src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2016-02-24 21:32:58 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2016-02-24 21:32:58 +0000
commit	d9c9bd8485071afb22adcd2bb08f6a8e5e587ed6 (patch)
tree	53c036d35173ba19f107d9afe07678667d270bee
parent	3f4bde29a30d8c43db5cbe8f5541ebc5d1fdc6af (diff)
download	src-d9c9bd8485071afb22adcd2bb08f6a8e5e587ed6.tar.gz src-d9c9bd8485071afb22adcd2bb08f6a8e5e587ed6.zip

Vendor import of llvm release_38 branch r261684:vendor/llvm/llvm-release_38-r261684

https://llvm.org/svn/llvm-project/llvm/branches/release_38@261684

Notes

Notes: svn path=/vendor/llvm/dist/; revision=296003 svn path=/vendor/llvm/llvm-release_38-r261684/; revision=296004; tag=vendor/llvm/llvm-release_38-r261684

Diffstat

-rw-r--r--

lib/CodeGen/RegAllocFast.cpp

-rw-r--r--

lib/Target/AArch64/AArch64FrameLowering.cpp

-rw-r--r--

lib/Target/AArch64/AArch64FrameLowering.h

-rw-r--r--

lib/Target/PowerPC/PPCFrameLowering.cpp

205

-rw-r--r--

lib/Target/PowerPC/PPCFrameLowering.h

-rw-r--r--

lib/Target/X86/X86ISelLowering.cpp

-rw-r--r--

lib/Target/X86/X86ISelLowering.h

-rw-r--r--

lib/Target/X86/X86InstrCompiler.td

-rw-r--r--

test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll

-rw-r--r--

test/CodeGen/AArch64/arm64-shrink-wrapping.ll

-rw-r--r--

test/CodeGen/ARM/Windows/alloca.ll

-rw-r--r--

test/CodeGen/PowerPC/pr26690.ll

118

-rw-r--r--

test/CodeGen/X86/i386-tlscall-fastregalloc.ll

-rw-r--r--

test/CodeGen/X86/tls-shrink-wrapping.ll

14 files changed, 540 insertions, 60 deletions

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index f4c076fea0e7..8d7a7213ba07 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp

@@ -1002,11 +1002,13 @@ void RAFast::AllocateBasicBlock() {

unsigned DefOpEnd = MI->getNumOperands();

if (MI->isCall()) {

- // Spill all virtregs before a call. This serves two purposes: 1. If an

+ // Spill all virtregs before a call. This serves one purpose: If an

// exception is thrown, the landing pad is going to expect to find

- // registers in their spill slots, and 2. we don't have to wade through

- // all the <imp-def> operands on the call instruction.

- DefOpEnd = VirtOpEnd;

+ // registers in their spill slots.

+ // Note: although this is appealing to just consider all definitions

+ // as call-clobbered, this is not correct because some of those

+ // definitions may be used later on and we do not want to reuse

+ // those for virtual registers in between.

DEBUG(dbgs() << " Spilling remaining registers before call.\n");

spillAll(MI);

diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 11ae8005370d..3f63d049c34e 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp

@@ -275,6 +275,18 @@ static bool isCSSave(MachineInstr *MBBI) {

MBBI->getOpcode() == AArch64::STPDpre;

}

+bool AArch64FrameLowering::canUseAsPrologue(

+ const MachineBasicBlock &MBB) const {

+ const MachineFunction *MF = MBB.getParent();

+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();

+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

+ // Don't need a scratch register if we're not going to re-align the stack.

+ // Otherwise, we may need a scratch register to be available and we do not

+ // support that for now.

+ return !RegInfo->needsStackRealignment(*MF);

void AArch64FrameLowering::emitPrologue(MachineFunction &MF,

MachineBasicBlock &MBB) const {

MachineBasicBlock::iterator MBBI = MBB.begin();

diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 427afdf4acbf..7d8354c38787 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h

@@ -37,6 +37,8 @@ public:

void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;

void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;

+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;

int getFrameIndexReference(const MachineFunction &MF, int FI,

unsigned &FrameReg) const override;

int resolveFrameIndexReference(const MachineFunction &MF, int FI,

diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index beab844c6025..3fd509ae27f4 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp

@@ -556,16 +556,42 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {

}

-bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,

- bool UseAtEnd,

- unsigned *ScratchRegister) const {

+/* This function will do the following:

+ - If MBB is an entry or exit block, set SR1 and SR2 to R0 and R12

+ respectively (defaults recommended by the ABI) and return true

+ - If MBB is not an entry block, initialize the register scavenger and look

+ for available registers.

+ - If the defaults (R0/R12) are available, return true

+ - If TwoUniqueRegsRequired is set to true, it looks for two unique

+ registers. Otherwise, look for a single available register.

+ - If the required registers are found, set SR1 and SR2 and return true.

+ - If the required registers are not found, set SR2 or both SR1 and SR2 to

+ PPC::NoRegister and return false.

+ Note that if both SR1 and SR2 are valid parameters and TwoUniqueRegsRequired

+ is not set, this function will attempt to find two different registers, but

+ still return true if only one register is available (and set SR1 == SR2).

+*/

+bool

+PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,

+ bool UseAtEnd,

+ bool TwoUniqueRegsRequired,

+ unsigned *SR1,

+ unsigned *SR2) const {

RegScavenger RS;

- unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;

+ unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;

+ unsigned R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;

+ // Set the defaults for the two scratch registers.

+ if (SR1)

+ *SR1 = R0;

- if (ScratchRegister)

- *ScratchRegister = R0;

+ if (SR2) {

+ assert (SR1 && "Asking for the second scratch register but not the first?");

+ *SR2 = R12;

+ }

- // If MBB is an entry or exit block, use R0 as the scratch register

+ // If MBB is an entry or exit block, use R0 and R12 as the scratch registers.

if ((UseAtEnd && MBB->isReturnBlock()) ||

(!UseAtEnd && (&MBB->getParent()->front() == MBB)))

return true;

@@ -573,8 +599,8 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,

RS.enterBasicBlock(MBB);

if (UseAtEnd && !MBB->empty()) {

- // The scratch register will be used at the end of the block, so must consider

- // all registers used within the block

+ // The scratch register will be used at the end of the block, so must

+ // consider all registers used within the block

MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();

// If no terminator, back iterator up to previous instruction.

@@ -584,35 +610,86 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,

if (MBBI != MBB->begin())

RS.forward(MBBI);

}

- if (!RS.isRegUsed(R0))

+ // If the two registers are available, we're all good.

+ // Note that we only return here if both R0 and R12 are available because

+ // although the function may not require two unique registers, it may benefit

+ // from having two so we should try to provide them.

+ if (!RS.isRegUsed(R0) && !RS.isRegUsed(R12))

return true;

- unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass

- : &PPC::GPRCRegClass);

- // Make sure the register scavenger was able to find an available register

- // If not, use R0 but return false to indicate no register was available and

- // R0 must be used (as recommended by the ABI)

- if (Reg == 0)

- return false;

+ // Get the list of callee-saved registers for the target.

+ const PPCRegisterInfo *RegInfo =

+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());

+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());

+ // Get all the available registers in the block.

+ BitVector BV = RS.getRegsAvailable(Subtarget.isPPC64() ? &PPC::G8RCRegClass :

+ &PPC::GPRCRegClass);

+ // We shouldn't use callee-saved registers as scratch registers as they may be

+ // available when looking for a candidate block for shrink wrapping but not

+ // available when the actual prologue/epilogue is being emitted because they

+ // were added as live-in to the prologue block by PrologueEpilogueInserter.

+ for (int i = 0; CSRegs[i]; ++i)

+ BV.reset(CSRegs[i]);

+ // Set the first scratch register to the first available one.

+ if (SR1) {

+ int FirstScratchReg = BV.find_first();

+ *SR1 = FirstScratchReg == -1 ? (unsigned)PPC::NoRegister : FirstScratchReg;

+ }

- if (ScratchRegister)

- *ScratchRegister = Reg;

+ // If there is another one available, set the second scratch register to that.

+ // Otherwise, set it to either PPC::NoRegister if this function requires two

+ // or to whatever SR1 is set to if this function doesn't require two.

+ if (SR2) {

+ int SecondScratchReg = BV.find_next(*SR1);

+ if (SecondScratchReg != -1)

+ *SR2 = SecondScratchReg;

+ else

+ *SR2 = TwoUniqueRegsRequired ? (unsigned)PPC::NoRegister : *SR1;

+ }

+ // Now that we've done our best to provide both registers, double check

+ // whether we were unable to provide enough.

+ if (BV.count() < (TwoUniqueRegsRequired ? 2 : 1))

+ return false;

return true;

}

+// We need a scratch register for spilling LR and for spilling CR. By default,

+// we use two scratch registers to hide latency. However, if only one scratch

+// register is available, we can adjust for that by not overlapping the spill

+// code. However, if we need to realign the stack (i.e. have a base pointer)

+// and the stack frame is large, we need two scratch registers.

+bool

+PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {

+ const PPCRegisterInfo *RegInfo =

+ static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());

+ MachineFunction &MF = *(MBB->getParent());

+ bool HasBP = RegInfo->hasBasePointer(MF);

+ unsigned FrameSize = determineFrameLayout(MF, false);

+ int NegFrameSize = -FrameSize;

+ bool IsLargeFrame = !isInt<16>(NegFrameSize);

+ MachineFrameInfo *MFI = MF.getFrameInfo();

+ unsigned MaxAlign = MFI->getMaxAlignment();

+ return IsLargeFrame && HasBP && MaxAlign > 1;

bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {

MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);

- return findScratchRegister(TmpMBB, false, nullptr);

+ return findScratchRegister(TmpMBB, false,

+ twoUniqueScratchRegsRequired(TmpMBB));

}

bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {

MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);

- return findScratchRegister(TmpMBB, true, nullptr);

+ return findScratchRegister(TmpMBB, true);

}

void PPCFrameLowering::emitPrologue(MachineFunction &MF,

@@ -664,6 +741,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();

bool MustSaveLR = FI->mustSaveLR();

const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();

+ bool MustSaveCR = !MustSaveCRs.empty();

// Do we have a frame pointer and/or base pointer for this function?

bool HasFP = hasFP(MF);

bool HasBP = RegInfo->hasBasePointer(MF);

@@ -701,9 +779,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&

"FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");

- findScratchRegister(&MBB, false, &ScratchReg);

- assert(ScratchReg && "No scratch register!");

+ // Using the same bool variable as below to supress compiler warnings.

+ bool SingleScratchReg =

+ findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB),

+ &ScratchReg, &TempReg);

+ assert(SingleScratchReg &&

+ "Required number of registers not available in this block");

+ SingleScratchReg = ScratchReg == TempReg;

int LROffset = getReturnSaveOffset();

int FPOffset = 0;

@@ -748,13 +832,30 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

// indexed into with a simple STDU/STWU/STD/STW immediate offset operand.

bool isLargeFrame = !isInt<16>(NegFrameSize);

+ assert((isPPC64 || !MustSaveCR) &&

+ "Prologue CR saving supported only in 64-bit mode");

+ // If we need to spill the CR and the LR but we don't have two separate

+ // registers available, we must spill them one at a time

+ if (MustSaveCR && SingleScratchReg && MustSaveLR) {

+ // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.

+ // If only one or two CR fields are clobbered, it could be more

+ // efficient to use mfocrf to selectively save just those fields.

+ MachineInstrBuilder MIB =

+ BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);

+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)

+ MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);

+ BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))

+ .addReg(TempReg, getKillRegState(true))

+ .addImm(8)

+ .addReg(SPReg);

+ }

if (MustSaveLR)

BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);

- assert((isPPC64 || MustSaveCRs.empty()) &&

- "Prologue CR saving supported only in 64-bit mode");

- if (!MustSaveCRs.empty()) { // will only occur for PPC64

+ if (MustSaveCR &&

+ !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64

// FIXME: In the ELFv2 ABI, we are not required to save all CR fields.

// If only one or two CR fields are clobbered, it could be more

// efficient to use mfocrf to selectively save just those fields.

@@ -792,7 +893,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

.addImm(LROffset)

.addReg(SPReg);

- if (!MustSaveCRs.empty()) // will only occur for PPC64

+ if (MustSaveCR &&

+ !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64

BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))

.addReg(TempReg, getKillRegState(true))

.addImm(8)

@@ -811,6 +913,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

.addReg(SPReg);

}

+ // This condition must be kept in sync with canUseAsPrologue.

if (HasBP && MaxAlign > 1) {

if (isPPC64)

BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)

@@ -828,6 +931,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

.addReg(ScratchReg, RegState::Kill)

.addImm(NegFrameSize);

} else {

+ assert(!SingleScratchReg && "Only a single scratch reg available");

BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)

.addImm(NegFrameSize >> 16);

BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)

@@ -951,7 +1055,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,

// For SVR4, don't emit a move for the CR spill slot if we haven't

// spilled CRs.

if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)

- && MustSaveCRs.empty())

+ && !MustSaveCR)

continue;

// For 64-bit SVR4 when we have spilled CRs, the spill location

@@ -1005,6 +1109,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,

PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();

bool MustSaveLR = FI->mustSaveLR();

const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();

+ bool MustSaveCR = !MustSaveCRs.empty();

// Do we have a frame pointer and/or base pointer for this function?

bool HasFP = hasFP(MF);

bool HasBP = RegInfo->hasBasePointer(MF);

@@ -1026,14 +1131,19 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,

: PPC::ADDI );

const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8

: PPC::ADD4 );

int LROffset = getReturnSaveOffset();

int FPOffset = 0;

- findScratchRegister(&MBB, true, &ScratchReg);

- assert(ScratchReg && "No scratch register!");

+ // Using the same bool variable as below to supress compiler warnings.

+ bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,

+ &TempReg);

+ assert(SingleScratchReg &&

+ "Could not find an available scratch register");

+ SingleScratchReg = ScratchReg == TempReg;

if (HasFP) {

if (isSVR4ABI) {

MachineFrameInfo *FFI = MF.getFrameInfo();

@@ -1130,15 +1240,27 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,

}

+ assert((isPPC64 || !MustSaveCR) &&

+ "Epilogue CR restoring supported only in 64-bit mode");

+ // If we need to save both the LR and the CR and we only have one available

+ // scratch register, we must do them one at a time.

+ if (MustSaveCR && SingleScratchReg && MustSaveLR) {

+ BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)

+ .addImm(8)

+ .addReg(SPReg);

+ for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)

+ BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])

+ .addReg(TempReg, getKillRegState(i == e-1));

+ }

if (MustSaveLR)

BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)

.addImm(LROffset)

.addReg(SPReg);

- assert((isPPC64 || MustSaveCRs.empty()) &&

- "Epilogue CR restoring supported only in 64-bit mode");

- if (!MustSaveCRs.empty()) // will only occur for PPC64

+ if (MustSaveCR &&

+ !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64

BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)

.addImm(8)

.addReg(SPReg);

@@ -1160,7 +1282,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,

.addImm(BPOffset)

.addReg(SPReg);

- if (!MustSaveCRs.empty()) // will only occur for PPC64

+ if (MustSaveCR &&

+ !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64

for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)

BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])

.addReg(TempReg, getKillRegState(i == e-1));

diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index bbe1329a5352..f1f3f0b831a7 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h

@@ -30,28 +30,41 @@ class PPCFrameLowering: public TargetFrameLowering {

const unsigned BasePointerSaveOffset;

/**

- * \brief Find a register that can be used in function prologue and epilogue

+ * \brief Find register[s] that can be used in function prologue and epilogue

- * Find a register that can be use as the scratch register in function

+ * Find register[s] that can be use as scratch register[s] in function

* prologue and epilogue to save various registers (Link Register, Base

- * Pointer, etc.). Prefer R0, if it is available. If it is not available,

- * then choose a different register.

+ * Pointer, etc.). Prefer R0/R12, if available. Otherwise choose whatever

+ * register[s] are available.

- * This method will return true if an available register was found (including

- * R0). If no available registers are found, the method returns false and sets

- * ScratchRegister to R0, as per the recommendation in the ABI.

+ * This method will return true if it is able to find enough unique scratch

+ * registers (1 or 2 depending on the requirement). If it is unable to find

+ * enough available registers in the block, it will return false and set

+ * any passed output parameter that corresponds to a required unique register

+ * to PPC::NoRegister.

* \param[in] MBB The machine basic block to find an available register for

* \param[in] UseAtEnd Specify whether the scratch register will be used at

* the end of the basic block (i.e., will the scratch

* register kill a register defined in the basic block)

- * \param[out] ScratchRegister The scratch register to use

- * \return true if a scratch register was found. false of a scratch register

- * was not found and R0 is being used as the default.

+ * \param[in] TwoUniqueRegsRequired Specify whether this basic block will

+ * require two unique scratch registers.

+ * \param[out] SR1 The scratch register to use

+ * \param[out] SR2 The second scratch register. If this pointer is not null

+ * the function will attempt to set it to an available

+ * register regardless of whether there is a hard requirement

+ * for two unique scratch registers.

+ * \return true if the required number of registers was found.

+ * false if the required number of scratch register weren't available.

+ * If either output parameter refers to a required scratch register

+ * that isn't available, it will be set to an invalid value.

bool findScratchRegister(MachineBasicBlock *MBB,

bool UseAtEnd,

- unsigned *ScratchRegister) const;

+ bool TwoUniqueRegsRequired = false,

+ unsigned *SR1 = nullptr,

+ unsigned *SR2 = nullptr) const;

+ bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;

public:

PPCFrameLowering(const PPCSubtarget &STI);

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c12a3ed43d29..dd9966f9e179 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp

@@ -22228,6 +22228,35 @@ X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,

}

MachineBasicBlock *

+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,

+ MachineBasicBlock *BB) const {

+ // So, here we replace TLSADDR with the sequence:

+ // adjust_stackdown -> TLSADDR -> adjust_stackup.

+ // We need this because TLSADDR is lowered into calls

+ // inside MC, therefore without the two markers shrink-wrapping

+ // may push the prologue/epilogue pass them.

+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();

+ DebugLoc DL = MI->getDebugLoc();

+ MachineFunction &MF = *BB->getParent();

+ // Emit CALLSEQ_START right before the instruction.

+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

+ MachineInstrBuilder CallseqStart =

+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0);

+ BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

+ // Emit CALLSEQ_END right after the instruction.

+ // We don't call erase from parent because we want to keep the

+ // original instruction around.

+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

+ MachineInstrBuilder CallseqEnd =

+ BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

+ BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

+ return BB;

+MachineBasicBlock *

X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,

MachineBasicBlock *BB) const {

// This is pretty easy. We're taking the value that we received from

@@ -22607,6 +22636,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,

case X86::TCRETURNri64:

case X86::TCRETURNmi64:

return BB;

+ case X86::TLS_addr32:

+ case X86::TLS_addr64:

+ case X86::TLS_base_addr32:

+ case X86::TLS_base_addr64:

+ return EmitLoweredTLSAddr(MI, BB);

case X86::WIN_ALLOCA:

return EmitLoweredWinAlloca(MI, BB);

case X86::CATCHRET:

diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0ab786e08e02..b67958a9c498 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h

@@ -1129,6 +1129,9 @@ namespace llvm {

MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,

MachineBasicBlock *BB) const;

+ MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr *MI,

+ MachineBasicBlock *BB) const;

MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,

MachineBasicBlock *BB) const;

diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 96a29ca8c370..c709c8aca9fa 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td

@@ -436,7 +436,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,

MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,

XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,

XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],

- Uses = [ESP] in {

+ usesCustomInserter = 1, Uses = [ESP] in {

def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),

"# TLS_addr32",

[(X86tlsaddr tls32addr:$sym)]>,

@@ -456,7 +456,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,

MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,

XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,

XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],

- Uses = [RSP] in {

+ usesCustomInserter = 1, Uses = [RSP] in {

def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),

"# TLS_addr64",

[(X86tlsaddr tls64addr:$sym)]>,

diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 1820b8163a90..90093f94d0ad 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll

@@ -522,10 +522,10 @@ bb1:

; CHECK-LABEL: realign_conditional2

; Extra realignment in the prologue (performance issue).

-; CHECK: tbz {{.*}} .[[LABEL:.*]]

; CHECK: sub x9, sp, #32 // =32

; CHECK: and sp, x9, #0xffffffffffffffe0

; CHECK: mov x19, sp

+; CHECK: tbz {{.*}} .[[LABEL:.*]]

; Stack is realigned in a non-entry BB.

; CHECK: sub [[REG:x[01-9]+]], sp, #64

; CHECK: and sp, [[REG]], #0xffffffffffffffe0

diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 2ecd66ddf5d4..4d751f501d4a 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll

@@ -630,3 +630,88 @@ loop2b: ; preds = %loop1

end:

ret void

}

+; Don't do shrink-wrapping when we need to re-align the stack pointer.

+; See bug 26642.

+; CHECK-LABEL: stack_realign:

+; CHECK-NOT: lsl w[[LSL1:[0-9]+]], w0, w1

+; CHECK-NOT: lsl w[[LSL2:[0-9]+]], w1, w0

+; CHECK: stp x29, x30, [sp, #-16]!

+; CHECK: mov x29, sp

+; CHECK: sub x{{[0-9]+}}, sp, #16

+; CHECK-DAG: lsl w[[LSL1:[0-9]+]], w0, w1

+; CHECK-DAG: lsl w[[LSL2:[0-9]+]], w1, w0

+; CHECK-DAG: str w[[LSL1]],

+; CHECK-DAG: str w[[LSL2]],

+define i32 @stack_realign(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) {

+ %tmp = alloca i32, align 32

+ %shl1 = shl i32 %a, %b

+ %shl2 = shl i32 %b, %a

+ %tmp2 = icmp slt i32 %a, %b

+ br i1 %tmp2, label %true, label %false

+true:

+ store i32 %a, i32* %tmp, align 4

+ %tmp4 = load i32, i32* %tmp

+ br label %false

+false:

+ %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]

+ store i32 %shl1, i32* %ptr1

+ store i32 %shl2, i32* %ptr2

+ ret i32 %tmp.0

+; Re-aligned stack pointer with all caller-save regs live. See bug

+; 26642. In this case we currently avoid shrink wrapping because

+; ensuring we have a scratch register to re-align the stack pointer is

+; too complicated. Output should be the same for both enabled and

+; disabled shrink wrapping.

+; CHECK-LABEL: stack_realign2:

+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #-{{[0-9]+}}]!

+; CHECK: add x29, sp, #{{[0-9]+}}

+; CHECK: lsl {{w[0-9]+}}, w0, w1

+define void @stack_realign2(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2, i32* %ptr3, i32* %ptr4, i32* %ptr5, i32* %ptr6) {

+ %tmp = alloca i32, align 32

+ %tmp1 = shl i32 %a, %b

+ %tmp2 = shl i32 %b, %a

+ %tmp3 = lshr i32 %a, %b

+ %tmp4 = lshr i32 %b, %a

+ %tmp5 = add i32 %b, %a

+ %tmp6 = sub i32 %b, %a

+ %tmp7 = add i32 %tmp1, %tmp2

+ %tmp8 = sub i32 %tmp2, %tmp3

+ %tmp9 = add i32 %tmp3, %tmp4

+ %tmp10 = add i32 %tmp4, %tmp5

+ %cmp = icmp slt i32 %a, %b

+ br i1 %cmp, label %true, label %false

+true:

+ store i32 %a, i32* %tmp, align 4

+ call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"() nounwind

+ br label %false

+false:

+ store i32 %tmp1, i32* %ptr1, align 4

+ store i32 %tmp2, i32* %ptr2, align 4

+ store i32 %tmp3, i32* %ptr3, align 4

+ store i32 %tmp4, i32* %ptr4, align 4

+ store i32 %tmp5, i32* %ptr5, align 4

+ store i32 %tmp6, i32* %ptr6, align 4

+ %idx1 = getelementptr inbounds i32, i32* %ptr1, i64 1

+ store i32 %a, i32* %idx1, align 4

+ %idx2 = getelementptr inbounds i32, i32* %ptr1, i64 2

+ store i32 %b, i32* %idx2, align 4

+ %idx3 = getelementptr inbounds i32, i32* %ptr1, i64 3

+ store i32 %tmp7, i32* %idx3, align 4

+ %idx4 = getelementptr inbounds i32, i32* %ptr1, i64 4

+ store i32 %tmp8, i32* %idx4, align 4

+ %idx5 = getelementptr inbounds i32, i32* %ptr1, i64 5

+ store i32 %tmp9, i32* %idx5, align 4

+ %idx6 = getelementptr inbounds i32, i32* %ptr1, i64 6

+ store i32 %tmp10, i32* %idx6, align 4

+ ret void

diff --git a/test/CodeGen/ARM/Windows/alloca.ll b/test/CodeGen/ARM/Windows/alloca.ll
index 6a3d002ab3b3..0f20ffbd36db 100644
--- a/test/CodeGen/ARM/Windows/alloca.ll
+++ b/test/CodeGen/ARM/Windows/alloca.ll

@@ -13,7 +13,9 @@ entry:

}

; CHECK: bl num_entries

-; CHECK: movs [[R1:r[0-9]+]], #7

+; Any register is actually valid here, but turns out we use lr,

+; because we do not have the kill flag on R0.

+; CHECK: mov.w [[R1:lr]], #7

; CHECK: add.w [[R0:r[0-9]+]], [[R1]], [[R0]], lsl #2

; CHECK: bic [[R0]], [[R0]], #7

; CHECK: lsrs r4, [[R0]], #2

diff --git a/test/CodeGen/PowerPC/pr26690.ll b/test/CodeGen/PowerPC/pr26690.ll
new file mode 100644
index 000000000000..3e7662409d51
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26690.ll

@@ -0,0 +1,118 @@

+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s

+%struct.anon = type { %struct.anon.0, %struct.anon.1 }

+%struct.anon.0 = type { i32 }

+%struct.anon.1 = type { i32 }

+@i = common global i32 0, align 4

+@b = common global i32* null, align 8

+@c = common global i32 0, align 4

+@a = common global i32 0, align 4

+@h = common global i32 0, align 4

+@g = common global i32 0, align 4

+@j = common global i32 0, align 4

+@f = common global %struct.anon zeroinitializer, align 4

+@d = common global i32 0, align 4

+@e = common global i32 0, align 4

+; Function Attrs: norecurse nounwind

+define signext i32 @fn1(i32* nocapture %p1, i32 signext %p2, i32* nocapture %p3) {

+entry:

+ %0 = load i32, i32* @i, align 4, !tbaa !1

+ %cond = icmp eq i32 %0, 8

+ br i1 %cond, label %if.end16, label %while.cond.preheader

+while.cond.preheader: ; preds = %entry

+ %1 = load i32*, i32** @b, align 8, !tbaa !5

+ %2 = load i32, i32* %1, align 4, !tbaa !1

+ %tobool18 = icmp eq i32 %2, 0

+ br i1 %tobool18, label %while.end, label %while.body.lr.ph

+while.body.lr.ph: ; preds = %while.cond.preheader

+ %.pre = load i32, i32* @c, align 4, !tbaa !1

+ br label %while.body

+while.body: ; preds = %while.body.backedge, %while.body.lr.ph

+ switch i32 %.pre, label %while.body.backedge [

+ i32 0, label %sw.bb1

+ i32 8, label %sw.bb1

+ i32 6, label %sw.bb1

+ i32 24, label %while.cond.backedge

+ ]

+while.body.backedge: ; preds = %while.body, %while.cond.backedge

+ br label %while.body

+sw.bb1: ; preds = %while.body, %while.body, %while.body

+ store i32 2, i32* @a, align 4, !tbaa !1

+ br label %while.cond.backedge

+while.cond.backedge: ; preds = %while.body, %sw.bb1

+ store i32 4, i32* @a, align 4, !tbaa !1

+ %.pre19 = load i32, i32* %1, align 4, !tbaa !1

+ %tobool = icmp eq i32 %.pre19, 0

+ br i1 %tobool, label %while.end.loopexit, label %while.body.backedge

+while.end.loopexit: ; preds = %while.cond.backedge

+ br label %while.end

+while.end: ; preds = %while.end.loopexit, %while.cond.preheader

+ %3 = load i32, i32* @h, align 4, !tbaa !1

+ %mul = mul nsw i32 %0, %3

+ %4 = load i32, i32* @g, align 4, !tbaa !1

+ %mul4 = mul nsw i32 %mul, %4

+ store i32 %mul4, i32* @j, align 4, !tbaa !1

+ %5 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @f, i64 0, i32 0, i32 0), align 4, !tbaa !7

+ %tobool5 = icmp eq i32 %5, 0

+ br i1 %tobool5, label %if.end, label %if.then

+if.then: ; preds = %while.end

+ %div = sdiv i32 %5, %mul

+ store i32 %div, i32* @g, align 4, !tbaa !1

+ br label %if.end

+if.end: ; preds = %while.end, %if.then

+ %6 = phi i32 [ %4, %while.end ], [ %div, %if.then ]

+ %7 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @f, i64 0, i32 1, i32 0), align 4, !tbaa !10

+ %tobool7 = icmp ne i32 %7, 0

+ %tobool8 = icmp ne i32 %mul4, 0

+ %or.cond = and i1 %tobool7, %tobool8

+ %tobool10 = icmp ne i32 %0, 0

+ %or.cond17 = and i1 %or.cond, %tobool10

+ br i1 %or.cond17, label %if.then11, label %if.end13

+if.then11: ; preds = %if.end

+ store i32 %3, i32* @d, align 4, !tbaa !1

+ %8 = load i32, i32* @e, align 4, !tbaa !1

+ store i32 %8, i32* %p3, align 4, !tbaa !1

+ %.pre20 = load i32, i32* @g, align 4, !tbaa !1

+ br label %if.end13

+if.end13: ; preds = %if.then11, %if.end

+ %9 = phi i32 [ %.pre20, %if.then11 ], [ %6, %if.end ]

+ %tobool14 = icmp eq i32 %9, 0

+ br i1 %tobool14, label %if.end16, label %if.then15

+if.then15: ; preds = %if.end13

+ store i32 %p2, i32* %p1, align 4, !tbaa !1

+ br label %if.end16

+if.end16: ; preds = %entry, %if.end13, %if.then15

+ ret i32 2

+; CHECK: mfcr {{[0-9]+}}

+!llvm.ident = !{!0}

+!0 = !{!"clang version 3.9.0 (trunk 261520)"}

+!1 = !{!2, !2, i64 0}

+!2 = !{!"int", !3, i64 0}

+!3 = !{!"omnipotent char", !4, i64 0}

+!4 = !{!"Simple C/C++ TBAA"}

+!5 = !{!6, !6, i64 0}

+!6 = !{!"any pointer", !3, i64 0}

+!7 = !{!8, !2, i64 0}

+!8 = !{!"", !9, i64 0, !9, i64 4}

+!9 = !{!"", !2, i64 0}

+!10 = !{!8, !2, i64 4}

diff --git a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
new file mode 100644
index 000000000000..775c0c1b3784
--- /dev/null
+++ b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll

@@ -0,0 +1,26 @@

+; RUN: llc %s -o - -O0 -regalloc=fast | FileCheck %s

+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"

+target triple = "i386-apple-macosx10.10"

+@c = external global i8, align 1

+@p = thread_local global i8* null, align 4

+; Check that regalloc fast correctly preserves EAX that is set by the TLS call

+; until the actual use.

+; PR26485.

+; CHECK-LABEL: f:

+; Get p.

+; CHECK: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]

+; CHECK-NEXT: calll *([[P_ADDR]])

+; At this point eax contiains the address of p.

+; Load c address.

+; Make sure we do not clobber eax.

+; CHECK-NEXT: movl L_c{{[^,]*}}, [[C_ADDR:%e[b-z]x+]]

+; Store c address into p.

+; CHECK-NEXT: movl [[C_ADDR]], (%eax)

+define void @f() #0 {

+entry:

+ store i8* @c, i8** @p, align 4

+ ret void

diff --git a/test/CodeGen/X86/tls-shrink-wrapping.ll b/test/CodeGen/X86/tls-shrink-wrapping.ll
new file mode 100644
index 000000000000..37c1754c0be8
--- /dev/null
+++ b/test/CodeGen/X86/tls-shrink-wrapping.ll

@@ -0,0 +1,60 @@

+; Testcase generated from the following code:

+; extern __thread int i;

+; void f();

+; int g(void) {

+; if (i) {

+; i = 0;

+; f();

+; }

+; return i;

+; }

+; We want to make sure that TLS variables are not accessed before

+; the stack frame is set up.

+; RUN: llc < %s -relocation-model=pic | FileCheck %s

+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

+target triple = "x86_64-unknown-freebsd11.0"

+@i = external thread_local global i32, align 4

+define i32 @g() #0 {

+entry:

+ %tmp = load i32, i32* @i, align 4

+ %tobool = icmp eq i32 %tmp, 0

+ br i1 %tobool, label %if.end, label %if.then

+if.then: ; preds = %entry

+ store i32 0, i32* @i, align 4

+ tail call void (...) @f() #2

+ %.pre = load i32, i32* @i, align 4

+ br label %if.end

+if.end: ; preds = %if.then, %entry

+ %tmp1 = phi i32 [ 0, %entry ], [ %.pre, %if.then ]

+ ret i32 %tmp1

+; CHECK: g: # @g

+; CHECK-NEXT: .cfi_startproc

+; CHECK-NEXT: # BB#0: # %entry

+; CHECK-NEXT: pushq %rbp

+; CHECK-NEXT: .Ltmp0:

+; CHECK-NEXT: .cfi_def_cfa_offset 16

+; CHECK-NEXT: .Ltmp1:

+; CHECK-NEXT: .cfi_offset %rbp, -16

+; CHECK-NEXT: movq %rsp, %rbp

+; CHECK-NEXT: .Ltmp2:

+; CHECK-NEXT: .cfi_def_cfa_register %rbp

+; CHECK-NEXT: pushq %rbx

+; CHECK-NEXT: pushq %rax

+; CHECK-NEXT: .Ltmp3:

+; CHECK-NEXT: .cfi_offset %rbx, -24

+; CHECK-NEXT: data16

+; CHECK-NEXT: leaq i@TLSGD(%rip), %rdi

+declare void @f(...) #1

+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

+attributes #2 = { nounwind }