1 files changed, 299 insertions, 166 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 61bcef8cb760..830354ab23f0 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -1,4 +1,4 @@
-//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
+//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,17 +15,29 @@
 #include "UnicodeCharSets.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/LiteralSupport.h"
+#include "clang/Lex/MultipleIncludeOpt.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/UnicodeCharRanges.h"
 #include <algorithm>
 #include <cassert>
@@ -63,7 +75,7 @@ tok::ObjCKeywordKind Token::getObjCKeywordID() const {
 // Lexer Class Implementation
 //===----------------------------------------------------------------------===//
 
-void Lexer::anchor() { }
+void Lexer::anchor() {}
 
 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
                       const char *BufEnd) {
@@ -120,31 +132,21 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
 /// assumes that the associated file buffer and Preprocessor objects will
 /// outlive it, so it doesn't take ownership of either of them.
 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
-  : PreprocessorLexer(&PP, FID),
-    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
-    LangOpts(PP.getLangOpts()) {
-
+    : PreprocessorLexer(&PP, FID),
+      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
+      LangOpts(PP.getLangOpts()) {
   InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
             InputFile->getBufferEnd());
 
   resetExtendedTokenMode();
 }
 
-void Lexer::resetExtendedTokenMode() {
-  assert(PP && "Cannot reset token mode without a preprocessor");
-  if (LangOpts.TraditionalCPP)
-    SetKeepWhitespaceMode(true);
-  else
-    SetCommentRetentionState(PP->getCommentRetentionState());
-}
-
 /// Lexer constructor - Create a new raw lexer object.  This object is only
 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
 /// range will outlive it, so it doesn't take ownership of it.
 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
              const char *BufStart, const char *BufPtr, const char *BufEnd)
-  : FileLoc(fileloc), LangOpts(langOpts) {
-
+    : FileLoc(fileloc), LangOpts(langOpts) {
   InitLexer(BufStart, BufPtr, BufEnd);
 
   // We *are* in raw mode.
@@ -159,6 +161,14 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
     : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
             FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
 
+void Lexer::resetExtendedTokenMode() {
+  assert(PP && "Cannot reset token mode without a preprocessor");
+  if (LangOpts.TraditionalCPP)
+    SetKeepWhitespaceMode(true);
+  else
+    SetCommentRetentionState(PP->getCommentRetentionState());
+}
+
 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
 /// _Pragma expansion.  This has a variety of magic semantics that this method
 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
@@ -209,30 +219,39 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
   return L;
 }
 
-/// Stringify - Convert the specified string into a C string, with surrounding
-/// ""'s, and with escaped \ and " characters.
+template <typename T> static void StringifyImpl(T &Str, char Quote) {
+  typename T::size_type i = 0, e = Str.size();
+  while (i < e) {
+    if (Str[i] == '\\' || Str[i] == Quote) {
+      Str.insert(Str.begin() + i, '\\');
+      i += 2;
+      ++e;
+    } else if (Str[i] == '\n' || Str[i] == '\r') {
+      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
+      if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
+          Str[i] != Str[i + 1]) {
+        Str[i] = '\\';
+        Str[i + 1] = 'n';
+      } else {
+        // Replace '\n' and '\r' to '\\' followed by 'n'.
+        Str[i] = '\\';
+        Str.insert(Str.begin() + i + 1, 'n');
+        ++e;
+      }
+      i += 2;
+    } else
+      ++i;
+  }
+}
+
 std::string Lexer::Stringify(StringRef Str, bool Charify) {
   std::string Result = Str;
   char Quote = Charify ? '\'' : '"';
-  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
-    if (Result[i] == '\\' || Result[i] == Quote) {
-      Result.insert(Result.begin()+i, '\\');
-      ++i; ++e;
-    }
-  }
+  StringifyImpl(Result, Quote);
   return Result;
 }
 
-/// Stringify - Convert the specified string into a C string by escaping '\'
-/// and " characters.  This does not add surrounding ""'s to the string.
-void Lexer::Stringify(SmallVectorImpl<char> &Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] == '\\' || Str[i] == '"') {
-      Str.insert(Str.begin()+i, '\\');
-      ++i; ++e;
-    }
-  }
-}
+void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
 
 //===----------------------------------------------------------------------===//
 // Token Spelling
@@ -307,7 +326,7 @@ StringRef Lexer::getSpelling(SourceLocation loc,
   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
   if (invalidTemp) {
     if (invalid) *invalid = true;
-    return StringRef();
+    return {};
   }
 
   const char *tokenBegin = file.data() + locInfo.second;
@@ -345,7 +364,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
   if (Invalid)
     *Invalid = CharDataInvalid;
   if (CharDataInvalid)
-    return std::string();
+    return {};
 
   // If this token contains nothing interesting, return it directly.
   if (!Tok.needsCleaning())
@@ -367,7 +386,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
 /// to point to a constant buffer with the data already in it (avoiding a
 /// copy).  The caller is not allowed to modify the returned buffer pointer
 /// if an internal buffer is returned.
-unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
                             const SourceManager &SourceMgr,
                             const LangOptions &LangOpts, bool *Invalid) {
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
@@ -463,19 +482,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
   const char *BufStart = Buffer.data();
   if (Offset >= Buffer.size())
     return nullptr;
-  const char *StrData = BufStart + Offset;
-
-  if (StrData[0] == '\n' || StrData[0] == '\r')
-    return StrData;
 
-  const char *LexStart = StrData;
-  while (LexStart != BufStart) {
-    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+  const char *LexStart = BufStart + Offset;
+  for (; LexStart != BufStart; --LexStart) {
+    if (isVerticalWhitespace(LexStart[0]) &&
+        !Lexer::isNewLineEscaped(BufStart, LexStart)) {
+      // LexStart should point at first character of logical line.
       ++LexStart;
       break;
     }
-
-    --LexStart;
   }
   return LexStart;
 }
@@ -487,7 +502,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
   if (LocInfo.first.isInvalid())
     return Loc;
-  
+
   bool Invalid = false;
   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
   if (Invalid)
@@ -499,31 +514,31 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
   const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
   if (!LexStart || LexStart == StrData)
     return Loc;
-  
+
   // Create a lexer starting at the beginning of this token.
   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
   Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
                  Buffer.end());
   TheLexer.SetCommentRetentionState(true);
-  
+
   // Lex tokens until we find the token that contains the source location.
   Token TheTok;
   do {
     TheLexer.LexFromRawLexer(TheTok);
-    
+
     if (TheLexer.getBufferLocation() > StrData) {
       // Lexing this token has taken the lexer past the source location we're
       // looking for. If the current token encompasses our source location,
       // return the beginning of that token.
       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
         return TheTok.getLocation();
-      
+
       // We ended up skipping over the source location entirely, which means
       // that it points into whitespace. We're done here.
       break;
     }
   } while (TheTok.getKind() != tok::eof);
-  
+
   // We've passed our source location; just return the original source location.
   return Loc;
 }
@@ -531,34 +546,34 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
                                           const SourceManager &SM,
                                           const LangOptions &LangOpts) {
- if (Loc.isFileID())
-   return getBeginningOfFileToken(Loc, SM, LangOpts);
- 
- if (!SM.isMacroArgExpansion(Loc))
-   return Loc;
-
- SourceLocation FileLoc = SM.getSpellingLoc(Loc);
- SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
- std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
- std::pair<FileID, unsigned> BeginFileLocInfo
-   = SM.getDecomposedLoc(BeginFileLoc);
- assert(FileLocInfo.first == BeginFileLocInfo.first &&
-        FileLocInfo.second >= BeginFileLocInfo.second);
- return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
+  if (Loc.isFileID())
+    return getBeginningOfFileToken(Loc, SM, LangOpts);
+
+  if (!SM.isMacroArgExpansion(Loc))
+    return Loc;
+
+  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+  std::pair<FileID, unsigned> BeginFileLocInfo =
+      SM.getDecomposedLoc(BeginFileLoc);
+  assert(FileLocInfo.first == BeginFileLocInfo.first &&
+         FileLocInfo.second >= BeginFileLocInfo.second);
+  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
 }
 
 namespace {
 
-  enum PreambleDirectiveKind {
-    PDK_Skipped,
-    PDK_Unknown
-  };
+enum PreambleDirectiveKind {
+  PDK_Skipped,
+  PDK_Unknown
+};
 
-} // end anonymous namespace
+} // namespace
 
-std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
-                                                 const LangOptions &LangOpts,
-                                                 unsigned MaxLines) {
+PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
+                                      const LangOptions &LangOpts,
+                                      unsigned MaxLines) {
   // Create a lexer starting at the beginning of the file. Note that we use a
   // "fake" file source location at offset 1 so that the lexer will track our
   // position within the file.
@@ -568,9 +583,6 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
                  Buffer.end());
   TheLexer.SetCommentRetentionState(true);
 
-  // StartLoc will differ from FileLoc if there is a BOM that was skipped.
-  SourceLocation StartLoc = TheLexer.getSourceLocation();
-
   bool InPreprocessorDirective = false;
   Token TheTok;
   SourceLocation ActiveCommentLoc;
@@ -599,17 +611,17 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
       if (TheTok.getKind() == tok::eof) {
         break;
       }
-      
+
       // If we haven't hit the end of the preprocessor directive, skip this
       // token.
       if (!TheTok.isAtStartOfLine())
         continue;
-        
+
       // We've passed the end of the preprocessor directive, and will look
       // at this token again below.
       InPreprocessorDirective = false;
     }
-    
+
     // Keep track of the # of lines in the preamble.
     if (TheTok.isAtStartOfLine()) {
       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
@@ -626,13 +638,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
         ActiveCommentLoc = TheTok.getLocation();
       continue;
     }
-    
+
     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
-      // This is the start of a preprocessor directive. 
+      // This is the start of a preprocessor directive.
       Token HashTok = TheTok;
       InPreprocessorDirective = true;
       ActiveCommentLoc = SourceLocation();
-      
+
       // Figure out which directive this is. Since we're lexing raw tokens,
       // we don't have an identifier table available. Instead, just look at
       // the raw identifier to recognize and categorize preprocessor directives.
@@ -672,7 +684,7 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
           break;
         }
       }
-      
+
       // We only end up here if we didn't recognize the preprocessor
       // directive or it was one that can't occur in the preamble at this
       // point. Roll back the current token to the location of the '#'.
@@ -685,14 +697,14 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
     // the preamble.
     break;
   } while (true);
-  
+
   SourceLocation End;
   if (ActiveCommentLoc.isValid())
     End = ActiveCommentLoc; // don't truncate a decl comment.
   else
     End = TheTok.getLocation();
 
-  return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
+  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
                         TheTok.isAtStartOfLine());
 }
 
@@ -707,13 +719,13 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
   // trigraphs.
   bool Invalid = false;
   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
-  
+
   // If they request the first char of the token, we're trivially done.
   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
     return TokStart;
-  
+
   unsigned PhysOffset = 0;
-  
+
   // The usual case is that tokens don't contain anything interesting.  Skip
   // over the uninteresting characters.  If a token only consists of simple
   // chars, this method is extremely fast.
@@ -724,7 +736,7 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
     --CharNo;
     ++PhysOffset;
   }
-  
+
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
@@ -733,14 +745,14 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
     TokPtr += Size;
     PhysOffset += Size;
   }
-  
+
   // Final detail: if we end up on an escaped newline, we want to return the
   // location of the actual byte of the token.  For example foo\<newline>bar
   // advanced by 3 should return the location of b, not of \\.  One compounding
   // detail of this is that the escape may be made by a trigraph.
   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
-  
+
   return TokStart.getLocWithOffset(PhysOffset);
 }
 
@@ -763,11 +775,11 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
                                           const SourceManager &SM,
                                           const LangOptions &LangOpts) {
   if (Loc.isInvalid())
-    return SourceLocation();
+    return {};
 
   if (Loc.isMacroID()) {
     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
-      return SourceLocation(); // Points inside the macro expansion.
+      return {}; // Points inside the macro expansion.
   }
 
   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
@@ -775,7 +787,7 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
     Len = Len - Offset;
   else
     return Loc;
-  
+
   return Loc.getLocWithOffset(Len);
 }
 
@@ -838,7 +850,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
   if (Range.isTokenRange()) {
     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
     if (End.isInvalid())
-      return CharSourceRange();
+      return {};
   }
 
   // Break down the source locations.
@@ -846,12 +858,12 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
   unsigned BeginOffs;
   std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
   if (FID.isInvalid())
-    return CharSourceRange();
+    return {};
 
   unsigned EndOffs;
   if (!SM.isInFileID(End, FID, &EndOffs) ||
       BeginOffs > EndOffs)
-    return CharSourceRange();
+    return {};
 
   return CharSourceRange::getCharRange(Begin, End);
 }
@@ -862,14 +874,14 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
   SourceLocation Begin = Range.getBegin();
   SourceLocation End = Range.getEnd();
   if (Begin.isInvalid() || End.isInvalid())
-    return CharSourceRange();
+    return {};
 
   if (Begin.isFileID() && End.isFileID())
     return makeRangeFromFileLocs(Range, SM, LangOpts);
 
   if (Begin.isMacroID() && End.isFileID()) {
     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
-      return CharSourceRange();
+      return {};
     Range.setBegin(Begin);
     return makeRangeFromFileLocs(Range, SM, LangOpts);
   }
@@ -879,7 +891,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
                                                           &End)) ||
         (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
                                                            &End)))
-      return CharSourceRange();
+      return {};
     Range.setEnd(End);
     return makeRangeFromFileLocs(Range, SM, LangOpts);
   }
@@ -900,13 +912,13 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
   const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
                                                         &Invalid);
   if (Invalid)
-    return CharSourceRange();
+    return {};
 
   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
     const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
                                                         &Invalid);
     if (Invalid)
-      return CharSourceRange();
+      return {};
 
     if (EndEntry.getExpansion().isMacroArgExpansion() &&
         BeginEntry.getExpansion().getExpansionLocStart() ==
@@ -917,7 +929,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
     }
   }
 
-  return CharSourceRange();
+  return {};
 }
 
 StringRef Lexer::getSourceText(CharSourceRange Range,
@@ -927,21 +939,21 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
   Range = makeFileCharRange(Range, SM, LangOpts);
   if (Range.isInvalid()) {
     if (Invalid) *Invalid = true;
-    return StringRef();
+    return {};
   }
 
   // Break down the source location.
   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
   if (beginInfo.first.isInvalid()) {
     if (Invalid) *Invalid = true;
-    return StringRef();
+    return {};
   }
 
   unsigned EndOffs;
   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
       beginInfo.second > EndOffs) {
     if (Invalid) *Invalid = true;
-    return StringRef();
+    return {};
   }
 
   // Try to the load the file buffer.
@@ -949,7 +961,7 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
   if (invalidTemp) {
     if (Invalid) *Invalid = true;
-    return StringRef();
+    return {};
   }
 
   if (Invalid) *Invalid = false;
@@ -972,7 +984,7 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
 
     // For macro arguments we need to check that the argument did not come
     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
-    
+
     // Loc points to the argument id of the macro definition, move to the
     // macro expansion.
     Loc = SM.getImmediateExpansionRange(Loc).first;
@@ -1013,7 +1025,7 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics(
   // If the macro's spelling has no FileID, then it's actually a token paste
   // or stringization (or similar) and not a macro at all.
   if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
-    return StringRef();
+    return {};
 
   // Find the spelling location of the start of the non-argument expansion
   // range. This is where the macro name was spelled in order to begin
@@ -1032,20 +1044,40 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
   return isIdentifierBody(c, LangOpts.DollarIdents);
 }
 
+bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
+  assert(isVerticalWhitespace(Str[0]));
+  if (Str - 1 < BufferStart)
+    return false;
+
+  if ((Str[0] == '\n' && Str[-1] == '\r') ||
+      (Str[0] == '\r' && Str[-1] == '\n')) {
+    if (Str - 2 < BufferStart)
+      return false;
+    --Str;
+  }
+  --Str;
+
+  // Rewind to first non-space character:
+  while (Str > BufferStart && isHorizontalWhitespace(*Str))
+    --Str;
+
+  return *Str == '\\';
+}
+
 StringRef Lexer::getIndentationForLine(SourceLocation Loc,
                                        const SourceManager &SM) {
   if (Loc.isInvalid() || Loc.isMacroID())
-    return "";
+    return {};
   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
   if (LocInfo.first.isInvalid())
-    return "";
+    return {};
   bool Invalid = false;
   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
   if (Invalid)
-    return "";
+    return {};
   const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
   if (!Line)
-    return "";
+    return {};
   StringRef Rest = Buffer.substr(Line - Buffer.data());
   size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
   return NumWhitespaceChars == StringRef::npos
@@ -1199,18 +1231,12 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
   }
 }
 
-/// \brief Checks that the given token is the first token that occurs after the
-/// given location (this excludes comments and whitespace). Returns the location
-/// immediately after the specified token. If the token is not found or the
-/// location is inside a macro, the returned source location will be invalid.
-SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
-                                        tok::TokenKind TKind,
-                                        const SourceManager &SM,
-                                        const LangOptions &LangOpts,
-                                        bool SkipTrailingWhitespaceAndNewLine) {
+Optional<Token> Lexer::findNextToken(SourceLocation Loc,
+                                     const SourceManager &SM,
+                                     const LangOptions &LangOpts) {
   if (Loc.isMacroID()) {
     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
-      return SourceLocation();
+      return None;
   }
   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
 
@@ -1221,7 +1247,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
   bool InvalidTemp = false;
   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
   if (InvalidTemp)
-    return SourceLocation();
+    return None;
 
   const char *TokenBegin = File.data() + LocInfo.second;
 
@@ -1231,15 +1257,25 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
   // Find the token.
   Token Tok;
   lexer.LexFromRawLexer(Tok);
-  if (Tok.isNot(TKind))
-    return SourceLocation();
-  SourceLocation TokenLoc = Tok.getLocation();
+  return Tok;
+}
+
+/// \brief Checks that the given token is the first token that occurs after the
+/// given location (this excludes comments and whitespace). Returns the location
+/// immediately after the specified token. If the token is not found or the
+/// location is inside a macro, the returned source location will be invalid.
+SourceLocation Lexer::findLocationAfterToken(
+    SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
+    const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
+  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
+  if (!Tok || Tok->isNot(TKind))
+    return {};
+  SourceLocation TokenLoc = Tok->getLocation();
 
   // Calculate how much whitespace needs to be skipped if any.
   unsigned NumWhitespaceChars = 0;
   if (SkipTrailingWhitespaceAndNewLine) {
-    const char *TokenEnd = SM.getCharacterData(TokenLoc) +
-                           Tok.getLength();
+    const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
     unsigned char C = *TokenEnd;
     while (isHorizontalWhitespace(C)) {
       C = *(++TokenEnd);
@@ -1256,7 +1292,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
     }
   }
 
-  return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
+  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
 }
 
 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
@@ -1274,7 +1310,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-///
 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
                                Token *Tok) {
   // If we have a slash, look for an escaped newline.
@@ -1378,9 +1413,9 @@ Slash:
 // Helper methods for lexing.
 //===----------------------------------------------------------------------===//
 
-/// \brief Routine that indiscriminately skips bytes in the source file.
-void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
-  BufferPtr += Bytes;
+/// \brief Routine that indiscriminately sets the offset into the source file.
+void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
+  BufferPtr = BufferStart + Offset;
   if (BufferPtr > BufferEnd)
     BufferPtr = BufferEnd;
   // FIXME: What exactly does the StartOfLine bit mean?  There are two
@@ -1466,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
   }
 }
 
+/// After encountering UTF-8 character C and interpreting it as an identifier
+/// character, check whether it's a homoglyph for a common non-identifier
+/// source character that is unlikely to be an intentional identifier
+/// character and warn if so.
+static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
+                                       CharSourceRange Range) {
+  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
+  struct HomoglyphPair {
+    uint32_t Character;
+    char LooksLike;
+    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
+  };
+  static constexpr HomoglyphPair SortedHomoglyphs[] = {
+    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
+    {U'\u037e', ';'}, // GREEK QUESTION MARK
+    {U'\u2212', '-'}, // MINUS SIGN
+    {U'\u2215', '/'}, // DIVISION SLASH
+    {U'\u2216', '\\'}, // SET MINUS
+    {U'\u2217', '*'}, // ASTERISK OPERATOR
+    {U'\u2223', '|'}, // DIVIDES
+    {U'\u2227', '^'}, // LOGICAL AND
+    {U'\u2236', ':'}, // RATIO
+    {U'\u223c', '~'}, // TILDE OPERATOR
+    {U'\ua789', ':'}, // MODIFIER LETTER COLON
+    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
+    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
+    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
+    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
+    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
+    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
+    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
+    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
+    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
+    {U'\uff0c', ','}, // FULLWIDTH COMMA
+    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
+    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
+    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
+    {U'\uff1a', ':'}, // FULLWIDTH COLON
+    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
+    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
+    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
+    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
+    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
+    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
+    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
+    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
+    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
+    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
+    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
+    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
+    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
+    {U'\uff5e', '~'}, // FULLWIDTH TILDE
+    {0, 0}
+  };
+  auto Homoglyph =
+      std::lower_bound(std::begin(SortedHomoglyphs),
+                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
+  if (Homoglyph->Character == C) {
+    llvm::SmallString<5> CharBuf;
+    {
+      llvm::raw_svector_ostream CharOS(CharBuf);
+      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+    }
+    const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+    Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+        << Range << CharBuf << LooksLikeStr;
+  }
+}
+
 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
                                     Token &Result) {
   const char *UCNPtr = CurPtr + Size;
@@ -1500,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
       !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
     return false;
 
-  if (!isLexingRawMode())
+  if (!isLexingRawMode()) {
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UnicodePtr),
                               /*IsFirst=*/false);
+    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+                               makeCharRange(*this, CurPtr, UnicodePtr));
+  }
 
   CurPtr = UnicodePtr;
   return true;
@@ -1569,7 +1676,6 @@ FinishIdentifier:
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
       continue;
-
     } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
       C = getCharAndSize(CurPtr, Size);
       continue;
@@ -1632,7 +1738,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
     if (!LangOpts.C99) {
       if (!isHexaLiteral(BufferPtr, LangOpts))
         IsHexFloat = false;
-      else if (!getLangOpts().CPlusPlus1z &&
+      else if (!getLangOpts().CPlusPlus17 &&
                std::find(BufferPtr, CurPtr, '_') != CurPtr)
         IsHexFloat = false;
     }
@@ -1778,7 +1884,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
     // getAndAdvanceChar.
     if (C == '\\')
       C = getAndAdvanceChar(CurPtr, Result);
-    
+
     if (C == '\n' || C == '\r' ||             // Newline.
         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
@@ -1786,7 +1892,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
       return true;
     }
-    
+
     if (C == 0) {
       if (isCodeCompletionPoint(CurPtr-1)) {
         PP->CodeCompleteNaturalLanguage();
@@ -2000,7 +2106,6 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
 /// Update BufferPtr to point to the next non-whitespace character and return.
 ///
 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
-///
 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
                            bool &TokAtPhysicalStartOfLine) {
   // Whitespace - Skip it, then return the token after the whitespace.
@@ -2131,7 +2236,8 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
     // If we read multiple characters, and one of those characters was a \r or
     // \n, then we had an escaped newline within the comment.  Emit diagnostic
     // unless the next line is also a // comment.
-    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
+    if (CurPtr != OldPtr + 1 && C != '/' &&
+        (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
       for (; OldPtr != CurPtr; ++OldPtr)
         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
           // Okay, we found a // comment that ends in a newline, if the next
@@ -2214,7 +2320,7 @@ bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
   std::string Spelling = PP->getSpelling(Result, &Invalid);
   if (Invalid)
     return true;
-  
+
   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
   Spelling[1] = '*';   // Change prefix to "/*".
   Spelling += "*/";    // add suffix.
@@ -2540,7 +2646,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
       resetExtendedTokenMode();
     return true;  // Have a token.
   }
- 
+
   // If we are in raw mode, return this event as an EOF token.  Let the caller
   // that put us in raw mode handle the event.
   if (isLexingRawMode()) {
@@ -2549,7 +2655,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     FormTokenWithChars(Result, BufferEnd, tok::eof);
     return true;
   }
-  
+
   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
     PP->setRecordedPreambleConditionalStack(ConditionalStack);
     ConditionalStack.clear();
@@ -2661,7 +2767,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // Check to see if we have <<<<<<< or >>>>.
   if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
       !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
@@ -2671,7 +2777,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
   // it.
   if (CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
 
   // Check to see if there is an ending marker somewhere in the buffer at the
@@ -2681,7 +2787,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
     // Diagnose this, and ignore to the end of line.
     Diag(CurPtr, diag::err_conflict_marker);
     CurrentConflictMarkerState = Kind;
-    
+
     // Skip ahead to the end of line.  We know this exists because the
     // end-of-conflict marker starts with \r or \n.
     while (*CurPtr != '\r' && *CurPtr != '\n') {
@@ -2691,7 +2797,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
     BufferPtr = CurPtr;
     return true;
   }
-  
+
   // No end of conflict marker found.
   return false;
 }
@@ -2705,35 +2811,35 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // If we have a situation where we don't care about conflict markers, ignore
   // it.
   if (!CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   // Check to see if we have the marker (4 characters in a row).
   for (unsigned i = 1; i != 4; ++i)
     if (CurPtr[i] != CurPtr[0])
       return false;
-  
+
   // If we do have it, search for the end of the conflict marker.  This could
   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
   // be the end of conflict marker.
   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
                                         CurrentConflictMarkerState)) {
     CurPtr = End;
-    
+
     // Skip ahead to the end of line.
     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
       ++CurPtr;
-    
+
     BufferPtr = CurPtr;
-    
+
     // No longer in the conflict marker.
     CurrentConflictMarkerState = CMK_None;
     return true;
   }
-  
+
   return false;
 }
 
@@ -2872,7 +2978,6 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
     }
 
     return 0;
-
   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
     // We don't use isLexingRawMode() here because we need to diagnose bad
@@ -3042,7 +3147,7 @@ LexNextToken:
     // We know the lexer hasn't changed, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   case 26:  // DOS & CP/M EOF: "^Z".
     // If we're in Microsoft extensions mode, treat this as end of file.
     if (LangOpts.MicrosoftExt) {
@@ -3054,9 +3159,12 @@ LexNextToken:
     // If Microsoft extensions are disabled, this is just random garbage.
     Kind = tok::unknown;
     break;
-      
-  case '\n':
+
   case '\r':
+    if (CurPtr[0] == '\n')
+      Char = getAndAdvanceChar(CurPtr, Result);
+    LLVM_FALLTHROUGH;
+  case '\n':
     // If we are inside a preprocessor directive and we see the end of line,
     // we know we are done with the directive, so return an EOD token.
     if (ParsingPreprocessorDirective) {
@@ -3114,7 +3222,7 @@ LexNextToken:
     // We only saw whitespace, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   // C99 6.4.4.1: Integer Constants.
   // C99 6.4.4.2: Floating Constants.
   case '0': case '1': case '2': case '3': case '4':
@@ -3157,7 +3265,7 @@ LexNextToken:
                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
                                            SizeTmp2, Result),
                                tok::utf8_string_literal);
-        if (Char2 == '\'' && LangOpts.CPlusPlus1z)
+        if (Char2 == '\'' && LangOpts.CPlusPlus17)
           return LexCharConstant(
               Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
                                   SizeTmp2, Result),
@@ -3501,6 +3609,24 @@ LexNextToken:
         Kind = tok::lessless;
       }
     } else if (Char == '=') {
+      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
+      if (After == '>') {
+        if (getLangOpts().CPlusPlus2a) {
+          if (!isLexingRawMode())
+            Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
+          CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                               SizeTmp2, Result);
+          Kind = tok::spaceship;
+          break;
+        }
+        // Suggest adding a space between the '<=' and the '>' to avoid a
+        // change in semantics if this turns up in C++ <=17 mode.
+        if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
+          Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
+            << FixItHint::CreateInsertion(
+                   getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
+        }
+      }
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
       Kind = tok::lessequal;
     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
@@ -3526,7 +3652,8 @@ LexNextToken:
     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
       Kind = tok::l_brace;
-    } else if (Char == '#' && lexEditorPlaceholder(Result, CurPtr)) {
+    } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
+               lexEditorPlaceholder(Result, CurPtr)) {
       return true;
     } else {
       Kind = tok::less;
@@ -3594,7 +3721,9 @@ LexNextToken:
     if (LangOpts.Digraphs && Char == '>') {
       Kind = tok::r_square; // ':>' -> ']'
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
-    } else if (LangOpts.CPlusPlus && Char == ':') {
+    } else if ((LangOpts.CPlusPlus ||
+                LangOpts.DoubleSquareBracketAttributes) &&
+               Char == ':') {
       Kind = tok::coloncolon;
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
     } else {
@@ -3610,7 +3739,7 @@ LexNextToken:
       // If this is '====' and we're in a conflict marker, ignore it.
       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
         goto LexNextToken;
-      
+
       Kind = tok::equalequal;
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
     } else {
@@ -3681,6 +3810,7 @@ LexNextToken:
     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
     // an escaped newline.
     --CurPtr;
+    const char *UTF8StartPtr = CurPtr;
     llvm::ConversionResult Status =
         llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
                                   (const llvm::UTF8 *)BufferEnd,
@@ -3695,9 +3825,12 @@ LexNextToken:
         // (We manually eliminate the tail call to avoid recursion.)
         goto LexNextToken;
       }
+      if (!isLexingRawMode())
+        maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+                                   makeCharRange(*this, UTF8StartPtr, CurPtr));
       return LexUnicode(Result, CodePoint, CurPtr);
     }
-    
+
     if (isLexingRawMode() || ParsingPreprocessorDirective ||
         PP->isPreprocessedOutput()) {
       ++CurPtr;