aboutsummaryrefslogtreecommitdiff
path: root/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r--lib/Lex/Lexer.cpp465
1 files changed, 299 insertions, 166 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 61bcef8cb760..830354ab23f0 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -1,4 +1,4 @@
-//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
+//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,17 +15,29 @@
#include "UnicodeCharSets.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Lex/LiteralSupport.h"
+#include "clang/Lex/MultipleIncludeOpt.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/UnicodeCharRanges.h"
#include <algorithm>
#include <cassert>
@@ -63,7 +75,7 @@ tok::ObjCKeywordKind Token::getObjCKeywordID() const {
// Lexer Class Implementation
//===----------------------------------------------------------------------===//
-void Lexer::anchor() { }
+void Lexer::anchor() {}
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
@@ -120,31 +132,21 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
/// assumes that the associated file buffer and Preprocessor objects will
/// outlive it, so it doesn't take ownership of either of them.
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
- : PreprocessorLexer(&PP, FID),
- FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
- LangOpts(PP.getLangOpts()) {
-
+ : PreprocessorLexer(&PP, FID),
+ FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
+ LangOpts(PP.getLangOpts()) {
InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
InputFile->getBufferEnd());
resetExtendedTokenMode();
}
-void Lexer::resetExtendedTokenMode() {
- assert(PP && "Cannot reset token mode without a preprocessor");
- if (LangOpts.TraditionalCPP)
- SetKeepWhitespaceMode(true);
- else
- SetCommentRetentionState(PP->getCommentRetentionState());
-}
-
/// Lexer constructor - Create a new raw lexer object. This object is only
/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
/// range will outlive it, so it doesn't take ownership of it.
Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
const char *BufStart, const char *BufPtr, const char *BufEnd)
- : FileLoc(fileloc), LangOpts(langOpts) {
-
+ : FileLoc(fileloc), LangOpts(langOpts) {
InitLexer(BufStart, BufPtr, BufEnd);
// We *are* in raw mode.
@@ -159,6 +161,14 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
: Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
+void Lexer::resetExtendedTokenMode() {
+ assert(PP && "Cannot reset token mode without a preprocessor");
+ if (LangOpts.TraditionalCPP)
+ SetKeepWhitespaceMode(true);
+ else
+ SetCommentRetentionState(PP->getCommentRetentionState());
+}
+
/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
/// _Pragma expansion. This has a variety of magic semantics that this method
/// sets up. It returns a new'd Lexer that must be delete'd when done.
@@ -209,30 +219,39 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
return L;
}
-/// Stringify - Convert the specified string into a C string, with surrounding
-/// ""'s, and with escaped \ and " characters.
+template <typename T> static void StringifyImpl(T &Str, char Quote) {
+ typename T::size_type i = 0, e = Str.size();
+ while (i < e) {
+ if (Str[i] == '\\' || Str[i] == Quote) {
+ Str.insert(Str.begin() + i, '\\');
+ i += 2;
+ ++e;
+ } else if (Str[i] == '\n' || Str[i] == '\r') {
+ // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
+ if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
+ Str[i] != Str[i + 1]) {
+ Str[i] = '\\';
+ Str[i + 1] = 'n';
+ } else {
+ // Replace '\n' and '\r' to '\\' followed by 'n'.
+ Str[i] = '\\';
+ Str.insert(Str.begin() + i + 1, 'n');
+ ++e;
+ }
+ i += 2;
+ } else
+ ++i;
+ }
+}
+
std::string Lexer::Stringify(StringRef Str, bool Charify) {
std::string Result = Str;
char Quote = Charify ? '\'' : '"';
- for (unsigned i = 0, e = Result.size(); i != e; ++i) {
- if (Result[i] == '\\' || Result[i] == Quote) {
- Result.insert(Result.begin()+i, '\\');
- ++i; ++e;
- }
- }
+ StringifyImpl(Result, Quote);
return Result;
}
-/// Stringify - Convert the specified string into a C string by escaping '\'
-/// and " characters. This does not add surrounding ""'s to the string.
-void Lexer::Stringify(SmallVectorImpl<char> &Str) {
- for (unsigned i = 0, e = Str.size(); i != e; ++i) {
- if (Str[i] == '\\' || Str[i] == '"') {
- Str.insert(Str.begin()+i, '\\');
- ++i; ++e;
- }
- }
-}
+void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
//===----------------------------------------------------------------------===//
// Token Spelling
@@ -307,7 +326,7 @@ StringRef Lexer::getSpelling(SourceLocation loc,
StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
if (invalidTemp) {
if (invalid) *invalid = true;
- return StringRef();
+ return {};
}
const char *tokenBegin = file.data() + locInfo.second;
@@ -345,7 +364,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
if (Invalid)
*Invalid = CharDataInvalid;
if (CharDataInvalid)
- return std::string();
+ return {};
// If this token contains nothing interesting, return it directly.
if (!Tok.needsCleaning())
@@ -367,7 +386,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
/// to point to a constant buffer with the data already in it (avoiding a
/// copy). The caller is not allowed to modify the returned buffer pointer
/// if an internal buffer is returned.
-unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
const SourceManager &SourceMgr,
const LangOptions &LangOpts, bool *Invalid) {
assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
@@ -463,19 +482,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
const char *BufStart = Buffer.data();
if (Offset >= Buffer.size())
return nullptr;
- const char *StrData = BufStart + Offset;
-
- if (StrData[0] == '\n' || StrData[0] == '\r')
- return StrData;
- const char *LexStart = StrData;
- while (LexStart != BufStart) {
- if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+ const char *LexStart = BufStart + Offset;
+ for (; LexStart != BufStart; --LexStart) {
+ if (isVerticalWhitespace(LexStart[0]) &&
+ !Lexer::isNewLineEscaped(BufStart, LexStart)) {
+ // LexStart should point at first character of logical line.
++LexStart;
break;
}
-
- --LexStart;
}
return LexStart;
}
@@ -487,7 +502,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
return Loc;
-
+
bool Invalid = false;
StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
@@ -499,31 +514,31 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
if (!LexStart || LexStart == StrData)
return Loc;
-
+
// Create a lexer starting at the beginning of this token.
SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
Buffer.end());
TheLexer.SetCommentRetentionState(true);
-
+
// Lex tokens until we find the token that contains the source location.
Token TheTok;
do {
TheLexer.LexFromRawLexer(TheTok);
-
+
if (TheLexer.getBufferLocation() > StrData) {
// Lexing this token has taken the lexer past the source location we're
// looking for. If the current token encompasses our source location,
// return the beginning of that token.
if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
return TheTok.getLocation();
-
+
// We ended up skipping over the source location entirely, which means
// that it points into whitespace. We're done here.
break;
}
} while (TheTok.getKind() != tok::eof);
-
+
// We've passed our source location; just return the original source location.
return Loc;
}
@@ -531,34 +546,34 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
- if (Loc.isFileID())
- return getBeginningOfFileToken(Loc, SM, LangOpts);
-
- if (!SM.isMacroArgExpansion(Loc))
- return Loc;
-
- SourceLocation FileLoc = SM.getSpellingLoc(Loc);
- SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
- std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
- std::pair<FileID, unsigned> BeginFileLocInfo
- = SM.getDecomposedLoc(BeginFileLoc);
- assert(FileLocInfo.first == BeginFileLocInfo.first &&
- FileLocInfo.second >= BeginFileLocInfo.second);
- return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
+ if (Loc.isFileID())
+ return getBeginningOfFileToken(Loc, SM, LangOpts);
+
+ if (!SM.isMacroArgExpansion(Loc))
+ return Loc;
+
+ SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+ SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+ std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+ std::pair<FileID, unsigned> BeginFileLocInfo =
+ SM.getDecomposedLoc(BeginFileLoc);
+ assert(FileLocInfo.first == BeginFileLocInfo.first &&
+ FileLocInfo.second >= BeginFileLocInfo.second);
+ return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
}
namespace {
- enum PreambleDirectiveKind {
- PDK_Skipped,
- PDK_Unknown
- };
+enum PreambleDirectiveKind {
+ PDK_Skipped,
+ PDK_Unknown
+};
-} // end anonymous namespace
+} // namespace
-std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
- const LangOptions &LangOpts,
- unsigned MaxLines) {
+PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
+ const LangOptions &LangOpts,
+ unsigned MaxLines) {
// Create a lexer starting at the beginning of the file. Note that we use a
// "fake" file source location at offset 1 so that the lexer will track our
// position within the file.
@@ -568,9 +583,6 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
Buffer.end());
TheLexer.SetCommentRetentionState(true);
- // StartLoc will differ from FileLoc if there is a BOM that was skipped.
- SourceLocation StartLoc = TheLexer.getSourceLocation();
-
bool InPreprocessorDirective = false;
Token TheTok;
SourceLocation ActiveCommentLoc;
@@ -599,17 +611,17 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
if (TheTok.getKind() == tok::eof) {
break;
}
-
+
// If we haven't hit the end of the preprocessor directive, skip this
// token.
if (!TheTok.isAtStartOfLine())
continue;
-
+
// We've passed the end of the preprocessor directive, and will look
// at this token again below.
InPreprocessorDirective = false;
}
-
+
// Keep track of the # of lines in the preamble.
if (TheTok.isAtStartOfLine()) {
unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
@@ -626,13 +638,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
ActiveCommentLoc = TheTok.getLocation();
continue;
}
-
+
if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
- // This is the start of a preprocessor directive.
+ // This is the start of a preprocessor directive.
Token HashTok = TheTok;
InPreprocessorDirective = true;
ActiveCommentLoc = SourceLocation();
-
+
// Figure out which directive this is. Since we're lexing raw tokens,
// we don't have an identifier table available. Instead, just look at
// the raw identifier to recognize and categorize preprocessor directives.
@@ -672,7 +684,7 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
break;
}
}
-
+
// We only end up here if we didn't recognize the preprocessor
// directive or it was one that can't occur in the preamble at this
// point. Roll back the current token to the location of the '#'.
@@ -685,14 +697,14 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
// the preamble.
break;
} while (true);
-
+
SourceLocation End;
if (ActiveCommentLoc.isValid())
End = ActiveCommentLoc; // don't truncate a decl comment.
else
End = TheTok.getLocation();
- return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
+ return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
TheTok.isAtStartOfLine());
}
@@ -707,13 +719,13 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
// trigraphs.
bool Invalid = false;
const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
-
+
// If they request the first char of the token, we're trivially done.
if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
return TokStart;
-
+
unsigned PhysOffset = 0;
-
+
// The usual case is that tokens don't contain anything interesting. Skip
// over the uninteresting characters. If a token only consists of simple
// chars, this method is extremely fast.
@@ -724,7 +736,7 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
--CharNo;
++PhysOffset;
}
-
+
// If we have a character that may be a trigraph or escaped newline, use a
// lexer to parse it correctly.
for (; CharNo; --CharNo) {
@@ -733,14 +745,14 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
TokPtr += Size;
PhysOffset += Size;
}
-
+
// Final detail: if we end up on an escaped newline, we want to return the
// location of the actual byte of the token. For example foo\<newline>bar
// advanced by 3 should return the location of b, not of \\. One compounding
// detail of this is that the escape may be made by a trigraph.
if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
-
+
return TokStart.getLocWithOffset(PhysOffset);
}
@@ -763,11 +775,11 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
const SourceManager &SM,
const LangOptions &LangOpts) {
if (Loc.isInvalid())
- return SourceLocation();
+ return {};
if (Loc.isMacroID()) {
if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
- return SourceLocation(); // Points inside the macro expansion.
+ return {}; // Points inside the macro expansion.
}
unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
@@ -775,7 +787,7 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
Len = Len - Offset;
else
return Loc;
-
+
return Loc.getLocWithOffset(Len);
}
@@ -838,7 +850,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
if (Range.isTokenRange()) {
End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
if (End.isInvalid())
- return CharSourceRange();
+ return {};
}
// Break down the source locations.
@@ -846,12 +858,12 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
unsigned BeginOffs;
std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
if (FID.isInvalid())
- return CharSourceRange();
+ return {};
unsigned EndOffs;
if (!SM.isInFileID(End, FID, &EndOffs) ||
BeginOffs > EndOffs)
- return CharSourceRange();
+ return {};
return CharSourceRange::getCharRange(Begin, End);
}
@@ -862,14 +874,14 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
SourceLocation Begin = Range.getBegin();
SourceLocation End = Range.getEnd();
if (Begin.isInvalid() || End.isInvalid())
- return CharSourceRange();
+ return {};
if (Begin.isFileID() && End.isFileID())
return makeRangeFromFileLocs(Range, SM, LangOpts);
if (Begin.isMacroID() && End.isFileID()) {
if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
- return CharSourceRange();
+ return {};
Range.setBegin(Begin);
return makeRangeFromFileLocs(Range, SM, LangOpts);
}
@@ -879,7 +891,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
&End)) ||
(Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
&End)))
- return CharSourceRange();
+ return {};
Range.setEnd(End);
return makeRangeFromFileLocs(Range, SM, LangOpts);
}
@@ -900,13 +912,13 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
&Invalid);
if (Invalid)
- return CharSourceRange();
+ return {};
if (BeginEntry.getExpansion().isMacroArgExpansion()) {
const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
&Invalid);
if (Invalid)
- return CharSourceRange();
+ return {};
if (EndEntry.getExpansion().isMacroArgExpansion() &&
BeginEntry.getExpansion().getExpansionLocStart() ==
@@ -917,7 +929,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
}
}
- return CharSourceRange();
+ return {};
}
StringRef Lexer::getSourceText(CharSourceRange Range,
@@ -927,21 +939,21 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
Range = makeFileCharRange(Range, SM, LangOpts);
if (Range.isInvalid()) {
if (Invalid) *Invalid = true;
- return StringRef();
+ return {};
}
// Break down the source location.
std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
if (beginInfo.first.isInvalid()) {
if (Invalid) *Invalid = true;
- return StringRef();
+ return {};
}
unsigned EndOffs;
if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
beginInfo.second > EndOffs) {
if (Invalid) *Invalid = true;
- return StringRef();
+ return {};
}
// Try to the load the file buffer.
@@ -949,7 +961,7 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
if (invalidTemp) {
if (Invalid) *Invalid = true;
- return StringRef();
+ return {};
}
if (Invalid) *Invalid = false;
@@ -972,7 +984,7 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
// For macro arguments we need to check that the argument did not come
// from an inner macro, e.g: "MAC1( MAC2(foo) )"
-
+
// Loc points to the argument id of the macro definition, move to the
// macro expansion.
Loc = SM.getImmediateExpansionRange(Loc).first;
@@ -1013,7 +1025,7 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics(
// If the macro's spelling has no FileID, then it's actually a token paste
// or stringization (or similar) and not a macro at all.
if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
- return StringRef();
+ return {};
// Find the spelling location of the start of the non-argument expansion
// range. This is where the macro name was spelled in order to begin
@@ -1032,20 +1044,40 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
return isIdentifierBody(c, LangOpts.DollarIdents);
}
+bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
+ assert(isVerticalWhitespace(Str[0]));
+ if (Str - 1 < BufferStart)
+ return false;
+
+ if ((Str[0] == '\n' && Str[-1] == '\r') ||
+ (Str[0] == '\r' && Str[-1] == '\n')) {
+ if (Str - 2 < BufferStart)
+ return false;
+ --Str;
+ }
+ --Str;
+
+ // Rewind to first non-space character:
+ while (Str > BufferStart && isHorizontalWhitespace(*Str))
+ --Str;
+
+ return *Str == '\\';
+}
+
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
const SourceManager &SM) {
if (Loc.isInvalid() || Loc.isMacroID())
- return "";
+ return {};
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
- return "";
+ return {};
bool Invalid = false;
StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
- return "";
+ return {};
const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
if (!Line)
- return "";
+ return {};
StringRef Rest = Buffer.substr(Line - Buffer.data());
size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
return NumWhitespaceChars == StringRef::npos
@@ -1199,18 +1231,12 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
}
}
-/// \brief Checks that the given token is the first token that occurs after the
-/// given location (this excludes comments and whitespace). Returns the location
-/// immediately after the specified token. If the token is not found or the
-/// location is inside a macro, the returned source location will be invalid.
-SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
- tok::TokenKind TKind,
- const SourceManager &SM,
- const LangOptions &LangOpts,
- bool SkipTrailingWhitespaceAndNewLine) {
+Optional<Token> Lexer::findNextToken(SourceLocation Loc,
+ const SourceManager &SM,
+ const LangOptions &LangOpts) {
if (Loc.isMacroID()) {
if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
- return SourceLocation();
+ return None;
}
Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
@@ -1221,7 +1247,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
bool InvalidTemp = false;
StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
if (InvalidTemp)
- return SourceLocation();
+ return None;
const char *TokenBegin = File.data() + LocInfo.second;
@@ -1231,15 +1257,25 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
// Find the token.
Token Tok;
lexer.LexFromRawLexer(Tok);
- if (Tok.isNot(TKind))
- return SourceLocation();
- SourceLocation TokenLoc = Tok.getLocation();
+ return Tok;
+}
+
+/// \brief Checks that the given token is the first token that occurs after the
+/// given location (this excludes comments and whitespace). Returns the location
+/// immediately after the specified token. If the token is not found or the
+/// location is inside a macro, the returned source location will be invalid.
+SourceLocation Lexer::findLocationAfterToken(
+ SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
+ const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
+ Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
+ if (!Tok || Tok->isNot(TKind))
+ return {};
+ SourceLocation TokenLoc = Tok->getLocation();
// Calculate how much whitespace needs to be skipped if any.
unsigned NumWhitespaceChars = 0;
if (SkipTrailingWhitespaceAndNewLine) {
- const char *TokenEnd = SM.getCharacterData(TokenLoc) +
- Tok.getLength();
+ const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
unsigned char C = *TokenEnd;
while (isHorizontalWhitespace(C)) {
C = *(++TokenEnd);
@@ -1256,7 +1292,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
}
}
- return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
+ return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
}
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
@@ -1274,7 +1310,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
///
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
/// be updated to match.
-///
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
Token *Tok) {
// If we have a slash, look for an escaped newline.
@@ -1378,9 +1413,9 @@ Slash:
// Helper methods for lexing.
//===----------------------------------------------------------------------===//
-/// \brief Routine that indiscriminately skips bytes in the source file.
-void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
- BufferPtr += Bytes;
+/// \brief Routine that indiscriminately sets the offset into the source file.
+void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
+ BufferPtr = BufferStart + Offset;
if (BufferPtr > BufferEnd)
BufferPtr = BufferEnd;
// FIXME: What exactly does the StartOfLine bit mean? There are two
@@ -1466,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
}
}
+/// After encountering UTF-8 character C and interpreting it as an identifier
+/// character, check whether it's a homoglyph for a common non-identifier
+/// source character that is unlikely to be an intentional identifier
+/// character and warn if so.
+static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
+ CharSourceRange Range) {
+ // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
+ struct HomoglyphPair {
+ uint32_t Character;
+ char LooksLike;
+ bool operator<(HomoglyphPair R) const { return Character < R.Character; }
+ };
+ static constexpr HomoglyphPair SortedHomoglyphs[] = {
+ {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
+ {U'\u037e', ';'}, // GREEK QUESTION MARK
+ {U'\u2212', '-'}, // MINUS SIGN
+ {U'\u2215', '/'}, // DIVISION SLASH
+ {U'\u2216', '\\'}, // SET MINUS
+ {U'\u2217', '*'}, // ASTERISK OPERATOR
+ {U'\u2223', '|'}, // DIVIDES
+ {U'\u2227', '^'}, // LOGICAL AND
+ {U'\u2236', ':'}, // RATIO
+ {U'\u223c', '~'}, // TILDE OPERATOR
+ {U'\ua789', ':'}, // MODIFIER LETTER COLON
+ {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
+ {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
+ {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
+ {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
+ {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
+ {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
+ {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
+ {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
+ {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
+ {U'\uff0c', ','}, // FULLWIDTH COMMA
+ {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
+ {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
+ {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
+ {U'\uff1a', ':'}, // FULLWIDTH COLON
+ {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
+ {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
+ {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
+ {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
+ {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
+ {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
+ {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
+ {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
+ {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
+ {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
+ {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
+ {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
+ {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
+ {U'\uff5e', '~'}, // FULLWIDTH TILDE
+ {0, 0}
+ };
+ auto Homoglyph =
+ std::lower_bound(std::begin(SortedHomoglyphs),
+ std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
+ if (Homoglyph->Character == C) {
+ llvm::SmallString<5> CharBuf;
+ {
+ llvm::raw_svector_ostream CharOS(CharBuf);
+ llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+ }
+ const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+ << Range << CharBuf << LooksLikeStr;
+ }
+}
+
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
Token &Result) {
const char *UCNPtr = CurPtr + Size;
@@ -1500,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
return false;
- if (!isLexingRawMode())
+ if (!isLexingRawMode()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
+ maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UnicodePtr));
+ }
CurPtr = UnicodePtr;
return true;
@@ -1569,7 +1676,6 @@ FinishIdentifier:
CurPtr = ConsumeChar(CurPtr, Size, Result);
C = getCharAndSize(CurPtr, Size);
continue;
-
} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
C = getCharAndSize(CurPtr, Size);
continue;
@@ -1632,7 +1738,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
if (!LangOpts.C99) {
if (!isHexaLiteral(BufferPtr, LangOpts))
IsHexFloat = false;
- else if (!getLangOpts().CPlusPlus1z &&
+ else if (!getLangOpts().CPlusPlus17 &&
std::find(BufferPtr, CurPtr, '_') != CurPtr)
IsHexFloat = false;
}
@@ -1778,7 +1884,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
// getAndAdvanceChar.
if (C == '\\')
C = getAndAdvanceChar(CurPtr, Result);
-
+
if (C == '\n' || C == '\r' || // Newline.
(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
@@ -1786,7 +1892,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return true;
}
-
+
if (C == 0) {
if (isCodeCompletionPoint(CurPtr-1)) {
PP->CodeCompleteNaturalLanguage();
@@ -2000,7 +2106,6 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
/// Update BufferPtr to point to the next non-whitespace character and return.
///
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
-///
bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
bool &TokAtPhysicalStartOfLine) {
// Whitespace - Skip it, then return the token after the whitespace.
@@ -2131,7 +2236,8 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
// If we read multiple characters, and one of those characters was a \r or
// \n, then we had an escaped newline within the comment. Emit diagnostic
// unless the next line is also a // comment.
- if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
+ if (CurPtr != OldPtr + 1 && C != '/' &&
+ (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
for (; OldPtr != CurPtr; ++OldPtr)
if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
// Okay, we found a // comment that ends in a newline, if the next
@@ -2214,7 +2320,7 @@ bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
std::string Spelling = PP->getSpelling(Result, &Invalid);
if (Invalid)
return true;
-
+
assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
Spelling[1] = '*'; // Change prefix to "/*".
Spelling += "*/"; // add suffix.
@@ -2540,7 +2646,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
resetExtendedTokenMode();
return true; // Have a token.
}
-
+
// If we are in raw mode, return this event as an EOF token. Let the caller
// that put us in raw mode handle the event.
if (isLexingRawMode()) {
@@ -2549,7 +2655,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
FormTokenWithChars(Result, BufferEnd, tok::eof);
return true;
}
-
+
if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
PP->setRecordedPreambleConditionalStack(ConditionalStack);
ConditionalStack.clear();
@@ -2661,7 +2767,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
if (CurPtr != BufferStart &&
CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
return false;
-
+
// Check to see if we have <<<<<<< or >>>>.
if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
@@ -2671,7 +2777,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
// it.
if (CurrentConflictMarkerState || isLexingRawMode())
return false;
-
+
ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
// Check to see if there is an ending marker somewhere in the buffer at the
@@ -2681,7 +2787,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
// Diagnose this, and ignore to the end of line.
Diag(CurPtr, diag::err_conflict_marker);
CurrentConflictMarkerState = Kind;
-
+
// Skip ahead to the end of line. We know this exists because the
// end-of-conflict marker starts with \r or \n.
while (*CurPtr != '\r' && *CurPtr != '\n') {
@@ -2691,7 +2797,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
BufferPtr = CurPtr;
return true;
}
-
+
// No end of conflict marker found.
return false;
}
@@ -2705,35 +2811,35 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
if (CurPtr != BufferStart &&
CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
return false;
-
+
// If we have a situation where we don't care about conflict markers, ignore
// it.
if (!CurrentConflictMarkerState || isLexingRawMode())
return false;
-
+
// Check to see if we have the marker (4 characters in a row).
for (unsigned i = 1; i != 4; ++i)
if (CurPtr[i] != CurPtr[0])
return false;
-
+
// If we do have it, search for the end of the conflict marker. This could
// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
// be the end of conflict marker.
if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
CurrentConflictMarkerState)) {
CurPtr = End;
-
+
// Skip ahead to the end of line.
while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
++CurPtr;
-
+
BufferPtr = CurPtr;
-
+
// No longer in the conflict marker.
CurrentConflictMarkerState = CMK_None;
return true;
}
-
+
return false;
}
@@ -2872,7 +2978,6 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
}
return 0;
-
} else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
// C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
// We don't use isLexingRawMode() here because we need to diagnose bad
@@ -3042,7 +3147,7 @@ LexNextToken:
// We know the lexer hasn't changed, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
-
+
case 26: // DOS & CP/M EOF: "^Z".
// If we're in Microsoft extensions mode, treat this as end of file.
if (LangOpts.MicrosoftExt) {
@@ -3054,9 +3159,12 @@ LexNextToken:
// If Microsoft extensions are disabled, this is just random garbage.
Kind = tok::unknown;
break;
-
- case '\n':
+
case '\r':
+ if (CurPtr[0] == '\n')
+ Char = getAndAdvanceChar(CurPtr, Result);
+ LLVM_FALLTHROUGH;
+ case '\n':
// If we are inside a preprocessor directive and we see the end of line,
// we know we are done with the directive, so return an EOD token.
if (ParsingPreprocessorDirective) {
@@ -3114,7 +3222,7 @@ LexNextToken:
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
-
+
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
case '0': case '1': case '2': case '3': case '4':
@@ -3157,7 +3265,7 @@ LexNextToken:
ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::utf8_string_literal);
- if (Char2 == '\'' && LangOpts.CPlusPlus1z)
+ if (Char2 == '\'' && LangOpts.CPlusPlus17)
return LexCharConstant(
Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
@@ -3501,6 +3609,24 @@ LexNextToken:
Kind = tok::lessless;
}
} else if (Char == '=') {
+ char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
+ if (After == '>') {
+ if (getLangOpts().CPlusPlus2a) {
+ if (!isLexingRawMode())
+ Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
+ CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result);
+ Kind = tok::spaceship;
+ break;
+ }
+ // Suggest adding a space between the '<=' and the '>' to avoid a
+ // change in semantics if this turns up in C++ <=17 mode.
+ if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
+ Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
+ << FixItHint::CreateInsertion(
+ getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
+ }
+ }
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::lessequal;
} else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
@@ -3526,7 +3652,8 @@ LexNextToken:
} else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::l_brace;
- } else if (Char == '#' && lexEditorPlaceholder(Result, CurPtr)) {
+ } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
+ lexEditorPlaceholder(Result, CurPtr)) {
return true;
} else {
Kind = tok::less;
@@ -3594,7 +3721,9 @@ LexNextToken:
if (LangOpts.Digraphs && Char == '>') {
Kind = tok::r_square; // ':>' -> ']'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
- } else if (LangOpts.CPlusPlus && Char == ':') {
+ } else if ((LangOpts.CPlusPlus ||
+ LangOpts.DoubleSquareBracketAttributes) &&
+ Char == ':') {
Kind = tok::coloncolon;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
@@ -3610,7 +3739,7 @@ LexNextToken:
// If this is '====' and we're in a conflict marker, ignore it.
if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
goto LexNextToken;
-
+
Kind = tok::equalequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
@@ -3681,6 +3810,7 @@ LexNextToken:
// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
// an escaped newline.
--CurPtr;
+ const char *UTF8StartPtr = CurPtr;
llvm::ConversionResult Status =
llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
(const llvm::UTF8 *)BufferEnd,
@@ -3695,9 +3825,12 @@ LexNextToken:
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
+ if (!isLexingRawMode())
+ maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, UTF8StartPtr, CurPtr));
return LexUnicode(Result, CodePoint, CurPtr);
}
-
+
if (isLexingRawMode() || ParsingPreprocessorDirective ||
PP->isPreprocessedOutput()) {
++CurPtr;