diff options
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r-- | lib/Lex/Lexer.cpp | 465 |
1 files changed, 299 insertions, 166 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 61bcef8cb760..830354ab23f0 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1,4 +1,4 @@ -//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// +//===- Lexer.cpp - C Language Family Lexer --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -15,17 +15,29 @@ #include "UnicodeCharSets.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Lex/LiteralSupport.h" +#include "clang/Lex/MultipleIncludeOpt.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/Token.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/UnicodeCharRanges.h" #include <algorithm> #include <cassert> @@ -63,7 +75,7 @@ tok::ObjCKeywordKind Token::getObjCKeywordID() const { // Lexer Class Implementation //===----------------------------------------------------------------------===// -void Lexer::anchor() { } +void Lexer::anchor() {} void Lexer::InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd) { @@ -120,31 +132,21 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr, /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) - : PreprocessorLexer(&PP, FID), - FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), - LangOpts(PP.getLangOpts()) { - + : PreprocessorLexer(&PP, FID), + FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), + LangOpts(PP.getLangOpts()) { InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), InputFile->getBufferEnd()); resetExtendedTokenMode(); } -void Lexer::resetExtendedTokenMode() { - assert(PP && "Cannot reset token mode without a preprocessor"); - if (LangOpts.TraditionalCPP) - SetKeepWhitespaceMode(true); - else - SetCommentRetentionState(PP->getCommentRetentionState()); -} - /// Lexer constructor - Create a new raw lexer object. This object is only /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text /// range will outlive it, so it doesn't take ownership of it. Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, const char *BufStart, const char *BufPtr, const char *BufEnd) - : FileLoc(fileloc), LangOpts(langOpts) { - + : FileLoc(fileloc), LangOpts(langOpts) { InitLexer(BufStart, BufPtr, BufEnd); // We *are* in raw mode. @@ -159,6 +161,14 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), FromFile->getBufferStart(), FromFile->getBufferEnd()) {} +void Lexer::resetExtendedTokenMode() { + assert(PP && "Cannot reset token mode without a preprocessor"); + if (LangOpts.TraditionalCPP) + SetKeepWhitespaceMode(true); + else + SetCommentRetentionState(PP->getCommentRetentionState()); +} + /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for /// _Pragma expansion. This has a variety of magic semantics that this method /// sets up. It returns a new'd Lexer that must be delete'd when done. @@ -209,30 +219,39 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, return L; } -/// Stringify - Convert the specified string into a C string, with surrounding -/// ""'s, and with escaped \ and " characters. +template <typename T> static void StringifyImpl(T &Str, char Quote) { + typename T::size_type i = 0, e = Str.size(); + while (i < e) { + if (Str[i] == '\\' || Str[i] == Quote) { + Str.insert(Str.begin() + i, '\\'); + i += 2; + ++e; + } else if (Str[i] == '\n' || Str[i] == '\r') { + // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. + if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && + Str[i] != Str[i + 1]) { + Str[i] = '\\'; + Str[i + 1] = 'n'; + } else { + // Replace '\n' and '\r' to '\\' followed by 'n'. + Str[i] = '\\'; + Str.insert(Str.begin() + i + 1, 'n'); + ++e; + } + i += 2; + } else + ++i; + } +} + std::string Lexer::Stringify(StringRef Str, bool Charify) { std::string Result = Str; char Quote = Charify ? '\'' : '"'; - for (unsigned i = 0, e = Result.size(); i != e; ++i) { - if (Result[i] == '\\' || Result[i] == Quote) { - Result.insert(Result.begin()+i, '\\'); - ++i; ++e; - } - } + StringifyImpl(Result, Quote); return Result; } -/// Stringify - Convert the specified string into a C string by escaping '\' -/// and " characters. This does not add surrounding ""'s to the string. -void Lexer::Stringify(SmallVectorImpl<char> &Str) { - for (unsigned i = 0, e = Str.size(); i != e; ++i) { - if (Str[i] == '\\' || Str[i] == '"') { - Str.insert(Str.begin()+i, '\\'); - ++i; ++e; - } - } -} +void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } //===----------------------------------------------------------------------===// // Token Spelling @@ -307,7 +326,7 @@ StringRef Lexer::getSpelling(SourceLocation loc, StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); if (invalidTemp) { if (invalid) *invalid = true; - return StringRef(); + return {}; } const char *tokenBegin = file.data() + locInfo.second; @@ -345,7 +364,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, if (Invalid) *Invalid = CharDataInvalid; if (CharDataInvalid) - return std::string(); + return {}; // If this token contains nothing interesting, return it directly. if (!Tok.needsCleaning()) @@ -367,7 +386,7 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, /// to point to a constant buffer with the data already in it (avoiding a /// copy). The caller is not allowed to modify the returned buffer pointer /// if an internal buffer is returned. -unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, +unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); @@ -463,19 +482,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { const char *BufStart = Buffer.data(); if (Offset >= Buffer.size()) return nullptr; - const char *StrData = BufStart + Offset; - - if (StrData[0] == '\n' || StrData[0] == '\r') - return StrData; - const char *LexStart = StrData; - while (LexStart != BufStart) { - if (LexStart[0] == '\n' || LexStart[0] == '\r') { + const char *LexStart = BufStart + Offset; + for (; LexStart != BufStart; --LexStart) { + if (isVerticalWhitespace(LexStart[0]) && + !Lexer::isNewLineEscaped(BufStart, LexStart)) { + // LexStart should point at first character of logical line. ++LexStart; break; } - - --LexStart; } return LexStart; } @@ -487,7 +502,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); if (LocInfo.first.isInvalid()) return Loc; - + bool Invalid = false; StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); if (Invalid) @@ -499,31 +514,31 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); if (!LexStart || LexStart == StrData) return Loc; - + // Create a lexer starting at the beginning of this token. SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, Buffer.end()); TheLexer.SetCommentRetentionState(true); - + // Lex tokens until we find the token that contains the source location. Token TheTok; do { TheLexer.LexFromRawLexer(TheTok); - + if (TheLexer.getBufferLocation() > StrData) { // Lexing this token has taken the lexer past the source location we're // looking for. If the current token encompasses our source location, // return the beginning of that token. if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) return TheTok.getLocation(); - + // We ended up skipping over the source location entirely, which means // that it points into whitespace. We're done here. break; } } while (TheTok.getKind() != tok::eof); - + // We've passed our source location; just return the original source location. return Loc; } @@ -531,34 +546,34 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc, SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { - if (Loc.isFileID()) - return getBeginningOfFileToken(Loc, SM, LangOpts); - - if (!SM.isMacroArgExpansion(Loc)) - return Loc; - - SourceLocation FileLoc = SM.getSpellingLoc(Loc); - SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); - std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); - std::pair<FileID, unsigned> BeginFileLocInfo - = SM.getDecomposedLoc(BeginFileLoc); - assert(FileLocInfo.first == BeginFileLocInfo.first && - FileLocInfo.second >= BeginFileLocInfo.second); - return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); + if (Loc.isFileID()) + return getBeginningOfFileToken(Loc, SM, LangOpts); + + if (!SM.isMacroArgExpansion(Loc)) + return Loc; + + SourceLocation FileLoc = SM.getSpellingLoc(Loc); + SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); + std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); + std::pair<FileID, unsigned> BeginFileLocInfo = + SM.getDecomposedLoc(BeginFileLoc); + assert(FileLocInfo.first == BeginFileLocInfo.first && + FileLocInfo.second >= BeginFileLocInfo.second); + return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); } namespace { - enum PreambleDirectiveKind { - PDK_Skipped, - PDK_Unknown - }; +enum PreambleDirectiveKind { + PDK_Skipped, + PDK_Unknown +}; -} // end anonymous namespace +} // namespace -std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, - const LangOptions &LangOpts, - unsigned MaxLines) { +PreambleBounds Lexer::ComputePreamble(StringRef Buffer, + const LangOptions &LangOpts, + unsigned MaxLines) { // Create a lexer starting at the beginning of the file. Note that we use a // "fake" file source location at offset 1 so that the lexer will track our // position within the file. @@ -568,9 +583,6 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, Buffer.end()); TheLexer.SetCommentRetentionState(true); - // StartLoc will differ from FileLoc if there is a BOM that was skipped. - SourceLocation StartLoc = TheLexer.getSourceLocation(); - bool InPreprocessorDirective = false; Token TheTok; SourceLocation ActiveCommentLoc; @@ -599,17 +611,17 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, if (TheTok.getKind() == tok::eof) { break; } - + // If we haven't hit the end of the preprocessor directive, skip this // token. if (!TheTok.isAtStartOfLine()) continue; - + // We've passed the end of the preprocessor directive, and will look // at this token again below. InPreprocessorDirective = false; } - + // Keep track of the # of lines in the preamble. if (TheTok.isAtStartOfLine()) { unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; @@ -626,13 +638,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, ActiveCommentLoc = TheTok.getLocation(); continue; } - + if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { - // This is the start of a preprocessor directive. + // This is the start of a preprocessor directive. Token HashTok = TheTok; InPreprocessorDirective = true; ActiveCommentLoc = SourceLocation(); - + // Figure out which directive this is. Since we're lexing raw tokens, // we don't have an identifier table available. Instead, just look at // the raw identifier to recognize and categorize preprocessor directives. @@ -672,7 +684,7 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, break; } } - + // We only end up here if we didn't recognize the preprocessor // directive or it was one that can't occur in the preamble at this // point. Roll back the current token to the location of the '#'. @@ -685,14 +697,14 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, // the preamble. break; } while (true); - + SourceLocation End; if (ActiveCommentLoc.isValid()) End = ActiveCommentLoc; // don't truncate a decl comment. else End = TheTok.getLocation(); - return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), + return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), TheTok.isAtStartOfLine()); } @@ -707,13 +719,13 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, // trigraphs. bool Invalid = false; const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); - + // If they request the first char of the token, we're trivially done. if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) return TokStart; - + unsigned PhysOffset = 0; - + // The usual case is that tokens don't contain anything interesting. Skip // over the uninteresting characters. If a token only consists of simple // chars, this method is extremely fast. @@ -724,7 +736,7 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, --CharNo; ++PhysOffset; } - + // If we have a character that may be a trigraph or escaped newline, use a // lexer to parse it correctly. for (; CharNo; --CharNo) { @@ -733,14 +745,14 @@ SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, TokPtr += Size; PhysOffset += Size; } - + // Final detail: if we end up on an escaped newline, we want to return the // location of the actual byte of the token. For example foo\<newline>bar // advanced by 3 should return the location of b, not of \\. One compounding // detail of this is that the escape may be made by a trigraph. if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; - + return TokStart.getLocWithOffset(PhysOffset); } @@ -763,11 +775,11 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts) { if (Loc.isInvalid()) - return SourceLocation(); + return {}; if (Loc.isMacroID()) { if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) - return SourceLocation(); // Points inside the macro expansion. + return {}; // Points inside the macro expansion. } unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); @@ -775,7 +787,7 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, Len = Len - Offset; else return Loc; - + return Loc.getLocWithOffset(Len); } @@ -838,7 +850,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, if (Range.isTokenRange()) { End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); if (End.isInvalid()) - return CharSourceRange(); + return {}; } // Break down the source locations. @@ -846,12 +858,12 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, unsigned BeginOffs; std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); if (FID.isInvalid()) - return CharSourceRange(); + return {}; unsigned EndOffs; if (!SM.isInFileID(End, FID, &EndOffs) || BeginOffs > EndOffs) - return CharSourceRange(); + return {}; return CharSourceRange::getCharRange(Begin, End); } @@ -862,14 +874,14 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, SourceLocation Begin = Range.getBegin(); SourceLocation End = Range.getEnd(); if (Begin.isInvalid() || End.isInvalid()) - return CharSourceRange(); + return {}; if (Begin.isFileID() && End.isFileID()) return makeRangeFromFileLocs(Range, SM, LangOpts); if (Begin.isMacroID() && End.isFileID()) { if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) - return CharSourceRange(); + return {}; Range.setBegin(Begin); return makeRangeFromFileLocs(Range, SM, LangOpts); } @@ -879,7 +891,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, &End)) || (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, &End))) - return CharSourceRange(); + return {}; Range.setEnd(End); return makeRangeFromFileLocs(Range, SM, LangOpts); } @@ -900,13 +912,13 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), &Invalid); if (Invalid) - return CharSourceRange(); + return {}; if (BeginEntry.getExpansion().isMacroArgExpansion()) { const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), &Invalid); if (Invalid) - return CharSourceRange(); + return {}; if (EndEntry.getExpansion().isMacroArgExpansion() && BeginEntry.getExpansion().getExpansionLocStart() == @@ -917,7 +929,7 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, } } - return CharSourceRange(); + return {}; } StringRef Lexer::getSourceText(CharSourceRange Range, @@ -927,21 +939,21 @@ StringRef Lexer::getSourceText(CharSourceRange Range, Range = makeFileCharRange(Range, SM, LangOpts); if (Range.isInvalid()) { if (Invalid) *Invalid = true; - return StringRef(); + return {}; } // Break down the source location. std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); if (beginInfo.first.isInvalid()) { if (Invalid) *Invalid = true; - return StringRef(); + return {}; } unsigned EndOffs; if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || beginInfo.second > EndOffs) { if (Invalid) *Invalid = true; - return StringRef(); + return {}; } // Try to the load the file buffer. @@ -949,7 +961,7 @@ StringRef Lexer::getSourceText(CharSourceRange Range, StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); if (invalidTemp) { if (Invalid) *Invalid = true; - return StringRef(); + return {}; } if (Invalid) *Invalid = false; @@ -972,7 +984,7 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc, // For macro arguments we need to check that the argument did not come // from an inner macro, e.g: "MAC1( MAC2(foo) )" - + // Loc points to the argument id of the macro definition, move to the // macro expansion. Loc = SM.getImmediateExpansionRange(Loc).first; @@ -1013,7 +1025,7 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics( // If the macro's spelling has no FileID, then it's actually a token paste // or stringization (or similar) and not a macro at all. if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) - return StringRef(); + return {}; // Find the spelling location of the start of the non-argument expansion // range. This is where the macro name was spelled in order to begin @@ -1032,20 +1044,40 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { return isIdentifierBody(c, LangOpts.DollarIdents); } +bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { + assert(isVerticalWhitespace(Str[0])); + if (Str - 1 < BufferStart) + return false; + + if ((Str[0] == '\n' && Str[-1] == '\r') || + (Str[0] == '\r' && Str[-1] == '\n')) { + if (Str - 2 < BufferStart) + return false; + --Str; + } + --Str; + + // Rewind to first non-space character: + while (Str > BufferStart && isHorizontalWhitespace(*Str)) + --Str; + + return *Str == '\\'; +} + StringRef Lexer::getIndentationForLine(SourceLocation Loc, const SourceManager &SM) { if (Loc.isInvalid() || Loc.isMacroID()) - return ""; + return {}; std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); if (LocInfo.first.isInvalid()) - return ""; + return {}; bool Invalid = false; StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); if (Invalid) - return ""; + return {}; const char *Line = findBeginningOfLine(Buffer, LocInfo.second); if (!Line) - return ""; + return {}; StringRef Rest = Buffer.substr(Line - Buffer.data()); size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); return NumWhitespaceChars == StringRef::npos @@ -1199,18 +1231,12 @@ const char *Lexer::SkipEscapedNewLines(const char *P) { } } -/// \brief Checks that the given token is the first token that occurs after the -/// given location (this excludes comments and whitespace). Returns the location -/// immediately after the specified token. If the token is not found or the -/// location is inside a macro, the returned source location will be invalid. -SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, - tok::TokenKind TKind, - const SourceManager &SM, - const LangOptions &LangOpts, - bool SkipTrailingWhitespaceAndNewLine) { +Optional<Token> Lexer::findNextToken(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { if (Loc.isMacroID()) { if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) - return SourceLocation(); + return None; } Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); @@ -1221,7 +1247,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, bool InvalidTemp = false; StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); if (InvalidTemp) - return SourceLocation(); + return None; const char *TokenBegin = File.data() + LocInfo.second; @@ -1231,15 +1257,25 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, // Find the token. Token Tok; lexer.LexFromRawLexer(Tok); - if (Tok.isNot(TKind)) - return SourceLocation(); - SourceLocation TokenLoc = Tok.getLocation(); + return Tok; +} + +/// \brief Checks that the given token is the first token that occurs after the +/// given location (this excludes comments and whitespace). Returns the location +/// immediately after the specified token. If the token is not found or the +/// location is inside a macro, the returned source location will be invalid. +SourceLocation Lexer::findLocationAfterToken( + SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, + const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { + Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); + if (!Tok || Tok->isNot(TKind)) + return {}; + SourceLocation TokenLoc = Tok->getLocation(); // Calculate how much whitespace needs to be skipped if any. unsigned NumWhitespaceChars = 0; if (SkipTrailingWhitespaceAndNewLine) { - const char *TokenEnd = SM.getCharacterData(TokenLoc) + - Tok.getLength(); + const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); unsigned char C = *TokenEnd; while (isHorizontalWhitespace(C)) { C = *(++TokenEnd); @@ -1256,7 +1292,7 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, } } - return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); + return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); } /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, @@ -1274,7 +1310,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, /// /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should /// be updated to match. -/// char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, Token *Tok) { // If we have a slash, look for an escaped newline. @@ -1378,9 +1413,9 @@ Slash: // Helper methods for lexing. //===----------------------------------------------------------------------===// -/// \brief Routine that indiscriminately skips bytes in the source file. -void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { - BufferPtr += Bytes; +/// \brief Routine that indiscriminately sets the offset into the source file. +void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { + BufferPtr = BufferStart + Offset; if (BufferPtr > BufferEnd) BufferPtr = BufferEnd; // FIXME: What exactly does the StartOfLine bit mean? There are two @@ -1466,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, } } +/// After encountering UTF-8 character C and interpreting it as an identifier +/// character, check whether it's a homoglyph for a common non-identifier +/// source character that is unlikely to be an intentional identifier +/// character and warn if so. +static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, + CharSourceRange Range) { + // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). + struct HomoglyphPair { + uint32_t Character; + char LooksLike; + bool operator<(HomoglyphPair R) const { return Character < R.Character; } + }; + static constexpr HomoglyphPair SortedHomoglyphs[] = { + {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK + {U'\u037e', ';'}, // GREEK QUESTION MARK + {U'\u2212', '-'}, // MINUS SIGN + {U'\u2215', '/'}, // DIVISION SLASH + {U'\u2216', '\\'}, // SET MINUS + {U'\u2217', '*'}, // ASTERISK OPERATOR + {U'\u2223', '|'}, // DIVIDES + {U'\u2227', '^'}, // LOGICAL AND + {U'\u2236', ':'}, // RATIO + {U'\u223c', '~'}, // TILDE OPERATOR + {U'\ua789', ':'}, // MODIFIER LETTER COLON + {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK + {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN + {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN + {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN + {U'\uff06', '&'}, // FULLWIDTH AMPERSAND + {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS + {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS + {U'\uff0a', '*'}, // FULLWIDTH ASTERISK + {U'\uff0b', '+'}, // FULLWIDTH ASTERISK + {U'\uff0c', ','}, // FULLWIDTH COMMA + {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS + {U'\uff0e', '.'}, // FULLWIDTH FULL STOP + {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS + {U'\uff1a', ':'}, // FULLWIDTH COLON + {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON + {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN + {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN + {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN + {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK + {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT + {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET + {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS + {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET + {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT + {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET + {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE + {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET + {U'\uff5e', '~'}, // FULLWIDTH TILDE + {0, 0} + }; + auto Homoglyph = + std::lower_bound(std::begin(SortedHomoglyphs), + std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); + if (Homoglyph->Character == C) { + llvm::SmallString<5> CharBuf; + { + llvm::raw_svector_ostream CharOS(CharBuf); + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); + } + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) + << Range << CharBuf << LooksLikeStr; + } +} + bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, Token &Result) { const char *UCNPtr = CurPtr + Size; @@ -1500,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) return false; - if (!isLexingRawMode()) + if (!isLexingRawMode()) { maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr)); + } CurPtr = UnicodePtr; return true; @@ -1569,7 +1676,6 @@ FinishIdentifier: CurPtr = ConsumeChar(CurPtr, Size, Result); C = getCharAndSize(CurPtr, Size); continue; - } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { C = getCharAndSize(CurPtr, Size); continue; @@ -1632,7 +1738,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { if (!LangOpts.C99) { if (!isHexaLiteral(BufferPtr, LangOpts)) IsHexFloat = false; - else if (!getLangOpts().CPlusPlus1z && + else if (!getLangOpts().CPlusPlus17 && std::find(BufferPtr, CurPtr, '_') != CurPtr) IsHexFloat = false; } @@ -1778,7 +1884,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, // getAndAdvanceChar. if (C == '\\') C = getAndAdvanceChar(CurPtr, Result); - + if (C == '\n' || C == '\r' || // Newline. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) @@ -1786,7 +1892,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, FormTokenWithChars(Result, CurPtr-1, tok::unknown); return true; } - + if (C == 0) { if (isCodeCompletionPoint(CurPtr-1)) { PP->CodeCompleteNaturalLanguage(); @@ -2000,7 +2106,6 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, /// Update BufferPtr to point to the next non-whitespace character and return. /// /// This method forms a token and returns true if KeepWhitespaceMode is enabled. -/// bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, bool &TokAtPhysicalStartOfLine) { // Whitespace - Skip it, then return the token after the whitespace. @@ -2131,7 +2236,8 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, // If we read multiple characters, and one of those characters was a \r or // \n, then we had an escaped newline within the comment. Emit diagnostic // unless the next line is also a // comment. - if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { + if (CurPtr != OldPtr + 1 && C != '/' && + (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { for (; OldPtr != CurPtr; ++OldPtr) if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { // Okay, we found a // comment that ends in a newline, if the next @@ -2214,7 +2320,7 @@ bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { std::string Spelling = PP->getSpelling(Result, &Invalid); if (Invalid) return true; - + assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); Spelling[1] = '*'; // Change prefix to "/*". Spelling += "*/"; // add suffix. @@ -2540,7 +2646,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { resetExtendedTokenMode(); return true; // Have a token. } - + // If we are in raw mode, return this event as an EOF token. Let the caller // that put us in raw mode handle the event. if (isLexingRawMode()) { @@ -2549,7 +2655,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { FormTokenWithChars(Result, BufferEnd, tok::eof); return true; } - + if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { PP->setRecordedPreambleConditionalStack(ConditionalStack); ConditionalStack.clear(); @@ -2661,7 +2767,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // Check to see if we have <<<<<<< or >>>>. if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) @@ -2671,7 +2777,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { // it. if (CurrentConflictMarkerState || isLexingRawMode()) return false; - + ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; // Check to see if there is an ending marker somewhere in the buffer at the @@ -2681,7 +2787,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { // Diagnose this, and ignore to the end of line. Diag(CurPtr, diag::err_conflict_marker); CurrentConflictMarkerState = Kind; - + // Skip ahead to the end of line. We know this exists because the // end-of-conflict marker starts with \r or \n. while (*CurPtr != '\r' && *CurPtr != '\n') { @@ -2691,7 +2797,7 @@ bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { BufferPtr = CurPtr; return true; } - + // No end of conflict marker found. return false; } @@ -2705,35 +2811,35 @@ bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // If we have a situation where we don't care about conflict markers, ignore // it. if (!CurrentConflictMarkerState || isLexingRawMode()) return false; - + // Check to see if we have the marker (4 characters in a row). for (unsigned i = 1; i != 4; ++i) if (CurPtr[i] != CurPtr[0]) return false; - + // If we do have it, search for the end of the conflict marker. This could // fail if it got skipped with a '#if 0' or something. Note that CurPtr might // be the end of conflict marker. if (const char *End = FindConflictEnd(CurPtr, BufferEnd, CurrentConflictMarkerState)) { CurPtr = End; - + // Skip ahead to the end of line. while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') ++CurPtr; - + BufferPtr = CurPtr; - + // No longer in the conflict marker. CurrentConflictMarkerState = CMK_None; return true; } - + return false; } @@ -2872,7 +2978,6 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, } return 0; - } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. // We don't use isLexingRawMode() here because we need to diagnose bad @@ -3042,7 +3147,7 @@ LexNextToken: // We know the lexer hasn't changed, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + case 26: // DOS & CP/M EOF: "^Z". // If we're in Microsoft extensions mode, treat this as end of file. if (LangOpts.MicrosoftExt) { @@ -3054,9 +3159,12 @@ LexNextToken: // If Microsoft extensions are disabled, this is just random garbage. Kind = tok::unknown; break; - - case '\n': + case '\r': + if (CurPtr[0] == '\n') + Char = getAndAdvanceChar(CurPtr, Result); + LLVM_FALLTHROUGH; + case '\n': // If we are inside a preprocessor directive and we see the end of line, // we know we are done with the directive, so return an EOD token. if (ParsingPreprocessorDirective) { @@ -3114,7 +3222,7 @@ LexNextToken: // We only saw whitespace, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + // C99 6.4.4.1: Integer Constants. // C99 6.4.4.2: Floating Constants. case '0': case '1': case '2': case '3': case '4': @@ -3157,7 +3265,7 @@ LexNextToken: ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result), tok::utf8_string_literal); - if (Char2 == '\'' && LangOpts.CPlusPlus1z) + if (Char2 == '\'' && LangOpts.CPlusPlus17) return LexCharConstant( Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result), @@ -3501,6 +3609,24 @@ LexNextToken: Kind = tok::lessless; } } else if (Char == '=') { + char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); + if (After == '>') { + if (getLangOpts().CPlusPlus2a) { + if (!isLexingRawMode()) + Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); + Kind = tok::spaceship; + break; + } + // Suggest adding a space between the '<=' and the '>' to avoid a + // change in semantics if this turns up in C++ <=17 mode. + if (getLangOpts().CPlusPlus && !isLexingRawMode()) { + Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) + << FixItHint::CreateInsertion( + getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); + } + } CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::lessequal; } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' @@ -3526,7 +3652,8 @@ LexNextToken: } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::l_brace; - } else if (Char == '#' && lexEditorPlaceholder(Result, CurPtr)) { + } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && + lexEditorPlaceholder(Result, CurPtr)) { return true; } else { Kind = tok::less; @@ -3594,7 +3721,9 @@ LexNextToken: if (LangOpts.Digraphs && Char == '>') { Kind = tok::r_square; // ':>' -> ']' CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); - } else if (LangOpts.CPlusPlus && Char == ':') { + } else if ((LangOpts.CPlusPlus || + LangOpts.DoubleSquareBracketAttributes) && + Char == ':') { Kind = tok::coloncolon; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); } else { @@ -3610,7 +3739,7 @@ LexNextToken: // If this is '====' and we're in a conflict marker, ignore it. if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) goto LexNextToken; - + Kind = tok::equalequal; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); } else { @@ -3681,6 +3810,7 @@ LexNextToken: // We can't just reset CurPtr to BufferPtr because BufferPtr may point to // an escaped newline. --CurPtr; + const char *UTF8StartPtr = CurPtr; llvm::ConversionResult Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)BufferEnd, @@ -3695,9 +3825,12 @@ LexNextToken: // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; } + if (!isLexingRawMode()) + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, UTF8StartPtr, CurPtr)); return LexUnicode(Result, CodePoint, CurPtr); } - + if (isLexingRawMode() || ParsingPreprocessorDirective || PP->isPreprocessedOutput()) { ++CurPtr; |