aboutsummaryrefslogtreecommitdiff
path: root/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r--lib/Lex/Lexer.cpp436
1 files changed, 266 insertions, 170 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 6f1043ae7353..c8b9a5d5420a 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -39,7 +39,7 @@ static void InitCharacterInfo();
// Token Class Implementation
//===----------------------------------------------------------------------===//
-/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
+/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
if (IdentifierInfo *II = getIdentifierInfo())
return II->getObjCKeywordID() == objcKey;
@@ -57,35 +57,36 @@ tok::ObjCKeywordKind Token::getObjCKeywordID() const {
// Lexer Class Implementation
//===----------------------------------------------------------------------===//
-void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
+void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
InitCharacterInfo();
-
+
BufferStart = BufStart;
BufferPtr = BufPtr;
BufferEnd = BufEnd;
-
+
assert(BufEnd[0] == 0 &&
"We assume that the input buffer has a null character at the end"
" to simplify lexing!");
-
- Is_PragmaLexer = false;
+ Is_PragmaLexer = false;
+ IsEofCodeCompletion = false;
+
// Start of the file is a start of line.
IsAtStartOfLine = true;
-
+
// We are not after parsing a #.
ParsingPreprocessorDirective = false;
-
+
// We are not after parsing #include.
ParsingFilename = false;
-
+
// We are not in raw mode. Raw mode disables diagnostics and interpretation
// of tokens (e.g. identifiers, thus disabling macro expansion). It is used
// to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
// or otherwise skipping over tokens.
LexingRawMode = false;
-
+
// Default to not keeping comments.
ExtendedTokenMode = 0;
}
@@ -98,14 +99,18 @@ Lexer::Lexer(FileID FID, Preprocessor &PP)
: PreprocessorLexer(&PP, FID),
FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
Features(PP.getLangOptions()) {
-
+
const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID);
InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
InputFile->getBufferEnd());
-
+
// Default to keeping comments if the preprocessor wants them.
SetCommentRetentionState(PP.getCommentRetentionState());
+
+ // If the input file is truncated, the EOF is a code-completion token.
+ if (PP.getSourceManager().isTruncatedFile(FID))
+ IsEofCodeCompletion = true;
}
/// Lexer constructor - Create a new raw lexer object. This object is only
@@ -116,7 +121,7 @@ Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
: FileLoc(fileloc), Features(features) {
InitLexer(BufStart, BufPtr, BufEnd);
-
+
// We *are* in raw mode.
LexingRawMode = true;
}
@@ -128,9 +133,9 @@ Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features)
: FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID);
- InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
+ InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
FromFile->getBufferEnd());
-
+
// We *are* in raw mode.
LexingRawMode = true;
}
@@ -150,7 +155,7 @@ Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features)
/// interface that could handle this stuff. This would pull GetMappedTokenLoc
/// out of the critical path of the lexer!
///
-Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
+Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
SourceLocation InstantiationLocStart,
SourceLocation InstantiationLocEnd,
unsigned TokLen, Preprocessor &PP) {
@@ -159,12 +164,12 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
// Create the lexer as if we were going to lex the file normally.
FileID SpellingFID = SM.getFileID(SpellingLoc);
Lexer *L = new Lexer(SpellingFID, PP);
-
+
// Now that the lexer is created, change the start/end locations so that we
// just lex the subsection of the file that we want. This is lexing from a
// scratch buffer.
const char *StrData = SM.getCharacterData(SpellingLoc);
-
+
L->BufferPtr = StrData;
L->BufferEnd = StrData+TokLen;
assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
@@ -174,11 +179,11 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
InstantiationLocStart,
InstantiationLocEnd, TokLen);
-
+
// Ensure that the lexer thinks it is inside a directive, so that end \n will
// return an EOM token.
L->ParsingPreprocessorDirective = true;
-
+
// This lexer really is for _Pragma.
L->Is_PragmaLexer = true;
return L;
@@ -220,7 +225,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
const LangOptions &LangOpts) {
// TODO: this could be special cased for common tokens like identifiers, ')',
// etc to make this faster, if it mattered. Just look at StrData[0] to handle
- // all obviously single-char tokens. This could use
+ // all obviously single-char tokens. This could use
// Lexer::isObviouslySimpleCharacter for example to handle identifiers or
// something.
@@ -233,6 +238,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
// Create a lexer starting at the beginning of this token.
Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second);
+ TheLexer.SetCommentRetentionState(true);
Token TheTok;
TheLexer.LexFromRawLexer(TheTok);
return TheTok.getLength();
@@ -242,8 +248,6 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
// Character information.
//===----------------------------------------------------------------------===//
-static unsigned char CharInfo[256];
-
enum {
CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
CHAR_VERT_WS = 0x02, // '\r', '\n'
@@ -253,25 +257,98 @@ enum {
CHAR_PERIOD = 0x20 // .
};
+// Statically initialize CharInfo table based on ASCII character set
+// Reference: FreeBSD 7.2 /usr/share/misc/ascii
+static const unsigned char CharInfo[256] =
+{
+// 0 NUL 1 SOH 2 STX 3 ETX
+// 4 EOT 5 ENQ 6 ACK 7 BEL
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+// 8 BS 9 HT 10 NL 11 VT
+//12 NP 13 CR 14 SO 15 SI
+ 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
+ CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
+//16 DLE 17 DC1 18 DC2 19 DC3
+//20 DC4 21 NAK 22 SYN 23 ETB
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+//24 CAN 25 EM 26 SUB 27 ESC
+//28 FS 29 GS 30 RS 31 US
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+//32 SP 33 ! 34 " 35 #
+//36 $ 37 % 38 & 39 '
+ CHAR_HORZ_WS, 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+//40 ( 41 ) 42 * 43 +
+//44 , 45 - 46 . 47 /
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , CHAR_PERIOD , 0 ,
+//48 0 49 1 50 2 51 3
+//52 4 53 5 54 6 55 7
+ CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
+ CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
+//56 8 57 9 58 : 59 ;
+//60 < 61 = 62 > 63 ?
+ CHAR_NUMBER , CHAR_NUMBER , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+//64 @ 65 A 66 B 67 C
+//68 D 69 E 70 F 71 G
+ 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//72 H 73 I 74 J 75 K
+//76 L 77 M 78 N 79 O
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//80 P 81 Q 82 R 83 S
+//84 T 85 U 86 V 87 W
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//88 X 89 Y 90 Z 91 [
+//92 \ 93 ] 94 ^ 95 _
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
+ 0 , 0 , 0 , CHAR_UNDER ,
+//96 ` 97 a 98 b 99 c
+//100 d 101 e 102 f 103 g
+ 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//104 h 105 i 106 j 107 k
+//108 l 109 m 110 n 111 o
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//112 p 113 q 114 r 115 s
+//116 t 117 u 118 v 119 w
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
+//120 x 121 y 122 z 123 {
+//124 | 125 } 126 ~ 127 DEL
+ CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
+ 0 , 0 , 0 , 0
+};
+
static void InitCharacterInfo() {
static bool isInited = false;
if (isInited) return;
- isInited = true;
-
- // Intiialize the CharInfo table.
- // TODO: statically initialize this.
- CharInfo[(int)' '] = CharInfo[(int)'\t'] =
- CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
- CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
-
- CharInfo[(int)'_'] = CHAR_UNDER;
- CharInfo[(int)'.'] = CHAR_PERIOD;
- for (unsigned i = 'a'; i <= 'z'; ++i)
- CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
+ // check the statically-initialized CharInfo table
+ assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
+ assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
+ assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
+ assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
+ assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
+ assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
+ assert(CHAR_UNDER == CharInfo[(int)'_']);
+ assert(CHAR_PERIOD == CharInfo[(int)'.']);
+ for (unsigned i = 'a'; i <= 'z'; ++i) {
+ assert(CHAR_LETTER == CharInfo[i]);
+ assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
+ }
for (unsigned i = '0'; i <= '9'; ++i)
- CharInfo[i] = CHAR_NUMBER;
+ assert(CHAR_NUMBER == CharInfo[i]);
+ isInited = true;
}
+
/// isIdentifierBody - Return true if this is the body character of an
/// identifier, which is [a-zA-Z0-9_].
static inline bool isIdentifierBody(unsigned char c) {
@@ -294,7 +371,7 @@ static inline bool isWhitespace(unsigned char c) {
/// isNumberBody - Return true if this is the body character of an
/// preprocessing number, which is [a-zA-Z0-9_.].
static inline bool isNumberBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
+ return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
true : false;
}
@@ -315,22 +392,22 @@ static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
SourceLocation FileLoc,
unsigned CharNo, unsigned TokLen) {
assert(FileLoc.isMacroID() && "Must be an instantiation");
-
+
// Otherwise, we're lexing "mapped tokens". This is used for things like
// _Pragma handling. Combine the instantiation location of FileLoc with the
// spelling location.
SourceManager &SM = PP.getSourceManager();
-
+
// Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
// characters come from spelling(FileLoc)+Offset.
SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
-
+
// Figure out the expansion loc range, which is the range covered by the
// original _Pragma(...) sequence.
std::pair<SourceLocation,SourceLocation> II =
SM.getImmediateInstantiationRange(FileLoc);
-
+
return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
}
@@ -346,7 +423,7 @@ SourceLocation Lexer::getSourceLocation(const char *Loc,
unsigned CharNo = Loc-BufferStart;
if (FileLoc.isFileID())
return FileLoc.getFileLocWithOffset(CharNo);
-
+
// Otherwise, this is the _Pragma lexer case, which pretends that all of the
// tokens are lexed from where the _Pragma was defined.
assert(PP && "This doesn't work on raw lexers");
@@ -387,13 +464,13 @@ static char GetTrigraphCharForLetter(char Letter) {
static char DecodeTrigraphChar(const char *CP, Lexer *L) {
char Res = GetTrigraphCharForLetter(*CP);
if (!Res || !L) return Res;
-
+
if (!L->getFeatures().Trigraphs) {
if (!L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_ignored);
return 0;
}
-
+
if (!L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
return Res;
@@ -401,12 +478,12 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L) {
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
-/// trigraph equivalent on entry to this function.
+/// trigraph equivalent on entry to this function.
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
unsigned Size = 0;
while (isWhitespace(Ptr[Size])) {
++Size;
-
+
if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
continue;
@@ -414,10 +491,10 @@ unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
Ptr[Size-1] != Ptr[Size])
++Size;
-
+
return Size;
- }
-
+ }
+
// Not an escaped newline, must be a \t or something else.
return 0;
}
@@ -438,7 +515,7 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
} else {
return P;
}
-
+
unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
if (NewLineSize == 0) return P;
P = AfterEscape+NewLineSize;
@@ -472,7 +549,7 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
Slash:
// Common case, backslash-char where the char is not whitespace.
if (!isWhitespace(Ptr[0])) return '\\';
-
+
// See if we have optional whitespace characters between the slash and
// newline.
if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -482,18 +559,18 @@ Slash:
// Warn if there was whitespace between the backslash and newline.
if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
Diag(Ptr, diag::backslash_newline_space);
-
+
// Found backslash<whitespace><newline>. Parse the char after it.
Size += EscapedNewLineSize;
Ptr += EscapedNewLineSize;
// Use slow version to accumulate a correct size field.
return getCharAndSizeSlow(Ptr, Size, Tok);
}
-
+
// Otherwise, this is not an escaped newline, just return the slash.
return '\\';
}
-
+
// If this is a trigraph, process it.
if (Ptr[0] == '?' && Ptr[1] == '?') {
// If this is actually a legal trigraph (not something like "??x"), emit
@@ -508,7 +585,7 @@ Slash:
return C;
}
}
-
+
// If this is neither, return a single character.
++Size;
return *Ptr;
@@ -530,21 +607,21 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
Slash:
// Common case, backslash-char where the char is not whitespace.
if (!isWhitespace(Ptr[0])) return '\\';
-
+
// See if we have optional whitespace characters followed by a newline.
if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
// Found backslash<whitespace><newline>. Parse the char after it.
Size += EscapedNewLineSize;
Ptr += EscapedNewLineSize;
-
+
// Use slow version to accumulate a correct size field.
return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
}
-
+
// Otherwise, this is not an escaped newline, just return the slash.
return '\\';
}
-
+
// If this is a trigraph, process it.
if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
// If this is actually a legal trigraph (not something like "??x"), return
@@ -556,7 +633,7 @@ Slash:
return C;
}
}
-
+
// If this is neither, return a single character.
++Size;
return *Ptr;
@@ -582,34 +659,34 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
FinishIdentifier:
const char *IdStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::identifier);
-
+
// If we are in raw mode, return this identifier raw. There is no need to
// look up identifier information or attempt to macro expand it.
if (LexingRawMode) return;
-
+
// Fill in Result.IdentifierInfo, looking up the identifier in the
// identifier table.
IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
-
+
// Change the kind of this identifier to the appropriate token kind, e.g.
// turning "for" into a keyword.
Result.setKind(II->getTokenID());
-
+
// Finally, now that we know we have an identifier, pass this off to the
// preprocessor, which may macro expand it or something.
if (II->isHandleIdentifierCase())
PP->HandleIdentifier(Result);
return;
}
-
+
// Otherwise, $,\,? in identifier found. Enter slower path.
-
+
C = getCharAndSize(CurPtr, Size);
while (1) {
if (C == '$') {
// If we hit a $ and they are not supported in identifiers, we are done.
if (!Features.DollarIdents) goto FinishIdentifier;
-
+
// Otherwise, emit a diagnostic and continue.
if (!isLexingRawMode())
Diag(CurPtr, diag::ext_dollar_in_identifier);
@@ -645,7 +722,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
PrevCh = C;
C = getCharAndSize(CurPtr, Size);
}
-
+
// If we fell out, check for a sign, due to 1e+12. If we have one, continue.
if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
@@ -653,7 +730,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
// If we have a hex FP constant, continue.
if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p'))
return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
-
+
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@@ -664,7 +741,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
/// either " or L".
void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
-
+
char C = getAndAdvanceChar(CurPtr, Result);
while (C != '"') {
// Skip escaped characters.
@@ -682,7 +759,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
}
C = getAndAdvanceChar(CurPtr, Result);
}
-
+
// If a nul character existed in the string, warn about it.
if (NulCharacter && !isLexingRawMode())
Diag(NulCharacter, diag::null_in_string);
@@ -716,11 +793,11 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
}
C = getAndAdvanceChar(CurPtr, Result);
}
-
+
// If a nul character existed in the string, warn about it.
if (NulCharacter && !isLexingRawMode())
Diag(NulCharacter, diag::null_in_string);
-
+
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
@@ -745,7 +822,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
// FIXME: UCN's.
C = getAndAdvanceChar(CurPtr, Result);
}
-
+
if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
++CurPtr;
} else {
@@ -767,7 +844,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
C = getAndAdvanceChar(CurPtr, Result);
} while (C != '\'');
}
-
+
if (NulCharacter && !isLexingRawMode())
Diag(NulCharacter, diag::null_in_char);
@@ -789,17 +866,17 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
// Skip horizontal whitespace very aggressively.
while (isHorizontalWhitespace(Char))
Char = *++CurPtr;
-
+
// Otherwise if we have something other than whitespace, we're done.
if (Char != '\n' && Char != '\r')
break;
-
+
if (ParsingPreprocessorDirective) {
// End of preprocessor directive line, let LexTokenInternal handle this.
BufferPtr = CurPtr;
return false;
}
-
+
// ok, but handle newline.
// The returned token is at the start of the line.
Result.setFlag(Token::StartOfLine);
@@ -818,7 +895,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
return true;
}
-
+
BufferPtr = CurPtr;
return false;
}
@@ -832,12 +909,12 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
// extension warning.
if (!Features.BCPLComment && !isLexingRawMode()) {
Diag(BufferPtr, diag::ext_bcpl_comment);
-
+
// Mark them enabled so we only emit one warning for this translation
// unit.
Features.BCPLComment = true;
}
-
+
// Scan over the body of the comment. The common case, when scanning, is that
// the comment contains normal ascii characters with nothing interesting in
// them. As such, optimize for this case with the inner loop.
@@ -847,7 +924,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
// FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character.
// If we find a \n character, scan backwards, checking to see if it's an
// escaped newline, like we do for block comments.
-
+
// Skip over characters in the fast loop.
while (C != 0 && // Potentially EOF.
C != '\\' && // Potentially escaped newline.
@@ -858,7 +935,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
// If this is a newline, we're done.
if (C == '\n' || C == '\r')
break; // Found the newline? Break out!
-
+
// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
// properly decode the character. Read it in raw mode to avoid emitting
// diagnostics about things like trigraphs. If we see an escaped newline,
@@ -876,7 +953,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
--CurPtr;
C = 'x'; // doesn't matter what this is.
}
-
+
// If we read multiple characters, and one of those characters was a \r or
// \n, then we had an escaped newline within the comment. Emit diagnostic
// unless the next line is also a // comment.
@@ -892,21 +969,21 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
break;
}
-
+
if (!isLexingRawMode())
Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
break;
}
}
-
+
if (CurPtr == BufferEnd+1) { --CurPtr; break; }
} while (C != '\n' && C != '\r');
// Found but did not consume the newline.
if (PP)
- PP->HandleComment(SourceRange(getSourceLocation(BufferPtr),
+ PP->HandleComment(SourceRange(getSourceLocation(BufferPtr),
getSourceLocation(CurPtr)));
-
+
// If we are returning comments as tokens, return this comment as a token.
if (inKeepCommentMode())
return SaveBCPLComment(Result, CurPtr);
@@ -917,14 +994,14 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
BufferPtr = CurPtr;
return false;
}
-
+
// Otherwise, eat the \n character. We don't care if this is a \n\r or
// \r\n sequence. This is an efficiency hack (because we know the \n can't
// contribute to another token), it isn't needed for correctness. Note that
// this is ok even in KeepWhitespaceMode, because we would have returned the
/// comment above in that mode.
++CurPtr;
-
+
// The next returned token is at the start of the line.
Result.setFlag(Token::StartOfLine);
// No leading whitespace seen so far.
@@ -939,17 +1016,17 @@ bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
// If we're not in a preprocessor directive, just return the // comment
// directly.
FormTokenWithChars(Result, CurPtr, tok::comment);
-
+
if (!ParsingPreprocessorDirective)
return true;
-
+
// If this BCPL-style comment is in a macro definition, transmogrify it into
// a C-style block comment.
std::string Spelling = PP->getSpelling(Result);
assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
Spelling[1] = '*'; // Change prefix to "/*".
Spelling += "*/"; // add suffix.
-
+
Result.setKind(tok::comment);
PP->CreateString(&Spelling[0], Spelling.size(), Result,
Result.getLocation());
@@ -959,13 +1036,13 @@ bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
/// character (either \n or \r) is part of an escaped newline sequence. Issue a
/// diagnostic if so. We know that the newline is inside of a block comment.
-static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
+static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
Lexer *L) {
assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
-
+
// Back up off the newline.
--CurPtr;
-
+
// If this is a two-character newline sequence, skip the other character.
if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
// \n\n or \r\r -> not escaped newline.
@@ -974,7 +1051,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
// \n\r or \r\n -> skip the newline.
--CurPtr;
}
-
+
// If we have horizontal whitespace, skip over it. We allow whitespace
// between the slash and newline.
bool HasSpace = false;
@@ -982,7 +1059,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
--CurPtr;
HasSpace = true;
}
-
+
// If we have a slash, we know this is an escaped newline.
if (*CurPtr == '\\') {
if (CurPtr[-1] != '*') return false;
@@ -991,7 +1068,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
CurPtr[-3] != '*')
return false;
-
+
// This is the trigraph ending the comment. Emit a stern warning!
CurPtr -= 2;
@@ -1005,15 +1082,15 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
if (!L->isLexingRawMode())
L->Diag(CurPtr, diag::trigraph_ends_block_comment);
}
-
+
// Warn about having an escaped newline between the */ characters.
if (!L->isLexingRawMode())
L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
-
+
// If there was space between the backslash and newline, warn about it.
if (HasSpace && !L->isLexingRawMode())
L->Diag(CurPtr, diag::backslash_newline_space);
-
+
return true;
}
@@ -1049,23 +1126,23 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
if (!isLexingRawMode())
Diag(BufferPtr, diag::err_unterminated_block_comment);
--CurPtr;
-
+
// KeepWhitespaceMode should return this broken comment as a token. Since
// it isn't a well formed comment, just return it as an 'unknown' token.
if (isKeepWhitespaceMode()) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
return true;
}
-
+
BufferPtr = CurPtr;
return false;
}
-
+
// Check to see if the first character after the '/*' is another /. If so,
// then this slash does not end the block comment, it is part of it.
if (C == '/')
C = *CurPtr++;
-
+
while (1) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
@@ -1073,7 +1150,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
// While not aligned to a 16-byte boundary.
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
-
+
if (C == '/') goto FoundSlash;
#ifdef __SSE2__
@@ -1084,13 +1161,13 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
CurPtr += 16;
#elif __ALTIVEC__
__vector unsigned char Slashes = {
- '/', '/', '/', '/', '/', '/', '/', '/',
+ '/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
while (CurPtr+16 <= BufferEnd &&
!vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
CurPtr += 16;
-#else
+#else
// Scan for '/' quickly. Many block comments are very large.
while (CurPtr[0] != '/' &&
CurPtr[1] != '/' &&
@@ -1100,20 +1177,20 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
CurPtr += 4;
}
#endif
-
+
// It has to be one of the bytes scanned, increment to it and read one.
C = *CurPtr++;
}
-
+
// Loop to scan the remainder.
while (C != '/' && C != '\0')
C = *CurPtr++;
-
+
FoundSlash:
if (C == '/') {
if (CurPtr[-2] == '*') // We found the final */. We're done!
break;
-
+
if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
// We found the final */, though it had an escaped newline between the
@@ -1135,22 +1212,22 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
// after the /*, but this would involve lexing a lot of what really is the
// comment, which surely would confuse the parser.
--CurPtr;
-
+
// KeepWhitespaceMode should return this broken comment as a token. Since
// it isn't a well formed comment, just return it as an 'unknown' token.
if (isKeepWhitespaceMode()) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
return true;
}
-
+
BufferPtr = CurPtr;
return false;
}
C = *CurPtr++;
}
-
- if (PP)
- PP->HandleComment(SourceRange(getSourceLocation(BufferPtr),
+
+ if (PP)
+ PP->HandleComment(SourceRange(getSourceLocation(BufferPtr),
getSourceLocation(CurPtr)));
// If we are returning comments as tokens, return this comment as a token.
@@ -1208,11 +1285,11 @@ std::string Lexer::ReadToEndOfLine() {
// Okay, we found the end of the line. First, back up past the \0, \r, \n.
assert(CurPtr[-1] == Char && "Trigraphs for newline?");
BufferPtr = CurPtr-1;
-
+
// Next, lex the character, which should handle the EOM transition.
Lex(Tmp);
assert(Tmp.is(tok::eom) && "Unexpected token!");
-
+
// Finally, we're done, return the string we found.
return Result;
}
@@ -1232,12 +1309,12 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
ParsingPreprocessorDirective = false;
// Update the location of token as well as BufferPtr.
FormTokenWithChars(Result, CurPtr, tok::eom);
-
+
// Restore comment saving mode, in case it was disabled for directive.
SetCommentRetentionState(PP->getCommentRetentionState());
return true; // Have a token.
- }
-
+ }
+
// If we are in raw mode, return this event as an EOF token. Let the caller
// that put us in raw mode handle the event.
if (isLexingRawMode()) {
@@ -1246,23 +1323,44 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
FormTokenWithChars(Result, BufferEnd, tok::eof);
return true;
}
-
- // Otherwise, issue diagnostics for unterminated #if and missing newline.
+ // Otherwise, check if we are code-completing, then issue diagnostics for
+ // unterminated #if and missing newline.
+
+ if (IsEofCodeCompletion) {
+ bool isIntendedFile = true;
+ if (PP && FileLoc.isFileID()) {
+ SourceManager &SM = PP->getSourceManager();
+ isIntendedFile = SM.isTruncatedFile(SM.getFileID(FileLoc));
+ }
+
+ if (isIntendedFile) {
+ // We're at the end of the file, but we've been asked to consider the
+ // end of the file to be a code-completion token. Return the
+ // code-completion token.
+ Result.startToken();
+ FormTokenWithChars(Result, CurPtr, tok::code_completion);
+
+ // Only do the eof -> code_completion translation once.
+ IsEofCodeCompletion = false;
+ return true;
+ }
+ }
+
// If we are in a #if directive, emit an error.
while (!ConditionalStack.empty()) {
PP->Diag(ConditionalStack.back().IfLoc,
diag::err_pp_unterminated_conditional);
ConditionalStack.pop_back();
}
-
+
// C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
// a pedwarn.
if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
Diag(BufferEnd, diag::ext_no_newline_eof)
<< CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd),
"\n");
-
+
BufferPtr = CurPtr;
// Finally, let the preprocessor handle this.
@@ -1275,27 +1373,27 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
/// lexer.
unsigned Lexer::isNextPPTokenLParen() {
assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
-
+
// Switch to 'skipping' mode. This will ensure that we can lex a token
// without emitting diagnostics, disables macro expansion, and will cause EOF
// to return an EOF token instead of popping the include stack.
LexingRawMode = true;
-
+
// Save state that can be changed while lexing so that we can restore it.
const char *TmpBufferPtr = BufferPtr;
bool inPPDirectiveMode = ParsingPreprocessorDirective;
-
+
Token Tok;
Tok.startToken();
LexTokenInternal(Tok);
-
+
// Restore state that may have changed.
BufferPtr = TmpBufferPtr;
ParsingPreprocessorDirective = inPPDirectiveMode;
-
+
// Restore the lexer back to non-skipping mode.
LexingRawMode = false;
-
+
if (Tok.is(tok::eof))
return 2;
return Tok.is(tok::l_paren);
@@ -1304,17 +1402,15 @@ unsigned Lexer::isNextPPTokenLParen() {
/// LexTokenInternal - This implements a simple C family lexer. It is an
/// extremely performance critical piece of code. This assumes that the buffer
-/// has a null character at the end of the file. Return true if an error
-/// occurred and compilation should terminate, false if normal. This returns a
-/// preprocessing token, not a normal token, as such, it is an internal
-/// interface. It assumes that the Flags of result have been cleared before
-/// calling this.
+/// has a null character at the end of the file. This returns a preprocessing
+/// token, not a normal token, as such, it is an internal interface. It assumes
+/// that the Flags of result have been cleared before calling this.
void Lexer::LexTokenInternal(Token &Result) {
LexNextToken:
// New token, can't need cleaning yet.
Result.clearFlag(Token::NeedsCleaning);
Result.setIdentifierInfo(0);
-
+
// CurPtr - Cache BufferPtr in an automatic variable.
const char *CurPtr = BufferPtr;
@@ -1323,7 +1419,7 @@ LexNextToken:
++CurPtr;
while ((*CurPtr == ' ') || (*CurPtr == '\t'))
++CurPtr;
-
+
// If we are keeping whitespace and other tokens, just return what we just
// skipped. The next lexer invocation will return the token after the
// whitespace.
@@ -1331,17 +1427,17 @@ LexNextToken:
FormTokenWithChars(Result, CurPtr, tok::unknown);
return;
}
-
+
BufferPtr = CurPtr;
Result.setFlag(Token::LeadingSpace);
}
-
+
unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
-
+
// Read a character, advancing over it.
char Char = getAndAdvanceChar(CurPtr, Result);
tok::TokenKind Kind;
-
+
switch (Char) {
case 0: // Null.
// Found end of file?
@@ -1354,13 +1450,13 @@ LexNextToken:
assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
return PPCache->Lex(Result);
}
-
+
if (!isLexingRawMode())
Diag(CurPtr-1, diag::null_in_file);
Result.setFlag(Token::LeadingSpace);
if (SkipWhitespace(Result, CurPtr))
return; // KeepWhitespaceMode
-
+
goto LexNextToken; // GCC isn't tail call eliminating.
case '\n':
case '\r':
@@ -1369,13 +1465,13 @@ LexNextToken:
if (ParsingPreprocessorDirective) {
// Done parsing the "line".
ParsingPreprocessorDirective = false;
-
+
// Restore comment saving mode, in case it was disabled for directive.
SetCommentRetentionState(PP->getCommentRetentionState());
-
+
// Since we consumed a newline, we are back at the start of a line.
IsAtStartOfLine = true;
-
+
Kind = tok::eom;
break;
}
@@ -1383,7 +1479,7 @@ LexNextToken:
Result.setFlag(Token::StartOfLine);
// No leading whitespace seen so far.
Result.clearFlag(Token::LeadingSpace);
-
+
if (SkipWhitespace(Result, CurPtr))
return; // KeepWhitespaceMode
goto LexNextToken; // GCC isn't tail call eliminating.
@@ -1398,7 +1494,7 @@ LexNextToken:
SkipIgnoredUnits:
CurPtr = BufferPtr;
-
+
// If the next token is obviously a // or /* */ comment, skip it efficiently
// too (without going through the big switch stmt).
if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
@@ -1420,7 +1516,7 @@ LexNextToken:
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
-
+
case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
@@ -1435,7 +1531,7 @@ LexNextToken:
if (Char == '\'')
return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
// FALL THROUGH, treating L like the start of an identifier.
-
+
// C99 6.4.2: Identifiers.
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
@@ -1458,10 +1554,10 @@ LexNextToken:
MIOpt.ReadToken();
return LexIdentifier(Result, CurPtr);
}
-
+
Kind = tok::unknown;
break;
-
+
// C99 6.4.4: Character Constants.
case '\'':
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -1527,7 +1623,7 @@ LexNextToken:
Kind = tok::amp;
}
break;
- case '*':
+ case '*':
if (getCharAndSize(CurPtr, SizeTmp) == '=') {
Kind = tok::starequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
@@ -1552,7 +1648,7 @@ LexNextToken:
if (Char == '-') { // --
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::minusminus;
- } else if (Char == '>' && Features.CPlusPlus &&
+ } else if (Char == '>' && Features.CPlusPlus &&
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
@@ -1593,20 +1689,20 @@ LexNextToken:
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
return; // KeepCommentMode
-
+
// It is common for the tokens immediately after a // comment to be
// whitespace (indentation for the next line). Instead of going through
// the big switch, handle it efficiently now.
goto SkipIgnoredUnits;
}
}
-
+
if (Char == '*') { // /**/ comment.
if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
return; // KeepCommentMode
goto LexNextToken; // GCC isn't tail call eliminating.
}
-
+
if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::slashequal;
@@ -1642,7 +1738,7 @@ LexNextToken:
if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
FormTokenWithChars(Result, CurPtr, tok::hash);
PP->HandleDirective(Result);
-
+
// As an optimization, if the preprocessor didn't switch lexers, tail
// recurse.
if (PP->isCurrentLexer(this)) {
@@ -1655,10 +1751,10 @@ LexNextToken:
}
goto LexNextToken; // GCC isn't tail call eliminating.
}
-
+
return PP->Lex(Result);
}
-
+
Kind = tok::hash;
}
} else {
@@ -1695,7 +1791,7 @@ LexNextToken:
if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::greaterequal;
- } else if (Char == '>' &&
+ } else if (Char == '>' &&
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
@@ -1736,7 +1832,7 @@ LexNextToken:
} else if (Features.CPlusPlus && Char == ':') {
Kind = tok::coloncolon;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
- } else {
+ } else {
Kind = tok::colon;
}
break;
@@ -1748,7 +1844,7 @@ LexNextToken:
if (Char == '=') {
Kind = tok::equalequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
- } else {
+ } else {
Kind = tok::equal;
}
break;
@@ -1773,7 +1869,7 @@ LexNextToken:
if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
FormTokenWithChars(Result, CurPtr, tok::hash);
PP->HandleDirective(Result);
-
+
// As an optimization, if the preprocessor didn't switch lexers, tail
// recurse.
if (PP->isCurrentLexer(this)) {
@@ -1788,7 +1884,7 @@ LexNextToken:
}
return PP->Lex(Result);
}
-
+
Kind = tok::hash;
}
break;
@@ -1800,7 +1896,7 @@ LexNextToken:
else
Kind = tok::unknown;
break;
-
+
case '\\':
// FIXME: UCN's.
// FALL THROUGH.
@@ -1808,7 +1904,7 @@ LexNextToken:
Kind = tok::unknown;
break;
}
-
+
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();