contrib/llvm-project/clang/include/clang/Tooling/Syntax/Tokens.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463

//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Record tokens that a preprocessor emits and define operations to map between
// the tokens written in a file and tokens produced by the preprocessor.
//
// When running the compiler, there are two token streams we are interested in:
//   - "spelled" tokens directly correspond to a substring written in some
//     source file.
//   - "expanded" tokens represent the result of preprocessing, parses consumes
//     this token stream to produce the AST.
//
// Expanded tokens correspond directly to locations found in the AST, allowing
// to find subranges of the token stream covered by various AST nodes. Spelled
// tokens correspond directly to the source code written by the user.
//
// To allow composing these two use-cases, we also define operations that map
// between expanded and spelled tokens that produced them (macro calls,
// directives, etc).
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H

#include "clang/Basic/FileManager.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <tuple>

namespace clang {
class Preprocessor;

namespace syntax {

/// A half-open character range inside a particular file, the start offset is
/// included and the end offset is excluded from the range.
struct FileRange {
  /// EXPECTS: File.isValid() && Begin <= End.
  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
  ///          are the same.
  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
            SourceLocation EndLoc);

  FileID file() const { return File; }
  /// Start is a start offset (inclusive) in the corresponding file.
  unsigned beginOffset() const { return Begin; }
  /// End offset (exclusive) in the corresponding file.
  unsigned endOffset() const { return End; }

  unsigned length() const { return End - Begin; }

  /// Check if \p Offset is inside the range.
  bool contains(unsigned Offset) const {
    return Begin <= Offset && Offset < End;
  }
  /// Check \p Offset is inside the range or equal to its endpoint.
  bool touches(unsigned Offset) const {
    return Begin <= Offset && Offset <= End;
  }

  /// Gets the substring that this FileRange refers to.
  llvm::StringRef text(const SourceManager &SM) const;

  /// Convert to the clang range. The returned range is always a char range,
  /// never a token range.
  CharSourceRange toCharRange(const SourceManager &SM) const;

  friend bool operator==(const FileRange &L, const FileRange &R) {
    return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
  }
  friend bool operator!=(const FileRange &L, const FileRange &R) {
    return !(L == R);
  }

private:
  FileID File;
  unsigned Begin;
  unsigned End;
};

/// For debugging purposes.
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);

/// A token coming directly from a file or from a macro invocation. Has just
/// enough information to locate the token in the source code.
/// Can represent both expanded and spelled tokens.
class Token {
public:
  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
  /// EXPECTS: clang::Token is not an annotation token.
  explicit Token(const clang::Token &T);

  tok::TokenKind kind() const { return Kind; }
  /// Location of the first character of a token.
  SourceLocation location() const { return Location; }
  /// Location right after the last character of a token.
  SourceLocation endLocation() const {
    return Location.getLocWithOffset(Length);
  }
  unsigned length() const { return Length; }

  /// Get the substring covered by the token. Note that will include all
  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
  ///    in\
  ///    t
  /// both have the same kind tok::kw_int, but results of text() are different.
  llvm::StringRef text(const SourceManager &SM) const;

  /// Gets a range of this token.
  /// EXPECTS: token comes from a file, not from a macro expansion.
  FileRange range(const SourceManager &SM) const;

  /// Given two tokens inside the same file, returns a file range that starts at
  /// \p First and ends at \p Last.
  /// EXPECTS: First and Last are file tokens from the same file, Last starts
  ///          after First.
  static FileRange range(const SourceManager &SM, const syntax::Token &First,
                         const syntax::Token &Last);

  std::string dumpForTests(const SourceManager &SM) const;
  /// For debugging purposes.
  std::string str() const;

private:
  SourceLocation Location;
  unsigned Length;
  tok::TokenKind Kind;
};
/// For debugging purposes. Equivalent to a call to Token::str().
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);

/// A list of tokens obtained by preprocessing a text buffer and operations to
/// map between the expanded and spelled tokens, i.e. TokenBuffer has
/// information about two token streams:
///    1. Expanded tokens: tokens produced by the preprocessor after all macro
///       replacements,
///    2. Spelled tokens: corresponding directly to the source code of a file
///       before any macro replacements occurred.
/// Here's an example to illustrate a difference between those two:
///     #define FOO 10
///     int a = FOO;
///
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
/// Expanded tokens are {'int','a','=','10',';','eof'}.
///
/// Note that the expanded token stream has a tok::eof token at the end, the
/// spelled tokens never store a 'eof' token.
///
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
/// tokens for each of the files can be obtained via spelledTokens(FileID).
///
/// To map between the expanded and spelled tokens use findSpelledByExpanded().
///
/// To build a token buffer use the TokenCollector class. You can also compute
/// the spelled tokens of a file using the tokenize() helper.
///
/// FIXME: allow mappings into macro arguments.
class TokenBuffer {
public:
  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}

  TokenBuffer(TokenBuffer &&) = default;
  TokenBuffer(const TokenBuffer &) = delete;
  TokenBuffer &operator=(TokenBuffer &&) = default;
  TokenBuffer &operator=(const TokenBuffer &) = delete;

  /// All tokens produced by the preprocessor after all macro replacements,
  /// directives, etc. Source locations found in the clang AST will always
  /// point to one of these tokens.
  /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
  ///        into two '>' tokens by the parser. However, TokenBuffer currently
  ///        keeps it as a single '>>' token.
  llvm::ArrayRef<syntax::Token> expandedTokens() const {
    return ExpandedTokens;
  }

  /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
  /// Creates an index only once. Further calls to it will be no-op.
  void indexExpandedTokens();

  /// Returns the subrange of expandedTokens() corresponding to the closed
  /// token range R.
  /// Consider calling indexExpandedTokens() before for faster lookups.
  llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;

  /// Returns the subrange of spelled tokens corresponding to AST node spanning
  /// \p Expanded. This is the text that should be replaced if a refactoring
  /// were to rewrite the node. If \p Expanded is empty, the returned value is
  /// llvm::None.
  ///
  /// Will fail if the expanded tokens do not correspond to a sequence of
  /// spelled tokens. E.g. for the following example:
  ///
  ///   #define FIRST f1 f2 f3
  ///   #define SECOND s1 s2 s3
  ///   #define ID2(X, Y) X Y
  ///
  ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
  ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i
  ///
  /// the results would be:
  ///   expanded   => spelled
  ///   ------------------------
  ///            a => a
  ///     s1 s2 s3 => SECOND
  ///   a f1 f2 f3 => a FIRST
  ///         a f1 => can't map
  ///        s1 s2 => can't map
  ///         e f  => e f
  ///         g h  => can't map
  ///
  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
  /// Complexity is logarithmic.
  llvm::Optional<llvm::ArrayRef<syntax::Token>>
  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;

  /// Find the subranges of expanded tokens, corresponding to \p Spelled.
  ///
  /// Some spelled tokens may not be present in the expanded token stream, so
  /// this function can return an empty vector, e.g. for tokens of macro
  /// directives or disabled preprocessor branches.
  ///
  /// Some spelled tokens can be duplicated in the expanded token stream
  /// multiple times and this function will return multiple results in those
  /// cases. This happens when \p Spelled is inside a macro argument.
  ///
  /// FIXME: return correct results on macro arguments. For now, we return an
  ///        empty list.
  ///
  /// (!) will return empty vector on tokens from #define body:
  /// E.g. for the following example:
  ///
  ///   #define FIRST(A) f1 A = A f2
  ///   #define SECOND s
  ///
  ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
  /// The results would be
  ///   spelled           => expanded
  ///   ------------------------
  ///   #define FIRST     => {}
  ///   a FIRST(arg)      => {a f1 arg = arg f2}
  ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is
  ///                                   // after `=` in the expanded tokens.
  llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
  expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;

  /// An expansion produced by the preprocessor, includes macro expansions and
  /// preprocessor directives. Preprocessor always maps a non-empty range of
  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
  /// few examples of expansions:
  ///    #pragma once      // Expands to an empty range.
  ///    #define FOO 1 2 3 // Expands an empty range.
  ///    FOO               // Expands to "1 2 3".
  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
  ///    #include <vector> // Expands to tokens produced by the include.
  struct Expansion {
    llvm::ArrayRef<syntax::Token> Spelled;
    llvm::ArrayRef<syntax::Token> Expanded;
  };
  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
  /// a preprocessor directive) return the subrange of expanded tokens that the
  /// macro expands to.
  llvm::Optional<Expansion>
  expansionStartingAt(const syntax::Token *Spelled) const;
  /// Returns all expansions (partially) expanded from the specified tokens.
  /// This is the expansions whose Spelled range intersects \p Spelled.
  std::vector<Expansion>
  expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;

  /// Lexed tokens of a file before preprocessing. E.g. for the following input
  ///     #define DECL(name) int name = 10
  ///     DECL(a);
  /// spelledTokens() returns
  ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
  ///     "DECL", "(", "a", ")", ";"}
  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;

  /// Returns the spelled Token starting at Loc, if there are no such tokens
  /// returns nullptr.
  const syntax::Token *spelledTokenAt(SourceLocation Loc) const;

  /// Get all tokens that expand a macro in \p FID. For the following input
  ///     #define FOO B
  ///     #define FOO2(X) int X
  ///     FOO2(XY)
  ///     int B;
  ///     FOO;
  /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
  /// respecitvely).
  std::vector<const syntax::Token *> macroExpansions(FileID FID) const;

  const SourceManager &sourceManager() const { return *SourceMgr; }

  std::string dumpForTests() const;

private:
  /// Describes a mapping between a continuous subrange of spelled tokens and
  /// expanded tokens. Represents macro expansions, preprocessor directives,
  /// conditionally disabled pp regions, etc.
  ///   #define FOO 1+2
  ///   #define BAR(a) a + 1
  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
  ///                            macroTokens = {'BAR', '(', '1', ')'}.
  struct Mapping {
    // Positions in the corresponding spelled token stream. The corresponding
    // range is never empty.
    unsigned BeginSpelled = 0;
    unsigned EndSpelled = 0;
    // Positions in the expanded token stream. The corresponding range can be
    // empty.
    unsigned BeginExpanded = 0;
    unsigned EndExpanded = 0;

    /// For debugging purposes.
    std::string str() const;
  };
  /// Spelled tokens of the file with information about the subranges.
  struct MarkedFile {
    /// Lexed, but not preprocessed, tokens of the file. These map directly to
    /// text in the corresponding files and include tokens of all preprocessor
    /// directives.
    /// FIXME: spelled tokens don't change across FileID that map to the same
    ///        FileEntry. We could consider deduplicating them to save memory.
    std::vector<syntax::Token> SpelledTokens;
    /// A sorted list to convert between the spelled and expanded token streams.
    std::vector<Mapping> Mappings;
    /// The first expanded token produced for this FileID.
    unsigned BeginExpanded = 0;
    unsigned EndExpanded = 0;
  };

  friend class TokenCollector;

  /// Maps a single expanded token to its spelled counterpart or a mapping that
  /// produced it.
  std::pair<const syntax::Token *, const Mapping *>
  spelledForExpandedToken(const syntax::Token *Expanded) const;

  /// Returns a mapping starting before \p Spelled token, or nullptr if no
  /// such mapping exists.
  static const Mapping *
  mappingStartingBeforeSpelled(const MarkedFile &F,
                               const syntax::Token *Spelled);

  /// Convert a private Mapping to a public Expansion.
  Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
  /// Returns the file that the Spelled tokens are taken from.
  /// Asserts that they are non-empty, from a tracked file, and in-bounds.
  const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;

  /// Token stream produced after preprocessing, conceputally this captures the
  /// same stream as 'clang -E' (excluding the preprocessor directives like
  /// #file, etc.).
  std::vector<syntax::Token> ExpandedTokens;
  // Index of ExpandedTokens for faster lookups by SourceLocation.
  llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
  llvm::DenseMap<FileID, MarkedFile> Files;
  // The value is never null, pointer instead of reference to avoid disabling
  // implicit assignment operator.
  const SourceManager *SourceMgr;
};

/// The spelled tokens that overlap or touch a spelling location Loc.
/// This always returns 0-2 tokens.
llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);

/// The identifier token that overlaps or touches a spelling location Loc.
/// If there is none, returns nullptr.
const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,
                          llvm::ArrayRef<syntax::Token> Tokens);
const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,
                          const syntax::TokenBuffer &Tokens);

/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
/// setting the appropriate token kind (instead of the raw_identifier reported
/// by lexer in raw mode). This is a very low-level function, most users should
/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
/// results from what one might expect when running a C++ frontend, e.g.
/// preprocessor does not run at all.
/// The result will *not* have a 'eof' token at the end.
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
                                    const LangOptions &LO);
/// Similar to one above, instead of whole file tokenizes a part of it. Note
/// that, the first token might be incomplete if FR.startOffset is not at the
/// beginning of a token, and the last token returned will start before the
/// FR.endOffset but might end after it.
std::vector<syntax::Token>
tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);

/// Collects tokens for the main file while running the frontend action. An
/// instance of this object should be created on
/// FrontendAction::BeginSourceFile() and the results should be consumed after
/// FrontendAction::Execute() finishes.
class TokenCollector {
public:
  /// Adds the hooks to collect the tokens. Should be called before the
  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
  /// CreateASTConsumer().
  TokenCollector(Preprocessor &P);

  /// Finalizes token collection. Should be called after preprocessing is
  /// finished, i.e. after running Execute().
  LLVM_NODISCARD TokenBuffer consume() &&;

private:
  /// Maps from a start to an end spelling location of transformations
  /// performed by the preprocessor. These include:
  ///   1. range from '#' to the last token in the line for PP directives,
  ///   2. macro name and arguments for macro expansions.
  /// Note that we record only top-level macro expansions, intermediate
  /// expansions (e.g. inside macro arguments) are ignored.
  ///
  /// Used to find correct boundaries of macro calls and directives when
  /// building mappings from spelled to expanded tokens.
  ///
  /// Logically, at each point of the preprocessor execution there is a stack of
  /// macro expansions being processed and we could use it to recover the
  /// location information we need. However, the public preprocessor API only
  /// exposes the points when macro expansions start (when we push a macro onto
  /// the stack) and not when they end (when we pop a macro from the stack).
  /// To workaround this limitation, we rely on source location information
  /// stored in this map.
  using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
  class Builder;
  class CollectPPExpansions;

  std::vector<syntax::Token> Expanded;
  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
  PPExpansions Expansions;
  Preprocessor &PP;
  CollectPPExpansions *Collector;
};

} // namespace syntax
} // namespace clang

#endif