aboutsummaryrefslogtreecommitdiff
path: root/include/clang/Tooling/Syntax/Tokens.h
blob: 301432d3888b302efb8fc07355111447c9b32e1f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Record tokens that a preprocessor emits and define operations to map between
// the tokens written in a file and tokens produced by the preprocessor.
//
// When running the compiler, there are two token streams we are interested in:
//   - "spelled" tokens directly correspond to a substring written in some
//     source file.
//   - "expanded" tokens represent the result of preprocessing, parses consumes
//     this token stream to produce the AST.
//
// Expanded tokens correspond directly to locations found in the AST, allowing
// to find subranges of the token stream covered by various AST nodes. Spelled
// tokens correspond directly to the source code written by the user.
//
// To allow composing these two use-cases, we also define operations that map
// between expanded and spelled tokens that produced them (macro calls,
// directives, etc).
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H

#include "clang/Basic/FileManager.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <tuple>

namespace clang {
class Preprocessor;

namespace syntax {

/// A half-open character range inside a particular file, the start offset is
/// included and the end offset is excluded from the range.
struct FileRange {
  /// EXPECTS: File.isValid() && Begin <= End.
  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
  ///          are the same.
  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
            SourceLocation EndLoc);

  FileID file() const { return File; }
  /// Start is a start offset (inclusive) in the corresponding file.
  unsigned beginOffset() const { return Begin; }
  /// End offset (exclusive) in the corresponding file.
  unsigned endOffset() const { return End; }

  unsigned length() const { return End - Begin; }

  /// Check if \p Offset is inside the range.
  bool contains(unsigned Offset) const {
    return Begin <= Offset && Offset < End;
  }
  /// Check \p Offset is inside the range or equal to its endpoint.
  bool touches(unsigned Offset) const {
    return Begin <= Offset && Offset <= End;
  }

  /// Gets the substring that this FileRange refers to.
  llvm::StringRef text(const SourceManager &SM) const;

  friend bool operator==(const FileRange &L, const FileRange &R) {
    return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
  }
  friend bool operator!=(const FileRange &L, const FileRange &R) {
    return !(L == R);
  }

private:
  FileID File;
  unsigned Begin;
  unsigned End;
};

/// For debugging purposes.
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);

/// A token coming directly from a file or from a macro invocation. Has just
/// enough information to locate the token in the source code.
/// Can represent both expanded and spelled tokens.
class Token {
public:
  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
  /// EXPECTS: clang::Token is not an annotation token.
  explicit Token(const clang::Token &T);

  tok::TokenKind kind() const { return Kind; }
  /// Location of the first character of a token.
  SourceLocation location() const { return Location; }
  /// Location right after the last character of a token.
  SourceLocation endLocation() const {
    return Location.getLocWithOffset(Length);
  }
  unsigned length() const { return Length; }

  /// Get the substring covered by the token. Note that will include all
  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
  ///    in\
  ///    t
  /// both have the same kind tok::kw_int, but results of text() are different.
  llvm::StringRef text(const SourceManager &SM) const;

  /// Gets a range of this token.
  /// EXPECTS: token comes from a file, not from a macro expansion.
  FileRange range(const SourceManager &SM) const;

  /// Given two tokens inside the same file, returns a file range that starts at
  /// \p First and ends at \p Last.
  /// EXPECTS: First and Last are file tokens from the same file, Last starts
  ///          after First.
  static FileRange range(const SourceManager &SM, const syntax::Token &First,
                         const syntax::Token &Last);

  std::string dumpForTests(const SourceManager &SM) const;
  /// For debugging purposes.
  std::string str() const;

private:
  SourceLocation Location;
  unsigned Length;
  tok::TokenKind Kind;
};
/// For debugging purposes. Equivalent to a call to Token::str().
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);

/// A list of tokens obtained by preprocessing a text buffer and operations to
/// map between the expanded and spelled tokens, i.e. TokenBuffer has
/// information about two token streams:
///    1. Expanded tokens: tokens produced by the preprocessor after all macro
///       replacements,
///    2. Spelled tokens: corresponding directly to the source code of a file
///       before any macro replacements occurred.
/// Here's an example to illustrate a difference between those two:
///     #define FOO 10
///     int a = FOO;
///
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
/// Expanded tokens are {'int','a','=','10',';','eof'}.
///
/// Note that the expanded token stream has a tok::eof token at the end, the
/// spelled tokens never store a 'eof' token.
///
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
/// tokens for each of the files can be obtained via spelledTokens(FileID).
///
/// To map between the expanded and spelled tokens use findSpelledByExpanded().
///
/// To build a token buffer use the TokenCollector class. You can also compute
/// the spelled tokens of a file using the tokenize() helper.
///
/// FIXME: allow to map from spelled to expanded tokens when use-case shows up.
/// FIXME: allow mappings into macro arguments.
class TokenBuffer {
public:
  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
  /// All tokens produced by the preprocessor after all macro replacements,
  /// directives, etc. Source locations found in the clang AST will always
  /// point to one of these tokens.
  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
  ///        into two '>' tokens by the parser. However, TokenBuffer currently
  ///        keeps it as a single '>>' token.
  llvm::ArrayRef<syntax::Token> expandedTokens() const {
    return ExpandedTokens;
  }

  /// Find the subrange of spelled tokens that produced the corresponding \p
  /// Expanded tokens.
  ///
  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
  ///
  /// Will fail if the expanded tokens do not correspond to a
  /// sequence of spelled tokens. E.g. for the following example:
  ///
  ///   #define FIRST f1 f2 f3
  ///   #define SECOND s1 s2 s3
  ///
  ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
  ///
  /// the results would be:
  ///   expanded   => spelled
  ///   ------------------------
  ///            a => a
  ///     s1 s2 s3 => SECOND
  ///   a f1 f2 f3 => a FIRST
  ///         a f1 => can't map
  ///        s1 s2 => can't map
  ///
  /// If \p Expanded is empty, the returned value is llvm::None.
  /// Complexity is logarithmic.
  llvm::Optional<llvm::ArrayRef<syntax::Token>>
  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;

  /// An expansion produced by the preprocessor, includes macro expansions and
  /// preprocessor directives. Preprocessor always maps a non-empty range of
  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
  /// few examples of expansions:
  ///    #pragma once      // Expands to an empty range.
  ///    #define FOO 1 2 3 // Expands an empty range.
  ///    FOO               // Expands to "1 2 3".
  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
  ///    #include <vector> // Expands to tokens produced by the include.
  struct Expansion {
    llvm::ArrayRef<syntax::Token> Spelled;
    llvm::ArrayRef<syntax::Token> Expanded;
  };
  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
  /// a preprocessor directive) return the subrange of expanded tokens that the
  /// macro expands to.
  llvm::Optional<Expansion>
  expansionStartingAt(const syntax::Token *Spelled) const;

  /// Lexed tokens of a file before preprocessing. E.g. for the following input
  ///     #define DECL(name) int name = 10
  ///     DECL(a);
  /// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}.
  /// FIXME: we do not yet store tokens of directives, like #include, #define,
  ///        #pragma, etc.
  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;

  /// Get all tokens that expand a macro in \p FID. For the following input
  ///     #define FOO B
  ///     #define FOO2(X) int X
  ///     FOO2(XY)
  ///     int B;
  ///     FOO;
  /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
  /// respecitvely).
  std::vector<const syntax::Token *> macroExpansions(FileID FID) const;

  const SourceManager &sourceManager() const { return *SourceMgr; }

  std::string dumpForTests() const;

private:
  /// Describes a mapping between a continuous subrange of spelled tokens and
  /// expanded tokens. Represents macro expansions, preprocessor directives,
  /// conditionally disabled pp regions, etc.
  ///   #define FOO 1+2
  ///   #define BAR(a) a + 1
  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
  ///                            macroTokens = {'BAR', '(', '1', ')'}.
  struct Mapping {
    // Positions in the corresponding spelled token stream. The corresponding
    // range is never empty.
    unsigned BeginSpelled = 0;
    unsigned EndSpelled = 0;
    // Positions in the expanded token stream. The corresponding range can be
    // empty.
    unsigned BeginExpanded = 0;
    unsigned EndExpanded = 0;

    /// For debugging purposes.
    std::string str() const;
  };
  /// Spelled tokens of the file with information about the subranges.
  struct MarkedFile {
    /// Lexed, but not preprocessed, tokens of the file. These map directly to
    /// text in the corresponding files and include tokens of all preprocessor
    /// directives.
    /// FIXME: spelled tokens don't change across FileID that map to the same
    ///        FileEntry. We could consider deduplicating them to save memory.
    std::vector<syntax::Token> SpelledTokens;
    /// A sorted list to convert between the spelled and expanded token streams.
    std::vector<Mapping> Mappings;
    /// The first expanded token produced for this FileID.
    unsigned BeginExpanded = 0;
    unsigned EndExpanded = 0;
  };

  friend class TokenCollector;

  /// Maps a single expanded token to its spelled counterpart or a mapping that
  /// produced it.
  std::pair<const syntax::Token *, const Mapping *>
  spelledForExpandedToken(const syntax::Token *Expanded) const;

  /// Token stream produced after preprocessing, conceputally this captures the
  /// same stream as 'clang -E' (excluding the preprocessor directives like
  /// #file, etc.).
  std::vector<syntax::Token> ExpandedTokens;
  llvm::DenseMap<FileID, MarkedFile> Files;
  // The value is never null, pointer instead of reference to avoid disabling
  // implicit assignment operator.
  const SourceManager *SourceMgr;
};

/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
/// setting the appropriate token kind (instead of the raw_identifier reported
/// by lexer in raw mode). This is a very low-level function, most users should
/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
/// results from what one might expect when running a C++ frontend, e.g.
/// preprocessor does not run at all.
/// The result will *not* have a 'eof' token at the end.
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
                                    const LangOptions &LO);

/// Collects tokens for the main file while running the frontend action. An
/// instance of this object should be created on
/// FrontendAction::BeginSourceFile() and the results should be consumed after
/// FrontendAction::Execute() finishes.
class TokenCollector {
public:
  /// Adds the hooks to collect the tokens. Should be called before the
  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
  /// CreateASTConsumer().
  TokenCollector(Preprocessor &P);

  /// Finalizes token collection. Should be called after preprocessing is
  /// finished, i.e. after running Execute().
  LLVM_NODISCARD TokenBuffer consume() &&;

private:
  /// Maps from a start to an end spelling location of transformations
  /// performed by the preprocessor. These include:
  ///   1. range from '#' to the last token in the line for PP directives,
  ///   2. macro name and arguments for macro expansions.
  /// Note that we record only top-level macro expansions, intermediate
  /// expansions (e.g. inside macro arguments) are ignored.
  ///
  /// Used to find correct boundaries of macro calls and directives when
  /// building mappings from spelled to expanded tokens.
  ///
  /// Logically, at each point of the preprocessor execution there is a stack of
  /// macro expansions being processed and we could use it to recover the
  /// location information we need. However, the public preprocessor API only
  /// exposes the points when macro expansions start (when we push a macro onto
  /// the stack) and not when they end (when we pop a macro from the stack).
  /// To workaround this limitation, we rely on source location information
  /// stored in this map.
  using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>;
  class Builder;
  class CollectPPExpansions;

  std::vector<syntax::Token> Expanded;
  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
  PPExpansions Expansions;
  Preprocessor &PP;
  CollectPPExpansions *Collector;
};

} // namespace syntax
} // namespace clang

#endif