//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Record tokens that a preprocessor emits and define operations to map between // the tokens written in a file and tokens produced by the preprocessor. // // When running the compiler, there are two token streams we are interested in: // - "spelled" tokens directly correspond to a substring written in some // source file. // - "expanded" tokens represent the result of preprocessing, parses consumes // this token stream to produce the AST. // // Expanded tokens correspond directly to locations found in the AST, allowing // to find subranges of the token stream covered by various AST nodes. Spelled // tokens correspond directly to the source code written by the user. // // To allow composing these two use-cases, we also define operations that map // between expanded and spelled tokens that produced them (macro calls, // directives, etc). // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/TokenKinds.h" #include "clang/Lex/Token.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include #include namespace clang { class Preprocessor; namespace syntax { /// A half-open character range inside a particular file, the start offset is /// included and the end offset is excluded from the range. struct FileRange { /// EXPECTS: File.isValid() && Begin <= End. FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files /// are the same. FileRange(const SourceManager &SM, SourceLocation BeginLoc, SourceLocation EndLoc); FileID file() const { return File; } /// Start is a start offset (inclusive) in the corresponding file. unsigned beginOffset() const { return Begin; } /// End offset (exclusive) in the corresponding file. unsigned endOffset() const { return End; } unsigned length() const { return End - Begin; } /// Check if \p Offset is inside the range. bool contains(unsigned Offset) const { return Begin <= Offset && Offset < End; } /// Check \p Offset is inside the range or equal to its endpoint. bool touches(unsigned Offset) const { return Begin <= Offset && Offset <= End; } /// Gets the substring that this FileRange refers to. llvm::StringRef text(const SourceManager &SM) const; /// Convert to the clang range. The returned range is always a char range, /// never a token range. CharSourceRange toCharRange(const SourceManager &SM) const; friend bool operator==(const FileRange &L, const FileRange &R) { return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); } friend bool operator!=(const FileRange &L, const FileRange &R) { return !(L == R); } private: FileID File; unsigned Begin; unsigned End; }; /// For debugging purposes. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); /// A token coming directly from a file or from a macro invocation. Has just /// enough information to locate the token in the source code. /// Can represent both expanded and spelled tokens. class Token { public: Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); /// EXPECTS: clang::Token is not an annotation token. explicit Token(const clang::Token &T); tok::TokenKind kind() const { return Kind; } /// Location of the first character of a token. SourceLocation location() const { return Location; } /// Location right after the last character of a token. SourceLocation endLocation() const { return Location.getLocWithOffset(Length); } unsigned length() const { return Length; } /// Get the substring covered by the token. Note that will include all /// digraphs, newline continuations, etc. E.g. tokens for 'int' and /// in\ /// t /// both have the same kind tok::kw_int, but results of text() are different. llvm::StringRef text(const SourceManager &SM) const; /// Gets a range of this token. /// EXPECTS: token comes from a file, not from a macro expansion. FileRange range(const SourceManager &SM) const; /// Given two tokens inside the same file, returns a file range that starts at /// \p First and ends at \p Last. /// EXPECTS: First and Last are file tokens from the same file, Last starts /// after First. static FileRange range(const SourceManager &SM, const syntax::Token &First, const syntax::Token &Last); std::string dumpForTests(const SourceManager &SM) const; /// For debugging purposes. std::string str() const; private: SourceLocation Location; unsigned Length; tok::TokenKind Kind; }; /// For debugging purposes. Equivalent to a call to Token::str(). llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); /// A list of tokens obtained by preprocessing a text buffer and operations to /// map between the expanded and spelled tokens, i.e. TokenBuffer has /// information about two token streams: /// 1. Expanded tokens: tokens produced by the preprocessor after all macro /// replacements, /// 2. Spelled tokens: corresponding directly to the source code of a file /// before any macro replacements occurred. /// Here's an example to illustrate a difference between those two: /// #define FOO 10 /// int a = FOO; /// /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. /// Expanded tokens are {'int','a','=','10',';','eof'}. /// /// Note that the expanded token stream has a tok::eof token at the end, the /// spelled tokens never store a 'eof' token. /// /// The full list expanded tokens can be obtained with expandedTokens(). Spelled /// tokens for each of the files can be obtained via spelledTokens(FileID). /// /// To map between the expanded and spelled tokens use findSpelledByExpanded(). /// /// To build a token buffer use the TokenCollector class. You can also compute /// the spelled tokens of a file using the tokenize() helper. /// /// FIXME: allow mappings into macro arguments. class TokenBuffer { public: TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} TokenBuffer(TokenBuffer &&) = default; TokenBuffer(const TokenBuffer &) = delete; TokenBuffer &operator=(TokenBuffer &&) = default; TokenBuffer &operator=(const TokenBuffer &) = delete; /// All tokens produced by the preprocessor after all macro replacements, /// directives, etc. Source locations found in the clang AST will always /// point to one of these tokens. /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split /// into two '>' tokens by the parser. However, TokenBuffer currently /// keeps it as a single '>>' token. llvm::ArrayRef expandedTokens() const { return ExpandedTokens; } /// Builds a cache to make future calls to expandedToken(SourceRange) faster. /// Creates an index only once. Further calls to it will be no-op. void indexExpandedTokens(); /// Returns the subrange of expandedTokens() corresponding to the closed /// token range R. /// Consider calling indexExpandedTokens() before for faster lookups. llvm::ArrayRef expandedTokens(SourceRange R) const; /// Returns the subrange of spelled tokens corresponding to AST node spanning /// \p Expanded. This is the text that should be replaced if a refactoring /// were to rewrite the node. If \p Expanded is empty, the returned value is /// std::nullopt. /// /// Will fail if the expanded tokens do not correspond to a sequence of /// spelled tokens. E.g. for the following example: /// /// #define FIRST f1 f2 f3 /// #define SECOND s1 s2 s3 /// #define ID2(X, Y) X Y /// /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c /// d ID2(e f g, h) i // expanded tokens are: d e f g h i /// /// the results would be: /// expanded => spelled /// ------------------------ /// a => a /// s1 s2 s3 => SECOND /// a f1 f2 f3 => a FIRST /// a f1 => can't map /// s1 s2 => can't map /// e f => e f /// g h => can't map /// /// EXPECTS: \p Expanded is a subrange of expandedTokens(). /// Complexity is logarithmic. std::optional> spelledForExpanded(llvm::ArrayRef Expanded) const; /// Find the subranges of expanded tokens, corresponding to \p Spelled. /// /// Some spelled tokens may not be present in the expanded token stream, so /// this function can return an empty vector, e.g. for tokens of macro /// directives or disabled preprocessor branches. /// /// Some spelled tokens can be duplicated in the expanded token stream /// multiple times and this function will return multiple results in those /// cases. This happens when \p Spelled is inside a macro argument. /// /// FIXME: return correct results on macro arguments. For now, we return an /// empty list. /// /// (!) will return empty vector on tokens from #define body: /// E.g. for the following example: /// /// #define FIRST(A) f1 A = A f2 /// #define SECOND s /// /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s /// The results would be /// spelled => expanded /// ------------------------ /// #define FIRST => {} /// a FIRST(arg) => {a f1 arg = arg f2} /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is /// // after `=` in the expanded tokens. llvm::SmallVector, 1> expandedForSpelled(llvm::ArrayRef Spelled) const; /// An expansion produced by the preprocessor, includes macro expansions and /// preprocessor directives. Preprocessor always maps a non-empty range of /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a /// few examples of expansions: /// #pragma once // Expands to an empty range. /// #define FOO 1 2 3 // Expands an empty range. /// FOO // Expands to "1 2 3". /// FIXME(ibiryukov): implement this, currently #include expansions are empty. /// #include // Expands to tokens produced by the include. struct Expansion { llvm::ArrayRef Spelled; llvm::ArrayRef Expanded; }; /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting /// a preprocessor directive) return the subrange of expanded tokens that the /// macro expands to. std::optional expansionStartingAt(const syntax::Token *Spelled) const; /// Returns all expansions (partially) expanded from the specified tokens. /// This is the expansions whose Spelled range intersects \p Spelled. std::vector expansionsOverlapping(llvm::ArrayRef Spelled) const; /// Lexed tokens of a file before preprocessing. E.g. for the following input /// #define DECL(name) int name = 10 /// DECL(a); /// spelledTokens() returns /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", /// "DECL", "(", "a", ")", ";"} llvm::ArrayRef spelledTokens(FileID FID) const; /// Returns the spelled Token starting at Loc, if there are no such tokens /// returns nullptr. const syntax::Token *spelledTokenAt(SourceLocation Loc) const; /// Get all tokens that expand a macro in \p FID. For the following input /// #define FOO B /// #define FOO2(X) int X /// FOO2(XY) /// int B; /// FOO; /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 /// respecitvely). std::vector macroExpansions(FileID FID) const; const SourceManager &sourceManager() const { return *SourceMgr; } std::string dumpForTests() const; private: /// Describes a mapping between a continuous subrange of spelled tokens and /// expanded tokens. Represents macro expansions, preprocessor directives, /// conditionally disabled pp regions, etc. /// #define FOO 1+2 /// #define BAR(a) a + 1 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, /// macroTokens = {'BAR', '(', '1', ')'}. struct Mapping { // Positions in the corresponding spelled token stream. The corresponding // range is never empty. unsigned BeginSpelled = 0; unsigned EndSpelled = 0; // Positions in the expanded token stream. The corresponding range can be // empty. unsigned BeginExpanded = 0; unsigned EndExpanded = 0; /// For debugging purposes. std::string str() const; }; /// Spelled tokens of the file with information about the subranges. struct MarkedFile { /// Lexed, but not preprocessed, tokens of the file. These map directly to /// text in the corresponding files and include tokens of all preprocessor /// directives. /// FIXME: spelled tokens don't change across FileID that map to the same /// FileEntry. We could consider deduplicating them to save memory. std::vector SpelledTokens; /// A sorted list to convert between the spelled and expanded token streams. std::vector Mappings; /// The first expanded token produced for this FileID. unsigned BeginExpanded = 0; unsigned EndExpanded = 0; }; friend class TokenCollector; /// Maps a single expanded token to its spelled counterpart or a mapping that /// produced it. std::pair spelledForExpandedToken(const syntax::Token *Expanded) const; /// Returns a mapping starting before \p Spelled token, or nullptr if no /// such mapping exists. static const Mapping * mappingStartingBeforeSpelled(const MarkedFile &F, const syntax::Token *Spelled); /// Convert a private Mapping to a public Expansion. Expansion makeExpansion(const MarkedFile &, const Mapping &) const; /// Returns the file that the Spelled tokens are taken from. /// Asserts that they are non-empty, from a tracked file, and in-bounds. const MarkedFile &fileForSpelled(llvm::ArrayRef Spelled) const; /// Token stream produced after preprocessing, conceputally this captures the /// same stream as 'clang -E' (excluding the preprocessor directives like /// #file, etc.). std::vector ExpandedTokens; // Index of ExpandedTokens for faster lookups by SourceLocation. llvm::DenseMap ExpandedTokIndex; llvm::DenseMap Files; // The value is never null, pointer instead of reference to avoid disabling // implicit assignment operator. const SourceManager *SourceMgr; }; /// The spelled tokens that overlap or touch a spelling location Loc. /// This always returns 0-2 tokens. llvm::ArrayRef spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); llvm::ArrayRef spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef Tokens); /// The identifier token that overlaps or touches a spelling location Loc. /// If there is none, returns nullptr. const syntax::Token * spelledIdentifierTouching(SourceLocation Loc, llvm::ArrayRef Tokens); const syntax::Token * spelledIdentifierTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); /// Lex the text buffer, corresponding to \p FID, in raw mode and record the /// resulting spelled tokens. Does minimal post-processing on raw identifiers, /// setting the appropriate token kind (instead of the raw_identifier reported /// by lexer in raw mode). This is a very low-level function, most users should /// prefer to use TokenCollector. Lexing in raw mode produces wildly different /// results from what one might expect when running a C++ frontend, e.g. /// preprocessor does not run at all. /// The result will *not* have a 'eof' token at the end. std::vector tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO); /// Similar to one above, instead of whole file tokenizes a part of it. Note /// that, the first token might be incomplete if FR.startOffset is not at the /// beginning of a token, and the last token returned will start before the /// FR.endOffset but might end after it. std::vector tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); /// Collects tokens for the main file while running the frontend action. An /// instance of this object should be created on /// FrontendAction::BeginSourceFile() and the results should be consumed after /// FrontendAction::Execute() finishes. class TokenCollector { public: /// Adds the hooks to collect the tokens. Should be called before the /// preprocessing starts, i.e. as a part of BeginSourceFile() or /// CreateASTConsumer(). TokenCollector(Preprocessor &P); /// Finalizes token collection. Should be called after preprocessing is /// finished, i.e. after running Execute(). [[nodiscard]] TokenBuffer consume() &&; private: /// Maps from a start to an end spelling location of transformations /// performed by the preprocessor. These include: /// 1. range from '#' to the last token in the line for PP directives, /// 2. macro name and arguments for macro expansions. /// Note that we record only top-level macro expansions, intermediate /// expansions (e.g. inside macro arguments) are ignored. /// /// Used to find correct boundaries of macro calls and directives when /// building mappings from spelled to expanded tokens. /// /// Logically, at each point of the preprocessor execution there is a stack of /// macro expansions being processed and we could use it to recover the /// location information we need. However, the public preprocessor API only /// exposes the points when macro expansions start (when we push a macro onto /// the stack) and not when they end (when we pop a macro from the stack). /// To workaround this limitation, we rely on source location information /// stored in this map. using PPExpansions = llvm::DenseMap; class Builder; class CollectPPExpansions; std::vector Expanded; // FIXME: we only store macro expansions, also add directives(#pragma, etc.) PPExpansions Expansions; Preprocessor &PP; CollectPPExpansions *Collector; }; } // namespace syntax } // namespace clang #endif