15ffd83dbSDimitry Andric //===-- ClangHighlighter.cpp ----------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric
90b57cec5SDimitry Andric #include "ClangHighlighter.h"
100b57cec5SDimitry Andric
110b57cec5SDimitry Andric #include "lldb/Host/FileSystem.h"
120b57cec5SDimitry Andric #include "lldb/Target/Language.h"
130b57cec5SDimitry Andric #include "lldb/Utility/AnsiTerminal.h"
140b57cec5SDimitry Andric #include "lldb/Utility/StreamString.h"
150b57cec5SDimitry Andric
165ffd83dbSDimitry Andric #include "clang/Basic/FileManager.h"
170b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h"
180b57cec5SDimitry Andric #include "clang/Lex/Lexer.h"
190b57cec5SDimitry Andric #include "llvm/ADT/StringSet.h"
200b57cec5SDimitry Andric #include "llvm/Support/MemoryBuffer.h"
21bdd1243dSDimitry Andric #include <optional>
220b57cec5SDimitry Andric
230b57cec5SDimitry Andric using namespace lldb_private;
240b57cec5SDimitry Andric
isKeyword(llvm::StringRef token) const250b57cec5SDimitry Andric bool ClangHighlighter::isKeyword(llvm::StringRef token) const {
2606c3fb27SDimitry Andric return keywords.contains(token);
270b57cec5SDimitry Andric }
280b57cec5SDimitry Andric
ClangHighlighter()290b57cec5SDimitry Andric ClangHighlighter::ClangHighlighter() {
300b57cec5SDimitry Andric #define KEYWORD(X, N) keywords.insert(#X);
310b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.def"
320b57cec5SDimitry Andric }
330b57cec5SDimitry Andric
340b57cec5SDimitry Andric /// Determines which style should be applied to the given token.
350b57cec5SDimitry Andric /// \param highlighter
360b57cec5SDimitry Andric /// The current highlighter that should use the style.
370b57cec5SDimitry Andric /// \param token
380b57cec5SDimitry Andric /// The current token.
390b57cec5SDimitry Andric /// \param tok_str
400b57cec5SDimitry Andric /// The string in the source code the token represents.
410b57cec5SDimitry Andric /// \param options
420b57cec5SDimitry Andric /// The style we use for coloring the source code.
430b57cec5SDimitry Andric /// \param in_pp_directive
440b57cec5SDimitry Andric /// If we are currently in a preprocessor directive. NOTE: This is
450b57cec5SDimitry Andric /// passed by reference and will be updated if the current token starts
460b57cec5SDimitry Andric /// or ends a preprocessor directive.
470b57cec5SDimitry Andric /// \return
480b57cec5SDimitry Andric /// The ColorStyle that should be applied to the token.
490b57cec5SDimitry Andric static HighlightStyle::ColorStyle
determineClangStyle(const ClangHighlighter & highlighter,const clang::Token & token,llvm::StringRef tok_str,const HighlightStyle & options,bool & in_pp_directive)500b57cec5SDimitry Andric determineClangStyle(const ClangHighlighter &highlighter,
510b57cec5SDimitry Andric const clang::Token &token, llvm::StringRef tok_str,
520b57cec5SDimitry Andric const HighlightStyle &options, bool &in_pp_directive) {
530b57cec5SDimitry Andric using namespace clang;
540b57cec5SDimitry Andric
550b57cec5SDimitry Andric if (token.is(tok::comment)) {
560b57cec5SDimitry Andric // If we were in a preprocessor directive before, we now left it.
570b57cec5SDimitry Andric in_pp_directive = false;
580b57cec5SDimitry Andric return options.comment;
590b57cec5SDimitry Andric } else if (in_pp_directive || token.getKind() == tok::hash) {
600b57cec5SDimitry Andric // Let's assume that the rest of the line is a PP directive.
610b57cec5SDimitry Andric in_pp_directive = true;
620b57cec5SDimitry Andric // Preprocessor directives are hard to match, so we have to hack this in.
630b57cec5SDimitry Andric return options.pp_directive;
640b57cec5SDimitry Andric } else if (tok::isStringLiteral(token.getKind()))
650b57cec5SDimitry Andric return options.string_literal;
660b57cec5SDimitry Andric else if (tok::isLiteral(token.getKind()))
670b57cec5SDimitry Andric return options.scalar_literal;
680b57cec5SDimitry Andric else if (highlighter.isKeyword(tok_str))
690b57cec5SDimitry Andric return options.keyword;
700b57cec5SDimitry Andric else
710b57cec5SDimitry Andric switch (token.getKind()) {
720b57cec5SDimitry Andric case tok::raw_identifier:
730b57cec5SDimitry Andric case tok::identifier:
740b57cec5SDimitry Andric return options.identifier;
750b57cec5SDimitry Andric case tok::l_brace:
760b57cec5SDimitry Andric case tok::r_brace:
770b57cec5SDimitry Andric return options.braces;
780b57cec5SDimitry Andric case tok::l_square:
790b57cec5SDimitry Andric case tok::r_square:
800b57cec5SDimitry Andric return options.square_brackets;
810b57cec5SDimitry Andric case tok::l_paren:
820b57cec5SDimitry Andric case tok::r_paren:
830b57cec5SDimitry Andric return options.parentheses;
840b57cec5SDimitry Andric case tok::comma:
850b57cec5SDimitry Andric return options.comma;
860b57cec5SDimitry Andric case tok::coloncolon:
870b57cec5SDimitry Andric case tok::colon:
880b57cec5SDimitry Andric return options.colon;
890b57cec5SDimitry Andric
900b57cec5SDimitry Andric case tok::amp:
910b57cec5SDimitry Andric case tok::ampamp:
920b57cec5SDimitry Andric case tok::ampequal:
930b57cec5SDimitry Andric case tok::star:
940b57cec5SDimitry Andric case tok::starequal:
950b57cec5SDimitry Andric case tok::plus:
960b57cec5SDimitry Andric case tok::plusplus:
970b57cec5SDimitry Andric case tok::plusequal:
980b57cec5SDimitry Andric case tok::minus:
990b57cec5SDimitry Andric case tok::arrow:
1000b57cec5SDimitry Andric case tok::minusminus:
1010b57cec5SDimitry Andric case tok::minusequal:
1020b57cec5SDimitry Andric case tok::tilde:
1030b57cec5SDimitry Andric case tok::exclaim:
1040b57cec5SDimitry Andric case tok::exclaimequal:
1050b57cec5SDimitry Andric case tok::slash:
1060b57cec5SDimitry Andric case tok::slashequal:
1070b57cec5SDimitry Andric case tok::percent:
1080b57cec5SDimitry Andric case tok::percentequal:
1090b57cec5SDimitry Andric case tok::less:
1100b57cec5SDimitry Andric case tok::lessless:
1110b57cec5SDimitry Andric case tok::lessequal:
1120b57cec5SDimitry Andric case tok::lesslessequal:
1130b57cec5SDimitry Andric case tok::spaceship:
1140b57cec5SDimitry Andric case tok::greater:
1150b57cec5SDimitry Andric case tok::greatergreater:
1160b57cec5SDimitry Andric case tok::greaterequal:
1170b57cec5SDimitry Andric case tok::greatergreaterequal:
1180b57cec5SDimitry Andric case tok::caret:
1190b57cec5SDimitry Andric case tok::caretequal:
1200b57cec5SDimitry Andric case tok::pipe:
1210b57cec5SDimitry Andric case tok::pipepipe:
1220b57cec5SDimitry Andric case tok::pipeequal:
1230b57cec5SDimitry Andric case tok::question:
1240b57cec5SDimitry Andric case tok::equal:
1250b57cec5SDimitry Andric case tok::equalequal:
1260b57cec5SDimitry Andric return options.operators;
1270b57cec5SDimitry Andric default:
1280b57cec5SDimitry Andric break;
1290b57cec5SDimitry Andric }
1300b57cec5SDimitry Andric return HighlightStyle::ColorStyle();
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric
Highlight(const HighlightStyle & options,llvm::StringRef line,std::optional<size_t> cursor_pos,llvm::StringRef previous_lines,Stream & result) const1330b57cec5SDimitry Andric void ClangHighlighter::Highlight(const HighlightStyle &options,
1340b57cec5SDimitry Andric llvm::StringRef line,
135bdd1243dSDimitry Andric std::optional<size_t> cursor_pos,
1360b57cec5SDimitry Andric llvm::StringRef previous_lines,
1370b57cec5SDimitry Andric Stream &result) const {
1380b57cec5SDimitry Andric using namespace clang;
1390b57cec5SDimitry Andric
1400b57cec5SDimitry Andric FileSystemOptions file_opts;
1410b57cec5SDimitry Andric FileManager file_mgr(file_opts,
1420b57cec5SDimitry Andric FileSystem::Instance().GetVirtualFileSystem());
1430b57cec5SDimitry Andric
144480093f4SDimitry Andric // The line might end in a backslash which would cause Clang to drop the
145480093f4SDimitry Andric // backslash and the terminating new line. This makes sense when parsing C++,
146480093f4SDimitry Andric // but when highlighting we care about preserving the backslash/newline. To
147480093f4SDimitry Andric // not lose this information we remove the new line here so that Clang knows
148480093f4SDimitry Andric // this is just a single line we are highlighting. We add back the newline
149480093f4SDimitry Andric // after tokenizing.
150480093f4SDimitry Andric llvm::StringRef line_ending = "";
151480093f4SDimitry Andric // There are a few legal line endings Clang recognizes and we need to
152480093f4SDimitry Andric // temporarily remove from the string.
153480093f4SDimitry Andric if (line.consume_back("\r\n"))
154480093f4SDimitry Andric line_ending = "\r\n";
155480093f4SDimitry Andric else if (line.consume_back("\n"))
156480093f4SDimitry Andric line_ending = "\n";
157480093f4SDimitry Andric else if (line.consume_back("\r"))
158480093f4SDimitry Andric line_ending = "\r";
159480093f4SDimitry Andric
1600b57cec5SDimitry Andric unsigned line_number = previous_lines.count('\n') + 1U;
1610b57cec5SDimitry Andric
1620b57cec5SDimitry Andric // Let's build the actual source code Clang needs and setup some utility
1630b57cec5SDimitry Andric // objects.
1640b57cec5SDimitry Andric std::string full_source = previous_lines.str() + line.str();
1650b57cec5SDimitry Andric llvm::IntrusiveRefCntPtr<DiagnosticIDs> diag_ids(new DiagnosticIDs());
1660b57cec5SDimitry Andric llvm::IntrusiveRefCntPtr<DiagnosticOptions> diags_opts(
1670b57cec5SDimitry Andric new DiagnosticOptions());
1680b57cec5SDimitry Andric DiagnosticsEngine diags(diag_ids, diags_opts);
1690b57cec5SDimitry Andric clang::SourceManager SM(diags, file_mgr);
1700b57cec5SDimitry Andric auto buf = llvm::MemoryBuffer::getMemBuffer(full_source);
1710b57cec5SDimitry Andric
172e8d8bef9SDimitry Andric FileID FID = SM.createFileID(buf->getMemBufferRef());
1730b57cec5SDimitry Andric
1740b57cec5SDimitry Andric // Let's just enable the latest ObjC and C++ which should get most tokens
1750b57cec5SDimitry Andric // right.
1760b57cec5SDimitry Andric LangOptions Opts;
1770b57cec5SDimitry Andric Opts.ObjC = true;
1780b57cec5SDimitry Andric // FIXME: This should probably set CPlusPlus, CPlusPlus11, ... too
1790b57cec5SDimitry Andric Opts.CPlusPlus17 = true;
1800b57cec5SDimitry Andric Opts.LineComment = true;
1810b57cec5SDimitry Andric
182e8d8bef9SDimitry Andric Lexer lex(FID, buf->getMemBufferRef(), SM, Opts);
1830b57cec5SDimitry Andric // The lexer should keep whitespace around.
1840b57cec5SDimitry Andric lex.SetKeepWhitespaceMode(true);
1850b57cec5SDimitry Andric
1860b57cec5SDimitry Andric // Keeps track if we have entered a PP directive.
1870b57cec5SDimitry Andric bool in_pp_directive = false;
1880b57cec5SDimitry Andric
1890b57cec5SDimitry Andric // True once we actually lexed the user provided line.
1900b57cec5SDimitry Andric bool found_user_line = false;
1910b57cec5SDimitry Andric
1920b57cec5SDimitry Andric // True if we already highlighted the token under the cursor, false otherwise.
1930b57cec5SDimitry Andric bool highlighted_cursor = false;
1940b57cec5SDimitry Andric Token token;
1950b57cec5SDimitry Andric bool exit = false;
1960b57cec5SDimitry Andric while (!exit) {
1970b57cec5SDimitry Andric // Returns true if this is the last token we get from the lexer.
1980b57cec5SDimitry Andric exit = lex.LexFromRawLexer(token);
1990b57cec5SDimitry Andric
2000b57cec5SDimitry Andric bool invalid = false;
2010b57cec5SDimitry Andric unsigned current_line_number =
2020b57cec5SDimitry Andric SM.getSpellingLineNumber(token.getLocation(), &invalid);
2030b57cec5SDimitry Andric if (current_line_number != line_number)
2040b57cec5SDimitry Andric continue;
2050b57cec5SDimitry Andric found_user_line = true;
2060b57cec5SDimitry Andric
2070b57cec5SDimitry Andric // We don't need to print any tokens without a spelling line number.
2080b57cec5SDimitry Andric if (invalid)
2090b57cec5SDimitry Andric continue;
2100b57cec5SDimitry Andric
2110b57cec5SDimitry Andric // Same as above but with the column number.
2120b57cec5SDimitry Andric invalid = false;
2130b57cec5SDimitry Andric unsigned start = SM.getSpellingColumnNumber(token.getLocation(), &invalid);
2140b57cec5SDimitry Andric if (invalid)
2150b57cec5SDimitry Andric continue;
2160b57cec5SDimitry Andric // Column numbers start at 1, but indexes in our string start at 0.
2170b57cec5SDimitry Andric --start;
2180b57cec5SDimitry Andric
2190b57cec5SDimitry Andric // Annotations don't have a length, so let's skip them.
2200b57cec5SDimitry Andric if (token.isAnnotation())
2210b57cec5SDimitry Andric continue;
2220b57cec5SDimitry Andric
2230b57cec5SDimitry Andric // Extract the token string from our source code.
2240b57cec5SDimitry Andric llvm::StringRef tok_str = line.substr(start, token.getLength());
2250b57cec5SDimitry Andric
2260b57cec5SDimitry Andric // If the token is just an empty string, we can skip all the work below.
2270b57cec5SDimitry Andric if (tok_str.empty())
2280b57cec5SDimitry Andric continue;
2290b57cec5SDimitry Andric
2300b57cec5SDimitry Andric // If the cursor is inside this token, we have to apply the 'selected'
2310b57cec5SDimitry Andric // highlight style before applying the actual token color.
2320b57cec5SDimitry Andric llvm::StringRef to_print = tok_str;
2330b57cec5SDimitry Andric StreamString storage;
2340b57cec5SDimitry Andric auto end = start + token.getLength();
2350b57cec5SDimitry Andric if (cursor_pos && end > *cursor_pos && !highlighted_cursor) {
2360b57cec5SDimitry Andric highlighted_cursor = true;
2370b57cec5SDimitry Andric options.selected.Apply(storage, tok_str);
2380b57cec5SDimitry Andric to_print = storage.GetString();
2390b57cec5SDimitry Andric }
2400b57cec5SDimitry Andric
2410b57cec5SDimitry Andric // See how we are supposed to highlight this token.
2420b57cec5SDimitry Andric HighlightStyle::ColorStyle color =
2430b57cec5SDimitry Andric determineClangStyle(*this, token, tok_str, options, in_pp_directive);
2440b57cec5SDimitry Andric
2450b57cec5SDimitry Andric color.Apply(result, to_print);
2460b57cec5SDimitry Andric }
2470b57cec5SDimitry Andric
248480093f4SDimitry Andric // Add the line ending we trimmed before tokenizing.
249480093f4SDimitry Andric result << line_ending;
250480093f4SDimitry Andric
2510b57cec5SDimitry Andric // If we went over the whole file but couldn't find our own file, then
2520b57cec5SDimitry Andric // somehow our setup was wrong. When we're in release mode we just give the
2530b57cec5SDimitry Andric // user the normal line and pretend we don't know how to highlight it. In
2540b57cec5SDimitry Andric // debug mode we bail out with an assert as this should never happen.
2550b57cec5SDimitry Andric if (!found_user_line) {
2560b57cec5SDimitry Andric result << line;
2570b57cec5SDimitry Andric assert(false && "We couldn't find the user line in the input file?");
2580b57cec5SDimitry Andric }
2590b57cec5SDimitry Andric }
260