15ffd83dbSDimitry Andric //===-- ClangHighlighter.cpp ----------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "ClangHighlighter.h"
100b57cec5SDimitry Andric 
110b57cec5SDimitry Andric #include "lldb/Host/FileSystem.h"
120b57cec5SDimitry Andric #include "lldb/Target/Language.h"
130b57cec5SDimitry Andric #include "lldb/Utility/AnsiTerminal.h"
140b57cec5SDimitry Andric #include "lldb/Utility/StreamString.h"
150b57cec5SDimitry Andric 
165ffd83dbSDimitry Andric #include "clang/Basic/FileManager.h"
170b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h"
180b57cec5SDimitry Andric #include "clang/Lex/Lexer.h"
190b57cec5SDimitry Andric #include "llvm/ADT/StringSet.h"
200b57cec5SDimitry Andric #include "llvm/Support/MemoryBuffer.h"
21bdd1243dSDimitry Andric #include <optional>
220b57cec5SDimitry Andric 
230b57cec5SDimitry Andric using namespace lldb_private;
240b57cec5SDimitry Andric 
isKeyword(llvm::StringRef token) const250b57cec5SDimitry Andric bool ClangHighlighter::isKeyword(llvm::StringRef token) const {
2606c3fb27SDimitry Andric   return keywords.contains(token);
270b57cec5SDimitry Andric }
280b57cec5SDimitry Andric 
ClangHighlighter()290b57cec5SDimitry Andric ClangHighlighter::ClangHighlighter() {
300b57cec5SDimitry Andric #define KEYWORD(X, N) keywords.insert(#X);
310b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.def"
320b57cec5SDimitry Andric }
330b57cec5SDimitry Andric 
340b57cec5SDimitry Andric /// Determines which style should be applied to the given token.
350b57cec5SDimitry Andric /// \param highlighter
360b57cec5SDimitry Andric ///     The current highlighter that should use the style.
370b57cec5SDimitry Andric /// \param token
380b57cec5SDimitry Andric ///     The current token.
390b57cec5SDimitry Andric /// \param tok_str
400b57cec5SDimitry Andric ///     The string in the source code the token represents.
410b57cec5SDimitry Andric /// \param options
420b57cec5SDimitry Andric ///     The style we use for coloring the source code.
430b57cec5SDimitry Andric /// \param in_pp_directive
440b57cec5SDimitry Andric ///     If we are currently in a preprocessor directive. NOTE: This is
450b57cec5SDimitry Andric ///     passed by reference and will be updated if the current token starts
460b57cec5SDimitry Andric ///     or ends a preprocessor directive.
470b57cec5SDimitry Andric /// \return
480b57cec5SDimitry Andric ///     The ColorStyle that should be applied to the token.
490b57cec5SDimitry Andric static HighlightStyle::ColorStyle
determineClangStyle(const ClangHighlighter & highlighter,const clang::Token & token,llvm::StringRef tok_str,const HighlightStyle & options,bool & in_pp_directive)500b57cec5SDimitry Andric determineClangStyle(const ClangHighlighter &highlighter,
510b57cec5SDimitry Andric                     const clang::Token &token, llvm::StringRef tok_str,
520b57cec5SDimitry Andric                     const HighlightStyle &options, bool &in_pp_directive) {
530b57cec5SDimitry Andric   using namespace clang;
540b57cec5SDimitry Andric 
550b57cec5SDimitry Andric   if (token.is(tok::comment)) {
560b57cec5SDimitry Andric     // If we were in a preprocessor directive before, we now left it.
570b57cec5SDimitry Andric     in_pp_directive = false;
580b57cec5SDimitry Andric     return options.comment;
590b57cec5SDimitry Andric   } else if (in_pp_directive || token.getKind() == tok::hash) {
600b57cec5SDimitry Andric     // Let's assume that the rest of the line is a PP directive.
610b57cec5SDimitry Andric     in_pp_directive = true;
620b57cec5SDimitry Andric     // Preprocessor directives are hard to match, so we have to hack this in.
630b57cec5SDimitry Andric     return options.pp_directive;
640b57cec5SDimitry Andric   } else if (tok::isStringLiteral(token.getKind()))
650b57cec5SDimitry Andric     return options.string_literal;
660b57cec5SDimitry Andric   else if (tok::isLiteral(token.getKind()))
670b57cec5SDimitry Andric     return options.scalar_literal;
680b57cec5SDimitry Andric   else if (highlighter.isKeyword(tok_str))
690b57cec5SDimitry Andric     return options.keyword;
700b57cec5SDimitry Andric   else
710b57cec5SDimitry Andric     switch (token.getKind()) {
720b57cec5SDimitry Andric     case tok::raw_identifier:
730b57cec5SDimitry Andric     case tok::identifier:
740b57cec5SDimitry Andric       return options.identifier;
750b57cec5SDimitry Andric     case tok::l_brace:
760b57cec5SDimitry Andric     case tok::r_brace:
770b57cec5SDimitry Andric       return options.braces;
780b57cec5SDimitry Andric     case tok::l_square:
790b57cec5SDimitry Andric     case tok::r_square:
800b57cec5SDimitry Andric       return options.square_brackets;
810b57cec5SDimitry Andric     case tok::l_paren:
820b57cec5SDimitry Andric     case tok::r_paren:
830b57cec5SDimitry Andric       return options.parentheses;
840b57cec5SDimitry Andric     case tok::comma:
850b57cec5SDimitry Andric       return options.comma;
860b57cec5SDimitry Andric     case tok::coloncolon:
870b57cec5SDimitry Andric     case tok::colon:
880b57cec5SDimitry Andric       return options.colon;
890b57cec5SDimitry Andric 
900b57cec5SDimitry Andric     case tok::amp:
910b57cec5SDimitry Andric     case tok::ampamp:
920b57cec5SDimitry Andric     case tok::ampequal:
930b57cec5SDimitry Andric     case tok::star:
940b57cec5SDimitry Andric     case tok::starequal:
950b57cec5SDimitry Andric     case tok::plus:
960b57cec5SDimitry Andric     case tok::plusplus:
970b57cec5SDimitry Andric     case tok::plusequal:
980b57cec5SDimitry Andric     case tok::minus:
990b57cec5SDimitry Andric     case tok::arrow:
1000b57cec5SDimitry Andric     case tok::minusminus:
1010b57cec5SDimitry Andric     case tok::minusequal:
1020b57cec5SDimitry Andric     case tok::tilde:
1030b57cec5SDimitry Andric     case tok::exclaim:
1040b57cec5SDimitry Andric     case tok::exclaimequal:
1050b57cec5SDimitry Andric     case tok::slash:
1060b57cec5SDimitry Andric     case tok::slashequal:
1070b57cec5SDimitry Andric     case tok::percent:
1080b57cec5SDimitry Andric     case tok::percentequal:
1090b57cec5SDimitry Andric     case tok::less:
1100b57cec5SDimitry Andric     case tok::lessless:
1110b57cec5SDimitry Andric     case tok::lessequal:
1120b57cec5SDimitry Andric     case tok::lesslessequal:
1130b57cec5SDimitry Andric     case tok::spaceship:
1140b57cec5SDimitry Andric     case tok::greater:
1150b57cec5SDimitry Andric     case tok::greatergreater:
1160b57cec5SDimitry Andric     case tok::greaterequal:
1170b57cec5SDimitry Andric     case tok::greatergreaterequal:
1180b57cec5SDimitry Andric     case tok::caret:
1190b57cec5SDimitry Andric     case tok::caretequal:
1200b57cec5SDimitry Andric     case tok::pipe:
1210b57cec5SDimitry Andric     case tok::pipepipe:
1220b57cec5SDimitry Andric     case tok::pipeequal:
1230b57cec5SDimitry Andric     case tok::question:
1240b57cec5SDimitry Andric     case tok::equal:
1250b57cec5SDimitry Andric     case tok::equalequal:
1260b57cec5SDimitry Andric       return options.operators;
1270b57cec5SDimitry Andric     default:
1280b57cec5SDimitry Andric       break;
1290b57cec5SDimitry Andric     }
1300b57cec5SDimitry Andric   return HighlightStyle::ColorStyle();
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric 
Highlight(const HighlightStyle & options,llvm::StringRef line,std::optional<size_t> cursor_pos,llvm::StringRef previous_lines,Stream & result) const1330b57cec5SDimitry Andric void ClangHighlighter::Highlight(const HighlightStyle &options,
1340b57cec5SDimitry Andric                                  llvm::StringRef line,
135bdd1243dSDimitry Andric                                  std::optional<size_t> cursor_pos,
1360b57cec5SDimitry Andric                                  llvm::StringRef previous_lines,
1370b57cec5SDimitry Andric                                  Stream &result) const {
1380b57cec5SDimitry Andric   using namespace clang;
1390b57cec5SDimitry Andric 
1400b57cec5SDimitry Andric   FileSystemOptions file_opts;
1410b57cec5SDimitry Andric   FileManager file_mgr(file_opts,
1420b57cec5SDimitry Andric                        FileSystem::Instance().GetVirtualFileSystem());
1430b57cec5SDimitry Andric 
144480093f4SDimitry Andric   // The line might end in a backslash which would cause Clang to drop the
145480093f4SDimitry Andric   // backslash and the terminating new line. This makes sense when parsing C++,
146480093f4SDimitry Andric   // but when highlighting we care about preserving the backslash/newline. To
147480093f4SDimitry Andric   // not lose this information we remove the new line here so that Clang knows
148480093f4SDimitry Andric   // this is just a single line we are highlighting. We add back the newline
149480093f4SDimitry Andric   // after tokenizing.
150480093f4SDimitry Andric   llvm::StringRef line_ending = "";
151480093f4SDimitry Andric   // There are a few legal line endings Clang recognizes and we need to
152480093f4SDimitry Andric   // temporarily remove from the string.
153480093f4SDimitry Andric   if (line.consume_back("\r\n"))
154480093f4SDimitry Andric     line_ending = "\r\n";
155480093f4SDimitry Andric   else if (line.consume_back("\n"))
156480093f4SDimitry Andric     line_ending = "\n";
157480093f4SDimitry Andric   else if (line.consume_back("\r"))
158480093f4SDimitry Andric     line_ending = "\r";
159480093f4SDimitry Andric 
1600b57cec5SDimitry Andric   unsigned line_number = previous_lines.count('\n') + 1U;
1610b57cec5SDimitry Andric 
1620b57cec5SDimitry Andric   // Let's build the actual source code Clang needs and setup some utility
1630b57cec5SDimitry Andric   // objects.
1640b57cec5SDimitry Andric   std::string full_source = previous_lines.str() + line.str();
1650b57cec5SDimitry Andric   llvm::IntrusiveRefCntPtr<DiagnosticIDs> diag_ids(new DiagnosticIDs());
1660b57cec5SDimitry Andric   llvm::IntrusiveRefCntPtr<DiagnosticOptions> diags_opts(
1670b57cec5SDimitry Andric       new DiagnosticOptions());
1680b57cec5SDimitry Andric   DiagnosticsEngine diags(diag_ids, diags_opts);
1690b57cec5SDimitry Andric   clang::SourceManager SM(diags, file_mgr);
1700b57cec5SDimitry Andric   auto buf = llvm::MemoryBuffer::getMemBuffer(full_source);
1710b57cec5SDimitry Andric 
172e8d8bef9SDimitry Andric   FileID FID = SM.createFileID(buf->getMemBufferRef());
1730b57cec5SDimitry Andric 
1740b57cec5SDimitry Andric   // Let's just enable the latest ObjC and C++ which should get most tokens
1750b57cec5SDimitry Andric   // right.
1760b57cec5SDimitry Andric   LangOptions Opts;
1770b57cec5SDimitry Andric   Opts.ObjC = true;
1780b57cec5SDimitry Andric   // FIXME: This should probably set CPlusPlus, CPlusPlus11, ... too
1790b57cec5SDimitry Andric   Opts.CPlusPlus17 = true;
1800b57cec5SDimitry Andric   Opts.LineComment = true;
1810b57cec5SDimitry Andric 
182e8d8bef9SDimitry Andric   Lexer lex(FID, buf->getMemBufferRef(), SM, Opts);
1830b57cec5SDimitry Andric   // The lexer should keep whitespace around.
1840b57cec5SDimitry Andric   lex.SetKeepWhitespaceMode(true);
1850b57cec5SDimitry Andric 
1860b57cec5SDimitry Andric   // Keeps track if we have entered a PP directive.
1870b57cec5SDimitry Andric   bool in_pp_directive = false;
1880b57cec5SDimitry Andric 
1890b57cec5SDimitry Andric   // True once we actually lexed the user provided line.
1900b57cec5SDimitry Andric   bool found_user_line = false;
1910b57cec5SDimitry Andric 
1920b57cec5SDimitry Andric   // True if we already highlighted the token under the cursor, false otherwise.
1930b57cec5SDimitry Andric   bool highlighted_cursor = false;
1940b57cec5SDimitry Andric   Token token;
1950b57cec5SDimitry Andric   bool exit = false;
1960b57cec5SDimitry Andric   while (!exit) {
1970b57cec5SDimitry Andric     // Returns true if this is the last token we get from the lexer.
1980b57cec5SDimitry Andric     exit = lex.LexFromRawLexer(token);
1990b57cec5SDimitry Andric 
2000b57cec5SDimitry Andric     bool invalid = false;
2010b57cec5SDimitry Andric     unsigned current_line_number =
2020b57cec5SDimitry Andric         SM.getSpellingLineNumber(token.getLocation(), &invalid);
2030b57cec5SDimitry Andric     if (current_line_number != line_number)
2040b57cec5SDimitry Andric       continue;
2050b57cec5SDimitry Andric     found_user_line = true;
2060b57cec5SDimitry Andric 
2070b57cec5SDimitry Andric     // We don't need to print any tokens without a spelling line number.
2080b57cec5SDimitry Andric     if (invalid)
2090b57cec5SDimitry Andric       continue;
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric     // Same as above but with the column number.
2120b57cec5SDimitry Andric     invalid = false;
2130b57cec5SDimitry Andric     unsigned start = SM.getSpellingColumnNumber(token.getLocation(), &invalid);
2140b57cec5SDimitry Andric     if (invalid)
2150b57cec5SDimitry Andric       continue;
2160b57cec5SDimitry Andric     // Column numbers start at 1, but indexes in our string start at 0.
2170b57cec5SDimitry Andric     --start;
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric     // Annotations don't have a length, so let's skip them.
2200b57cec5SDimitry Andric     if (token.isAnnotation())
2210b57cec5SDimitry Andric       continue;
2220b57cec5SDimitry Andric 
2230b57cec5SDimitry Andric     // Extract the token string from our source code.
2240b57cec5SDimitry Andric     llvm::StringRef tok_str = line.substr(start, token.getLength());
2250b57cec5SDimitry Andric 
2260b57cec5SDimitry Andric     // If the token is just an empty string, we can skip all the work below.
2270b57cec5SDimitry Andric     if (tok_str.empty())
2280b57cec5SDimitry Andric       continue;
2290b57cec5SDimitry Andric 
2300b57cec5SDimitry Andric     // If the cursor is inside this token, we have to apply the 'selected'
2310b57cec5SDimitry Andric     // highlight style before applying the actual token color.
2320b57cec5SDimitry Andric     llvm::StringRef to_print = tok_str;
2330b57cec5SDimitry Andric     StreamString storage;
2340b57cec5SDimitry Andric     auto end = start + token.getLength();
2350b57cec5SDimitry Andric     if (cursor_pos && end > *cursor_pos && !highlighted_cursor) {
2360b57cec5SDimitry Andric       highlighted_cursor = true;
2370b57cec5SDimitry Andric       options.selected.Apply(storage, tok_str);
2380b57cec5SDimitry Andric       to_print = storage.GetString();
2390b57cec5SDimitry Andric     }
2400b57cec5SDimitry Andric 
2410b57cec5SDimitry Andric     // See how we are supposed to highlight this token.
2420b57cec5SDimitry Andric     HighlightStyle::ColorStyle color =
2430b57cec5SDimitry Andric         determineClangStyle(*this, token, tok_str, options, in_pp_directive);
2440b57cec5SDimitry Andric 
2450b57cec5SDimitry Andric     color.Apply(result, to_print);
2460b57cec5SDimitry Andric   }
2470b57cec5SDimitry Andric 
248480093f4SDimitry Andric   // Add the line ending we trimmed before tokenizing.
249480093f4SDimitry Andric   result << line_ending;
250480093f4SDimitry Andric 
2510b57cec5SDimitry Andric   // If we went over the whole file but couldn't find our own file, then
2520b57cec5SDimitry Andric   // somehow our setup was wrong. When we're in release mode we just give the
2530b57cec5SDimitry Andric   // user the normal line and pretend we don't know how to highlight it. In
2540b57cec5SDimitry Andric   // debug mode we bail out with an assert as this should never happen.
2550b57cec5SDimitry Andric   if (!found_user_line) {
2560b57cec5SDimitry Andric     result << line;
2570b57cec5SDimitry Andric     assert(false && "We couldn't find the user line in the input file?");
2580b57cec5SDimitry Andric   }
2590b57cec5SDimitry Andric }
260