1 //===-- ClangHighlighter.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ClangHighlighter.h"
10 
11 #include "lldb/Host/FileSystem.h"
12 #include "lldb/Target/Language.h"
13 #include "lldb/Utility/AnsiTerminal.h"
14 #include "lldb/Utility/StreamString.h"
15 
16 #include "clang/Basic/FileManager.h"
17 #include "clang/Basic/SourceManager.h"
18 #include "clang/Lex/Lexer.h"
19 #include "llvm/ADT/StringSet.h"
20 #include "llvm/Support/MemoryBuffer.h"
21 
22 using namespace lldb_private;
23 
24 bool ClangHighlighter::isKeyword(llvm::StringRef token) const {
25   return keywords.find(token) != keywords.end();
26 }
27 
28 ClangHighlighter::ClangHighlighter() {
29 #define KEYWORD(X, N) keywords.insert(#X);
30 #include "clang/Basic/TokenKinds.def"
31 }
32 
33 /// Determines which style should be applied to the given token.
34 /// \param highlighter
35 ///     The current highlighter that should use the style.
36 /// \param token
37 ///     The current token.
38 /// \param tok_str
39 ///     The string in the source code the token represents.
40 /// \param options
41 ///     The style we use for coloring the source code.
42 /// \param in_pp_directive
43 ///     If we are currently in a preprocessor directive. NOTE: This is
44 ///     passed by reference and will be updated if the current token starts
45 ///     or ends a preprocessor directive.
46 /// \return
47 ///     The ColorStyle that should be applied to the token.
48 static HighlightStyle::ColorStyle
49 determineClangStyle(const ClangHighlighter &highlighter,
50                     const clang::Token &token, llvm::StringRef tok_str,
51                     const HighlightStyle &options, bool &in_pp_directive) {
52   using namespace clang;
53 
54   if (token.is(tok::comment)) {
55     // If we were in a preprocessor directive before, we now left it.
56     in_pp_directive = false;
57     return options.comment;
58   } else if (in_pp_directive || token.getKind() == tok::hash) {
59     // Let's assume that the rest of the line is a PP directive.
60     in_pp_directive = true;
61     // Preprocessor directives are hard to match, so we have to hack this in.
62     return options.pp_directive;
63   } else if (tok::isStringLiteral(token.getKind()))
64     return options.string_literal;
65   else if (tok::isLiteral(token.getKind()))
66     return options.scalar_literal;
67   else if (highlighter.isKeyword(tok_str))
68     return options.keyword;
69   else
70     switch (token.getKind()) {
71     case tok::raw_identifier:
72     case tok::identifier:
73       return options.identifier;
74     case tok::l_brace:
75     case tok::r_brace:
76       return options.braces;
77     case tok::l_square:
78     case tok::r_square:
79       return options.square_brackets;
80     case tok::l_paren:
81     case tok::r_paren:
82       return options.parentheses;
83     case tok::comma:
84       return options.comma;
85     case tok::coloncolon:
86     case tok::colon:
87       return options.colon;
88 
89     case tok::amp:
90     case tok::ampamp:
91     case tok::ampequal:
92     case tok::star:
93     case tok::starequal:
94     case tok::plus:
95     case tok::plusplus:
96     case tok::plusequal:
97     case tok::minus:
98     case tok::arrow:
99     case tok::minusminus:
100     case tok::minusequal:
101     case tok::tilde:
102     case tok::exclaim:
103     case tok::exclaimequal:
104     case tok::slash:
105     case tok::slashequal:
106     case tok::percent:
107     case tok::percentequal:
108     case tok::less:
109     case tok::lessless:
110     case tok::lessequal:
111     case tok::lesslessequal:
112     case tok::spaceship:
113     case tok::greater:
114     case tok::greatergreater:
115     case tok::greaterequal:
116     case tok::greatergreaterequal:
117     case tok::caret:
118     case tok::caretequal:
119     case tok::pipe:
120     case tok::pipepipe:
121     case tok::pipeequal:
122     case tok::question:
123     case tok::equal:
124     case tok::equalequal:
125       return options.operators;
126     default:
127       break;
128     }
129   return HighlightStyle::ColorStyle();
130 }
131 
132 void ClangHighlighter::Highlight(const HighlightStyle &options,
133                                  llvm::StringRef line,
134                                  llvm::Optional<size_t> cursor_pos,
135                                  llvm::StringRef previous_lines,
136                                  Stream &result) const {
137   using namespace clang;
138 
139   FileSystemOptions file_opts;
140   FileManager file_mgr(file_opts,
141                        FileSystem::Instance().GetVirtualFileSystem());
142 
143   // The line might end in a backslash which would cause Clang to drop the
144   // backslash and the terminating new line. This makes sense when parsing C++,
145   // but when highlighting we care about preserving the backslash/newline. To
146   // not lose this information we remove the new line here so that Clang knows
147   // this is just a single line we are highlighting. We add back the newline
148   // after tokenizing.
149   llvm::StringRef line_ending = "";
150   // There are a few legal line endings Clang recognizes and we need to
151   // temporarily remove from the string.
152   if (line.consume_back("\r\n"))
153     line_ending = "\r\n";
154   else if (line.consume_back("\n"))
155     line_ending = "\n";
156   else if (line.consume_back("\r"))
157     line_ending = "\r";
158 
159   unsigned line_number = previous_lines.count('\n') + 1U;
160 
161   // Let's build the actual source code Clang needs and setup some utility
162   // objects.
163   std::string full_source = previous_lines.str() + line.str();
164   llvm::IntrusiveRefCntPtr<DiagnosticIDs> diag_ids(new DiagnosticIDs());
165   llvm::IntrusiveRefCntPtr<DiagnosticOptions> diags_opts(
166       new DiagnosticOptions());
167   DiagnosticsEngine diags(diag_ids, diags_opts);
168   clang::SourceManager SM(diags, file_mgr);
169   auto buf = llvm::MemoryBuffer::getMemBuffer(full_source);
170 
171   FileID FID = SM.createFileID(buf->getMemBufferRef());
172 
173   // Let's just enable the latest ObjC and C++ which should get most tokens
174   // right.
175   LangOptions Opts;
176   Opts.ObjC = true;
177   // FIXME: This should probably set CPlusPlus, CPlusPlus11, ... too
178   Opts.CPlusPlus17 = true;
179   Opts.LineComment = true;
180 
181   Lexer lex(FID, buf->getMemBufferRef(), SM, Opts);
182   // The lexer should keep whitespace around.
183   lex.SetKeepWhitespaceMode(true);
184 
185   // Keeps track if we have entered a PP directive.
186   bool in_pp_directive = false;
187 
188   // True once we actually lexed the user provided line.
189   bool found_user_line = false;
190 
191   // True if we already highlighted the token under the cursor, false otherwise.
192   bool highlighted_cursor = false;
193   Token token;
194   bool exit = false;
195   while (!exit) {
196     // Returns true if this is the last token we get from the lexer.
197     exit = lex.LexFromRawLexer(token);
198 
199     bool invalid = false;
200     unsigned current_line_number =
201         SM.getSpellingLineNumber(token.getLocation(), &invalid);
202     if (current_line_number != line_number)
203       continue;
204     found_user_line = true;
205 
206     // We don't need to print any tokens without a spelling line number.
207     if (invalid)
208       continue;
209 
210     // Same as above but with the column number.
211     invalid = false;
212     unsigned start = SM.getSpellingColumnNumber(token.getLocation(), &invalid);
213     if (invalid)
214       continue;
215     // Column numbers start at 1, but indexes in our string start at 0.
216     --start;
217 
218     // Annotations don't have a length, so let's skip them.
219     if (token.isAnnotation())
220       continue;
221 
222     // Extract the token string from our source code.
223     llvm::StringRef tok_str = line.substr(start, token.getLength());
224 
225     // If the token is just an empty string, we can skip all the work below.
226     if (tok_str.empty())
227       continue;
228 
229     // If the cursor is inside this token, we have to apply the 'selected'
230     // highlight style before applying the actual token color.
231     llvm::StringRef to_print = tok_str;
232     StreamString storage;
233     auto end = start + token.getLength();
234     if (cursor_pos && end > *cursor_pos && !highlighted_cursor) {
235       highlighted_cursor = true;
236       options.selected.Apply(storage, tok_str);
237       to_print = storage.GetString();
238     }
239 
240     // See how we are supposed to highlight this token.
241     HighlightStyle::ColorStyle color =
242         determineClangStyle(*this, token, tok_str, options, in_pp_directive);
243 
244     color.Apply(result, to_print);
245   }
246 
247   // Add the line ending we trimmed before tokenizing.
248   result << line_ending;
249 
250   // If we went over the whole file but couldn't find our own file, then
251   // somehow our setup was wrong. When we're in release mode we just give the
252   // user the normal line and pretend we don't know how to highlight it. In
253   // debug mode we bail out with an assert as this should never happen.
254   if (!found_user_line) {
255     result << line;
256     assert(false && "We couldn't find the user line in the input file?");
257   }
258 }
259