1 //===--- Markup.cpp -----------------------------------------*- C++-*------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "support/Markup.h"
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/ADT/SmallVector.h"
12 #include "llvm/ADT/StringExtras.h"
13 #include "llvm/ADT/StringRef.h"
14 #include "llvm/Support/Compiler.h"
15 #include "llvm/Support/ErrorHandling.h"
16 #include "llvm/Support/FormatVariadic.h"
17 #include "llvm/Support/raw_ostream.h"
18 #include <cstddef>
19 #include <iterator>
20 #include <memory>
21 #include <string>
22 #include <vector>
23 
24 namespace clang {
25 namespace clangd {
26 namespace markup {
27 namespace {
28 
29 // Is <contents a plausible start to an HTML tag?
30 // Contents may not be the rest of the line, but it's the rest of the plain
31 // text, so we expect to see at least the tag name.
looksLikeTag(llvm::StringRef Contents)32 bool looksLikeTag(llvm::StringRef Contents) {
33   if (Contents.empty())
34     return false;
35   if (Contents.front() == '!' || Contents.front() == '?' ||
36       Contents.front() == '/')
37     return true;
38   // Check the start of the tag name.
39   if (!llvm::isAlpha(Contents.front()))
40     return false;
41   // Drop rest of the tag name, and following whitespace.
42   Contents = Contents
43                  .drop_while([](char C) {
44                    return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
45                  })
46                  .drop_while(llvm::isSpace);
47   // The rest of the tag consists of attributes, which have restrictive names.
48   // If we hit '=', all bets are off (attribute values can contain anything).
49   for (; !Contents.empty(); Contents = Contents.drop_front()) {
50     if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front()))
51       continue;
52     if (Contents.front() == '>' || Contents.startswith("/>"))
53       return true; // May close the tag.
54     if (Contents.front() == '=')
55       return true; // Don't try to parse attribute values.
56     return false;  // Random punctuation means this isn't a tag.
57   }
58   return true; // Potentially incomplete tag.
59 }
60 
61 // Tests whether C should be backslash-escaped in markdown.
62 // The string being escaped is Before + C + After. This is part of a paragraph.
63 // StartsLine indicates whether `Before` is the start of the line.
64 // After may not be everything until the end of the line.
65 //
66 // It's always safe to escape punctuation, but want minimal escaping.
67 // The strategy is to escape the first character of anything that might start
68 // a markdown grammar construct.
needsLeadingEscape(char C,llvm::StringRef Before,llvm::StringRef After,bool StartsLine)69 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
70                         bool StartsLine) {
71   assert(Before.take_while(llvm::isSpace).empty());
72   auto RulerLength = [&]() -> /*Length*/ unsigned {
73     if (!StartsLine || !Before.empty())
74       return false;
75     llvm::StringRef A = After.rtrim();
76     return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
77   };
78   auto IsBullet = [&]() {
79     return StartsLine && Before.empty() &&
80            (After.empty() || After.startswith(" "));
81   };
82   auto SpaceSurrounds = [&]() {
83     return (After.empty() || llvm::isSpace(After.front())) &&
84            (Before.empty() || llvm::isSpace(Before.back()));
85   };
86   auto WordSurrounds = [&]() {
87     return (!After.empty() && llvm::isAlnum(After.front())) &&
88            (!Before.empty() && llvm::isAlnum(Before.back()));
89   };
90 
91   switch (C) {
92   case '\\': // Escaped character.
93     return true;
94   case '`': // Code block or inline code
95     // Any number of backticks can delimit an inline code block that can end
96     // anywhere (including on another line). We must escape them all.
97     return true;
98   case '~': // Code block
99     return StartsLine && Before.empty() && After.startswith("~~");
100   case '#': { // ATX heading.
101     if (!StartsLine || !Before.empty())
102       return false;
103     llvm::StringRef Rest = After.ltrim(C);
104     return Rest.empty() || Rest.startswith(" ");
105   }
106   case ']': // Link or link reference.
107     // We escape ] rather than [ here, because it's more constrained:
108     //   ](...) is an in-line link
109     //   ]: is a link reference
110     // The following are only links if the link reference exists:
111     //   ] by itself is a shortcut link
112     //   ][...] is an out-of-line link
113     // Because we never emit link references, we don't need to handle these.
114     return After.startswith(":") || After.startswith("(");
115   case '=': // Setex heading.
116     return RulerLength() > 0;
117   case '_': // Horizontal ruler or matched delimiter.
118     if (RulerLength() >= 3)
119       return true;
120     // Not a delimiter if surrounded by space, or inside a word.
121     // (The rules at word boundaries are subtle).
122     return !(SpaceSurrounds() || WordSurrounds());
123   case '-': // Setex heading, horizontal ruler, or bullet.
124     if (RulerLength() > 0)
125       return true;
126     return IsBullet();
127   case '+': // Bullet list.
128     return IsBullet();
129   case '*': // Bullet list, horizontal ruler, or delimiter.
130     return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
131   case '<': // HTML tag (or autolink, which we choose not to escape)
132     return looksLikeTag(After);
133   case '>': // Quote marker. Needs escaping at start of line.
134     return StartsLine && Before.empty();
135   case '&': { // HTML entity reference
136     auto End = After.find(';');
137     if (End == llvm::StringRef::npos)
138       return false;
139     llvm::StringRef Content = After.substr(0, End);
140     if (Content.consume_front("#")) {
141       if (Content.consume_front("x") || Content.consume_front("X"))
142         return llvm::all_of(Content, llvm::isHexDigit);
143       return llvm::all_of(Content, llvm::isDigit);
144     }
145     return llvm::all_of(Content, llvm::isAlpha);
146   }
147   case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
148   case ')':
149     return StartsLine && !Before.empty() &&
150            llvm::all_of(Before, llvm::isDigit) && After.startswith(" ");
151   default:
152     return false;
153   }
154 }
155 
156 /// Escape a markdown text block. Ensures the punctuation will not introduce
157 /// any of the markdown constructs.
renderText(llvm::StringRef Input,bool StartsLine)158 std::string renderText(llvm::StringRef Input, bool StartsLine) {
159   std::string R;
160   for (unsigned I = 0; I < Input.size(); ++I) {
161     if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
162                            StartsLine))
163       R.push_back('\\');
164     R.push_back(Input[I]);
165   }
166   return R;
167 }
168 
169 /// Renders \p Input as an inline block of code in markdown. The returned value
170 /// is surrounded by backticks and the inner contents are properly escaped.
renderInlineBlock(llvm::StringRef Input)171 std::string renderInlineBlock(llvm::StringRef Input) {
172   std::string R;
173   // Double all backticks to make sure we don't close the inline block early.
174   for (size_t From = 0; From < Input.size();) {
175     size_t Next = Input.find("`", From);
176     R += Input.substr(From, Next - From);
177     if (Next == llvm::StringRef::npos)
178       break;
179     R += "``"; // double the found backtick.
180 
181     From = Next + 1;
182   }
183   // If results starts with a backtick, add spaces on both sides. The spaces
184   // are ignored by markdown renderers.
185   if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`"))
186     return "` " + std::move(R) + " `";
187   // Markdown render should ignore first and last space if both are there. We
188   // add an extra pair of spaces in that case to make sure we render what the
189   // user intended.
190   if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" "))
191     return "` " + std::move(R) + " `";
192   return "`" + std::move(R) + "`";
193 }
194 
195 /// Get marker required for \p Input to represent a markdown codeblock. It
196 /// consists of at least 3 backticks(`). Although markdown also allows to use
197 /// tilde(~) for code blocks, they are never used.
getMarkerForCodeBlock(llvm::StringRef Input)198 std::string getMarkerForCodeBlock(llvm::StringRef Input) {
199   // Count the maximum number of consecutive backticks in \p Input. We need to
200   // start and end the code block with more.
201   unsigned MaxBackticks = 0;
202   unsigned Backticks = 0;
203   for (char C : Input) {
204     if (C == '`') {
205       ++Backticks;
206       continue;
207     }
208     MaxBackticks = std::max(MaxBackticks, Backticks);
209     Backticks = 0;
210   }
211   MaxBackticks = std::max(Backticks, MaxBackticks);
212   // Use the corresponding number of backticks to start and end a code block.
213   return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`');
214 }
215 
216 // Trims the input and concatenates whitespace blocks into a single ` `.
canonicalizeSpaces(llvm::StringRef Input)217 std::string canonicalizeSpaces(llvm::StringRef Input) {
218   llvm::SmallVector<llvm::StringRef> Words;
219   llvm::SplitString(Input, Words);
220   return llvm::join(Words, " ");
221 }
222 
renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,void (Block::* RenderFunc)(llvm::raw_ostream &)const)223 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
224                          void (Block::*RenderFunc)(llvm::raw_ostream &) const) {
225   std::string R;
226   llvm::raw_string_ostream OS(R);
227 
228   // Trim rulers.
229   Children = Children.drop_while(
230       [](const std::unique_ptr<Block> &C) { return C->isRuler(); });
231   auto Last = llvm::find_if(
232       llvm::reverse(Children),
233       [](const std::unique_ptr<Block> &C) { return !C->isRuler(); });
234   Children = Children.drop_back(Children.end() - Last.base());
235 
236   bool LastBlockWasRuler = true;
237   for (const auto &C : Children) {
238     if (C->isRuler() && LastBlockWasRuler)
239       continue;
240     LastBlockWasRuler = C->isRuler();
241     ((*C).*RenderFunc)(OS);
242   }
243 
244   // Get rid of redundant empty lines introduced in plaintext while imitating
245   // padding in markdown.
246   std::string AdjustedResult;
247   llvm::StringRef TrimmedText(OS.str());
248   TrimmedText = TrimmedText.trim();
249 
250   llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult),
251                 [&TrimmedText](const char &C) {
252                   return !llvm::StringRef(TrimmedText.data(),
253                                           &C - TrimmedText.data() + 1)
254                               // We allow at most two newlines.
255                               .endswith("\n\n\n");
256                 });
257 
258   return AdjustedResult;
259 }
260 
261 // Separates two blocks with extra spacing. Note that it might render strangely
262 // in vscode if the trailing block is a codeblock, see
263 // https://github.com/microsoft/vscode/issues/88416 for details.
264 class Ruler : public Block {
265 public:
renderMarkdown(llvm::raw_ostream & OS) const266   void renderMarkdown(llvm::raw_ostream &OS) const override {
267     // Note that we need an extra new line before the ruler, otherwise we might
268     // make previous block a title instead of introducing a ruler.
269     OS << "\n---\n";
270   }
renderPlainText(llvm::raw_ostream & OS) const271   void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; }
clone() const272   std::unique_ptr<Block> clone() const override {
273     return std::make_unique<Ruler>(*this);
274   }
isRuler() const275   bool isRuler() const override { return true; }
276 };
277 
278 class CodeBlock : public Block {
279 public:
renderMarkdown(llvm::raw_ostream & OS) const280   void renderMarkdown(llvm::raw_ostream &OS) const override {
281     std::string Marker = getMarkerForCodeBlock(Contents);
282     // No need to pad from previous blocks, as they should end with a new line.
283     OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n';
284   }
285 
renderPlainText(llvm::raw_ostream & OS) const286   void renderPlainText(llvm::raw_ostream &OS) const override {
287     // In plaintext we want one empty line before and after codeblocks.
288     OS << '\n' << Contents << "\n\n";
289   }
290 
clone() const291   std::unique_ptr<Block> clone() const override {
292     return std::make_unique<CodeBlock>(*this);
293   }
294 
CodeBlock(std::string Contents,std::string Language)295   CodeBlock(std::string Contents, std::string Language)
296       : Contents(std::move(Contents)), Language(std::move(Language)) {}
297 
298 private:
299   std::string Contents;
300   std::string Language;
301 };
302 
303 // Inserts two spaces after each `\n` to indent each line. First line is not
304 // indented.
indentLines(llvm::StringRef Input)305 std::string indentLines(llvm::StringRef Input) {
306   assert(!Input.endswith("\n") && "Input should've been trimmed.");
307   std::string IndentedR;
308   // We'll add 2 spaces after each new line.
309   IndentedR.reserve(Input.size() + Input.count('\n') * 2);
310   for (char C : Input) {
311     IndentedR += C;
312     if (C == '\n')
313       IndentedR.append("  ");
314   }
315   return IndentedR;
316 }
317 
318 class Heading : public Paragraph {
319 public:
Heading(size_t Level)320   Heading(size_t Level) : Level(Level) {}
renderMarkdown(llvm::raw_ostream & OS) const321   void renderMarkdown(llvm::raw_ostream &OS) const override {
322     OS << std::string(Level, '#') << ' ';
323     Paragraph::renderMarkdown(OS);
324   }
325 
326 private:
327   size_t Level;
328 };
329 
330 } // namespace
331 
asMarkdown() const332 std::string Block::asMarkdown() const {
333   std::string R;
334   llvm::raw_string_ostream OS(R);
335   renderMarkdown(OS);
336   return llvm::StringRef(OS.str()).trim().str();
337 }
338 
asPlainText() const339 std::string Block::asPlainText() const {
340   std::string R;
341   llvm::raw_string_ostream OS(R);
342   renderPlainText(OS);
343   return llvm::StringRef(OS.str()).trim().str();
344 }
345 
renderMarkdown(llvm::raw_ostream & OS) const346 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
347   bool NeedsSpace = false;
348   bool HasChunks = false;
349   for (auto &C : Chunks) {
350     if (C.SpaceBefore || NeedsSpace)
351       OS << " ";
352     switch (C.Kind) {
353     case Chunk::PlainText:
354       OS << renderText(C.Contents, !HasChunks);
355       break;
356     case Chunk::InlineCode:
357       OS << renderInlineBlock(C.Contents);
358       break;
359     }
360     HasChunks = true;
361     NeedsSpace = C.SpaceAfter;
362   }
363   // Paragraphs are translated into markdown lines, not markdown paragraphs.
364   // Therefore it only has a single linebreak afterwards.
365   // VSCode requires two spaces at the end of line to start a new one.
366   OS << "  \n";
367 }
368 
clone() const369 std::unique_ptr<Block> Paragraph::clone() const {
370   return std::make_unique<Paragraph>(*this);
371 }
372 
373 /// Choose a marker to delimit `Text` from a prioritized list of options.
374 /// This is more readable than escaping for plain-text.
chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,llvm::StringRef Text)375 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
376                              llvm::StringRef Text) {
377   // Prefer a delimiter whose characters don't appear in the text.
378   for (llvm::StringRef S : Options)
379     if (Text.find_first_of(S) == llvm::StringRef::npos)
380       return S;
381   return Options.front();
382 }
383 
renderPlainText(llvm::raw_ostream & OS) const384 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
385   bool NeedsSpace = false;
386   for (auto &C : Chunks) {
387     if (C.SpaceBefore || NeedsSpace)
388       OS << " ";
389     llvm::StringRef Marker = "";
390     if (C.Preserve && C.Kind == Chunk::InlineCode)
391       Marker = chooseMarker({"`", "'", "\""}, C.Contents);
392     OS << Marker << C.Contents << Marker;
393     NeedsSpace = C.SpaceAfter;
394   }
395   OS << '\n';
396 }
397 
renderMarkdown(llvm::raw_ostream & OS) const398 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
399   for (auto &D : Items) {
400     // Instead of doing this we might prefer passing Indent to children to get
401     // rid of the copies, if it turns out to be a bottleneck.
402     OS << "- " << indentLines(D.asMarkdown()) << '\n';
403   }
404   // We need a new line after list to terminate it in markdown.
405   OS << '\n';
406 }
407 
renderPlainText(llvm::raw_ostream & OS) const408 void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
409   for (auto &D : Items) {
410     // Instead of doing this we might prefer passing Indent to children to get
411     // rid of the copies, if it turns out to be a bottleneck.
412     OS << "- " << indentLines(D.asPlainText()) << '\n';
413   }
414 }
415 
appendSpace()416 Paragraph &Paragraph::appendSpace() {
417   if (!Chunks.empty())
418     Chunks.back().SpaceAfter = true;
419   return *this;
420 }
421 
appendText(llvm::StringRef Text)422 Paragraph &Paragraph::appendText(llvm::StringRef Text) {
423   std::string Norm = canonicalizeSpaces(Text);
424   if (Norm.empty())
425     return *this;
426   Chunks.emplace_back();
427   Chunk &C = Chunks.back();
428   C.Contents = std::move(Norm);
429   C.Kind = Chunk::PlainText;
430   C.SpaceBefore = llvm::isSpace(Text.front());
431   C.SpaceAfter = llvm::isSpace(Text.back());
432   return *this;
433 }
434 
appendCode(llvm::StringRef Code,bool Preserve)435 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
436   bool AdjacentCode =
437       !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
438   std::string Norm = canonicalizeSpaces(std::move(Code));
439   if (Norm.empty())
440     return *this;
441   Chunks.emplace_back();
442   Chunk &C = Chunks.back();
443   C.Contents = std::move(Norm);
444   C.Kind = Chunk::InlineCode;
445   C.Preserve = Preserve;
446   // Disallow adjacent code spans without spaces, markdown can't render them.
447   C.SpaceBefore = AdjacentCode;
448   return *this;
449 }
450 
clone() const451 std::unique_ptr<Block> BulletList::clone() const {
452   return std::make_unique<BulletList>(*this);
453 }
454 
addItem()455 class Document &BulletList::addItem() {
456   Items.emplace_back();
457   return Items.back();
458 }
459 
operator =(const Document & Other)460 Document &Document::operator=(const Document &Other) {
461   Children.clear();
462   for (const auto &C : Other.Children)
463     Children.push_back(C->clone());
464   return *this;
465 }
466 
append(Document Other)467 void Document::append(Document Other) {
468   std::move(Other.Children.begin(), Other.Children.end(),
469             std::back_inserter(Children));
470 }
471 
addParagraph()472 Paragraph &Document::addParagraph() {
473   Children.push_back(std::make_unique<Paragraph>());
474   return *static_cast<Paragraph *>(Children.back().get());
475 }
476 
addRuler()477 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); }
478 
addCodeBlock(std::string Code,std::string Language)479 void Document::addCodeBlock(std::string Code, std::string Language) {
480   Children.emplace_back(
481       std::make_unique<CodeBlock>(std::move(Code), std::move(Language)));
482 }
483 
asMarkdown() const484 std::string Document::asMarkdown() const {
485   return renderBlocks(Children, &Block::renderMarkdown);
486 }
487 
asPlainText() const488 std::string Document::asPlainText() const {
489   return renderBlocks(Children, &Block::renderPlainText);
490 }
491 
addBulletList()492 BulletList &Document::addBulletList() {
493   Children.emplace_back(std::make_unique<BulletList>());
494   return *static_cast<BulletList *>(Children.back().get());
495 }
496 
addHeading(size_t Level)497 Paragraph &Document::addHeading(size_t Level) {
498   assert(Level > 0);
499   Children.emplace_back(std::make_unique<Heading>(Level));
500   return *static_cast<Paragraph *>(Children.back().get());
501 }
502 } // namespace markup
503 } // namespace clangd
504 } // namespace clang
505