10b57cec5SDimitry Andric //===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This tablegen backend emits an efficient function to translate HTML named
100b57cec5SDimitry Andric // character references to UTF-8 sequences.
110b57cec5SDimitry Andric //
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
14a7dea167SDimitry Andric #include "TableGenBackends.h"
150b57cec5SDimitry Andric #include "llvm/ADT/SmallString.h"
160b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
170b57cec5SDimitry Andric #include "llvm/TableGen/Error.h"
180b57cec5SDimitry Andric #include "llvm/TableGen/Record.h"
190b57cec5SDimitry Andric #include "llvm/TableGen/StringMatcher.h"
200b57cec5SDimitry Andric #include "llvm/TableGen/TableGenBackend.h"
210b57cec5SDimitry Andric #include <vector>
220b57cec5SDimitry Andric 
230b57cec5SDimitry Andric using namespace llvm;
240b57cec5SDimitry Andric 
250b57cec5SDimitry Andric /// Convert a code point to the corresponding UTF-8 sequence represented
260b57cec5SDimitry Andric /// as a C string literal.
270b57cec5SDimitry Andric ///
280b57cec5SDimitry Andric /// \returns true on success.
translateCodePointToUTF8(unsigned CodePoint,SmallVectorImpl<char> & CLiteral)290b57cec5SDimitry Andric static bool translateCodePointToUTF8(unsigned CodePoint,
300b57cec5SDimitry Andric                                      SmallVectorImpl<char> &CLiteral) {
310b57cec5SDimitry Andric   char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
320b57cec5SDimitry Andric   char *TranslatedPtr = Translated;
330b57cec5SDimitry Andric   if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr))
340b57cec5SDimitry Andric     return false;
350b57cec5SDimitry Andric 
360b57cec5SDimitry Andric   StringRef UTF8(Translated, TranslatedPtr - Translated);
370b57cec5SDimitry Andric 
380b57cec5SDimitry Andric   raw_svector_ostream OS(CLiteral);
390b57cec5SDimitry Andric   OS << "\"";
400b57cec5SDimitry Andric   for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
410b57cec5SDimitry Andric     OS << "\\x";
420b57cec5SDimitry Andric     OS.write_hex(static_cast<unsigned char>(UTF8[i]));
430b57cec5SDimitry Andric   }
440b57cec5SDimitry Andric   OS << "\"";
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric   return true;
470b57cec5SDimitry Andric }
480b57cec5SDimitry Andric 
EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper & Records,raw_ostream & OS)49a7dea167SDimitry Andric void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
500b57cec5SDimitry Andric                                                          raw_ostream &OS) {
510b57cec5SDimitry Andric   std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
520b57cec5SDimitry Andric   std::vector<StringMatcher::StringPair> NameToUTF8;
530b57cec5SDimitry Andric   SmallString<32> CLiteral;
540b57cec5SDimitry Andric   for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
550b57cec5SDimitry Andric        I != E; ++I) {
560b57cec5SDimitry Andric     Record &Tag = **I;
575ffd83dbSDimitry Andric     std::string Spelling = std::string(Tag.getValueAsString("Spelling"));
580b57cec5SDimitry Andric     uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
590b57cec5SDimitry Andric     CLiteral.clear();
600b57cec5SDimitry Andric     CLiteral.append("return ");
610b57cec5SDimitry Andric     if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
620b57cec5SDimitry Andric       SrcMgr.PrintMessage(Tag.getLoc().front(),
630b57cec5SDimitry Andric                           SourceMgr::DK_Error,
640b57cec5SDimitry Andric                           Twine("invalid code point"));
650b57cec5SDimitry Andric       continue;
660b57cec5SDimitry Andric     }
670b57cec5SDimitry Andric     CLiteral.append(";");
680b57cec5SDimitry Andric 
697a6dacacSDimitry Andric     StringMatcher::StringPair Match(Spelling, std::string(CLiteral));
700b57cec5SDimitry Andric     NameToUTF8.push_back(Match);
710b57cec5SDimitry Andric   }
720b57cec5SDimitry Andric 
735f757f3fSDimitry Andric   emitSourceFileHeader("HTML named character reference to UTF-8 translation",
745f757f3fSDimitry Andric                        OS, Records);
750b57cec5SDimitry Andric 
760b57cec5SDimitry Andric   OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n"
770b57cec5SDimitry Andric         "                                             StringRef Name) {\n";
780b57cec5SDimitry Andric   StringMatcher("Name", NameToUTF8, OS).Emit();
790b57cec5SDimitry Andric   OS << "  return StringRef();\n"
800b57cec5SDimitry Andric      << "}\n\n";
810b57cec5SDimitry Andric }
82