1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2 //-*- C++ -*-===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements functions to map the name or alias of a unicode
11 // character to its codepoint.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Support/Unicode.h"
19 
20 namespace llvm {
21 namespace sys {
22 namespace unicode {
23 
24 extern const char *UnicodeNameToCodepointDict;
25 extern const uint8_t *UnicodeNameToCodepointIndex;
26 extern const std::size_t UnicodeNameToCodepointIndexSize;
27 extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28 
29 using BufferType = SmallString<64>;
30 
31 struct Node {
32   bool IsRoot = false;
33   char32_t Value = 0xFFFFFFFF;
34   uint32_t ChildrenOffset = 0;
35   bool HasSibling = false;
36   uint32_t Size = 0;
37   StringRef Name;
38   const Node *Parent = nullptr;
39 
isValidllvm::sys::unicode::Node40   constexpr bool isValid() const {
41     return !Name.empty() || Value == 0xFFFFFFFF;
42   }
hasChildrenllvm::sys::unicode::Node43   constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44 
fullNamellvm::sys::unicode::Node45   std::string fullName() const {
46     std::string S;
47     // Reserve enough space for most unicode code points.
48     // The chosen value represent the 99th percentile of name size as of
49     // Unicode 15.0.
50     S.reserve(46);
51     const Node *N = this;
52     while (N) {
53       std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54       N = N->Parent;
55     }
56     std::reverse(S.begin(), S.end());
57     return S;
58   }
59 };
60 
createRoot()61 static Node createRoot() {
62   Node N;
63   N.IsRoot = true;
64   N.ChildrenOffset = 1;
65   N.Size = 1;
66   return N;
67 }
68 
readNode(uint32_t Offset,const Node * Parent=nullptr)69 static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70   if (Offset == 0)
71     return createRoot();
72 
73   uint32_t Origin = Offset;
74   Node N;
75   N.Parent = Parent;
76   uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77   if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78     return N;
79 
80   bool LongName = NameInfo & 0x40;
81   bool HasValue = NameInfo & 0x80;
82   std::size_t Size = NameInfo & ~0xC0;
83   if (LongName) {
84     uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85     NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86     N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87   } else {
88     N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89   }
90   if (HasValue) {
91     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92     uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93     uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94     N.Value = ((H << 16) | (M << 8) | L) >> 3;
95 
96     bool HasChildren = L & 0x02;
97     N.HasSibling = L & 0x01;
98 
99     if (HasChildren) {
100       N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103     }
104   } else {
105     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106     N.HasSibling = H & 0x80;
107     bool HasChildren = H & 0x40;
108     H &= uint8_t(~0xC0);
109     if (HasChildren) {
110       N.ChildrenOffset = (H << 16);
111       N.ChildrenOffset |=
112           (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114     }
115   }
116   N.Size = Offset - Origin;
117   return N;
118 }
119 
startsWith(StringRef Name,StringRef Needle,bool Strict,std::size_t & Consummed,char & PreviousCharInName,bool IsPrefix=false)120 static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121                        std::size_t &Consummed, char &PreviousCharInName,
122                        bool IsPrefix = false) {
123 
124   Consummed = 0;
125   if (Strict) {
126     if (!Name.starts_with(Needle))
127       return false;
128     Consummed = Needle.size();
129     return true;
130   }
131   if (Needle.empty())
132     return true;
133 
134   auto NamePos = Name.begin();
135   auto NeedlePos = Needle.begin();
136 
137   char PreviousCharInNameOrigin = PreviousCharInName;
138   char PreviousCharInNeedle = *Needle.begin();
139   auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
140                          bool IsPrefix = false) {
141     while (It != End) {
142       const auto Next = std::next(It);
143       // Ignore spaces, underscore, medial hyphens
144       // The generator ensures a needle never ends (or starts) by a medial
145       // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
146       bool Ignore =
147           *It == ' ' || *It == '_' ||
148           (*It == '-' && isAlnum(PreviousChar) &&
149            ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
150       PreviousChar = *It;
151       if (!Ignore)
152         break;
153       ++It;
154     }
155     return It;
156   };
157 
158   while (true) {
159     NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160     NeedlePos =
161         IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162     if (NeedlePos == Needle.end())
163       break;
164     if (NamePos == Name.end())
165       break;
166     if (toUpper(*NeedlePos) != toUpper(*NamePos))
167       break;
168     NeedlePos++;
169     NamePos++;
170   }
171   Consummed = std::distance(Name.begin(), NamePos);
172   if (NeedlePos != Needle.end()) {
173     PreviousCharInName = PreviousCharInNameOrigin;
174   }
175   return NeedlePos == Needle.end();
176 }
177 
178 static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,char PreviousCharInName,BufferType & Buffer,const Node * Parent=nullptr)179 compareNode(uint32_t Offset, StringRef Name, bool Strict,
180             char PreviousCharInName, BufferType &Buffer,
181             const Node *Parent = nullptr) {
182   Node N = readNode(Offset, Parent);
183   std::size_t Consummed = 0;
184   bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
185                                               PreviousCharInName);
186   if (!DoesStartWith)
187     return std::make_tuple(N, false, 0);
188 
189   if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
190     return std::make_tuple(N, true, N.Value);
191 
192   if (N.hasChildren()) {
193     uint32_t ChildOffset = N.ChildrenOffset;
194     for (;;) {
195       Node C;
196       bool Matches;
197       uint32_t Value;
198       std::tie(C, Matches, Value) =
199           compareNode(ChildOffset, Name.substr(Consummed), Strict,
200                       PreviousCharInName, Buffer, &N);
201       if (Matches) {
202         std::reverse_copy(C.Name.begin(), C.Name.end(),
203                           std::back_inserter(Buffer));
204         return std::make_tuple(N, true, Value);
205       }
206       ChildOffset += C.Size;
207       if (!C.HasSibling)
208         break;
209     }
210   }
211   return std::make_tuple(N, false, 0);
212 }
213 
214 static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,BufferType & Buffer)215 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
216   return compareNode(Offset, Name, Strict, 0, Buffer);
217 }
218 
219 // clang-format off
220 constexpr const char *const HangulSyllables[][3] = {
221     { "G",  "A",   ""   },
222     { "GG", "AE",  "G"  },
223     { "N",  "YA",  "GG" },
224     { "D",  "YAE", "GS" },
225     { "DD", "EO",  "N", },
226     { "R",  "E",   "NJ" },
227     { "M",  "YEO", "NH" },
228     { "B",  "YE",  "D"  },
229     { "BB", "O",   "L"  },
230     { "S",  "WA",  "LG" },
231     { "SS", "WAE", "LM" },
232     { "",   "OE",  "LB" },
233     { "J",  "YO",  "LS" },
234     { "JJ", "U",   "LT" },
235     { "C",  "WEO", "LP" },
236     { "K",  "WE",  "LH" },
237     { "T",  "WI",  "M"  },
238     { "P",  "YU",  "B"  },
239     { "H",  "EU",  "BS" },
240     { 0,    "YI",  "S"  },
241     { 0,    "I",   "SS" },
242     { 0,    0,     "NG" },
243     { 0,    0,     "J"  },
244     { 0,    0,     "C"  },
245     { 0,    0,     "K"  },
246     { 0,    0,     "T"  },
247     { 0,    0,     "P"  },
248     { 0,    0,     "H"  }
249     };
250 // clang-format on
251 
252 // Unicode 15.0
253 // 3.12 Conjoining Jamo Behavior Common constants
254 constexpr const char32_t SBase = 0xAC00;
255 constexpr const uint32_t LCount = 19;
256 constexpr const uint32_t VCount = 21;
257 constexpr const uint32_t TCount = 28;
258 
findSyllable(StringRef Name,bool Strict,char & PreviousInName,int & Pos,int Column)259 static std::size_t findSyllable(StringRef Name, bool Strict,
260                                 char &PreviousInName, int &Pos, int Column) {
261   assert(Column == 0 || Column == 1 || Column == 2);
262   static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
263   int Len = -1;
264   int Prev = PreviousInName;
265   for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
266     StringRef Syllable(HangulSyllables[I][Column]);
267     if (int(Syllable.size()) <= Len)
268       continue;
269     std::size_t Consummed = 0;
270     char PreviousInNameCopy = PreviousInName;
271     bool DoesStartWith =
272         startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
273     if (!DoesStartWith)
274       continue;
275     Len = Consummed;
276     Pos = I;
277     Prev = PreviousInNameCopy;
278   }
279   if (Len == -1)
280     return 0;
281   PreviousInName = Prev;
282   return size_t(Len);
283 }
284 
285 static std::optional<char32_t>
nameToHangulCodePoint(StringRef Name,bool Strict,BufferType & Buffer)286 nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
287   Buffer.clear();
288   // Hangul Syllable Decomposition
289   std::size_t Consummed = 0;
290   char NameStart = 0;
291   bool DoesStartWith =
292       startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
293   if (!DoesStartWith)
294     return std::nullopt;
295   Name = Name.substr(Consummed);
296   int L = -1, V = -1, T = -1;
297   Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
298   Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
299   Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
300   if (L != -1 && V != -1 && T != -1 && Name.empty()) {
301     if (!Strict) {
302       Buffer.append("HANGUL SYLLABLE ");
303       if (L != -1)
304         Buffer.append(HangulSyllables[L][0]);
305       if (V != -1)
306         Buffer.append(HangulSyllables[V][1]);
307       if (T != -1)
308         Buffer.append(HangulSyllables[T][2]);
309     }
310     return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
311            std::uint32_t(T);
312   }
313   // Otherwise, it's an illegal syllable name.
314   return std::nullopt;
315 }
316 
317 struct GeneratedNamesData {
318   StringRef Prefix;
319   uint32_t Start;
320   uint32_t End;
321 };
322 
323 // Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
324 static const GeneratedNamesData GeneratedNamesDataTable[] = {
325     {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326     {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327     {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328     {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
329     {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
330     {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
331     {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
332     {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
333     {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
334     {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
335     {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
336     {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
337     {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
338     {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
339     {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
340     {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
341     {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
342 };
343 
344 static std::optional<char32_t>
nameToGeneratedCodePoint(StringRef Name,bool Strict,BufferType & Buffer)345 nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
346   for (auto &&Item : GeneratedNamesDataTable) {
347     Buffer.clear();
348     std::size_t Consummed = 0;
349     char NameStart = 0;
350     bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
351                                     NameStart, /*IsPrefix=*/true);
352     if (!DoesStartWith)
353       continue;
354     auto Number = Name.substr(Consummed);
355     unsigned long long V = 0;
356     // Be consistent about mandating upper casing.
357     if (Strict &&
358         llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
359       return {};
360     if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
361       continue;
362     if (!Strict) {
363       Buffer.append(Item.Prefix);
364       Buffer.append(utohexstr(V, true));
365     }
366     return V;
367   }
368   return std::nullopt;
369 }
370 
nameToCodepoint(StringRef Name,bool Strict,BufferType & Buffer)371 static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
372                                                BufferType &Buffer) {
373   if (Name.empty())
374     return std::nullopt;
375 
376   std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
377   if (!Res)
378     Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
379   if (Res)
380     return *Res;
381 
382   Buffer.clear();
383   Node Node;
384   bool Matches;
385   uint32_t Value;
386   std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
387   if (Matches) {
388     std::reverse(Buffer.begin(), Buffer.end());
389     // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
390     // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
391     if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {
392       Buffer = "HANGUL JUNGSEONG O-E";
393       Value = 0x1180;
394     }
395     return Value;
396   }
397   return std::nullopt;
398 }
399 
nameToCodepointStrict(StringRef Name)400 std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
401 
402   BufferType Buffer;
403   auto Opt = nameToCodepoint(Name, true, Buffer);
404   return Opt;
405 }
406 
407 std::optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name)408 nameToCodepointLooseMatching(StringRef Name) {
409   BufferType Buffer;
410   auto Opt = nameToCodepoint(Name, false, Buffer);
411   if (!Opt)
412     return std::nullopt;
413   return LooseMatchingResult{*Opt, Buffer};
414 }
415 
416 // Find the unicode character whose editing distance to Pattern
417 // is shortest, using the Wagner–Fischer algorithm.
418 llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern,std::size_t MaxMatchesCount)419 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
420   // We maintain a fixed size vector of matches,
421   // sorted by distance
422   // The worst match (with the biggest distance) are discarded when new elements
423   // are added.
424   std::size_t LargestEditDistance = 0;
425   llvm::SmallVector<MatchForCodepointName> Matches;
426   Matches.reserve(MaxMatchesCount + 1);
427 
428   auto Insert = [&](const Node &Node, uint32_t Distance,
429                     char32_t Value) -> bool {
430     if (Distance > LargestEditDistance) {
431       if (Matches.size() == MaxMatchesCount)
432         return false;
433       LargestEditDistance = Distance;
434     }
435     // To avoid allocations, the creation of the name is delayed
436     // as much as possible.
437     std::string Name;
438     auto GetName = [&] {
439       if (Name.empty())
440         Name = Node.fullName();
441       return Name;
442     };
443 
444     auto It = llvm::lower_bound(
445         Matches, Distance,
446         [&](const MatchForCodepointName &a, std::size_t Distance) {
447           if (Distance == a.Distance)
448             return a.Name < GetName();
449           return a.Distance < Distance;
450         });
451     if (It == Matches.end() && Matches.size() == MaxMatchesCount)
452       return false;
453 
454     MatchForCodepointName M{GetName(), Distance, Value};
455     Matches.insert(It, std::move(M));
456     if (Matches.size() > MaxMatchesCount)
457       Matches.pop_back();
458     return true;
459   };
460 
461   // We ignore case, space, hyphens, etc,
462   // in both the search pattern and the prospective names.
463   auto Normalize = [](StringRef Name) {
464     std::string Out;
465     Out.reserve(Name.size());
466     for (char C : Name) {
467       if (isAlnum(C))
468         Out.push_back(toUpper(C));
469     }
470     return Out;
471   };
472   std::string NormalizedName = Normalize(Pattern);
473 
474   // Allocate a matrix big enough for longest names.
475   const std::size_t Columns =
476       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
477       1;
478 
479   LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
480       UnicodeNameToCodepointLargestNameSize + 1;
481 
482   std::vector<char> Distances(
483       Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
484 
485   auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
486     assert(Column < Columns);
487     assert(Row < Rows);
488     return Distances[Row * Columns + Column];
489   };
490 
491   for (std::size_t I = 0; I < Columns; I++)
492     Get(I, 0) = I;
493 
494   // Visit the childrens,
495   // Filling (and overriding) the matrix for the name fragment of each node
496   // iteratively. CompleteName is used to collect the actual name of potential
497   // match, respecting case and spacing.
498   auto VisitNode = [&](const Node &N, std::size_t Row,
499                        auto &VisitNode) -> void {
500     std::size_t J = 0;
501     for (; J < N.Name.size(); J++) {
502       if (!isAlnum(N.Name[J]))
503         continue;
504 
505       Get(0, Row) = Row;
506 
507       for (std::size_t I = 1; I < Columns; I++) {
508         const int Delete = Get(I - 1, Row) + 1;
509         const int Insert = Get(I, Row - 1) + 1;
510 
511         const int Replace =
512             Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
513 
514         Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
515       }
516 
517       Row++;
518     }
519 
520     unsigned Cost = Get(Columns - 1, Row - 1);
521     if (N.Value != 0xFFFFFFFF) {
522       Insert(N, Cost, N.Value);
523     }
524 
525     if (N.hasChildren()) {
526       auto ChildOffset = N.ChildrenOffset;
527       for (;;) {
528         Node C = readNode(ChildOffset, &N);
529         ChildOffset += C.Size;
530         if (!C.isValid())
531           break;
532         VisitNode(C, Row, VisitNode);
533         if (!C.HasSibling)
534           break;
535       }
536     }
537   };
538 
539   Node Root = createRoot();
540   VisitNode(Root, 1, VisitNode);
541   return Matches;
542 }
543 
544 } // namespace unicode
545 
546 } // namespace sys
547 } // namespace llvm
548