1 //===- llvm/ADT/StringExtras.h - Useful string functions --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains some functions that are useful when dealing with strings.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_ADT_STRINGEXTRAS_H
15 #define LLVM_ADT_STRINGEXTRAS_H
16 
17 #include "llvm/ADT/APSInt.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Twine.h"
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstdlib>
26 #include <cstring>
27 #include <iterator>
28 #include <string>
29 #include <utility>
30 
31 namespace llvm {
32 
33 class raw_ostream;
34 
35 /// hexdigit - Return the hexadecimal character for the
36 /// given number \p X (which should be less than 16).
37 inline char hexdigit(unsigned X, bool LowerCase = false) {
38   assert(X < 16);
39   static const char LUT[] = "0123456789ABCDEF";
40   const uint8_t Offset = LowerCase ? 32 : 0;
41   return LUT[X] | Offset;
42 }
43 
44 /// Given an array of c-style strings terminated by a null pointer, construct
45 /// a vector of StringRefs representing the same strings without the terminating
46 /// null string.
47 inline std::vector<StringRef> toStringRefArray(const char *const *Strings) {
48   std::vector<StringRef> Result;
49   while (*Strings)
50     Result.push_back(*Strings++);
51   return Result;
52 }
53 
54 /// Construct a string ref from a boolean.
55 inline StringRef toStringRef(bool B) { return StringRef(B ? "true" : "false"); }
56 
57 /// Construct a string ref from an array ref of unsigned chars.
58 inline StringRef toStringRef(ArrayRef<uint8_t> Input) {
59   return StringRef(reinterpret_cast<const char *>(Input.begin()), Input.size());
60 }
61 
62 /// Construct a string ref from an array ref of unsigned chars.
63 inline ArrayRef<uint8_t> arrayRefFromStringRef(StringRef Input) {
64   return {Input.bytes_begin(), Input.bytes_end()};
65 }
66 
67 /// Interpret the given character \p C as a hexadecimal digit and return its
68 /// value.
69 ///
70 /// If \p C is not a valid hex digit, -1U is returned.
71 inline unsigned hexDigitValue(char C) {
72   /* clang-format off */
73   static const int16_t LUT[256] = {
74     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
75     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
76     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
77      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,  // '0'..'9'
78     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'A'..'F'
79     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
80     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'a'..'f'
81     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
82     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
83     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
84     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
85     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
86     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
87     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
89     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
90   };
91   /* clang-format on */
92   return LUT[static_cast<unsigned char>(C)];
93 }
94 
95 /// Checks if character \p C is one of the 10 decimal digits.
96 inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
97 
98 /// Checks if character \p C is a hexadecimal numeric character.
99 inline bool isHexDigit(char C) { return hexDigitValue(C) != ~0U; }
100 
101 /// Checks if character \p C is a valid letter as classified by "C" locale.
102 inline bool isAlpha(char C) {
103   return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z');
104 }
105 
106 /// Checks whether character \p C is either a decimal digit or an uppercase or
107 /// lowercase letter as classified by "C" locale.
108 inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
109 
110 /// Checks whether character \p C is valid ASCII (high bit is zero).
111 inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
112 
113 /// Checks whether all characters in S are ASCII.
114 inline bool isASCII(llvm::StringRef S) {
115   for (char C : S)
116     if (LLVM_UNLIKELY(!isASCII(C)))
117       return false;
118   return true;
119 }
120 
121 /// Checks whether character \p C is printable.
122 ///
123 /// Locale-independent version of the C standard library isprint whose results
124 /// may differ on different platforms.
125 inline bool isPrint(char C) {
126   unsigned char UC = static_cast<unsigned char>(C);
127   return (0x20 <= UC) && (UC <= 0x7E);
128 }
129 
130 /// Checks whether character \p C is whitespace in the "C" locale.
131 ///
132 /// Locale-independent version of the C standard library isspace.
133 inline bool isSpace(char C) {
134   return C == ' ' || C == '\f' || C == '\n' || C == '\r' || C == '\t' ||
135          C == '\v';
136 }
137 
138 /// Returns the corresponding lowercase character if \p x is uppercase.
139 inline char toLower(char x) {
140   if (x >= 'A' && x <= 'Z')
141     return x - 'A' + 'a';
142   return x;
143 }
144 
145 /// Returns the corresponding uppercase character if \p x is lowercase.
146 inline char toUpper(char x) {
147   if (x >= 'a' && x <= 'z')
148     return x - 'a' + 'A';
149   return x;
150 }
151 
152 inline std::string utohexstr(uint64_t X, bool LowerCase = false,
153                              unsigned Width = 0) {
154   char Buffer[17];
155   char *BufPtr = std::end(Buffer);
156 
157   if (X == 0) *--BufPtr = '0';
158 
159   for (unsigned i = 0; Width ? (i < Width) : X; ++i) {
160     unsigned char Mod = static_cast<unsigned char>(X) & 15;
161     *--BufPtr = hexdigit(Mod, LowerCase);
162     X >>= 4;
163   }
164 
165   return std::string(BufPtr, std::end(Buffer));
166 }
167 
168 /// Convert buffer \p Input to its hexadecimal representation.
169 /// The returned string is double the size of \p Input.
170 inline void toHex(ArrayRef<uint8_t> Input, bool LowerCase,
171                   SmallVectorImpl<char> &Output) {
172   const size_t Length = Input.size();
173   Output.resize_for_overwrite(Length * 2);
174 
175   for (size_t i = 0; i < Length; i++) {
176     const uint8_t c = Input[i];
177     Output[i * 2    ] = hexdigit(c >> 4, LowerCase);
178     Output[i * 2 + 1] = hexdigit(c & 15, LowerCase);
179   }
180 }
181 
182 inline std::string toHex(ArrayRef<uint8_t> Input, bool LowerCase = false) {
183   SmallString<16> Output;
184   toHex(Input, LowerCase, Output);
185   return std::string(Output);
186 }
187 
188 inline std::string toHex(StringRef Input, bool LowerCase = false) {
189   return toHex(arrayRefFromStringRef(Input), LowerCase);
190 }
191 
192 /// Store the binary representation of the two provided values, \p MSB and
193 /// \p LSB, that make up the nibbles of a hexadecimal digit. If \p MSB or \p LSB
194 /// do not correspond to proper nibbles of a hexadecimal digit, this method
195 /// returns false. Otherwise, returns true.
196 inline bool tryGetHexFromNibbles(char MSB, char LSB, uint8_t &Hex) {
197   unsigned U1 = hexDigitValue(MSB);
198   unsigned U2 = hexDigitValue(LSB);
199   if (U1 == ~0U || U2 == ~0U)
200     return false;
201 
202   Hex = static_cast<uint8_t>((U1 << 4) | U2);
203   return true;
204 }
205 
206 /// Return the binary representation of the two provided values, \p MSB and
207 /// \p LSB, that make up the nibbles of a hexadecimal digit.
208 inline uint8_t hexFromNibbles(char MSB, char LSB) {
209   uint8_t Hex = 0;
210   bool GotHex = tryGetHexFromNibbles(MSB, LSB, Hex);
211   (void)GotHex;
212   assert(GotHex && "MSB and/or LSB do not correspond to hex digits");
213   return Hex;
214 }
215 
216 /// Convert hexadecimal string \p Input to its binary representation and store
217 /// the result in \p Output. Returns true if the binary representation could be
218 /// converted from the hexadecimal string. Returns false if \p Input contains
219 /// non-hexadecimal digits. The output string is half the size of \p Input.
220 inline bool tryGetFromHex(StringRef Input, std::string &Output) {
221   if (Input.empty())
222     return true;
223 
224   // If the input string is not properly aligned on 2 nibbles we pad out the
225   // front with a 0 prefix; e.g. `ABC` -> `0ABC`.
226   Output.resize((Input.size() + 1) / 2);
227   char *OutputPtr = const_cast<char *>(Output.data());
228   if (Input.size() % 2 == 1) {
229     uint8_t Hex = 0;
230     if (!tryGetHexFromNibbles('0', Input.front(), Hex))
231       return false;
232     *OutputPtr++ = Hex;
233     Input = Input.drop_front();
234   }
235 
236   // Convert the nibble pairs (e.g. `9C`) into bytes (0x9C).
237   // With the padding above we know the input is aligned and the output expects
238   // exactly half as many bytes as nibbles in the input.
239   size_t InputSize = Input.size();
240   assert(InputSize % 2 == 0);
241   const char *InputPtr = Input.data();
242   for (size_t OutputIndex = 0; OutputIndex < InputSize / 2; ++OutputIndex) {
243     uint8_t Hex = 0;
244     if (!tryGetHexFromNibbles(InputPtr[OutputIndex * 2 + 0], // MSB
245                               InputPtr[OutputIndex * 2 + 1], // LSB
246                               Hex))
247       return false;
248     OutputPtr[OutputIndex] = Hex;
249   }
250   return true;
251 }
252 
253 /// Convert hexadecimal string \p Input to its binary representation.
254 /// The return string is half the size of \p Input.
255 inline std::string fromHex(StringRef Input) {
256   std::string Hex;
257   bool GotHex = tryGetFromHex(Input, Hex);
258   (void)GotHex;
259   assert(GotHex && "Input contains non hex digits");
260   return Hex;
261 }
262 
263 /// Convert the string \p S to an integer of the specified type using
264 /// the radix \p Base.  If \p Base is 0, auto-detects the radix.
265 /// Returns true if the number was successfully converted, false otherwise.
266 template <typename N> bool to_integer(StringRef S, N &Num, unsigned Base = 0) {
267   return !S.getAsInteger(Base, Num);
268 }
269 
270 namespace detail {
271 template <typename N>
272 inline bool to_float(const Twine &T, N &Num, N (*StrTo)(const char *, char **)) {
273   SmallString<32> Storage;
274   StringRef S = T.toNullTerminatedStringRef(Storage);
275   char *End;
276   N Temp = StrTo(S.data(), &End);
277   if (*End != '\0')
278     return false;
279   Num = Temp;
280   return true;
281 }
282 }
283 
284 inline bool to_float(const Twine &T, float &Num) {
285   return detail::to_float(T, Num, strtof);
286 }
287 
288 inline bool to_float(const Twine &T, double &Num) {
289   return detail::to_float(T, Num, strtod);
290 }
291 
292 inline bool to_float(const Twine &T, long double &Num) {
293   return detail::to_float(T, Num, strtold);
294 }
295 
296 inline std::string utostr(uint64_t X, bool isNeg = false) {
297   char Buffer[21];
298   char *BufPtr = std::end(Buffer);
299 
300   if (X == 0) *--BufPtr = '0';  // Handle special case...
301 
302   while (X) {
303     *--BufPtr = '0' + char(X % 10);
304     X /= 10;
305   }
306 
307   if (isNeg) *--BufPtr = '-';   // Add negative sign...
308   return std::string(BufPtr, std::end(Buffer));
309 }
310 
311 inline std::string itostr(int64_t X) {
312   if (X < 0)
313     return utostr(static_cast<uint64_t>(1) + ~static_cast<uint64_t>(X), true);
314   else
315     return utostr(static_cast<uint64_t>(X));
316 }
317 
318 inline std::string toString(const APInt &I, unsigned Radix, bool Signed,
319                             bool formatAsCLiteral = false) {
320   SmallString<40> S;
321   I.toString(S, Radix, Signed, formatAsCLiteral);
322   return std::string(S.str());
323 }
324 
325 inline std::string toString(const APSInt &I, unsigned Radix) {
326   return toString(I, Radix, I.isSigned());
327 }
328 
329 /// StrInStrNoCase - Portable version of strcasestr.  Locates the first
330 /// occurrence of string 's1' in string 's2', ignoring case.  Returns
331 /// the offset of s2 in s1 or npos if s2 cannot be found.
332 StringRef::size_type StrInStrNoCase(StringRef s1, StringRef s2);
333 
334 /// getToken - This function extracts one token from source, ignoring any
335 /// leading characters that appear in the Delimiters string, and ending the
336 /// token at any of the characters that appear in the Delimiters string.  If
337 /// there are no tokens in the source string, an empty string is returned.
338 /// The function returns a pair containing the extracted token and the
339 /// remaining tail string.
340 std::pair<StringRef, StringRef> getToken(StringRef Source,
341                                          StringRef Delimiters = " \t\n\v\f\r");
342 
343 /// SplitString - Split up the specified string according to the specified
344 /// delimiters, appending the result fragments to the output list.
345 void SplitString(StringRef Source,
346                  SmallVectorImpl<StringRef> &OutFragments,
347                  StringRef Delimiters = " \t\n\v\f\r");
348 
349 /// Returns the English suffix for an ordinal integer (-st, -nd, -rd, -th).
350 inline StringRef getOrdinalSuffix(unsigned Val) {
351   // It is critically important that we do this perfectly for
352   // user-written sequences with over 100 elements.
353   switch (Val % 100) {
354   case 11:
355   case 12:
356   case 13:
357     return "th";
358   default:
359     switch (Val % 10) {
360       case 1: return "st";
361       case 2: return "nd";
362       case 3: return "rd";
363       default: return "th";
364     }
365   }
366 }
367 
368 /// Print each character of the specified string, escaping it if it is not
369 /// printable or if it is an escape char.
370 void printEscapedString(StringRef Name, raw_ostream &Out);
371 
372 /// Print each character of the specified string, escaping HTML special
373 /// characters.
374 void printHTMLEscaped(StringRef String, raw_ostream &Out);
375 
376 /// printLowerCase - Print each character as lowercase if it is uppercase.
377 void printLowerCase(StringRef String, raw_ostream &Out);
378 
379 /// Converts a string from camel-case to snake-case by replacing all uppercase
380 /// letters with '_' followed by the letter in lowercase, except if the
381 /// uppercase letter is the first character of the string.
382 std::string convertToSnakeFromCamelCase(StringRef input);
383 
384 /// Converts a string from snake-case to camel-case by replacing all occurrences
385 /// of '_' followed by a lowercase letter with the letter in uppercase.
386 /// Optionally allow capitalization of the first letter (if it is a lowercase
387 /// letter)
388 std::string convertToCamelFromSnakeCase(StringRef input,
389                                         bool capitalizeFirst = false);
390 
391 namespace detail {
392 
393 template <typename IteratorT>
394 inline std::string join_impl(IteratorT Begin, IteratorT End,
395                              StringRef Separator, std::input_iterator_tag) {
396   std::string S;
397   if (Begin == End)
398     return S;
399 
400   S += (*Begin);
401   while (++Begin != End) {
402     S += Separator;
403     S += (*Begin);
404   }
405   return S;
406 }
407 
408 template <typename IteratorT>
409 inline std::string join_impl(IteratorT Begin, IteratorT End,
410                              StringRef Separator, std::forward_iterator_tag) {
411   std::string S;
412   if (Begin == End)
413     return S;
414 
415   size_t Len = (std::distance(Begin, End) - 1) * Separator.size();
416   for (IteratorT I = Begin; I != End; ++I)
417     Len += (*I).size();
418   S.reserve(Len);
419   size_t PrevCapacity = S.capacity();
420   (void)PrevCapacity;
421   S += (*Begin);
422   while (++Begin != End) {
423     S += Separator;
424     S += (*Begin);
425   }
426   assert(PrevCapacity == S.capacity() && "String grew during building");
427   return S;
428 }
429 
430 template <typename Sep>
431 inline void join_items_impl(std::string &Result, Sep Separator) {}
432 
433 template <typename Sep, typename Arg>
434 inline void join_items_impl(std::string &Result, Sep Separator,
435                             const Arg &Item) {
436   Result += Item;
437 }
438 
439 template <typename Sep, typename Arg1, typename... Args>
440 inline void join_items_impl(std::string &Result, Sep Separator, const Arg1 &A1,
441                             Args &&... Items) {
442   Result += A1;
443   Result += Separator;
444   join_items_impl(Result, Separator, std::forward<Args>(Items)...);
445 }
446 
447 inline size_t join_one_item_size(char) { return 1; }
448 inline size_t join_one_item_size(const char *S) { return S ? ::strlen(S) : 0; }
449 
450 template <typename T> inline size_t join_one_item_size(const T &Str) {
451   return Str.size();
452 }
453 
454 inline size_t join_items_size() { return 0; }
455 
456 template <typename A1> inline size_t join_items_size(const A1 &A) {
457   return join_one_item_size(A);
458 }
459 template <typename A1, typename... Args>
460 inline size_t join_items_size(const A1 &A, Args &&... Items) {
461   return join_one_item_size(A) + join_items_size(std::forward<Args>(Items)...);
462 }
463 
464 } // end namespace detail
465 
466 /// Joins the strings in the range [Begin, End), adding Separator between
467 /// the elements.
468 template <typename IteratorT>
469 inline std::string join(IteratorT Begin, IteratorT End, StringRef Separator) {
470   using tag = typename std::iterator_traits<IteratorT>::iterator_category;
471   return detail::join_impl(Begin, End, Separator, tag());
472 }
473 
474 /// Joins the strings in the range [R.begin(), R.end()), adding Separator
475 /// between the elements.
476 template <typename Range>
477 inline std::string join(Range &&R, StringRef Separator) {
478   return join(R.begin(), R.end(), Separator);
479 }
480 
481 /// Joins the strings in the parameter pack \p Items, adding \p Separator
482 /// between the elements.  All arguments must be implicitly convertible to
483 /// std::string, or there should be an overload of std::string::operator+=()
484 /// that accepts the argument explicitly.
485 template <typename Sep, typename... Args>
486 inline std::string join_items(Sep Separator, Args &&... Items) {
487   std::string Result;
488   if (sizeof...(Items) == 0)
489     return Result;
490 
491   size_t NS = detail::join_one_item_size(Separator);
492   size_t NI = detail::join_items_size(std::forward<Args>(Items)...);
493   Result.reserve(NI + (sizeof...(Items) - 1) * NS + 1);
494   detail::join_items_impl(Result, Separator, std::forward<Args>(Items)...);
495   return Result;
496 }
497 
498 /// A helper class to return the specified delimiter string after the first
499 /// invocation of operator StringRef().  Used to generate a comma-separated
500 /// list from a loop like so:
501 ///
502 /// \code
503 ///   ListSeparator LS;
504 ///   for (auto &I : C)
505 ///     OS << LS << I.getName();
506 /// \end
507 class ListSeparator {
508   bool First = true;
509   StringRef Separator;
510 
511 public:
512   ListSeparator(StringRef Separator = ", ") : Separator(Separator) {}
513   operator StringRef() {
514     if (First) {
515       First = false;
516       return {};
517     }
518     return Separator;
519   }
520 };
521 
522 /// A forward iterator over partitions of string over a separator.
523 class SplittingIterator
524     : public iterator_facade_base<SplittingIterator, std::forward_iterator_tag,
525                                   StringRef> {
526   char SeparatorStorage;
527   StringRef Current;
528   StringRef Next;
529   StringRef Separator;
530 
531 public:
532   SplittingIterator(StringRef Str, StringRef Separator)
533       : Next(Str), Separator(Separator) {
534     ++*this;
535   }
536 
537   SplittingIterator(StringRef Str, char Separator)
538       : SeparatorStorage(Separator), Next(Str),
539         Separator(&SeparatorStorage, 1) {
540     ++*this;
541   }
542 
543   SplittingIterator(const SplittingIterator &R)
544       : SeparatorStorage(R.SeparatorStorage), Current(R.Current), Next(R.Next),
545         Separator(R.Separator) {
546     if (R.Separator.data() == &R.SeparatorStorage)
547       Separator = StringRef(&SeparatorStorage, 1);
548   }
549 
550   SplittingIterator &operator=(const SplittingIterator &R) {
551     if (this == &R)
552       return *this;
553 
554     SeparatorStorage = R.SeparatorStorage;
555     Current = R.Current;
556     Next = R.Next;
557     Separator = R.Separator;
558     if (R.Separator.data() == &R.SeparatorStorage)
559       Separator = StringRef(&SeparatorStorage, 1);
560     return *this;
561   }
562 
563   bool operator==(const SplittingIterator &R) const {
564     assert(Separator == R.Separator);
565     return Current.data() == R.Current.data();
566   }
567 
568   const StringRef &operator*() const { return Current; }
569 
570   StringRef &operator*() { return Current; }
571 
572   SplittingIterator &operator++() {
573     std::tie(Current, Next) = Next.split(Separator);
574     return *this;
575   }
576 };
577 
578 /// Split the specified string over a separator and return a range-compatible
579 /// iterable over its partitions.  Used to permit conveniently iterating
580 /// over separated strings like so:
581 ///
582 /// \code
583 ///   for (StringRef x : llvm::split("foo,bar,baz", ","))
584 ///     ...;
585 /// \end
586 ///
587 /// Note that the passed string must remain valid throuhgout lifetime
588 /// of the iterators.
589 inline iterator_range<SplittingIterator> split(StringRef Str, StringRef Separator) {
590   return {SplittingIterator(Str, Separator),
591           SplittingIterator(StringRef(), Separator)};
592 }
593 
594 inline iterator_range<SplittingIterator> split(StringRef Str, char Separator) {
595   return {SplittingIterator(Str, Separator),
596           SplittingIterator(StringRef(), Separator)};
597 }
598 
599 } // end namespace llvm
600 
601 #endif // LLVM_ADT_STRINGEXTRAS_H
602