1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4 
5 #include "MLTextUtils.h"
6 
7 #include <cstring>
8 
9 #include "MLDSPScalarMath.h"
10 #include "MLMemoryUtils.h"
11 #include "aes256.h"
12 #include "utf.hpp"
13 
14 namespace ml
15 {
16 namespace textUtils
17 {
18 static const int npos = -1;
19 
isDigit(CodePoint c)20 bool isDigit(CodePoint c)
21 {
22   if (c >= '0' && c <= '9') return true;
23   return false;
24 }
isASCII(CodePoint c)25 bool isASCII(CodePoint c) { return (c < 0x7f); }
26 
isLatin(CodePoint c)27 bool isLatin(CodePoint c)
28 {
29   // includes Latin-1 Supplement
30   return (c <= 0xFF);
31 }
32 
isWhitespace(CodePoint ch)33 bool isWhitespace(CodePoint ch)
34 {
35   return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 ||
36          ch == 0x1680 || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 ||
37          ch == 0x202F || ch == 0x205F || ch == 0x3000;
38 }
39 
isCJK(CodePoint ch)40 bool isCJK(CodePoint ch)
41 {
42   return (ch >= 0x4E00 && ch <= 0x9FBF)      // CJK Unified Ideographs
43          || (ch >= 0x2E80 && ch <= 0x2FDF)   // CJK Radicals Supplement & Kangxi Radicals
44          || (ch >= 0x2FF0 && ch <= 0x30FF)   // Ideographic Description Characters, CJK Symbols
45                                              // and Punctuation & Japanese
46          || (ch >= 0x3100 && ch <= 0x31BF)   // Korean
47          || (ch >= 0xAC00 && ch <= 0xD7AF)   // Hangul Syllables
48          || (ch >= 0xF900 && ch <= 0xFAFF)   // CJK Compatibility Ideographs
49          || (ch >= 0xFE30 && ch <= 0xFE4F)   // CJK Compatibility Forms
50          || (ch >= 0x31C0 && ch <= 0x4DFF);  // Other exiensions
51 }
52 
digitsToNaturalNumber(const char32_t * p)53 int digitsToNaturalNumber(const char32_t* p)
54 {
55   constexpr int kMaxDigits = 16;
56 
57   if (!p) return 0;
58   int v = 0;
59   int l = 0;
60   int d;
61   char c;
62 
63   while (p[l])
64   {
65     c = p[l];
66     if (c >= '0' && c <= '9')
67       d = (c - '0');
68     else
69       break;
70     v = (v * 10) + d;
71     l++;
72     if (l >= kMaxDigits) return -1;
73   }
74   return v;
75 }
76 
textToNaturalNumber(const TextFragment & frag)77 int textToNaturalNumber(const TextFragment& frag)
78 {
79   std::vector<CodePoint> vec = textToCodePoints(frag);
80   return digitsToNaturalNumber(vec.data());
81 }
82 
naturalNumberToText(int i)83 TextFragment naturalNumberToText(int i)
84 {
85   constexpr int kMaxDigits = 16;
86 
87   char buf[kMaxDigits]{};
88   char* p = buf + kMaxDigits - 1;
89   char* end = p;
90 
91   // null-terminate the string
92   *end = 0;
93 
94   // work backwards
95   do
96   {
97     p--;
98     if (p < buf) return "overflow";
99     *p = '0' + (i % 10);
100     i /= 10;
101   } while (i != 0);
102   return (TextFragment(p, end - p));
103 }
104 
105 // numeric
106 
floatNumberToText(float f,int precision)107 TextFragment floatNumberToText(float f, int precision)
108 {
109   // const float maxFloat = std::numeric_limits<float>::max();
110   constexpr int kMaxPrecision = 10;
111   constexpr int kScientificStart = 5;
112   constexpr int kMaxDigits = 32;
113   constexpr int kTableZeroOffset = 38;
114   constexpr float powersOfTen[kTableZeroOffset * 2 + 1]{
115       1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28, 1e-27, 1e-26,
116       1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13,
117       1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1e+00,
118       1e+01, 1e+02, 1e+03, 1e+04, 1e+05, 1e+06, 1e+07, 1e+08, 1e+09, 1e+10, 1e+11, 1e+12, 1e+13,
119       1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, 1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26,
120       1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38};
121 
122   char buf[kMaxDigits];
123   char* writePtr = buf;
124   float value = f;
125   const int p = std::min(precision, kMaxPrecision);
126   const float epsilon =
127       std::max((float)fabs(f * powersOfTen[kTableZeroOffset - p]), std::numeric_limits<float>::min());
128 
129   if (std::isnan(f))
130   {
131     *writePtr++ = 'n';
132     *writePtr++ = 'a';
133     *writePtr++ = 'n';
134   }
135   else
136   {
137     if (value < 0)
138     {
139       value = -value;
140       *writePtr++ = '-';
141     }
142 
143     if (value > powersOfTen[kTableZeroOffset * 2])
144     {
145       *writePtr++ = 'i';
146       *writePtr++ = 'n';
147       *writePtr++ = 'f';
148     }
149     else if (value < powersOfTen[0])
150     {
151       *writePtr++ = '0';
152       *writePtr++ = '.';
153     }
154     else
155     {
156       // get the exponent using linear search, starting from center
157       int y = kTableZeroOffset;
158       while (value > powersOfTen[y])
159       {
160         y++;
161       }
162       while (value < powersOfTen[y])
163       {
164         y--;
165       }
166       int exponent = y - kTableZeroOffset;
167       int absExponent = std::abs(exponent);
168 
169       if (absExponent < kScientificStart)
170       // write in decimal notation
171       {
172         // first write any leading zeroes
173         if (exponent < -1)
174         {
175           *writePtr++ = '0';
176           *writePtr++ = '.';
177           int zeroes = -exponent - 1;
178           for (int i = 0; i < zeroes; ++i)
179           {
180             *writePtr++ = '0';
181           }
182         }
183         else if (exponent == -1)
184         {
185           *writePtr++ = '0';
186         }
187 
188         // then write nonzero digits
189         do
190         {
191           if (exponent == -1)
192           {
193             *writePtr++ = '.';
194           }
195           int onesInt = truncf(value * powersOfTen[kTableZeroOffset - exponent]);
196           *writePtr++ = '0' + onesInt;
197           value = value - onesInt * powersOfTen[kTableZeroOffset + exponent];
198           exponent--;
199         } while ((value > epsilon) || (exponent >= 0));
200       }
201       else
202       // write in scientific notation
203       {
204         const char exponentSign = exponent >= 0 ? '+' : '-';
205 
206         // write mantissa
207         int onesInt = value * powersOfTen[kTableZeroOffset - exponent];
208         *writePtr++ = '0' + onesInt;
209         *writePtr++ = '.';
210         while (value > epsilon)
211         {
212           value = value - onesInt * powersOfTen[kTableZeroOffset + exponent];
213           exponent--;
214           onesInt = value * powersOfTen[kTableZeroOffset - exponent];
215           *writePtr++ = '0' + onesInt;
216         }
217 
218         // write exponent
219         *writePtr++ = 'e';
220         *writePtr++ = exponentSign;
221         *writePtr++ = '0' + absExponent / 10;
222         *writePtr++ = '0' + absExponent % 10;
223       }
224     }
225   }
226   return TextFragment(buf, writePtr - buf);
227 }
228 
fragmentContainsCodePoint(TextFragment f,CodePoint cp)229 bool fragmentContainsCodePoint(TextFragment f, CodePoint cp)
230 {
231   for (const CodePoint c : f)
232   {
233     if (c == cp) return true;
234   }
235   return false;
236 }
237 
textToFloatNumber(const TextFragment & frag)238 float textToFloatNumber(const TextFragment& frag)
239 {
240   float sign = 1;
241   float wholePart = 0, fracPart = 0, fracPlace = 1;
242   float exponentSign = 1, exponent = 0;
243   bool hasExp = false;
244   auto it = frag.begin();
245   const TextFragment digits{"0123456789"};
246   std::vector<std::pair<TextFragment, std::function<void()> > > segments{
247       {"NaN", [&]() { wholePart = std::numeric_limits<float>::quiet_NaN(); }},
248       {"-", [&]() { sign = -sign; }},
249       {"inf", [&]() { wholePart = std::numeric_limits<float>::infinity(); }},
250       {digits, [&]() { wholePart = wholePart * 10.0f + ((*it) - '0'); }},
251       {".", [&]() {}},
252       {digits, [&]() { fracPart += ((*it) - '0') * (fracPlace *= 0.1f); }},
253       {"e+", [&]() { hasExp = true; }},
254       {"-", [&]() { exponentSign = -exponentSign; }},
255       {digits, [&]() { exponent = exponent * 10.0f + ((*it) - '0'); }}};
256 
257   for (auto segment : segments)
258   {
259     while (fragmentContainsCodePoint(segment.first, *it))
260     {
261       segment.second();
262       ++it;
263     }
264   }
265 
266   float base = sign * (wholePart + fracPart);
267   return hasExp ? base * powf(10.f, exponent * exponentSign) : base;
268 }
269 
findFirst(const TextFragment & frag,const CodePoint b)270 int findFirst(const TextFragment& frag, const CodePoint b)
271 {
272   int r = npos;
273   if (!frag) return r;
274   int i = 0;
275   for (const CodePoint c : frag)
276   {
277     if (!validateCodePoint(c)) return r;
278     if (c == b)
279     {
280       r = i;
281       break;
282     }
283     i++;
284   }
285   return r;
286 }
287 
findLast(const TextFragment & frag,const CodePoint b)288 int findLast(const TextFragment& frag, const CodePoint b)
289 {
290   int r = npos;
291   if (!frag) return r;
292   int i = 0;
293   for (const CodePoint c : frag)
294   {
295     if (!validateCodePoint(c)) return r;
296     if (c == b)
297     {
298       r = i;
299     }
300     i++;
301   }
302   return r;
303 }
304 
findFirst(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)305 int findFirst(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
306 {
307   int r = npos;
308   if (!frag) return r;
309   int i = 0;
310   for (const CodePoint c : frag)
311   {
312     if (!validateCodePoint(c)) return r;
313     if (matchFn(c))
314     {
315       r = i;
316       break;
317     }
318     i++;
319   }
320   return r;
321 }
322 
323 // TODO dumb, have to call matchFn on each code point because we have no reverse
324 // iterator
findLast(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)325 int findLast(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
326 {
327   int r = npos;
328   if (!frag) return r;
329   int i = 0;
330   for (const CodePoint c : frag)
331   {
332     if (!validateCodePoint(c)) return r;
333     if (matchFn(c))
334     {
335       r = i;
336     }
337     i++;
338   }
339   return r;
340 }
341 
subText(const TextFragment & frag,size_t start,size_t end)342 TextFragment subText(const TextFragment& frag, size_t start, size_t end)
343 {
344   // this impl does an unneccesary copy, to keep TextFragment very simple for
345   // now.
346   if (!frag) return TextFragment();
347   if (start >= end) return TextFragment();
348 
349   // temp buffer big enough to hold whole input fragment if needed.
350   // we won't know the output fragment size in bytes until iterating the code
351   // points.
352   size_t len = frag.lengthInBytes();
353   SmallStackBuffer<char, kShortFragmentSizeInChars> temp(len);
354   char* buf = temp.data();
355   char* pb = buf;
356 
357   auto first = TextFragment::Iterator(frag.getText());
358   auto it = first;
359   for (int i = 0; i < start; ++i)
360   {
361     ++it;
362   }
363 
364   for (int i = 0; i < end - start; ++i)
365   {
366     // write the codepoint as UTF-8 to the buffer
367     if (!validateCodePoint(*it)) return TextFragment();
368     pb = utf::internal::utf_traits<utf::utf8>::encode(*it, pb);
369     ++it;
370   }
371 
372   return TextFragment(buf, pb - buf);
373 }
374 
map(const TextFragment & frag,std::function<CodePoint (CodePoint)> f)375 TextFragment map(const TextFragment& frag, std::function<CodePoint(CodePoint)> f)
376 {
377   if (!frag) return TextFragment();
378   std::vector<CodePoint> vec = textToCodePoints(frag);
379   std::transform(vec.begin(), vec.end(), vec.begin(), f);
380   return codePointsToText(vec);
381 }
382 
reduce(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)383 TextFragment reduce(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
384 {
385   if (!frag) return TextFragment();
386   size_t len = frag.lengthInBytes();
387   SmallStackBuffer<char, kShortFragmentSizeInChars> temp(len);
388   char* buf = temp.data();
389   char* pb = buf;
390 
391   for (const CodePoint c : frag)
392   {
393     if (!validateCodePoint(c)) return TextFragment();
394     if (matchFn(c))
395     {
396       pb = utf::internal::utf_traits<utf::utf8>::encode(c, pb);
397     }
398   }
399 
400   return TextFragment(buf, pb - buf);
401 }
402 
split(TextFragment frag,CodePoint delimiter)403 std::vector<TextFragment> split(TextFragment frag, CodePoint delimiter)
404 {
405   std::vector<TextFragment> output;
406   int start = 0;
407   int end = 0;
408   int pieceLen = 0;
409   for (const CodePoint c : frag)
410   {
411     if (!validateCodePoint(c)) return std::vector<TextFragment>();
412     pieceLen++;
413     end++;
414     if (c == delimiter)
415     {
416       if (pieceLen > 1)
417       {
418         output.push_back(subText(frag, start, end - 1));
419       }
420       start = end;
421       pieceLen = 0;
422     }
423   }
424   if (pieceLen > 0)
425   {
426     output.push_back(subText(frag, start, end));
427   }
428   return output;
429 }
430 
join(const std::vector<TextFragment> & vec)431 TextFragment join(const std::vector<TextFragment>& vec)
432 {
433   TextFragment sum;
434   size_t len = vec.size();
435   for (int i = 0; i < len; ++i)
436   {
437     TextFragment frag = vec[i];
438     sum = TextFragment(sum, vec[i]);
439   }
440   return sum;
441 }
442 
join(const std::vector<TextFragment> & vec,CodePoint delimiter)443 TextFragment join(const std::vector<TextFragment>& vec, CodePoint delimiter)
444 {
445   TextFragment delimFrag(delimiter);
446   TextFragment sum;
447   size_t len = vec.size();
448   for (int i = 0; i < len; ++i)
449   {
450     TextFragment frag = vec[i];
451     sum = TextFragment(sum, vec[i]);
452     if ((i >= 0) && (i < len - 1))
453     {
454       sum = TextFragment(sum, delimFrag);
455     }
456   }
457   return sum;
458 }
459 
stripFileExtension(const TextFragment & frag)460 TextFragment stripFileExtension(const TextFragment& frag)
461 {
462   int dotLoc = findLast(frag, '.');
463   if (dotLoc >= 0)
464   {
465     return subText(frag, 0, dotLoc);
466   }
467   return frag;
468 }
469 
getShortFileName(const TextFragment & frag)470 TextFragment getShortFileName(const TextFragment& frag)
471 {
472   int slashLoc = findLast(frag, '/');
473   if (slashLoc >= 0)
474   {
475     return subText(frag, slashLoc + 1, frag.lengthInCodePoints());
476   }
477   return frag;
478 }
479 
getPath(const TextFragment & frag)480 TextFragment getPath(const TextFragment& frag)
481 {
482   int slashLoc = findLast(frag, '/');
483   if (slashLoc >= 0)
484   {
485     return subText(frag, 0, slashLoc);
486   }
487   return frag;
488 }
489 
490 // TODO extend to recognize Cyrillic and other scripts
bestScriptForTextFragment(const TextFragment & frag)491 Symbol bestScriptForTextFragment(const TextFragment& frag)
492 {
493   for (const CodePoint c : frag)
494   {
495     if (!validateCodePoint(c)) return "unknown";
496     // if there are any CJK characters, return CJK
497     if (isCJK(c))
498     {
499       return "cjk";
500     }
501     else if (!isLatin(c))
502     {
503       return "unknown";
504     }
505   }
506   return "latin";
507 }
508 
509 static const char base64table[] =
510     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
indexOf(const char * str,char c)511 int indexOf(const char* str, char c)
512 {
513   int r = -1;
514   size_t len = strlen(str);
515   for (size_t i = 0; i < len; ++i)
516   {
517     if (str[i] == c)
518     {
519       r = (int)i;
520       break;
521     }
522   }
523   return r;
524 }
525 
base64Encode(const std::vector<uint8_t> & in)526 TextFragment base64Encode(const std::vector<uint8_t>& in)
527 {
528   size_t len = in.size();
529   std::vector<char> out;
530   int b;
531   for (size_t i = 0; i < len; i += 3)
532   {
533     b = (in[i] & 0xFC) >> 2;
534     out.push_back(base64table[b]);
535     b = (in[i] & 0x03) << 4;
536     if (i + 1 < len)
537     {
538       b |= (in[i + 1] & 0xF0) >> 4;
539       out.push_back(base64table[b]);
540       b = (in[i + 1] & 0x0F) << 2;
541       if (i + 2 < len)
542       {
543         b |= (in[i + 2] & 0xC0) >> 6;
544         out.push_back(base64table[b]);
545         b = in[i + 2] & 0x3F;
546         out.push_back(base64table[b]);
547       }
548       else
549       {
550         out.push_back(base64table[b]);
551         out.push_back('=');
552       }
553     }
554     else
555     {
556       out.push_back(base64table[b]);
557       out.push_back('=');
558       out.push_back('=');
559     }
560   }
561   out.push_back(0);
562   return TextFragment(out.data());
563 }
564 
base64Decode(const TextFragment & frag)565 std::vector<uint8_t> base64Decode(const TextFragment& frag)
566 {
567   size_t len = frag.lengthInBytes();
568   if (len % 4) return std::vector<uint8_t>();
569   std::vector<uint8_t> decoded;
570   const char* inChars = frag.getText();
571   int b[4];
572   for (int i = 0; i < len; i += 4)
573   {
574     for (int j = 0; j < 4; ++j)
575     {
576       b[j] = indexOf(base64table, inChars[i + j]);
577     }
578     decoded.push_back((b[0] << 2) | (b[1] >> 4));
579     if (b[2] < 64)
580     {
581       decoded.push_back((b[1] << 4) | (b[2] >> 2));
582       if (b[3] < 64)
583       {
584         decoded.push_back((b[2] << 6) | b[3]);
585       }
586     }
587   }
588   return decoded;
589 }
590 
stripWhitespaceAtEnds(const TextFragment & frag)591 TextFragment stripWhitespaceAtEnds(const TextFragment& frag)
592 {
593   std::function<bool(CodePoint)> f([](CodePoint c) { return !isWhitespace(c); });
594   int first = findFirst(frag, f);
595   int last = findLast(frag, f);
596   if ((first == npos) || (last == npos)) return TextFragment();
597   return (subText(frag, first, last + 1));
598 }
599 
stripAllWhitespace(const TextFragment & frag)600 TextFragment stripAllWhitespace(const TextFragment& frag)
601 {
602   std::function<bool(CodePoint)> f([](CodePoint c) { return !isWhitespace(c); });
603   return reduce(frag, f);
604 }
605 
AES256CBCEncode(const std::vector<uint8_t> & input,const std::vector<uint8_t> & key,const std::vector<uint8_t> & iv)606 std::vector<uint8_t> AES256CBCEncode(const std::vector<uint8_t>& input,
607                                      const std::vector<uint8_t>& key,
608                                      const std::vector<uint8_t>& iv)
609 {
610   if (!(input.size() > 0) || !(key.size() == 32) || !(iv.size() == 32))
611     return std::vector<uint8_t>();
612 
613   aes256_context ctx;
614   aes256_init(&ctx, key.data());
615 
616   const int blockSize = 16;
617   size_t inputSize = input.size();
618   size_t blocks = inputSize / blockSize + 1;
619   size_t paddedSize = blockSize * (blocks);
620 
621   // add PKCS padding
622   std::vector<uint8_t> plaintext = input;
623   plaintext.resize(paddedSize);
624   size_t padBytes = paddedSize - inputSize;
625   for (size_t i = inputSize; i < paddedSize; ++i)
626   {
627     plaintext[i] = padBytes;
628   }
629 
630   std::vector<uint8_t> ciphertext(paddedSize);
631   uint8_t currentIV[blockSize];
632   uint8_t workVector[blockSize];
633 
634   for (size_t i = 0; i < blockSize; ++i)
635   {
636     currentIV[i] = iv[i];
637   }
638 
639   for (size_t b = 0; b < blocks; ++b)
640   {
641     // get plaintext XOR IV
642     for (size_t i = 0; i < blockSize; ++i)
643     {
644       workVector[i] = plaintext[b * blockSize + i] ^ currentIV[i];
645     }
646 
647     aes256_encrypt_ecb(&ctx, workVector);
648 
649     // write to ciphertext, get new IV
650     for (size_t i = 0; i < blockSize; ++i)
651     {
652       ciphertext[b * blockSize + i] = workVector[i];
653       currentIV[i] = workVector[i];
654     }
655   }
656 
657   aes256_done(&ctx);
658   return ciphertext;
659 }
660 
AES256CBCDecode(const std::vector<uint8_t> & cipher,const std::vector<uint8_t> & key,const std::vector<uint8_t> & iv)661 std::vector<uint8_t> AES256CBCDecode(const std::vector<uint8_t>& cipher,
662                                      const std::vector<uint8_t>& key,
663                                      const std::vector<uint8_t>& iv)
664 {
665   if (!(cipher.size() > 0) || (key.size() < 32) || (iv.size() < 32)) return std::vector<uint8_t>();
666 
667   aes256_context ctx;
668   aes256_init(&ctx, key.data());
669 
670   const int blockSize = 16;
671   size_t blocks = cipher.size() / blockSize;
672 
673   std::vector<uint8_t> plaintext(blockSize * blocks);
674 
675   uint8_t currentIV[blockSize];
676   uint8_t nextIV[blockSize];
677   uint8_t workVector[blockSize];
678 
679   for (int i = 0; i < blockSize; ++i)
680   {
681     currentIV[i] = iv[i];
682   }
683 
684   for (int b = 0; b < blocks; ++b)
685   {
686     // get next cipher block and use ciphertext as next IV
687     for (int i = 0; i < blockSize; ++i)
688     {
689       workVector[i] = cipher[b * blockSize + i];
690       nextIV[i] = workVector[i];
691     }
692 
693     aes256_decrypt_ecb(&ctx, workVector);
694 
695     // write to plaintext, XOR work vector with IV
696     for (int i = 0; i < blockSize; ++i)
697     {
698       workVector[i] ^= currentIV[i];
699       plaintext[b * blockSize + i] = workVector[i];
700       currentIV[i] = nextIV[i];
701     }
702   }
703 
704   aes256_done(&ctx);
705 
706   // remove PKCS padding
707   size_t paddedSize = plaintext.size();
708   if (paddedSize % blockSize == 0)
709   {
710     int padBytes = plaintext[paddedSize - 1];
711     if ((padBytes <= 16) && (padBytes < paddedSize))
712     {
713       plaintext.resize(paddedSize - padBytes);
714     }
715   }
716 
717   return plaintext;
718 }
719 
collate(const TextFragment & a,const TextFragment & b)720 bool collate(const TextFragment& a, const TextFragment& b)
721 {
722   auto ia = a.begin();
723   auto ib = b.begin();
724 
725   int iterEnds = 0;
726   while (!iterEnds)
727   {
728     CodePoint ca = *ia;
729     CodePoint cb = *ib;
730 
731     if (!validateCodePoint(ca)) return false;
732     if (!validateCodePoint(cb)) return false;
733 
734     if (ca != cb)
735     {
736       // the code points differ.
737       // compare codepoints, produce a result and bail.
738       if (isLatin(ca) && isLatin(cb))
739       {
740         char la = tolower(ca);
741         char lb = tolower(cb);
742 
743         if (la != lb)
744         {
745           // different letters
746           return la < lb;
747         }
748         else
749         {
750           // different cases but same letter. define lower case as less within
751           // letter.
752           return ca > cb;
753         }
754       }
755       else
756       {
757         // TODO collate other languages better using miniutf library.
758         return ca < cb;
759       }
760     }
761     else
762     {
763       ++ia;
764       ++ib;
765     }
766 
767     int aEnd = (ia == a.end());
768     int bEnd = (ib == b.end());
769     iterEnds = (aEnd << 1) | bEnd;
770   }
771 
772   switch (iterEnds)
773   {
774     case 1:  // b ended but not a: a > b.
775       return false;
776     case 2:  // a ended but not b: a < b.
777       return true;
778     case 3:   // both ended, a == b.
779     default:  // impossible
780       return false;
781   }
782   return false;
783 }
784 
785 #pragma mark Symbol utilities
786 
addFinalNumber(Symbol sym,int n)787 Symbol addFinalNumber(Symbol sym, int n)
788 {
789   TextFragment t(sym.getTextFragment(), textUtils::naturalNumberToText(n));
790   return Symbol(t.getText());
791 }
792 
stripFinalNumber(Symbol sym)793 Symbol stripFinalNumber(Symbol sym)
794 {
795   const TextFragment& frag = sym.getTextFragment();
796   size_t points = frag.lengthInCodePoints();
797 
798   // TODO make more readble using random access fragment class
799 
800   SmallStackBuffer<CodePoint, kShortFragmentSizeInCodePoints> temp(points + 1);
801   CodePoint* buf = temp.data();
802 
803   // read into char32 array for random access
804   int i = 0;
805   for (CodePoint c : frag)
806   {
807     if (!validateCodePoint(c)) return Symbol();
808     buf[i++] = c;
809   }
810 
811   // null terminate
812   buf[points] = 0;
813 
814   // no final number? return
815   if (!textUtils::isDigit(buf[points - 1])) return sym;
816 
817   // read backwards until non-digit
818   size_t firstDigitPos = 0;
819   for (size_t i = points - 2; i >= 0; --i)
820   {
821     char32_t c = buf[i];
822     if (!textUtils::isDigit(c))
823     {
824       firstDigitPos = i + 1;
825       break;
826     }
827   }
828 
829   ml::TextFragment subFrag(textUtils::subText(frag, 0, firstDigitPos));
830   return subFrag.getText();
831 }
832 
833 // if the symbol's text ends in a positive integer, return that number.
834 // Otherwise return 0.
getFinalNumber(Symbol sym)835 int getFinalNumber(Symbol sym)
836 {
837   // make temporary buffer of decoded code points, hopefully on stack
838   const TextFragment& frag = sym.getTextFragment();
839   size_t points = frag.lengthInCodePoints();
840 
841   // TODO make more readble using random access fragment class
842 
843   SmallStackBuffer<CodePoint, kShortFragmentSizeInCodePoints> decodedPoints(points + 1);
844   CodePoint* buf = decodedPoints.data();
845 
846   // read into char32 array for random access
847   int i = 0;
848   for (CodePoint c : frag)
849   {
850     if (!validateCodePoint(c)) return 0;
851     buf[i++] = c;
852   }
853 
854   // null terminate char32_t string
855   buf[i] = 0;
856 
857   // no final number? return
858   if (!textUtils::isDigit(buf[i - 1])) return 0;
859 
860   // read backwards until non-digit
861   int firstDigitPos = 0;
862   for (i--; i >= 0; --i)
863   {
864     char32_t c = buf[i];
865     if (!textUtils::isDigit(c))
866     {
867       firstDigitPos = i + 1;
868       break;
869     }
870   }
871 
872   // note, null terminated char32_t string needed
873   int r = digitsToNaturalNumber(buf + firstDigitPos);
874   return r;
875 }
876 
stripFinalCharacter(Symbol sym)877 Symbol stripFinalCharacter(Symbol sym)
878 {
879   TextFragment frag = sym.getTextFragment();
880   size_t len = frag.lengthInCodePoints();
881   return Symbol(subText(frag, 0, len - 1));
882 }
883 
884 #pragma mark NameMaker
885 
886 // base-26 arithmetic with letters (A = 0) produces A, B, ... Z, BA, BB ...
nextName()887 const TextFragment NameMaker::nextName()
888 {
889   std::vector<int> digits;
890   const int base = 26;
891   const char baseChar = 'A';
892   int a, m, d, rem;
893 
894   a = index++;
895 
896   if (!a)
897   {
898     digits.push_back(0);
899   }
900   else
901     while (a)
902     {
903       d = a / base;
904       m = d * base;
905       rem = a - m;
906       digits.push_back(rem);
907       a = d;
908     }
909 
910   int c = 0;
911   while (digits.size() && (c < maxLen - 1))
912   {
913     d = digits.back();
914     digits.pop_back();
915 
916     buf[c++] = static_cast<char>(d) + baseChar;
917   }
918 
919   buf[c++] = 0;
920   return TextFragment(buf);
921 }
922 
923 class NoiseGen
924 {
925  public:
NoiseGen()926   NoiseGen() : mSeed(0) {}
~NoiseGen()927   ~NoiseGen() {}
928 
step()929   inline void step() { mSeed = mSeed * 0x0019660D + 0x3C6EF35F; }
930 
getIntSample()931   inline uint32_t getIntSample()
932   {
933     step();
934     return mSeed;
935   }
936 
reset()937   void reset() { mSeed = 0; }
938 
939  private:
940   uint32_t mSeed = 0;
941 };
942 
943 static const char kLetters[33] = "aabcdeefghijklmnnoopqrssttuvwxyz";
vectorOfNonsenseSymbols(int len)944 std::vector<Symbol> vectorOfNonsenseSymbols(int len)
945 {
946   NoiseGen randSource;
947   std::vector<Symbol> words;
948   for (int i = 0; i < len; ++i)
949   {
950     std::string newStr;
951     uint32_t r32 = randSource.getIntSample() >> 16;
952     int wordLen = (r32 & 7) + 3;
953 
954     for (int j = 0; j < wordLen; ++j)
955     {
956       r32 = randSource.getIntSample() >> 16;
957       int idx = (r32 & 31);
958       newStr += (kLetters[idx]);
959     }
960     words.push_back(Symbol(newStr.c_str()));
961   }
962   return words;
963 }
964 
formatNumber(const float number,const int digits,const int precision,const bool doSign,Symbol mode)965 ml::Text formatNumber(const float number, const int digits, const int precision, const bool doSign,
966                       Symbol mode) throw()
967 {
968   const std::vector<ml::Text> pitchNames{"A",  "A#", "B", "C",  "C#", "D",
969                                          "D#", "E",  "F", "F#", "G",  "G#"};
970 
971   const int bufLength = 16;
972   char numBuf[bufLength] = {0};
973   char format[bufLength] = {0};
974   float tweakedNumber;
975 
976   // get digits to display
977   int m = (precision > 0) ? std::max(digits, precision + 1) : digits;
978   int d = ceil(log10f(fabs(number) + 1.));
979   int p = (d + precision > m) ? m - d : precision;
980   p = std::max(p, 0);
981 
982   //  printf("---------number: %-+10.2f\n", number);
983   //  printf("---------number: %-+10.8f\n", number);
984   //  printf("max: %d, digits: %d, after decimal: %d\n", m, d, p);
985 
986   tweakedNumber = number;
987   if (mode == "default")
988   {
989     if (doSign)
990     {
991       snprintf(format, bufLength, "X-+0%1d.%1df", m, p);
992     }
993     else
994     {
995       snprintf(format, bufLength, "X-0%1d.%1df", m, p);
996     }
997     format[0] = 37;  // '%'
998     snprintf(numBuf, bufLength, format, tweakedNumber);
999   }
1000   else if (mode == "ratio")
1001   {
1002     bool done = false;
1003     for (int a = 1; a <= 8 && !done; ++a)
1004     {
1005       for (int b = 1; b <= 4 && !done; ++b)
1006       {
1007         if (fabs(number - (float)a / (float)b) < 0.001)
1008         {
1009           snprintf(numBuf, bufLength, "%d/%d", a, b);
1010           done = true;
1011         }
1012       }
1013     }
1014     if (!done)
1015     {
1016       if (doSign)
1017       {
1018         snprintf(format, bufLength, "X-+0%1d.%1df", m, p);
1019       }
1020       else
1021       {
1022         snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1023       }
1024       format[0] = 37;  // '%'
1025       snprintf(numBuf, bufLength, format, tweakedNumber);
1026     }
1027   }
1028   else if (mode == "pitch1")  // just show As
1029   {
1030     int octave = log2(number / (27.5f - 0.01f));
1031     float quant = (pow(2.f, (float)octave) * 27.5f);
1032     float distFromOctave = fabs(number - quant);
1033     if (distFromOctave < 0.01)
1034     {
1035       snprintf(format, bufLength, "X-0%1d.%1df\nA%d", m, p, octave);
1036     }
1037     else
1038     {
1039       snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1040     }
1041     format[0] = 37;  // '%'
1042     snprintf(numBuf, bufLength, format, tweakedNumber);
1043   }
1044   else if (mode == "pitch2")  // show all notes
1045   {
1046     int note = log2f(number / (27.5f - 0.01f)) * 12.f;
1047     float quantizedNotePitch = (pow(2.f, (float)note / 12.f) * 27.5f);
1048     float distFromNote = fabs(number - quantizedNotePitch);
1049     if (distFromNote < 0.01)
1050     {
1051       const int octaveFromC = (note - 3) / 12;
1052       snprintf(format, bufLength, "X-0%1d.%1df\n%s%d", m, p, pitchNames[note % 12].getText(),
1053                octaveFromC);
1054     }
1055     else
1056     {
1057       snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1058     }
1059     format[0] = 37;  // '%'
1060     snprintf(numBuf, bufLength, format, tweakedNumber);
1061   }
1062   else if (mode == "db")
1063   {
1064     if (doSign)
1065     {
1066       snprintf(format, bufLength, "X-+0%1d.%1dfdB", m, p);
1067     }
1068     else
1069     {
1070       snprintf(format, bufLength, "X-0%1d.%1dfdB", m, p);
1071     }
1072     format[0] = 37;  // '%'
1073     snprintf(numBuf, bufLength, format, tweakedNumber);
1074   }
1075 
1076   return Text(numBuf);
1077 }
1078 
1079 }  // namespace textUtils
1080 }  // namespace ml
1081