1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4
5 #include "MLTextUtils.h"
6
7 #include <cstring>
8
9 #include "MLDSPScalarMath.h"
10 #include "MLMemoryUtils.h"
11 #include "aes256.h"
12 #include "utf.hpp"
13
14 namespace ml
15 {
16 namespace textUtils
17 {
18 static const int npos = -1;
19
isDigit(CodePoint c)20 bool isDigit(CodePoint c)
21 {
22 if (c >= '0' && c <= '9') return true;
23 return false;
24 }
isASCII(CodePoint c)25 bool isASCII(CodePoint c) { return (c < 0x7f); }
26
isLatin(CodePoint c)27 bool isLatin(CodePoint c)
28 {
29 // includes Latin-1 Supplement
30 return (c <= 0xFF);
31 }
32
isWhitespace(CodePoint ch)33 bool isWhitespace(CodePoint ch)
34 {
35 return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 ||
36 ch == 0x1680 || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 ||
37 ch == 0x202F || ch == 0x205F || ch == 0x3000;
38 }
39
isCJK(CodePoint ch)40 bool isCJK(CodePoint ch)
41 {
42 return (ch >= 0x4E00 && ch <= 0x9FBF) // CJK Unified Ideographs
43 || (ch >= 0x2E80 && ch <= 0x2FDF) // CJK Radicals Supplement & Kangxi Radicals
44 || (ch >= 0x2FF0 && ch <= 0x30FF) // Ideographic Description Characters, CJK Symbols
45 // and Punctuation & Japanese
46 || (ch >= 0x3100 && ch <= 0x31BF) // Korean
47 || (ch >= 0xAC00 && ch <= 0xD7AF) // Hangul Syllables
48 || (ch >= 0xF900 && ch <= 0xFAFF) // CJK Compatibility Ideographs
49 || (ch >= 0xFE30 && ch <= 0xFE4F) // CJK Compatibility Forms
50 || (ch >= 0x31C0 && ch <= 0x4DFF); // Other exiensions
51 }
52
digitsToNaturalNumber(const char32_t * p)53 int digitsToNaturalNumber(const char32_t* p)
54 {
55 constexpr int kMaxDigits = 16;
56
57 if (!p) return 0;
58 int v = 0;
59 int l = 0;
60 int d;
61 char c;
62
63 while (p[l])
64 {
65 c = p[l];
66 if (c >= '0' && c <= '9')
67 d = (c - '0');
68 else
69 break;
70 v = (v * 10) + d;
71 l++;
72 if (l >= kMaxDigits) return -1;
73 }
74 return v;
75 }
76
textToNaturalNumber(const TextFragment & frag)77 int textToNaturalNumber(const TextFragment& frag)
78 {
79 std::vector<CodePoint> vec = textToCodePoints(frag);
80 return digitsToNaturalNumber(vec.data());
81 }
82
naturalNumberToText(int i)83 TextFragment naturalNumberToText(int i)
84 {
85 constexpr int kMaxDigits = 16;
86
87 char buf[kMaxDigits]{};
88 char* p = buf + kMaxDigits - 1;
89 char* end = p;
90
91 // null-terminate the string
92 *end = 0;
93
94 // work backwards
95 do
96 {
97 p--;
98 if (p < buf) return "overflow";
99 *p = '0' + (i % 10);
100 i /= 10;
101 } while (i != 0);
102 return (TextFragment(p, end - p));
103 }
104
105 // numeric
106
floatNumberToText(float f,int precision)107 TextFragment floatNumberToText(float f, int precision)
108 {
109 // const float maxFloat = std::numeric_limits<float>::max();
110 constexpr int kMaxPrecision = 10;
111 constexpr int kScientificStart = 5;
112 constexpr int kMaxDigits = 32;
113 constexpr int kTableZeroOffset = 38;
114 constexpr float powersOfTen[kTableZeroOffset * 2 + 1]{
115 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28, 1e-27, 1e-26,
116 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13,
117 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1e+00,
118 1e+01, 1e+02, 1e+03, 1e+04, 1e+05, 1e+06, 1e+07, 1e+08, 1e+09, 1e+10, 1e+11, 1e+12, 1e+13,
119 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, 1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26,
120 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38};
121
122 char buf[kMaxDigits];
123 char* writePtr = buf;
124 float value = f;
125 const int p = std::min(precision, kMaxPrecision);
126 const float epsilon =
127 std::max((float)fabs(f * powersOfTen[kTableZeroOffset - p]), std::numeric_limits<float>::min());
128
129 if (std::isnan(f))
130 {
131 *writePtr++ = 'n';
132 *writePtr++ = 'a';
133 *writePtr++ = 'n';
134 }
135 else
136 {
137 if (value < 0)
138 {
139 value = -value;
140 *writePtr++ = '-';
141 }
142
143 if (value > powersOfTen[kTableZeroOffset * 2])
144 {
145 *writePtr++ = 'i';
146 *writePtr++ = 'n';
147 *writePtr++ = 'f';
148 }
149 else if (value < powersOfTen[0])
150 {
151 *writePtr++ = '0';
152 *writePtr++ = '.';
153 }
154 else
155 {
156 // get the exponent using linear search, starting from center
157 int y = kTableZeroOffset;
158 while (value > powersOfTen[y])
159 {
160 y++;
161 }
162 while (value < powersOfTen[y])
163 {
164 y--;
165 }
166 int exponent = y - kTableZeroOffset;
167 int absExponent = std::abs(exponent);
168
169 if (absExponent < kScientificStart)
170 // write in decimal notation
171 {
172 // first write any leading zeroes
173 if (exponent < -1)
174 {
175 *writePtr++ = '0';
176 *writePtr++ = '.';
177 int zeroes = -exponent - 1;
178 for (int i = 0; i < zeroes; ++i)
179 {
180 *writePtr++ = '0';
181 }
182 }
183 else if (exponent == -1)
184 {
185 *writePtr++ = '0';
186 }
187
188 // then write nonzero digits
189 do
190 {
191 if (exponent == -1)
192 {
193 *writePtr++ = '.';
194 }
195 int onesInt = truncf(value * powersOfTen[kTableZeroOffset - exponent]);
196 *writePtr++ = '0' + onesInt;
197 value = value - onesInt * powersOfTen[kTableZeroOffset + exponent];
198 exponent--;
199 } while ((value > epsilon) || (exponent >= 0));
200 }
201 else
202 // write in scientific notation
203 {
204 const char exponentSign = exponent >= 0 ? '+' : '-';
205
206 // write mantissa
207 int onesInt = value * powersOfTen[kTableZeroOffset - exponent];
208 *writePtr++ = '0' + onesInt;
209 *writePtr++ = '.';
210 while (value > epsilon)
211 {
212 value = value - onesInt * powersOfTen[kTableZeroOffset + exponent];
213 exponent--;
214 onesInt = value * powersOfTen[kTableZeroOffset - exponent];
215 *writePtr++ = '0' + onesInt;
216 }
217
218 // write exponent
219 *writePtr++ = 'e';
220 *writePtr++ = exponentSign;
221 *writePtr++ = '0' + absExponent / 10;
222 *writePtr++ = '0' + absExponent % 10;
223 }
224 }
225 }
226 return TextFragment(buf, writePtr - buf);
227 }
228
fragmentContainsCodePoint(TextFragment f,CodePoint cp)229 bool fragmentContainsCodePoint(TextFragment f, CodePoint cp)
230 {
231 for (const CodePoint c : f)
232 {
233 if (c == cp) return true;
234 }
235 return false;
236 }
237
textToFloatNumber(const TextFragment & frag)238 float textToFloatNumber(const TextFragment& frag)
239 {
240 float sign = 1;
241 float wholePart = 0, fracPart = 0, fracPlace = 1;
242 float exponentSign = 1, exponent = 0;
243 bool hasExp = false;
244 auto it = frag.begin();
245 const TextFragment digits{"0123456789"};
246 std::vector<std::pair<TextFragment, std::function<void()> > > segments{
247 {"NaN", [&]() { wholePart = std::numeric_limits<float>::quiet_NaN(); }},
248 {"-", [&]() { sign = -sign; }},
249 {"inf", [&]() { wholePart = std::numeric_limits<float>::infinity(); }},
250 {digits, [&]() { wholePart = wholePart * 10.0f + ((*it) - '0'); }},
251 {".", [&]() {}},
252 {digits, [&]() { fracPart += ((*it) - '0') * (fracPlace *= 0.1f); }},
253 {"e+", [&]() { hasExp = true; }},
254 {"-", [&]() { exponentSign = -exponentSign; }},
255 {digits, [&]() { exponent = exponent * 10.0f + ((*it) - '0'); }}};
256
257 for (auto segment : segments)
258 {
259 while (fragmentContainsCodePoint(segment.first, *it))
260 {
261 segment.second();
262 ++it;
263 }
264 }
265
266 float base = sign * (wholePart + fracPart);
267 return hasExp ? base * powf(10.f, exponent * exponentSign) : base;
268 }
269
findFirst(const TextFragment & frag,const CodePoint b)270 int findFirst(const TextFragment& frag, const CodePoint b)
271 {
272 int r = npos;
273 if (!frag) return r;
274 int i = 0;
275 for (const CodePoint c : frag)
276 {
277 if (!validateCodePoint(c)) return r;
278 if (c == b)
279 {
280 r = i;
281 break;
282 }
283 i++;
284 }
285 return r;
286 }
287
findLast(const TextFragment & frag,const CodePoint b)288 int findLast(const TextFragment& frag, const CodePoint b)
289 {
290 int r = npos;
291 if (!frag) return r;
292 int i = 0;
293 for (const CodePoint c : frag)
294 {
295 if (!validateCodePoint(c)) return r;
296 if (c == b)
297 {
298 r = i;
299 }
300 i++;
301 }
302 return r;
303 }
304
findFirst(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)305 int findFirst(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
306 {
307 int r = npos;
308 if (!frag) return r;
309 int i = 0;
310 for (const CodePoint c : frag)
311 {
312 if (!validateCodePoint(c)) return r;
313 if (matchFn(c))
314 {
315 r = i;
316 break;
317 }
318 i++;
319 }
320 return r;
321 }
322
323 // TODO dumb, have to call matchFn on each code point because we have no reverse
324 // iterator
findLast(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)325 int findLast(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
326 {
327 int r = npos;
328 if (!frag) return r;
329 int i = 0;
330 for (const CodePoint c : frag)
331 {
332 if (!validateCodePoint(c)) return r;
333 if (matchFn(c))
334 {
335 r = i;
336 }
337 i++;
338 }
339 return r;
340 }
341
subText(const TextFragment & frag,size_t start,size_t end)342 TextFragment subText(const TextFragment& frag, size_t start, size_t end)
343 {
344 // this impl does an unneccesary copy, to keep TextFragment very simple for
345 // now.
346 if (!frag) return TextFragment();
347 if (start >= end) return TextFragment();
348
349 // temp buffer big enough to hold whole input fragment if needed.
350 // we won't know the output fragment size in bytes until iterating the code
351 // points.
352 size_t len = frag.lengthInBytes();
353 SmallStackBuffer<char, kShortFragmentSizeInChars> temp(len);
354 char* buf = temp.data();
355 char* pb = buf;
356
357 auto first = TextFragment::Iterator(frag.getText());
358 auto it = first;
359 for (int i = 0; i < start; ++i)
360 {
361 ++it;
362 }
363
364 for (int i = 0; i < end - start; ++i)
365 {
366 // write the codepoint as UTF-8 to the buffer
367 if (!validateCodePoint(*it)) return TextFragment();
368 pb = utf::internal::utf_traits<utf::utf8>::encode(*it, pb);
369 ++it;
370 }
371
372 return TextFragment(buf, pb - buf);
373 }
374
map(const TextFragment & frag,std::function<CodePoint (CodePoint)> f)375 TextFragment map(const TextFragment& frag, std::function<CodePoint(CodePoint)> f)
376 {
377 if (!frag) return TextFragment();
378 std::vector<CodePoint> vec = textToCodePoints(frag);
379 std::transform(vec.begin(), vec.end(), vec.begin(), f);
380 return codePointsToText(vec);
381 }
382
reduce(const TextFragment & frag,std::function<bool (CodePoint)> matchFn)383 TextFragment reduce(const TextFragment& frag, std::function<bool(CodePoint)> matchFn)
384 {
385 if (!frag) return TextFragment();
386 size_t len = frag.lengthInBytes();
387 SmallStackBuffer<char, kShortFragmentSizeInChars> temp(len);
388 char* buf = temp.data();
389 char* pb = buf;
390
391 for (const CodePoint c : frag)
392 {
393 if (!validateCodePoint(c)) return TextFragment();
394 if (matchFn(c))
395 {
396 pb = utf::internal::utf_traits<utf::utf8>::encode(c, pb);
397 }
398 }
399
400 return TextFragment(buf, pb - buf);
401 }
402
split(TextFragment frag,CodePoint delimiter)403 std::vector<TextFragment> split(TextFragment frag, CodePoint delimiter)
404 {
405 std::vector<TextFragment> output;
406 int start = 0;
407 int end = 0;
408 int pieceLen = 0;
409 for (const CodePoint c : frag)
410 {
411 if (!validateCodePoint(c)) return std::vector<TextFragment>();
412 pieceLen++;
413 end++;
414 if (c == delimiter)
415 {
416 if (pieceLen > 1)
417 {
418 output.push_back(subText(frag, start, end - 1));
419 }
420 start = end;
421 pieceLen = 0;
422 }
423 }
424 if (pieceLen > 0)
425 {
426 output.push_back(subText(frag, start, end));
427 }
428 return output;
429 }
430
join(const std::vector<TextFragment> & vec)431 TextFragment join(const std::vector<TextFragment>& vec)
432 {
433 TextFragment sum;
434 size_t len = vec.size();
435 for (int i = 0; i < len; ++i)
436 {
437 TextFragment frag = vec[i];
438 sum = TextFragment(sum, vec[i]);
439 }
440 return sum;
441 }
442
join(const std::vector<TextFragment> & vec,CodePoint delimiter)443 TextFragment join(const std::vector<TextFragment>& vec, CodePoint delimiter)
444 {
445 TextFragment delimFrag(delimiter);
446 TextFragment sum;
447 size_t len = vec.size();
448 for (int i = 0; i < len; ++i)
449 {
450 TextFragment frag = vec[i];
451 sum = TextFragment(sum, vec[i]);
452 if ((i >= 0) && (i < len - 1))
453 {
454 sum = TextFragment(sum, delimFrag);
455 }
456 }
457 return sum;
458 }
459
stripFileExtension(const TextFragment & frag)460 TextFragment stripFileExtension(const TextFragment& frag)
461 {
462 int dotLoc = findLast(frag, '.');
463 if (dotLoc >= 0)
464 {
465 return subText(frag, 0, dotLoc);
466 }
467 return frag;
468 }
469
getShortFileName(const TextFragment & frag)470 TextFragment getShortFileName(const TextFragment& frag)
471 {
472 int slashLoc = findLast(frag, '/');
473 if (slashLoc >= 0)
474 {
475 return subText(frag, slashLoc + 1, frag.lengthInCodePoints());
476 }
477 return frag;
478 }
479
getPath(const TextFragment & frag)480 TextFragment getPath(const TextFragment& frag)
481 {
482 int slashLoc = findLast(frag, '/');
483 if (slashLoc >= 0)
484 {
485 return subText(frag, 0, slashLoc);
486 }
487 return frag;
488 }
489
490 // TODO extend to recognize Cyrillic and other scripts
bestScriptForTextFragment(const TextFragment & frag)491 Symbol bestScriptForTextFragment(const TextFragment& frag)
492 {
493 for (const CodePoint c : frag)
494 {
495 if (!validateCodePoint(c)) return "unknown";
496 // if there are any CJK characters, return CJK
497 if (isCJK(c))
498 {
499 return "cjk";
500 }
501 else if (!isLatin(c))
502 {
503 return "unknown";
504 }
505 }
506 return "latin";
507 }
508
509 static const char base64table[] =
510 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
indexOf(const char * str,char c)511 int indexOf(const char* str, char c)
512 {
513 int r = -1;
514 size_t len = strlen(str);
515 for (size_t i = 0; i < len; ++i)
516 {
517 if (str[i] == c)
518 {
519 r = (int)i;
520 break;
521 }
522 }
523 return r;
524 }
525
base64Encode(const std::vector<uint8_t> & in)526 TextFragment base64Encode(const std::vector<uint8_t>& in)
527 {
528 size_t len = in.size();
529 std::vector<char> out;
530 int b;
531 for (size_t i = 0; i < len; i += 3)
532 {
533 b = (in[i] & 0xFC) >> 2;
534 out.push_back(base64table[b]);
535 b = (in[i] & 0x03) << 4;
536 if (i + 1 < len)
537 {
538 b |= (in[i + 1] & 0xF0) >> 4;
539 out.push_back(base64table[b]);
540 b = (in[i + 1] & 0x0F) << 2;
541 if (i + 2 < len)
542 {
543 b |= (in[i + 2] & 0xC0) >> 6;
544 out.push_back(base64table[b]);
545 b = in[i + 2] & 0x3F;
546 out.push_back(base64table[b]);
547 }
548 else
549 {
550 out.push_back(base64table[b]);
551 out.push_back('=');
552 }
553 }
554 else
555 {
556 out.push_back(base64table[b]);
557 out.push_back('=');
558 out.push_back('=');
559 }
560 }
561 out.push_back(0);
562 return TextFragment(out.data());
563 }
564
base64Decode(const TextFragment & frag)565 std::vector<uint8_t> base64Decode(const TextFragment& frag)
566 {
567 size_t len = frag.lengthInBytes();
568 if (len % 4) return std::vector<uint8_t>();
569 std::vector<uint8_t> decoded;
570 const char* inChars = frag.getText();
571 int b[4];
572 for (int i = 0; i < len; i += 4)
573 {
574 for (int j = 0; j < 4; ++j)
575 {
576 b[j] = indexOf(base64table, inChars[i + j]);
577 }
578 decoded.push_back((b[0] << 2) | (b[1] >> 4));
579 if (b[2] < 64)
580 {
581 decoded.push_back((b[1] << 4) | (b[2] >> 2));
582 if (b[3] < 64)
583 {
584 decoded.push_back((b[2] << 6) | b[3]);
585 }
586 }
587 }
588 return decoded;
589 }
590
stripWhitespaceAtEnds(const TextFragment & frag)591 TextFragment stripWhitespaceAtEnds(const TextFragment& frag)
592 {
593 std::function<bool(CodePoint)> f([](CodePoint c) { return !isWhitespace(c); });
594 int first = findFirst(frag, f);
595 int last = findLast(frag, f);
596 if ((first == npos) || (last == npos)) return TextFragment();
597 return (subText(frag, first, last + 1));
598 }
599
stripAllWhitespace(const TextFragment & frag)600 TextFragment stripAllWhitespace(const TextFragment& frag)
601 {
602 std::function<bool(CodePoint)> f([](CodePoint c) { return !isWhitespace(c); });
603 return reduce(frag, f);
604 }
605
AES256CBCEncode(const std::vector<uint8_t> & input,const std::vector<uint8_t> & key,const std::vector<uint8_t> & iv)606 std::vector<uint8_t> AES256CBCEncode(const std::vector<uint8_t>& input,
607 const std::vector<uint8_t>& key,
608 const std::vector<uint8_t>& iv)
609 {
610 if (!(input.size() > 0) || !(key.size() == 32) || !(iv.size() == 32))
611 return std::vector<uint8_t>();
612
613 aes256_context ctx;
614 aes256_init(&ctx, key.data());
615
616 const int blockSize = 16;
617 size_t inputSize = input.size();
618 size_t blocks = inputSize / blockSize + 1;
619 size_t paddedSize = blockSize * (blocks);
620
621 // add PKCS padding
622 std::vector<uint8_t> plaintext = input;
623 plaintext.resize(paddedSize);
624 size_t padBytes = paddedSize - inputSize;
625 for (size_t i = inputSize; i < paddedSize; ++i)
626 {
627 plaintext[i] = padBytes;
628 }
629
630 std::vector<uint8_t> ciphertext(paddedSize);
631 uint8_t currentIV[blockSize];
632 uint8_t workVector[blockSize];
633
634 for (size_t i = 0; i < blockSize; ++i)
635 {
636 currentIV[i] = iv[i];
637 }
638
639 for (size_t b = 0; b < blocks; ++b)
640 {
641 // get plaintext XOR IV
642 for (size_t i = 0; i < blockSize; ++i)
643 {
644 workVector[i] = plaintext[b * blockSize + i] ^ currentIV[i];
645 }
646
647 aes256_encrypt_ecb(&ctx, workVector);
648
649 // write to ciphertext, get new IV
650 for (size_t i = 0; i < blockSize; ++i)
651 {
652 ciphertext[b * blockSize + i] = workVector[i];
653 currentIV[i] = workVector[i];
654 }
655 }
656
657 aes256_done(&ctx);
658 return ciphertext;
659 }
660
AES256CBCDecode(const std::vector<uint8_t> & cipher,const std::vector<uint8_t> & key,const std::vector<uint8_t> & iv)661 std::vector<uint8_t> AES256CBCDecode(const std::vector<uint8_t>& cipher,
662 const std::vector<uint8_t>& key,
663 const std::vector<uint8_t>& iv)
664 {
665 if (!(cipher.size() > 0) || (key.size() < 32) || (iv.size() < 32)) return std::vector<uint8_t>();
666
667 aes256_context ctx;
668 aes256_init(&ctx, key.data());
669
670 const int blockSize = 16;
671 size_t blocks = cipher.size() / blockSize;
672
673 std::vector<uint8_t> plaintext(blockSize * blocks);
674
675 uint8_t currentIV[blockSize];
676 uint8_t nextIV[blockSize];
677 uint8_t workVector[blockSize];
678
679 for (int i = 0; i < blockSize; ++i)
680 {
681 currentIV[i] = iv[i];
682 }
683
684 for (int b = 0; b < blocks; ++b)
685 {
686 // get next cipher block and use ciphertext as next IV
687 for (int i = 0; i < blockSize; ++i)
688 {
689 workVector[i] = cipher[b * blockSize + i];
690 nextIV[i] = workVector[i];
691 }
692
693 aes256_decrypt_ecb(&ctx, workVector);
694
695 // write to plaintext, XOR work vector with IV
696 for (int i = 0; i < blockSize; ++i)
697 {
698 workVector[i] ^= currentIV[i];
699 plaintext[b * blockSize + i] = workVector[i];
700 currentIV[i] = nextIV[i];
701 }
702 }
703
704 aes256_done(&ctx);
705
706 // remove PKCS padding
707 size_t paddedSize = plaintext.size();
708 if (paddedSize % blockSize == 0)
709 {
710 int padBytes = plaintext[paddedSize - 1];
711 if ((padBytes <= 16) && (padBytes < paddedSize))
712 {
713 plaintext.resize(paddedSize - padBytes);
714 }
715 }
716
717 return plaintext;
718 }
719
collate(const TextFragment & a,const TextFragment & b)720 bool collate(const TextFragment& a, const TextFragment& b)
721 {
722 auto ia = a.begin();
723 auto ib = b.begin();
724
725 int iterEnds = 0;
726 while (!iterEnds)
727 {
728 CodePoint ca = *ia;
729 CodePoint cb = *ib;
730
731 if (!validateCodePoint(ca)) return false;
732 if (!validateCodePoint(cb)) return false;
733
734 if (ca != cb)
735 {
736 // the code points differ.
737 // compare codepoints, produce a result and bail.
738 if (isLatin(ca) && isLatin(cb))
739 {
740 char la = tolower(ca);
741 char lb = tolower(cb);
742
743 if (la != lb)
744 {
745 // different letters
746 return la < lb;
747 }
748 else
749 {
750 // different cases but same letter. define lower case as less within
751 // letter.
752 return ca > cb;
753 }
754 }
755 else
756 {
757 // TODO collate other languages better using miniutf library.
758 return ca < cb;
759 }
760 }
761 else
762 {
763 ++ia;
764 ++ib;
765 }
766
767 int aEnd = (ia == a.end());
768 int bEnd = (ib == b.end());
769 iterEnds = (aEnd << 1) | bEnd;
770 }
771
772 switch (iterEnds)
773 {
774 case 1: // b ended but not a: a > b.
775 return false;
776 case 2: // a ended but not b: a < b.
777 return true;
778 case 3: // both ended, a == b.
779 default: // impossible
780 return false;
781 }
782 return false;
783 }
784
785 #pragma mark Symbol utilities
786
addFinalNumber(Symbol sym,int n)787 Symbol addFinalNumber(Symbol sym, int n)
788 {
789 TextFragment t(sym.getTextFragment(), textUtils::naturalNumberToText(n));
790 return Symbol(t.getText());
791 }
792
stripFinalNumber(Symbol sym)793 Symbol stripFinalNumber(Symbol sym)
794 {
795 const TextFragment& frag = sym.getTextFragment();
796 size_t points = frag.lengthInCodePoints();
797
798 // TODO make more readble using random access fragment class
799
800 SmallStackBuffer<CodePoint, kShortFragmentSizeInCodePoints> temp(points + 1);
801 CodePoint* buf = temp.data();
802
803 // read into char32 array for random access
804 int i = 0;
805 for (CodePoint c : frag)
806 {
807 if (!validateCodePoint(c)) return Symbol();
808 buf[i++] = c;
809 }
810
811 // null terminate
812 buf[points] = 0;
813
814 // no final number? return
815 if (!textUtils::isDigit(buf[points - 1])) return sym;
816
817 // read backwards until non-digit
818 size_t firstDigitPos = 0;
819 for (size_t i = points - 2; i >= 0; --i)
820 {
821 char32_t c = buf[i];
822 if (!textUtils::isDigit(c))
823 {
824 firstDigitPos = i + 1;
825 break;
826 }
827 }
828
829 ml::TextFragment subFrag(textUtils::subText(frag, 0, firstDigitPos));
830 return subFrag.getText();
831 }
832
833 // if the symbol's text ends in a positive integer, return that number.
834 // Otherwise return 0.
getFinalNumber(Symbol sym)835 int getFinalNumber(Symbol sym)
836 {
837 // make temporary buffer of decoded code points, hopefully on stack
838 const TextFragment& frag = sym.getTextFragment();
839 size_t points = frag.lengthInCodePoints();
840
841 // TODO make more readble using random access fragment class
842
843 SmallStackBuffer<CodePoint, kShortFragmentSizeInCodePoints> decodedPoints(points + 1);
844 CodePoint* buf = decodedPoints.data();
845
846 // read into char32 array for random access
847 int i = 0;
848 for (CodePoint c : frag)
849 {
850 if (!validateCodePoint(c)) return 0;
851 buf[i++] = c;
852 }
853
854 // null terminate char32_t string
855 buf[i] = 0;
856
857 // no final number? return
858 if (!textUtils::isDigit(buf[i - 1])) return 0;
859
860 // read backwards until non-digit
861 int firstDigitPos = 0;
862 for (i--; i >= 0; --i)
863 {
864 char32_t c = buf[i];
865 if (!textUtils::isDigit(c))
866 {
867 firstDigitPos = i + 1;
868 break;
869 }
870 }
871
872 // note, null terminated char32_t string needed
873 int r = digitsToNaturalNumber(buf + firstDigitPos);
874 return r;
875 }
876
stripFinalCharacter(Symbol sym)877 Symbol stripFinalCharacter(Symbol sym)
878 {
879 TextFragment frag = sym.getTextFragment();
880 size_t len = frag.lengthInCodePoints();
881 return Symbol(subText(frag, 0, len - 1));
882 }
883
884 #pragma mark NameMaker
885
886 // base-26 arithmetic with letters (A = 0) produces A, B, ... Z, BA, BB ...
nextName()887 const TextFragment NameMaker::nextName()
888 {
889 std::vector<int> digits;
890 const int base = 26;
891 const char baseChar = 'A';
892 int a, m, d, rem;
893
894 a = index++;
895
896 if (!a)
897 {
898 digits.push_back(0);
899 }
900 else
901 while (a)
902 {
903 d = a / base;
904 m = d * base;
905 rem = a - m;
906 digits.push_back(rem);
907 a = d;
908 }
909
910 int c = 0;
911 while (digits.size() && (c < maxLen - 1))
912 {
913 d = digits.back();
914 digits.pop_back();
915
916 buf[c++] = static_cast<char>(d) + baseChar;
917 }
918
919 buf[c++] = 0;
920 return TextFragment(buf);
921 }
922
923 class NoiseGen
924 {
925 public:
NoiseGen()926 NoiseGen() : mSeed(0) {}
~NoiseGen()927 ~NoiseGen() {}
928
step()929 inline void step() { mSeed = mSeed * 0x0019660D + 0x3C6EF35F; }
930
getIntSample()931 inline uint32_t getIntSample()
932 {
933 step();
934 return mSeed;
935 }
936
reset()937 void reset() { mSeed = 0; }
938
939 private:
940 uint32_t mSeed = 0;
941 };
942
943 static const char kLetters[33] = "aabcdeefghijklmnnoopqrssttuvwxyz";
vectorOfNonsenseSymbols(int len)944 std::vector<Symbol> vectorOfNonsenseSymbols(int len)
945 {
946 NoiseGen randSource;
947 std::vector<Symbol> words;
948 for (int i = 0; i < len; ++i)
949 {
950 std::string newStr;
951 uint32_t r32 = randSource.getIntSample() >> 16;
952 int wordLen = (r32 & 7) + 3;
953
954 for (int j = 0; j < wordLen; ++j)
955 {
956 r32 = randSource.getIntSample() >> 16;
957 int idx = (r32 & 31);
958 newStr += (kLetters[idx]);
959 }
960 words.push_back(Symbol(newStr.c_str()));
961 }
962 return words;
963 }
964
formatNumber(const float number,const int digits,const int precision,const bool doSign,Symbol mode)965 ml::Text formatNumber(const float number, const int digits, const int precision, const bool doSign,
966 Symbol mode) throw()
967 {
968 const std::vector<ml::Text> pitchNames{"A", "A#", "B", "C", "C#", "D",
969 "D#", "E", "F", "F#", "G", "G#"};
970
971 const int bufLength = 16;
972 char numBuf[bufLength] = {0};
973 char format[bufLength] = {0};
974 float tweakedNumber;
975
976 // get digits to display
977 int m = (precision > 0) ? std::max(digits, precision + 1) : digits;
978 int d = ceil(log10f(fabs(number) + 1.));
979 int p = (d + precision > m) ? m - d : precision;
980 p = std::max(p, 0);
981
982 // printf("---------number: %-+10.2f\n", number);
983 // printf("---------number: %-+10.8f\n", number);
984 // printf("max: %d, digits: %d, after decimal: %d\n", m, d, p);
985
986 tweakedNumber = number;
987 if (mode == "default")
988 {
989 if (doSign)
990 {
991 snprintf(format, bufLength, "X-+0%1d.%1df", m, p);
992 }
993 else
994 {
995 snprintf(format, bufLength, "X-0%1d.%1df", m, p);
996 }
997 format[0] = 37; // '%'
998 snprintf(numBuf, bufLength, format, tweakedNumber);
999 }
1000 else if (mode == "ratio")
1001 {
1002 bool done = false;
1003 for (int a = 1; a <= 8 && !done; ++a)
1004 {
1005 for (int b = 1; b <= 4 && !done; ++b)
1006 {
1007 if (fabs(number - (float)a / (float)b) < 0.001)
1008 {
1009 snprintf(numBuf, bufLength, "%d/%d", a, b);
1010 done = true;
1011 }
1012 }
1013 }
1014 if (!done)
1015 {
1016 if (doSign)
1017 {
1018 snprintf(format, bufLength, "X-+0%1d.%1df", m, p);
1019 }
1020 else
1021 {
1022 snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1023 }
1024 format[0] = 37; // '%'
1025 snprintf(numBuf, bufLength, format, tweakedNumber);
1026 }
1027 }
1028 else if (mode == "pitch1") // just show As
1029 {
1030 int octave = log2(number / (27.5f - 0.01f));
1031 float quant = (pow(2.f, (float)octave) * 27.5f);
1032 float distFromOctave = fabs(number - quant);
1033 if (distFromOctave < 0.01)
1034 {
1035 snprintf(format, bufLength, "X-0%1d.%1df\nA%d", m, p, octave);
1036 }
1037 else
1038 {
1039 snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1040 }
1041 format[0] = 37; // '%'
1042 snprintf(numBuf, bufLength, format, tweakedNumber);
1043 }
1044 else if (mode == "pitch2") // show all notes
1045 {
1046 int note = log2f(number / (27.5f - 0.01f)) * 12.f;
1047 float quantizedNotePitch = (pow(2.f, (float)note / 12.f) * 27.5f);
1048 float distFromNote = fabs(number - quantizedNotePitch);
1049 if (distFromNote < 0.01)
1050 {
1051 const int octaveFromC = (note - 3) / 12;
1052 snprintf(format, bufLength, "X-0%1d.%1df\n%s%d", m, p, pitchNames[note % 12].getText(),
1053 octaveFromC);
1054 }
1055 else
1056 {
1057 snprintf(format, bufLength, "X-0%1d.%1df", m, p);
1058 }
1059 format[0] = 37; // '%'
1060 snprintf(numBuf, bufLength, format, tweakedNumber);
1061 }
1062 else if (mode == "db")
1063 {
1064 if (doSign)
1065 {
1066 snprintf(format, bufLength, "X-+0%1d.%1dfdB", m, p);
1067 }
1068 else
1069 {
1070 snprintf(format, bufLength, "X-0%1d.%1dfdB", m, p);
1071 }
1072 format[0] = 37; // '%'
1073 snprintf(numBuf, bufLength, format, tweakedNumber);
1074 }
1075
1076 return Text(numBuf);
1077 }
1078
1079 } // namespace textUtils
1080 } // namespace ml
1081