1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_string.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
24 #if defined(__LP64__) && !defined(_GNU_SOURCE)
rawmemchr(const void * ss,int cc)25 CXXCONST_VOIDP rawmemchr(const void* ss, int cc) {
26   const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
27   const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
28   const VecI8 vvec_all_needle = veci8_set1(cc);
29   VecI8 cur_vvec = *ss_viter;
30   VecI8 needle_match_vvec = (cur_vvec == vvec_all_needle);
31   uint32_t matching_bytes = veci8_movemask(needle_match_vvec);
32   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
33   matching_bytes &= UINT32_MAX << leading_byte_ct;
34   // This is typically short-range, so the Memrchr() double-vector strategy is
35   // unlikely to be profitable.  (todo: experiment with header-inline)
36   while (!matching_bytes) {
37     ++ss_viter;
38     cur_vvec = *ss_viter;
39     needle_match_vvec = (cur_vvec == vvec_all_needle);
40     matching_bytes = veci8_movemask(needle_match_vvec);
41   }
42   const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
43   return R_CAST(CXXCONST_VOIDP, R_CAST(uintptr_t, ss_viter) + byte_offset_in_vec);
44 }
45 #endif
46 
47 #ifdef __LP64__
rawmemchr2(const void * ss,unsigned char ucc1,unsigned char ucc2)48 CXXCONST_VOIDP rawmemchr2(const void* ss, unsigned char ucc1, unsigned char ucc2) {
49   const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
50   const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
51   const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
52   const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
53   VecI8 cur_vvec = *ss_viter;
54   VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
55   VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
56   uint32_t matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec);
57   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
58   matching_bytes &= UINT32_MAX << leading_byte_ct;
59   while (!matching_bytes) {
60     ++ss_viter;
61     cur_vvec = *ss_viter;
62     ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
63     ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
64     matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec);
65   }
66   const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
67   return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
68 }
69 
rawmemchr3(const void * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)70 CXXCONST_VOIDP rawmemchr3(const void* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
71   const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
72   const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
73   const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
74   const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
75   const VecI8 vvec_all_ucc3 = veci8_set1(ucc3);
76   VecI8 cur_vvec = *ss_viter;
77   VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
78   VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
79   VecI8 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
80   uint32_t matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
81   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
82   matching_bytes &= UINT32_MAX << leading_byte_ct;
83   while (!matching_bytes) {
84     ++ss_viter;
85     cur_vvec = *ss_viter;
86     ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
87     ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
88     ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
89     matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
90   }
91   const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
92   return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
93 }
94 
strchrnul3(const char * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)95 CXXCONST_CP strchrnul3(const char* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
96   const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
97   const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
98   const VecI8 vvec_all_zero = veci8_setzero();
99   const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
100   const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
101   const VecI8 vvec_all_ucc3 = veci8_set1(ucc3);
102   VecI8 cur_vvec = *ss_viter;
103   VecI8 zero_match_vvec = (cur_vvec == vvec_all_zero);
104   VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
105   VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
106   VecI8 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
107   uint32_t matching_bytes = veci8_movemask(zero_match_vvec | ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
108   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
109   matching_bytes &= UINT32_MAX << leading_byte_ct;
110   while (!matching_bytes) {
111     ++ss_viter;
112     cur_vvec = *ss_viter;
113     zero_match_vvec = (cur_vvec == vvec_all_zero);
114     ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
115     ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
116     ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
117     matching_bytes = veci8_movemask(zero_match_vvec | ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
118   }
119   const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
120   return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
121 }
122 #else
rawmemchr2(const void * ss,unsigned char ucc1,unsigned char ucc2)123 CXXCONST_VOIDP rawmemchr2(const void* ss, unsigned char ucc1, unsigned char ucc2) {
124   for (const unsigned char* ss_iter = S_CAST(const unsigned char*, ss); ; ++ss_iter) {
125     unsigned char ucc = *ss_iter;
126     if ((ucc == ucc1) || (ucc == ucc2)) {
127       return S_CAST(CXXCONST_VOIDP, ss_iter);
128     }
129   }
130 }
131 
rawmemchr3(const void * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)132 CXXCONST_VOIDP rawmemchr3(const void* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
133   for (const unsigned char* ss_iter = S_CAST(const unsigned char*, ss); ; ++ss_iter) {
134     unsigned char ucc = *ss_iter;
135     if ((ucc == ucc1) || (ucc == ucc2) || (ucc == ucc3)) {
136       return S_CAST(CXXCONST_VOIDP, ss_iter);
137     }
138   }
139 }
140 
strchrnul3(const char * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)141 CXXCONST_CP strchrnul3(const char* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
142   for (; ; ++ss) {
143     unsigned char ucc = *ss;
144     // at this point, checking against a precomputed 256-byte array may be
145     // better?  test this.
146     if ((!ucc) || (ucc == ucc1) || (ucc == ucc2) || (ucc == ucc3)) {
147       return S_CAST(CXXCONST_CP, ss);
148     }
149   }
150 }
151 #endif
152 
WordWrap(uint32_t suffix_len,char * strbuf)153 void WordWrap(uint32_t suffix_len, char* strbuf) {
154   // Input: A null-terminated string with no intermediate newlines.  If
155   //        suffix_len is zero, there should be a terminating \n; otherwise,
156   //        the last character should be a space.  The allocation the string is
157   //        part of must include at least ~80 bytes past the string end.
158   // Effect: Spaces are replaced with newlines in a manner that plays well with
159   //         80 column terminal windows.  (Multi-space blocks are never
160   //         collapsed.)
161   // Considered UTF-8 awareness, but then decided against it after reading
162   //   https://denisbider.blogspot.com/2015/09/when-monospace-fonts-arent-unicode.html .
163   char* token_start = strbuf;
164   char* line_end = &(strbuf[79]);
165   char* token_end;
166   while (1) {
167     while (*token_start == ' ') {
168       ++token_start;
169     }
170     if (token_start > line_end) {
171       do {
172         *line_end = '\n';
173         line_end = &(line_end[80]);
174       } while (token_start > line_end);
175     }
176     token_end = strchr(token_start, ' ');
177     if (!token_end) {
178       if (&(token_start[79]) == line_end) {
179         return;
180       }
181       token_end = strnul(token_start);
182       if (!suffix_len) {
183         if (token_end <= &(line_end[1])) {
184           // okay if end-of-string is one past the end, because function
185           // assumes last character is \n in suffix_len == 0 case
186           assert(token_end[-1] == '\n');
187           return;
188         }
189       } else {
190         if (&(token_end[suffix_len]) <= line_end) {
191           return;
192         }
193         // because of terminal space assumption, token_start actually points
194         // to the end of the string
195         assert(token_start[-1] == ' ');
196       }
197       token_start[-1] = '\n';
198       return;
199     }
200     if (token_end > line_end) {
201       if (&(token_start[79]) != line_end) {
202         token_start[-1] = '\n';
203         line_end = &(token_start[79]);
204         if (token_end > line_end) {
205           // single really long token, can't do anything beyond putting it on
206           // its own line
207           *token_end = '\n';
208           line_end = &(token_end[80]);
209         }
210       } else {
211         // single really long token, *and* previous token was either
212         // nonexistent or long
213         *token_end = '\n';
214         line_end = &(token_end[80]);
215       }
216     }
217     token_start = &(token_end[1]);
218   }
219 }
220 
221 
222 static const uint32_t kPow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
223 
224 #ifdef USE_AVX2
225 static const unsigned char kLzUintSlenBase[] =
226 {9, 9, 9, 8,
227  8, 8, 7, 7,
228  7, 6, 6, 6,
229  6, 5, 5, 5,
230  4, 4, 4, 3,
231  3, 3, 3, 2,
232  2, 2, 1, 1,
233  1, 0, 0, 0,
234  1};  // UintSlen(0) needs to be 1, not zero
235 
UintSlen(uint32_t num)236 uint32_t UintSlen(uint32_t num) {
237   const uint32_t lz_ct = _lzcnt_u32(num);
238   const uint32_t slen_base = kLzUintSlenBase[lz_ct];
239   return slen_base + (num >= kPow10[slen_base]);
240 }
241 #else
242 // could also use something like ((32 - lz_ct) * 77) >> 8, since 77/256 is a
243 // sufficiently good approximation of ln(2)/ln(10), but that's a bit slower and
244 // this table doesn't take much space
245 //
246 // bugfix (29 Mar 2018): this table was totally wrong, ugh
247 static const unsigned char kBsrUintSlenBase[] =
248 {1, 1, 1, 1,
249  2, 2, 2, 3,
250  3, 3, 4, 4,
251  4, 4, 5, 5,
252  5, 6, 6, 6,
253  7, 7, 7, 7,
254  8, 8, 8, 9,
255  9, 9, 9, 9};
256 
UintSlen(uint32_t num)257 uint32_t UintSlen(uint32_t num) {
258   // tried divide-by-10 and divide-by-100 loops, they were slower
259   // also tried a hardcoded binary tree, it was better but still slower
260 
261   // bsru32(0) is undefined
262   if (num < 10) {
263     return 1;
264   }
265   const uint32_t top_bit_pos = bsru32(num);
266   const uint32_t slen_base = kBsrUintSlenBase[top_bit_pos];
267   return slen_base + (num >= kPow10[slen_base]);
268 }
269 #endif
270 
FirstUnequal4(const char * s1,const char * s2,uintptr_t slen)271 uintptr_t FirstUnequal4(const char* s1, const char* s2, uintptr_t slen) {
272   // See memequal() in plink2_base.
273 #ifdef __LP64__
274   if (slen < kBytesPerVec) {
275     if (slen < kBytesPerWord) {
276       uint32_t xor_result = (*R_CAST(const uint32_t*, s1)) ^ (*R_CAST(const uint32_t*, s2));
277       if (xor_result) {
278         return ctzu32(xor_result) / CHAR_BIT;
279       }
280       if (slen > 4) {
281         const uintptr_t final_offset = slen - 4;
282         xor_result = (*R_CAST(const uint32_t*, &(s1[final_offset]))) ^ (*R_CAST(const uint32_t*, &(s2[final_offset])));
283         if (xor_result) {
284           return final_offset + ctzu32(xor_result) / CHAR_BIT;
285         }
286       }
287       return slen;
288     }
289     const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
290     const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
291     const uintptr_t word_ct = slen / kBytesPerWord;
292     for (uint32_t widx = 0; widx != word_ct; ++widx) {
293       const uintptr_t xor_result = s1_alias[widx] ^ s2_alias[widx];
294       if (xor_result) {
295         return widx * kBytesPerWord + ctzw(xor_result) / CHAR_BIT;
296       }
297     }
298     if (slen % kBytesPerWord) {
299       const uintptr_t final_offset = slen - kBytesPerWord;
300       const uintptr_t xor_result = (*R_CAST(const uintptr_t*, &(s1[final_offset]))) ^ (*R_CAST(const uintptr_t*, &(s2[final_offset])));
301       if (xor_result) {
302         return final_offset + ctzw(xor_result) / CHAR_BIT;
303       }
304     }
305     return slen;
306   }
307   const VecUc* s1_alias = R_CAST(const VecUc*, s1);
308   const VecUc* s2_alias = R_CAST(const VecUc*, s2);
309   const uintptr_t vec_ct = slen / kBytesPerVec;
310   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
311     const VecUc v1 = vecuc_loadu(&(s1_alias[vidx]));
312     const VecUc v2 = vecuc_loadu(&(s2_alias[vidx]));
313     const uint32_t eq_result = vecw_movemask(v1 == v2);
314     if (eq_result != kVec8thUintMax) {
315       return vidx * kBytesPerVec + ctzu32(~eq_result);
316     }
317   }
318   if (slen % kBytesPerVec) {
319     const uintptr_t final_offset = slen - kBytesPerVec;
320     const VecW v1 = vecw_loadu(&(s1[final_offset]));
321     const VecW v2 = vecw_loadu(&(s2[final_offset]));
322     const uint32_t eq_result = vecw_movemask(v1 == v2);
323     if (eq_result != kVec8thUintMax) {
324       return final_offset + ctzu32(~eq_result);
325     }
326   }
327 #else  // !__LP64__
328   const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
329   const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
330   const uintptr_t word_ct = slen / kBytesPerWord;
331   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
332     const uintptr_t xor_result = s1_alias[widx] ^ s2_alias[widx];
333     if (xor_result) {
334       return widx * kBytesPerWord + ctzw(xor_result) / CHAR_BIT;
335     }
336   }
337   if (slen % kBytesPerWord) {
338     const uintptr_t final_offset = slen - kBytesPerWord;
339     const uintptr_t xor_result = (*R_CAST(const uintptr_t*, &(s1[final_offset]))) ^ (*R_CAST(const uintptr_t*, &(s2[final_offset])));
340     if (xor_result) {
341       return final_offset + ctzw(xor_result) / CHAR_BIT;
342     }
343   }
344 #endif
345   return slen;
346 }
347 
348 // May read (kBytesPerWord - 1) bytes past the end of each string.
349 // This can be quite a bit faster than stock strcmp on x86 for sorting, though
350 // benchmarking is necessary in general (compiler may perform its own strcmp
351 // optimizations).
strcmp_overread(const char * s1,const char * s2)352 int32_t strcmp_overread(const char* s1, const char* s2) {
353   const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
354   const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
355   for (uintptr_t widx = 0; ; ++widx) {
356     uintptr_t w1 = s1_alias[widx];
357     const uintptr_t zcheck = DetectFirstZeroByte(w1);
358     uintptr_t w2 = s2_alias[widx];
359     if (zcheck) {
360       // Mask out bytes past the known null.
361       const uintptr_t mask = zcheck ^ (zcheck - 1);
362       w1 &= mask;
363       w2 &= mask;
364       if (w1 == w2) {
365         return 0;
366       }
367     } else if (w1 == w2) {
368       continue;
369     }
370     // bugfix (30 Jun 2018): forgot to adhere to strcmp instead of std::sort
371     // interface
372 #ifdef __LP64__
373     return (__builtin_bswap64(w1) < __builtin_bswap64(w2))? -1 : 1;
374 #else
375     return (__builtin_bswap32(w1) < __builtin_bswap32(w2))? -1 : 1;
376 #endif
377   }
378 }
379 
strcmp_casted(const void * s1,const void * s2)380 int32_t strcmp_casted(const void* s1, const void* s2) {
381   return strcmp(S_CAST(const char*, s1), S_CAST(const char*, s2));
382 }
383 
strcmp_overread_casted(const void * s1,const void * s2)384 int32_t strcmp_overread_casted(const void* s1, const void* s2) {
385   return strcmp_overread(S_CAST(const char*, s1), S_CAST(const char*, s2));
386 }
387 
388 // PLINK 2's natural sort uses the following logic:
389 // - All alphabetic characters act as if they are capitalized, except for
390 // tiebreaking purposes (where ASCII is used).
391 // - Numbers are compared by magnitude, with the exception of...
392 // - Numbers with leading zero(es).  If you're putting extraneous zeroes in
393 // front of IDs, we assume they're there to force particular items to be sorted
394 // earlier, rather than just appearing at random.  So, unlike many natural sort
395 // implementations, we sort 00200 < 021 < 20: all numbers with n leading zeroes
396 // are sorted before all numbers with (n-1) leading zeroes; magnitude only
397 // applies if the leading zero counts match.  This handles e.g. subbasement
398 // room numbering properly.
399 //
400 // This won't always do what you want if your IDs have variable-length decimals
401 // in them (e.g. it yields 0.99 < 0.101); if you don't want to fall back on
402 // ASCII sort, enforce a fixed number of digits after the decimal point.  Also
403 // note that ASCII sort is outright better for e.g. numbers represented in
404 // hexadecimal or base 36.  In principle, it's possible to reliably autodetect
405 // some of these cases (especially hexadecimal numbers beginning with "0x"),
406 // but that'll never be perfect so we just let the user toggle the sort method.
strcmp_natural_scan_forward(const char * s1,const char * s2)407 int32_t strcmp_natural_scan_forward(const char* s1, const char* s2) {
408   // assumes s1 and s2 currently point to the middle of a mismatching number,
409   // where s1 < s2.
410   char c1;
411   char c2;
412   do {
413     c1 = *(++s1);
414     c2 = *(++s2);
415     if (IsNotDigit(c1)) {
416       return -1;
417     }
418   } while (IsDigit(c2));
419   return 1;
420 }
421 
422 // We have the following major states:
423 //   0 (initial): strings perfectly match so far, last char (if any) is
424 //                nonnumeric.
425 //   1: strings perfectly match so far, last char is numeric.
426 //   2: strings match except for capitalization, last char is nonnumeric.
427 //   3: strings match except for capitalization, last char is numeric.
428 // strcmp_natural_tiebroken() expresses the logic for states 2 and 3, while
429 // strcmp_natural_uncasted() handles states 0 and 1.
strcmp_natural_tiebroken(const char * s1,const char * s2)430 int32_t strcmp_natural_tiebroken(const char* s1, const char* s2) {
431   // assumes ties should be broken in favor of s2.
432   unsigned char uc1 = *(++s1);
433   unsigned char uc2 = *(++s2);
434   while (IsNotNzdigit(uc1) && IsNotNzdigit(uc2)) {
435     // state 2
436   strcmp_natural_tiebroken_state_2:
437     if (uc1 != uc2) {
438       if ((uc1 >= 'a') && (uc1 <= 'z')) {
439         uc1 -= 32;
440       }
441       if ((uc2 >= 'a') && (uc2 <= 'z')) {
442         uc2 -= 32;
443       }
444       if (uc1 < uc2) {
445         return -1;
446       }
447       if (uc1 > uc2) {
448         return 1;
449       }
450     } else if (!uc1) {
451       return -1;
452     }
453     uc1 = *(++s1);
454     uc2 = *(++s2);
455   }
456   if (IsNotNzdigit(uc1) || IsNotNzdigit(uc2)) {
457     return (uc1 < uc2)? -1 : 1;
458   }
459   do {
460     // state 3
461     if (uc1 != uc2) {
462       if (IsDigit(uc2)) {
463         if (uc1 < uc2) {
464           return strcmp_natural_scan_forward(s1, s2);
465         }
466         return -strcmp_natural_scan_forward(s2, s1);
467       }
468       return 1;
469     }
470     uc1 = *(++s1);
471     uc2 = *(++s2);
472   } while (IsDigit(uc1));
473   if (IsDigit(uc2)) {
474     return -1;
475   }
476   // skip the while (is_not_digit...) check
477   goto strcmp_natural_tiebroken_state_2;
478 }
479 
strcmp_natural_uncasted(const char * s1,const char * s2)480 int32_t strcmp_natural_uncasted(const char* s1, const char* s2) {
481   unsigned char uc1 = *s1;
482   unsigned char uc2 = *s2;
483   while (IsNotNzdigit(uc1) && IsNotNzdigit(uc2)) {
484     // state 0
485   strcmp_natural_uncasted_state_0:
486     if (uc1 != uc2) {
487       if ((uc1 >= 'a') && (uc1 <= 'z')) {
488         if (uc2 + 32 == uc1) {
489           return -strcmp_natural_tiebroken(s2, s1);
490         }
491         if ((uc2 < 'a') || (uc2 > 'z')) {
492           uc1 -= 32;
493         }
494       } else if ((uc2 >= 'a') && (uc2 <= 'z')) {
495         uc2 -= 32;
496         if (uc1 == uc2) {
497           return strcmp_natural_tiebroken(s1, s2);
498         }
499       }
500       return (uc1 < uc2)? -1 : 1;
501     }
502     if (!uc1) {
503       return 0;
504     }
505     uc1 = *(++s1);
506     uc2 = *(++s2);
507   }
508   if (IsNotNzdigit(uc1) || IsNotNzdigit(uc2)) {
509     return (uc1 < uc2)? -1 : 1;
510   }
511   do {
512     // state 1
513     if (uc1 != uc2) {
514       if (IsDigit(uc2)) {
515         if (uc1 < uc2) {
516           return strcmp_natural_scan_forward(s1, s2);
517         }
518         return -strcmp_natural_scan_forward(s2, s1);
519       }
520       return 1;
521     }
522     uc1 = *(++s1);
523     uc2 = *(++s2);
524   } while (IsDigit(uc1));
525   if (IsDigit(uc2)) {
526     return -1;
527   }
528   goto strcmp_natural_uncasted_state_0;
529 }
530 
strcmp_natural(const void * s1,const void * s2)531 int32_t strcmp_natural(const void* s1, const void* s2) {
532   return strcmp_natural_uncasted(S_CAST(const char*, s1), S_CAST(const char*, s2));
533 }
534 
strcmp_deref(const void * s1,const void * s2)535 int32_t strcmp_deref(const void* s1, const void* s2) {
536   return strcmp(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
537 }
538 
strcmp_overread_deref(const void * s1,const void * s2)539 int32_t strcmp_overread_deref(const void* s1, const void* s2) {
540   return strcmp_overread(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
541 }
542 
strcmp_natural_deref(const void * s1,const void * s2)543 int32_t strcmp_natural_deref(const void* s1, const void* s2) {
544   return strcmp_natural_uncasted(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
545 }
546 
547 #ifdef __cplusplus
548 static_assert(sizeof(Strbuf28Ui) == 32, "Strbuf28Ui is not laid out as expected.");
549 static_assert(offsetof(Strbuf28Ui, orig_idx) == 28, "Strbuf28Ui is not laid out as expected.");
550 static_assert(sizeof(Strbuf60Ui) == 64, "Strbuf60Ui is not laid out as expected.");
551 static_assert(offsetof(Strbuf60Ui, orig_idx) == 60, "Strbuf60Ui is not laid out as expected.");
GetStrboxsortWentryBlen(uintptr_t max_str_blen)552 uintptr_t GetStrboxsortWentryBlen(uintptr_t max_str_blen) {
553   if (max_str_blen <= 28) {
554     return sizeof(Strbuf28Ui);
555   }
556   if (max_str_blen <= 60) {
557     return sizeof(Strbuf60Ui);
558   }
559   return max_str_blen;
560 }
561 #else
GetStrboxsortWentryBlen(uintptr_t max_str_blen)562 uintptr_t GetStrboxsortWentryBlen(uintptr_t max_str_blen) {
563   return MAXV(max_str_blen, sizeof(StrSortIndexedDeref));
564 }
565 #endif
566 
567 // Assumed that sort_wkspace has size >= str_ct *
568 // max(sizeof(StrSortIndexedDeref), max_str_blen).
569 // Must be ok to overread.
SortStrboxIndexed2Fallback(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map,void * sort_wkspace)570 void SortStrboxIndexed2Fallback(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
571   StrSortIndexedDerefOverread* wkspace_alias = S_CAST(StrSortIndexedDerefOverread*, sort_wkspace);
572   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
573     wkspace_alias[str_idx].strptr = &(strbox[str_idx * max_str_blen]);
574     wkspace_alias[str_idx].orig_idx = id_map[str_idx];
575   }
576   if (!use_nsort) {
577     STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, wkspace_alias);
578   } else {
579 #ifdef __cplusplus
580     StrNsortIndexedDeref* wkspace_alias2 = R_CAST(StrNsortIndexedDeref*, wkspace_alias);
581     STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
582 #else
583     STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, wkspace_alias);
584 #endif
585   }
586   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
587     id_map[str_idx] = wkspace_alias[str_idx].orig_idx;
588   }
589 #ifndef __cplusplus
590   if (max_str_blen < sizeof(StrSortIndexedDeref)) {
591     // actually better to use non-deref sort here, but just get this working
592     // properly for now
593     for (uint32_t new_idx = 0; new_idx != str_ct; ++new_idx) {
594       const char* strptr = wkspace_alias[new_idx].strptr;
595       // todo: check whether memcpy(., ., max_str_blen) tends to be better
596       strcpy(&(R_CAST(char*, wkspace_alias)[new_idx * max_str_blen]), strptr);
597     }
598   } else {
599 #endif
600     // bugfix: need to handle id_map[str_idx] != str_idx
601     uint32_t new_idx = str_ct;
602     do {
603       --new_idx;
604       const char* strptr = wkspace_alias[new_idx].strptr;
605       strcpy(&(R_CAST(char*, wkspace_alias)[new_idx * max_str_blen]), strptr);
606     } while (new_idx);
607 #ifndef __cplusplus
608   }
609 #endif
610   memcpy(strbox, wkspace_alias, str_ct * max_str_blen);
611 }
612 
613 #ifdef __cplusplus
614 typedef struct WordCmp32bStruct {
615   uintptr_t words[32 / kBytesPerWord];
operator <plink2::WordCmp32bStruct616   bool operator<(const struct WordCmp32bStruct& rhs) const {
617     uint32_t idx = 0;
618     do {
619       const uintptr_t cur_word = words[idx];
620       const uintptr_t rhs_word = rhs.words[idx];
621       if (cur_word != rhs_word) {
622 #  ifdef __LP64__
623         return __builtin_bswap64(cur_word) < __builtin_bswap64(rhs_word);
624 #  else
625         return __builtin_bswap32(cur_word) < __builtin_bswap32(rhs_word);
626 #  endif
627       }
628     } while (++idx < (32 / kBytesPerWord));
629     return false;
630   }
631 } WordCmp32b;
632 
633 typedef struct WordCmp64bStruct {
634   uintptr_t words[64 / kBytesPerWord];
operator <plink2::WordCmp64bStruct635   bool operator<(const struct WordCmp64bStruct& rhs) const {
636     uint32_t idx = 0;
637     do {
638       const uintptr_t cur_word = words[idx];
639       const uintptr_t rhs_word = rhs.words[idx];
640       if (cur_word != rhs_word) {
641 #  ifdef __LP64__
642         return __builtin_bswap64(cur_word) < __builtin_bswap64(rhs_word);
643 #  else
644         return __builtin_bswap32(cur_word) < __builtin_bswap32(rhs_word);
645 #  endif
646       }
647     } while (++idx < (64 / kBytesPerWord));
648     return false;
649   }
650 } WordCmp64b;
651 
652 static_assert(sizeof(WordCmp32b) == 32, "WordCmp32b does not have the expected size.");
653 static_assert(sizeof(WordCmp64b) == 64, "WordCmp64b does not have the expected size.");
654 
SortStrbox32bFinish(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,Strbuf28Ui * filled_wkspace,char * sorted_strbox,uint32_t * id_map)655 void SortStrbox32bFinish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf28Ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
656   if (!use_nsort) {
657     WordCmp32b* wkspace_alias = R_CAST(WordCmp32b*, filled_wkspace);
658     STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
659   } else {
660     STD_SORT_PAR_UNSEQ(str_ct, nullptr, filled_wkspace);
661   }
662   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
663     strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
664     id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
665   }
666 }
667 
SortStrbox64bFinish(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,Strbuf60Ui * filled_wkspace,char * sorted_strbox,uint32_t * id_map)668 void SortStrbox64bFinish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf60Ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
669   if (!use_nsort) {
670     WordCmp64b* wkspace_alias = R_CAST(WordCmp64b*, filled_wkspace);
671     STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
672   } else {
673     STD_SORT_PAR_UNSEQ(str_ct, nullptr, filled_wkspace);
674   }
675   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
676     strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
677     id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
678   }
679 }
680 
681 // Normally use SortStrboxIndexed(), but this version is necessary before
682 // g_bigstack has been allocated.
SortStrboxIndexed2(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map,void * sort_wkspace)683 void SortStrboxIndexed2(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
684   if (max_str_blen <= 28) {
685     Strbuf28Ui* wkspace_alias = S_CAST(Strbuf28Ui*, sort_wkspace);
686     for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
687       const char* cur_str = &(strbox[str_idx * max_str_blen]);
688       strcpy(wkspace_alias[str_idx].strbuf, cur_str);
689       wkspace_alias[str_idx].orig_idx = id_map[str_idx];
690     }
691     SortStrbox32bFinish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
692     return;
693   }
694   if (max_str_blen <= 60) {
695     Strbuf60Ui* wkspace_alias = S_CAST(Strbuf60Ui*, sort_wkspace);
696     for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
697       const char* cur_str = &(strbox[str_idx * max_str_blen]);
698       strcpy(wkspace_alias[str_idx].strbuf, cur_str);
699       wkspace_alias[str_idx].orig_idx = id_map[str_idx];
700     }
701     SortStrbox64bFinish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
702     return;
703   }
704   SortStrboxIndexed2Fallback(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
705 }
706 #endif  // __cplusplus
707 
708 // Must be ok to overread.
SortStrboxIndexedMalloc(uintptr_t str_ct,uintptr_t max_str_blen,char * strbox,uint32_t * id_map)709 BoolErr SortStrboxIndexedMalloc(uintptr_t str_ct, uintptr_t max_str_blen, char* strbox, uint32_t* id_map) {
710   if (str_ct < 2) {
711     return 0;
712   }
713   const uintptr_t wkspace_entry_blen = GetStrboxsortWentryBlen(max_str_blen);
714   unsigned char* sort_wkspace;
715   if (unlikely(pgl_malloc(str_ct * wkspace_entry_blen, &sort_wkspace))) {
716     return 1;
717   }
718   SortStrboxIndexed2(str_ct, max_str_blen, 0, strbox, id_map, sort_wkspace);
719   free(sort_wkspace);
720   return 0;
721 }
722 
CopyAndDedupSortedStrptrsToStrbox(const char * const * sorted_strptrs,uintptr_t str_ct,uintptr_t max_str_blen,char * strbox)723 uint32_t CopyAndDedupSortedStrptrsToStrbox(const char* const* sorted_strptrs, uintptr_t str_ct, uintptr_t max_str_blen, char* strbox) {
724   if (!str_ct) {
725     return 0;
726   }
727   const char* const* sorted_strptrs_iter = sorted_strptrs;
728   const char* const* sorted_strptrs_end = &(sorted_strptrs[str_ct]);
729   uintptr_t write_idx = 0;
730   uint32_t prev_slen = UINT32_MAX;
731   const char* prev_str = nullptr;
732   do {
733     const char* cur_str = *sorted_strptrs_iter++;
734     const uint32_t cur_slen = strlen(cur_str);
735     if ((cur_slen != prev_slen) || (!memequal(cur_str, prev_str, prev_slen))) {
736       memcpy(&(strbox[write_idx * max_str_blen]), cur_str, cur_slen + 1);
737       ++write_idx;
738       prev_str = cur_str;
739     }
740   } while (sorted_strptrs_iter != sorted_strptrs_end);
741   return write_idx;
742 }
743 
744 
StrptrArrSortMain(uintptr_t str_ct,uint32_t overread_ok,uint32_t use_nsort,StrSortIndexedDeref * wkspace_alias)745 void StrptrArrSortMain(uintptr_t str_ct, uint32_t overread_ok, uint32_t use_nsort, StrSortIndexedDeref* wkspace_alias) {
746   if (!use_nsort) {
747     if (overread_ok) {
748 #ifdef __cplusplus
749       StrSortIndexedDerefOverread* wkspace_alias2 = R_CAST(StrSortIndexedDerefOverread*, wkspace_alias);
750       STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
751 #else
752       STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, wkspace_alias);
753 #endif
754     } else {
755       STD_SORT_PAR_UNSEQ(str_ct, strcmp_deref, wkspace_alias);
756     }
757   } else {
758 #ifdef __cplusplus
759     StrNsortIndexedDeref* wkspace_alias2 = R_CAST(StrNsortIndexedDeref*, wkspace_alias);
760     STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
761 #else
762     STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, wkspace_alias);
763 #endif
764   }
765 }
766 
767 /*
768 uint32_t match_upper(const char* str_iter, const char* fixed_str) {
769   char cc = *fixed_str++;
770   do {
771     if ((((unsigned char)(*str_iter++)) & 0xdf) != ((unsigned char)cc)) {
772       return 0;
773     }
774     cc = *fixed_str++;
775   } while (cc);
776   return !(*str_iter);
777 }
778 */
779 
780 /*
781 #ifdef __LP64__
782 CXXCONST_CP FirstPrecharFar(const char* str_iter, uint32_t char_code) {
783   const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
784   const VecUc* str_viter = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
785   const VecUc vvec_add = vecuc_set1(128 - char_code);
786   VecUc cur_vvec = *str_viter;
787   VecUc non_prechar_vvec = vecuc_adds(cur_vvec, vvec_add);
788   uint32_t matching_bytes = ~vecuc_movemask(non_prechar_vvec);
789   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter);
790   matching_bytes &= UINT32_MAX << leading_byte_ct;
791   // This typically isn't *that* long-range, so the Memrchr()
792   // double-vector strategy is unlikely to be profitable?
793   while (!matching_bytes) {
794     ++str_viter;
795     cur_vvec = *str_viter;
796     non_prechar_vvec = vecuc_adds(cur_vvec, vvec_add);
797     matching_bytes = ~vecuc_movemask(non_prechar_vvec);
798   }
799   const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
800   return &(R_CAST(CXXCONST_CP, str_viter)[byte_offset_in_vec]);
801 }
802 #endif
803 */
804 
MatchUpperCounted(const char * str,const char * fixed_str,uint32_t ct)805 uint32_t MatchUpperCounted(const char* str, const char* fixed_str, uint32_t ct) {
806   for (uint32_t uii = 0; uii != ct; ++uii) {
807     if ((ctou32(str[uii]) & 0xdf) != ctou32(fixed_str[uii])) {
808       return 0;
809     }
810   }
811   return 1;
812 }
813 
814 // might want to make this std::array in the future?
815 static const char kToUpper[256] = {
816   '\0', '\1', '\2', '\3', '\4', '\5', '\6', '\7',
817   '\10', '\11', '\12', '\13', '\14', '\15', '\16', '\17',
818   '\20', '\21', '\22', '\23', '\24', '\25', '\26', '\27',
819   '\30', '\31', '\32', '\33', '\34', '\35', '\36', '\37',
820   '\40', '\41', '\42', '\43', '\44', '\45', '\46', '\47',
821   '\50', '\51', '\52', '\53', '\54', '\55', '\56', '\57',
822   '\60', '\61', '\62', '\63', '\64', '\65', '\66', '\67',
823   '\70', '\71', '\72', '\73', '\74', '\75', '\76', '\77',
824   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
825   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
826   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
827   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
828   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
829   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
830   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
831   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
832   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
833   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
834   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
835   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
836   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
837   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
838   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
839   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
840   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
841   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
842   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
843   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
844   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
845   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
846   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
847   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
848 };
849 
strcaseequal(const char * str1,const char * str2,uint32_t ct)850 uint32_t strcaseequal(const char* str1, const char* str2, uint32_t ct) {
851   for (uint32_t uii = 0; uii != ct; ++uii) {
852     if (kToUpper[ctou32(str1[uii])] != kToUpper[ctou32(str2[uii])]) {
853       return 0;
854     }
855   }
856   return 1;
857 }
858 
859 /*
860 void str_toupper(char* str_iter) {
861   while (1) {
862     const uint32_t uii = (unsigned char)(*str_iter);
863     if (!uii) {
864       return;
865     }
866     if (((uint32_t)(uii - 97)) < 26) {
867       // 'a' has ASCII code 97
868       *str_iter = uii - 32;
869     }
870     ++str_iter;
871   }
872 }
873 
874 void buf_toupper(uint32_t slen, char* strbuf) {
875   for (uint32_t pos = 0; pos != slen; ++pos) {
876     const uint32_t uii = (unsigned char)(strbuf[pos]);
877     if (((uint32_t)(uii - 97)) < 26) {
878       strbuf[pos] = uii - 32;
879     }
880   }
881 }
882 
883 void strcpy_toupper(char* target, const char* source) {
884   while (1) {
885     uint32_t uii = (unsigned char)(*source++);
886     if (!uii) {
887       return;
888     }
889     if (((uint32_t)(uii - 97)) < 26) {
890       uii -= 32;
891     }
892     *target++ = uii;
893   }
894 }
895 
896 char* memcpya_toupper(char* __restrict target, const char* __restrict source, uint32_t slen) {
897   for (uint32_t pos = 0; pos != slen; ++pos) {
898     uint32_t uii = ctou32(source[pos]);
899     if ((uii - 97) < 26) {
900       uii -= 32;
901     }
902     target[pos] = uii;
903   }
904   return &(target[slen]);
905 }
906 */
907 
908 
IsAlphanumeric(const char * str_iter)909 uint32_t IsAlphanumeric(const char* str_iter) {
910   for (; ; ++str_iter) {
911     uint32_t uii = ctou32(*str_iter);
912     if (!uii) {
913       return 1;
914     }
915     if (((uii - 48) > 9) && (((uii & 0xffffffdfU) - 65) > 25)) {
916       return 0;
917     }
918   }
919 }
920 
ScanPosintptr(const char * str_iter,uintptr_t * valp)921 BoolErr ScanPosintptr(const char* str_iter, uintptr_t* valp) {
922   // Reads an integer in [1, 2^kBitsPerWord - 1].  Assumes first character is
923   // nonspace.
924   assert(ctow(str_iter[0]) > 32);
925   uintptr_t val = ctow(*str_iter++) - 48;
926   if (val >= 10) {
927 #ifdef __LP64__
928     if (unlikely(val != 0xfffffffffffffffbLLU)) {
929       return 1;
930     }
931 #else
932     if (unlikely(val != 0xfffffffbU)) {
933       return 1;
934     }
935 #endif
936     val = ctow(*str_iter++) - 48;
937     if (unlikely(val >= 10)) {
938       return 1;
939     }
940   }
941   while (!val) {
942     val = ctow(*str_iter++) - 48;
943     if (unlikely(val >= 10)) {
944       return 1;
945     }
946   }
947 #ifdef __LP64__
948   // limit is 20 digits, we've already read one
949   const char* str_limit = &(str_iter[20]);
950 #else
951   const char* str_limit = &(str_iter[10]);
952 #endif
953   while (1) {
954     const uintptr_t cur_digit = ctow(*str_iter++) - 48;
955     if (cur_digit >= 10) {
956       *valp = val;
957       return 0;
958     }
959     const uintptr_t cur_digit2 = ctow(*str_iter++) - 48;
960     if (str_iter == str_limit) {
961       if (unlikely((cur_digit2 < 10) || ((val >= (~k0LU) / 10) && ((val > (~k0LU) / 10) || (cur_digit > (~k0LU) % 10))))) {
962         return 1;
963       }
964       *valp = val * 10 + cur_digit;
965       return 0;
966     }
967     if (cur_digit2 >= 10) {
968       *valp = val * 10 + cur_digit;
969       return 0;
970     }
971     val = val * 100 + cur_digit * 10 + cur_digit2;
972   }
973 }
974 
975 #ifdef __LP64__
ScanmovUintCappedFinish(uint64_t cap,const char ** str_iterp,uint32_t * valp)976 static inline BoolErr ScanmovUintCappedFinish(uint64_t cap, const char** str_iterp, uint32_t* valp) {
977   const char* str_iter = *str_iterp;
978   uint64_t val = *valp;
979   while (1) {
980     // a little bit of unrolling seems to help
981     const uint64_t cur_digit = ctou64(*str_iter++) - 48;
982     if (cur_digit >= 10) {
983       break;
984     }
985     // val = val * 10 + cur_digit;
986     const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
987     if (cur_digit2 >= 10) {
988       val = val * 10 + cur_digit;
989       if (unlikely(val > cap)) {
990         return 1;
991       }
992       break;
993     }
994     val = val * 100 + cur_digit * 10 + cur_digit2;
995     if (unlikely(val > cap)) {
996       return 1;
997     }
998   }
999   *valp = val;
1000   *str_iterp = &(str_iter[-1]);
1001   return 0;
1002 }
1003 
ScanmovPosintCapped(uint64_t cap,const char ** str_iterp,uint32_t * valp)1004 BoolErr ScanmovPosintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp) {
1005   const char* str_iter = *str_iterp;
1006   *valp = ctou32(*str_iter++) - 48;
1007   if (*valp >= 10) {
1008     if (unlikely(*valp != 0xfffffffbU)) {
1009       return 1;
1010     }
1011     *valp = ctou32(*str_iter++) - 48;
1012     if (unlikely(*valp >= 10)) {
1013       return 1;
1014     }
1015   }
1016   while (!(*valp)) {
1017     *valp = ctou32(*str_iter++) - 48;
1018     if (unlikely((*valp) >= 10)) {
1019       return 1;
1020     }
1021   }
1022   *str_iterp = str_iter;
1023   return ScanmovUintCappedFinish(cap, str_iterp, valp);
1024 }
1025 
ScanmovUintCapped(uint64_t cap,const char ** str_iterp,uint32_t * valp)1026 BoolErr ScanmovUintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp) {
1027   const char* str_iter = *str_iterp;
1028   *valp = ctou32(*str_iter++) - 48;
1029   if (*valp >= 10) {
1030     if (*valp != 0xfffffffbU) {
1031       // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
1032       if (unlikely((*valp != 0xfffffffdU) || (*str_iter != '0'))) {
1033         return 1;
1034       }
1035       // accept "-0", "-00", etc.
1036       while (*(++str_iter) == '0');
1037       *valp = 0;
1038       *str_iterp = str_iter;
1039       return (ctou32(*str_iter) - 48) < 10;
1040     }
1041     // accept leading '+'
1042     *valp = ctou32(*str_iter++) - 48;
1043     if (unlikely(*valp >= 10)) {
1044       return 1;
1045     }
1046   }
1047   *str_iterp = str_iter;
1048   return ScanmovUintCappedFinish(cap, str_iterp, valp);
1049 }
1050 
1051 // 2^{-31} < floor <= 0 <= cap < 2^31
ScanmovIntBounded(uint64_t abs_floor,uint64_t cap,const char ** str_iterp,int32_t * valp)1052 BoolErr ScanmovIntBounded(uint64_t abs_floor, uint64_t cap, const char** str_iterp, int32_t* valp) {
1053   const char* str_iter = *str_iterp;
1054   *valp = ctou32(*str_iter++) - 48;
1055   int32_t sign = 1;
1056   if (ctou32(*valp) >= 10) {
1057     if (*valp == -3) {
1058       sign = -1;
1059       cap = abs_floor;
1060     } else if (unlikely(*valp != -5)) {  // accept leading '+'
1061       return 1;
1062     }
1063     *valp = ctou32(*str_iter++) - 48;
1064     if (unlikely(*valp >= 10)) {
1065       return 1;
1066     }
1067   }
1068   *str_iterp = str_iter;
1069   if (ScanmovUintCappedFinish(cap, str_iterp, R_CAST(uint32_t*, valp))) {
1070     return 1;
1071   }
1072   *valp *= sign;
1073   return 0;
1074 }
1075 #else
ScanmovPosintCapped32(uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,uint32_t * valp)1076 BoolErr ScanmovPosintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp) {
1077   const char* str_iter = *str_iterp;
1078   uint32_t val = ctou32(*str_iter++) - 48;
1079   if (val >= 10) {
1080     if (unlikely(val != 0xfffffffbU)) {
1081       return 1;
1082     }
1083     val = ctou32(*str_iter++) - 48;
1084     if (unlikely(val >= 10)) {
1085       return 1;
1086     }
1087   }
1088   while (!val) {
1089     val = ctou32(*str_iter++);
1090     if (unlikely(val >= 10)) {
1091       return 1;
1092     }
1093   }
1094   for (; ; ++str_iter) {
1095     const uint32_t cur_digit = ctou32(*str_iter) - 48;
1096     if (cur_digit >= 10) {
1097       *valp = val;
1098       *str_iterp = str_iter;
1099       return 0;
1100     }
1101     if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1102       return 1;
1103     }
1104     val = val * 10 + cur_digit;
1105   }
1106 }
1107 
ScanmovUintCapped32(uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,uint32_t * valp)1108 BoolErr ScanmovUintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp) {
1109   const char* str_iter = *str_iterp;
1110   uint32_t val = ctou32(*str_iter++) - 48;
1111   if (val >= 10) {
1112     if (val != 0xfffffffbU) {
1113       if (unlikely((val != 0xfffffffd) || (*str_iter != '0'))) {
1114         return 1;
1115       }
1116       while (*(++str_iter) == '0');
1117       *valp = 0;
1118       *str_iterp = str_iter;
1119       return (ctou32(*str_iter) - 48) < 10;
1120     }
1121     val = ctou32(*str_iter++) - 48;
1122     if (unlikely(val >= 10)) {
1123       return 1;
1124     }
1125   }
1126   for (; ; ++str_iter) {
1127     const uint32_t cur_digit = ctou32(*str_iter) - 48;
1128     if (cur_digit >= 10) {
1129       *valp = val;
1130       *str_iterp = str_iter;
1131       return 0;
1132     }
1133     if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1134       return 1;
1135     }
1136     val = val * 10 + cur_digit;
1137   }
1138 }
1139 
ScanmovIntBounded32(uint32_t abs_floor_div_10,uint32_t abs_floor_mod_10,uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,int32_t * valp)1140 BoolErr ScanmovIntBounded32(uint32_t abs_floor_div_10, uint32_t abs_floor_mod_10, uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, int32_t* valp) {
1141   const char* str_iter = *str_iterp;
1142   uint32_t val = ctou32(*str_iter++) - 48;
1143   int32_t sign = 1;
1144   if (val >= 10) {
1145     if (val == 0xfffffffdU) {
1146       sign = -1;
1147       cap_div_10 = abs_floor_div_10;
1148       cap_mod_10 = abs_floor_mod_10;
1149     } else if (unlikely(val != 0xfffffffbU)) {
1150       return 1;
1151     }
1152     val = ctou32(*str_iter++) - 48;
1153     if (unlikely(val >= 10)) {
1154       return 1;
1155     }
1156   }
1157   for (; ; ++str_iter) {
1158     const uint32_t cur_digit = ctou32(*str_iter) - 48;
1159     if (cur_digit >= 10) {
1160       *valp = sign * S_CAST(int32_t, val);
1161       *str_iterp = str_iter;
1162       return 0;
1163     }
1164     if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1165       return 1;
1166     }
1167     val = val * 10 + cur_digit;
1168   }
1169 }
1170 #endif
1171 
1172 static const double kPositivePow10[] = {1, 1.0e1, 1.0e2, 1.0e3, 1.0e4, 1.0e5, 1.0e6, 1.0e7, 1.0e8, 1.0e9, 1.0e10, 1.0e11, 1.0e12, 1.0e13, 1.0e14, 1.0e15};
1173 static const double kPositivePowTen16[] = {1, 1.0e16, 1.0e32, 1.0e48, 1.0e64, 1.0e80, 1.0e96, 1.0e112, 1.0e128, 1.0e144, 1.0e160, 1.0e176, 1.0e192, 1.0e208, 1.0e224, 1.0e240};
1174 static const double kNegativePow10[] = {1, 1.0e-1, 1.0e-2, 1.0e-3, 1.0e-4, 1.0e-5, 1.0e-6, 1.0e-7, 1.0e-8, 1.0e-9, 1.0e-10, 1.0e-11, 1.0e-12, 1.0e-13, 1.0e-14, 1.0e-15};
1175 static const double kNegativePowTen16[] = {1, 1.0e-16, 1.0e-32, 1.0e-48, 1.0e-64, 1.0e-80, 1.0e-96, 1.0e-112};
1176 
ScanadvDouble(const char * str_iter,double * valp)1177 CXXCONST_CP ScanadvDouble(const char* str_iter, double* valp) {
1178   // requires first character to be nonspace (to succeed; it fails without
1179   //   segfaulting on space/eoln/null)
1180   // don't care about hexadecimal
1181   // ok to lose last ~2 bits of precision
1182   // ok if behavior undefined on >1GB strings in 32-bit case, >2GB for 64-bit
1183   // fail on nan/infinity/overflow instead of usual strtod behavior
1184   uint32_t cur_char_code = ctou32(*str_iter);
1185   const uint32_t is_negative = (cur_char_code == 45);
1186   if (is_negative || (cur_char_code == 43)) {
1187     cur_char_code = ctou32(*(++str_iter));
1188   }
1189   uint32_t cur_digit = cur_char_code - 48;
1190   intptr_t e10 = 0;
1191   const char* dot_ptr;
1192   int64_t digits;
1193 #ifdef __LP64__
1194   if (cur_digit < 10) {
1195     // ok, we have at least one digit
1196     digits = cur_digit;
1197     // to check: best to skip leading zeroes and compare against 17 instead of
1198     // 10^16?
1199     do {
1200       cur_digit = ctou32(*(++str_iter)) - 48;
1201       if (cur_digit >= 10) {
1202         if (cur_digit == 0xfffffffeU) {
1203           dot_ptr = str_iter;
1204           goto ScanadvDouble_parse_decimal;
1205         }
1206         goto ScanadvDouble_parse_exponent;
1207       }
1208       digits = digits * 10 + cur_digit;
1209     } while (digits < 10000000000000000LL);
1210     // we have 17 significant digits; count the rest, but don't worry about
1211     // contents
1212     // (could keep ~19 instead, but if we're systematically losing the last two
1213     // bits of precision anyway...)
1214     const char* last_sig_fig_ptr = str_iter;
1215     do {
1216       cur_digit = ctou32(*(++str_iter)) - 48;
1217     } while (cur_digit < 10);
1218     e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1219     if (cur_digit == 0xfffffffeU) {
1220       do {
1221         cur_digit = ctou32(*(++str_iter)) - 48;
1222       } while (cur_digit < 10);
1223     }
1224     goto ScanadvDouble_parse_exponent;
1225   }
1226   if (cur_digit != 0xfffffffeU) {
1227     return nullptr;
1228   }
1229   // first (nonsign) character is dot, verify we have a digit after it
1230   dot_ptr = str_iter;
1231   cur_digit = ctou32(*(++str_iter)) - 48;
1232   if (cur_digit >= 10) {
1233     return nullptr;
1234   }
1235   digits = cur_digit;
1236  ScanadvDouble_parse_decimal:
1237   while (1) {
1238     cur_digit = ctou32(*(++str_iter)) - 48;
1239     if (cur_digit >= 10) {
1240       e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1241       break;
1242     }
1243     digits = digits * 10 + cur_digit;
1244     if (digits >= 10000000000000000LL) {
1245       e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1246       do {
1247         cur_digit = ctou32(*(++str_iter)) - 48;
1248       } while (cur_digit < 10);
1249       break;
1250     }
1251   }
1252  ScanadvDouble_parse_exponent:
1253   if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1254     cur_char_code = ctou32(*(++str_iter));
1255     const uint32_t exp_is_negative = (cur_char_code == 45);
1256     if (exp_is_negative || (cur_char_code == 43)) {
1257       cur_char_code = ctou32(*(++str_iter));
1258     }
1259     cur_digit = cur_char_code - 48;
1260     int32_t cur_exp = 0;
1261     while (cur_digit < 10) {
1262       if (cur_exp >= 214748364) {
1263         // may as well guard against exponent overflow
1264         if (!exp_is_negative) {
1265           return nullptr;
1266         }
1267         *valp = 0;
1268         do {
1269           cur_digit = ctou32(*(++str_iter)) - 48;
1270         } while (cur_digit < 10);
1271         return S_CAST(CXXCONST_CP, str_iter);
1272       }
1273       cur_exp = cur_exp * 10 + cur_digit;
1274       cur_digit = ctou32(*(++str_iter)) - 48;
1275     }
1276     if (exp_is_negative) {
1277       cur_exp = -cur_exp;
1278     }
1279     e10 += cur_exp;
1280   }
1281 #else  // not __LP64__
1282   int32_t digits_short;
1283   if (cur_digit < 10) {
1284     // ok, we have at least one digit
1285     digits_short = cur_digit;
1286     // to check: best to skip leading zeroes and compare against 17 instead of
1287     // 10^16?
1288     do {
1289       cur_digit = ctou32(*(++str_iter)) - 48;
1290       if (cur_digit >= 10) {
1291         if (cur_digit == 0xfffffffeU) {
1292           dot_ptr = str_iter;
1293           goto ScanadvDouble_parse_decimal;
1294         }
1295         digits = digits_short;
1296         goto ScanadvDouble_parse_exponent;
1297       }
1298       digits_short = digits_short * 10 + cur_digit;
1299     } while (digits_short < 100000000);
1300     digits = digits_short;
1301     do {
1302       cur_digit = ctou32(*(++str_iter)) - 48;
1303       if (cur_digit >= 10) {
1304         if (cur_digit == 0xfffffffeU) {
1305           dot_ptr = str_iter;
1306           goto ScanadvDouble_parse_decimal_long;
1307         }
1308         goto ScanadvDouble_parse_exponent;
1309       }
1310       digits = digits * 10 + cur_digit;
1311     } while (digits < 10000000000000000LL);
1312     // we have 17 significant digits; count the rest, but don't worry about
1313     // contents
1314     const char* last_sig_fig_ptr = str_iter;
1315     do {
1316       cur_digit = ctou32(*(++str_iter)) - 48;
1317     } while (cur_digit < 10);
1318     e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1319     if (cur_digit == 0xfffffffeU) {
1320       do {
1321         cur_digit = ctou32(*(++str_iter)) - 48;
1322       } while (cur_digit < 10);
1323     }
1324     goto ScanadvDouble_parse_exponent;
1325   }
1326   if (cur_digit != 0xfffffffeU) {
1327     return nullptr;
1328   }
1329   // first (nonsign) character is dot, verify we have a digit after it
1330   dot_ptr = str_iter;
1331   cur_digit = ctou32(*(++str_iter)) - 48;
1332   if (cur_digit >= 10) {
1333     return nullptr;
1334   }
1335   digits_short = cur_digit;
1336  ScanadvDouble_parse_decimal:
1337   while (1) {
1338     cur_digit = ctou32(*(++str_iter)) - 48;
1339     if (cur_digit >= 10) {
1340       e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1341       digits = digits_short;
1342       break;
1343     }
1344     digits_short = digits_short * 10 + cur_digit;
1345     if (digits_short >= 100000000) {
1346       digits = digits_short;
1347     ScanadvDouble_parse_decimal_long:
1348       while (1) {
1349         cur_digit = ctou32(*(++str_iter)) - 48;
1350         if (cur_digit >= 10) {
1351           e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1352           goto ScanadvDouble_parse_exponent;
1353         }
1354         digits = digits * 10 + cur_digit;
1355         if (digits >= 10000000000000000LL) {
1356           e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1357           do {
1358             cur_digit = ctou32(*(++str_iter)) - 48;
1359           } while (cur_digit < 10);
1360           goto ScanadvDouble_parse_exponent;
1361         }
1362       }
1363     }
1364   }
1365  ScanadvDouble_parse_exponent:
1366   if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1367     cur_char_code = ctou32(*(++str_iter));
1368     const uint32_t exp_is_negative = (cur_char_code == 45);
1369     if (exp_is_negative || (cur_char_code == 43)) {
1370       cur_char_code = ctou32(*(++str_iter));
1371     }
1372     cur_digit = cur_char_code - 48;
1373     int32_t cur_exp = 0;
1374     while (cur_digit < 10) {
1375       if (cur_exp >= 107374182) {
1376         // may as well guard against exponent overflow
1377         // 2^30 instead of 2^31 limit since this gets added to e10
1378         if (!exp_is_negative) {
1379           return nullptr;
1380         }
1381         *valp = 0;
1382         do {
1383           cur_digit = ctou32(*(++str_iter)) - 48;
1384         } while (cur_digit < 10);
1385         return S_CAST(CXXCONST_CP, str_iter);
1386       }
1387       cur_exp = cur_exp * 10 + cur_digit;
1388       cur_digit = ctou32(*(++str_iter)) - 48;
1389     }
1390     if (exp_is_negative) {
1391       cur_exp = -cur_exp;
1392     }
1393     e10 += cur_exp;
1394   }
1395 #endif
1396   if (digits == 0) {
1397     *valp = 0;
1398     return S_CAST(CXXCONST_CP, str_iter);
1399   }
1400   if (is_negative) {
1401     digits = -digits;
1402   }
1403   double dxx = S_CAST(double, digits);
1404   if (e10) {
1405     if (e10 < 0) {
1406       uint32_t pos_exp = -e10;
1407       dxx *= kNegativePow10[pos_exp & 15];
1408       pos_exp /= 16;
1409       if (pos_exp) {
1410         dxx *= kNegativePowTen16[pos_exp & 7];
1411         if (pos_exp > 7) {
1412           if (pos_exp > 23) {
1413             dxx = 0;
1414           } else if (pos_exp > 15) {
1415             dxx *= 1.0e-256;
1416           } else {
1417             dxx *= 1.0e-128;
1418           }
1419         }
1420       }
1421     } else {
1422       uint32_t pos_exp = e10;
1423       dxx *= kPositivePow10[pos_exp & 15];
1424       pos_exp /= 16;
1425       if (pos_exp) {
1426         dxx *= kPositivePowTen16[pos_exp & 15];
1427         if (pos_exp > 15) {
1428           // overflow check
1429           // last digits are "54" instead of "57" since that's the threshold
1430           // beyond which multiply-by-1e256 overflows
1431           if ((pos_exp > 31) || (dxx > 1.7976931348623154e52)) {
1432             return nullptr;
1433           }
1434           dxx *= 1.0e256;
1435         }
1436       }
1437     }
1438   }
1439   *valp = dxx;
1440   return S_CAST(CXXCONST_CP, str_iter);
1441 }
1442 
ScanadvLn(const char * str_iter,double * ln_ptr)1443 CXXCONST_CP ScanadvLn(const char* str_iter, double* ln_ptr) {
1444   // revised ScanadvDouble() which currently requires number to be nonnegative
1445   // returns -DBL_MAX on 0
1446   uint32_t cur_char_code = ctou32(*str_iter);
1447   const uint32_t is_negative = (cur_char_code == 45);
1448   if (is_negative || (cur_char_code == 43)) {
1449     cur_char_code = ctou32(*(++str_iter));
1450   }
1451   uint32_t cur_digit = cur_char_code - 48;
1452   intptr_t e10 = 0;
1453   const char* dot_ptr;
1454   int64_t digits;
1455 #ifdef __LP64__
1456   if (cur_digit < 10) {
1457     // ok, we have at least one digit
1458     digits = cur_digit;
1459     // to check: best to skip leading zeroes and compare against 17 instead of
1460     // 10^16?
1461     do {
1462       cur_digit = ctou32(*(++str_iter)) - 48;
1463       if (cur_digit >= 10) {
1464         if (cur_digit == 0xfffffffeU) {
1465           dot_ptr = str_iter;
1466           goto ScanadvLn_parse_decimal;
1467         }
1468         goto ScanadvLn_parse_exponent;
1469       }
1470       digits = digits * 10 + cur_digit;
1471     } while (digits < 10000000000000000LL);
1472     // we have 17 significant digits; count the rest, but don't worry about
1473     // contents
1474     // (could keep ~19 instead, but if we're systematically losing the last two
1475     // bits of precision anyway...)
1476     const char* last_sig_fig_ptr = str_iter;
1477     do {
1478       cur_digit = ctou32(*(++str_iter)) - 48;
1479     } while (cur_digit < 10);
1480     e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1481     if (cur_digit == 0xfffffffeU) {
1482       do {
1483         cur_digit = ctou32(*(++str_iter)) - 48;
1484       } while (cur_digit < 10);
1485     }
1486     goto ScanadvLn_parse_exponent;
1487   }
1488   if (cur_digit != 0xfffffffeU) {
1489     return nullptr;
1490   }
1491   // first (nonsign) character is dot, verify we have a digit after it
1492   dot_ptr = str_iter;
1493   cur_digit = ctou32(*(++str_iter)) - 48;
1494   if (cur_digit >= 10) {
1495     return nullptr;
1496   }
1497   digits = cur_digit;
1498  ScanadvLn_parse_decimal:
1499   while (1) {
1500     cur_digit = ctou32(*(++str_iter)) - 48;
1501     if (cur_digit >= 10) {
1502       e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1503       break;
1504     }
1505     digits = digits * 10 + cur_digit;
1506     if (digits >= 10000000000000000LL) {
1507       e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1508       do {
1509         cur_digit = ctou32(*(++str_iter)) - 48;
1510       } while (cur_digit < 10);
1511       break;
1512     }
1513   }
1514  ScanadvLn_parse_exponent:
1515   if (is_negative && (digits != 0)) {
1516     return nullptr;
1517   }
1518   if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1519     cur_char_code = ctou32(*(++str_iter));
1520     const uint32_t exp_is_negative = (cur_char_code == 45);
1521     if (exp_is_negative || (cur_char_code == 43)) {
1522       cur_char_code = ctou32(*(++str_iter));
1523     }
1524     cur_digit = cur_char_code - 48;
1525     int32_t cur_exp = 0;
1526     while (cur_digit < 10) {
1527       if (cur_exp >= 214748364) {
1528         // may as well guard against exponent overflow
1529         if (!exp_is_negative) {
1530           return nullptr;
1531         }
1532         *ln_ptr = -DBL_MAX;
1533         do {
1534           cur_digit = ctou32(*(++str_iter)) - 48;
1535         } while (cur_digit < 10);
1536         return S_CAST(CXXCONST_CP, str_iter);
1537       }
1538       cur_exp = cur_exp * 10 + cur_digit;
1539       cur_digit = ctou32(*(++str_iter)) - 48;
1540     }
1541     if (exp_is_negative) {
1542       cur_exp = -cur_exp;
1543     }
1544     e10 += cur_exp;
1545   }
1546 #else  // not __LP64__
1547   int32_t digits_short;
1548   if (cur_digit < 10) {
1549     // ok, we have at least one digit
1550     digits_short = cur_digit;
1551     // to check: best to skip leading zeroes and compare against 17 instead of
1552     // 10^16?
1553     do {
1554       cur_digit = ctou32(*(++str_iter)) - 48;
1555       if (cur_digit >= 10) {
1556         if (cur_digit == 0xfffffffeU) {
1557           dot_ptr = str_iter;
1558           goto ScanadvLn_parse_decimal;
1559         }
1560         digits = digits_short;
1561         goto ScanadvLn_parse_exponent;
1562       }
1563       digits_short = digits_short * 10 + cur_digit;
1564     } while (digits_short < 100000000);
1565     digits = digits_short;
1566     do {
1567       cur_digit = ctou32(*(++str_iter)) - 48;
1568       if (cur_digit >= 10) {
1569         if (cur_digit == 0xfffffffeU) {
1570           dot_ptr = str_iter;
1571           goto ScanadvLn_parse_decimal_long;
1572         }
1573         goto ScanadvLn_parse_exponent;
1574       }
1575       digits = digits * 10 + cur_digit;
1576     } while (digits < 10000000000000000LL);
1577     // we have 17 significant digits; count the rest, but don't worry about
1578     // contents
1579     const char* last_sig_fig_ptr = str_iter;
1580     do {
1581       cur_digit = ctou32(*(++str_iter)) - 48;
1582     } while (cur_digit < 10);
1583     e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1584     if (cur_digit == 0xfffffffeU) {
1585       do {
1586         cur_digit = ctou32(*(++str_iter)) - 48;
1587       } while (cur_digit < 10);
1588     }
1589     goto ScanadvLn_parse_exponent;
1590   }
1591   if (cur_digit != 0xfffffffeU) {
1592     return nullptr;
1593   }
1594   // first (nonsign) character is dot, verify we have a digit after it
1595   dot_ptr = str_iter;
1596   cur_digit = ctou32(*(++str_iter)) - 48;
1597   if (cur_digit >= 10) {
1598     return nullptr;
1599   }
1600   digits_short = cur_digit;
1601  ScanadvLn_parse_decimal:
1602   while (1) {
1603     cur_digit = ctou32(*(++str_iter)) - 48;
1604     if (cur_digit >= 10) {
1605       e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1606       digits = digits_short;
1607       break;
1608     }
1609     digits_short = digits_short * 10 + cur_digit;
1610     if (digits_short >= 100000000) {
1611       digits = digits_short;
1612     ScanadvLn_parse_decimal_long:
1613       while (1) {
1614         cur_digit = ctou32(*(++str_iter)) - 48;
1615         if (cur_digit >= 10) {
1616           e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1617           goto ScanadvLn_parse_exponent;
1618         }
1619         digits = digits * 10 + cur_digit;
1620         if (digits >= 10000000000000000LL) {
1621           e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1622           do {
1623             cur_digit = ctou32(*(++str_iter)) - 48;
1624           } while (cur_digit < 10);
1625           goto ScanadvLn_parse_exponent;
1626         }
1627       }
1628     }
1629   }
1630  ScanadvLn_parse_exponent:
1631   if (is_negative && (digits != 0)) {
1632     return nullptr;
1633   }
1634   if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1635     cur_char_code = ctou32(*(++str_iter));
1636     const uint32_t exp_is_negative = (cur_char_code == 45);
1637     if (exp_is_negative || (cur_char_code == 43)) {
1638       cur_char_code = ctou32(*(++str_iter));
1639     }
1640     cur_digit = cur_char_code - 48;
1641     int32_t cur_exp = 0;
1642     while (cur_digit < 10) {
1643       if (cur_exp >= 107374182) {
1644         // may as well guard against exponent overflow
1645         if (!exp_is_negative) {
1646           return nullptr;
1647         }
1648         *ln_ptr = -DBL_MAX;
1649         do {
1650           cur_digit = ctou32(*(++str_iter)) - 48;
1651         } while (cur_digit < 10);
1652         return S_CAST(CXXCONST_CP, str_iter);
1653       }
1654       cur_exp = cur_exp * 10 + cur_digit;
1655       cur_digit = ctou32(*(++str_iter)) - 48;
1656     }
1657     if (exp_is_negative) {
1658       cur_exp = -cur_exp;
1659     }
1660     e10 += cur_exp;
1661   }
1662 #endif
1663   if (digits == 0) {
1664     *ln_ptr = -DBL_MAX;
1665     return S_CAST(CXXCONST_CP, str_iter);
1666   }
1667   double ln_val = log(S_CAST(double, digits));
1668   if (e10) {
1669     ln_val += e10 * kLn10;
1670   }
1671   *ln_ptr = ln_val;
1672   return S_CAST(CXXCONST_CP, str_iter);
1673 }
1674 
ScanPosintCappedx(const char * str_iter,uint64_t cap,uint32_t * valp)1675 BoolErr ScanPosintCappedx(const char* str_iter, uint64_t cap, uint32_t* valp) {
1676   double val;
1677   if ((!ScantokDouble(str_iter, &val)) || (val < 1.0) || (val > S_CAST(double, cap))) {
1678     return 1;
1679   }
1680   *valp = S_CAST(uint32_t, val);
1681   return (val != S_CAST(double, *valp));
1682 }
1683 
ScanUintCappedx(const char * str_iter,uint64_t cap,uint32_t * valp)1684 BoolErr ScanUintCappedx(const char* str_iter, uint64_t cap, uint32_t* valp) {
1685   double val;
1686   if ((!ScantokDouble(str_iter, &val)) || (val < 0.0) || (val > S_CAST(double, cap))) {
1687     return 1;
1688   }
1689   *valp = S_CAST(uint32_t, val);
1690   return (val != S_CAST(double, *valp));
1691 }
1692 
ScanIntAbsBoundedx(const char * str_iter,int64_t bound,int32_t * valp)1693 BoolErr ScanIntAbsBoundedx(const char* str_iter, int64_t bound, int32_t* valp) {
1694   const double bound_d = S_CAST(double, bound);
1695   double val;
1696   if ((!ScantokDouble(str_iter, &val)) || (val < -bound_d) || (val > bound_d)) {
1697     return 1;
1698   }
1699   *valp = S_CAST(int32_t, val);
1700   return (val != S_CAST(double, *valp));
1701 }
1702 
ScanPosintptrx(const char * str_iter,uintptr_t * valp)1703 BoolErr ScanPosintptrx(const char* str_iter, uintptr_t* valp) {
1704   double val;
1705   if ((!ScantokDouble(str_iter, &val)) || (val < 1.0) || (val > S_CAST(double, ~k0LU))) {
1706     return 1;
1707   }
1708   *valp = S_CAST(uintptr_t, val);
1709   return (val != S_CAST(double, *valp));
1710 }
1711 
GetTopTwoUi(const uint32_t * __restrict uint_arr,uintptr_t uia_size,uintptr_t * __restrict top_idx_ptr,uintptr_t * __restrict second_idx_ptr)1712 void GetTopTwoUi(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
1713   assert(uia_size > 1);
1714   uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
1715   uintptr_t second_idx = 1 ^ top_idx;
1716   uint32_t top_val = uint_arr[top_idx];
1717   uint32_t second_val = uint_arr[second_idx];
1718   uintptr_t cur_idx;
1719   uintptr_t cur_val;
1720   for (cur_idx = 2; cur_idx != uia_size; ++cur_idx) {
1721     cur_val = uint_arr[cur_idx];
1722     if (cur_val > second_val) {
1723       if (cur_val > top_val) {
1724         second_val = top_val;
1725         second_idx = top_idx;
1726         top_val = cur_val;
1727         top_idx = cur_idx;
1728       } else {
1729         second_val = cur_val;
1730         second_idx = cur_idx;
1731       }
1732     }
1733   }
1734   *top_idx_ptr = top_idx;
1735   *second_idx_ptr = second_idx;
1736 }
1737 
1738 #ifdef USE_AVX2
NextTokenMultFar(const char * str_iter,uint32_t ct)1739 CXXCONST_CP NextTokenMultFar(const char* str_iter, uint32_t ct) {
1740   // assert(ct);
1741   if (!str_iter) {
1742     return nullptr;
1743   }
1744   uint32_t transition_bits_to_skip_p1 = ct * 2;
1745   const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
1746   const VecUc* str_viter_start = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
1747   const VecUc vvec_all_tab = vecuc_set1(9);
1748   const VecUc vvec_all_space = vecuc_set1(32);
1749   VecUc cur_vvec = *str_viter_start;
1750   VecUc tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1751   // Underlying intrinsics are _mm256_cmpeq_... and _mm256_cmpgt_...
1752   // So we don't really want to use <= or !=.
1753   // (Er, there's also an signed vs. unsigned comparison impedence mismatch:
1754   // there is no unsigned version of _mm256_cmpgt_epi8.  So we don't really
1755   // want to use gcc-vector-extension < at all here when we can use a mask to
1756   // do the same job.)
1757   const VecUc vvec_all96 = vecuc_set1(96);
1758   VecUc post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1759   uint32_t delimiter_bytes = vecuc_movemask(tabspace_vvec);
1760   const uint32_t initial_nonterminating_bytes = vecuc_movemask(post31_vvec) | delimiter_bytes;
1761   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter_start);
1762   uint32_t leading_mask = UINT32_MAX << leading_byte_ct;
1763   delimiter_bytes &= leading_mask;
1764   uint32_t terminating_bytes = leading_mask & (~initial_nonterminating_bytes);
1765   uint32_t prev_delim_highbit = 0;
1766   for (const VecUc* str_viter = str_viter_start; ; ) {
1767     // The number of 0->1 + 1->0 bit transitions in x is
1768     //   popcount(x ^ (x << 1)).
1769     uint32_t cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1) ^ prev_delim_highbit);
1770     uint32_t cur_transition_ct = PopcountVec8thUint(cur_transitions);
1771     if (cur_transition_ct >= transition_bits_to_skip_p1) {
1772       cur_transitions = ClearBottomSetBits(transition_bits_to_skip_p1 - 1, cur_transitions);
1773       uint32_t byte_offset_in_vec = ctzu32(cur_transitions);
1774       if (terminating_bytes << (31 - byte_offset_in_vec)) {
1775         return nullptr;
1776       }
1777       return &(R_CAST(CXXCONST_CP, str_viter)[byte_offset_in_vec]);
1778     }
1779     if (terminating_bytes) {
1780       return nullptr;
1781     }
1782     transition_bits_to_skip_p1 -= cur_transition_ct;
1783     prev_delim_highbit = transition_bits_to_skip_p1 & 1;
1784     ++str_viter;
1785     cur_vvec = *str_viter;
1786     tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1787     post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1788     delimiter_bytes = vecuc_movemask(tabspace_vvec);
1789     terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1790   }
1791 }
1792 
TokenLexK0(const char * str_iter,const uint32_t * col_types,const uint32_t * col_skips,uint32_t relevant_col_ct,const char ** token_ptrs,uint32_t * token_slens)1793 const char* TokenLexK0(const char* str_iter, const uint32_t* col_types, const uint32_t* col_skips, uint32_t relevant_col_ct, const char** token_ptrs, uint32_t* token_slens) {
1794   const uint32_t relevant_col_ct_m1 = relevant_col_ct - 1;
1795   uint32_t relevant_col_idx = 0;
1796   uint32_t transition_bits_to_skip_p1 = col_skips[relevant_col_idx] * 2;
1797   const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
1798   const VecUc* str_viter = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
1799   const VecUc vvec_all_tab = vecuc_set1(9);
1800   const VecUc vvec_all_space = vecuc_set1(32);
1801   VecUc cur_vvec = *str_viter;
1802   VecUc tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1803   const VecUc vvec_all96 = vecuc_set1(96);
1804   VecUc post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1805   uint32_t delimiter_bytes = vecuc_movemask(tabspace_vvec);
1806   const uint32_t initial_nonterminating_bytes = vecuc_movemask(post31_vvec) | delimiter_bytes;
1807   const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter);
1808   uint32_t leading_mask = UINT32_MAX << leading_byte_ct;
1809   delimiter_bytes &= leading_mask;
1810   uint32_t terminating_bytes = leading_mask & (~initial_nonterminating_bytes);
1811   uint32_t prev_delim_highbit = 0;
1812   if (!transition_bits_to_skip_p1) {
1813     // Special case: col_skips[0] can be zero.  When it is, str_iter must point
1814     // to the beginning of a token.
1815     prev_delim_highbit = -leading_mask;
1816     transition_bits_to_skip_p1 = 1;
1817   }
1818   while (1) {
1819     // The number of 0->1 + 1->0 bit transitions in x is
1820     //   popcount(x ^ (x << 1)).
1821     uint32_t cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1) ^ prev_delim_highbit);
1822     uint32_t cur_transition_ct = PopcountVec8thUint(cur_transitions);
1823     while (cur_transition_ct >= transition_bits_to_skip_p1) {
1824       cur_transitions = ClearBottomSetBits(transition_bits_to_skip_p1 - 1, cur_transitions);
1825       uint32_t byte_offset_in_vec = ctzu32(cur_transitions);
1826       const char* token_start = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1827       const uint32_t cur_col_type = col_types[relevant_col_idx];
1828       token_ptrs[cur_col_type] = token_start;
1829       // Find end of token.
1830       if (relevant_col_idx == relevant_col_ct_m1) {
1831         if (terminating_bytes << (31 - byte_offset_in_vec)) {
1832           return nullptr;
1833         }
1834         // Handle this separately, since it's ok for the last token to be
1835         // terminated by something other than tab/space.
1836         // cur_transitions is a misnomer here; we just need the bottom bit to
1837         // tell us where the token end is.
1838 
1839         // 33..255 is a bit more annoying to check for efficiently than
1840         // 32..255.
1841         const VecUc vvec_all95 = vecuc_set1(95);
1842         cur_transitions &= cur_transitions - 1;
1843         cur_transitions |= terminating_bytes;
1844         while (!cur_transitions) {
1845           ++str_viter;
1846           cur_vvec = *str_viter;
1847           const VecUc postspace_vvec = vecuc_adds(cur_vvec, vvec_all95);
1848           cur_transitions = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
1849         }
1850         byte_offset_in_vec = ctzu32(cur_transitions);
1851         const char* token_end = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1852         token_slens[cur_col_type] = token_end - token_start;
1853         return token_end;
1854       }
1855       cur_transition_ct -= transition_bits_to_skip_p1;
1856       if (!cur_transition_ct) {
1857         // Token end is in a later vector.
1858         do {
1859           if (terminating_bytes) {
1860             return nullptr;
1861           }
1862           ++str_viter;
1863           cur_vvec = *str_viter;
1864           tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1865           post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1866           delimiter_bytes = vecuc_movemask(tabspace_vvec);
1867           terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1868         } while (!delimiter_bytes);
1869         cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1));
1870         cur_transition_ct = PopcountVec8thUint(cur_transitions);
1871       } else {
1872         cur_transitions &= cur_transitions - 1;
1873       }
1874       byte_offset_in_vec = ctzu32(cur_transitions);
1875       const char* token_end = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1876       token_slens[cur_col_type] = token_end - token_start;
1877       ++relevant_col_idx;
1878       transition_bits_to_skip_p1 = 2 * col_skips[relevant_col_idx];
1879     }
1880     if (terminating_bytes) {
1881       return nullptr;
1882     }
1883     transition_bits_to_skip_p1 -= cur_transition_ct;
1884     prev_delim_highbit = transition_bits_to_skip_p1 & 1;
1885     ++str_viter;
1886     cur_vvec = *str_viter;
1887     tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1888     post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1889     delimiter_bytes = vecuc_movemask(tabspace_vvec);
1890     terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1891   }
1892 }
1893 #endif  // USE_AVX2
1894 
NextCsvMult(const char * str_iter,uint32_t ct)1895 CXXCONST_CP NextCsvMult(const char* str_iter, uint32_t ct) {
1896   if (!str_iter) {
1897     return nullptr;
1898   }
1899   // assumes initial spaces in current token have been skipped
1900   // ok if we're at the end of the token
1901   unsigned char ucc = *str_iter;
1902   assert(ucc != ' ');
1903   while (ucc >= ' ') {
1904     // avoid strchr to keep "ASCII code < 32 == newline" consistent
1905     // (tab handling is quirky right now--permitted at the beginning of a
1906     // token, but treated as newline later--but it should never appear so
1907     // no point in e.g. adding an extra parameter to FirstNonTspace();
1908     // just need to make sure the quirky behavior is consistent.)
1909     if (ucc != ',') {
1910       ucc = *(++str_iter);
1911       continue;
1912     }
1913     do {
1914       ucc = *(++str_iter);
1915     } while ((ucc == ' ') || (ucc == '\t'));
1916     if (!(--ct)) {
1917       return S_CAST(CXXCONST_CP, str_iter);
1918     }
1919   }
1920   return nullptr;
1921 }
1922 
1923 #ifdef USE_AVX2
CsvLexK(const char * str_iter,const uint32_t * col_types,const uint32_t * col_skips,uint32_t relevant_col_ct,const char ** token_ptrs,uint32_t * token_slens)1924 const char* CsvLexK(const char* str_iter, const uint32_t* col_types, const uint32_t* col_skips, uint32_t relevant_col_ct, const char** token_ptrs, uint32_t* token_slens) {
1925   // Temporary; optimize later.
1926   for (uint32_t relevant_col_idx = 0; relevant_col_idx != relevant_col_ct; ++relevant_col_idx) {
1927     const uint32_t cur_col_type = col_types[relevant_col_idx];
1928     str_iter = NextCsvMult(str_iter, col_skips[relevant_col_idx]);
1929     if (!str_iter) {
1930       return nullptr;
1931     }
1932     token_ptrs[cur_col_type] = str_iter;
1933     const char* token_end = CsvFieldEnd(str_iter);
1934     token_slens[cur_col_type] = token_end - str_iter;
1935     str_iter = token_end;
1936   }
1937   return str_iter;
1938 }
1939 #endif
1940 
CountTokens(const char * str_iter)1941 uint32_t CountTokens(const char* str_iter) {
1942   uint32_t token_ct = 0;
1943   str_iter = FirstNonTspace(str_iter);
1944   while (!IsEolnKns(*str_iter)) {
1945     ++token_ct;
1946     str_iter = FirstNonTspace(CurTokenEnd(str_iter));
1947   }
1948   return token_ct;
1949 }
1950 
1951 /*
1952 uint32_t CommaOrSpaceCountTokens(const char* str_iter, uint32_t comma_delim) {
1953   if (comma_delim) {
1954     // assumes nonempty line (treats trailing empty string as a token).
1955     uint32_t token_ct = 1;
1956     unsigned char ucc = (unsigned char)(*str_iter++);
1957     while (1) {
1958       if (ucc < 32) {
1959         return token_ct;
1960       }
1961       if (ucc == ',') {
1962         // spelled out due to const qualifier
1963         do {
1964           ucc = (unsigned char)(*str_iter++);
1965         } while ((ucc == ' ') || (ucc == '\t'));
1966         token_ct++;
1967         continue;
1968       }
1969       ucc = (unsigned char)(*str_iter++);
1970     }
1971   }
1972   return CountTokens(str_iter);
1973 }
1974 */
1975 
CountAndMeasureMultistr(const char * multistr,uintptr_t * max_blen_ptr)1976 uint32_t CountAndMeasureMultistr(const char* multistr, uintptr_t* max_blen_ptr) {
1977   // Straightforward to accelerate this with movemask if it ever matters.
1978   // (Doesn't matter now since it's only used for command-line processing.)
1979   uint32_t ct = 0;
1980   uintptr_t max_blen = *max_blen_ptr;
1981   while (*multistr) {
1982     const uintptr_t blen = strlen(multistr) + 1;
1983     if (blen > max_blen) {
1984       max_blen = blen;
1985     }
1986     multistr = &(multistr[blen]);
1987     ++ct;
1988   } while (*multistr);
1989   *max_blen_ptr = max_blen;
1990   return ct;
1991 }
1992 
1993 // number-to-string encoders
1994 
1995 const uint16_t kDigitPair[] = {
1996   0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
1997   0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
1998   0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
1999   0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
2000   0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
2001   0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
2002   0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
2003   0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
2004   0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
2005   0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939};
2006 
u32toa(uint32_t uii,char * start)2007 char* u32toa(uint32_t uii, char* start) {
2008   // Memory-efficient fast integer writer.  (You can do a bit better sometimes
2009   // by using a larger lookup table, but on average I doubt that pays off.)
2010   // Returns a pointer to the end of the integer (not null-terminated).
2011   //
2012   // Nearly identical to 'branchlut' from
2013   // https://github.com/miloyip/itoa-benchmark , except that the hardcoded
2014   // binary search is more balanced (start by comparing 6+ digits vs. <6,
2015   // instead of 9+ digits vs. <8).  This tends to be slightly better unless the
2016   // integers are almost uniformly distributed over [0, 2^32).
2017   //
2018   // Todo: compare against an_itoa in https://github.com/appnexus/acf/ .
2019   //
2020   // (Making the first comparison 7+ digits vs. <7 would seem to make sense,
2021   // but it seems to benchmark slightly worse on my Mac?)
2022   //
2023   // (Since we want to execute different code depending on the number of
2024   // digits, the UintSlen() approach doesn't pay off.)
2025   uint32_t quotient;
2026   if (uii < 100000) {
2027     if (uii < 100) {
2028       if (uii >= 10) {
2029         goto u32toa_just2;
2030       }
2031       *start++ = '0' + uii;
2032       return start;
2033     }
2034     if (uii < 10000) {
2035       if (uii >= 1000) {
2036         goto u32toa_just4;
2037       }
2038       quotient = uii / 100;
2039       *start++ = '0' + quotient;
2040       goto u32toa_2left;
2041     }
2042     quotient = uii / 10000;
2043     *start++ = '0' + quotient;
2044     goto u32toa_4left;
2045   }
2046   if (uii < 100000000) {
2047     if (uii < 1000000) {
2048       goto u32toa_just6;
2049     }
2050     if (uii >= 10000000) {
2051       goto u32toa_just8;
2052     }
2053     quotient = uii / 1000000;
2054     *start++ = '0' + quotient;
2055     goto u32toa_6left;
2056   }
2057   quotient = uii / 100000000;
2058   if (uii < 1000000000) {
2059     *start++ = '0' + quotient;
2060   } else {
2061     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2062   }
2063   uii -= quotient * 100000000;
2064  u32toa_just8:
2065   quotient = uii / 1000000;
2066   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2067  u32toa_6left:
2068   uii -= quotient * 1000000;
2069  u32toa_just6:
2070   quotient = uii / 10000;
2071   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2072  u32toa_4left:
2073   uii -= quotient * 10000;
2074  u32toa_just4:
2075   quotient = uii / 100;
2076   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2077  u32toa_2left:
2078   uii -= quotient * 100;
2079  u32toa_just2:
2080   return memcpya_k(start, &(kDigitPair[uii]), 2);
2081 }
2082 
i32toa(int32_t ii,char * start)2083 char* i32toa(int32_t ii, char* start) {
2084   uint32_t uii = ii;
2085   if (ii < 0) {
2086     // -INT_MIN is undefined, but negating the unsigned int equivalent works
2087     *start++ = '-';
2088     uii = -uii;
2089   }
2090   return u32toa(uii, start);
2091 }
2092 
uitoa_z4(uint32_t uii,char * start)2093 char* uitoa_z4(uint32_t uii, char* start) {
2094   uint32_t quotient = uii / 100;
2095   assert(quotient < 100);
2096   uii -= 100 * quotient;
2097   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2098   return memcpya_k(start, &(kDigitPair[uii]), 2);
2099 }
2100 
u32toa_z5(uint32_t uii,char * start)2101 char* u32toa_z5(uint32_t uii, char* start) {
2102   uint32_t quotient = uii / 10000;
2103   *start++ = '0' + quotient;
2104   return uitoa_z4(uii - 10000 * quotient, start);
2105 }
2106 
uitoa_z6(uint32_t uii,char * start)2107 char* uitoa_z6(uint32_t uii, char* start) {
2108   uint32_t quotient = uii / 10000;
2109   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2110   return uitoa_z4(uii - 10000 * quotient, start);
2111 }
2112 
uitoa_z8(uint32_t uii,char * start)2113 char* uitoa_z8(uint32_t uii, char* start) {
2114   uint32_t quotient = uii / 1000000;
2115   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2116   return uitoa_z6(uii - 1000000 * quotient, start);
2117 }
2118 
i64toa(int64_t llii,char * start)2119 char* i64toa(int64_t llii, char* start) {
2120   uint64_t ullii = llii;
2121   uint64_t top_digits;
2122   uint32_t bottom_eight;
2123   uint32_t middle_eight;
2124   if (llii < 0) {
2125     *start++ = '-';
2126     ullii = -ullii;
2127   }
2128   if (ullii <= 0xffffffffLLU) {
2129     return u32toa(S_CAST(uint32_t, ullii), start);
2130   }
2131   top_digits = ullii / 100000000;
2132   bottom_eight = S_CAST(uint32_t, ullii - (top_digits * 100000000));
2133   if (top_digits <= 0xffffffffLLU) {
2134     start = u32toa(S_CAST(uint32_t, top_digits), start);
2135     return uitoa_z8(bottom_eight, start);
2136   }
2137   ullii = top_digits / 100000000;
2138   middle_eight = S_CAST(uint32_t, top_digits - (ullii * 100000000));
2139   start = u32toa(S_CAST(uint32_t, ullii), start);
2140   start = uitoa_z8(middle_eight, start);
2141   return uitoa_z8(bottom_eight, start);
2142 }
2143 
2144 
u32toa_trunc4(uint32_t uii,char * start)2145 char* u32toa_trunc4(uint32_t uii, char* start) {
2146   uint32_t quotient = uii / 100;
2147   memcpy_k(start, &(kDigitPair[quotient]), 2);
2148   uii -= 100 * quotient;
2149   if (uii) {
2150     start += 2;
2151     memcpy_k(start, &(kDigitPair[uii]), 2);
2152   }
2153   if (start[1] != '0') {
2154     return &(start[2]);
2155   }
2156   return &(start[1]);
2157 }
2158 
uitoa_trunc6(uint32_t uii,char * start)2159 static inline char* uitoa_trunc6(uint32_t uii, char* start) {
2160   uint32_t quotient = uii / 10000;
2161   memcpy_k(start, &(kDigitPair[quotient]), 2);
2162   uii -= 10000 * quotient;
2163   if (uii) {
2164     quotient = uii / 100;
2165     start += 2;
2166     memcpy_k(start, &(kDigitPair[quotient]), 2);
2167     uii -= 100 * quotient;
2168     if (uii) {
2169       start += 2;
2170       memcpy_k(start, &(kDigitPair[uii]), 2);
2171     }
2172   }
2173   if (start[1] != '0') {
2174     return &(start[2]);
2175   }
2176   return &(start[1]);
2177 }
2178 
uitoa_trunc8(uint32_t uii,char * start)2179 static inline char* uitoa_trunc8(uint32_t uii, char* start) {
2180   uint32_t quotient = uii / 1000000;
2181   memcpy_k(start, &(kDigitPair[quotient]), 2);
2182   uii -= 1000000 * quotient;
2183   if (uii) {
2184     quotient = uii / 10000;
2185     start += 2;
2186     memcpy_k(start, &(kDigitPair[quotient]), 2);
2187     uii -= 10000 * quotient;
2188     if (uii) {
2189       quotient = uii / 100;
2190       start += 2;
2191       memcpy_k(start, &(kDigitPair[quotient]), 2);
2192       uii -= 100 * quotient;
2193       if (uii) {
2194         start += 2;
2195         memcpy_k(start, &(kDigitPair[uii]), 2);
2196       }
2197     }
2198   }
2199   if (start[1] != '0') {
2200     return &(start[2]);
2201   }
2202   return &(start[1]);
2203 }
2204 
rtoa_p5(uint32_t remainder,char * start)2205 static inline char* rtoa_p5(uint32_t remainder, char* start) {
2206   if (!remainder) {
2207     return start;
2208   }
2209   *start++ = '.';
2210   uint32_t quotient = remainder / 1000;
2211   memcpy_k(start, &(kDigitPair[quotient]), 2);
2212   remainder -= 1000 * quotient;
2213   if (remainder) {
2214     quotient = remainder / 10;
2215     start += 2;
2216     memcpy_k(start, &(kDigitPair[quotient]), 2);
2217     remainder -= 10 * quotient;
2218     if (remainder) {
2219       start[2] = '0' + remainder;
2220       return &(start[3]);
2221     }
2222   }
2223   if (start[1] != '0') {
2224     return &(start[2]);
2225   }
2226   return &(start[1]);
2227 }
2228 
qrtoa_1p5(uint32_t quotient,uint32_t remainder,char * start)2229 static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
2230   *start++ = '0' + quotient;
2231   return rtoa_p5(remainder, start);
2232 }
2233 
qrtoa_1p7(uint32_t quotient,uint32_t remainder,char * start)2234 static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
2235   *start++ = '0' + quotient;
2236   if (!remainder) {
2237     return start;
2238   }
2239   *start++ = '.';
2240   quotient = remainder / 100000;
2241   memcpy_k(start, &(kDigitPair[quotient]), 2);
2242   remainder -= 100000 * quotient;
2243   if (remainder) {
2244     quotient = remainder / 1000;
2245     start += 2;
2246     memcpy_k(start, &(kDigitPair[quotient]), 2);
2247     remainder -= 1000 * quotient;
2248     if (remainder) {
2249       quotient = remainder / 10;
2250       start += 2;
2251       memcpy_k(start, &(kDigitPair[quotient]), 2);
2252       remainder -= 10 * quotient;
2253       if (remainder) {
2254         start[2] = '0' + remainder;
2255         return &(start[3]);
2256       }
2257     }
2258   }
2259   if (start[1] != '0') {
2260     return &(start[2]);
2261   }
2262   return &(start[1]);
2263 }
2264 
2265 // Okay, time to do banker's rounding when printing doubles.  14 digits of
2266 // precision are used in judging equality to 0.5 (actual precision of doubles
2267 // is 15-17 digits); the intention is to capture all directly loaded or exactly
2268 // computed edge cases (so enough tolerance is needed to survive the internal
2269 // multiplications by powers of 10, etc.), while rounding a negligible number
2270 // of honest-to-god 0.4999999s up and 0.5000001s down.
2271 // To avoid inadvertent printing of an extra digit, there's a deliberate gap
2272 // between the 99.9994999...-type bounds and the largest numbers that would
2273 // actually round down.
2274 static const STD_ARRAY_DECL(double, 2, kBankerRound6) = STD_ARRAY_INIT_START() {0.4999995, 0.5000005} STD_ARRAY_INIT_END();
2275 static const STD_ARRAY_DECL(double, 2, kBankerRound8) = STD_ARRAY_INIT_START() {0.499999995, 0.500000005} STD_ARRAY_INIT_END();
2276 
2277 static inline uint32_t BankerRoundD(double dxx, STD_ARRAY_KREF(double, 2) banker_round) {
2278   uint32_t result = S_CAST(int32_t, dxx);
2279   return result + S_CAST(int32_t, (dxx - u31tod(result)) + banker_round[result & 1]);
2280 }
2281 
2282 // These are separate functions so the compiler can optimize the integer
2283 // divisions.
2284 static inline void BankerRoundD1(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2285   dxx *= 10;
2286   uint32_t remainder = S_CAST(int32_t, dxx);
2287   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2288   *quotientp = remainder / 10;
2289   *remainderp = remainder - (*quotientp) * 10;
2290 }
2291 
2292 static inline void BankerRoundD2(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2293   dxx *= 100;
2294   uint32_t remainder = S_CAST(int32_t, dxx);
2295   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2296   *quotientp = remainder / 100;
2297   *remainderp = remainder - (*quotientp) * 100;
2298 }
2299 
2300 static inline void BankerRoundD3(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2301   dxx *= 1000;
2302   uint32_t remainder = S_CAST(int32_t, dxx);
2303   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2304   *quotientp = remainder / 1000;
2305   *remainderp = remainder - (*quotientp) * 1000;
2306 }
2307 
2308 static inline void BankerRoundD4(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2309   dxx *= 10000;
2310   uint32_t remainder = S_CAST(int32_t, dxx);
2311   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2312   *quotientp = remainder / 10000;
2313   *remainderp = remainder - (*quotientp) * 10000;
2314 }
2315 
2316 static inline void BankerRoundD5(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2317   dxx *= 100000;
2318   uint32_t remainder = S_CAST(int32_t, dxx);
2319   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2320   *quotientp = remainder / 100000;
2321   *remainderp = remainder - (*quotientp) * 100000;
2322 }
2323 
2324 static inline void BankerRoundD6(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2325   dxx *= 1000000;
2326   uint32_t remainder = S_CAST(int32_t, dxx);
2327   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2328   *quotientp = remainder / 1000000;
2329   *remainderp = remainder - (*quotientp) * 1000000;
2330 }
2331 
2332 static inline void BankerRoundD7(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2333   dxx *= 10000000;
2334   uint32_t remainder = S_CAST(int32_t, dxx);
2335   remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2336   *quotientp = remainder / 10000000;
2337   *remainderp = remainder - (*quotientp) * 10000000;
2338 }
2339 
dtoa_so6(double dxx,char * start)2340 char* dtoa_so6(double dxx, char* start) {
2341   // 6 sig fig number, 0.999995 <= dxx < 999999.5
2342   // 'so' = "significand only"
2343   // Just hardcoding all six cases, in the absence of a better approach...
2344   uint32_t uii;
2345   uint32_t quotient;
2346   uint32_t remainder;
2347   if (dxx < 99.999949999999) {
2348     if (dxx < 9.9999949999999) {
2349       BankerRoundD5(dxx, kBankerRound8, &quotient, &remainder);
2350       return qrtoa_1p5(quotient, remainder, start);
2351     }
2352     BankerRoundD4(dxx, kBankerRound8, &quotient, &remainder);
2353     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2354     if (!remainder) {
2355       return start;
2356     }
2357     *start++ = '.';
2358     quotient = remainder / 100;
2359     memcpy_k(start, &(kDigitPair[quotient]), 2);
2360     remainder -= 100 * quotient;
2361     if (remainder) {
2362       start += 2;
2363     dtoa_so6_pretail:
2364       memcpy_k(start, &(kDigitPair[remainder]), 2);
2365     }
2366   dtoa_so6_tail:
2367     if (start[1] != '0') {
2368       return &(start[2]);
2369     }
2370     return &(start[1]);
2371   }
2372   if (dxx < 9999.9949999999) {
2373     if (dxx < 999.99949999999) {
2374       BankerRoundD3(dxx, kBankerRound8, &uii, &remainder);
2375       quotient = uii / 100;
2376       *start++ = '0' + quotient;
2377       quotient = uii - 100 * quotient;
2378       start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2379       if (!remainder) {
2380         return start;
2381       }
2382       *start++ = '.';
2383       quotient = remainder / 10;
2384       memcpy_k(start, &(kDigitPair[quotient]), 2);
2385       remainder -= quotient * 10;
2386       if (!remainder) {
2387         goto dtoa_so6_tail;
2388       }
2389       start[2] = '0' + remainder;
2390       return &(start[3]);
2391     }
2392     BankerRoundD2(dxx, kBankerRound8, &uii, &remainder);
2393     quotient = uii / 100;
2394     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2395     quotient = uii - (100 * quotient);
2396     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2397     if (!remainder) {
2398       return start;
2399     }
2400     *start++ = '.';
2401     goto dtoa_so6_pretail;
2402   }
2403   if (dxx >= 99999.949999999) {
2404     return uitoa_z6(BankerRoundD(dxx, kBankerRound8), start);
2405   }
2406   BankerRoundD1(dxx, kBankerRound8, &uii, &remainder);
2407   quotient = uii / 10000;
2408   *start = '0' + quotient;
2409   uii -= 10000 * quotient;
2410   quotient = uii / 100;
2411   start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2412   uii = uii - 100 * quotient;
2413   start = memcpya_k(start, &(kDigitPair[uii]), 2);
2414   if (!remainder) {
2415     return start;
2416   }
2417   *start++ = '.';
2418   *start = '0' + remainder;
2419   return &(start[1]);
2420 }
2421 
dtoa_so8(double dxx,char * start)2422 char* dtoa_so8(double dxx, char* start) {
2423   // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
2424   uint32_t uii;
2425   uint32_t quotient;
2426   uint32_t remainder;
2427   if (dxx < 99.999999499999) {
2428     if (dxx < 9.9999999499999) {
2429       BankerRoundD7(dxx, kBankerRound6, &quotient, &remainder);
2430       return qrtoa_1p7(quotient, remainder, start);
2431     }
2432     BankerRoundD6(dxx, kBankerRound6, &quotient, &remainder);
2433     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2434     if (!remainder) {
2435       return start;
2436     }
2437     *start++ = '.';
2438     quotient = remainder / 10000;
2439     memcpy_k(start, &(kDigitPair[quotient]), 2);
2440     remainder -= 10000 * quotient;
2441     if (remainder) {
2442       start += 2;
2443     dtoa_so8_pretail4:
2444       quotient = remainder / 100;
2445       memcpy_k(start, &(kDigitPair[quotient]), 2);
2446       remainder -= 100 * quotient;
2447       if (remainder) {
2448         start += 2;
2449       dtoa_so8_pretail2:
2450         memcpy_k(start, &(kDigitPair[remainder]), 2);
2451       }
2452     }
2453   dtoa_so8_tail:
2454     if (start[1] != '0') {
2455       return &(start[2]);
2456     }
2457     return &(start[1]);
2458   }
2459   if (dxx < 9999.9999499999) {
2460     if (dxx < 999.99999499999) {
2461       BankerRoundD5(dxx, kBankerRound6, &uii, &remainder);
2462       quotient = uii / 100;
2463       *start++ = '0' + quotient;
2464       quotient = uii - 100 * quotient;
2465       start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2466       if (!remainder) {
2467         return start;
2468       }
2469       *start++ = '.';
2470       quotient = remainder / 1000;
2471       memcpy_k(start, &(kDigitPair[quotient]), 2);
2472       remainder -= quotient * 1000;
2473       if (!remainder) {
2474         goto dtoa_so8_tail;
2475       }
2476       start += 2;
2477     dtoa_so8_pretail3:
2478       quotient = remainder / 10;
2479       memcpy_k(start, &(kDigitPair[quotient]), 2);
2480       remainder -= quotient * 10;
2481       if (!remainder) {
2482         goto dtoa_so8_tail;
2483       }
2484       start[2] = '0' + remainder;
2485       return &(start[3]);
2486     }
2487     BankerRoundD4(dxx, kBankerRound6, &uii, &remainder);
2488     quotient = uii / 100;
2489     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2490     quotient = uii - (100 * quotient);
2491     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2492     if (!remainder) {
2493       return start;
2494     }
2495     *start++ = '.';
2496     goto dtoa_so8_pretail4;
2497   }
2498   if (dxx < 999999.99499999) {
2499     if (dxx < 99999.999499999) {
2500       BankerRoundD3(dxx, kBankerRound6, &uii, &remainder);
2501       quotient = uii / 10000;
2502       *start = '0' + quotient;
2503       uii -= 10000 * quotient;
2504       quotient = uii / 100;
2505       start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2506       uii -= 100 * quotient;
2507       start = memcpya_k(start, &(kDigitPair[uii]), 2);
2508       if (!remainder) {
2509         return start;
2510       }
2511       *start++ = '.';
2512       goto dtoa_so8_pretail3;
2513     }
2514     BankerRoundD2(dxx, kBankerRound6, &uii, &remainder);
2515     quotient = uii / 10000;
2516     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2517     uii -= 10000 * quotient;
2518     quotient = uii / 100;
2519     start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2520     uii -= 100 * quotient;
2521     start = memcpya_k(start, &(kDigitPair[uii]), 2);
2522     if (!remainder) {
2523       return start;
2524     }
2525     *start++ = '.';
2526     goto dtoa_so8_pretail2;
2527   }
2528   if (dxx >= 9999999.9499999) {
2529     return uitoa_z8(BankerRoundD(dxx, kBankerRound6), start);
2530   }
2531   BankerRoundD1(dxx, kBankerRound6, &uii, &remainder);
2532   quotient = uii / 1000000;
2533   *start = '0' + quotient;
2534   uii -= 1000000 * quotient;
2535   quotient = uii / 10000;
2536   start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2537   uii -= 10000 * quotient;
2538   quotient = uii / 100;
2539   start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2540   uii -= 100 * quotient;
2541   start = memcpya_k(start, &(kDigitPair[uii]), 2);
2542   if (!remainder) {
2543     return start;
2544   }
2545   *start = '.';
2546   start[1] = '0' + remainder;
2547   return &(start[2]);
2548 }
2549 
dtoa_g(double dxx,char * start)2550 char* dtoa_g(double dxx, char* start) {
2551   uint32_t xp10 = 0;
2552   uint32_t quotient;
2553   uint32_t remainder;
2554   if (dxx != dxx) {
2555     return strcpya_k(start, "nan");
2556   }
2557   if (dxx < 0) {
2558     *start++ = '-';
2559     dxx = -dxx;
2560   }
2561   if (dxx < 9.9999949999999e-5) {
2562     // 6 sig fig exponential notation, small
2563     if (dxx < 9.9999949999999e-16) {
2564       if (dxx < 9.9999949999999e-128) {
2565         if (dxx == 0.0) {
2566           *start = '0';
2567           return &(start[1]);
2568         }
2569         if (dxx < 9.9999949999999e-256) {
2570           dxx *= 1.0e256;
2571           xp10 |= 256;
2572         } else {
2573           dxx *= 1.0e128;
2574           xp10 |= 128;
2575         }
2576       }
2577       if (dxx < 9.9999949999999e-64) {
2578         dxx *= 1.0e64;
2579         xp10 |= 64;
2580       }
2581       if (dxx < 9.9999949999999e-32) {
2582         dxx *= 1.0e32;
2583         xp10 |= 32;
2584       }
2585       if (dxx < 9.9999949999999e-16) {
2586         dxx *= 1.0e16;
2587         xp10 |= 16;
2588       }
2589     }
2590     if (dxx < 9.9999949999999e-8) {
2591       dxx *= 100000000;
2592       xp10 |= 8;
2593     }
2594     if (dxx < 9.9999949999999e-4) {
2595       dxx *= 10000;
2596       xp10 |= 4;
2597     }
2598     if (dxx < 9.9999949999999e-2) {
2599       dxx *= 100;
2600       xp10 |= 2;
2601     }
2602     if (dxx < 9.9999949999999e-1) {
2603       dxx *= 10;
2604       ++xp10;
2605     }
2606     BankerRoundD5(dxx, kBankerRound8, &quotient, &remainder);
2607     start = memcpya_k(qrtoa_1p5(quotient, remainder, start), "e-", 2);
2608     if (xp10 >= 100) {
2609       quotient = xp10 / 100;
2610       *start++ = '0' + quotient;
2611       xp10 -= 100 * quotient;
2612     }
2613     return memcpya_k(start, &(kDigitPair[xp10]), 2);
2614   }
2615   if (dxx >= 999999.49999999) {
2616     // 6 sig fig exponential notation, large
2617     if (dxx >= 9.9999949999999e15) {
2618       if (dxx >= 9.9999949999999e127) {
2619         if (dxx > DBL_MAX) {
2620           return strcpya_k(start, "inf");
2621         }
2622         if (dxx >= 9.9999949999999e255) {
2623           dxx *= 1.0e-256;
2624           xp10 |= 256;
2625         } else {
2626           dxx *= 1.0e-128;
2627           xp10 |= 128;
2628         }
2629       }
2630       if (dxx >= 9.9999949999999e63) {
2631         dxx *= 1.0e-64;
2632         xp10 |= 64;
2633       }
2634       if (dxx >= 9.9999949999999e31) {
2635         dxx *= 1.0e-32;
2636         xp10 |= 32;
2637       }
2638       if (dxx >= 9.9999949999999e15) {
2639         dxx *= 1.0e-16;
2640         xp10 |= 16;
2641       }
2642     }
2643     if (dxx >= 9.9999949999999e7) {
2644       dxx *= 1.0e-8;
2645       xp10 |= 8;
2646     }
2647     if (dxx >= 9.9999949999999e3) {
2648       dxx *= 1.0e-4;
2649       xp10 |= 4;
2650     }
2651     if (dxx >= 9.9999949999999e1) {
2652       dxx *= 1.0e-2;
2653       xp10 |= 2;
2654     }
2655     if (dxx >= 9.9999949999999e0) {
2656       dxx *= 1.0e-1;
2657       xp10++;
2658     }
2659     BankerRoundD5(dxx, kBankerRound8, &quotient, &remainder);
2660     start = memcpya_k(qrtoa_1p5(quotient, remainder, start), "e+", 2);
2661     if (xp10 >= 100) {
2662       quotient = xp10 / 100;
2663       *start++ = '0' + quotient;
2664       xp10 -= 100 * quotient;
2665     }
2666     return memcpya_k(start, &(kDigitPair[xp10]), 2);
2667   }
2668   if (dxx >= 0.99999949999999) {
2669     return dtoa_so6(dxx, start);
2670   }
2671   // 6 sig fig decimal, no less than ~0.0001
2672   start = memcpya_k(start, "0.", 2);
2673   if (dxx < 9.9999949999999e-3) {
2674     dxx *= 100;
2675     start = memcpya_k(start, "00", 2);
2676   }
2677   if (dxx < 9.9999949999999e-2) {
2678     dxx *= 10;
2679     *start++ = '0';
2680   }
2681   return uitoa_trunc6(BankerRoundD(dxx * 1000000, kBankerRound8), start);
2682 }
2683 
dtoa_g_p8(double dxx,char * start)2684 char* dtoa_g_p8(double dxx, char* start) {
2685   uint32_t xp10 = 0;
2686   char wbuf[16];
2687   char* wpos = wbuf;
2688   uint32_t quotient;
2689   uint32_t remainder;
2690   if (dxx != dxx) {
2691     return strcpya_k(start, "nan");
2692   }
2693   if (dxx < 0) {
2694     *wpos++ = '-';
2695     dxx = -dxx;
2696   }
2697   if (dxx < 9.9999999499999e-5) {
2698     // 8 sig fig exponential notation, small
2699     if (dxx < 9.9999999499999e-16) {
2700       if (dxx < 9.9999999499999e-128) {
2701         if (dxx == 0.0) {
2702           *start = '0';
2703           return &(start[1]);
2704         }
2705         if (dxx < 9.9999999499999e-256) {
2706           dxx *= 1.0e256;
2707           xp10 |= 256;
2708         } else {
2709           dxx *= 1.0e128;
2710           xp10 |= 128;
2711         }
2712       }
2713       if (dxx < 9.9999999499999e-64) {
2714         dxx *= 1.0e64;
2715         xp10 |= 64;
2716       }
2717       if (dxx < 9.9999999499999e-32) {
2718         dxx *= 1.0e32;
2719         xp10 |= 32;
2720       }
2721       if (dxx < 9.9999999499999e-16) {
2722         dxx *= 1.0e16;
2723         xp10 |= 16;
2724       }
2725     }
2726     if (dxx < 9.9999999499999e-8) {
2727       dxx *= 100000000;
2728       xp10 |= 8;
2729     }
2730     if (dxx < 9.9999999499999e-4) {
2731       dxx *= 10000;
2732       xp10 |= 4;
2733     }
2734     if (dxx < 9.9999999499999e-2) {
2735       dxx *= 100;
2736       xp10 |= 2;
2737     }
2738     if (dxx < 9.9999999499999e-1) {
2739       dxx *= 10;
2740       ++xp10;
2741     }
2742     BankerRoundD7(dxx, kBankerRound6, &quotient, &remainder);
2743     wpos = qrtoa_1p7(quotient, remainder, wpos);
2744     remainder = wpos - wbuf;
2745     if (xp10 >= 100) {
2746       start = memcpya(start, wbuf, remainder);
2747       quotient = xp10 / 100;
2748       start = memcpyax_k(start, "e-", 2, '0' + quotient);
2749       xp10 -= 100 * quotient;
2750     } else {
2751       start = memcpya(start, wbuf, remainder);
2752       start = memcpya_k(start, "e-", 2);
2753     }
2754     return memcpya_k(start, &(kDigitPair[xp10]), 2);
2755   }
2756   if (dxx >= 99999999.499999) {
2757     // 8 sig fig exponential notation, large
2758     if (dxx >= 9.9999999499999e15) {
2759       if (dxx >= 9.9999999499999e127) {
2760         if (dxx > DBL_MAX) {
2761           if (wpos == wbuf) {
2762             return memcpya(start, " inf", 4);
2763           }
2764           return memcpya(start, "-inf", 4);
2765         }
2766         if (dxx >= 9.9999999499999e255) {
2767           dxx *= 1.0e-256;
2768           xp10 |= 256;
2769         } else {
2770           dxx *= 1.0e-128;
2771           xp10 |= 128;
2772         }
2773       }
2774       if (dxx >= 9.9999999499999e63) {
2775         dxx *= 1.0e-64;
2776         xp10 |= 64;
2777       }
2778       if (dxx >= 9.9999999499999e31) {
2779         dxx *= 1.0e-32;
2780         xp10 |= 32;
2781       }
2782       if (dxx >= 9.9999999499999e15) {
2783         dxx *= 1.0e-16;
2784         xp10 |= 16;
2785       }
2786     }
2787     if (dxx >= 9.9999999499999e7) {
2788       dxx *= 1.0e-8;
2789       xp10 |= 8;
2790     }
2791     if (dxx >= 9.9999999499999e3) {
2792       dxx *= 1.0e-4;
2793       xp10 |= 4;
2794     }
2795     if (dxx >= 9.9999999499999e1) {
2796       dxx *= 1.0e-2;
2797       xp10 |= 2;
2798     }
2799     if (dxx >= 9.9999999499999e0) {
2800       dxx *= 1.0e-1;
2801       ++xp10;
2802     }
2803     BankerRoundD7(dxx, kBankerRound6, &quotient, &remainder);
2804     wpos = qrtoa_1p7(quotient, remainder, wpos);
2805     remainder = wpos - wbuf;
2806     if (xp10 >= 100) {
2807       start = memcpya(start, wbuf, remainder);
2808       quotient = xp10 / 100;
2809       start = memcpyax_k(start, "e+", 2, '0' + quotient);
2810       xp10 -= 100 * quotient;
2811     } else {
2812       start = memcpya(start, wbuf, remainder);
2813       start = memcpya_k(start, "e+", 2);
2814     }
2815     return memcpya_k(start, &(kDigitPair[xp10]), 2);
2816   }
2817   if (dxx >= 0.99999999499999) {
2818     wpos = dtoa_so8(dxx, wpos);
2819   } else {
2820     // 8 sig fig decimal, no less than ~0.0001
2821     wpos = memcpya_k(wpos, "0.", 2);
2822     if (dxx < 9.9999999499999e-3) {
2823       dxx *= 100;
2824       wpos = memcpya_k(wpos, "00", 2);
2825     }
2826     if (dxx < 9.9999999499999e-2) {
2827       dxx *= 10;
2828       *wpos++ = '0';
2829     }
2830     wpos = uitoa_trunc8(BankerRoundD(dxx * 100000000, kBankerRound6), wpos);
2831   }
2832   remainder = wpos - wbuf;
2833   return memcpya(start, wbuf, remainder);
2834 }
2835 
2836 // "prob" means that the number is guaranteed to be in [0, 1].
2837 // no leading space is printed.  trailing zeroes (/decimal point) are erased
2838 //   iff there is equality to ~13 decimal places.
dtoa_f_probp6_spaced(double dxx,char * start)2839 char* dtoa_f_probp6_spaced(double dxx, char* start) {
2840   double dxx_10_6 = dxx * 1000000;
2841   const uint32_t dec_digits = BankerRoundD(dxx_10_6, kBankerRound8);
2842   *start++ = '0' + (dec_digits == 1000000);
2843   *start++ = '.';
2844   start = uitoa_z6(dec_digits, start);
2845   if (fabs(dxx_10_6 - u31tod(dec_digits)) >= 0.00000005) {
2846     return start;
2847   }
2848   TrailingZeroesToSpaces(start);
2849   return start;
2850 }
2851 
dtoa_f_probp6_clipped(double dxx,char * start)2852 char* dtoa_f_probp6_clipped(double dxx, char* start) {
2853   double dxx_10_6 = dxx * 1000000;
2854   const uint32_t dec_digits = BankerRoundD(dxx_10_6, kBankerRound8);
2855   *start++ = '0' + (dec_digits == 1000000);
2856   *start++ = '.';
2857   start = uitoa_z6(dec_digits, start);
2858   if (fabs(dxx_10_6 - u31tod(dec_digits)) >= 0.00000005) {
2859     return start;
2860   }
2861   return ClipTrailingZeroes(start);
2862 }
2863 
2864 /*
2865 char* dtoa_f_p5_clipped(double dxx, char* start) {
2866   if (dxx != dxx) {
2867     return strcpya_k(start, "nan");
2868   }
2869   if (dxx < 0.0) {
2870     // note that "-0" will be printed for very small negative numbers; do we
2871     // want this?
2872     *start++ = '-';
2873     dxx = -dxx;
2874   }
2875 #ifdef __LP64__
2876   if (dxx < 4294967295.999994) {
2877     // We could use different levels of banker's rounding for different-size
2878     // quotients, but that's overkill for now; revisit after basic dosage
2879     // support is working.
2880     dxx *= 100000;
2881     uint64_t remainder = (int64_t)dxx;
2882     remainder += (int64_t)((dxx - ((int64_t)remainder)) + kBankerRound6[remainder & 1]);
2883     uint64_t quotient = remainder / 100000;
2884     remainder = remainder - quotient * 100000;
2885     start = u32toa(quotient, start);
2886     return rtoa_p5(remainder, start);
2887   }
2888 #else
2889   if (dxx < 2147483647.999994) {
2890     // avoid 64-bit integer math in 32-bit build.
2891     // (todo: a bit of benchmarking)
2892     const uintptr_t quotient = (intptr_t)dxx;
2893     const double remainder_d = (dxx - ((intptr_t)quotient)) * 100000;
2894     const uint32_t remainder_d_trunc = (int32_t)remainder_d;
2895     const uint32_t remainder = (int32_t)(remainder_d + kBankerRound6[remainder_d_trunc & 1]);
2896     start = u32toa(quotient, start);
2897     return rtoa_p5(remainder, start);
2898   }
2899 #endif
2900   if (dxx == INFINITY) {
2901     return strcpya_k(start, "inf");
2902   }
2903   // just punt larger numbers to glibc for now, this isn't a bottleneck
2904   start += sprintf(start, "%.5f", dxx);
2905   // .5f doesn't strip trailing zeroes, do that manually
2906   for (uint32_t uii = 0; uii != 5; ++uii) {
2907     if (start[-1] != '0') {
2908       return start;
2909     }
2910     --start;
2911   }
2912   return &(start[-1]);  // strip the decimal point
2913 }
2914 */
2915 
2916 // todo: benchmark the exponential-notation part of this vs. dtoa_g(); maybe
2917 // dtoa_g() should actually call this (or at least the exponential-notation
2918 // part, put into its own function) for the most extreme values?
lntoa_g(double ln_val,char * start)2919 char* lntoa_g(double ln_val, char* start) {
2920   // log(999999.49999999)
2921   if (ln_val < 13.81551005796414) {
2922     // log(9.9999949999999e-5)
2923     if (ln_val > -9.210340871976317) {
2924       // No exponential notation.
2925 
2926       // log(0.99999949999999)
2927       if (ln_val > -5.000001349509205e-7) {
2928         // may as well fast-path x=1; since the most common use-case for this
2929         // function is p-value printing, x=1 should happen a lot more than x>1.
2930         // log(1.0000050000001)
2931         if (ln_val < 4.999987599993995e-6) {
2932           *start++ = '1';
2933           return start;
2934         }
2935         return dtoa_so6(exp(ln_val), start);
2936       }
2937       double dxx = exp(ln_val);
2938       // 6 sig fig decimal, no less than ~0.0001
2939       start = memcpya_k(start, "0.", 2);
2940       if (dxx < 9.9999949999999e-3) {
2941         dxx *= 100;
2942         start = memcpya_k(start, "00", 2);
2943       }
2944       if (dxx < 9.9999949999999e-2) {
2945         dxx *= 10;
2946         *start++ = '0';
2947       }
2948       return uitoa_trunc6(BankerRoundD(dxx * 1000000, kBankerRound8), start);
2949     }
2950     // if exponent is in danger of overflowing int32, just print '0'
2951     if (ln_val < 0x7ffffffb * (-kLn10)) {
2952       *start++ = '0';
2953       return start;
2954     }
2955   } else {
2956     // if exponent is in danger of overflowing int32, just print 'inf'
2957     if (ln_val > 0x7ffffffb * kLn10) {
2958       return strcpya_k(start, "inf");
2959     }
2960   }
2961   int32_t xp10 = S_CAST(int32_t, (5.000001349509205e-7 + ln_val) * kRecipLn10);
2962   double mantissa = exp(xp10 * (-kLn10) + ln_val);
2963   // mantissa will usually be in [.9999995, 9.999995], but |ln_val| can be
2964   // larger than 2^32, and floating point errors in either direction are
2965   // definitely possible (<20 bits of precision).
2966   if (mantissa < 0.99999949999999) {
2967     mantissa *= 10;
2968     xp10 -= 1;
2969   } else if (mantissa > 9.9999949999999) {
2970     mantissa *= 0.1;
2971     xp10 += 1;
2972   }
2973   uint32_t quotient;
2974   uint32_t remainder;
2975   BankerRoundD5(mantissa, kBankerRound8, &quotient, &remainder);
2976   start = qrtoa_1p5(quotient, remainder, start);
2977   if (xp10 < 0) {
2978     start = memcpya_k(start, "e-", 2);
2979     if (xp10 > -10) {
2980       *start++ = '0';
2981     }
2982     return u32toa(-xp10, start);
2983   }
2984   start = memcpya_k(start, "e+", 2);
2985   if (xp10 < 10) {
2986     *start++ = '0';
2987   }
2988   return u32toa(xp10, start);
2989 }
2990 
2991 // Previously had specialized float-printing functions, but upon reflection it
2992 // makes sense to just promote to double like printf does.
2993 
2994 
ScanForDuplicateIds(const char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_blen)2995 CXXCONST_CP ScanForDuplicateIds(const char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_blen) {
2996   --id_ct;
2997   for (uintptr_t id_idx = 0; id_idx != id_ct; ++id_idx) {
2998     if (strequal_overread(&(sorted_ids[id_idx * max_id_blen]), &(sorted_ids[(id_idx + 1) * max_id_blen]))) {
2999       return S_CAST(CXXCONST_CP, &(sorted_ids[id_idx * max_id_blen]));
3000     }
3001   }
3002   return nullptr;
3003 }
3004 
CollapseDuplicateIds(uintptr_t id_ct,uintptr_t max_id_blen,char * sorted_ids,uint32_t * id_starts)3005 uint32_t CollapseDuplicateIds(uintptr_t id_ct, uintptr_t max_id_blen, char* sorted_ids, uint32_t* id_starts) {
3006   // Collapses array of sorted IDs to remove duplicates, and writes
3007   // pre-collapse positions to id_starts (so e.g. duplication count of any
3008   // sample ID can be determined via subtraction) if it isn't nullptr.
3009   // Returns id_ct of collapsed array.
3010   if (!id_ct) {
3011     return 0;
3012   }
3013   uintptr_t read_idx = 1;
3014   uintptr_t write_idx;
3015   if (id_starts) {
3016     id_starts[0] = 0;
3017     for (; read_idx != id_ct; ++read_idx) {
3018       if (strequal_overread(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3019         break;
3020       }
3021       id_starts[read_idx] = read_idx;
3022     }
3023     write_idx = read_idx;
3024     while (++read_idx < id_ct) {
3025       // this loop can probably be improved with string-length tracking...
3026       if (!strequal_overread(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3027         strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
3028         id_starts[write_idx++] = read_idx;
3029       }
3030     }
3031   } else {
3032     for (; read_idx != id_ct; ++read_idx) {
3033       if (strequal_overread(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3034         break;
3035       }
3036     }
3037     write_idx = read_idx;
3038     while (++read_idx < id_ct) {
3039       if (!strequal_overread(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3040         strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
3041         ++write_idx;
3042       }
3043     }
3044   }
3045   return write_idx;
3046 }
3047 
3048 
bsearch_str(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx)3049 int32_t bsearch_str(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
3050   // does not assume null-terminated idbuf, or nonempty array.
3051   if (cur_id_slen >= max_id_blen) {
3052     return -1;
3053   }
3054   uintptr_t start_idx = 0;
3055   while (start_idx < end_idx) {
3056     const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3057     const int32_t ii = Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen);
3058     if (ii > 0) {
3059       start_idx = mid_idx + 1;
3060     } else if ((ii < 0) || sorted_strbox[mid_idx * max_id_blen + cur_id_slen]) {
3061       end_idx = mid_idx;
3062     } else {
3063       return S_CAST(uint32_t, mid_idx);
3064     }
3065   }
3066   return -1;
3067 }
3068 
bsearch_str_natural(const char * idbuf,const char * sorted_strbox,uintptr_t max_id_blen,uintptr_t end_idx)3069 int32_t bsearch_str_natural(const char* idbuf, const char* sorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx) {
3070   // unlike bsearch_str(), caller is responsible for slen >= max_id_blen check
3071   // if appropriate here
3072   uintptr_t start_idx = 0;
3073   while (start_idx < end_idx) {
3074     const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3075     const int32_t ii = strcmp_natural(idbuf, &(sorted_strbox[mid_idx * max_id_blen]));
3076     if (ii > 0) {
3077       start_idx = mid_idx + 1;
3078     } else if (ii < 0) {
3079       end_idx = mid_idx;
3080     } else {
3081       return S_CAST(uint32_t, mid_idx);
3082     }
3083   }
3084   return -1;
3085 }
3086 
bsearch_str_lb(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx)3087 uintptr_t bsearch_str_lb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
3088   // returns number of elements in sorted_strbox[] less than idbuf.
3089   if (cur_id_slen > max_id_blen) {
3090     cur_id_slen = max_id_blen;
3091   }
3092   uintptr_t start_idx = 0;
3093   while (start_idx < end_idx) {
3094     const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3095     if (Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen) > 0) {
3096       start_idx = mid_idx + 1;
3097     } else {
3098       end_idx = mid_idx;
3099     }
3100   }
3101   return start_idx;
3102 }
3103 
ExpsearchStrLb(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx,uintptr_t cur_idx)3104 uintptr_t ExpsearchStrLb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx, uintptr_t cur_idx) {
3105   uintptr_t next_incr = 1;
3106   uintptr_t start_idx = cur_idx;
3107   while (cur_idx < end_idx) {
3108     if (Memcmp(idbuf, &(sorted_strbox[cur_idx * max_id_blen]), cur_id_slen) <= 0) {
3109       end_idx = cur_idx;
3110       break;
3111     }
3112     start_idx = cur_idx + 1;
3113     cur_idx += next_incr;
3114     next_incr *= 2;
3115   }
3116   while (start_idx < end_idx) {
3117     const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3118     if (Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen) > 0) {
3119       start_idx = mid_idx + 1;
3120     } else {
3121       end_idx = mid_idx;
3122     }
3123   }
3124   return start_idx;
3125 }
3126 
ExpsearchNsortStrLb(const char * idbuf,const char * nsorted_strbox,uintptr_t max_id_blen,uintptr_t end_idx,uintptr_t cur_idx)3127 uintptr_t ExpsearchNsortStrLb(const char* idbuf, const char* nsorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx, uintptr_t cur_idx) {
3128   uintptr_t next_incr = 1;
3129   uintptr_t start_idx = cur_idx;
3130   while (cur_idx < end_idx) {
3131     if (strcmp_natural(idbuf, &(nsorted_strbox[cur_idx * max_id_blen])) <= 0) {
3132       end_idx = cur_idx;
3133       break;
3134     }
3135     start_idx = cur_idx + 1;
3136     cur_idx += next_incr;
3137     next_incr *= 2;
3138   }
3139   while (start_idx < end_idx) {
3140     const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3141     if (strcmp_natural(idbuf, &(nsorted_strbox[mid_idx * max_id_blen])) > 0) {
3142       start_idx = mid_idx + 1;
3143     } else {
3144       end_idx = mid_idx;
3145     }
3146   }
3147   return start_idx;
3148 }
3149 
IsInfStr(const char * ss,uint32_t slen,uint32_t * is_neg_ptr)3150 uint32_t IsInfStr(const char* ss, uint32_t slen, uint32_t* is_neg_ptr) {
3151   const char first_char = *ss;
3152   if (first_char == '-') {
3153     *is_neg_ptr = 1;
3154     ++ss;
3155     --slen;
3156   } else if (first_char == '+') {
3157     ++ss;
3158     --slen;
3159   }
3160   if (slen == 3) {
3161     uint32_t four_chars;
3162     memcpy(&four_chars, ss, 4);  // OVERREAD
3163     // assumes little-endian
3164     return ((four_chars & 0xdfdfdf) == 0x464e49);
3165   }
3166   if (slen != 8) {
3167     return 0;
3168   }
3169   uint64_t eight_chars;
3170   memcpy(&eight_chars, ss, 8);
3171   return ((eight_chars & 0xdfdfdfdfdfdfdfdfLLU) == 0x5954494e49464e49LLU);
3172 }
3173 
3174 
3175 #ifdef __LP64__
Memrchr(const char * str_start,char needle,uintptr_t slen)3176 CXXCONST_CP Memrchr(const char* str_start, char needle, uintptr_t slen) {
3177   const VecI8 vvec_all_needle = veci8_set1(needle);
3178   const uintptr_t str_end_addr = R_CAST(uintptr_t, str_start) + slen;
3179   const uint32_t trailing_byte_ct = str_end_addr % kBytesPerVec;
3180   const VecI8* str_rev_viter = R_CAST(const VecI8*, RoundDownPow2(str_end_addr, kBytesPerVec));
3181   if (trailing_byte_ct) {
3182     // This is a GNU vector extension parallel-equality check, which gets
3183     // translated to e.g. _mm256_cmpeq_epi8().
3184     // As long as we're performing aligned reads, it's safe to read bytes
3185     // beyond str_end as long as they're in the same vector; we only risk
3186     // violating process read permissions if we cross a page boundary.
3187     // (For this reason, I don't bother with AVX unaligned reads.)
3188     const VecI8 match_vvec = (*str_rev_viter == vvec_all_needle);
3189     uint32_t matching_bytes = veci8_movemask(match_vvec);
3190     matching_bytes &= (1U << (trailing_byte_ct % kBytesPerVec)) - 1;
3191     if (str_start > R_CAST(const char*, str_rev_viter)) {
3192       const uint32_t leading_byte_ct = R_CAST(uintptr_t, str_start) % kBytesPerVec;
3193       matching_bytes &= -(1U << leading_byte_ct);
3194       // Special-case this, since main_loop_iter_ct below would underflow.
3195       if (!matching_bytes) {
3196         return nullptr;
3197       }
3198     }
3199     if (matching_bytes) {
3200       const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3201       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3202     }
3203   }
3204   const uintptr_t main_loop_iter_ct = (R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start)) / (2 * kBytesPerVec);
3205   for (uintptr_t ulii = 0; ulii != main_loop_iter_ct; ++ulii) {
3206     // For long lines, looping over two vectors at a time is most efficient on
3207     // my Mac (also tried 1 and 4).
3208     --str_rev_viter;
3209     const VecI8 match_vvec1 = (*str_rev_viter == vvec_all_needle);
3210     --str_rev_viter;
3211     const VecI8 match_vvec0 = (*str_rev_viter == vvec_all_needle);
3212     const uint32_t matching_bytes = veci8_movemask(match_vvec1 | match_vvec0);
3213     if (matching_bytes) {
3214       const uint32_t matching_bytes1 = veci8_movemask(match_vvec1);
3215       if (matching_bytes1) {
3216         const uint32_t byte_offset_in_vec = bsru32(matching_bytes1);
3217         return &(R_CAST(CXXCONST_CP, &(str_rev_viter[1]))[byte_offset_in_vec]);
3218       }
3219       const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3220       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3221     }
3222   }
3223   while (1) {
3224     uintptr_t remaining_byte_ct_underflow = R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start);
3225     if (S_CAST(intptr_t, remaining_byte_ct_underflow) <= 0) {
3226       return nullptr;
3227     }
3228     --str_rev_viter;
3229     const VecI8 match_vvec = (*str_rev_viter == vvec_all_needle);
3230     const uint32_t matching_bytes = veci8_movemask(match_vvec);
3231     if (matching_bytes) {
3232       const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3233       if (byte_offset_in_vec + remaining_byte_ct_underflow < kBytesPerVec) {
3234         return nullptr;
3235       }
3236       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3237     }
3238   }
3239 }
3240 
LastSpaceOrEoln(const char * str_start,uintptr_t slen)3241 CXXCONST_CP LastSpaceOrEoln(const char* str_start, uintptr_t slen) {
3242   // See TokenLexK0().
3243   const VecUc vvec_all95 = vecuc_set1(95);
3244   const uintptr_t str_end_addr = R_CAST(uintptr_t, str_start) + slen;
3245   const uint32_t trailing_byte_ct = str_end_addr % kBytesPerVec;
3246   const VecUc* str_rev_viter = R_CAST(const VecUc*, RoundDownPow2(str_end_addr, kBytesPerVec));
3247   if (trailing_byte_ct) {
3248     // As long as we're performing aligned reads, it's safe to read bytes
3249     // beyond str_end as long as they're in the same vector; we only risk
3250     // violating process read permissions if we cross a page boundary.
3251     const VecUc postspace_vvec = vecuc_adds(*str_rev_viter, vvec_all95);
3252     uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
3253     nontoken_bytes &= (1U << (trailing_byte_ct % kBytesPerVec)) - 1;
3254     if (str_start > R_CAST(const char*, str_rev_viter)) {
3255       const uint32_t leading_byte_ct = R_CAST(uintptr_t, str_start) % kBytesPerVec;
3256       nontoken_bytes &= -(1U << leading_byte_ct);
3257       // Special-case this, since main_loop_iter_ct below would underflow.
3258       if (!nontoken_bytes) {
3259         return nullptr;
3260       }
3261     }
3262     if (nontoken_bytes) {
3263       const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3264       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3265     }
3266   }
3267   const uintptr_t main_loop_iter_ct = (R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start)) / (2 * kBytesPerVec);
3268   for (uintptr_t ulii = 0; ulii != main_loop_iter_ct; ++ulii) {
3269     --str_rev_viter;
3270     const VecUc postspace_vvec1 = vecuc_adds(*str_rev_viter, vvec_all95);
3271     --str_rev_viter;
3272     const VecUc postspace_vvec0 = vecuc_adds(*str_rev_viter, vvec_all95);
3273     const uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec1 & postspace_vvec0));
3274     if (nontoken_bytes) {
3275       const uint32_t nontoken_bytes1 = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec1));
3276       if (nontoken_bytes1) {
3277         const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes1);
3278         return &(R_CAST(CXXCONST_CP, &(str_rev_viter[1]))[byte_offset_in_vec]);
3279       }
3280       const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3281       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3282     }
3283   }
3284   while (1) {
3285     uintptr_t remaining_byte_ct_underflow = R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start);
3286     if (S_CAST(intptr_t, remaining_byte_ct_underflow) <= 0) {
3287       return nullptr;
3288     }
3289     --str_rev_viter;
3290     const VecUc postspace_vvec = vecuc_adds(*str_rev_viter, vvec_all95);
3291     const uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
3292     if (nontoken_bytes) {
3293       const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3294       if (byte_offset_in_vec + remaining_byte_ct_underflow < kBytesPerVec) {
3295         return nullptr;
3296       }
3297       return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3298     }
3299   }
3300 }
3301 #endif  // __LP64__
3302 
3303 /*
3304 void ReplaceAllInstances(char old_char, char new_char, uint32_t slen, char* dst) {
3305   // probable todo: add SIMD version
3306   while (1) {
3307     char* strchr_result = memchr(dst, old_char, slen);
3308     if (!strchr_result) {
3309       return;
3310     }
3311     *strchr_result++ = new_char;
3312     slen -= strchr_result - dst;
3313     dst = strchr_result;
3314   }
3315 }
3316 */
3317 
TabsToSpaces(char * ss_iter)3318 void TabsToSpaces(char* ss_iter) {
3319   while (1) {
3320     ss_iter = strchr(ss_iter, '\t');
3321     if (!ss_iter) {
3322       return;
3323     }
3324     *ss_iter++ = ' ';
3325   }
3326 }
3327 
ReplaceCharAdvChecked(char old_char,char new_char,char ** str_ptr)3328 BoolErr ReplaceCharAdvChecked(char old_char, char new_char, char** str_ptr) {
3329   char* token_end;
3330   for (char* str_iter = *str_ptr; ; str_iter = &(token_end[1])) {
3331     token_end = strchrnul2(str_iter, old_char, new_char);
3332     if ((*token_end) != old_char) {
3333       if (likely(!(*token_end))) {
3334         *str_ptr = token_end;
3335         return 0;
3336       }
3337       return 1;
3338     }
3339     *token_end = new_char;
3340   }
3341 }
3342 
3343 #ifdef __cplusplus
3344 }  // namespace plink2
3345 #endif
3346