1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_string.h"
19
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23
24 #if defined(__LP64__) && !defined(_GNU_SOURCE)
rawmemchr(const void * ss,int cc)25 CXXCONST_VOIDP rawmemchr(const void* ss, int cc) {
26 const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
27 const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
28 const VecI8 vvec_all_needle = veci8_set1(cc);
29 VecI8 cur_vvec = *ss_viter;
30 VecI8 needle_match_vvec = (cur_vvec == vvec_all_needle);
31 uint32_t matching_bytes = veci8_movemask(needle_match_vvec);
32 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
33 matching_bytes &= UINT32_MAX << leading_byte_ct;
34 // This is typically short-range, so the Memrchr() double-vector strategy is
35 // unlikely to be profitable. (todo: experiment with header-inline)
36 while (!matching_bytes) {
37 ++ss_viter;
38 cur_vvec = *ss_viter;
39 needle_match_vvec = (cur_vvec == vvec_all_needle);
40 matching_bytes = veci8_movemask(needle_match_vvec);
41 }
42 const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
43 return R_CAST(CXXCONST_VOIDP, R_CAST(uintptr_t, ss_viter) + byte_offset_in_vec);
44 }
45 #endif
46
47 #ifdef __LP64__
rawmemchr2(const void * ss,unsigned char ucc1,unsigned char ucc2)48 CXXCONST_VOIDP rawmemchr2(const void* ss, unsigned char ucc1, unsigned char ucc2) {
49 const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
50 const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
51 const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
52 const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
53 VecI8 cur_vvec = *ss_viter;
54 VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
55 VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
56 uint32_t matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec);
57 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
58 matching_bytes &= UINT32_MAX << leading_byte_ct;
59 while (!matching_bytes) {
60 ++ss_viter;
61 cur_vvec = *ss_viter;
62 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
63 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
64 matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec);
65 }
66 const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
67 return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
68 }
69
rawmemchr3(const void * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)70 CXXCONST_VOIDP rawmemchr3(const void* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
71 const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
72 const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
73 const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
74 const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
75 const VecI8 vvec_all_ucc3 = veci8_set1(ucc3);
76 VecI8 cur_vvec = *ss_viter;
77 VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
78 VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
79 VecI8 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
80 uint32_t matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
81 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
82 matching_bytes &= UINT32_MAX << leading_byte_ct;
83 while (!matching_bytes) {
84 ++ss_viter;
85 cur_vvec = *ss_viter;
86 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
87 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
88 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
89 matching_bytes = veci8_movemask(ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
90 }
91 const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
92 return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
93 }
94
strchrnul3(const char * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)95 CXXCONST_CP strchrnul3(const char* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
96 const uintptr_t starting_addr = R_CAST(uintptr_t, ss);
97 const VecI8* ss_viter = R_CAST(const VecI8*, RoundDownPow2(starting_addr, kBytesPerVec));
98 const VecI8 vvec_all_zero = veci8_setzero();
99 const VecI8 vvec_all_ucc1 = veci8_set1(ucc1);
100 const VecI8 vvec_all_ucc2 = veci8_set1(ucc2);
101 const VecI8 vvec_all_ucc3 = veci8_set1(ucc3);
102 VecI8 cur_vvec = *ss_viter;
103 VecI8 zero_match_vvec = (cur_vvec == vvec_all_zero);
104 VecI8 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
105 VecI8 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
106 VecI8 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
107 uint32_t matching_bytes = veci8_movemask(zero_match_vvec | ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
108 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, ss_viter);
109 matching_bytes &= UINT32_MAX << leading_byte_ct;
110 while (!matching_bytes) {
111 ++ss_viter;
112 cur_vvec = *ss_viter;
113 zero_match_vvec = (cur_vvec == vvec_all_zero);
114 ucc1_match_vvec = (cur_vvec == vvec_all_ucc1);
115 ucc2_match_vvec = (cur_vvec == vvec_all_ucc2);
116 ucc3_match_vvec = (cur_vvec == vvec_all_ucc3);
117 matching_bytes = veci8_movemask(zero_match_vvec | ucc1_match_vvec | ucc2_match_vvec | ucc3_match_vvec);
118 }
119 const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
120 return &(R_CAST(CXXCONST_CP, ss_viter)[byte_offset_in_vec]);
121 }
122 #else
rawmemchr2(const void * ss,unsigned char ucc1,unsigned char ucc2)123 CXXCONST_VOIDP rawmemchr2(const void* ss, unsigned char ucc1, unsigned char ucc2) {
124 for (const unsigned char* ss_iter = S_CAST(const unsigned char*, ss); ; ++ss_iter) {
125 unsigned char ucc = *ss_iter;
126 if ((ucc == ucc1) || (ucc == ucc2)) {
127 return S_CAST(CXXCONST_VOIDP, ss_iter);
128 }
129 }
130 }
131
rawmemchr3(const void * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)132 CXXCONST_VOIDP rawmemchr3(const void* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
133 for (const unsigned char* ss_iter = S_CAST(const unsigned char*, ss); ; ++ss_iter) {
134 unsigned char ucc = *ss_iter;
135 if ((ucc == ucc1) || (ucc == ucc2) || (ucc == ucc3)) {
136 return S_CAST(CXXCONST_VOIDP, ss_iter);
137 }
138 }
139 }
140
strchrnul3(const char * ss,unsigned char ucc1,unsigned char ucc2,unsigned char ucc3)141 CXXCONST_CP strchrnul3(const char* ss, unsigned char ucc1, unsigned char ucc2, unsigned char ucc3) {
142 for (; ; ++ss) {
143 unsigned char ucc = *ss;
144 // at this point, checking against a precomputed 256-byte array may be
145 // better? test this.
146 if ((!ucc) || (ucc == ucc1) || (ucc == ucc2) || (ucc == ucc3)) {
147 return S_CAST(CXXCONST_CP, ss);
148 }
149 }
150 }
151 #endif
152
WordWrap(uint32_t suffix_len,char * strbuf)153 void WordWrap(uint32_t suffix_len, char* strbuf) {
154 // Input: A null-terminated string with no intermediate newlines. If
155 // suffix_len is zero, there should be a terminating \n; otherwise,
156 // the last character should be a space. The allocation the string is
157 // part of must include at least ~80 bytes past the string end.
158 // Effect: Spaces are replaced with newlines in a manner that plays well with
159 // 80 column terminal windows. (Multi-space blocks are never
160 // collapsed.)
161 // Considered UTF-8 awareness, but then decided against it after reading
162 // https://denisbider.blogspot.com/2015/09/when-monospace-fonts-arent-unicode.html .
163 char* token_start = strbuf;
164 char* line_end = &(strbuf[79]);
165 char* token_end;
166 while (1) {
167 while (*token_start == ' ') {
168 ++token_start;
169 }
170 if (token_start > line_end) {
171 do {
172 *line_end = '\n';
173 line_end = &(line_end[80]);
174 } while (token_start > line_end);
175 }
176 token_end = strchr(token_start, ' ');
177 if (!token_end) {
178 if (&(token_start[79]) == line_end) {
179 return;
180 }
181 token_end = strnul(token_start);
182 if (!suffix_len) {
183 if (token_end <= &(line_end[1])) {
184 // okay if end-of-string is one past the end, because function
185 // assumes last character is \n in suffix_len == 0 case
186 assert(token_end[-1] == '\n');
187 return;
188 }
189 } else {
190 if (&(token_end[suffix_len]) <= line_end) {
191 return;
192 }
193 // because of terminal space assumption, token_start actually points
194 // to the end of the string
195 assert(token_start[-1] == ' ');
196 }
197 token_start[-1] = '\n';
198 return;
199 }
200 if (token_end > line_end) {
201 if (&(token_start[79]) != line_end) {
202 token_start[-1] = '\n';
203 line_end = &(token_start[79]);
204 if (token_end > line_end) {
205 // single really long token, can't do anything beyond putting it on
206 // its own line
207 *token_end = '\n';
208 line_end = &(token_end[80]);
209 }
210 } else {
211 // single really long token, *and* previous token was either
212 // nonexistent or long
213 *token_end = '\n';
214 line_end = &(token_end[80]);
215 }
216 }
217 token_start = &(token_end[1]);
218 }
219 }
220
221
222 static const uint32_t kPow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
223
224 #ifdef USE_AVX2
225 static const unsigned char kLzUintSlenBase[] =
226 {9, 9, 9, 8,
227 8, 8, 7, 7,
228 7, 6, 6, 6,
229 6, 5, 5, 5,
230 4, 4, 4, 3,
231 3, 3, 3, 2,
232 2, 2, 1, 1,
233 1, 0, 0, 0,
234 1}; // UintSlen(0) needs to be 1, not zero
235
UintSlen(uint32_t num)236 uint32_t UintSlen(uint32_t num) {
237 const uint32_t lz_ct = _lzcnt_u32(num);
238 const uint32_t slen_base = kLzUintSlenBase[lz_ct];
239 return slen_base + (num >= kPow10[slen_base]);
240 }
241 #else
242 // could also use something like ((32 - lz_ct) * 77) >> 8, since 77/256 is a
243 // sufficiently good approximation of ln(2)/ln(10), but that's a bit slower and
244 // this table doesn't take much space
245 //
246 // bugfix (29 Mar 2018): this table was totally wrong, ugh
247 static const unsigned char kBsrUintSlenBase[] =
248 {1, 1, 1, 1,
249 2, 2, 2, 3,
250 3, 3, 4, 4,
251 4, 4, 5, 5,
252 5, 6, 6, 6,
253 7, 7, 7, 7,
254 8, 8, 8, 9,
255 9, 9, 9, 9};
256
UintSlen(uint32_t num)257 uint32_t UintSlen(uint32_t num) {
258 // tried divide-by-10 and divide-by-100 loops, they were slower
259 // also tried a hardcoded binary tree, it was better but still slower
260
261 // bsru32(0) is undefined
262 if (num < 10) {
263 return 1;
264 }
265 const uint32_t top_bit_pos = bsru32(num);
266 const uint32_t slen_base = kBsrUintSlenBase[top_bit_pos];
267 return slen_base + (num >= kPow10[slen_base]);
268 }
269 #endif
270
FirstUnequal4(const char * s1,const char * s2,uintptr_t slen)271 uintptr_t FirstUnequal4(const char* s1, const char* s2, uintptr_t slen) {
272 // See memequal() in plink2_base.
273 #ifdef __LP64__
274 if (slen < kBytesPerVec) {
275 if (slen < kBytesPerWord) {
276 uint32_t xor_result = (*R_CAST(const uint32_t*, s1)) ^ (*R_CAST(const uint32_t*, s2));
277 if (xor_result) {
278 return ctzu32(xor_result) / CHAR_BIT;
279 }
280 if (slen > 4) {
281 const uintptr_t final_offset = slen - 4;
282 xor_result = (*R_CAST(const uint32_t*, &(s1[final_offset]))) ^ (*R_CAST(const uint32_t*, &(s2[final_offset])));
283 if (xor_result) {
284 return final_offset + ctzu32(xor_result) / CHAR_BIT;
285 }
286 }
287 return slen;
288 }
289 const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
290 const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
291 const uintptr_t word_ct = slen / kBytesPerWord;
292 for (uint32_t widx = 0; widx != word_ct; ++widx) {
293 const uintptr_t xor_result = s1_alias[widx] ^ s2_alias[widx];
294 if (xor_result) {
295 return widx * kBytesPerWord + ctzw(xor_result) / CHAR_BIT;
296 }
297 }
298 if (slen % kBytesPerWord) {
299 const uintptr_t final_offset = slen - kBytesPerWord;
300 const uintptr_t xor_result = (*R_CAST(const uintptr_t*, &(s1[final_offset]))) ^ (*R_CAST(const uintptr_t*, &(s2[final_offset])));
301 if (xor_result) {
302 return final_offset + ctzw(xor_result) / CHAR_BIT;
303 }
304 }
305 return slen;
306 }
307 const VecUc* s1_alias = R_CAST(const VecUc*, s1);
308 const VecUc* s2_alias = R_CAST(const VecUc*, s2);
309 const uintptr_t vec_ct = slen / kBytesPerVec;
310 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
311 const VecUc v1 = vecuc_loadu(&(s1_alias[vidx]));
312 const VecUc v2 = vecuc_loadu(&(s2_alias[vidx]));
313 const uint32_t eq_result = vecw_movemask(v1 == v2);
314 if (eq_result != kVec8thUintMax) {
315 return vidx * kBytesPerVec + ctzu32(~eq_result);
316 }
317 }
318 if (slen % kBytesPerVec) {
319 const uintptr_t final_offset = slen - kBytesPerVec;
320 const VecW v1 = vecw_loadu(&(s1[final_offset]));
321 const VecW v2 = vecw_loadu(&(s2[final_offset]));
322 const uint32_t eq_result = vecw_movemask(v1 == v2);
323 if (eq_result != kVec8thUintMax) {
324 return final_offset + ctzu32(~eq_result);
325 }
326 }
327 #else // !__LP64__
328 const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
329 const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
330 const uintptr_t word_ct = slen / kBytesPerWord;
331 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
332 const uintptr_t xor_result = s1_alias[widx] ^ s2_alias[widx];
333 if (xor_result) {
334 return widx * kBytesPerWord + ctzw(xor_result) / CHAR_BIT;
335 }
336 }
337 if (slen % kBytesPerWord) {
338 const uintptr_t final_offset = slen - kBytesPerWord;
339 const uintptr_t xor_result = (*R_CAST(const uintptr_t*, &(s1[final_offset]))) ^ (*R_CAST(const uintptr_t*, &(s2[final_offset])));
340 if (xor_result) {
341 return final_offset + ctzw(xor_result) / CHAR_BIT;
342 }
343 }
344 #endif
345 return slen;
346 }
347
348 // May read (kBytesPerWord - 1) bytes past the end of each string.
349 // This can be quite a bit faster than stock strcmp on x86 for sorting, though
350 // benchmarking is necessary in general (compiler may perform its own strcmp
351 // optimizations).
strcmp_overread(const char * s1,const char * s2)352 int32_t strcmp_overread(const char* s1, const char* s2) {
353 const uintptr_t* s1_alias = R_CAST(const uintptr_t*, s1);
354 const uintptr_t* s2_alias = R_CAST(const uintptr_t*, s2);
355 for (uintptr_t widx = 0; ; ++widx) {
356 uintptr_t w1 = s1_alias[widx];
357 const uintptr_t zcheck = DetectFirstZeroByte(w1);
358 uintptr_t w2 = s2_alias[widx];
359 if (zcheck) {
360 // Mask out bytes past the known null.
361 const uintptr_t mask = zcheck ^ (zcheck - 1);
362 w1 &= mask;
363 w2 &= mask;
364 if (w1 == w2) {
365 return 0;
366 }
367 } else if (w1 == w2) {
368 continue;
369 }
370 // bugfix (30 Jun 2018): forgot to adhere to strcmp instead of std::sort
371 // interface
372 #ifdef __LP64__
373 return (__builtin_bswap64(w1) < __builtin_bswap64(w2))? -1 : 1;
374 #else
375 return (__builtin_bswap32(w1) < __builtin_bswap32(w2))? -1 : 1;
376 #endif
377 }
378 }
379
strcmp_casted(const void * s1,const void * s2)380 int32_t strcmp_casted(const void* s1, const void* s2) {
381 return strcmp(S_CAST(const char*, s1), S_CAST(const char*, s2));
382 }
383
strcmp_overread_casted(const void * s1,const void * s2)384 int32_t strcmp_overread_casted(const void* s1, const void* s2) {
385 return strcmp_overread(S_CAST(const char*, s1), S_CAST(const char*, s2));
386 }
387
388 // PLINK 2's natural sort uses the following logic:
389 // - All alphabetic characters act as if they are capitalized, except for
390 // tiebreaking purposes (where ASCII is used).
391 // - Numbers are compared by magnitude, with the exception of...
392 // - Numbers with leading zero(es). If you're putting extraneous zeroes in
393 // front of IDs, we assume they're there to force particular items to be sorted
394 // earlier, rather than just appearing at random. So, unlike many natural sort
395 // implementations, we sort 00200 < 021 < 20: all numbers with n leading zeroes
396 // are sorted before all numbers with (n-1) leading zeroes; magnitude only
397 // applies if the leading zero counts match. This handles e.g. subbasement
398 // room numbering properly.
399 //
400 // This won't always do what you want if your IDs have variable-length decimals
401 // in them (e.g. it yields 0.99 < 0.101); if you don't want to fall back on
402 // ASCII sort, enforce a fixed number of digits after the decimal point. Also
403 // note that ASCII sort is outright better for e.g. numbers represented in
404 // hexadecimal or base 36. In principle, it's possible to reliably autodetect
405 // some of these cases (especially hexadecimal numbers beginning with "0x"),
406 // but that'll never be perfect so we just let the user toggle the sort method.
strcmp_natural_scan_forward(const char * s1,const char * s2)407 int32_t strcmp_natural_scan_forward(const char* s1, const char* s2) {
408 // assumes s1 and s2 currently point to the middle of a mismatching number,
409 // where s1 < s2.
410 char c1;
411 char c2;
412 do {
413 c1 = *(++s1);
414 c2 = *(++s2);
415 if (IsNotDigit(c1)) {
416 return -1;
417 }
418 } while (IsDigit(c2));
419 return 1;
420 }
421
422 // We have the following major states:
423 // 0 (initial): strings perfectly match so far, last char (if any) is
424 // nonnumeric.
425 // 1: strings perfectly match so far, last char is numeric.
426 // 2: strings match except for capitalization, last char is nonnumeric.
427 // 3: strings match except for capitalization, last char is numeric.
428 // strcmp_natural_tiebroken() expresses the logic for states 2 and 3, while
429 // strcmp_natural_uncasted() handles states 0 and 1.
strcmp_natural_tiebroken(const char * s1,const char * s2)430 int32_t strcmp_natural_tiebroken(const char* s1, const char* s2) {
431 // assumes ties should be broken in favor of s2.
432 unsigned char uc1 = *(++s1);
433 unsigned char uc2 = *(++s2);
434 while (IsNotNzdigit(uc1) && IsNotNzdigit(uc2)) {
435 // state 2
436 strcmp_natural_tiebroken_state_2:
437 if (uc1 != uc2) {
438 if ((uc1 >= 'a') && (uc1 <= 'z')) {
439 uc1 -= 32;
440 }
441 if ((uc2 >= 'a') && (uc2 <= 'z')) {
442 uc2 -= 32;
443 }
444 if (uc1 < uc2) {
445 return -1;
446 }
447 if (uc1 > uc2) {
448 return 1;
449 }
450 } else if (!uc1) {
451 return -1;
452 }
453 uc1 = *(++s1);
454 uc2 = *(++s2);
455 }
456 if (IsNotNzdigit(uc1) || IsNotNzdigit(uc2)) {
457 return (uc1 < uc2)? -1 : 1;
458 }
459 do {
460 // state 3
461 if (uc1 != uc2) {
462 if (IsDigit(uc2)) {
463 if (uc1 < uc2) {
464 return strcmp_natural_scan_forward(s1, s2);
465 }
466 return -strcmp_natural_scan_forward(s2, s1);
467 }
468 return 1;
469 }
470 uc1 = *(++s1);
471 uc2 = *(++s2);
472 } while (IsDigit(uc1));
473 if (IsDigit(uc2)) {
474 return -1;
475 }
476 // skip the while (is_not_digit...) check
477 goto strcmp_natural_tiebroken_state_2;
478 }
479
strcmp_natural_uncasted(const char * s1,const char * s2)480 int32_t strcmp_natural_uncasted(const char* s1, const char* s2) {
481 unsigned char uc1 = *s1;
482 unsigned char uc2 = *s2;
483 while (IsNotNzdigit(uc1) && IsNotNzdigit(uc2)) {
484 // state 0
485 strcmp_natural_uncasted_state_0:
486 if (uc1 != uc2) {
487 if ((uc1 >= 'a') && (uc1 <= 'z')) {
488 if (uc2 + 32 == uc1) {
489 return -strcmp_natural_tiebroken(s2, s1);
490 }
491 if ((uc2 < 'a') || (uc2 > 'z')) {
492 uc1 -= 32;
493 }
494 } else if ((uc2 >= 'a') && (uc2 <= 'z')) {
495 uc2 -= 32;
496 if (uc1 == uc2) {
497 return strcmp_natural_tiebroken(s1, s2);
498 }
499 }
500 return (uc1 < uc2)? -1 : 1;
501 }
502 if (!uc1) {
503 return 0;
504 }
505 uc1 = *(++s1);
506 uc2 = *(++s2);
507 }
508 if (IsNotNzdigit(uc1) || IsNotNzdigit(uc2)) {
509 return (uc1 < uc2)? -1 : 1;
510 }
511 do {
512 // state 1
513 if (uc1 != uc2) {
514 if (IsDigit(uc2)) {
515 if (uc1 < uc2) {
516 return strcmp_natural_scan_forward(s1, s2);
517 }
518 return -strcmp_natural_scan_forward(s2, s1);
519 }
520 return 1;
521 }
522 uc1 = *(++s1);
523 uc2 = *(++s2);
524 } while (IsDigit(uc1));
525 if (IsDigit(uc2)) {
526 return -1;
527 }
528 goto strcmp_natural_uncasted_state_0;
529 }
530
strcmp_natural(const void * s1,const void * s2)531 int32_t strcmp_natural(const void* s1, const void* s2) {
532 return strcmp_natural_uncasted(S_CAST(const char*, s1), S_CAST(const char*, s2));
533 }
534
strcmp_deref(const void * s1,const void * s2)535 int32_t strcmp_deref(const void* s1, const void* s2) {
536 return strcmp(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
537 }
538
strcmp_overread_deref(const void * s1,const void * s2)539 int32_t strcmp_overread_deref(const void* s1, const void* s2) {
540 return strcmp_overread(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
541 }
542
strcmp_natural_deref(const void * s1,const void * s2)543 int32_t strcmp_natural_deref(const void* s1, const void* s2) {
544 return strcmp_natural_uncasted(*S_CAST(const char* const*, s1), *S_CAST(const char* const*, s2));
545 }
546
547 #ifdef __cplusplus
548 static_assert(sizeof(Strbuf28Ui) == 32, "Strbuf28Ui is not laid out as expected.");
549 static_assert(offsetof(Strbuf28Ui, orig_idx) == 28, "Strbuf28Ui is not laid out as expected.");
550 static_assert(sizeof(Strbuf60Ui) == 64, "Strbuf60Ui is not laid out as expected.");
551 static_assert(offsetof(Strbuf60Ui, orig_idx) == 60, "Strbuf60Ui is not laid out as expected.");
GetStrboxsortWentryBlen(uintptr_t max_str_blen)552 uintptr_t GetStrboxsortWentryBlen(uintptr_t max_str_blen) {
553 if (max_str_blen <= 28) {
554 return sizeof(Strbuf28Ui);
555 }
556 if (max_str_blen <= 60) {
557 return sizeof(Strbuf60Ui);
558 }
559 return max_str_blen;
560 }
561 #else
GetStrboxsortWentryBlen(uintptr_t max_str_blen)562 uintptr_t GetStrboxsortWentryBlen(uintptr_t max_str_blen) {
563 return MAXV(max_str_blen, sizeof(StrSortIndexedDeref));
564 }
565 #endif
566
567 // Assumed that sort_wkspace has size >= str_ct *
568 // max(sizeof(StrSortIndexedDeref), max_str_blen).
569 // Must be ok to overread.
SortStrboxIndexed2Fallback(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map,void * sort_wkspace)570 void SortStrboxIndexed2Fallback(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
571 StrSortIndexedDerefOverread* wkspace_alias = S_CAST(StrSortIndexedDerefOverread*, sort_wkspace);
572 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
573 wkspace_alias[str_idx].strptr = &(strbox[str_idx * max_str_blen]);
574 wkspace_alias[str_idx].orig_idx = id_map[str_idx];
575 }
576 if (!use_nsort) {
577 STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, wkspace_alias);
578 } else {
579 #ifdef __cplusplus
580 StrNsortIndexedDeref* wkspace_alias2 = R_CAST(StrNsortIndexedDeref*, wkspace_alias);
581 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
582 #else
583 STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, wkspace_alias);
584 #endif
585 }
586 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
587 id_map[str_idx] = wkspace_alias[str_idx].orig_idx;
588 }
589 #ifndef __cplusplus
590 if (max_str_blen < sizeof(StrSortIndexedDeref)) {
591 // actually better to use non-deref sort here, but just get this working
592 // properly for now
593 for (uint32_t new_idx = 0; new_idx != str_ct; ++new_idx) {
594 const char* strptr = wkspace_alias[new_idx].strptr;
595 // todo: check whether memcpy(., ., max_str_blen) tends to be better
596 strcpy(&(R_CAST(char*, wkspace_alias)[new_idx * max_str_blen]), strptr);
597 }
598 } else {
599 #endif
600 // bugfix: need to handle id_map[str_idx] != str_idx
601 uint32_t new_idx = str_ct;
602 do {
603 --new_idx;
604 const char* strptr = wkspace_alias[new_idx].strptr;
605 strcpy(&(R_CAST(char*, wkspace_alias)[new_idx * max_str_blen]), strptr);
606 } while (new_idx);
607 #ifndef __cplusplus
608 }
609 #endif
610 memcpy(strbox, wkspace_alias, str_ct * max_str_blen);
611 }
612
613 #ifdef __cplusplus
614 typedef struct WordCmp32bStruct {
615 uintptr_t words[32 / kBytesPerWord];
operator <plink2::WordCmp32bStruct616 bool operator<(const struct WordCmp32bStruct& rhs) const {
617 uint32_t idx = 0;
618 do {
619 const uintptr_t cur_word = words[idx];
620 const uintptr_t rhs_word = rhs.words[idx];
621 if (cur_word != rhs_word) {
622 # ifdef __LP64__
623 return __builtin_bswap64(cur_word) < __builtin_bswap64(rhs_word);
624 # else
625 return __builtin_bswap32(cur_word) < __builtin_bswap32(rhs_word);
626 # endif
627 }
628 } while (++idx < (32 / kBytesPerWord));
629 return false;
630 }
631 } WordCmp32b;
632
633 typedef struct WordCmp64bStruct {
634 uintptr_t words[64 / kBytesPerWord];
operator <plink2::WordCmp64bStruct635 bool operator<(const struct WordCmp64bStruct& rhs) const {
636 uint32_t idx = 0;
637 do {
638 const uintptr_t cur_word = words[idx];
639 const uintptr_t rhs_word = rhs.words[idx];
640 if (cur_word != rhs_word) {
641 # ifdef __LP64__
642 return __builtin_bswap64(cur_word) < __builtin_bswap64(rhs_word);
643 # else
644 return __builtin_bswap32(cur_word) < __builtin_bswap32(rhs_word);
645 # endif
646 }
647 } while (++idx < (64 / kBytesPerWord));
648 return false;
649 }
650 } WordCmp64b;
651
652 static_assert(sizeof(WordCmp32b) == 32, "WordCmp32b does not have the expected size.");
653 static_assert(sizeof(WordCmp64b) == 64, "WordCmp64b does not have the expected size.");
654
SortStrbox32bFinish(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,Strbuf28Ui * filled_wkspace,char * sorted_strbox,uint32_t * id_map)655 void SortStrbox32bFinish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf28Ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
656 if (!use_nsort) {
657 WordCmp32b* wkspace_alias = R_CAST(WordCmp32b*, filled_wkspace);
658 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
659 } else {
660 STD_SORT_PAR_UNSEQ(str_ct, nullptr, filled_wkspace);
661 }
662 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
663 strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
664 id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
665 }
666 }
667
SortStrbox64bFinish(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,Strbuf60Ui * filled_wkspace,char * sorted_strbox,uint32_t * id_map)668 void SortStrbox64bFinish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf60Ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
669 if (!use_nsort) {
670 WordCmp64b* wkspace_alias = R_CAST(WordCmp64b*, filled_wkspace);
671 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
672 } else {
673 STD_SORT_PAR_UNSEQ(str_ct, nullptr, filled_wkspace);
674 }
675 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
676 strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
677 id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
678 }
679 }
680
681 // Normally use SortStrboxIndexed(), but this version is necessary before
682 // g_bigstack has been allocated.
SortStrboxIndexed2(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map,void * sort_wkspace)683 void SortStrboxIndexed2(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
684 if (max_str_blen <= 28) {
685 Strbuf28Ui* wkspace_alias = S_CAST(Strbuf28Ui*, sort_wkspace);
686 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
687 const char* cur_str = &(strbox[str_idx * max_str_blen]);
688 strcpy(wkspace_alias[str_idx].strbuf, cur_str);
689 wkspace_alias[str_idx].orig_idx = id_map[str_idx];
690 }
691 SortStrbox32bFinish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
692 return;
693 }
694 if (max_str_blen <= 60) {
695 Strbuf60Ui* wkspace_alias = S_CAST(Strbuf60Ui*, sort_wkspace);
696 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
697 const char* cur_str = &(strbox[str_idx * max_str_blen]);
698 strcpy(wkspace_alias[str_idx].strbuf, cur_str);
699 wkspace_alias[str_idx].orig_idx = id_map[str_idx];
700 }
701 SortStrbox64bFinish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
702 return;
703 }
704 SortStrboxIndexed2Fallback(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
705 }
706 #endif // __cplusplus
707
708 // Must be ok to overread.
SortStrboxIndexedMalloc(uintptr_t str_ct,uintptr_t max_str_blen,char * strbox,uint32_t * id_map)709 BoolErr SortStrboxIndexedMalloc(uintptr_t str_ct, uintptr_t max_str_blen, char* strbox, uint32_t* id_map) {
710 if (str_ct < 2) {
711 return 0;
712 }
713 const uintptr_t wkspace_entry_blen = GetStrboxsortWentryBlen(max_str_blen);
714 unsigned char* sort_wkspace;
715 if (unlikely(pgl_malloc(str_ct * wkspace_entry_blen, &sort_wkspace))) {
716 return 1;
717 }
718 SortStrboxIndexed2(str_ct, max_str_blen, 0, strbox, id_map, sort_wkspace);
719 free(sort_wkspace);
720 return 0;
721 }
722
CopyAndDedupSortedStrptrsToStrbox(const char * const * sorted_strptrs,uintptr_t str_ct,uintptr_t max_str_blen,char * strbox)723 uint32_t CopyAndDedupSortedStrptrsToStrbox(const char* const* sorted_strptrs, uintptr_t str_ct, uintptr_t max_str_blen, char* strbox) {
724 if (!str_ct) {
725 return 0;
726 }
727 const char* const* sorted_strptrs_iter = sorted_strptrs;
728 const char* const* sorted_strptrs_end = &(sorted_strptrs[str_ct]);
729 uintptr_t write_idx = 0;
730 uint32_t prev_slen = UINT32_MAX;
731 const char* prev_str = nullptr;
732 do {
733 const char* cur_str = *sorted_strptrs_iter++;
734 const uint32_t cur_slen = strlen(cur_str);
735 if ((cur_slen != prev_slen) || (!memequal(cur_str, prev_str, prev_slen))) {
736 memcpy(&(strbox[write_idx * max_str_blen]), cur_str, cur_slen + 1);
737 ++write_idx;
738 prev_str = cur_str;
739 }
740 } while (sorted_strptrs_iter != sorted_strptrs_end);
741 return write_idx;
742 }
743
744
StrptrArrSortMain(uintptr_t str_ct,uint32_t overread_ok,uint32_t use_nsort,StrSortIndexedDeref * wkspace_alias)745 void StrptrArrSortMain(uintptr_t str_ct, uint32_t overread_ok, uint32_t use_nsort, StrSortIndexedDeref* wkspace_alias) {
746 if (!use_nsort) {
747 if (overread_ok) {
748 #ifdef __cplusplus
749 StrSortIndexedDerefOverread* wkspace_alias2 = R_CAST(StrSortIndexedDerefOverread*, wkspace_alias);
750 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
751 #else
752 STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, wkspace_alias);
753 #endif
754 } else {
755 STD_SORT_PAR_UNSEQ(str_ct, strcmp_deref, wkspace_alias);
756 }
757 } else {
758 #ifdef __cplusplus
759 StrNsortIndexedDeref* wkspace_alias2 = R_CAST(StrNsortIndexedDeref*, wkspace_alias);
760 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias2);
761 #else
762 STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, wkspace_alias);
763 #endif
764 }
765 }
766
767 /*
768 uint32_t match_upper(const char* str_iter, const char* fixed_str) {
769 char cc = *fixed_str++;
770 do {
771 if ((((unsigned char)(*str_iter++)) & 0xdf) != ((unsigned char)cc)) {
772 return 0;
773 }
774 cc = *fixed_str++;
775 } while (cc);
776 return !(*str_iter);
777 }
778 */
779
780 /*
781 #ifdef __LP64__
782 CXXCONST_CP FirstPrecharFar(const char* str_iter, uint32_t char_code) {
783 const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
784 const VecUc* str_viter = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
785 const VecUc vvec_add = vecuc_set1(128 - char_code);
786 VecUc cur_vvec = *str_viter;
787 VecUc non_prechar_vvec = vecuc_adds(cur_vvec, vvec_add);
788 uint32_t matching_bytes = ~vecuc_movemask(non_prechar_vvec);
789 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter);
790 matching_bytes &= UINT32_MAX << leading_byte_ct;
791 // This typically isn't *that* long-range, so the Memrchr()
792 // double-vector strategy is unlikely to be profitable?
793 while (!matching_bytes) {
794 ++str_viter;
795 cur_vvec = *str_viter;
796 non_prechar_vvec = vecuc_adds(cur_vvec, vvec_add);
797 matching_bytes = ~vecuc_movemask(non_prechar_vvec);
798 }
799 const uint32_t byte_offset_in_vec = ctzu32(matching_bytes);
800 return &(R_CAST(CXXCONST_CP, str_viter)[byte_offset_in_vec]);
801 }
802 #endif
803 */
804
MatchUpperCounted(const char * str,const char * fixed_str,uint32_t ct)805 uint32_t MatchUpperCounted(const char* str, const char* fixed_str, uint32_t ct) {
806 for (uint32_t uii = 0; uii != ct; ++uii) {
807 if ((ctou32(str[uii]) & 0xdf) != ctou32(fixed_str[uii])) {
808 return 0;
809 }
810 }
811 return 1;
812 }
813
814 // might want to make this std::array in the future?
815 static const char kToUpper[256] = {
816 '\0', '\1', '\2', '\3', '\4', '\5', '\6', '\7',
817 '\10', '\11', '\12', '\13', '\14', '\15', '\16', '\17',
818 '\20', '\21', '\22', '\23', '\24', '\25', '\26', '\27',
819 '\30', '\31', '\32', '\33', '\34', '\35', '\36', '\37',
820 '\40', '\41', '\42', '\43', '\44', '\45', '\46', '\47',
821 '\50', '\51', '\52', '\53', '\54', '\55', '\56', '\57',
822 '\60', '\61', '\62', '\63', '\64', '\65', '\66', '\67',
823 '\70', '\71', '\72', '\73', '\74', '\75', '\76', '\77',
824 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
825 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
826 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
827 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
828 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
829 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
830 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
831 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
832 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
833 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
834 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
835 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
836 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
837 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
838 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
839 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
840 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
841 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
842 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
843 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
844 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
845 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
846 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
847 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
848 };
849
strcaseequal(const char * str1,const char * str2,uint32_t ct)850 uint32_t strcaseequal(const char* str1, const char* str2, uint32_t ct) {
851 for (uint32_t uii = 0; uii != ct; ++uii) {
852 if (kToUpper[ctou32(str1[uii])] != kToUpper[ctou32(str2[uii])]) {
853 return 0;
854 }
855 }
856 return 1;
857 }
858
859 /*
860 void str_toupper(char* str_iter) {
861 while (1) {
862 const uint32_t uii = (unsigned char)(*str_iter);
863 if (!uii) {
864 return;
865 }
866 if (((uint32_t)(uii - 97)) < 26) {
867 // 'a' has ASCII code 97
868 *str_iter = uii - 32;
869 }
870 ++str_iter;
871 }
872 }
873
874 void buf_toupper(uint32_t slen, char* strbuf) {
875 for (uint32_t pos = 0; pos != slen; ++pos) {
876 const uint32_t uii = (unsigned char)(strbuf[pos]);
877 if (((uint32_t)(uii - 97)) < 26) {
878 strbuf[pos] = uii - 32;
879 }
880 }
881 }
882
883 void strcpy_toupper(char* target, const char* source) {
884 while (1) {
885 uint32_t uii = (unsigned char)(*source++);
886 if (!uii) {
887 return;
888 }
889 if (((uint32_t)(uii - 97)) < 26) {
890 uii -= 32;
891 }
892 *target++ = uii;
893 }
894 }
895
896 char* memcpya_toupper(char* __restrict target, const char* __restrict source, uint32_t slen) {
897 for (uint32_t pos = 0; pos != slen; ++pos) {
898 uint32_t uii = ctou32(source[pos]);
899 if ((uii - 97) < 26) {
900 uii -= 32;
901 }
902 target[pos] = uii;
903 }
904 return &(target[slen]);
905 }
906 */
907
908
IsAlphanumeric(const char * str_iter)909 uint32_t IsAlphanumeric(const char* str_iter) {
910 for (; ; ++str_iter) {
911 uint32_t uii = ctou32(*str_iter);
912 if (!uii) {
913 return 1;
914 }
915 if (((uii - 48) > 9) && (((uii & 0xffffffdfU) - 65) > 25)) {
916 return 0;
917 }
918 }
919 }
920
ScanPosintptr(const char * str_iter,uintptr_t * valp)921 BoolErr ScanPosintptr(const char* str_iter, uintptr_t* valp) {
922 // Reads an integer in [1, 2^kBitsPerWord - 1]. Assumes first character is
923 // nonspace.
924 assert(ctow(str_iter[0]) > 32);
925 uintptr_t val = ctow(*str_iter++) - 48;
926 if (val >= 10) {
927 #ifdef __LP64__
928 if (unlikely(val != 0xfffffffffffffffbLLU)) {
929 return 1;
930 }
931 #else
932 if (unlikely(val != 0xfffffffbU)) {
933 return 1;
934 }
935 #endif
936 val = ctow(*str_iter++) - 48;
937 if (unlikely(val >= 10)) {
938 return 1;
939 }
940 }
941 while (!val) {
942 val = ctow(*str_iter++) - 48;
943 if (unlikely(val >= 10)) {
944 return 1;
945 }
946 }
947 #ifdef __LP64__
948 // limit is 20 digits, we've already read one
949 const char* str_limit = &(str_iter[20]);
950 #else
951 const char* str_limit = &(str_iter[10]);
952 #endif
953 while (1) {
954 const uintptr_t cur_digit = ctow(*str_iter++) - 48;
955 if (cur_digit >= 10) {
956 *valp = val;
957 return 0;
958 }
959 const uintptr_t cur_digit2 = ctow(*str_iter++) - 48;
960 if (str_iter == str_limit) {
961 if (unlikely((cur_digit2 < 10) || ((val >= (~k0LU) / 10) && ((val > (~k0LU) / 10) || (cur_digit > (~k0LU) % 10))))) {
962 return 1;
963 }
964 *valp = val * 10 + cur_digit;
965 return 0;
966 }
967 if (cur_digit2 >= 10) {
968 *valp = val * 10 + cur_digit;
969 return 0;
970 }
971 val = val * 100 + cur_digit * 10 + cur_digit2;
972 }
973 }
974
975 #ifdef __LP64__
ScanmovUintCappedFinish(uint64_t cap,const char ** str_iterp,uint32_t * valp)976 static inline BoolErr ScanmovUintCappedFinish(uint64_t cap, const char** str_iterp, uint32_t* valp) {
977 const char* str_iter = *str_iterp;
978 uint64_t val = *valp;
979 while (1) {
980 // a little bit of unrolling seems to help
981 const uint64_t cur_digit = ctou64(*str_iter++) - 48;
982 if (cur_digit >= 10) {
983 break;
984 }
985 // val = val * 10 + cur_digit;
986 const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
987 if (cur_digit2 >= 10) {
988 val = val * 10 + cur_digit;
989 if (unlikely(val > cap)) {
990 return 1;
991 }
992 break;
993 }
994 val = val * 100 + cur_digit * 10 + cur_digit2;
995 if (unlikely(val > cap)) {
996 return 1;
997 }
998 }
999 *valp = val;
1000 *str_iterp = &(str_iter[-1]);
1001 return 0;
1002 }
1003
ScanmovPosintCapped(uint64_t cap,const char ** str_iterp,uint32_t * valp)1004 BoolErr ScanmovPosintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp) {
1005 const char* str_iter = *str_iterp;
1006 *valp = ctou32(*str_iter++) - 48;
1007 if (*valp >= 10) {
1008 if (unlikely(*valp != 0xfffffffbU)) {
1009 return 1;
1010 }
1011 *valp = ctou32(*str_iter++) - 48;
1012 if (unlikely(*valp >= 10)) {
1013 return 1;
1014 }
1015 }
1016 while (!(*valp)) {
1017 *valp = ctou32(*str_iter++) - 48;
1018 if (unlikely((*valp) >= 10)) {
1019 return 1;
1020 }
1021 }
1022 *str_iterp = str_iter;
1023 return ScanmovUintCappedFinish(cap, str_iterp, valp);
1024 }
1025
ScanmovUintCapped(uint64_t cap,const char ** str_iterp,uint32_t * valp)1026 BoolErr ScanmovUintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp) {
1027 const char* str_iter = *str_iterp;
1028 *valp = ctou32(*str_iter++) - 48;
1029 if (*valp >= 10) {
1030 if (*valp != 0xfffffffbU) {
1031 // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
1032 if (unlikely((*valp != 0xfffffffdU) || (*str_iter != '0'))) {
1033 return 1;
1034 }
1035 // accept "-0", "-00", etc.
1036 while (*(++str_iter) == '0');
1037 *valp = 0;
1038 *str_iterp = str_iter;
1039 return (ctou32(*str_iter) - 48) < 10;
1040 }
1041 // accept leading '+'
1042 *valp = ctou32(*str_iter++) - 48;
1043 if (unlikely(*valp >= 10)) {
1044 return 1;
1045 }
1046 }
1047 *str_iterp = str_iter;
1048 return ScanmovUintCappedFinish(cap, str_iterp, valp);
1049 }
1050
1051 // 2^{-31} < floor <= 0 <= cap < 2^31
ScanmovIntBounded(uint64_t abs_floor,uint64_t cap,const char ** str_iterp,int32_t * valp)1052 BoolErr ScanmovIntBounded(uint64_t abs_floor, uint64_t cap, const char** str_iterp, int32_t* valp) {
1053 const char* str_iter = *str_iterp;
1054 *valp = ctou32(*str_iter++) - 48;
1055 int32_t sign = 1;
1056 if (ctou32(*valp) >= 10) {
1057 if (*valp == -3) {
1058 sign = -1;
1059 cap = abs_floor;
1060 } else if (unlikely(*valp != -5)) { // accept leading '+'
1061 return 1;
1062 }
1063 *valp = ctou32(*str_iter++) - 48;
1064 if (unlikely(*valp >= 10)) {
1065 return 1;
1066 }
1067 }
1068 *str_iterp = str_iter;
1069 if (ScanmovUintCappedFinish(cap, str_iterp, R_CAST(uint32_t*, valp))) {
1070 return 1;
1071 }
1072 *valp *= sign;
1073 return 0;
1074 }
1075 #else
ScanmovPosintCapped32(uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,uint32_t * valp)1076 BoolErr ScanmovPosintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp) {
1077 const char* str_iter = *str_iterp;
1078 uint32_t val = ctou32(*str_iter++) - 48;
1079 if (val >= 10) {
1080 if (unlikely(val != 0xfffffffbU)) {
1081 return 1;
1082 }
1083 val = ctou32(*str_iter++) - 48;
1084 if (unlikely(val >= 10)) {
1085 return 1;
1086 }
1087 }
1088 while (!val) {
1089 val = ctou32(*str_iter++);
1090 if (unlikely(val >= 10)) {
1091 return 1;
1092 }
1093 }
1094 for (; ; ++str_iter) {
1095 const uint32_t cur_digit = ctou32(*str_iter) - 48;
1096 if (cur_digit >= 10) {
1097 *valp = val;
1098 *str_iterp = str_iter;
1099 return 0;
1100 }
1101 if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1102 return 1;
1103 }
1104 val = val * 10 + cur_digit;
1105 }
1106 }
1107
ScanmovUintCapped32(uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,uint32_t * valp)1108 BoolErr ScanmovUintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp) {
1109 const char* str_iter = *str_iterp;
1110 uint32_t val = ctou32(*str_iter++) - 48;
1111 if (val >= 10) {
1112 if (val != 0xfffffffbU) {
1113 if (unlikely((val != 0xfffffffd) || (*str_iter != '0'))) {
1114 return 1;
1115 }
1116 while (*(++str_iter) == '0');
1117 *valp = 0;
1118 *str_iterp = str_iter;
1119 return (ctou32(*str_iter) - 48) < 10;
1120 }
1121 val = ctou32(*str_iter++) - 48;
1122 if (unlikely(val >= 10)) {
1123 return 1;
1124 }
1125 }
1126 for (; ; ++str_iter) {
1127 const uint32_t cur_digit = ctou32(*str_iter) - 48;
1128 if (cur_digit >= 10) {
1129 *valp = val;
1130 *str_iterp = str_iter;
1131 return 0;
1132 }
1133 if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1134 return 1;
1135 }
1136 val = val * 10 + cur_digit;
1137 }
1138 }
1139
ScanmovIntBounded32(uint32_t abs_floor_div_10,uint32_t abs_floor_mod_10,uint32_t cap_div_10,uint32_t cap_mod_10,const char ** str_iterp,int32_t * valp)1140 BoolErr ScanmovIntBounded32(uint32_t abs_floor_div_10, uint32_t abs_floor_mod_10, uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, int32_t* valp) {
1141 const char* str_iter = *str_iterp;
1142 uint32_t val = ctou32(*str_iter++) - 48;
1143 int32_t sign = 1;
1144 if (val >= 10) {
1145 if (val == 0xfffffffdU) {
1146 sign = -1;
1147 cap_div_10 = abs_floor_div_10;
1148 cap_mod_10 = abs_floor_mod_10;
1149 } else if (unlikely(val != 0xfffffffbU)) {
1150 return 1;
1151 }
1152 val = ctou32(*str_iter++) - 48;
1153 if (unlikely(val >= 10)) {
1154 return 1;
1155 }
1156 }
1157 for (; ; ++str_iter) {
1158 const uint32_t cur_digit = ctou32(*str_iter) - 48;
1159 if (cur_digit >= 10) {
1160 *valp = sign * S_CAST(int32_t, val);
1161 *str_iterp = str_iter;
1162 return 0;
1163 }
1164 if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
1165 return 1;
1166 }
1167 val = val * 10 + cur_digit;
1168 }
1169 }
1170 #endif
1171
1172 static const double kPositivePow10[] = {1, 1.0e1, 1.0e2, 1.0e3, 1.0e4, 1.0e5, 1.0e6, 1.0e7, 1.0e8, 1.0e9, 1.0e10, 1.0e11, 1.0e12, 1.0e13, 1.0e14, 1.0e15};
1173 static const double kPositivePowTen16[] = {1, 1.0e16, 1.0e32, 1.0e48, 1.0e64, 1.0e80, 1.0e96, 1.0e112, 1.0e128, 1.0e144, 1.0e160, 1.0e176, 1.0e192, 1.0e208, 1.0e224, 1.0e240};
1174 static const double kNegativePow10[] = {1, 1.0e-1, 1.0e-2, 1.0e-3, 1.0e-4, 1.0e-5, 1.0e-6, 1.0e-7, 1.0e-8, 1.0e-9, 1.0e-10, 1.0e-11, 1.0e-12, 1.0e-13, 1.0e-14, 1.0e-15};
1175 static const double kNegativePowTen16[] = {1, 1.0e-16, 1.0e-32, 1.0e-48, 1.0e-64, 1.0e-80, 1.0e-96, 1.0e-112};
1176
ScanadvDouble(const char * str_iter,double * valp)1177 CXXCONST_CP ScanadvDouble(const char* str_iter, double* valp) {
1178 // requires first character to be nonspace (to succeed; it fails without
1179 // segfaulting on space/eoln/null)
1180 // don't care about hexadecimal
1181 // ok to lose last ~2 bits of precision
1182 // ok if behavior undefined on >1GB strings in 32-bit case, >2GB for 64-bit
1183 // fail on nan/infinity/overflow instead of usual strtod behavior
1184 uint32_t cur_char_code = ctou32(*str_iter);
1185 const uint32_t is_negative = (cur_char_code == 45);
1186 if (is_negative || (cur_char_code == 43)) {
1187 cur_char_code = ctou32(*(++str_iter));
1188 }
1189 uint32_t cur_digit = cur_char_code - 48;
1190 intptr_t e10 = 0;
1191 const char* dot_ptr;
1192 int64_t digits;
1193 #ifdef __LP64__
1194 if (cur_digit < 10) {
1195 // ok, we have at least one digit
1196 digits = cur_digit;
1197 // to check: best to skip leading zeroes and compare against 17 instead of
1198 // 10^16?
1199 do {
1200 cur_digit = ctou32(*(++str_iter)) - 48;
1201 if (cur_digit >= 10) {
1202 if (cur_digit == 0xfffffffeU) {
1203 dot_ptr = str_iter;
1204 goto ScanadvDouble_parse_decimal;
1205 }
1206 goto ScanadvDouble_parse_exponent;
1207 }
1208 digits = digits * 10 + cur_digit;
1209 } while (digits < 10000000000000000LL);
1210 // we have 17 significant digits; count the rest, but don't worry about
1211 // contents
1212 // (could keep ~19 instead, but if we're systematically losing the last two
1213 // bits of precision anyway...)
1214 const char* last_sig_fig_ptr = str_iter;
1215 do {
1216 cur_digit = ctou32(*(++str_iter)) - 48;
1217 } while (cur_digit < 10);
1218 e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1219 if (cur_digit == 0xfffffffeU) {
1220 do {
1221 cur_digit = ctou32(*(++str_iter)) - 48;
1222 } while (cur_digit < 10);
1223 }
1224 goto ScanadvDouble_parse_exponent;
1225 }
1226 if (cur_digit != 0xfffffffeU) {
1227 return nullptr;
1228 }
1229 // first (nonsign) character is dot, verify we have a digit after it
1230 dot_ptr = str_iter;
1231 cur_digit = ctou32(*(++str_iter)) - 48;
1232 if (cur_digit >= 10) {
1233 return nullptr;
1234 }
1235 digits = cur_digit;
1236 ScanadvDouble_parse_decimal:
1237 while (1) {
1238 cur_digit = ctou32(*(++str_iter)) - 48;
1239 if (cur_digit >= 10) {
1240 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1241 break;
1242 }
1243 digits = digits * 10 + cur_digit;
1244 if (digits >= 10000000000000000LL) {
1245 e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1246 do {
1247 cur_digit = ctou32(*(++str_iter)) - 48;
1248 } while (cur_digit < 10);
1249 break;
1250 }
1251 }
1252 ScanadvDouble_parse_exponent:
1253 if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1254 cur_char_code = ctou32(*(++str_iter));
1255 const uint32_t exp_is_negative = (cur_char_code == 45);
1256 if (exp_is_negative || (cur_char_code == 43)) {
1257 cur_char_code = ctou32(*(++str_iter));
1258 }
1259 cur_digit = cur_char_code - 48;
1260 int32_t cur_exp = 0;
1261 while (cur_digit < 10) {
1262 if (cur_exp >= 214748364) {
1263 // may as well guard against exponent overflow
1264 if (!exp_is_negative) {
1265 return nullptr;
1266 }
1267 *valp = 0;
1268 do {
1269 cur_digit = ctou32(*(++str_iter)) - 48;
1270 } while (cur_digit < 10);
1271 return S_CAST(CXXCONST_CP, str_iter);
1272 }
1273 cur_exp = cur_exp * 10 + cur_digit;
1274 cur_digit = ctou32(*(++str_iter)) - 48;
1275 }
1276 if (exp_is_negative) {
1277 cur_exp = -cur_exp;
1278 }
1279 e10 += cur_exp;
1280 }
1281 #else // not __LP64__
1282 int32_t digits_short;
1283 if (cur_digit < 10) {
1284 // ok, we have at least one digit
1285 digits_short = cur_digit;
1286 // to check: best to skip leading zeroes and compare against 17 instead of
1287 // 10^16?
1288 do {
1289 cur_digit = ctou32(*(++str_iter)) - 48;
1290 if (cur_digit >= 10) {
1291 if (cur_digit == 0xfffffffeU) {
1292 dot_ptr = str_iter;
1293 goto ScanadvDouble_parse_decimal;
1294 }
1295 digits = digits_short;
1296 goto ScanadvDouble_parse_exponent;
1297 }
1298 digits_short = digits_short * 10 + cur_digit;
1299 } while (digits_short < 100000000);
1300 digits = digits_short;
1301 do {
1302 cur_digit = ctou32(*(++str_iter)) - 48;
1303 if (cur_digit >= 10) {
1304 if (cur_digit == 0xfffffffeU) {
1305 dot_ptr = str_iter;
1306 goto ScanadvDouble_parse_decimal_long;
1307 }
1308 goto ScanadvDouble_parse_exponent;
1309 }
1310 digits = digits * 10 + cur_digit;
1311 } while (digits < 10000000000000000LL);
1312 // we have 17 significant digits; count the rest, but don't worry about
1313 // contents
1314 const char* last_sig_fig_ptr = str_iter;
1315 do {
1316 cur_digit = ctou32(*(++str_iter)) - 48;
1317 } while (cur_digit < 10);
1318 e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1319 if (cur_digit == 0xfffffffeU) {
1320 do {
1321 cur_digit = ctou32(*(++str_iter)) - 48;
1322 } while (cur_digit < 10);
1323 }
1324 goto ScanadvDouble_parse_exponent;
1325 }
1326 if (cur_digit != 0xfffffffeU) {
1327 return nullptr;
1328 }
1329 // first (nonsign) character is dot, verify we have a digit after it
1330 dot_ptr = str_iter;
1331 cur_digit = ctou32(*(++str_iter)) - 48;
1332 if (cur_digit >= 10) {
1333 return nullptr;
1334 }
1335 digits_short = cur_digit;
1336 ScanadvDouble_parse_decimal:
1337 while (1) {
1338 cur_digit = ctou32(*(++str_iter)) - 48;
1339 if (cur_digit >= 10) {
1340 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1341 digits = digits_short;
1342 break;
1343 }
1344 digits_short = digits_short * 10 + cur_digit;
1345 if (digits_short >= 100000000) {
1346 digits = digits_short;
1347 ScanadvDouble_parse_decimal_long:
1348 while (1) {
1349 cur_digit = ctou32(*(++str_iter)) - 48;
1350 if (cur_digit >= 10) {
1351 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1352 goto ScanadvDouble_parse_exponent;
1353 }
1354 digits = digits * 10 + cur_digit;
1355 if (digits >= 10000000000000000LL) {
1356 e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1357 do {
1358 cur_digit = ctou32(*(++str_iter)) - 48;
1359 } while (cur_digit < 10);
1360 goto ScanadvDouble_parse_exponent;
1361 }
1362 }
1363 }
1364 }
1365 ScanadvDouble_parse_exponent:
1366 if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1367 cur_char_code = ctou32(*(++str_iter));
1368 const uint32_t exp_is_negative = (cur_char_code == 45);
1369 if (exp_is_negative || (cur_char_code == 43)) {
1370 cur_char_code = ctou32(*(++str_iter));
1371 }
1372 cur_digit = cur_char_code - 48;
1373 int32_t cur_exp = 0;
1374 while (cur_digit < 10) {
1375 if (cur_exp >= 107374182) {
1376 // may as well guard against exponent overflow
1377 // 2^30 instead of 2^31 limit since this gets added to e10
1378 if (!exp_is_negative) {
1379 return nullptr;
1380 }
1381 *valp = 0;
1382 do {
1383 cur_digit = ctou32(*(++str_iter)) - 48;
1384 } while (cur_digit < 10);
1385 return S_CAST(CXXCONST_CP, str_iter);
1386 }
1387 cur_exp = cur_exp * 10 + cur_digit;
1388 cur_digit = ctou32(*(++str_iter)) - 48;
1389 }
1390 if (exp_is_negative) {
1391 cur_exp = -cur_exp;
1392 }
1393 e10 += cur_exp;
1394 }
1395 #endif
1396 if (digits == 0) {
1397 *valp = 0;
1398 return S_CAST(CXXCONST_CP, str_iter);
1399 }
1400 if (is_negative) {
1401 digits = -digits;
1402 }
1403 double dxx = S_CAST(double, digits);
1404 if (e10) {
1405 if (e10 < 0) {
1406 uint32_t pos_exp = -e10;
1407 dxx *= kNegativePow10[pos_exp & 15];
1408 pos_exp /= 16;
1409 if (pos_exp) {
1410 dxx *= kNegativePowTen16[pos_exp & 7];
1411 if (pos_exp > 7) {
1412 if (pos_exp > 23) {
1413 dxx = 0;
1414 } else if (pos_exp > 15) {
1415 dxx *= 1.0e-256;
1416 } else {
1417 dxx *= 1.0e-128;
1418 }
1419 }
1420 }
1421 } else {
1422 uint32_t pos_exp = e10;
1423 dxx *= kPositivePow10[pos_exp & 15];
1424 pos_exp /= 16;
1425 if (pos_exp) {
1426 dxx *= kPositivePowTen16[pos_exp & 15];
1427 if (pos_exp > 15) {
1428 // overflow check
1429 // last digits are "54" instead of "57" since that's the threshold
1430 // beyond which multiply-by-1e256 overflows
1431 if ((pos_exp > 31) || (dxx > 1.7976931348623154e52)) {
1432 return nullptr;
1433 }
1434 dxx *= 1.0e256;
1435 }
1436 }
1437 }
1438 }
1439 *valp = dxx;
1440 return S_CAST(CXXCONST_CP, str_iter);
1441 }
1442
ScanadvLn(const char * str_iter,double * ln_ptr)1443 CXXCONST_CP ScanadvLn(const char* str_iter, double* ln_ptr) {
1444 // revised ScanadvDouble() which currently requires number to be nonnegative
1445 // returns -DBL_MAX on 0
1446 uint32_t cur_char_code = ctou32(*str_iter);
1447 const uint32_t is_negative = (cur_char_code == 45);
1448 if (is_negative || (cur_char_code == 43)) {
1449 cur_char_code = ctou32(*(++str_iter));
1450 }
1451 uint32_t cur_digit = cur_char_code - 48;
1452 intptr_t e10 = 0;
1453 const char* dot_ptr;
1454 int64_t digits;
1455 #ifdef __LP64__
1456 if (cur_digit < 10) {
1457 // ok, we have at least one digit
1458 digits = cur_digit;
1459 // to check: best to skip leading zeroes and compare against 17 instead of
1460 // 10^16?
1461 do {
1462 cur_digit = ctou32(*(++str_iter)) - 48;
1463 if (cur_digit >= 10) {
1464 if (cur_digit == 0xfffffffeU) {
1465 dot_ptr = str_iter;
1466 goto ScanadvLn_parse_decimal;
1467 }
1468 goto ScanadvLn_parse_exponent;
1469 }
1470 digits = digits * 10 + cur_digit;
1471 } while (digits < 10000000000000000LL);
1472 // we have 17 significant digits; count the rest, but don't worry about
1473 // contents
1474 // (could keep ~19 instead, but if we're systematically losing the last two
1475 // bits of precision anyway...)
1476 const char* last_sig_fig_ptr = str_iter;
1477 do {
1478 cur_digit = ctou32(*(++str_iter)) - 48;
1479 } while (cur_digit < 10);
1480 e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1481 if (cur_digit == 0xfffffffeU) {
1482 do {
1483 cur_digit = ctou32(*(++str_iter)) - 48;
1484 } while (cur_digit < 10);
1485 }
1486 goto ScanadvLn_parse_exponent;
1487 }
1488 if (cur_digit != 0xfffffffeU) {
1489 return nullptr;
1490 }
1491 // first (nonsign) character is dot, verify we have a digit after it
1492 dot_ptr = str_iter;
1493 cur_digit = ctou32(*(++str_iter)) - 48;
1494 if (cur_digit >= 10) {
1495 return nullptr;
1496 }
1497 digits = cur_digit;
1498 ScanadvLn_parse_decimal:
1499 while (1) {
1500 cur_digit = ctou32(*(++str_iter)) - 48;
1501 if (cur_digit >= 10) {
1502 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1503 break;
1504 }
1505 digits = digits * 10 + cur_digit;
1506 if (digits >= 10000000000000000LL) {
1507 e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1508 do {
1509 cur_digit = ctou32(*(++str_iter)) - 48;
1510 } while (cur_digit < 10);
1511 break;
1512 }
1513 }
1514 ScanadvLn_parse_exponent:
1515 if (is_negative && (digits != 0)) {
1516 return nullptr;
1517 }
1518 if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1519 cur_char_code = ctou32(*(++str_iter));
1520 const uint32_t exp_is_negative = (cur_char_code == 45);
1521 if (exp_is_negative || (cur_char_code == 43)) {
1522 cur_char_code = ctou32(*(++str_iter));
1523 }
1524 cur_digit = cur_char_code - 48;
1525 int32_t cur_exp = 0;
1526 while (cur_digit < 10) {
1527 if (cur_exp >= 214748364) {
1528 // may as well guard against exponent overflow
1529 if (!exp_is_negative) {
1530 return nullptr;
1531 }
1532 *ln_ptr = -DBL_MAX;
1533 do {
1534 cur_digit = ctou32(*(++str_iter)) - 48;
1535 } while (cur_digit < 10);
1536 return S_CAST(CXXCONST_CP, str_iter);
1537 }
1538 cur_exp = cur_exp * 10 + cur_digit;
1539 cur_digit = ctou32(*(++str_iter)) - 48;
1540 }
1541 if (exp_is_negative) {
1542 cur_exp = -cur_exp;
1543 }
1544 e10 += cur_exp;
1545 }
1546 #else // not __LP64__
1547 int32_t digits_short;
1548 if (cur_digit < 10) {
1549 // ok, we have at least one digit
1550 digits_short = cur_digit;
1551 // to check: best to skip leading zeroes and compare against 17 instead of
1552 // 10^16?
1553 do {
1554 cur_digit = ctou32(*(++str_iter)) - 48;
1555 if (cur_digit >= 10) {
1556 if (cur_digit == 0xfffffffeU) {
1557 dot_ptr = str_iter;
1558 goto ScanadvLn_parse_decimal;
1559 }
1560 digits = digits_short;
1561 goto ScanadvLn_parse_exponent;
1562 }
1563 digits_short = digits_short * 10 + cur_digit;
1564 } while (digits_short < 100000000);
1565 digits = digits_short;
1566 do {
1567 cur_digit = ctou32(*(++str_iter)) - 48;
1568 if (cur_digit >= 10) {
1569 if (cur_digit == 0xfffffffeU) {
1570 dot_ptr = str_iter;
1571 goto ScanadvLn_parse_decimal_long;
1572 }
1573 goto ScanadvLn_parse_exponent;
1574 }
1575 digits = digits * 10 + cur_digit;
1576 } while (digits < 10000000000000000LL);
1577 // we have 17 significant digits; count the rest, but don't worry about
1578 // contents
1579 const char* last_sig_fig_ptr = str_iter;
1580 do {
1581 cur_digit = ctou32(*(++str_iter)) - 48;
1582 } while (cur_digit < 10);
1583 e10 = S_CAST(intptr_t, str_iter - last_sig_fig_ptr) - 1;
1584 if (cur_digit == 0xfffffffeU) {
1585 do {
1586 cur_digit = ctou32(*(++str_iter)) - 48;
1587 } while (cur_digit < 10);
1588 }
1589 goto ScanadvLn_parse_exponent;
1590 }
1591 if (cur_digit != 0xfffffffeU) {
1592 return nullptr;
1593 }
1594 // first (nonsign) character is dot, verify we have a digit after it
1595 dot_ptr = str_iter;
1596 cur_digit = ctou32(*(++str_iter)) - 48;
1597 if (cur_digit >= 10) {
1598 return nullptr;
1599 }
1600 digits_short = cur_digit;
1601 ScanadvLn_parse_decimal:
1602 while (1) {
1603 cur_digit = ctou32(*(++str_iter)) - 48;
1604 if (cur_digit >= 10) {
1605 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1606 digits = digits_short;
1607 break;
1608 }
1609 digits_short = digits_short * 10 + cur_digit;
1610 if (digits_short >= 100000000) {
1611 digits = digits_short;
1612 ScanadvLn_parse_decimal_long:
1613 while (1) {
1614 cur_digit = ctou32(*(++str_iter)) - 48;
1615 if (cur_digit >= 10) {
1616 e10 = 1 - S_CAST(intptr_t, str_iter - dot_ptr);
1617 goto ScanadvLn_parse_exponent;
1618 }
1619 digits = digits * 10 + cur_digit;
1620 if (digits >= 10000000000000000LL) {
1621 e10 = -S_CAST(intptr_t, str_iter - dot_ptr);
1622 do {
1623 cur_digit = ctou32(*(++str_iter)) - 48;
1624 } while (cur_digit < 10);
1625 goto ScanadvLn_parse_exponent;
1626 }
1627 }
1628 }
1629 }
1630 ScanadvLn_parse_exponent:
1631 if (is_negative && (digits != 0)) {
1632 return nullptr;
1633 }
1634 if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
1635 cur_char_code = ctou32(*(++str_iter));
1636 const uint32_t exp_is_negative = (cur_char_code == 45);
1637 if (exp_is_negative || (cur_char_code == 43)) {
1638 cur_char_code = ctou32(*(++str_iter));
1639 }
1640 cur_digit = cur_char_code - 48;
1641 int32_t cur_exp = 0;
1642 while (cur_digit < 10) {
1643 if (cur_exp >= 107374182) {
1644 // may as well guard against exponent overflow
1645 if (!exp_is_negative) {
1646 return nullptr;
1647 }
1648 *ln_ptr = -DBL_MAX;
1649 do {
1650 cur_digit = ctou32(*(++str_iter)) - 48;
1651 } while (cur_digit < 10);
1652 return S_CAST(CXXCONST_CP, str_iter);
1653 }
1654 cur_exp = cur_exp * 10 + cur_digit;
1655 cur_digit = ctou32(*(++str_iter)) - 48;
1656 }
1657 if (exp_is_negative) {
1658 cur_exp = -cur_exp;
1659 }
1660 e10 += cur_exp;
1661 }
1662 #endif
1663 if (digits == 0) {
1664 *ln_ptr = -DBL_MAX;
1665 return S_CAST(CXXCONST_CP, str_iter);
1666 }
1667 double ln_val = log(S_CAST(double, digits));
1668 if (e10) {
1669 ln_val += e10 * kLn10;
1670 }
1671 *ln_ptr = ln_val;
1672 return S_CAST(CXXCONST_CP, str_iter);
1673 }
1674
ScanPosintCappedx(const char * str_iter,uint64_t cap,uint32_t * valp)1675 BoolErr ScanPosintCappedx(const char* str_iter, uint64_t cap, uint32_t* valp) {
1676 double val;
1677 if ((!ScantokDouble(str_iter, &val)) || (val < 1.0) || (val > S_CAST(double, cap))) {
1678 return 1;
1679 }
1680 *valp = S_CAST(uint32_t, val);
1681 return (val != S_CAST(double, *valp));
1682 }
1683
ScanUintCappedx(const char * str_iter,uint64_t cap,uint32_t * valp)1684 BoolErr ScanUintCappedx(const char* str_iter, uint64_t cap, uint32_t* valp) {
1685 double val;
1686 if ((!ScantokDouble(str_iter, &val)) || (val < 0.0) || (val > S_CAST(double, cap))) {
1687 return 1;
1688 }
1689 *valp = S_CAST(uint32_t, val);
1690 return (val != S_CAST(double, *valp));
1691 }
1692
ScanIntAbsBoundedx(const char * str_iter,int64_t bound,int32_t * valp)1693 BoolErr ScanIntAbsBoundedx(const char* str_iter, int64_t bound, int32_t* valp) {
1694 const double bound_d = S_CAST(double, bound);
1695 double val;
1696 if ((!ScantokDouble(str_iter, &val)) || (val < -bound_d) || (val > bound_d)) {
1697 return 1;
1698 }
1699 *valp = S_CAST(int32_t, val);
1700 return (val != S_CAST(double, *valp));
1701 }
1702
ScanPosintptrx(const char * str_iter,uintptr_t * valp)1703 BoolErr ScanPosintptrx(const char* str_iter, uintptr_t* valp) {
1704 double val;
1705 if ((!ScantokDouble(str_iter, &val)) || (val < 1.0) || (val > S_CAST(double, ~k0LU))) {
1706 return 1;
1707 }
1708 *valp = S_CAST(uintptr_t, val);
1709 return (val != S_CAST(double, *valp));
1710 }
1711
GetTopTwoUi(const uint32_t * __restrict uint_arr,uintptr_t uia_size,uintptr_t * __restrict top_idx_ptr,uintptr_t * __restrict second_idx_ptr)1712 void GetTopTwoUi(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
1713 assert(uia_size > 1);
1714 uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
1715 uintptr_t second_idx = 1 ^ top_idx;
1716 uint32_t top_val = uint_arr[top_idx];
1717 uint32_t second_val = uint_arr[second_idx];
1718 uintptr_t cur_idx;
1719 uintptr_t cur_val;
1720 for (cur_idx = 2; cur_idx != uia_size; ++cur_idx) {
1721 cur_val = uint_arr[cur_idx];
1722 if (cur_val > second_val) {
1723 if (cur_val > top_val) {
1724 second_val = top_val;
1725 second_idx = top_idx;
1726 top_val = cur_val;
1727 top_idx = cur_idx;
1728 } else {
1729 second_val = cur_val;
1730 second_idx = cur_idx;
1731 }
1732 }
1733 }
1734 *top_idx_ptr = top_idx;
1735 *second_idx_ptr = second_idx;
1736 }
1737
1738 #ifdef USE_AVX2
NextTokenMultFar(const char * str_iter,uint32_t ct)1739 CXXCONST_CP NextTokenMultFar(const char* str_iter, uint32_t ct) {
1740 // assert(ct);
1741 if (!str_iter) {
1742 return nullptr;
1743 }
1744 uint32_t transition_bits_to_skip_p1 = ct * 2;
1745 const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
1746 const VecUc* str_viter_start = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
1747 const VecUc vvec_all_tab = vecuc_set1(9);
1748 const VecUc vvec_all_space = vecuc_set1(32);
1749 VecUc cur_vvec = *str_viter_start;
1750 VecUc tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1751 // Underlying intrinsics are _mm256_cmpeq_... and _mm256_cmpgt_...
1752 // So we don't really want to use <= or !=.
1753 // (Er, there's also an signed vs. unsigned comparison impedence mismatch:
1754 // there is no unsigned version of _mm256_cmpgt_epi8. So we don't really
1755 // want to use gcc-vector-extension < at all here when we can use a mask to
1756 // do the same job.)
1757 const VecUc vvec_all96 = vecuc_set1(96);
1758 VecUc post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1759 uint32_t delimiter_bytes = vecuc_movemask(tabspace_vvec);
1760 const uint32_t initial_nonterminating_bytes = vecuc_movemask(post31_vvec) | delimiter_bytes;
1761 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter_start);
1762 uint32_t leading_mask = UINT32_MAX << leading_byte_ct;
1763 delimiter_bytes &= leading_mask;
1764 uint32_t terminating_bytes = leading_mask & (~initial_nonterminating_bytes);
1765 uint32_t prev_delim_highbit = 0;
1766 for (const VecUc* str_viter = str_viter_start; ; ) {
1767 // The number of 0->1 + 1->0 bit transitions in x is
1768 // popcount(x ^ (x << 1)).
1769 uint32_t cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1) ^ prev_delim_highbit);
1770 uint32_t cur_transition_ct = PopcountVec8thUint(cur_transitions);
1771 if (cur_transition_ct >= transition_bits_to_skip_p1) {
1772 cur_transitions = ClearBottomSetBits(transition_bits_to_skip_p1 - 1, cur_transitions);
1773 uint32_t byte_offset_in_vec = ctzu32(cur_transitions);
1774 if (terminating_bytes << (31 - byte_offset_in_vec)) {
1775 return nullptr;
1776 }
1777 return &(R_CAST(CXXCONST_CP, str_viter)[byte_offset_in_vec]);
1778 }
1779 if (terminating_bytes) {
1780 return nullptr;
1781 }
1782 transition_bits_to_skip_p1 -= cur_transition_ct;
1783 prev_delim_highbit = transition_bits_to_skip_p1 & 1;
1784 ++str_viter;
1785 cur_vvec = *str_viter;
1786 tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1787 post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1788 delimiter_bytes = vecuc_movemask(tabspace_vvec);
1789 terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1790 }
1791 }
1792
TokenLexK0(const char * str_iter,const uint32_t * col_types,const uint32_t * col_skips,uint32_t relevant_col_ct,const char ** token_ptrs,uint32_t * token_slens)1793 const char* TokenLexK0(const char* str_iter, const uint32_t* col_types, const uint32_t* col_skips, uint32_t relevant_col_ct, const char** token_ptrs, uint32_t* token_slens) {
1794 const uint32_t relevant_col_ct_m1 = relevant_col_ct - 1;
1795 uint32_t relevant_col_idx = 0;
1796 uint32_t transition_bits_to_skip_p1 = col_skips[relevant_col_idx] * 2;
1797 const uintptr_t starting_addr = R_CAST(uintptr_t, str_iter);
1798 const VecUc* str_viter = R_CAST(const VecUc*, RoundDownPow2(starting_addr, kBytesPerVec));
1799 const VecUc vvec_all_tab = vecuc_set1(9);
1800 const VecUc vvec_all_space = vecuc_set1(32);
1801 VecUc cur_vvec = *str_viter;
1802 VecUc tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1803 const VecUc vvec_all96 = vecuc_set1(96);
1804 VecUc post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1805 uint32_t delimiter_bytes = vecuc_movemask(tabspace_vvec);
1806 const uint32_t initial_nonterminating_bytes = vecuc_movemask(post31_vvec) | delimiter_bytes;
1807 const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, str_viter);
1808 uint32_t leading_mask = UINT32_MAX << leading_byte_ct;
1809 delimiter_bytes &= leading_mask;
1810 uint32_t terminating_bytes = leading_mask & (~initial_nonterminating_bytes);
1811 uint32_t prev_delim_highbit = 0;
1812 if (!transition_bits_to_skip_p1) {
1813 // Special case: col_skips[0] can be zero. When it is, str_iter must point
1814 // to the beginning of a token.
1815 prev_delim_highbit = -leading_mask;
1816 transition_bits_to_skip_p1 = 1;
1817 }
1818 while (1) {
1819 // The number of 0->1 + 1->0 bit transitions in x is
1820 // popcount(x ^ (x << 1)).
1821 uint32_t cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1) ^ prev_delim_highbit);
1822 uint32_t cur_transition_ct = PopcountVec8thUint(cur_transitions);
1823 while (cur_transition_ct >= transition_bits_to_skip_p1) {
1824 cur_transitions = ClearBottomSetBits(transition_bits_to_skip_p1 - 1, cur_transitions);
1825 uint32_t byte_offset_in_vec = ctzu32(cur_transitions);
1826 const char* token_start = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1827 const uint32_t cur_col_type = col_types[relevant_col_idx];
1828 token_ptrs[cur_col_type] = token_start;
1829 // Find end of token.
1830 if (relevant_col_idx == relevant_col_ct_m1) {
1831 if (terminating_bytes << (31 - byte_offset_in_vec)) {
1832 return nullptr;
1833 }
1834 // Handle this separately, since it's ok for the last token to be
1835 // terminated by something other than tab/space.
1836 // cur_transitions is a misnomer here; we just need the bottom bit to
1837 // tell us where the token end is.
1838
1839 // 33..255 is a bit more annoying to check for efficiently than
1840 // 32..255.
1841 const VecUc vvec_all95 = vecuc_set1(95);
1842 cur_transitions &= cur_transitions - 1;
1843 cur_transitions |= terminating_bytes;
1844 while (!cur_transitions) {
1845 ++str_viter;
1846 cur_vvec = *str_viter;
1847 const VecUc postspace_vvec = vecuc_adds(cur_vvec, vvec_all95);
1848 cur_transitions = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
1849 }
1850 byte_offset_in_vec = ctzu32(cur_transitions);
1851 const char* token_end = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1852 token_slens[cur_col_type] = token_end - token_start;
1853 return token_end;
1854 }
1855 cur_transition_ct -= transition_bits_to_skip_p1;
1856 if (!cur_transition_ct) {
1857 // Token end is in a later vector.
1858 do {
1859 if (terminating_bytes) {
1860 return nullptr;
1861 }
1862 ++str_viter;
1863 cur_vvec = *str_viter;
1864 tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1865 post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1866 delimiter_bytes = vecuc_movemask(tabspace_vvec);
1867 terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1868 } while (!delimiter_bytes);
1869 cur_transitions = S_CAST(Vec8thUint, delimiter_bytes ^ (delimiter_bytes << 1));
1870 cur_transition_ct = PopcountVec8thUint(cur_transitions);
1871 } else {
1872 cur_transitions &= cur_transitions - 1;
1873 }
1874 byte_offset_in_vec = ctzu32(cur_transitions);
1875 const char* token_end = &(R_CAST(const char*, str_viter)[byte_offset_in_vec]);
1876 token_slens[cur_col_type] = token_end - token_start;
1877 ++relevant_col_idx;
1878 transition_bits_to_skip_p1 = 2 * col_skips[relevant_col_idx];
1879 }
1880 if (terminating_bytes) {
1881 return nullptr;
1882 }
1883 transition_bits_to_skip_p1 -= cur_transition_ct;
1884 prev_delim_highbit = transition_bits_to_skip_p1 & 1;
1885 ++str_viter;
1886 cur_vvec = *str_viter;
1887 tabspace_vvec = (cur_vvec == vvec_all_tab) | (cur_vvec == vvec_all_space);
1888 post31_vvec = vecuc_adds(cur_vvec, vvec_all96);
1889 delimiter_bytes = vecuc_movemask(tabspace_vvec);
1890 terminating_bytes = ~(vecuc_movemask(post31_vvec) | delimiter_bytes);
1891 }
1892 }
1893 #endif // USE_AVX2
1894
NextCsvMult(const char * str_iter,uint32_t ct)1895 CXXCONST_CP NextCsvMult(const char* str_iter, uint32_t ct) {
1896 if (!str_iter) {
1897 return nullptr;
1898 }
1899 // assumes initial spaces in current token have been skipped
1900 // ok if we're at the end of the token
1901 unsigned char ucc = *str_iter;
1902 assert(ucc != ' ');
1903 while (ucc >= ' ') {
1904 // avoid strchr to keep "ASCII code < 32 == newline" consistent
1905 // (tab handling is quirky right now--permitted at the beginning of a
1906 // token, but treated as newline later--but it should never appear so
1907 // no point in e.g. adding an extra parameter to FirstNonTspace();
1908 // just need to make sure the quirky behavior is consistent.)
1909 if (ucc != ',') {
1910 ucc = *(++str_iter);
1911 continue;
1912 }
1913 do {
1914 ucc = *(++str_iter);
1915 } while ((ucc == ' ') || (ucc == '\t'));
1916 if (!(--ct)) {
1917 return S_CAST(CXXCONST_CP, str_iter);
1918 }
1919 }
1920 return nullptr;
1921 }
1922
1923 #ifdef USE_AVX2
CsvLexK(const char * str_iter,const uint32_t * col_types,const uint32_t * col_skips,uint32_t relevant_col_ct,const char ** token_ptrs,uint32_t * token_slens)1924 const char* CsvLexK(const char* str_iter, const uint32_t* col_types, const uint32_t* col_skips, uint32_t relevant_col_ct, const char** token_ptrs, uint32_t* token_slens) {
1925 // Temporary; optimize later.
1926 for (uint32_t relevant_col_idx = 0; relevant_col_idx != relevant_col_ct; ++relevant_col_idx) {
1927 const uint32_t cur_col_type = col_types[relevant_col_idx];
1928 str_iter = NextCsvMult(str_iter, col_skips[relevant_col_idx]);
1929 if (!str_iter) {
1930 return nullptr;
1931 }
1932 token_ptrs[cur_col_type] = str_iter;
1933 const char* token_end = CsvFieldEnd(str_iter);
1934 token_slens[cur_col_type] = token_end - str_iter;
1935 str_iter = token_end;
1936 }
1937 return str_iter;
1938 }
1939 #endif
1940
CountTokens(const char * str_iter)1941 uint32_t CountTokens(const char* str_iter) {
1942 uint32_t token_ct = 0;
1943 str_iter = FirstNonTspace(str_iter);
1944 while (!IsEolnKns(*str_iter)) {
1945 ++token_ct;
1946 str_iter = FirstNonTspace(CurTokenEnd(str_iter));
1947 }
1948 return token_ct;
1949 }
1950
1951 /*
1952 uint32_t CommaOrSpaceCountTokens(const char* str_iter, uint32_t comma_delim) {
1953 if (comma_delim) {
1954 // assumes nonempty line (treats trailing empty string as a token).
1955 uint32_t token_ct = 1;
1956 unsigned char ucc = (unsigned char)(*str_iter++);
1957 while (1) {
1958 if (ucc < 32) {
1959 return token_ct;
1960 }
1961 if (ucc == ',') {
1962 // spelled out due to const qualifier
1963 do {
1964 ucc = (unsigned char)(*str_iter++);
1965 } while ((ucc == ' ') || (ucc == '\t'));
1966 token_ct++;
1967 continue;
1968 }
1969 ucc = (unsigned char)(*str_iter++);
1970 }
1971 }
1972 return CountTokens(str_iter);
1973 }
1974 */
1975
CountAndMeasureMultistr(const char * multistr,uintptr_t * max_blen_ptr)1976 uint32_t CountAndMeasureMultistr(const char* multistr, uintptr_t* max_blen_ptr) {
1977 // Straightforward to accelerate this with movemask if it ever matters.
1978 // (Doesn't matter now since it's only used for command-line processing.)
1979 uint32_t ct = 0;
1980 uintptr_t max_blen = *max_blen_ptr;
1981 while (*multistr) {
1982 const uintptr_t blen = strlen(multistr) + 1;
1983 if (blen > max_blen) {
1984 max_blen = blen;
1985 }
1986 multistr = &(multistr[blen]);
1987 ++ct;
1988 } while (*multistr);
1989 *max_blen_ptr = max_blen;
1990 return ct;
1991 }
1992
1993 // number-to-string encoders
1994
1995 const uint16_t kDigitPair[] = {
1996 0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
1997 0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
1998 0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
1999 0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
2000 0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
2001 0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
2002 0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
2003 0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
2004 0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
2005 0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939};
2006
u32toa(uint32_t uii,char * start)2007 char* u32toa(uint32_t uii, char* start) {
2008 // Memory-efficient fast integer writer. (You can do a bit better sometimes
2009 // by using a larger lookup table, but on average I doubt that pays off.)
2010 // Returns a pointer to the end of the integer (not null-terminated).
2011 //
2012 // Nearly identical to 'branchlut' from
2013 // https://github.com/miloyip/itoa-benchmark , except that the hardcoded
2014 // binary search is more balanced (start by comparing 6+ digits vs. <6,
2015 // instead of 9+ digits vs. <8). This tends to be slightly better unless the
2016 // integers are almost uniformly distributed over [0, 2^32).
2017 //
2018 // Todo: compare against an_itoa in https://github.com/appnexus/acf/ .
2019 //
2020 // (Making the first comparison 7+ digits vs. <7 would seem to make sense,
2021 // but it seems to benchmark slightly worse on my Mac?)
2022 //
2023 // (Since we want to execute different code depending on the number of
2024 // digits, the UintSlen() approach doesn't pay off.)
2025 uint32_t quotient;
2026 if (uii < 100000) {
2027 if (uii < 100) {
2028 if (uii >= 10) {
2029 goto u32toa_just2;
2030 }
2031 *start++ = '0' + uii;
2032 return start;
2033 }
2034 if (uii < 10000) {
2035 if (uii >= 1000) {
2036 goto u32toa_just4;
2037 }
2038 quotient = uii / 100;
2039 *start++ = '0' + quotient;
2040 goto u32toa_2left;
2041 }
2042 quotient = uii / 10000;
2043 *start++ = '0' + quotient;
2044 goto u32toa_4left;
2045 }
2046 if (uii < 100000000) {
2047 if (uii < 1000000) {
2048 goto u32toa_just6;
2049 }
2050 if (uii >= 10000000) {
2051 goto u32toa_just8;
2052 }
2053 quotient = uii / 1000000;
2054 *start++ = '0' + quotient;
2055 goto u32toa_6left;
2056 }
2057 quotient = uii / 100000000;
2058 if (uii < 1000000000) {
2059 *start++ = '0' + quotient;
2060 } else {
2061 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2062 }
2063 uii -= quotient * 100000000;
2064 u32toa_just8:
2065 quotient = uii / 1000000;
2066 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2067 u32toa_6left:
2068 uii -= quotient * 1000000;
2069 u32toa_just6:
2070 quotient = uii / 10000;
2071 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2072 u32toa_4left:
2073 uii -= quotient * 10000;
2074 u32toa_just4:
2075 quotient = uii / 100;
2076 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2077 u32toa_2left:
2078 uii -= quotient * 100;
2079 u32toa_just2:
2080 return memcpya_k(start, &(kDigitPair[uii]), 2);
2081 }
2082
i32toa(int32_t ii,char * start)2083 char* i32toa(int32_t ii, char* start) {
2084 uint32_t uii = ii;
2085 if (ii < 0) {
2086 // -INT_MIN is undefined, but negating the unsigned int equivalent works
2087 *start++ = '-';
2088 uii = -uii;
2089 }
2090 return u32toa(uii, start);
2091 }
2092
uitoa_z4(uint32_t uii,char * start)2093 char* uitoa_z4(uint32_t uii, char* start) {
2094 uint32_t quotient = uii / 100;
2095 assert(quotient < 100);
2096 uii -= 100 * quotient;
2097 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2098 return memcpya_k(start, &(kDigitPair[uii]), 2);
2099 }
2100
u32toa_z5(uint32_t uii,char * start)2101 char* u32toa_z5(uint32_t uii, char* start) {
2102 uint32_t quotient = uii / 10000;
2103 *start++ = '0' + quotient;
2104 return uitoa_z4(uii - 10000 * quotient, start);
2105 }
2106
uitoa_z6(uint32_t uii,char * start)2107 char* uitoa_z6(uint32_t uii, char* start) {
2108 uint32_t quotient = uii / 10000;
2109 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2110 return uitoa_z4(uii - 10000 * quotient, start);
2111 }
2112
uitoa_z8(uint32_t uii,char * start)2113 char* uitoa_z8(uint32_t uii, char* start) {
2114 uint32_t quotient = uii / 1000000;
2115 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2116 return uitoa_z6(uii - 1000000 * quotient, start);
2117 }
2118
i64toa(int64_t llii,char * start)2119 char* i64toa(int64_t llii, char* start) {
2120 uint64_t ullii = llii;
2121 uint64_t top_digits;
2122 uint32_t bottom_eight;
2123 uint32_t middle_eight;
2124 if (llii < 0) {
2125 *start++ = '-';
2126 ullii = -ullii;
2127 }
2128 if (ullii <= 0xffffffffLLU) {
2129 return u32toa(S_CAST(uint32_t, ullii), start);
2130 }
2131 top_digits = ullii / 100000000;
2132 bottom_eight = S_CAST(uint32_t, ullii - (top_digits * 100000000));
2133 if (top_digits <= 0xffffffffLLU) {
2134 start = u32toa(S_CAST(uint32_t, top_digits), start);
2135 return uitoa_z8(bottom_eight, start);
2136 }
2137 ullii = top_digits / 100000000;
2138 middle_eight = S_CAST(uint32_t, top_digits - (ullii * 100000000));
2139 start = u32toa(S_CAST(uint32_t, ullii), start);
2140 start = uitoa_z8(middle_eight, start);
2141 return uitoa_z8(bottom_eight, start);
2142 }
2143
2144
u32toa_trunc4(uint32_t uii,char * start)2145 char* u32toa_trunc4(uint32_t uii, char* start) {
2146 uint32_t quotient = uii / 100;
2147 memcpy_k(start, &(kDigitPair[quotient]), 2);
2148 uii -= 100 * quotient;
2149 if (uii) {
2150 start += 2;
2151 memcpy_k(start, &(kDigitPair[uii]), 2);
2152 }
2153 if (start[1] != '0') {
2154 return &(start[2]);
2155 }
2156 return &(start[1]);
2157 }
2158
uitoa_trunc6(uint32_t uii,char * start)2159 static inline char* uitoa_trunc6(uint32_t uii, char* start) {
2160 uint32_t quotient = uii / 10000;
2161 memcpy_k(start, &(kDigitPair[quotient]), 2);
2162 uii -= 10000 * quotient;
2163 if (uii) {
2164 quotient = uii / 100;
2165 start += 2;
2166 memcpy_k(start, &(kDigitPair[quotient]), 2);
2167 uii -= 100 * quotient;
2168 if (uii) {
2169 start += 2;
2170 memcpy_k(start, &(kDigitPair[uii]), 2);
2171 }
2172 }
2173 if (start[1] != '0') {
2174 return &(start[2]);
2175 }
2176 return &(start[1]);
2177 }
2178
uitoa_trunc8(uint32_t uii,char * start)2179 static inline char* uitoa_trunc8(uint32_t uii, char* start) {
2180 uint32_t quotient = uii / 1000000;
2181 memcpy_k(start, &(kDigitPair[quotient]), 2);
2182 uii -= 1000000 * quotient;
2183 if (uii) {
2184 quotient = uii / 10000;
2185 start += 2;
2186 memcpy_k(start, &(kDigitPair[quotient]), 2);
2187 uii -= 10000 * quotient;
2188 if (uii) {
2189 quotient = uii / 100;
2190 start += 2;
2191 memcpy_k(start, &(kDigitPair[quotient]), 2);
2192 uii -= 100 * quotient;
2193 if (uii) {
2194 start += 2;
2195 memcpy_k(start, &(kDigitPair[uii]), 2);
2196 }
2197 }
2198 }
2199 if (start[1] != '0') {
2200 return &(start[2]);
2201 }
2202 return &(start[1]);
2203 }
2204
rtoa_p5(uint32_t remainder,char * start)2205 static inline char* rtoa_p5(uint32_t remainder, char* start) {
2206 if (!remainder) {
2207 return start;
2208 }
2209 *start++ = '.';
2210 uint32_t quotient = remainder / 1000;
2211 memcpy_k(start, &(kDigitPair[quotient]), 2);
2212 remainder -= 1000 * quotient;
2213 if (remainder) {
2214 quotient = remainder / 10;
2215 start += 2;
2216 memcpy_k(start, &(kDigitPair[quotient]), 2);
2217 remainder -= 10 * quotient;
2218 if (remainder) {
2219 start[2] = '0' + remainder;
2220 return &(start[3]);
2221 }
2222 }
2223 if (start[1] != '0') {
2224 return &(start[2]);
2225 }
2226 return &(start[1]);
2227 }
2228
qrtoa_1p5(uint32_t quotient,uint32_t remainder,char * start)2229 static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
2230 *start++ = '0' + quotient;
2231 return rtoa_p5(remainder, start);
2232 }
2233
qrtoa_1p7(uint32_t quotient,uint32_t remainder,char * start)2234 static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
2235 *start++ = '0' + quotient;
2236 if (!remainder) {
2237 return start;
2238 }
2239 *start++ = '.';
2240 quotient = remainder / 100000;
2241 memcpy_k(start, &(kDigitPair[quotient]), 2);
2242 remainder -= 100000 * quotient;
2243 if (remainder) {
2244 quotient = remainder / 1000;
2245 start += 2;
2246 memcpy_k(start, &(kDigitPair[quotient]), 2);
2247 remainder -= 1000 * quotient;
2248 if (remainder) {
2249 quotient = remainder / 10;
2250 start += 2;
2251 memcpy_k(start, &(kDigitPair[quotient]), 2);
2252 remainder -= 10 * quotient;
2253 if (remainder) {
2254 start[2] = '0' + remainder;
2255 return &(start[3]);
2256 }
2257 }
2258 }
2259 if (start[1] != '0') {
2260 return &(start[2]);
2261 }
2262 return &(start[1]);
2263 }
2264
2265 // Okay, time to do banker's rounding when printing doubles. 14 digits of
2266 // precision are used in judging equality to 0.5 (actual precision of doubles
2267 // is 15-17 digits); the intention is to capture all directly loaded or exactly
2268 // computed edge cases (so enough tolerance is needed to survive the internal
2269 // multiplications by powers of 10, etc.), while rounding a negligible number
2270 // of honest-to-god 0.4999999s up and 0.5000001s down.
2271 // To avoid inadvertent printing of an extra digit, there's a deliberate gap
2272 // between the 99.9994999...-type bounds and the largest numbers that would
2273 // actually round down.
2274 static const STD_ARRAY_DECL(double, 2, kBankerRound6) = STD_ARRAY_INIT_START() {0.4999995, 0.5000005} STD_ARRAY_INIT_END();
2275 static const STD_ARRAY_DECL(double, 2, kBankerRound8) = STD_ARRAY_INIT_START() {0.499999995, 0.500000005} STD_ARRAY_INIT_END();
2276
2277 static inline uint32_t BankerRoundD(double dxx, STD_ARRAY_KREF(double, 2) banker_round) {
2278 uint32_t result = S_CAST(int32_t, dxx);
2279 return result + S_CAST(int32_t, (dxx - u31tod(result)) + banker_round[result & 1]);
2280 }
2281
2282 // These are separate functions so the compiler can optimize the integer
2283 // divisions.
2284 static inline void BankerRoundD1(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2285 dxx *= 10;
2286 uint32_t remainder = S_CAST(int32_t, dxx);
2287 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2288 *quotientp = remainder / 10;
2289 *remainderp = remainder - (*quotientp) * 10;
2290 }
2291
2292 static inline void BankerRoundD2(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2293 dxx *= 100;
2294 uint32_t remainder = S_CAST(int32_t, dxx);
2295 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2296 *quotientp = remainder / 100;
2297 *remainderp = remainder - (*quotientp) * 100;
2298 }
2299
2300 static inline void BankerRoundD3(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2301 dxx *= 1000;
2302 uint32_t remainder = S_CAST(int32_t, dxx);
2303 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2304 *quotientp = remainder / 1000;
2305 *remainderp = remainder - (*quotientp) * 1000;
2306 }
2307
2308 static inline void BankerRoundD4(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2309 dxx *= 10000;
2310 uint32_t remainder = S_CAST(int32_t, dxx);
2311 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2312 *quotientp = remainder / 10000;
2313 *remainderp = remainder - (*quotientp) * 10000;
2314 }
2315
2316 static inline void BankerRoundD5(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2317 dxx *= 100000;
2318 uint32_t remainder = S_CAST(int32_t, dxx);
2319 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2320 *quotientp = remainder / 100000;
2321 *remainderp = remainder - (*quotientp) * 100000;
2322 }
2323
2324 static inline void BankerRoundD6(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2325 dxx *= 1000000;
2326 uint32_t remainder = S_CAST(int32_t, dxx);
2327 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2328 *quotientp = remainder / 1000000;
2329 *remainderp = remainder - (*quotientp) * 1000000;
2330 }
2331
2332 static inline void BankerRoundD7(double dxx, STD_ARRAY_KREF(double, 2) banker_round, uint32_t* quotientp, uint32_t* remainderp) {
2333 dxx *= 10000000;
2334 uint32_t remainder = S_CAST(int32_t, dxx);
2335 remainder += S_CAST(int32_t, (dxx - u31tod(remainder)) + banker_round[remainder & 1]);
2336 *quotientp = remainder / 10000000;
2337 *remainderp = remainder - (*quotientp) * 10000000;
2338 }
2339
dtoa_so6(double dxx,char * start)2340 char* dtoa_so6(double dxx, char* start) {
2341 // 6 sig fig number, 0.999995 <= dxx < 999999.5
2342 // 'so' = "significand only"
2343 // Just hardcoding all six cases, in the absence of a better approach...
2344 uint32_t uii;
2345 uint32_t quotient;
2346 uint32_t remainder;
2347 if (dxx < 99.999949999999) {
2348 if (dxx < 9.9999949999999) {
2349 BankerRoundD5(dxx, kBankerRound8, "ient, &remainder);
2350 return qrtoa_1p5(quotient, remainder, start);
2351 }
2352 BankerRoundD4(dxx, kBankerRound8, "ient, &remainder);
2353 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2354 if (!remainder) {
2355 return start;
2356 }
2357 *start++ = '.';
2358 quotient = remainder / 100;
2359 memcpy_k(start, &(kDigitPair[quotient]), 2);
2360 remainder -= 100 * quotient;
2361 if (remainder) {
2362 start += 2;
2363 dtoa_so6_pretail:
2364 memcpy_k(start, &(kDigitPair[remainder]), 2);
2365 }
2366 dtoa_so6_tail:
2367 if (start[1] != '0') {
2368 return &(start[2]);
2369 }
2370 return &(start[1]);
2371 }
2372 if (dxx < 9999.9949999999) {
2373 if (dxx < 999.99949999999) {
2374 BankerRoundD3(dxx, kBankerRound8, &uii, &remainder);
2375 quotient = uii / 100;
2376 *start++ = '0' + quotient;
2377 quotient = uii - 100 * quotient;
2378 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2379 if (!remainder) {
2380 return start;
2381 }
2382 *start++ = '.';
2383 quotient = remainder / 10;
2384 memcpy_k(start, &(kDigitPair[quotient]), 2);
2385 remainder -= quotient * 10;
2386 if (!remainder) {
2387 goto dtoa_so6_tail;
2388 }
2389 start[2] = '0' + remainder;
2390 return &(start[3]);
2391 }
2392 BankerRoundD2(dxx, kBankerRound8, &uii, &remainder);
2393 quotient = uii / 100;
2394 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2395 quotient = uii - (100 * quotient);
2396 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2397 if (!remainder) {
2398 return start;
2399 }
2400 *start++ = '.';
2401 goto dtoa_so6_pretail;
2402 }
2403 if (dxx >= 99999.949999999) {
2404 return uitoa_z6(BankerRoundD(dxx, kBankerRound8), start);
2405 }
2406 BankerRoundD1(dxx, kBankerRound8, &uii, &remainder);
2407 quotient = uii / 10000;
2408 *start = '0' + quotient;
2409 uii -= 10000 * quotient;
2410 quotient = uii / 100;
2411 start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2412 uii = uii - 100 * quotient;
2413 start = memcpya_k(start, &(kDigitPair[uii]), 2);
2414 if (!remainder) {
2415 return start;
2416 }
2417 *start++ = '.';
2418 *start = '0' + remainder;
2419 return &(start[1]);
2420 }
2421
dtoa_so8(double dxx,char * start)2422 char* dtoa_so8(double dxx, char* start) {
2423 // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
2424 uint32_t uii;
2425 uint32_t quotient;
2426 uint32_t remainder;
2427 if (dxx < 99.999999499999) {
2428 if (dxx < 9.9999999499999) {
2429 BankerRoundD7(dxx, kBankerRound6, "ient, &remainder);
2430 return qrtoa_1p7(quotient, remainder, start);
2431 }
2432 BankerRoundD6(dxx, kBankerRound6, "ient, &remainder);
2433 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2434 if (!remainder) {
2435 return start;
2436 }
2437 *start++ = '.';
2438 quotient = remainder / 10000;
2439 memcpy_k(start, &(kDigitPair[quotient]), 2);
2440 remainder -= 10000 * quotient;
2441 if (remainder) {
2442 start += 2;
2443 dtoa_so8_pretail4:
2444 quotient = remainder / 100;
2445 memcpy_k(start, &(kDigitPair[quotient]), 2);
2446 remainder -= 100 * quotient;
2447 if (remainder) {
2448 start += 2;
2449 dtoa_so8_pretail2:
2450 memcpy_k(start, &(kDigitPair[remainder]), 2);
2451 }
2452 }
2453 dtoa_so8_tail:
2454 if (start[1] != '0') {
2455 return &(start[2]);
2456 }
2457 return &(start[1]);
2458 }
2459 if (dxx < 9999.9999499999) {
2460 if (dxx < 999.99999499999) {
2461 BankerRoundD5(dxx, kBankerRound6, &uii, &remainder);
2462 quotient = uii / 100;
2463 *start++ = '0' + quotient;
2464 quotient = uii - 100 * quotient;
2465 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2466 if (!remainder) {
2467 return start;
2468 }
2469 *start++ = '.';
2470 quotient = remainder / 1000;
2471 memcpy_k(start, &(kDigitPair[quotient]), 2);
2472 remainder -= quotient * 1000;
2473 if (!remainder) {
2474 goto dtoa_so8_tail;
2475 }
2476 start += 2;
2477 dtoa_so8_pretail3:
2478 quotient = remainder / 10;
2479 memcpy_k(start, &(kDigitPair[quotient]), 2);
2480 remainder -= quotient * 10;
2481 if (!remainder) {
2482 goto dtoa_so8_tail;
2483 }
2484 start[2] = '0' + remainder;
2485 return &(start[3]);
2486 }
2487 BankerRoundD4(dxx, kBankerRound6, &uii, &remainder);
2488 quotient = uii / 100;
2489 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2490 quotient = uii - (100 * quotient);
2491 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2492 if (!remainder) {
2493 return start;
2494 }
2495 *start++ = '.';
2496 goto dtoa_so8_pretail4;
2497 }
2498 if (dxx < 999999.99499999) {
2499 if (dxx < 99999.999499999) {
2500 BankerRoundD3(dxx, kBankerRound6, &uii, &remainder);
2501 quotient = uii / 10000;
2502 *start = '0' + quotient;
2503 uii -= 10000 * quotient;
2504 quotient = uii / 100;
2505 start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2506 uii -= 100 * quotient;
2507 start = memcpya_k(start, &(kDigitPair[uii]), 2);
2508 if (!remainder) {
2509 return start;
2510 }
2511 *start++ = '.';
2512 goto dtoa_so8_pretail3;
2513 }
2514 BankerRoundD2(dxx, kBankerRound6, &uii, &remainder);
2515 quotient = uii / 10000;
2516 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2517 uii -= 10000 * quotient;
2518 quotient = uii / 100;
2519 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2520 uii -= 100 * quotient;
2521 start = memcpya_k(start, &(kDigitPair[uii]), 2);
2522 if (!remainder) {
2523 return start;
2524 }
2525 *start++ = '.';
2526 goto dtoa_so8_pretail2;
2527 }
2528 if (dxx >= 9999999.9499999) {
2529 return uitoa_z8(BankerRoundD(dxx, kBankerRound6), start);
2530 }
2531 BankerRoundD1(dxx, kBankerRound6, &uii, &remainder);
2532 quotient = uii / 1000000;
2533 *start = '0' + quotient;
2534 uii -= 1000000 * quotient;
2535 quotient = uii / 10000;
2536 start = memcpya_k(&(start[1]), &(kDigitPair[quotient]), 2);
2537 uii -= 10000 * quotient;
2538 quotient = uii / 100;
2539 start = memcpya_k(start, &(kDigitPair[quotient]), 2);
2540 uii -= 100 * quotient;
2541 start = memcpya_k(start, &(kDigitPair[uii]), 2);
2542 if (!remainder) {
2543 return start;
2544 }
2545 *start = '.';
2546 start[1] = '0' + remainder;
2547 return &(start[2]);
2548 }
2549
dtoa_g(double dxx,char * start)2550 char* dtoa_g(double dxx, char* start) {
2551 uint32_t xp10 = 0;
2552 uint32_t quotient;
2553 uint32_t remainder;
2554 if (dxx != dxx) {
2555 return strcpya_k(start, "nan");
2556 }
2557 if (dxx < 0) {
2558 *start++ = '-';
2559 dxx = -dxx;
2560 }
2561 if (dxx < 9.9999949999999e-5) {
2562 // 6 sig fig exponential notation, small
2563 if (dxx < 9.9999949999999e-16) {
2564 if (dxx < 9.9999949999999e-128) {
2565 if (dxx == 0.0) {
2566 *start = '0';
2567 return &(start[1]);
2568 }
2569 if (dxx < 9.9999949999999e-256) {
2570 dxx *= 1.0e256;
2571 xp10 |= 256;
2572 } else {
2573 dxx *= 1.0e128;
2574 xp10 |= 128;
2575 }
2576 }
2577 if (dxx < 9.9999949999999e-64) {
2578 dxx *= 1.0e64;
2579 xp10 |= 64;
2580 }
2581 if (dxx < 9.9999949999999e-32) {
2582 dxx *= 1.0e32;
2583 xp10 |= 32;
2584 }
2585 if (dxx < 9.9999949999999e-16) {
2586 dxx *= 1.0e16;
2587 xp10 |= 16;
2588 }
2589 }
2590 if (dxx < 9.9999949999999e-8) {
2591 dxx *= 100000000;
2592 xp10 |= 8;
2593 }
2594 if (dxx < 9.9999949999999e-4) {
2595 dxx *= 10000;
2596 xp10 |= 4;
2597 }
2598 if (dxx < 9.9999949999999e-2) {
2599 dxx *= 100;
2600 xp10 |= 2;
2601 }
2602 if (dxx < 9.9999949999999e-1) {
2603 dxx *= 10;
2604 ++xp10;
2605 }
2606 BankerRoundD5(dxx, kBankerRound8, "ient, &remainder);
2607 start = memcpya_k(qrtoa_1p5(quotient, remainder, start), "e-", 2);
2608 if (xp10 >= 100) {
2609 quotient = xp10 / 100;
2610 *start++ = '0' + quotient;
2611 xp10 -= 100 * quotient;
2612 }
2613 return memcpya_k(start, &(kDigitPair[xp10]), 2);
2614 }
2615 if (dxx >= 999999.49999999) {
2616 // 6 sig fig exponential notation, large
2617 if (dxx >= 9.9999949999999e15) {
2618 if (dxx >= 9.9999949999999e127) {
2619 if (dxx > DBL_MAX) {
2620 return strcpya_k(start, "inf");
2621 }
2622 if (dxx >= 9.9999949999999e255) {
2623 dxx *= 1.0e-256;
2624 xp10 |= 256;
2625 } else {
2626 dxx *= 1.0e-128;
2627 xp10 |= 128;
2628 }
2629 }
2630 if (dxx >= 9.9999949999999e63) {
2631 dxx *= 1.0e-64;
2632 xp10 |= 64;
2633 }
2634 if (dxx >= 9.9999949999999e31) {
2635 dxx *= 1.0e-32;
2636 xp10 |= 32;
2637 }
2638 if (dxx >= 9.9999949999999e15) {
2639 dxx *= 1.0e-16;
2640 xp10 |= 16;
2641 }
2642 }
2643 if (dxx >= 9.9999949999999e7) {
2644 dxx *= 1.0e-8;
2645 xp10 |= 8;
2646 }
2647 if (dxx >= 9.9999949999999e3) {
2648 dxx *= 1.0e-4;
2649 xp10 |= 4;
2650 }
2651 if (dxx >= 9.9999949999999e1) {
2652 dxx *= 1.0e-2;
2653 xp10 |= 2;
2654 }
2655 if (dxx >= 9.9999949999999e0) {
2656 dxx *= 1.0e-1;
2657 xp10++;
2658 }
2659 BankerRoundD5(dxx, kBankerRound8, "ient, &remainder);
2660 start = memcpya_k(qrtoa_1p5(quotient, remainder, start), "e+", 2);
2661 if (xp10 >= 100) {
2662 quotient = xp10 / 100;
2663 *start++ = '0' + quotient;
2664 xp10 -= 100 * quotient;
2665 }
2666 return memcpya_k(start, &(kDigitPair[xp10]), 2);
2667 }
2668 if (dxx >= 0.99999949999999) {
2669 return dtoa_so6(dxx, start);
2670 }
2671 // 6 sig fig decimal, no less than ~0.0001
2672 start = memcpya_k(start, "0.", 2);
2673 if (dxx < 9.9999949999999e-3) {
2674 dxx *= 100;
2675 start = memcpya_k(start, "00", 2);
2676 }
2677 if (dxx < 9.9999949999999e-2) {
2678 dxx *= 10;
2679 *start++ = '0';
2680 }
2681 return uitoa_trunc6(BankerRoundD(dxx * 1000000, kBankerRound8), start);
2682 }
2683
dtoa_g_p8(double dxx,char * start)2684 char* dtoa_g_p8(double dxx, char* start) {
2685 uint32_t xp10 = 0;
2686 char wbuf[16];
2687 char* wpos = wbuf;
2688 uint32_t quotient;
2689 uint32_t remainder;
2690 if (dxx != dxx) {
2691 return strcpya_k(start, "nan");
2692 }
2693 if (dxx < 0) {
2694 *wpos++ = '-';
2695 dxx = -dxx;
2696 }
2697 if (dxx < 9.9999999499999e-5) {
2698 // 8 sig fig exponential notation, small
2699 if (dxx < 9.9999999499999e-16) {
2700 if (dxx < 9.9999999499999e-128) {
2701 if (dxx == 0.0) {
2702 *start = '0';
2703 return &(start[1]);
2704 }
2705 if (dxx < 9.9999999499999e-256) {
2706 dxx *= 1.0e256;
2707 xp10 |= 256;
2708 } else {
2709 dxx *= 1.0e128;
2710 xp10 |= 128;
2711 }
2712 }
2713 if (dxx < 9.9999999499999e-64) {
2714 dxx *= 1.0e64;
2715 xp10 |= 64;
2716 }
2717 if (dxx < 9.9999999499999e-32) {
2718 dxx *= 1.0e32;
2719 xp10 |= 32;
2720 }
2721 if (dxx < 9.9999999499999e-16) {
2722 dxx *= 1.0e16;
2723 xp10 |= 16;
2724 }
2725 }
2726 if (dxx < 9.9999999499999e-8) {
2727 dxx *= 100000000;
2728 xp10 |= 8;
2729 }
2730 if (dxx < 9.9999999499999e-4) {
2731 dxx *= 10000;
2732 xp10 |= 4;
2733 }
2734 if (dxx < 9.9999999499999e-2) {
2735 dxx *= 100;
2736 xp10 |= 2;
2737 }
2738 if (dxx < 9.9999999499999e-1) {
2739 dxx *= 10;
2740 ++xp10;
2741 }
2742 BankerRoundD7(dxx, kBankerRound6, "ient, &remainder);
2743 wpos = qrtoa_1p7(quotient, remainder, wpos);
2744 remainder = wpos - wbuf;
2745 if (xp10 >= 100) {
2746 start = memcpya(start, wbuf, remainder);
2747 quotient = xp10 / 100;
2748 start = memcpyax_k(start, "e-", 2, '0' + quotient);
2749 xp10 -= 100 * quotient;
2750 } else {
2751 start = memcpya(start, wbuf, remainder);
2752 start = memcpya_k(start, "e-", 2);
2753 }
2754 return memcpya_k(start, &(kDigitPair[xp10]), 2);
2755 }
2756 if (dxx >= 99999999.499999) {
2757 // 8 sig fig exponential notation, large
2758 if (dxx >= 9.9999999499999e15) {
2759 if (dxx >= 9.9999999499999e127) {
2760 if (dxx > DBL_MAX) {
2761 if (wpos == wbuf) {
2762 return memcpya(start, " inf", 4);
2763 }
2764 return memcpya(start, "-inf", 4);
2765 }
2766 if (dxx >= 9.9999999499999e255) {
2767 dxx *= 1.0e-256;
2768 xp10 |= 256;
2769 } else {
2770 dxx *= 1.0e-128;
2771 xp10 |= 128;
2772 }
2773 }
2774 if (dxx >= 9.9999999499999e63) {
2775 dxx *= 1.0e-64;
2776 xp10 |= 64;
2777 }
2778 if (dxx >= 9.9999999499999e31) {
2779 dxx *= 1.0e-32;
2780 xp10 |= 32;
2781 }
2782 if (dxx >= 9.9999999499999e15) {
2783 dxx *= 1.0e-16;
2784 xp10 |= 16;
2785 }
2786 }
2787 if (dxx >= 9.9999999499999e7) {
2788 dxx *= 1.0e-8;
2789 xp10 |= 8;
2790 }
2791 if (dxx >= 9.9999999499999e3) {
2792 dxx *= 1.0e-4;
2793 xp10 |= 4;
2794 }
2795 if (dxx >= 9.9999999499999e1) {
2796 dxx *= 1.0e-2;
2797 xp10 |= 2;
2798 }
2799 if (dxx >= 9.9999999499999e0) {
2800 dxx *= 1.0e-1;
2801 ++xp10;
2802 }
2803 BankerRoundD7(dxx, kBankerRound6, "ient, &remainder);
2804 wpos = qrtoa_1p7(quotient, remainder, wpos);
2805 remainder = wpos - wbuf;
2806 if (xp10 >= 100) {
2807 start = memcpya(start, wbuf, remainder);
2808 quotient = xp10 / 100;
2809 start = memcpyax_k(start, "e+", 2, '0' + quotient);
2810 xp10 -= 100 * quotient;
2811 } else {
2812 start = memcpya(start, wbuf, remainder);
2813 start = memcpya_k(start, "e+", 2);
2814 }
2815 return memcpya_k(start, &(kDigitPair[xp10]), 2);
2816 }
2817 if (dxx >= 0.99999999499999) {
2818 wpos = dtoa_so8(dxx, wpos);
2819 } else {
2820 // 8 sig fig decimal, no less than ~0.0001
2821 wpos = memcpya_k(wpos, "0.", 2);
2822 if (dxx < 9.9999999499999e-3) {
2823 dxx *= 100;
2824 wpos = memcpya_k(wpos, "00", 2);
2825 }
2826 if (dxx < 9.9999999499999e-2) {
2827 dxx *= 10;
2828 *wpos++ = '0';
2829 }
2830 wpos = uitoa_trunc8(BankerRoundD(dxx * 100000000, kBankerRound6), wpos);
2831 }
2832 remainder = wpos - wbuf;
2833 return memcpya(start, wbuf, remainder);
2834 }
2835
2836 // "prob" means that the number is guaranteed to be in [0, 1].
2837 // no leading space is printed. trailing zeroes (/decimal point) are erased
2838 // iff there is equality to ~13 decimal places.
dtoa_f_probp6_spaced(double dxx,char * start)2839 char* dtoa_f_probp6_spaced(double dxx, char* start) {
2840 double dxx_10_6 = dxx * 1000000;
2841 const uint32_t dec_digits = BankerRoundD(dxx_10_6, kBankerRound8);
2842 *start++ = '0' + (dec_digits == 1000000);
2843 *start++ = '.';
2844 start = uitoa_z6(dec_digits, start);
2845 if (fabs(dxx_10_6 - u31tod(dec_digits)) >= 0.00000005) {
2846 return start;
2847 }
2848 TrailingZeroesToSpaces(start);
2849 return start;
2850 }
2851
dtoa_f_probp6_clipped(double dxx,char * start)2852 char* dtoa_f_probp6_clipped(double dxx, char* start) {
2853 double dxx_10_6 = dxx * 1000000;
2854 const uint32_t dec_digits = BankerRoundD(dxx_10_6, kBankerRound8);
2855 *start++ = '0' + (dec_digits == 1000000);
2856 *start++ = '.';
2857 start = uitoa_z6(dec_digits, start);
2858 if (fabs(dxx_10_6 - u31tod(dec_digits)) >= 0.00000005) {
2859 return start;
2860 }
2861 return ClipTrailingZeroes(start);
2862 }
2863
2864 /*
2865 char* dtoa_f_p5_clipped(double dxx, char* start) {
2866 if (dxx != dxx) {
2867 return strcpya_k(start, "nan");
2868 }
2869 if (dxx < 0.0) {
2870 // note that "-0" will be printed for very small negative numbers; do we
2871 // want this?
2872 *start++ = '-';
2873 dxx = -dxx;
2874 }
2875 #ifdef __LP64__
2876 if (dxx < 4294967295.999994) {
2877 // We could use different levels of banker's rounding for different-size
2878 // quotients, but that's overkill for now; revisit after basic dosage
2879 // support is working.
2880 dxx *= 100000;
2881 uint64_t remainder = (int64_t)dxx;
2882 remainder += (int64_t)((dxx - ((int64_t)remainder)) + kBankerRound6[remainder & 1]);
2883 uint64_t quotient = remainder / 100000;
2884 remainder = remainder - quotient * 100000;
2885 start = u32toa(quotient, start);
2886 return rtoa_p5(remainder, start);
2887 }
2888 #else
2889 if (dxx < 2147483647.999994) {
2890 // avoid 64-bit integer math in 32-bit build.
2891 // (todo: a bit of benchmarking)
2892 const uintptr_t quotient = (intptr_t)dxx;
2893 const double remainder_d = (dxx - ((intptr_t)quotient)) * 100000;
2894 const uint32_t remainder_d_trunc = (int32_t)remainder_d;
2895 const uint32_t remainder = (int32_t)(remainder_d + kBankerRound6[remainder_d_trunc & 1]);
2896 start = u32toa(quotient, start);
2897 return rtoa_p5(remainder, start);
2898 }
2899 #endif
2900 if (dxx == INFINITY) {
2901 return strcpya_k(start, "inf");
2902 }
2903 // just punt larger numbers to glibc for now, this isn't a bottleneck
2904 start += sprintf(start, "%.5f", dxx);
2905 // .5f doesn't strip trailing zeroes, do that manually
2906 for (uint32_t uii = 0; uii != 5; ++uii) {
2907 if (start[-1] != '0') {
2908 return start;
2909 }
2910 --start;
2911 }
2912 return &(start[-1]); // strip the decimal point
2913 }
2914 */
2915
2916 // todo: benchmark the exponential-notation part of this vs. dtoa_g(); maybe
2917 // dtoa_g() should actually call this (or at least the exponential-notation
2918 // part, put into its own function) for the most extreme values?
lntoa_g(double ln_val,char * start)2919 char* lntoa_g(double ln_val, char* start) {
2920 // log(999999.49999999)
2921 if (ln_val < 13.81551005796414) {
2922 // log(9.9999949999999e-5)
2923 if (ln_val > -9.210340871976317) {
2924 // No exponential notation.
2925
2926 // log(0.99999949999999)
2927 if (ln_val > -5.000001349509205e-7) {
2928 // may as well fast-path x=1; since the most common use-case for this
2929 // function is p-value printing, x=1 should happen a lot more than x>1.
2930 // log(1.0000050000001)
2931 if (ln_val < 4.999987599993995e-6) {
2932 *start++ = '1';
2933 return start;
2934 }
2935 return dtoa_so6(exp(ln_val), start);
2936 }
2937 double dxx = exp(ln_val);
2938 // 6 sig fig decimal, no less than ~0.0001
2939 start = memcpya_k(start, "0.", 2);
2940 if (dxx < 9.9999949999999e-3) {
2941 dxx *= 100;
2942 start = memcpya_k(start, "00", 2);
2943 }
2944 if (dxx < 9.9999949999999e-2) {
2945 dxx *= 10;
2946 *start++ = '0';
2947 }
2948 return uitoa_trunc6(BankerRoundD(dxx * 1000000, kBankerRound8), start);
2949 }
2950 // if exponent is in danger of overflowing int32, just print '0'
2951 if (ln_val < 0x7ffffffb * (-kLn10)) {
2952 *start++ = '0';
2953 return start;
2954 }
2955 } else {
2956 // if exponent is in danger of overflowing int32, just print 'inf'
2957 if (ln_val > 0x7ffffffb * kLn10) {
2958 return strcpya_k(start, "inf");
2959 }
2960 }
2961 int32_t xp10 = S_CAST(int32_t, (5.000001349509205e-7 + ln_val) * kRecipLn10);
2962 double mantissa = exp(xp10 * (-kLn10) + ln_val);
2963 // mantissa will usually be in [.9999995, 9.999995], but |ln_val| can be
2964 // larger than 2^32, and floating point errors in either direction are
2965 // definitely possible (<20 bits of precision).
2966 if (mantissa < 0.99999949999999) {
2967 mantissa *= 10;
2968 xp10 -= 1;
2969 } else if (mantissa > 9.9999949999999) {
2970 mantissa *= 0.1;
2971 xp10 += 1;
2972 }
2973 uint32_t quotient;
2974 uint32_t remainder;
2975 BankerRoundD5(mantissa, kBankerRound8, "ient, &remainder);
2976 start = qrtoa_1p5(quotient, remainder, start);
2977 if (xp10 < 0) {
2978 start = memcpya_k(start, "e-", 2);
2979 if (xp10 > -10) {
2980 *start++ = '0';
2981 }
2982 return u32toa(-xp10, start);
2983 }
2984 start = memcpya_k(start, "e+", 2);
2985 if (xp10 < 10) {
2986 *start++ = '0';
2987 }
2988 return u32toa(xp10, start);
2989 }
2990
2991 // Previously had specialized float-printing functions, but upon reflection it
2992 // makes sense to just promote to double like printf does.
2993
2994
ScanForDuplicateIds(const char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_blen)2995 CXXCONST_CP ScanForDuplicateIds(const char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_blen) {
2996 --id_ct;
2997 for (uintptr_t id_idx = 0; id_idx != id_ct; ++id_idx) {
2998 if (strequal_overread(&(sorted_ids[id_idx * max_id_blen]), &(sorted_ids[(id_idx + 1) * max_id_blen]))) {
2999 return S_CAST(CXXCONST_CP, &(sorted_ids[id_idx * max_id_blen]));
3000 }
3001 }
3002 return nullptr;
3003 }
3004
CollapseDuplicateIds(uintptr_t id_ct,uintptr_t max_id_blen,char * sorted_ids,uint32_t * id_starts)3005 uint32_t CollapseDuplicateIds(uintptr_t id_ct, uintptr_t max_id_blen, char* sorted_ids, uint32_t* id_starts) {
3006 // Collapses array of sorted IDs to remove duplicates, and writes
3007 // pre-collapse positions to id_starts (so e.g. duplication count of any
3008 // sample ID can be determined via subtraction) if it isn't nullptr.
3009 // Returns id_ct of collapsed array.
3010 if (!id_ct) {
3011 return 0;
3012 }
3013 uintptr_t read_idx = 1;
3014 uintptr_t write_idx;
3015 if (id_starts) {
3016 id_starts[0] = 0;
3017 for (; read_idx != id_ct; ++read_idx) {
3018 if (strequal_overread(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3019 break;
3020 }
3021 id_starts[read_idx] = read_idx;
3022 }
3023 write_idx = read_idx;
3024 while (++read_idx < id_ct) {
3025 // this loop can probably be improved with string-length tracking...
3026 if (!strequal_overread(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3027 strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
3028 id_starts[write_idx++] = read_idx;
3029 }
3030 }
3031 } else {
3032 for (; read_idx != id_ct; ++read_idx) {
3033 if (strequal_overread(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3034 break;
3035 }
3036 }
3037 write_idx = read_idx;
3038 while (++read_idx < id_ct) {
3039 if (!strequal_overread(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
3040 strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
3041 ++write_idx;
3042 }
3043 }
3044 }
3045 return write_idx;
3046 }
3047
3048
bsearch_str(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx)3049 int32_t bsearch_str(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
3050 // does not assume null-terminated idbuf, or nonempty array.
3051 if (cur_id_slen >= max_id_blen) {
3052 return -1;
3053 }
3054 uintptr_t start_idx = 0;
3055 while (start_idx < end_idx) {
3056 const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3057 const int32_t ii = Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen);
3058 if (ii > 0) {
3059 start_idx = mid_idx + 1;
3060 } else if ((ii < 0) || sorted_strbox[mid_idx * max_id_blen + cur_id_slen]) {
3061 end_idx = mid_idx;
3062 } else {
3063 return S_CAST(uint32_t, mid_idx);
3064 }
3065 }
3066 return -1;
3067 }
3068
bsearch_str_natural(const char * idbuf,const char * sorted_strbox,uintptr_t max_id_blen,uintptr_t end_idx)3069 int32_t bsearch_str_natural(const char* idbuf, const char* sorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx) {
3070 // unlike bsearch_str(), caller is responsible for slen >= max_id_blen check
3071 // if appropriate here
3072 uintptr_t start_idx = 0;
3073 while (start_idx < end_idx) {
3074 const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3075 const int32_t ii = strcmp_natural(idbuf, &(sorted_strbox[mid_idx * max_id_blen]));
3076 if (ii > 0) {
3077 start_idx = mid_idx + 1;
3078 } else if (ii < 0) {
3079 end_idx = mid_idx;
3080 } else {
3081 return S_CAST(uint32_t, mid_idx);
3082 }
3083 }
3084 return -1;
3085 }
3086
bsearch_str_lb(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx)3087 uintptr_t bsearch_str_lb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
3088 // returns number of elements in sorted_strbox[] less than idbuf.
3089 if (cur_id_slen > max_id_blen) {
3090 cur_id_slen = max_id_blen;
3091 }
3092 uintptr_t start_idx = 0;
3093 while (start_idx < end_idx) {
3094 const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3095 if (Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen) > 0) {
3096 start_idx = mid_idx + 1;
3097 } else {
3098 end_idx = mid_idx;
3099 }
3100 }
3101 return start_idx;
3102 }
3103
ExpsearchStrLb(const char * idbuf,const char * sorted_strbox,uintptr_t cur_id_slen,uintptr_t max_id_blen,uintptr_t end_idx,uintptr_t cur_idx)3104 uintptr_t ExpsearchStrLb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx, uintptr_t cur_idx) {
3105 uintptr_t next_incr = 1;
3106 uintptr_t start_idx = cur_idx;
3107 while (cur_idx < end_idx) {
3108 if (Memcmp(idbuf, &(sorted_strbox[cur_idx * max_id_blen]), cur_id_slen) <= 0) {
3109 end_idx = cur_idx;
3110 break;
3111 }
3112 start_idx = cur_idx + 1;
3113 cur_idx += next_incr;
3114 next_incr *= 2;
3115 }
3116 while (start_idx < end_idx) {
3117 const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3118 if (Memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen) > 0) {
3119 start_idx = mid_idx + 1;
3120 } else {
3121 end_idx = mid_idx;
3122 }
3123 }
3124 return start_idx;
3125 }
3126
ExpsearchNsortStrLb(const char * idbuf,const char * nsorted_strbox,uintptr_t max_id_blen,uintptr_t end_idx,uintptr_t cur_idx)3127 uintptr_t ExpsearchNsortStrLb(const char* idbuf, const char* nsorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx, uintptr_t cur_idx) {
3128 uintptr_t next_incr = 1;
3129 uintptr_t start_idx = cur_idx;
3130 while (cur_idx < end_idx) {
3131 if (strcmp_natural(idbuf, &(nsorted_strbox[cur_idx * max_id_blen])) <= 0) {
3132 end_idx = cur_idx;
3133 break;
3134 }
3135 start_idx = cur_idx + 1;
3136 cur_idx += next_incr;
3137 next_incr *= 2;
3138 }
3139 while (start_idx < end_idx) {
3140 const uintptr_t mid_idx = (start_idx + end_idx) / 2;
3141 if (strcmp_natural(idbuf, &(nsorted_strbox[mid_idx * max_id_blen])) > 0) {
3142 start_idx = mid_idx + 1;
3143 } else {
3144 end_idx = mid_idx;
3145 }
3146 }
3147 return start_idx;
3148 }
3149
IsInfStr(const char * ss,uint32_t slen,uint32_t * is_neg_ptr)3150 uint32_t IsInfStr(const char* ss, uint32_t slen, uint32_t* is_neg_ptr) {
3151 const char first_char = *ss;
3152 if (first_char == '-') {
3153 *is_neg_ptr = 1;
3154 ++ss;
3155 --slen;
3156 } else if (first_char == '+') {
3157 ++ss;
3158 --slen;
3159 }
3160 if (slen == 3) {
3161 uint32_t four_chars;
3162 memcpy(&four_chars, ss, 4); // OVERREAD
3163 // assumes little-endian
3164 return ((four_chars & 0xdfdfdf) == 0x464e49);
3165 }
3166 if (slen != 8) {
3167 return 0;
3168 }
3169 uint64_t eight_chars;
3170 memcpy(&eight_chars, ss, 8);
3171 return ((eight_chars & 0xdfdfdfdfdfdfdfdfLLU) == 0x5954494e49464e49LLU);
3172 }
3173
3174
3175 #ifdef __LP64__
Memrchr(const char * str_start,char needle,uintptr_t slen)3176 CXXCONST_CP Memrchr(const char* str_start, char needle, uintptr_t slen) {
3177 const VecI8 vvec_all_needle = veci8_set1(needle);
3178 const uintptr_t str_end_addr = R_CAST(uintptr_t, str_start) + slen;
3179 const uint32_t trailing_byte_ct = str_end_addr % kBytesPerVec;
3180 const VecI8* str_rev_viter = R_CAST(const VecI8*, RoundDownPow2(str_end_addr, kBytesPerVec));
3181 if (trailing_byte_ct) {
3182 // This is a GNU vector extension parallel-equality check, which gets
3183 // translated to e.g. _mm256_cmpeq_epi8().
3184 // As long as we're performing aligned reads, it's safe to read bytes
3185 // beyond str_end as long as they're in the same vector; we only risk
3186 // violating process read permissions if we cross a page boundary.
3187 // (For this reason, I don't bother with AVX unaligned reads.)
3188 const VecI8 match_vvec = (*str_rev_viter == vvec_all_needle);
3189 uint32_t matching_bytes = veci8_movemask(match_vvec);
3190 matching_bytes &= (1U << (trailing_byte_ct % kBytesPerVec)) - 1;
3191 if (str_start > R_CAST(const char*, str_rev_viter)) {
3192 const uint32_t leading_byte_ct = R_CAST(uintptr_t, str_start) % kBytesPerVec;
3193 matching_bytes &= -(1U << leading_byte_ct);
3194 // Special-case this, since main_loop_iter_ct below would underflow.
3195 if (!matching_bytes) {
3196 return nullptr;
3197 }
3198 }
3199 if (matching_bytes) {
3200 const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3201 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3202 }
3203 }
3204 const uintptr_t main_loop_iter_ct = (R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start)) / (2 * kBytesPerVec);
3205 for (uintptr_t ulii = 0; ulii != main_loop_iter_ct; ++ulii) {
3206 // For long lines, looping over two vectors at a time is most efficient on
3207 // my Mac (also tried 1 and 4).
3208 --str_rev_viter;
3209 const VecI8 match_vvec1 = (*str_rev_viter == vvec_all_needle);
3210 --str_rev_viter;
3211 const VecI8 match_vvec0 = (*str_rev_viter == vvec_all_needle);
3212 const uint32_t matching_bytes = veci8_movemask(match_vvec1 | match_vvec0);
3213 if (matching_bytes) {
3214 const uint32_t matching_bytes1 = veci8_movemask(match_vvec1);
3215 if (matching_bytes1) {
3216 const uint32_t byte_offset_in_vec = bsru32(matching_bytes1);
3217 return &(R_CAST(CXXCONST_CP, &(str_rev_viter[1]))[byte_offset_in_vec]);
3218 }
3219 const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3220 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3221 }
3222 }
3223 while (1) {
3224 uintptr_t remaining_byte_ct_underflow = R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start);
3225 if (S_CAST(intptr_t, remaining_byte_ct_underflow) <= 0) {
3226 return nullptr;
3227 }
3228 --str_rev_viter;
3229 const VecI8 match_vvec = (*str_rev_viter == vvec_all_needle);
3230 const uint32_t matching_bytes = veci8_movemask(match_vvec);
3231 if (matching_bytes) {
3232 const uint32_t byte_offset_in_vec = bsru32(matching_bytes);
3233 if (byte_offset_in_vec + remaining_byte_ct_underflow < kBytesPerVec) {
3234 return nullptr;
3235 }
3236 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3237 }
3238 }
3239 }
3240
LastSpaceOrEoln(const char * str_start,uintptr_t slen)3241 CXXCONST_CP LastSpaceOrEoln(const char* str_start, uintptr_t slen) {
3242 // See TokenLexK0().
3243 const VecUc vvec_all95 = vecuc_set1(95);
3244 const uintptr_t str_end_addr = R_CAST(uintptr_t, str_start) + slen;
3245 const uint32_t trailing_byte_ct = str_end_addr % kBytesPerVec;
3246 const VecUc* str_rev_viter = R_CAST(const VecUc*, RoundDownPow2(str_end_addr, kBytesPerVec));
3247 if (trailing_byte_ct) {
3248 // As long as we're performing aligned reads, it's safe to read bytes
3249 // beyond str_end as long as they're in the same vector; we only risk
3250 // violating process read permissions if we cross a page boundary.
3251 const VecUc postspace_vvec = vecuc_adds(*str_rev_viter, vvec_all95);
3252 uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
3253 nontoken_bytes &= (1U << (trailing_byte_ct % kBytesPerVec)) - 1;
3254 if (str_start > R_CAST(const char*, str_rev_viter)) {
3255 const uint32_t leading_byte_ct = R_CAST(uintptr_t, str_start) % kBytesPerVec;
3256 nontoken_bytes &= -(1U << leading_byte_ct);
3257 // Special-case this, since main_loop_iter_ct below would underflow.
3258 if (!nontoken_bytes) {
3259 return nullptr;
3260 }
3261 }
3262 if (nontoken_bytes) {
3263 const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3264 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3265 }
3266 }
3267 const uintptr_t main_loop_iter_ct = (R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start)) / (2 * kBytesPerVec);
3268 for (uintptr_t ulii = 0; ulii != main_loop_iter_ct; ++ulii) {
3269 --str_rev_viter;
3270 const VecUc postspace_vvec1 = vecuc_adds(*str_rev_viter, vvec_all95);
3271 --str_rev_viter;
3272 const VecUc postspace_vvec0 = vecuc_adds(*str_rev_viter, vvec_all95);
3273 const uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec1 & postspace_vvec0));
3274 if (nontoken_bytes) {
3275 const uint32_t nontoken_bytes1 = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec1));
3276 if (nontoken_bytes1) {
3277 const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes1);
3278 return &(R_CAST(CXXCONST_CP, &(str_rev_viter[1]))[byte_offset_in_vec]);
3279 }
3280 const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3281 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3282 }
3283 }
3284 while (1) {
3285 uintptr_t remaining_byte_ct_underflow = R_CAST(uintptr_t, str_rev_viter) - R_CAST(uintptr_t, str_start);
3286 if (S_CAST(intptr_t, remaining_byte_ct_underflow) <= 0) {
3287 return nullptr;
3288 }
3289 --str_rev_viter;
3290 const VecUc postspace_vvec = vecuc_adds(*str_rev_viter, vvec_all95);
3291 const uint32_t nontoken_bytes = S_CAST(Vec8thUint, ~vecuc_movemask(postspace_vvec));
3292 if (nontoken_bytes) {
3293 const uint32_t byte_offset_in_vec = bsru32(nontoken_bytes);
3294 if (byte_offset_in_vec + remaining_byte_ct_underflow < kBytesPerVec) {
3295 return nullptr;
3296 }
3297 return &(R_CAST(CXXCONST_CP, str_rev_viter)[byte_offset_in_vec]);
3298 }
3299 }
3300 }
3301 #endif // __LP64__
3302
3303 /*
3304 void ReplaceAllInstances(char old_char, char new_char, uint32_t slen, char* dst) {
3305 // probable todo: add SIMD version
3306 while (1) {
3307 char* strchr_result = memchr(dst, old_char, slen);
3308 if (!strchr_result) {
3309 return;
3310 }
3311 *strchr_result++ = new_char;
3312 slen -= strchr_result - dst;
3313 dst = strchr_result;
3314 }
3315 }
3316 */
3317
TabsToSpaces(char * ss_iter)3318 void TabsToSpaces(char* ss_iter) {
3319 while (1) {
3320 ss_iter = strchr(ss_iter, '\t');
3321 if (!ss_iter) {
3322 return;
3323 }
3324 *ss_iter++ = ' ';
3325 }
3326 }
3327
ReplaceCharAdvChecked(char old_char,char new_char,char ** str_ptr)3328 BoolErr ReplaceCharAdvChecked(char old_char, char new_char, char** str_ptr) {
3329 char* token_end;
3330 for (char* str_iter = *str_ptr; ; str_iter = &(token_end[1])) {
3331 token_end = strchrnul2(str_iter, old_char, new_char);
3332 if ((*token_end) != old_char) {
3333 if (likely(!(*token_end))) {
3334 *str_ptr = token_end;
3335 return 0;
3336 }
3337 return 1;
3338 }
3339 *token_end = new_char;
3340 }
3341 }
3342
3343 #ifdef __cplusplus
3344 } // namespace plink2
3345 #endif
3346