1 // This library is part of PLINK 2, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_base.h"
19
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23
24 uintptr_t g_failed_alloc_attempt_size = 0;
25
26 #if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
pgl_malloc(uintptr_t size,void * pp)27 BoolErr pgl_malloc(uintptr_t size, void* pp) {
28 *S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
29 if (likely(*S_CAST(unsigned char**, pp))) {
30 return 0;
31 }
32 g_failed_alloc_attempt_size = size;
33 return 1;
34 }
35 #endif
36
fwrite_checked(const void * buf,uintptr_t len,FILE * outfile)37 BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile) {
38 while (len > kMaxBytesPerIO) {
39 // OS X fwrite() doesn't support 2GiB+ writes
40 // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
41 // write size
42 // bugfix (9 Mar 2018): forgot a 'not' here...
43 if (unlikely(!fwrite_unlocked(buf, kMaxBytesPerIO, 1, outfile))) {
44 return 1;
45 }
46 buf = &(S_CAST(const unsigned char*, buf)[kMaxBytesPerIO]);
47 len -= kMaxBytesPerIO;
48 }
49 uintptr_t written_byte_ct = fwrite_unlocked(buf, 1, len, outfile);
50 // must do the right thing when len == 0
51 return (written_byte_ct != len);
52 }
53
54 /*
55 IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
56 uintptr_t bytes_read = 0;
57 while (len > kMaxBytesPerIO) {
58 const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
59 bytes_read += cur_bytes_read;
60 if (cur_bytes_read != kMaxBytesPerIO) {
61 *bytes_read_ptr = bytes_read;
62 return ferror_unlocked(infile);
63 }
64 buf = &(((char*)buf)[kMaxBytesPerIO]);
65 len -= kMaxBytesPerIO;
66 }
67 bytes_read += fread_unlocked(buf, 1, len, infile);
68 *bytes_read_ptr = bytes_read;
69 // could skip ferror_unlocked call if bytes_read == original len
70 return ferror_unlocked(infile);
71 }
72 */
73
fread_checked(void * buf,uintptr_t len,FILE * infile)74 BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile) {
75 while (len > kMaxBytesPerIO) {
76 const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
77 if (unlikely(cur_bytes_read != kMaxBytesPerIO)) {
78 return 1;
79 }
80 buf = &(S_CAST(unsigned char*, buf)[kMaxBytesPerIO]);
81 len -= kMaxBytesPerIO;
82 }
83 const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, len, infile);
84 return (cur_bytes_read != len);
85 }
86
87 #ifdef __LP64__
ScanUintCappedFinish(const char * str_iter,uint64_t cap,uint32_t * valp)88 static inline BoolErr ScanUintCappedFinish(const char* str_iter, uint64_t cap, uint32_t* valp) {
89 uint64_t val = *valp;
90 while (1) {
91 // a little bit of unrolling seems to help
92 const uint64_t cur_digit = ctou64(*str_iter++) - 48;
93 if (cur_digit >= 10) {
94 break;
95 }
96 // val = val * 10 + cur_digit;
97 const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
98 if (cur_digit2 >= 10) {
99 val = val * 10 + cur_digit;
100 if (unlikely(val > cap)) {
101 return 1;
102 }
103 break;
104 }
105 val = val * 100 + cur_digit * 10 + cur_digit2;
106 if (unlikely(val > cap)) {
107 return 1;
108 }
109 }
110 *valp = val;
111 return 0;
112 }
113
ScanPosintCapped(const char * str_iter,uint64_t cap,uint32_t * valp)114 BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
115 // '0' has ascii code 48
116 assert(ctou32(str_iter[0]) > 32);
117 *valp = ctou32(*str_iter++) - 48;
118 if (*valp >= 10) {
119 // permit leading '+' (ascii 43), but not '++' or '+-'
120 // reasonable to use unlikely() here since these functions aren't used for
121 // numeric vs. non-numeric classification anyway due to erroring out on
122 // overflow
123 if (unlikely(*valp != 0xfffffffbU)) {
124 return 1;
125 }
126 *valp = ctou32(*str_iter++) - 48;
127 if (unlikely(*valp >= 10)) {
128 return 1;
129 }
130 }
131 while (!(*valp)) {
132 *valp = ctou32(*str_iter++) - 48;
133 if (unlikely((*valp) >= 10)) {
134 return 1;
135 }
136 }
137 return ScanUintCappedFinish(str_iter, cap, valp);
138 }
139
140 // Note that NumericRangeListToBitarr() can call this in an ignore-overflow
141 // mode. If similar logic ever goes into an inner loop, remove all unlikely()
142 // annotations in this function and its children.
ScanUintCapped(const char * str_iter,uint64_t cap,uint32_t * valp)143 BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
144 // Reads an integer in [0, cap]. Assumes first character is nonspace.
145 assert(ctou32(str_iter[0]) > 32);
146 uint32_t val = ctou32(*str_iter++) - 48;
147 if (val >= 10) {
148 if (val != 0xfffffffbU) {
149 // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
150 if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
151 return 1;
152 }
153 // accept "-0", "-00", etc.
154 while (*(++str_iter) == '0');
155 *valp = 0;
156 return (ctou32(*str_iter) - 48) < 10;
157 }
158 // accept leading '+'
159 val = ctou32(*str_iter++) - 48;
160 if (unlikely(val >= 10)) {
161 return 1;
162 }
163 }
164 *valp = val;
165 return ScanUintCappedFinish(str_iter, cap, valp);
166 }
167
ScanIntAbsBounded(const char * str_iter,uint64_t bound,int32_t * valp)168 BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp) {
169 // Reads an integer in [-bound, bound]. Assumes first character is nonspace.
170 assert(ctou32(str_iter[0]) > 32);
171 *valp = ctou32(*str_iter++) - 48;
172 int32_t sign = 1;
173 if (ctou32(*valp) >= 10) {
174 if (*valp == -3) {
175 sign = -1;
176 } else if (unlikely(*valp != -5)) {
177 return 1;
178 }
179 *valp = ctou32(*str_iter++) - 48;
180 if (unlikely(*valp >= 10)) {
181 return 1;
182 }
183 }
184 if (unlikely(ScanUintCappedFinish(str_iter, bound, R_CAST(uint32_t*, valp)))) {
185 return 1;
186 }
187 *valp *= sign;
188 return 0;
189 }
190 #else // not __LP64__
ScanPosintCapped32(const char * str_iter,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)191 BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
192 // '0' has ascii code 48
193 assert(ctou32(str_iter[0]) > 32);
194 uint32_t val = ctou32(*str_iter++) - 48;
195 if (val >= 10) {
196 if (unlikely(val != 0xfffffffbU)) {
197 return 1;
198 }
199 val = ctou32(*str_iter++) - 48;
200 if (unlikely(val >= 10)) {
201 return 1;
202 }
203 }
204 while (!val) {
205 val = ctou32(*str_iter++) - 48;
206 if (unlikely(val >= 10)) {
207 return 1;
208 }
209 }
210 for (; ; ++str_iter) {
211 const uint32_t cur_digit = ctou32(*str_iter) - 48;
212 if (cur_digit >= 10) {
213 *valp = val;
214 return 0;
215 }
216 // avoid integer overflow in middle of computation
217 if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
218 return 1;
219 }
220 val = val * 10 + cur_digit;
221 }
222 }
223
ScanUintCapped32(const char * str_iter,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)224 BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
225 // Reads an integer in [0, cap]. Assumes first character is nonspace.
226 assert(ctou32(str_iter[0]) > 32);
227 uint32_t val = ctou32(*str_iter++) - 48;
228 if (val >= 10) {
229 if (val != 0xfffffffbU) {
230 if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
231 return 1;
232 }
233 while (*(++str_iter) == '0');
234 *valp = 0;
235 return (ctou32(*str_iter) - 48) < 10;
236 }
237 val = ctou32(*str_iter++) - 48;
238 if (unlikely(val >= 10)) {
239 return 1;
240 }
241 }
242 for (; ; ++str_iter) {
243 const uint32_t cur_digit = ctou32(*str_iter) - 48;
244 if (cur_digit >= 10) {
245 *valp = val;
246 return 0;
247 }
248 if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
249 return 1;
250 }
251 val = val * 10 + cur_digit;
252 }
253 }
254
ScanIntAbsBounded32(const char * str_iter,uint32_t bound_div_10,uint32_t bound_mod_10,int32_t * valp)255 BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
256 // Reads an integer in [-bound, bound]. Assumes first character is nonspace.
257 assert(ctou32(str_iter[0]) > 32);
258 uint32_t val = ctou32(*str_iter++) - 48;
259 int32_t sign = 1;
260 if (val >= 10) {
261 if (val == 0xfffffffdU) {
262 sign = -1;
263 } else if (unlikely(val != 0xfffffffbU)) {
264 return 1;
265 }
266 val = ctou32(*str_iter++) - 48;
267 if (unlikely(val >= 10)) {
268 return 1;
269 }
270 }
271 for (; ; ++str_iter) {
272 const uint32_t cur_digit = ctou32(*str_iter) - 48;
273 if (cur_digit >= 10) {
274 *valp = sign * S_CAST(int32_t, val);
275 return 0;
276 }
277 if (unlikely((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10)))) {
278 return 1;
279 }
280 val = val * 10 + cur_digit;
281 }
282 }
283 #endif
284
aligned_malloc(uintptr_t size,uintptr_t alignment,void * aligned_pp)285 BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp) {
286 // Assumes malloc returns word-aligned addresses.
287 assert(alignment);
288 assert(!(alignment % kBytesPerWord));
289 uintptr_t malloc_addr;
290 if (unlikely(pgl_malloc(size + alignment, &malloc_addr))) {
291 return 1;
292 }
293 assert(!(malloc_addr % kBytesPerWord));
294 uintptr_t** casted_aligned_pp = S_CAST(uintptr_t**, aligned_pp);
295 *casted_aligned_pp = R_CAST(uintptr_t*, RoundDownPow2(malloc_addr + alignment, alignment));
296 (*casted_aligned_pp)[-1] = malloc_addr;
297 return 0;
298 }
299
300 #ifdef __LP64__
memequal(const void * m1,const void * m2,uintptr_t byte_ct)301 int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
302 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
303 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
304 if (byte_ct < 16 + (kBytesPerVec / 2)) {
305 if (byte_ct < kBytesPerWord) {
306 if (byte_ct < 4) {
307 if (byte_ct < 2) {
308 return (!byte_ct) || (m1_uc[0] == m2_uc[0]);
309 }
310 if ((*S_CAST(const uint16_t*, m1)) != (*S_CAST(const uint16_t*, m2))) {
311 return 0;
312 }
313 if ((byte_ct == 3) && (m1_uc[2] != m2_uc[2])) {
314 return 0;
315 }
316 return 1;
317 }
318 if ((*R_CAST(const uint32_t*, m1_uc)) != (*R_CAST(const uint32_t*, m2_uc))) {
319 return 0;
320 }
321 if (byte_ct > 4) {
322 const uintptr_t final_offset = byte_ct - 4;
323 if ((*R_CAST(const uint32_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uint32_t*, &(m2_uc[final_offset])))) {
324 return 0;
325 }
326 }
327 return 1;
328 }
329 const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
330 const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
331 if (m1_alias[0] != m2_alias[0]) {
332 return 0;
333 }
334 if (byte_ct >= 16) {
335 if (m1_alias[1] != m2_alias[1]) {
336 return 0;
337 }
338 # ifdef USE_AVX2
339 if (byte_ct >= 24) {
340 if (m1_alias[2] != m2_alias[2]) {
341 return 0;
342 }
343 }
344 # endif
345 }
346 if (byte_ct % kBytesPerWord) {
347 const uintptr_t final_offset = byte_ct - kBytesPerWord;
348 if ((*R_CAST(const uintptr_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uintptr_t*, &(m2_uc[final_offset])))) {
349 return 0;
350 }
351 }
352 return 1;
353 }
354 // Don't use VecW since _mm_cmpeq_epi64() not defined until SSE4.1.
355 const VecUc* m1_alias = S_CAST(const VecUc*, m1);
356 const VecUc* m2_alias = S_CAST(const VecUc*, m2);
357 const uintptr_t vec_ct = byte_ct / kBytesPerVec;
358 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
359 // tried unrolling this, doesn't make a difference
360 const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
361 const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
362 if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
363 return 0;
364 }
365 }
366 if (byte_ct % kBytesPerVec) {
367 // put this last instead of first, for better behavior when inputs are
368 // aligned
369 const uintptr_t final_offset = byte_ct - kBytesPerVec;
370 const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
371 const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
372 if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
373 return 0;
374 }
375 }
376 return 1;
377 }
378
379 // clang/gcc memcmp is not that well-optimized for the short strings we usually
380 // compare.
Memcmp(const void * m1,const void * m2,uintptr_t byte_ct)381 int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) {
382 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
383 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
384 // tried larger crossover threshold, doesn't help
385 if (byte_ct < kBytesPerVec) {
386 if (byte_ct < kBytesPerWord) {
387 if (byte_ct < 4) {
388 for (uintptr_t pos = 0; pos != byte_ct; ++pos) {
389 const unsigned char ucc1 = m1_uc[pos];
390 const unsigned char ucc2 = m2_uc[pos];
391 if (ucc1 != ucc2) {
392 return (ucc1 < ucc2)? -1 : 1;
393 }
394 }
395 return 0;
396 }
397 uint32_t m1_u32 = *S_CAST(const uint32_t*, m1);
398 uint32_t m2_u32 = *S_CAST(const uint32_t*, m2);
399 if (m1_u32 != m2_u32) {
400 return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
401 }
402 if (byte_ct > 4) {
403 const uintptr_t final_offset = byte_ct - 4;
404 m1_u32 = *R_CAST(const uint32_t*, &(m1_uc[final_offset]));
405 m2_u32 = *R_CAST(const uint32_t*, &(m2_uc[final_offset]));
406 if (m1_u32 != m2_u32) {
407 return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
408 }
409 }
410 return 0;
411 }
412 const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
413 const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
414 uintptr_t m1_word = m1_alias[0];
415 uintptr_t m2_word = m2_alias[0];
416 if (m1_word != m2_word) {
417 return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
418 }
419 # ifdef USE_AVX2
420 if (byte_ct >= 16) {
421 m1_word = m1_alias[1];
422 m2_word = m2_alias[1];
423 if (m1_word != m2_word) {
424 return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
425 }
426 if (byte_ct >= 24) {
427 m1_word = m1_alias[2];
428 m2_word = m2_alias[2];
429 if (m1_word != m2_word) {
430 return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
431 }
432 }
433 }
434 # endif
435 if (byte_ct % kBytesPerWord) {
436 const uintptr_t final_offset = byte_ct - kBytesPerWord;
437 m1_word = *R_CAST(const uintptr_t*, &(m1_uc[final_offset]));
438 m2_word = *R_CAST(const uintptr_t*, &(m2_uc[final_offset]));
439 if (m1_word != m2_word) {
440 return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
441 }
442 }
443 return 0;
444 }
445 const VecUc* m1_alias = S_CAST(const VecUc*, m1);
446 const VecUc* m2_alias = S_CAST(const VecUc*, m2);
447 const uintptr_t fullvec_ct = byte_ct / kBytesPerVec;
448 // uh, clang/LLVM -O2 optimizes this better when comparison is != instead of
449 // <? ugh, time to change all of the for loops...
450 // (and yes, both -O3 configurations generate worse code here)
451 // at least for loop is better than do-while loop even when 1 iteration is
452 // guaranteed...
453 for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
454 const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
455 const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
456 // is this even worthwhile now in non-AVX2 case?
457 const uint32_t movemask_result = vecuc_movemask(v1 == v2);
458 if (movemask_result != kVec8thUintMax) {
459 const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result);
460 return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
461 }
462 }
463 if (byte_ct % kBytesPerVec) {
464 const uintptr_t final_offset = byte_ct - kBytesPerVec;
465 const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
466 const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
467 const uint32_t movemask_result = vecuc_movemask(v1 == v2);
468 if (movemask_result != kVec8thUintMax) {
469 const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result);
470 return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
471 }
472 }
473 return 0;
474 }
475 #endif
476
477 #ifdef __cplusplus
478 } // namespace plink2
479 #endif
480