1 // This library is part of PLINK 2, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_base.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
24 uintptr_t g_failed_alloc_attempt_size = 0;
25 
26 #if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
pgl_malloc(uintptr_t size,void * pp)27 BoolErr pgl_malloc(uintptr_t size, void* pp) {
28   *S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
29   if (likely(*S_CAST(unsigned char**, pp))) {
30     return 0;
31   }
32   g_failed_alloc_attempt_size = size;
33   return 1;
34 }
35 #endif
36 
fwrite_checked(const void * buf,uintptr_t len,FILE * outfile)37 BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile) {
38   while (len > kMaxBytesPerIO) {
39     // OS X fwrite() doesn't support 2GiB+ writes
40     // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
41     // write size
42     // bugfix (9 Mar 2018): forgot a 'not' here...
43     if (unlikely(!fwrite_unlocked(buf, kMaxBytesPerIO, 1, outfile))) {
44       return 1;
45     }
46     buf = &(S_CAST(const unsigned char*, buf)[kMaxBytesPerIO]);
47     len -= kMaxBytesPerIO;
48   }
49   uintptr_t written_byte_ct = fwrite_unlocked(buf, 1, len, outfile);
50   // must do the right thing when len == 0
51   return (written_byte_ct != len);
52 }
53 
54 /*
55 IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
56   uintptr_t bytes_read = 0;
57   while (len > kMaxBytesPerIO) {
58     const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
59     bytes_read += cur_bytes_read;
60     if (cur_bytes_read != kMaxBytesPerIO) {
61       *bytes_read_ptr = bytes_read;
62       return ferror_unlocked(infile);
63     }
64     buf = &(((char*)buf)[kMaxBytesPerIO]);
65     len -= kMaxBytesPerIO;
66   }
67   bytes_read += fread_unlocked(buf, 1, len, infile);
68   *bytes_read_ptr = bytes_read;
69   // could skip ferror_unlocked call if bytes_read == original len
70   return ferror_unlocked(infile);
71 }
72 */
73 
fread_checked(void * buf,uintptr_t len,FILE * infile)74 BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile) {
75   while (len > kMaxBytesPerIO) {
76     const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
77     if (unlikely(cur_bytes_read != kMaxBytesPerIO)) {
78       return 1;
79     }
80     buf = &(S_CAST(unsigned char*, buf)[kMaxBytesPerIO]);
81     len -= kMaxBytesPerIO;
82   }
83   const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, len, infile);
84   return (cur_bytes_read != len);
85 }
86 
87 #ifdef __LP64__
ScanUintCappedFinish(const char * str_iter,uint64_t cap,uint32_t * valp)88 static inline BoolErr ScanUintCappedFinish(const char* str_iter, uint64_t cap, uint32_t* valp) {
89   uint64_t val = *valp;
90   while (1) {
91     // a little bit of unrolling seems to help
92     const uint64_t cur_digit = ctou64(*str_iter++) - 48;
93     if (cur_digit >= 10) {
94       break;
95     }
96     // val = val * 10 + cur_digit;
97     const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
98     if (cur_digit2 >= 10) {
99       val = val * 10 + cur_digit;
100       if (unlikely(val > cap)) {
101         return 1;
102       }
103       break;
104     }
105     val = val * 100 + cur_digit * 10 + cur_digit2;
106     if (unlikely(val > cap)) {
107       return 1;
108     }
109   }
110   *valp = val;
111   return 0;
112 }
113 
ScanPosintCapped(const char * str_iter,uint64_t cap,uint32_t * valp)114 BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
115   // '0' has ascii code 48
116   assert(ctou32(str_iter[0]) > 32);
117   *valp = ctou32(*str_iter++) - 48;
118   if (*valp >= 10) {
119     // permit leading '+' (ascii 43), but not '++' or '+-'
120     // reasonable to use unlikely() here since these functions aren't used for
121     // numeric vs. non-numeric classification anyway due to erroring out on
122     // overflow
123     if (unlikely(*valp != 0xfffffffbU)) {
124       return 1;
125     }
126     *valp = ctou32(*str_iter++) - 48;
127     if (unlikely(*valp >= 10)) {
128       return 1;
129     }
130   }
131   while (!(*valp)) {
132     *valp = ctou32(*str_iter++) - 48;
133     if (unlikely((*valp) >= 10)) {
134       return 1;
135     }
136   }
137   return ScanUintCappedFinish(str_iter, cap, valp);
138 }
139 
140 // Note that NumericRangeListToBitarr() can call this in an ignore-overflow
141 // mode.  If similar logic ever goes into an inner loop, remove all unlikely()
142 // annotations in this function and its children.
ScanUintCapped(const char * str_iter,uint64_t cap,uint32_t * valp)143 BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
144   // Reads an integer in [0, cap].  Assumes first character is nonspace.
145   assert(ctou32(str_iter[0]) > 32);
146   uint32_t val = ctou32(*str_iter++) - 48;
147   if (val >= 10) {
148     if (val != 0xfffffffbU) {
149       // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
150       if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
151         return 1;
152       }
153       // accept "-0", "-00", etc.
154       while (*(++str_iter) == '0');
155       *valp = 0;
156       return (ctou32(*str_iter) - 48) < 10;
157     }
158     // accept leading '+'
159     val = ctou32(*str_iter++) - 48;
160     if (unlikely(val >= 10)) {
161       return 1;
162     }
163   }
164   *valp = val;
165   return ScanUintCappedFinish(str_iter, cap, valp);
166 }
167 
ScanIntAbsBounded(const char * str_iter,uint64_t bound,int32_t * valp)168 BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp) {
169   // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
170   assert(ctou32(str_iter[0]) > 32);
171   *valp = ctou32(*str_iter++) - 48;
172   int32_t sign = 1;
173   if (ctou32(*valp) >= 10) {
174     if (*valp == -3) {
175       sign = -1;
176     } else if (unlikely(*valp != -5)) {
177       return 1;
178     }
179     *valp = ctou32(*str_iter++) - 48;
180     if (unlikely(*valp >= 10)) {
181       return 1;
182     }
183   }
184   if (unlikely(ScanUintCappedFinish(str_iter, bound, R_CAST(uint32_t*, valp)))) {
185     return 1;
186   }
187   *valp *= sign;
188   return 0;
189 }
190 #else  // not __LP64__
ScanPosintCapped32(const char * str_iter,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)191 BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
192   // '0' has ascii code 48
193   assert(ctou32(str_iter[0]) > 32);
194   uint32_t val = ctou32(*str_iter++) - 48;
195   if (val >= 10) {
196     if (unlikely(val != 0xfffffffbU)) {
197       return 1;
198     }
199     val = ctou32(*str_iter++) - 48;
200     if (unlikely(val >= 10)) {
201       return 1;
202     }
203   }
204   while (!val) {
205     val = ctou32(*str_iter++) - 48;
206     if (unlikely(val >= 10)) {
207       return 1;
208     }
209   }
210   for (; ; ++str_iter) {
211     const uint32_t cur_digit = ctou32(*str_iter) - 48;
212     if (cur_digit >= 10) {
213       *valp = val;
214       return 0;
215     }
216     // avoid integer overflow in middle of computation
217     if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
218       return 1;
219     }
220     val = val * 10 + cur_digit;
221   }
222 }
223 
ScanUintCapped32(const char * str_iter,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)224 BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
225   // Reads an integer in [0, cap].  Assumes first character is nonspace.
226   assert(ctou32(str_iter[0]) > 32);
227   uint32_t val = ctou32(*str_iter++) - 48;
228   if (val >= 10) {
229     if (val != 0xfffffffbU) {
230       if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
231         return 1;
232       }
233       while (*(++str_iter) == '0');
234       *valp = 0;
235       return (ctou32(*str_iter) - 48) < 10;
236     }
237     val = ctou32(*str_iter++) - 48;
238     if (unlikely(val >= 10)) {
239       return 1;
240     }
241   }
242   for (; ; ++str_iter) {
243     const uint32_t cur_digit = ctou32(*str_iter) - 48;
244     if (cur_digit >= 10) {
245       *valp = val;
246       return 0;
247     }
248     if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
249       return 1;
250     }
251     val = val * 10 + cur_digit;
252   }
253 }
254 
ScanIntAbsBounded32(const char * str_iter,uint32_t bound_div_10,uint32_t bound_mod_10,int32_t * valp)255 BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
256   // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
257   assert(ctou32(str_iter[0]) > 32);
258   uint32_t val = ctou32(*str_iter++) - 48;
259   int32_t sign = 1;
260   if (val >= 10) {
261     if (val == 0xfffffffdU) {
262       sign = -1;
263     } else if (unlikely(val != 0xfffffffbU)) {
264       return 1;
265     }
266     val = ctou32(*str_iter++) - 48;
267     if (unlikely(val >= 10)) {
268       return 1;
269     }
270   }
271   for (; ; ++str_iter) {
272     const uint32_t cur_digit = ctou32(*str_iter) - 48;
273     if (cur_digit >= 10) {
274       *valp = sign * S_CAST(int32_t, val);
275       return 0;
276     }
277     if (unlikely((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10)))) {
278       return 1;
279     }
280     val = val * 10 + cur_digit;
281   }
282 }
283 #endif
284 
aligned_malloc(uintptr_t size,uintptr_t alignment,void * aligned_pp)285 BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp) {
286   // Assumes malloc returns word-aligned addresses.
287   assert(alignment);
288   assert(!(alignment % kBytesPerWord));
289   uintptr_t malloc_addr;
290   if (unlikely(pgl_malloc(size + alignment, &malloc_addr))) {
291     return 1;
292   }
293   assert(!(malloc_addr % kBytesPerWord));
294   uintptr_t** casted_aligned_pp = S_CAST(uintptr_t**, aligned_pp);
295   *casted_aligned_pp = R_CAST(uintptr_t*, RoundDownPow2(malloc_addr + alignment, alignment));
296   (*casted_aligned_pp)[-1] = malloc_addr;
297   return 0;
298 }
299 
300 #ifdef __LP64__
memequal(const void * m1,const void * m2,uintptr_t byte_ct)301 int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
302   const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
303   const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
304   if (byte_ct < 16 + (kBytesPerVec / 2)) {
305     if (byte_ct < kBytesPerWord) {
306       if (byte_ct < 4) {
307         if (byte_ct < 2) {
308           return (!byte_ct) || (m1_uc[0] == m2_uc[0]);
309         }
310         if ((*S_CAST(const uint16_t*, m1)) != (*S_CAST(const uint16_t*, m2))) {
311           return 0;
312         }
313         if ((byte_ct == 3) && (m1_uc[2] != m2_uc[2])) {
314           return 0;
315         }
316         return 1;
317       }
318       if ((*R_CAST(const uint32_t*, m1_uc)) != (*R_CAST(const uint32_t*, m2_uc))) {
319         return 0;
320       }
321       if (byte_ct > 4) {
322         const uintptr_t final_offset = byte_ct - 4;
323         if ((*R_CAST(const uint32_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uint32_t*, &(m2_uc[final_offset])))) {
324           return 0;
325         }
326       }
327       return 1;
328     }
329     const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
330     const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
331     if (m1_alias[0] != m2_alias[0]) {
332       return 0;
333     }
334     if (byte_ct >= 16) {
335       if (m1_alias[1] != m2_alias[1]) {
336         return 0;
337       }
338 #  ifdef USE_AVX2
339       if (byte_ct >= 24) {
340         if (m1_alias[2] != m2_alias[2]) {
341           return 0;
342         }
343       }
344 #  endif
345     }
346     if (byte_ct % kBytesPerWord) {
347       const uintptr_t final_offset = byte_ct - kBytesPerWord;
348       if ((*R_CAST(const uintptr_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uintptr_t*, &(m2_uc[final_offset])))) {
349         return 0;
350       }
351     }
352     return 1;
353   }
354   // Don't use VecW since _mm_cmpeq_epi64() not defined until SSE4.1.
355   const VecUc* m1_alias = S_CAST(const VecUc*, m1);
356   const VecUc* m2_alias = S_CAST(const VecUc*, m2);
357   const uintptr_t vec_ct = byte_ct / kBytesPerVec;
358   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
359     // tried unrolling this, doesn't make a difference
360     const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
361     const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
362     if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
363       return 0;
364     }
365   }
366   if (byte_ct % kBytesPerVec) {
367     // put this last instead of first, for better behavior when inputs are
368     // aligned
369     const uintptr_t final_offset = byte_ct - kBytesPerVec;
370     const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
371     const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
372     if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
373       return 0;
374     }
375   }
376   return 1;
377 }
378 
379 // clang/gcc memcmp is not that well-optimized for the short strings we usually
380 // compare.
Memcmp(const void * m1,const void * m2,uintptr_t byte_ct)381 int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) {
382   const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
383   const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
384   // tried larger crossover threshold, doesn't help
385   if (byte_ct < kBytesPerVec) {
386     if (byte_ct < kBytesPerWord) {
387       if (byte_ct < 4) {
388         for (uintptr_t pos = 0; pos != byte_ct; ++pos) {
389           const unsigned char ucc1 = m1_uc[pos];
390           const unsigned char ucc2 = m2_uc[pos];
391           if (ucc1 != ucc2) {
392             return (ucc1 < ucc2)? -1 : 1;
393           }
394         }
395         return 0;
396       }
397       uint32_t m1_u32 = *S_CAST(const uint32_t*, m1);
398       uint32_t m2_u32 = *S_CAST(const uint32_t*, m2);
399       if (m1_u32 != m2_u32) {
400         return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
401       }
402       if (byte_ct > 4) {
403         const uintptr_t final_offset = byte_ct - 4;
404         m1_u32 = *R_CAST(const uint32_t*, &(m1_uc[final_offset]));
405         m2_u32 = *R_CAST(const uint32_t*, &(m2_uc[final_offset]));
406         if (m1_u32 != m2_u32) {
407           return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
408         }
409       }
410       return 0;
411     }
412     const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
413     const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
414     uintptr_t m1_word = m1_alias[0];
415     uintptr_t m2_word = m2_alias[0];
416     if (m1_word != m2_word) {
417       return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
418     }
419 #  ifdef USE_AVX2
420     if (byte_ct >= 16) {
421       m1_word = m1_alias[1];
422       m2_word = m2_alias[1];
423       if (m1_word != m2_word) {
424         return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
425       }
426       if (byte_ct >= 24) {
427         m1_word = m1_alias[2];
428         m2_word = m2_alias[2];
429         if (m1_word != m2_word) {
430           return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
431         }
432       }
433     }
434 #  endif
435     if (byte_ct % kBytesPerWord) {
436       const uintptr_t final_offset = byte_ct - kBytesPerWord;
437       m1_word = *R_CAST(const uintptr_t*, &(m1_uc[final_offset]));
438       m2_word = *R_CAST(const uintptr_t*, &(m2_uc[final_offset]));
439       if (m1_word != m2_word) {
440         return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
441       }
442     }
443     return 0;
444   }
445   const VecUc* m1_alias = S_CAST(const VecUc*, m1);
446   const VecUc* m2_alias = S_CAST(const VecUc*, m2);
447   const uintptr_t fullvec_ct = byte_ct / kBytesPerVec;
448   // uh, clang/LLVM -O2 optimizes this better when comparison is != instead of
449   // <?  ugh, time to change all of the for loops...
450   // (and yes, both -O3 configurations generate worse code here)
451   // at least for loop is better than do-while loop even when 1 iteration is
452   // guaranteed...
453   for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
454     const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
455     const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
456     // is this even worthwhile now in non-AVX2 case?
457     const uint32_t movemask_result = vecuc_movemask(v1 == v2);
458     if (movemask_result != kVec8thUintMax) {
459       const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result);
460       return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
461     }
462   }
463   if (byte_ct % kBytesPerVec) {
464     const uintptr_t final_offset = byte_ct - kBytesPerVec;
465     const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
466     const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
467     const uint32_t movemask_result = vecuc_movemask(v1 == v2);
468     if (movemask_result != kVec8thUintMax) {
469       const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result);
470       return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
471     }
472   }
473   return 0;
474 }
475 #endif
476 
477 #ifdef __cplusplus
478 }  // namespace plink2
479 #endif
480