1
2 /******************************************************************************
3 *
4 * This file is part of meryl-utility, a collection of miscellaneous code
5 * used by Meryl, Canu and others.
6 *
7 * This software is based on:
8 * 'Canu' v2.0 (https://github.com/marbl/canu)
9 * which is based on:
10 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
11 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
12 *
13 * Except as indicated otherwise, this is a 'United States Government Work',
14 * and is released in the public domain.
15 *
16 * File 'README.licenses' in the root directory of this distribution
17 * contains full conditions and disclaimers.
18 */
19
20 #ifndef TYPES_H
21 #define TYPES_H
22
23 // Do NOT enable _GLIBCXX_PARALLEL. Performance is atrocious, and it will
24 // not sort in-place, so memory will blow up. More comments in AS_global.C.
25 #undef _GLIBCXX_PARALLEL
26
27 // ISO C99 says that to get INT32_MAX et al, these must be defined.
28 // (7.18.2, 7.18.4, 7.8.1)
29 #ifndef __STDC_CONSTANT_MACROS
30 #define __STDC_CONSTANT_MACROS
31 #endif
32 #ifndef __STDC_LIMIT_MACROS
33 #define __STDC_LIMIT_MACROS
34 #endif
35 #ifndef __STDC_FORMAT_MACROS
36 #define __STDC_FORMAT_MACROS
37 #endif
38
39 // Tell gcc (and others, maybe) about unused parameters. This is important for gcc (especially
40 // newer ones) that complain about unused parameters. Thanks to ideasman42 at
41 // http://stackoverflow.com/questions/3599160/unused-parameter-warnings-in-c-code.
42 #ifdef __GNUC__
43 #define UNUSED(x) UNUSED_ ## x __attribute__((__unused__))
44 #else
45 #define UNUSED(x) UNUSED_ ## x
46 #endif
47
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <stdint.h>
51 #include <inttypes.h>
52 #include <limits.h>
53 #include <unistd.h>
54 #include <string.h>
55 #include <ctype.h>
56
57 #include <float.h>
58 #include <cmath>
59
60 #include <assert.h>
61 #include <errno.h>
62 #include <time.h>
63
64 #include <sys/types.h>
65 #include <sys/stat.h>
66
67 #include <omp.h>
68
69 #include <limits>
70 #include <set>
71 #include <vector>
72
73 #if defined(_FILE_OFFSET_BITS) && (_FILE_OFFSET_BITS == 32)
74 #error I do not support 32-bit off_t.
75 #endif
76
77 // Make the basic int types a bit more friendly.
78
79 typedef unsigned __int128 uint128;
80 typedef __int128 int128;
81
82 typedef uint64_t uint64;
83 typedef int64_t int64;
84
85 typedef uint32_t uint32;
86 typedef int32_t int32;
87
88 typedef uint16_t uint16;
89 typedef int16_t int16;
90
91 typedef uint8_t uint8;
92 typedef int8_t int8;
93
94 // There's no way to assign a constant value to the 128-bit integers
95 // directly, but with a helper function we can assign it using two 64-bit
96 // integers. This only really makes sense for the unsigned flavor, e.g.,
97 // when used for bit packed quantities.
98
build_uint128(uint64 a,uint64 b)99 constexpr inline uint128 build_uint128(uint64 a, uint64 b) { return(((uint128)a << 64) | ((uint128)b)); };
build_int128(int64 a,int64 b)100 constexpr inline int128 build_int128 ( int64 a, int64 b) { return( ((int128)a << 64) | ((int128)b)); };
101
102 // Some handy constants.
103 //
104 // numeric_limits<> on the 128-bit types is undefined, so we're forced to do
105 // it the hard way.
106
107 constexpr uint128 uint128zero = 0;
108 constexpr uint128 uint128one = 1;
109 constexpr uint128 uint128min = 0;
110 constexpr uint128 uint128max = (uint128)(0xffffffffffffffffllu) << 64 | (uint128)(0xffffffffffffffffllu);
111
112 constexpr int128 int128zero = 0;
113 constexpr int128 int128one = 1;
114 constexpr int128 int128min = (uint128)(0x8000000000000000llu) << 64 | (uint128)(0x0000000000000000llu);
115 constexpr int128 int128max = (uint128)(0x7fffffffffffffffllu) << 64 | (uint128)(0xffffffffffffffffllu);
116
117 constexpr uint64 uint64zero = 0;
118 constexpr uint64 uint64one = 1;
119 constexpr uint64 uint64min = std::numeric_limits<uint64>::min();
120 constexpr uint64 uint64max = std::numeric_limits<uint64>::max();
121
122 constexpr int64 int64zero = 0;
123 constexpr int64 int64one = 1;
124 constexpr int64 int64min = std::numeric_limits<int64>::min();
125 constexpr int64 int64max = std::numeric_limits<int64>::max();
126
127 constexpr uint32 uint32zero = 0;
128 constexpr uint32 uint32one = 1;
129 constexpr uint32 uint32min = std::numeric_limits<uint32>::min();
130 constexpr uint32 uint32max = std::numeric_limits<uint32>::max();
131
132 constexpr int32 int32zero = 0;
133 constexpr int32 int32one = 1;
134 constexpr int32 int32min = std::numeric_limits<int32>::min();
135 constexpr int32 int32max = std::numeric_limits<int32>::max();
136
137 constexpr uint16 uint16zero = 0;
138 constexpr uint16 uint16one = 1;
139 constexpr uint16 uint16min = std::numeric_limits<uint16>::min();
140 constexpr uint16 uint16max = std::numeric_limits<uint16>::max();
141
142 constexpr int16 int16zero = 0;
143 constexpr int16 int16one = 1;
144 constexpr int16 int16min = std::numeric_limits<int16>::min();
145 constexpr int16 int16max = std::numeric_limits<int16>::max();
146
147 constexpr uint8 uint8zero = 0;
148 constexpr uint8 uint8one = 1;
149 constexpr uint8 uint8min = std::numeric_limits<uint8>::min();
150 constexpr uint8 uint8max = std::numeric_limits<uint8>::max();
151
152 constexpr int8 int8zero = 0;
153 constexpr int8 int8one = 1;
154 constexpr int8 int8min = std::numeric_limits<int8>::min();
155 constexpr int8 int8max = std::numeric_limits<int8>::max();
156
157 // Conversion from floating point to integer. lrint() rounds the
158 // floating-point argument to an integer value, using the current rounding
159 // mode. This mode can be set with std::fesetround().
160
doubletoint64(double d)161 inline int64 doubletoint64(double d) { return(std::llrint(d)); }
doubletoint32(double d)162 inline int32 doubletoint32(double d) { return(std:: lrint(d)); }
163
164 // Decoding stings into numbers (and a boolean).
165 // - The first set simply convert the string to a number and return that
166 // number.
167 // - The second set converts the string to a number and returns a pointer
168 // to the letter in the string just after the number.
169 //
170 // There probably should be a strtobool() of the second form, but I'm not
171 // really sure what to do with the 'invalid' case that is currently treated
172 // as 'false'.
173
174 uint128 strtoullll(char const *nptr, char **endptr); // The original strtoul() et al take char**
175 int128 strtollll (char const *nptr, char **endptr); // as the second arg.
176
strtouint128(char const * str)177 inline uint128 strtouint128(char const *str) { return((uint128)strtoullll(str, nullptr)); }
strtoint128(char const * str)178 inline int128 strtoint128 (char const *str) { return( (int128)strtollll (str, nullptr)); }
strtouint64(char const * str)179 inline uint64 strtouint64 (char const *str) { return((uint64) strtoull (str, nullptr, 10)); }
strtoint64(char const * str)180 inline int64 strtoint64 (char const *str) { return( (int64) strtoll (str, nullptr, 10)); }
strtouint32(char const * str)181 inline uint32 strtouint32 (char const *str) { return((uint32) strtoul (str, nullptr, 10)); }
strtoint32(char const * str)182 inline int32 strtoint32 (char const *str) { return( (int32) strtol (str, nullptr, 10)); }
strtouint16(char const * str)183 inline uint16 strtouint16 (char const *str) { return((uint16) strtoul (str, nullptr, 10)); } // WARNING: these convert to
strtoint16(char const * str)184 inline int16 strtoint16 (char const *str) { return( (int16) strtol (str, nullptr, 10)); } // a 32-bit integer then cast
strtouint8(char const * str)185 inline uint8 strtouint8 (char const *str) { return((uint8) strtoul (str, nullptr, 10)); } // to 16- or 8-bit integers.
strtoint8(char const * str)186 inline int8 strtoint8 (char const *str) { return( (int8) strtol (str, nullptr, 10)); }
strtofloat(char const * str)187 inline float strtofloat (char const *str) { return( (float) strtof (str, nullptr)); }
strtodouble(char const * str)188 inline double strtodouble (char const *str) { return((double) strtod (str, nullptr)); }
189
strtonumber(char const * str,uint128 & num)190 inline char const *strtonumber(char const *str, uint128 &num) { char *rem; num = (uint128)strtoullll(str, &rem); return(rem); }
strtonumber(char const * str,int128 & num)191 inline char const *strtonumber(char const *str, int128 &num) { char *rem; num = (int128)strtollll (str, &rem); return(rem); }
strtonumber(char const * str,uint64 & num)192 inline char const *strtonumber(char const *str, uint64 &num) { char *rem; num = (uint64) strtoull (str, &rem, 10); return(rem); }
strtonumber(char const * str,int64 & num)193 inline char const *strtonumber(char const *str, int64 &num) { char *rem; num = (int64) strtoll (str, &rem, 10); return(rem); }
strtonumber(char const * str,uint32 & num)194 inline char const *strtonumber(char const *str, uint32 &num) { char *rem; num = (uint32) strtoul (str, &rem, 10); return(rem); }
strtonumber(char const * str,int32 & num)195 inline char const *strtonumber(char const *str, int32 &num) { char *rem; num = (int32) strtol (str, &rem, 10); return(rem); }
strtonumber(char const * str,uint16 & num)196 inline char const *strtonumber(char const *str, uint16 &num) { char *rem; num = (uint16) strtoul (str, &rem, 10); return(rem); }
strtonumber(char const * str,int16 & num)197 inline char const *strtonumber(char const *str, int16 &num) { char *rem; num = (int16) strtol (str, &rem, 10); return(rem); }
strtonumber(char const * str,uint8 & num)198 inline char const *strtonumber(char const *str, uint8 &num) { char *rem; num = (uint8) strtoul (str, &rem, 10); return(rem); }
strtonumber(char const * str,int8 & num)199 inline char const *strtonumber(char const *str, int8 &num) { char *rem; num = (int8) strtol (str, &rem, 10); return(rem); }
strtonumber(char const * str,float & num)200 inline char const *strtonumber(char const *str, float &num) { char *rem; num = (double) strtof (str, &rem); return(rem); }
strtonumber(char const * str,double & num)201 inline char const *strtonumber(char const *str, double &num) { char *rem; num = (double) strtod (str, &rem); return(rem); }
202
strtobool(char const * str)203 inline bool strtobool(char const *str) {
204 if ((str == nullptr) ||
205 (str[0] == 0))
206 return(false);
207
208 if (((str[0] == 'y') && (str[1] == 0)) ||
209 ((str[0] == 'Y') && (str[1] == 0)) ||
210 ((str[0] == 't') && (str[1] == 0)) ||
211 ((str[0] == 'T') && (str[1] == 0)) ||
212 ((str[0] == '1') && (str[1] == 0)) ||
213 ((str[0] == '+') && (str[1] == 0)))
214 return(true);
215
216 if ((strcasecmp(str, "yes") == 0) ||
217 (strcasecmp(str, "true") == 0))
218 return(true);
219
220 return(false);
221 }
222
223 // Test if a character or string is of the desired encoding.
224
isNUL(char c)225 inline bool isNUL(char c) { return(c == 0); }
226
isVisible(char c)227 inline bool isVisible(char c) { return(('!' <= c) && (c <= '~')); }
228
isLetter(char c)229 inline bool isLetter(char c) { return((('a' <= c) && (c <= 'z')) ||
230 (('A' <= c) && (c <= 'Z'))); }
231
isWhiteSpace(char c)232 inline bool isWhiteSpace(char c) { return((c == ' ') || (c == '\n') ||
233 (c == '\t') || (c == '\r')); };
234
isComment(char c)235 inline bool isComment(char c) { return((c == '!') || (c == '#') || (c == 0)); };
isDelimiter(char c)236 inline bool isDelimiter(char c) { return((c == ':') || (c == '=')); };
237
isBinDigit(char c)238 inline bool isBinDigit(char c) { return((('0' <= c) && (c <= '1'))); }
isOctDigit(char c)239 inline bool isOctDigit(char c) { return((('0' <= c) && (c <= '7'))); }
isDecDigit(char c)240 inline bool isDecDigit(char c) { return((('0' <= c) && (c <= '9'))); }
isHexDigit(char c)241 inline bool isHexDigit(char c) { return((('0' <= c) && (c <= '9')) ||
242 (('a' <= c) && (c <= 'f')) ||
243 (('A' <= c) && (c <= 'F'))); }
244
245 bool isBinNumber (char const *s);
246 bool isOctNumber (char const *s);
247 bool isDecNumber (char const *s, char dot='.');
isDecInteger(char const * s)248 bool inline isDecInteger(char const *s) { return(isDecNumber(s, 0)); };
isDecFloat(char const * s)249 bool inline isDecFloat (char const *s) { return(isDecNumber(s, '.')); };
250 bool isHexNumber (char const *s);
251
252 // Disallow the usual character tests becuse of their goofy return values.
253
254 #undef isalnum
255 #undef isalpha
256 #undef iscntrl
257 #undef isdigit
258 #undef isgraph
259 #undef islower
260 #undef isprint
261 #undef ispunct
262 #undef isspace
263 #undef isupper
264 #undef isxdigit
265 #undef isnumber
266
267 int inline isalnum (char c) = delete;
268 int inline isalpha (char c) = delete;
269 int inline iscntrl (char c) = delete;
270 int inline isdigit (char c) = delete;
271 int inline isgraph (char c) = delete;
272 int inline islower (char c) = delete;
273 int inline isprint (char c) = delete;
274 int inline ispunct (char c) = delete;
275 int inline isspace (char c) = delete;
276 int inline isupper (char c) = delete;
277 int inline isxdigit(char c) = delete;
278 int inline isnumber(char c) = delete;
279
280 // Convert an ascii binary, octal, decimal or hexadecimal letter to an
281 // integer. No type checking is performed; you've already called
282 // isHexNumber() et al, right?
283 //
284 // The pieces of asciiHexToInteger() are as follows:
285 // (d & 0xf) // Decodes '0'-'9' as 0-9, 'a' - 'f' as 1-6
286 // (d >> 6) // Decodes digits as 0, letters as 1.
287 // ((d >> 6) << 3) // Decodes digits as 0, letters as 8.
288
asciiBinToInteger(char d)289 inline uint8 asciiBinToInteger(char d) { return(d - '0'); } // Pretty trivial.
asciiOctToInteger(char d)290 inline uint8 asciiOctToInteger(char d) { return(d - '0'); }
asciiDecToInteger(char d)291 inline uint8 asciiDecToInteger(char d) { return(d - '0'); }
asciiHexToInteger(char d)292 inline uint8 asciiHexToInteger(char d) { return(((uint8)d & 0xf) + ((uint8)d >> 6) + (((uint8)d >> 6) << 3)); }
293
294 // Convert an integer to a printable letter. If it's not a printable
295 // letter, returns '.'.
296
297 inline
298 char
integerToLetter(uint32 i)299 integerToLetter(uint32 i) {
300 return(((' ' <= i) && (i <= '~')) ? i : '.');
301 }
302
303 // Convert a string representing a set of numbers to
304 // - the first and last values (for form '#' or '#-#')
305 // - a vector of the low and high values
306 // - a set of the values
307 //
308 // The string should be comprised of multiple comma separated ranges:
309 // - # a single number
310 // - #-# a range of numbers
311 // - #/# a one-out-of-N specification
312 //
313 // The first form returns a pointer to the letter after the decoded values.
314 //
315 // If a single number is encountered in the first or second forms, both
316 // 'bgn' and 'end' are set to that value.
317 //
318 // If 'numberType' is a 128-bit integer, only 64-bit integers can be
319 // converted.
320
321 template<typename numberType> char const *decodeRange(char const *range, numberType &bgn, numberType &end);
322 template<typename numberType> void decodeRange(char const *range, std::vector<numberType> &bgn, std::vector<numberType> &end);
323 template<typename numberType> void decodeRange(char const *range, std::set<numberType> &values);
324
325 // Convert an unsigned integer representing bits or bytes to
326 // a floating point number representing GB or MB.
327
bitsToGB(uint64 bits)328 inline double bitsToGB(uint64 bits) { return(bits / 8 / 1024.0 / 1024.0 / 1024.0); }
bitsToMB(uint64 bits)329 inline double bitsToMB(uint64 bits) { return(bits / 8 / 1024.0 / 1024.0); }
330
331 // Convert an unsigned integer to one with 3 significant digit number, and
332 // also return the correct SI base.
333
334 uint64 scaledNumber(uint64 n, uint32 div=1024); // Return n between 0 and div,
335 char scaledUnit (uint64 n, uint32 div=1024); // and the SI unit of that
336 const char *scaledName (uint64 n, uint32 div=1024); // scaling.
337
338 // Convert an unsigned integer to a character string in the desired base.
339 //
340 // char *toXXX(v, str)
341 // Expects a pre-allocated character buffer 'str' with enough space for
342 // the output string and a NUL terminating byte. It returns a pointer
343 // to the NUL byte. A 128-bit integer in:
344 // binary needs 129 bytes
345 // octal needs 44 bytes
346 // decimal needs 40 bytes (it's 340,282,366,920,938,463,463,374,607,431,768,211,455)
347 // hexadecimal needs 33 bytes
348 //
349 // char const *toXX(v)
350 // Returns a pointer to one of 32 private string buffers. This is
351 // thread safe, as long as you don't use it more than 32 times at once.
352 //
353 // Both forms take an optional 'width' (in bits) to display. The actual
354 // width used is the minimum of this width and the number of bits in the
355 // type. toDec() accepts the width, but doesn't use it.
356
357 template<typename uintType> char *toBin(uintType value, char *out, uint32 width=128);
358 template<typename uintType> char *toOct(uintType value, char *out, uint32 width=128);
359 template<typename uintType> char *toDec(uintType value, char *out, uint32 width=128);
360 template<typename uintType> char *toHex(uintType value, char *out, uint32 width=128);
361
362 template<typename uintType> char const *toBin(uintType value, uint32 width=128);
363 template<typename uintType> char const *toOct(uintType value, uint32 width=128);
364 template<typename uintType> char const *toDec(uintType value, uint32 width=128);
365 template<typename uintType> char const *toHex(uintType value, uint32 width=128);
366
367 // Format specifications for printf()
368
369 #define F_PTR "0x%016p" // Pointers
370 #define F_C "%c" // Characters
371 #define F_CP "c"
372 #define F_CI "%*c"
373 #define F_STR "%s" // Strings
374 #define F_STRP "s"
375 #define F_STRI "%*s"
376 #define F_S16 "%" PRId16 // Integers
377 #define F_S16P PRId16
378 #define F_S16I "%*" PRId16
379 #define F_U16 "%" PRIu16
380 #define F_U16P PRIu16
381 #define F_U16I "%*" PRIu16
382 #define F_S32 "%" PRId32
383 #define F_S32P PRId32
384 #define F_S32I "%*" PRId32
385 #define F_U32 "%" PRIu32
386 #define F_U32P PRIu32
387 #define F_U32I "%*" PRIu32
388 #define F_S64 "%" PRId64
389 #define F_S64P PRId64
390 #define F_S64I "%*" PRId64
391 #define F_U64 "%" PRIu64
392 #define F_U64P PRIu64
393 #define F_U64I "%*" PRIu64
394 #define F_X64 "%016" PRIx64
395 #define F_X64P PRIx64
396 #define F_X64I "%*" PRIx64
397 #define F_F32 "%f" // Floating points
398 #define F_F32P "f"
399 #define F_F32I "%*f"
400 #define F_F64 "%lf"
401 #define F_F64P "lf"
402 #define F_F64I "%*lf"
403 #define F_SIZE_T "%zu" // Standard typedefs
404 #define F_SIZE_TP "zu"
405 #define F_SIZE_TI "%*zu"
406 #define F_OFF_T F_S64
407 #define F_OFF_TP F_S64P
408 #define F_OFF_TI F_S64I
409
410
411 #endif // TYPES_H
412