1 #ifndef __PGENLIB_MISC_H__
2 #define __PGENLIB_MISC_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // Low-level C99/C++03/C++11 library for reading .pgen (PLINK 2.0 binary) files
22 // (designed to produce good lowest-common-denominator binaries across
23 // Windows/OS X/Linux).
24 //
25 // File format design:
26 // - With the header loaded, it is possible to efficiently access a variant by
27 // its index. Since records can now be variable-length, this sometimes
28 // requires storage of record lengths.
29 // - Due to the power of LD-based compression, we permit a variant record to
30 // just store a list of differences from an earlier, fully stored variant.
31 // However, only short-range dependence is permitted; sequential processing
32 // of the file only requires caching of the most recent explicitly stored
33 // variant.
34 // - Like the plink1 format, this is balanced for relatively easy reading and
35 // writing; in particular, the mode-0x10/0x11 header is not read-optimized,
36 // it passes up some obvious compression opportunities which would make it
37 // more difficult to write e.g. an efficient file merger. This isn't a big
38 // deal if we don't have a huge number of one-sample .pgen files sharing a
39 // single .bim file (or equivalent). (If they don't share the same .bim
40 // file, .bim overhead > .pgen overhead.) If we ever do, we can define an
41 // additional mode to handle that case more efficiently.
42 // - Building blocks are arrays of 1-bit, 2-bit, 4-bit, 1-byte, 2-byte, 3-byte,
43 // and 4-byte values. 3/5/6/7(/9...)-bit values don't play well with
44 // bitwise operations, and when it's important, there's usually a natural way
45 // to split them into power-of-2-bit components.
46 // (unsigned integers known to be smaller than 2^24, but not known to be
47 // smaller than 2^16, are stored as 3-byte values on disk and "decompressed"
48 // to uint32_t during loading.)
49 // - Missing value is usually all-1s. (Only exceptions right now: plink1
50 // backward compatibility mode; presence/absence of rare alts for variants
51 // with >2 alt alleles is an array of 1-bit values, where absence = 0; and
52 // presence/absence of phasing info is similar.) Time to move away from 01
53 // nonsense.
54 // - Individual variant records are prohibited from being >= 4GiB, to reduce
55 // integer overflow issues. (This may be reduced to 2GiB later, but I'll
56 // attempt to handle the 2-4GiB range properly for now since it's conceivable
57 // for multiallelic records in very large datasets to reach that size.)
58 // - (later todo: include stuff like file creation command in .pvar header;
59 // that doesn't really belong in a binary file.)
60 // See the bottom of this header file, and the pgen_spec/ subdirectory of
61 // plink-ng on GitHub, for details.
62
63 // Additional parameter conventions:
64 // - "nyparr" indicates a word-aligned, packed array of 2-bit values, while
65 // "nypvec" is the vector-aligned equivalent. "nybblearr" marks the much
66 // rarer case of a packed array of 4-bit values.
67 // - "nypvec_01" indicates a packed, vector-aligned array of 2-bit values where
68 // each value is zero or one. This data structure was used quite a bit by
69 // plink 1.9 for operating on a subset of a 2-bit-genotype array.
70 // - "genoarr"/"genovec" indicates a nyparr/nypvec containing genotype
71 // information.
72 // - "interleaved_vec" is plink 2.0's preferred alternative to nypvec_01: we
73 // basically stack pairs of adjacent vectors on top of each other and unpack
74 // on the fly, since that tends to be faster than having to access twice as
75 // much memory.
76
77 #include "plink2_bits.h"
78
79 // 10000 * major + 100 * minor + patch
80 // Exception to CONSTI32, since we want the preprocessor to have access to this
81 // value. Named with all caps as a consequence.
82 #define PGENLIB_INTERNAL_VERNUM 1504
83
84 #ifdef __cplusplus
85 namespace plink2 {
86 #endif
87
88 // other configuration-ish values needed by plink2_common subset
89 typedef unsigned char AlleleCode;
90 typedef uint16_t DoubleAlleleCode;
91 static_assert(sizeof(DoubleAlleleCode) == 2 * sizeof(AlleleCode), "Inconsistent AlleleCode and DoubleAlleleCode definitions.");
92 // Set this to 65534 if AlleleCode is uint16_t, 2^24 - 1 if uint32_t.
93 CONSTI32(kPglMaxAltAlleleCt, 254);
94 #ifdef __cplusplus
95 # define kMissingAlleleCode S_CAST(plink2::AlleleCode, -1)
96 #else
97 # define kMissingAlleleCode S_CAST(AlleleCode, -1)
98 #endif
99 CONSTI32(kAlleleCodesPerVec, kBytesPerVec / sizeof(AlleleCode));
100
101 // more verbose than (val + 3) / 4, but may as well make semantic meaning
102 // obvious; any explicit DivUp(val, 4) expressions should have a different
103 // meaning
104 // (not needed for bitct -> bytect, DivUp(val, CHAR_BIT) is clear enough)
NypCtToByteCt(uintptr_t val)105 HEADER_INLINE uintptr_t NypCtToByteCt(uintptr_t val) {
106 return DivUp(val, 4);
107 }
108
NypCtToVecCt(uintptr_t val)109 HEADER_INLINE uintptr_t NypCtToVecCt(uintptr_t val) {
110 return DivUp(val, kNypsPerVec);
111 }
112
NypCtToWordCt(uintptr_t val)113 HEADER_INLINE uintptr_t NypCtToWordCt(uintptr_t val) {
114 return DivUp(val, kBitsPerWordD2);
115 }
116
NypCtToAlignedWordCt(uintptr_t val)117 HEADER_INLINE uintptr_t NypCtToAlignedWordCt(uintptr_t val) {
118 return kWordsPerVec * NypCtToVecCt(val);
119 }
120
NypCtToCachelineCt(uintptr_t val)121 HEADER_INLINE uintptr_t NypCtToCachelineCt(uintptr_t val) {
122 return DivUp(val, kNypsPerCacheline);
123 }
124
AlleleCodeCtToVecCt(uintptr_t val)125 HEADER_INLINE uintptr_t AlleleCodeCtToVecCt(uintptr_t val) {
126 return DivUp(val, kAlleleCodesPerVec);
127 }
128
AlleleCodeCtToAlignedWordCt(uintptr_t val)129 HEADER_INLINE uintptr_t AlleleCodeCtToAlignedWordCt(uintptr_t val) {
130 return kWordsPerVec * AlleleCodeCtToVecCt(val);
131 }
132
GetNyparrEntry(const uintptr_t * nyparr,uint32_t idx)133 HEADER_INLINE uintptr_t GetNyparrEntry(const uintptr_t* nyparr, uint32_t idx) {
134 return (nyparr[idx / kBitsPerWordD2] >> (2 * (idx % kBitsPerWordD2))) & 3;
135 }
136
137 // todo: check if this optimizes newval=0 out
AssignNyparrEntry(uint32_t idx,uintptr_t newval,uintptr_t * nyparr)138 HEADER_INLINE void AssignNyparrEntry(uint32_t idx, uintptr_t newval, uintptr_t* nyparr) {
139 const uint32_t bit_shift_ct = 2 * (idx % kBitsPerWordD2);
140 uintptr_t* wordp = &(nyparr[idx / kBitsPerWordD2]);
141 *wordp = ((*wordp) & (~((3 * k1LU) << bit_shift_ct))) | (newval << bit_shift_ct);
142 }
143
ClearNyparrEntry(uint32_t idx,uintptr_t * nyparr)144 HEADER_INLINE void ClearNyparrEntry(uint32_t idx, uintptr_t* nyparr) {
145 nyparr[idx / kBitsPerWordD2] &= ~((3 * k1LU) << (idx % kBitsPerWordD2));
146 }
147
148 // returns a word with low bit in each pair set at each 00.
Word00(uintptr_t ww)149 HEADER_INLINE uintptr_t Word00(uintptr_t ww) {
150 return (~(ww | (ww >> 1))) & kMask5555;
151 }
152
Word01(uintptr_t ww)153 HEADER_INLINE uintptr_t Word01(uintptr_t ww) {
154 return ww & (~(ww >> 1)) & kMask5555;
155 }
156
157 // returns a word with *low* bit in each pair set at each 10.
Word10(uintptr_t ww)158 HEADER_INLINE uintptr_t Word10(uintptr_t ww) {
159 return (~ww) & (ww >> 1) & kMask5555;
160 }
161
Word11(uintptr_t ww)162 HEADER_INLINE uintptr_t Word11(uintptr_t ww) {
163 return ww & (ww >> 1) & kMask5555;
164 }
165
Pack01ToHalfword(uintptr_t ww)166 HEADER_INLINE Halfword Pack01ToHalfword(uintptr_t ww) {
167 return PackWordToHalfwordMask5555(ww & (~(ww >> 1)));
168 }
169
Pack11ToHalfword(uintptr_t ww)170 HEADER_INLINE Halfword Pack11ToHalfword(uintptr_t ww) {
171 return PackWordToHalfwordMask5555(ww & (ww >> 1));
172 }
173
174 #ifdef USE_SSE42
Popcount01Word(uintptr_t val)175 HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
176 return PopcountWord(val);
177 }
178
Popcount0001Word(uintptr_t val)179 HEADER_INLINE uint32_t Popcount0001Word(uintptr_t val) {
180 return PopcountWord(val);
181 }
182 #else
Popcount01Word(uintptr_t val)183 HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
184 return NypsumWord(val);
185 }
186
Popcount0001Word(uintptr_t val)187 HEADER_INLINE uint32_t Popcount0001Word(uintptr_t val) {
188 # ifdef __LP64__
189 // (val * kMask1111) >> 60 can barely overflow, sigh
190 const uintptr_t val0 = val & 1;
191 return (((val - val0) * kMask1111) >> 60) + val0;
192 # else
193 return (val * kMask1111) >> 28;
194 # endif
195 }
196 #endif
197
198 // safe errstr_buf size for pgen_init_phase{1,2}()
199 CONSTI32(kPglErrstrBufBlen, kPglFnamesize + 256);
200
201 // assumes subset_mask has trailing zeroes up to the next vector boundary
202 void FillInterleavedMaskVec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec);
203
CopyNyparr(const uintptr_t * __restrict source_nyparr,uint32_t nyparr_entry_ct,uintptr_t * __restrict target_nyparr)204 HEADER_INLINE void CopyNyparr(const uintptr_t* __restrict source_nyparr, uint32_t nyparr_entry_ct, uintptr_t* __restrict target_nyparr) {
205 memcpy(target_nyparr, source_nyparr, NypCtToWordCt(nyparr_entry_ct) * kBytesPerWord);
206 }
207
208 // may want bit past the end of subset_mask (i.e. position
209 // raw_nyparr_entry_ct) to always be allocated and unset. This removes the
210 // need for some explicit end-of-bitarray checks.
211 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr);
212
213 // Copies a bit from raw_bitarr for each genoarr entry matching match_word.
214 // (match_word must be a multiple of kMask5555.)
215 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr);
216
217 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target);
218
219
220 // These functions are "unsafe" since they assume trailing bits of last
221 // genovec/genoarr word are zeroed out.
222 void GenovecCount12Unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_10_ctp);
223
224 void Count3FreqVec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp);
225
226 // vector-alignment preferred.
227 void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
228
229 // geno_vvec now allowed to be unaligned.
230 void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp);
231
232 // genoarr vector-alignment preferred.
233 void GenoarrCountSubsetFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
234
235 // slower GenoarrCountSubsetFreqs() which does not require
236 // sample_include_interleaved_vec to be precomputed
237 void GenoarrCountSubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
238
239 void GenoarrCountInvsubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_exclude, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
240
241 void GenoarrCountSubsetIntersectFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
242
243 void GenovecInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec);
244
InvertGenoWordUnsafe(uintptr_t geno_word)245 HEADER_INLINE uintptr_t InvertGenoWordUnsafe(uintptr_t geno_word) {
246 return (geno_word ^ ((~(geno_word << 1)) & kMaskAAAA));
247 }
248
249 // too easy to forget to multiply by 2
ZeroTrailingNyps(uintptr_t nyp_ct,uintptr_t * bitarr)250 HEADER_INLINE void ZeroTrailingNyps(uintptr_t nyp_ct, uintptr_t* bitarr) {
251 ZeroTrailingBits(nyp_ct * 2, bitarr);
252 }
253
SetTrailingNyps(uintptr_t nyp_ct,uintptr_t * bitarr)254 HEADER_INLINE void SetTrailingNyps(uintptr_t nyp_ct, uintptr_t* bitarr) {
255 const uintptr_t trail_ct = nyp_ct % kBitsPerWordD2;
256 if (trail_ct) {
257 bitarr[nyp_ct / kBitsPerWordD2] |= (~k0LU) << (nyp_ct * 2);
258 }
259 }
260
261 // A VINT is a sequence of bytes where each byte stores just 7 bits of an
262 // an integer, and the high bit is set when the integer has more nonzero bits.
263 // See e.g.
264 // https://developers.google.com/protocol-buffers/docs/encoding#varints
265 // (Note that protocol buffers used "group varints" at one point, but then
266 // abandoned them. I suspect they'd be simultaneously slower and less
267 // compact here.)
268
Vint32Append(uint32_t uii,unsigned char * buf)269 HEADER_INLINE unsigned char* Vint32Append(uint32_t uii, unsigned char* buf) {
270 while (uii > 127) {
271 *buf++ = (uii & 127) + 128;
272 uii >>= 7;
273 }
274 *buf++ = uii;
275 return buf;
276 }
277
278 // Returns 0x80000000U on read-past-end instead of UINT32_MAX so overflow check
279 // works properly in 32-bit build. Named "GetVint31" to make it more obvious
280 // that a 2^31 return value can't be legitimate.
GetVint31(const unsigned char * buf_end,const unsigned char ** buf_iterp)281 HEADER_INLINE uint32_t GetVint31(const unsigned char* buf_end, const unsigned char** buf_iterp) {
282 if (likely(buf_end > (*buf_iterp))) {
283 uint32_t vint32 = *((*buf_iterp)++);
284 if (vint32 <= 127) {
285 return vint32;
286 }
287 vint32 &= 127;
288 uint32_t shift = 7;
289 while (likely(buf_end > (*buf_iterp))) {
290 uint32_t uii = *((*buf_iterp)++);
291 vint32 |= (uii & 127) << shift;
292 if (uii <= 127) {
293 return vint32;
294 }
295 shift += 7;
296 // currently don't check for shift >= 32 (that's what ValidateVint31()
297 // is for).
298 }
299 }
300 return 0x80000000U;
301 }
302
303 // Input must be validated, or bufp must be >= 5 characters before the end of
304 // the read buffer. Currently unused.
305 // todo: check if this has enough of a speed advantage over GetVint31() to
306 // justify using this in the main loops and catching SIGSEGV. (seems to be no
307 // more than 3%?)
308 /*
309 HEADER_INLINE uint32_t GetVint31Unsafe(const unsigned char** buf_iterp) {
310 uint32_t vint32 = *(*buf_iterp)++;
311 if (vint32 <= 127) {
312 return vint32;
313 }
314 vint32 &= 127;
315 for (uint32_t shift = 7; shift != 35; shift += 7) {
316 uint32_t uii = *(*buf_iterp)++;
317 vint32 |= (uii & 127) << shift;
318 if (uii <= 127) {
319 return vint32;
320 }
321 }
322 return 0x80000000U;
323 }
324 */
325
326 // Does not update buf_iter.
PeekVint31(const unsigned char * buf_iter,const unsigned char * buf_end)327 HEADER_INLINE uint32_t PeekVint31(const unsigned char* buf_iter, const unsigned char* buf_end) {
328 if (likely(buf_end > buf_iter)) {
329 uint32_t vint32 = *buf_iter++;
330 if (vint32 <= 127) {
331 return vint32;
332 }
333 vint32 &= 127;
334 uint32_t shift = 7;
335 while (likely(buf_end > buf_iter)) {
336 uint32_t uii = *buf_iter++;
337 vint32 |= (uii & 127) << shift;
338 if (uii <= 127) {
339 return vint32;
340 }
341 shift += 7;
342 }
343 }
344 return 0x80000000U;
345 }
346
347 /*
348 HEADER_INLINE uint32_t FGetVint31(FILE* ff) {
349 // Can't be used when multiple threads are reading from ff.
350 uint32_t vint32 = getc_unlocked(ff);
351 if (vint32 <= 127) {
352 return vint32;
353 }
354 vint32 &= 127;
355 for (uint32_t shift = 7; shift != 35; shift += 7) {
356 uint32_t uii = getc_unlocked(ff);
357 vint32 |= (uii & 127) << shift;
358 if (uii <= 127) {
359 return vint32;
360 }
361 }
362 return 0x80000000U;
363 }
364
365 HEADER_INLINE void FPutVint31(uint32_t uii, FILE* ff) {
366 // caller's responsibility to periodically check ferror
367 while (uii > 127) {
368 putc_unlocked((uii & 127) + 128, ff);
369 uii >>= 7;
370 }
371 putc_unlocked(uii, ff);
372 }
373 */
374
375 // Need this for sparse multiallelic dosage.
Vint64Append(uint64_t ullii,unsigned char * buf)376 HEADER_INLINE unsigned char* Vint64Append(uint64_t ullii, unsigned char* buf) {
377 while (ullii > 127) {
378 *buf++ = (ullii & 127) + 128;
379 ullii >>= 7;
380 }
381 *buf++ = ullii;
382 return buf;
383 }
384
385 // Returns 2^63 on read-past-end, and named GetVint63 to make it more obvious
386 // that a 2^63 return value can't be legitimate.
GetVint63(const unsigned char * buf_end,const unsigned char ** buf_iterp)387 HEADER_INLINE uint64_t GetVint63(const unsigned char* buf_end, const unsigned char** buf_iterp) {
388 if (likely(buf_end > (*buf_iterp))) {
389 uint64_t vint64 = *((*buf_iterp)++);
390 if (vint64 <= 127) {
391 return vint64;
392 }
393 vint64 &= 127;
394 uint32_t shift = 7;
395 while (likely(buf_end > (*buf_iterp))) {
396 uint64_t ullii = *((*buf_iterp)++);
397 vint64 |= (ullii & 127) << shift;
398 if (ullii <= 127) {
399 return vint64;
400 }
401 shift += 7;
402 // currently don't check for shift >= 64 (that's what ValidateVint63()
403 // will be for).
404 }
405 }
406 return (1LLU << 63);
407 }
408
409 // main batch size
410 CONSTI32(kPglNypTransposeBatch, kNypsPerCacheline);
411
412 // word width of each matrix row
413 CONSTI32(kPglNypTransposeWords, kWordsPerCacheline);
414
415 #ifdef __LP64__
416 CONSTI32(kPglNypTransposeBufbytes, (kPglNypTransposeBatch * kPglNypTransposeBatch) / 2);
417
418 void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1);
419 #else // !__LP64__
420 CONSTI32(kPglNypTransposeBufbytes, (kPglNypTransposeBatch * kPglNypTransposeBatch) / 2);
421
422 void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1);
423 #endif
424 CONSTI32(kPglNypTransposeBufwords, kPglNypTransposeBufbytes / kBytesPerWord);
425
426 // - up to 256x256; vecaligned_buf must have size 16k (64-bit) or 32k (32-bit)
427 // - does NOT zero out trailing bits, because main application is ind-major-bed
428 // <-> plink2 format conversion, where the zeroing would be undone...
429 // - important: write_iter must be allocated up to at least
430 // RoundUpPow2(write_batch_size, 4) rows (may want to remove this
431 // requirement)
432
TransposeNypblock(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)433 HEADER_INLINE void TransposeNypblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
434 #ifdef __LP64__
435 // assert(!(write_ul_stride % 2));
436 TransposeNypblock64(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, R_CAST(unsigned char*, vecaligned_buf), &(R_CAST(unsigned char*, vecaligned_buf)[kPglNypTransposeBufbytes / 2]));
437 #else
438 TransposeNypblock32(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, R_CAST(unsigned char*, vecaligned_buf), &(R_CAST(unsigned char*, vecaligned_buf)[kPglNypTransposeBufbytes / 2]));
439 #endif
440 }
441
442
443 // replaces each x with (32768 - x)
444 // okay for dosage_main to be nullptr if dosage_ct == 0
445 void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main);
446
447 // replaces each x with -x
448 void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta);
449
450 // currently does zero trailing halfword
451 void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness);
452
453 // currently does not zero trailing halfword
454 void GenoarrToNonmissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness);
455
456 // hom_buf gets set bits when genoarr value is 0 or 2.
457 // ref2het_buf gets set bits when genoarr value is 0 or 1.
458 // N.B. assumes trailing bits of loadbuf have been filled with 1s, not 0s
459 // Also takes genoarr word count instead of sample count.
460 void SplitHomRef2hetUnsafeW(const uintptr_t* genoarr, uint32_t inword_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
461
462
463 void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
464
465
466 // These functions use 16- or 256-element lookup tables to apply functions of
467 // the form
468 // f: {0,1,2,3} -> x
469 // to genoarr, saving the output to result[].
470 // 256-element tables result in a substantially faster inner loop, but they are
471 // more expensive to set up and consume a non-negligible fraction of L1 cache,
472 // so they aren't always the right choice.
473 // When lookup table rows are 16 bytes, they are assumed to be 16-byte aligned
474 // in 64-bit builds. result[] is not assumed to be aligned.
475 void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result);
476
477 void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* result);
478
479 void GenoarrLookup256x2bx4(const uintptr_t* genoarr, const void* table256x2bx4, uint32_t sample_ct, void* result);
480
481 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result);
482
483 #define PAIR_TABLE16(a, b, c, d) \
484 {(a), (a), (b), (a), (c), (a), (d), (a), \
485 (a), (b), (b), (b), (c), (b), (d), (b), \
486 (a), (c), (b), (c), (c), (c), (d), (c), \
487 (a), (d), (b), (d), (c), (d), (d), (d)}
488
489 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* result);
490
491 #define QUAD_TABLE256_INTERNAL2(a, b, c, d, f2, f3, f4) \
492 (a), (f2), (f3), (f4), \
493 (b), (f2), (f3), (f4), \
494 (c), (f2), (f3), (f4), \
495 (d), (f2), (f3), (f4)
496 #define QUAD_TABLE256_INTERNAL3(a, b, c, d, f3, f4) \
497 QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (a), (f3), (f4)), \
498 QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (b), (f3), (f4)), \
499 QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (c), (f3), (f4)), \
500 QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (d), (f3), (f4))
501 #define QUAD_TABLE256_INTERNAL4(a, b, c, d, f4) \
502 QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (a), (f4)), \
503 QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (b), (f4)), \
504 QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (c), (f4)), \
505 QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (d), (f4))
506 #define QUAD_TABLE256(a, b, c, d) \
507 {QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (a)), \
508 QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (b)), \
509 QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (c)), \
510 QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (d))}
511
512 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* result);
513
514 // Lookup table initialization functions. table[0][0], [1][0], [2][0], and
515 // [3][0] must be initialized to f(0), f(1), f(2), and f(3) respectively.
516 void InitLookup16x4bx2(void* table16x4bx2);
517
518 void InitLookup16x8bx2(void* table16x8bx2);
519
520 void InitLookup256x1bx4(void* table256x1bx4);
521
522 void InitLookup256x2bx4(void* table256x2bx4);
523
524 void InitLookup256x4bx4(void* table256x4bx4);
525
526 void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x4bx2, uint32_t sample_ct, void* result);
527
528 // [0][0]..[3][0], [17][0], and [19][0] should contain the relevant values
529 void InitPhaseLookup4b(void* table56x4bx2);
530
531 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* result);
532
533 void InitPhaseLookup8b(void* table56x8bx2);
534
535 // het-haploid prohibited. 64-entry table suffices: we use the same bits for
536 // phasepresent and sex_male since they can't be true simultaneously.
537 // phaseinfo is xor'd with bits 1 and 3 instead of 1 and 2.
538 void PhaseXNohhLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result);
539
540 // [0][0]..[3][0], [16][0]..[19][0]
541 void InitPhaseXNohhLookup4b(void* table64x4bx2);
542
543 // uses same table as PhaseXNohhLookup
544 void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result);
545
546 void InitPhaseXNohhLookup8b(void* table64x8bx2);
547
548 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result);
549
550 // Unlike PhaseLookup4b(), this allows the cur_phased bit to be set when the
551 // genoarr entry is not 01 (het).
552 void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result);
553
554 // Precondition:
555 // [0], [2], [4], [6] initialized with unphased entries
556 // [32], [34], [36], [38] initialized with phased-unflipped entries
557 // [162] initialized with phased-flipped case
558 void InitVcfPhaseLookup4b(void* table246x4bx2);
559
560 void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result);
561
562 void InitVcfPhaseLookup2b(void* table246x2bx2);
563
564
565 // Analogue of BitIter1x.
GenoIter1x(const uintptr_t * __restrict genoarr,uintptr_t match_word,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)566 HEADER_INLINE uint32_t GenoIter1x(const uintptr_t* __restrict genoarr, uintptr_t match_word, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
567 uintptr_t cur_bits = *cur_bitsp;
568 while (!cur_bits) {
569 cur_bits = genoarr[++(*widxp)] ^ match_word;
570 cur_bits = (~(cur_bits | (cur_bits >> 1))) & kMask5555;
571 }
572 *cur_bitsp = cur_bits & (cur_bits - 1);
573 return ctzw(cur_bits);
574 }
575
576 // For every missing entry in genoarr, clear the corresponding subset and
577 // sparse_vals entries.
578 void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
579
580 void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
581
582 double MultiallelicDiploidMinimac3R2(const uint64_t* __restrict sums, const uint64_t* __restrict hap_ssqs_x2, uint32_t nm_sample_ct, uint32_t allele_ct, uint32_t extra_phased_het_ct);
583
MultiallelicDiploidMachR2(const uint64_t * __restrict sums,const uint64_t * __restrict ssqs,uint32_t nm_sample_ct,uint32_t allele_ct)584 HEADER_INLINE double MultiallelicDiploidMachR2(const uint64_t* __restrict sums, const uint64_t* __restrict ssqs, uint32_t nm_sample_ct, uint32_t allele_ct) {
585 return 2 * MultiallelicDiploidMinimac3R2(sums, ssqs, nm_sample_ct, allele_ct, 0);
586 }
587
588 // ----- end plink2_common subset -----
589
590 // other configuration-ish values
591 // this part of the specification is set in stone.
592
593 CONSTI32(kPglVblockSize, 65536);
594
595 // Currently chosen so that it plus kPglFwriteBlockSize + kCacheline - 2 is
596 // < 2^32, so DivUp(kPglMaxBytesPerVariant + kPglFwriteBlockSize - 1,
597 // kCacheline) doesn't overflow.
598 static const uint32_t kPglMaxBytesPerVariant = 0xfffdffc0U;
599 // CONSTI32(kPglMaxBytesPerDataTrack, 0x7ffff000);
600 // static_assert(kMaxBytesPerIO >= (int32_t)kPglMaxBytesPerDataTrack, "pgenlib assumes a single variant data track always fits in one fread/fwrite operation.");
601
602 // currently must be power of 2, and multiple of (kBitsPerWord / 2)
603 CONSTI32(kPglDifflistGroupSize, 64);
604
605 FLAGSET_DEF_START()
606 kfPgenGlobal0,
607 kfPgenGlobalLdCompressionPresent = (1 << 0),
608 kfPgenGlobalDifflistOrLdPresent = (1 << 1),
609
610 // Only guaranteed to be set when present if phase or dosage also present.
611 kfPgenGlobalMultiallelicHardcallFound = (1 << 2),
612
613 kfPgenGlobalHardcallPhasePresent = (1 << 3),
614 kfPgenGlobalDosagePresent = (1 << 4),
615 kfPgenGlobalDosagePhasePresent = (1 << 5),
616 kfPgenGlobalAllNonref = (1 << 6)
617 FLAGSET_DEF_END(PgenGlobalFlags);
618
619 // difflist/LD compression must not involve more than
620 // raw_sample_ct / kPglMaxDifflistLenDivisor
621 // entries. (however, returned difflists can have up to twice as many entries,
622 // when a variant is LD-compressed and the reference variant is
623 // difflist-compressed.)
624 // This value can be considered set in stone.
625 CONSTI32(kPglMaxDifflistLenDivisor, 8);
626
627 // Threshold for using a deltalist to represent a bitarray on disk (currently
628 // relevant for dosage data). This is a tunable parameter, but must be >=
629 // kPglMaxDifflistLenDivisor.
630 CONSTI32(kPglMaxDeltalistLenDivisor, 9);
631
632 void PgrDifflistToGenovecUnsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec);
633
634 // This covers all the possibilities. Todo: switch all functions exporting
635 // multiallelic codes and/or phased dosage to use this struct. (Biallelic
636 // phased hardcalls and unphased dosages are simple enough for this to be
637 // overkill, though.)
638 typedef struct PgenVariantStruct {
639 NONCOPYABLE(PgenVariantStruct);
640 uintptr_t* genovec;
641 uintptr_t* patch_01_set;
642 AlleleCode* patch_01_vals;
643 uintptr_t* patch_10_set;
644 AlleleCode* patch_10_vals;
645 uintptr_t* phasepresent;
646 uintptr_t* phaseinfo;
647 uintptr_t* dosage_present;
648 uint16_t* dosage_main;
649 uintptr_t* multidosage_present;
650 unsigned char* multidosage_cts;
651 AlleleCode* multidosage_codes;
652 uint16_t* multidosage_vals;
653 uintptr_t* dphase_present;
654 int16_t* dphase_delta;
655 uintptr_t* multidphase_present;
656 unsigned char* multidphase_cts;
657 AlleleCode* multidphase_codes;
658 int16_t* multidphase_delta;
659
660 uint32_t patch_01_ct;
661 uint32_t patch_10_ct;
662 uint32_t phasepresent_ct;
663 uint32_t dosage_ct;
664 uint32_t multidosage_sample_ct;
665 uint32_t dphase_ct;
666 uint32_t multidphase_sample_ct;
667 } PgenVariant;
668
GetAux1bAlleleEntryByteCt(uint32_t allele_ct,uint32_t rare10_ct)669 HEADER_INLINE uintptr_t GetAux1bAlleleEntryByteCt(uint32_t allele_ct, uint32_t rare10_ct) {
670 assert(allele_ct >= 3);
671 if (allele_ct == 3) {
672 return DivUp(rare10_ct, 8);
673 }
674 if (allele_ct < 6) {
675 return DivUp(rare10_ct, 2);
676 }
677 // one byte per entry for allele_ct <= 17, two bytes for 18..256
678 return ((allele_ct >= 18) + 1) * rare10_ct;
679 // todo: allele_ct > 257
680 }
681
682 extern const uint16_t kHcToAlleleCodes[1024];
683
684 // Permits missing codes, does not remap.
685 void PglMultiallelicSparseToDenseMiss(const PgenVariant* pgvp, uint32_t sample_ct, AlleleCode* __restrict wide_codes);
686
687 // The actual format:
688 // 1. 2 magic bytes 0x6c 0x1b.
689 //
690 // 2. Mode byte.
691 // 0x01 = plink1 variant-major.
692 // 0x02 = plink2 basic variant-major. variant/sample counts in header,
693 // 00 = hom ref, 01 = het, 10 = hom alt, 11 = missing. (vrtype 0)
694 // 0x03 = plink2 basic unphased dosage (vrtype 0x40)
695 // 0x04 = plink2 basic phased dosage (vrtype 0xc0)
696 // These are designed to be easy to write. Note that the dosage formats
697 // require hardcalls to be stored as well; however, you can just set them
698 // to all-missing and then use
699 // plink2 --hard-call-threshold <...> --make-pgen
700 // to populate them.
701 //
702 // 0x10 = variable-type and/or variable-length records present.
703 // 0x11 = mode 0x10, but with phase set information at the end of the
704 // file.
705 // 0x05..0x0f and 0x12..0x7f are reserved for possible use by future
706 // versions of the PGEN specification, and 0 is off-limits (PLINK 1
707 // sample-major .bed).
708 // 0x80..0xff can be safely used by developers for their own purposes.
709 //
710 // 3. If not plink1-format,
711 // a. 4-byte # of variants; call this M.
712 // b. 4-byte # of samples, call this N.
713 // c. Additional 1-byte header 'control' value (PgenHeaderCtrl). May be
714 // extended in the future.
715 // bits 0-3: Indicates vrtype and variant record length storage widths.
716 // If bit 3 is unset, bits 0-1 store (vrec_len_byte_ct - 1), while bit
717 // 2 is set iff phase or dosage info is present (requiring 8 bits
718 // instead of 4 bits for vrtypes).
719 // If bit 3 is set, a specialized encoding is used which combines the
720 // two pieces of information (reducing the overhead for files with few
721 // samples). The following encodings are now defined (note that there
722 // was a change of plans in Mar 2019):
723 // 8: 1 bit per fused vrtype-length. Unset = vrtype 5, set = vrtype
724 // 0.
725 // 9: 2 bits, multiallelic. 0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype
726 // 8 with that many more bytes than vrtype 0. Note that this is
727 // limited to 16 ALT alleles.
728 // 10: 2 bits, phased. 0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype 16
729 // with that many minus 1 bytes beyond vrtype 0. While this is
730 // also aimed at the single-sample use case, it technically
731 // supports up to 15 always-phased or 7 partially-phased samples.
732 // 11: 4 bits, multiallelic + phased. 0 = vrtype 5, 1 = vrtype 0, 2-7
733 // = vrtype 8 with that many bytes beyond vrtype 0, 9 = vrtype 16
734 // phase info requiring just 1 byte, 10-15 = vrtype 24 with (x-7)
735 // extra bytes required between multiallelic and phased tracks.
736 // 12: 2 bits, dosage, must be single-sample. 0 = vrtype 5, 1 =
737 // vrtype 0, 2 = vrtype 0x45 with 2 bytes, 3 = vrtype 0x40 with 3
738 // total bytes.
739 // 13: reserved for single-sample multiallelic + dosage.
740 // 14: 4 bits, phased + dosage, must be single-sample. 0 and 1 as
741 // usual, 3 = vrtype 16 with 1 phaseinfo byte, 4 = vrtype 0x45
742 // with 2 bytes, 5 = vrtype 0x40 with 3 total bytes, 12 = vrtype
743 // 0xc5 with 4 total bytes, 13 = vrtype 0xc0 with 5 total bytes,
744 // 15 = vrtype 0xe0 with 6 total bytes
745 // 15: reserved for single-sample multiallelic + phased dosage.
746 // bits 4-5: allele count storage (00 = unstored, 01-11 = bytes per ct)
747 // bits 6-7: nonref flags info (00 = unstored, 01 = all ref/alt, 10 =
748 // never ref/alt, 11 = explicitly stored)
749 // Bits 0-5 do not apply to the fixed-length modes (currently 0x02-0x04)
750 // and should be zeroed out in that case.
751 //
752 // 4. If mode 0x10/0x11,
753 // a. Array of 8-byte fpos values for the first variant in each vblock.
754 // (Note that this suggests a way to support in-place insertions: some
755 // unused space can be left between the vblocks.)
756 // b. Sequence of header blocks, each containing information about
757 // kPglVblockSize variants (except the last may be shorter). All values
758 // are known-width, to allow e.g. plink2 --make-pgen/--pmerge to compress
759 // all variant records first, then fseek to the beginning of the output
760 // file and write the header.
761 // i. array of 4-bit or 1-byte vrtypes.
762 // ii. array of variant record lengths (each occupying vrec_len_byte_ct
763 // bytes, or 2-4 bits).
764 // iii. if bits 4-5 of {3c} aren't 00, array of alt allele counts.
765 // iv. nonref flags info, if explicitly stored
766 // (this representation allows more efficient random access)
767 // If mode 0x02-0x04, and nonref flags info explicitly stored, just that
768 // bitarray.
769 //
770 // 5. The variant records. See below for details.
771
772 // Difflist format:
773 // a. <difflist_len VINT>
774 // If difflist_len is zero, that's it. Otherwise, the difflist is organized
775 // into 64-element groups (the last group will usually be smaller), to make
776 // extraction of e.g. a single sample less painful. Note that with 20k
777 // samples, a difflist is space-saving even with MAF 5%:
778 // ~1/400 hom alt + ~38/400 het = (~39/400) * 20k
779 // = ~1950 sample IDs.
780 // that's 31 groups, requiring about 2 + 62 + 30 + 488 + 1919 = 2501 bytes
781 // (can be slightly higher since a few ID deltas may be larger than 127);
782 // uncompressed storage requires 5000 bytes.
783 // b. <array of group start sample IDs, each of sample_id_byte_ct>
784 // c. <array of 1-byte <delta segment lengths minus 63>, with last entry
785 // omitted>
786 // d. <optional payload of fixed-width genotype values>
787 // (in retrospect, it might have been better to position this after (e)
788 // to avoid entanglement with the packed-bitarray definition, oh well...)
789 // e. one "delta segment"/group: <array of <group size - 1> VINT values,
790 // each indicating the difference between the current and previous sample
791 // IDs; i.e. value is 1 for two consecutive samples>
792
793 // Variant record type ('vrtype') coding:
794 // bits 0-2:
795 // 000 = Simple 2-bit encoding.
796 // 100, 110, 111 = Simple difflist. Low two bits store the base value. (101
797 // isn't here since Hardy-Weinberg equilibrium prevents
798 // all het ref/alt from arising much in practice, outside of
799 // alignment/variant-calling technical artifacts that should
800 // be removed.)
801 // 010 = Differences-from-earlier-variant encoding ("LD compression"). The
802 // last variant without this type of encoding is the base.
803 // To simplify random access logic, the first variant in each vblock is
804 // prohibited from using this encoding.
805 // 011 = Inverted differences-from-earlier-variant encoding. (This covers
806 // the case where a reference allele is 'wrong'.) When decoding, the
807 // difflist should be processed first, then the entire genovec should
808 // be flipped.
809 // 001 = 1-bit + difflist representation. Suppose most calls are hom ref or
810 // het (e.g. a 20% MAF variant with ~4% hom alt1, ~36% het ref/alt1,
811 // ~64% hom ref), then the main datatrack has just the low bits of the
812 // usual 2-bit codes. This is followed by a difflist containing the
813 // hom alt1 and missing genotypes.
814 // The main datatrack is preceded by a single byte indicating what
815 // the two common values are: 2 low bits = <set value - unset value>,
816 // next 2 bits = unset value (6 possibilities). Top 4 bits are
817 // reserved.
818 // 101 = All alleles are reference, no missing data. The main datatrack is
819 // empty in this case. Although this saves only 1 byte per variant
820 // over vrtype 100, this actually makes a huge difference for
821 // single-sample files.
822 // Since this was not defined until mid-2019, the standard plink2
823 // alpha-test binaries will not use this encoding. However,
824 // alpha-2-final and later binaries interpret this encoding correctly.
825 // If your workflow makes heavy use of single-sample .pgen files, you
826 // can add -DFUTURE_ENCODER during compilation to unlock this feature.
827 //
828 // bit 3: multiallelic hardcalls present with alt2/alt3/... present? If yes,
829 // auxiliary data track #1 disambiguates the 0b01 (ref/altx) and 0b10
830 // (altx/alty, x may equal y) hardcalls. This contains a format byte,
831 // followed by a list of ref/altx patches, then a list of altx/alty
832 // patches. All unpatched genotypes are ref/alt1 or alt1/alt1.
833 // The bottom 4 bits of the format byte describe how the ref/altx patch
834 // set is stored.
835 // 0 = Starts with a bitarray with <total ref/altx count> bits (divide by 8
836 // and round up to get byte count of this component; any trailing bits in
837 // the last byte must be 0), where each set bit corresponds to presence
838 // of a rarealt (i.e. alt2/alt3/...).
839 // ExpandBytearr(aux1_first_quarter, all_01, raw_sample_ctl, ct_01, 0,
840 // patch_01);
841 // can be used to convert this into a set of sample IDs, though we may
842 // want to avoid an intermediate unpacking step in practice. Note that
843 // when we're passing in sample_ct < raw_sample_ct and the main datatrack
844 // is LD-compressed, we'd like ldbase_raw_genovec to be cached.
845 // This is followed by a packed array of fixed-width <allele idx - 2>
846 // values, where the width depends on the total number of alt alleles.
847 // 2 alts: width ZERO. All set bits in the first bitarray correspond
848 // to ref/alt2.
849 // 3 alts: width 1 bit. Set bits correspond to ref/alt3, clear bits
850 // correspond to ref/alt2.
851 // 4-5 alts: width 2 bits. 0b00 corresponds to ref/alt2, 0b01 =
852 // ref/alt3, 0b10 = ref/alt4, etc.
853 // 6-17 alts: width 4 bits.
854 // 18-257 alts: width 8 bits.
855 // 258-65537 alts: width 16 bits.
856 // 65538-16777215 alts: width 24 bits. Reasonable to prohibit more than
857 // 2^24 - 1 = 16777215, since variant records are
858 // limited to 4 GiB. I can imagine some
859 // applications of >65534 in highly variable
860 // regions, though, and it doesn't actually cost us
861 // anything to define a way to represent it. (A
862 // plink2 binary compiled with AlleleCode typedef'd
863 // as uint32_t will run more slowly, of course, but
864 // most binaries will not be compiled that way.)
865 // 1 = Same as mode 0, except the initial bitarray is replaced by a difflist
866 // with sample IDs. (We could make that piece somewhat smaller by
867 // storing 0-based ref/altx indexes instead, but I'm pretty sure that
868 // isn't worth the performance penalty of requiring all_01 and more
869 // complicated unpacking. Though we'll need to peek at aux1[0] before
870 // decompressing the main datatrack to exploit this.)
871 // 15 = Empty patch set. Might remove this (storing this as mode 1 just
872 // takes 1 more byte), but it may enable some relevant performance
873 // optimizations.
874 // 2-14 are reserved for future use. We don't define an efficient way to
875 // represent a variant that e.g. has more alt2s than alt1s for now, since alt
876 // alleles will usually be sorted in order of decreasing frequency, but maybe
877 // this will matter in the future.
878 //
879 // The top 4 bits describe how the altx/alty patch set is stored. 0/1/15
880 // have the same meaning as they do for the ref/altx patch set; the only
881 // thing that changes is the format of the packed array of values at the end.
882 // 2 alts: width 1. This is treated as a special case. Set bits correspond
883 // to alt2/alt2, clear = alt1/alt2.
884 // 3-4 alts: width 2+2 bits. Each stores <allele idx - 1>, with the smaller
885 // number in the lower bits. E.g. alt1/alt2 is stored as 0b0100;
886 // alt3/alt3 is stored as 0b1010.
887 // 5-16 alts: width 4+4 bits.
888 // 17-256 alts: width 8+8 bits.
889 // 257-65536 alts: width 16+16 bits.
890 // 65537-16777215 alts: width 24+24 bits.
891 //
892 // bit 4: hardcall phased? If yes, auxiliary data track #2 contains phasing
893 // information for heterozygous calls.
894 // The first *bit* of the track indicates whether an explicit
895 // "phasepresent" bitarray is stored. If it's set, the next het_ct bits
896 // are 1-bit values, where 0 = no phasing info known, and 1 = phasing
897 // info present. If it's unset, phasing info is present for every het
898 // call.
899 // This is followed by a "phaseinfo" bitarray, where 0 = unswapped,
900 // 1 = swapped (e.g. "1|0" in VCF).
901 // This track is normally unpacked into fixed-size bitarrays when
902 // loaded, but a raw mode is also provided (which doesn't support
903 // subsetting).
904 // By default, entire chromosomes/contigs are assumed to be phased
905 // together. (Todo: support contiguous phase sets.)
906 //
907 // bits 5-6:
908 // 00 = no dosage data.
909 // 01 = dosage list. Auxiliary data track #3 contains a delta-encoded list
910 // of sample IDs (like a difflist, but with no genotypes). Track #4
911 // contains a 16-bit (0..2^15; 65535 missing value is only permitted in
912 // unconditional-dosage case) value expressing the sum of all alt allele
913 // dosages. (Yes, making this the ref allele dosage would have been a
914 // bit cleaner, but it's too late now.)
915 // If the variant is multiallelic, nonzero alt2/alt3/... dosages are
916 // likely to be sparse. So,
917 // - track #5 contains a delta-encoded list describing which
918 // <sample_uidx x rarealt dosage> entries are nonzero, where rarealt
919 // index is in the lowest bits and sample_uidx can be computed via
920 // right-shift (to avoid integer-division headaches, especially
921 // important since indexes in this list can be larger than 2^32).
922 // We use sample_uidx here to make subsetting less painful.
923 // Since each nonzero dosage value requires 16 bits, and
924 // delta-encoding of a dense list adds less than 9 bits per entry,
925 // there isn't much point in supporting a dense bitarray mode here.
926 // To keep memory requirements sane for biobank-scale datasets when
927 // the number of alt alleles is very large, each sample is prohibited
928 // from having more than 255 nonzero allele dosages (this makes no
929 // difference when sizeof(AlleleCode) == 1, but it may matter later).
930 // - track #6 contains the rarealt nonzero dosage values.
931 // Note that this and the other dosage modes are in ADDITION to
932 // hardcalls. This increases filesize by up to 12.5%, but makes the
933 // reader substantially simpler; --hard-call-threshold logic is nicely
934 // compartmentalized.
935 // 10 = unconditional dosage (just track #4).
936 // 11 = dosage bitarray. In this case, auxiliary data track #3 contains an
937 // array of 1-bit values indicating which samples have dosages. If the
938 // variant is multiallelic, tracks #5 and 6 are as described above.
939 // bgen 1.2 format no longer permits fractional missingness, so no good
940 // reason for us to support it.
941 // Considered putting *all* dosage data at the end of the file (like I will
942 // do for phase set info); this could actually be worthwhile for
943 // unconditional dosages, but it doesn't work well when only some samples
944 // have dosage data.
945 // bit 7: some dosages explicitly phased? If yes, and dosages are not
946 // unconditionally present, auxiliary data track #7 is a bitarray of
947 // length dosage_ct indicating whether dphase_delta exists for that
948 // sample. Note that this is technically independent of bit 4; either
949 // can be set without the other. (However, in practice, bit 4 is almost
950 // always set when bit 7 is, since that enables more efficient storage
951 // of 0|0.99, 1|0.02, and similar pairs.)
952 // When phased dosages are present, track #8 contains values
953 // representing <(hap1 alt prob) - (hap2 alt prob)>, etc., where the
954 // underlying values are represented in [0..16384] (so the signed
955 // difference is in [-16384..16384]). Track #4 contains the
956 // corresponding sums; parity does NOT need to match (necessary to allow
957 // this since we treat omitted values as zero; and since we are allowing
958 // it, no point in making parity match in other situations either). In
959 // fixed-width case, -32768 should be stored in track #8 when the entire
960 // call is missing, while 0 and missing-phase are considered synonymous.
961 // In the biallelic case, if a hardcall is phased, a dosage is present,
962 // and no explicit dosage-phase is, we define it to mean the unique
963 // dphase_delta sequence with maximal absolute value, and --make-pgen
964 // takes advantage of it. This definition technically works for
965 // triallelic variants as well, but it breaks down with 4 alleles, so we
966 // prohibit hardcall-phase + dosage + no-dosage-phase with more than 2
967 // alleles.
968 // In the multiallelic case, tracks #9 and #10 are analogous to #5 and
969 // #6.
970 //
971 // Representation of variable ploidy (MT) was considered, but rejected since
972 // dosages should be at least as appropriate for MT.
973 // Oxford/VCF-style storage of separate probabilities for every possible
974 // genotype (e.g. P(AA), P(AB), P(BB) instead of just 2P(AA) + P(AB) and
975 // 2P(BB) + P(AB)) is tentatively rejected due to (i) lack of relevance to
976 // PLINK's analysis functions and (ii) high storage cost where we can afford it
977 // least. In principle, this is subject to reevaluation if (i) changes, but
978 // given the poor interaction with phased dosages, it's probably better to just
979 // think of them as permanently outside PLINK's scope.
980
981 #ifdef __cplusplus
982 } // namespace plink2
983 #endif
984
985 #endif // __PGENLIB_MISC_H__
986