1 #ifndef __PGENLIB_MISC_H__
2 #define __PGENLIB_MISC_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // Low-level C99/C++03/C++11 library for reading .pgen (PLINK 2.0 binary) files
22 // (designed to produce good lowest-common-denominator binaries across
23 // Windows/OS X/Linux).
24 //
25 // File format design:
26 // - With the header loaded, it is possible to efficiently access a variant by
27 //   its index.  Since records can now be variable-length, this sometimes
28 //   requires storage of record lengths.
29 // - Due to the power of LD-based compression, we permit a variant record to
30 //   just store a list of differences from an earlier, fully stored variant.
31 //   However, only short-range dependence is permitted; sequential processing
32 //   of the file only requires caching of the most recent explicitly stored
33 //   variant.
34 // - Like the plink1 format, this is balanced for relatively easy reading and
35 //   writing; in particular, the mode-0x10/0x11 header is not read-optimized,
36 //   it passes up some obvious compression opportunities which would make it
37 //   more difficult to write e.g. an efficient file merger.  This isn't a big
38 //   deal if we don't have a huge number of one-sample .pgen files sharing a
39 //   single .bim file (or equivalent).  (If they don't share the same .bim
40 //   file, .bim overhead > .pgen overhead.)  If we ever do, we can define an
41 //   additional mode to handle that case more efficiently.
42 // - Building blocks are arrays of 1-bit, 2-bit, 4-bit, 1-byte, 2-byte, 3-byte,
43 //   and 4-byte values.  3/5/6/7(/9...)-bit values don't play well with
44 //   bitwise operations, and when it's important, there's usually a natural way
45 //   to split them into power-of-2-bit components.
46 //   (unsigned integers known to be smaller than 2^24, but not known to be
47 //   smaller than 2^16, are stored as 3-byte values on disk and "decompressed"
48 //   to uint32_t during loading.)
49 // - Missing value is usually all-1s.  (Only exceptions right now: plink1
50 //   backward compatibility mode; presence/absence of rare alts for variants
51 //   with >2 alt alleles is an array of 1-bit values, where absence = 0; and
52 //   presence/absence of phasing info is similar.)  Time to move away from 01
53 //   nonsense.
54 // - Individual variant records are prohibited from being >= 4GiB, to reduce
55 //   integer overflow issues.  (This may be reduced to 2GiB later, but I'll
56 //   attempt to handle the 2-4GiB range properly for now since it's conceivable
57 //   for multiallelic records in very large datasets to reach that size.)
58 // - (later todo: include stuff like file creation command in .pvar header;
59 //   that doesn't really belong in a binary file.)
60 // See the bottom of this header file, and the pgen_spec/ subdirectory of
61 // plink-ng on GitHub, for details.
62 
63 // Additional parameter conventions:
64 // - "nyparr" indicates a word-aligned, packed array of 2-bit values, while
65 //   "nypvec" is the vector-aligned equivalent.  "nybblearr" marks the much
66 //   rarer case of a packed array of 4-bit values.
67 // - "nypvec_01" indicates a packed, vector-aligned array of 2-bit values where
68 //   each value is zero or one.  This data structure was used quite a bit by
69 //   plink 1.9 for operating on a subset of a 2-bit-genotype array.
70 // - "genoarr"/"genovec" indicates a nyparr/nypvec containing genotype
71 //   information.
72 // - "interleaved_vec" is plink 2.0's preferred alternative to nypvec_01: we
73 //   basically stack pairs of adjacent vectors on top of each other and unpack
74 //   on the fly, since that tends to be faster than having to access twice as
75 //   much memory.
76 
77 #include "plink2_bits.h"
78 
79 // 10000 * major + 100 * minor + patch
80 // Exception to CONSTI32, since we want the preprocessor to have access to this
81 // value.  Named with all caps as a consequence.
82 #define PGENLIB_INTERNAL_VERNUM 1504
83 
84 #ifdef __cplusplus
85 namespace plink2 {
86 #endif
87 
88 // other configuration-ish values needed by plink2_common subset
89 typedef unsigned char AlleleCode;
90 typedef uint16_t DoubleAlleleCode;
91 static_assert(sizeof(DoubleAlleleCode) == 2 * sizeof(AlleleCode), "Inconsistent AlleleCode and DoubleAlleleCode definitions.");
92 // Set this to 65534 if AlleleCode is uint16_t, 2^24 - 1 if uint32_t.
93 CONSTI32(kPglMaxAltAlleleCt, 254);
94 #ifdef __cplusplus
95 #  define kMissingAlleleCode S_CAST(plink2::AlleleCode, -1)
96 #else
97 #  define kMissingAlleleCode S_CAST(AlleleCode, -1)
98 #endif
99 CONSTI32(kAlleleCodesPerVec, kBytesPerVec / sizeof(AlleleCode));
100 
101 // more verbose than (val + 3) / 4, but may as well make semantic meaning
102 // obvious; any explicit DivUp(val, 4) expressions should have a different
103 // meaning
104 // (not needed for bitct -> bytect, DivUp(val, CHAR_BIT) is clear enough)
NypCtToByteCt(uintptr_t val)105 HEADER_INLINE uintptr_t NypCtToByteCt(uintptr_t val) {
106   return DivUp(val, 4);
107 }
108 
NypCtToVecCt(uintptr_t val)109 HEADER_INLINE uintptr_t NypCtToVecCt(uintptr_t val) {
110   return DivUp(val, kNypsPerVec);
111 }
112 
NypCtToWordCt(uintptr_t val)113 HEADER_INLINE uintptr_t NypCtToWordCt(uintptr_t val) {
114   return DivUp(val, kBitsPerWordD2);
115 }
116 
NypCtToAlignedWordCt(uintptr_t val)117 HEADER_INLINE uintptr_t NypCtToAlignedWordCt(uintptr_t val) {
118   return kWordsPerVec * NypCtToVecCt(val);
119 }
120 
NypCtToCachelineCt(uintptr_t val)121 HEADER_INLINE uintptr_t NypCtToCachelineCt(uintptr_t val) {
122   return DivUp(val, kNypsPerCacheline);
123 }
124 
AlleleCodeCtToVecCt(uintptr_t val)125 HEADER_INLINE uintptr_t AlleleCodeCtToVecCt(uintptr_t val) {
126   return DivUp(val, kAlleleCodesPerVec);
127 }
128 
AlleleCodeCtToAlignedWordCt(uintptr_t val)129 HEADER_INLINE uintptr_t AlleleCodeCtToAlignedWordCt(uintptr_t val) {
130   return kWordsPerVec * AlleleCodeCtToVecCt(val);
131 }
132 
GetNyparrEntry(const uintptr_t * nyparr,uint32_t idx)133 HEADER_INLINE uintptr_t GetNyparrEntry(const uintptr_t* nyparr, uint32_t idx) {
134   return (nyparr[idx / kBitsPerWordD2] >> (2 * (idx % kBitsPerWordD2))) & 3;
135 }
136 
137 // todo: check if this optimizes newval=0 out
AssignNyparrEntry(uint32_t idx,uintptr_t newval,uintptr_t * nyparr)138 HEADER_INLINE void AssignNyparrEntry(uint32_t idx, uintptr_t newval, uintptr_t* nyparr) {
139   const uint32_t bit_shift_ct = 2 * (idx % kBitsPerWordD2);
140   uintptr_t* wordp = &(nyparr[idx / kBitsPerWordD2]);
141   *wordp = ((*wordp) & (~((3 * k1LU) << bit_shift_ct))) | (newval << bit_shift_ct);
142 }
143 
ClearNyparrEntry(uint32_t idx,uintptr_t * nyparr)144 HEADER_INLINE void ClearNyparrEntry(uint32_t idx, uintptr_t* nyparr) {
145   nyparr[idx / kBitsPerWordD2] &= ~((3 * k1LU) << (idx % kBitsPerWordD2));
146 }
147 
148 // returns a word with low bit in each pair set at each 00.
Word00(uintptr_t ww)149 HEADER_INLINE uintptr_t Word00(uintptr_t ww) {
150   return (~(ww | (ww >> 1))) & kMask5555;
151 }
152 
Word01(uintptr_t ww)153 HEADER_INLINE uintptr_t Word01(uintptr_t ww) {
154   return ww & (~(ww >> 1)) & kMask5555;
155 }
156 
157 // returns a word with *low* bit in each pair set at each 10.
Word10(uintptr_t ww)158 HEADER_INLINE uintptr_t Word10(uintptr_t ww) {
159   return (~ww) & (ww >> 1) & kMask5555;
160 }
161 
Word11(uintptr_t ww)162 HEADER_INLINE uintptr_t Word11(uintptr_t ww) {
163   return ww & (ww >> 1) & kMask5555;
164 }
165 
Pack01ToHalfword(uintptr_t ww)166 HEADER_INLINE Halfword Pack01ToHalfword(uintptr_t ww) {
167   return PackWordToHalfwordMask5555(ww & (~(ww >> 1)));
168 }
169 
Pack11ToHalfword(uintptr_t ww)170 HEADER_INLINE Halfword Pack11ToHalfword(uintptr_t ww) {
171   return PackWordToHalfwordMask5555(ww & (ww >> 1));
172 }
173 
174 #ifdef USE_SSE42
Popcount01Word(uintptr_t val)175 HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
176   return PopcountWord(val);
177 }
178 
Popcount0001Word(uintptr_t val)179 HEADER_INLINE uint32_t Popcount0001Word(uintptr_t val) {
180   return PopcountWord(val);
181 }
182 #else
Popcount01Word(uintptr_t val)183 HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
184   return NypsumWord(val);
185 }
186 
Popcount0001Word(uintptr_t val)187 HEADER_INLINE uint32_t Popcount0001Word(uintptr_t val) {
188 #  ifdef __LP64__
189   // (val * kMask1111) >> 60 can barely overflow, sigh
190   const uintptr_t val0 = val & 1;
191   return (((val - val0) * kMask1111) >> 60) + val0;
192 #  else
193   return (val * kMask1111) >> 28;
194 #  endif
195 }
196 #endif
197 
198 // safe errstr_buf size for pgen_init_phase{1,2}()
199 CONSTI32(kPglErrstrBufBlen, kPglFnamesize + 256);
200 
201 // assumes subset_mask has trailing zeroes up to the next vector boundary
202 void FillInterleavedMaskVec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec);
203 
CopyNyparr(const uintptr_t * __restrict source_nyparr,uint32_t nyparr_entry_ct,uintptr_t * __restrict target_nyparr)204 HEADER_INLINE void CopyNyparr(const uintptr_t* __restrict source_nyparr, uint32_t nyparr_entry_ct, uintptr_t* __restrict target_nyparr) {
205   memcpy(target_nyparr, source_nyparr, NypCtToWordCt(nyparr_entry_ct) * kBytesPerWord);
206 }
207 
208 // may want bit past the end of subset_mask (i.e. position
209 // raw_nyparr_entry_ct) to always be allocated and unset.  This removes the
210 // need for some explicit end-of-bitarray checks.
211 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr);
212 
213 // Copies a bit from raw_bitarr for each genoarr entry matching match_word.
214 // (match_word must be a multiple of kMask5555.)
215 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr);
216 
217 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target);
218 
219 
220 // These functions are "unsafe" since they assume trailing bits of last
221 // genovec/genoarr word are zeroed out.
222 void GenovecCount12Unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_10_ctp);
223 
224 void Count3FreqVec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp);
225 
226 // vector-alignment preferred.
227 void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
228 
229 // geno_vvec now allowed to be unaligned.
230 void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp);
231 
232 // genoarr vector-alignment preferred.
233 void GenoarrCountSubsetFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
234 
235 // slower GenoarrCountSubsetFreqs() which does not require
236 // sample_include_interleaved_vec to be precomputed
237 void GenoarrCountSubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
238 
239 void GenoarrCountInvsubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_exclude, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
240 
241 void GenoarrCountSubsetIntersectFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
242 
243 void GenovecInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec);
244 
InvertGenoWordUnsafe(uintptr_t geno_word)245 HEADER_INLINE uintptr_t InvertGenoWordUnsafe(uintptr_t geno_word) {
246   return (geno_word ^ ((~(geno_word << 1)) & kMaskAAAA));
247 }
248 
249 // too easy to forget to multiply by 2
ZeroTrailingNyps(uintptr_t nyp_ct,uintptr_t * bitarr)250 HEADER_INLINE void ZeroTrailingNyps(uintptr_t nyp_ct, uintptr_t* bitarr) {
251   ZeroTrailingBits(nyp_ct * 2, bitarr);
252 }
253 
SetTrailingNyps(uintptr_t nyp_ct,uintptr_t * bitarr)254 HEADER_INLINE void SetTrailingNyps(uintptr_t nyp_ct, uintptr_t* bitarr) {
255   const uintptr_t trail_ct = nyp_ct % kBitsPerWordD2;
256   if (trail_ct) {
257     bitarr[nyp_ct / kBitsPerWordD2] |= (~k0LU) << (nyp_ct * 2);
258   }
259 }
260 
261 // A VINT is a sequence of bytes where each byte stores just 7 bits of an
262 // an integer, and the high bit is set when the integer has more nonzero bits.
263 // See e.g.
264 //   https://developers.google.com/protocol-buffers/docs/encoding#varints
265 // (Note that protocol buffers used "group varints" at one point, but then
266 // abandoned them.  I suspect they'd be simultaneously slower and less
267 // compact here.)
268 
Vint32Append(uint32_t uii,unsigned char * buf)269 HEADER_INLINE unsigned char* Vint32Append(uint32_t uii, unsigned char* buf) {
270   while (uii > 127) {
271     *buf++ = (uii & 127) + 128;
272     uii >>= 7;
273   }
274   *buf++ = uii;
275   return buf;
276 }
277 
278 // Returns 0x80000000U on read-past-end instead of UINT32_MAX so overflow check
279 // works properly in 32-bit build.  Named "GetVint31" to make it more obvious
280 // that a 2^31 return value can't be legitimate.
GetVint31(const unsigned char * buf_end,const unsigned char ** buf_iterp)281 HEADER_INLINE uint32_t GetVint31(const unsigned char* buf_end, const unsigned char** buf_iterp) {
282   if (likely(buf_end > (*buf_iterp))) {
283     uint32_t vint32 = *((*buf_iterp)++);
284     if (vint32 <= 127) {
285       return vint32;
286     }
287     vint32 &= 127;
288     uint32_t shift = 7;
289     while (likely(buf_end > (*buf_iterp))) {
290       uint32_t uii = *((*buf_iterp)++);
291       vint32 |= (uii & 127) << shift;
292       if (uii <= 127) {
293         return vint32;
294       }
295       shift += 7;
296       // currently don't check for shift >= 32 (that's what ValidateVint31()
297       // is for).
298     }
299   }
300   return 0x80000000U;
301 }
302 
303 // Input must be validated, or bufp must be >= 5 characters before the end of
304 // the read buffer.  Currently unused.
305 // todo: check if this has enough of a speed advantage over GetVint31() to
306 // justify using this in the main loops and catching SIGSEGV.  (seems to be no
307 // more than 3%?)
308 /*
309 HEADER_INLINE uint32_t GetVint31Unsafe(const unsigned char** buf_iterp) {
310   uint32_t vint32 = *(*buf_iterp)++;
311   if (vint32 <= 127) {
312     return vint32;
313   }
314   vint32 &= 127;
315   for (uint32_t shift = 7; shift != 35; shift += 7) {
316     uint32_t uii = *(*buf_iterp)++;
317     vint32 |= (uii & 127) << shift;
318     if (uii <= 127) {
319       return vint32;
320     }
321   }
322   return 0x80000000U;
323 }
324 */
325 
326 // Does not update buf_iter.
PeekVint31(const unsigned char * buf_iter,const unsigned char * buf_end)327 HEADER_INLINE uint32_t PeekVint31(const unsigned char* buf_iter, const unsigned char* buf_end) {
328   if (likely(buf_end > buf_iter)) {
329     uint32_t vint32 = *buf_iter++;
330     if (vint32 <= 127) {
331       return vint32;
332     }
333     vint32 &= 127;
334     uint32_t shift = 7;
335     while (likely(buf_end > buf_iter)) {
336       uint32_t uii = *buf_iter++;
337       vint32 |= (uii & 127) << shift;
338       if (uii <= 127) {
339         return vint32;
340       }
341       shift += 7;
342     }
343   }
344   return 0x80000000U;
345 }
346 
347 /*
348 HEADER_INLINE uint32_t FGetVint31(FILE* ff) {
349   // Can't be used when multiple threads are reading from ff.
350   uint32_t vint32 = getc_unlocked(ff);
351   if (vint32 <= 127) {
352     return vint32;
353   }
354   vint32 &= 127;
355   for (uint32_t shift = 7; shift != 35; shift += 7) {
356     uint32_t uii = getc_unlocked(ff);
357     vint32 |= (uii & 127) << shift;
358     if (uii <= 127) {
359       return vint32;
360     }
361   }
362   return 0x80000000U;
363 }
364 
365 HEADER_INLINE void FPutVint31(uint32_t uii, FILE* ff) {
366   // caller's responsibility to periodically check ferror
367   while (uii > 127) {
368     putc_unlocked((uii & 127) + 128, ff);
369     uii >>= 7;
370   }
371   putc_unlocked(uii, ff);
372 }
373 */
374 
375 // Need this for sparse multiallelic dosage.
Vint64Append(uint64_t ullii,unsigned char * buf)376 HEADER_INLINE unsigned char* Vint64Append(uint64_t ullii, unsigned char* buf) {
377   while (ullii > 127) {
378     *buf++ = (ullii & 127) + 128;
379     ullii >>= 7;
380   }
381   *buf++ = ullii;
382   return buf;
383 }
384 
385 // Returns 2^63 on read-past-end, and named GetVint63 to make it more obvious
386 // that a 2^63 return value can't be legitimate.
GetVint63(const unsigned char * buf_end,const unsigned char ** buf_iterp)387 HEADER_INLINE uint64_t GetVint63(const unsigned char* buf_end, const unsigned char** buf_iterp) {
388   if (likely(buf_end > (*buf_iterp))) {
389     uint64_t vint64 = *((*buf_iterp)++);
390     if (vint64 <= 127) {
391       return vint64;
392     }
393     vint64 &= 127;
394     uint32_t shift = 7;
395     while (likely(buf_end > (*buf_iterp))) {
396       uint64_t ullii = *((*buf_iterp)++);
397       vint64 |= (ullii & 127) << shift;
398       if (ullii <= 127) {
399         return vint64;
400       }
401       shift += 7;
402       // currently don't check for shift >= 64 (that's what ValidateVint63()
403       // will be for).
404     }
405   }
406   return (1LLU << 63);
407 }
408 
409 // main batch size
410 CONSTI32(kPglNypTransposeBatch, kNypsPerCacheline);
411 
412 // word width of each matrix row
413 CONSTI32(kPglNypTransposeWords, kWordsPerCacheline);
414 
415 #ifdef __LP64__
416 CONSTI32(kPglNypTransposeBufbytes, (kPglNypTransposeBatch * kPglNypTransposeBatch) / 2);
417 
418 void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1);
419 #else  // !__LP64__
420 CONSTI32(kPglNypTransposeBufbytes, (kPglNypTransposeBatch * kPglNypTransposeBatch) / 2);
421 
422 void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1);
423 #endif
424 CONSTI32(kPglNypTransposeBufwords, kPglNypTransposeBufbytes / kBytesPerWord);
425 
426 // - up to 256x256; vecaligned_buf must have size 16k (64-bit) or 32k (32-bit)
427 // - does NOT zero out trailing bits, because main application is ind-major-bed
428 //   <-> plink2 format conversion, where the zeroing would be undone...
429 // - important: write_iter must be allocated up to at least
430 //   RoundUpPow2(write_batch_size, 4) rows (may want to remove this
431 //   requirement)
432 
TransposeNypblock(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)433 HEADER_INLINE void TransposeNypblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
434 #ifdef __LP64__
435   // assert(!(write_ul_stride % 2));
436   TransposeNypblock64(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, R_CAST(unsigned char*, vecaligned_buf), &(R_CAST(unsigned char*, vecaligned_buf)[kPglNypTransposeBufbytes / 2]));
437 #else
438   TransposeNypblock32(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, R_CAST(unsigned char*, vecaligned_buf), &(R_CAST(unsigned char*, vecaligned_buf)[kPglNypTransposeBufbytes / 2]));
439 #endif
440 }
441 
442 
443 // replaces each x with (32768 - x)
444 // okay for dosage_main to be nullptr if dosage_ct == 0
445 void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main);
446 
447 // replaces each x with -x
448 void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta);
449 
450 // currently does zero trailing halfword
451 void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness);
452 
453 // currently does not zero trailing halfword
454 void GenoarrToNonmissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness);
455 
456 // hom_buf gets set bits when genoarr value is 0 or 2.
457 // ref2het_buf gets set bits when genoarr value is 0 or 1.
458 // N.B. assumes trailing bits of loadbuf have been filled with 1s, not 0s
459 // Also takes genoarr word count instead of sample count.
460 void SplitHomRef2hetUnsafeW(const uintptr_t* genoarr, uint32_t inword_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
461 
462 
463 void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
464 
465 
466 // These functions use 16- or 256-element lookup tables to apply functions of
467 // the form
468 //   f: {0,1,2,3} -> x
469 // to genoarr, saving the output to result[].
470 // 256-element tables result in a substantially faster inner loop, but they are
471 // more expensive to set up and consume a non-negligible fraction of L1 cache,
472 // so they aren't always the right choice.
473 // When lookup table rows are 16 bytes, they are assumed to be 16-byte aligned
474 // in 64-bit builds.  result[] is not assumed to be aligned.
475 void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result);
476 
477 void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* result);
478 
479 void GenoarrLookup256x2bx4(const uintptr_t* genoarr, const void* table256x2bx4, uint32_t sample_ct, void* result);
480 
481 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result);
482 
483 #define PAIR_TABLE16(a, b, c, d) \
484   {(a), (a), (b), (a), (c), (a), (d), (a), \
485   (a), (b), (b), (b), (c), (b), (d), (b), \
486   (a), (c), (b), (c), (c), (c), (d), (c), \
487   (a), (d), (b), (d), (c), (d), (d), (d)}
488 
489 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* result);
490 
491 #define QUAD_TABLE256_INTERNAL2(a, b, c, d, f2, f3, f4) \
492   (a), (f2), (f3), (f4), \
493   (b), (f2), (f3), (f4), \
494   (c), (f2), (f3), (f4), \
495   (d), (f2), (f3), (f4)
496 #define QUAD_TABLE256_INTERNAL3(a, b, c, d, f3, f4) \
497   QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (a), (f3), (f4)), \
498   QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (b), (f3), (f4)), \
499   QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (c), (f3), (f4)), \
500   QUAD_TABLE256_INTERNAL2((a), (b), (c), (d), (d), (f3), (f4))
501 #define QUAD_TABLE256_INTERNAL4(a, b, c, d, f4) \
502   QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (a), (f4)), \
503   QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (b), (f4)), \
504   QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (c), (f4)), \
505   QUAD_TABLE256_INTERNAL3((a), (b), (c), (d), (d), (f4))
506 #define QUAD_TABLE256(a, b, c, d) \
507   {QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (a)), \
508    QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (b)), \
509    QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (c)), \
510    QUAD_TABLE256_INTERNAL4((a), (b), (c), (d), (d))}
511 
512 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* result);
513 
514 // Lookup table initialization functions.  table[0][0], [1][0], [2][0], and
515 // [3][0] must be initialized to f(0), f(1), f(2), and f(3) respectively.
516 void InitLookup16x4bx2(void* table16x4bx2);
517 
518 void InitLookup16x8bx2(void* table16x8bx2);
519 
520 void InitLookup256x1bx4(void* table256x1bx4);
521 
522 void InitLookup256x2bx4(void* table256x2bx4);
523 
524 void InitLookup256x4bx4(void* table256x4bx4);
525 
526 void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x4bx2, uint32_t sample_ct, void* result);
527 
528 // [0][0]..[3][0], [17][0], and [19][0] should contain the relevant values
529 void InitPhaseLookup4b(void* table56x4bx2);
530 
531 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* result);
532 
533 void InitPhaseLookup8b(void* table56x8bx2);
534 
535 // het-haploid prohibited.  64-entry table suffices: we use the same bits for
536 // phasepresent and sex_male since they can't be true simultaneously.
537 // phaseinfo is xor'd with bits 1 and 3 instead of 1 and 2.
538 void PhaseXNohhLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result);
539 
540 // [0][0]..[3][0], [16][0]..[19][0]
541 void InitPhaseXNohhLookup4b(void* table64x4bx2);
542 
543 // uses same table as PhaseXNohhLookup
544 void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result);
545 
546 void InitPhaseXNohhLookup8b(void* table64x8bx2);
547 
548 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result);
549 
550 // Unlike PhaseLookup4b(), this allows the cur_phased bit to be set when the
551 // genoarr entry is not 01 (het).
552 void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result);
553 
554 // Precondition:
555 //   [0], [2], [4], [6] initialized with unphased entries
556 //   [32], [34], [36], [38] initialized with phased-unflipped entries
557 //   [162] initialized with phased-flipped case
558 void InitVcfPhaseLookup4b(void* table246x4bx2);
559 
560 void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result);
561 
562 void InitVcfPhaseLookup2b(void* table246x2bx2);
563 
564 
565 // Analogue of BitIter1x.
GenoIter1x(const uintptr_t * __restrict genoarr,uintptr_t match_word,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)566 HEADER_INLINE uint32_t GenoIter1x(const uintptr_t* __restrict genoarr, uintptr_t match_word, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
567   uintptr_t cur_bits = *cur_bitsp;
568   while (!cur_bits) {
569     cur_bits = genoarr[++(*widxp)] ^ match_word;
570     cur_bits = (~(cur_bits | (cur_bits >> 1))) & kMask5555;
571   }
572   *cur_bitsp = cur_bits & (cur_bits - 1);
573   return ctzw(cur_bits);
574 }
575 
576 // For every missing entry in genoarr, clear the corresponding subset and
577 // sparse_vals entries.
578 void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
579 
580 void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
581 
582 double MultiallelicDiploidMinimac3R2(const uint64_t* __restrict sums, const uint64_t* __restrict hap_ssqs_x2, uint32_t nm_sample_ct, uint32_t allele_ct, uint32_t extra_phased_het_ct);
583 
MultiallelicDiploidMachR2(const uint64_t * __restrict sums,const uint64_t * __restrict ssqs,uint32_t nm_sample_ct,uint32_t allele_ct)584 HEADER_INLINE double MultiallelicDiploidMachR2(const uint64_t* __restrict sums, const uint64_t* __restrict ssqs, uint32_t nm_sample_ct, uint32_t allele_ct) {
585   return 2 * MultiallelicDiploidMinimac3R2(sums, ssqs, nm_sample_ct, allele_ct, 0);
586 }
587 
588 // ----- end plink2_common subset -----
589 
590 // other configuration-ish values
591 // this part of the specification is set in stone.
592 
593 CONSTI32(kPglVblockSize, 65536);
594 
595 // Currently chosen so that it plus kPglFwriteBlockSize + kCacheline - 2 is
596 // < 2^32, so DivUp(kPglMaxBytesPerVariant + kPglFwriteBlockSize - 1,
597 // kCacheline) doesn't overflow.
598 static const uint32_t kPglMaxBytesPerVariant = 0xfffdffc0U;
599 // CONSTI32(kPglMaxBytesPerDataTrack, 0x7ffff000);
600 // static_assert(kMaxBytesPerIO >= (int32_t)kPglMaxBytesPerDataTrack, "pgenlib assumes a single variant data track always fits in one fread/fwrite operation.");
601 
602 // currently must be power of 2, and multiple of (kBitsPerWord / 2)
603 CONSTI32(kPglDifflistGroupSize, 64);
604 
605 FLAGSET_DEF_START()
606   kfPgenGlobal0,
607   kfPgenGlobalLdCompressionPresent = (1 << 0),
608   kfPgenGlobalDifflistOrLdPresent = (1 << 1),
609 
610   // Only guaranteed to be set when present if phase or dosage also present.
611   kfPgenGlobalMultiallelicHardcallFound = (1 << 2),
612 
613   kfPgenGlobalHardcallPhasePresent = (1 << 3),
614   kfPgenGlobalDosagePresent = (1 << 4),
615   kfPgenGlobalDosagePhasePresent = (1 << 5),
616   kfPgenGlobalAllNonref = (1 << 6)
617 FLAGSET_DEF_END(PgenGlobalFlags);
618 
619 // difflist/LD compression must not involve more than
620 //   raw_sample_ct / kPglMaxDifflistLenDivisor
621 // entries.  (however, returned difflists can have up to twice as many entries,
622 // when a variant is LD-compressed and the reference variant is
623 // difflist-compressed.)
624 // This value can be considered set in stone.
625 CONSTI32(kPglMaxDifflistLenDivisor, 8);
626 
627 // Threshold for using a deltalist to represent a bitarray on disk (currently
628 // relevant for dosage data).  This is a tunable parameter, but must be >=
629 // kPglMaxDifflistLenDivisor.
630 CONSTI32(kPglMaxDeltalistLenDivisor, 9);
631 
632 void PgrDifflistToGenovecUnsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec);
633 
634 // This covers all the possibilities.  Todo: switch all functions exporting
635 // multiallelic codes and/or phased dosage to use this struct.  (Biallelic
636 // phased hardcalls and unphased dosages are simple enough for this to be
637 // overkill, though.)
638 typedef struct PgenVariantStruct {
639   NONCOPYABLE(PgenVariantStruct);
640   uintptr_t* genovec;
641   uintptr_t* patch_01_set;
642   AlleleCode* patch_01_vals;
643   uintptr_t* patch_10_set;
644   AlleleCode* patch_10_vals;
645   uintptr_t* phasepresent;
646   uintptr_t* phaseinfo;
647   uintptr_t* dosage_present;
648   uint16_t* dosage_main;
649   uintptr_t* multidosage_present;
650   unsigned char* multidosage_cts;
651   AlleleCode* multidosage_codes;
652   uint16_t* multidosage_vals;
653   uintptr_t* dphase_present;
654   int16_t* dphase_delta;
655   uintptr_t* multidphase_present;
656   unsigned char* multidphase_cts;
657   AlleleCode* multidphase_codes;
658   int16_t* multidphase_delta;
659 
660   uint32_t patch_01_ct;
661   uint32_t patch_10_ct;
662   uint32_t phasepresent_ct;
663   uint32_t dosage_ct;
664   uint32_t multidosage_sample_ct;
665   uint32_t dphase_ct;
666   uint32_t multidphase_sample_ct;
667 } PgenVariant;
668 
GetAux1bAlleleEntryByteCt(uint32_t allele_ct,uint32_t rare10_ct)669 HEADER_INLINE uintptr_t GetAux1bAlleleEntryByteCt(uint32_t allele_ct, uint32_t rare10_ct) {
670   assert(allele_ct >= 3);
671   if (allele_ct == 3) {
672     return DivUp(rare10_ct, 8);
673   }
674   if (allele_ct < 6) {
675     return DivUp(rare10_ct, 2);
676   }
677   // one byte per entry for allele_ct <= 17, two bytes for 18..256
678   return ((allele_ct >= 18) + 1) * rare10_ct;
679   // todo: allele_ct > 257
680 }
681 
682 extern const uint16_t kHcToAlleleCodes[1024];
683 
684 // Permits missing codes, does not remap.
685 void PglMultiallelicSparseToDenseMiss(const PgenVariant* pgvp, uint32_t sample_ct, AlleleCode* __restrict wide_codes);
686 
687 // The actual format:
688 // 1. 2 magic bytes 0x6c 0x1b.
689 //
690 // 2. Mode byte.
691 //      0x01 = plink1 variant-major.
692 //      0x02 = plink2 basic variant-major.  variant/sample counts in header,
693 //             00 = hom ref, 01 = het, 10 = hom alt, 11 = missing.  (vrtype 0)
694 //      0x03 = plink2 basic unphased dosage (vrtype 0x40)
695 //      0x04 = plink2 basic phased dosage (vrtype 0xc0)
696 //      These are designed to be easy to write.  Note that the dosage formats
697 //      require hardcalls to be stored as well; however, you can just set them
698 //      to all-missing and then use
699 //        plink2 --hard-call-threshold <...> --make-pgen
700 //      to populate them.
701 //
702 //      0x10 = variable-type and/or variable-length records present.
703 //      0x11 = mode 0x10, but with phase set information at the end of the
704 //             file.
705 //      0x05..0x0f and 0x12..0x7f are reserved for possible use by future
706 //      versions of the PGEN specification, and 0 is off-limits (PLINK 1
707 //      sample-major .bed).
708 //      0x80..0xff can be safely used by developers for their own purposes.
709 //
710 // 3. If not plink1-format,
711 //    a. 4-byte # of variants; call this M.
712 //    b. 4-byte # of samples, call this N.
713 //    c. Additional 1-byte header 'control' value (PgenHeaderCtrl).  May be
714 //       extended in the future.
715 //       bits 0-3: Indicates vrtype and variant record length storage widths.
716 //         If bit 3 is unset, bits 0-1 store (vrec_len_byte_ct - 1), while bit
717 //         2 is set iff phase or dosage info is present (requiring 8 bits
718 //         instead of 4 bits for vrtypes).
719 //         If bit 3 is set, a specialized encoding is used which combines the
720 //         two pieces of information (reducing the overhead for files with few
721 //         samples).  The following encodings are now defined (note that there
722 //         was a change of plans in Mar 2019):
723 //           8: 1 bit per fused vrtype-length.  Unset = vrtype 5, set = vrtype
724 //              0.
725 //           9: 2 bits, multiallelic.  0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype
726 //              8 with that many more bytes than vrtype 0.  Note that this is
727 //              limited to 16 ALT alleles.
728 //          10: 2 bits, phased.  0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype 16
729 //              with that many minus 1 bytes beyond vrtype 0.  While this is
730 //              also aimed at the single-sample use case, it technically
731 //              supports up to 15 always-phased or 7 partially-phased samples.
732 //          11: 4 bits, multiallelic + phased.  0 = vrtype 5, 1 = vrtype 0, 2-7
733 //              = vrtype 8 with that many bytes beyond vrtype 0, 9 = vrtype 16
734 //              phase info requiring just 1 byte, 10-15 = vrtype 24 with (x-7)
735 //              extra bytes required between multiallelic and phased tracks.
736 //          12: 2 bits, dosage, must be single-sample.  0 = vrtype 5, 1 =
737 //              vrtype 0, 2 = vrtype 0x45 with 2 bytes, 3 = vrtype 0x40 with 3
738 //              total bytes.
739 //          13: reserved for single-sample multiallelic + dosage.
740 //          14: 4 bits, phased + dosage, must be single-sample.  0 and 1 as
741 //              usual, 3 = vrtype 16 with 1 phaseinfo byte, 4 = vrtype 0x45
742 //              with 2 bytes, 5 = vrtype 0x40 with 3 total bytes, 12 = vrtype
743 //              0xc5 with 4 total bytes, 13 = vrtype 0xc0 with 5 total bytes,
744 //              15 = vrtype 0xe0 with 6 total bytes
745 //          15: reserved for single-sample multiallelic + phased dosage.
746 //       bits 4-5: allele count storage (00 = unstored, 01-11 = bytes per ct)
747 //       bits 6-7: nonref flags info (00 = unstored, 01 = all ref/alt, 10 =
748 //                 never ref/alt, 11 = explicitly stored)
749 //       Bits 0-5 do not apply to the fixed-length modes (currently 0x02-0x04)
750 //       and should be zeroed out in that case.
751 //
752 // 4. If mode 0x10/0x11,
753 //    a. Array of 8-byte fpos values for the first variant in each vblock.
754 //       (Note that this suggests a way to support in-place insertions: some
755 //       unused space can be left between the vblocks.)
756 //    b. Sequence of header blocks, each containing information about
757 //       kPglVblockSize variants (except the last may be shorter).  All values
758 //       are known-width, to allow e.g. plink2 --make-pgen/--pmerge to compress
759 //       all variant records first, then fseek to the beginning of the output
760 //       file and write the header.
761 //         i. array of 4-bit or 1-byte vrtypes.
762 //        ii. array of variant record lengths (each occupying vrec_len_byte_ct
763 //            bytes, or 2-4 bits).
764 //       iii. if bits 4-5 of {3c} aren't 00, array of alt allele counts.
765 //        iv. nonref flags info, if explicitly stored
766 //      (this representation allows more efficient random access)
767 //    If mode 0x02-0x04, and nonref flags info explicitly stored, just that
768 //    bitarray.
769 //
770 // 5. The variant records.  See below for details.
771 
772 // Difflist format:
773 //   a. <difflist_len VINT>
774 //   If difflist_len is zero, that's it.  Otherwise, the difflist is organized
775 //   into 64-element groups (the last group will usually be smaller), to make
776 //   extraction of e.g. a single sample less painful.  Note that with 20k
777 //   samples, a difflist is space-saving even with MAF 5%:
778 //     ~1/400 hom alt + ~38/400 het = (~39/400) * 20k
779 //                                  = ~1950 sample IDs.
780 //     that's 31 groups, requiring about 2 + 62 + 30 + 488 + 1919 = 2501 bytes
781 //     (can be slightly higher since a few ID deltas may be larger than 127);
782 //     uncompressed storage requires 5000 bytes.
783 //   b. <array of group start sample IDs, each of sample_id_byte_ct>
784 //   c. <array of 1-byte <delta segment lengths minus 63>, with last entry
785 //      omitted>
786 //   d. <optional payload of fixed-width genotype values>
787 //      (in retrospect, it might have been better to position this after (e)
788 //      to avoid entanglement with the packed-bitarray definition, oh well...)
789 //   e. one "delta segment"/group: <array of <group size - 1> VINT values,
790 //      each indicating the difference between the current and previous sample
791 //      IDs; i.e. value is 1 for two consecutive samples>
792 
793 // Variant record type ('vrtype') coding:
794 // bits 0-2:
795 //   000 = Simple 2-bit encoding.
796 //   100, 110, 111 = Simple difflist.  Low two bits store the base value.  (101
797 //                   isn't here since Hardy-Weinberg equilibrium prevents
798 //                   all het ref/alt from arising much in practice, outside of
799 //                   alignment/variant-calling technical artifacts that should
800 //                   be removed.)
801 //   010 = Differences-from-earlier-variant encoding ("LD compression").  The
802 //         last variant without this type of encoding is the base.
803 //         To simplify random access logic, the first variant in each vblock is
804 //         prohibited from using this encoding.
805 //   011 = Inverted differences-from-earlier-variant encoding.  (This covers
806 //         the case where a reference allele is 'wrong'.)  When decoding, the
807 //         difflist should be processed first, then the entire genovec should
808 //         be flipped.
809 //   001 = 1-bit + difflist representation.  Suppose most calls are hom ref or
810 //         het (e.g. a 20% MAF variant with ~4% hom alt1, ~36% het ref/alt1,
811 //         ~64% hom ref), then the main datatrack has just the low bits of the
812 //         usual 2-bit codes.  This is followed by a difflist containing the
813 //         hom alt1 and missing genotypes.
814 //         The main datatrack is preceded by a single byte indicating what
815 //         the two common values are: 2 low bits = <set value - unset value>,
816 //         next 2 bits = unset value (6 possibilities).  Top 4 bits are
817 //         reserved.
818 //   101 = All alleles are reference, no missing data.  The main datatrack is
819 //         empty in this case.  Although this saves only 1 byte per variant
820 //         over vrtype 100, this actually makes a huge difference for
821 //         single-sample files.
822 //         Since this was not defined until mid-2019, the standard plink2
823 //         alpha-test binaries will not use this encoding.  However,
824 //         alpha-2-final and later binaries interpret this encoding correctly.
825 //         If your workflow makes heavy use of single-sample .pgen files, you
826 //         can add -DFUTURE_ENCODER during compilation to unlock this feature.
827 //
828 // bit 3: multiallelic hardcalls present with alt2/alt3/... present?  If yes,
829 //        auxiliary data track #1 disambiguates the 0b01 (ref/altx) and 0b10
830 //        (altx/alty, x may equal y) hardcalls.  This contains a format byte,
831 //        followed by a list of ref/altx patches, then a list of altx/alty
832 //        patches.  All unpatched genotypes are ref/alt1 or alt1/alt1.
833 //        The bottom 4 bits of the format byte describe how the ref/altx patch
834 //        set is stored.
835 //   0 = Starts with a bitarray with <total ref/altx count> bits (divide by 8
836 //       and round up to get byte count of this component; any trailing bits in
837 //       the last byte must be 0), where each set bit corresponds to presence
838 //       of a rarealt (i.e. alt2/alt3/...).
839 //       ExpandBytearr(aux1_first_quarter, all_01, raw_sample_ctl, ct_01, 0,
840 //                     patch_01);
841 //       can be used to convert this into a set of sample IDs, though we may
842 //       want to avoid an intermediate unpacking step in practice.  Note that
843 //       when we're passing in sample_ct < raw_sample_ct and the main datatrack
844 //       is LD-compressed, we'd like ldbase_raw_genovec to be cached.
845 //       This is followed by a packed array of fixed-width <allele idx - 2>
846 //       values, where the width depends on the total number of alt alleles.
847 //       2 alts: width ZERO.  All set bits in the first bitarray correspond
848 //               to ref/alt2.
849 //       3 alts: width 1 bit.  Set bits correspond to ref/alt3, clear bits
850 //               correspond to ref/alt2.
851 //       4-5 alts: width 2 bits.  0b00 corresponds to ref/alt2, 0b01 =
852 //                 ref/alt3, 0b10 = ref/alt4, etc.
853 //       6-17 alts: width 4 bits.
854 //       18-257 alts: width 8 bits.
855 //       258-65537 alts: width 16 bits.
856 //       65538-16777215 alts: width 24 bits.  Reasonable to prohibit more than
857 //                            2^24 - 1 = 16777215, since variant records are
858 //                            limited to 4 GiB.  I can imagine some
859 //                            applications of >65534 in highly variable
860 //                            regions, though, and it doesn't actually cost us
861 //                            anything to define a way to represent it.  (A
862 //                            plink2 binary compiled with AlleleCode typedef'd
863 //                            as uint32_t will run more slowly, of course, but
864 //                            most binaries will not be compiled that way.)
865 //   1 = Same as mode 0, except the initial bitarray is replaced by a difflist
866 //       with sample IDs.  (We could make that piece somewhat smaller by
867 //       storing 0-based ref/altx indexes instead, but I'm pretty sure that
868 //       isn't worth the performance penalty of requiring all_01 and more
869 //       complicated unpacking.  Though we'll need to peek at aux1[0] before
870 //       decompressing the main datatrack to exploit this.)
871 //   15 = Empty patch set.  Might remove this (storing this as mode 1 just
872 //        takes 1 more byte), but it may enable some relevant performance
873 //        optimizations.
874 //   2-14 are reserved for future use.  We don't define an efficient way to
875 //   represent a variant that e.g. has more alt2s than alt1s for now, since alt
876 //   alleles will usually be sorted in order of decreasing frequency, but maybe
877 //   this will matter in the future.
878 //
879 //   The top 4 bits describe how the altx/alty patch set is stored.  0/1/15
880 //   have the same meaning as they do for the ref/altx patch set; the only
881 //   thing that changes is the format of the packed array of values at the end.
882 //   2 alts: width 1.  This is treated as a special case.  Set bits correspond
883 //           to alt2/alt2, clear = alt1/alt2.
884 //   3-4 alts: width 2+2 bits.  Each stores <allele idx - 1>, with the smaller
885 //             number in the lower bits.  E.g. alt1/alt2 is stored as 0b0100;
886 //             alt3/alt3 is stored as 0b1010.
887 //   5-16 alts: width 4+4 bits.
888 //   17-256 alts: width 8+8 bits.
889 //   257-65536 alts: width 16+16 bits.
890 //   65537-16777215 alts: width 24+24 bits.
891 //
892 // bit 4: hardcall phased?  If yes, auxiliary data track #2 contains phasing
893 //        information for heterozygous calls.
894 //        The first *bit* of the track indicates whether an explicit
895 //        "phasepresent" bitarray is stored.  If it's set, the next het_ct bits
896 //        are 1-bit values, where 0 = no phasing info known, and 1 = phasing
897 //        info present.  If it's unset, phasing info is present for every het
898 //        call.
899 //        This is followed by a "phaseinfo" bitarray, where 0 = unswapped,
900 //        1 = swapped (e.g. "1|0" in VCF).
901 //        This track is normally unpacked into fixed-size bitarrays when
902 //        loaded, but a raw mode is also provided (which doesn't support
903 //        subsetting).
904 //        By default, entire chromosomes/contigs are assumed to be phased
905 //        together.  (Todo: support contiguous phase sets.)
906 //
907 // bits 5-6:
908 //   00 = no dosage data.
909 //   01 = dosage list.  Auxiliary data track #3 contains a delta-encoded list
910 //        of sample IDs (like a difflist, but with no genotypes).  Track #4
911 //        contains a 16-bit (0..2^15; 65535 missing value is only permitted in
912 //        unconditional-dosage case) value expressing the sum of all alt allele
913 //        dosages.  (Yes, making this the ref allele dosage would have been a
914 //        bit cleaner, but it's too late now.)
915 //        If the variant is multiallelic, nonzero alt2/alt3/... dosages are
916 //        likely to be sparse.  So,
917 //        - track #5 contains a delta-encoded list describing which
918 //          <sample_uidx x rarealt dosage> entries are nonzero, where rarealt
919 //          index is in the lowest bits and sample_uidx can be computed via
920 //          right-shift (to avoid integer-division headaches, especially
921 //          important since indexes in this list can be larger than 2^32).
922 //          We use sample_uidx here to make subsetting less painful.
923 //          Since each nonzero dosage value requires 16 bits, and
924 //          delta-encoding of a dense list adds less than 9 bits per entry,
925 //          there isn't much point in supporting a dense bitarray mode here.
926 //          To keep memory requirements sane for biobank-scale datasets when
927 //          the number of alt alleles is very large, each sample is prohibited
928 //          from having more than 255 nonzero allele dosages (this makes no
929 //          difference when sizeof(AlleleCode) == 1, but it may matter later).
930 //        - track #6 contains the rarealt nonzero dosage values.
931 //        Note that this and the other dosage modes are in ADDITION to
932 //        hardcalls.  This increases filesize by up to 12.5%, but makes the
933 //        reader substantially simpler; --hard-call-threshold logic is nicely
934 //        compartmentalized.
935 //   10 = unconditional dosage (just track #4).
936 //   11 = dosage bitarray.  In this case, auxiliary data track #3 contains an
937 //        array of 1-bit values indicating which samples have dosages.  If the
938 //        variant is multiallelic, tracks #5 and 6 are as described above.
939 //   bgen 1.2 format no longer permits fractional missingness, so no good
940 //   reason for us to support it.
941 //   Considered putting *all* dosage data at the end of the file (like I will
942 //   do for phase set info); this could actually be worthwhile for
943 //   unconditional dosages, but it doesn't work well when only some samples
944 //   have dosage data.
945 // bit 7: some dosages explicitly phased?  If yes, and dosages are not
946 //        unconditionally present, auxiliary data track #7 is a bitarray of
947 //        length dosage_ct indicating whether dphase_delta exists for that
948 //        sample.  Note that this is technically independent of bit 4; either
949 //        can be set without the other.  (However, in practice, bit 4 is almost
950 //        always set when bit 7 is, since that enables more efficient storage
951 //        of 0|0.99, 1|0.02, and similar pairs.)
952 //        When phased dosages are present, track #8 contains values
953 //        representing <(hap1 alt prob) - (hap2 alt prob)>, etc., where the
954 //        underlying values are represented in [0..16384] (so the signed
955 //        difference is in [-16384..16384]).  Track #4 contains the
956 //        corresponding sums; parity does NOT need to match (necessary to allow
957 //        this since we treat omitted values as zero; and since we are allowing
958 //        it, no point in making parity match in other situations either).  In
959 //        fixed-width case, -32768 should be stored in track #8 when the entire
960 //        call is missing, while 0 and missing-phase are considered synonymous.
961 //        In the biallelic case, if a hardcall is phased, a dosage is present,
962 //        and no explicit dosage-phase is, we define it to mean the unique
963 //        dphase_delta sequence with maximal absolute value, and --make-pgen
964 //        takes advantage of it.  This definition technically works for
965 //        triallelic variants as well, but it breaks down with 4 alleles, so we
966 //        prohibit hardcall-phase + dosage + no-dosage-phase with more than 2
967 //        alleles.
968 //        In the multiallelic case, tracks #9 and #10 are analogous to #5 and
969 //        #6.
970 //
971 // Representation of variable ploidy (MT) was considered, but rejected since
972 // dosages should be at least as appropriate for MT.
973 // Oxford/VCF-style storage of separate probabilities for every possible
974 // genotype (e.g. P(AA), P(AB), P(BB) instead of just 2P(AA) + P(AB) and
975 // 2P(BB) + P(AB)) is tentatively rejected due to (i) lack of relevance to
976 // PLINK's analysis functions and (ii) high storage cost where we can afford it
977 // least.  In principle, this is subject to reevaluation if (i) changes, but
978 // given the poor interaction with phased dosages, it's probably better to just
979 // think of them as permanently outside PLINK's scope.
980 
981 #ifdef __cplusplus
982 }  // namespace plink2
983 #endif
984 
985 #endif  // __PGENLIB_MISC_H__
986