1 #ifndef __PGENLIB_READ_H__
2 #define __PGENLIB_READ_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // pgenlib_read contains reader-specific code.
22 
23 #include "pgenlib_misc.h"
24 
25 #ifdef __cplusplus
26 namespace plink2 {
27 #endif
28 
29 // mmap is a horrible idea for 32-bit builds, and as long as we have non-mmap
30 // code we may as well not worry about Win64 CreateFileMapping.
31 
32 // also, OS X mmap implementation seems to be crappy for large sequentially
33 // accessed files, compared to Linux.
34 
35 // possible todo: SIGBUS handling?  do we ever want to try to recover from an
36 // I/O error?
37 #if defined(_WIN32) || !defined(__LP64__)
38 #  define NO_MMAP
39 #endif
40 
41 FLAGSET_DEF_START()
42   kfPgrLdcache0,
43   kfPgrLdcacheNyp = (1 << 0),
44   kfPgrLdcacheDifflist = (1 << 1),
45   kfPgrLdcacheRawNyp = (1 << 2),
46   // may also want RawDifflist
47   kfPgrLdcacheBasicGenocounts = (1 << 3)
48 FLAGSET_DEF_END(PgrLdcacheFlags);
49 
50 // PgenFileInfo and PgenReader are the main exported "classes".
51 // Exported functions involving these data structure should all have
52 // "pgfi"/"pgr" in their names.
53 
54 // Note that this can be default-copied.
55 typedef struct PgenFileInfoStruct {
56   // ----- Header information, constant after initialization -----
57   uint32_t raw_variant_ct;
58   uint32_t raw_sample_ct;
59 
60   // 0 if variant records aren't all the same length.
61   // If they are (e.g. PLINK 1 encoding; or vrtype bits 0-5 unset), we just
62   // fseek to
63   //   const_fpos_offset + const_vrec_width * ((uint64_t)variant_idx).
64   uint64_t const_fpos_offset;
65 
66   uint32_t const_vrec_width;
67 
68   // see below.  positioned here instead of slightly later due to struct
69   // packing behavior.
70   uint32_t const_vrtype;  // 256 for plink 1 encoding, UINT32_MAX for nonconst
71 
72   // size (raw_variant_ct + 1), so that the number of bytes of (zero-based)
73   // variant n is var_fpos[n+1] - var_fpos[n].  nullptr if
74   // const_vrec_width is nonzero.
75   // It's not difficult to save some memory here (e.g. unless we're dealing
76   // with >256 TB files, it's trivial to go from 8 bytes down to 6 bytes per
77   // entry), but I doubt that's worth the trouble; let's worry about
78   // O(mn)-or-worse stuff, and on-disk stuff, first.
79   uint64_t* var_fpos;
80 
81   // Variant record type codes.
82   // base pointer is null if mode is 0x01-0x04 (const_vrtype != UINT32_MAX).
83   // if not nullptr, required to be length >=
84   //   max(raw_variant_ct + 1, RoundUpPow2(raw_variant_ct, kBytesPerWord))
85   unsigned char* vrtypes;
86 
87   // alt allele counts.
88 
89   // This can be nullptr if all alt allele counts are 1.
90   // (actually, we store the allele index offsets, so
91   // (allele_idx_offsets[n+1] - allele_idx_offsets[n]) is the number of alleles
92   // for variant n.  Otherwise, we'd need another data structure to support
93   // fast allele name lookup.)
94   uintptr_t* allele_idx_offsets;
95 
96   uintptr_t* nonref_flags;
97 
98   // If pgr.nonref_flags is nullptr and kfPgenGlobalAllNonref is unset, all
99   // reference alleles are assumed to be correct.
100   PgenGlobalFlags gflags;
101 
102   uint32_t max_allele_ct;
103   // uint32_t max_dosage_allele_ct;  // might need this later
104 
105   // * nullptr if using mmap
106   // * if using per-variant fread(), this is non-null during Pgen_file_info
107   //   initialization, but it's then "moved" to the first Pgen_reader and set
108   //   to nullptr.
109   FILE* shared_ff;
110 
111   const unsigned char* block_base;  // nullptr if using per-variant fread()
112   uint64_t block_offset;  // 0 for mmap
113 #ifndef NO_MMAP
114   uint64_t file_size;
115 #endif
116 } PgenFileInfo;
117 
118 typedef struct PgenReaderMainStruct {
119   NONCOPYABLE(PgenReaderMainStruct);
120   // would like to make this const, but that makes initialization really
121   // annoying in C99
122   struct PgenFileInfoStruct fi;
123 
124   // ----- Mutable state -----
125   // If we don't fseek, what's the next variant we'd read?  (Still relevant
126   // with mmap due to how LD decompression is implemented.)
127   uint32_t fp_vidx;
128 
129   // ** per-variant fread()-only **
130   FILE* ff;
131   unsigned char* fread_buf;
132   // ** end per-variant fread()-only **
133 
134   // if LD compression is present, cache the last non-LD-compressed variant
135   uint32_t ldbase_vidx;
136 
137   // flags indicating which base_variant buffers are populated
138   PgrLdcacheFlags ldbase_stypes;
139 
140   uint32_t ldbase_difflist_len;
141 
142   // these should be treated as private after initial allocation.
143   // not currently guaranteed to have trailing zeroes.
144   uintptr_t* ldbase_raw_genovec;  // now allocated even with no LD compression
145   uintptr_t* ldbase_genovec;
146   uintptr_t* ldbase_raregeno;
147 
148   // when ldbase_difflist_ids[] is initialized, element [ldbase_difflist_len]
149   // must be set to sample_ct.
150   uint32_t* ldbase_difflist_sample_ids;
151 
152   // common genotype can be looked up from vrtypes[]
153 
154   STD_ARRAY_DECL(uint32_t, 4, ldbase_basic_genocounts);
155 
156   // now only allocated if multiallelic variants, phase, and/or dosage present
157   // most commonly used for unsubsetted genovec; all_hets can be computed from
158   // this and patch_10_{set,vals}, and then aux2 can be interpreted.
159   // can also be used for other purposes after we're done processing aux2.
160   uintptr_t* workspace_vec;
161 
162   // currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor)
163   // entries; may need to double the sizes later
164   // some top-level interface functions use these, so several lower-level
165   // functions cannot
166   uintptr_t* workspace_raregeno_vec;
167   uint32_t* workspace_difflist_sample_ids;
168 
169   // must hold (raw_sample_ct / kPglMaxDifflistLenDivisor) entries
170   uintptr_t* workspace_raregeno_tmp_loadbuf;
171 
172   uintptr_t* workspace_aux1x_present;
173   uint64_t* workspace_imp_r2;  // needed in multiallelic case
174 
175   uintptr_t* workspace_all_hets;
176   uintptr_t* workspace_subset;  // currently used for hphase decoding
177 
178   uintptr_t* workspace_dosage_present;
179   uintptr_t* workspace_dphase_present;
180 
181   // phase set loading (mode 0x11) unimplemented for now; should be a sequence
182   // of (sample ID, [uint32_t phase set begin, set end), [set begin, set end),
183   // ...).
184 } PgenReaderMain;
185 
186 typedef struct PgenReaderStruct {
187 #ifdef __cplusplus
GET_PRIVATE_mPgenReaderStruct188   PgenReaderMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mPgenReaderStruct189   PgenReaderMain const& GET_PRIVATE_m() const { return m; }
190  private:
191 #endif
192   PgenReaderMain m;
193 } PgenReader;
194 
195 CONSTI32(kPglVrtypePlink1, 256);
196 
GetPgfiVrtype(const PgenFileInfo * pgfip,uint32_t vidx)197 HEADER_INLINE uint32_t GetPgfiVrtype(const PgenFileInfo* pgfip, uint32_t vidx) {
198   if (pgfip->vrtypes) {
199     return pgfip->vrtypes[vidx];
200   }
201   return pgfip->const_vrtype;
202 }
203 
GetPgfiFpos(const PgenFileInfo * pgfip,uintptr_t vidx)204 HEADER_INLINE uint64_t GetPgfiFpos(const PgenFileInfo* pgfip, uintptr_t vidx) {
205   if (pgfip->var_fpos) {
206     return pgfip->var_fpos[vidx];
207   }
208   return pgfip->const_fpos_offset + pgfip->const_vrec_width * S_CAST(uint64_t, vidx);
209 }
210 
GetPgfiVrecWidth(const PgenFileInfo * pgfip,uint32_t vidx)211 HEADER_INLINE uint32_t GetPgfiVrecWidth(const PgenFileInfo* pgfip, uint32_t vidx) {
212   if (pgfip->var_fpos) {
213     return pgfip->var_fpos[vidx + 1] - pgfip->var_fpos[vidx];
214   }
215   return pgfip->const_vrec_width;
216 }
217 
PgfiIsSimpleFormat(const PgenFileInfo * pgfip)218 HEADER_INLINE uint32_t PgfiIsSimpleFormat(const PgenFileInfo* pgfip) {
219   return (pgfip->const_vrtype != UINT32_MAX);
220 }
221 
VrtypeDifflist(uint32_t vrtype)222 HEADER_INLINE uint32_t VrtypeDifflist(uint32_t vrtype) {
223   return (vrtype & 4) && ((vrtype & 3) != 1);
224 }
225 
VrtypeLdCompressed(uint32_t vrtype)226 HEADER_INLINE uint32_t VrtypeLdCompressed(uint32_t vrtype) {
227   return (vrtype & 6) == 2;
228 }
229 
230 // Only checks for rarealt-containing hardcall.  Multiallelic dosage may still
231 // be present when this returns zero.
VrtypeMultiallelicHc(uint32_t vrtype)232 HEADER_INLINE uint32_t VrtypeMultiallelicHc(uint32_t vrtype) {
233   return (vrtype & 8);
234 }
235 
VrtypeHphase(uint32_t vrtype)236 HEADER_INLINE uint32_t VrtypeHphase(uint32_t vrtype) {
237   return (vrtype & 0x10);
238 }
239 
VrtypeAuxTracksPresent(uint32_t vrtype)240 HEADER_INLINE uint32_t VrtypeAuxTracksPresent(uint32_t vrtype) {
241   return (vrtype & 0x78);
242 }
243 
VrtypeVariableWidth(uint32_t vrtype)244 HEADER_INLINE uint32_t VrtypeVariableWidth(uint32_t vrtype) {
245   return (vrtype & 0x3e);
246 }
247 
VrtypeDosage(uint32_t vrtype)248 HEADER_INLINE uint32_t VrtypeDosage(uint32_t vrtype) {
249   return (vrtype & 0x60);
250 }
251 
252 static_assert(kPglMaxAltAlleleCt <= 254, "GetAux1xAlleleEntryByteCt() needs to be updated.");
GetAux1aAlleleEntryByteCt(uint32_t allele_ct,uint32_t rare01_ct)253 HEADER_INLINE uintptr_t GetAux1aAlleleEntryByteCt(uint32_t allele_ct, uint32_t rare01_ct) {
254   assert(allele_ct >= 3);
255   if (allele_ct == 3) {
256     return 0;
257   }
258   if (allele_ct == 4) {
259     return DivUp(rare01_ct, 8);
260   }
261   if (allele_ct <= 6) {
262     return DivUp(rare01_ct, 4);
263   }
264   if (allele_ct <= 18) {
265     return DivUp(rare01_ct, 2);
266   }
267   return rare01_ct;
268 }
269 
PgrGetFreadBuf(PgenReader * pgr_ptr)270 HEADER_INLINE unsigned char* PgrGetFreadBuf(PgenReader* pgr_ptr) {
271   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
272   return pgrp->fread_buf;
273 }
274 
PgrGetVrtypes(PgenReader * pgr_ptr)275 HEADER_INLINE unsigned char* PgrGetVrtypes(PgenReader* pgr_ptr) {
276   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
277   return pgrp->fi.vrtypes;
278 }
279 
PgrGetNonrefFlags(PgenReader * pgr_ptr)280 HEADER_INLINE uintptr_t* PgrGetNonrefFlags(PgenReader* pgr_ptr) {
281   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
282   return pgrp->fi.nonref_flags;
283 }
284 
PgrGetGflags(const PgenReader * pgr_ptr)285 HEADER_INLINE PgenGlobalFlags PgrGetGflags(const PgenReader* pgr_ptr) {
286   const PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
287   return pgrp->fi.gflags;
288 }
289 
PgrGetMaxAlleleCt(const PgenReader * pgr_ptr)290 HEADER_INLINE uint32_t PgrGetMaxAlleleCt(const PgenReader* pgr_ptr) {
291   const PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
292   return pgrp->fi.max_allele_ct;
293 }
294 
PgrSetFreadBuf(unsigned char * fread_buf,PgenReader * pgr_ptr)295 HEADER_INLINE void PgrSetFreadBuf(unsigned char* fread_buf, PgenReader* pgr_ptr) {
296   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
297   pgrp->fread_buf = fread_buf;
298 }
299 
PgrCopyBaseAndOffset(const PgenFileInfo * pgfip,uint32_t thread_ct,PgenReader ** pgr_ptr_arr)300 HEADER_INLINE void PgrCopyBaseAndOffset(const PgenFileInfo* pgfip, uint32_t thread_ct, PgenReader** pgr_ptr_arr) {
301   for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
302     PgenReaderMain* pgrp = &GET_PRIVATE(*(pgr_ptr_arr[tidx]), m);
303     pgrp->fi.block_base = pgfip->block_base;
304     pgrp->fi.block_offset = pgfip->block_offset;
305   }
306 }
307 
308 // This is necessary when changing sample_include, unless the new query is
309 // iterating from the first variant.  (Which can almost never be assumed in
310 // plink2 since variant_include[] may not include the first variant.)
PgrClearLdCache(PgenReader * pgr_ptr)311 HEADER_INLINE void PgrClearLdCache(PgenReader* pgr_ptr) {
312   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
313   pgrp->ldbase_stypes &= kfPgrLdcacheRawNyp;
314 
315   // bugfix, LdLoadNecessary() was otherwise claiming that reload wasn't
316   // necessary in certain cases
317   pgrp->ldbase_vidx = 0x80000000U;
318 }
319 
320 // Design change (30 Nov 2019): It is easy to forget to call PgrClearLdCache
321 // when changing sample_include.  However, each sample_include change must be
322 // accompanied by a sample_include_cumulative_popcounts update.  So, if we
323 // define a sample_include_cumulative_popcounts wrapper-type which can only be
324 // initialized by a function that also clears a PgenReader LD cache, and modify
325 // all PgrGet... functions to require this wrapper-type, the frequency of
326 // foot-shooting should go down.
327 //
328 // The key usage rule is: only use this as a local variable type, and define
329 // only one of these per function (unless you're using multiple PgenReaders
330 // simultaneously, anyway).  If you're changing the sample-subset when entering
331 // and exiting chrY, call PgrSetSampleSubsetIndex on your single
332 // PgrSampleSubsetIndex at the time you're crossing a chrY boundary.  Don't
333 // define two preinitialized PgrSetSampleSubsetIndexes...
334 // (possible todo: if compiling as C++ and NDEBUG isn't defined, add a counter
335 // field to PgenReader which is initialized to zero, asserted to be zero and
336 // then incremented by PgrSetSampleSubsetIndex, and decremented by the
337 // PgrSampleSubsetIndex destructor.)
338 typedef struct PgrSampleSubsetIndexStruct {
339 #ifdef __cplusplus
GET_PRIVATE_cumulative_popcountsPgrSampleSubsetIndexStruct340   const uint32_t*& GET_PRIVATE_cumulative_popcounts() { return cumulative_popcounts; }
GET_PRIVATE_cumulative_popcountsPgrSampleSubsetIndexStruct341   const uint32_t* const& GET_PRIVATE_cumulative_popcounts() const { return cumulative_popcounts; }
342  private:
343 #endif
344   const uint32_t* cumulative_popcounts;
345 } PgrSampleSubsetIndex;
346 
PgrSetSampleSubsetIndex(const uint32_t * sample_include_cumulative_popcounts,PgenReader * pgr_ptr,PgrSampleSubsetIndex * pssi_ptr)347 HEADER_INLINE void PgrSetSampleSubsetIndex(const uint32_t* sample_include_cumulative_popcounts, PgenReader* pgr_ptr, PgrSampleSubsetIndex* pssi_ptr) {
348   GET_PRIVATE(*pssi_ptr, cumulative_popcounts) = sample_include_cumulative_popcounts;
349   PgrClearLdCache(pgr_ptr);
350 }
351 
PgrClearSampleSubsetIndex(PgenReader * pgr_ptr,PgrSampleSubsetIndex * pssi_ptr)352 HEADER_INLINE void PgrClearSampleSubsetIndex(PgenReader* pgr_ptr, PgrSampleSubsetIndex* pssi_ptr) {
353   GET_PRIVATE(*pssi_ptr, cumulative_popcounts) = nullptr;
354   if (pgr_ptr) {
355     PgrClearLdCache(pgr_ptr);
356   }
357 }
358 
PgrSetBaseAndOffset0(unsigned char * block_base,uint32_t thread_ct,PgenReader ** pgr_ptr_arr)359 HEADER_INLINE void PgrSetBaseAndOffset0(unsigned char* block_base, uint32_t thread_ct, PgenReader** pgr_ptr_arr) {
360   for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
361     PgenReader* pgr_ptr = pgr_ptr_arr[tidx];
362     PgrClearLdCache(pgr_ptr);
363     PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
364     pgrp->fi.block_base = block_base;
365     pgrp->fi.block_offset = 0;
366   }
367 }
368 
369 // PgenFileInfo initialization is split into two phases, to decouple
370 // plink2's arena allocator from this library.  (obvious todo: provide a simple
371 // malloc-using PgenReader constructor for anyone who doesn't want to worry
372 // about these details.)
373 //
374 // Phase 1: Open the .pgen; verify that the initial bytes are consistent with
375 //   the file format; load/verify sample and variant counts, initialize
376 //   pgfi.const_vrtype, pgfi.const_vrec_width, and pgfi.const_fpos_offset;
377 //   determine initial memory allocation requirement.  first_alloc_cacheline_ct
378 //   does not include allele counts and nonref flags, since it may be more
379 //   appropriate to allocate those arrays earlier (during loading of a
380 //   .bim-like file).
381 //
382 //   pgfi.var_fpos is set to nullptr if pgfi.const_vrec_width is nonzero.
383 //   pgfi.vrtypes/var_allele_cts are set to nullptr in the plink1-format case.
384 //
385 //   raw_sample_ct and raw_variant_ct should be UINT32_MAX if not previously
386 //   known.
387 //
388 // Intermission: Caller obtains a block of pgfi_alloc_cacheline_ct * 64 bytes,
389 //   64-byte aligned.  The cachealigned_malloc() function can be used for this
390 //   purpose.  If necessary, pgfi.allele_idx_offsets and pgfi.nonref_flags
391 //   should be pointed at already-loaded data, or allocated so they can be
392 //   loaded during phase 2.
393 //
394 // Phase 2: Initialize most pointers in the PgenReader struct to appropriate
395 //   positions in first_alloc.  For modes 0x10-0x11, load pgfi.var_fpos and
396 //   pgfi.vrtypes, load/validate pgfi.allele_idx_offsets and pgfi.nonref_flags
397 //   if appropriate, and initialize pgfi.gflags, pgfi.max_allele_ct, and
398 //   pgfi.max_dosage_allele_ct.
399 //
400 // Finally, if block-fread mode is being used, pgfi.block_base must be
401 //   initialized to point to a memory large enough to handle the largest
402 //   pgfi_block_read() operation that will be attempted.
403 //   pgfi_blockload_get_cacheline_req() can be used to determine the necessary
404 //   buffer size.
405 
406 // This type may change if we introduce a more read-optimized format in the
407 // future.  Right now it just tracks the presence/absence of two optional
408 // pieces of information: allele counts and nonref flags.
409 typedef uint32_t PgenHeaderCtrl;
410 
411 void PreinitPgfi(PgenFileInfo* pgfip);
412 
413 // There are three modes of operation:
414 // 1. mmaped file.  Appropriate for handling multiple queries across different
415 //    parts of the genome in parallel.  Suboptimal for whole-genome queries.
416 //    Doesn't currently run on Windows.
417 // 2. fread block-load.  Block-load operations are single-threaded, while
418 //    decompression/counting is multithreaded.  Appropriate for whole-genome
419 //    queries, since even with a SSD, reading from multiple parts of a file
420 //    simultaneously doesn't work well.
421 // 3. fread single-variant-at-a-time.  Simpler interface than block-load, and
422 //    doesn't share its inability to handle multiple queries at a time, but
423 //    less performant for CPU-heavy operations on the whole genome.
424 //
425 // To specify mode 1, pass in use_mmap == 1 here.
426 // To specify mode 2, pass in use_mmap == 0 here, and use_blockload == 1 during
427 //   phase2.
428 // To specify mode 3, pass in use_mmap == 0 here, and use_blockload == 0 during
429 //   phase2.
430 //
431 // Update (7 Jan 2018): raw_variant_ct must be in [1, 2^31 - 3], and
432 //   raw_sample_ct must be in [1, 2^31 - 2].
433 PglErr PgfiInitPhase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, PgenHeaderCtrl* header_ctrl_ptr, PgenFileInfo* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf);
434 
435 // If allele_cts_already_loaded is set, but they're present in the file,
436 // they'll be validated; similarly for nonref_flags_already_loaded.
437 PglErr PgfiInitPhase2(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, PgenFileInfo* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf);
438 
439 
440 uint64_t PgfiMultireadGetCachelineReq(const uintptr_t* variant_include, const PgenFileInfo* pgfip, uint32_t variant_ct, uint32_t block_size);
441 
442 // variant_include can be nullptr; in that case, we simply load all the
443 // variants (load_variant_ct must be variant_uidx_end - variant_uidx_start).)
444 // IMPORTANT: pgfi.block_offset must be manually copied to each reader for now.
445 //   (todo: probably replace pgr.fi with a pointer.  when doing that, need to
446 //   ensure multiple per-variant readers still works.)
447 PglErr PgfiMultiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, PgenFileInfo* pgfip);
448 
449 
450 void PreinitPgr(PgenReader* pgr_ptr);
451 
452 // Before PgrInit() is called, the caller must obtain a block of
453 // pgr_alloc_cacheline_ct * 64 bytes (this value is returned by
454 // pgfi_init_phase2), 64-byte aligned; this is the pgr_alloc parameter.
455 //
456 // There's also a modal usage difference:
457 //
458 // * Modes 1-2 (mmap, block-fread): There is one PgenFileInfo per file
459 //   which doesn't belong to any reader.  After it's initialized, multiple
460 //   PgenReaders can be based off of it.  When the PgenFileInfo is
461 //   destroyed, those PgenReaders are invalidated and should be destroyed if
462 //   that hasn't already happened.
463 //
464 //   fname parameter must be nullptr.
465 //
466 // * Mode 3 (per-variant fread): Destruction of the original PgenFileInfo
467 //   struct does not invalidate any extant PgenReader instances (at least
468 //   from pgenlib_read's perspective).  Instead, destruction of the
469 //   corresponding memory block or allele_idx_offsets/nonref_flags invalidates
470 //   the associated PgenReaders.
471 //
472 //   The only difference between the first reader and later readers of the same
473 //   file is that the first reader steals the shared_ff used to read the
474 //   header.
475 //
476 //   fname parameter must be non-null.
477 
478 // max_vrec_width ignored when using mode 1 or 2.
479 PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip, PgenReader* pgr_ptr, unsigned char* pgr_alloc);
480 
481 // practically all these functions require genovec to be allocated up to
482 // vector, not word, boundary
483 void PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec);
484 
485 void PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec);
486 
487 // Function names for the main reader functions were getting ridiculous.
488 // New naming scheme:
489 // * PgrGet() is the basic two-bit genovec loader.  All ALT alleles are treated
490 //   as equivalent.  (00 = hom ref, 01 = het ref, 10 = two alt alleles, 11 =
491 //   missing.)
492 // * PgrGetInv1() is similar, except that the allele index to treat as REF can
493 //   be changed.
494 // * PgrGet1() only counts the specified allele.  To minimize inversion costs,
495 //   GetInv1() should be called on major alleles and Get1() should be called on
496 //   minor ones.
497 // * PgrGetM() is the multiallelic loader which doesn't collapse multiple
498 //   alleles into one.  This retrieves a sparse form identical to what
499 //   PwcAppendMultiallelicSparse takes.
500 //   Multiallelic-dosage read functions (PgrReadRaw() included) will probably
501 //   fill a 3-part data structure of the following form:
502 //   1. Bitarray indicating which samples have at least one rarealt dosage.
503 //   2. unsigned char array where, if bits a, b, and c are the only set ones in
504 //      the first array, the first three elements of the second array are
505 //      rarealt dosage counts (1..255) for those three samples.  (Could also
506 //      put those in positions [a], [b], and [c], but that produces worse
507 //      memory access locality, and it makes sense to treat multiallelic
508 //      dosages as fundamentally sparse.)
509 //   3. Let R := MINV(255, allele_ct - 2).
510 //      a. Length-(sample_ct x R) array of AlleleCodes.
511 //      b. Length-(sample_ct x R) array of uint16_t dosage (or int16_t dphase)
512 //         values.
513 //      Again we use the sparse representation, with payload values packed at
514 //      the beginning.
515 //   (--indiv-sort algorithm: initialize an array of uintptr_ts of length
516 //   sample_ct where [k] has that sample's start index in the payload arrays.)
517 // * PgrGetDifflistOrGenovec() opportunistically returns the sparse genotype
518 //   representation ('difflist'), for functions capable of taking advantage of
519 //   it.  See SampleCountsThread() in plink2_misc for a usage example.
520 // * PgrGetCounts() is equivalent to calling PgrGet() and then counting the
521 //   number of 00s, 01s, 10s, and 11s, without the overhead of fully expanding
522 //   the compressed data, etc.
523 // * P suffix = also returns hardcall-phase information.
524 // * D suffix = also returns dosage information.
525 // * Dp suffix = also returns hardcall-phase, dosage and phased-dosage
526 //   information.
527 // * PgrGet2() and PgrGet2P() loads biallelic (possibly phased) hardcalls from
528 //   a possibly-multiallelic variant.  Any hardcall where either allele is not
529 //   one of the specified two alleles is set to missing.
530 //   There is no dosage-supporting version of this because rescaling sucks.
531 
532 // This will normally extract only the genotype indexes corresponding to set
533 // bits in sample_include.  Set sample_ct == raw_sample_ct if you don't want
534 // any subsetting to occur (in this case sample_include is ignored, can be
535 // nullptr).
536 // sample_ct cannot be zero.  Trailing bits of genovec are not zeroed out.
537 // Ok if genovec only has space for sample_ct values.
538 PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec);
539 
540 // Loads the specified variant as a difflist if that's more efficient, setting
541 // difflist_common_geno to the common genotype value in that case.  Otherwise,
542 // genovec is populated and difflist_common_geno is set to UINT32_MAX.
543 //
544 // max_simple_difflist_len must be smaller than sample_ct.
545 //
546 // Note that the returned difflist_len can be much larger than
547 // max_simple_difflist_len when the variant is LD-encoded; it's bounded by
548 //   2 * (raw_sample_ct / kPglMaxDifflistLenDivisor).
549 PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
550 
551 // genocounts[0] = # hom ref, [1] = # het ref, [2] = two alts, [3] = missing
552 PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts);
553 
554 // genocounts[0] = # of hardcalls with two copies of specified allele
555 // genocounts[1] = # of hardcalls with exactly one copy of specified allele
556 // genocounts[2] = # of hardcalls with no copies
557 // genocounts[3] = missing
558 PglErr PgrGetInv1Counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts);
559 
560 PglErr IMPLPgrGet1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec);
561 
562 // Loads a nypvec with counts of a single allele (allele_idx 0 corresponds to
563 // the reference allele, allele_idx 1 corresponds to alt1, etc.).  0b11 ==
564 // missing call.
565 // Note that calling this with allele_idx == 0 is similar to a plink1 load
566 // (except with missing == 0b11, of course).
567 // todo: provide a difflist interface once anyone wants it.
PgrGet1(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec)568 HEADER_INLINE PglErr PgrGet1(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec) {
569   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
570   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
571   return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
572 }
573 
574 PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec);
575 
PgrGetInv1(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec)576 HEADER_INLINE PglErr PgrGetInv1(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec) {
577   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
578   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
579   return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec);
580 }
581 
582 PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec);
583 
PgrGet2(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReader * pgr_ptr,uintptr_t * __restrict genovec)584 HEADER_INLINE PglErr PgrGet2(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec) {
585   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
586   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
587   return IMPLPgrGet2(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx0, allele_idx1, pgrp, genovec);
588 }
589 
590 void PreinitPgv(PgenVariant* pgvp);
591 
592 PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
593 
594 // possible todo: add functions which directly support MAF-based queries.  Note
595 // that when the difflist representation is used, we can disqualify some
596 // low-MAF variants without actually loading the genotype data, since the size
597 // of the record puts an upper bound on the alt allele frequency.
598 
599 // requires trailing bits of genoarr to be zeroed out, AND does not update high
600 // bits of last word if raw_sample_ctl2 is odd.
601 void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* __restrict all_hets_hw);
602 
603 // requires trailing bits of genoarr to be zeroed out.
PgrDetectGenoarrHetsUnsafe(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ctl2,uintptr_t * __restrict all_hets)604 HEADER_INLINE void PgrDetectGenoarrHetsUnsafe(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, uintptr_t* __restrict all_hets) {
605   Halfword* all_hets_alias = R_CAST(Halfword*, all_hets);
606   DetectGenoarrHetsHw(genoarr, raw_sample_ctl2, all_hets_alias);
607   if (raw_sample_ctl2 % 2) {
608     all_hets_alias[raw_sample_ctl2] = 0;
609   }
610 }
611 
PgrDetectGenoarrHets(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ct,uintptr_t * __restrict all_hets)612 HEADER_INLINE void PgrDetectGenoarrHets(const uintptr_t* __restrict genoarr, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
613   DetectGenoarrHetsHw(genoarr, NypCtToWordCt(raw_sample_ct), R_CAST(Halfword*, all_hets));
614   ZeroTrailingBits(raw_sample_ct, all_hets);
615 }
616 
617 // sample_ct > 0.  ok for trailing bits of genoarr to not be zeroed out.
618 void PgrDetectGenoarrHetsMultiallelic(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets);
619 
620 // cannot assume phaseinfo bit is clear when phasepresent is clear.
621 PglErr PgrGetP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
622 
623 PglErr PgrGet1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
624 
625 PglErr IMPLPgrGetInv1P(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
626 
PgrGetInv1P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)627 HEADER_INLINE PglErr PgrGetInv1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
628   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
629   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
630   return IMPLPgrGetInv1P(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
631 }
632 
633 PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
634 
635 PglErr PgrGetMP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
636 
637 PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
638 
639 // if dosage_present and dosage_main are nullptr, dosage data is ignored
PgrGetD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)640 HEADER_INLINE PglErr PgrGetD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
641   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
642   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
643   return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr);
644 }
645 
646 PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
647 
648 PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
649 
650 // When computing either form of imputation-r2, this function requires the
651 // variant to be biallelic; PgrGetMDCounts must be called in that multiallelic
652 // case.
653 // imp_r2_ptr must be non-null when is_minimac3_r2 is set.
654 PglErr PgrGetDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages);
655 
656 PglErr PgrGetMDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages);
657 
658 PglErr PgrGetMD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
659 
660 PglErr IMPLPgrGetDp(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, PgenVariant* pgvp);
661 
662 // ok for both dosage_present and dosage_main to be nullptr when no dosage data
663 // is present
664 // ok for dphase_present/dphase_delta to be nullptr; dphase_ct always set to 0
665 // in that case
PgrGetDp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)666 HEADER_INLINE PglErr PgrGetDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
667   PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
668   const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
669   return IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
670 }
671 
672 // pgvp->genovec filled with inverse-counts for specified allele
673 PglErr PgrGetInv1Dp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, PgenVariant* pgvp);
674 
675 PglErr PgrGetMDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
676 
677 // interface used by --make-pgen, just performs basic LD/difflist decompression
678 // to maximize parallelism
679 PglErr PgrGetRaw(uint32_t vidx, PgenGlobalFlags read_gflags, PgenReader* pgr_ptr, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr);
680 
681 PglErr PgrValidate(PgenReader* pgr_ptr, uintptr_t* genovec_buf, char* errstr_buf);
682 
683 // missingness bit is set iff hardcall is not present (even if dosage info *is*
684 // present)
685 PglErr PgrGetMissingness(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf);
686 
687 // either missingness_hc (hardcall) or missingness_dosage must be non-null for
688 // now
689 // missingness_dosage must be vector-aligned
690 PglErr PgrGetMissingnessD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf);
691 
692 
693 // error-return iff reterr was success and was changed to kPglRetReadFail (i.e.
694 // an error message should be printed).
695 BoolErr CleanupPgfi(PgenFileInfo* pgfip, PglErr* reterrp);
696 
697 BoolErr CleanupPgr(PgenReader* pgr_ptr, PglErr* reterrp);
698 
699 #ifdef __cplusplus
700 }  // namespace plink2
701 #endif
702 
703 #endif  // __PGENLIB_READ_H__
704