1 #ifndef __PGENLIB_READ_H__
2 #define __PGENLIB_READ_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // pgenlib_read contains reader-specific code.
22
23 #include "pgenlib_misc.h"
24
25 #ifdef __cplusplus
26 namespace plink2 {
27 #endif
28
29 // mmap is a horrible idea for 32-bit builds, and as long as we have non-mmap
30 // code we may as well not worry about Win64 CreateFileMapping.
31
32 // also, OS X mmap implementation seems to be crappy for large sequentially
33 // accessed files, compared to Linux.
34
35 // possible todo: SIGBUS handling? do we ever want to try to recover from an
36 // I/O error?
37 #if defined(_WIN32) || !defined(__LP64__)
38 # define NO_MMAP
39 #endif
40
41 FLAGSET_DEF_START()
42 kfPgrLdcache0,
43 kfPgrLdcacheNyp = (1 << 0),
44 kfPgrLdcacheDifflist = (1 << 1),
45 kfPgrLdcacheRawNyp = (1 << 2),
46 // may also want RawDifflist
47 kfPgrLdcacheBasicGenocounts = (1 << 3)
48 FLAGSET_DEF_END(PgrLdcacheFlags);
49
50 // PgenFileInfo and PgenReader are the main exported "classes".
51 // Exported functions involving these data structure should all have
52 // "pgfi"/"pgr" in their names.
53
54 // Note that this can be default-copied.
55 typedef struct PgenFileInfoStruct {
56 // ----- Header information, constant after initialization -----
57 uint32_t raw_variant_ct;
58 uint32_t raw_sample_ct;
59
60 // 0 if variant records aren't all the same length.
61 // If they are (e.g. PLINK 1 encoding; or vrtype bits 0-5 unset), we just
62 // fseek to
63 // const_fpos_offset + const_vrec_width * ((uint64_t)variant_idx).
64 uint64_t const_fpos_offset;
65
66 uint32_t const_vrec_width;
67
68 // see below. positioned here instead of slightly later due to struct
69 // packing behavior.
70 uint32_t const_vrtype; // 256 for plink 1 encoding, UINT32_MAX for nonconst
71
72 // size (raw_variant_ct + 1), so that the number of bytes of (zero-based)
73 // variant n is var_fpos[n+1] - var_fpos[n]. nullptr if
74 // const_vrec_width is nonzero.
75 // It's not difficult to save some memory here (e.g. unless we're dealing
76 // with >256 TB files, it's trivial to go from 8 bytes down to 6 bytes per
77 // entry), but I doubt that's worth the trouble; let's worry about
78 // O(mn)-or-worse stuff, and on-disk stuff, first.
79 uint64_t* var_fpos;
80
81 // Variant record type codes.
82 // base pointer is null if mode is 0x01-0x04 (const_vrtype != UINT32_MAX).
83 // if not nullptr, required to be length >=
84 // max(raw_variant_ct + 1, RoundUpPow2(raw_variant_ct, kBytesPerWord))
85 unsigned char* vrtypes;
86
87 // alt allele counts.
88
89 // This can be nullptr if all alt allele counts are 1.
90 // (actually, we store the allele index offsets, so
91 // (allele_idx_offsets[n+1] - allele_idx_offsets[n]) is the number of alleles
92 // for variant n. Otherwise, we'd need another data structure to support
93 // fast allele name lookup.)
94 uintptr_t* allele_idx_offsets;
95
96 uintptr_t* nonref_flags;
97
98 // If pgr.nonref_flags is nullptr and kfPgenGlobalAllNonref is unset, all
99 // reference alleles are assumed to be correct.
100 PgenGlobalFlags gflags;
101
102 uint32_t max_allele_ct;
103 // uint32_t max_dosage_allele_ct; // might need this later
104
105 // * nullptr if using mmap
106 // * if using per-variant fread(), this is non-null during Pgen_file_info
107 // initialization, but it's then "moved" to the first Pgen_reader and set
108 // to nullptr.
109 FILE* shared_ff;
110
111 const unsigned char* block_base; // nullptr if using per-variant fread()
112 uint64_t block_offset; // 0 for mmap
113 #ifndef NO_MMAP
114 uint64_t file_size;
115 #endif
116 } PgenFileInfo;
117
118 typedef struct PgenReaderMainStruct {
119 NONCOPYABLE(PgenReaderMainStruct);
120 // would like to make this const, but that makes initialization really
121 // annoying in C99
122 struct PgenFileInfoStruct fi;
123
124 // ----- Mutable state -----
125 // If we don't fseek, what's the next variant we'd read? (Still relevant
126 // with mmap due to how LD decompression is implemented.)
127 uint32_t fp_vidx;
128
129 // ** per-variant fread()-only **
130 FILE* ff;
131 unsigned char* fread_buf;
132 // ** end per-variant fread()-only **
133
134 // if LD compression is present, cache the last non-LD-compressed variant
135 uint32_t ldbase_vidx;
136
137 // flags indicating which base_variant buffers are populated
138 PgrLdcacheFlags ldbase_stypes;
139
140 uint32_t ldbase_difflist_len;
141
142 // these should be treated as private after initial allocation.
143 // not currently guaranteed to have trailing zeroes.
144 uintptr_t* ldbase_raw_genovec; // now allocated even with no LD compression
145 uintptr_t* ldbase_genovec;
146 uintptr_t* ldbase_raregeno;
147
148 // when ldbase_difflist_ids[] is initialized, element [ldbase_difflist_len]
149 // must be set to sample_ct.
150 uint32_t* ldbase_difflist_sample_ids;
151
152 // common genotype can be looked up from vrtypes[]
153
154 STD_ARRAY_DECL(uint32_t, 4, ldbase_basic_genocounts);
155
156 // now only allocated if multiallelic variants, phase, and/or dosage present
157 // most commonly used for unsubsetted genovec; all_hets can be computed from
158 // this and patch_10_{set,vals}, and then aux2 can be interpreted.
159 // can also be used for other purposes after we're done processing aux2.
160 uintptr_t* workspace_vec;
161
162 // currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor)
163 // entries; may need to double the sizes later
164 // some top-level interface functions use these, so several lower-level
165 // functions cannot
166 uintptr_t* workspace_raregeno_vec;
167 uint32_t* workspace_difflist_sample_ids;
168
169 // must hold (raw_sample_ct / kPglMaxDifflistLenDivisor) entries
170 uintptr_t* workspace_raregeno_tmp_loadbuf;
171
172 uintptr_t* workspace_aux1x_present;
173 uint64_t* workspace_imp_r2; // needed in multiallelic case
174
175 uintptr_t* workspace_all_hets;
176 uintptr_t* workspace_subset; // currently used for hphase decoding
177
178 uintptr_t* workspace_dosage_present;
179 uintptr_t* workspace_dphase_present;
180
181 // phase set loading (mode 0x11) unimplemented for now; should be a sequence
182 // of (sample ID, [uint32_t phase set begin, set end), [set begin, set end),
183 // ...).
184 } PgenReaderMain;
185
186 typedef struct PgenReaderStruct {
187 #ifdef __cplusplus
GET_PRIVATE_mPgenReaderStruct188 PgenReaderMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mPgenReaderStruct189 PgenReaderMain const& GET_PRIVATE_m() const { return m; }
190 private:
191 #endif
192 PgenReaderMain m;
193 } PgenReader;
194
195 CONSTI32(kPglVrtypePlink1, 256);
196
GetPgfiVrtype(const PgenFileInfo * pgfip,uint32_t vidx)197 HEADER_INLINE uint32_t GetPgfiVrtype(const PgenFileInfo* pgfip, uint32_t vidx) {
198 if (pgfip->vrtypes) {
199 return pgfip->vrtypes[vidx];
200 }
201 return pgfip->const_vrtype;
202 }
203
GetPgfiFpos(const PgenFileInfo * pgfip,uintptr_t vidx)204 HEADER_INLINE uint64_t GetPgfiFpos(const PgenFileInfo* pgfip, uintptr_t vidx) {
205 if (pgfip->var_fpos) {
206 return pgfip->var_fpos[vidx];
207 }
208 return pgfip->const_fpos_offset + pgfip->const_vrec_width * S_CAST(uint64_t, vidx);
209 }
210
GetPgfiVrecWidth(const PgenFileInfo * pgfip,uint32_t vidx)211 HEADER_INLINE uint32_t GetPgfiVrecWidth(const PgenFileInfo* pgfip, uint32_t vidx) {
212 if (pgfip->var_fpos) {
213 return pgfip->var_fpos[vidx + 1] - pgfip->var_fpos[vidx];
214 }
215 return pgfip->const_vrec_width;
216 }
217
PgfiIsSimpleFormat(const PgenFileInfo * pgfip)218 HEADER_INLINE uint32_t PgfiIsSimpleFormat(const PgenFileInfo* pgfip) {
219 return (pgfip->const_vrtype != UINT32_MAX);
220 }
221
VrtypeDifflist(uint32_t vrtype)222 HEADER_INLINE uint32_t VrtypeDifflist(uint32_t vrtype) {
223 return (vrtype & 4) && ((vrtype & 3) != 1);
224 }
225
VrtypeLdCompressed(uint32_t vrtype)226 HEADER_INLINE uint32_t VrtypeLdCompressed(uint32_t vrtype) {
227 return (vrtype & 6) == 2;
228 }
229
230 // Only checks for rarealt-containing hardcall. Multiallelic dosage may still
231 // be present when this returns zero.
VrtypeMultiallelicHc(uint32_t vrtype)232 HEADER_INLINE uint32_t VrtypeMultiallelicHc(uint32_t vrtype) {
233 return (vrtype & 8);
234 }
235
VrtypeHphase(uint32_t vrtype)236 HEADER_INLINE uint32_t VrtypeHphase(uint32_t vrtype) {
237 return (vrtype & 0x10);
238 }
239
VrtypeAuxTracksPresent(uint32_t vrtype)240 HEADER_INLINE uint32_t VrtypeAuxTracksPresent(uint32_t vrtype) {
241 return (vrtype & 0x78);
242 }
243
VrtypeVariableWidth(uint32_t vrtype)244 HEADER_INLINE uint32_t VrtypeVariableWidth(uint32_t vrtype) {
245 return (vrtype & 0x3e);
246 }
247
VrtypeDosage(uint32_t vrtype)248 HEADER_INLINE uint32_t VrtypeDosage(uint32_t vrtype) {
249 return (vrtype & 0x60);
250 }
251
252 static_assert(kPglMaxAltAlleleCt <= 254, "GetAux1xAlleleEntryByteCt() needs to be updated.");
GetAux1aAlleleEntryByteCt(uint32_t allele_ct,uint32_t rare01_ct)253 HEADER_INLINE uintptr_t GetAux1aAlleleEntryByteCt(uint32_t allele_ct, uint32_t rare01_ct) {
254 assert(allele_ct >= 3);
255 if (allele_ct == 3) {
256 return 0;
257 }
258 if (allele_ct == 4) {
259 return DivUp(rare01_ct, 8);
260 }
261 if (allele_ct <= 6) {
262 return DivUp(rare01_ct, 4);
263 }
264 if (allele_ct <= 18) {
265 return DivUp(rare01_ct, 2);
266 }
267 return rare01_ct;
268 }
269
PgrGetFreadBuf(PgenReader * pgr_ptr)270 HEADER_INLINE unsigned char* PgrGetFreadBuf(PgenReader* pgr_ptr) {
271 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
272 return pgrp->fread_buf;
273 }
274
PgrGetVrtypes(PgenReader * pgr_ptr)275 HEADER_INLINE unsigned char* PgrGetVrtypes(PgenReader* pgr_ptr) {
276 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
277 return pgrp->fi.vrtypes;
278 }
279
PgrGetNonrefFlags(PgenReader * pgr_ptr)280 HEADER_INLINE uintptr_t* PgrGetNonrefFlags(PgenReader* pgr_ptr) {
281 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
282 return pgrp->fi.nonref_flags;
283 }
284
PgrGetGflags(const PgenReader * pgr_ptr)285 HEADER_INLINE PgenGlobalFlags PgrGetGflags(const PgenReader* pgr_ptr) {
286 const PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
287 return pgrp->fi.gflags;
288 }
289
PgrGetMaxAlleleCt(const PgenReader * pgr_ptr)290 HEADER_INLINE uint32_t PgrGetMaxAlleleCt(const PgenReader* pgr_ptr) {
291 const PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
292 return pgrp->fi.max_allele_ct;
293 }
294
PgrSetFreadBuf(unsigned char * fread_buf,PgenReader * pgr_ptr)295 HEADER_INLINE void PgrSetFreadBuf(unsigned char* fread_buf, PgenReader* pgr_ptr) {
296 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
297 pgrp->fread_buf = fread_buf;
298 }
299
PgrCopyBaseAndOffset(const PgenFileInfo * pgfip,uint32_t thread_ct,PgenReader ** pgr_ptr_arr)300 HEADER_INLINE void PgrCopyBaseAndOffset(const PgenFileInfo* pgfip, uint32_t thread_ct, PgenReader** pgr_ptr_arr) {
301 for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
302 PgenReaderMain* pgrp = &GET_PRIVATE(*(pgr_ptr_arr[tidx]), m);
303 pgrp->fi.block_base = pgfip->block_base;
304 pgrp->fi.block_offset = pgfip->block_offset;
305 }
306 }
307
308 // This is necessary when changing sample_include, unless the new query is
309 // iterating from the first variant. (Which can almost never be assumed in
310 // plink2 since variant_include[] may not include the first variant.)
PgrClearLdCache(PgenReader * pgr_ptr)311 HEADER_INLINE void PgrClearLdCache(PgenReader* pgr_ptr) {
312 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
313 pgrp->ldbase_stypes &= kfPgrLdcacheRawNyp;
314
315 // bugfix, LdLoadNecessary() was otherwise claiming that reload wasn't
316 // necessary in certain cases
317 pgrp->ldbase_vidx = 0x80000000U;
318 }
319
320 // Design change (30 Nov 2019): It is easy to forget to call PgrClearLdCache
321 // when changing sample_include. However, each sample_include change must be
322 // accompanied by a sample_include_cumulative_popcounts update. So, if we
323 // define a sample_include_cumulative_popcounts wrapper-type which can only be
324 // initialized by a function that also clears a PgenReader LD cache, and modify
325 // all PgrGet... functions to require this wrapper-type, the frequency of
326 // foot-shooting should go down.
327 //
328 // The key usage rule is: only use this as a local variable type, and define
329 // only one of these per function (unless you're using multiple PgenReaders
330 // simultaneously, anyway). If you're changing the sample-subset when entering
331 // and exiting chrY, call PgrSetSampleSubsetIndex on your single
332 // PgrSampleSubsetIndex at the time you're crossing a chrY boundary. Don't
333 // define two preinitialized PgrSetSampleSubsetIndexes...
334 // (possible todo: if compiling as C++ and NDEBUG isn't defined, add a counter
335 // field to PgenReader which is initialized to zero, asserted to be zero and
336 // then incremented by PgrSetSampleSubsetIndex, and decremented by the
337 // PgrSampleSubsetIndex destructor.)
338 typedef struct PgrSampleSubsetIndexStruct {
339 #ifdef __cplusplus
GET_PRIVATE_cumulative_popcountsPgrSampleSubsetIndexStruct340 const uint32_t*& GET_PRIVATE_cumulative_popcounts() { return cumulative_popcounts; }
GET_PRIVATE_cumulative_popcountsPgrSampleSubsetIndexStruct341 const uint32_t* const& GET_PRIVATE_cumulative_popcounts() const { return cumulative_popcounts; }
342 private:
343 #endif
344 const uint32_t* cumulative_popcounts;
345 } PgrSampleSubsetIndex;
346
PgrSetSampleSubsetIndex(const uint32_t * sample_include_cumulative_popcounts,PgenReader * pgr_ptr,PgrSampleSubsetIndex * pssi_ptr)347 HEADER_INLINE void PgrSetSampleSubsetIndex(const uint32_t* sample_include_cumulative_popcounts, PgenReader* pgr_ptr, PgrSampleSubsetIndex* pssi_ptr) {
348 GET_PRIVATE(*pssi_ptr, cumulative_popcounts) = sample_include_cumulative_popcounts;
349 PgrClearLdCache(pgr_ptr);
350 }
351
PgrClearSampleSubsetIndex(PgenReader * pgr_ptr,PgrSampleSubsetIndex * pssi_ptr)352 HEADER_INLINE void PgrClearSampleSubsetIndex(PgenReader* pgr_ptr, PgrSampleSubsetIndex* pssi_ptr) {
353 GET_PRIVATE(*pssi_ptr, cumulative_popcounts) = nullptr;
354 if (pgr_ptr) {
355 PgrClearLdCache(pgr_ptr);
356 }
357 }
358
PgrSetBaseAndOffset0(unsigned char * block_base,uint32_t thread_ct,PgenReader ** pgr_ptr_arr)359 HEADER_INLINE void PgrSetBaseAndOffset0(unsigned char* block_base, uint32_t thread_ct, PgenReader** pgr_ptr_arr) {
360 for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
361 PgenReader* pgr_ptr = pgr_ptr_arr[tidx];
362 PgrClearLdCache(pgr_ptr);
363 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
364 pgrp->fi.block_base = block_base;
365 pgrp->fi.block_offset = 0;
366 }
367 }
368
369 // PgenFileInfo initialization is split into two phases, to decouple
370 // plink2's arena allocator from this library. (obvious todo: provide a simple
371 // malloc-using PgenReader constructor for anyone who doesn't want to worry
372 // about these details.)
373 //
374 // Phase 1: Open the .pgen; verify that the initial bytes are consistent with
375 // the file format; load/verify sample and variant counts, initialize
376 // pgfi.const_vrtype, pgfi.const_vrec_width, and pgfi.const_fpos_offset;
377 // determine initial memory allocation requirement. first_alloc_cacheline_ct
378 // does not include allele counts and nonref flags, since it may be more
379 // appropriate to allocate those arrays earlier (during loading of a
380 // .bim-like file).
381 //
382 // pgfi.var_fpos is set to nullptr if pgfi.const_vrec_width is nonzero.
383 // pgfi.vrtypes/var_allele_cts are set to nullptr in the plink1-format case.
384 //
385 // raw_sample_ct and raw_variant_ct should be UINT32_MAX if not previously
386 // known.
387 //
388 // Intermission: Caller obtains a block of pgfi_alloc_cacheline_ct * 64 bytes,
389 // 64-byte aligned. The cachealigned_malloc() function can be used for this
390 // purpose. If necessary, pgfi.allele_idx_offsets and pgfi.nonref_flags
391 // should be pointed at already-loaded data, or allocated so they can be
392 // loaded during phase 2.
393 //
394 // Phase 2: Initialize most pointers in the PgenReader struct to appropriate
395 // positions in first_alloc. For modes 0x10-0x11, load pgfi.var_fpos and
396 // pgfi.vrtypes, load/validate pgfi.allele_idx_offsets and pgfi.nonref_flags
397 // if appropriate, and initialize pgfi.gflags, pgfi.max_allele_ct, and
398 // pgfi.max_dosage_allele_ct.
399 //
400 // Finally, if block-fread mode is being used, pgfi.block_base must be
401 // initialized to point to a memory large enough to handle the largest
402 // pgfi_block_read() operation that will be attempted.
403 // pgfi_blockload_get_cacheline_req() can be used to determine the necessary
404 // buffer size.
405
406 // This type may change if we introduce a more read-optimized format in the
407 // future. Right now it just tracks the presence/absence of two optional
408 // pieces of information: allele counts and nonref flags.
409 typedef uint32_t PgenHeaderCtrl;
410
411 void PreinitPgfi(PgenFileInfo* pgfip);
412
413 // There are three modes of operation:
414 // 1. mmaped file. Appropriate for handling multiple queries across different
415 // parts of the genome in parallel. Suboptimal for whole-genome queries.
416 // Doesn't currently run on Windows.
417 // 2. fread block-load. Block-load operations are single-threaded, while
418 // decompression/counting is multithreaded. Appropriate for whole-genome
419 // queries, since even with a SSD, reading from multiple parts of a file
420 // simultaneously doesn't work well.
421 // 3. fread single-variant-at-a-time. Simpler interface than block-load, and
422 // doesn't share its inability to handle multiple queries at a time, but
423 // less performant for CPU-heavy operations on the whole genome.
424 //
425 // To specify mode 1, pass in use_mmap == 1 here.
426 // To specify mode 2, pass in use_mmap == 0 here, and use_blockload == 1 during
427 // phase2.
428 // To specify mode 3, pass in use_mmap == 0 here, and use_blockload == 0 during
429 // phase2.
430 //
431 // Update (7 Jan 2018): raw_variant_ct must be in [1, 2^31 - 3], and
432 // raw_sample_ct must be in [1, 2^31 - 2].
433 PglErr PgfiInitPhase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, PgenHeaderCtrl* header_ctrl_ptr, PgenFileInfo* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf);
434
435 // If allele_cts_already_loaded is set, but they're present in the file,
436 // they'll be validated; similarly for nonref_flags_already_loaded.
437 PglErr PgfiInitPhase2(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, PgenFileInfo* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf);
438
439
440 uint64_t PgfiMultireadGetCachelineReq(const uintptr_t* variant_include, const PgenFileInfo* pgfip, uint32_t variant_ct, uint32_t block_size);
441
442 // variant_include can be nullptr; in that case, we simply load all the
443 // variants (load_variant_ct must be variant_uidx_end - variant_uidx_start).)
444 // IMPORTANT: pgfi.block_offset must be manually copied to each reader for now.
445 // (todo: probably replace pgr.fi with a pointer. when doing that, need to
446 // ensure multiple per-variant readers still works.)
447 PglErr PgfiMultiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, PgenFileInfo* pgfip);
448
449
450 void PreinitPgr(PgenReader* pgr_ptr);
451
452 // Before PgrInit() is called, the caller must obtain a block of
453 // pgr_alloc_cacheline_ct * 64 bytes (this value is returned by
454 // pgfi_init_phase2), 64-byte aligned; this is the pgr_alloc parameter.
455 //
456 // There's also a modal usage difference:
457 //
458 // * Modes 1-2 (mmap, block-fread): There is one PgenFileInfo per file
459 // which doesn't belong to any reader. After it's initialized, multiple
460 // PgenReaders can be based off of it. When the PgenFileInfo is
461 // destroyed, those PgenReaders are invalidated and should be destroyed if
462 // that hasn't already happened.
463 //
464 // fname parameter must be nullptr.
465 //
466 // * Mode 3 (per-variant fread): Destruction of the original PgenFileInfo
467 // struct does not invalidate any extant PgenReader instances (at least
468 // from pgenlib_read's perspective). Instead, destruction of the
469 // corresponding memory block or allele_idx_offsets/nonref_flags invalidates
470 // the associated PgenReaders.
471 //
472 // The only difference between the first reader and later readers of the same
473 // file is that the first reader steals the shared_ff used to read the
474 // header.
475 //
476 // fname parameter must be non-null.
477
478 // max_vrec_width ignored when using mode 1 or 2.
479 PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip, PgenReader* pgr_ptr, unsigned char* pgr_alloc);
480
481 // practically all these functions require genovec to be allocated up to
482 // vector, not word, boundary
483 void PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec);
484
485 void PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec);
486
487 // Function names for the main reader functions were getting ridiculous.
488 // New naming scheme:
489 // * PgrGet() is the basic two-bit genovec loader. All ALT alleles are treated
490 // as equivalent. (00 = hom ref, 01 = het ref, 10 = two alt alleles, 11 =
491 // missing.)
492 // * PgrGetInv1() is similar, except that the allele index to treat as REF can
493 // be changed.
494 // * PgrGet1() only counts the specified allele. To minimize inversion costs,
495 // GetInv1() should be called on major alleles and Get1() should be called on
496 // minor ones.
497 // * PgrGetM() is the multiallelic loader which doesn't collapse multiple
498 // alleles into one. This retrieves a sparse form identical to what
499 // PwcAppendMultiallelicSparse takes.
500 // Multiallelic-dosage read functions (PgrReadRaw() included) will probably
501 // fill a 3-part data structure of the following form:
502 // 1. Bitarray indicating which samples have at least one rarealt dosage.
503 // 2. unsigned char array where, if bits a, b, and c are the only set ones in
504 // the first array, the first three elements of the second array are
505 // rarealt dosage counts (1..255) for those three samples. (Could also
506 // put those in positions [a], [b], and [c], but that produces worse
507 // memory access locality, and it makes sense to treat multiallelic
508 // dosages as fundamentally sparse.)
509 // 3. Let R := MINV(255, allele_ct - 2).
510 // a. Length-(sample_ct x R) array of AlleleCodes.
511 // b. Length-(sample_ct x R) array of uint16_t dosage (or int16_t dphase)
512 // values.
513 // Again we use the sparse representation, with payload values packed at
514 // the beginning.
515 // (--indiv-sort algorithm: initialize an array of uintptr_ts of length
516 // sample_ct where [k] has that sample's start index in the payload arrays.)
517 // * PgrGetDifflistOrGenovec() opportunistically returns the sparse genotype
518 // representation ('difflist'), for functions capable of taking advantage of
519 // it. See SampleCountsThread() in plink2_misc for a usage example.
520 // * PgrGetCounts() is equivalent to calling PgrGet() and then counting the
521 // number of 00s, 01s, 10s, and 11s, without the overhead of fully expanding
522 // the compressed data, etc.
523 // * P suffix = also returns hardcall-phase information.
524 // * D suffix = also returns dosage information.
525 // * Dp suffix = also returns hardcall-phase, dosage and phased-dosage
526 // information.
527 // * PgrGet2() and PgrGet2P() loads biallelic (possibly phased) hardcalls from
528 // a possibly-multiallelic variant. Any hardcall where either allele is not
529 // one of the specified two alleles is set to missing.
530 // There is no dosage-supporting version of this because rescaling sucks.
531
532 // This will normally extract only the genotype indexes corresponding to set
533 // bits in sample_include. Set sample_ct == raw_sample_ct if you don't want
534 // any subsetting to occur (in this case sample_include is ignored, can be
535 // nullptr).
536 // sample_ct cannot be zero. Trailing bits of genovec are not zeroed out.
537 // Ok if genovec only has space for sample_ct values.
538 PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec);
539
540 // Loads the specified variant as a difflist if that's more efficient, setting
541 // difflist_common_geno to the common genotype value in that case. Otherwise,
542 // genovec is populated and difflist_common_geno is set to UINT32_MAX.
543 //
544 // max_simple_difflist_len must be smaller than sample_ct.
545 //
546 // Note that the returned difflist_len can be much larger than
547 // max_simple_difflist_len when the variant is LD-encoded; it's bounded by
548 // 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor).
549 PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
550
551 // genocounts[0] = # hom ref, [1] = # het ref, [2] = two alts, [3] = missing
552 PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts);
553
554 // genocounts[0] = # of hardcalls with two copies of specified allele
555 // genocounts[1] = # of hardcalls with exactly one copy of specified allele
556 // genocounts[2] = # of hardcalls with no copies
557 // genocounts[3] = missing
558 PglErr PgrGetInv1Counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts);
559
560 PglErr IMPLPgrGet1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec);
561
562 // Loads a nypvec with counts of a single allele (allele_idx 0 corresponds to
563 // the reference allele, allele_idx 1 corresponds to alt1, etc.). 0b11 ==
564 // missing call.
565 // Note that calling this with allele_idx == 0 is similar to a plink1 load
566 // (except with missing == 0b11, of course).
567 // todo: provide a difflist interface once anyone wants it.
PgrGet1(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec)568 HEADER_INLINE PglErr PgrGet1(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec) {
569 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
570 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
571 return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
572 }
573
574 PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec);
575
PgrGetInv1(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec)576 HEADER_INLINE PglErr PgrGetInv1(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec) {
577 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
578 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
579 return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec);
580 }
581
582 PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec);
583
PgrGet2(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReader * pgr_ptr,uintptr_t * __restrict genovec)584 HEADER_INLINE PglErr PgrGet2(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec) {
585 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
586 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
587 return IMPLPgrGet2(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx0, allele_idx1, pgrp, genovec);
588 }
589
590 void PreinitPgv(PgenVariant* pgvp);
591
592 PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
593
594 // possible todo: add functions which directly support MAF-based queries. Note
595 // that when the difflist representation is used, we can disqualify some
596 // low-MAF variants without actually loading the genotype data, since the size
597 // of the record puts an upper bound on the alt allele frequency.
598
599 // requires trailing bits of genoarr to be zeroed out, AND does not update high
600 // bits of last word if raw_sample_ctl2 is odd.
601 void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* __restrict all_hets_hw);
602
603 // requires trailing bits of genoarr to be zeroed out.
PgrDetectGenoarrHetsUnsafe(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ctl2,uintptr_t * __restrict all_hets)604 HEADER_INLINE void PgrDetectGenoarrHetsUnsafe(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, uintptr_t* __restrict all_hets) {
605 Halfword* all_hets_alias = R_CAST(Halfword*, all_hets);
606 DetectGenoarrHetsHw(genoarr, raw_sample_ctl2, all_hets_alias);
607 if (raw_sample_ctl2 % 2) {
608 all_hets_alias[raw_sample_ctl2] = 0;
609 }
610 }
611
PgrDetectGenoarrHets(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ct,uintptr_t * __restrict all_hets)612 HEADER_INLINE void PgrDetectGenoarrHets(const uintptr_t* __restrict genoarr, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
613 DetectGenoarrHetsHw(genoarr, NypCtToWordCt(raw_sample_ct), R_CAST(Halfword*, all_hets));
614 ZeroTrailingBits(raw_sample_ct, all_hets);
615 }
616
617 // sample_ct > 0. ok for trailing bits of genoarr to not be zeroed out.
618 void PgrDetectGenoarrHetsMultiallelic(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets);
619
620 // cannot assume phaseinfo bit is clear when phasepresent is clear.
621 PglErr PgrGetP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
622
623 PglErr PgrGet1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
624
625 PglErr IMPLPgrGetInv1P(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
626
PgrGetInv1P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)627 HEADER_INLINE PglErr PgrGetInv1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
628 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
629 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
630 return IMPLPgrGetInv1P(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
631 }
632
633 PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr);
634
635 PglErr PgrGetMP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
636
637 PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
638
639 // if dosage_present and dosage_main are nullptr, dosage data is ignored
PgrGetD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)640 HEADER_INLINE PglErr PgrGetD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
641 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
642 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
643 return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr);
644 }
645
646 PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
647
648 PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
649
650 // When computing either form of imputation-r2, this function requires the
651 // variant to be biallelic; PgrGetMDCounts must be called in that multiallelic
652 // case.
653 // imp_r2_ptr must be non-null when is_minimac3_r2 is set.
654 PglErr PgrGetDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages);
655
656 PglErr PgrGetMDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages);
657
658 PglErr PgrGetMD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
659
660 PglErr IMPLPgrGetDp(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, PgenVariant* pgvp);
661
662 // ok for both dosage_present and dosage_main to be nullptr when no dosage data
663 // is present
664 // ok for dphase_present/dphase_delta to be nullptr; dphase_ct always set to 0
665 // in that case
PgrGetDp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)666 HEADER_INLINE PglErr PgrGetDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
667 PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
668 const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
669 return IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
670 }
671
672 // pgvp->genovec filled with inverse-counts for specified allele
673 PglErr PgrGetInv1Dp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, PgenVariant* pgvp);
674
675 PglErr PgrGetMDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp);
676
677 // interface used by --make-pgen, just performs basic LD/difflist decompression
678 // to maximize parallelism
679 PglErr PgrGetRaw(uint32_t vidx, PgenGlobalFlags read_gflags, PgenReader* pgr_ptr, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr);
680
681 PglErr PgrValidate(PgenReader* pgr_ptr, uintptr_t* genovec_buf, char* errstr_buf);
682
683 // missingness bit is set iff hardcall is not present (even if dosage info *is*
684 // present)
685 PglErr PgrGetMissingness(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf);
686
687 // either missingness_hc (hardcall) or missingness_dosage must be non-null for
688 // now
689 // missingness_dosage must be vector-aligned
690 PglErr PgrGetMissingnessD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf);
691
692
693 // error-return iff reterr was success and was changed to kPglRetReadFail (i.e.
694 // an error message should be printed).
695 BoolErr CleanupPgfi(PgenFileInfo* pgfip, PglErr* reterrp);
696
697 BoolErr CleanupPgr(PgenReader* pgr_ptr, PglErr* reterrp);
698
699 #ifdef __cplusplus
700 } // namespace plink2
701 #endif
702
703 #endif // __PGENLIB_READ_H__
704