1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_compress_stream.h"
19 #include "plink2_matrix.h"
20 #include "plink2_matrix_calc.h"
21 #include "plink2_random.h"
22
23 #ifdef __cplusplus
24 namespace plink2 {
25 #endif
26
InitScore(ScoreInfo * score_info_ptr)27 void InitScore(ScoreInfo* score_info_ptr) {
28 score_info_ptr->flags = kfScore0;
29 score_info_ptr->varid_col_p1 = 1;
30 score_info_ptr->allele_col_p1 = 0; // defensive
31 score_info_ptr->input_fname = nullptr;
32 InitRangeList(&(score_info_ptr->input_col_idx_range_list));
33
34 score_info_ptr->qsr_range_fname = nullptr;
35 score_info_ptr->qsr_data_fname = nullptr;
36 score_info_ptr->qsr_varid_col_p1 = 1;
37 score_info_ptr->qsr_val_col_p1 = 0; // defensive
38 }
39
CleanupScore(ScoreInfo * score_info_ptr)40 void CleanupScore(ScoreInfo* score_info_ptr) {
41 free_cond(score_info_ptr->input_fname);
42 CleanupRangeList(&(score_info_ptr->input_col_idx_range_list));
43
44 free_cond(score_info_ptr->qsr_range_fname);
45 free_cond(score_info_ptr->qsr_data_fname);
46 }
47
48
TriangleDivide(int64_t cur_prod_x2,int32_t modif)49 uint32_t TriangleDivide(int64_t cur_prod_x2, int32_t modif) {
50 // return smallest integer vv for which (vv * (vv + modif)) is no smaller
51 // than cur_prod_x2, and neither term in the product is negative.
52 int64_t vv;
53 if (cur_prod_x2 == 0) {
54 if (modif < 0) {
55 return -modif;
56 }
57 return 0;
58 }
59 vv = S_CAST(int64_t, sqrt(S_CAST(double, cur_prod_x2)));
60 while ((vv - 1) * (vv + modif - 1) >= cur_prod_x2) {
61 vv--;
62 }
63 while (vv * (vv + modif) < cur_prod_x2) {
64 vv++;
65 }
66 return vv;
67 }
68
ParallelBounds(uint32_t ct,int32_t start,uint32_t parallel_idx,uint32_t parallel_tot,int32_t * __restrict bound_start_ptr,int32_t * __restrict bound_end_ptr)69 void ParallelBounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
70 int32_t modif = 1 - start * 2;
71 int64_t ct_tot = S_CAST(int64_t, ct) * (ct + modif);
72 *bound_start_ptr = TriangleDivide((ct_tot * parallel_idx) / parallel_tot, modif);
73 *bound_end_ptr = TriangleDivide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
74 }
75
76 // set align to 1 for no alignment
TriangleFill(uint32_t ct,uint32_t piece_ct,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t start,uint32_t align,uint32_t * target_arr)77 void TriangleFill(uint32_t ct, uint32_t piece_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
78 int32_t modif = 1 - start * 2;
79 int64_t cur_prod_x2;
80 int32_t lbound;
81 int32_t ubound;
82 uint32_t uii;
83 uint32_t align_m1;
84 ParallelBounds(ct, start, parallel_idx, parallel_tot, &lbound, &ubound);
85 // x(x+1)/2 is divisible by y iff (x % (2y)) is 0 or (2y - 1).
86 align *= 2;
87 align_m1 = align - 1;
88 target_arr[0] = lbound;
89 target_arr[piece_ct] = ubound;
90 cur_prod_x2 = S_CAST(int64_t, lbound) * (lbound + modif);
91 const int64_t ct_tr = (S_CAST(int64_t, ubound) * (ubound + modif) - cur_prod_x2) / piece_ct;
92 for (uint32_t piece_idx = 1; piece_idx != piece_ct; ++piece_idx) {
93 cur_prod_x2 += ct_tr;
94 lbound = TriangleDivide(cur_prod_x2, modif);
95 uii = (lbound - S_CAST(int32_t, start)) & align_m1;
96 if ((uii) && (uii != align_m1)) {
97 lbound = start + ((lbound - S_CAST(int32_t, start)) | align_m1);
98 }
99 // lack of this check caused a nasty bug earlier
100 if (S_CAST(uint32_t, lbound) > ct) {
101 lbound = ct;
102 }
103 target_arr[piece_idx] = lbound;
104 }
105 }
106
107 // Returns 0 if cells_avail is insufficient.
CountTrianglePasses(uintptr_t start_idx,uintptr_t end_idx,uintptr_t is_no_diag,uintptr_t cells_avail)108 uint32_t CountTrianglePasses(uintptr_t start_idx, uintptr_t end_idx, uintptr_t is_no_diag, uintptr_t cells_avail) {
109 start_idx -= is_no_diag;
110 end_idx -= is_no_diag;
111 if (cells_avail < end_idx) {
112 return 0;
113 }
114 cells_avail *= 2; // don't want to worry about /2 in triangular numbers
115 const uint64_t end_tri = S_CAST(uint64_t, end_idx) * (end_idx + 1);
116 uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
117 for (uint32_t pass_ct = 1; ; ++pass_ct) {
118 const uint64_t delta_tri = end_tri - start_tri;
119 if (delta_tri <= cells_avail) {
120 return pass_ct;
121 }
122 const uint64_t next_target = start_tri + cells_avail;
123 start_idx = S_CAST(int64_t, sqrt(u63tod(next_target)));
124 start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
125 if (start_tri > next_target) {
126 --start_idx;
127 start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
128 assert(start_tri <= next_target);
129 }
130 }
131 }
132
NextTrianglePass(uintptr_t start_idx,uintptr_t grand_end_idx,uintptr_t is_no_diag,uintptr_t cells_avail)133 uint64_t NextTrianglePass(uintptr_t start_idx, uintptr_t grand_end_idx, uintptr_t is_no_diag, uintptr_t cells_avail) {
134 cells_avail *= 2;
135 start_idx -= is_no_diag;
136 grand_end_idx -= is_no_diag;
137 const uint64_t end_tri = S_CAST(uint64_t, grand_end_idx) * (grand_end_idx + 1);
138 uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
139 const uint64_t delta_tri = end_tri - start_tri;
140 if (delta_tri <= cells_avail) {
141 return grand_end_idx + is_no_diag;
142 }
143 const uint64_t next_target = start_tri + cells_avail;
144 start_idx = S_CAST(int64_t, sqrt(u63tod(next_target)));
145 start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
146 return start_idx + is_no_diag - (start_tri > next_target);
147 }
148
TriangleLoadBalance(uint32_t piece_ct,uintptr_t start_idx,uintptr_t end_idx,uint32_t is_no_diag,uint32_t * target_arr)149 void TriangleLoadBalance(uint32_t piece_ct, uintptr_t start_idx, uintptr_t end_idx, uint32_t is_no_diag, uint32_t* target_arr) {
150 target_arr[0] = start_idx;
151 target_arr[piece_ct] = end_idx;
152 start_idx -= is_no_diag;
153 end_idx -= is_no_diag;
154 const uint64_t end_tri = S_CAST(uint64_t, end_idx) * (end_idx + 1);
155 uint64_t cur_target = S_CAST(uint64_t, start_idx) * (start_idx + 1);
156 const uint64_t std_size = (end_tri - cur_target) / piece_ct;
157 for (uint32_t piece_idx = 1; piece_idx != piece_ct; ++piece_idx) {
158 // don't use cur_target = start_tri + (piece_idx * delta_tri) / piece_ct
159 // because of potential overflow
160 cur_target += std_size;
161 start_idx = S_CAST(int64_t, sqrt(u63tod(cur_target)));
162 const uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
163 if (start_tri > cur_target) {
164 --start_idx;
165 }
166 target_arr[piece_idx] = start_idx + is_no_diag;
167 }
168 }
169
KinshipPruneDestructive(uintptr_t * kinship_table,uintptr_t * sample_include,uint32_t * sample_ct_ptr)170 PglErr KinshipPruneDestructive(uintptr_t* kinship_table, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
171 PglErr reterr = kPglRetSuccess;
172 {
173 const uintptr_t orig_sample_ct = *sample_ct_ptr;
174 const uintptr_t orig_sample_ctl = BitCtToWordCt(orig_sample_ct);
175 uintptr_t* sample_include_collapsed_nz;
176 uintptr_t* sample_remove_collapsed;
177 uint32_t* vertex_degree;
178 if (unlikely(
179 bigstack_calloc_w(orig_sample_ctl, &sample_include_collapsed_nz) ||
180 bigstack_calloc_w(orig_sample_ctl, &sample_remove_collapsed) ||
181 bigstack_alloc_u32(orig_sample_ct, &vertex_degree))) {
182 goto KinshipPruneDestructive_ret_NOMEM;
183 }
184 // 1. count the number of constraints for each remaining sample
185 uint32_t degree_1_vertex_ct = 0;
186 for (uint32_t sample_idx = 0; sample_idx != orig_sample_ct; ++sample_idx) {
187 const uintptr_t woffset = sample_idx * orig_sample_ctl;
188 const uintptr_t* read_iter1 = &(kinship_table[woffset]);
189 // don't currently guarantee vector-alignment of kinship_table rows, so
190 // can't use PopcountWords(). (change this?)
191 uint32_t cur_degree = 0;
192 for (uint32_t widx = 0; widx != orig_sample_ctl; ++widx) {
193 const uintptr_t cur_word = *read_iter1++;
194 cur_degree += PopcountWord(cur_word);
195 }
196 if (cur_degree) {
197 vertex_degree[sample_idx] = cur_degree;
198 degree_1_vertex_ct += (cur_degree == 1);
199 SetBit(sample_idx, sample_include_collapsed_nz);
200 }
201 }
202 uint32_t cur_sample_nz_ct = PopcountWords(sample_include_collapsed_nz, orig_sample_ctl);
203 // 2. as long as edges remain,
204 // a. remove partner of first degree-one vertex, if such a vertex exists
205 // b. otherwise, remove first maximal-degree vertex
206 // (similar to plink 1.9 rel_cutoff_batch(), but data structure is not
207 // triangular since more speed is needed)
208 while (cur_sample_nz_ct) {
209 uint32_t prune_uidx;
210 uint32_t cur_degree;
211 if (degree_1_vertex_ct) {
212 uint32_t degree_1_vertex_uidx = 0;
213 while (1) {
214 // sparse
215 degree_1_vertex_uidx = AdvTo1Bit(sample_include_collapsed_nz, degree_1_vertex_uidx);
216 if (vertex_degree[degree_1_vertex_uidx] == 1) {
217 break;
218 }
219 ++degree_1_vertex_uidx;
220 }
221 // find partner
222 prune_uidx = AdvTo1Bit(&(kinship_table[degree_1_vertex_uidx * orig_sample_ctl]), 0);
223 cur_degree = vertex_degree[prune_uidx];
224 } else {
225 uint32_t sample_uidx = AdvTo1Bit(sample_include_collapsed_nz, 0);
226 cur_degree = vertex_degree[sample_uidx];
227 prune_uidx = sample_uidx;
228 for (uint32_t sample_idx = 1; sample_idx != cur_sample_nz_ct; ++sample_idx) {
229 // sparse
230 sample_uidx = AdvTo1Bit(sample_include_collapsed_nz, sample_uidx + 1);
231 const uint32_t new_degree = vertex_degree[sample_uidx];
232 if (new_degree > cur_degree) {
233 cur_degree = new_degree;
234 prune_uidx = sample_uidx;
235 }
236 }
237 }
238 // remove row/column
239 uintptr_t* cur_kinship_col = &(kinship_table[prune_uidx / kBitsPerWord]);
240 const uintptr_t kinship_col_mask = ~(k1LU << (prune_uidx % kBitsPerWord));
241 uintptr_t* cur_kinship_row = &(kinship_table[prune_uidx * orig_sample_ctl]);
242 uint32_t sample_uidx = 0;
243 for (uint32_t partner_idx = 0; partner_idx != cur_degree; ++partner_idx, ++sample_uidx) {
244 // sparse
245 sample_uidx = AdvTo1Bit(cur_kinship_row, sample_uidx);
246 const uint32_t new_degree = vertex_degree[sample_uidx] - 1;
247 if (!new_degree) {
248 ClearBit(sample_uidx, sample_include_collapsed_nz);
249 --degree_1_vertex_ct;
250 --cur_sample_nz_ct;
251 // unnecessary to write to kinship_table[] or vertex_degree[]
252 } else {
253 cur_kinship_col[sample_uidx * orig_sample_ctl] &= kinship_col_mask;
254 degree_1_vertex_ct += (new_degree == 1);
255 vertex_degree[sample_uidx] = new_degree;
256 }
257 }
258 if (vertex_degree[prune_uidx] == 1) {
259 --degree_1_vertex_ct;
260 }
261 sample_remove_collapsed[prune_uidx / kBitsPerWord] |= ~kinship_col_mask;
262 sample_include_collapsed_nz[prune_uidx / kBitsPerWord] &= kinship_col_mask;
263 // unnecessary to update current kinship_table[] row
264 --cur_sample_nz_ct;
265 }
266 uint32_t sample_ct = orig_sample_ct;
267 uintptr_t sample_widx = 0;
268 uintptr_t cur_bits = sample_include[0];
269 for (uint32_t sample_idx = 0; sample_idx != orig_sample_ct; ++sample_idx) {
270 const uintptr_t lowbit = BitIter1y(sample_include, &sample_widx, &cur_bits);
271 if (IsSet(sample_remove_collapsed, sample_idx)) {
272 sample_include[sample_widx] ^= lowbit;
273 --sample_ct;
274 }
275 }
276 *sample_ct_ptr = sample_ct;
277 }
278 while (0) {
279 KinshipPruneDestructive_ret_NOMEM:
280 reterr = kPglRetNomem;
281 break;
282 }
283 return reterr;
284 }
285
KingCutoffBatch(const SampleIdInfo * siip,uint32_t raw_sample_ct,double king_cutoff,uintptr_t * sample_include,char * king_cutoff_fprefix,uint32_t * sample_ct_ptr)286 PglErr KingCutoffBatch(const SampleIdInfo* siip, uint32_t raw_sample_ct, double king_cutoff, uintptr_t* sample_include, char* king_cutoff_fprefix, uint32_t* sample_ct_ptr) {
287 unsigned char* bigstack_mark = g_bigstack_base;
288 FILE* binfile = nullptr;
289 char* fprefix_end = &(king_cutoff_fprefix[strlen(king_cutoff_fprefix)]);
290 uintptr_t line_idx = 0;
291 PglErr reterr = kPglRetSuccess;
292 TextStream txs;
293 PreinitTextStream(&txs);
294 {
295 uint32_t sample_ct = *sample_ct_ptr;
296 const uint32_t orig_sample_ctl = BitCtToWordCt(sample_ct);
297 uintptr_t* kinship_table;
298 uint32_t* sample_uidx_to_king_uidx;
299 if (unlikely(
300 bigstack_calloc_w(sample_ct * orig_sample_ctl, &kinship_table) ||
301 bigstack_alloc_u32(raw_sample_ct, &sample_uidx_to_king_uidx))) {
302 goto KingCutoffBatch_ret_NOMEM;
303 }
304
305 snprintf(fprefix_end, 9, ".king.id");
306 reterr = InitTextStream(king_cutoff_fprefix, kTextStreamBlenFast, 1, &txs);
307 if (unlikely(reterr)) {
308 goto KingCutoffBatch_ret_TSTREAM_FAIL;
309 }
310 // bugfix (18 Aug 2018): this missed some xid_mode possibilities
311 // todo: try to simplify this interface, it's bordering on incomprehensible
312 char* line_start;
313 XidMode xid_mode;
314 reterr = LoadXidHeader("king-cutoff", (siip->sids || (siip->flags & kfSampleIdStrictSid0))? kfXidHeader0 : kfXidHeaderIgnoreSid, &line_idx, &txs, &xid_mode, &line_start, nullptr);
315 if (unlikely(reterr)) {
316 if (reterr == kPglRetEof) {
317 logerrputs("Error: Empty --king-cutoff ID file.\n");
318 goto KingCutoffBatch_ret_MALFORMED_INPUT;
319 }
320 goto KingCutoffBatch_ret_TSTREAM_XID_FAIL;
321 }
322
323 uint32_t* xid_map; // IDs not collapsed
324 char* sorted_xidbox;
325 uintptr_t max_xid_blen;
326 reterr = SortedXidboxInitAlloc(sample_include, siip, sample_ct, 0, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
327 if (unlikely(reterr)) {
328 goto KingCutoffBatch_ret_1;
329 }
330 char* idbuf;
331 if (unlikely(bigstack_alloc_c(max_xid_blen, &idbuf))) {
332 goto KingCutoffBatch_ret_NOMEM;
333 }
334 SetAllU32Arr(raw_sample_ct, sample_uidx_to_king_uidx);
335 uintptr_t king_id_ct = 0;
336 if (*line_start == '#') {
337 ++line_idx;
338 line_start = TextGet(&txs);
339 }
340 for (; line_start; ++line_idx, line_start = TextGet(&txs)) {
341 const char* linebuf_iter = line_start;
342 uint32_t sample_uidx;
343 if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx, idbuf)) {
344 if (unlikely(!linebuf_iter)) {
345 goto KingCutoffBatch_ret_MISSING_TOKENS;
346 }
347 continue;
348 }
349 if (unlikely(sample_uidx_to_king_uidx[sample_uidx] != UINT32_MAX)) {
350 char* first_tab = AdvToDelim(idbuf, '\t');
351 char* second_tab = strchr(&(first_tab[1]), '\t');
352 *first_tab = ' ';
353 if (second_tab) {
354 *second_tab = ' ';
355 }
356 snprintf(g_logbuf, kLogbufSize, "Error: Duplicate ID '%s' in %s .\n", idbuf, king_cutoff_fprefix);
357 goto KingCutoffBatch_ret_MALFORMED_INPUT_WW;
358 }
359 sample_uidx_to_king_uidx[sample_uidx] = king_id_ct;
360 ++king_id_ct;
361 }
362 if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
363 goto KingCutoffBatch_ret_TSTREAM_FAIL;
364 }
365
366 BigstackReset(TextStreamMemStart(&txs));
367 if (unlikely(CleanupTextStream2(king_cutoff_fprefix, &txs, &reterr))) {
368 goto KingCutoffBatch_ret_1;
369 }
370 uintptr_t* king_include;
371 uint32_t* king_uidx_to_sample_idx;
372 if (unlikely(
373 bigstack_calloc_w(BitCtToWordCt(king_id_ct), &king_include) ||
374 bigstack_alloc_u32(king_id_ct, &king_uidx_to_sample_idx))) {
375 goto KingCutoffBatch_ret_NOMEM;
376 }
377 uintptr_t sample_uidx_base = 0;
378 uintptr_t sample_include_bits = sample_include[0];
379 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
380 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
381 const uint32_t king_uidx = sample_uidx_to_king_uidx[sample_uidx];
382 if (king_uidx != UINT32_MAX) {
383 SetBit(king_uidx, king_include);
384 king_uidx_to_sample_idx[king_uidx] = sample_idx;
385 }
386 }
387 snprintf(fprefix_end, 10, ".king.bin");
388 if (unlikely(fopen_checked(king_cutoff_fprefix, FOPEN_RB, &binfile))) {
389 goto KingCutoffBatch_ret_OPEN_FAIL;
390 }
391 if (unlikely(fseeko(binfile, 0, SEEK_END))) {
392 goto KingCutoffBatch_ret_READ_FAIL;
393 }
394 const uint64_t fsize = ftello(binfile);
395 const uint64_t fsize_double_expected = (king_id_ct * (S_CAST(uint64_t, king_id_ct) - 1) * (sizeof(double) / 2));
396 const uint32_t is_double = (fsize == fsize_double_expected);
397 rewind(binfile);
398 const uint32_t first_king_uidx = AdvBoundedTo1Bit(king_include, 0, king_id_ct);
399 uintptr_t king_uidx = AdvBoundedTo1Bit(king_include, first_king_uidx + 1, king_id_ct);
400 if (king_uidx > 1) {
401 if (fseeko(binfile, king_uidx * (S_CAST(uint64_t, king_uidx) - 1) * (2 + (2 * is_double)), SEEK_SET)) {
402 goto KingCutoffBatch_ret_READ_FAIL;
403 }
404 }
405 uintptr_t constraint_ct = 0;
406 if (is_double) {
407 // fread limit
408 assert(king_id_ct <= ((kMaxBytesPerIO / sizeof(double)) + 1));
409 double* king_drow;
410 if (unlikely(bigstack_alloc_d(king_id_ct - 1, &king_drow))) {
411 goto KingCutoffBatch_ret_NOMEM;
412 }
413 for (uint32_t king_idx = 1; king_uidx != king_id_ct; ++king_idx, ++king_uidx) {
414 if (!IsSet(king_include, king_uidx)) {
415 king_uidx = AdvBoundedTo1Bit(king_include, king_uidx + 1, king_id_ct);
416 if (king_uidx == king_id_ct) {
417 break;
418 }
419 if (unlikely(fseeko(binfile, S_CAST(uint64_t, king_uidx) * (king_uidx - 1) * (sizeof(double) / 2), SEEK_SET))) {
420 goto KingCutoffBatch_ret_READ_FAIL;
421 }
422 }
423 if (unlikely(!fread_unlocked(king_drow, king_uidx * sizeof(double), 1, binfile))) {
424 goto KingCutoffBatch_ret_READ_FAIL;
425 }
426 const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
427 uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
428 uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
429 const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
430 uintptr_t king_uidx2_base;
431 uintptr_t king_include_bits;
432 BitIter1Start(king_include, first_king_uidx, &king_uidx2_base, &king_include_bits);
433 for (uint32_t king_idx2 = 0; king_idx2 != king_idx; ++king_idx2) {
434 const uintptr_t king_uidx2 = BitIter1(king_include, &king_uidx2_base, &king_include_bits);
435 if (king_drow[king_uidx2] > king_cutoff) {
436 const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
437 SetBit(sample_idx2, kinship_table_row);
438 kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
439 ++constraint_ct;
440 }
441 }
442 }
443 } else {
444 if (unlikely(fsize != (fsize_double_expected / 2))) {
445 const uint64_t fsize_double_square = king_id_ct * S_CAST(uint64_t, king_id_ct) * sizeof(double);
446 if ((fsize == fsize_double_square) || (fsize == fsize_double_square / 2)) {
447 logerrputs("Error: --king-cutoff currently requires a *triangular* .bin file; the provided\nfile appears to be square.\n");
448 } else {
449 logerrprintfww("Error: Invalid --king-cutoff .bin file size (expected %" PRIu64 " or %" PRIu64 " bytes).\n", fsize_double_expected / 2, fsize_double_expected);
450 }
451 goto KingCutoffBatch_ret_MALFORMED_INPUT;
452 }
453 assert(king_id_ct <= ((0x7ffff000 / sizeof(float)) + 1));
454 const float king_cutoff_f = S_CAST(float, king_cutoff);
455 float* king_frow;
456 if (unlikely(bigstack_alloc_f(king_id_ct - 1, &king_frow))) {
457 goto KingCutoffBatch_ret_NOMEM;
458 }
459 for (uint32_t king_idx = 1; king_uidx != king_id_ct; ++king_idx, ++king_uidx) {
460 if (!IsSet(king_include, king_uidx)) {
461 king_uidx = AdvBoundedTo1Bit(king_include, king_uidx + 1, king_id_ct);
462 if (king_uidx == king_id_ct) {
463 break;
464 }
465 if (unlikely(fseeko(binfile, S_CAST(uint64_t, king_uidx) * (king_uidx - 1) * (sizeof(float) / 2), SEEK_SET))) {
466 goto KingCutoffBatch_ret_READ_FAIL;
467 }
468 }
469 if (unlikely(!fread_unlocked(king_frow, king_uidx * sizeof(float), 1, binfile))) {
470 goto KingCutoffBatch_ret_READ_FAIL;
471 }
472 const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
473 uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
474 uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
475 const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
476 uintptr_t king_uidx2_base;
477 uintptr_t king_include_bits;
478 BitIter1Start(king_include, first_king_uidx, &king_uidx2_base, &king_include_bits);
479 for (uint32_t king_idx2 = 0; king_idx2 != king_idx; ++king_idx2) {
480 const uintptr_t king_uidx2 = BitIter1(king_include, &king_uidx2_base, &king_include_bits);
481 if (king_frow[king_uidx2] > king_cutoff_f) {
482 const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
483 SetBit(sample_idx2, kinship_table_row);
484 kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
485 ++constraint_ct;
486 }
487 }
488 }
489 }
490 logprintf("--king-cutoff: %" PRIuPTR " constraint%s loaded.\n", constraint_ct, (constraint_ct == 1)? "" : "s");
491 BigstackReset(sample_uidx_to_king_uidx);
492 if (unlikely(KinshipPruneDestructive(kinship_table, sample_include, sample_ct_ptr))) {
493 goto KingCutoffBatch_ret_NOMEM;
494 }
495 }
496 while (0) {
497 KingCutoffBatch_ret_NOMEM:
498 reterr = kPglRetNomem;
499 break;
500 KingCutoffBatch_ret_OPEN_FAIL:
501 reterr = kPglRetOpenFail;
502 break;
503 KingCutoffBatch_ret_READ_FAIL:
504 logerrprintfww(kErrprintfFread, king_cutoff_fprefix, strerror(errno));
505 reterr = kPglRetReadFail;
506 break;
507 KingCutoffBatch_ret_MISSING_TOKENS:
508 logerrprintfww("Error: Fewer tokens than expected on line %" PRIuPTR " of %s .\n", line_idx, king_cutoff_fprefix);
509 reterr = kPglRetMalformedInput;
510 break;
511 KingCutoffBatch_ret_TSTREAM_XID_FAIL:
512 if (!TextStreamErrcode(&txs)) {
513 break;
514 }
515 KingCutoffBatch_ret_TSTREAM_FAIL:
516 TextStreamErrPrint(king_cutoff_fprefix, &txs);
517 break;
518 KingCutoffBatch_ret_MALFORMED_INPUT_WW:
519 WordWrapB(0);
520 logerrputsb();
521 KingCutoffBatch_ret_MALFORMED_INPUT:
522 reterr = kPglRetMalformedInput;
523 break;
524 }
525 KingCutoffBatch_ret_1:
526 fclose_cond(binfile);
527 if (CleanupTextStream(&txs, &reterr)) {
528 snprintf(fprefix_end, 9, ".king.id");
529 logerrprintfww(kErrprintfFread, king_cutoff_fprefix, strerror(errno));
530 }
531 BigstackReset(bigstack_mark);
532 return reterr;
533 }
534
535 CONSTI32(kKingOffsetIbs0, 0);
536 CONSTI32(kKingOffsetHethet, 1);
537 CONSTI32(kKingOffsetHet2Hom1, 2);
538 CONSTI32(kKingOffsetHet1Hom2, 3);
539 CONSTI32(kKingOffsetHomhom, 4);
540
541 typedef struct CalcKingSparseCtxStruct {
542 const uintptr_t* variant_include_orig;
543 uintptr_t* sample_include;
544 uint32_t* sample_include_cumulative_popcounts;
545 uint32_t row_start_idx;
546 uint32_t row_end_idx;
547 uint32_t homhom_needed;
548
549 uint32_t max_sparse_ct;
550
551 uint32_t read_block_size; // guaranteed to be power of 2
552
553 PgenReader** pgr_ptrs;
554 uintptr_t** genovecs;
555 uint32_t* read_variant_uidx_starts;
556
557 // this has length >= 3 * max_sparse_ct
558 uint32_t** thread_idx_bufs;
559
560 uint32_t cur_block_size;
561
562 uint32_t** thread_singleton_het_cts;
563 uint32_t** thread_singleton_hom_cts;
564 uint32_t** thread_singleton_missing_cts;
565 uint32_t* thread_skip_cts;
566
567 // single global copy
568 uint32_t* king_counts;
569
570 uintptr_t** thread_sparse_excludes[2];
571
572 PglErr reterr;
573 } CalcKingSparseCtx;
574
CalcKingSparseThread(void * raw_arg)575 THREAD_FUNC_DECL CalcKingSparseThread(void* raw_arg) {
576 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
577 const uintptr_t tidx = arg->tidx;
578 CalcKingSparseCtx* ctx = S_CAST(CalcKingSparseCtx*, arg->sharedp->context);
579
580 const uintptr_t* variant_include_orig = ctx->variant_include_orig;
581 const uintptr_t* sample_include = ctx->sample_include;
582
583 PgenReader* pgrp = ctx->pgr_ptrs[tidx];
584 PgrSampleSubsetIndex pssi;
585 PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
586 uintptr_t* genovec = ctx->genovecs[tidx];
587 uint32_t row_start_idx = ctx->row_start_idx;
588 const uint64_t tri_start = ((row_start_idx - 1) * S_CAST(uint64_t, row_start_idx)) / 2;
589 if (row_start_idx == 1) {
590 row_start_idx = 0;
591 }
592 const uint32_t sample_ct = ctx->row_end_idx;
593 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
594 const uint32_t remainder = sample_ct % kBitsPerWordD2;
595 const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp);
596 const uint32_t homhom_needed = ctx->homhom_needed;
597 const uintptr_t homhom_needed_p4 = homhom_needed + 4;
598 const uint32_t max_sparse_ct = ctx->max_sparse_ct;
599 const uint32_t read_block_size_mask = ctx->read_block_size - 1;
600 const uint32_t read_block_sizel = ctx->read_block_size / kBitsPerWord;
601
602 uint32_t* idx_bufs[4];
603 idx_bufs[0] = nullptr;
604 idx_bufs[1] = ctx->thread_idx_bufs[tidx];
605 idx_bufs[2] = &(idx_bufs[1][max_sparse_ct]);
606 idx_bufs[3] = &(idx_bufs[2][max_sparse_ct]);
607 const uint32_t min_common_ct = sample_ct - max_sparse_ct;
608
609 uint32_t* singleton_het_cts = ctx->thread_singleton_het_cts[tidx];
610 uint32_t* singleton_hom_cts = ctx->thread_singleton_hom_cts[tidx];
611 uint32_t* singleton_missing_cts = ctx->thread_singleton_missing_cts[tidx];
612 ZeroU32Arr(sample_ct, singleton_het_cts);
613 ZeroU32Arr(sample_ct, singleton_hom_cts);
614 ZeroU32Arr(sample_ct, singleton_missing_cts);
615 uint32_t skip_ct = 0;
616
617 uint32_t* king_counts = ctx->king_counts;
618 {
619 // This matrix can be huge, so we multithread zero-initialization.
620 const uint64_t entry_ct = homhom_needed_p4 * (((sample_ct - 1) * S_CAST(uint64_t, sample_ct)) / 2 - tri_start);
621 const uintptr_t fill_start = RoundDownPow2((tidx * entry_ct) / calc_thread_ct, kInt32PerCacheline);
622 uintptr_t fill_end = entry_ct;
623 if (tidx + 1 != calc_thread_ct) {
624 fill_end = RoundDownPow2(((tidx + 1) * entry_ct) / calc_thread_ct, kInt32PerCacheline);
625 }
626 ZeroU32Arr(fill_end - fill_start, &(king_counts[fill_start]));
627 }
628 uint32_t parity = 0;
629 // sync.Once before main loop; we need the other threads to be done with
630 // their zero-initialization jobs before we can proceed.
631 while (!THREAD_BLOCK_FINISH(arg)) {
632 const uint32_t cur_block_size = ctx->cur_block_size;
633 const uint32_t idx_end = ((tidx + 1) * cur_block_size) / calc_thread_ct;
634 uintptr_t variant_uidx_base;
635 uintptr_t variant_include_bits;
636 BitIter1Start(variant_include_orig, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &variant_include_bits);
637 uintptr_t* sparse_exclude = ctx->thread_sparse_excludes[parity][tidx];
638 ZeroWArr(read_block_sizel, sparse_exclude);
639 // probable todo: better load-balancing
640 for (uint32_t cur_idx = (tidx * cur_block_size) / calc_thread_ct; cur_idx != idx_end; ++cur_idx) {
641 const uint32_t variant_uidx = BitIter1(variant_include_orig, &variant_uidx_base, &variant_include_bits);
642 // tried DifflistOrGenovec, difference was negligible. Not really worth
643 // considering it when calculation is inherently >O(mn).
644 PglErr reterr = PgrGet(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec);
645 if (unlikely(reterr)) {
646 ctx->reterr = reterr;
647 goto CalcKingSparseThread_err;
648 }
649 STD_ARRAY_DECL(uint32_t, 4, genocounts);
650 ZeroTrailingNyps(sample_ct, genovec);
651 GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts);
652 uintptr_t mask_word;
653 uintptr_t common_idx;
654 if (genocounts[0] >= min_common_ct) {
655 common_idx = 0;
656 mask_word = 0;
657 } else if (genocounts[2] >= min_common_ct) {
658 common_idx = 2;
659 mask_word = kMaskAAAA;
660 } else if (genocounts[3] >= min_common_ct) {
661 common_idx = 3;
662 mask_word = ~k0LU;
663 ++skip_ct;
664 } else {
665 if ((!homhom_needed) && ((genocounts[0] + genocounts[3] == sample_ct) || (genocounts[2] + genocounts[3] == sample_ct))) {
666 SetBit(variant_uidx & read_block_size_mask, sparse_exclude);
667 ++skip_ct;
668 }
669 continue;
670 }
671 SetBit(variant_uidx & read_block_size_mask, sparse_exclude);
672 if (genocounts[common_idx] == sample_ct) {
673 continue;
674 }
675 if (remainder) {
676 genovec[sample_ctl2 - 1] |= mask_word << (2 * remainder);
677 }
678 uint32_t* idx_buf_iters[4];
679 memcpy(idx_buf_iters, idx_bufs, 4 * sizeof(intptr_t));
680 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
681 uintptr_t xor_word = genovec[widx] ^ mask_word;
682 if (xor_word) {
683 const uint32_t offset_base = widx * kBitsPerWordD2;
684 do {
685 const uint32_t shift_ct = ctzw(xor_word) & (~1);
686 const uint32_t cur_xor = (xor_word >> shift_ct) & 3;
687 *(idx_buf_iters[cur_xor])++ = offset_base + (shift_ct / 2);
688 xor_word &= ~((3 * k1LU) << shift_ct);
689 } while (xor_word);
690 }
691 }
692 // We do two things here.
693 // 1. Update singleton_{het,hom,missing}_cts for every observed rare
694 // genotype. This is enough for correct accounting for any pair
695 // involving only one (or none) of these rare genotypes, and the
696 // arrays are small enough that each thread can keep its own copy
697 // (they're added up at the end).
698 // 2. For each pair of rare genotypes, atomically correct the main
699 // king_counts[] array. This is messy (9x2 cases) but conceptually
700 // straightforward.
701 const uint32_t* het_idxs = idx_bufs[common_idx ^ 1];
702 const uint32_t het_ct = genocounts[1];
703 if (common_idx != 3) {
704 const uint32_t* other_hom_idxs = idx_bufs[2];
705 const uint32_t* missing_idxs = idx_bufs[common_idx ^ 3];
706 const uint32_t other_hom_ct = idx_buf_iters[2] - other_hom_idxs;
707 const uint32_t missing_ct = genocounts[3];
708 for (uint32_t uii = 0; uii != het_ct; ++uii) {
709 // We want to iterate over one row at a time, for better
710 // memory-access locality. So the outer loop must correspond to the
711 // larger sample-index.
712 const uintptr_t sample_idx_hi = het_idxs[uii];
713 singleton_het_cts[sample_idx_hi] += 1;
714 if (sample_idx_hi < row_start_idx) {
715 continue;
716 }
717 const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
718 for (uint32_t ujj = 0; ujj != uii; ++ujj) {
719 const uintptr_t sample_idx_lo = het_idxs[ujj];
720 const uintptr_t tri_coord = tri_base + sample_idx_lo;
721 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
722 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHethet]), 1, __ATOMIC_RELAXED);
723 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
724 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
725 if (homhom_needed) {
726 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
727 }
728 }
729 for (uint32_t ujj = 0; ujj != other_hom_ct; ++ujj) {
730 const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
731 if (sample_idx_lo > sample_idx_hi) {
732 break;
733 }
734 const uintptr_t tri_coord = tri_base + sample_idx_lo;
735 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
736 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
737 }
738 for (uint32_t ujj = 0; ujj != missing_ct; ++ujj) {
739 const uintptr_t sample_idx_lo = missing_idxs[ujj];
740 if (sample_idx_lo > sample_idx_hi) {
741 break;
742 }
743 const uintptr_t tri_coord = tri_base + sample_idx_lo;
744 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
745 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
746 if (homhom_needed) {
747 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
748 }
749 }
750 }
751 for (uint32_t uii = 0; uii != other_hom_ct; ++uii) {
752 const uintptr_t sample_idx_hi = other_hom_idxs[uii];
753 singleton_hom_cts[sample_idx_hi] += 1;
754 if (sample_idx_hi < row_start_idx) {
755 continue;
756 }
757 const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
758 for (uint32_t ujj = 0; ujj != uii; ++ujj) {
759 const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
760 const uintptr_t tri_coord = tri_base + sample_idx_lo;
761 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
762 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 2, __ATOMIC_RELAXED);
763 }
764 for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
765 const uintptr_t sample_idx_lo = het_idxs[ujj];
766 if (sample_idx_lo > sample_idx_hi) {
767 break;
768 }
769 const uintptr_t tri_coord = tri_base + sample_idx_lo;
770 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
771 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
772 }
773 for (uint32_t ujj = 0; ujj != missing_ct; ++ujj) {
774 const uintptr_t sample_idx_lo = missing_idxs[ujj];
775 if (sample_idx_lo > sample_idx_hi) {
776 break;
777 }
778 const uintptr_t tri_coord = tri_base + sample_idx_lo;
779 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
780 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
781 if (homhom_needed) {
782 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
783 }
784 }
785 }
786 for (uint32_t uii = 0; uii != missing_ct; ++uii) {
787 const uintptr_t sample_idx_hi = missing_idxs[uii];
788 singleton_missing_cts[sample_idx_hi] += 1;
789 if (sample_idx_hi < row_start_idx) {
790 continue;
791 }
792 const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
793 if (homhom_needed) {
794 for (uint32_t ujj = 0; ujj != uii; ++ujj) {
795 const uintptr_t sample_idx_lo = missing_idxs[ujj];
796 const uintptr_t tri_coord = tri_base + sample_idx_lo;
797 // bugfix (12 Nov 2019): added 4 twice
798 uint32_t* king_counts_ptr = &(king_counts[tri_coord * 5]);
799 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
800 }
801 }
802 for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
803 const uintptr_t sample_idx_lo = het_idxs[ujj];
804 if (sample_idx_lo > sample_idx_hi) {
805 break;
806 }
807 const uintptr_t tri_coord = tri_base + sample_idx_lo;
808 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
809 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
810 if (homhom_needed) {
811 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
812 }
813 }
814 for (uint32_t ujj = 0; ujj != other_hom_ct; ++ujj) {
815 const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
816 if (sample_idx_lo > sample_idx_hi) {
817 break;
818 }
819 const uintptr_t tri_coord = tri_base + sample_idx_lo;
820 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
821 __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
822 if (homhom_needed) {
823 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
824 }
825 }
826 }
827 } else {
828 // merge hom0 and hom2 cases.
829 for (uint32_t hom_geno = 0; hom_geno != 4; hom_geno += 2) {
830 const uint32_t* cur_hom_idxs = idx_bufs[3 - hom_geno];
831 const uint32_t* opp_hom_idxs = idx_bufs[1 + hom_geno];
832 const uint32_t cur_hom_ct = genocounts[hom_geno];
833 const uint32_t opp_hom_ct = genocounts[2 - hom_geno];
834 for (uint32_t uii = 0; uii != cur_hom_ct; ++uii) {
835 const uintptr_t sample_idx_hi = cur_hom_idxs[uii];
836 if (sample_idx_hi < row_start_idx) {
837 continue;
838 }
839 const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
840 if (homhom_needed) {
841 for (uint32_t ujj = 0; ujj != uii; ++ujj) {
842 const uintptr_t sample_idx_lo = cur_hom_idxs[ujj];
843 const uintptr_t tri_coord = tri_base + sample_idx_lo;
844 uint32_t* king_counts_ptr = &(king_counts[tri_coord * 5]);
845 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
846 }
847 }
848 for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
849 const uintptr_t sample_idx_lo = het_idxs[ujj];
850 if (sample_idx_lo > sample_idx_hi) {
851 break;
852 }
853 const uintptr_t tri_coord = tri_base + sample_idx_lo;
854 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
855 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
856 }
857 for (uint32_t ujj = 0; ujj != opp_hom_ct; ++ujj) {
858 const uintptr_t sample_idx_lo = opp_hom_idxs[ujj];
859 if (sample_idx_lo > sample_idx_hi) {
860 break;
861 }
862 const uintptr_t tri_coord = tri_base + sample_idx_lo;
863 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
864 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
865 if (homhom_needed) {
866 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
867 }
868 }
869 }
870 }
871 const uint32_t* hom0_idxs = idx_bufs[3];
872 const uint32_t* hom2_idxs = idx_bufs[1];
873 const uint32_t hom0_ct = genocounts[0];
874 const uint32_t hom2_ct = genocounts[2];
875 for (uint32_t uii = 0; uii != het_ct; ++uii) {
876 const uintptr_t sample_idx_hi = het_idxs[uii];
877 if (sample_idx_hi < row_start_idx) {
878 continue;
879 }
880 const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
881 for (uint32_t ujj = 0; ujj != uii; ++ujj) {
882 const uintptr_t sample_idx_lo = het_idxs[ujj];
883 const uintptr_t tri_coord = tri_base + sample_idx_lo;
884 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
885 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHethet]), 1, __ATOMIC_RELAXED);
886 }
887 for (uint32_t ujj = 0; ujj != hom0_ct; ++ujj) {
888 const uintptr_t sample_idx_lo = hom0_idxs[ujj];
889 if (sample_idx_lo > sample_idx_hi) {
890 break;
891 }
892 const uintptr_t tri_coord = tri_base + sample_idx_lo;
893 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
894 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
895 }
896 for (uint32_t ujj = 0; ujj != hom2_ct; ++ujj) {
897 const uintptr_t sample_idx_lo = hom2_idxs[ujj];
898 if (sample_idx_lo > sample_idx_hi) {
899 break;
900 }
901 const uintptr_t tri_coord = tri_base + sample_idx_lo;
902 uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
903 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
904 }
905 }
906 }
907 }
908 CalcKingSparseThread_err:
909 parity = 1 - parity;
910 }
911 ctx->thread_skip_cts[tidx] = skip_ct;
912 THREAD_RETURN;
913 }
914
915 #ifdef USE_SSE42
916 CONSTI32(kKingMultiplex, 1024);
917 CONSTI32(kKingMultiplexWords, kKingMultiplex / kBitsPerWord);
IncrKing(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)918 void IncrKing(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
919 // Tried adding another level of blocking, but couldn't get it to make a
920 // difference.
921 for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
922 // technically overflows for huge sample_ct
923 const uint32_t second_offset = second_idx * kKingMultiplexWords;
924 const uintptr_t* second_hom = &(smaj_hom[second_offset]);
925 const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
926 const uintptr_t* first_hom_iter = smaj_hom;
927 const uintptr_t* first_ref2het_iter = smaj_ref2het;
928 while (first_hom_iter < second_hom) {
929 uint32_t acc_ibs0 = 0;
930 uint32_t acc_hethet = 0;
931 uint32_t acc_het2hom1 = 0;
932 uint32_t acc_het1hom2 = 0;
933 for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
934 const uintptr_t hom1 = first_hom_iter[widx];
935 const uintptr_t hom2 = second_hom[widx];
936 const uintptr_t ref2het1 = first_ref2het_iter[widx];
937 const uintptr_t ref2het2 = second_ref2het[widx];
938 const uintptr_t homhom = hom1 & hom2;
939 const uintptr_t het1 = ref2het1 & (~hom1);
940 const uintptr_t het2 = ref2het2 & (~hom2);
941 acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
942 acc_hethet += PopcountWord(het1 & het2);
943 acc_het2hom1 += PopcountWord(hom1 & het2);
944 acc_het1hom2 += PopcountWord(hom2 & het1);
945 }
946 king_counts_iter[kKingOffsetIbs0] += acc_ibs0;
947 king_counts_iter[kKingOffsetHethet] += acc_hethet;
948 king_counts_iter[kKingOffsetHet2Hom1] += acc_het2hom1;
949 king_counts_iter[kKingOffsetHet1Hom2] += acc_het1hom2;
950 king_counts_iter = &(king_counts_iter[4]);
951
952 first_hom_iter = &(first_hom_iter[kKingMultiplexWords]);
953 first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexWords]);
954 }
955 }
956 }
957
IncrKingHomhom(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)958 void IncrKingHomhom(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
959 for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
960 // technically overflows for huge sample_ct
961 const uint32_t second_offset = second_idx * kKingMultiplexWords;
962 const uintptr_t* second_hom = &(smaj_hom[second_offset]);
963 const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
964 const uintptr_t* first_hom_iter = smaj_hom;
965 const uintptr_t* first_ref2het_iter = smaj_ref2het;
966 while (first_hom_iter < second_hom) {
967 uint32_t acc_homhom = 0;
968 uint32_t acc_ibs0 = 0;
969 uint32_t acc_hethet = 0;
970 uint32_t acc_het2hom1 = 0;
971 uint32_t acc_het1hom2 = 0;
972 for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
973 const uintptr_t hom1 = first_hom_iter[widx];
974 const uintptr_t hom2 = second_hom[widx];
975 const uintptr_t ref2het1 = first_ref2het_iter[widx];
976 const uintptr_t ref2het2 = second_ref2het[widx];
977 const uintptr_t homhom = hom1 & hom2;
978 const uintptr_t het1 = ref2het1 & (~hom1);
979 const uintptr_t het2 = ref2het2 & (~hom2);
980 acc_homhom += PopcountWord(homhom);
981 acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
982 acc_hethet += PopcountWord(het1 & het2);
983 acc_het2hom1 += PopcountWord(hom1 & het2);
984 acc_het1hom2 += PopcountWord(hom2 & het1);
985 }
986 king_counts_iter[kKingOffsetIbs0] += acc_ibs0;
987 king_counts_iter[kKingOffsetHethet] += acc_hethet;
988 king_counts_iter[kKingOffsetHet2Hom1] += acc_het2hom1;
989 king_counts_iter[kKingOffsetHet1Hom2] += acc_het1hom2;
990 king_counts_iter[kKingOffsetHomhom] += acc_homhom;
991 king_counts_iter = &(king_counts_iter[5]);
992
993 first_hom_iter = &(first_hom_iter[kKingMultiplexWords]);
994 first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexWords]);
995 }
996 }
997 }
998 #else // !USE_SSE42
999 # ifdef __LP64__
1000 CONSTI32(kKingMultiplex, 1536);
1001 # else
1002 CONSTI32(kKingMultiplex, 960);
1003 # endif
1004 static_assert(kKingMultiplex % (3 * kBitsPerVec) == 0, "Invalid kKingMultiplex value.");
1005 CONSTI32(kKingMultiplexWords, kKingMultiplex / kBitsPerWord);
1006 CONSTI32(kKingMultiplexVecs, kKingMultiplex / kBitsPerVec);
1007 // expensive PopcountWord(). Use Lauradoux/Walisch accumulators, since
1008 // Harley-Seal requires too many variables.
IncrKing(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)1009 void IncrKing(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
1010 const VecW m1 = VCONST_W(kMask5555);
1011 const VecW m2 = VCONST_W(kMask3333);
1012 const VecW m4 = VCONST_W(kMask0F0F);
1013 for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
1014 // technically overflows for huge sample_ct
1015 const uint32_t second_offset = second_idx * kKingMultiplexWords;
1016 const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
1017 const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
1018 const VecW* first_hom_iter = R_CAST(const VecW*, smaj_hom);
1019 const VecW* first_ref2het_iter = R_CAST(const VecW*, smaj_ref2het);
1020 while (first_hom_iter < second_hom) {
1021 UniVec acc_ibs0;
1022 UniVec acc_hethet;
1023 UniVec acc_het2hom1;
1024 UniVec acc_het1hom2;
1025 acc_ibs0.vw = vecw_setzero();
1026 acc_hethet.vw = vecw_setzero();
1027 acc_het2hom1.vw = vecw_setzero();
1028 acc_het1hom2.vw = vecw_setzero();
1029 for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
1030 VecW hom1 = first_hom_iter[vec_idx];
1031 VecW hom2 = second_hom[vec_idx];
1032 VecW ref2het1 = first_ref2het_iter[vec_idx];
1033 VecW ref2het2 = second_ref2het[vec_idx];
1034 VecW het1 = vecw_and_notfirst(hom1, ref2het1);
1035 VecW het2 = vecw_and_notfirst(hom2, ref2het2);
1036 VecW agg_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
1037 VecW agg_hethet = het1 & het2;
1038 VecW agg_het2hom1 = hom1 & het2;
1039 VecW agg_het1hom2 = hom2 & het1;
1040 agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
1041 agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
1042 agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
1043 agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
1044 agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
1045 agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
1046 agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
1047 agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
1048
1049 for (uint32_t offset = 1; offset != 3; ++offset) {
1050 hom1 = first_hom_iter[vec_idx + offset];
1051 hom2 = second_hom[vec_idx + offset];
1052 ref2het1 = first_ref2het_iter[vec_idx + offset];
1053 ref2het2 = second_ref2het[vec_idx + offset];
1054 het1 = vecw_and_notfirst(hom1, ref2het1);
1055 het2 = vecw_and_notfirst(hom2, ref2het2);
1056 VecW cur_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
1057 VecW cur_hethet = het1 & het2;
1058 VecW cur_het2hom1 = hom1 & het2;
1059 VecW cur_het1hom2 = hom2 & het1;
1060 cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
1061 cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
1062 cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
1063 cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
1064 agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
1065 agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
1066 agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
1067 agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
1068 }
1069 acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
1070 acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
1071 acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
1072 acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
1073 }
1074 const VecW m8 = VCONST_W(kMask00FF);
1075 acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
1076 acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
1077 acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
1078 acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
1079 king_counts_iter[kKingOffsetIbs0] += UniVecHsum16(acc_ibs0);
1080 king_counts_iter[kKingOffsetHethet] += UniVecHsum16(acc_hethet);
1081 king_counts_iter[kKingOffsetHet2Hom1] += UniVecHsum16(acc_het2hom1);
1082 king_counts_iter[kKingOffsetHet1Hom2] += UniVecHsum16(acc_het1hom2);
1083 king_counts_iter = &(king_counts_iter[4]);
1084
1085 first_hom_iter = &(first_hom_iter[kKingMultiplexVecs]);
1086 first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexVecs]);
1087 }
1088 }
1089 }
1090
IncrKingHomhom(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)1091 void IncrKingHomhom(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
1092 const VecW m1 = VCONST_W(kMask5555);
1093 const VecW m2 = VCONST_W(kMask3333);
1094 const VecW m4 = VCONST_W(kMask0F0F);
1095 for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
1096 // technically overflows for huge sample_ct
1097 const uint32_t second_offset = second_idx * kKingMultiplexWords;
1098 const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
1099 const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
1100 const VecW* first_hom_iter = R_CAST(const VecW*, smaj_hom);
1101 const VecW* first_ref2het_iter = R_CAST(const VecW*, smaj_ref2het);
1102 while (first_hom_iter < second_hom) {
1103 UniVec acc_homhom;
1104 UniVec acc_ibs0;
1105 UniVec acc_hethet;
1106 UniVec acc_het2hom1;
1107 UniVec acc_het1hom2;
1108 acc_homhom.vw = vecw_setzero();
1109 acc_ibs0.vw = vecw_setzero();
1110 acc_hethet.vw = vecw_setzero();
1111 acc_het2hom1.vw = vecw_setzero();
1112 acc_het1hom2.vw = vecw_setzero();
1113 for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
1114 VecW hom1 = first_hom_iter[vec_idx];
1115 VecW hom2 = second_hom[vec_idx];
1116 VecW ref2het1 = first_ref2het_iter[vec_idx];
1117 VecW ref2het2 = second_ref2het[vec_idx];
1118 VecW agg_homhom = hom1 & hom2;
1119 VecW het1 = vecw_and_notfirst(hom1, ref2het1);
1120 VecW het2 = vecw_and_notfirst(hom2, ref2het2);
1121 VecW agg_ibs0 = (ref2het1 ^ ref2het2) & agg_homhom;
1122 VecW agg_hethet = het1 & het2;
1123 VecW agg_het2hom1 = hom1 & het2;
1124 VecW agg_het1hom2 = hom2 & het1;
1125 agg_homhom = agg_homhom - (vecw_srli(agg_homhom, 1) & m1);
1126 agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
1127 agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
1128 agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
1129 agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
1130 agg_homhom = (agg_homhom & m2) + (vecw_srli(agg_homhom, 2) & m2);
1131 agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
1132 agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
1133 agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
1134 agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
1135
1136 for (uint32_t offset = 1; offset != 3; ++offset) {
1137 hom1 = first_hom_iter[vec_idx + offset];
1138 hom2 = second_hom[vec_idx + offset];
1139 ref2het1 = first_ref2het_iter[vec_idx + offset];
1140 ref2het2 = second_ref2het[vec_idx + offset];
1141 VecW cur_homhom = hom1 & hom2;
1142 het1 = vecw_and_notfirst(hom1, ref2het1);
1143 het2 = vecw_and_notfirst(hom2, ref2het2);
1144 VecW cur_ibs0 = (ref2het1 ^ ref2het2) & cur_homhom;
1145 VecW cur_hethet = het1 & het2;
1146 VecW cur_het2hom1 = hom1 & het2;
1147 VecW cur_het1hom2 = hom2 & het1;
1148 cur_homhom = cur_homhom - (vecw_srli(cur_homhom, 1) & m1);
1149 cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
1150 cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
1151 cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
1152 cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
1153 agg_homhom += (cur_homhom & m2) + (vecw_srli(cur_homhom, 2) & m2);
1154 agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
1155 agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
1156 agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
1157 agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
1158 }
1159 acc_homhom.vw = acc_homhom.vw + (agg_homhom & m4) + (vecw_srli(agg_homhom, 4) & m4);
1160 acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
1161 acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
1162 acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
1163 acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
1164 }
1165 const VecW m8 = VCONST_W(kMask00FF);
1166 acc_homhom.vw = (acc_homhom.vw & m8) + (vecw_srli(acc_homhom.vw, 8) & m8);
1167 acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
1168 acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
1169 acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
1170 acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
1171 king_counts_iter[kKingOffsetIbs0] += UniVecHsum16(acc_ibs0);
1172 king_counts_iter[kKingOffsetHethet] += UniVecHsum16(acc_hethet);
1173 king_counts_iter[kKingOffsetHet2Hom1] += UniVecHsum16(acc_het2hom1);
1174 king_counts_iter[kKingOffsetHet1Hom2] += UniVecHsum16(acc_het1hom2);
1175 king_counts_iter[kKingOffsetHomhom] += UniVecHsum16(acc_homhom);
1176 king_counts_iter = &(king_counts_iter[5]);
1177
1178 first_hom_iter = &(first_hom_iter[kKingMultiplexVecs]);
1179 first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexVecs]);
1180 }
1181 }
1182 }
1183 #endif
1184 static_assert(!(kKingMultiplexWords % 2), "kKingMultiplexWords must be even for safe bit-transpose.");
1185
1186 typedef struct CalcKingDenseCtxStruct {
1187 uintptr_t* smaj_hom[2];
1188 uintptr_t* smaj_ref2het[2];
1189 uint32_t homhom_needed;
1190
1191 uint32_t* thread_start;
1192
1193 uint32_t* king_counts;
1194 } CalcKingDenseCtx;
1195
CalcKingDenseThread(void * raw_arg)1196 THREAD_FUNC_DECL CalcKingDenseThread(void* raw_arg) {
1197 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
1198 const uintptr_t tidx = arg->tidx;
1199 CalcKingDenseCtx* ctx = S_CAST(CalcKingDenseCtx*, arg->sharedp->context);
1200
1201 const uint64_t mem_start_idx = ctx->thread_start[0];
1202 const uint64_t start_idx = ctx->thread_start[tidx];
1203 const uint32_t end_idx = ctx->thread_start[tidx + 1];
1204 const uint32_t homhom_needed = ctx->homhom_needed;
1205 uint32_t parity = 0;
1206 do {
1207 if (homhom_needed) {
1208 IncrKingHomhom(ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, &(ctx->king_counts[((start_idx * (start_idx - 1) - mem_start_idx * (mem_start_idx - 1)) / 2) * 5]));
1209 } else {
1210 IncrKing(ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, &(ctx->king_counts[(start_idx * (start_idx - 1) - mem_start_idx * (mem_start_idx - 1)) * 2]));
1211 }
1212 parity = 1 - parity;
1213 } while (!THREAD_BLOCK_FINISH(arg));
1214 THREAD_RETURN;
1215 }
1216
1217 /*
1218 double ComputeKinship(const uint32_t* king_counts_entry) {
1219 const uint32_t ibs0_ct = king_counts_entry[kKingOffsetIbs0];
1220 const uint32_t hethet_ct = king_counts_entry[kKingOffsetHethet];
1221 const uint32_t het2hom1_ct = king_counts_entry[kKingOffsetHet2Hom1];
1222 const uint32_t het1hom2_ct = king_counts_entry[kKingOffsetHet1Hom2];
1223 const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1224 return 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1225 }
1226 */
1227
1228 // '2' refers to the larger index here
ComputeKinship(const uint32_t * king_counts_entry,uint32_t singleton_het1_ct,uint32_t singleton_hom1_ct,uint32_t singleton_het2_ct,uint32_t singleton_hom2_ct)1229 double ComputeKinship(const uint32_t* king_counts_entry, uint32_t singleton_het1_ct, uint32_t singleton_hom1_ct, uint32_t singleton_het2_ct, uint32_t singleton_hom2_ct) {
1230 const uint32_t ibs0_ct = king_counts_entry[kKingOffsetIbs0] + singleton_hom1_ct + singleton_hom2_ct;
1231 const uint32_t hethet_ct = king_counts_entry[kKingOffsetHethet];
1232 const uint32_t het2hom1_ct = king_counts_entry[kKingOffsetHet2Hom1] + singleton_het2_ct;
1233 const uint32_t het1hom2_ct = king_counts_entry[kKingOffsetHet1Hom2] + singleton_het1_ct;
1234 const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1235 return 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1236 }
1237
1238 // could also return pointer to end?
SetKingMatrixFname(KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,char * outname_end)1239 void SetKingMatrixFname(KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, char* outname_end) {
1240 if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1241 char* outname_end2 = strcpya_k(outname_end, ".king");
1242 const uint32_t output_zst = king_flags & kfKingMatrixZs;
1243 if (parallel_tot != 1) {
1244 *outname_end2++ = '.';
1245 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1246 }
1247 if (output_zst) {
1248 outname_end2 = strcpya_k(outname_end2, ".zst");
1249 }
1250 *outname_end2 = '\0';
1251 return;
1252 }
1253 char* outname_end2 = strcpya_k(outname_end, ".king.bin");
1254 if (parallel_tot != 1) {
1255 *outname_end2++ = '.';
1256 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1257 }
1258 *outname_end2 = '\0';
1259 }
1260
SetKingTableFname(KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,char * outname_end)1261 void SetKingTableFname(KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, char* outname_end) {
1262 char* outname_end2 = strcpya_k(outname_end, ".kin0");
1263 const uint32_t output_zst = king_flags & kfKingTableZs;
1264 if (parallel_tot != 1) {
1265 *outname_end2++ = '.';
1266 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1267 }
1268 if (output_zst) {
1269 outname_end2 = strcpya_k(outname_end2, ".zst");
1270 }
1271 *outname_end2 = '\0';
1272 }
1273
AppendKingTableHeader(KingFlags king_flags,uint32_t king_col_fid,uint32_t king_col_sid,char * cswritep)1274 char* AppendKingTableHeader(KingFlags king_flags, uint32_t king_col_fid, uint32_t king_col_sid, char* cswritep) {
1275 *cswritep++ = '#';
1276 if (king_flags & kfKingColId) {
1277 if (king_col_fid) {
1278 cswritep = strcpya_k(cswritep, "FID1\t");
1279 }
1280 // Was 'ID1' before alpha 3, but that's inconsistent with other plink2
1281 // commands, and in the meantime the header line still doesn't perfectly
1282 // match KING due to e.g. capitalization.
1283 cswritep = strcpya_k(cswritep, "IID1\t");
1284 if (king_col_sid) {
1285 cswritep = strcpya_k(cswritep, "SID1\t");
1286 }
1287 if (king_col_fid) {
1288 cswritep = strcpya_k(cswritep, "FID2\t");
1289 }
1290 cswritep = strcpya_k(cswritep, "IID2\t");
1291 if (king_col_sid) {
1292 cswritep = strcpya_k(cswritep, "SID2\t");
1293 }
1294 }
1295 if (king_flags & kfKingColNsnp) {
1296 cswritep = strcpya_k(cswritep, "NSNP\t");
1297 }
1298 if (king_flags & kfKingColHethet) {
1299 cswritep = strcpya_k(cswritep, "HETHET\t");
1300 }
1301 if (king_flags & kfKingColIbs0) {
1302 cswritep = strcpya_k(cswritep, "IBS0\t");
1303 }
1304 if (king_flags & kfKingColIbs1) {
1305 cswritep = strcpya_k(cswritep, "HET1_HOM2\tHET2_HOM1\t");
1306 }
1307 if (king_flags & kfKingColKinship) {
1308 cswritep = strcpya_k(cswritep, "KINSHIP\t");
1309 }
1310 DecrAppendBinaryEoln(&cswritep);
1311 return cswritep;
1312 }
1313
KingMaxSparseCt(uint32_t row_end_idx)1314 uint32_t KingMaxSparseCt(uint32_t row_end_idx) {
1315 #ifdef USE_AVX2
1316 return row_end_idx / 33;
1317 #else
1318 return row_end_idx / 30;
1319 #endif
1320 }
1321
CalcKing(const SampleIdInfo * siip,const uintptr_t * variant_include_orig,const ChrInfo * cip,uint32_t raw_sample_ct,uint32_t orig_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,double king_cutoff,double king_table_filter,KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,uintptr_t pgr_alloc_cacheline_ct,PgenFileInfo * pgfip,PgenReader * simple_pgrp,uintptr_t * sample_include,uint32_t * sample_ct_ptr,char * outname,char * outname_end)1322 PglErr CalcKing(const SampleIdInfo* siip, const uintptr_t* variant_include_orig, const ChrInfo* cip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, double king_cutoff, double king_table_filter, KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, uintptr_t* sample_include, uint32_t* sample_ct_ptr, char* outname, char* outname_end) {
1323 unsigned char* bigstack_mark = g_bigstack_base;
1324 FILE* outfile = nullptr;
1325 char* cswritep = nullptr;
1326 char* cswritetp = nullptr;
1327 CompressStreamState css;
1328 CompressStreamState csst;
1329 ThreadGroup tg;
1330 PglErr reterr = kPglRetSuccess;
1331 PreinitCstream(&css);
1332 PreinitCstream(&csst);
1333 PreinitThreads(&tg);
1334 {
1335 const KingFlags matrix_shape = king_flags & kfKingMatrixShapemask;
1336 const char* flagname = matrix_shape? "--make-king" : ((king_flags & kfKingColAll)? "--make-king-table" : "--king-cutoff");
1337 if (unlikely(IsSet(cip->haploid_mask, 0))) {
1338 logerrprintf("Error: %s cannot be used on haploid genomes.\n", flagname);
1339 goto CalcKing_ret_INCONSISTENT_INPUT;
1340 }
1341 uint32_t sample_ct = *sample_ct_ptr;
1342 if (unlikely(sample_ct < 2)) {
1343 logerrprintf("Error: %s requires at least 2 samples.\n", flagname);
1344 goto CalcKing_ret_DEGENERATE_DATA;
1345 }
1346 #ifdef __LP64__
1347 // there's also a UINT32_MAX / kKingMultiplexWords limit, but that's not
1348 // relevant for now
1349 if (unlikely(sample_ct > 134000000)) {
1350 // for text output, 134m * 16 is just below kMaxLongLine
1351 logerrprintf("Error: %s does not support > 134000000 samples.\n", flagname);
1352 reterr = kPglRetNotYetSupported;
1353 goto CalcKing_ret_1;
1354 }
1355 #endif
1356 const uintptr_t sample_ctl = BitCtToWordCt(sample_ct);
1357 uintptr_t* kinship_table = nullptr;
1358 if (king_cutoff != -1) {
1359 if (unlikely(bigstack_calloc_w(sample_ct * sample_ctl, &kinship_table))) {
1360 goto CalcKing_ret_NOMEM;
1361 }
1362 }
1363 const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
1364 const uint32_t non_autosomal_variant_ct = CountNonAutosomalVariants(variant_include_orig, cip, 1, 1);
1365 if (non_autosomal_variant_ct) {
1366 uintptr_t* variant_include_next;
1367 if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_next))) {
1368 goto CalcKing_ret_NOMEM;
1369 }
1370 logprintf("Excluding %u variant%s on non-autosomes from KING-robust calculation.\n", non_autosomal_variant_ct, (non_autosomal_variant_ct == 1)? "" : "s");
1371 variant_ct -= non_autosomal_variant_ct;
1372 if (!variant_ct) {
1373 logerrprintf("Error: No variants remaining for KING-robust calculation.\n");
1374 goto CalcKing_ret_DEGENERATE_DATA;
1375 }
1376 memcpy(variant_include_next, variant_include_orig, raw_variant_ctl * sizeof(intptr_t));
1377 ExcludeNonAutosomalVariants(cip, variant_include_next);
1378 variant_include_orig = variant_include_next;
1379 }
1380 uintptr_t* variant_include;
1381
1382 if (unlikely(
1383 bigstack_alloc_w(raw_variant_ctl, &variant_include))) {
1384 goto CalcKing_ret_NOMEM;
1385 }
1386
1387 uint32_t grand_row_start_idx;
1388 uint32_t grand_row_end_idx;
1389 ParallelBounds(sample_ct, 1, parallel_idx, parallel_tot, R_CAST(int32_t*, &grand_row_start_idx), R_CAST(int32_t*, &grand_row_end_idx));
1390
1391 // possible todo: allow this to change between passes
1392 uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
1393 if (calc_thread_ct > sample_ct / 32) {
1394 calc_thread_ct = sample_ct / 32;
1395 }
1396 if (!calc_thread_ct) {
1397 calc_thread_ct = 1;
1398 }
1399 const uint32_t homhom_needed = (king_flags & kfKingColNsnp) || ((!(king_flags & kfKingCounts)) && (king_flags & (kfKingColHethet | kfKingColIbs0 | kfKingColIbs1)));
1400 CalcKingSparseCtx sparse_ctx;
1401 uint32_t sparse_read_block_size = 0;
1402 STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
1403 // These values are now permitted to underflow from sparse-optimization.
1404 // Might want to change them to int32_t*.
1405 uint32_t* singleton_het_cts;
1406 uint32_t* singleton_hom_cts;
1407 uint32_t* singleton_missing_cts;
1408 {
1409 sparse_ctx.variant_include_orig = variant_include_orig;
1410 sparse_ctx.homhom_needed = homhom_needed;
1411 const uint32_t max_sparse_ct = KingMaxSparseCt(grand_row_end_idx);
1412 // Ok for this to be a slight underestimate, since bigstack_left()/8 is
1413 // an arbitrary limit anyway.
1414 const uintptr_t thread_xalloc_cacheline_ct = DivUp((3 * k1LU) * (max_sparse_ct + grand_row_end_idx), kInt32PerCacheline) + ((kPglVblockSize * 2) / kBitsPerCacheline);
1415 if (unlikely(PgenMtLoadInit(variant_include_orig, grand_row_end_idx, variant_ct, bigstack_left() / 8, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, 0, 0, pgfip, &calc_thread_ct, &sparse_ctx.genovecs, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, &sparse_read_block_size, nullptr, main_loadbufs, &sparse_ctx.pgr_ptrs, &sparse_ctx.read_variant_uidx_starts))) {
1416 goto CalcKing_ret_NOMEM;
1417 }
1418 sparse_ctx.read_block_size = sparse_read_block_size;
1419 sparse_ctx.reterr = kPglRetSuccess;
1420 if (unlikely(
1421 bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_idx_bufs) ||
1422 bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_het_cts) ||
1423 bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_hom_cts) ||
1424 bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_missing_cts) ||
1425 bigstack_alloc_u32(calc_thread_ct, &sparse_ctx.thread_skip_cts) ||
1426 bigstack_alloc_wp(calc_thread_ct, &sparse_ctx.thread_sparse_excludes[0]) ||
1427 bigstack_alloc_wp(calc_thread_ct, &sparse_ctx.thread_sparse_excludes[1]))) {
1428 goto CalcKing_ret_NOMEM;
1429 }
1430 const uint32_t read_block_sizel = sparse_read_block_size / kBitsPerWord;
1431 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
1432 if (unlikely(
1433 bigstack_alloc_u32(3 * max_sparse_ct, &(sparse_ctx.thread_idx_bufs[tidx])) ||
1434 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_het_cts[tidx])) ||
1435 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_hom_cts[tidx])) ||
1436 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_missing_cts[tidx])) ||
1437 bigstack_alloc_w(read_block_sizel, &(sparse_ctx.thread_sparse_excludes[0][tidx])) ||
1438 bigstack_alloc_w(read_block_sizel, &(sparse_ctx.thread_sparse_excludes[1][tidx])))) {
1439 goto CalcKing_ret_NOMEM;
1440 }
1441 }
1442 singleton_het_cts = sparse_ctx.thread_singleton_het_cts[0];
1443 singleton_hom_cts = sparse_ctx.thread_singleton_hom_cts[0];
1444 singleton_missing_cts = sparse_ctx.thread_singleton_missing_cts[0];
1445 }
1446
1447 CalcKingDenseCtx dense_ctx;
1448 if (unlikely(
1449 SetThreadCt(calc_thread_ct, &tg) ||
1450 bigstack_alloc_u32(calc_thread_ct + 1, &dense_ctx.thread_start))) {
1451 goto CalcKing_ret_NOMEM;
1452 }
1453 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1454 const uint32_t grei_ctaw = BitCtToAlignedWordCt(grand_row_end_idx);
1455 const uint32_t grei_ctaw2 = NypCtToAlignedWordCt(grand_row_end_idx);
1456 dense_ctx.homhom_needed = homhom_needed;
1457 const uint32_t king_bufsizew = kKingMultiplexWords * grand_row_end_idx;
1458 const uint32_t homhom_needed_p4 = dense_ctx.homhom_needed + 4;
1459 uintptr_t* cur_sample_include;
1460 uint32_t* sample_include_cumulative_popcounts;
1461 uintptr_t* loadbuf;
1462 uintptr_t* splitbuf_hom;
1463 uintptr_t* splitbuf_ref2het;
1464 VecW* vecaligned_buf;
1465 if (unlikely(
1466 bigstack_alloc_w(raw_sample_ctl, &cur_sample_include) ||
1467 bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
1468 bigstack_alloc_w(grei_ctaw2, &loadbuf) ||
1469 bigstack_alloc_w(kPglBitTransposeBatch * grei_ctaw, &splitbuf_hom) ||
1470 bigstack_alloc_w(kPglBitTransposeBatch * grei_ctaw, &splitbuf_ref2het) ||
1471 bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_hom[0])) ||
1472 bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_ref2het[0])) ||
1473 bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_hom[1])) ||
1474 bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_ref2het[1])) ||
1475 bigstack_alloc_v(kPglBitTransposeBufvecs, &vecaligned_buf))) {
1476 goto CalcKing_ret_NOMEM;
1477 }
1478
1479 // Make this automatically multipass when there's insufficient memory. So
1480 // we open the output file(s) here, and just append in the main loop.
1481 unsigned char* numbuf = nullptr;
1482 if (matrix_shape) {
1483 SetKingMatrixFname(king_flags, parallel_idx, parallel_tot, outname_end);
1484 if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1485 // text matrix
1486 // won't be >2gb since sample_ct <= 134m
1487 const uint32_t overflow_buf_size = kCompressStreamBlock + 16 * sample_ct;
1488 reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingMatrixZs, max_thread_ct, overflow_buf_size, &css, &cswritep);
1489 if (unlikely(reterr)) {
1490 goto CalcKing_ret_1;
1491 }
1492 } else {
1493 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
1494 goto CalcKing_ret_OPEN_FAIL;
1495 }
1496 if (unlikely(bigstack_alloc_uc(sample_ct * 4 * (2 - ((king_flags / kfKingMatrixBin4) & 1)), &numbuf))) {
1497 goto CalcKing_ret_OPEN_FAIL;
1498 }
1499 }
1500 }
1501 uint32_t king_col_fid = 0;
1502 uint32_t king_col_sid = 0;
1503 uintptr_t max_sample_fmtid_blen = 0;
1504 char* collapsed_sample_fmtids = nullptr;
1505 if (king_flags & kfKingColAll) {
1506 const uint32_t overflow_buf_size = kCompressStreamBlock + kMaxMediumLine;
1507 SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
1508 reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingTableZs, max_thread_ct, overflow_buf_size, &csst, &cswritetp);
1509 if (unlikely(reterr)) {
1510 goto CalcKing_ret_1;
1511 }
1512
1513 king_col_fid = FidColIsRequired(siip, king_flags / kfKingColMaybefid);
1514 king_col_sid = SidColIsRequired(siip->sids, king_flags / kfKingColMaybesid);
1515 if (!parallel_idx) {
1516 cswritetp = AppendKingTableHeader(king_flags, king_col_fid, king_col_sid, cswritetp);
1517 }
1518 if (unlikely(CollapsedSampleFmtidInitAlloc(sample_include, siip, grand_row_end_idx, king_col_fid, king_col_sid, &collapsed_sample_fmtids, &max_sample_fmtid_blen))) {
1519 goto CalcKing_ret_NOMEM;
1520 }
1521 }
1522 uint64_t king_table_filter_ct = 0;
1523 const uintptr_t cells_avail = bigstack_left() / (sizeof(int32_t) * homhom_needed_p4);
1524 const uint32_t pass_ct = CountTrianglePasses(grand_row_start_idx, grand_row_end_idx, 1, cells_avail);
1525 if (unlikely(!pass_ct)) {
1526 goto CalcKing_ret_NOMEM;
1527 }
1528 if (unlikely((pass_ct > 1) && (king_flags & kfKingMatrixSq))) {
1529 logerrputs("Insufficient memory for --make-king square output. Try square0 or triangle\nshape instead.\n");
1530 goto CalcKing_ret_NOMEM;
1531 }
1532 uint32_t row_end_idx = grand_row_start_idx;
1533 sparse_ctx.king_counts = R_CAST(uint32_t*, g_bigstack_base);
1534 dense_ctx.king_counts = sparse_ctx.king_counts;
1535 for (uint32_t pass_idx_p1 = 1; pass_idx_p1 <= pass_ct; ++pass_idx_p1) {
1536 const uint32_t row_start_idx = row_end_idx;
1537 row_end_idx = NextTrianglePass(row_start_idx, grand_row_end_idx, 1, cells_avail);
1538 TriangleLoadBalance(calc_thread_ct, row_start_idx, row_end_idx, 1, dense_ctx.thread_start);
1539 memcpy(cur_sample_include, sample_include, raw_sample_ctl * sizeof(intptr_t));
1540 // bugfix (20 Nov 2019): forgot that --parallel could cause the old
1541 // row_end_idx != grand_row_end_idx comparison not work
1542 if (row_end_idx != orig_sample_ct) {
1543 uint32_t sample_uidx_end = IdxToUidxBasic(sample_include, row_end_idx);
1544 ClearBitsNz(sample_uidx_end, raw_sample_ct, cur_sample_include);
1545 }
1546 FillCumulativePopcounts(cur_sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
1547 pgfip->block_base = main_loadbufs[0]; // needed after first pass
1548 // Update (9 Nov 2019): The one-time singleton/monomorphic scan has been
1549 // replaced with a more effective sparse-variant scan which happens on
1550 // every pass.
1551 sparse_ctx.sample_include = cur_sample_include;
1552 sparse_ctx.sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
1553 sparse_ctx.row_start_idx = row_start_idx;
1554 sparse_ctx.row_end_idx = row_end_idx;
1555 sparse_ctx.max_sparse_ct = KingMaxSparseCt(row_end_idx);
1556 logprintf("%s pass %u/%u: Scanning for rare variants... ", flagname, pass_idx_p1, pass_ct);
1557 fputs("0%", stdout);
1558 fflush(stdout);
1559 SetThreadFuncAndData(CalcKingSparseThread, &sparse_ctx, &tg);
1560 if (unlikely(SpawnThreads(&tg))) {
1561 goto CalcKing_ret_THREAD_CREATE_FAIL;
1562 }
1563 memcpy(variant_include, variant_include_orig, raw_variant_ctl * sizeof(intptr_t));
1564
1565 {
1566 const uint32_t read_block_sizel = sparse_read_block_size / kBitsPerWord;
1567 uint32_t prev_read_block_idx = 0;
1568 uint32_t read_block_idx = 0;
1569 uint32_t pct = 0;
1570 uint32_t next_print_variant_idx = variant_ct / 100;
1571 uint32_t parity = 0;
1572 for (uint32_t variant_idx = 0; ; ) {
1573 const uint32_t cur_block_size = MultireadNonempty(variant_include_orig, &tg, raw_variant_ct, sparse_read_block_size, pgfip, &read_block_idx, &reterr);
1574 if (unlikely(reterr)) {
1575 goto CalcKing_ret_PGR_FAIL;
1576 }
1577 JoinThreads(&tg);
1578 reterr = sparse_ctx.reterr;
1579 if (unlikely(reterr)) {
1580 goto CalcKing_ret_PGR_FAIL;
1581 }
1582 if (!IsLastBlock(&tg)) {
1583 sparse_ctx.cur_block_size = cur_block_size;
1584 ComputeUidxStartPartition(variant_include_orig, cur_block_size, calc_thread_ct, read_block_idx * sparse_read_block_size, sparse_ctx.read_variant_uidx_starts);
1585 PgrCopyBaseAndOffset(pgfip, calc_thread_ct, sparse_ctx.pgr_ptrs);
1586 if (variant_idx + cur_block_size == variant_ct) {
1587 DeclareLastThreadBlock(&tg);
1588 }
1589 SpawnThreads(&tg);
1590 }
1591 parity = 1 - parity;
1592 if (variant_idx) {
1593 uintptr_t* variant_include_update = &(variant_include[prev_read_block_idx * read_block_sizel]);
1594 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
1595 BitvecInvmask(sparse_ctx.thread_sparse_excludes[parity][tidx], read_block_sizel, variant_include_update);
1596 }
1597 if (variant_idx == variant_ct) {
1598 break;
1599 }
1600 if (variant_idx >= next_print_variant_idx) {
1601 if (pct > 10) {
1602 putc_unlocked('\b', stdout);
1603 }
1604 pct = (variant_idx * 100LLU) / variant_ct;
1605 printf("\b\b%u%%", pct++);
1606 fflush(stdout);
1607 next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
1608 }
1609 }
1610 prev_read_block_idx = read_block_idx;
1611 ++read_block_idx;
1612 variant_idx += cur_block_size;
1613 pgfip->block_base = main_loadbufs[parity];
1614 }
1615 if (pct > 10) {
1616 putc_unlocked('\b', stdout);
1617 }
1618 }
1619 fputs("\b\b", stdout);
1620 logputs("done.\n");
1621 const uint32_t cur_variant_ct = PopcountWords(variant_include, raw_variant_ctl);
1622 uint32_t sparse_variant_ct = variant_ct - cur_variant_ct;
1623 logprintf("%u variant%s handled by initial scan (%u remaining).\n", sparse_variant_ct, (sparse_variant_ct == 1)? "" : "s", cur_variant_ct);
1624 uint32_t skip_ct = sparse_ctx.thread_skip_cts[0];
1625 const uint32_t vec_ct = DivUp(row_end_idx, kInt32PerVec);
1626 for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
1627 U32CastVecAdd(sparse_ctx.thread_singleton_het_cts[tidx], vec_ct, singleton_het_cts);
1628 U32CastVecAdd(sparse_ctx.thread_singleton_hom_cts[tidx], vec_ct, singleton_hom_cts);
1629 U32CastVecAdd(sparse_ctx.thread_singleton_missing_cts[tidx], vec_ct, singleton_missing_cts);
1630 skip_ct += sparse_ctx.thread_skip_cts[tidx];
1631 }
1632 sparse_variant_ct -= skip_ct;
1633 if (cur_variant_ct) {
1634 SetThreadFuncAndData(CalcKingDenseThread, &dense_ctx, &tg);
1635 const uint32_t row_end_idxaw = BitCtToAlignedWordCt(row_end_idx);
1636 const uint32_t row_end_idxaw2 = NypCtToAlignedWordCt(row_end_idx);
1637 if (row_end_idxaw % 2) {
1638 const uint32_t cur_king_bufsizew = kKingMultiplexWords * row_end_idx;
1639 uintptr_t* smaj_hom0_last = &(dense_ctx.smaj_hom[0][kKingMultiplexWords - 1]);
1640 uintptr_t* smaj_ref2het0_last = &(dense_ctx.smaj_ref2het[0][kKingMultiplexWords - 1]);
1641 uintptr_t* smaj_hom1_last = &(dense_ctx.smaj_hom[1][kKingMultiplexWords - 1]);
1642 uintptr_t* smaj_ref2het1_last = &(dense_ctx.smaj_ref2het[1][kKingMultiplexWords - 1]);
1643 for (uint32_t offset = 0; offset < cur_king_bufsizew; offset += kKingMultiplexWords) {
1644 smaj_hom0_last[offset] = 0;
1645 smaj_ref2het0_last[offset] = 0;
1646 smaj_hom1_last[offset] = 0;
1647 smaj_ref2het1_last[offset] = 0;
1648 }
1649 }
1650 uintptr_t variant_uidx_base = 0;
1651 uintptr_t cur_bits = variant_include[0];
1652 uint32_t variants_completed = 0;
1653 uint32_t parity = 0;
1654 const uint32_t sample_batch_ct_m1 = (row_end_idx - 1) / kPglBitTransposeBatch;
1655 // Similar to plink 1.9 --genome. For each pair of samples S1-S2, we
1656 // need to determine counts of the following:
1657 // * S1 hom-S2 opposite hom
1658 // * het-het
1659 // * S1 hom-S2 het
1660 // * S2 hom-S1 het
1661 // * sometimes S1 hom-S2 same hom
1662 // * (nonmissing determined via subtraction)
1663 // We handle this as follows:
1664 // 1. set n=0, reader thread loads first kKingMultiplex variants and
1665 // converts+transposes the data to a sample-major format suitable
1666 // for multithreaded computation.
1667 // 2. spawn threads
1668 //
1669 // 3. increment n by 1
1670 // 4. load block n unless eof
1671 // 5. permit threads to continue to next block, unless eof
1672 // 6. goto step 3 unless eof
1673 //
1674 // 7. write results
1675 // Results are always reported in lower-triangular order, rather than
1676 // KING's upper-triangular order, since the former plays more nicely
1677 // with incremental addition of samples.
1678 PgrSampleSubsetIndex pssi;
1679 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
1680 do {
1681 const uint32_t cur_block_size = MINV(cur_variant_ct - variants_completed, kKingMultiplex);
1682 uintptr_t* cur_smaj_hom = dense_ctx.smaj_hom[parity];
1683 uintptr_t* cur_smaj_ref2het = dense_ctx.smaj_ref2het[parity];
1684 // "block" = distance computation granularity, usually 1024 or 1536
1685 // variants
1686 // "batch" = variant-major-to-sample-major transpose granularity,
1687 // currently 512 variants
1688 uint32_t variant_batch_size = kPglBitTransposeBatch;
1689 uint32_t variant_batch_size_rounded_up = kPglBitTransposeBatch;
1690 const uint32_t write_batch_ct_m1 = (cur_block_size - 1) / kPglBitTransposeBatch;
1691 for (uint32_t write_batch_idx = 0; ; ++write_batch_idx) {
1692 if (write_batch_idx >= write_batch_ct_m1) {
1693 if (write_batch_idx > write_batch_ct_m1) {
1694 break;
1695 }
1696 variant_batch_size = ModNz(cur_block_size, kPglBitTransposeBatch);
1697 variant_batch_size_rounded_up = variant_batch_size;
1698 const uint32_t variant_batch_size_rem = variant_batch_size % kBitsPerWord;
1699 if (variant_batch_size_rem) {
1700 const uint32_t trailing_variant_ct = kBitsPerWord - variant_batch_size_rem;
1701 variant_batch_size_rounded_up += trailing_variant_ct;
1702 ZeroWArr(trailing_variant_ct * row_end_idxaw, &(splitbuf_hom[variant_batch_size * row_end_idxaw]));
1703 ZeroWArr(trailing_variant_ct * row_end_idxaw, &(splitbuf_ref2het[variant_batch_size * row_end_idxaw]));
1704 }
1705 }
1706 uintptr_t* hom_iter = splitbuf_hom;
1707 uintptr_t* ref2het_iter = splitbuf_ref2het;
1708 for (uint32_t uii = 0; uii != variant_batch_size; ++uii) {
1709 const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
1710 // Model does not cleanly generalize to multiallelic variants
1711 // (unless there's something I overlooked, which is quite
1712 // possible).
1713 // Thought about using major allele counts in that case, but that
1714 // sacrifices a really nice property of this method: estimated
1715 // relationship coefficient between each pair of samples is
1716 // independent of estimated allele frequencies. And the accuracy
1717 // improvement we'd get in return is microscopic. So we stick to
1718 // REF/ALT allele counts instead.
1719 reterr = PgrGet(cur_sample_include, pssi, row_end_idx, variant_uidx, simple_pgrp, loadbuf);
1720 if (unlikely(reterr)) {
1721 goto CalcKing_ret_PGR_FAIL;
1722 }
1723 SetTrailingNyps(row_end_idx, loadbuf);
1724 SplitHomRef2hetUnsafeW(loadbuf, row_end_idxaw2, hom_iter, ref2het_iter);
1725 hom_iter = &(hom_iter[row_end_idxaw]);
1726 ref2het_iter = &(ref2het_iter[row_end_idxaw]);
1727 }
1728 // uintptr_t* read_iter = loadbuf;
1729 uintptr_t* write_hom_iter = &(cur_smaj_hom[write_batch_idx * kPglBitTransposeWords]);
1730 uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[write_batch_idx * kPglBitTransposeWords]);
1731 uint32_t write_batch_size = kPglBitTransposeBatch;
1732 for (uint32_t sample_batch_idx = 0; ; ++sample_batch_idx) {
1733 if (sample_batch_idx >= sample_batch_ct_m1) {
1734 if (sample_batch_idx > sample_batch_ct_m1) {
1735 break;
1736 }
1737 write_batch_size = ModNz(row_end_idx, kPglBitTransposeBatch);
1738 }
1739 // bugfix: read_batch_size must be rounded up to word boundary,
1740 // since we want to one-out instead of zero-out the trailing bits
1741 //
1742 // bugfix: if we always use kPglBitTransposeBatch instead of
1743 // variant_batch_size_rounded_up, we read/write past the
1744 // kKingMultiplex limit and clobber the first variants of the
1745 // next sample with garbage.
1746 TransposeBitblock(&(splitbuf_hom[sample_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_hom_iter, vecaligned_buf);
1747 TransposeBitblock(&(splitbuf_ref2het[sample_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_ref2het_iter, vecaligned_buf);
1748 write_hom_iter = &(write_hom_iter[kKingMultiplex * kPglBitTransposeWords]);
1749 write_ref2het_iter = &(write_ref2het_iter[kKingMultiplex * kPglBitTransposeWords]);
1750 }
1751 }
1752 const uint32_t cur_block_sizew = BitCtToWordCt(cur_block_size);
1753 if (cur_block_sizew < kKingMultiplexWords) {
1754 uintptr_t* write_hom_iter = &(cur_smaj_hom[cur_block_sizew]);
1755 uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[cur_block_sizew]);
1756 const uint32_t write_word_ct = kKingMultiplexWords - cur_block_sizew;
1757 for (uint32_t sample_idx = 0; sample_idx != row_end_idx; ++sample_idx) {
1758 ZeroWArr(write_word_ct, write_hom_iter);
1759 ZeroWArr(write_word_ct, write_ref2het_iter);
1760 write_hom_iter = &(write_hom_iter[kKingMultiplexWords]);
1761 write_ref2het_iter = &(write_ref2het_iter[kKingMultiplexWords]);
1762 }
1763 }
1764 if (variants_completed) {
1765 JoinThreads(&tg);
1766 // CalcKingThread() never errors out
1767 }
1768 // this update must occur after JoinThreads() call
1769 if (variants_completed + cur_block_size == cur_variant_ct) {
1770 DeclareLastThreadBlock(&tg);
1771 }
1772 if (unlikely(SpawnThreads(&tg))) {
1773 goto CalcKing_ret_THREAD_CREATE_FAIL;
1774 }
1775 printf("\r%s pass %u/%u: %u variants complete.", flagname, pass_idx_p1, pass_ct, variants_completed);
1776 fflush(stdout);
1777 variants_completed += cur_block_size;
1778 parity = 1 - parity;
1779 } while (!IsLastBlock(&tg));
1780 JoinThreads(&tg);
1781 }
1782 if (matrix_shape || (king_flags & kfKingColAll)) {
1783 printf("\r%s pass %u/%u: Writing... \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", flagname, pass_idx_p1, pass_ct);
1784 fflush(stdout);
1785 // allow simultaneous --make-king + --make-king-table
1786 if (matrix_shape) {
1787 if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1788 const uint32_t is_squarex = king_flags & (kfKingMatrixSq | kfKingMatrixSq0);
1789 const uint32_t is_square0 = king_flags & kfKingMatrixSq0;
1790 uint32_t* results_iter = dense_ctx.king_counts;
1791 uint32_t sample_idx1 = row_start_idx;
1792 if (is_squarex && (!parallel_idx) && (pass_idx_p1)) {
1793 // dump "empty" first row
1794 sample_idx1 = 0;
1795 }
1796 for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1797 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1798 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1799 for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
1800 const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1801 if (kinship_table && (kinship_coeff > king_cutoff)) {
1802 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1803 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1804 }
1805 cswritep = dtoa_g(kinship_coeff, cswritep);
1806 *cswritep++ = '\t';
1807 results_iter = &(results_iter[homhom_needed_p4]);
1808 }
1809 if (is_squarex) {
1810 cswritep = strcpya_k(cswritep, "0.5");
1811 if (is_square0) {
1812 // (roughly same performance as creating a tab-zero constant
1813 // buffer in advance)
1814 const uint32_t zcount = sample_ct - sample_idx1 - 1;
1815 const uint32_t wct = DivUp(zcount, kBytesPerWord / 2);
1816 // assumes little-endian
1817 const uintptr_t tabzero_word = 0x3009 * kMask0001;
1818 #ifdef __arm__
1819 # error "Unaligned accesses in CalcKing()."
1820 #endif
1821 uintptr_t* writep_alias = R_CAST(uintptr_t*, cswritep);
1822 for (uintptr_t widx = 0; widx != wct; ++widx) {
1823 *writep_alias++ = tabzero_word;
1824 }
1825 cswritep = &(cswritep[zcount * 2]);
1826 } else {
1827 const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1828 // 0
1829 // 1 2
1830 // 3 4 5
1831 // 6 7 8 9
1832 // 10 11 12 13 14
1833
1834 // sample_idx1 = 0: [0] 0 1 3 6 10...
1835 // sample_idx1 = 1: [1] 2 4 7 11...
1836 // sample_idx1 = 2: [3] 5 8 12...
1837 // sample_idx1 = 3: [6] 9 13...
1838 for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1839 *cswritep++ = '\t';
1840 cswritep = dtoa_g(ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]), cswritep);
1841 results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1842 }
1843 }
1844 ++cswritep;
1845 }
1846 DecrAppendBinaryEoln(&cswritep);
1847 if (unlikely(Cswrite(&css, &cswritep))) {
1848 goto CalcKing_ret_WRITE_FAIL;
1849 }
1850 }
1851 } else {
1852 // binary matrix output
1853 // er, probably want to revise this so there's less duplicated code
1854 // from text matrix output...
1855 const uint32_t is_squarex = king_flags & (kfKingMatrixSq | kfKingMatrixSq0);
1856 const uint32_t is_square0 = king_flags & kfKingMatrixSq0;
1857 uint32_t* results_iter = dense_ctx.king_counts;
1858 uint32_t sample_idx1 = row_start_idx;
1859 if (is_squarex && (!parallel_idx)) {
1860 sample_idx1 = 0;
1861 }
1862 if (king_flags & kfKingMatrixBin4) {
1863 float* write_row = R_CAST(float*, numbuf);
1864 uintptr_t row_byte_ct = sample_ct * sizeof(float);
1865 for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1866 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1867 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1868 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
1869 const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1870 if (kinship_table && (kinship_coeff > king_cutoff)) {
1871 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1872 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1873 }
1874 write_row[sample_idx2] = S_CAST(float, kinship_coeff);
1875 results_iter = &(results_iter[homhom_needed_p4]);
1876 }
1877 if (is_squarex) {
1878 write_row[sample_idx1] = 0.5f;
1879 if (is_square0) {
1880 const uint32_t right_fill_idx = sample_idx1 + 1;
1881 ZeroFArr(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
1882 } else {
1883 const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1884 for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1885 write_row[sample_idx2] = S_CAST(float, ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]));
1886 results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1887 }
1888 }
1889 } else {
1890 row_byte_ct = sample_idx1 * sizeof(float);
1891 }
1892 if (unlikely(fwrite_checked(write_row, row_byte_ct, outfile))) {
1893 goto CalcKing_ret_WRITE_FAIL;
1894 }
1895 }
1896 } else {
1897 double* write_row = R_CAST(double*, numbuf);
1898 uintptr_t row_byte_ct = sample_ct * sizeof(double);
1899 for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1900 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1901 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1902 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
1903 const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1904 if (kinship_table && (kinship_coeff > king_cutoff)) {
1905 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1906 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1907 }
1908 write_row[sample_idx2] = kinship_coeff;
1909 results_iter = &(results_iter[homhom_needed_p4]);
1910 }
1911 if (is_squarex) {
1912 write_row[sample_idx1] = 0.5;
1913 if (is_square0) {
1914 const uint32_t right_fill_idx = sample_idx1 + 1;
1915 ZeroDArr(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
1916 } else {
1917 const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1918 for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1919 write_row[sample_idx2] = ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]);
1920 results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1921 }
1922 }
1923 } else {
1924 row_byte_ct = sample_idx1 * sizeof(double);
1925 }
1926 if (unlikely(fwrite_checked(write_row, row_byte_ct, outfile))) {
1927 goto CalcKing_ret_WRITE_FAIL;
1928 }
1929 }
1930 }
1931 }
1932 }
1933 if (king_flags & kfKingColAll) {
1934 uintptr_t* kinship_table_backup = nullptr;
1935 if (matrix_shape) {
1936 // We already updated the table; don't do it again.
1937 kinship_table_backup = kinship_table;
1938 kinship_table = nullptr;
1939 }
1940 const uint32_t king_col_id = king_flags & kfKingColId;
1941 const uint32_t king_col_nsnp = king_flags & kfKingColNsnp;
1942 const uint32_t king_col_hethet = king_flags & kfKingColHethet;
1943 const uint32_t king_col_ibs0 = king_flags & kfKingColIbs0;
1944 const uint32_t king_col_ibs1 = king_flags & kfKingColIbs1;
1945 const uint32_t king_col_kinship = king_flags & kfKingColKinship;
1946 const uint32_t report_counts = king_flags & kfKingCounts;
1947 uint32_t* results_iter = dense_ctx.king_counts;
1948 double nonmiss_recip = 0.0;
1949 for (uint32_t sample_idx1 = row_start_idx; sample_idx1 != row_end_idx; ++sample_idx1) {
1950 const char* sample_fmtid1 = &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx1]);
1951 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1952 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1953 const uint32_t sample_fmtid1_slen = strlen(sample_fmtid1);
1954 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2, results_iter = &(results_iter[homhom_needed_p4])) {
1955 const uint32_t singleton_het2_ct = singleton_het_cts[sample_idx2];
1956 const uint32_t singleton_hom2_ct = singleton_hom_cts[sample_idx2];
1957 const uint32_t ibs0_ct = results_iter[kKingOffsetIbs0] + singleton_hom2_ct + singleton_hom1_ct;
1958 const uint32_t hethet_ct = results_iter[kKingOffsetHethet];
1959 // '2' here refers to the larger index, so this is swapped
1960 const uint32_t het2hom1_ct = results_iter[kKingOffsetHet2Hom1] + singleton_het1_ct;
1961 const uint32_t het1hom2_ct = results_iter[kKingOffsetHet1Hom2] + singleton_het2_ct;
1962 const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1963 const double kinship_coeff = 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1964 if (kinship_table && (kinship_coeff > king_cutoff)) {
1965 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1966 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1967 }
1968 // edge case fix (18 Nov 2017): kinship_coeff can be -inf when
1969 // smaller_het_ct is zero. Don't filter those lines out when
1970 // --king-table-filter wasn't specified.
1971 if ((king_table_filter != -DBL_MAX) && (kinship_coeff < king_table_filter)) {
1972 ++king_table_filter_ct;
1973 continue;
1974 }
1975 if (king_col_id) {
1976 cswritetp = memcpyax(cswritetp, sample_fmtid1, sample_fmtid1_slen, '\t');
1977 cswritetp = strcpyax(cswritetp, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx2]), '\t');
1978 }
1979 if (homhom_needed_p4 == 5) {
1980 const uint32_t homhom_ct = results_iter[kKingOffsetHomhom] + sparse_variant_ct - singleton_het2_ct - singleton_missing_cts[sample_idx2] - singleton_het1_ct - singleton_missing_cts[sample_idx1];
1981 const uint32_t nonmiss_ct = het1hom2_ct + het2hom1_ct + homhom_ct + hethet_ct;
1982 if (king_col_nsnp) {
1983 cswritetp = u32toa_x(nonmiss_ct, '\t', cswritetp);
1984 }
1985 if (!report_counts) {
1986 nonmiss_recip = 1.0 / u31tod(nonmiss_ct);
1987 }
1988 }
1989 if (king_col_hethet) {
1990 if (report_counts) {
1991 cswritetp = u32toa(hethet_ct, cswritetp);
1992 } else {
1993 cswritetp = dtoa_g(nonmiss_recip * u31tod(hethet_ct), cswritetp);
1994 }
1995 *cswritetp++ = '\t';
1996 }
1997 if (king_col_ibs0) {
1998 if (report_counts) {
1999 cswritetp = u32toa(ibs0_ct, cswritetp);
2000 } else {
2001 cswritetp = dtoa_g(nonmiss_recip * u31tod(ibs0_ct), cswritetp);
2002 }
2003 *cswritetp++ = '\t';
2004 }
2005 if (king_col_ibs1) {
2006 if (report_counts) {
2007 cswritetp = u32toa_x(het1hom2_ct, '\t', cswritetp);
2008 cswritetp = u32toa(het2hom1_ct, cswritetp);
2009 } else {
2010 cswritetp = dtoa_g(nonmiss_recip * u31tod(het1hom2_ct), cswritetp);
2011 *cswritetp++ = '\t';
2012 cswritetp = dtoa_g(nonmiss_recip * u31tod(het2hom1_ct), cswritetp);
2013 }
2014 *cswritetp++ = '\t';
2015 }
2016 if (king_col_kinship) {
2017 cswritetp = dtoa_g(kinship_coeff, cswritetp);
2018 ++cswritetp;
2019 }
2020 DecrAppendBinaryEoln(&cswritetp);
2021 if (unlikely(Cswrite(&csst, &cswritetp))) {
2022 goto CalcKing_ret_WRITE_FAIL;
2023 }
2024 }
2025 }
2026
2027 if (matrix_shape) {
2028 kinship_table = kinship_table_backup;
2029 }
2030 }
2031 } else {
2032 printf("\r%s pass %u/%u: Condensing... \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", flagname, pass_idx_p1, pass_ct);
2033 fflush(stdout);
2034 uint32_t* results_iter = dense_ctx.king_counts;
2035 for (uint32_t sample_idx1 = row_start_idx; sample_idx1 != row_end_idx; ++sample_idx1) {
2036 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
2037 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
2038 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
2039 const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
2040 if (kinship_coeff > king_cutoff) {
2041 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
2042 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
2043 }
2044 results_iter = &(results_iter[homhom_needed_p4]);
2045 }
2046 }
2047 }
2048 fputs(" done.\n", stdout);
2049 }
2050 logprintf("%s: %u variant%s processed.\n", flagname, variant_ct, (variant_ct == 1)? "" : "s");
2051 // end-of-loop operations
2052 if (matrix_shape) {
2053 if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
2054 if (unlikely(CswriteCloseNull(&css, cswritep))) {
2055 goto CalcKing_ret_WRITE_FAIL;
2056 }
2057 } else {
2058 if (unlikely(fclose_null(&outfile))) {
2059 goto CalcKing_ret_WRITE_FAIL;
2060 }
2061 }
2062 // Necessary to regenerate filename since it may have been overwritten by
2063 // --make-king-table.
2064 SetKingMatrixFname(king_flags, parallel_idx, parallel_tot, outname_end);
2065
2066 char* write_iter = strcpya_k(g_logbuf, "Results written to ");
2067 const uint32_t outname_base_slen = S_CAST(uintptr_t, outname_end - outname);
2068 write_iter = memcpya(write_iter, outname, outname_base_slen + strlen(outname_end));
2069 write_iter = strcpya_k(write_iter, " and ");
2070 strcpy_k(&(outname_end[5]), ".id");
2071 write_iter = memcpya(write_iter, outname, outname_base_slen + 8);
2072 strcpy_k(write_iter, " .\n");
2073 WordWrapB(0);
2074 logputsb();
2075 reterr = WriteSampleIds(sample_include, siip, outname, sample_ct);
2076 if (unlikely(reterr)) {
2077 goto CalcKing_ret_1;
2078 }
2079 }
2080 if (king_flags & kfKingColAll) {
2081 if (unlikely(CswriteCloseNull(&csst, cswritetp))) {
2082 goto CalcKing_ret_WRITE_FAIL;
2083 }
2084 SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
2085 char* write_iter = strcpya_k(g_logbuf, "Results written to ");
2086 const uint32_t outname_base_slen = S_CAST(uintptr_t, outname_end - outname);
2087 write_iter = memcpya(write_iter, outname, outname_base_slen + strlen(outname_end));
2088 if ((!parallel_idx) && (!(king_flags & kfKingColId))) {
2089 write_iter = strcpya_k(write_iter, " and ");
2090 strcpy_k(&(outname_end[5]), ".id");
2091 write_iter = memcpya(write_iter, outname, outname_base_slen + 8);
2092 strcpy_k(write_iter, " .\n");
2093 WordWrapB(0);
2094 logputsb();
2095 reterr = WriteSampleIds(sample_include, siip, outname, sample_ct);
2096 if (unlikely(reterr)) {
2097 goto CalcKing_ret_1;
2098 }
2099 } else {
2100 strcpy_k(write_iter, " .\n");
2101 WordWrapB(0);
2102 logputsb();
2103 }
2104 if (king_table_filter != -DBL_MAX) {
2105 const uint64_t grand_tot_cells = (S_CAST(uint64_t, grand_row_end_idx) * (grand_row_end_idx - 1) - S_CAST(uint64_t, grand_row_start_idx) * (grand_row_start_idx - 1)) / 2;
2106 const uint64_t reported_ct = grand_tot_cells - king_table_filter_ct;
2107 logprintf("--king-table-filter: %" PRIu64 " relationship%s reported (%" PRIu64 " filtered out).\n", reported_ct, (reported_ct == 1)? "" : "s", king_table_filter_ct);
2108 }
2109 }
2110 if (kinship_table) {
2111 BigstackReset(sample_include_cumulative_popcounts);
2112 *sample_ct_ptr = sample_ct;
2113 if (unlikely(KinshipPruneDestructive(kinship_table, sample_include, sample_ct_ptr))) {
2114 goto CalcKing_ret_NOMEM;
2115 }
2116 }
2117 }
2118 while (0) {
2119 CalcKing_ret_NOMEM:
2120 reterr = kPglRetNomem;
2121 break;
2122 CalcKing_ret_OPEN_FAIL:
2123 reterr = kPglRetOpenFail;
2124 break;
2125 CalcKing_ret_PGR_FAIL:
2126 PgenErrPrintN(reterr);
2127 break;
2128 CalcKing_ret_WRITE_FAIL:
2129 reterr = kPglRetWriteFail;
2130 break;
2131 CalcKing_ret_INCONSISTENT_INPUT:
2132 reterr = kPglRetInconsistentInput;
2133 break;
2134 CalcKing_ret_THREAD_CREATE_FAIL:
2135 reterr = kPglRetThreadCreateFail;
2136 break;
2137 CalcKing_ret_DEGENERATE_DATA:
2138 reterr = kPglRetDegenerateData;
2139 break;
2140 }
2141 CalcKing_ret_1:
2142 CleanupThreads(&tg);
2143 CswriteCloseCond(&csst, cswritetp);
2144 CswriteCloseCond(&css, cswritep);
2145 fclose_cond(outfile);
2146 BigstackReset(bigstack_mark);
2147 return reterr;
2148 }
2149
2150 #ifdef USE_SSE42
IncrKingSubset(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2151 void IncrKingSubset(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2152 const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2153 const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2154 uint32_t* king_counts_iter = &(king_counts[(4 * k1LU) * start_idx]);
2155 while (sample_idx_pair_iter != sample_idx_pair_stop) {
2156 // technically overflows for huge sample_ct
2157 const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2158 const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2159 const uintptr_t* first_hom = &(smaj_hom[first_offset]);
2160 const uintptr_t* first_ref2het = &(smaj_ref2het[first_offset]);
2161 const uintptr_t* second_hom = &(smaj_hom[second_offset]);
2162 const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
2163 uint32_t acc_ibs0 = 0;
2164 uint32_t acc_hethet = 0;
2165 uint32_t acc_het2hom1 = 0;
2166 uint32_t acc_het1hom2 = 0;
2167 for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
2168 const uintptr_t hom1 = first_hom[widx];
2169 const uintptr_t hom2 = second_hom[widx];
2170 const uintptr_t ref2het1 = first_ref2het[widx];
2171 const uintptr_t ref2het2 = second_ref2het[widx];
2172 const uintptr_t homhom = hom1 & hom2;
2173 const uintptr_t het1 = ref2het1 & (~hom1);
2174 const uintptr_t het2 = ref2het2 & (~hom2);
2175 acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
2176 acc_hethet += PopcountWord(het1 & het2);
2177 acc_het2hom1 += PopcountWord(hom1 & het2);
2178 acc_het1hom2 += PopcountWord(hom2 & het1);
2179 }
2180 *king_counts_iter++ += acc_ibs0;
2181 *king_counts_iter++ += acc_hethet;
2182 *king_counts_iter++ += acc_het2hom1;
2183 *king_counts_iter++ += acc_het1hom2;
2184 }
2185 }
2186
IncrKingSubsetHomhom(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2187 void IncrKingSubsetHomhom(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2188 const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2189 const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2190 uint32_t* king_counts_iter = &(king_counts[(5 * k1LU) * start_idx]);
2191 while (sample_idx_pair_iter != sample_idx_pair_stop) {
2192 // technically overflows for huge sample_ct
2193 const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2194 const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2195 const uintptr_t* first_hom = &(smaj_hom[first_offset]);
2196 const uintptr_t* first_ref2het = &(smaj_ref2het[first_offset]);
2197 const uintptr_t* second_hom = &(smaj_hom[second_offset]);
2198 const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
2199 uint32_t acc_homhom = 0;
2200 uint32_t acc_ibs0 = 0;
2201 uint32_t acc_hethet = 0;
2202 uint32_t acc_het2hom1 = 0;
2203 uint32_t acc_het1hom2 = 0;
2204 for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
2205 const uintptr_t hom1 = first_hom[widx];
2206 const uintptr_t hom2 = second_hom[widx];
2207 const uintptr_t ref2het1 = first_ref2het[widx];
2208 const uintptr_t ref2het2 = second_ref2het[widx];
2209 const uintptr_t homhom = hom1 & hom2;
2210 const uintptr_t het1 = ref2het1 & (~hom1);
2211 const uintptr_t het2 = ref2het2 & (~hom2);
2212 acc_homhom += PopcountWord(homhom);
2213 acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
2214 acc_hethet += PopcountWord(het1 & het2);
2215 acc_het2hom1 += PopcountWord(hom1 & het2);
2216 acc_het1hom2 += PopcountWord(hom2 & het1);
2217 }
2218 *king_counts_iter++ += acc_ibs0;
2219 *king_counts_iter++ += acc_hethet;
2220 *king_counts_iter++ += acc_het2hom1;
2221 *king_counts_iter++ += acc_het1hom2;
2222 *king_counts_iter++ += acc_homhom;
2223 }
2224 }
2225 #else
IncrKingSubset(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2226 void IncrKingSubset(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2227 const VecW m1 = VCONST_W(kMask5555);
2228 const VecW m2 = VCONST_W(kMask3333);
2229 const VecW m4 = VCONST_W(kMask0F0F);
2230 const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2231 const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2232 uint32_t* king_counts_iter = &(king_counts[(4 * k1LU) * start_idx]);
2233 while (sample_idx_pair_iter != sample_idx_pair_stop) {
2234 // technically overflows for huge sample_ct
2235 const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2236 const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2237 const VecW* first_hom = R_CAST(const VecW*, &(smaj_hom[first_offset]));
2238 const VecW* first_ref2het = R_CAST(const VecW*, &(smaj_ref2het[first_offset]));
2239 const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
2240 const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
2241 UniVec acc_ibs0;
2242 UniVec acc_hethet;
2243 UniVec acc_het2hom1;
2244 UniVec acc_het1hom2;
2245 acc_ibs0.vw = vecw_setzero();
2246 acc_hethet.vw = vecw_setzero();
2247 acc_het2hom1.vw = vecw_setzero();
2248 acc_het1hom2.vw = vecw_setzero();
2249 for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
2250 VecW hom1 = first_hom[vec_idx];
2251 VecW hom2 = second_hom[vec_idx];
2252 VecW ref2het1 = first_ref2het[vec_idx];
2253 VecW ref2het2 = second_ref2het[vec_idx];
2254 VecW het1 = vecw_and_notfirst(hom1, ref2het1);
2255 VecW het2 = vecw_and_notfirst(hom2, ref2het2);
2256 VecW agg_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
2257 VecW agg_hethet = het1 & het2;
2258 VecW agg_het2hom1 = hom1 & het2;
2259 VecW agg_het1hom2 = hom2 & het1;
2260 agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
2261 agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
2262 agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
2263 agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
2264 agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
2265 agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
2266 agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
2267 agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
2268
2269 for (uint32_t offset = 1; offset != 3; ++offset) {
2270 hom1 = first_hom[vec_idx + offset];
2271 hom2 = second_hom[vec_idx + offset];
2272 ref2het1 = first_ref2het[vec_idx + offset];
2273 ref2het2 = second_ref2het[vec_idx + offset];
2274 het1 = vecw_and_notfirst(hom1, ref2het1);
2275 het2 = vecw_and_notfirst(hom2, ref2het2);
2276 VecW cur_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
2277 VecW cur_hethet = het1 & het2;
2278 VecW cur_het2hom1 = hom1 & het2;
2279 VecW cur_het1hom2 = hom2 & het1;
2280 cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
2281 cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
2282 cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
2283 cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
2284 agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
2285 agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
2286 agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
2287 agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
2288 }
2289 acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
2290 acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
2291 acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
2292 acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
2293 }
2294 const VecW m8 = VCONST_W(kMask00FF);
2295 acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
2296 acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
2297 acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
2298 acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
2299 *king_counts_iter++ += UniVecHsum16(acc_ibs0);
2300 *king_counts_iter++ += UniVecHsum16(acc_hethet);
2301 *king_counts_iter++ += UniVecHsum16(acc_het2hom1);
2302 *king_counts_iter++ += UniVecHsum16(acc_het1hom2);
2303 }
2304 }
2305
IncrKingSubsetHomhom(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2306 void IncrKingSubsetHomhom(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2307 const VecW m1 = VCONST_W(kMask5555);
2308 const VecW m2 = VCONST_W(kMask3333);
2309 const VecW m4 = VCONST_W(kMask0F0F);
2310 const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2311 const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2312 uint32_t* king_counts_iter = &(king_counts[(5 * k1LU) * start_idx]);
2313 while (sample_idx_pair_iter != sample_idx_pair_stop) {
2314 // technically overflows for huge sample_ct
2315 const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2316 const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2317 const VecW* first_hom = R_CAST(const VecW*, &(smaj_hom[first_offset]));
2318 const VecW* first_ref2het = R_CAST(const VecW*, &(smaj_ref2het[first_offset]));
2319 const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
2320 const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
2321 UniVec acc_homhom;
2322 UniVec acc_ibs0;
2323 UniVec acc_hethet;
2324 UniVec acc_het2hom1;
2325 UniVec acc_het1hom2;
2326 acc_homhom.vw = vecw_setzero();
2327 acc_ibs0.vw = vecw_setzero();
2328 acc_hethet.vw = vecw_setzero();
2329 acc_het2hom1.vw = vecw_setzero();
2330 acc_het1hom2.vw = vecw_setzero();
2331 for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
2332 VecW hom1 = first_hom[vec_idx];
2333 VecW hom2 = second_hom[vec_idx];
2334 VecW ref2het1 = first_ref2het[vec_idx];
2335 VecW ref2het2 = second_ref2het[vec_idx];
2336 VecW agg_homhom = hom1 & hom2;
2337 VecW het1 = vecw_and_notfirst(hom1, ref2het1);
2338 VecW het2 = vecw_and_notfirst(hom2, ref2het2);
2339 VecW agg_ibs0 = (ref2het1 ^ ref2het2) & agg_homhom;
2340 VecW agg_hethet = het1 & het2;
2341 VecW agg_het2hom1 = hom1 & het2;
2342 VecW agg_het1hom2 = hom2 & het1;
2343 agg_homhom = agg_homhom - (vecw_srli(agg_homhom, 1) & m1);
2344 agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
2345 agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
2346 agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
2347 agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
2348 agg_homhom = (agg_homhom & m2) + (vecw_srli(agg_homhom, 2) & m2);
2349 agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
2350 agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
2351 agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
2352 agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
2353
2354 for (uint32_t offset = 1; offset != 3; ++offset) {
2355 hom1 = first_hom[vec_idx + offset];
2356 hom2 = second_hom[vec_idx + offset];
2357 ref2het1 = first_ref2het[vec_idx + offset];
2358 ref2het2 = second_ref2het[vec_idx + offset];
2359 VecW cur_homhom = hom1 & hom2;
2360 het1 = vecw_and_notfirst(hom1, ref2het1);
2361 het2 = vecw_and_notfirst(hom2, ref2het2);
2362 VecW cur_ibs0 = (ref2het1 ^ ref2het2) & cur_homhom;
2363 VecW cur_hethet = het1 & het2;
2364 VecW cur_het2hom1 = hom1 & het2;
2365 VecW cur_het1hom2 = hom2 & het1;
2366 cur_homhom = cur_homhom - (vecw_srli(cur_homhom, 1) & m1);
2367 cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
2368 cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
2369 cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
2370 cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
2371 agg_homhom += (cur_homhom & m2) + (vecw_srli(cur_homhom, 2) & m2);
2372 agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
2373 agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
2374 agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
2375 agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
2376 }
2377 acc_homhom.vw = acc_homhom.vw + (agg_homhom & m4) + (vecw_srli(agg_homhom, 4) & m4);
2378 acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
2379 acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
2380 acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
2381 acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
2382 }
2383 const VecW m8 = VCONST_W(kMask00FF);
2384 acc_homhom.vw = (acc_homhom.vw & m8) + (vecw_srli(acc_homhom.vw, 8) & m8);
2385 acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
2386 acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
2387 acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
2388 acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
2389 *king_counts_iter++ += UniVecHsum16(acc_ibs0);
2390 *king_counts_iter++ += UniVecHsum16(acc_hethet);
2391 *king_counts_iter++ += UniVecHsum16(acc_het2hom1);
2392 *king_counts_iter++ += UniVecHsum16(acc_het1hom2);
2393 *king_counts_iter++ += UniVecHsum16(acc_homhom);
2394 }
2395 }
2396 #endif
2397
2398 typedef struct CalcKingTableSubsetCtxStruct {
2399 uintptr_t* smaj_hom[2];
2400 uintptr_t* smaj_ref2het[2];
2401 uint32_t* loaded_sample_idx_pairs;
2402 uint32_t homhom_needed;
2403
2404 uint32_t* thread_start;
2405
2406 uint32_t* king_counts;
2407 } CalcKingTableSubsetCtx;
2408
CalcKingTableSubsetThread(void * raw_arg)2409 THREAD_FUNC_DECL CalcKingTableSubsetThread(void* raw_arg) {
2410 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2411 const uintptr_t tidx = arg->tidx;
2412 CalcKingTableSubsetCtx* ctx = S_CAST(CalcKingTableSubsetCtx*, arg->sharedp->context);
2413
2414 const uint32_t start_idx = ctx->thread_start[tidx];
2415 const uint32_t end_idx = ctx->thread_start[tidx + 1];
2416 const uint32_t homhom_needed = ctx->homhom_needed;
2417 uint32_t parity = 0;
2418 do {
2419 if (homhom_needed) {
2420 IncrKingSubsetHomhom(ctx->loaded_sample_idx_pairs, ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, ctx->king_counts);
2421 } else {
2422 IncrKingSubset(ctx->loaded_sample_idx_pairs, ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, ctx->king_counts);
2423 }
2424 parity = 1 - parity;
2425 } while (!THREAD_BLOCK_FINISH(arg));
2426 THREAD_RETURN;
2427 }
2428
KingTableSubsetLoad(const char * sorted_xidbox,const uint32_t * xid_map,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,double king_table_subset_thresh,XidMode xid_mode,uint32_t skip_sid,uint32_t rel_check,uint32_t kinship_skip,uint32_t is_first_parallel_scan,uint64_t pair_idx_start,uint64_t pair_idx_stop,uintptr_t line_idx,TextStream * txsp,uint64_t * pair_idx_ptr,uint32_t * loaded_sample_idx_pairs,char * idbuf)2429 PglErr KingTableSubsetLoad(const char* sorted_xidbox, const uint32_t* xid_map, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, double king_table_subset_thresh, XidMode xid_mode, uint32_t skip_sid, uint32_t rel_check, uint32_t kinship_skip, uint32_t is_first_parallel_scan, uint64_t pair_idx_start, uint64_t pair_idx_stop, uintptr_t line_idx, TextStream* txsp, uint64_t* pair_idx_ptr, uint32_t* loaded_sample_idx_pairs, char* idbuf) {
2430 PglErr reterr = kPglRetSuccess;
2431 {
2432 uint64_t pair_idx = *pair_idx_ptr;
2433 // Assumes header line already read if pair_idx == 0, and if pair_idx is
2434 // positive, we're that far into the file.
2435 uint32_t* loaded_sample_idx_pairs_iter = loaded_sample_idx_pairs;
2436 ++line_idx;
2437 for (char* line_iter = TextLineEnd(txsp); TextGetUnsafe2(txsp, &line_iter); line_iter = AdvPastDelim(line_iter, '\n'), ++line_idx) {
2438 const char* linebuf_iter = line_iter;
2439 uint32_t sample_uidx1;
2440 if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx1, idbuf)) {
2441 if (unlikely(!linebuf_iter)) {
2442 goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2443 }
2444 line_iter = K_CAST(char*, linebuf_iter);
2445 continue;
2446 }
2447 linebuf_iter = FirstNonTspace(linebuf_iter);
2448 if (skip_sid) {
2449 if (unlikely(IsEolnKns(*linebuf_iter))) {
2450 goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2451 }
2452 linebuf_iter = FirstNonTspace(CurTokenEnd(linebuf_iter));
2453 }
2454 if (rel_check) {
2455 // linebuf_iter must point to the start of the second FID, while
2456 // line_iter points to the start of the first.
2457 const uint32_t first_fid_slen = CurTokenEnd(line_iter) - line_iter;
2458 const uint32_t second_fid_slen = CurTokenEnd(linebuf_iter) - linebuf_iter;
2459 if ((first_fid_slen != second_fid_slen) || (!memequal(line_iter, linebuf_iter, first_fid_slen))) {
2460 line_iter = K_CAST(char*, linebuf_iter);
2461 continue;
2462 }
2463 }
2464 uint32_t sample_uidx2;
2465 if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx2, idbuf)) {
2466 if (unlikely(!linebuf_iter)) {
2467 goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2468 }
2469 line_iter = K_CAST(char*, linebuf_iter);
2470 continue;
2471 }
2472 if (unlikely(sample_uidx1 == sample_uidx2)) {
2473 // could technically be due to unloaded SID, so use inconsistent-input
2474 // error code
2475 snprintf(g_logbuf, kLogbufSize, "Error: Identical sample IDs on line %" PRIuPTR " of --king-table-subset file.\n", line_idx);
2476 goto KingTableSubsetLoad_ret_INCONSISTENT_INPUT_WW;
2477 }
2478 if (king_table_subset_thresh != -DBL_MAX) {
2479 linebuf_iter = FirstNonTspace(linebuf_iter);
2480 linebuf_iter = NextTokenMult0(linebuf_iter, kinship_skip);
2481 if (unlikely(!linebuf_iter)) {
2482 goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2483 }
2484 double cur_kinship;
2485 const char* kinship_end = ScanadvDouble(linebuf_iter, &cur_kinship);
2486 if (!kinship_end) {
2487 line_iter = K_CAST(char*, linebuf_iter);
2488 continue;
2489 }
2490 if (unlikely(!IsSpaceOrEoln(*kinship_end))) {
2491 kinship_end = CurTokenEnd(kinship_end);
2492 *K_CAST(char*, kinship_end) = '\0';
2493 logerrprintfww("Error: Invalid numeric token '%s' on line %" PRIuPTR " of --king-table-subset file.\n", linebuf_iter, line_idx);
2494 goto KingTableSubsetLoad_ret_MALFORMED_INPUT;
2495 }
2496 if (cur_kinship < king_table_subset_thresh) {
2497 line_iter = K_CAST(char*, kinship_end);
2498 continue;
2499 }
2500 }
2501 line_iter = K_CAST(char*, linebuf_iter);
2502 if (pair_idx < pair_idx_start) {
2503 ++pair_idx;
2504 continue;
2505 }
2506 *loaded_sample_idx_pairs_iter++ = sample_uidx1;
2507 *loaded_sample_idx_pairs_iter++ = sample_uidx2;
2508 ++pair_idx;
2509 if (pair_idx == pair_idx_stop) {
2510 if (!is_first_parallel_scan) {
2511 TextSetPos(AdvPastDelim(line_iter, '\n'), txsp);
2512 goto KingTableSubsetLoad_finish;
2513 }
2514 // large --parallel job, first pass: count number of valid pairs, don't
2515 // save the remainder
2516 pair_idx_start = ~0LLU;
2517 }
2518 }
2519 if (unlikely(TextStreamErrcode2(txsp, &reterr))) {
2520 goto KingTableSubsetLoad_ret_TSTREAM_FAIL;
2521 }
2522 KingTableSubsetLoad_finish:
2523 *pair_idx_ptr = pair_idx;
2524 }
2525 while (0) {
2526 KingTableSubsetLoad_ret_TSTREAM_FAIL:
2527 TextStreamErrPrint("--king-table-subset file", txsp);
2528 break;
2529 KingTableSubsetLoad_ret_MALFORMED_INPUT:
2530 reterr = kPglRetMalformedInput;
2531 break;
2532 KingTableSubsetLoad_ret_MISSING_TOKENS:
2533 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --king-table-subset file has fewer tokens than expected.\n", line_idx);
2534 KingTableSubsetLoad_ret_INCONSISTENT_INPUT_WW:
2535 WordWrapB(0);
2536 logerrputsb();
2537 reterr = kPglRetInconsistentInput;
2538 break;
2539 }
2540 return reterr;
2541 }
2542
2543 typedef struct FidPairIteratorStruct {
2544 uint32_t block_start_idx;
2545 uint32_t block_end_idx;
2546 uint32_t idx1;
2547 uint32_t idx2;
2548 } FidPairIterator;
2549
InitFidPairIterator(FidPairIterator * fpip)2550 void InitFidPairIterator(FidPairIterator* fpip) {
2551 fpip->block_start_idx = UINT32_MAX; // deliberate overflow
2552 fpip->block_end_idx = 0;
2553 fpip->idx1 = 0;
2554 fpip->idx2 = 0; // defensive
2555 }
2556
CountRelCheckPairs(const char * nsorted_xidbox,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,char * idbuf)2557 uint64_t CountRelCheckPairs(const char* nsorted_xidbox, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, char* idbuf) {
2558 uint64_t total = 0;
2559 for (uintptr_t block_start_idx = 0; block_start_idx != orig_sample_ct; ) {
2560 const char* fid_start = &(nsorted_xidbox[block_start_idx * max_xid_blen]);
2561 const uint32_t fid_slen = AdvToDelim(fid_start, '\t') - fid_start;
2562 memcpy(idbuf, fid_start, fid_slen);
2563 idbuf[fid_slen] = ' ';
2564 // bugfix (14 Jan 2020): forgot that natural-sorting was used...
2565 idbuf[fid_slen + 1] = '\0';
2566 const uintptr_t block_end_idx = ExpsearchNsortStrLb(idbuf, nsorted_xidbox, max_xid_blen, orig_sample_ct, block_start_idx + 1);
2567 const uint64_t cur_block_size = block_end_idx - block_start_idx;
2568 total += (cur_block_size * (cur_block_size - 1)) / 2;
2569 block_start_idx = block_end_idx;
2570 }
2571 return total;
2572 }
2573
GetRelCheckPairs(const char * nsorted_xidbox,const uint32_t * xid_map,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,uint32_t is_first_parallel_scan,uint64_t pair_idx_start,uint64_t pair_idx_stop,FidPairIterator * fpip,uint64_t * pair_idx_ptr,uint32_t * loaded_sample_idx_pairs,char * idbuf)2574 void GetRelCheckPairs(const char* nsorted_xidbox, const uint32_t* xid_map, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, uint32_t is_first_parallel_scan, uint64_t pair_idx_start, uint64_t pair_idx_stop, FidPairIterator* fpip, uint64_t* pair_idx_ptr, uint32_t* loaded_sample_idx_pairs, char* idbuf) {
2575 // Support "--make-king-table rel-check" without an actual subset-file.
2576 uint32_t block_start_idx = fpip->block_start_idx;
2577 uint32_t block_end_idx = fpip->block_end_idx;
2578 uint32_t idx1 = fpip->idx1;
2579 uint32_t idx2 = fpip->idx2;
2580 uint64_t pair_idx = *pair_idx_ptr;
2581 uint32_t* loaded_sample_idx_pairs_iter = loaded_sample_idx_pairs;
2582 while (1) {
2583 for (; idx1 != block_end_idx; ++idx1) {
2584 // idx1 >= idx2.
2585 uint32_t cur_pair_ct = idx1 - idx2;
2586 uint32_t idx2_stop = idx1;
2587 if (pair_idx_stop - pair_idx < cur_pair_ct) {
2588 cur_pair_ct = pair_idx_stop - pair_idx;
2589 idx2_stop = idx2 + cur_pair_ct;
2590 }
2591 if (pair_idx < pair_idx_start) {
2592 const uint64_t skip_ct = pair_idx_start - pair_idx;
2593 if (skip_ct >= cur_pair_ct) {
2594 idx2 = idx2_stop;
2595 } else {
2596 idx2 += skip_ct;
2597 }
2598 // pair_idx is updated correctly after the inner loop
2599 }
2600 const uint32_t sample_uidx1 = xid_map[idx1];
2601 for (; idx2 != idx2_stop; ++idx2) {
2602 const uint32_t sample_uidx2 = xid_map[idx2];
2603 *loaded_sample_idx_pairs_iter++ = sample_uidx1;
2604 *loaded_sample_idx_pairs_iter++ = sample_uidx2;
2605 }
2606 pair_idx += cur_pair_ct;
2607 if (pair_idx == pair_idx_stop) {
2608 if (is_first_parallel_scan) {
2609 pair_idx = CountRelCheckPairs(nsorted_xidbox, max_xid_blen, orig_sample_ct, idbuf);
2610 }
2611 goto GetRelCheckPairs_early_exit;
2612 }
2613 idx2 = block_start_idx;
2614 }
2615 block_start_idx = block_end_idx;
2616 if (block_start_idx == orig_sample_ct) {
2617 break;
2618 }
2619 idx2 = block_start_idx;
2620 const char* fid_start = &(nsorted_xidbox[block_start_idx * max_xid_blen]);
2621 const uint32_t fid_slen = AdvToDelim(fid_start, '\t') - fid_start;
2622 memcpy(idbuf, fid_start, fid_slen);
2623 idbuf[fid_slen] = ' ';
2624 idbuf[fid_slen + 1] = '\0';
2625 block_end_idx = ExpsearchNsortStrLb(idbuf, nsorted_xidbox, max_xid_blen, orig_sample_ct, block_start_idx + 1);
2626 }
2627 GetRelCheckPairs_early_exit:
2628 *pair_idx_ptr = pair_idx;
2629 fpip->block_start_idx = block_start_idx;
2630 fpip->block_end_idx = block_end_idx;
2631 fpip->idx1 = idx1;
2632 fpip->idx2 = idx2;
2633 }
2634
CalcKingTableSubset(const uintptr_t * orig_sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const char * subset_fname,uint32_t raw_sample_ct,uint32_t orig_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,double king_table_filter,double king_table_subset_thresh,uint32_t rel_check,KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end)2635 PglErr CalcKingTableSubset(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const char* subset_fname, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, double king_table_filter, double king_table_subset_thresh, uint32_t rel_check, KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end) {
2636 // subset_fname permitted to be nullptr when rel_check is true.
2637 unsigned char* bigstack_mark = g_bigstack_base;
2638 FILE* outfile = nullptr;
2639 char* cswritep = nullptr;
2640 PglErr reterr = kPglRetSuccess;
2641 TextStream txs;
2642 CompressStreamState css;
2643 ThreadGroup tg;
2644 PreinitTextStream(&txs);
2645 PreinitCstream(&css);
2646 PreinitThreads(&tg);
2647 {
2648 if (unlikely(IsSet(cip->haploid_mask, 0))) {
2649 logerrputs("Error: --make-king-table cannot be used on haploid genomes.\n");
2650 goto CalcKingTableSubset_ret_INCONSISTENT_INPUT;
2651 }
2652 reterr = ConditionalAllocateNonAutosomalVariants(cip, "--make-king-table", raw_variant_ct, &variant_include, &variant_ct);
2653 if (unlikely(reterr)) {
2654 goto CalcKingTableSubset_ret_1;
2655 }
2656 // 1. Write output header line if necessary.
2657 // 2. Count number of relevant sample pairs (higher uidx in high 32 bits),
2658 // and load as much as may be useful during first pass (usually there
2659 // will be only one pass).
2660 // 3. If list is empty, error out.
2661 // 4. If --parallel, discard part of the list, then exit if remainder
2662 // empty.
2663 // 5. If remainder of list is too large to process in one pass, determine
2664 // number of necessary passes. If output filename refers to the same
2665 // thing as input file, append ~ to input filename.
2666 // Loop:
2667 // * Determine which sample indexes appear in this part of the list.
2668 // Compute current cumulative_popcounts, perform uidx -> idx conversion.
2669 // (Don't bother sorting the pairs, since that prevents
2670 // --parallel/multipass mode from delivering the same results.)
2671 // * Execute usual KING-robust computation, write .kin0 entries.
2672 // * If not last pass, reload input .kin0, etc.
2673 //
2674 // Could store the pairs in a more compact manner, but can live with 50%
2675 // space bloat for now.
2676 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
2677 uint32_t sample_ctaw = BitCtToAlignedWordCt(orig_sample_ct);
2678 uint32_t sample_ctaw2 = NypCtToAlignedWordCt(orig_sample_ct);
2679 uint32_t king_bufsizew = kKingMultiplexWords * orig_sample_ct;
2680 uintptr_t* cur_sample_include;
2681 uint32_t* sample_include_cumulative_popcounts;
2682 uintptr_t* loadbuf;
2683 uintptr_t* splitbuf_hom;
2684 uintptr_t* splitbuf_ref2het;
2685 VecW* vecaligned_buf;
2686 // ok if allocations are a bit oversized
2687 CalcKingTableSubsetCtx ctx;
2688 if (unlikely(
2689 bigstack_alloc_w(raw_sample_ctl, &cur_sample_include) ||
2690 bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
2691 bigstack_alloc_w(sample_ctaw2, &loadbuf) ||
2692 bigstack_alloc_w(kPglBitTransposeBatch * sample_ctaw, &splitbuf_hom) ||
2693 bigstack_alloc_w(kPglBitTransposeBatch * sample_ctaw, &splitbuf_ref2het) ||
2694 bigstack_alloc_w(king_bufsizew, &(ctx.smaj_hom[0])) ||
2695 bigstack_alloc_w(king_bufsizew, &(ctx.smaj_ref2het[0])) ||
2696 bigstack_alloc_w(king_bufsizew, &(ctx.smaj_hom[1])) ||
2697 bigstack_alloc_w(king_bufsizew, &(ctx.smaj_ref2het[1])) ||
2698 bigstack_alloc_v(kPglBitTransposeBufvecs, &vecaligned_buf))) {
2699 goto CalcKingTableSubset_ret_NOMEM;
2700 }
2701 SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
2702 if (subset_fname) {
2703 uint32_t fname_slen;
2704 #ifdef _WIN32
2705 fname_slen = GetFullPathName(subset_fname, kPglFnamesize, g_textbuf, nullptr);
2706 if (unlikely((!fname_slen) || (fname_slen > kPglFnamesize)))
2707 #else
2708 if (unlikely(!realpath(subset_fname, g_textbuf)))
2709 #endif
2710 {
2711 logerrprintfww(kErrprintfFopen, subset_fname, strerror(errno));
2712 goto CalcKingTableSubset_ret_OPEN_FAIL;
2713 }
2714 if (RealpathIdentical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]))) {
2715 logerrputs("Warning: --king-table-subset input filename matches --make-king-table output\nfilename. Appending '~' to input filename.\n");
2716 fname_slen = strlen(subset_fname);
2717 memcpy(g_textbuf, subset_fname, fname_slen);
2718 strcpy_k(&(g_textbuf[fname_slen]), "~");
2719 if (unlikely(rename(subset_fname, g_textbuf))) {
2720 logerrputs("Error: Failed to append '~' to --king-table-subset input filename.\n");
2721 goto CalcKingTableSubset_ret_OPEN_FAIL;
2722 }
2723 subset_fname = g_textbuf;
2724 }
2725 }
2726
2727 // Safe to "write" the header line now, if necessary.
2728 reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingTableZs, max_thread_ct, kMaxMediumLine + kCompressStreamBlock, &css, &cswritep);
2729 if (unlikely(reterr)) {
2730 goto CalcKingTableSubset_ret_1;
2731 }
2732 const uint32_t king_col_fid = FidColIsRequired(siip, king_flags / kfKingColMaybefid);
2733 const uint32_t king_col_sid = SidColIsRequired(siip->sids, king_flags / kfKingColMaybesid);
2734 if (!parallel_idx) {
2735 cswritep = AppendKingTableHeader(king_flags, king_col_fid, king_col_sid, cswritep);
2736 }
2737 const uintptr_t max_sample_fmtid_blen = GetMaxSampleFmtidBlen(siip, king_col_fid, king_col_sid);
2738 char* collapsed_sample_fmtids;
2739 if (unlikely(bigstack_alloc_c(max_sample_fmtid_blen * orig_sample_ct, &collapsed_sample_fmtids))) {
2740 goto CalcKingTableSubset_ret_NOMEM;
2741 }
2742 // possible todo: allow this to change between passes
2743 uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
2744 if (calc_thread_ct > orig_sample_ct / 32) {
2745 calc_thread_ct = orig_sample_ct / 32;
2746 }
2747 if (!calc_thread_ct) {
2748 calc_thread_ct = 1;
2749 }
2750 // could eventually have 64-bit g_thread_start?
2751 if (unlikely(
2752 SetThreadCt(calc_thread_ct, &tg) ||
2753 bigstack_alloc_u32(calc_thread_ct + 1, &ctx.thread_start))) {
2754 goto CalcKingTableSubset_ret_NOMEM;
2755 }
2756
2757 uintptr_t line_idx = 0;
2758 uint32_t kinship_skip = 0;
2759 uint32_t skip_sid = 0;
2760 XidMode xid_mode = siip->sids? kfXidModeFidIidSid : kfXidModeIidSid;
2761 if (subset_fname) {
2762 reterr = InitTextStream(subset_fname, kTextStreamBlenFast, 1, &txs);
2763 if (unlikely(reterr)) {
2764 if (reterr == kPglRetEof) {
2765 logerrputs("Error: Empty --king-table-subset file.\n");
2766 goto CalcKingTableSubset_ret_MALFORMED_INPUT;
2767 }
2768 goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2769 }
2770 ++line_idx;
2771 const char* linebuf_iter = TextGet(&txs);
2772 if (unlikely(!linebuf_iter)) {
2773 if (!TextStreamErrcode2(&txs, &reterr)) {
2774 logerrputs("Error: Empty --king-table-subset file.\n");
2775 goto CalcKingTableSubset_ret_MALFORMED_INPUT;
2776 }
2777 goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2778 }
2779 const char* token_end = CurTokenEnd(linebuf_iter);
2780 uint32_t token_slen = token_end - linebuf_iter;
2781 // Make this work with both KING- and plink2-generated .kin0 files.
2782 uint32_t fid_present = strequal_k(linebuf_iter, "#FID1", token_slen) || strequal_k(linebuf_iter, "FID", token_slen);
2783 if (fid_present) {
2784 linebuf_iter = FirstNonTspace(token_end);
2785 token_end = CurTokenEnd(linebuf_iter);
2786 token_slen = token_end - linebuf_iter;
2787 xid_mode = kfXidModeFidIid;
2788 } else {
2789 if (unlikely(*linebuf_iter != '#')) {
2790 goto CalcKingTableSubset_ret_INVALID_HEADER;
2791 }
2792 ++linebuf_iter;
2793 --token_slen;
2794 xid_mode = kfXidModeIid;
2795 }
2796 if (unlikely((!strequal_k(linebuf_iter, "ID1", token_slen)) && (!strequal_k(linebuf_iter, "IID1", token_slen)))) {
2797 goto CalcKingTableSubset_ret_INVALID_HEADER;
2798 }
2799 linebuf_iter = FirstNonTspace(token_end);
2800 token_end = CurTokenEnd(linebuf_iter);
2801 token_slen = token_end - linebuf_iter;
2802 if (strequal_k(linebuf_iter, "SID1", token_slen)) {
2803 if (siip->sids) {
2804 xid_mode = fid_present? kfXidModeFidIidSid : kfXidModeIidSid;
2805 } else {
2806 skip_sid = 1;
2807 }
2808 linebuf_iter = FirstNonTspace(token_end);
2809 token_end = CurTokenEnd(linebuf_iter);
2810 token_slen = token_end - linebuf_iter;
2811 }
2812 if (fid_present) {
2813 if (unlikely(!strequal_k(linebuf_iter, "FID2", token_slen))) {
2814 goto CalcKingTableSubset_ret_INVALID_HEADER;
2815 }
2816 linebuf_iter = FirstNonTspace(token_end);
2817 token_end = CurTokenEnd(linebuf_iter);
2818 token_slen = token_end - linebuf_iter;
2819 }
2820 if (unlikely((!strequal_k(linebuf_iter, "ID2", token_slen)) && (!strequal_k(linebuf_iter, "IID2", token_slen)))) {
2821 goto CalcKingTableSubset_ret_INVALID_HEADER;
2822 }
2823 if (xid_mode == kfXidModeFidIidSid) {
2824 // technically don't need to check this in skip_sid case
2825 linebuf_iter = FirstNonTspace(token_end);
2826 token_end = CurTokenEnd(linebuf_iter);
2827 token_slen = token_end - linebuf_iter;
2828 if (unlikely(!strequal_k(linebuf_iter, "SID2", token_slen))) {
2829 goto CalcKingTableSubset_ret_INVALID_HEADER;
2830 }
2831 }
2832 if (king_table_subset_thresh != -DBL_MAX) {
2833 king_table_subset_thresh *= 1.0 - kSmallEpsilon;
2834 while (1) {
2835 linebuf_iter = FirstNonTspace(token_end);
2836 token_end = CurTokenEnd(linebuf_iter);
2837 token_slen = token_end - linebuf_iter;
2838 if (unlikely(!token_slen)) {
2839 logerrputs("Error: No kinship-coefficient column in --king-table-subset file.\n");
2840 goto CalcKingTableSubset_ret_INCONSISTENT_INPUT;
2841 }
2842 if (strequal_k(linebuf_iter, "KINSHIP", token_slen) || strequal_k(linebuf_iter, "Kinship", token_slen)) {
2843 break;
2844 }
2845 ++kinship_skip;
2846 }
2847 }
2848 }
2849
2850 uint32_t* xid_map; // IDs not collapsed
2851 char* sorted_xidbox;
2852 uintptr_t max_xid_blen;
2853 // may as well use natural-sort order in rel-check-only case
2854 reterr = SortedXidboxInitAlloc(orig_sample_include, siip, orig_sample_ct, 0, xid_mode, (!subset_fname), &sorted_xidbox, &xid_map, &max_xid_blen);
2855 if (unlikely(reterr)) {
2856 goto CalcKingTableSubset_ret_1;
2857 }
2858 char* idbuf;
2859 if (unlikely(bigstack_alloc_c(max_xid_blen, &idbuf))) {
2860 goto CalcKingTableSubset_ret_NOMEM;
2861 }
2862
2863 ctx.homhom_needed = (king_flags & kfKingColNsnp) || ((!(king_flags & kfKingCounts)) && (king_flags & (kfKingColHethet | kfKingColIbs0 | kfKingColIbs1)));
2864 const uint32_t homhom_needed_p4 = ctx.homhom_needed + 4;
2865 // if homhom_needed, 8 + 20 bytes per pair, otherwise 8 + 16
2866 uintptr_t pair_buf_capacity = bigstack_left();
2867 if (unlikely(pair_buf_capacity < 2 * kCacheline)) {
2868 goto CalcKingTableSubset_ret_NOMEM;
2869 }
2870 // adverse rounding
2871 pair_buf_capacity = (pair_buf_capacity - 2 * kCacheline) / (24 + 4 * ctx.homhom_needed);
2872 if (pair_buf_capacity > 0xffffffffU) {
2873 // 32-bit ctx.thread_start[] for now
2874 pair_buf_capacity = 0xffffffffU;
2875 }
2876 ctx.loaded_sample_idx_pairs = S_CAST(uint32_t*, bigstack_alloc_raw_rd(pair_buf_capacity * 2 * sizeof(int32_t)));
2877 ctx.king_counts = R_CAST(uint32_t*, g_bigstack_base);
2878 SetThreadFuncAndData(CalcKingTableSubsetThread, &ctx, &tg);
2879
2880 FidPairIterator fpi;
2881 InitFidPairIterator(&fpi);
2882
2883 uint64_t pair_idx = 0;
2884 if (!subset_fname) {
2885 GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, (parallel_tot != 1), 0, pair_buf_capacity, &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2886 } else {
2887 fputs("Scanning --king-table-subset file...", stdout);
2888 fflush(stdout);
2889 reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, (parallel_tot != 1), 0, pair_buf_capacity, line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2890 if (unlikely(reterr)) {
2891 goto CalcKingTableSubset_ret_1;
2892 }
2893 }
2894 uint64_t pair_idx_global_start = 0;
2895 uint64_t pair_idx_global_stop = ~0LLU;
2896 if (parallel_tot != 1) {
2897 const uint64_t parallel_pair_ct = pair_idx;
2898 pair_idx_global_start = (parallel_idx * parallel_pair_ct) / parallel_tot;
2899 pair_idx_global_stop = ((parallel_idx + 1) * parallel_pair_ct) / parallel_tot;
2900 if (pair_idx > pair_buf_capacity) {
2901 // may as well document possible overflow
2902 if (unlikely(parallel_pair_ct > ((~0LLU) / kParallelMax))) {
2903 if (!subset_fname) {
2904 // This is easy to support if there's ever a need, of course.
2905 logerrputs("Error: Too many \"--make-king-table rel-check\" sample pairs for this " PROG_NAME_STR "\nbuild.\n");
2906 } else {
2907 logerrputs("Error: Too many --king-table-subset sample pairs for this " PROG_NAME_STR " build.\n");
2908 }
2909 reterr = kPglRetNotYetSupported;
2910 goto CalcKingTableSubset_ret_1;
2911 }
2912 if (pair_idx_global_stop > pair_buf_capacity) {
2913 // large --parallel job
2914 pair_idx = 0;
2915 if (!subset_fname) {
2916 InitFidPairIterator(&fpi);
2917 GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2918 } else {
2919 reterr = TextRewind(&txs);
2920 if (unlikely(reterr)) {
2921 goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2922 }
2923 // bugfix (4 Oct 2019): forgot a bunch of reinitialization here
2924 line_idx = 1;
2925 char* header_throwaway;
2926 reterr = TextNextLineLstrip(&txs, &header_throwaway);
2927 if (unlikely(reterr)) {
2928 goto CalcKingTableSubset_ret_TSTREAM_REWIND_FAIL;
2929 }
2930 reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2931 if (unlikely(reterr)) {
2932 goto CalcKingTableSubset_ret_1;
2933 }
2934 }
2935 } else {
2936 pair_idx = pair_idx_global_stop;
2937 if (pair_idx_global_start) {
2938 memmove(ctx.loaded_sample_idx_pairs, &(ctx.loaded_sample_idx_pairs[pair_idx_global_start * 2]), (pair_idx_global_stop - pair_idx_global_start) * 2 * sizeof(int32_t));
2939 }
2940 }
2941 } else {
2942 pair_idx = pair_idx_global_stop;
2943 if (pair_idx_global_start) {
2944 memmove(ctx.loaded_sample_idx_pairs, &(ctx.loaded_sample_idx_pairs[pair_idx_global_start * 2]), (pair_idx_global_stop - pair_idx_global_start) * 2 * sizeof(int32_t));
2945 }
2946 }
2947 }
2948 uint64_t pair_idx_cur_start = pair_idx_global_start;
2949 uint64_t king_table_filter_ct = 0;
2950 uintptr_t pass_idx = 1;
2951 while (pair_idx_cur_start < pair_idx) {
2952 ZeroWArr(raw_sample_ctl, cur_sample_include);
2953 const uintptr_t cur_pair_ct = pair_idx - pair_idx_cur_start;
2954 const uintptr_t cur_pair_ct_x2 = 2 * cur_pair_ct;
2955 for (uintptr_t ulii = 0; ulii != cur_pair_ct_x2; ++ulii) {
2956 SetBit(ctx.loaded_sample_idx_pairs[ulii], cur_sample_include);
2957 }
2958 FillCumulativePopcounts(cur_sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
2959 const uint32_t cur_sample_ct = sample_include_cumulative_popcounts[raw_sample_ctl - 1] + PopcountWord(cur_sample_include[raw_sample_ctl - 1]);
2960 const uint32_t cur_sample_ctaw = BitCtToAlignedWordCt(cur_sample_ct);
2961 const uint32_t cur_sample_ctaw2 = NypCtToAlignedWordCt(cur_sample_ct);
2962 if (cur_sample_ct != raw_sample_ct) {
2963 for (uintptr_t ulii = 0; ulii != cur_pair_ct_x2; ++ulii) {
2964 ctx.loaded_sample_idx_pairs[ulii] = RawToSubsettedPos(cur_sample_include, sample_include_cumulative_popcounts, ctx.loaded_sample_idx_pairs[ulii]);
2965 }
2966 }
2967 ZeroU32Arr(cur_pair_ct * homhom_needed_p4, ctx.king_counts);
2968 CollapsedSampleFmtidInit(cur_sample_include, siip, cur_sample_ct, king_col_fid, king_col_sid, max_sample_fmtid_blen, collapsed_sample_fmtids);
2969 for (uint32_t tidx = 0; tidx <= calc_thread_ct; ++tidx) {
2970 ctx.thread_start[tidx] = (tidx * S_CAST(uint64_t, cur_pair_ct)) / calc_thread_ct;
2971 }
2972 if (pass_idx != 1) {
2973 ReinitThreads(&tg);
2974 }
2975 // possible todo: singleton/monomorphic optimization for sufficiently
2976 // large jobs
2977 uintptr_t variant_uidx_base = 0;
2978 uintptr_t cur_bits = variant_include[0];
2979 uint32_t variants_completed = 0;
2980 uint32_t parity = 0;
2981 const uint32_t sample_batch_ct_m1 = (cur_sample_ct - 1) / kPglBitTransposeBatch;
2982 PgrSampleSubsetIndex pssi;
2983 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
2984 do {
2985 const uint32_t cur_block_size = MINV(variant_ct - variants_completed, kKingMultiplex);
2986 uintptr_t* cur_smaj_hom = ctx.smaj_hom[parity];
2987 uintptr_t* cur_smaj_ref2het = ctx.smaj_ref2het[parity];
2988 // "block" = distance computation granularity, usually 1024 or 1536
2989 // variants
2990 // "batch" = variant-major-to-sample-major transpose granularity,
2991 // currently 512 variants
2992 uint32_t variant_batch_size = kPglBitTransposeBatch;
2993 uint32_t variant_batch_size_rounded_up = kPglBitTransposeBatch;
2994 const uint32_t write_batch_ct_m1 = (cur_block_size - 1) / kPglBitTransposeBatch;
2995 for (uint32_t write_batch_idx = 0; ; ++write_batch_idx) {
2996 if (write_batch_idx >= write_batch_ct_m1) {
2997 if (write_batch_idx > write_batch_ct_m1) {
2998 break;
2999 }
3000 variant_batch_size = ModNz(cur_block_size, kPglBitTransposeBatch);
3001 variant_batch_size_rounded_up = variant_batch_size;
3002 const uint32_t variant_batch_size_rem = variant_batch_size % kBitsPerWord;
3003 if (variant_batch_size_rem) {
3004 const uint32_t trailing_variant_ct = kBitsPerWord - variant_batch_size_rem;
3005 variant_batch_size_rounded_up += trailing_variant_ct;
3006 ZeroWArr(trailing_variant_ct * cur_sample_ctaw, &(splitbuf_hom[variant_batch_size * cur_sample_ctaw]));
3007 ZeroWArr(trailing_variant_ct * cur_sample_ctaw, &(splitbuf_ref2het[variant_batch_size * cur_sample_ctaw]));
3008 }
3009 }
3010 uintptr_t* hom_iter = splitbuf_hom;
3011 uintptr_t* ref2het_iter = splitbuf_ref2het;
3012 for (uint32_t uii = 0; uii != variant_batch_size; ++uii) {
3013 const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3014 reterr = PgrGet(cur_sample_include, pssi, cur_sample_ct, variant_uidx, simple_pgrp, loadbuf);
3015 if (unlikely(reterr)) {
3016 goto CalcKingTableSubset_ret_PGR_FAIL;
3017 }
3018 // may want to support some sort of low-MAF optimization here
3019 SetTrailingNyps(cur_sample_ct, loadbuf);
3020 SplitHomRef2hetUnsafeW(loadbuf, cur_sample_ctaw2, hom_iter, ref2het_iter);
3021 hom_iter = &(hom_iter[cur_sample_ctaw]);
3022 ref2het_iter = &(ref2het_iter[cur_sample_ctaw]);
3023 }
3024 // uintptr_t* read_iter = loadbuf;
3025 uintptr_t* write_hom_iter = &(cur_smaj_hom[write_batch_idx * kPglBitTransposeWords]);
3026 uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[write_batch_idx * kPglBitTransposeWords]);
3027 uint32_t write_batch_size = kPglBitTransposeBatch;
3028 for (uint32_t sample_batch_idx = 0; ; ++sample_batch_idx) {
3029 if (sample_batch_idx >= sample_batch_ct_m1) {
3030 if (sample_batch_idx > sample_batch_ct_m1) {
3031 break;
3032 }
3033 write_batch_size = ModNz(cur_sample_ct, kPglBitTransposeBatch);
3034 }
3035 // bugfix: read_batch_size must be rounded up to word boundary,
3036 // since we want to one-out instead of zero-out the trailing bits
3037 //
3038 // bugfix: if we always use kPglBitTransposeBatch instead of
3039 // variant_batch_size_rounded_up, we read/write past the
3040 // kKingMultiplex limit and clobber the first variants of the next
3041 // sample with garbage.
3042 TransposeBitblock(&(splitbuf_hom[sample_batch_idx * kPglBitTransposeWords]), cur_sample_ctaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_hom_iter, vecaligned_buf);
3043 TransposeBitblock(&(splitbuf_ref2het[sample_batch_idx * kPglBitTransposeWords]), cur_sample_ctaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_ref2het_iter, vecaligned_buf);
3044 write_hom_iter = &(write_hom_iter[kKingMultiplex * kPglBitTransposeWords]);
3045 write_ref2het_iter = &(write_ref2het_iter[kKingMultiplex * kPglBitTransposeWords]);
3046 }
3047 }
3048 const uint32_t cur_block_sizew = BitCtToWordCt(cur_block_size);
3049 if (cur_block_sizew < kKingMultiplexWords) {
3050 uintptr_t* write_hom_iter = &(cur_smaj_hom[cur_block_sizew]);
3051 uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[cur_block_sizew]);
3052 const uint32_t write_word_ct = kKingMultiplexWords - cur_block_sizew;
3053 for (uint32_t sample_idx = 0; sample_idx != cur_sample_ct; ++sample_idx) {
3054 ZeroWArr(write_word_ct, write_hom_iter);
3055 ZeroWArr(write_word_ct, write_ref2het_iter);
3056 write_hom_iter = &(write_hom_iter[kKingMultiplexWords]);
3057 write_ref2het_iter = &(write_ref2het_iter[kKingMultiplexWords]);
3058 }
3059 }
3060 if (variants_completed) {
3061 JoinThreads(&tg);
3062 // CalcKingTableSubsetThread() never errors out
3063 }
3064 // this update must occur after JoinThreads() call
3065 if (variants_completed + cur_block_size == variant_ct) {
3066 DeclareLastThreadBlock(&tg);
3067 }
3068 if (unlikely(SpawnThreads(&tg))) {
3069 goto CalcKingTableSubset_ret_THREAD_CREATE_FAIL;
3070 }
3071 printf("\r--make-king-table pass %" PRIuPTR ": %u variants complete.", pass_idx, variants_completed);
3072 fflush(stdout);
3073 variants_completed += cur_block_size;
3074 parity = 1 - parity;
3075 } while (!IsLastBlock(&tg));
3076 JoinThreads(&tg);
3077 printf("\r--make-king-table pass %" PRIuPTR ": Writing... \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", pass_idx);
3078 fflush(stdout);
3079
3080 const uint32_t king_col_id = king_flags & kfKingColId;
3081 const uint32_t king_col_nsnp = king_flags & kfKingColNsnp;
3082 const uint32_t king_col_hethet = king_flags & kfKingColHethet;
3083 const uint32_t king_col_ibs0 = king_flags & kfKingColIbs0;
3084 const uint32_t king_col_ibs1 = king_flags & kfKingColIbs1;
3085 const uint32_t king_col_kinship = king_flags & kfKingColKinship;
3086 const uint32_t report_counts = king_flags & kfKingCounts;
3087 uint32_t* results_iter = ctx.king_counts;
3088 double nonmiss_recip = 0.0;
3089 for (uintptr_t cur_pair_idx = 0; cur_pair_idx != cur_pair_ct; ++cur_pair_idx, results_iter = &(results_iter[homhom_needed_p4])) {
3090 const uint32_t ibs0_ct = results_iter[kKingOffsetIbs0];
3091 const uint32_t hethet_ct = results_iter[kKingOffsetHethet];
3092 const uint32_t het2hom1_ct = results_iter[kKingOffsetHet2Hom1];
3093 const uint32_t het1hom2_ct = results_iter[kKingOffsetHet1Hom2];
3094 const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
3095 const double kinship_coeff = 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
3096 if ((king_table_filter != -DBL_MAX) && (kinship_coeff < king_table_filter)) {
3097 ++king_table_filter_ct;
3098 continue;
3099 }
3100 const uint32_t sample_idx1 = ctx.loaded_sample_idx_pairs[2 * cur_pair_idx];
3101 const uint32_t sample_idx2 = ctx.loaded_sample_idx_pairs[2 * cur_pair_idx + 1];
3102 if (king_col_id) {
3103 cswritep = strcpyax(cswritep, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx1]), '\t');
3104 cswritep = strcpyax(cswritep, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx2]), '\t');
3105 }
3106 if (homhom_needed_p4 == 5) {
3107 const uint32_t homhom_ct = results_iter[kKingOffsetHomhom];
3108 const uint32_t nonmiss_ct = het1hom2_ct + het2hom1_ct + homhom_ct + hethet_ct;
3109 if (king_col_nsnp) {
3110 cswritep = u32toa_x(nonmiss_ct, '\t', cswritep);
3111 }
3112 if (!report_counts) {
3113 nonmiss_recip = 1.0 / u31tod(nonmiss_ct);
3114 }
3115 }
3116 if (king_col_hethet) {
3117 if (report_counts) {
3118 cswritep = u32toa(hethet_ct, cswritep);
3119 } else {
3120 cswritep = dtoa_g(nonmiss_recip * u31tod(hethet_ct), cswritep);
3121 }
3122 *cswritep++ = '\t';
3123 }
3124 if (king_col_ibs0) {
3125 if (report_counts) {
3126 cswritep = u32toa(ibs0_ct, cswritep);
3127 } else {
3128 cswritep = dtoa_g(nonmiss_recip * u31tod(ibs0_ct), cswritep);
3129 }
3130 *cswritep++ = '\t';
3131 }
3132 if (king_col_ibs1) {
3133 if (report_counts) {
3134 cswritep = u32toa_x(het1hom2_ct, '\t', cswritep);
3135 cswritep = u32toa(het2hom1_ct, cswritep);
3136 } else {
3137 cswritep = dtoa_g(nonmiss_recip * u31tod(het1hom2_ct), cswritep);
3138 *cswritep++ = '\t';
3139 cswritep = dtoa_g(nonmiss_recip * u31tod(het2hom1_ct), cswritep);
3140 }
3141 *cswritep++ = '\t';
3142 }
3143 if (king_col_kinship) {
3144 cswritep = dtoa_g(kinship_coeff, cswritep);
3145 ++cswritep;
3146 }
3147 DecrAppendBinaryEoln(&cswritep);
3148 if (unlikely(Cswrite(&css, &cswritep))) {
3149 goto CalcKingTableSubset_ret_WRITE_FAIL;
3150 }
3151 }
3152
3153 putc_unlocked('\r', stdout);
3154 const uint64_t pair_complete_ct = pair_idx - pair_idx_global_start;
3155 logprintf("Subsetted --make-king-table: %" PRIu64 " pair%s complete.\n", pair_complete_ct, (pair_complete_ct == 1)? "" : "s");
3156 if (TextEof(&txs) || (pair_idx == pair_idx_global_stop)) {
3157 break;
3158 }
3159 pair_idx_cur_start = pair_idx;
3160 if (!subset_fname) {
3161 GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
3162 } else {
3163 fputs("Scanning --king-table-subset file...", stdout);
3164 fflush(stdout);
3165 reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, 0, pair_idx_cur_start, MINV(pair_idx_global_stop, pair_idx_cur_start + pair_buf_capacity), line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
3166 if (unlikely(reterr)) {
3167 goto CalcKingTableSubset_ret_1;
3168 }
3169 }
3170 ++pass_idx;
3171 }
3172 if (unlikely(CswriteCloseNull(&css, cswritep))) {
3173 goto CalcKingTableSubset_ret_WRITE_FAIL;
3174 }
3175 logprintfww("Results written to %s .\n", outname);
3176 if (king_table_filter != -DBL_MAX) {
3177 const uint64_t reported_ct = pair_idx - pair_idx_global_start - king_table_filter_ct;
3178 logprintf("--king-table-filter: %" PRIu64 " relationship%s reported (%" PRIu64 " filtered out).\n", reported_ct, (reported_ct == 1)? "" : "s", king_table_filter_ct);
3179 }
3180 }
3181 while (0) {
3182 CalcKingTableSubset_ret_NOMEM:
3183 reterr = kPglRetNomem;
3184 break;
3185 CalcKingTableSubset_ret_OPEN_FAIL:
3186 reterr = kPglRetOpenFail;
3187 break;
3188 CalcKingTableSubset_ret_TSTREAM_REWIND_FAIL:
3189 TextStreamErrPrintRewind("--king-table-subset file", &txs, &reterr);
3190 break;
3191 CalcKingTableSubset_ret_TSTREAM_FAIL:
3192 TextStreamErrPrint("--king-table-subset file", &txs);
3193 break;
3194 CalcKingTableSubset_ret_PGR_FAIL:
3195 PgenErrPrintN(reterr);
3196 break;
3197 CalcKingTableSubset_ret_WRITE_FAIL:
3198 reterr = kPglRetWriteFail;
3199 break;
3200 CalcKingTableSubset_ret_INVALID_HEADER:
3201 logerrputs("Error: Invalid header line in --king-table-subset file.\n");
3202 CalcKingTableSubset_ret_MALFORMED_INPUT:
3203 reterr = kPglRetMalformedInput;
3204 break;
3205 CalcKingTableSubset_ret_INCONSISTENT_INPUT:
3206 reterr = kPglRetInconsistentInput;
3207 break;
3208 CalcKingTableSubset_ret_THREAD_CREATE_FAIL:
3209 reterr = kPglRetThreadCreateFail;
3210 break;
3211 }
3212 CalcKingTableSubset_ret_1:
3213 CleanupThreads(&tg);
3214 CleanupTextStream2("--king-table-subset file", &txs, &reterr);
3215 CswriteCloseCond(&css, cswritep);
3216 fclose_cond(outfile);
3217 BigstackReset(bigstack_mark);
3218 return reterr;
3219 }
3220
3221 // assumes trailing bits of genovec are zeroed out
ExpandCenteredVarmaj(const uintptr_t * genovec,const uintptr_t * dosage_present,const Dosage * dosage_main,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t dosage_ct,double ref_freq,double * normed_dosages)3222 PglErr ExpandCenteredVarmaj(const uintptr_t* genovec, const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t dosage_ct, double ref_freq, double* normed_dosages) {
3223 const double alt_freq = 1.0 - ref_freq;
3224 double inv_stdev;
3225 if (variance_standardize) {
3226 const double variance = 2 * ref_freq * alt_freq;
3227 if (!(variance > kSmallEpsilon)) {
3228 // See LoadMultiallelicCenteredVarmaj(). This check was tightened up in
3229 // alpha 3 to reject all-het and monomorphic-wrong-allele variants.
3230 STD_ARRAY_DECL(uint32_t, 4, genocounts);
3231 GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts);
3232 if (unlikely(dosage_ct || genocounts[1])) {
3233 return kPglRetDegenerateData;
3234 }
3235 if (variance != variance) {
3236 if (unlikely(genocounts[0] || genocounts[2])) {
3237 return kPglRetDegenerateData;
3238 }
3239 } else {
3240 if (ref_freq > 0.5) {
3241 if (unlikely(genocounts[2])) {
3242 return kPglRetDegenerateData;
3243 }
3244 } else {
3245 if (unlikely(genocounts[0])) {
3246 return kPglRetDegenerateData;
3247 }
3248 }
3249 }
3250 ZeroDArr(sample_ct, normed_dosages);
3251 return kPglRetSuccess;
3252 }
3253 inv_stdev = 1.0 / sqrt(variance);
3254 if (is_haploid) {
3255 // For our purposes, variance is doubled in haploid case.
3256 inv_stdev *= (1.0 / kSqrt2);
3257 }
3258 // possible todo:
3259 // * Could use one inv_stdev for males and one for nonmales for chrX
3260 // --score (while still leaving that out of GRM... or just leave males
3261 // out there?). This depends on dosage compensation model; discussed in
3262 // e.g. GCTA paper.
3263 } else {
3264 // Extra factor of 2 removed from haploid 'cov' formula in alpha 3.
3265 inv_stdev = is_haploid? 0.5 : 1.0;
3266 }
3267 PopulateRescaledDosage(genovec, dosage_present, dosage_main, inv_stdev, -2 * alt_freq * inv_stdev, 0.0, sample_ct, dosage_ct, normed_dosages);
3268 return kPglRetSuccess;
3269 }
3270
3271 // This breaks the "don't pass pssi between functions" rule since it's a thin
3272 // wrapper around PgrGetInv1D().
LoadBiallelicCenteredVarmaj(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_uidx,double ref_freq,PgenReader * simple_pgrp,uint32_t * missing_presentp,double * normed_dosages,uintptr_t * genovec_buf,uintptr_t * dosage_present_buf,Dosage * dosage_main_buf)3273 PglErr LoadBiallelicCenteredVarmaj(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_uidx, double ref_freq, PgenReader* simple_pgrp, uint32_t* missing_presentp, double* normed_dosages, uintptr_t* genovec_buf, uintptr_t* dosage_present_buf, Dosage* dosage_main_buf) {
3274 uint32_t dosage_ct;
3275 PglErr reterr = PgrGetD(sample_include, pssi, sample_ct, variant_uidx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_main_buf, &dosage_ct);
3276 if (unlikely(reterr)) {
3277 // don't print malformed-.pgen error message here, since this is called
3278 // from multithreaded loops
3279 return reterr;
3280 }
3281 ZeroTrailingNyps(sample_ct, genovec_buf);
3282 if (missing_presentp) {
3283 // missing_present assumed to be initialized to 0
3284 // this should probably be a library function...
3285 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
3286 if (!dosage_ct) {
3287 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3288 const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3289 if (detect_11) {
3290 *missing_presentp = 1;
3291 break;
3292 }
3293 }
3294 } else {
3295 Halfword* dosage_present_alias = R_CAST(Halfword*, dosage_present_buf);
3296 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3297 const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3298 if (detect_11) {
3299 if (PackWordToHalfword(detect_11) & (~dosage_present_alias[widx])) {
3300 *missing_presentp = 1;
3301 break;
3302 }
3303 }
3304 }
3305 }
3306 }
3307 return ExpandCenteredVarmaj(genovec_buf, dosage_present_buf, dosage_main_buf, variance_standardize, is_haploid, sample_ct, dosage_ct, ref_freq, normed_dosages);
3308 }
3309
ComputeDiploidMultiallelicVariance(const double * cur_allele_freqs,uint32_t cur_allele_ct)3310 double ComputeDiploidMultiallelicVariance(const double* cur_allele_freqs, uint32_t cur_allele_ct) {
3311 const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3312 double variance = 0.0;
3313 double freq_sum = 0.0;
3314 for (uint32_t allele_idx = 0; allele_idx != cur_allele_ct_m1; ++allele_idx) {
3315 const double cur_allele_freq = cur_allele_freqs[allele_idx];
3316 variance += cur_allele_freq * (1.0 - cur_allele_freq);
3317 freq_sum += cur_allele_freq;
3318 }
3319 if (freq_sum < 1.0 - kSmallEpsilon) {
3320 const double last_allele_freq = 1.0 - freq_sum;
3321 variance += freq_sum * last_allele_freq;
3322 }
3323 return variance;
3324 }
3325
3326 // Assumes trailing bits of pgvp->genovec have been zeroed out.
CheckMultiallelicDegenVariance(const PgenVariant * pgvp,const double * cur_allele_freqs,uint32_t sample_ct,uint32_t cur_allele_ct,double variance)3327 BoolErr CheckMultiallelicDegenVariance(const PgenVariant* pgvp, const double* cur_allele_freqs, uint32_t sample_ct, uint32_t cur_allele_ct, double variance) {
3328 // One allele has 100% frequency (or all frequencies are NaN).
3329 // If it's the REF allele, error out unless all nonmissing genotypes are
3330 // homozygous-ref, in which case this row can be filled with zeroes (or
3331 // omitted).
3332 // If it's ALT1, error out unless all nonmissing genotypes are hom-ALT1, etc.
3333 const uintptr_t* genovec_buf = pgvp->genovec;
3334 STD_ARRAY_DECL(uint32_t, 4, genocounts);
3335 GenoarrCountFreqsUnsafe(genovec_buf, sample_ct, genocounts);
3336 if (unlikely(pgvp->dosage_ct || genocounts[1])) {
3337 return 1;
3338 }
3339 const uint32_t nm_sample_ct = genocounts[2];
3340 if (variance != variance) {
3341 // NaN frequency is possible when all founder genotypes/dosages are
3342 // missing. Error out in this case unless all other genotypes/dosages are
3343 // also missing.
3344 return (genocounts[0] || nm_sample_ct);
3345 }
3346 if (cur_allele_freqs[0] > 0.5) {
3347 return (nm_sample_ct != 0);
3348 }
3349 if (unlikely(genocounts[0])) {
3350 return 1;
3351 }
3352 if (cur_allele_freqs[1] > 0.5) {
3353 return (pgvp->patch_10_ct != 0);
3354 }
3355 if (pgvp->patch_10_ct != nm_sample_ct) {
3356 return 0;
3357 }
3358 const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3359 uint32_t mono_allele_idx;
3360 for (mono_allele_idx = 2; mono_allele_idx != cur_allele_ct_m1; ++mono_allele_idx) {
3361 if (cur_allele_freqs[mono_allele_idx] > 0.5) {
3362 break;
3363 }
3364 }
3365 return !AllBytesAreX(pgvp->patch_10_vals, mono_allele_idx, 2 * nm_sample_ct);
3366 }
3367
LoadMultiallelicCenteredVarmaj(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,const double * cur_allele_freqs,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_uidx,uint32_t cur_allele_ct,uint32_t allele_idx_start,uint32_t allele_idx_end,PgenReader * simple_pgrp,uint32_t * missing_presentp,double * normed_dosages,PgenVariant * pgvp,double * allele_1copy_buf)3368 PglErr LoadMultiallelicCenteredVarmaj(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, const double* cur_allele_freqs, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_uidx, uint32_t cur_allele_ct, uint32_t allele_idx_start, uint32_t allele_idx_end, PgenReader* simple_pgrp, uint32_t* missing_presentp, double* normed_dosages, PgenVariant* pgvp, double* allele_1copy_buf) {
3369 // This handles cur_allele_ct == 2 correctly. But we typically don't use it
3370 // in that case since it does ~2x as much work as necessary: the two
3371 // normed_dosages[] rows are identical except for opposite sign, so it's best
3372 // to combine them into one row.
3373 PglErr reterr = PgrGetMD(sample_include, pssi, sample_ct, variant_uidx, simple_pgrp, pgvp);
3374 if (unlikely(reterr)) {
3375 return reterr;
3376 }
3377 ZeroTrailingNyps(sample_ct, pgvp->genovec);
3378 const uintptr_t* genovec_buf = pgvp->genovec;
3379 if (missing_presentp) {
3380 // missing_present assumed to be initialized to 0
3381 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
3382 if (!pgvp->dosage_ct) {
3383 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3384 const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3385 if (detect_11) {
3386 *missing_presentp = 1;
3387 break;
3388 }
3389 }
3390 } else {
3391 Halfword* dosage_present_alias = R_CAST(Halfword*, pgvp->dosage_present);
3392 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3393 const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3394 if (detect_11) {
3395 if (PackWordToHalfword(detect_11) & (~dosage_present_alias[widx])) {
3396 *missing_presentp = 1;
3397 break;
3398 }
3399 }
3400 }
3401 }
3402 }
3403 const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3404 double freq_sum = cur_allele_freqs[0];
3405 for (uint32_t uii = 1; uii != cur_allele_ct_m1; ++uii) {
3406 freq_sum += cur_allele_freqs[uii];
3407 }
3408 const double last_allele_freq = 1.0 - freq_sum;
3409 double inv_stdev;
3410 if (variance_standardize) {
3411 const double variance = ComputeDiploidMultiallelicVariance(cur_allele_freqs, cur_allele_ct);
3412 if (!(variance > kSmallEpsilon)) {
3413 if (unlikely(CheckMultiallelicDegenVariance(pgvp, cur_allele_freqs, sample_ct, cur_allele_ct, variance))) {
3414 return kPglRetDegenerateData;
3415 }
3416 ZeroDArr(S_CAST(uintptr_t, sample_ct) * (allele_idx_end - allele_idx_start), normed_dosages);
3417 return kPglRetSuccess;
3418 }
3419 inv_stdev = (1.0 / kSqrt2) / sqrt(variance);
3420 if (is_haploid) {
3421 inv_stdev *= (1.0 / kSqrt2);
3422 }
3423 } else {
3424 inv_stdev = is_haploid? (0.5 / kSqrt2) : (1.0 / kSqrt2);
3425 }
3426 if (!pgvp->dosage_ct) {
3427 // diploid:
3428 // \sum_i x_i * (1 - x_i)
3429 double lookup_vals[32] ALIGNV16;
3430 double* normed_dosages0 = normed_dosages - (allele_idx_start * S_CAST(uintptr_t, sample_ct));
3431 double alt1_intercept = 0.0;
3432 for (uint32_t allele_idx = allele_idx_start; allele_idx != allele_idx_end; ++allele_idx) {
3433 double cur_allele_freq;
3434 if (allele_idx != cur_allele_ct_m1) {
3435 cur_allele_freq = cur_allele_freqs[allele_idx];
3436 } else {
3437 cur_allele_freq = last_allele_freq;
3438 }
3439 const double intercept = -2 * cur_allele_freq * inv_stdev;
3440 if (!allele_idx) {
3441 // genovec entry of 0 corresponds to 2 copies of REF allele, etc.
3442 lookup_vals[0] = intercept + 2 * inv_stdev;
3443 lookup_vals[2] = intercept + inv_stdev;
3444 lookup_vals[4] = intercept;
3445 lookup_vals[6] = 0.0;
3446 InitLookup16x8bx2(lookup_vals);
3447 GenoarrLookup16x8bx2(genovec_buf, lookup_vals, sample_ct, normed_dosages0);
3448 continue;
3449 }
3450 allele_1copy_buf[allele_idx] = intercept + inv_stdev;
3451 if (allele_idx == 1) {
3452 alt1_intercept = intercept;
3453 lookup_vals[0] = intercept;
3454 lookup_vals[2] = intercept + inv_stdev;
3455 lookup_vals[4] = intercept + 2 * inv_stdev;
3456 lookup_vals[6] = 0.0;
3457 InitLookup16x8bx2(lookup_vals);
3458 GenoarrLookup16x8bx2(genovec_buf, lookup_vals, sample_ct, &(normed_dosages0[sample_ct]));
3459 } else {
3460 double* normed_dosages_cur_allele = &(normed_dosages0[allele_idx * S_CAST(uintptr_t, sample_ct)]);
3461 for (uint32_t uii = 0; uii != sample_ct; ++uii) {
3462 normed_dosages_cur_allele[uii] = intercept;
3463 }
3464 }
3465 }
3466 const uintptr_t* patch_01_set = pgvp->patch_01_set;
3467 const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
3468 const uintptr_t* patch_10_set = pgvp->patch_10_set;
3469 const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
3470 const uint32_t patch_01_ct = pgvp->patch_01_ct;
3471 const uint32_t patch_10_ct = pgvp->patch_10_ct;
3472 if ((allele_idx_start < 2) && (allele_idx_end >= 2)) {
3473 if (patch_01_ct) {
3474 uintptr_t sample_idx_base = 0;
3475 uintptr_t cur_bits = patch_01_set[0];
3476 if (cur_allele_ct == allele_idx_end) {
3477 for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3478 const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3479 normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3480 const uintptr_t cur_allele_code = patch_01_vals[uii];
3481 normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3482 }
3483 } else {
3484 for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3485 const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3486 normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3487 const uintptr_t cur_allele_code = patch_01_vals[uii];
3488 if (cur_allele_code < allele_idx_end) {
3489 normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3490 }
3491 }
3492 }
3493 }
3494 if (patch_10_ct) {
3495 uintptr_t sample_idx_base = 0;
3496 uintptr_t cur_bits = patch_10_set[0];
3497 if (cur_allele_ct == allele_idx_end) {
3498 for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3499 const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3500 normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3501 const uintptr_t ac0 = patch_10_vals[2 * uii];
3502 const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3503 const double ac0_1copy_val = allele_1copy_buf[ac0];
3504 if (ac0 == ac1) {
3505 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3506 } else {
3507 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3508 normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3509 }
3510 }
3511 } else {
3512 for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3513 const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3514 normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3515 const uintptr_t ac0 = patch_10_vals[2 * uii];
3516 if (ac0 >= allele_idx_end) {
3517 continue;
3518 }
3519 const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3520 const double ac0_1copy_val = allele_1copy_buf[ac0];
3521 if (ac0 == ac1) {
3522 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3523 } else {
3524 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3525 if (ac1 < allele_idx_end) {
3526 normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3527 }
3528 }
3529 }
3530 }
3531 }
3532 } else {
3533 if (patch_01_ct) {
3534 uintptr_t sample_idx_base = 0;
3535 uintptr_t cur_bits = patch_01_set[0];
3536 for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3537 const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3538 const uintptr_t cur_allele_code = patch_01_vals[uii];
3539 if ((cur_allele_code >= allele_idx_start) && (cur_allele_code < allele_idx_end)) {
3540 normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3541 }
3542 }
3543 }
3544 if (patch_10_ct) {
3545 uintptr_t sample_idx_base = 0;
3546 uintptr_t cur_bits = patch_10_set[0];
3547 for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3548 const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3549 const uintptr_t ac0 = patch_10_vals[2 * uii];
3550 if (ac0 >= allele_idx_end) {
3551 continue;
3552 }
3553 const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3554 if (ac1 < allele_idx_start) {
3555 continue;
3556 }
3557 const double ac0_1copy_val = allele_1copy_buf[ac0];
3558 if (ac0 == ac1) {
3559 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3560 } else {
3561 if (ac0 >= allele_idx_start) {
3562 normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3563 }
3564 if (ac1 < allele_idx_end) {
3565 normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3566 }
3567 }
3568 }
3569 }
3570 }
3571 return kPglRetSuccess;
3572 }
3573 fputs("true multiallelic dosages not yet supported by LoadMultiallelicCenteredVarmaj()\n", stderr);
3574 exit(S_CAST(int32_t, kPglRetNotYetSupported));
3575 return kPglRetSuccess;
3576 }
3577
LoadCenteredVarmajBlock(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,const double * allele_freqs,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_ct,PgenReader * simple_pgrp,double * normed_vmaj_iter,uintptr_t * variant_include_has_missing,uint32_t * cur_batch_sizep,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uintptr_t * allele_idx_basep,uint32_t * cur_allele_ctp,uint32_t * incomplete_allele_idxp,PgenVariant * pgvp,double * allele_1copy_buf)3578 PglErr LoadCenteredVarmajBlock(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const double* allele_freqs, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_ct, PgenReader* simple_pgrp, double* normed_vmaj_iter, uintptr_t* variant_include_has_missing, uint32_t* cur_batch_sizep, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uintptr_t* allele_idx_basep, uint32_t* cur_allele_ctp, uint32_t* incomplete_allele_idxp, PgenVariant* pgvp, double* allele_1copy_buf) {
3579 const uint32_t std_batch_size = *cur_batch_sizep;
3580 uint32_t variant_idx = *variant_idxp;
3581 uintptr_t variant_uidx = *variant_uidxp;
3582 uintptr_t allele_idx_base = *allele_idx_basep;
3583 uint32_t cur_allele_ct = *cur_allele_ctp;
3584 uint32_t incomplete_allele_idx = *incomplete_allele_idxp;
3585 uintptr_t variant_uidx_base;
3586 uintptr_t cur_bits;
3587 BitIter1Start(variant_include, variant_uidx + (incomplete_allele_idx != 0), &variant_uidx_base, &cur_bits);
3588 for (uint32_t allele_bidx = 0; allele_bidx != std_batch_size; ) {
3589 uint32_t missing_present = 0;
3590 if (!incomplete_allele_idx) {
3591 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3592 if (!allele_idx_offsets) {
3593 allele_idx_base = variant_uidx;
3594 } else {
3595 allele_idx_base = allele_idx_offsets[variant_uidx];
3596 cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_base;
3597 allele_idx_base -= variant_uidx;
3598 }
3599 }
3600 uint32_t allele_idx_stop;
3601 uint32_t allele_idx_end;
3602 PglErr reterr;
3603 if (cur_allele_ct == 2) {
3604 allele_idx_stop = 1;
3605 allele_idx_end = 1;
3606 reterr = LoadBiallelicCenteredVarmaj(sample_include, pssi, variance_standardize, is_haploid, sample_ct, variant_uidx, allele_freqs[allele_idx_base], simple_pgrp, variant_include_has_missing? (&missing_present) : nullptr, normed_vmaj_iter, pgvp->genovec, pgvp->dosage_present, pgvp->dosage_main);
3607 } else {
3608 allele_idx_end = cur_allele_ct;
3609 allele_idx_stop = std_batch_size + incomplete_allele_idx - allele_bidx;
3610 if (allele_idx_stop > allele_idx_end) {
3611 allele_idx_stop = allele_idx_end;
3612 }
3613 reterr = LoadMultiallelicCenteredVarmaj(sample_include, pssi, &(allele_freqs[allele_idx_base]), variance_standardize, is_haploid, sample_ct, variant_uidx, cur_allele_ct, incomplete_allele_idx, allele_idx_stop, simple_pgrp, variant_include_has_missing? (&missing_present) : nullptr, normed_vmaj_iter, pgvp, allele_1copy_buf);
3614 }
3615 if (unlikely(reterr)) {
3616 if (reterr == kPglRetDegenerateData) {
3617 logputs("\n");
3618 logerrputs("Error: Zero-MAF variant is not actually monomorphic. (This is possible when\ne.g. MAF is estimated from founders, but the minor allele was only observed in\nnonfounders. In any case, you should be using e.g. --maf to filter out all\nvery-low-MAF variants, since the relationship matrix distance formula does not\nhandle them well.)\n");
3619 }
3620 return reterr;
3621 }
3622 if (missing_present) {
3623 SetBit(variant_uidx, variant_include_has_missing);
3624 }
3625 const uintptr_t incr = allele_idx_stop - incomplete_allele_idx;
3626 normed_vmaj_iter = &(normed_vmaj_iter[incr * sample_ct]);
3627 allele_bidx += incr;
3628 if (allele_idx_stop == allele_idx_end) {
3629 if (++variant_idx == variant_ct) {
3630 *cur_batch_sizep = allele_bidx;
3631 break;
3632 }
3633 incomplete_allele_idx = 0;
3634 } else {
3635 incomplete_allele_idx = allele_idx_stop;
3636 }
3637 }
3638 *variant_idxp = variant_idx;
3639 *variant_uidxp = variant_uidx + (incomplete_allele_idx == 0);
3640 *allele_idx_basep = allele_idx_base;
3641 *cur_allele_ctp = cur_allele_ct;
3642 *incomplete_allele_idxp = incomplete_allele_idx;
3643 return kPglRetSuccess;
3644 }
3645
3646 CONSTI32(kGrmVariantBlockSize, 144);
3647
3648 typedef struct CalcGrmPartCtxStruct {
3649 uint32_t* thread_start;
3650 uint32_t sample_ct;
3651
3652 uint32_t cur_batch_size;
3653 double* normed_dosage_vmaj_bufs[2];
3654 double* normed_dosage_smaj_bufs[2];
3655
3656 double* grm;
3657 } CalcGrmPartCtx;
3658
3659 // turns out dsyrk_ does exactly what we want here
CalcGrmThread(void * raw_arg)3660 THREAD_FUNC_DECL CalcGrmThread(void* raw_arg) {
3661 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3662 assert(!arg->tidx);
3663 CalcGrmPartCtx* ctx = S_CAST(CalcGrmPartCtx*, arg->sharedp->context);
3664 const uint32_t sample_ct = ctx->sample_ct;
3665 double* grm = ctx->grm;
3666 uint32_t parity = 0;
3667 do {
3668 const uint32_t cur_batch_size = ctx->cur_batch_size;
3669 if (cur_batch_size) {
3670 TransposeMultiplySelfIncr(ctx->normed_dosage_vmaj_bufs[parity], sample_ct, cur_batch_size, grm);
3671 }
3672 parity = 1 - parity;
3673 } while (!THREAD_BLOCK_FINISH(arg));
3674 THREAD_RETURN;
3675 }
3676
3677 // can't use dsyrk_, so we manually partition the GRM piece we need to compute
3678 // into an appropriate number of sub-pieces
CalcGrmPartThread(void * raw_arg)3679 THREAD_FUNC_DECL CalcGrmPartThread(void* raw_arg) {
3680 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3681 const uintptr_t tidx = arg->tidx;
3682 CalcGrmPartCtx* ctx = S_CAST(CalcGrmPartCtx*, arg->sharedp->context);
3683
3684 const uintptr_t sample_ct = ctx->sample_ct;
3685 const uintptr_t first_thread_row_start_idx = ctx->thread_start[0];
3686 const uintptr_t row_start_idx = ctx->thread_start[tidx];
3687 const uintptr_t row_ct = ctx->thread_start[tidx + 1] - row_start_idx;
3688 double* grm_piece = &(ctx->grm[(row_start_idx - first_thread_row_start_idx) * sample_ct]);
3689 uint32_t parity = 0;
3690 do {
3691 const uintptr_t cur_batch_size = ctx->cur_batch_size;
3692 if (cur_batch_size) {
3693 double* normed_vmaj = ctx->normed_dosage_vmaj_bufs[parity];
3694 double* normed_smaj = ctx->normed_dosage_smaj_bufs[parity];
3695 RowMajorMatrixMultiplyIncr(&(normed_smaj[row_start_idx * cur_batch_size]), normed_vmaj, row_ct, sample_ct, cur_batch_size, grm_piece);
3696 }
3697 parity = 1 - parity;
3698 } while (!THREAD_BLOCK_FINISH(arg));
3699 THREAD_RETURN;
3700 }
3701
3702 CONSTI32(kDblMissingBlockWordCt, 2);
3703 CONSTI32(kDblMissingBlockSize, kDblMissingBlockWordCt * kBitsPerWord);
3704
3705 typedef struct CalcDblMissingCtxStruct {
3706 uint32_t* thread_start;
3707 // missing_nz bit is set iff that sample has at least one missing entry in
3708 // current block
3709 uintptr_t* missing_nz[2];
3710 uintptr_t* missing_smaj[2];
3711 uint32_t* missing_dbl_exclude_cts;
3712 } CalcDblMissingCtx;
3713
CalcDblMissingThread(void * raw_arg)3714 THREAD_FUNC_DECL CalcDblMissingThread(void* raw_arg) {
3715 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3716 const uintptr_t tidx = arg->tidx;
3717 CalcDblMissingCtx* ctx = S_CAST(CalcDblMissingCtx*, arg->sharedp->context);
3718
3719 const uint64_t first_thread_row_start_idx = ctx->thread_start[0];
3720 const uint64_t dbl_exclude_offset = (first_thread_row_start_idx * (first_thread_row_start_idx - 1)) / 2;
3721 const uint32_t row_start_idx = ctx->thread_start[tidx];
3722 const uintptr_t row_end_idx = ctx->thread_start[tidx + 1];
3723 uint32_t* missing_dbl_exclude_cts = ctx->missing_dbl_exclude_cts;
3724 uint32_t parity = 0;
3725 do {
3726 const uintptr_t* missing_nz = ctx->missing_nz[parity];
3727 const uintptr_t* missing_smaj = ctx->missing_smaj[parity];
3728 const uint32_t first_idx = AdvBoundedTo1Bit(missing_nz, 0, row_end_idx);
3729 uint32_t sample_idx = first_idx;
3730 uint32_t prev_missing_nz_ct = 0;
3731 if (sample_idx < row_start_idx) {
3732 sample_idx = AdvBoundedTo1Bit(missing_nz, row_start_idx, row_end_idx);
3733 if (sample_idx != row_end_idx) {
3734 prev_missing_nz_ct = PopcountBitRange(missing_nz, 0, row_start_idx);
3735 }
3736 }
3737 while (sample_idx < row_end_idx) {
3738 // todo: compare this explicit unroll with ordinary iteration over a
3739 // cur_words[] array
3740 // todo: try 1 word at a time, and 30 words at a time
3741 const uintptr_t cur_word0 = missing_smaj[sample_idx * kDblMissingBlockWordCt];
3742 const uintptr_t cur_word1 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 1];
3743 #ifndef __LP64__
3744 const uintptr_t cur_word2 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 2];
3745 const uintptr_t cur_word3 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 3];
3746 #endif
3747 uintptr_t sample_idx2_base;
3748 uintptr_t cur_bits;
3749 BitIter1Start(missing_nz, first_idx, &sample_idx2_base, &cur_bits);
3750 // (sample_idx - 1) underflow ok
3751 uint32_t* write_base = &(missing_dbl_exclude_cts[((S_CAST(uint64_t, sample_idx) * (sample_idx - 1)) / 2) - dbl_exclude_offset]);
3752 for (uint32_t uii = 0; uii != prev_missing_nz_ct; ++uii) {
3753 const uint32_t sample_idx2 = BitIter1(missing_nz, &sample_idx2_base, &cur_bits);
3754 const uintptr_t* cur_missing_smaj_base = &(missing_smaj[sample_idx2 * kDblMissingBlockWordCt]);
3755 const uintptr_t cur_and0 = cur_word0 & cur_missing_smaj_base[0];
3756 const uintptr_t cur_and1 = cur_word1 & cur_missing_smaj_base[1];
3757 #ifdef __LP64__
3758 if (cur_and0 || cur_and1) {
3759 write_base[sample_idx2] += Popcount2Words(cur_and0, cur_and1);
3760 }
3761 #else
3762 const uintptr_t cur_and2 = cur_word2 & cur_missing_smaj_base[2];
3763 const uintptr_t cur_and3 = cur_word3 & cur_missing_smaj_base[3];
3764 if (cur_and0 || cur_and1 || cur_and2 || cur_and3) {
3765 write_base[sample_idx2] += Popcount4Words(cur_and0, cur_and1, cur_and2, cur_and3);
3766 }
3767 #endif
3768 }
3769 ++prev_missing_nz_ct;
3770 sample_idx = AdvBoundedTo1Bit(missing_nz, sample_idx + 1, row_end_idx);
3771 }
3772 parity = 1 - parity;
3773 } while (!THREAD_BLOCK_FINISH(arg));
3774 THREAD_RETURN;
3775 }
3776
CalcMissingMatrix(const uintptr_t * sample_include,const uint32_t * sample_include_cumulative_popcounts,const uintptr_t * variant_include,uint32_t sample_ct,uint32_t variant_ct,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t row_start_idx,uintptr_t row_end_idx,uint32_t max_thread_ct,PgenReader * simple_pgrp,uint32_t ** missing_cts_ptr,uint32_t ** missing_dbl_exclude_cts_ptr)3777 PglErr CalcMissingMatrix(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t row_start_idx, uintptr_t row_end_idx, uint32_t max_thread_ct, PgenReader* simple_pgrp, uint32_t** missing_cts_ptr, uint32_t** missing_dbl_exclude_cts_ptr) {
3778 unsigned char* bigstack_mark = g_bigstack_base;
3779 ThreadGroup tg;
3780 PreinitThreads(&tg);
3781 PglErr reterr = kPglRetSuccess;
3782 {
3783 const uintptr_t row_end_idxl = BitCtToWordCt(row_end_idx);
3784 // bugfix (1 Oct 2017): missing_vmaj rows must be vector-aligned
3785 const uintptr_t row_end_idxaw = BitCtToAlignedWordCt(row_end_idx);
3786 uintptr_t* missing_vmaj = nullptr;
3787 uintptr_t* genovec_buf = nullptr;
3788 CalcDblMissingCtx ctx;
3789 if (bigstack_calloc_u32(row_end_idx, missing_cts_ptr) ||
3790 bigstack_calloc_u32((S_CAST(uint64_t, row_end_idx) * (row_end_idx - 1) - S_CAST(uint64_t, row_start_idx) * (row_start_idx - 1)) / 2, missing_dbl_exclude_cts_ptr) ||
3791 bigstack_calloc_w(row_end_idxl, &ctx.missing_nz[0]) ||
3792 bigstack_calloc_w(row_end_idxl, &ctx.missing_nz[1]) ||
3793 bigstack_alloc_w(NypCtToWordCt(row_end_idx), &genovec_buf) ||
3794 bigstack_alloc_w(row_end_idxaw * (k1LU * kDblMissingBlockSize), &missing_vmaj) ||
3795 bigstack_alloc_w(RoundUpPow2(row_end_idx, 2) * kDblMissingBlockWordCt, &ctx.missing_smaj[0]) ||
3796 bigstack_alloc_w(RoundUpPow2(row_end_idx, 2) * kDblMissingBlockWordCt, &ctx.missing_smaj[1])) {
3797 goto CalcMissingMatrix_ret_NOMEM;
3798 }
3799 uint32_t* missing_cts = *missing_cts_ptr;
3800 uint32_t* missing_dbl_exclude_cts = *missing_dbl_exclude_cts_ptr;
3801 ctx.missing_dbl_exclude_cts = missing_dbl_exclude_cts;
3802 VecW* transpose_bitblock_wkspace = S_CAST(VecW*, bigstack_alloc_raw(kPglBitTransposeBufbytes));
3803 uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
3804 if (unlikely(
3805 SetThreadCt(calc_thread_ct, &tg) ||
3806 bigstack_alloc_u32(calc_thread_ct + 1, &ctx.thread_start))) {
3807 goto CalcMissingMatrix_ret_NOMEM;
3808 }
3809 // note that this ctx.thread_start[] may have different values than the one
3810 // computed by CalcGrm(), since calc_thread_ct changes in the MTBLAS and
3811 // OS X cases.
3812 TriangleFill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, ctx.thread_start);
3813 assert(ctx.thread_start[0] == row_start_idx);
3814 assert(ctx.thread_start[calc_thread_ct] == row_end_idx);
3815 SetThreadFuncAndData(CalcDblMissingThread, &ctx, &tg);
3816 const uint32_t sample_transpose_batch_ct_m1 = (row_end_idx - 1) / kPglBitTransposeBatch;
3817
3818 uintptr_t variant_uidx_base = 0;
3819 uintptr_t cur_bits = variant_include[0];
3820 uint32_t parity = 0;
3821 uint32_t pct = 0;
3822 uint32_t next_print_variant_idx = variant_ct / 100;
3823 // caller's responsibility to print this
3824 // logputs("Correcting for missingness: ");
3825 fputs("0%", stdout);
3826 fflush(stdout);
3827 PgrSampleSubsetIndex pssi;
3828 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
3829 for (uint32_t cur_variant_idx_start = 0; ; ) {
3830 uint32_t cur_batch_size = 0;
3831 if (!IsLastBlock(&tg)) {
3832 cur_batch_size = kDblMissingBlockSize;
3833 uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
3834 if (cur_variant_idx_end > variant_ct) {
3835 cur_batch_size = variant_ct - cur_variant_idx_start;
3836 cur_variant_idx_end = variant_ct;
3837 ZeroWArr((kDblMissingBlockSize - cur_batch_size) * row_end_idxaw, &(missing_vmaj[cur_batch_size * row_end_idxaw]));
3838 }
3839 uintptr_t* missing_vmaj_iter = missing_vmaj;
3840 for (uint32_t variant_idx = cur_variant_idx_start; variant_idx != cur_variant_idx_end; ++variant_idx) {
3841 const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3842 reterr = PgrGetMissingnessD(sample_include, pssi, row_end_idx, variant_uidx, simple_pgrp, nullptr, missing_vmaj_iter, nullptr, genovec_buf);
3843 if (unlikely(reterr)) {
3844 goto CalcMissingMatrix_ret_PGR_FAIL;
3845 }
3846 missing_vmaj_iter = &(missing_vmaj_iter[row_end_idxaw]);
3847 }
3848 uintptr_t* cur_missing_smaj_iter = ctx.missing_smaj[parity];
3849 uint32_t sample_batch_size = kPglBitTransposeBatch;
3850 for (uint32_t sample_transpose_batch_idx = 0; ; ++sample_transpose_batch_idx) {
3851 if (sample_transpose_batch_idx >= sample_transpose_batch_ct_m1) {
3852 if (sample_transpose_batch_idx > sample_transpose_batch_ct_m1) {
3853 break;
3854 }
3855 sample_batch_size = ModNz(row_end_idx, kPglBitTransposeBatch);
3856 }
3857 // missing_smaj offset needs to be 64-bit if kDblMissingBlockWordCt
3858 // increases
3859 TransposeBitblock(&(missing_vmaj[sample_transpose_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kDblMissingBlockWordCt, kDblMissingBlockSize, sample_batch_size, &(cur_missing_smaj_iter[sample_transpose_batch_idx * kPglBitTransposeBatch * kDblMissingBlockWordCt]), transpose_bitblock_wkspace);
3860 }
3861 uintptr_t* cur_missing_nz = ctx.missing_nz[parity];
3862 ZeroWArr(row_end_idxl, cur_missing_nz);
3863 for (uint32_t sample_idx = 0; sample_idx != row_end_idx; ++sample_idx) {
3864 const uintptr_t cur_word0 = *cur_missing_smaj_iter++;
3865 const uintptr_t cur_word1 = *cur_missing_smaj_iter++;
3866 #ifdef __LP64__
3867 if (cur_word0 || cur_word1) {
3868 SetBit(sample_idx, cur_missing_nz);
3869 missing_cts[sample_idx] += Popcount2Words(cur_word0, cur_word1);
3870 }
3871 #else
3872 const uintptr_t cur_word2 = *cur_missing_smaj_iter++;
3873 const uintptr_t cur_word3 = *cur_missing_smaj_iter++;
3874 if (cur_word0 || cur_word1 || cur_word2 || cur_word3) {
3875 SetBit(sample_idx, cur_missing_nz);
3876 missing_cts[sample_idx] += Popcount4Words(cur_word0, cur_word1, cur_word2, cur_word3);
3877 }
3878 #endif
3879 }
3880 }
3881 if (cur_variant_idx_start) {
3882 JoinThreads(&tg);
3883 // CalcDblMissingThread() never errors out
3884 if (IsLastBlock(&tg)) {
3885 break;
3886 }
3887 if (cur_variant_idx_start >= next_print_variant_idx) {
3888 if (pct > 10) {
3889 putc_unlocked('\b', stdout);
3890 }
3891 pct = (cur_variant_idx_start * 100LLU) / variant_ct;
3892 printf("\b\b%u%%", pct++);
3893 fflush(stdout);
3894 next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
3895 }
3896 }
3897 if (cur_variant_idx_start + cur_batch_size == variant_ct) {
3898 DeclareLastThreadBlock(&tg);
3899 }
3900 if (unlikely(SpawnThreads(&tg))) {
3901 goto CalcMissingMatrix_ret_THREAD_CREATE_FAIL;
3902 }
3903 cur_variant_idx_start += cur_batch_size;
3904 parity = 1 - parity;
3905 }
3906 if (pct > 10) {
3907 putc_unlocked('\b', stdout);
3908 }
3909 fputs("\b\b", stdout);
3910 logputs("done.\n");
3911 bigstack_mark = R_CAST(unsigned char*, ctx.missing_nz[0]);
3912 }
3913 while (0) {
3914 CalcMissingMatrix_ret_NOMEM:
3915 reterr = kPglRetNomem;
3916 break;
3917 CalcMissingMatrix_ret_PGR_FAIL:
3918 PgenErrPrintN(reterr);
3919 break;
3920 CalcMissingMatrix_ret_THREAD_CREATE_FAIL:
3921 reterr = kPglRetThreadCreateFail;
3922 break;
3923 }
3924 CleanupThreads(&tg);
3925 BigstackReset(bigstack_mark);
3926 return reterr;
3927 }
3928
CalcGrm(const uintptr_t * orig_sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const uintptr_t * allele_idx_offsets,const double * allele_freqs,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct,GrmFlags grm_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end,double ** grm_ptr)3929 PglErr CalcGrm(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const double* allele_freqs, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, GrmFlags grm_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end, double** grm_ptr) {
3930 unsigned char* bigstack_mark = g_bigstack_base;
3931 unsigned char* bigstack_end_mark = g_bigstack_end;
3932 FILE* outfile = nullptr;
3933 char* cswritep = nullptr;
3934 CompressStreamState css;
3935 ThreadGroup tg;
3936 PglErr reterr = kPglRetSuccess;
3937 PreinitCstream(&css);
3938 PreinitThreads(&tg);
3939 {
3940 assert(variant_ct);
3941 #if defined(__APPLE__) || defined(USE_MTBLAS)
3942 uint32_t calc_thread_ct = 1;
3943 #else
3944 uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
3945 if (calc_thread_ct * parallel_tot > sample_ct / 32) {
3946 calc_thread_ct = sample_ct / (32 * parallel_tot);
3947 if (!calc_thread_ct) {
3948 calc_thread_ct = 1;
3949 }
3950 }
3951 #endif
3952 if (unlikely(sample_ct < 2)) {
3953 logerrputs("Error: GRM construction requires at least two samples.\n");
3954 goto CalcGrm_ret_DEGENERATE_DATA;
3955 }
3956 const uintptr_t* sample_include = orig_sample_include;
3957 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
3958 uint32_t row_start_idx = 0;
3959 uintptr_t row_end_idx = sample_ct;
3960 uint32_t* thread_start = nullptr;
3961 if ((calc_thread_ct != 1) || (parallel_tot != 1)) {
3962 // note that grm should be allocated on bottom if no --parallel, since it
3963 // may continue to be used after function exit. So we allocate this on
3964 // top.
3965 if (unlikely(bigstack_end_alloc_u32(calc_thread_ct + 1, &thread_start))) {
3966 goto CalcGrm_ret_NOMEM;
3967 }
3968 // slightly different from plink 1.9 since we don't bother to treat the
3969 // diagonal as a special case any more.
3970 TriangleFill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, thread_start);
3971 row_start_idx = thread_start[0];
3972 row_end_idx = thread_start[calc_thread_ct];
3973 if (row_end_idx < sample_ct) {
3974 // 0
3975 // 0 0
3976 // 0 0 0
3977 // 0 0 0 0
3978 // 1 1 1 1 1
3979 // 1 1 1 1 1 1
3980 // 2 2 2 2 2 2 2
3981 // 2 2 2 2 2 2 2 2
3982 // If we're computing part 0, we never need to load the last 4 samples;
3983 // if part 1, we don't need the last two; etc.
3984 uintptr_t* new_sample_include;
3985 if (unlikely(bigstack_alloc_w(raw_sample_ctl, &new_sample_include))) {
3986 goto CalcGrm_ret_NOMEM;
3987 }
3988 const uint32_t sample_uidx_end = 1 + IdxToUidxBasic(orig_sample_include, row_end_idx - 1);
3989 memcpy(new_sample_include, orig_sample_include, RoundUpPow2(sample_uidx_end, kBitsPerWord) / CHAR_BIT);
3990 ClearBitsNz(sample_uidx_end, raw_sample_ctl * kBitsPerWord, new_sample_include);
3991 sample_include = new_sample_include;
3992 }
3993 if ((!parallel_idx) && (calc_thread_ct == 1)) {
3994 thread_start = nullptr;
3995 }
3996 }
3997
3998 CalcGrmPartCtx ctx;
3999 ctx.thread_start = thread_start;
4000 double* grm;
4001 if (unlikely(
4002 SetThreadCt(calc_thread_ct, &tg))) {
4003 goto CalcGrm_ret_NOMEM;
4004 }
4005 if (unlikely(
4006 bigstack_calloc_d((row_end_idx - row_start_idx) * row_end_idx, &grm))) {
4007 if (!grm_ptr) {
4008 logerrputs("Error: Out of memory. If you are SURE you are performing the right matrix\ncomputation, you can split it into smaller pieces with --parallel, and then\nconcatenate the results. But before you try this, make sure the program you're\nproviding the matrix to can actually handle such a large input file.\n");
4009 } else {
4010 // Need to edit this if there are ever non-PCA ways to get here.
4011 if (!(grm_flags & (kfGrmMatrixShapemask | kfGrmListmask | kfGrmBin))) {
4012 logerrputs("Error: Out of memory. Consider \"--pca approx\" instead.\n");
4013 } else {
4014 logerrputs("Error: Out of memory. Consider \"--pca approx\" (and not writing the GRM to\ndisk) instead.\n");
4015 }
4016 }
4017 goto CalcGrm_ret_NOMEM_CUSTOM;
4018 }
4019 ctx.sample_ct = row_end_idx;
4020 ctx.grm = grm;
4021 uint32_t* sample_include_cumulative_popcounts;
4022 PgenVariant pgv;
4023 double* allele_1copy_buf;
4024 if (unlikely(
4025 bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
4026 BigstackAllocPgv(row_end_idx, allele_idx_offsets != nullptr, PgrGetGflags(simple_pgrp), &pgv) ||
4027 bigstack_alloc_d(max_allele_ct, &allele_1copy_buf))) {
4028 goto CalcGrm_ret_NOMEM;
4029 }
4030 FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
4031 reterr = ConditionalAllocateNonAutosomalVariants(cip, "GRM construction", raw_variant_ct, &variant_include, &variant_ct);
4032 if (unlikely(reterr)) {
4033 goto CalcGrm_ret_1;
4034 }
4035 if (unlikely(
4036 bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_vmaj_bufs[0]) ||
4037 bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_vmaj_bufs[1]))) {
4038 goto CalcGrm_ret_NOMEM;
4039 }
4040 const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
4041 uintptr_t* variant_include_has_missing = nullptr;
4042 if (!(grm_flags & kfGrmMeanimpute)) {
4043 if (unlikely(bigstack_calloc_w(raw_variant_ctl, &variant_include_has_missing))) {
4044 goto CalcGrm_ret_NOMEM;
4045 }
4046 }
4047 if (thread_start) {
4048 if (unlikely(
4049 bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_smaj_bufs[0]) ||
4050 bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_smaj_bufs[1]))) {
4051 goto CalcGrm_ret_NOMEM;
4052 }
4053 SetThreadFuncAndData(CalcGrmPartThread, &ctx, &tg);
4054 } else {
4055 // defensive
4056 ctx.normed_dosage_smaj_bufs[0] = nullptr;
4057 ctx.normed_dosage_smaj_bufs[1] = nullptr;
4058 SetThreadFuncAndData(CalcGrmThread, &ctx, &tg);
4059 }
4060 #ifdef USE_MTBLAS
4061 const uint32_t blas_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
4062 BLAS_SET_NUM_THREADS(blas_thread_ct);
4063 #endif
4064 // Main workflow:
4065 // 1. Set n=0, load batch 0
4066 //
4067 // 2. Spawn threads processing batch n
4068 // 3. Increment n by 1
4069 // 4. Load batch n unless eof
4070 // 5. Join threads
4071 // 6. Goto step 2 unless eof
4072 const uint32_t variance_standardize = !(grm_flags & kfGrmCov);
4073 const uint32_t is_haploid = cip->haploid_mask[0] & 1;
4074 uint32_t cur_batch_size = kGrmVariantBlockSize;
4075 uint32_t variant_idx_start = 0;
4076 uint32_t variant_idx = 0;
4077 uintptr_t variant_uidx = 0;
4078 uintptr_t allele_idx_base = 0;
4079 uint32_t cur_allele_ct = 2;
4080 uint32_t incomplete_allele_idx = 0;
4081 uint32_t parity = 0;
4082 uint32_t is_not_first_block = 0;
4083 uint32_t pct = 0;
4084 uint32_t next_print_variant_idx = variant_ct / 100;
4085 logputs("Constructing GRM: ");
4086 fputs("0%", stdout);
4087 fflush(stdout);
4088 PgrSampleSubsetIndex pssi;
4089 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
4090 while (1) {
4091 if (!IsLastBlock(&tg)) {
4092 double* normed_vmaj = ctx.normed_dosage_vmaj_bufs[parity];
4093 reterr = LoadCenteredVarmajBlock(sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, variance_standardize, is_haploid, row_end_idx, variant_ct, simple_pgrp, normed_vmaj, variant_include_has_missing, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
4094 if (unlikely(reterr)) {
4095 goto CalcGrm_ret_PGR_FAIL;
4096 }
4097 if (thread_start) {
4098 MatrixTransposeCopy(normed_vmaj, cur_batch_size, row_end_idx, ctx.normed_dosage_smaj_bufs[parity]);
4099 }
4100 }
4101 if (is_not_first_block) {
4102 JoinThreads(&tg);
4103 // CalcGrmPartThread() and CalcGrmThread() never error out
4104 if (IsLastBlock(&tg)) {
4105 break;
4106 }
4107 if (variant_idx_start >= next_print_variant_idx) {
4108 if (pct > 10) {
4109 putc_unlocked('\b', stdout);
4110 }
4111 pct = (variant_idx_start * 100LLU) / variant_ct;
4112 printf("\b\b%u%%", pct++);
4113 fflush(stdout);
4114 next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
4115 }
4116 }
4117 ctx.cur_batch_size = cur_batch_size;
4118 if (variant_idx == variant_ct) {
4119 DeclareLastThreadBlock(&tg);
4120 cur_batch_size = 0;
4121 }
4122 if (unlikely(SpawnThreads(&tg))) {
4123 goto CalcGrm_ret_THREAD_CREATE_FAIL;
4124 }
4125 is_not_first_block = 1;
4126 variant_idx_start = variant_idx;
4127 parity = 1 - parity;
4128 }
4129 BLAS_SET_NUM_THREADS(1);
4130 if (pct > 10) {
4131 putc_unlocked('\b', stdout);
4132 }
4133 fputs("\b\b", stdout);
4134 logputs("done.\n");
4135 uint32_t* missing_cts = nullptr; // stays null iff meanimpute
4136 uint32_t* missing_dbl_exclude_cts = nullptr;
4137 if (variant_include_has_missing) {
4138 const uint32_t variant_ct_with_missing = PopcountWords(variant_include_has_missing, raw_variant_ctl);
4139 // if no missing calls at all, act as if meanimpute was on
4140 if (variant_ct_with_missing) {
4141 logputs("Correcting for missingness... ");
4142 reterr = CalcMissingMatrix(sample_include, sample_include_cumulative_popcounts, variant_include_has_missing, sample_ct, variant_ct_with_missing, parallel_idx, parallel_tot, row_start_idx, row_end_idx, max_thread_ct, simple_pgrp, &missing_cts, &missing_dbl_exclude_cts);
4143 if (unlikely(reterr)) {
4144 goto CalcGrm_ret_1;
4145 }
4146 }
4147 }
4148 if (missing_cts) {
4149 // could parallelize this loop if it ever matters
4150 const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
4151 for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4152 const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
4153 double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4154 for (uint32_t col_idx = 0; col_idx != row_idx; ++col_idx) {
4155 *grm_iter++ /= u31tod(variant_ct_base - missing_cts[col_idx] + (*missing_dbl_exclude_iter++));
4156 }
4157 *grm_iter++ /= u31tod(variant_ct_base);
4158 }
4159 } else {
4160 const double variant_ct_recip = 1.0 / u31tod(variant_ct);
4161 for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4162 double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4163 for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4164 *grm_iter++ *= variant_ct_recip;
4165 }
4166 }
4167 }
4168 // N.B. Only the lower right of grm[] is valid when parallel_tot == 1.
4169
4170 // possible todo: allow simultaneous --make-rel and
4171 // --make-grm-list/--make-grm-bin
4172 // (note that this routine may also be called by --pca, which may not write
4173 // a matrix to disk at all.)
4174 if (grm_flags & (kfGrmMatrixShapemask | kfGrmListmask | kfGrmBin)) {
4175 const GrmFlags matrix_shape = grm_flags & kfGrmMatrixShapemask;
4176 char* log_write_iter;
4177 if (matrix_shape) {
4178 // --make-rel
4179 fputs("--make-rel: Writing...", stdout);
4180 fflush(stdout);
4181 if (grm_flags & kfGrmMatrixBin) {
4182 char* outname_end2 = strcpya_k(outname_end, ".rel.bin");
4183 if (parallel_tot != 1) {
4184 *outname_end2++ = '.';
4185 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4186 }
4187 *outname_end2 = '\0';
4188 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4189 goto CalcGrm_ret_OPEN_FAIL;
4190 }
4191 double* write_double_buf = nullptr;
4192 if (matrix_shape == kfGrmMatrixSq0) {
4193 write_double_buf = R_CAST(double*, g_textbuf);
4194 ZeroDArr(kTextbufMainSize / sizeof(double), write_double_buf);
4195 } else if (matrix_shape == kfGrmMatrixSq) {
4196 if (unlikely(bigstack_alloc_d(row_end_idx - row_start_idx - 1, &write_double_buf))) {
4197 goto CalcGrm_ret_NOMEM;
4198 }
4199 }
4200 for (uintptr_t row_idx = row_start_idx; ; ) {
4201 const double* grm_row = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4202 ++row_idx;
4203 if (unlikely(fwrite_checked(grm_row, row_idx * sizeof(double), outfile))) {
4204 goto CalcGrm_ret_WRITE_FAIL;
4205 }
4206 if (row_idx == row_end_idx) {
4207 break;
4208 }
4209 if (matrix_shape == kfGrmMatrixSq0) {
4210 uintptr_t zbytes_to_dump = (sample_ct - row_idx) * sizeof(double);
4211 while (zbytes_to_dump >= kTextbufMainSize) {
4212 if (unlikely(fwrite_checked(write_double_buf, kTextbufMainSize, outfile))) {
4213 goto CalcGrm_ret_WRITE_FAIL;
4214 }
4215 zbytes_to_dump -= kTextbufMainSize;
4216 }
4217 if (zbytes_to_dump) {
4218 if (unlikely(fwrite_checked(write_double_buf, zbytes_to_dump, outfile))) {
4219 goto CalcGrm_ret_WRITE_FAIL;
4220 }
4221 }
4222 } else if (matrix_shape == kfGrmMatrixSq) {
4223 double* write_double_iter = write_double_buf;
4224 const double* grm_col = &(grm[row_idx - 1]);
4225 for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4226 *write_double_iter++ = grm_col[(row_idx2 - row_start_idx) * sample_ct];
4227 }
4228 if (unlikely(fwrite_checked(write_double_buf, (sample_ct - row_idx) * sizeof(double), outfile))) {
4229 goto CalcGrm_ret_WRITE_FAIL;
4230 }
4231 }
4232 }
4233 if (unlikely(fclose_null(&outfile))) {
4234 goto CalcGrm_ret_WRITE_FAIL;
4235 }
4236 } else if (grm_flags & kfGrmMatrixBin4) {
4237 // downcode all entries to floats
4238 char* outname_end2 = strcpya_k(outname_end, ".rel.bin");
4239 if (parallel_tot != 1) {
4240 *outname_end2++ = '.';
4241 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4242 }
4243 *outname_end2 = '\0';
4244 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4245 goto CalcGrm_ret_OPEN_FAIL;
4246 }
4247 float* write_float_buf;
4248 if (unlikely(bigstack_alloc_f(row_end_idx, &write_float_buf))) {
4249 goto CalcGrm_ret_NOMEM;
4250 }
4251 uintptr_t row_idx = row_start_idx;
4252 do {
4253 const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4254 float* write_float_iter = write_float_buf;
4255 for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4256 *write_float_iter++ = S_CAST(float, *grm_iter++);
4257 }
4258 ++row_idx;
4259 if (matrix_shape == kfGrmMatrixSq0) {
4260 ZeroFArr(sample_ct - row_idx, write_float_iter);
4261 write_float_iter = &(write_float_buf[sample_ct]);
4262 } else if (matrix_shape == kfGrmMatrixSq) {
4263 const double* grm_col = &(grm[row_idx - 1]);
4264 for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4265 *write_float_iter++ = S_CAST(float, grm_col[(row_idx2 - row_start_idx) * sample_ct]);
4266 }
4267 }
4268 if (unlikely(fwrite_checked(write_float_buf, sizeof(float) * S_CAST(uintptr_t, write_float_iter - write_float_buf), outfile))) {
4269 goto CalcGrm_ret_WRITE_FAIL;
4270 }
4271 } while (row_idx < row_end_idx);
4272 if (unlikely(fclose_null(&outfile))) {
4273 goto CalcGrm_ret_WRITE_FAIL;
4274 }
4275 } else {
4276 char* outname_end2 = strcpya_k(outname_end, ".rel");
4277 if (parallel_tot != 1) {
4278 *outname_end2++ = '.';
4279 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4280 }
4281 const uint32_t output_zst = (grm_flags / kfGrmMatrixZs) & 1;
4282 if (output_zst) {
4283 outname_end2 = strcpya_k(outname_end2, ".zst");
4284 }
4285 *outname_end2 = '\0';
4286 reterr = InitCstreamAlloc(outname, 0, output_zst, max_thread_ct, kCompressStreamBlock + 16 * row_end_idx, &css, &cswritep);
4287 if (unlikely(reterr)) {
4288 goto CalcGrm_ret_1;
4289 }
4290 uintptr_t row_idx = row_start_idx;
4291 do {
4292 const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4293 ++row_idx;
4294 for (uint32_t col_idx = 0; col_idx != row_idx; ++col_idx) {
4295 cswritep = dtoa_g(*grm_iter++, cswritep);
4296 *cswritep++ = '\t';
4297 }
4298 if (matrix_shape == kfGrmMatrixSq0) {
4299 // (roughly same performance as creating a zero-tab constant
4300 // buffer in advance)
4301 const uint32_t zcount = sample_ct - row_idx;
4302 const uint32_t wct = DivUp(zcount, kBytesPerWord / 2);
4303 // assumes little-endian
4304 const uintptr_t zerotab_word = 0x930 * kMask0001;
4305 #ifdef __arm__
4306 # error "Unaligned accesses in CalcGrm()."
4307 #endif
4308 uintptr_t* writep_alias = R_CAST(uintptr_t*, cswritep);
4309 for (uintptr_t widx = 0; widx != wct; ++widx) {
4310 *writep_alias++ = zerotab_word;
4311 }
4312 cswritep = &(cswritep[zcount * 2]);
4313 } else if (matrix_shape == kfGrmMatrixSq) {
4314 const double* grm_col = &(grm[row_idx - 1]);
4315 for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4316 cswritep = dtoa_g(grm_col[(row_idx2 - row_start_idx) * sample_ct], cswritep);
4317 *cswritep++ = '\t';
4318 }
4319 }
4320 DecrAppendBinaryEoln(&cswritep);
4321 if (unlikely(Cswrite(&css, &cswritep))) {
4322 goto CalcGrm_ret_WRITE_FAIL;
4323 }
4324 } while (row_idx < row_end_idx);
4325 if (unlikely(CswriteCloseNull(&css, cswritep))) {
4326 goto CalcGrm_ret_WRITE_FAIL;
4327 }
4328 }
4329 putc_unlocked('\r', stdout);
4330 log_write_iter = strcpya_k(g_logbuf, "--make-rel: GRM ");
4331 if (parallel_tot != 1) {
4332 log_write_iter = strcpya_k(log_write_iter, "component ");
4333 }
4334 log_write_iter = strcpya_k(log_write_iter, "written to ");
4335 log_write_iter = strcpya(log_write_iter, outname);
4336 } else {
4337 const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
4338 if (grm_flags & kfGrmBin) {
4339 // --make-grm-bin
4340 float* write_float_buf;
4341 if (unlikely(bigstack_alloc_f(row_end_idx, &write_float_buf))) {
4342 goto CalcGrm_ret_NOMEM;
4343 }
4344 char* outname_end2 = strcpya_k(outname_end, ".grm.bin");
4345 if (parallel_tot != 1) {
4346 *outname_end2++ = '.';
4347 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4348 }
4349 *outname_end2 = '\0';
4350 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4351 goto CalcGrm_ret_OPEN_FAIL;
4352 }
4353 fputs("--make-grm-bin: Writing...", stdout);
4354 fflush(stdout);
4355 for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4356 const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4357 for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4358 write_float_buf[col_idx] = S_CAST(float, *grm_iter++);
4359 }
4360 if (unlikely(fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile))) {
4361 goto CalcGrm_ret_WRITE_FAIL;
4362 }
4363 }
4364 if (unlikely(fclose_null(&outfile))) {
4365 goto CalcGrm_ret_WRITE_FAIL;
4366 }
4367
4368 outname_end2 = strcpya_k(outname_end, ".grm.N.bin");
4369 if (parallel_tot != 1) {
4370 *outname_end2++ = '.';
4371 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4372 }
4373 *outname_end2 = '\0';
4374 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4375 goto CalcGrm_ret_OPEN_FAIL;
4376 }
4377 if (!missing_cts) {
4378 // trivial case: write the same number repeatedly
4379 const uintptr_t tot_cells = (S_CAST(uint64_t, row_end_idx) * (row_end_idx - 1) - S_CAST(uint64_t, row_start_idx) * (row_start_idx - 1)) / 2;
4380 const float variant_ctf = u31tof(variant_ct);
4381 write_float_buf = R_CAST(float*, g_textbuf);
4382 for (uint32_t uii = 0; uii != (kTextbufMainSize / sizeof(float)); ++uii) {
4383 write_float_buf[uii] = variant_ctf;
4384 }
4385 const uintptr_t full_write_ct = tot_cells / (kTextbufMainSize / sizeof(float));
4386 for (uintptr_t ulii = 0; ulii != full_write_ct; ++ulii) {
4387 if (unlikely(fwrite_checked(write_float_buf, kTextbufMainSize, outfile))) {
4388 goto CalcGrm_ret_WRITE_FAIL;
4389 }
4390 }
4391 const uintptr_t remainder = tot_cells % (kTextbufMainSize / sizeof(float));
4392 if (remainder) {
4393 if (unlikely(fwrite_checked(write_float_buf, remainder * sizeof(float), outfile))) {
4394 goto CalcGrm_ret_WRITE_FAIL;
4395 }
4396 }
4397 } else {
4398 for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4399 const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
4400 for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4401 uint32_t cur_obs_ct = variant_ct_base;
4402 if (col_idx != row_idx) {
4403 cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
4404 }
4405 write_float_buf[col_idx] = u31tof(cur_obs_ct);
4406 }
4407 if (unlikely(fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile))) {
4408 goto CalcGrm_ret_WRITE_FAIL;
4409 }
4410 }
4411 }
4412 if (unlikely(fclose_null(&outfile))) {
4413 goto CalcGrm_ret_WRITE_FAIL;
4414 }
4415 putc_unlocked('\r', stdout);
4416 const uint32_t outname_copy_byte_ct = 5 + S_CAST(uintptr_t, outname_end - outname);
4417 log_write_iter = strcpya_k(g_logbuf, "--make-grm-bin: GRM ");
4418 if (parallel_tot != 1) {
4419 log_write_iter = strcpya_k(log_write_iter, "component ");
4420 }
4421 log_write_iter = strcpya_k(log_write_iter, "written to ");
4422 log_write_iter = memcpya(log_write_iter, outname, outname_copy_byte_ct);
4423 log_write_iter = strcpya_k(log_write_iter, "bin");
4424 if (parallel_tot != 1) {
4425 *log_write_iter++ = '.';
4426 log_write_iter = u32toa(parallel_idx + 1, log_write_iter);
4427 }
4428 log_write_iter = strcpya_k(log_write_iter, " , ");
4429 if (parallel_idx) {
4430 log_write_iter = strcpya_k(log_write_iter, "and ");
4431 }
4432 log_write_iter = strcpya_k(log_write_iter, "observation counts to ");
4433 log_write_iter = memcpya(log_write_iter, outname, outname_end2 - outname);
4434 } else {
4435 // --make-grm-list
4436 char* outname_end2 = strcpya_k(outname_end, ".grm");
4437 if (parallel_tot != 1) {
4438 *outname_end2++ = '.';
4439 outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4440 }
4441 if (grm_flags & kfGrmListZs) {
4442 outname_end2 = strcpya_k(outname_end2, ".zst");
4443 }
4444 *outname_end2 = '\0';
4445 reterr = InitCstreamAlloc(outname, 0, !(grm_flags & kfGrmListNoGz), max_thread_ct, kCompressStreamBlock + kMaxMediumLine, &css, &cswritep);
4446 if (unlikely(reterr)) {
4447 goto CalcGrm_ret_1;
4448 }
4449 fputs("--make-grm-list: Writing...", stdout);
4450 fflush(stdout);
4451 for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4452 uint32_t variant_ct_base = variant_ct;
4453 if (missing_cts) {
4454 variant_ct_base -= missing_cts[row_idx];
4455 }
4456 const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4457 for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4458 cswritep = u32toa_x(row_idx + 1, '\t', cswritep);
4459 cswritep = u32toa_x(col_idx + 1, '\t', cswritep);
4460 if (missing_cts) {
4461 uint32_t cur_obs_ct = variant_ct_base;
4462 if (col_idx != row_idx) {
4463 cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
4464 }
4465 cswritep = u32toa(cur_obs_ct, cswritep);
4466 } else {
4467 cswritep = u32toa(variant_ct_base, cswritep);
4468 }
4469 *cswritep++ = '\t';
4470 cswritep = dtoa_g(*grm_iter++, cswritep);
4471 AppendBinaryEoln(&cswritep);
4472 if (unlikely(Cswrite(&css, &cswritep))) {
4473 goto CalcGrm_ret_WRITE_FAIL;
4474 }
4475 }
4476 }
4477 if (unlikely(CswriteCloseNull(&css, cswritep))) {
4478 goto CalcGrm_ret_WRITE_FAIL;
4479 }
4480 putc_unlocked('\r', stdout);
4481 log_write_iter = strcpya_k(g_logbuf, "--make-grm-list: GRM ");
4482 if (parallel_tot != 1) {
4483 log_write_iter = strcpya_k(log_write_iter, "component ");
4484 }
4485 log_write_iter = strcpya_k(log_write_iter, "written to ");
4486 log_write_iter = strcpya(log_write_iter, outname);
4487 }
4488 }
4489 if (!parallel_idx) {
4490 SampleIdFlags id_print_flags = siip->flags & kfSampleIdFidPresent;
4491 if (grm_flags & kfGrmNoIdHeader) {
4492 id_print_flags |= kfSampleIdNoIdHeader;
4493 if (grm_flags & kfGrmNoIdHeaderIidOnly) {
4494 id_print_flags |= kfSampleIdNoIdHeaderIidOnly;
4495 }
4496 }
4497 snprintf(&(outname_end[4]), kMaxOutfnameExtBlen - 4, ".id");
4498 reterr = WriteSampleIdsOverride(orig_sample_include, siip, outname, sample_ct, id_print_flags);
4499 if (unlikely(reterr)) {
4500 goto CalcGrm_ret_1;
4501 }
4502 log_write_iter = strcpya_k(log_write_iter, " , and IDs to ");
4503 log_write_iter = strcpya(log_write_iter, outname);
4504 }
4505 snprintf(log_write_iter, kLogbufSize - 2 * kPglFnamesize - 256, " .\n");
4506 WordWrapB(0);
4507 logputsb();
4508 }
4509
4510 if (grm_ptr) {
4511 *grm_ptr = grm;
4512 // allocation right on top of grm[]
4513 bigstack_mark = R_CAST(unsigned char*, sample_include_cumulative_popcounts);
4514 }
4515 }
4516 while (0) {
4517 CalcGrm_ret_NOMEM:
4518 reterr = kPglRetNomem;
4519 break;
4520 CalcGrm_ret_NOMEM_CUSTOM:
4521 reterr = kPglRetNomemCustomMsg;
4522 break;
4523 CalcGrm_ret_OPEN_FAIL:
4524 reterr = kPglRetOpenFail;
4525 break;
4526 CalcGrm_ret_PGR_FAIL:
4527 PgenErrPrintN(reterr);
4528 break;
4529 CalcGrm_ret_WRITE_FAIL:
4530 reterr = kPglRetWriteFail;
4531 break;
4532 CalcGrm_ret_THREAD_CREATE_FAIL:
4533 reterr = kPglRetThreadCreateFail;
4534 break;
4535 CalcGrm_ret_DEGENERATE_DATA:
4536 reterr = kPglRetDegenerateData;
4537 break;
4538 }
4539 CalcGrm_ret_1:
4540 CswriteCloseCond(&css, cswritep);
4541 fclose_cond(outfile);
4542 CleanupThreads(&tg);
4543 BLAS_SET_NUM_THREADS(1);
4544 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
4545 return reterr;
4546 }
4547
4548 // should be able to remove NOLAPACK later since we already have a non-LAPACK
4549 // SVD implementation
4550 #ifndef NOLAPACK
4551 // this seems to be better than 256 (due to avoidance of cache critical
4552 // stride?)
4553 // (still want this to be a multiple of 8, for cleaner multithreading)
4554 CONSTI32(kPcaVariantBlockSize, 240);
4555
4556 typedef struct CalcPcaCtxStruct {
4557 uint32_t sample_ct;
4558 uint32_t pc_ct;
4559
4560 double* yy_bufs[2];
4561
4562 uint32_t cur_batch_size;
4563
4564 double* g1;
4565 double* qq;
4566 double** y_transpose_bufs;
4567 double** g2_bb_part_bufs;
4568 } CalcPcaCtx;
4569
CalcPcaXtxaThread(void * raw_arg)4570 THREAD_FUNC_DECL CalcPcaXtxaThread(void* raw_arg) {
4571 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4572 const uintptr_t tidx = arg->tidx;
4573 CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4574
4575 const uint32_t sample_ct = ctx->sample_ct;
4576 const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4577 const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4578 const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4579 const double* g1 = ctx->g1;
4580 double* qq_iter = ctx->qq;
4581 double* y_transpose_buf = ctx->y_transpose_bufs[tidx];
4582 double* g2_part_buf = ctx->g2_bb_part_bufs[tidx];
4583 uint32_t parity = 0;
4584 do {
4585 const uint32_t cur_batch_size = ctx->cur_batch_size;
4586 if (vidx_offset < cur_batch_size) {
4587 uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4588 if (cur_thread_batch_size > kPcaVariantBlockSize) {
4589 cur_thread_batch_size = kPcaVariantBlockSize;
4590 }
4591 const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4592 double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
4593 RowMajorMatrixMultiplyStrided(yy_buf, g1, cur_thread_batch_size, sample_ct, pc_ct_x2, pc_ct_x2, sample_ct, qq_col_ct, cur_qq);
4594 MatrixTransposeCopy(yy_buf, cur_thread_batch_size, sample_ct, y_transpose_buf);
4595 RowMajorMatrixMultiplyStridedIncr(y_transpose_buf, cur_qq, sample_ct, cur_thread_batch_size, pc_ct_x2, qq_col_ct, cur_thread_batch_size, pc_ct_x2, g2_part_buf);
4596 qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4597 }
4598 parity = 1 - parity;
4599 } while (!THREAD_BLOCK_FINISH(arg));
4600 THREAD_RETURN;
4601 }
4602
CalcPcaXaThread(void * raw_arg)4603 THREAD_FUNC_DECL CalcPcaXaThread(void* raw_arg) {
4604 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4605 const uintptr_t tidx = arg->tidx;
4606 CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4607
4608 const uint32_t sample_ct = ctx->sample_ct;
4609 const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4610 const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4611 const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4612 const double* g1 = ctx->g1;
4613 double* qq_iter = ctx->qq;
4614 uint32_t parity = 0;
4615 do {
4616 const uint32_t cur_batch_size = ctx->cur_batch_size;
4617 if (vidx_offset < cur_batch_size) {
4618 uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4619 if (cur_thread_batch_size > kPcaVariantBlockSize) {
4620 cur_thread_batch_size = kPcaVariantBlockSize;
4621 }
4622 const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4623 double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
4624 RowMajorMatrixMultiplyStrided(yy_buf, g1, cur_thread_batch_size, sample_ct, pc_ct_x2, pc_ct_x2, sample_ct, qq_col_ct, cur_qq);
4625 qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4626 }
4627 parity = 1 - parity;
4628 } while (!THREAD_BLOCK_FINISH(arg));
4629 THREAD_RETURN;
4630 }
4631
CalcPcaXtbThread(void * raw_arg)4632 THREAD_FUNC_DECL CalcPcaXtbThread(void* raw_arg) {
4633 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4634 const uintptr_t tidx = arg->tidx;
4635 CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4636
4637 const uint32_t sample_ct = ctx->sample_ct;
4638 const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4639 const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4640 const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4641 const double* qq_iter = &(ctx->qq[vidx_offset * qq_col_ct]);
4642 double* y_transpose_buf = ctx->y_transpose_bufs[tidx];
4643 double* bb_part_buf = ctx->g2_bb_part_bufs[tidx];
4644 uint32_t parity = 0;
4645 do {
4646 const uint32_t cur_batch_size = ctx->cur_batch_size;
4647 if (vidx_offset < cur_batch_size) {
4648 uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4649 if (cur_thread_batch_size > kPcaVariantBlockSize) {
4650 cur_thread_batch_size = kPcaVariantBlockSize;
4651 }
4652 const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4653 MatrixTransposeCopy(yy_buf, cur_thread_batch_size, sample_ct, y_transpose_buf);
4654 RowMajorMatrixMultiplyIncr(y_transpose_buf, qq_iter, sample_ct, qq_col_ct, cur_thread_batch_size, bb_part_buf);
4655 qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4656 }
4657 parity = 1 - parity;
4658 } while (!THREAD_BLOCK_FINISH(arg));
4659 THREAD_RETURN;
4660 }
4661
4662 typedef struct CalcPcaVarWtsCtxStruct {
4663 uint32_t sample_ct;
4664 uint32_t pc_ct;
4665
4666 double* sample_wts_smaj;
4667
4668 double* yy_bufs[2];
4669
4670 uint32_t cur_batch_size;
4671
4672 double* var_wts;
4673 } CalcPcaVarWtsCtx;
4674
CalcPcaVarWtsThread(void * raw_arg)4675 THREAD_FUNC_DECL CalcPcaVarWtsThread(void* raw_arg) {
4676 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4677 const uintptr_t tidx = arg->tidx;
4678 CalcPcaVarWtsCtx* ctx = S_CAST(CalcPcaVarWtsCtx*, arg->sharedp->context);
4679
4680 const uint32_t sample_ct = ctx->sample_ct;
4681 const uint32_t pc_ct = ctx->pc_ct;
4682 const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4683
4684 // either first batch size is calc_thread_ct * kPcaVariantBlockSize, or there
4685 // is only one batch
4686 const uintptr_t var_wts_part_size = S_CAST(uintptr_t, pc_ct) * ctx->cur_batch_size;
4687
4688 const double* sample_wts = ctx->sample_wts_smaj; // sample-major, pc_ct columns
4689 uint32_t parity = 0;
4690 do {
4691 const uint32_t cur_batch_size = ctx->cur_batch_size;
4692 if (vidx_offset < cur_batch_size) {
4693 uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4694 if (cur_thread_batch_size > kPcaVariantBlockSize) {
4695 cur_thread_batch_size = kPcaVariantBlockSize;
4696 }
4697 const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4698 // Variant weight matrix = X^T * S * D^{-1/2}, where X^T is the
4699 // variance-standardized genotype matrix, S is the sample weight matrix,
4700 // and D is a diagonal eigenvalue matrix.
4701 // We postpone the D^{-1/2} part for now, but it's straightforward to
4702 // switch to using precomputed (S * D^{-1/2}).
4703 double* cur_var_wts_part = &(ctx->var_wts[parity * var_wts_part_size + vidx_offset * S_CAST(uintptr_t, pc_ct)]);
4704 RowMajorMatrixMultiply(yy_buf, sample_wts, cur_thread_batch_size, pc_ct, sample_ct, cur_var_wts_part);
4705 }
4706 parity = 1 - parity;
4707 } while (!THREAD_BLOCK_FINISH(arg));
4708 THREAD_RETURN;
4709 }
4710
FlushBiallelicVarWts(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const AlleleCode * maj_alleles,const double * var_wts_iter,const double * eigval_inv_sqrts,uint32_t batch_size,uint32_t pc_ct,PcaFlags pca_flags,CompressStreamState * cssp,char ** cswritepp,char * chr_buf,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uint32_t * chr_fo_idxp,uint32_t * chr_endp,uint32_t * chr_buf_blenp)4711 PglErr FlushBiallelicVarWts(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const AlleleCode* maj_alleles, const double* var_wts_iter, const double* eigval_inv_sqrts, uint32_t batch_size, uint32_t pc_ct, PcaFlags pca_flags, CompressStreamState* cssp, char** cswritepp, char* chr_buf, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uint32_t* chr_fo_idxp, uint32_t* chr_endp, uint32_t* chr_buf_blenp) {
4712 char* cswritep = *cswritepp;
4713 uint32_t variant_idx = *variant_idxp;
4714 uintptr_t variant_uidx = *variant_uidxp;
4715 uint32_t chr_fo_idx = *chr_fo_idxp;
4716 uint32_t chr_end = *chr_endp;
4717 uint32_t chr_buf_blen = *chr_buf_blenp;
4718
4719 const uint32_t variant_idx_stop = variant_idx + batch_size;
4720 const uint32_t ref_col = pca_flags & kfPcaVcolRef;
4721 const uint32_t alt1_col = pca_flags & kfPcaVcolAlt1;
4722 const uint32_t alt_col = pca_flags & kfPcaVcolAlt;
4723 const uint32_t maj_col = pca_flags & kfPcaVcolMaj;
4724 const uint32_t nonmaj_col = pca_flags & kfPcaVcolNonmaj;
4725
4726 uintptr_t variant_uidx_base;
4727 uintptr_t cur_bits;
4728 BitIter1Start(variant_include, variant_uidx, &variant_uidx_base, &cur_bits);
4729 for (; variant_idx != variant_idx_stop; ++variant_idx) {
4730 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4731 if (chr_buf) {
4732 // ok to skip this logic if chr_col not printed
4733 if (variant_uidx >= chr_end) {
4734 do {
4735 ++chr_fo_idx;
4736 chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4737 } while (variant_uidx >= chr_end);
4738 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
4739 char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
4740 *chr_name_end = '\t';
4741 chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
4742 }
4743 cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
4744 }
4745 if (variant_bps) {
4746 cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
4747 }
4748 cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
4749 uintptr_t allele_idx_offset_base = variant_uidx * 2;
4750 if (allele_idx_offsets) {
4751 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
4752 }
4753 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4754 if (ref_col) {
4755 *cswritep++ = '\t';
4756 cswritep = strcpya(cswritep, cur_alleles[0]);
4757 }
4758 if (alt1_col) {
4759 *cswritep++ = '\t';
4760 cswritep = strcpya(cswritep, cur_alleles[1]);
4761 }
4762 if (alt_col) {
4763 *cswritep++ = '\t';
4764 // guaranteed biallelic
4765 cswritep = strcpya(cswritep, cur_alleles[1]);
4766 }
4767 const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
4768 if (maj_col) {
4769 if (unlikely(Cswrite(cssp, &cswritep))) {
4770 return kPglRetWriteFail;
4771 }
4772 *cswritep++ = '\t';
4773 cswritep = strcpya(cswritep, cur_alleles[maj_allele_idx]);
4774 }
4775 if (nonmaj_col) {
4776 *cswritep++ = '\t';
4777 cswritep = strcpya(cswritep, cur_alleles[1 - maj_allele_idx]);
4778 }
4779 if (!maj_allele_idx) {
4780 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4781 *cswritep++ = '\t';
4782 // could avoid these multiplications by premultiplying the
4783 // sample weight matrix
4784 cswritep = dtoa_g((*var_wts_iter++) * eigval_inv_sqrts[pc_idx], cswritep);
4785 }
4786 } else {
4787 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4788 *cswritep++ = '\t';
4789 cswritep = dtoa_g((*var_wts_iter++) * (-eigval_inv_sqrts[pc_idx]), cswritep);
4790 }
4791 }
4792 AppendBinaryEoln(&cswritep);
4793 if (unlikely(Cswrite(cssp, &cswritep))) {
4794 // bugfix (15 Dec 2017): prevent buffer overflow when ALT, MAJ,
4795 // and NONMAJ columns all missing.
4796 return kPglRetWriteFail;
4797 }
4798 }
4799 *cswritepp = cswritep;
4800 *variant_idxp = variant_idx_stop;
4801 *variant_uidxp = variant_uidx + 1;
4802 *chr_fo_idxp = chr_fo_idx;
4803 *chr_endp = chr_end;
4804 *chr_buf_blenp = chr_buf_blen;
4805 return kPglRetSuccess;
4806 }
4807
FlushAlleleWts(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const double * var_wts_iter,const double * eigval_inv_sqrts,uint32_t batch_size,uint32_t pc_ct,PcaFlags pca_flags,CompressStreamState * cssp,char ** cswritepp,char * chr_buf,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uintptr_t * allele_idx_offset_basep,uint32_t * cur_allele_ctp,uint32_t * incomplete_allele_idxp,uint32_t * chr_fo_idxp,uint32_t * chr_endp,uint32_t * chr_buf_blenp)4808 PglErr FlushAlleleWts(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const double* var_wts_iter, const double* eigval_inv_sqrts, uint32_t batch_size, uint32_t pc_ct, PcaFlags pca_flags, CompressStreamState* cssp, char** cswritepp, char* chr_buf, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uintptr_t* allele_idx_offset_basep, uint32_t* cur_allele_ctp, uint32_t* incomplete_allele_idxp, uint32_t* chr_fo_idxp, uint32_t* chr_endp, uint32_t* chr_buf_blenp) {
4809 char* cswritep = *cswritepp;
4810 uint32_t variant_idx = *variant_idxp;
4811 uintptr_t variant_uidx = *variant_uidxp;
4812 uintptr_t allele_idx_offset_base = *allele_idx_offset_basep;
4813 uint32_t cur_allele_ct = *cur_allele_ctp;
4814 uint32_t incomplete_allele_idx = *incomplete_allele_idxp;
4815 uint32_t chr_fo_idx = *chr_fo_idxp;
4816 uint32_t chr_end = *chr_endp;
4817 uint32_t chr_buf_blen = *chr_buf_blenp;
4818
4819 const uint32_t ref_col = pca_flags & kfPcaVcolRef;
4820 const uint32_t alt1_col = pca_flags & kfPcaVcolAlt1;
4821 const uint32_t alt_col = pca_flags & kfPcaVcolAlt;
4822 const uint32_t ax_col = pca_flags & kfPcaVcolAx;
4823
4824 uintptr_t variant_uidx_base;
4825 uintptr_t cur_bits;
4826 BitIter1Start(variant_include, variant_uidx + (incomplete_allele_idx != 0), &variant_uidx_base, &cur_bits);
4827 for (uint32_t allele_bidx = 0; allele_bidx != batch_size; ) {
4828 if (!incomplete_allele_idx) {
4829 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4830 if (chr_buf && (variant_uidx >= chr_end)) {
4831 do {
4832 ++chr_fo_idx;
4833 chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4834 } while (variant_uidx >= chr_end);
4835 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
4836 char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
4837 *chr_name_end = '\t';
4838 chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
4839 }
4840 if (!allele_idx_offsets) {
4841 allele_idx_offset_base = variant_uidx * 2;
4842 } else {
4843 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
4844 cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
4845 }
4846 }
4847 uint32_t allele_idx_end = cur_allele_ct;
4848 uint32_t allele_idx_stop;
4849 uint32_t incr;
4850 if (cur_allele_ct == 2) {
4851 allele_idx_stop = 2;
4852 incr = 1;
4853 } else {
4854 allele_idx_stop = batch_size + incomplete_allele_idx - allele_bidx;
4855 if (allele_idx_stop > allele_idx_end) {
4856 allele_idx_stop = allele_idx_end;
4857 }
4858 incr = allele_idx_stop - incomplete_allele_idx;
4859 }
4860 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4861 for (uint32_t allele_idx = incomplete_allele_idx; allele_idx != allele_idx_stop; ++allele_idx) {
4862 if (chr_buf) {
4863 cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
4864 }
4865 if (variant_bps) {
4866 cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
4867 }
4868 cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
4869 if (ref_col) {
4870 *cswritep++ = '\t';
4871 cswritep = strcpya(cswritep, cur_alleles[0]);
4872 }
4873 if (alt1_col) {
4874 *cswritep++ = '\t';
4875 cswritep = strcpya(cswritep, cur_alleles[1]);
4876 }
4877 if (alt_col) {
4878 *cswritep++ = '\t';
4879 for (uint32_t allele_idx2 = 1; allele_idx2 != cur_allele_ct; ++allele_idx2) {
4880 if (unlikely(Cswrite(cssp, &cswritep))) {
4881 return kPglRetWriteFail;
4882 }
4883 cswritep = strcpyax(cswritep, cur_alleles[allele_idx2], ',');
4884 }
4885 --cswritep;
4886 }
4887 // A1 col always present
4888 if (unlikely(Cswrite(cssp, &cswritep))) {
4889 return kPglRetWriteFail;
4890 }
4891 *cswritep++ = '\t';
4892 cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
4893 if (ax_col) {
4894 *cswritep++ = '\t';
4895 for (uint32_t allele_idx2 = 0; allele_idx2 != cur_allele_ct; ++allele_idx2) {
4896 if (allele_idx2 == allele_idx) {
4897 continue;
4898 }
4899 if (unlikely(Cswrite(cssp, &cswritep))) {
4900 return kPglRetWriteFail;
4901 }
4902 cswritep = strcpyax(cswritep, cur_alleles[allele_idx2], ',');
4903 }
4904 --cswritep;
4905 }
4906 if (cur_allele_ct == 2) {
4907 const double mult = allele_idx? -0.5 : 0.5;
4908 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4909 *cswritep++ = '\t';
4910 cswritep = dtoa_g((*var_wts_iter++) * mult * eigval_inv_sqrts[pc_idx], cswritep);
4911 }
4912 if (!allele_idx) {
4913 var_wts_iter -= pc_ct;
4914 }
4915 } else {
4916 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4917 *cswritep++ = '\t';
4918 cswritep = dtoa_g((*var_wts_iter++) * eigval_inv_sqrts[pc_idx], cswritep);
4919 }
4920 }
4921 AppendBinaryEoln(&cswritep);
4922 if (unlikely(Cswrite(cssp, &cswritep))) {
4923 return kPglRetWriteFail;
4924 }
4925 }
4926 allele_bidx += incr;
4927 if (allele_idx_stop == allele_idx_end) {
4928 ++variant_idx;
4929 incomplete_allele_idx = 0;
4930 } else {
4931 incomplete_allele_idx = allele_idx_stop;
4932 }
4933 }
4934 *cswritepp = cswritep;
4935 *variant_idxp = variant_idx;
4936 *variant_uidxp = variant_uidx + (incomplete_allele_idx == 0);
4937 *allele_idx_offset_basep = allele_idx_offset_base;
4938 *cur_allele_ctp = cur_allele_ct;
4939 *incomplete_allele_idxp = incomplete_allele_idx;
4940 *chr_fo_idxp = chr_fo_idx;
4941 *chr_endp = chr_end;
4942 *chr_buf_blenp = chr_buf_blen;
4943 return kPglRetSuccess;
4944 }
4945
CalcPca(const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const AlleleCode * maj_alleles,const double * allele_freqs,uint32_t raw_sample_ct,uintptr_t pca_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct,uint32_t max_allele_slen,uint32_t pc_ct,PcaFlags pca_flags,uint32_t max_thread_ct,PgenReader * simple_pgrp,sfmt_t * sfmtp,double * grm,char * outname,char * outname_end)4946 PglErr CalcPca(const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const AlleleCode* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uintptr_t pca_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t pc_ct, PcaFlags pca_flags, uint32_t max_thread_ct, PgenReader* simple_pgrp, sfmt_t* sfmtp, double* grm, char* outname, char* outname_end) {
4947 unsigned char* bigstack_mark = g_bigstack_base;
4948 FILE* outfile = nullptr;
4949 char* cswritep = nullptr;
4950 CompressStreamState css;
4951 ThreadGroup tg;
4952 PreinitThreads(&tg);
4953 PglErr reterr = kPglRetSuccess;
4954 PreinitCstream(&css);
4955 {
4956 const uint32_t write_fid = FidColIsRequired(siip, pca_flags / kfPcaScolMaybefid);
4957 const char* sample_ids = siip->sample_ids;
4958 const char* sids = siip->sids;
4959 const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
4960 const uintptr_t max_sid_blen = siip->max_sid_blen;
4961 const uint32_t write_sid = SidColIsRequired(sids, pca_flags / kfPcaScolMaybesid);
4962 const uint32_t is_approx = (pca_flags / kfPcaApprox) & 1;
4963 reterr = ConditionalAllocateNonAutosomalVariants(cip, is_approx? "PCA approximation" : "PCA", raw_variant_ct, &variant_include, &variant_ct);
4964 if (unlikely(reterr)) {
4965 goto CalcPca_ret_1;
4966 }
4967 #ifdef __APPLE__
4968 // min OS X version is 10.7, so we can take Grand Central Dispatch dgemm
4969 // for granted
4970 // (tried this with Linux MKL + OpenMP as well, but results were inferior)
4971 uint32_t calc_thread_ct = 1;
4972 #else
4973 // I/O thread generally has <1/8 of workload
4974 // TODO: recheck this, now that I/O thread is also responsible for fully
4975 // expanding dosages. Still shouldn't be a big deal, but we probably want
4976 // sample_ct to affect the decision boundary now.
4977 uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
4978 if ((calc_thread_ct - 1) * kPcaVariantBlockSize >= variant_ct) {
4979 calc_thread_ct = 1 + (variant_ct - 1) / kPcaVariantBlockSize;
4980 }
4981 #endif
4982 if (unlikely(pc_ct > pca_sample_ct)) {
4983 // minor update (alpha 3): just error out here instead of trying to
4984 // auto-adjust PC count, number of .eigenvec output columns should be
4985 // easily predictable
4986 logerrprintf("Error: Too few samples to compute %u PCs with \"--pca approx\".\n", pc_ct);
4987 goto CalcPca_ret_DEGENERATE_DATA;
4988 }
4989 const uint32_t wts_requested = ((pca_flags & (kfPcaAlleleWts | kfPcaBiallelicVarWts)) != 0);
4990 const uint32_t biallelic_variant_ct = CountBiallelicVariants(variant_include, allele_idx_offsets, variant_ct);
4991 double* cur_var_wts = nullptr;
4992 double* eigval_inv_sqrts = nullptr;
4993 char* chr_buf = nullptr;
4994 uintptr_t overflow_buf_size = 3 * kMaxMediumLine;
4995 if (wts_requested) {
4996 if (pca_flags & kfPcaBiallelicVarWts) {
4997 if (unlikely(biallelic_variant_ct != variant_ct)) {
4998 logerrputs("Error: Multiallelic variant present in \"--pca biallelic-var-wts\" run.\n");
4999 goto CalcPca_ret_INCONSISTENT_INPUT;
5000 }
5001 }
5002 if (unlikely(
5003 bigstack_alloc_d(pc_ct, &cur_var_wts) ||
5004 bigstack_alloc_d(pc_ct, &eigval_inv_sqrts))) {
5005 goto CalcPca_ret_NOMEM;
5006 }
5007 uint32_t max_chr_blen = 0;
5008 if (pca_flags & kfPcaVcolChrom) {
5009 max_chr_blen = GetMaxChrSlen(cip) + 1;
5010 if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
5011 goto CalcPca_ret_NOMEM;
5012 }
5013 }
5014 const uintptr_t overflow_buf_size2 = RoundUpPow2(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 2 * max_allele_slen + 32 + 16 * pc_ct, kCacheline);
5015 if (overflow_buf_size2 > overflow_buf_size) {
5016 overflow_buf_size = overflow_buf_size2;
5017 }
5018 }
5019 uintptr_t writebuf_alloc = overflow_buf_size;
5020 if (pca_flags & kfPcaVarZs) {
5021 writebuf_alloc += CstreamWkspaceReq(overflow_buf_size);
5022 }
5023 // temporary
5024 // todo: additional --pca-clusters allocations
5025 const uintptr_t* pca_sample_include = sample_include;
5026 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5027 uint32_t* pca_sample_include_cumulative_popcounts;
5028 PgenVariant pgv;
5029 double* allele_1copy_buf;
5030 double* eigvals;
5031 CalcPcaCtx ctx;
5032 if (unlikely(
5033 bigstack_alloc_u32(raw_sample_ctl, &pca_sample_include_cumulative_popcounts) ||
5034 BigstackAllocPgv(pca_sample_ct, allele_idx_offsets != nullptr, PgrGetGflags(simple_pgrp), &pgv) ||
5035 bigstack_alloc_d(max_allele_ct, &allele_1copy_buf) ||
5036 bigstack_alloc_d(pc_ct, &eigvals) ||
5037 SetThreadCt(calc_thread_ct, &tg))) {
5038 goto CalcPca_ret_NOMEM;
5039 }
5040 FillCumulativePopcounts(pca_sample_include, raw_sample_ctl, pca_sample_include_cumulative_popcounts);
5041 PgrSampleSubsetIndex pssi;
5042 PgrSetSampleSubsetIndex(pca_sample_include_cumulative_popcounts, simple_pgrp, &pssi);
5043 ctx.sample_ct = pca_sample_ct;
5044 ctx.pc_ct = pc_ct;
5045 const uintptr_t pca_row_ct = CountAlleles(variant_include, allele_idx_offsets, raw_variant_ct, variant_ct) - biallelic_variant_ct;
5046 const uint32_t is_haploid = cip->haploid_mask[0] & 1;
5047 uint32_t cur_allele_ct = 2;
5048 double* qq = nullptr;
5049 double* eigvecs_smaj;
5050 char* writebuf;
5051 if (is_approx) {
5052 if (pca_sample_ct <= 5000) {
5053 logerrputs("Warning: \"--pca approx\" is only recommended for analysis of >5000 samples.\n");
5054 }
5055 if (pca_row_ct > 5000000) {
5056 logerrputs("Warning: Use of \"--pca approx\" on >5m rows is not advisable. Apply a MAF\nfilter if you haven't done so yet, and consider LD-pruning your variant set as\nwell.\n");
5057 }
5058 // This is ported from EIGENSOFT 6 src/ksrc/kjg_fpca.c , which is in turn
5059 // primarily based on Halko N, Martinsson P, Shkolnisky Y, Tygert M
5060 // (2011) An Algorithm for the Principal Component Analysis of Large Data
5061 // Sets.
5062 const uintptr_t pc_ct_x2 = pc_ct * 2;
5063 const uintptr_t qq_col_ct = (pc_ct + 1) * pc_ct_x2;
5064 // bugfix (30 Jan 2019): First SvdRect() call returns min(variant_ct,
5065 // qq_col_ct) singular vectors; this was previously assumed to always be
5066 // qq_col_ct, and very inaccurate results were produced when the
5067 // assumption wasn't true.
5068 // Simplest solution is to force the user to request fewer PCs, since the
5069 // final PCs wouldn't be accurate anyway.
5070 if (qq_col_ct > variant_ct) {
5071 logerrprintfww("Error: Too few variants to compute %u PCs with \"--pca approx\" (%u required).\n", pc_ct, qq_col_ct);
5072 goto CalcPca_ret_DEGENERATE_DATA;
5073 }
5074 #ifndef LAPACK_ILP64
5075 if (unlikely((pca_row_ct * S_CAST(uint64_t, qq_col_ct)) > 0x7effffff)) {
5076 logerrputs("Error: \"--pca approx\" problem instance too large for this " PROG_NAME_STR " build. If\nthis is really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
5077 goto CalcPca_ret_INCONSISTENT_INPUT;
5078 }
5079 #endif
5080 const double variant_ct_recip = 1.0 / u31tod(variant_ct);
5081
5082 const uintptr_t gg_size = pca_sample_ct * pc_ct_x2;
5083 __CLPK_integer svd_rect_lwork;
5084 #ifdef LAPACK_ILP64
5085 GetSvdRectLwork(MAXV(pca_sample_ct, pca_row_ct), qq_col_ct, &svd_rect_lwork);
5086 #else
5087 if (unlikely(GetSvdRectLwork(MAXV(pca_sample_ct, pca_row_ct), qq_col_ct, &svd_rect_lwork))) {
5088 logerrputs("Error: \"--pca approx\" problem instance too large for this " PROG_NAME_STR " build. If\nthis is really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
5089 goto CalcPca_ret_INCONSISTENT_INPUT;
5090 }
5091 #endif
5092 uintptr_t svd_rect_wkspace_size = (svd_rect_lwork + qq_col_ct * qq_col_ct) * sizeof(double);
5093 if (svd_rect_wkspace_size < writebuf_alloc) {
5094 // used as writebuf later
5095 svd_rect_wkspace_size = writebuf_alloc;
5096 }
5097
5098 unsigned char* svd_rect_wkspace;
5099 double* ss;
5100 double* g1;
5101 if (unlikely(
5102 bigstack_alloc_d(qq_col_ct, &ss) ||
5103 bigstack_alloc_d(pca_row_ct * qq_col_ct, &qq) ||
5104 bigstack_alloc_dp(calc_thread_ct, &ctx.y_transpose_bufs) ||
5105 bigstack_alloc_dp(calc_thread_ct, &ctx.g2_bb_part_bufs) ||
5106 bigstack_alloc_uc(svd_rect_wkspace_size, &svd_rect_wkspace) ||
5107 bigstack_alloc_d(gg_size, &g1))) {
5108 goto CalcPca_ret_NOMEM;
5109 }
5110 const uintptr_t yy_alloc_incr = RoundUpPow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
5111 const uintptr_t b_size = pca_sample_ct * qq_col_ct;
5112 const uintptr_t g2_bb_part_alloc = RoundUpPow2(b_size * sizeof(double), kCacheline);
5113 // bugfix (16 Jan 2020)
5114 const uintptr_t per_thread_alloc = 3 * yy_alloc_incr + g2_bb_part_alloc;
5115
5116 const uintptr_t bigstack_avail = bigstack_left();
5117 if (per_thread_alloc * calc_thread_ct > bigstack_avail) {
5118 if (unlikely(bigstack_avail < per_thread_alloc)) {
5119 goto CalcPca_ret_NOMEM;
5120 }
5121 calc_thread_ct = bigstack_avail / per_thread_alloc;
5122 }
5123 const uintptr_t yy_main_alloc = RoundUpPow2(kPcaVariantBlockSize * calc_thread_ct * pca_sample_ct * sizeof(double), kCacheline);
5124 ctx.yy_bufs[0] = S_CAST(double*, bigstack_alloc_raw(yy_main_alloc));
5125 ctx.yy_bufs[1] = S_CAST(double*, bigstack_alloc_raw(yy_main_alloc));
5126 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5127 ctx.y_transpose_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(yy_alloc_incr));
5128 ctx.g2_bb_part_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(g2_bb_part_alloc));
5129 }
5130 FillGaussianDArr(gg_size / 2, max_thread_ct, sfmtp, g1);
5131 ctx.g1 = g1;
5132 #ifdef __APPLE__
5133 fputs("Projecting random vectors... ", stdout);
5134 #else
5135 printf("Projecting random vectors (%u compute thread%s)... ", calc_thread_ct, (calc_thread_ct == 1)? "" : "s");
5136 #endif
5137 fflush(stdout);
5138 for (uint32_t iter_idx = 0; iter_idx <= pc_ct; ++iter_idx) {
5139 // kjg_fpca_XTXA(), kjg_fpca_XA()
5140 if (iter_idx < pc_ct) {
5141 SetThreadFuncAndData(CalcPcaXtxaThread, &ctx, &tg);
5142 } else {
5143 SetThreadFuncAndData(CalcPcaXaThread, &ctx, &tg);
5144 }
5145 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5146 ZeroDArr(gg_size, ctx.g2_bb_part_bufs[tidx]);
5147 }
5148 double* qq_iter = &(qq[iter_idx * pc_ct_x2]); // offset on first row
5149 ctx.qq = qq_iter;
5150
5151 // Main workflow:
5152 // 1. Set n=0, load batch 0
5153 //
5154 // 2. Spawn threads processing batch n
5155 // 3. Increment n by 1
5156 // 4. Load batch n unless eof
5157 // 5. Join threads
5158 // 6. Goto step 2 unless eof
5159 //
5160 // 7. Assemble next g1 by summing g2_parts
5161 uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5162 uint32_t variant_idx = 0;
5163 uintptr_t variant_uidx = 0;
5164 uintptr_t allele_idx_base = 0;
5165 uint32_t incomplete_allele_idx = 0;
5166 uint32_t parity = 0;
5167 uint32_t is_not_first_block = 0;
5168 while (1) {
5169 if (!IsLastBlock(&tg)) {
5170 reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, ctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
5171 if (unlikely(reterr)) {
5172 goto CalcPca_ret_PGR_FAIL;
5173 }
5174 }
5175 if (is_not_first_block) {
5176 JoinThreads(&tg);
5177 if (IsLastBlock(&tg)) {
5178 break;
5179 }
5180 }
5181 ctx.cur_batch_size = cur_batch_size;
5182 if (variant_idx == variant_ct) {
5183 DeclareLastThreadBlock(&tg);
5184 cur_batch_size = 0;
5185 }
5186 if (unlikely(SpawnThreads(&tg))) {
5187 goto CalcPca_ret_THREAD_CREATE_FAIL;
5188 }
5189 is_not_first_block = 1;
5190 parity = 1 - parity;
5191 }
5192 if (iter_idx < pc_ct) {
5193 memcpy(g1, ctx.g2_bb_part_bufs[0], gg_size * sizeof(double));
5194 for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
5195 const double* cur_g2_part = ctx.g2_bb_part_bufs[tidx];
5196 for (uintptr_t ulii = 0; ulii != gg_size; ++ulii) {
5197 g1[ulii] += cur_g2_part[ulii];
5198 }
5199 }
5200 for (uintptr_t ulii = 0; ulii != gg_size; ++ulii) {
5201 g1[ulii] *= variant_ct_recip;
5202 }
5203 }
5204 #ifdef __APPLE__
5205 printf("\rProjecting random vectors... %u/%u", iter_idx + 1, pc_ct + 1);
5206 #else
5207 printf("\rProjecting random vectors (%u compute thread%s)... %u/%u", calc_thread_ct, (calc_thread_ct == 1)? "" : "s", iter_idx + 1, pc_ct + 1);
5208 #endif
5209 fflush(stdout);
5210 }
5211 fputs(".\n", stdout);
5212 logputs("Computing SVD of Krylov matrix... ");
5213 fflush(stdout);
5214 BLAS_SET_NUM_THREADS(max_thread_ct);
5215 IntErr svd_rect_err = SvdRect(pca_row_ct, qq_col_ct, svd_rect_lwork, qq, ss, svd_rect_wkspace);
5216 if (unlikely(svd_rect_err)) {
5217 logputs("\n");
5218 snprintf(g_logbuf, kLogbufSize, "Error: Failed to compute SVD of Krylov matrix (DGESVD info=%d).\n", S_CAST(int32_t, svd_rect_err));
5219 goto CalcPca_ret_DEGENERATE_DATA_2;
5220 }
5221 BLAS_SET_NUM_THREADS(1);
5222 logputs("done.\nRecovering top PCs from range approximation... ");
5223 fflush(stdout);
5224
5225 // kjg_fpca_XTB()
5226 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5227 ZeroDArr(b_size, ctx.g2_bb_part_bufs[tidx]);
5228 }
5229 SetThreadFuncAndData(CalcPcaXtbThread, &ctx, &tg);
5230 ctx.qq = qq;
5231 uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5232 uint32_t variant_idx = 0;
5233 uintptr_t variant_uidx = 0;
5234 uintptr_t allele_idx_base = 0;
5235 uint32_t incomplete_allele_idx = 0;
5236 uint32_t parity = 0;
5237 uint32_t is_not_first_block = 0;
5238 while (1) {
5239 if (!IsLastBlock(&tg)) {
5240 reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, ctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
5241 if (unlikely(reterr)) {
5242 // this error *didn't* happen on an earlier pass, so assign blame
5243 // to I/O instead
5244 goto CalcPca_ret_REWIND_FAIL;
5245 }
5246 }
5247 if (is_not_first_block) {
5248 JoinThreads(&tg);
5249 if (IsLastBlock(&tg)) {
5250 break;
5251 }
5252 }
5253 ctx.cur_batch_size = cur_batch_size;
5254 if (variant_idx == variant_ct) {
5255 DeclareLastThreadBlock(&tg);
5256 cur_batch_size = 0;
5257 }
5258 if (unlikely(SpawnThreads(&tg))) {
5259 goto CalcPca_ret_THREAD_CREATE_FAIL;
5260 }
5261 is_not_first_block = 1;
5262 parity = 1 - parity;
5263 }
5264 double* bb = ctx.g2_bb_part_bufs[0];
5265 for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
5266 const double* cur_bb_part = ctx.g2_bb_part_bufs[tidx];
5267 for (uintptr_t ulii = 0; ulii != b_size; ++ulii) {
5268 bb[ulii] += cur_bb_part[ulii];
5269 }
5270 }
5271 BLAS_SET_NUM_THREADS(max_thread_ct);
5272 svd_rect_err = SvdRect(pca_sample_ct, qq_col_ct, svd_rect_lwork, bb, ss, svd_rect_wkspace);
5273 if (unlikely(svd_rect_err)) {
5274 logputs("\n");
5275 snprintf(g_logbuf, kLogbufSize, "Error: Failed to compute SVD of final matrix (DGESVD info=%d).\n", S_CAST(int32_t, svd_rect_err));
5276 goto CalcPca_ret_DEGENERATE_DATA_2;
5277 }
5278 BLAS_SET_NUM_THREADS(1);
5279 logputs("done.\n");
5280 eigvecs_smaj = g1;
5281 for (uint32_t sample_idx = 0; sample_idx != pca_sample_ct; ++sample_idx) {
5282 memcpy(&(eigvecs_smaj[sample_idx * S_CAST(uintptr_t, pc_ct)]), &(bb[sample_idx * qq_col_ct]), pc_ct * sizeof(double));
5283 }
5284 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5285 eigvals[pc_idx] = ss[pc_idx] * ss[pc_idx] * variant_ct_recip;
5286 }
5287 writebuf = R_CAST(char*, svd_rect_wkspace);
5288 // bugfix (25 Jun 2018): eigvals[] computation was missing a divide-by-2
5289 // somewhere, in both diploid and haploid cases.
5290 // update (30 Jan 2019): er, actually, no.
5291 if (is_haploid) {
5292 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5293 eigvals[pc_idx] *= 0.5;
5294 }
5295 }
5296 } else {
5297 __CLPK_integer lwork;
5298 __CLPK_integer liwork;
5299 uintptr_t wkspace_byte_ct;
5300 if (unlikely(GetExtractEigvecsLworks(pca_sample_ct, pc_ct, &lwork, &liwork, &wkspace_byte_ct))) {
5301 goto CalcPca_ret_NOMEM;
5302 }
5303 const uintptr_t eigvecs_smaj_alloc = pc_ct * pca_sample_ct * sizeof(double);
5304 if (wkspace_byte_ct < eigvecs_smaj_alloc) {
5305 wkspace_byte_ct = eigvecs_smaj_alloc;
5306 }
5307 double* reverse_eigvecs_pcmaj;
5308 unsigned char* extract_eigvecs_wkspace;
5309 if (unlikely(
5310 bigstack_alloc_d(pc_ct * pca_sample_ct, &reverse_eigvecs_pcmaj) ||
5311 bigstack_alloc_uc(wkspace_byte_ct, &extract_eigvecs_wkspace))) {
5312 goto CalcPca_ret_NOMEM;
5313 }
5314 logprintf("Extracting eigenvalue%s and eigenvector%s... ", (pc_ct == 1)? "" : "s", (pc_ct == 1)? "" : "s");
5315 fflush(stdout);
5316 BLAS_SET_NUM_THREADS(max_thread_ct);
5317 // not putting unlikely() here for now.
5318 if (ExtractEigvecs(pca_sample_ct, pc_ct, lwork, liwork, grm, eigvals, reverse_eigvecs_pcmaj, extract_eigvecs_wkspace)) {
5319 logerrputs("Error: Failed to extract eigenvector(s) from GRM.\n");
5320 goto CalcPca_ret_DEGENERATE_DATA;
5321 }
5322 BLAS_SET_NUM_THREADS(1);
5323 logputs("done.\n");
5324 eigvecs_smaj = R_CAST(double*, extract_eigvecs_wkspace);
5325 BigstackShrinkTop(eigvecs_smaj, eigvecs_smaj_alloc);
5326 if (unlikely(bigstack_alloc_c(writebuf_alloc, &writebuf))) {
5327 goto CalcPca_ret_NOMEM;
5328 }
5329
5330 // ExtractEigvecs() results are in reverse order, and we also need to
5331 // transpose eigenvectors to sample-major
5332 const uint32_t pc_ct_m1 = pc_ct - 1;
5333 const uint32_t pc_ct_div2 = pc_ct / 2;
5334 for (uint32_t pc_idx = 0; pc_idx != pc_ct_div2; ++pc_idx) {
5335 double tmp_eigval = eigvals[pc_idx];
5336 eigvals[pc_idx] = eigvals[pc_ct_m1 - pc_idx];
5337 eigvals[pc_ct_m1 - pc_idx] = tmp_eigval;
5338 }
5339 double* eigvecs_smaj_iter = eigvecs_smaj;
5340 for (uint32_t sample_idx = 0; sample_idx != pca_sample_ct; ++sample_idx) {
5341 uintptr_t pc_inv_idx = pc_ct;
5342 const double* reverse_eigvecs_col = &(reverse_eigvecs_pcmaj[sample_idx]);
5343 do {
5344 --pc_inv_idx;
5345 *eigvecs_smaj_iter++ = reverse_eigvecs_col[pc_inv_idx * pca_sample_ct];
5346 } while (pc_inv_idx);
5347 }
5348 }
5349 // (later: --pca-cluster-names, --pca-clusters)
5350 char* writebuf_flush = &(writebuf[kMaxMediumLine]);
5351
5352 if (wts_requested) {
5353 CalcPcaVarWtsCtx vwctx;
5354 vwctx.sample_ct = pca_sample_ct;
5355 vwctx.pc_ct = pc_ct;
5356 vwctx.sample_wts_smaj = eigvecs_smaj;
5357 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5358 eigval_inv_sqrts[pc_idx] = 1.0 / sqrt(eigvals[pc_idx]);
5359 }
5360
5361 const uint32_t allele_wts = (pca_flags / kfPcaAlleleWts) & 1;
5362 const uint32_t output_zst = (pca_flags / kfPcaVarZs) & 1;
5363 if (allele_wts) {
5364 OutnameZstSet(".eigenvec.allele", output_zst, outname_end);
5365 } else {
5366 OutnameZstSet(".eigenvec.var", output_zst, outname_end);
5367 }
5368 reterr = InitCstream(outname, 0, output_zst, max_thread_ct, overflow_buf_size, writebuf, R_CAST(unsigned char*, &(writebuf[overflow_buf_size])), &css);
5369 if (unlikely(reterr)) {
5370 goto CalcPca_ret_1;
5371 }
5372 cswritep = writebuf;
5373 *cswritep++ = '#';
5374 if (chr_buf) {
5375 cswritep = strcpya_k(cswritep, "CHROM\t");
5376 }
5377 if (pca_flags & kfPcaVcolPos) {
5378 cswritep = strcpya_k(cswritep, "POS\t");
5379 } else {
5380 variant_bps = nullptr;
5381 }
5382 cswritep = strcpya_k(cswritep, "ID");
5383 if (pca_flags & kfPcaVcolRef) {
5384 cswritep = strcpya_k(cswritep, "\tREF");
5385 }
5386 if (pca_flags & kfPcaVcolAlt1) {
5387 cswritep = strcpya_k(cswritep, "\tALT1");
5388 }
5389 if (pca_flags & kfPcaVcolAlt) {
5390 cswritep = strcpya_k(cswritep, "\tALT");
5391 }
5392 if (allele_wts) {
5393 cswritep = strcpya_k(cswritep, "\tA1");
5394 }
5395 if (pca_flags & kfPcaVcolAx) {
5396 cswritep = strcpya_k(cswritep, "\tAX");
5397 }
5398 if (pca_flags & kfPcaVcolMaj) {
5399 cswritep = strcpya_k(cswritep, "\tMAJ");
5400 }
5401 if (pca_flags & kfPcaVcolNonmaj) {
5402 cswritep = strcpya_k(cswritep, "\tNONMAJ");
5403 }
5404 for (uint32_t pc_idx = 1; pc_idx <= pc_ct; ++pc_idx) {
5405 cswritep = strcpya_k(cswritep, "\tPC");
5406 cswritep = u32toa(pc_idx, cswritep);
5407 }
5408 AppendBinaryEoln(&cswritep);
5409
5410 // Main workflow:
5411 // 1. Set n=0, load batch 0
5412 //
5413 // 2. Spawn threads processing batch n
5414 // 3. If n>0, write results and update projection for block (n-1)
5415 // 4. Increment n by 1
5416 // 5. Load batch n unless eof
5417 // 6. Join threads
5418 // 7. Goto step 2 unless eof
5419 //
5420 // 8. Write results and update projection for last block
5421 #ifndef __APPLE__
5422 if (output_zst) {
5423 // compression is relatively expensive?
5424 calc_thread_ct = 1;
5425 }
5426 #endif
5427 uintptr_t var_wts_part_size;
5428 double* var_wts = qq;
5429 if (var_wts) {
5430 var_wts_part_size = (MINV(pca_row_ct, calc_thread_ct * kPcaVariantBlockSize)) * S_CAST(uintptr_t, pc_ct);
5431 vwctx.yy_bufs[0] = ctx.yy_bufs[0];
5432 vwctx.yy_bufs[1] = ctx.yy_bufs[1];
5433 vwctx.var_wts = ctx.qq;
5434 } else {
5435 // non-approximate PCA, some buffers have not been allocated yet
5436
5437 // if grm[] (which we no longer need) has at least as much remaining
5438 // space as bigstack, allocate from grm
5439 unsigned char* arena_bottom = R_CAST(unsigned char*, grm);
5440 unsigned char* arena_top = bigstack_mark;
5441 uintptr_t arena_avail = arena_top - arena_bottom;
5442 if (arena_avail < bigstack_left()) {
5443 arena_bottom = g_bigstack_base;
5444 arena_top = g_bigstack_end;
5445 arena_avail = bigstack_left();
5446 }
5447 const uintptr_t var_wts_part_alloc = RoundUpPow2(2 * kPcaVariantBlockSize * sizeof(double) * pc_ct, kCacheline);
5448 const uintptr_t yy_alloc_incr = RoundUpPow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
5449 const uintptr_t per_thread_alloc = 2 * yy_alloc_incr + var_wts_part_alloc;
5450 if (per_thread_alloc * calc_thread_ct > arena_avail) {
5451 if (unlikely(arena_avail < per_thread_alloc)) {
5452 goto CalcPca_ret_NOMEM;
5453 }
5454 calc_thread_ct = arena_avail / per_thread_alloc;
5455 }
5456 const uintptr_t yy_main_alloc = RoundUpPow2(kPcaVariantBlockSize * calc_thread_ct * pca_sample_ct * sizeof(double), kCacheline);
5457 vwctx.yy_bufs[0] = S_CAST(double*, arena_alloc_raw(yy_main_alloc, &arena_bottom));
5458 vwctx.yy_bufs[1] = S_CAST(double*, arena_alloc_raw(yy_main_alloc, &arena_bottom));
5459 var_wts_part_size = (MINV(pca_row_ct, calc_thread_ct * kPcaVariantBlockSize)) * S_CAST(uintptr_t, pc_ct);
5460 var_wts = S_CAST(double*, arena_alloc_raw_rd(2 * var_wts_part_size * sizeof(double), &arena_bottom));
5461 vwctx.var_wts = var_wts;
5462 #ifndef NDEBUG
5463 if (arena_top == g_bigstack_end) {
5464 // we shouldn't make any more allocations, but just in case...
5465 g_bigstack_base = arena_bottom;
5466 }
5467 #endif
5468 }
5469 if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
5470 goto CalcPca_ret_NOMEM;
5471 }
5472 SetThreadFuncAndData(CalcPcaVarWtsThread, &vwctx, &tg);
5473 uint32_t prev_batch_size = 0;
5474 uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5475
5476 uint32_t variant_idx_load = 0;
5477 uintptr_t variant_uidx_load = 0;
5478 uintptr_t allele_idx_base_load = 0;
5479 uint32_t cur_allele_ct_load = 2;
5480 uint32_t incomplete_allele_idx_load = 0;
5481
5482 uint32_t variant_idx_write = 0;
5483 uintptr_t variant_uidx_write = 0;
5484 uintptr_t allele_idx_offset_write = 0;
5485 // cur_allele_ct = 2;
5486 uint32_t incomplete_allele_idx_write = 0;
5487 uint32_t chr_fo_idx = UINT32_MAX;
5488 uint32_t chr_end = 0;
5489 uint32_t chr_buf_blen = 0;
5490
5491 uint32_t parity = 0;
5492 uint32_t is_not_first_block = 0;
5493 while (1) {
5494 if (!IsLastBlock(&tg)) {
5495 reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, vwctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx_load, &variant_uidx_load, &allele_idx_base_load, &cur_allele_ct_load, &incomplete_allele_idx_load, &pgv, allele_1copy_buf);
5496 if (unlikely(reterr)) {
5497 goto CalcPca_ret_PGR_FAIL;
5498 }
5499 }
5500 if (is_not_first_block) {
5501 JoinThreads(&tg);
5502 }
5503 if (!IsLastBlock(&tg)) {
5504 vwctx.cur_batch_size = cur_batch_size;
5505 if (variant_idx_load == variant_ct) {
5506 DeclareLastThreadBlock(&tg);
5507 }
5508 if (unlikely(SpawnThreads(&tg))) {
5509 goto CalcPca_ret_THREAD_CREATE_FAIL;
5510 }
5511 }
5512 parity = 1 - parity;
5513 if (is_not_first_block) {
5514 // write *previous* block results
5515 const double* var_wts_iter = &(var_wts[parity * var_wts_part_size]);
5516 // (todo: update projection here)
5517 if (allele_wts) {
5518 reterr = FlushAlleleWts(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, var_wts_iter, eigval_inv_sqrts, prev_batch_size, pc_ct, pca_flags, &css, &cswritep, chr_buf, &variant_idx_write, &variant_uidx_write, &allele_idx_offset_write, &cur_allele_ct, &incomplete_allele_idx_write, &chr_fo_idx, &chr_end, &chr_buf_blen);
5519 } else {
5520 reterr = FlushBiallelicVarWts(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, maj_alleles, var_wts_iter, eigval_inv_sqrts, prev_batch_size, pc_ct, pca_flags, &css, &cswritep, chr_buf, &variant_idx_write, &variant_uidx_write, &chr_fo_idx, &chr_end, &chr_buf_blen);
5521 }
5522 if (unlikely(reterr)) {
5523 // only write_fail possible in practice
5524 goto CalcPca_ret_1;
5525 }
5526 if (variant_idx_write == variant_ct) {
5527 break;
5528 }
5529 }
5530 is_not_first_block = 1;
5531 prev_batch_size = cur_batch_size;
5532 }
5533 if (unlikely(CswriteCloseNull(&css, cswritep))) {
5534 goto CalcPca_ret_WRITE_FAIL;
5535 }
5536 logprintfww("--pca%s: %s weights written to %s .\n", is_approx? " approx" : "", allele_wts? "Allele" : "Variant", outname);
5537 }
5538
5539 snprintf(outname_end, kMaxOutfnameExtBlen, ".eigenvec");
5540 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
5541 goto CalcPca_ret_OPEN_FAIL;
5542 }
5543 char* write_iter = writebuf;
5544 *write_iter++ = '#';
5545 if (write_fid) {
5546 write_iter = strcpya_k(write_iter, "FID\t");
5547 }
5548 write_iter = strcpya_k(write_iter, "IID");
5549 if (write_sid) {
5550 write_iter = strcpya_k(write_iter, "\tSID");
5551 }
5552 for (uint32_t pc_idx = 1; pc_idx <= pc_ct; ++pc_idx) {
5553 write_iter = strcpya_k(write_iter, "\tPC");
5554 write_iter = u32toa(pc_idx, write_iter);
5555 }
5556 AppendBinaryEoln(&write_iter);
5557 const uint32_t sample_ct = pca_sample_ct;
5558 uintptr_t sample_uidx_base = 0;
5559 uintptr_t sample_include_bits = sample_include[0];
5560 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
5561 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
5562 const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
5563 if (!write_fid) {
5564 cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
5565 }
5566 write_iter = strcpya(write_iter, cur_sample_id);
5567 if (write_sid) {
5568 *write_iter++ = '\t';
5569 if (sids) {
5570 write_iter = strcpya(write_iter, &(sids[max_sid_blen * sample_uidx]));
5571 } else {
5572 *write_iter++ = '0';
5573 }
5574 }
5575 double* sample_wts_iter = &(eigvecs_smaj[sample_idx * pc_ct]);
5576 // todo: read from proj_sample_wts instead when pca_sample_include bit
5577 // not set
5578 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5579 *write_iter++ = '\t';
5580 write_iter = dtoa_g(*sample_wts_iter++, write_iter);
5581 }
5582 AppendBinaryEoln(&write_iter);
5583 if (unlikely(fwrite_ck(writebuf_flush, outfile, &write_iter))) {
5584 goto CalcPca_ret_WRITE_FAIL;
5585 }
5586 }
5587 if (unlikely(fclose_flush_null(writebuf_flush, write_iter, &outfile))) {
5588 goto CalcPca_ret_WRITE_FAIL;
5589 }
5590
5591 snprintf(outname_end, kMaxOutfnameExtBlen, ".eigenval");
5592 if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
5593 goto CalcPca_ret_OPEN_FAIL;
5594 }
5595 write_iter = writebuf;
5596 for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5597 write_iter = dtoa_g(eigvals[pc_idx], write_iter);
5598 AppendBinaryEoln(&write_iter);
5599 }
5600 if (unlikely(fclose_flush_null(writebuf_flush, write_iter, &outfile))) {
5601 goto CalcPca_ret_WRITE_FAIL;
5602 }
5603 *outname_end = '\0';
5604 logprintfww("--pca%s: Eigenvector%s written to %s.eigenvec , and eigenvalue%s written to %s.eigenval .\n", is_approx? " approx" : "", (pc_ct == 1)? "" : "s", outname, (pc_ct == 1)? "" : "s", outname);
5605 }
5606 while (0) {
5607 CalcPca_ret_NOMEM:
5608 reterr = kPglRetNomem;
5609 break;
5610 CalcPca_ret_OPEN_FAIL:
5611 reterr = kPglRetOpenFail;
5612 break;
5613 CalcPca_ret_PGR_FAIL:
5614 PgenErrPrintN(reterr);
5615 break;
5616 CalcPca_ret_REWIND_FAIL:
5617 logerrprintfww(kErrprintfRewind, ".pgen file");
5618 break;
5619 CalcPca_ret_WRITE_FAIL:
5620 reterr = kPglRetWriteFail;
5621 break;
5622 CalcPca_ret_INCONSISTENT_INPUT:
5623 reterr = kPglRetInconsistentInput;
5624 break;
5625 CalcPca_ret_THREAD_CREATE_FAIL:
5626 reterr = kPglRetThreadCreateFail;
5627 break;
5628 CalcPca_ret_DEGENERATE_DATA_2:
5629 logerrputsb();
5630 CalcPca_ret_DEGENERATE_DATA:
5631 reterr = kPglRetDegenerateData;
5632 break;
5633 }
5634 CalcPca_ret_1:
5635 CleanupThreads(&tg);
5636 BLAS_SET_NUM_THREADS(1);
5637 CswriteCloseCond(&css, cswritep);
5638 fclose_cond(outfile);
5639 if (grm) {
5640 // nothing after --pca in the plink2 order of operations uses grm[]
5641 BigstackReset(grm);
5642 } else {
5643 BigstackReset(bigstack_mark);
5644 }
5645 return reterr;
5646 }
5647 #endif
5648
5649 // to test: do we actually want cur_dosage_ints to be uint64_t* instead of
5650 // uint32_t*?
5651 // also, should this be moved to plink2_common?
FillCurDdosageInts(const uintptr_t * genovec_buf,const uintptr_t * dosage_present,const Dosage * dosage_main_buf,uint32_t sample_ct,uint32_t dosage_ct,uint32_t is_diploid_p1,uint64_t * cur_ddosage_ints)5652 void FillCurDdosageInts(const uintptr_t* genovec_buf, const uintptr_t* dosage_present, const Dosage* dosage_main_buf, uint32_t sample_ct, uint32_t dosage_ct, uint32_t is_diploid_p1, uint64_t* cur_ddosage_ints) {
5653 uint64_t lookup_table[32] ALIGNV16;
5654 lookup_table[0] = 0;
5655 lookup_table[2] = is_diploid_p1 * kDosageMid;
5656 lookup_table[4] = is_diploid_p1 * kDosageMax;
5657 lookup_table[6] = 0;
5658 InitLookup16x8bx2(lookup_table);
5659 GenoarrLookup16x8bx2(genovec_buf, lookup_table, sample_ct, cur_ddosage_ints);
5660 if (!dosage_ct) {
5661 return;
5662 }
5663 uintptr_t sample_idx_base = 0;
5664 uintptr_t cur_bits = dosage_present[0];
5665 for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
5666 const uintptr_t sample_idx = BitIter1(dosage_present, &sample_idx_base, &cur_bits);
5667 cur_ddosage_ints[sample_idx] = dosage_main_buf[dosage_idx] * is_diploid_p1;
5668 }
5669 }
5670
5671 CONSTI32(kScoreVariantBlockSize, 240);
5672
5673 typedef struct CalcScoreCtxStruct {
5674 uint32_t score_final_col_ct;
5675 uint32_t sample_ct;
5676
5677 double* dosages_vmaj[2];
5678 double* score_coefs_cmaj[2];
5679
5680 uint32_t cur_batch_size;
5681
5682 double* final_scores_cmaj;
5683 } CalcScoreCtx;
5684
CalcScoreThread(void * raw_arg)5685 THREAD_FUNC_DECL CalcScoreThread(void* raw_arg) {
5686 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
5687 // don't bother to explicitly multithread for now
5688 assert(!arg->tidx);
5689 CalcScoreCtx* ctx = S_CAST(CalcScoreCtx*, arg->sharedp->context);
5690
5691 double* final_scores_cmaj = ctx->final_scores_cmaj;
5692 const uint32_t score_final_col_ct = ctx->score_final_col_ct;
5693 const uint32_t sample_ct = ctx->sample_ct;
5694 uint32_t parity = 0;
5695 do {
5696 const uint32_t cur_batch_size = ctx->cur_batch_size;
5697 if (cur_batch_size) {
5698 RowMajorMatrixMultiplyStridedIncr(ctx->score_coefs_cmaj[parity], ctx->dosages_vmaj[parity], score_final_col_ct, kScoreVariantBlockSize, sample_ct, sample_ct, cur_batch_size, sample_ct, final_scores_cmaj);
5699 }
5700 parity = 1 - parity;
5701 } while (!THREAD_BLOCK_FINISH(arg));
5702 THREAD_RETURN;
5703 }
5704
5705 typedef struct ParsedQscoreRangeStruct {
5706 char* range_name;
5707 double lbound;
5708 double ubound;
5709 } ParsedQscoreRange;
5710
ScoreReport(const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * sex_male,const PhenoCol * pheno_cols,const char * pheno_names,const uintptr_t * variant_include,const ChrInfo * cip,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const double * allele_freqs,const ScoreInfo * score_info_ptr,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t pheno_ct,uintptr_t max_pheno_name_blen,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_variant_id_slen,uint32_t xchr_model,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end)5711 PglErr ScoreReport(const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const ChrInfo* cip, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const double* allele_freqs, const ScoreInfo* score_info_ptr, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_variant_id_slen, uint32_t xchr_model, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end) {
5712 unsigned char* bigstack_mark = g_bigstack_base;
5713 unsigned char* bigstack_end_mark = g_bigstack_end;
5714 uintptr_t line_idx = 0;
5715 char* cswritep = nullptr;
5716 PglErr reterr = kPglRetSuccess;
5717 TextStream score_txs;
5718 ThreadGroup tg;
5719 CompressStreamState css;
5720 PreinitTextStream(&score_txs);
5721 PreinitThreads(&tg);
5722 PreinitCstream(&css);
5723 {
5724 const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
5725 if (!xchr_model) {
5726 uint32_t x_code;
5727 if (XymtExists(cip, kChrOffsetX, &x_code)) {
5728 uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code];
5729 uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
5730 uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
5731 if (!AllBitsAreZero(variant_include, x_start, x_end)) {
5732 uintptr_t* variant_include_no_x;
5733 if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_no_x))) {
5734 goto ScoreReport_ret_NOMEM;
5735 }
5736 memcpy(variant_include_no_x, variant_include, raw_variant_ctl * sizeof(intptr_t));
5737 ClearBitsNz(x_start, x_end, variant_include_no_x);
5738 variant_include = variant_include_no_x;
5739 }
5740 }
5741 } else if (xchr_model == 2) {
5742 xchr_model = 0;
5743 }
5744 // now xchr_model is set iff it's 1
5745
5746 const ScoreFlags flags = score_info_ptr->flags;
5747 const uint32_t output_zst = (flags / kfScoreZs) & 1;
5748 uint32_t* variant_id_htable = nullptr;
5749 uint32_t variant_id_htable_size = 0;
5750 uint32_t* variant_include_cumulative_popcounts = nullptr;
5751 uintptr_t* qsr_include = nullptr;
5752 char** range_names = nullptr;
5753 uintptr_t qsr_ct = 0;
5754 if (score_info_ptr->qsr_range_fname) {
5755 // Limit this to ~1/8 of available memory, since memory may be tight with
5756 // many ranges.
5757 variant_id_htable_size = GetHtableFastSize(variant_ct);
5758 const uintptr_t htable_size_limit = bigstack_left() / (8 * sizeof(int32_t));
5759 if (variant_id_htable_size > htable_size_limit) {
5760 variant_id_htable_size = htable_size_limit;
5761 const uint32_t htable_size_min = GetHtableMinSize(variant_ct);
5762 if (htable_size_min > variant_id_htable_size) {
5763 variant_id_htable_size = htable_size_min;
5764 }
5765 }
5766 if (unlikely(
5767 bigstack_alloc_u32(variant_id_htable_size, &variant_id_htable))) {
5768 goto ScoreReport_ret_NOMEM;
5769 }
5770 reterr = PopulateIdHtableMt(nullptr, variant_include, variant_ids, variant_ct, 0, variant_id_htable_size, max_thread_ct, nullptr, variant_id_htable, nullptr);
5771 if (unlikely(reterr)) {
5772 goto ScoreReport_ret_1;
5773 }
5774 // Strictly speaking, textFILE would be more appropriate for the range
5775 // file since it should be tiny, but it doesn't really matter.
5776 // We still reserve bigstack_left() / 8 for the line-buffer since the
5777 // data file usually contains allele codes, and we use TextRetarget()
5778 // below to use the buffer allocated here for the data file too (and for
5779 // the score file later).
5780 reterr = SizeAndInitTextStream(score_info_ptr->qsr_range_fname, bigstack_left() / 8, 1, &score_txs);
5781 if (unlikely(reterr)) {
5782 goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5783 }
5784 unsigned char* bigstack_mark2 = g_bigstack_base;
5785 // strlen("<prefix>.<range name>.sscore[.zst]") < kPglFnamesize
5786 const uint32_t max_name_slen = kPglFnamesize - S_CAST(uintptr_t, outname_end - outname) - 9 - output_zst * 4;
5787 ParsedQscoreRange* parsed_qscore_ranges = R_CAST(ParsedQscoreRange*, g_bigstack_base);
5788 unsigned char* tmp_alloc_end = g_bigstack_end;
5789 uintptr_t miss_ct = 0;
5790 while (1) {
5791 ++line_idx;
5792 const char* line_start = TextGet(&score_txs);
5793 if (!line_start) {
5794 if (likely(!TextStreamErrcode2(&score_txs, &reterr))) {
5795 break;
5796 }
5797 goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5798 }
5799 // range name, p-value lower bound, p-value upper bound
5800 const char* range_name_end = CurTokenEnd(line_start);
5801 const char* lbound_start = FirstNonTspace(range_name_end);
5802 double lbound;
5803 // PLINK 1.9 documentation promises that lines with too few entries or
5804 // nonnumeric values in the second and third column are ignored.
5805 const char* lbound_end = ScantokDouble(lbound_start, &lbound);
5806 if (!lbound_end) {
5807 continue;
5808 }
5809 const char* ubound_start = FirstNonTspace(lbound_end);
5810 double ubound;
5811 const char* ubound_end = ScantokDouble(ubound_start, &ubound);
5812 if (!ubound_end) {
5813 continue;
5814 }
5815 if (unlikely(lbound > ubound)) {
5816 snprintf(g_logbuf, kLogbufSize, "Error: Upper bound < lower bound on line %" PRIuPTR " of --q-score-range range file.\n", line_idx);
5817 goto ScoreReport_ret_MALFORMED_INPUT_WW;
5818 }
5819 const uint32_t name_slen = range_name_end - line_start;
5820 if (name_slen > max_name_slen) {
5821 snprintf(g_logbuf, kLogbufSize, "Error: Name too long on line %" PRIuPTR " of --q-score-range range file.\n", line_idx);
5822 }
5823 unsigned char* tmp_alloc_base = R_CAST(unsigned char*, &(parsed_qscore_ranges[qsr_ct]));
5824 if (S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) <= name_slen + sizeof(ParsedQscoreRange)) {
5825 goto ScoreReport_ret_NOMEM;
5826 }
5827 tmp_alloc_end -= name_slen + 1;
5828 char* stored_name = R_CAST(char*, tmp_alloc_end);
5829 memcpyx(stored_name, line_start, name_slen, '\0');
5830 parsed_qscore_ranges[qsr_ct].range_name = stored_name;
5831 parsed_qscore_ranges[qsr_ct].lbound = lbound;
5832 parsed_qscore_ranges[qsr_ct].ubound = ubound;
5833 ++qsr_ct;
5834 }
5835 if (unlikely(!qsr_ct)) {
5836 logerrputs("Error: Empty --q-score-range range file.\n");
5837 goto ScoreReport_ret_INCONSISTENT_INPUT;
5838 }
5839 BigstackBaseSet(&(parsed_qscore_ranges[qsr_ct]));
5840 BigstackEndSet(tmp_alloc_end);
5841 #ifndef LAPACK_ILP64
5842 if (unlikely(qsr_ct > (0x7fffffff / kScoreVariantBlockSize))) {
5843 logerrputs("Error: --q-score-range range count too large for this " PROG_NAME_STR " build. If this is\nreally the computation you want, use a " PROG_NAME_STR " build with large-matrix support.\n");
5844 goto ScoreReport_ret_INCONSISTENT_INPUT;
5845 }
5846 # ifndef __LP64__
5847 const uint64_t bit_ct = S_CAST(uint64_t, qsr_ct) * variant_ct;
5848 if (unlikely(bit_ct > 0xffffffffU)) {
5849 goto ScoreReport_ret_NOMEM;
5850 }
5851 # endif
5852 #endif
5853 if (unlikely(
5854 (g_bigstack_base > g_bigstack_end) ||
5855 bigstack_end_alloc_u32(raw_variant_ctl, &variant_include_cumulative_popcounts) ||
5856 bigstack_end_calloc_w(BitCtToWordCt(S_CAST(uint64_t, qsr_ct) * variant_ct), &qsr_include) ||
5857 bigstack_end_alloc_cp(qsr_ct, &range_names))) {
5858 goto ScoreReport_ret_NOMEM;
5859 }
5860 for (uintptr_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
5861 range_names[qsr_idx] = parsed_qscore_ranges[qsr_idx].range_name;
5862 }
5863 const uint32_t variant_ctl = BitCtToWordCt(variant_ct);
5864 uintptr_t* already_seen;
5865 if (unlikely(
5866 bigstack_calloc_w(variant_ctl, &already_seen))) {
5867 goto ScoreReport_ret_NOMEM;
5868 }
5869 FillCumulativePopcounts(variant_include, raw_variant_ctl, variant_include_cumulative_popcounts);
5870 reterr = TextRetarget(score_info_ptr->qsr_data_fname, &score_txs);
5871 if (unlikely(reterr)) {
5872 goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5873 }
5874 const uint32_t colid_first = (score_info_ptr->qsr_varid_col_p1 < score_info_ptr->qsr_val_col_p1);
5875 uint32_t colmin;
5876 uint32_t coldiff;
5877 if (colid_first) {
5878 colmin = score_info_ptr->qsr_varid_col_p1 - 1;
5879 coldiff = score_info_ptr->qsr_val_col_p1 - score_info_ptr->qsr_varid_col_p1;
5880 } else {
5881 colmin = score_info_ptr->qsr_val_col_p1 - 1;
5882 coldiff = score_info_ptr->qsr_varid_col_p1 - score_info_ptr->qsr_val_col_p1;
5883 }
5884 line_idx = 0;
5885 miss_ct = 0;
5886 if (flags & kfScoreQsrHeader) {
5887 ++line_idx;
5888 if (unlikely(!TextGet(&score_txs))) {
5889 if (!TextStreamErrcode2(&score_txs, &reterr)) {
5890 logerrputs("Error: Empty --q-score-range data file.\n");
5891 goto ScoreReport_ret_MALFORMED_INPUT;
5892 }
5893 goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5894 }
5895 }
5896 double* min_vals = nullptr;
5897 if (flags & kfScoreQsrMin) {
5898 // something like this is needed to handle --glm output for
5899 // multiallelic variants.
5900 // (possible todo: --glm modifier which requests all-allele joint tests
5901 // for multiallelic variants)
5902 if (unlikely(bigstack_alloc_d(variant_ct, &min_vals))) {
5903 goto ScoreReport_ret_NOMEM;
5904 }
5905 }
5906 while (1) {
5907 ++line_idx;
5908 const char* line_start = TextGet(&score_txs);
5909 if (!line_start) {
5910 if (likely(!TextStreamErrcode2(&score_txs, &reterr))) {
5911 break;
5912 }
5913 goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5914 }
5915 const char* colid_ptr;
5916 const char* colval_ptr;
5917 if (colid_first) {
5918 colid_ptr = NextTokenMult0(line_start, colmin);
5919 colval_ptr = NextTokenMult(colid_ptr, coldiff);
5920 if (unlikely(!colval_ptr)) {
5921 goto ScoreReport_ret_QSR_DATA_MISSING_TOKENS;
5922 }
5923 } else {
5924 colval_ptr = NextTokenMult0(line_start, colmin);
5925 colid_ptr = NextTokenMult(colval_ptr, coldiff);
5926 if (unlikely(!colid_ptr)) {
5927 goto ScoreReport_ret_QSR_DATA_MISSING_TOKENS;
5928 }
5929 }
5930 const uint32_t varid_slen = strlen_se(colid_ptr);
5931 const uint32_t variant_uidx = VariantIdDupflagHtableFind(colid_ptr, variant_ids, variant_id_htable, varid_slen, variant_id_htable_size, max_variant_id_slen);
5932 if ((variant_uidx >> 31) || (!IsSet(variant_include, variant_uidx))) {
5933 ++miss_ct;
5934 continue;
5935 }
5936 double cur_val;
5937 if (!ScantokDouble(colval_ptr, &cur_val)) {
5938 // Tolerate NA without erroring out. (Could count this as seen, but
5939 // that would for the min_vals logic to be more complicated.)
5940 const char* colval_end = CurTokenEnd(colval_ptr);
5941 if (likely(IsNanStr(colval_ptr, colval_end - colval_ptr))) {
5942 continue;
5943 }
5944 *K_CAST(char*, colval_end) = '\0';
5945 logerrprintfww("Error: Invalid value '%s' on line %" PRIuPTR " of --q-score-range data file.\n", colval_ptr, line_idx);
5946 goto ScoreReport_ret_MALFORMED_INPUT;
5947 }
5948 const uint32_t variant_idx = RawToSubsettedPos(variant_include, variant_include_cumulative_popcounts, variant_uidx);
5949 const uintptr_t bit_idx_base = variant_idx * qsr_ct;
5950 if (min_vals) {
5951 if (IsSet(already_seen, variant_idx)) {
5952 if (min_vals[variant_idx] <= cur_val) {
5953 continue;
5954 }
5955 ClearBitsNz(bit_idx_base, bit_idx_base + qsr_ct, qsr_include);
5956 }
5957 min_vals[variant_idx] = cur_val;
5958 } else {
5959 if (IsSet(already_seen, variant_idx)) {
5960 logerrprintfww("Error: Duplicate ID '%s' in --q-score-range data file. (Add the 'min' modifier if this is a multiallelic variant that you want to use the minimum p-value for.)\n", variant_ids[variant_uidx]);
5961 goto ScoreReport_ret_MALFORMED_INPUT;
5962 }
5963 }
5964 SetBit(variant_idx, already_seen);
5965 for (uintptr_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
5966 if ((cur_val < parsed_qscore_ranges[qsr_idx].lbound) || (cur_val > parsed_qscore_ranges[qsr_idx].ubound)) {
5967 continue;
5968 }
5969 SetBit(bit_idx_base + qsr_idx, qsr_include);
5970 }
5971 }
5972 const uint32_t qsr_variant_ct = PopcountWords(already_seen, variant_ctl);
5973 if (unlikely(!qsr_variant_ct)) {
5974 logerrputs("Error: No valid entries in --q-score-range data file.\n");
5975 goto ScoreReport_ret_INCONSISTENT_INPUT;
5976 }
5977 logprintf("--q-score-range: %" PRIuPTR " range%s and %u variant%s loaded.\n", qsr_ct, (qsr_ct == 1)? "" : "s", qsr_variant_ct, (qsr_variant_ct == 1)? "" : "s");
5978 if (miss_ct) {
5979 logerrprintf("Warning: %" PRIuPTR " line%s skipped in --q-score-range data file.\n", miss_ct, (miss_ct == 1)? "" : "s");
5980 }
5981 // possible todo: replace variant_include with already_seen, and compact
5982 // qsr_include.
5983 // but for now, we just free already_seen, and in the common use cases
5984 // this should be fine.
5985 reterr = TextRetarget(score_info_ptr->input_fname, &score_txs);
5986 if (unlikely(reterr)) {
5987 goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5988 }
5989 BigstackReset(bigstack_mark2);
5990 line_idx = 0;
5991 } else {
5992 reterr = SizeAndInitTextStream(score_info_ptr->input_fname, bigstack_left() / 8, 1, &score_txs);
5993 if (unlikely(reterr)) {
5994 goto ScoreReport_ret_TSTREAM_FAIL;
5995 }
5996 }
5997 uint32_t lines_to_skip_p1 = 1 + ((flags / kfScoreHeaderIgnore) & 1);
5998 char* line_start;
5999 for (uint32_t uii = 0; uii != lines_to_skip_p1; ++uii) {
6000 ++line_idx;
6001 line_start = TextGet(&score_txs);
6002 if (unlikely(!line_start)) {
6003 if (!TextStreamErrcode2(&score_txs, &reterr)) {
6004 logerrputs("Error: Empty --score file.\n");
6005 goto ScoreReport_ret_MALFORMED_INPUT;
6006 }
6007 goto ScoreReport_ret_TSTREAM_FAIL;
6008 }
6009 }
6010 uint32_t last_col_idx = CountTokens(line_start);
6011 const uint32_t varid_col_idx = score_info_ptr->varid_col_p1 - 1;
6012 const uint32_t allele_col_idx = score_info_ptr->allele_col_p1 - 1;
6013 if (unlikely(MAXV(varid_col_idx, allele_col_idx) >= last_col_idx)) {
6014 goto ScoreReport_ret_MISSING_TOKENS;
6015 }
6016 uint32_t* score_col_idx_deltas = nullptr;
6017 uintptr_t score_col_ct = 1;
6018 if (!score_info_ptr->input_col_idx_range_list.name_ct) {
6019 if (unlikely(allele_col_idx == last_col_idx)) {
6020 goto ScoreReport_ret_MISSING_TOKENS;
6021 }
6022 if (unlikely(bigstack_alloc_u32(1, &score_col_idx_deltas))) {
6023 goto ScoreReport_ret_NOMEM;
6024 }
6025 // catch edge case
6026 if (unlikely(allele_col_idx + 1 == varid_col_idx)) {
6027 logerrputs("Error: --score variant ID column index matches a coefficient column index.\n");
6028 goto ScoreReport_ret_INVALID_CMDLINE;
6029 }
6030 score_col_idx_deltas[0] = allele_col_idx + 1;
6031 } else {
6032 unsigned char* bigstack_end_mark2 = g_bigstack_end;
6033 const uint32_t last_col_idxl = BitCtToWordCt(last_col_idx);
6034 uintptr_t* score_col_bitarr;
6035 if (unlikely(bigstack_end_calloc_w(last_col_idxl, &score_col_bitarr))) {
6036 goto ScoreReport_ret_NOMEM;
6037 }
6038 if (unlikely(NumericRangeListToBitarr(&(score_info_ptr->input_col_idx_range_list), last_col_idx, 1, 0, score_col_bitarr))) {
6039 goto ScoreReport_ret_MISSING_TOKENS;
6040 }
6041 if (unlikely(IsSet(score_col_bitarr, varid_col_idx))) {
6042 logerrputs("Error: --score variant ID column index matches a coefficient column index.\n");
6043 goto ScoreReport_ret_INVALID_CMDLINE;
6044 }
6045 if (unlikely(IsSet(score_col_bitarr, allele_col_idx))) {
6046 logerrputs("Error: --score allele column index matches a coefficient column index.\n");
6047 goto ScoreReport_ret_INVALID_CMDLINE;
6048 }
6049 score_col_ct = PopcountWords(score_col_bitarr, last_col_idxl);
6050 if (unlikely(bigstack_alloc_u32(score_col_ct, &score_col_idx_deltas))) {
6051 goto ScoreReport_ret_NOMEM;
6052 }
6053 uintptr_t col_uidx_base = 0;
6054 uintptr_t score_col_bitarr_bits = score_col_bitarr[0];
6055 for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6056 const uint32_t col_uidx = BitIter1(score_col_bitarr, &col_uidx_base, &score_col_bitarr_bits);
6057 score_col_idx_deltas[score_col_idx] = col_uidx;
6058 }
6059 // now convert to deltas
6060 for (uintptr_t score_col_idx = score_col_ct - 1; score_col_idx; --score_col_idx) {
6061 score_col_idx_deltas[score_col_idx] -= score_col_idx_deltas[score_col_idx - 1];
6062 }
6063 BigstackEndReset(bigstack_end_mark2);
6064 }
6065 char** score_col_names;
6066 if (unlikely(bigstack_alloc_cp(score_col_ct, &score_col_names))) {
6067 goto ScoreReport_ret_NOMEM;
6068 }
6069 char* write_iter = R_CAST(char*, g_bigstack_base);
6070 // don't have to worry about overflow, since linebuf was limited to 1/8
6071 // of available workspace.
6072 if (flags & kfScoreHeaderRead) {
6073 char* read_iter = line_start;
6074 for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6075 read_iter = NextTokenMult0(read_iter, score_col_idx_deltas[score_col_idx]);
6076 if (unlikely(!read_iter)) {
6077 goto ScoreReport_ret_MISSING_TOKENS;
6078 }
6079 score_col_names[score_col_idx] = write_iter;
6080 char* token_end = CurTokenEnd(read_iter);
6081 const uint32_t slen = token_end - read_iter;
6082 write_iter = memcpyax(write_iter, read_iter, slen, '\0');
6083 }
6084 } else {
6085 for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6086 score_col_names[score_col_idx] = write_iter;
6087 write_iter = strcpya_k(write_iter, "SCORE");
6088 write_iter = u32toa_x(score_col_idx + 1, '\0', write_iter);
6089 }
6090 }
6091 BigstackBaseSet(write_iter);
6092
6093 uint32_t score_final_col_ct = score_col_ct;
6094 if (qsr_ct) {
6095 const uint64_t prod = S_CAST(uint64_t, qsr_ct) * score_col_ct;
6096 if (prod > 0x7fffffff) {
6097 // little point in supporting this even in large-matrix build
6098 logerrputs("Error: <--score column count> * <--q-score-range range count> too large.\n");
6099 goto ScoreReport_ret_INCONSISTENT_INPUT;
6100 }
6101 #ifndef LAPACK_ILP64
6102 if (unlikely(prod > (0x7fffffff / kScoreVariantBlockSize))) {
6103 logerrputs("Error: <--score column count> * <--q-score-range range count> too large for\nthis " PROG_NAME_STR " build. If this is really the computation you want, use a " PROG_NAME_STR "\nbuild with large-matrix support.\n");
6104 goto ScoreReport_ret_INCONSISTENT_INPUT;
6105 }
6106 #endif
6107 score_final_col_ct = qsr_ct * score_col_ct;
6108 #ifndef LAPACK_ILP64
6109 } else {
6110 if (unlikely(score_final_col_ct > (0x7fffffff / kScoreVariantBlockSize))) {
6111 logerrputs("Error: --score column count too large for this " PROG_NAME_STR " build. If this is really\nthe computation you want, use a " PROG_NAME_STR " build with large-matrix support.\n");
6112 goto ScoreReport_ret_INCONSISTENT_INPUT;
6113 }
6114 #endif
6115 }
6116 CalcScoreCtx ctx;
6117 ctx.score_final_col_ct = score_final_col_ct;
6118 ctx.sample_ct = sample_ct;
6119 ctx.cur_batch_size = kScoreVariantBlockSize;
6120 if (unlikely(SetThreadCt(1, &tg))) {
6121 goto ScoreReport_ret_NOMEM;
6122 }
6123 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
6124 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6125 const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
6126 const uint32_t acc1_vec_ct = BitCtToVecCt(sample_ct);
6127 const uint32_t acc4_vec_ct = acc1_vec_ct * 4;
6128 const uint32_t acc8_vec_ct = acc1_vec_ct * 8;
6129 const uint32_t write_score_avgs = (flags / kfScoreColScoreAvgs) & 1;
6130 const uint32_t write_score_sums = (flags / kfScoreColScoreSums) & 1;
6131 const uintptr_t overflow_buf_size = RoundUpPow2((score_col_ct * (write_score_avgs + write_score_sums) + pheno_ct) * 16 + 3 * kMaxIdSlen + kCompressStreamBlock + 64, kCacheline);
6132 uintptr_t overflow_buf_alloc = overflow_buf_size;
6133 if (flags & (kfScoreZs | kfScoreListVariantsZs)) {
6134 overflow_buf_alloc += CstreamWkspaceReq(overflow_buf_size);
6135 }
6136 uintptr_t raw_allele_ct = 2 * raw_variant_ct;
6137 if (allele_idx_offsets) {
6138 raw_allele_ct = allele_idx_offsets[raw_variant_ct];
6139 }
6140 const uintptr_t raw_allele_ctl = BitCtToWordCt(raw_allele_ct);
6141 uint32_t* sample_include_cumulative_popcounts = nullptr;
6142 uintptr_t* sex_nonmale_collapsed = nullptr;
6143 uintptr_t* genovec_buf = nullptr;
6144 uintptr_t* dosage_present_buf = nullptr;
6145 Dosage* dosage_main_buf = nullptr;
6146 uintptr_t* missing_acc1 = nullptr;
6147 uintptr_t* missing_male_acc1 = nullptr;
6148 uint64_t* ddosage_sums;
6149 uint64_t* ddosage_incrs;
6150 uintptr_t* already_seen_variants;
6151 uintptr_t* already_seen_alleles;
6152 char* overflow_buf = nullptr;
6153 if (unlikely(
6154 bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(ctx.dosages_vmaj[0])) ||
6155 bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(ctx.dosages_vmaj[1])) ||
6156 bigstack_alloc_d(kScoreVariantBlockSize * score_final_col_ct, &(ctx.score_coefs_cmaj[0])) ||
6157 bigstack_alloc_d(kScoreVariantBlockSize * score_final_col_ct, &(ctx.score_coefs_cmaj[1])) ||
6158 bigstack_calloc_d(score_final_col_ct * sample_ct, &ctx.final_scores_cmaj) ||
6159 // bugfix (4 Nov 2017): need raw_sample_ctl here, not sample_ctl
6160 bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
6161 bigstack_alloc_w(sample_ctl, &sex_nonmale_collapsed) ||
6162 bigstack_alloc_w(sample_ctl2, &genovec_buf) ||
6163 bigstack_alloc_w(sample_ctl, &dosage_present_buf) ||
6164 bigstack_alloc_dosage(sample_ct, &dosage_main_buf) ||
6165 bigstack_alloc_w(45 * acc1_vec_ct * kWordsPerVec, &missing_acc1) ||
6166 bigstack_alloc_w(45 * acc1_vec_ct * kWordsPerVec, &missing_male_acc1) ||
6167 bigstack_calloc_u64(sample_ct, &ddosage_sums) ||
6168 bigstack_calloc_u64(sample_ct, &ddosage_incrs) ||
6169 bigstack_calloc_w(raw_variant_ctl, &already_seen_variants) ||
6170 bigstack_calloc_w(raw_allele_ctl, &already_seen_alleles) ||
6171 bigstack_alloc_c(overflow_buf_alloc, &overflow_buf))) {
6172 goto ScoreReport_ret_NOMEM;
6173 }
6174 SetThreadFuncAndData(CalcScoreThread, &ctx, &tg);
6175
6176 VecW* missing_diploid_acc4 = &(R_CAST(VecW*, missing_acc1)[acc1_vec_ct]);
6177 VecW* missing_diploid_acc8 = &(missing_diploid_acc4[acc4_vec_ct]);
6178 VecW* missing_diploid_acc32 = &(missing_diploid_acc8[acc8_vec_ct]);
6179 VecW* missing_haploid_acc4 = &(R_CAST(VecW*, missing_male_acc1)[acc1_vec_ct]);
6180 VecW* missing_haploid_acc8 = &(missing_haploid_acc4[acc4_vec_ct]);
6181 VecW* missing_haploid_acc32 = &(missing_haploid_acc8[acc8_vec_ct]);
6182 ZeroVecArr(acc4_vec_ct, missing_diploid_acc4);
6183 ZeroVecArr(acc8_vec_ct, missing_diploid_acc8);
6184 ZeroVecArr(acc8_vec_ct * 4, missing_diploid_acc32);
6185 ZeroVecArr(acc4_vec_ct, missing_haploid_acc4);
6186 ZeroVecArr(acc8_vec_ct, missing_haploid_acc8);
6187 ZeroVecArr(acc8_vec_ct * 4, missing_haploid_acc32);
6188 FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
6189 CopyBitarrSubset(sex_male, sample_include, sample_ct, sex_nonmale_collapsed);
6190 AlignedBitarrInvert(sample_ct, sex_nonmale_collapsed);
6191 const uint32_t nonmale_ct = PopcountWords(sex_nonmale_collapsed, sample_ctl);
6192 const uint32_t male_ct = sample_ct - nonmale_ct;
6193 if (!variant_id_htable) {
6194 reterr = AllocAndPopulateIdHtableMt(variant_include, variant_ids, variant_ct, 0, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size, nullptr);
6195 if (unlikely(reterr)) {
6196 goto ScoreReport_ret_1;
6197 }
6198 }
6199
6200 const uint32_t ignore_dup_ids = (flags / kfScoreIgnoreDupIds) & 1;
6201 const uint32_t list_variants = (flags / kfScoreListVariants) & 1;
6202 if (list_variants) {
6203 const uint32_t list_variants_zst = (flags / kfScoreListVariantsZs) & 1;
6204 OutnameZstSet(".sscore.vars", list_variants_zst, outname_end);
6205 reterr = InitCstream(outname, 0, list_variants_zst, max_thread_ct, overflow_buf_size, overflow_buf, R_CAST(unsigned char*, &(overflow_buf[overflow_buf_size])), &css);
6206 if (unlikely(reterr)) {
6207 goto ScoreReport_ret_1;
6208 }
6209 cswritep = overflow_buf;
6210 }
6211
6212 const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
6213 const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
6214 const uint32_t mt_code = cip->xymt_codes[kChrOffsetMT];
6215 const uint32_t model_dominant = (flags / kfScoreDominant) & 1;
6216 const uint32_t domrec = model_dominant || (flags & kfScoreRecessive);
6217 const uint32_t variance_standardize = (flags / kfScoreVarianceStandardize) & 1;
6218 const uint32_t center = variance_standardize || (flags & kfScoreCenter);
6219 const uint32_t no_meanimpute = (flags / kfScoreNoMeanimpute) & 1;
6220 const uint32_t se_mode = (flags / kfScoreSe) & 1;
6221 uint32_t block_vidx = 0;
6222 uint32_t parity = 0;
6223 uint32_t cur_allele_ct = 2;
6224 double* cur_dosages_vmaj_iter = ctx.dosages_vmaj[0];
6225 double* cur_score_coefs_cmaj = ctx.score_coefs_cmaj[0];
6226 double geno_slope = kRecipDosageMax;
6227 double geno_intercept = 0.0;
6228 double cur_allele_freq = 0.0;
6229 uint32_t variant_ct_rem15 = 15;
6230 uint32_t variant_ct_rem255d15 = 17;
6231 uint32_t variant_hap_ct_rem15 = 15;
6232 uint32_t variant_hap_ct_rem255d15 = 17;
6233 uint32_t allele_ct_base = 0;
6234 int32_t male_allele_ct_delta = 0;
6235 uint32_t valid_variant_ct = 0;
6236 uintptr_t missing_var_id_ct = 0;
6237 uintptr_t duplicated_var_id_ct = 0;
6238 uintptr_t missing_allele_code_ct = 0;
6239 #ifdef USE_MTBLAS
6240 const uint32_t matrix_multiply_thread_ct = (max_thread_ct > 1)? (max_thread_ct - 1) : 1;
6241 BLAS_SET_NUM_THREADS(matrix_multiply_thread_ct);
6242 #endif
6243 PgrSampleSubsetIndex pssi;
6244 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
6245 if (flags & kfScoreHeaderRead) {
6246 ++line_idx;
6247 line_start = TextGet(&score_txs);
6248 }
6249 for (; line_start; ++line_idx, line_start = TextGet(&score_txs)) {
6250 // varid_col_idx and allele_col_idx will almost always be very small
6251 char* variant_id_start = NextTokenMult0(line_start, varid_col_idx);
6252 if (unlikely(!variant_id_start)) {
6253 goto ScoreReport_ret_MISSING_TOKENS;
6254 }
6255 char* variant_id_token_end = CurTokenEnd(variant_id_start);
6256 const uint32_t variant_id_slen = variant_id_token_end - variant_id_start;
6257 uint32_t variant_uidx = VariantIdDupflagHtableFind(variant_id_start, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen);
6258 if (variant_uidx >> 31) {
6259 ++missing_var_id_ct;
6260 if (variant_uidx != UINT32_MAX) {
6261 if (unlikely(!ignore_dup_ids)) {
6262 snprintf(g_logbuf, kLogbufSize, "Error: --score variant ID '%s' appears multiple times in main dataset.\n", variant_ids[variant_uidx & 0x7fffffff]);
6263 goto ScoreReport_ret_INCONSISTENT_INPUT_WW;
6264 }
6265 ++duplicated_var_id_ct;
6266 // subtract this from missing_var_id_ct later
6267 }
6268 continue;
6269 }
6270 char* allele_start = NextTokenMult0(line_start, allele_col_idx);
6271 if (unlikely(!allele_start)) {
6272 goto ScoreReport_ret_MISSING_TOKENS;
6273 }
6274 uintptr_t allele_idx_offset_base;
6275 if (!allele_idx_offsets) {
6276 allele_idx_offset_base = variant_uidx * 2;
6277 } else {
6278 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
6279 cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
6280 }
6281 char* allele_end = CurTokenEnd(allele_start);
6282 char allele_end_char = *allele_end;
6283 *allele_end = '\0';
6284 const uint32_t allele_blen = 1 + S_CAST(uintptr_t, allele_end - allele_start);
6285 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
6286
6287 uint32_t cur_allele_idx = 0;
6288 for (; cur_allele_idx != cur_allele_ct; ++cur_allele_idx) {
6289 if (memequal(allele_start, cur_alleles[cur_allele_idx], allele_blen)) {
6290 break;
6291 }
6292 }
6293 // compiler is smart enough to avoid repeating this test
6294 if (cur_allele_idx == cur_allele_ct) {
6295 ++missing_allele_code_ct;
6296 continue;
6297 }
6298 const uintptr_t allele_uidx = allele_idx_offset_base + cur_allele_idx;
6299 if (unlikely(IsSet(already_seen_alleles, allele_uidx))) {
6300 char* errwrite_iter = strcpya_k(g_logbuf, "Error: ");
6301 // Don't write allele code, since it might be too long for the buffer.
6302 if (!cur_allele_idx) {
6303 errwrite_iter = strcpya_k(errwrite_iter, "REF");
6304 } else {
6305 errwrite_iter = strcpya_k(errwrite_iter, "ALT");
6306 errwrite_iter = u32toa(cur_allele_idx, errwrite_iter);
6307 }
6308 errwrite_iter = strcpya_k(errwrite_iter, " allele for variant '");
6309 errwrite_iter = strcpya(errwrite_iter, variant_ids[variant_uidx]);
6310 strcpy_k(errwrite_iter, "' appears multiple times in --score file.\n");
6311 goto ScoreReport_ret_MALFORMED_INPUT_WW;
6312 }
6313 SetBit(allele_uidx, already_seen_alleles);
6314 const uint32_t is_new_variant = 1 - IsSet(already_seen_variants, variant_uidx);
6315 SetBit(variant_uidx, already_seen_variants);
6316
6317 // okay, the variant and allele are in our dataset. Load it.
6318 // (possible todo: avoid reloading the same variant multiple times in a
6319 // row.)
6320 uint32_t dosage_ct;
6321 reterr = PgrGet1D(sample_include, pssi, sample_ct, variant_uidx, cur_allele_idx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_main_buf, &dosage_ct);
6322 if (unlikely(reterr)) {
6323 goto ScoreReport_ret_PGR_FAIL;
6324 }
6325 const uint32_t chr_idx = GetVariantChr(cip, variant_uidx);
6326 uint32_t is_nonx_haploid = IsSet(cip->haploid_mask, chr_idx);
6327 if (unlikely(domrec && is_nonx_haploid)) {
6328 logerrputs("Error: --score 'dominant' and 'recessive' modifiers cannot be used with haploid\nchromosomes.\n");
6329 goto ScoreReport_ret_INCONSISTENT_INPUT;
6330 }
6331 uint32_t is_relevant_x = (chr_idx == x_code);
6332 if (unlikely(variance_standardize && (is_relevant_x || (chr_idx == mt_code)))) {
6333 logerrputs("Error: --score 'variance-standardize' cannot be used with chrX or MT.\n");
6334 goto ScoreReport_ret_INCONSISTENT_INPUT;
6335 }
6336 is_nonx_haploid = (!is_relevant_x) && is_nonx_haploid;
6337
6338 // only if --xchr-model 1 (which is no longer the default)
6339 is_relevant_x = is_relevant_x && xchr_model;
6340
6341 const uint32_t is_y = (chr_idx == y_code);
6342 ZeroTrailingNyps(sample_ct, genovec_buf);
6343 GenoarrToMissingnessUnsafe(genovec_buf, sample_ct, missing_acc1);
6344 if (dosage_ct) {
6345 BitvecInvmask(dosage_present_buf, sample_ctl, missing_acc1);
6346 }
6347 FillCurDdosageInts(genovec_buf, dosage_present_buf, dosage_main_buf, sample_ct, dosage_ct, 2 - is_nonx_haploid, ddosage_incrs);
6348 double ploidy_d;
6349 if (is_nonx_haploid) {
6350 if (is_y) {
6351 uintptr_t sample_idx_base = 0;
6352 uintptr_t sex_nonmale_collapsed_bits = sex_nonmale_collapsed[0];
6353 for (uint32_t nonmale_idx = 0; nonmale_idx != nonmale_ct; ++nonmale_idx) {
6354 const uintptr_t sample_idx = BitIter1(sex_nonmale_collapsed, &sample_idx_base, &sex_nonmale_collapsed_bits);
6355 ddosage_incrs[sample_idx] = 0;
6356 }
6357 male_allele_ct_delta += is_new_variant;
6358 BitvecInvmask(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6359 } else {
6360 allele_ct_base += is_new_variant;
6361 }
6362 if (is_new_variant) {
6363 VcountIncr1To4(missing_acc1, acc1_vec_ct, missing_haploid_acc4);
6364 if (!(--variant_hap_ct_rem15)) {
6365 Vcount0Incr4To8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
6366 variant_hap_ct_rem15 = 15;
6367 if (!(--variant_hap_ct_rem255d15)) {
6368 Vcount0Incr8To32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
6369 variant_hap_ct_rem255d15 = 17;
6370 }
6371 }
6372 }
6373 if (is_y) {
6374 memcpy(missing_male_acc1, missing_acc1, sample_ctl * sizeof(intptr_t));
6375 BitvecOr(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6376 }
6377 ploidy_d = 1.0;
6378 } else {
6379 if (is_relevant_x) {
6380 uintptr_t sample_idx_base = 0;
6381 uintptr_t sex_nonmale_collapsed_inv_bits = ~sex_nonmale_collapsed[0];
6382 for (uint32_t male_idx = 0; male_idx != male_ct; ++male_idx) {
6383 const uintptr_t sample_idx = BitIter0(sex_nonmale_collapsed, &sample_idx_base, &sex_nonmale_collapsed_inv_bits);
6384 ddosage_incrs[sample_idx] /= 2;
6385 }
6386 BitvecInvmaskCopy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
6387 BitvecAnd(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6388 }
6389 if (is_new_variant) {
6390 VcountIncr1To4(missing_acc1, acc1_vec_ct, missing_diploid_acc4);
6391 if (!(--variant_ct_rem15)) {
6392 Vcount0Incr4To8(acc4_vec_ct, missing_diploid_acc4, missing_diploid_acc8);
6393 variant_ct_rem15 = 15;
6394 if (!(--variant_ct_rem255d15)) {
6395 Vcount0Incr8To32(acc8_vec_ct, missing_diploid_acc8, missing_diploid_acc32);
6396 variant_ct_rem255d15 = 17;
6397 }
6398 }
6399 allele_ct_base += 2;
6400 }
6401 if (is_relevant_x) {
6402 if (is_new_variant) {
6403 --male_allele_ct_delta;
6404 VcountIncr1To4(missing_male_acc1, acc1_vec_ct, missing_haploid_acc4);
6405 if (!(--variant_hap_ct_rem15)) {
6406 Vcount0Incr4To8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
6407 variant_hap_ct_rem15 = 15;
6408 if (!(--variant_hap_ct_rem255d15)) {
6409 Vcount0Incr8To32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
6410 variant_hap_ct_rem255d15 = 17;
6411 }
6412 }
6413 }
6414 BitvecOr(missing_male_acc1, sample_ctl, missing_acc1);
6415 }
6416 if (!domrec) {
6417 ploidy_d = 2.0;
6418 } else {
6419 if (model_dominant) {
6420 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6421 if (ddosage_incrs[sample_idx] > kDosageMax) {
6422 ddosage_incrs[sample_idx] = kDosageMax;
6423 }
6424 }
6425 } else {
6426 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6427 uint64_t cur_ddosage_incr = ddosage_incrs[sample_idx];
6428 if (cur_ddosage_incr <= kDosageMax) {
6429 cur_ddosage_incr = 0;
6430 } else {
6431 cur_ddosage_incr -= kDosageMax;
6432 }
6433 ddosage_incrs[sample_idx] = cur_ddosage_incr;
6434 }
6435 }
6436 ploidy_d = 1.0;
6437 }
6438 }
6439 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6440 ddosage_sums[sample_idx] += ddosage_incrs[sample_idx];
6441 }
6442 if (allele_freqs) {
6443 cur_allele_freq = GetAlleleFreq(&(allele_freqs[allele_idx_offset_base - variant_uidx]), cur_allele_idx, cur_allele_ct);
6444 }
6445 if (center) {
6446 if (variance_standardize) {
6447 const double variance = ploidy_d * 0.5 * ComputeDiploidMultiallelicVariance(&(allele_freqs[allele_idx_offset_base - variant_uidx]), cur_allele_ct);
6448 if (!(variance > kSmallEpsilon)) {
6449 // ZeroTrailingNyps(sample_ct, genovec_buf);
6450 STD_ARRAY_DECL(uint32_t, 4, genocounts);
6451 GenoarrCountFreqsUnsafe(genovec_buf, sample_ct, genocounts);
6452 if (unlikely(dosage_ct || genocounts[1] || genocounts[2])) {
6453 snprintf(g_logbuf, kLogbufSize, "Error: --score variance-standardize failure for variant '%s': estimated allele frequency is zero or NaN, but not all dosages are zero. (This is possible when e.g. allele frequencies are estimated from founders, but the allele is only observed in nonfounders.)\n", variant_ids[variant_uidx]);
6454 goto ScoreReport_ret_DEGENERATE_DATA_WW;
6455 }
6456 geno_slope = 0.0;
6457 } else {
6458 geno_slope = kRecipDosageMax / sqrt(variance);
6459 }
6460 }
6461 // (ploidy * cur_allele_freq * kDosageMax) * geno_slope +
6462 // geno_intercept == 0
6463 // bugfix: must use "-1.0 *" instead of - to avoid unsigned int
6464 // wraparound
6465 geno_intercept = (-1.0 * kDosageMax) * ploidy_d * cur_allele_freq * geno_slope;
6466 }
6467 const uint32_t missing_ct = PopcountWords(missing_acc1, sample_ctl);
6468 const uint32_t nm_sample_ct = sample_ct - missing_ct;
6469 if (missing_ct) {
6470 double missing_effect = 0.0;
6471 if (!no_meanimpute) {
6472 missing_effect = kDosageMax * cur_allele_freq * geno_slope;
6473 }
6474 uintptr_t sample_idx_base = 0;
6475 if (is_y || is_relevant_x) {
6476 ZeroDArr(sample_ct, cur_dosages_vmaj_iter);
6477 if (!no_meanimpute) {
6478 const uint32_t male_missing_ct = PopcountWords(missing_male_acc1, sample_ctl);
6479 uintptr_t missing_male_acc1_bits = missing_male_acc1[0];
6480 for (uint32_t male_missing_idx = 0; male_missing_idx != male_missing_ct; ++male_missing_idx) {
6481 const uintptr_t sample_idx = BitIter1(missing_male_acc1, &sample_idx_base, &missing_male_acc1_bits);
6482 cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6483 }
6484 if (is_relevant_x) {
6485 // missing_male_acc1 not used after this point, so okay to
6486 // use buffer for nonmales
6487 BitvecAndCopy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
6488 missing_effect *= 2;
6489 // bugfix (8 Jul 2018): need to reset sample_idx
6490 sample_idx_base = 0;
6491 missing_male_acc1_bits = missing_male_acc1[0];
6492 const uint32_t nonmale_missing_ct = PopcountWords(missing_male_acc1, sample_ctl);
6493 for (uint32_t nonmale_missing_idx = 0; nonmale_missing_idx != nonmale_missing_ct; ++nonmale_missing_idx) {
6494 const uintptr_t sample_idx = BitIter1(missing_male_acc1, &sample_idx_base, &missing_male_acc1_bits);
6495 cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6496 }
6497 }
6498 }
6499 } else {
6500 missing_effect *= ploidy_d;
6501 uintptr_t missing_acc1_bits = missing_acc1[0];
6502 for (uint32_t missing_idx = 0; missing_idx != missing_ct; ++missing_idx) {
6503 const uintptr_t sample_idx = BitIter1(missing_acc1, &sample_idx_base, &missing_acc1_bits);
6504 cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6505 }
6506 }
6507 }
6508 uintptr_t sample_idx_base = 0;
6509 uintptr_t missing_acc1_inv_bits = ~missing_acc1[0];
6510 for (uint32_t nm_sample_idx = 0; nm_sample_idx != nm_sample_ct; ++nm_sample_idx) {
6511 const uintptr_t sample_idx = BitIter0(missing_acc1, &sample_idx_base, &missing_acc1_inv_bits);
6512 cur_dosages_vmaj_iter[sample_idx] = u63tod(ddosage_incrs[sample_idx]) * geno_slope + geno_intercept;
6513 }
6514 if (se_mode) {
6515 // Suppose our score coefficients are drawn from independent Gaussians.
6516 // Then the variance of the final score average is the sum of the
6517 // variances of the individual terms, divided by (T^2) where T is the
6518 // number of terms. These individual variances are of the form
6519 // (<genotype value> * <stdev>)^2.
6520 //
6521 // Thus, we can use the same inner loop to compute standard errors, as
6522 // long as
6523 // 1. we square the genotypes and the standard errors before matrix
6524 // multiplication, and
6525 // 2. we take the square root of the sums at the end.
6526 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6527 cur_dosages_vmaj_iter[sample_idx] *= cur_dosages_vmaj_iter[sample_idx];
6528 }
6529 }
6530 cur_dosages_vmaj_iter = &(cur_dosages_vmaj_iter[sample_ct]);
6531
6532 *allele_end = allele_end_char;
6533 double* cur_score_coefs_iter = &(cur_score_coefs_cmaj[block_vidx]);
6534 const char* read_iter = line_start;
6535 for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6536 read_iter = NextTokenMult0(read_iter, score_col_idx_deltas[score_col_idx]);
6537 if (unlikely(!read_iter)) {
6538 goto ScoreReport_ret_MISSING_TOKENS;
6539 }
6540 double raw_coef;
6541 const char* token_end = ScantokDouble(read_iter, &raw_coef);
6542 if (unlikely(!token_end)) {
6543 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --score file has an invalid coefficient.\n", line_idx);
6544 goto ScoreReport_ret_MALFORMED_INPUT_2;
6545 }
6546 if (!qsr_ct) {
6547 *cur_score_coefs_iter = raw_coef;
6548 cur_score_coefs_iter = &(cur_score_coefs_iter[kScoreVariantBlockSize]);
6549 } else {
6550 const uintptr_t bit_idx_base = RawToSubsettedPos(variant_include, variant_include_cumulative_popcounts, variant_uidx) * qsr_ct;
6551 for (uint32_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
6552 double cur_coef = raw_coef * u31tod(IsSet(qsr_include, qsr_idx + bit_idx_base));
6553 *cur_score_coefs_iter = cur_coef;
6554 cur_score_coefs_iter = &(cur_score_coefs_iter[kScoreVariantBlockSize]);
6555 }
6556 }
6557 read_iter = token_end;
6558 }
6559 if (is_new_variant) {
6560 if (list_variants) {
6561 cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
6562 AppendBinaryEoln(&cswritep);
6563 if (unlikely(Cswrite(&css, &cswritep))) {
6564 goto ScoreReport_ret_WRITE_FAIL;
6565 }
6566 }
6567 ++valid_variant_ct;
6568 if (!(valid_variant_ct % 10000)) {
6569 printf("\r--score: %uk variants loaded.", valid_variant_ct / 1000);
6570 fflush(stdout);
6571 }
6572 }
6573 ++block_vidx;
6574 if (block_vidx == kScoreVariantBlockSize) {
6575 if (se_mode) {
6576 for (uintptr_t ulii = 0; ulii != kScoreVariantBlockSize * score_final_col_ct; ++ulii) {
6577 cur_score_coefs_cmaj[ulii] *= cur_score_coefs_cmaj[ulii];
6578 }
6579 }
6580 parity = 1 - parity;
6581 const uint32_t is_not_first_block = ThreadsAreActive(&tg);
6582 if (is_not_first_block) {
6583 JoinThreads(&tg);
6584 // CalcScoreThread() never errors out
6585 }
6586 if (unlikely(SpawnThreads(&tg))) {
6587 goto ScoreReport_ret_THREAD_CREATE_FAIL;
6588 }
6589 cur_dosages_vmaj_iter = ctx.dosages_vmaj[parity];
6590 cur_score_coefs_cmaj = ctx.score_coefs_cmaj[parity];
6591 block_vidx = 0;
6592 }
6593 }
6594 if (unlikely(TextStreamErrcode2(&score_txs, &reterr))) {
6595 goto ScoreReport_ret_TSTREAM_FAIL;
6596 }
6597 VcountIncr4To8(missing_diploid_acc4, acc4_vec_ct, missing_diploid_acc8);
6598 VcountIncr8To32(missing_diploid_acc8, acc8_vec_ct, missing_diploid_acc32);
6599 VcountIncr4To8(missing_haploid_acc4, acc4_vec_ct, missing_haploid_acc8);
6600 VcountIncr8To32(missing_haploid_acc8, acc8_vec_ct, missing_haploid_acc32);
6601 const uint32_t is_not_first_block = ThreadsAreActive(&tg);
6602 putc_unlocked('\r', stdout);
6603 if (missing_var_id_ct || missing_allele_code_ct || duplicated_var_id_ct) {
6604 missing_var_id_ct -= duplicated_var_id_ct;
6605 if (!missing_var_id_ct) {
6606 if (missing_allele_code_ct) {
6607 snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_allele_code_ct, (missing_allele_code_ct == 1)? "y was skipped due to a mismatching allele code" : "ies were skipped due to mismatching allele codes");
6608 }
6609 } else if (!missing_allele_code_ct) {
6610 snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs");
6611 } else {
6612 snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s, and %" PRIuPTR " %s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs", missing_allele_code_ct, (missing_allele_code_ct == 1)? "was skipped due to a mismatching allele code" : "were skipped due to mismatching allele codes");
6613 }
6614 WordWrapB(0);
6615 logerrputsb();
6616 if (duplicated_var_id_ct) {
6617 logerrprintfww("Warning: %" PRIuPTR " --score file entr%s appear multiple times in the main dataset.\n", duplicated_var_id_ct, (duplicated_var_id_ct == 1)? "y was skipped since its variant ID" : "ies were skipped since their variant IDs");
6618 }
6619 if (!list_variants) {
6620 logerrputs("(Add the 'list-variants' modifier to see which variants were actually used for\nscoring.)\n");
6621 }
6622 }
6623 if (block_vidx) {
6624 if (is_not_first_block) {
6625 JoinThreads(&tg);
6626 }
6627 } else if (unlikely(!valid_variant_ct)) {
6628 logerrputs("Error: No valid variants in --score file.\n");
6629 goto ScoreReport_ret_DEGENERATE_DATA;
6630 } else {
6631 JoinThreads(&tg);
6632 }
6633 DeclareLastThreadBlock(&tg);
6634 ctx.cur_batch_size = block_vidx;
6635 if (se_mode) {
6636 for (uintptr_t score_final_col_idx = 0; score_final_col_idx != score_final_col_ct; ++score_final_col_idx) {
6637 double* cur_score_coefs_row = &(cur_score_coefs_cmaj[score_final_col_idx * kScoreVariantBlockSize]);
6638 for (uint32_t uii = 0; uii != block_vidx; ++uii) {
6639 cur_score_coefs_row[uii] *= cur_score_coefs_row[uii];
6640 }
6641 }
6642 }
6643 if (unlikely(SpawnThreads(&tg))) {
6644 goto ScoreReport_ret_THREAD_CREATE_FAIL;
6645 }
6646 JoinThreads(&tg);
6647 if (se_mode) {
6648 // sample_ct * score_final_col_ct
6649 for (uintptr_t ulii = 0; ulii != sample_ct * score_final_col_ct; ++ulii) {
6650 ctx.final_scores_cmaj[ulii] = sqrt(ctx.final_scores_cmaj[ulii]);
6651 }
6652 }
6653 logprintf("--score: %u variant%s processed.\n", valid_variant_ct, (valid_variant_ct == 1)? "" : "s");
6654 if (list_variants) {
6655 if (unlikely(CswriteCloseNull(&css, cswritep))) {
6656 goto ScoreReport_ret_WRITE_FAIL;
6657 }
6658 cswritep = nullptr;
6659 logprintf("Variant list written to %s .\n", outname);
6660 }
6661
6662 const uint32_t qsr_ct_nz = qsr_ct + (qsr_ct == 0);
6663 for (uint32_t qsr_idx = 0; qsr_idx != qsr_ct_nz; ++qsr_idx) {
6664 char* outname_end2 = outname_end;
6665 if (range_names) {
6666 *outname_end2++ = '.';
6667 outname_end2 = strcpya(outname_end2, range_names[qsr_idx]);
6668 }
6669 OutnameZstSet(".sscore", output_zst, outname_end2);
6670 reterr = InitCstream(outname, 0, output_zst, max_thread_ct, overflow_buf_size, overflow_buf, R_CAST(unsigned char*, &(overflow_buf[overflow_buf_size])), &css);
6671 if (unlikely(reterr)) {
6672 goto ScoreReport_ret_1;
6673 }
6674 cswritep = overflow_buf;
6675 const uint32_t write_fid = FidColIsRequired(siip, flags / kfScoreColMaybefid);
6676 const char* sample_ids = siip->sample_ids;
6677 const char* sids = siip->sids;
6678 const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
6679 const uintptr_t max_sid_blen = siip->max_sid_blen;
6680 const uint32_t write_sid = SidColIsRequired(sids, flags / kfScoreColMaybesid);
6681 const uint32_t write_empty_pheno = (flags & kfScoreColPheno1) && (!pheno_ct);
6682 const uint32_t write_phenos = (flags & (kfScoreColPheno1 | kfScoreColPhenos)) && pheno_ct;
6683 if (write_phenos && (!(flags & kfScoreColPhenos))) {
6684 pheno_ct = 1;
6685 }
6686 *cswritep++ = '#';
6687 if (write_fid) {
6688 cswritep = strcpya_k(cswritep, "FID\t");
6689 }
6690 cswritep = strcpya_k(cswritep, "IID");
6691 if (write_sid) {
6692 cswritep = strcpya_k(cswritep, "\tSID");
6693 }
6694 if (write_phenos) {
6695 for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
6696 *cswritep++ = '\t';
6697 cswritep = strcpya(cswritep, &(pheno_names[pheno_idx * max_pheno_name_blen]));
6698 if (unlikely(Cswrite(&css, &cswritep))) {
6699 goto ScoreReport_ret_WRITE_FAIL;
6700 }
6701 }
6702 } else if (write_empty_pheno) {
6703 cswritep = strcpya_k(cswritep, "\tPHENO1");
6704 }
6705 const uint32_t write_nallele = (flags / kfScoreColNallele) & 1;
6706 if (write_nallele) {
6707 cswritep = strcpya_k(cswritep, "\tALLELE_CT");
6708 }
6709 const uint32_t write_denom = (flags / kfScoreColDenom) & 1;
6710 if (write_denom) {
6711 cswritep = strcpya_k(cswritep, "\tDENOM");
6712 }
6713 const uint32_t write_dosage_sum = (flags / kfScoreColDosageSum) & 1;
6714 if (write_dosage_sum) {
6715 cswritep = strcpya_k(cswritep, "\tNAMED_ALLELE_DOSAGE_SUM");
6716 }
6717 if (write_score_avgs) {
6718 for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6719 *cswritep++ = '\t';
6720 cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
6721 cswritep = strcpya_k(cswritep, "_AVG");
6722 if (unlikely(Cswrite(&css, &cswritep))) {
6723 goto ScoreReport_ret_WRITE_FAIL;
6724 }
6725 }
6726 }
6727 if (write_score_sums) {
6728 for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6729 *cswritep++ = '\t';
6730 cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
6731 cswritep = strcpya_k(cswritep, "_SUM");
6732 if (unlikely(Cswrite(&css, &cswritep))) {
6733 goto ScoreReport_ret_WRITE_FAIL;
6734 }
6735 }
6736 }
6737 AppendBinaryEoln(&cswritep);
6738 const uint32_t* scrambled_missing_diploid_cts = R_CAST(uint32_t*, missing_diploid_acc32);
6739 const uint32_t* scrambled_missing_haploid_cts = R_CAST(uint32_t*, missing_haploid_acc32);
6740 const char* output_missing_pheno = g_output_missing_pheno;
6741 const uint32_t omp_slen = strlen(output_missing_pheno);
6742
6743 uintptr_t sample_uidx_base = 0;
6744 uintptr_t sample_include_bits = sample_include[0];
6745 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6746 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
6747 const char* cur_sample_id = &(sample_ids[sample_uidx * max_sample_id_blen]);
6748 if (!write_fid) {
6749 cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
6750 }
6751 cswritep = strcpya(cswritep, cur_sample_id);
6752 if (write_sid) {
6753 *cswritep++ = '\t';
6754 if (sids) {
6755 cswritep = strcpya(cswritep, &(sids[max_sid_blen * sample_uidx]));
6756 } else {
6757 *cswritep++ = '0';
6758 }
6759 }
6760 if (write_phenos) {
6761 // er, this probably belongs in its own function
6762 for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
6763 const PhenoCol* cur_pheno_col = &(pheno_cols[pheno_idx]);
6764 const PhenoDtype type_code = cur_pheno_col->type_code;
6765 *cswritep++ = '\t';
6766 if (type_code <= kPhenoDtypeQt) {
6767 if (!IsSet(cur_pheno_col->nonmiss, sample_uidx)) {
6768 cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
6769 } else if (type_code == kPhenoDtypeCc) {
6770 *cswritep++ = '1' + IsSet(cur_pheno_col->data.cc, sample_uidx);
6771 } else {
6772 cswritep = dtoa_g(cur_pheno_col->data.qt[sample_uidx], cswritep);
6773 }
6774 } else {
6775 // category index guaranteed to be zero for missing values
6776 cswritep = strcpya(cswritep, cur_pheno_col->category_names[cur_pheno_col->data.cat[sample_uidx]]);
6777 if (unlikely(Cswrite(&css, &cswritep))) {
6778 goto ScoreReport_ret_WRITE_FAIL;
6779 }
6780 }
6781 }
6782 } else if (write_empty_pheno) {
6783 *cswritep++ = '\t';
6784 cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
6785 }
6786 const uint32_t scrambled_idx = VcountScramble1(sample_idx);
6787 uint32_t denom = allele_ct_base + IsSet(sex_male, sample_uidx) * male_allele_ct_delta;
6788 const uint32_t nallele = denom - 2 * scrambled_missing_diploid_cts[scrambled_idx] - scrambled_missing_haploid_cts[scrambled_idx];
6789 if (write_nallele) {
6790 *cswritep++ = '\t';
6791 cswritep = u32toa(nallele, cswritep);
6792 }
6793 if (no_meanimpute) {
6794 denom = nallele;
6795 }
6796 if (write_denom) {
6797 *cswritep++ = '\t';
6798 cswritep = u32toa(denom, cswritep);
6799 }
6800 if (write_dosage_sum) {
6801 *cswritep++ = '\t';
6802 cswritep = ddosagetoa(ddosage_sums[sample_idx], cswritep);
6803 }
6804 const double* final_score_col = &(ctx.final_scores_cmaj[sample_idx]);
6805 if (write_score_avgs) {
6806 const double denom_recip = 1.0 / S_CAST(double, denom);
6807 for (uintptr_t score_final_col_idx = qsr_idx; score_final_col_idx < score_final_col_ct; score_final_col_idx += qsr_ct_nz) {
6808 *cswritep++ = '\t';
6809 cswritep = dtoa_g(final_score_col[score_final_col_idx * sample_ct] * denom_recip, cswritep);
6810 }
6811 }
6812 if (write_score_sums) {
6813 for (uint32_t score_final_col_idx = qsr_idx; score_final_col_idx < score_final_col_ct; score_final_col_idx += qsr_ct_nz) {
6814 *cswritep++ = '\t';
6815 cswritep = dtoa_g(final_score_col[score_final_col_idx * sample_ct], cswritep);
6816 }
6817 }
6818 AppendBinaryEoln(&cswritep);
6819 if (unlikely(Cswrite(&css, &cswritep))) {
6820 goto ScoreReport_ret_WRITE_FAIL;
6821 }
6822 }
6823 if (unlikely(CswriteCloseNull(&css, cswritep))) {
6824 goto ScoreReport_ret_WRITE_FAIL;
6825 }
6826 }
6827 if (!qsr_ct) {
6828 logprintfww("--score: Results written to %s .\n", outname);
6829 } else {
6830 *outname_end = '\0';
6831 logprintfww("--score + --q-score-range: Results written to %s.<range name>.sscore%s .\n", outname, output_zst? ".zst" : "");
6832 }
6833 }
6834 while (0) {
6835 ScoreReport_ret_TSTREAM_FAIL:
6836 TextStreamErrPrint("--score file", &score_txs);
6837 break;
6838 ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL:
6839 TextStreamErrPrint("--q-score-range range file", &score_txs);
6840 break;
6841 ScoreReport_ret_QSR_DATA_TSTREAM_FAIL:
6842 TextStreamErrPrint("--q-score-range data file", &score_txs);
6843 break;
6844 ScoreReport_ret_NOMEM:
6845 reterr = kPglRetNomem;
6846 break;
6847 ScoreReport_ret_PGR_FAIL:
6848 PgenErrPrintN(reterr);
6849 break;
6850 ScoreReport_ret_WRITE_FAIL:
6851 reterr = kPglRetWriteFail;
6852 break;
6853 ScoreReport_ret_INVALID_CMDLINE:
6854 reterr = kPglRetInvalidCmdline;
6855 break;
6856 ScoreReport_ret_MALFORMED_INPUT_WW:
6857 WordWrapB(0);
6858 ScoreReport_ret_MALFORMED_INPUT_2:
6859 logputs("\n");
6860 logerrputsb();
6861 ScoreReport_ret_MALFORMED_INPUT:
6862 reterr = kPglRetMalformedInput;
6863 break;
6864 ScoreReport_ret_QSR_DATA_MISSING_TOKENS:
6865 logerrprintfww("Error: Line %" PRIuPTR " of --q-score-range data file has fewer tokens than expected.\n", line_idx);
6866 reterr = kPglRetInconsistentInput;
6867 break;
6868 ScoreReport_ret_MISSING_TOKENS:
6869 logputs("\n");
6870 logerrprintfww("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, score_info_ptr->input_fname);
6871 reterr = kPglRetInconsistentInput;
6872 break;
6873 ScoreReport_ret_INCONSISTENT_INPUT_WW:
6874 WordWrapB(0);
6875 logputs("\n");
6876 logerrputsb();
6877 ScoreReport_ret_INCONSISTENT_INPUT:
6878 reterr = kPglRetInconsistentInput;
6879 break;
6880 ScoreReport_ret_THREAD_CREATE_FAIL:
6881 reterr = kPglRetThreadCreateFail;
6882 break;
6883 ScoreReport_ret_DEGENERATE_DATA_WW:
6884 WordWrapB(0);
6885 logputs("\n");
6886 logerrputsb();
6887 ScoreReport_ret_DEGENERATE_DATA:
6888 reterr = kPglRetDegenerateData;
6889 break;
6890 }
6891 ScoreReport_ret_1:
6892 CswriteCloseCond(&css, cswritep);
6893 CleanupThreads(&tg);
6894 BLAS_SET_NUM_THREADS(1);
6895 CleanupTextStream2("--score file", &score_txs, &reterr);
6896 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
6897 return reterr;
6898 }
6899
6900 typedef struct VscoreCtxStruct {
6901 const uintptr_t* variant_include;
6902 const ChrInfo* cip;
6903 const uintptr_t* allele_idx_offsets;
6904 const double* allele_freqs;
6905 const uintptr_t* sample_include;
6906 const uint32_t* sample_include_cumulative_popcounts;
6907 const uintptr_t* sex_male_collapsed;
6908 const uintptr_t* sex_male_interleaved_vec;
6909 const double* wts_smaj;
6910 uint32_t vscore_ct;
6911 uint32_t sample_ct;
6912 uint32_t male_ct;
6913 uint32_t is_xchr_model_1;
6914
6915 PgenReader** pgr_ptrs;
6916 uintptr_t** genovecs;
6917 uintptr_t** raregenos;
6918 uint32_t** difflist_sample_id_bufs;
6919 uintptr_t** dosage_presents;
6920 Dosage** dosage_mains;
6921 uint32_t* read_variant_uidx_starts;
6922
6923 uint32_t cur_block_size;
6924
6925 double** dosage_vmaj_bufs;
6926 double** tmp_result_bufs;
6927
6928 // variant-major
6929 double* results[2];
6930
6931 uint32_t* missing_cts[2];
6932
6933 // only kPglRetMalformedInput possible, no atomic ops needed
6934 PglErr reterr;
6935 } VscoreCtx;
6936
6937 // This setting seems optimal on my Mac (smaller doesn't take full advantage of
6938 // AVX, larger creates cache problems?).
6939 CONSTI32(kVscoreBlockSize, 32);
6940
VscoreThread(void * raw_arg)6941 THREAD_FUNC_DECL VscoreThread(void* raw_arg) {
6942 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
6943 const uintptr_t tidx = arg->tidx;
6944 VscoreCtx* ctx = S_CAST(VscoreCtx*, arg->sharedp->context);
6945
6946 const uintptr_t* variant_include = ctx->variant_include;
6947 const ChrInfo* cip = ctx->cip;
6948 const uintptr_t* allele_idx_offsets = ctx->allele_idx_offsets;
6949 const double* allele_freqs = ctx->allele_freqs;
6950 const uintptr_t* sample_include = ctx->sample_include;
6951 const uintptr_t* sex_male = ctx->sex_male_collapsed;
6952 const uintptr_t* sex_male_interleaved_vec = ctx->sex_male_interleaved_vec;
6953 const double* wts_smaj = ctx->wts_smaj;
6954
6955 PgenReader* pgrp = ctx->pgr_ptrs[tidx];
6956 PgrSampleSubsetIndex pssi;
6957 PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
6958 uintptr_t* genovec = ctx->genovecs[tidx];
6959 uintptr_t* raregeno = ctx->raregenos[tidx];
6960 uint32_t* difflist_sample_ids = ctx->difflist_sample_id_bufs[tidx];
6961 uintptr_t* dosage_present = nullptr;
6962 Dosage* dosage_main = nullptr;
6963 if (ctx->dosage_presents) {
6964 dosage_present = ctx->dosage_presents[tidx];
6965 dosage_main = ctx->dosage_mains[tidx];
6966 }
6967
6968 const uintptr_t vscore_ct = ctx->vscore_ct;
6969 const uintptr_t sample_ct = ctx->sample_ct;
6970 const uint32_t male_ct = ctx->male_ct;
6971 const uint32_t nonmale_ct = sample_ct - male_ct;
6972 const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
6973 const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
6974 const uint32_t is_xchr_model_1 = ctx->is_xchr_model_1;
6975 const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp);
6976
6977 const uint32_t max_sparse = sample_ct / 9;
6978
6979 double* tmp_result_buf = ctx->tmp_result_bufs[tidx];
6980 uint16_t cur_bidxs[kVscoreBlockSize];
6981
6982 double* dosage_vmaj = ctx->dosage_vmaj_bufs[tidx];
6983
6984 uint32_t is_y = 0;
6985 uint32_t is_x_or_y = 0;
6986 uint32_t is_nonxy_haploid = 0;
6987 uint32_t chr_end = 0;
6988 double slope = 0.0;
6989
6990 uint32_t dosage_ct = 0;
6991
6992 uint32_t parity = 0;
6993 do {
6994 const uintptr_t cur_block_size = ctx->cur_block_size;
6995 const uint32_t bidx_end = ((tidx + 1) * cur_block_size) / calc_thread_ct;
6996 double* cur_results = ctx->results[parity];
6997 uint32_t* missing_cts = ctx->missing_cts[parity];
6998 uintptr_t row_idx = 0;
6999 uintptr_t variant_uidx_base;
7000 uintptr_t variant_include_bits;
7001 BitIter1Start(variant_include, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &variant_include_bits);
7002 for (uint32_t variant_bidx = (tidx * cur_block_size) / calc_thread_ct; variant_bidx != bidx_end; ++variant_bidx) {
7003 const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
7004 if (variant_uidx >= chr_end) {
7005 const uint32_t chr_fo_idx = GetVariantChrFoIdx(cip, variant_uidx);
7006 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
7007 chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
7008 is_y = 0;
7009 is_nonxy_haploid = 0;
7010 if (chr_idx == x_code) {
7011 is_x_or_y = is_xchr_model_1;
7012 } else if (chr_idx == y_code) {
7013 is_x_or_y = 1;
7014 is_y = 1;
7015 } else {
7016 is_x_or_y = 0;
7017 is_nonxy_haploid = IsSet(cip->haploid_mask, chr_idx);
7018 }
7019 slope = (is_nonxy_haploid || is_y)? 0.5 : 1.0;
7020 }
7021 double ref_freq;
7022 if (!allele_idx_offsets) {
7023 ref_freq = allele_freqs[variant_uidx];
7024 } else {
7025 ref_freq = allele_freqs[allele_idx_offsets[variant_uidx] - variant_uidx];
7026 }
7027 const double missing_val = slope * 2 * (1.0 - ref_freq);
7028 if (!dosage_present) {
7029 uint32_t difflist_common_geno;
7030 uint32_t difflist_len;
7031 PglErr reterr = PgrGetDifflistOrGenovec(sample_include, pssi, sample_ct, max_sparse, variant_uidx, pgrp, genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len);
7032 if (unlikely(reterr)) {
7033 ctx->reterr = reterr;
7034 goto VscoreThread_err;
7035 }
7036 if (difflist_common_geno != UINT32_MAX) {
7037 if ((!is_x_or_y) && (!difflist_common_geno)) {
7038 double* target = &(cur_results[variant_bidx * vscore_ct]);
7039 uint32_t missing_ct = 0;
7040 if (!difflist_len) {
7041 ZeroDArr(vscore_ct, target);
7042 } else {
7043 ZeroTrailingNyps(difflist_len, raregeno);
7044 ZeroDArr(vscore_ct * 3, tmp_result_buf);
7045 const uint32_t word_ct_m1 = (difflist_len - 1) / kBitsPerWordD2;
7046 uint32_t loop_len = kBitsPerWordD2;
7047 for (uint32_t widx = 0; ; ++widx) {
7048 if (widx >= word_ct_m1) {
7049 if (widx > word_ct_m1) {
7050 break;
7051 }
7052 loop_len = ModNz(difflist_len, kBitsPerWordD2);
7053 }
7054 // slightly nicer to work with 2..0 than 1..3 row-indexes
7055 uintptr_t raregeno_word = raregeno[widx];
7056 uintptr_t raregeno_invword = ~raregeno_word;
7057 missing_ct += Popcount01Word(raregeno_word & (raregeno_word >> 1) & kMask5555);
7058 const uint32_t* cur_difflist_sample_ids = &(difflist_sample_ids[widx * kBitsPerWordD2]);
7059 for (uint32_t uii = 0; uii != loop_len; ++uii) {
7060 const uintptr_t sample_idx = cur_difflist_sample_ids[uii];
7061 const uint32_t cur_invgeno = raregeno_invword & 3;
7062 const double* incr_src = &(wts_smaj[sample_idx * vscore_ct]);
7063 double* incr_dst = &(tmp_result_buf[cur_invgeno * vscore_ct]);
7064 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7065 incr_dst[ulii] += incr_src[ulii];
7066 }
7067 raregeno_invword = raregeno_invword >> 2;
7068 }
7069 }
7070 if (!is_nonxy_haploid) {
7071 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7072 target[ulii] = 2 * tmp_result_buf[ulii + vscore_ct] + tmp_result_buf[ulii + 2 * vscore_ct];
7073 }
7074 } else {
7075 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7076 target[ulii] = tmp_result_buf[ulii + vscore_ct] + 0.5 * tmp_result_buf[ulii + 2 * vscore_ct];
7077 }
7078 }
7079 if (missing_ct) {
7080 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7081 target[ulii] += missing_val * tmp_result_buf[ulii];
7082 }
7083 }
7084 }
7085 if (missing_cts) {
7086 missing_cts[variant_bidx] = missing_ct;
7087 }
7088 continue;
7089 }
7090 PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genovec);
7091 }
7092 } else {
7093 PglErr reterr = PgrGetD(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_main, &dosage_ct);
7094 if (unlikely(reterr)) {
7095 ctx->reterr = reterr;
7096 goto VscoreThread_err;
7097 }
7098 if ((!is_x_or_y) && (dosage_ct <= max_sparse)) {
7099 STD_ARRAY_DECL(uint32_t, 4, genocounts);
7100 ZeroTrailingNyps(sample_ct, genovec);
7101 if (!dosage_ct) {
7102 // dosage_present contains garbage if dosage_ct == 0; might want to
7103 // append 'Unsafe' to PgrGetD and similar function names...
7104 ZeroWArr(BitCtToWordCt(sample_ct), dosage_present);
7105 }
7106 GenoarrCountInvsubsetFreqs2(genovec, dosage_present, sample_ct, sample_ct - dosage_ct, genocounts);
7107 if (genocounts[0] >= sample_ct - max_sparse) {
7108 double* target = &(cur_results[variant_bidx * vscore_ct]);
7109 if (genocounts[0] == sample_ct) {
7110 ZeroDArr(vscore_ct, target);
7111 } else {
7112 ZeroDArr(vscore_ct * 3, tmp_result_buf);
7113 const Halfword* dosage_present_alias = R_CAST(Halfword*, dosage_present);
7114 const uint32_t sample_ctl2 = DivUp(sample_ct, kBitsPerWordD2);
7115 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
7116 uintptr_t geno_word = genovec[widx];
7117 if (!geno_word) {
7118 continue;
7119 }
7120 const uintptr_t dosage_mask = UnpackHalfwordToWord(dosage_present_alias[widx]);
7121 geno_word = geno_word & (~(dosage_mask * 3));
7122 if (!geno_word) {
7123 continue;
7124 }
7125 const double* cur_wts_smaj = &(wts_smaj[widx * kBitsPerWordD2 * vscore_ct]);
7126 do {
7127 const uint32_t shift_ct = ctzw(geno_word) & (~1);
7128 const uintptr_t cur_invgeno = 3 & (~(geno_word >> shift_ct));
7129 const double* incr_src = &(cur_wts_smaj[(shift_ct / 2) * vscore_ct]);
7130 double* incr_dst = &(tmp_result_buf[cur_invgeno * vscore_ct]);
7131 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7132 incr_dst[ulii] += incr_src[ulii];
7133 }
7134 geno_word &= ~((3 * k1LU) << shift_ct);
7135 } while (geno_word);
7136 }
7137 if (!is_nonxy_haploid) {
7138 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7139 target[ulii] = 2 * tmp_result_buf[ulii + vscore_ct] + tmp_result_buf[ulii + 2 * vscore_ct];
7140 }
7141 } else {
7142 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7143 target[ulii] = tmp_result_buf[ulii + vscore_ct] + 0.5 * tmp_result_buf[ulii + 2 * vscore_ct];
7144 }
7145 }
7146 if (genocounts[3]) {
7147 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7148 target[ulii] += missing_val * tmp_result_buf[ulii];
7149 }
7150 }
7151 uintptr_t sample_idx_base = 0;
7152 uintptr_t dosage_present_bits = dosage_present[0];
7153 for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
7154 const uintptr_t sample_idx = BitIter1(dosage_present, &sample_idx_base, &dosage_present_bits);
7155 const double* incr_src = &(wts_smaj[sample_idx * vscore_ct]);
7156 const double cur_dosage = slope * kRecipDosageMid * u31tod(dosage_main[dosage_idx]);
7157 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7158 target[ulii] += cur_dosage * incr_src[ulii];
7159 }
7160 }
7161 }
7162 if (missing_cts) {
7163 missing_cts[variant_bidx] = genocounts[3];
7164 }
7165 continue;
7166 }
7167 }
7168 }
7169
7170 if (row_idx == kVscoreBlockSize) {
7171 RowMajorMatrixMultiply(dosage_vmaj, wts_smaj, kVscoreBlockSize, vscore_ct, sample_ct, tmp_result_buf);
7172 const double* tmp_result_iter = tmp_result_buf;
7173 for (uintptr_t ulii = 0; ulii != kVscoreBlockSize; ++ulii) {
7174 const uintptr_t cur_bidx = cur_bidxs[ulii];
7175 memcpy(&(cur_results[cur_bidx * vscore_ct]), tmp_result_iter, vscore_ct * sizeof(double));
7176 tmp_result_iter = &(tmp_result_iter[vscore_ct]);
7177 }
7178 row_idx = 0;
7179 }
7180 cur_bidxs[row_idx] = variant_bidx;
7181 double* cur_row = &(dosage_vmaj[row_idx * sample_ct]);
7182 ++row_idx;
7183 PopulateRescaledDosage(genovec, dosage_present, dosage_main, slope, 0.0, missing_val, sample_ct, dosage_ct, cur_row);
7184 if (is_x_or_y) {
7185 // Instead of doing this for every variant, we could precompute
7186 // chrX/chrY weight matrices with male weights halved/nonmale weights
7187 // zeroed out. But the number of chrY variants is typically small
7188 // enough (and how often will --xchr-model 1 be used, anyway?) that I
7189 // don't think it's worth it.
7190 uintptr_t sample_uidx_base = 0;
7191 if (is_y) {
7192 // zero out nonmale values
7193 uintptr_t sex_male_invbits = ~sex_male[0];
7194 for (uint32_t nonmale_idx = 0; nonmale_idx != nonmale_ct; ++nonmale_idx) {
7195 const uintptr_t sample_uidx = BitIter0(sex_male, &sample_uidx_base, &sex_male_invbits);
7196 cur_row[sample_uidx] = 0.0;
7197 }
7198 } else {
7199 // xchr_model 1: halve male values
7200 uintptr_t sex_male_bits = sex_male[0];
7201 for (uint32_t male_idx = 0; male_idx != male_ct; ++male_idx) {
7202 const uintptr_t sample_uidx = BitIter1(sex_male, &sample_uidx_base, &sex_male_bits);
7203 cur_row[sample_uidx] *= 0.5;
7204 }
7205 }
7206 }
7207 if (missing_cts) {
7208 ZeroTrailingNyps(sample_ct, genovec);
7209 uint32_t missing_ct;
7210 if (!dosage_ct) {
7211 if (!is_y) {
7212 missing_ct = GenoarrCountMissingUnsafe(genovec, sample_ct);
7213 } else {
7214 missing_ct = GenoarrCountMissingSubset(genovec, sex_male_interleaved_vec, sample_ct);
7215 }
7216 } else {
7217 if (!is_y) {
7218 missing_ct = GenoarrCountMissingInvsubsetUnsafe(genovec, dosage_present, sample_ct);
7219 } else {
7220 // include males, exclude dosages
7221 const uint32_t fullword_ct = (sample_ct + kBitsPerWordD2 - 1) / kBitsPerWord;
7222 missing_ct = 0;
7223 for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
7224 uintptr_t w1 = genovec[2 * widx];
7225 uintptr_t w2 = genovec[2 * widx + 1];
7226 w1 = w1 & (w1 >> 1);
7227 w2 = w2 & (w2 >> 1);
7228 w1 = PackWordToHalfwordMask5555(w1);
7229 w2 = PackWordToHalfwordMask5555(w2);
7230 const uintptr_t ww = w1 | (w2 << kBitsPerWordD2);
7231 missing_ct += PopcountWord(ww & sex_male[widx] & (~dosage_present[widx]));
7232 }
7233 if (sample_ct > fullword_ct * kBitsPerWord) {
7234 uintptr_t w1 = genovec[2 * fullword_ct];
7235 w1 = w1 & (w1 >> 1);
7236 w1 = PackWordToHalfwordMask5555(w1);
7237 missing_ct += PopcountWord(w1 & sex_male[fullword_ct] & (~dosage_present[fullword_ct]));
7238 }
7239 }
7240 }
7241 missing_cts[variant_bidx] = missing_ct;
7242 }
7243 }
7244 if (row_idx) {
7245 RowMajorMatrixMultiply(dosage_vmaj, wts_smaj, row_idx, vscore_ct, sample_ct, tmp_result_buf);
7246 const double* tmp_result_iter = tmp_result_buf;
7247 for (uintptr_t ulii = 0; ulii != row_idx; ++ulii) {
7248 uintptr_t cur_bidx = cur_bidxs[ulii];
7249 memcpy(&(cur_results[cur_bidx * vscore_ct]), tmp_result_iter, vscore_ct * sizeof(double));
7250 tmp_result_iter = &(tmp_result_iter[vscore_ct]);
7251 }
7252 }
7253 VscoreThread_err:
7254 parity = 1 - parity;
7255 } while (!THREAD_BLOCK_FINISH(arg));
7256 THREAD_RETURN;
7257 }
7258
Vscore(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * sex_male,const double * allele_freqs,const char * in_fname,const RangeList * col_idx_range_listp,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t max_allele_slen,VscoreFlags flags,uint32_t xchr_model,uint32_t max_thread_ct,uintptr_t pgr_alloc_cacheline_ct,PgenFileInfo * pgfip,char * outname,char * outname_end)7259 PglErr Vscore(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* sex_male, const double* allele_freqs, const char* in_fname, const RangeList* col_idx_range_listp, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t max_allele_slen, VscoreFlags flags, uint32_t xchr_model, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, char* outname, char* outname_end) {
7260 unsigned char* bigstack_mark = g_bigstack_base;
7261 unsigned char* bigstack_end_mark = g_bigstack_end;
7262 uintptr_t line_idx = 0;
7263 char* cswritep = nullptr;
7264 FILE* binfile = nullptr;
7265 PglErr reterr = kPglRetSuccess;
7266 TextStream txs;
7267 ThreadGroup tg;
7268 CompressStreamState css;
7269 PreinitTextStream(&txs);
7270 PreinitThreads(&tg);
7271 PreinitCstream(&css);
7272 {
7273 // unsurprisingly, lots of overlap with --score
7274 const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
7275 if (!xchr_model) {
7276 uint32_t x_code;
7277 if (XymtExists(cip, kChrOffsetX, &x_code)) {
7278 uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code];
7279 uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
7280 uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
7281 if (!AllBitsAreZero(variant_include, x_start, x_end)) {
7282 uintptr_t* variant_include_no_x;
7283 if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_no_x))) {
7284 goto Vscore_ret_NOMEM;
7285 }
7286 memcpy(variant_include_no_x, variant_include, raw_variant_ctl * sizeof(intptr_t));
7287 variant_ct -= PopcountBitRange(variant_include, x_start, x_end);
7288 if (!variant_ct) {
7289 logerrputs("Error: No --variant-score variants remaining after --xchr-model 0.\n");
7290 goto Vscore_ret_INCONSISTENT_INPUT;
7291 }
7292 ClearBitsNz(x_start, x_end, variant_include_no_x);
7293 variant_include = variant_include_no_x;
7294 }
7295 }
7296 } else if (xchr_model == 2) {
7297 xchr_model = 0;
7298 }
7299 // now xchr_model is set iff it's 1
7300
7301 // see KeepFcol() and SampleSortFileMap()
7302 char* line_start;
7303 XidMode xid_mode;
7304 reterr = OpenAndLoadXidHeader(in_fname, "variant-score", (siip->sids || (siip->flags & kfSampleIdStrictSid0))? kfXidHeaderFixedWidth : kfXidHeaderFixedWidthIgnoreSid, kTextStreamBlenFast, &txs, &xid_mode, &line_idx, &line_start, nullptr);
7305 if (unlikely(reterr)) {
7306 if (reterr == kPglRetEof) {
7307 logerrputs("Error: Empty --variant-score file.\n");
7308 reterr = kPglRetMalformedInput;
7309 }
7310 goto Vscore_ret_1;
7311 }
7312 const uint32_t id_col_ct = GetXidColCt(xid_mode);
7313 const uint32_t col_ct = CountTokens(line_start);
7314 if (unlikely(id_col_ct == col_ct)) {
7315 logerrputs("Error: No score columns in --variant-score file.\n");
7316 goto Vscore_ret_MALFORMED_INPUT;
7317 }
7318 uintptr_t vscore_ct;
7319 uint32_t* col_idx_deltas;
7320 if (!col_idx_range_listp->name_ct) {
7321 vscore_ct = col_ct - id_col_ct;
7322 if (unlikely(bigstack_alloc_u32(vscore_ct, &col_idx_deltas))) {
7323 goto Vscore_ret_NOMEM;
7324 }
7325 for (uint32_t uii = 0; uii != vscore_ct; ++uii) {
7326 col_idx_deltas[uii] = 1;
7327 }
7328 } else {
7329 const uint32_t col_ctl = BitCtToWordCt(col_ct);
7330 uintptr_t* vscore_col_bitarr;
7331 if (unlikely(bigstack_calloc_w(col_ctl, &vscore_col_bitarr))) {
7332 goto Vscore_ret_NOMEM;
7333 }
7334 if (unlikely(NumericRangeListToBitarr(col_idx_range_listp, col_ct, 1, 0, vscore_col_bitarr))) {
7335 goto Vscore_ret_MISSING_TOKENS;
7336 }
7337 if (vscore_col_bitarr[0] & ((1 << id_col_ct) - 1)) {
7338 logerrputs("Error: --vscore-col-nums argument overlaps with ID columns.\n");
7339 goto Vscore_ret_INCONSISTENT_INPUT;
7340 }
7341 vscore_ct = PopcountWords(vscore_col_bitarr, col_ctl);
7342 // since we don't allow overflow, this should be guaranteed to be
7343 // positive
7344 assert(vscore_ct);
7345 if (unlikely(bigstack_alloc_u32(vscore_ct, &col_idx_deltas))) {
7346 goto Vscore_ret_NOMEM;
7347 }
7348 uintptr_t col_uidx_base = 0;
7349 uintptr_t vscore_col_bitarr_bits = vscore_col_bitarr[0];
7350 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7351 const uint32_t col_uidx = BitIter1(vscore_col_bitarr, &col_uidx_base, &vscore_col_bitarr_bits);
7352 col_idx_deltas[vscore_idx] = col_uidx;
7353 }
7354 // now convert to deltas
7355 for (uintptr_t vscore_idx = vscore_ct - 1; vscore_idx; --vscore_idx) {
7356 col_idx_deltas[vscore_idx] -= col_idx_deltas[vscore_idx - 1];
7357 }
7358 col_idx_deltas[0] -= id_col_ct - 1;
7359 }
7360 char** vscore_names;
7361 if (unlikely(bigstack_end_alloc_cp(vscore_ct, &vscore_names))) {
7362 goto Vscore_ret_NOMEM;
7363 }
7364 const uint32_t is_header_line = (line_start[0] == '#');
7365 unsigned char* tmp_alloc_base = g_bigstack_base;
7366 unsigned char* tmp_alloc_end = g_bigstack_end;
7367 if (is_header_line) {
7368 const char* name_iter = NextTokenMult0(line_start, id_col_ct - 1);
7369 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7370 name_iter = NextTokenMult(name_iter, col_idx_deltas[vscore_idx]);
7371 const char* name_end = CurTokenEnd(name_iter);
7372 // don't actually need to enforce unique names, though we could print a
7373 // warning later
7374 const uint32_t cur_slen = name_end - name_iter;
7375 if (cur_slen > kMaxIdSlen) {
7376 snprintf(g_logbuf, kLogbufSize, "Error: Variant-score name in column %" PRIuPTR " of %s is too long.\n", vscore_idx + id_col_ct + 1, in_fname);
7377 goto Vscore_ret_MALFORMED_INPUT_WW;
7378 }
7379 if (StoreStringAtEnd(tmp_alloc_base, name_iter, cur_slen, &tmp_alloc_end, &(vscore_names[vscore_idx]))) {
7380 goto Vscore_ret_NOMEM;
7381 }
7382 name_iter = name_end;
7383 }
7384 ++line_idx;
7385 line_start = TextGet(&txs);
7386 } else {
7387 for (uintptr_t vscore_num = 1; vscore_num <= vscore_ct; ++vscore_num) {
7388 const uint32_t cur_blen = 7 + UintSlen(vscore_num);
7389 if (PtrWSubCk(tmp_alloc_base, cur_blen, &tmp_alloc_end)) {
7390 goto Vscore_ret_NOMEM;
7391 }
7392 char* cur_name_iter = R_CAST(char*, tmp_alloc_end);
7393 vscore_names[vscore_num - 1] = cur_name_iter;
7394 cur_name_iter = strcpya_k(cur_name_iter, "VSCORE");
7395 cur_name_iter = u32toa(vscore_num, cur_name_iter);
7396 *cur_name_iter = '\0';
7397 }
7398 }
7399 BigstackEndSet(tmp_alloc_end);
7400 uint32_t* xid_map;
7401 char* sorted_xidbox;
7402 uintptr_t max_xid_blen;
7403 reterr = SortedXidboxInitAlloc(sample_include, siip, sample_ct, 0, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
7404 if (unlikely(reterr)) {
7405 goto Vscore_ret_1;
7406 }
7407 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7408 #ifndef __LP64__
7409 if (sample_ct * S_CAST(uint64_t, vscore_ct) >= 0x80000000U / sizeof(double)) {
7410 goto Vscore_ret_NOMEM;
7411 }
7412 #endif
7413 char* idbuf;
7414 uintptr_t* already_seen;
7415 double* raw_wts;
7416 uint32_t* sample_uidx_order;
7417 if (unlikely(
7418 bigstack_alloc_c(siip->max_sample_id_blen, &idbuf) ||
7419 bigstack_alloc_u32(sample_ct, &sample_uidx_order) ||
7420 bigstack_alloc_d(sample_ct * vscore_ct, &raw_wts) ||
7421 bigstack_end_calloc_w(raw_sample_ctl, &already_seen))) {
7422 goto Vscore_ret_NOMEM;
7423 }
7424 uintptr_t miss_ct = 0;
7425 uint32_t hit_ct = 0;
7426
7427 for (double* raw_wts_iter = raw_wts; line_start; ++line_idx, line_start = TextGet(&txs)) {
7428 if (unlikely(line_start[0] == '#')) {
7429 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --variant-score file starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx);
7430 goto Vscore_ret_MALFORMED_INPUT_WW;
7431 }
7432 const char* linebuf_iter = line_start;
7433 uint32_t sample_uidx;
7434 if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx, idbuf)) {
7435 if (unlikely(!linebuf_iter)) {
7436 goto Vscore_ret_MISSING_TOKENS;
7437 }
7438 ++miss_ct;
7439 continue;
7440 }
7441 if (unlikely(IsSet(already_seen, sample_uidx))) {
7442 char* tab_iter = AdvToDelim(idbuf, '\t');
7443 *tab_iter = ' ';
7444 if (xid_mode & kfXidModeFlagSid) {
7445 *AdvToDelim(&(tab_iter[1]), '\t') = ' ';
7446 }
7447 snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID '%s' in --variant-score file.\n", idbuf);
7448 goto Vscore_ret_MALFORMED_INPUT_WW;
7449 }
7450 SetBit(sample_uidx, already_seen);
7451 sample_uidx_order[hit_ct] = sample_uidx;
7452 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx, ++raw_wts_iter) {
7453 linebuf_iter = NextTokenMult(linebuf_iter, col_idx_deltas[vscore_idx]);
7454 if (unlikely(!linebuf_iter)) {
7455 goto Vscore_ret_MISSING_TOKENS;
7456 }
7457 const char* token_end = ScantokDouble(linebuf_iter, raw_wts_iter);
7458 if (unlikely(!token_end)) {
7459 token_end = CurTokenEnd(linebuf_iter);
7460 *K_CAST(char*, token_end) = '\0';
7461 snprintf(g_logbuf, kLogbufSize, "Error: Invalid coefficient '%s' on line %" PRIuPTR " of --variant-score file.\n", linebuf_iter, line_idx);
7462 goto Vscore_ret_MALFORMED_INPUT_WW;
7463 }
7464 linebuf_iter = token_end;
7465 }
7466 ++hit_ct;
7467 }
7468 if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
7469 goto Vscore_ret_TSTREAM_FAIL;
7470 }
7471 if (unlikely(CleanupTextStream2(in_fname, &txs, &reterr))) {
7472 goto Vscore_ret_1;
7473 }
7474 if (!hit_ct) {
7475 logerrputs("Error: No valid entries in --variant-score file.\n");
7476 goto Vscore_ret_INCONSISTENT_INPUT;
7477 }
7478 sample_include = already_seen;
7479 sample_ct = hit_ct;
7480 #if defined(__LP64__) && !defined(LAPACK_ILP64)
7481 if (sample_ct * vscore_ct > 0x7fffffff) {
7482 logerrputs("Error: --variant-score input matrix too large for this " PROG_NAME_STR " build. If this\nis really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
7483 goto Vscore_ret_INCONSISTENT_INPUT;
7484 }
7485 #endif
7486 VscoreCtx ctx;
7487 ctx.variant_include = variant_include;
7488 ctx.cip = cip;
7489 ctx.allele_idx_offsets = allele_idx_offsets;
7490 ctx.allele_freqs = allele_freqs;
7491 ctx.sample_include = sample_include;
7492 const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
7493 const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
7494 uint32_t calc_thread_ct = max_thread_ct;
7495 uint32_t compress_thread_ct = 1;
7496 const uint32_t output_zst = (flags / kfVscoreZs) & 1;
7497 snprintf(outname_end, kMaxOutfnameExtBlen, ".vscore");
7498 if (flags & kfVscoreBin) {
7499 snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".cols");
7500 if (unlikely(fopen_checked(outname, FOPEN_WB, &binfile))) {
7501 goto Vscore_ret_OPEN_FAIL;
7502 }
7503 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7504 fputs(vscore_names[vscore_idx], binfile);
7505 #ifdef _WIN32
7506 putc_unlocked('\r', binfile);
7507 #endif
7508 putc_unlocked('\n', binfile);
7509 }
7510 if (unlikely(fclose_null(&binfile))) {
7511 goto Vscore_ret_WRITE_FAIL;
7512 }
7513 snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".bin");
7514 if (unlikely(fopen_checked(outname, FOPEN_WB, &binfile))) {
7515 goto Vscore_ret_OPEN_FAIL;
7516 }
7517 snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".vars");
7518 if (output_zst) {
7519 snprintf(&(outname_end[12]), kMaxOutfnameExtBlen - 12, ".zst");
7520 }
7521 } else if (output_zst) {
7522 snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".zst");
7523 if (calc_thread_ct > 1) {
7524 // The more samples there are, the higher the compute:compress ratio we
7525 // want, though this is not a linear relationship due to the sparse
7526 // optimization.
7527 // 1:1 split seems to work well for a few thousand samples; I'm
7528 // guessing that ~7:1 is better for hundreds of thousands.
7529 if (sample_ct < 8192) {
7530 compress_thread_ct = calc_thread_ct / 2;
7531 } else {
7532 const uint32_t log2_sample_ct_m10 = bsru32(sample_ct) - 10;
7533 // 3/8, 4/16, 5/24, ...
7534 compress_thread_ct = (calc_thread_ct * log2_sample_ct_m10) / (8 * (log2_sample_ct_m10 - 2));
7535 if (!compress_thread_ct) {
7536 compress_thread_ct = 1;
7537 }
7538 }
7539 calc_thread_ct -= compress_thread_ct;
7540 }
7541 }
7542 {
7543 uint32_t* sample_include_cumulative_popcounts;
7544 double* wts_smaj;
7545 if (unlikely(
7546 bigstack_end_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
7547 bigstack_end_alloc_d(sample_ct * vscore_ct, &wts_smaj))) {
7548 goto Vscore_ret_NOMEM;
7549 }
7550 FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
7551 ctx.sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
7552 logprintfww("--variant-score: %" PRIuPTR " score-vector%s loaded for %u sample%s.\n", vscore_ct, (vscore_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
7553 if (miss_ct) {
7554 logerrprintf("Warning: %" PRIuPTR " line%s skipped in --variant-score file.\n", miss_ct, (miss_ct == 1)? "" : "s");
7555 }
7556 const double* wts_read_iter = raw_wts;
7557 for (uint32_t uii = 0; uii != sample_ct; ++uii) {
7558 const uint32_t sample_uidx = sample_uidx_order[uii];
7559 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_uidx);
7560 memcpy(&(wts_smaj[sample_idx * vscore_ct]), wts_read_iter, vscore_ct * sizeof(double));
7561 wts_read_iter = &(wts_read_iter[vscore_ct]);
7562 }
7563 ctx.wts_smaj = wts_smaj;
7564 BigstackReset(bigstack_mark);
7565 const uint32_t sample_ctv = BitCtToVecCt(sample_ct);
7566 uintptr_t* sex_male_collapsed;
7567 uintptr_t* sex_male_interleaved_vec;
7568 if (unlikely(
7569 bigstack_alloc_w(sample_ctl, &sex_male_collapsed) ||
7570 bigstack_alloc_w(sample_ctv * kWordsPerVec, &sex_male_interleaved_vec) ||
7571 bigstack_alloc_wp(calc_thread_ct, &ctx.raregenos) ||
7572 bigstack_alloc_u32p(calc_thread_ct, &ctx.difflist_sample_id_bufs) ||
7573 bigstack_alloc_dp(calc_thread_ct, &ctx.dosage_vmaj_bufs) ||
7574 bigstack_alloc_dp(calc_thread_ct, &ctx.tmp_result_bufs))) {
7575 goto Vscore_ret_NOMEM;
7576 }
7577 CopyBitarrSubset(sex_male, sample_include, sample_ct, sex_male_collapsed);
7578 FillInterleavedMaskVec(sex_male_collapsed, sample_ctv, sex_male_interleaved_vec);
7579 ctx.sex_male_collapsed = sex_male_collapsed;
7580 ctx.sex_male_interleaved_vec = sex_male_interleaved_vec;
7581 }
7582 ctx.vscore_ct = vscore_ct;
7583 ctx.sample_ct = sample_ct;
7584 const uint32_t male_ct = PopcountWords(ctx.sex_male_collapsed, sample_ctl);
7585 ctx.male_ct = male_ct;
7586 ctx.is_xchr_model_1 = xchr_model;
7587
7588 const uint32_t chr_col = (flags / kfVscoreColChrom) & 1;
7589 char* chr_buf = nullptr;
7590 uint32_t max_chr_blen = 0;
7591 if (chr_col) {
7592 max_chr_blen = GetMaxChrSlen(cip) + 1;
7593 if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
7594 goto Vscore_ret_NOMEM;
7595 }
7596 }
7597 const uint32_t ref_col = (flags / kfVscoreColRef) & 1;
7598 const uint32_t alt1_col = (flags / kfVscoreColAlt1) & 1;
7599 const uint32_t alt_col = (flags / kfVscoreColAlt) & 1;
7600 uintptr_t overflow_buf_size;
7601 if (binfile) {
7602 overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 16;
7603 } else {
7604 overflow_buf_size = kCompressStreamBlock + max_chr_blen * chr_col + kMaxIdSlen + 128 + (24 * k1LU) * vscore_ct + MAXV(ref_col + alt1_col, alt_col) * max_allele_slen;
7605 }
7606 reterr = InitCstreamAlloc(outname, 0, output_zst, compress_thread_ct, overflow_buf_size, &css, &cswritep);
7607 if (unlikely(reterr)) {
7608 goto Vscore_ret_1;
7609 }
7610 const uint32_t nmiss_col = (flags / kfVscoreColNmiss) & 1;
7611 const uint32_t nobs_col = (flags / kfVscoreColNobs) & 1;
7612 if (!binfile) {
7613 *cswritep++ = '#';
7614 if (chr_col) {
7615 cswritep = strcpya_k(cswritep, "CHROM\t");
7616 }
7617 if (flags & kfVscoreColPos) {
7618 cswritep = strcpya_k(cswritep, "POS\t");
7619 } else {
7620 variant_bps = nullptr;
7621 }
7622 cswritep = strcpya_k(cswritep, "ID");
7623 if (ref_col) {
7624 cswritep = strcpya_k(cswritep, "\tREF");
7625 }
7626 if (alt1_col) {
7627 cswritep = strcpya_k(cswritep, "\tALT1");
7628 }
7629 if (alt_col) {
7630 cswritep = strcpya_k(cswritep, "\tALT");
7631 }
7632 if (flags & kfVscoreColAltfreq) {
7633 cswritep = strcpya_k(cswritep, "\tALT_FREQ");
7634 } else {
7635 allele_freqs = nullptr;
7636 }
7637 if (nmiss_col) {
7638 cswritep = strcpya_k(cswritep, "\tMISSING_CT");
7639 }
7640 if (nobs_col) {
7641 cswritep = strcpya_k(cswritep, "\tOBS_CT");
7642 }
7643 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7644 *cswritep++ = '\t';
7645 cswritep = strcpya(cswritep, vscore_names[vscore_idx]);
7646 if (unlikely(Cswrite(&css, &cswritep))) {
7647 goto Vscore_ret_WRITE_FAIL;
7648 }
7649 }
7650 AppendBinaryEoln(&cswritep);
7651 }
7652
7653 if (nmiss_col || nobs_col) {
7654 if (unlikely(
7655 bigstack_alloc_u32(kPglVblockSize, &ctx.missing_cts[0]) ||
7656 bigstack_alloc_u32(kPglVblockSize, &ctx.missing_cts[1]))) {
7657 goto Vscore_ret_NOMEM;
7658 }
7659 } else {
7660 ctx.missing_cts[0] = nullptr;
7661 ctx.missing_cts[1] = nullptr;
7662 }
7663
7664 const uint32_t max_returned_difflist_len = 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor);
7665 // * Per-thread raregeno buffers must have space for
7666 // max_returned_difflist_len nyps, and difflist_sample_ids buffers need
7667 // space for that many uint32s.
7668 // * Per-thread dosage_vmaj buffers must have space for
7669 // kVscoreBlockSize * sample_ct elements.
7670 // * Per-thread result buffers must have space for kVscoreBlockSize *
7671 // vscore_ct elements.
7672 const uintptr_t thread_xalloc_cacheline_ct = DivUp(max_returned_difflist_len, kNypsPerCacheline) + DivUp(max_returned_difflist_len, kInt32PerCacheline) + DivUp(kVscoreBlockSize * S_CAST(uintptr_t, sample_ct) * sizeof(double), kCacheline) + DivUp(kVscoreBlockSize * vscore_ct * sizeof(double), kCacheline);
7673
7674 // ctx.results must have space for 2 * vscore_ct * read_block_size doubles.
7675 const uintptr_t per_variant_xalloc_byte_ct = 2 * vscore_ct * sizeof(double);
7676 STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
7677 // defensive
7678 ctx.dosage_presents = nullptr;
7679 ctx.dosage_mains = nullptr;
7680 uint32_t read_block_size;
7681 if (unlikely(PgenMtLoadInit(variant_include, sample_ct, variant_ct, bigstack_left(), pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, per_variant_xalloc_byte_ct, 0, pgfip, &calc_thread_ct, &ctx.genovecs, nullptr, nullptr, nullptr, dosage_is_present? (&ctx.dosage_presents) : nullptr, dosage_is_present? (&ctx.dosage_mains) : nullptr, nullptr, nullptr, &read_block_size, nullptr, main_loadbufs, &ctx.pgr_ptrs, &ctx.read_variant_uidx_starts))) {
7682 goto Vscore_ret_NOMEM;
7683 }
7684 if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
7685 goto Vscore_ret_NOMEM;
7686 }
7687 {
7688 // could vector-align individual allocations and only cacheline-align at
7689 // thread boundaries, but the savings are microscopic
7690 const uintptr_t raregeno_alloc = kCacheline * DivUp(max_returned_difflist_len, kNypsPerCacheline);
7691 const uintptr_t difflist_sample_ids_alloc = RoundUpPow2(max_returned_difflist_len * sizeof(int32_t), kCacheline);
7692 const uintptr_t dosage_vmaj_alloc = RoundUpPow2(kVscoreBlockSize * S_CAST(uintptr_t, sample_ct) * sizeof(double), kCacheline);
7693 const uintptr_t tmp_result_alloc = RoundUpPow2(kVscoreBlockSize * vscore_ct * sizeof(double), kCacheline);
7694 for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
7695 ctx.raregenos[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(raregeno_alloc));
7696 ctx.difflist_sample_id_bufs[tidx] = S_CAST(uint32_t*, bigstack_alloc_raw(difflist_sample_ids_alloc));
7697 ctx.dosage_vmaj_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(dosage_vmaj_alloc));
7698 ctx.tmp_result_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(tmp_result_alloc));
7699 }
7700 }
7701 const uintptr_t results_byte_ct = RoundUpPow2(per_variant_xalloc_byte_ct * read_block_size, kCacheline);
7702 ctx.results[0] = S_CAST(double*, bigstack_alloc_raw(results_byte_ct));
7703 ctx.results[1] = S_CAST(double*, bigstack_alloc_raw(results_byte_ct));
7704 assert(g_bigstack_base <= g_bigstack_end);
7705 ctx.reterr = kPglRetSuccess;
7706 SetThreadFuncAndData(VscoreThread, &ctx, &tg);
7707
7708 fputs("--variant-score: 0%", stdout);
7709 fflush(stdout);
7710 const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
7711 // Main workflow:
7712 // 1. Set n=0, load/skip block 0
7713 //
7714 // 2. Spawn threads processing block n
7715 // 3. If n>0, write results for block (n-1)
7716 // 4. Increment n by 1
7717 // 5. Load/skip block n unless eof
7718 // 6. Join threads
7719 // 7. Goto step 2 unless eof
7720 //
7721 // 8. Write results for last block
7722 uintptr_t write_variant_uidx_base = 0;
7723 uintptr_t cur_bits = variant_include[0];
7724 uint32_t prev_block_size = 0;
7725 uint32_t pct = 0;
7726 uint32_t next_print_variant_idx = variant_ct / 100;
7727 uint32_t parity = 0;
7728 uint32_t read_block_idx = 0;
7729 uint32_t chr_fo_idx = UINT32_MAX;
7730 uint32_t chr_end = 0;
7731 uint32_t chr_buf_blen = 0;
7732 uint32_t cur_sample_ct = 0;
7733 uint32_t cur_allele_ct = 2;
7734 for (uint32_t variant_idx = 0; ; ++read_block_idx) {
7735 const uint32_t cur_block_size = MultireadNonempty(variant_include, &tg, raw_variant_ct, read_block_size, pgfip, &read_block_idx, &reterr);
7736 if (unlikely(reterr)) {
7737 goto Vscore_ret_PGR_FAIL;
7738 }
7739 if (variant_idx) {
7740 JoinThreads(&tg);
7741 reterr = ctx.reterr;
7742 if (unlikely(reterr)) {
7743 goto Vscore_ret_PGR_FAIL;
7744 }
7745 }
7746 if (!IsLastBlock(&tg)) {
7747 // it may make sense to put this boilerplate into its own function,
7748 // too...
7749 ctx.cur_block_size = cur_block_size;
7750 ComputeUidxStartPartition(variant_include, cur_block_size, calc_thread_ct, read_block_idx * read_block_size, ctx.read_variant_uidx_starts);
7751 PgrCopyBaseAndOffset(pgfip, calc_thread_ct, ctx.pgr_ptrs);
7752 if (variant_idx + cur_block_size == variant_ct) {
7753 DeclareLastThreadBlock(&tg);
7754 }
7755 if (unlikely(SpawnThreads(&tg))) {
7756 goto Vscore_ret_THREAD_CREATE_FAIL;
7757 }
7758 }
7759 parity = 1 - parity;
7760 if (variant_idx) {
7761 // write *previous* block results
7762 const double* cur_results_iter = ctx.results[parity];
7763 const uint32_t* cur_missing_cts = ctx.missing_cts[parity];
7764 for (uint32_t variant_bidx = 0; variant_bidx != prev_block_size; ++variant_bidx) {
7765 const uint32_t write_variant_uidx = BitIter1(variant_include, &write_variant_uidx_base, &cur_bits);
7766 if (write_variant_uidx >= chr_end) {
7767 do {
7768 ++chr_fo_idx;
7769 chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
7770 } while (write_variant_uidx >= chr_end);
7771 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
7772 cur_sample_ct = (chr_idx == y_code)? male_ct : sample_ct;
7773 if (chr_buf) {
7774 char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
7775 *chr_name_end = '\t';
7776 chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
7777 }
7778 }
7779 if (binfile) {
7780 // may as well write variant-ID file in this loop
7781 cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
7782 AppendBinaryEoln(&cswritep);
7783 if (unlikely(Cswrite(&css, &cswritep))) {
7784 goto Vscore_ret_WRITE_FAIL;
7785 }
7786 continue;
7787 }
7788 if (chr_col) {
7789 cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
7790 }
7791 if (variant_bps) {
7792 cswritep = u32toa_x(variant_bps[write_variant_uidx], '\t', cswritep);
7793 }
7794 cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
7795 uintptr_t allele_idx_offset_base = write_variant_uidx * 2;
7796 if (allele_idx_offsets) {
7797 allele_idx_offset_base = allele_idx_offsets[write_variant_uidx];
7798 cur_allele_ct = allele_idx_offsets[write_variant_uidx + 1] - allele_idx_offset_base;
7799 }
7800 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
7801 if (ref_col) {
7802 *cswritep++ = '\t';
7803 cswritep = strcpya(cswritep, cur_alleles[0]);
7804 }
7805 if (alt1_col) {
7806 *cswritep++ = '\t';
7807 cswritep = strcpya(cswritep, cur_alleles[1]);
7808 }
7809 if (alt_col) {
7810 *cswritep++ = '\t';
7811 for (uint32_t allele_idx = 1; allele_idx != cur_allele_ct; ++allele_idx) {
7812 if (unlikely(Cswrite(&css, &cswritep))) {
7813 goto Vscore_ret_WRITE_FAIL;
7814 }
7815 cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
7816 }
7817 --cswritep;
7818 }
7819 if (allele_freqs) {
7820 *cswritep++ = '\t';
7821 cswritep = dtoa_g(1.0 - allele_freqs[allele_idx_offset_base - write_variant_uidx], cswritep);
7822 }
7823 if (nmiss_col) {
7824 *cswritep++ = '\t';
7825 cswritep = u32toa(cur_missing_cts[variant_bidx], cswritep);
7826 }
7827 if (nobs_col) {
7828 *cswritep++ = '\t';
7829 cswritep = u32toa(cur_sample_ct - cur_missing_cts[variant_bidx], cswritep);
7830 }
7831 for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7832 *cswritep++ = '\t';
7833 cswritep = dtoa_g(*cur_results_iter++, cswritep);
7834 }
7835 AppendBinaryEoln(&cswritep);
7836 if (unlikely(Cswrite(&css, &cswritep))) {
7837 goto Vscore_ret_WRITE_FAIL;
7838 }
7839 }
7840 if (binfile) {
7841 if (unlikely(fwrite_checked(cur_results_iter, vscore_ct * prev_block_size * sizeof(double), binfile))) {
7842 goto Vscore_ret_WRITE_FAIL;
7843 }
7844 }
7845 if (variant_idx == variant_ct) {
7846 break;
7847 }
7848 if (variant_idx >= next_print_variant_idx) {
7849 if (pct > 10) {
7850 putc_unlocked('\b', stdout);
7851 }
7852 pct = (variant_idx * 100LLU) / variant_ct;
7853 printf("\b\b%u%%", pct++);
7854 fflush(stdout);
7855 next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
7856 }
7857 }
7858 prev_block_size = cur_block_size;
7859 variant_idx += cur_block_size;
7860 pgfip->block_base = main_loadbufs[parity];
7861 }
7862 if (unlikely(CswriteCloseNull(&css, cswritep))) {
7863 goto Vscore_ret_WRITE_FAIL;
7864 }
7865 putc_unlocked('\r', stdout);
7866 if (!binfile) {
7867 logprintfww("--variant-score: Results written to %s .\n", outname);
7868 } else {
7869 if (unlikely(fclose_null(&binfile))) {
7870 goto Vscore_ret_WRITE_FAIL;
7871 }
7872 outname_end[8] = '\0';
7873 logprintfww("--variant-score: Score matrix written to %sbin , and associated column and variant ID labels written to %scols and %svars%s .\n", outname, outname, outname, output_zst? ".zst" : "");
7874 }
7875 }
7876 while (0) {
7877 Vscore_ret_NOMEM:
7878 reterr = kPglRetNomem;
7879 break;
7880 Vscore_ret_OPEN_FAIL:
7881 reterr = kPglRetOpenFail;
7882 break;
7883 Vscore_ret_TSTREAM_FAIL:
7884 TextStreamErrPrint("--variant-score file", &txs);
7885 break;
7886 Vscore_ret_PGR_FAIL:
7887 PgenErrPrintN(reterr);
7888 break;
7889 Vscore_ret_WRITE_FAIL:
7890 reterr = kPglRetWriteFail;
7891 break;
7892 Vscore_ret_MALFORMED_INPUT_WW:
7893 WordWrapB(0);
7894 logerrputsb();
7895 Vscore_ret_MALFORMED_INPUT:
7896 reterr = kPglRetMalformedInput;
7897 break;
7898 Vscore_ret_MISSING_TOKENS:
7899 logerrprintfww("Error: Line %" PRIuPTR " of --variant-score file has fewer tokens than expected.\n", line_idx);
7900 Vscore_ret_INCONSISTENT_INPUT:
7901 reterr = kPglRetInconsistentInput;
7902 break;
7903 Vscore_ret_THREAD_CREATE_FAIL:
7904 reterr = kPglRetThreadCreateFail;
7905 break;
7906 }
7907 Vscore_ret_1:
7908 fclose_cond(binfile);
7909 CswriteCloseCond(&css, cswritep);
7910 CleanupThreads(&tg);
7911 CleanupTextStream2("--variant-score file", &txs, &reterr);
7912 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
7913 return reterr;
7914 }
7915
7916 #ifdef __cplusplus
7917 } // namespace plink2
7918 #endif
7919