1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_compress_stream.h"
19 #include "plink2_matrix.h"
20 #include "plink2_matrix_calc.h"
21 #include "plink2_random.h"
22 
23 #ifdef __cplusplus
24 namespace plink2 {
25 #endif
26 
InitScore(ScoreInfo * score_info_ptr)27 void InitScore(ScoreInfo* score_info_ptr) {
28   score_info_ptr->flags = kfScore0;
29   score_info_ptr->varid_col_p1 = 1;
30   score_info_ptr->allele_col_p1 = 0;  // defensive
31   score_info_ptr->input_fname = nullptr;
32   InitRangeList(&(score_info_ptr->input_col_idx_range_list));
33 
34   score_info_ptr->qsr_range_fname = nullptr;
35   score_info_ptr->qsr_data_fname = nullptr;
36   score_info_ptr->qsr_varid_col_p1 = 1;
37   score_info_ptr->qsr_val_col_p1 = 0;  // defensive
38 }
39 
CleanupScore(ScoreInfo * score_info_ptr)40 void CleanupScore(ScoreInfo* score_info_ptr) {
41   free_cond(score_info_ptr->input_fname);
42   CleanupRangeList(&(score_info_ptr->input_col_idx_range_list));
43 
44   free_cond(score_info_ptr->qsr_range_fname);
45   free_cond(score_info_ptr->qsr_data_fname);
46 }
47 
48 
TriangleDivide(int64_t cur_prod_x2,int32_t modif)49 uint32_t TriangleDivide(int64_t cur_prod_x2, int32_t modif) {
50   // return smallest integer vv for which (vv * (vv + modif)) is no smaller
51   // than cur_prod_x2, and neither term in the product is negative.
52   int64_t vv;
53   if (cur_prod_x2 == 0) {
54     if (modif < 0) {
55       return -modif;
56     }
57     return 0;
58   }
59   vv = S_CAST(int64_t, sqrt(S_CAST(double, cur_prod_x2)));
60   while ((vv - 1) * (vv + modif - 1) >= cur_prod_x2) {
61     vv--;
62   }
63   while (vv * (vv + modif) < cur_prod_x2) {
64     vv++;
65   }
66   return vv;
67 }
68 
ParallelBounds(uint32_t ct,int32_t start,uint32_t parallel_idx,uint32_t parallel_tot,int32_t * __restrict bound_start_ptr,int32_t * __restrict bound_end_ptr)69 void ParallelBounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
70   int32_t modif = 1 - start * 2;
71   int64_t ct_tot = S_CAST(int64_t, ct) * (ct + modif);
72   *bound_start_ptr = TriangleDivide((ct_tot * parallel_idx) / parallel_tot, modif);
73   *bound_end_ptr = TriangleDivide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
74 }
75 
76 // set align to 1 for no alignment
TriangleFill(uint32_t ct,uint32_t piece_ct,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t start,uint32_t align,uint32_t * target_arr)77 void TriangleFill(uint32_t ct, uint32_t piece_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
78   int32_t modif = 1 - start * 2;
79   int64_t cur_prod_x2;
80   int32_t lbound;
81   int32_t ubound;
82   uint32_t uii;
83   uint32_t align_m1;
84   ParallelBounds(ct, start, parallel_idx, parallel_tot, &lbound, &ubound);
85   // x(x+1)/2 is divisible by y iff (x % (2y)) is 0 or (2y - 1).
86   align *= 2;
87   align_m1 = align - 1;
88   target_arr[0] = lbound;
89   target_arr[piece_ct] = ubound;
90   cur_prod_x2 = S_CAST(int64_t, lbound) * (lbound + modif);
91   const int64_t ct_tr = (S_CAST(int64_t, ubound) * (ubound + modif) - cur_prod_x2) / piece_ct;
92   for (uint32_t piece_idx = 1; piece_idx != piece_ct; ++piece_idx) {
93     cur_prod_x2 += ct_tr;
94     lbound = TriangleDivide(cur_prod_x2, modif);
95     uii = (lbound - S_CAST(int32_t, start)) & align_m1;
96     if ((uii) && (uii != align_m1)) {
97       lbound = start + ((lbound - S_CAST(int32_t, start)) | align_m1);
98     }
99     // lack of this check caused a nasty bug earlier
100     if (S_CAST(uint32_t, lbound) > ct) {
101       lbound = ct;
102     }
103     target_arr[piece_idx] = lbound;
104   }
105 }
106 
107 // Returns 0 if cells_avail is insufficient.
CountTrianglePasses(uintptr_t start_idx,uintptr_t end_idx,uintptr_t is_no_diag,uintptr_t cells_avail)108 uint32_t CountTrianglePasses(uintptr_t start_idx, uintptr_t end_idx, uintptr_t is_no_diag, uintptr_t cells_avail) {
109   start_idx -= is_no_diag;
110   end_idx -= is_no_diag;
111   if (cells_avail < end_idx) {
112     return 0;
113   }
114   cells_avail *= 2;  // don't want to worry about /2 in triangular numbers
115   const uint64_t end_tri = S_CAST(uint64_t, end_idx) * (end_idx + 1);
116   uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
117   for (uint32_t pass_ct = 1; ; ++pass_ct) {
118     const uint64_t delta_tri = end_tri - start_tri;
119     if (delta_tri <= cells_avail) {
120       return pass_ct;
121     }
122     const uint64_t next_target = start_tri + cells_avail;
123     start_idx = S_CAST(int64_t, sqrt(u63tod(next_target)));
124     start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
125     if (start_tri > next_target) {
126       --start_idx;
127       start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
128       assert(start_tri <= next_target);
129     }
130   }
131 }
132 
NextTrianglePass(uintptr_t start_idx,uintptr_t grand_end_idx,uintptr_t is_no_diag,uintptr_t cells_avail)133 uint64_t NextTrianglePass(uintptr_t start_idx, uintptr_t grand_end_idx, uintptr_t is_no_diag, uintptr_t cells_avail) {
134   cells_avail *= 2;
135   start_idx -= is_no_diag;
136   grand_end_idx -= is_no_diag;
137   const uint64_t end_tri = S_CAST(uint64_t, grand_end_idx) * (grand_end_idx + 1);
138   uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
139   const uint64_t delta_tri = end_tri - start_tri;
140   if (delta_tri <= cells_avail) {
141     return grand_end_idx + is_no_diag;
142   }
143   const uint64_t next_target = start_tri + cells_avail;
144   start_idx = S_CAST(int64_t, sqrt(u63tod(next_target)));
145   start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
146   return start_idx + is_no_diag - (start_tri > next_target);
147 }
148 
TriangleLoadBalance(uint32_t piece_ct,uintptr_t start_idx,uintptr_t end_idx,uint32_t is_no_diag,uint32_t * target_arr)149 void TriangleLoadBalance(uint32_t piece_ct, uintptr_t start_idx, uintptr_t end_idx, uint32_t is_no_diag, uint32_t* target_arr) {
150   target_arr[0] = start_idx;
151   target_arr[piece_ct] = end_idx;
152   start_idx -= is_no_diag;
153   end_idx -= is_no_diag;
154   const uint64_t end_tri = S_CAST(uint64_t, end_idx) * (end_idx + 1);
155   uint64_t cur_target = S_CAST(uint64_t, start_idx) * (start_idx + 1);
156   const uint64_t std_size = (end_tri - cur_target) / piece_ct;
157   for (uint32_t piece_idx = 1; piece_idx != piece_ct; ++piece_idx) {
158     // don't use cur_target = start_tri + (piece_idx * delta_tri) / piece_ct
159     // because of potential overflow
160     cur_target += std_size;
161     start_idx = S_CAST(int64_t, sqrt(u63tod(cur_target)));
162     const uint64_t start_tri = S_CAST(uint64_t, start_idx) * (start_idx + 1);
163     if (start_tri > cur_target) {
164       --start_idx;
165     }
166     target_arr[piece_idx] = start_idx + is_no_diag;
167   }
168 }
169 
KinshipPruneDestructive(uintptr_t * kinship_table,uintptr_t * sample_include,uint32_t * sample_ct_ptr)170 PglErr KinshipPruneDestructive(uintptr_t* kinship_table, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
171   PglErr reterr = kPglRetSuccess;
172   {
173     const uintptr_t orig_sample_ct = *sample_ct_ptr;
174     const uintptr_t orig_sample_ctl = BitCtToWordCt(orig_sample_ct);
175     uintptr_t* sample_include_collapsed_nz;
176     uintptr_t* sample_remove_collapsed;
177     uint32_t* vertex_degree;
178     if (unlikely(
179             bigstack_calloc_w(orig_sample_ctl, &sample_include_collapsed_nz) ||
180             bigstack_calloc_w(orig_sample_ctl, &sample_remove_collapsed) ||
181             bigstack_alloc_u32(orig_sample_ct, &vertex_degree))) {
182       goto KinshipPruneDestructive_ret_NOMEM;
183     }
184     // 1. count the number of constraints for each remaining sample
185     uint32_t degree_1_vertex_ct = 0;
186     for (uint32_t sample_idx = 0; sample_idx != orig_sample_ct; ++sample_idx) {
187       const uintptr_t woffset = sample_idx * orig_sample_ctl;
188       const uintptr_t* read_iter1 = &(kinship_table[woffset]);
189       // don't currently guarantee vector-alignment of kinship_table rows, so
190       // can't use PopcountWords().  (change this?)
191       uint32_t cur_degree = 0;
192       for (uint32_t widx = 0; widx != orig_sample_ctl; ++widx) {
193         const uintptr_t cur_word = *read_iter1++;
194         cur_degree += PopcountWord(cur_word);
195       }
196       if (cur_degree) {
197         vertex_degree[sample_idx] = cur_degree;
198         degree_1_vertex_ct += (cur_degree == 1);
199         SetBit(sample_idx, sample_include_collapsed_nz);
200       }
201     }
202     uint32_t cur_sample_nz_ct = PopcountWords(sample_include_collapsed_nz, orig_sample_ctl);
203     // 2. as long as edges remain,
204     //    a. remove partner of first degree-one vertex, if such a vertex exists
205     //    b. otherwise, remove first maximal-degree vertex
206     //    (similar to plink 1.9 rel_cutoff_batch(), but data structure is not
207     //    triangular since more speed is needed)
208     while (cur_sample_nz_ct) {
209       uint32_t prune_uidx;
210       uint32_t cur_degree;
211       if (degree_1_vertex_ct) {
212         uint32_t degree_1_vertex_uidx = 0;
213         while (1) {
214           // sparse
215           degree_1_vertex_uidx = AdvTo1Bit(sample_include_collapsed_nz, degree_1_vertex_uidx);
216           if (vertex_degree[degree_1_vertex_uidx] == 1) {
217             break;
218           }
219           ++degree_1_vertex_uidx;
220         }
221         // find partner
222         prune_uidx = AdvTo1Bit(&(kinship_table[degree_1_vertex_uidx * orig_sample_ctl]), 0);
223         cur_degree = vertex_degree[prune_uidx];
224       } else {
225         uint32_t sample_uidx = AdvTo1Bit(sample_include_collapsed_nz, 0);
226         cur_degree = vertex_degree[sample_uidx];
227         prune_uidx = sample_uidx;
228         for (uint32_t sample_idx = 1; sample_idx != cur_sample_nz_ct; ++sample_idx) {
229           // sparse
230           sample_uidx = AdvTo1Bit(sample_include_collapsed_nz, sample_uidx + 1);
231           const uint32_t new_degree = vertex_degree[sample_uidx];
232           if (new_degree > cur_degree) {
233             cur_degree = new_degree;
234             prune_uidx = sample_uidx;
235           }
236         }
237       }
238       // remove row/column
239       uintptr_t* cur_kinship_col = &(kinship_table[prune_uidx / kBitsPerWord]);
240       const uintptr_t kinship_col_mask = ~(k1LU << (prune_uidx % kBitsPerWord));
241       uintptr_t* cur_kinship_row = &(kinship_table[prune_uidx * orig_sample_ctl]);
242       uint32_t sample_uidx = 0;
243       for (uint32_t partner_idx = 0; partner_idx != cur_degree; ++partner_idx, ++sample_uidx) {
244         // sparse
245         sample_uidx = AdvTo1Bit(cur_kinship_row, sample_uidx);
246         const uint32_t new_degree = vertex_degree[sample_uidx] - 1;
247         if (!new_degree) {
248           ClearBit(sample_uidx, sample_include_collapsed_nz);
249           --degree_1_vertex_ct;
250           --cur_sample_nz_ct;
251           // unnecessary to write to kinship_table[] or vertex_degree[]
252         } else {
253           cur_kinship_col[sample_uidx * orig_sample_ctl] &= kinship_col_mask;
254           degree_1_vertex_ct += (new_degree == 1);
255           vertex_degree[sample_uidx] = new_degree;
256         }
257       }
258       if (vertex_degree[prune_uidx] == 1) {
259         --degree_1_vertex_ct;
260       }
261       sample_remove_collapsed[prune_uidx / kBitsPerWord] |= ~kinship_col_mask;
262       sample_include_collapsed_nz[prune_uidx / kBitsPerWord] &= kinship_col_mask;
263       // unnecessary to update current kinship_table[] row
264       --cur_sample_nz_ct;
265     }
266     uint32_t sample_ct = orig_sample_ct;
267     uintptr_t sample_widx = 0;
268     uintptr_t cur_bits = sample_include[0];
269     for (uint32_t sample_idx = 0; sample_idx != orig_sample_ct; ++sample_idx) {
270       const uintptr_t lowbit = BitIter1y(sample_include, &sample_widx, &cur_bits);
271       if (IsSet(sample_remove_collapsed, sample_idx)) {
272         sample_include[sample_widx] ^= lowbit;
273         --sample_ct;
274       }
275     }
276     *sample_ct_ptr = sample_ct;
277   }
278   while (0) {
279   KinshipPruneDestructive_ret_NOMEM:
280     reterr = kPglRetNomem;
281     break;
282   }
283   return reterr;
284 }
285 
KingCutoffBatch(const SampleIdInfo * siip,uint32_t raw_sample_ct,double king_cutoff,uintptr_t * sample_include,char * king_cutoff_fprefix,uint32_t * sample_ct_ptr)286 PglErr KingCutoffBatch(const SampleIdInfo* siip, uint32_t raw_sample_ct, double king_cutoff, uintptr_t* sample_include, char* king_cutoff_fprefix, uint32_t* sample_ct_ptr) {
287   unsigned char* bigstack_mark = g_bigstack_base;
288   FILE* binfile = nullptr;
289   char* fprefix_end = &(king_cutoff_fprefix[strlen(king_cutoff_fprefix)]);
290   uintptr_t line_idx = 0;
291   PglErr reterr = kPglRetSuccess;
292   TextStream txs;
293   PreinitTextStream(&txs);
294   {
295     uint32_t sample_ct = *sample_ct_ptr;
296     const uint32_t orig_sample_ctl = BitCtToWordCt(sample_ct);
297     uintptr_t* kinship_table;
298     uint32_t* sample_uidx_to_king_uidx;
299     if (unlikely(
300             bigstack_calloc_w(sample_ct * orig_sample_ctl, &kinship_table) ||
301             bigstack_alloc_u32(raw_sample_ct, &sample_uidx_to_king_uidx))) {
302       goto KingCutoffBatch_ret_NOMEM;
303     }
304 
305     snprintf(fprefix_end, 9, ".king.id");
306     reterr = InitTextStream(king_cutoff_fprefix, kTextStreamBlenFast, 1, &txs);
307     if (unlikely(reterr)) {
308       goto KingCutoffBatch_ret_TSTREAM_FAIL;
309     }
310     // bugfix (18 Aug 2018): this missed some xid_mode possibilities
311     // todo: try to simplify this interface, it's bordering on incomprehensible
312     char* line_start;
313     XidMode xid_mode;
314     reterr = LoadXidHeader("king-cutoff", (siip->sids || (siip->flags & kfSampleIdStrictSid0))? kfXidHeader0 : kfXidHeaderIgnoreSid, &line_idx, &txs, &xid_mode, &line_start, nullptr);
315     if (unlikely(reterr)) {
316       if (reterr == kPglRetEof) {
317         logerrputs("Error: Empty --king-cutoff ID file.\n");
318         goto KingCutoffBatch_ret_MALFORMED_INPUT;
319       }
320       goto KingCutoffBatch_ret_TSTREAM_XID_FAIL;
321     }
322 
323     uint32_t* xid_map;  // IDs not collapsed
324     char* sorted_xidbox;
325     uintptr_t max_xid_blen;
326     reterr = SortedXidboxInitAlloc(sample_include, siip, sample_ct, 0, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
327     if (unlikely(reterr)) {
328       goto KingCutoffBatch_ret_1;
329     }
330     char* idbuf;
331     if (unlikely(bigstack_alloc_c(max_xid_blen, &idbuf))) {
332       goto KingCutoffBatch_ret_NOMEM;
333     }
334     SetAllU32Arr(raw_sample_ct, sample_uidx_to_king_uidx);
335     uintptr_t king_id_ct = 0;
336     if (*line_start == '#') {
337       ++line_idx;
338       line_start = TextGet(&txs);
339     }
340     for (; line_start; ++line_idx, line_start = TextGet(&txs)) {
341       const char* linebuf_iter = line_start;
342       uint32_t sample_uidx;
343       if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx, idbuf)) {
344         if (unlikely(!linebuf_iter)) {
345           goto KingCutoffBatch_ret_MISSING_TOKENS;
346         }
347         continue;
348       }
349       if (unlikely(sample_uidx_to_king_uidx[sample_uidx] != UINT32_MAX)) {
350         char* first_tab = AdvToDelim(idbuf, '\t');
351         char* second_tab = strchr(&(first_tab[1]), '\t');
352         *first_tab = ' ';
353         if (second_tab) {
354           *second_tab = ' ';
355         }
356         snprintf(g_logbuf, kLogbufSize, "Error: Duplicate ID '%s' in %s .\n", idbuf, king_cutoff_fprefix);
357         goto KingCutoffBatch_ret_MALFORMED_INPUT_WW;
358       }
359       sample_uidx_to_king_uidx[sample_uidx] = king_id_ct;
360       ++king_id_ct;
361     }
362     if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
363       goto KingCutoffBatch_ret_TSTREAM_FAIL;
364     }
365 
366     BigstackReset(TextStreamMemStart(&txs));
367     if (unlikely(CleanupTextStream2(king_cutoff_fprefix, &txs, &reterr))) {
368       goto KingCutoffBatch_ret_1;
369     }
370     uintptr_t* king_include;
371     uint32_t* king_uidx_to_sample_idx;
372     if (unlikely(
373             bigstack_calloc_w(BitCtToWordCt(king_id_ct), &king_include) ||
374             bigstack_alloc_u32(king_id_ct, &king_uidx_to_sample_idx))) {
375       goto KingCutoffBatch_ret_NOMEM;
376     }
377     uintptr_t sample_uidx_base = 0;
378     uintptr_t sample_include_bits = sample_include[0];
379     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
380       const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
381       const uint32_t king_uidx = sample_uidx_to_king_uidx[sample_uidx];
382       if (king_uidx != UINT32_MAX) {
383         SetBit(king_uidx, king_include);
384         king_uidx_to_sample_idx[king_uidx] = sample_idx;
385       }
386     }
387     snprintf(fprefix_end, 10, ".king.bin");
388     if (unlikely(fopen_checked(king_cutoff_fprefix, FOPEN_RB, &binfile))) {
389       goto KingCutoffBatch_ret_OPEN_FAIL;
390     }
391     if (unlikely(fseeko(binfile, 0, SEEK_END))) {
392       goto KingCutoffBatch_ret_READ_FAIL;
393     }
394     const uint64_t fsize = ftello(binfile);
395     const uint64_t fsize_double_expected = (king_id_ct * (S_CAST(uint64_t, king_id_ct) - 1) * (sizeof(double) / 2));
396     const uint32_t is_double = (fsize == fsize_double_expected);
397     rewind(binfile);
398     const uint32_t first_king_uidx = AdvBoundedTo1Bit(king_include, 0, king_id_ct);
399     uintptr_t king_uidx = AdvBoundedTo1Bit(king_include, first_king_uidx + 1, king_id_ct);
400     if (king_uidx > 1) {
401       if (fseeko(binfile, king_uidx * (S_CAST(uint64_t, king_uidx) - 1) * (2 + (2 * is_double)), SEEK_SET)) {
402         goto KingCutoffBatch_ret_READ_FAIL;
403       }
404     }
405     uintptr_t constraint_ct = 0;
406     if (is_double) {
407       // fread limit
408       assert(king_id_ct <= ((kMaxBytesPerIO / sizeof(double)) + 1));
409       double* king_drow;
410       if (unlikely(bigstack_alloc_d(king_id_ct - 1, &king_drow))) {
411         goto KingCutoffBatch_ret_NOMEM;
412       }
413       for (uint32_t king_idx = 1; king_uidx != king_id_ct; ++king_idx, ++king_uidx) {
414         if (!IsSet(king_include, king_uidx)) {
415           king_uidx = AdvBoundedTo1Bit(king_include, king_uidx + 1, king_id_ct);
416           if (king_uidx == king_id_ct) {
417             break;
418           }
419           if (unlikely(fseeko(binfile, S_CAST(uint64_t, king_uidx) * (king_uidx - 1) * (sizeof(double) / 2), SEEK_SET))) {
420             goto KingCutoffBatch_ret_READ_FAIL;
421           }
422         }
423         if (unlikely(!fread_unlocked(king_drow, king_uidx * sizeof(double), 1, binfile))) {
424           goto KingCutoffBatch_ret_READ_FAIL;
425         }
426         const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
427         uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
428         uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
429         const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
430         uintptr_t king_uidx2_base;
431         uintptr_t king_include_bits;
432         BitIter1Start(king_include, first_king_uidx, &king_uidx2_base, &king_include_bits);
433         for (uint32_t king_idx2 = 0; king_idx2 != king_idx; ++king_idx2) {
434           const uintptr_t king_uidx2 = BitIter1(king_include, &king_uidx2_base, &king_include_bits);
435           if (king_drow[king_uidx2] > king_cutoff) {
436             const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
437             SetBit(sample_idx2, kinship_table_row);
438             kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
439             ++constraint_ct;
440           }
441         }
442       }
443     } else {
444       if (unlikely(fsize != (fsize_double_expected / 2))) {
445         const uint64_t fsize_double_square = king_id_ct * S_CAST(uint64_t, king_id_ct) * sizeof(double);
446         if ((fsize == fsize_double_square) || (fsize == fsize_double_square / 2)) {
447           logerrputs("Error: --king-cutoff currently requires a *triangular* .bin file; the provided\nfile appears to be square.\n");
448         } else {
449           logerrprintfww("Error: Invalid --king-cutoff .bin file size (expected %" PRIu64 " or %" PRIu64 " bytes).\n", fsize_double_expected / 2, fsize_double_expected);
450         }
451         goto KingCutoffBatch_ret_MALFORMED_INPUT;
452       }
453       assert(king_id_ct <= ((0x7ffff000 / sizeof(float)) + 1));
454       const float king_cutoff_f = S_CAST(float, king_cutoff);
455       float* king_frow;
456       if (unlikely(bigstack_alloc_f(king_id_ct - 1, &king_frow))) {
457         goto KingCutoffBatch_ret_NOMEM;
458       }
459       for (uint32_t king_idx = 1; king_uidx != king_id_ct; ++king_idx, ++king_uidx) {
460         if (!IsSet(king_include, king_uidx)) {
461           king_uidx = AdvBoundedTo1Bit(king_include, king_uidx + 1, king_id_ct);
462           if (king_uidx == king_id_ct) {
463             break;
464           }
465           if (unlikely(fseeko(binfile, S_CAST(uint64_t, king_uidx) * (king_uidx - 1) * (sizeof(float) / 2), SEEK_SET))) {
466             goto KingCutoffBatch_ret_READ_FAIL;
467           }
468         }
469         if (unlikely(!fread_unlocked(king_frow, king_uidx * sizeof(float), 1, binfile))) {
470           goto KingCutoffBatch_ret_READ_FAIL;
471         }
472         const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
473         uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
474         uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
475         const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
476         uintptr_t king_uidx2_base;
477         uintptr_t king_include_bits;
478         BitIter1Start(king_include, first_king_uidx, &king_uidx2_base, &king_include_bits);
479         for (uint32_t king_idx2 = 0; king_idx2 != king_idx; ++king_idx2) {
480           const uintptr_t king_uidx2 = BitIter1(king_include, &king_uidx2_base, &king_include_bits);
481           if (king_frow[king_uidx2] > king_cutoff_f) {
482             const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
483             SetBit(sample_idx2, kinship_table_row);
484             kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
485             ++constraint_ct;
486           }
487         }
488       }
489     }
490     logprintf("--king-cutoff: %" PRIuPTR " constraint%s loaded.\n", constraint_ct, (constraint_ct == 1)? "" : "s");
491     BigstackReset(sample_uidx_to_king_uidx);
492     if (unlikely(KinshipPruneDestructive(kinship_table, sample_include, sample_ct_ptr))) {
493       goto KingCutoffBatch_ret_NOMEM;
494     }
495   }
496   while (0) {
497   KingCutoffBatch_ret_NOMEM:
498     reterr = kPglRetNomem;
499     break;
500   KingCutoffBatch_ret_OPEN_FAIL:
501     reterr = kPglRetOpenFail;
502     break;
503   KingCutoffBatch_ret_READ_FAIL:
504     logerrprintfww(kErrprintfFread, king_cutoff_fprefix, strerror(errno));
505     reterr = kPglRetReadFail;
506     break;
507   KingCutoffBatch_ret_MISSING_TOKENS:
508     logerrprintfww("Error: Fewer tokens than expected on line %" PRIuPTR " of %s .\n", line_idx, king_cutoff_fprefix);
509     reterr = kPglRetMalformedInput;
510     break;
511   KingCutoffBatch_ret_TSTREAM_XID_FAIL:
512     if (!TextStreamErrcode(&txs)) {
513       break;
514     }
515   KingCutoffBatch_ret_TSTREAM_FAIL:
516     TextStreamErrPrint(king_cutoff_fprefix, &txs);
517     break;
518   KingCutoffBatch_ret_MALFORMED_INPUT_WW:
519     WordWrapB(0);
520     logerrputsb();
521   KingCutoffBatch_ret_MALFORMED_INPUT:
522     reterr = kPglRetMalformedInput;
523     break;
524   }
525  KingCutoffBatch_ret_1:
526   fclose_cond(binfile);
527   if (CleanupTextStream(&txs, &reterr)) {
528     snprintf(fprefix_end, 9, ".king.id");
529     logerrprintfww(kErrprintfFread, king_cutoff_fprefix, strerror(errno));
530   }
531   BigstackReset(bigstack_mark);
532   return reterr;
533 }
534 
535 CONSTI32(kKingOffsetIbs0, 0);
536 CONSTI32(kKingOffsetHethet, 1);
537 CONSTI32(kKingOffsetHet2Hom1, 2);
538 CONSTI32(kKingOffsetHet1Hom2, 3);
539 CONSTI32(kKingOffsetHomhom, 4);
540 
541 typedef struct CalcKingSparseCtxStruct {
542   const uintptr_t* variant_include_orig;
543   uintptr_t* sample_include;
544   uint32_t* sample_include_cumulative_popcounts;
545   uint32_t row_start_idx;
546   uint32_t row_end_idx;
547   uint32_t homhom_needed;
548 
549   uint32_t max_sparse_ct;
550 
551   uint32_t read_block_size;  // guaranteed to be power of 2
552 
553   PgenReader** pgr_ptrs;
554   uintptr_t** genovecs;
555   uint32_t* read_variant_uidx_starts;
556 
557   // this has length >= 3 * max_sparse_ct
558   uint32_t** thread_idx_bufs;
559 
560   uint32_t cur_block_size;
561 
562   uint32_t** thread_singleton_het_cts;
563   uint32_t** thread_singleton_hom_cts;
564   uint32_t** thread_singleton_missing_cts;
565   uint32_t* thread_skip_cts;
566 
567   // single global copy
568   uint32_t* king_counts;
569 
570   uintptr_t** thread_sparse_excludes[2];
571 
572   PglErr reterr;
573 } CalcKingSparseCtx;
574 
CalcKingSparseThread(void * raw_arg)575 THREAD_FUNC_DECL CalcKingSparseThread(void* raw_arg) {
576   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
577   const uintptr_t tidx = arg->tidx;
578   CalcKingSparseCtx* ctx = S_CAST(CalcKingSparseCtx*, arg->sharedp->context);
579 
580   const uintptr_t* variant_include_orig = ctx->variant_include_orig;
581   const uintptr_t* sample_include = ctx->sample_include;
582 
583   PgenReader* pgrp = ctx->pgr_ptrs[tidx];
584   PgrSampleSubsetIndex pssi;
585   PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
586   uintptr_t* genovec = ctx->genovecs[tidx];
587   uint32_t row_start_idx = ctx->row_start_idx;
588   const uint64_t tri_start = ((row_start_idx - 1) * S_CAST(uint64_t, row_start_idx)) / 2;
589   if (row_start_idx == 1) {
590     row_start_idx = 0;
591   }
592   const uint32_t sample_ct = ctx->row_end_idx;
593   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
594   const uint32_t remainder = sample_ct % kBitsPerWordD2;
595   const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp);
596   const uint32_t homhom_needed = ctx->homhom_needed;
597   const uintptr_t homhom_needed_p4 = homhom_needed + 4;
598   const uint32_t max_sparse_ct = ctx->max_sparse_ct;
599   const uint32_t read_block_size_mask = ctx->read_block_size - 1;
600   const uint32_t read_block_sizel = ctx->read_block_size / kBitsPerWord;
601 
602   uint32_t* idx_bufs[4];
603   idx_bufs[0] = nullptr;
604   idx_bufs[1] = ctx->thread_idx_bufs[tidx];
605   idx_bufs[2] = &(idx_bufs[1][max_sparse_ct]);
606   idx_bufs[3] = &(idx_bufs[2][max_sparse_ct]);
607   const uint32_t min_common_ct = sample_ct - max_sparse_ct;
608 
609   uint32_t* singleton_het_cts = ctx->thread_singleton_het_cts[tidx];
610   uint32_t* singleton_hom_cts = ctx->thread_singleton_hom_cts[tidx];
611   uint32_t* singleton_missing_cts = ctx->thread_singleton_missing_cts[tidx];
612   ZeroU32Arr(sample_ct, singleton_het_cts);
613   ZeroU32Arr(sample_ct, singleton_hom_cts);
614   ZeroU32Arr(sample_ct, singleton_missing_cts);
615   uint32_t skip_ct = 0;
616 
617   uint32_t* king_counts = ctx->king_counts;
618   {
619     // This matrix can be huge, so we multithread zero-initialization.
620     const uint64_t entry_ct = homhom_needed_p4 * (((sample_ct - 1) * S_CAST(uint64_t, sample_ct)) / 2 - tri_start);
621     const uintptr_t fill_start = RoundDownPow2((tidx * entry_ct) / calc_thread_ct, kInt32PerCacheline);
622     uintptr_t fill_end = entry_ct;
623     if (tidx + 1 != calc_thread_ct) {
624       fill_end = RoundDownPow2(((tidx + 1) * entry_ct) / calc_thread_ct, kInt32PerCacheline);
625     }
626     ZeroU32Arr(fill_end - fill_start, &(king_counts[fill_start]));
627   }
628   uint32_t parity = 0;
629   // sync.Once before main loop; we need the other threads to be done with
630   // their zero-initialization jobs before we can proceed.
631   while (!THREAD_BLOCK_FINISH(arg)) {
632     const uint32_t cur_block_size = ctx->cur_block_size;
633     const uint32_t idx_end = ((tidx + 1) * cur_block_size) / calc_thread_ct;
634     uintptr_t variant_uidx_base;
635     uintptr_t variant_include_bits;
636     BitIter1Start(variant_include_orig, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &variant_include_bits);
637     uintptr_t* sparse_exclude = ctx->thread_sparse_excludes[parity][tidx];
638     ZeroWArr(read_block_sizel, sparse_exclude);
639     // probable todo: better load-balancing
640     for (uint32_t cur_idx = (tidx * cur_block_size) / calc_thread_ct; cur_idx != idx_end; ++cur_idx) {
641       const uint32_t variant_uidx = BitIter1(variant_include_orig, &variant_uidx_base, &variant_include_bits);
642       // tried DifflistOrGenovec, difference was negligible.  Not really worth
643       // considering it when calculation is inherently >O(mn).
644       PglErr reterr = PgrGet(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec);
645       if (unlikely(reterr)) {
646         ctx->reterr = reterr;
647         goto CalcKingSparseThread_err;
648       }
649       STD_ARRAY_DECL(uint32_t, 4, genocounts);
650       ZeroTrailingNyps(sample_ct, genovec);
651       GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts);
652       uintptr_t mask_word;
653       uintptr_t common_idx;
654       if (genocounts[0] >= min_common_ct) {
655         common_idx = 0;
656         mask_word = 0;
657       } else if (genocounts[2] >= min_common_ct) {
658         common_idx = 2;
659         mask_word = kMaskAAAA;
660       } else if (genocounts[3] >= min_common_ct) {
661         common_idx = 3;
662         mask_word = ~k0LU;
663         ++skip_ct;
664       } else {
665         if ((!homhom_needed) && ((genocounts[0] + genocounts[3] == sample_ct) || (genocounts[2] + genocounts[3] == sample_ct))) {
666           SetBit(variant_uidx & read_block_size_mask, sparse_exclude);
667           ++skip_ct;
668         }
669         continue;
670       }
671       SetBit(variant_uidx & read_block_size_mask, sparse_exclude);
672       if (genocounts[common_idx] == sample_ct) {
673         continue;
674       }
675       if (remainder) {
676         genovec[sample_ctl2 - 1] |= mask_word << (2 * remainder);
677       }
678       uint32_t* idx_buf_iters[4];
679       memcpy(idx_buf_iters, idx_bufs, 4 * sizeof(intptr_t));
680       for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
681         uintptr_t xor_word = genovec[widx] ^ mask_word;
682         if (xor_word) {
683           const uint32_t offset_base = widx * kBitsPerWordD2;
684           do {
685             const uint32_t shift_ct = ctzw(xor_word) & (~1);
686             const uint32_t cur_xor = (xor_word >> shift_ct) & 3;
687             *(idx_buf_iters[cur_xor])++ = offset_base + (shift_ct / 2);
688             xor_word &= ~((3 * k1LU) << shift_ct);
689           } while (xor_word);
690         }
691       }
692       // We do two things here.
693       // 1. Update singleton_{het,hom,missing}_cts for every observed rare
694       //    genotype.  This is enough for correct accounting for any pair
695       //    involving only one (or none) of these rare genotypes, and the
696       //    arrays are small enough that each thread can keep its own copy
697       //    (they're added up at the end).
698       // 2. For each pair of rare genotypes, atomically correct the main
699       //    king_counts[] array.  This is messy (9x2 cases) but conceptually
700       //    straightforward.
701       const uint32_t* het_idxs = idx_bufs[common_idx ^ 1];
702       const uint32_t het_ct = genocounts[1];
703       if (common_idx != 3) {
704         const uint32_t* other_hom_idxs = idx_bufs[2];
705         const uint32_t* missing_idxs = idx_bufs[common_idx ^ 3];
706         const uint32_t other_hom_ct = idx_buf_iters[2] - other_hom_idxs;
707         const uint32_t missing_ct = genocounts[3];
708         for (uint32_t uii = 0; uii != het_ct; ++uii) {
709           // We want to iterate over one row at a time, for better
710           // memory-access locality.  So the outer loop must correspond to the
711           // larger sample-index.
712           const uintptr_t sample_idx_hi = het_idxs[uii];
713           singleton_het_cts[sample_idx_hi] += 1;
714           if (sample_idx_hi < row_start_idx) {
715             continue;
716           }
717           const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
718           for (uint32_t ujj = 0; ujj != uii; ++ujj) {
719             const uintptr_t sample_idx_lo = het_idxs[ujj];
720             const uintptr_t tri_coord = tri_base + sample_idx_lo;
721             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
722             __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHethet]), 1, __ATOMIC_RELAXED);
723             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
724             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
725             if (homhom_needed) {
726               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
727             }
728           }
729           for (uint32_t ujj = 0; ujj != other_hom_ct; ++ujj) {
730             const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
731             if (sample_idx_lo > sample_idx_hi) {
732               break;
733             }
734             const uintptr_t tri_coord = tri_base + sample_idx_lo;
735             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
736             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
737           }
738           for (uint32_t ujj = 0; ujj != missing_ct; ++ujj) {
739             const uintptr_t sample_idx_lo = missing_idxs[ujj];
740             if (sample_idx_lo > sample_idx_hi) {
741               break;
742             }
743             const uintptr_t tri_coord = tri_base + sample_idx_lo;
744             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
745             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
746             if (homhom_needed) {
747               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
748             }
749           }
750         }
751         for (uint32_t uii = 0; uii != other_hom_ct; ++uii) {
752           const uintptr_t sample_idx_hi = other_hom_idxs[uii];
753           singleton_hom_cts[sample_idx_hi] += 1;
754           if (sample_idx_hi < row_start_idx) {
755             continue;
756           }
757           const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
758           for (uint32_t ujj = 0; ujj != uii; ++ujj) {
759             const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
760             const uintptr_t tri_coord = tri_base + sample_idx_lo;
761             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
762             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 2, __ATOMIC_RELAXED);
763           }
764           for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
765             const uintptr_t sample_idx_lo = het_idxs[ujj];
766             if (sample_idx_lo > sample_idx_hi) {
767               break;
768             }
769             const uintptr_t tri_coord = tri_base + sample_idx_lo;
770             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
771             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
772           }
773           for (uint32_t ujj = 0; ujj != missing_ct; ++ujj) {
774             const uintptr_t sample_idx_lo = missing_idxs[ujj];
775             if (sample_idx_lo > sample_idx_hi) {
776               break;
777             }
778             const uintptr_t tri_coord = tri_base + sample_idx_lo;
779             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
780             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
781             if (homhom_needed) {
782               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
783             }
784           }
785         }
786         for (uint32_t uii = 0; uii != missing_ct; ++uii) {
787           const uintptr_t sample_idx_hi = missing_idxs[uii];
788           singleton_missing_cts[sample_idx_hi] += 1;
789           if (sample_idx_hi < row_start_idx) {
790             continue;
791           }
792           const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
793           if (homhom_needed) {
794             for (uint32_t ujj = 0; ujj != uii; ++ujj) {
795               const uintptr_t sample_idx_lo = missing_idxs[ujj];
796               const uintptr_t tri_coord = tri_base + sample_idx_lo;
797               // bugfix (12 Nov 2019): added 4 twice
798               uint32_t* king_counts_ptr = &(king_counts[tri_coord * 5]);
799               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
800             }
801           }
802           for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
803             const uintptr_t sample_idx_lo = het_idxs[ujj];
804             if (sample_idx_lo > sample_idx_hi) {
805               break;
806             }
807             const uintptr_t tri_coord = tri_base + sample_idx_lo;
808             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
809             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
810             if (homhom_needed) {
811               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
812             }
813           }
814           for (uint32_t ujj = 0; ujj != other_hom_ct; ++ujj) {
815             const uintptr_t sample_idx_lo = other_hom_idxs[ujj];
816             if (sample_idx_lo > sample_idx_hi) {
817               break;
818             }
819             const uintptr_t tri_coord = tri_base + sample_idx_lo;
820             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
821             __atomic_fetch_sub(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
822             if (homhom_needed) {
823               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
824             }
825           }
826         }
827       } else {
828         // merge hom0 and hom2 cases.
829         for (uint32_t hom_geno = 0; hom_geno != 4; hom_geno += 2) {
830           const uint32_t* cur_hom_idxs = idx_bufs[3 - hom_geno];
831           const uint32_t* opp_hom_idxs = idx_bufs[1 + hom_geno];
832           const uint32_t cur_hom_ct = genocounts[hom_geno];
833           const uint32_t opp_hom_ct = genocounts[2 - hom_geno];
834           for (uint32_t uii = 0; uii != cur_hom_ct; ++uii) {
835             const uintptr_t sample_idx_hi = cur_hom_idxs[uii];
836             if (sample_idx_hi < row_start_idx) {
837               continue;
838             }
839             const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
840             if (homhom_needed) {
841               for (uint32_t ujj = 0; ujj != uii; ++ujj) {
842                 const uintptr_t sample_idx_lo = cur_hom_idxs[ujj];
843                 const uintptr_t tri_coord = tri_base + sample_idx_lo;
844                 uint32_t* king_counts_ptr = &(king_counts[tri_coord * 5]);
845                 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
846               }
847             }
848             for (uint32_t ujj = 0; ujj != het_ct; ++ujj) {
849               const uintptr_t sample_idx_lo = het_idxs[ujj];
850               if (sample_idx_lo > sample_idx_hi) {
851                 break;
852               }
853               const uintptr_t tri_coord = tri_base + sample_idx_lo;
854               uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
855               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet1Hom2]), 1, __ATOMIC_RELAXED);
856             }
857             for (uint32_t ujj = 0; ujj != opp_hom_ct; ++ujj) {
858               const uintptr_t sample_idx_lo = opp_hom_idxs[ujj];
859               if (sample_idx_lo > sample_idx_hi) {
860                 break;
861               }
862               const uintptr_t tri_coord = tri_base + sample_idx_lo;
863               uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
864               __atomic_fetch_add(&(king_counts_ptr[kKingOffsetIbs0]), 1, __ATOMIC_RELAXED);
865               if (homhom_needed) {
866                 __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHomhom]), 1, __ATOMIC_RELAXED);
867               }
868             }
869           }
870         }
871         const uint32_t* hom0_idxs = idx_bufs[3];
872         const uint32_t* hom2_idxs = idx_bufs[1];
873         const uint32_t hom0_ct = genocounts[0];
874         const uint32_t hom2_ct = genocounts[2];
875         for (uint32_t uii = 0; uii != het_ct; ++uii) {
876           const uintptr_t sample_idx_hi = het_idxs[uii];
877           if (sample_idx_hi < row_start_idx) {
878             continue;
879           }
880           const uintptr_t tri_base = (S_CAST(uint64_t, sample_idx_hi) * (sample_idx_hi - 1)) / 2 - tri_start;
881           for (uint32_t ujj = 0; ujj != uii; ++ujj) {
882             const uintptr_t sample_idx_lo = het_idxs[ujj];
883             const uintptr_t tri_coord = tri_base + sample_idx_lo;
884             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
885             __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHethet]), 1, __ATOMIC_RELAXED);
886           }
887           for (uint32_t ujj = 0; ujj != hom0_ct; ++ujj) {
888             const uintptr_t sample_idx_lo = hom0_idxs[ujj];
889             if (sample_idx_lo > sample_idx_hi) {
890               break;
891             }
892             const uintptr_t tri_coord = tri_base + sample_idx_lo;
893             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
894             __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
895           }
896           for (uint32_t ujj = 0; ujj != hom2_ct; ++ujj) {
897             const uintptr_t sample_idx_lo = hom2_idxs[ujj];
898             if (sample_idx_lo > sample_idx_hi) {
899               break;
900             }
901             const uintptr_t tri_coord = tri_base + sample_idx_lo;
902             uint32_t* king_counts_ptr = &(king_counts[tri_coord * homhom_needed_p4]);
903             __atomic_fetch_add(&(king_counts_ptr[kKingOffsetHet2Hom1]), 1, __ATOMIC_RELAXED);
904           }
905         }
906       }
907     }
908   CalcKingSparseThread_err:
909     parity = 1 - parity;
910   }
911   ctx->thread_skip_cts[tidx] = skip_ct;
912   THREAD_RETURN;
913 }
914 
915 #ifdef USE_SSE42
916 CONSTI32(kKingMultiplex, 1024);
917 CONSTI32(kKingMultiplexWords, kKingMultiplex / kBitsPerWord);
IncrKing(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)918 void IncrKing(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
919   // Tried adding another level of blocking, but couldn't get it to make a
920   // difference.
921   for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
922     // technically overflows for huge sample_ct
923     const uint32_t second_offset = second_idx * kKingMultiplexWords;
924     const uintptr_t* second_hom = &(smaj_hom[second_offset]);
925     const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
926     const uintptr_t* first_hom_iter = smaj_hom;
927     const uintptr_t* first_ref2het_iter = smaj_ref2het;
928     while (first_hom_iter < second_hom) {
929       uint32_t acc_ibs0 = 0;
930       uint32_t acc_hethet = 0;
931       uint32_t acc_het2hom1 = 0;
932       uint32_t acc_het1hom2 = 0;
933       for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
934         const uintptr_t hom1 = first_hom_iter[widx];
935         const uintptr_t hom2 = second_hom[widx];
936         const uintptr_t ref2het1 = first_ref2het_iter[widx];
937         const uintptr_t ref2het2 = second_ref2het[widx];
938         const uintptr_t homhom = hom1 & hom2;
939         const uintptr_t het1 = ref2het1 & (~hom1);
940         const uintptr_t het2 = ref2het2 & (~hom2);
941         acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
942         acc_hethet += PopcountWord(het1 & het2);
943         acc_het2hom1 += PopcountWord(hom1 & het2);
944         acc_het1hom2 += PopcountWord(hom2 & het1);
945       }
946       king_counts_iter[kKingOffsetIbs0] += acc_ibs0;
947       king_counts_iter[kKingOffsetHethet] += acc_hethet;
948       king_counts_iter[kKingOffsetHet2Hom1] += acc_het2hom1;
949       king_counts_iter[kKingOffsetHet1Hom2] += acc_het1hom2;
950       king_counts_iter = &(king_counts_iter[4]);
951 
952       first_hom_iter = &(first_hom_iter[kKingMultiplexWords]);
953       first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexWords]);
954     }
955   }
956 }
957 
IncrKingHomhom(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)958 void IncrKingHomhom(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
959   for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
960     // technically overflows for huge sample_ct
961     const uint32_t second_offset = second_idx * kKingMultiplexWords;
962     const uintptr_t* second_hom = &(smaj_hom[second_offset]);
963     const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
964     const uintptr_t* first_hom_iter = smaj_hom;
965     const uintptr_t* first_ref2het_iter = smaj_ref2het;
966     while (first_hom_iter < second_hom) {
967       uint32_t acc_homhom = 0;
968       uint32_t acc_ibs0 = 0;
969       uint32_t acc_hethet = 0;
970       uint32_t acc_het2hom1 = 0;
971       uint32_t acc_het1hom2 = 0;
972       for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
973         const uintptr_t hom1 = first_hom_iter[widx];
974         const uintptr_t hom2 = second_hom[widx];
975         const uintptr_t ref2het1 = first_ref2het_iter[widx];
976         const uintptr_t ref2het2 = second_ref2het[widx];
977         const uintptr_t homhom = hom1 & hom2;
978         const uintptr_t het1 = ref2het1 & (~hom1);
979         const uintptr_t het2 = ref2het2 & (~hom2);
980         acc_homhom += PopcountWord(homhom);
981         acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
982         acc_hethet += PopcountWord(het1 & het2);
983         acc_het2hom1 += PopcountWord(hom1 & het2);
984         acc_het1hom2 += PopcountWord(hom2 & het1);
985       }
986       king_counts_iter[kKingOffsetIbs0] += acc_ibs0;
987       king_counts_iter[kKingOffsetHethet] += acc_hethet;
988       king_counts_iter[kKingOffsetHet2Hom1] += acc_het2hom1;
989       king_counts_iter[kKingOffsetHet1Hom2] += acc_het1hom2;
990       king_counts_iter[kKingOffsetHomhom] += acc_homhom;
991       king_counts_iter = &(king_counts_iter[5]);
992 
993       first_hom_iter = &(first_hom_iter[kKingMultiplexWords]);
994       first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexWords]);
995     }
996   }
997 }
998 #else  // !USE_SSE42
999 #  ifdef __LP64__
1000 CONSTI32(kKingMultiplex, 1536);
1001 #  else
1002 CONSTI32(kKingMultiplex, 960);
1003 #  endif
1004 static_assert(kKingMultiplex % (3 * kBitsPerVec) == 0, "Invalid kKingMultiplex value.");
1005 CONSTI32(kKingMultiplexWords, kKingMultiplex / kBitsPerWord);
1006 CONSTI32(kKingMultiplexVecs, kKingMultiplex / kBitsPerVec);
1007 // expensive PopcountWord().  Use Lauradoux/Walisch accumulators, since
1008 // Harley-Seal requires too many variables.
IncrKing(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)1009 void IncrKing(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
1010   const VecW m1 = VCONST_W(kMask5555);
1011   const VecW m2 = VCONST_W(kMask3333);
1012   const VecW m4 = VCONST_W(kMask0F0F);
1013   for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
1014     // technically overflows for huge sample_ct
1015     const uint32_t second_offset = second_idx * kKingMultiplexWords;
1016     const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
1017     const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
1018     const VecW* first_hom_iter = R_CAST(const VecW*, smaj_hom);
1019     const VecW* first_ref2het_iter = R_CAST(const VecW*, smaj_ref2het);
1020     while (first_hom_iter < second_hom) {
1021       UniVec acc_ibs0;
1022       UniVec acc_hethet;
1023       UniVec acc_het2hom1;
1024       UniVec acc_het1hom2;
1025       acc_ibs0.vw = vecw_setzero();
1026       acc_hethet.vw = vecw_setzero();
1027       acc_het2hom1.vw = vecw_setzero();
1028       acc_het1hom2.vw = vecw_setzero();
1029       for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
1030         VecW hom1 = first_hom_iter[vec_idx];
1031         VecW hom2 = second_hom[vec_idx];
1032         VecW ref2het1 = first_ref2het_iter[vec_idx];
1033         VecW ref2het2 = second_ref2het[vec_idx];
1034         VecW het1 = vecw_and_notfirst(hom1, ref2het1);
1035         VecW het2 = vecw_and_notfirst(hom2, ref2het2);
1036         VecW agg_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
1037         VecW agg_hethet = het1 & het2;
1038         VecW agg_het2hom1 = hom1 & het2;
1039         VecW agg_het1hom2 = hom2 & het1;
1040         agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
1041         agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
1042         agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
1043         agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
1044         agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
1045         agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
1046         agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
1047         agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
1048 
1049         for (uint32_t offset = 1; offset != 3; ++offset) {
1050           hom1 = first_hom_iter[vec_idx + offset];
1051           hom2 = second_hom[vec_idx + offset];
1052           ref2het1 = first_ref2het_iter[vec_idx + offset];
1053           ref2het2 = second_ref2het[vec_idx + offset];
1054           het1 = vecw_and_notfirst(hom1, ref2het1);
1055           het2 = vecw_and_notfirst(hom2, ref2het2);
1056           VecW cur_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
1057           VecW cur_hethet = het1 & het2;
1058           VecW cur_het2hom1 = hom1 & het2;
1059           VecW cur_het1hom2 = hom2 & het1;
1060           cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
1061           cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
1062           cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
1063           cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
1064           agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
1065           agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
1066           agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
1067           agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
1068         }
1069         acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
1070         acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
1071         acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
1072         acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
1073       }
1074       const VecW m8 = VCONST_W(kMask00FF);
1075       acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
1076       acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
1077       acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
1078       acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
1079       king_counts_iter[kKingOffsetIbs0] += UniVecHsum16(acc_ibs0);
1080       king_counts_iter[kKingOffsetHethet] += UniVecHsum16(acc_hethet);
1081       king_counts_iter[kKingOffsetHet2Hom1] += UniVecHsum16(acc_het2hom1);
1082       king_counts_iter[kKingOffsetHet1Hom2] += UniVecHsum16(acc_het1hom2);
1083       king_counts_iter = &(king_counts_iter[4]);
1084 
1085       first_hom_iter = &(first_hom_iter[kKingMultiplexVecs]);
1086       first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexVecs]);
1087     }
1088   }
1089 }
1090 
IncrKingHomhom(const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts_iter)1091 void IncrKingHomhom(const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
1092   const VecW m1 = VCONST_W(kMask5555);
1093   const VecW m2 = VCONST_W(kMask3333);
1094   const VecW m4 = VCONST_W(kMask0F0F);
1095   for (uint32_t second_idx = start_idx; second_idx != end_idx; ++second_idx) {
1096     // technically overflows for huge sample_ct
1097     const uint32_t second_offset = second_idx * kKingMultiplexWords;
1098     const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
1099     const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
1100     const VecW* first_hom_iter = R_CAST(const VecW*, smaj_hom);
1101     const VecW* first_ref2het_iter = R_CAST(const VecW*, smaj_ref2het);
1102     while (first_hom_iter < second_hom) {
1103       UniVec acc_homhom;
1104       UniVec acc_ibs0;
1105       UniVec acc_hethet;
1106       UniVec acc_het2hom1;
1107       UniVec acc_het1hom2;
1108       acc_homhom.vw = vecw_setzero();
1109       acc_ibs0.vw = vecw_setzero();
1110       acc_hethet.vw = vecw_setzero();
1111       acc_het2hom1.vw = vecw_setzero();
1112       acc_het1hom2.vw = vecw_setzero();
1113       for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
1114         VecW hom1 = first_hom_iter[vec_idx];
1115         VecW hom2 = second_hom[vec_idx];
1116         VecW ref2het1 = first_ref2het_iter[vec_idx];
1117         VecW ref2het2 = second_ref2het[vec_idx];
1118         VecW agg_homhom = hom1 & hom2;
1119         VecW het1 = vecw_and_notfirst(hom1, ref2het1);
1120         VecW het2 = vecw_and_notfirst(hom2, ref2het2);
1121         VecW agg_ibs0 = (ref2het1 ^ ref2het2) & agg_homhom;
1122         VecW agg_hethet = het1 & het2;
1123         VecW agg_het2hom1 = hom1 & het2;
1124         VecW agg_het1hom2 = hom2 & het1;
1125         agg_homhom = agg_homhom - (vecw_srli(agg_homhom, 1) & m1);
1126         agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
1127         agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
1128         agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
1129         agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
1130         agg_homhom = (agg_homhom & m2) + (vecw_srli(agg_homhom, 2) & m2);
1131         agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
1132         agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
1133         agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
1134         agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
1135 
1136         for (uint32_t offset = 1; offset != 3; ++offset) {
1137           hom1 = first_hom_iter[vec_idx + offset];
1138           hom2 = second_hom[vec_idx + offset];
1139           ref2het1 = first_ref2het_iter[vec_idx + offset];
1140           ref2het2 = second_ref2het[vec_idx + offset];
1141           VecW cur_homhom = hom1 & hom2;
1142           het1 = vecw_and_notfirst(hom1, ref2het1);
1143           het2 = vecw_and_notfirst(hom2, ref2het2);
1144           VecW cur_ibs0 = (ref2het1 ^ ref2het2) & cur_homhom;
1145           VecW cur_hethet = het1 & het2;
1146           VecW cur_het2hom1 = hom1 & het2;
1147           VecW cur_het1hom2 = hom2 & het1;
1148           cur_homhom = cur_homhom - (vecw_srli(cur_homhom, 1) & m1);
1149           cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
1150           cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
1151           cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
1152           cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
1153           agg_homhom += (cur_homhom & m2) + (vecw_srli(cur_homhom, 2) & m2);
1154           agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
1155           agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
1156           agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
1157           agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
1158         }
1159         acc_homhom.vw = acc_homhom.vw + (agg_homhom & m4) + (vecw_srli(agg_homhom, 4) & m4);
1160         acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
1161         acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
1162         acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
1163         acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
1164       }
1165       const VecW m8 = VCONST_W(kMask00FF);
1166       acc_homhom.vw = (acc_homhom.vw & m8) + (vecw_srli(acc_homhom.vw, 8) & m8);
1167       acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
1168       acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
1169       acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
1170       acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
1171       king_counts_iter[kKingOffsetIbs0] += UniVecHsum16(acc_ibs0);
1172       king_counts_iter[kKingOffsetHethet] += UniVecHsum16(acc_hethet);
1173       king_counts_iter[kKingOffsetHet2Hom1] += UniVecHsum16(acc_het2hom1);
1174       king_counts_iter[kKingOffsetHet1Hom2] += UniVecHsum16(acc_het1hom2);
1175       king_counts_iter[kKingOffsetHomhom] += UniVecHsum16(acc_homhom);
1176       king_counts_iter = &(king_counts_iter[5]);
1177 
1178       first_hom_iter = &(first_hom_iter[kKingMultiplexVecs]);
1179       first_ref2het_iter = &(first_ref2het_iter[kKingMultiplexVecs]);
1180     }
1181   }
1182 }
1183 #endif
1184 static_assert(!(kKingMultiplexWords % 2), "kKingMultiplexWords must be even for safe bit-transpose.");
1185 
1186 typedef struct CalcKingDenseCtxStruct {
1187   uintptr_t* smaj_hom[2];
1188   uintptr_t* smaj_ref2het[2];
1189   uint32_t homhom_needed;
1190 
1191   uint32_t* thread_start;
1192 
1193   uint32_t* king_counts;
1194 } CalcKingDenseCtx;
1195 
CalcKingDenseThread(void * raw_arg)1196 THREAD_FUNC_DECL CalcKingDenseThread(void* raw_arg) {
1197   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
1198   const uintptr_t tidx = arg->tidx;
1199   CalcKingDenseCtx* ctx = S_CAST(CalcKingDenseCtx*, arg->sharedp->context);
1200 
1201   const uint64_t mem_start_idx = ctx->thread_start[0];
1202   const uint64_t start_idx = ctx->thread_start[tidx];
1203   const uint32_t end_idx = ctx->thread_start[tidx + 1];
1204   const uint32_t homhom_needed = ctx->homhom_needed;
1205   uint32_t parity = 0;
1206   do {
1207     if (homhom_needed) {
1208       IncrKingHomhom(ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, &(ctx->king_counts[((start_idx * (start_idx - 1) - mem_start_idx * (mem_start_idx - 1)) / 2) * 5]));
1209     } else {
1210       IncrKing(ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, &(ctx->king_counts[(start_idx * (start_idx - 1) - mem_start_idx * (mem_start_idx - 1)) * 2]));
1211     }
1212     parity = 1 - parity;
1213   } while (!THREAD_BLOCK_FINISH(arg));
1214   THREAD_RETURN;
1215 }
1216 
1217 /*
1218 double ComputeKinship(const uint32_t* king_counts_entry) {
1219   const uint32_t ibs0_ct = king_counts_entry[kKingOffsetIbs0];
1220   const uint32_t hethet_ct = king_counts_entry[kKingOffsetHethet];
1221   const uint32_t het2hom1_ct = king_counts_entry[kKingOffsetHet2Hom1];
1222   const uint32_t het1hom2_ct = king_counts_entry[kKingOffsetHet1Hom2];
1223   const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1224   return 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1225 }
1226 */
1227 
1228 // '2' refers to the larger index here
ComputeKinship(const uint32_t * king_counts_entry,uint32_t singleton_het1_ct,uint32_t singleton_hom1_ct,uint32_t singleton_het2_ct,uint32_t singleton_hom2_ct)1229 double ComputeKinship(const uint32_t* king_counts_entry, uint32_t singleton_het1_ct, uint32_t singleton_hom1_ct, uint32_t singleton_het2_ct, uint32_t singleton_hom2_ct) {
1230   const uint32_t ibs0_ct = king_counts_entry[kKingOffsetIbs0] + singleton_hom1_ct + singleton_hom2_ct;
1231   const uint32_t hethet_ct = king_counts_entry[kKingOffsetHethet];
1232   const uint32_t het2hom1_ct = king_counts_entry[kKingOffsetHet2Hom1] + singleton_het2_ct;
1233   const uint32_t het1hom2_ct = king_counts_entry[kKingOffsetHet1Hom2] + singleton_het1_ct;
1234   const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1235   return 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1236 }
1237 
1238 // could also return pointer to end?
SetKingMatrixFname(KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,char * outname_end)1239 void SetKingMatrixFname(KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, char* outname_end) {
1240   if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1241     char* outname_end2 = strcpya_k(outname_end, ".king");
1242     const uint32_t output_zst = king_flags & kfKingMatrixZs;
1243     if (parallel_tot != 1) {
1244       *outname_end2++ = '.';
1245       outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1246     }
1247     if (output_zst) {
1248       outname_end2 = strcpya_k(outname_end2, ".zst");
1249     }
1250     *outname_end2 = '\0';
1251     return;
1252   }
1253   char* outname_end2 = strcpya_k(outname_end, ".king.bin");
1254   if (parallel_tot != 1) {
1255     *outname_end2++ = '.';
1256     outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1257   }
1258   *outname_end2 = '\0';
1259 }
1260 
SetKingTableFname(KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,char * outname_end)1261 void SetKingTableFname(KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, char* outname_end) {
1262   char* outname_end2 = strcpya_k(outname_end, ".kin0");
1263   const uint32_t output_zst = king_flags & kfKingTableZs;
1264   if (parallel_tot != 1) {
1265     *outname_end2++ = '.';
1266     outname_end2 = u32toa(parallel_idx + 1, outname_end2);
1267   }
1268   if (output_zst) {
1269     outname_end2 = strcpya_k(outname_end2, ".zst");
1270   }
1271   *outname_end2 = '\0';
1272 }
1273 
AppendKingTableHeader(KingFlags king_flags,uint32_t king_col_fid,uint32_t king_col_sid,char * cswritep)1274 char* AppendKingTableHeader(KingFlags king_flags, uint32_t king_col_fid, uint32_t king_col_sid, char* cswritep) {
1275   *cswritep++ = '#';
1276   if (king_flags & kfKingColId) {
1277     if (king_col_fid) {
1278       cswritep = strcpya_k(cswritep, "FID1\t");
1279     }
1280     // Was 'ID1' before alpha 3, but that's inconsistent with other plink2
1281     // commands, and in the meantime the header line still doesn't perfectly
1282     // match KING due to e.g. capitalization.
1283     cswritep = strcpya_k(cswritep, "IID1\t");
1284     if (king_col_sid) {
1285       cswritep = strcpya_k(cswritep, "SID1\t");
1286     }
1287     if (king_col_fid) {
1288       cswritep = strcpya_k(cswritep, "FID2\t");
1289     }
1290     cswritep = strcpya_k(cswritep, "IID2\t");
1291     if (king_col_sid) {
1292       cswritep = strcpya_k(cswritep, "SID2\t");
1293     }
1294   }
1295   if (king_flags & kfKingColNsnp) {
1296     cswritep = strcpya_k(cswritep, "NSNP\t");
1297   }
1298   if (king_flags & kfKingColHethet) {
1299     cswritep = strcpya_k(cswritep, "HETHET\t");
1300   }
1301   if (king_flags & kfKingColIbs0) {
1302     cswritep = strcpya_k(cswritep, "IBS0\t");
1303   }
1304   if (king_flags & kfKingColIbs1) {
1305     cswritep = strcpya_k(cswritep, "HET1_HOM2\tHET2_HOM1\t");
1306   }
1307   if (king_flags & kfKingColKinship) {
1308     cswritep = strcpya_k(cswritep, "KINSHIP\t");
1309   }
1310   DecrAppendBinaryEoln(&cswritep);
1311   return cswritep;
1312 }
1313 
KingMaxSparseCt(uint32_t row_end_idx)1314 uint32_t KingMaxSparseCt(uint32_t row_end_idx) {
1315 #ifdef USE_AVX2
1316   return row_end_idx / 33;
1317 #else
1318   return row_end_idx / 30;
1319 #endif
1320 }
1321 
CalcKing(const SampleIdInfo * siip,const uintptr_t * variant_include_orig,const ChrInfo * cip,uint32_t raw_sample_ct,uint32_t orig_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,double king_cutoff,double king_table_filter,KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,uintptr_t pgr_alloc_cacheline_ct,PgenFileInfo * pgfip,PgenReader * simple_pgrp,uintptr_t * sample_include,uint32_t * sample_ct_ptr,char * outname,char * outname_end)1322 PglErr CalcKing(const SampleIdInfo* siip, const uintptr_t* variant_include_orig, const ChrInfo* cip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, double king_cutoff, double king_table_filter, KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, uintptr_t* sample_include, uint32_t* sample_ct_ptr, char* outname, char* outname_end) {
1323   unsigned char* bigstack_mark = g_bigstack_base;
1324   FILE* outfile = nullptr;
1325   char* cswritep = nullptr;
1326   char* cswritetp = nullptr;
1327   CompressStreamState css;
1328   CompressStreamState csst;
1329   ThreadGroup tg;
1330   PglErr reterr = kPglRetSuccess;
1331   PreinitCstream(&css);
1332   PreinitCstream(&csst);
1333   PreinitThreads(&tg);
1334   {
1335     const KingFlags matrix_shape = king_flags & kfKingMatrixShapemask;
1336     const char* flagname = matrix_shape? "--make-king" : ((king_flags & kfKingColAll)? "--make-king-table" : "--king-cutoff");
1337     if (unlikely(IsSet(cip->haploid_mask, 0))) {
1338       logerrprintf("Error: %s cannot be used on haploid genomes.\n", flagname);
1339       goto CalcKing_ret_INCONSISTENT_INPUT;
1340     }
1341     uint32_t sample_ct = *sample_ct_ptr;
1342     if (unlikely(sample_ct < 2)) {
1343       logerrprintf("Error: %s requires at least 2 samples.\n", flagname);
1344       goto CalcKing_ret_DEGENERATE_DATA;
1345     }
1346 #ifdef __LP64__
1347     // there's also a UINT32_MAX / kKingMultiplexWords limit, but that's not
1348     // relevant for now
1349     if (unlikely(sample_ct > 134000000)) {
1350       // for text output, 134m * 16 is just below kMaxLongLine
1351       logerrprintf("Error: %s does not support > 134000000 samples.\n", flagname);
1352       reterr = kPglRetNotYetSupported;
1353       goto CalcKing_ret_1;
1354     }
1355 #endif
1356     const uintptr_t sample_ctl = BitCtToWordCt(sample_ct);
1357     uintptr_t* kinship_table = nullptr;
1358     if (king_cutoff != -1) {
1359       if (unlikely(bigstack_calloc_w(sample_ct * sample_ctl, &kinship_table))) {
1360         goto CalcKing_ret_NOMEM;
1361       }
1362     }
1363     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
1364     const uint32_t non_autosomal_variant_ct = CountNonAutosomalVariants(variant_include_orig, cip, 1, 1);
1365     if (non_autosomal_variant_ct) {
1366       uintptr_t* variant_include_next;
1367       if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_next))) {
1368         goto CalcKing_ret_NOMEM;
1369       }
1370       logprintf("Excluding %u variant%s on non-autosomes from KING-robust calculation.\n", non_autosomal_variant_ct, (non_autosomal_variant_ct == 1)? "" : "s");
1371       variant_ct -= non_autosomal_variant_ct;
1372       if (!variant_ct) {
1373         logerrprintf("Error: No variants remaining for KING-robust calculation.\n");
1374         goto CalcKing_ret_DEGENERATE_DATA;
1375       }
1376       memcpy(variant_include_next, variant_include_orig, raw_variant_ctl * sizeof(intptr_t));
1377       ExcludeNonAutosomalVariants(cip, variant_include_next);
1378       variant_include_orig = variant_include_next;
1379     }
1380     uintptr_t* variant_include;
1381 
1382     if (unlikely(
1383             bigstack_alloc_w(raw_variant_ctl, &variant_include))) {
1384       goto CalcKing_ret_NOMEM;
1385     }
1386 
1387     uint32_t grand_row_start_idx;
1388     uint32_t grand_row_end_idx;
1389     ParallelBounds(sample_ct, 1, parallel_idx, parallel_tot, R_CAST(int32_t*, &grand_row_start_idx), R_CAST(int32_t*, &grand_row_end_idx));
1390 
1391     // possible todo: allow this to change between passes
1392     uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
1393     if (calc_thread_ct > sample_ct / 32) {
1394       calc_thread_ct = sample_ct / 32;
1395     }
1396     if (!calc_thread_ct) {
1397       calc_thread_ct = 1;
1398     }
1399     const uint32_t homhom_needed = (king_flags & kfKingColNsnp) || ((!(king_flags & kfKingCounts)) && (king_flags & (kfKingColHethet | kfKingColIbs0 | kfKingColIbs1)));
1400     CalcKingSparseCtx sparse_ctx;
1401     uint32_t sparse_read_block_size = 0;
1402     STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
1403     // These values are now permitted to underflow from sparse-optimization.
1404     // Might want to change them to int32_t*.
1405     uint32_t* singleton_het_cts;
1406     uint32_t* singleton_hom_cts;
1407     uint32_t* singleton_missing_cts;
1408     {
1409       sparse_ctx.variant_include_orig = variant_include_orig;
1410       sparse_ctx.homhom_needed = homhom_needed;
1411       const uint32_t max_sparse_ct = KingMaxSparseCt(grand_row_end_idx);
1412       // Ok for this to be a slight underestimate, since bigstack_left()/8 is
1413       // an arbitrary limit anyway.
1414       const uintptr_t thread_xalloc_cacheline_ct = DivUp((3 * k1LU) * (max_sparse_ct + grand_row_end_idx), kInt32PerCacheline) + ((kPglVblockSize * 2) / kBitsPerCacheline);
1415       if (unlikely(PgenMtLoadInit(variant_include_orig, grand_row_end_idx, variant_ct, bigstack_left() / 8, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, 0, 0, pgfip, &calc_thread_ct, &sparse_ctx.genovecs, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, &sparse_read_block_size, nullptr, main_loadbufs, &sparse_ctx.pgr_ptrs, &sparse_ctx.read_variant_uidx_starts))) {
1416         goto CalcKing_ret_NOMEM;
1417       }
1418       sparse_ctx.read_block_size = sparse_read_block_size;
1419       sparse_ctx.reterr = kPglRetSuccess;
1420       if (unlikely(
1421               bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_idx_bufs) ||
1422               bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_het_cts) ||
1423               bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_hom_cts) ||
1424               bigstack_alloc_u32p(calc_thread_ct, &sparse_ctx.thread_singleton_missing_cts) ||
1425               bigstack_alloc_u32(calc_thread_ct, &sparse_ctx.thread_skip_cts) ||
1426               bigstack_alloc_wp(calc_thread_ct, &sparse_ctx.thread_sparse_excludes[0]) ||
1427               bigstack_alloc_wp(calc_thread_ct, &sparse_ctx.thread_sparse_excludes[1]))) {
1428         goto CalcKing_ret_NOMEM;
1429       }
1430       const uint32_t read_block_sizel = sparse_read_block_size / kBitsPerWord;
1431       for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
1432         if (unlikely(
1433                 bigstack_alloc_u32(3 * max_sparse_ct, &(sparse_ctx.thread_idx_bufs[tidx])) ||
1434                 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_het_cts[tidx])) ||
1435                 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_hom_cts[tidx])) ||
1436                 bigstack_alloc_u32(grand_row_end_idx, &(sparse_ctx.thread_singleton_missing_cts[tidx])) ||
1437                 bigstack_alloc_w(read_block_sizel, &(sparse_ctx.thread_sparse_excludes[0][tidx])) ||
1438                 bigstack_alloc_w(read_block_sizel, &(sparse_ctx.thread_sparse_excludes[1][tidx])))) {
1439           goto CalcKing_ret_NOMEM;
1440         }
1441       }
1442       singleton_het_cts = sparse_ctx.thread_singleton_het_cts[0];
1443       singleton_hom_cts = sparse_ctx.thread_singleton_hom_cts[0];
1444       singleton_missing_cts = sparse_ctx.thread_singleton_missing_cts[0];
1445     }
1446 
1447     CalcKingDenseCtx dense_ctx;
1448     if (unlikely(
1449             SetThreadCt(calc_thread_ct, &tg) ||
1450             bigstack_alloc_u32(calc_thread_ct + 1, &dense_ctx.thread_start))) {
1451       goto CalcKing_ret_NOMEM;
1452     }
1453     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1454     const uint32_t grei_ctaw = BitCtToAlignedWordCt(grand_row_end_idx);
1455     const uint32_t grei_ctaw2 = NypCtToAlignedWordCt(grand_row_end_idx);
1456     dense_ctx.homhom_needed = homhom_needed;
1457     const uint32_t king_bufsizew = kKingMultiplexWords * grand_row_end_idx;
1458     const uint32_t homhom_needed_p4 = dense_ctx.homhom_needed + 4;
1459     uintptr_t* cur_sample_include;
1460     uint32_t* sample_include_cumulative_popcounts;
1461     uintptr_t* loadbuf;
1462     uintptr_t* splitbuf_hom;
1463     uintptr_t* splitbuf_ref2het;
1464     VecW* vecaligned_buf;
1465     if (unlikely(
1466             bigstack_alloc_w(raw_sample_ctl, &cur_sample_include) ||
1467             bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
1468             bigstack_alloc_w(grei_ctaw2, &loadbuf) ||
1469             bigstack_alloc_w(kPglBitTransposeBatch * grei_ctaw, &splitbuf_hom) ||
1470             bigstack_alloc_w(kPglBitTransposeBatch * grei_ctaw, &splitbuf_ref2het) ||
1471             bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_hom[0])) ||
1472             bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_ref2het[0])) ||
1473             bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_hom[1])) ||
1474             bigstack_alloc_w(king_bufsizew, &(dense_ctx.smaj_ref2het[1])) ||
1475             bigstack_alloc_v(kPglBitTransposeBufvecs, &vecaligned_buf))) {
1476       goto CalcKing_ret_NOMEM;
1477     }
1478 
1479     // Make this automatically multipass when there's insufficient memory.  So
1480     // we open the output file(s) here, and just append in the main loop.
1481     unsigned char* numbuf = nullptr;
1482     if (matrix_shape) {
1483       SetKingMatrixFname(king_flags, parallel_idx, parallel_tot, outname_end);
1484       if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1485         // text matrix
1486         // won't be >2gb since sample_ct <= 134m
1487         const uint32_t overflow_buf_size = kCompressStreamBlock + 16 * sample_ct;
1488         reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingMatrixZs, max_thread_ct, overflow_buf_size, &css, &cswritep);
1489         if (unlikely(reterr)) {
1490           goto CalcKing_ret_1;
1491         }
1492       } else {
1493         if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
1494           goto CalcKing_ret_OPEN_FAIL;
1495         }
1496         if (unlikely(bigstack_alloc_uc(sample_ct * 4 * (2 - ((king_flags / kfKingMatrixBin4) & 1)), &numbuf))) {
1497           goto CalcKing_ret_OPEN_FAIL;
1498         }
1499       }
1500     }
1501     uint32_t king_col_fid = 0;
1502     uint32_t king_col_sid = 0;
1503     uintptr_t max_sample_fmtid_blen = 0;
1504     char* collapsed_sample_fmtids = nullptr;
1505     if (king_flags & kfKingColAll) {
1506       const uint32_t overflow_buf_size = kCompressStreamBlock + kMaxMediumLine;
1507       SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
1508       reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingTableZs, max_thread_ct, overflow_buf_size, &csst, &cswritetp);
1509       if (unlikely(reterr)) {
1510         goto CalcKing_ret_1;
1511       }
1512 
1513       king_col_fid = FidColIsRequired(siip, king_flags / kfKingColMaybefid);
1514       king_col_sid = SidColIsRequired(siip->sids, king_flags / kfKingColMaybesid);
1515       if (!parallel_idx) {
1516         cswritetp = AppendKingTableHeader(king_flags, king_col_fid, king_col_sid, cswritetp);
1517       }
1518       if (unlikely(CollapsedSampleFmtidInitAlloc(sample_include, siip, grand_row_end_idx, king_col_fid, king_col_sid, &collapsed_sample_fmtids, &max_sample_fmtid_blen))) {
1519         goto CalcKing_ret_NOMEM;
1520       }
1521     }
1522     uint64_t king_table_filter_ct = 0;
1523     const uintptr_t cells_avail = bigstack_left() / (sizeof(int32_t) * homhom_needed_p4);
1524     const uint32_t pass_ct = CountTrianglePasses(grand_row_start_idx, grand_row_end_idx, 1, cells_avail);
1525     if (unlikely(!pass_ct)) {
1526       goto CalcKing_ret_NOMEM;
1527     }
1528     if (unlikely((pass_ct > 1) && (king_flags & kfKingMatrixSq))) {
1529       logerrputs("Insufficient memory for --make-king square output.  Try square0 or triangle\nshape instead.\n");
1530       goto CalcKing_ret_NOMEM;
1531     }
1532     uint32_t row_end_idx = grand_row_start_idx;
1533     sparse_ctx.king_counts = R_CAST(uint32_t*, g_bigstack_base);
1534     dense_ctx.king_counts = sparse_ctx.king_counts;
1535     for (uint32_t pass_idx_p1 = 1; pass_idx_p1 <= pass_ct; ++pass_idx_p1) {
1536       const uint32_t row_start_idx = row_end_idx;
1537       row_end_idx = NextTrianglePass(row_start_idx, grand_row_end_idx, 1, cells_avail);
1538       TriangleLoadBalance(calc_thread_ct, row_start_idx, row_end_idx, 1, dense_ctx.thread_start);
1539       memcpy(cur_sample_include, sample_include, raw_sample_ctl * sizeof(intptr_t));
1540       // bugfix (20 Nov 2019): forgot that --parallel could cause the old
1541       // row_end_idx != grand_row_end_idx comparison not work
1542       if (row_end_idx != orig_sample_ct) {
1543         uint32_t sample_uidx_end = IdxToUidxBasic(sample_include, row_end_idx);
1544         ClearBitsNz(sample_uidx_end, raw_sample_ct, cur_sample_include);
1545       }
1546       FillCumulativePopcounts(cur_sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
1547       pgfip->block_base = main_loadbufs[0];  // needed after first pass
1548       // Update (9 Nov 2019): The one-time singleton/monomorphic scan has been
1549       // replaced with a more effective sparse-variant scan which happens on
1550       // every pass.
1551       sparse_ctx.sample_include = cur_sample_include;
1552       sparse_ctx.sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
1553       sparse_ctx.row_start_idx = row_start_idx;
1554       sparse_ctx.row_end_idx = row_end_idx;
1555       sparse_ctx.max_sparse_ct = KingMaxSparseCt(row_end_idx);
1556       logprintf("%s pass %u/%u: Scanning for rare variants... ", flagname, pass_idx_p1, pass_ct);
1557       fputs("0%", stdout);
1558       fflush(stdout);
1559       SetThreadFuncAndData(CalcKingSparseThread, &sparse_ctx, &tg);
1560       if (unlikely(SpawnThreads(&tg))) {
1561         goto CalcKing_ret_THREAD_CREATE_FAIL;
1562       }
1563       memcpy(variant_include, variant_include_orig, raw_variant_ctl * sizeof(intptr_t));
1564 
1565       {
1566         const uint32_t read_block_sizel = sparse_read_block_size / kBitsPerWord;
1567         uint32_t prev_read_block_idx = 0;
1568         uint32_t read_block_idx = 0;
1569         uint32_t pct = 0;
1570         uint32_t next_print_variant_idx = variant_ct / 100;
1571         uint32_t parity = 0;
1572         for (uint32_t variant_idx = 0; ; ) {
1573           const uint32_t cur_block_size = MultireadNonempty(variant_include_orig, &tg, raw_variant_ct, sparse_read_block_size, pgfip, &read_block_idx, &reterr);
1574           if (unlikely(reterr)) {
1575             goto CalcKing_ret_PGR_FAIL;
1576           }
1577           JoinThreads(&tg);
1578           reterr = sparse_ctx.reterr;
1579           if (unlikely(reterr)) {
1580             goto CalcKing_ret_PGR_FAIL;
1581           }
1582           if (!IsLastBlock(&tg)) {
1583             sparse_ctx.cur_block_size = cur_block_size;
1584             ComputeUidxStartPartition(variant_include_orig, cur_block_size, calc_thread_ct, read_block_idx * sparse_read_block_size, sparse_ctx.read_variant_uidx_starts);
1585             PgrCopyBaseAndOffset(pgfip, calc_thread_ct, sparse_ctx.pgr_ptrs);
1586             if (variant_idx + cur_block_size == variant_ct) {
1587               DeclareLastThreadBlock(&tg);
1588             }
1589             SpawnThreads(&tg);
1590           }
1591           parity = 1 - parity;
1592           if (variant_idx) {
1593             uintptr_t* variant_include_update = &(variant_include[prev_read_block_idx * read_block_sizel]);
1594             for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
1595               BitvecInvmask(sparse_ctx.thread_sparse_excludes[parity][tidx], read_block_sizel, variant_include_update);
1596             }
1597             if (variant_idx == variant_ct) {
1598               break;
1599             }
1600             if (variant_idx >= next_print_variant_idx) {
1601               if (pct > 10) {
1602                 putc_unlocked('\b', stdout);
1603               }
1604               pct = (variant_idx * 100LLU) / variant_ct;
1605               printf("\b\b%u%%", pct++);
1606               fflush(stdout);
1607               next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
1608             }
1609           }
1610           prev_read_block_idx = read_block_idx;
1611           ++read_block_idx;
1612           variant_idx += cur_block_size;
1613           pgfip->block_base = main_loadbufs[parity];
1614         }
1615         if (pct > 10) {
1616           putc_unlocked('\b', stdout);
1617         }
1618       }
1619       fputs("\b\b", stdout);
1620       logputs("done.\n");
1621       const uint32_t cur_variant_ct = PopcountWords(variant_include, raw_variant_ctl);
1622       uint32_t sparse_variant_ct = variant_ct - cur_variant_ct;
1623       logprintf("%u variant%s handled by initial scan (%u remaining).\n", sparse_variant_ct, (sparse_variant_ct == 1)? "" : "s", cur_variant_ct);
1624       uint32_t skip_ct = sparse_ctx.thread_skip_cts[0];
1625       const uint32_t vec_ct = DivUp(row_end_idx, kInt32PerVec);
1626       for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
1627         U32CastVecAdd(sparse_ctx.thread_singleton_het_cts[tidx], vec_ct, singleton_het_cts);
1628         U32CastVecAdd(sparse_ctx.thread_singleton_hom_cts[tidx], vec_ct, singleton_hom_cts);
1629         U32CastVecAdd(sparse_ctx.thread_singleton_missing_cts[tidx], vec_ct, singleton_missing_cts);
1630         skip_ct += sparse_ctx.thread_skip_cts[tidx];
1631       }
1632       sparse_variant_ct -= skip_ct;
1633       if (cur_variant_ct) {
1634         SetThreadFuncAndData(CalcKingDenseThread, &dense_ctx, &tg);
1635         const uint32_t row_end_idxaw = BitCtToAlignedWordCt(row_end_idx);
1636         const uint32_t row_end_idxaw2 = NypCtToAlignedWordCt(row_end_idx);
1637         if (row_end_idxaw % 2) {
1638           const uint32_t cur_king_bufsizew = kKingMultiplexWords * row_end_idx;
1639           uintptr_t* smaj_hom0_last = &(dense_ctx.smaj_hom[0][kKingMultiplexWords - 1]);
1640           uintptr_t* smaj_ref2het0_last = &(dense_ctx.smaj_ref2het[0][kKingMultiplexWords - 1]);
1641           uintptr_t* smaj_hom1_last = &(dense_ctx.smaj_hom[1][kKingMultiplexWords - 1]);
1642           uintptr_t* smaj_ref2het1_last = &(dense_ctx.smaj_ref2het[1][kKingMultiplexWords - 1]);
1643           for (uint32_t offset = 0; offset < cur_king_bufsizew; offset += kKingMultiplexWords) {
1644             smaj_hom0_last[offset] = 0;
1645             smaj_ref2het0_last[offset] = 0;
1646             smaj_hom1_last[offset] = 0;
1647             smaj_ref2het1_last[offset] = 0;
1648           }
1649         }
1650         uintptr_t variant_uidx_base = 0;
1651         uintptr_t cur_bits = variant_include[0];
1652         uint32_t variants_completed = 0;
1653         uint32_t parity = 0;
1654         const uint32_t sample_batch_ct_m1 = (row_end_idx - 1) / kPglBitTransposeBatch;
1655         // Similar to plink 1.9 --genome.  For each pair of samples S1-S2, we
1656         // need to determine counts of the following:
1657         //   * S1 hom-S2 opposite hom
1658         //   * het-het
1659         //   * S1 hom-S2 het
1660         //   * S2 hom-S1 het
1661         //   * sometimes S1 hom-S2 same hom
1662         //   * (nonmissing determined via subtraction)
1663         // We handle this as follows:
1664         //   1. set n=0, reader thread loads first kKingMultiplex variants and
1665         //      converts+transposes the data to a sample-major format suitable
1666         //      for multithreaded computation.
1667         //   2. spawn threads
1668         //
1669         //   3. increment n by 1
1670         //   4. load block n unless eof
1671         //   5. permit threads to continue to next block, unless eof
1672         //   6. goto step 3 unless eof
1673         //
1674         //   7. write results
1675         // Results are always reported in lower-triangular order, rather than
1676         // KING's upper-triangular order, since the former plays more nicely
1677         // with incremental addition of samples.
1678         PgrSampleSubsetIndex pssi;
1679         PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
1680         do {
1681           const uint32_t cur_block_size = MINV(cur_variant_ct - variants_completed, kKingMultiplex);
1682           uintptr_t* cur_smaj_hom = dense_ctx.smaj_hom[parity];
1683           uintptr_t* cur_smaj_ref2het = dense_ctx.smaj_ref2het[parity];
1684           // "block" = distance computation granularity, usually 1024 or 1536
1685           //           variants
1686           // "batch" = variant-major-to-sample-major transpose granularity,
1687           //           currently 512 variants
1688           uint32_t variant_batch_size = kPglBitTransposeBatch;
1689           uint32_t variant_batch_size_rounded_up = kPglBitTransposeBatch;
1690           const uint32_t write_batch_ct_m1 = (cur_block_size - 1) / kPglBitTransposeBatch;
1691           for (uint32_t write_batch_idx = 0; ; ++write_batch_idx) {
1692             if (write_batch_idx >= write_batch_ct_m1) {
1693               if (write_batch_idx > write_batch_ct_m1) {
1694                 break;
1695               }
1696               variant_batch_size = ModNz(cur_block_size, kPglBitTransposeBatch);
1697               variant_batch_size_rounded_up = variant_batch_size;
1698               const uint32_t variant_batch_size_rem = variant_batch_size % kBitsPerWord;
1699               if (variant_batch_size_rem) {
1700                 const uint32_t trailing_variant_ct = kBitsPerWord - variant_batch_size_rem;
1701                 variant_batch_size_rounded_up += trailing_variant_ct;
1702                 ZeroWArr(trailing_variant_ct * row_end_idxaw, &(splitbuf_hom[variant_batch_size * row_end_idxaw]));
1703                 ZeroWArr(trailing_variant_ct * row_end_idxaw, &(splitbuf_ref2het[variant_batch_size * row_end_idxaw]));
1704               }
1705             }
1706             uintptr_t* hom_iter = splitbuf_hom;
1707             uintptr_t* ref2het_iter = splitbuf_ref2het;
1708             for (uint32_t uii = 0; uii != variant_batch_size; ++uii) {
1709               const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
1710               // Model does not cleanly generalize to multiallelic variants
1711               // (unless there's something I overlooked, which is quite
1712               // possible).
1713               // Thought about using major allele counts in that case, but that
1714               // sacrifices a really nice property of this method: estimated
1715               // relationship coefficient between each pair of samples is
1716               // independent of estimated allele frequencies.  And the accuracy
1717               // improvement we'd get in return is microscopic.  So we stick to
1718               // REF/ALT allele counts instead.
1719               reterr = PgrGet(cur_sample_include, pssi, row_end_idx, variant_uidx, simple_pgrp, loadbuf);
1720               if (unlikely(reterr)) {
1721                 goto CalcKing_ret_PGR_FAIL;
1722               }
1723               SetTrailingNyps(row_end_idx, loadbuf);
1724               SplitHomRef2hetUnsafeW(loadbuf, row_end_idxaw2, hom_iter, ref2het_iter);
1725               hom_iter = &(hom_iter[row_end_idxaw]);
1726               ref2het_iter = &(ref2het_iter[row_end_idxaw]);
1727             }
1728             // uintptr_t* read_iter = loadbuf;
1729             uintptr_t* write_hom_iter = &(cur_smaj_hom[write_batch_idx * kPglBitTransposeWords]);
1730             uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[write_batch_idx * kPglBitTransposeWords]);
1731             uint32_t write_batch_size = kPglBitTransposeBatch;
1732             for (uint32_t sample_batch_idx = 0; ; ++sample_batch_idx) {
1733               if (sample_batch_idx >= sample_batch_ct_m1) {
1734                 if (sample_batch_idx > sample_batch_ct_m1) {
1735                   break;
1736                 }
1737                 write_batch_size = ModNz(row_end_idx, kPglBitTransposeBatch);
1738               }
1739               // bugfix: read_batch_size must be rounded up to word boundary,
1740               // since we want to one-out instead of zero-out the trailing bits
1741               //
1742               // bugfix: if we always use kPglBitTransposeBatch instead of
1743               // variant_batch_size_rounded_up, we read/write past the
1744               // kKingMultiplex limit and clobber the first variants of the
1745               // next sample with garbage.
1746               TransposeBitblock(&(splitbuf_hom[sample_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_hom_iter, vecaligned_buf);
1747               TransposeBitblock(&(splitbuf_ref2het[sample_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_ref2het_iter, vecaligned_buf);
1748               write_hom_iter = &(write_hom_iter[kKingMultiplex * kPglBitTransposeWords]);
1749               write_ref2het_iter = &(write_ref2het_iter[kKingMultiplex * kPglBitTransposeWords]);
1750             }
1751           }
1752           const uint32_t cur_block_sizew = BitCtToWordCt(cur_block_size);
1753           if (cur_block_sizew < kKingMultiplexWords) {
1754             uintptr_t* write_hom_iter = &(cur_smaj_hom[cur_block_sizew]);
1755             uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[cur_block_sizew]);
1756             const uint32_t write_word_ct = kKingMultiplexWords - cur_block_sizew;
1757             for (uint32_t sample_idx = 0; sample_idx != row_end_idx; ++sample_idx) {
1758               ZeroWArr(write_word_ct, write_hom_iter);
1759               ZeroWArr(write_word_ct, write_ref2het_iter);
1760               write_hom_iter = &(write_hom_iter[kKingMultiplexWords]);
1761               write_ref2het_iter = &(write_ref2het_iter[kKingMultiplexWords]);
1762             }
1763           }
1764           if (variants_completed) {
1765             JoinThreads(&tg);
1766             // CalcKingThread() never errors out
1767           }
1768           // this update must occur after JoinThreads() call
1769           if (variants_completed + cur_block_size == cur_variant_ct) {
1770             DeclareLastThreadBlock(&tg);
1771           }
1772           if (unlikely(SpawnThreads(&tg))) {
1773             goto CalcKing_ret_THREAD_CREATE_FAIL;
1774           }
1775           printf("\r%s pass %u/%u: %u variants complete.", flagname, pass_idx_p1, pass_ct, variants_completed);
1776           fflush(stdout);
1777           variants_completed += cur_block_size;
1778           parity = 1 - parity;
1779         } while (!IsLastBlock(&tg));
1780         JoinThreads(&tg);
1781       }
1782       if (matrix_shape || (king_flags & kfKingColAll)) {
1783         printf("\r%s pass %u/%u: Writing...                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", flagname, pass_idx_p1, pass_ct);
1784         fflush(stdout);
1785         // allow simultaneous --make-king + --make-king-table
1786         if (matrix_shape) {
1787           if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
1788             const uint32_t is_squarex = king_flags & (kfKingMatrixSq | kfKingMatrixSq0);
1789             const uint32_t is_square0 = king_flags & kfKingMatrixSq0;
1790             uint32_t* results_iter = dense_ctx.king_counts;
1791             uint32_t sample_idx1 = row_start_idx;
1792             if (is_squarex && (!parallel_idx) && (pass_idx_p1)) {
1793               // dump "empty" first row
1794               sample_idx1 = 0;
1795             }
1796             for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1797               const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1798               const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1799               for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
1800                 const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1801                 if (kinship_table && (kinship_coeff > king_cutoff)) {
1802                   SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1803                   SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1804                 }
1805                 cswritep = dtoa_g(kinship_coeff, cswritep);
1806                 *cswritep++ = '\t';
1807                 results_iter = &(results_iter[homhom_needed_p4]);
1808               }
1809               if (is_squarex) {
1810                 cswritep = strcpya_k(cswritep, "0.5");
1811                 if (is_square0) {
1812                   // (roughly same performance as creating a tab-zero constant
1813                   // buffer in advance)
1814                   const uint32_t zcount = sample_ct - sample_idx1 - 1;
1815                   const uint32_t wct = DivUp(zcount, kBytesPerWord / 2);
1816                   // assumes little-endian
1817                   const uintptr_t tabzero_word = 0x3009 * kMask0001;
1818 #ifdef __arm__
1819 #  error "Unaligned accesses in CalcKing()."
1820 #endif
1821                   uintptr_t* writep_alias = R_CAST(uintptr_t*, cswritep);
1822                   for (uintptr_t widx = 0; widx != wct; ++widx) {
1823                     *writep_alias++ = tabzero_word;
1824                   }
1825                   cswritep = &(cswritep[zcount * 2]);
1826                 } else {
1827                   const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1828                   // 0
1829                   // 1  2
1830                   // 3  4  5
1831                   // 6  7  8  9
1832                   // 10 11 12 13 14
1833 
1834                   // sample_idx1 = 0: [0] 0 1 3 6 10...
1835                   // sample_idx1 = 1: [1] 2 4 7 11...
1836                   // sample_idx1 = 2: [3] 5 8 12...
1837                   // sample_idx1 = 3: [6] 9 13...
1838                   for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1839                     *cswritep++ = '\t';
1840                     cswritep = dtoa_g(ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]), cswritep);
1841                     results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1842                   }
1843                 }
1844                 ++cswritep;
1845               }
1846               DecrAppendBinaryEoln(&cswritep);
1847               if (unlikely(Cswrite(&css, &cswritep))) {
1848                 goto CalcKing_ret_WRITE_FAIL;
1849               }
1850             }
1851           } else {
1852             // binary matrix output
1853             // er, probably want to revise this so there's less duplicated code
1854             // from text matrix output...
1855             const uint32_t is_squarex = king_flags & (kfKingMatrixSq | kfKingMatrixSq0);
1856             const uint32_t is_square0 = king_flags & kfKingMatrixSq0;
1857             uint32_t* results_iter = dense_ctx.king_counts;
1858             uint32_t sample_idx1 = row_start_idx;
1859             if (is_squarex && (!parallel_idx)) {
1860               sample_idx1 = 0;
1861             }
1862             if (king_flags & kfKingMatrixBin4) {
1863               float* write_row = R_CAST(float*, numbuf);
1864               uintptr_t row_byte_ct = sample_ct * sizeof(float);
1865               for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1866                 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1867                 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1868                 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
1869                   const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1870                   if (kinship_table && (kinship_coeff > king_cutoff)) {
1871                     SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1872                     SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1873                   }
1874                   write_row[sample_idx2] = S_CAST(float, kinship_coeff);
1875                   results_iter = &(results_iter[homhom_needed_p4]);
1876                 }
1877                 if (is_squarex) {
1878                   write_row[sample_idx1] = 0.5f;
1879                   if (is_square0) {
1880                     const uint32_t right_fill_idx = sample_idx1 + 1;
1881                     ZeroFArr(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
1882                   } else {
1883                     const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1884                     for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1885                       write_row[sample_idx2] = S_CAST(float, ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]));
1886                       results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1887                     }
1888                   }
1889                 } else {
1890                   row_byte_ct = sample_idx1 * sizeof(float);
1891                 }
1892                 if (unlikely(fwrite_checked(write_row, row_byte_ct, outfile))) {
1893                   goto CalcKing_ret_WRITE_FAIL;
1894                 }
1895               }
1896             } else {
1897               double* write_row = R_CAST(double*, numbuf);
1898               uintptr_t row_byte_ct = sample_ct * sizeof(double);
1899               for (; sample_idx1 != row_end_idx; ++sample_idx1) {
1900                 const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1901                 const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1902                 for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
1903                   const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
1904                   if (kinship_table && (kinship_coeff > king_cutoff)) {
1905                     SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1906                     SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1907                   }
1908                   write_row[sample_idx2] = kinship_coeff;
1909                   results_iter = &(results_iter[homhom_needed_p4]);
1910                 }
1911                 if (is_squarex) {
1912                   write_row[sample_idx1] = 0.5;
1913                   if (is_square0) {
1914                     const uint32_t right_fill_idx = sample_idx1 + 1;
1915                     ZeroDArr(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
1916                   } else {
1917                     const uint32_t* results_iter2 = &(results_iter[sample_idx1 * homhom_needed_p4]);
1918                     for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 != sample_ct; ++sample_idx2) {
1919                       write_row[sample_idx2] = ComputeKinship(results_iter2, singleton_het1_ct, singleton_hom1_ct, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2]);
1920                       results_iter2 = &(results_iter2[sample_idx2 * homhom_needed_p4]);
1921                     }
1922                   }
1923                 } else {
1924                   row_byte_ct = sample_idx1 * sizeof(double);
1925                 }
1926                 if (unlikely(fwrite_checked(write_row, row_byte_ct, outfile))) {
1927                   goto CalcKing_ret_WRITE_FAIL;
1928                 }
1929               }
1930             }
1931           }
1932         }
1933         if (king_flags & kfKingColAll) {
1934           uintptr_t* kinship_table_backup = nullptr;
1935           if (matrix_shape) {
1936             // We already updated the table; don't do it again.
1937             kinship_table_backup = kinship_table;
1938             kinship_table = nullptr;
1939           }
1940           const uint32_t king_col_id = king_flags & kfKingColId;
1941           const uint32_t king_col_nsnp = king_flags & kfKingColNsnp;
1942           const uint32_t king_col_hethet = king_flags & kfKingColHethet;
1943           const uint32_t king_col_ibs0 = king_flags & kfKingColIbs0;
1944           const uint32_t king_col_ibs1 = king_flags & kfKingColIbs1;
1945           const uint32_t king_col_kinship = king_flags & kfKingColKinship;
1946           const uint32_t report_counts = king_flags & kfKingCounts;
1947           uint32_t* results_iter = dense_ctx.king_counts;
1948           double nonmiss_recip = 0.0;
1949           for (uint32_t sample_idx1 = row_start_idx; sample_idx1 != row_end_idx; ++sample_idx1) {
1950             const char* sample_fmtid1 = &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx1]);
1951             const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
1952             const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
1953             const uint32_t sample_fmtid1_slen = strlen(sample_fmtid1);
1954             for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2, results_iter = &(results_iter[homhom_needed_p4])) {
1955               const uint32_t singleton_het2_ct = singleton_het_cts[sample_idx2];
1956               const uint32_t singleton_hom2_ct = singleton_hom_cts[sample_idx2];
1957               const uint32_t ibs0_ct = results_iter[kKingOffsetIbs0] + singleton_hom2_ct + singleton_hom1_ct;
1958               const uint32_t hethet_ct = results_iter[kKingOffsetHethet];
1959               // '2' here refers to the larger index, so this is swapped
1960               const uint32_t het2hom1_ct = results_iter[kKingOffsetHet2Hom1] + singleton_het1_ct;
1961               const uint32_t het1hom2_ct = results_iter[kKingOffsetHet1Hom2] + singleton_het2_ct;
1962               const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
1963               const double kinship_coeff = 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
1964               if (kinship_table && (kinship_coeff > king_cutoff)) {
1965                 SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
1966                 SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
1967               }
1968               // edge case fix (18 Nov 2017): kinship_coeff can be -inf when
1969               // smaller_het_ct is zero.  Don't filter those lines out when
1970               // --king-table-filter wasn't specified.
1971               if ((king_table_filter != -DBL_MAX) && (kinship_coeff < king_table_filter)) {
1972                 ++king_table_filter_ct;
1973                 continue;
1974               }
1975               if (king_col_id) {
1976                 cswritetp = memcpyax(cswritetp, sample_fmtid1, sample_fmtid1_slen, '\t');
1977                 cswritetp = strcpyax(cswritetp, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx2]), '\t');
1978               }
1979               if (homhom_needed_p4 == 5) {
1980                 const uint32_t homhom_ct = results_iter[kKingOffsetHomhom] + sparse_variant_ct - singleton_het2_ct - singleton_missing_cts[sample_idx2] - singleton_het1_ct - singleton_missing_cts[sample_idx1];
1981                 const uint32_t nonmiss_ct = het1hom2_ct + het2hom1_ct + homhom_ct + hethet_ct;
1982                 if (king_col_nsnp) {
1983                   cswritetp = u32toa_x(nonmiss_ct, '\t', cswritetp);
1984                 }
1985                 if (!report_counts) {
1986                   nonmiss_recip = 1.0 / u31tod(nonmiss_ct);
1987                 }
1988               }
1989               if (king_col_hethet) {
1990                 if (report_counts) {
1991                   cswritetp = u32toa(hethet_ct, cswritetp);
1992                 } else {
1993                   cswritetp = dtoa_g(nonmiss_recip * u31tod(hethet_ct), cswritetp);
1994                 }
1995                 *cswritetp++ = '\t';
1996               }
1997               if (king_col_ibs0) {
1998                 if (report_counts) {
1999                   cswritetp = u32toa(ibs0_ct, cswritetp);
2000                 } else {
2001                   cswritetp = dtoa_g(nonmiss_recip * u31tod(ibs0_ct), cswritetp);
2002                 }
2003                 *cswritetp++ = '\t';
2004               }
2005               if (king_col_ibs1) {
2006                 if (report_counts) {
2007                   cswritetp = u32toa_x(het1hom2_ct, '\t', cswritetp);
2008                   cswritetp = u32toa(het2hom1_ct, cswritetp);
2009                 } else {
2010                   cswritetp = dtoa_g(nonmiss_recip * u31tod(het1hom2_ct), cswritetp);
2011                   *cswritetp++ = '\t';
2012                   cswritetp = dtoa_g(nonmiss_recip * u31tod(het2hom1_ct), cswritetp);
2013                 }
2014                 *cswritetp++ = '\t';
2015               }
2016               if (king_col_kinship) {
2017                 cswritetp = dtoa_g(kinship_coeff, cswritetp);
2018                 ++cswritetp;
2019               }
2020               DecrAppendBinaryEoln(&cswritetp);
2021               if (unlikely(Cswrite(&csst, &cswritetp))) {
2022                 goto CalcKing_ret_WRITE_FAIL;
2023               }
2024             }
2025           }
2026 
2027           if (matrix_shape) {
2028             kinship_table = kinship_table_backup;
2029           }
2030         }
2031       } else {
2032         printf("\r%s pass %u/%u: Condensing...                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", flagname, pass_idx_p1, pass_ct);
2033         fflush(stdout);
2034         uint32_t* results_iter = dense_ctx.king_counts;
2035         for (uint32_t sample_idx1 = row_start_idx; sample_idx1 != row_end_idx; ++sample_idx1) {
2036           const uint32_t singleton_het1_ct = singleton_het_cts[sample_idx1];
2037           const uint32_t singleton_hom1_ct = singleton_hom_cts[sample_idx1];
2038           for (uint32_t sample_idx2 = 0; sample_idx2 != sample_idx1; ++sample_idx2) {
2039             const double kinship_coeff = ComputeKinship(results_iter, singleton_het_cts[sample_idx2], singleton_hom_cts[sample_idx2], singleton_het1_ct, singleton_hom1_ct);
2040             if (kinship_coeff > king_cutoff) {
2041               SetBit(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
2042               SetBit(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
2043             }
2044             results_iter = &(results_iter[homhom_needed_p4]);
2045           }
2046         }
2047       }
2048       fputs(" done.\n", stdout);
2049     }
2050     logprintf("%s: %u variant%s processed.\n", flagname, variant_ct, (variant_ct == 1)? "" : "s");
2051     // end-of-loop operations
2052     if (matrix_shape) {
2053       if (!(king_flags & (kfKingMatrixBin | kfKingMatrixBin4))) {
2054         if (unlikely(CswriteCloseNull(&css, cswritep))) {
2055           goto CalcKing_ret_WRITE_FAIL;
2056         }
2057       } else {
2058         if (unlikely(fclose_null(&outfile))) {
2059           goto CalcKing_ret_WRITE_FAIL;
2060         }
2061       }
2062       // Necessary to regenerate filename since it may have been overwritten by
2063       // --make-king-table.
2064       SetKingMatrixFname(king_flags, parallel_idx, parallel_tot, outname_end);
2065 
2066       char* write_iter = strcpya_k(g_logbuf, "Results written to ");
2067       const uint32_t outname_base_slen = S_CAST(uintptr_t, outname_end - outname);
2068       write_iter = memcpya(write_iter, outname, outname_base_slen + strlen(outname_end));
2069       write_iter = strcpya_k(write_iter, " and ");
2070       strcpy_k(&(outname_end[5]), ".id");
2071       write_iter = memcpya(write_iter, outname, outname_base_slen + 8);
2072       strcpy_k(write_iter, " .\n");
2073       WordWrapB(0);
2074       logputsb();
2075       reterr = WriteSampleIds(sample_include, siip, outname, sample_ct);
2076       if (unlikely(reterr)) {
2077         goto CalcKing_ret_1;
2078       }
2079     }
2080     if (king_flags & kfKingColAll) {
2081       if (unlikely(CswriteCloseNull(&csst, cswritetp))) {
2082         goto CalcKing_ret_WRITE_FAIL;
2083       }
2084       SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
2085       char* write_iter = strcpya_k(g_logbuf, "Results written to ");
2086       const uint32_t outname_base_slen = S_CAST(uintptr_t, outname_end - outname);
2087       write_iter = memcpya(write_iter, outname, outname_base_slen + strlen(outname_end));
2088       if ((!parallel_idx) && (!(king_flags & kfKingColId))) {
2089         write_iter = strcpya_k(write_iter, " and ");
2090         strcpy_k(&(outname_end[5]), ".id");
2091         write_iter = memcpya(write_iter, outname, outname_base_slen + 8);
2092         strcpy_k(write_iter, " .\n");
2093         WordWrapB(0);
2094         logputsb();
2095         reterr = WriteSampleIds(sample_include, siip, outname, sample_ct);
2096         if (unlikely(reterr)) {
2097           goto CalcKing_ret_1;
2098         }
2099       } else {
2100         strcpy_k(write_iter, " .\n");
2101         WordWrapB(0);
2102         logputsb();
2103       }
2104       if (king_table_filter != -DBL_MAX) {
2105         const uint64_t grand_tot_cells = (S_CAST(uint64_t, grand_row_end_idx) * (grand_row_end_idx - 1) - S_CAST(uint64_t, grand_row_start_idx) * (grand_row_start_idx - 1)) / 2;
2106         const uint64_t reported_ct = grand_tot_cells - king_table_filter_ct;
2107         logprintf("--king-table-filter: %" PRIu64 " relationship%s reported (%" PRIu64 " filtered out).\n", reported_ct, (reported_ct == 1)? "" : "s", king_table_filter_ct);
2108       }
2109     }
2110     if (kinship_table) {
2111       BigstackReset(sample_include_cumulative_popcounts);
2112       *sample_ct_ptr = sample_ct;
2113       if (unlikely(KinshipPruneDestructive(kinship_table, sample_include, sample_ct_ptr))) {
2114         goto CalcKing_ret_NOMEM;
2115       }
2116     }
2117   }
2118   while (0) {
2119   CalcKing_ret_NOMEM:
2120     reterr = kPglRetNomem;
2121     break;
2122   CalcKing_ret_OPEN_FAIL:
2123     reterr = kPglRetOpenFail;
2124     break;
2125   CalcKing_ret_PGR_FAIL:
2126     PgenErrPrintN(reterr);
2127     break;
2128   CalcKing_ret_WRITE_FAIL:
2129     reterr = kPglRetWriteFail;
2130     break;
2131   CalcKing_ret_INCONSISTENT_INPUT:
2132     reterr = kPglRetInconsistentInput;
2133     break;
2134   CalcKing_ret_THREAD_CREATE_FAIL:
2135     reterr = kPglRetThreadCreateFail;
2136     break;
2137   CalcKing_ret_DEGENERATE_DATA:
2138     reterr = kPglRetDegenerateData;
2139     break;
2140   }
2141  CalcKing_ret_1:
2142   CleanupThreads(&tg);
2143   CswriteCloseCond(&csst, cswritetp);
2144   CswriteCloseCond(&css, cswritep);
2145   fclose_cond(outfile);
2146   BigstackReset(bigstack_mark);
2147   return reterr;
2148 }
2149 
2150 #ifdef USE_SSE42
IncrKingSubset(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2151 void IncrKingSubset(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2152   const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2153   const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2154   uint32_t* king_counts_iter = &(king_counts[(4 * k1LU) * start_idx]);
2155   while (sample_idx_pair_iter != sample_idx_pair_stop) {
2156     // technically overflows for huge sample_ct
2157     const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2158     const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2159     const uintptr_t* first_hom = &(smaj_hom[first_offset]);
2160     const uintptr_t* first_ref2het = &(smaj_ref2het[first_offset]);
2161     const uintptr_t* second_hom = &(smaj_hom[second_offset]);
2162     const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
2163     uint32_t acc_ibs0 = 0;
2164     uint32_t acc_hethet = 0;
2165     uint32_t acc_het2hom1 = 0;
2166     uint32_t acc_het1hom2 = 0;
2167     for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
2168       const uintptr_t hom1 = first_hom[widx];
2169       const uintptr_t hom2 = second_hom[widx];
2170       const uintptr_t ref2het1 = first_ref2het[widx];
2171       const uintptr_t ref2het2 = second_ref2het[widx];
2172       const uintptr_t homhom = hom1 & hom2;
2173       const uintptr_t het1 = ref2het1 & (~hom1);
2174       const uintptr_t het2 = ref2het2 & (~hom2);
2175       acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
2176       acc_hethet += PopcountWord(het1 & het2);
2177       acc_het2hom1 += PopcountWord(hom1 & het2);
2178       acc_het1hom2 += PopcountWord(hom2 & het1);
2179     }
2180     *king_counts_iter++ += acc_ibs0;
2181     *king_counts_iter++ += acc_hethet;
2182     *king_counts_iter++ += acc_het2hom1;
2183     *king_counts_iter++ += acc_het1hom2;
2184   }
2185 }
2186 
IncrKingSubsetHomhom(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2187 void IncrKingSubsetHomhom(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2188   const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2189   const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2190   uint32_t* king_counts_iter = &(king_counts[(5 * k1LU) * start_idx]);
2191   while (sample_idx_pair_iter != sample_idx_pair_stop) {
2192     // technically overflows for huge sample_ct
2193     const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2194     const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2195     const uintptr_t* first_hom = &(smaj_hom[first_offset]);
2196     const uintptr_t* first_ref2het = &(smaj_ref2het[first_offset]);
2197     const uintptr_t* second_hom = &(smaj_hom[second_offset]);
2198     const uintptr_t* second_ref2het = &(smaj_ref2het[second_offset]);
2199     uint32_t acc_homhom = 0;
2200     uint32_t acc_ibs0 = 0;
2201     uint32_t acc_hethet = 0;
2202     uint32_t acc_het2hom1 = 0;
2203     uint32_t acc_het1hom2 = 0;
2204     for (uint32_t widx = 0; widx != kKingMultiplexWords; ++widx) {
2205       const uintptr_t hom1 = first_hom[widx];
2206       const uintptr_t hom2 = second_hom[widx];
2207       const uintptr_t ref2het1 = first_ref2het[widx];
2208       const uintptr_t ref2het2 = second_ref2het[widx];
2209       const uintptr_t homhom = hom1 & hom2;
2210       const uintptr_t het1 = ref2het1 & (~hom1);
2211       const uintptr_t het2 = ref2het2 & (~hom2);
2212       acc_homhom += PopcountWord(homhom);
2213       acc_ibs0 += PopcountWord((ref2het1 ^ ref2het2) & homhom);
2214       acc_hethet += PopcountWord(het1 & het2);
2215       acc_het2hom1 += PopcountWord(hom1 & het2);
2216       acc_het1hom2 += PopcountWord(hom2 & het1);
2217     }
2218     *king_counts_iter++ += acc_ibs0;
2219     *king_counts_iter++ += acc_hethet;
2220     *king_counts_iter++ += acc_het2hom1;
2221     *king_counts_iter++ += acc_het1hom2;
2222     *king_counts_iter++ += acc_homhom;
2223   }
2224 }
2225 #else
IncrKingSubset(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2226 void IncrKingSubset(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2227   const VecW m1 = VCONST_W(kMask5555);
2228   const VecW m2 = VCONST_W(kMask3333);
2229   const VecW m4 = VCONST_W(kMask0F0F);
2230   const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2231   const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2232   uint32_t* king_counts_iter = &(king_counts[(4 * k1LU) * start_idx]);
2233   while (sample_idx_pair_iter != sample_idx_pair_stop) {
2234     // technically overflows for huge sample_ct
2235     const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2236     const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2237     const VecW* first_hom = R_CAST(const VecW*, &(smaj_hom[first_offset]));
2238     const VecW* first_ref2het = R_CAST(const VecW*, &(smaj_ref2het[first_offset]));
2239     const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
2240     const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
2241     UniVec acc_ibs0;
2242     UniVec acc_hethet;
2243     UniVec acc_het2hom1;
2244     UniVec acc_het1hom2;
2245     acc_ibs0.vw = vecw_setzero();
2246     acc_hethet.vw = vecw_setzero();
2247     acc_het2hom1.vw = vecw_setzero();
2248     acc_het1hom2.vw = vecw_setzero();
2249     for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
2250       VecW hom1 = first_hom[vec_idx];
2251       VecW hom2 = second_hom[vec_idx];
2252       VecW ref2het1 = first_ref2het[vec_idx];
2253       VecW ref2het2 = second_ref2het[vec_idx];
2254       VecW het1 = vecw_and_notfirst(hom1, ref2het1);
2255       VecW het2 = vecw_and_notfirst(hom2, ref2het2);
2256       VecW agg_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
2257       VecW agg_hethet = het1 & het2;
2258       VecW agg_het2hom1 = hom1 & het2;
2259       VecW agg_het1hom2 = hom2 & het1;
2260       agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
2261       agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
2262       agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
2263       agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
2264       agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
2265       agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
2266       agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
2267       agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
2268 
2269       for (uint32_t offset = 1; offset != 3; ++offset) {
2270         hom1 = first_hom[vec_idx + offset];
2271         hom2 = second_hom[vec_idx + offset];
2272         ref2het1 = first_ref2het[vec_idx + offset];
2273         ref2het2 = second_ref2het[vec_idx + offset];
2274         het1 = vecw_and_notfirst(hom1, ref2het1);
2275         het2 = vecw_and_notfirst(hom2, ref2het2);
2276         VecW cur_ibs0 = (ref2het1 ^ ref2het2) & (hom1 & hom2);
2277         VecW cur_hethet = het1 & het2;
2278         VecW cur_het2hom1 = hom1 & het2;
2279         VecW cur_het1hom2 = hom2 & het1;
2280         cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
2281         cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
2282         cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
2283         cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
2284         agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
2285         agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
2286         agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
2287         agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
2288       }
2289       acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
2290       acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
2291       acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
2292       acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
2293     }
2294     const VecW m8 = VCONST_W(kMask00FF);
2295     acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
2296     acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
2297     acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
2298     acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
2299     *king_counts_iter++ += UniVecHsum16(acc_ibs0);
2300     *king_counts_iter++ += UniVecHsum16(acc_hethet);
2301     *king_counts_iter++ += UniVecHsum16(acc_het2hom1);
2302     *king_counts_iter++ += UniVecHsum16(acc_het1hom2);
2303   }
2304 }
2305 
IncrKingSubsetHomhom(const uint32_t * loaded_sample_idx_pairs,const uintptr_t * smaj_hom,const uintptr_t * smaj_ref2het,uint32_t start_idx,uint32_t end_idx,uint32_t * king_counts)2306 void IncrKingSubsetHomhom(const uint32_t* loaded_sample_idx_pairs, const uintptr_t* smaj_hom, const uintptr_t* smaj_ref2het, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts) {
2307   const VecW m1 = VCONST_W(kMask5555);
2308   const VecW m2 = VCONST_W(kMask3333);
2309   const VecW m4 = VCONST_W(kMask0F0F);
2310   const uint32_t* sample_idx_pair_iter = &(loaded_sample_idx_pairs[(2 * k1LU) * start_idx]);
2311   const uint32_t* sample_idx_pair_stop = &(loaded_sample_idx_pairs[(2 * k1LU) * end_idx]);
2312   uint32_t* king_counts_iter = &(king_counts[(5 * k1LU) * start_idx]);
2313   while (sample_idx_pair_iter != sample_idx_pair_stop) {
2314     // technically overflows for huge sample_ct
2315     const uint32_t first_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2316     const uint32_t second_offset = (*sample_idx_pair_iter++) * kKingMultiplexWords;
2317     const VecW* first_hom = R_CAST(const VecW*, &(smaj_hom[first_offset]));
2318     const VecW* first_ref2het = R_CAST(const VecW*, &(smaj_ref2het[first_offset]));
2319     const VecW* second_hom = R_CAST(const VecW*, &(smaj_hom[second_offset]));
2320     const VecW* second_ref2het = R_CAST(const VecW*, &(smaj_ref2het[second_offset]));
2321     UniVec acc_homhom;
2322     UniVec acc_ibs0;
2323     UniVec acc_hethet;
2324     UniVec acc_het2hom1;
2325     UniVec acc_het1hom2;
2326     acc_homhom.vw = vecw_setzero();
2327     acc_ibs0.vw = vecw_setzero();
2328     acc_hethet.vw = vecw_setzero();
2329     acc_het2hom1.vw = vecw_setzero();
2330     acc_het1hom2.vw = vecw_setzero();
2331     for (uint32_t vec_idx = 0; vec_idx < kKingMultiplexVecs; vec_idx += 3) {
2332       VecW hom1 = first_hom[vec_idx];
2333       VecW hom2 = second_hom[vec_idx];
2334       VecW ref2het1 = first_ref2het[vec_idx];
2335       VecW ref2het2 = second_ref2het[vec_idx];
2336       VecW agg_homhom = hom1 & hom2;
2337       VecW het1 = vecw_and_notfirst(hom1, ref2het1);
2338       VecW het2 = vecw_and_notfirst(hom2, ref2het2);
2339       VecW agg_ibs0 = (ref2het1 ^ ref2het2) & agg_homhom;
2340       VecW agg_hethet = het1 & het2;
2341       VecW agg_het2hom1 = hom1 & het2;
2342       VecW agg_het1hom2 = hom2 & het1;
2343       agg_homhom = agg_homhom - (vecw_srli(agg_homhom, 1) & m1);
2344       agg_ibs0 = agg_ibs0 - (vecw_srli(agg_ibs0, 1) & m1);
2345       agg_hethet = agg_hethet - (vecw_srli(agg_hethet, 1) & m1);
2346       agg_het2hom1 = agg_het2hom1 - (vecw_srli(agg_het2hom1, 1) & m1);
2347       agg_het1hom2 = agg_het1hom2 - (vecw_srli(agg_het1hom2, 1) & m1);
2348       agg_homhom = (agg_homhom & m2) + (vecw_srli(agg_homhom, 2) & m2);
2349       agg_ibs0 = (agg_ibs0 & m2) + (vecw_srli(agg_ibs0, 2) & m2);
2350       agg_hethet = (agg_hethet & m2) + (vecw_srli(agg_hethet, 2) & m2);
2351       agg_het2hom1 = (agg_het2hom1 & m2) + (vecw_srli(agg_het2hom1, 2) & m2);
2352       agg_het1hom2 = (agg_het1hom2 & m2) + (vecw_srli(agg_het1hom2, 2) & m2);
2353 
2354       for (uint32_t offset = 1; offset != 3; ++offset) {
2355         hom1 = first_hom[vec_idx + offset];
2356         hom2 = second_hom[vec_idx + offset];
2357         ref2het1 = first_ref2het[vec_idx + offset];
2358         ref2het2 = second_ref2het[vec_idx + offset];
2359         VecW cur_homhom = hom1 & hom2;
2360         het1 = vecw_and_notfirst(hom1, ref2het1);
2361         het2 = vecw_and_notfirst(hom2, ref2het2);
2362         VecW cur_ibs0 = (ref2het1 ^ ref2het2) & cur_homhom;
2363         VecW cur_hethet = het1 & het2;
2364         VecW cur_het2hom1 = hom1 & het2;
2365         VecW cur_het1hom2 = hom2 & het1;
2366         cur_homhom = cur_homhom - (vecw_srli(cur_homhom, 1) & m1);
2367         cur_ibs0 = cur_ibs0 - (vecw_srli(cur_ibs0, 1) & m1);
2368         cur_hethet = cur_hethet - (vecw_srli(cur_hethet, 1) & m1);
2369         cur_het2hom1 = cur_het2hom1 - (vecw_srli(cur_het2hom1, 1) & m1);
2370         cur_het1hom2 = cur_het1hom2 - (vecw_srli(cur_het1hom2, 1) & m1);
2371         agg_homhom += (cur_homhom & m2) + (vecw_srli(cur_homhom, 2) & m2);
2372         agg_ibs0 += (cur_ibs0 & m2) + (vecw_srli(cur_ibs0, 2) & m2);
2373         agg_hethet += (cur_hethet & m2) + (vecw_srli(cur_hethet, 2) & m2);
2374         agg_het2hom1 += (cur_het2hom1 & m2) + (vecw_srli(cur_het2hom1, 2) & m2);
2375         agg_het1hom2 += (cur_het1hom2 & m2) + (vecw_srli(cur_het1hom2, 2) & m2);
2376       }
2377       acc_homhom.vw = acc_homhom.vw + (agg_homhom & m4) + (vecw_srli(agg_homhom, 4) & m4);
2378       acc_ibs0.vw = acc_ibs0.vw + (agg_ibs0 & m4) + (vecw_srli(agg_ibs0, 4) & m4);
2379       acc_hethet.vw = acc_hethet.vw + (agg_hethet & m4) + (vecw_srli(agg_hethet, 4) & m4);
2380       acc_het2hom1.vw = acc_het2hom1.vw + (agg_het2hom1 & m4) + (vecw_srli(agg_het2hom1, 4) & m4);
2381       acc_het1hom2.vw = acc_het1hom2.vw + (agg_het1hom2 & m4) + (vecw_srli(agg_het1hom2, 4) & m4);
2382     }
2383     const VecW m8 = VCONST_W(kMask00FF);
2384     acc_homhom.vw = (acc_homhom.vw & m8) + (vecw_srli(acc_homhom.vw, 8) & m8);
2385     acc_ibs0.vw = (acc_ibs0.vw & m8) + (vecw_srli(acc_ibs0.vw, 8) & m8);
2386     acc_hethet.vw = (acc_hethet.vw & m8) + (vecw_srli(acc_hethet.vw, 8) & m8);
2387     acc_het2hom1.vw = (acc_het2hom1.vw & m8) + (vecw_srli(acc_het2hom1.vw, 8) & m8);
2388     acc_het1hom2.vw = (acc_het1hom2.vw & m8) + (vecw_srli(acc_het1hom2.vw, 8) & m8);
2389     *king_counts_iter++ += UniVecHsum16(acc_ibs0);
2390     *king_counts_iter++ += UniVecHsum16(acc_hethet);
2391     *king_counts_iter++ += UniVecHsum16(acc_het2hom1);
2392     *king_counts_iter++ += UniVecHsum16(acc_het1hom2);
2393     *king_counts_iter++ += UniVecHsum16(acc_homhom);
2394   }
2395 }
2396 #endif
2397 
2398 typedef struct CalcKingTableSubsetCtxStruct {
2399   uintptr_t* smaj_hom[2];
2400   uintptr_t* smaj_ref2het[2];
2401   uint32_t* loaded_sample_idx_pairs;
2402   uint32_t homhom_needed;
2403 
2404   uint32_t* thread_start;
2405 
2406   uint32_t* king_counts;
2407 } CalcKingTableSubsetCtx;
2408 
CalcKingTableSubsetThread(void * raw_arg)2409 THREAD_FUNC_DECL CalcKingTableSubsetThread(void* raw_arg) {
2410   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2411   const uintptr_t tidx = arg->tidx;
2412   CalcKingTableSubsetCtx* ctx = S_CAST(CalcKingTableSubsetCtx*, arg->sharedp->context);
2413 
2414   const uint32_t start_idx = ctx->thread_start[tidx];
2415   const uint32_t end_idx = ctx->thread_start[tidx + 1];
2416   const uint32_t homhom_needed = ctx->homhom_needed;
2417   uint32_t parity = 0;
2418   do {
2419     if (homhom_needed) {
2420       IncrKingSubsetHomhom(ctx->loaded_sample_idx_pairs, ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, ctx->king_counts);
2421     } else {
2422       IncrKingSubset(ctx->loaded_sample_idx_pairs, ctx->smaj_hom[parity], ctx->smaj_ref2het[parity], start_idx, end_idx, ctx->king_counts);
2423     }
2424     parity = 1 - parity;
2425   } while (!THREAD_BLOCK_FINISH(arg));
2426   THREAD_RETURN;
2427 }
2428 
KingTableSubsetLoad(const char * sorted_xidbox,const uint32_t * xid_map,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,double king_table_subset_thresh,XidMode xid_mode,uint32_t skip_sid,uint32_t rel_check,uint32_t kinship_skip,uint32_t is_first_parallel_scan,uint64_t pair_idx_start,uint64_t pair_idx_stop,uintptr_t line_idx,TextStream * txsp,uint64_t * pair_idx_ptr,uint32_t * loaded_sample_idx_pairs,char * idbuf)2429 PglErr KingTableSubsetLoad(const char* sorted_xidbox, const uint32_t* xid_map, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, double king_table_subset_thresh, XidMode xid_mode, uint32_t skip_sid, uint32_t rel_check, uint32_t kinship_skip, uint32_t is_first_parallel_scan, uint64_t pair_idx_start, uint64_t pair_idx_stop, uintptr_t line_idx, TextStream* txsp, uint64_t* pair_idx_ptr, uint32_t* loaded_sample_idx_pairs, char* idbuf) {
2430   PglErr reterr = kPglRetSuccess;
2431   {
2432     uint64_t pair_idx = *pair_idx_ptr;
2433     // Assumes header line already read if pair_idx == 0, and if pair_idx is
2434     // positive, we're that far into the file.
2435     uint32_t* loaded_sample_idx_pairs_iter = loaded_sample_idx_pairs;
2436     ++line_idx;
2437     for (char* line_iter = TextLineEnd(txsp); TextGetUnsafe2(txsp, &line_iter); line_iter = AdvPastDelim(line_iter, '\n'), ++line_idx) {
2438       const char* linebuf_iter = line_iter;
2439       uint32_t sample_uidx1;
2440       if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx1, idbuf)) {
2441         if (unlikely(!linebuf_iter)) {
2442           goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2443         }
2444         line_iter = K_CAST(char*, linebuf_iter);
2445         continue;
2446       }
2447       linebuf_iter = FirstNonTspace(linebuf_iter);
2448       if (skip_sid) {
2449         if (unlikely(IsEolnKns(*linebuf_iter))) {
2450           goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2451         }
2452         linebuf_iter = FirstNonTspace(CurTokenEnd(linebuf_iter));
2453       }
2454       if (rel_check) {
2455         // linebuf_iter must point to the start of the second FID, while
2456         // line_iter points to the start of the first.
2457         const uint32_t first_fid_slen = CurTokenEnd(line_iter) - line_iter;
2458         const uint32_t second_fid_slen = CurTokenEnd(linebuf_iter) - linebuf_iter;
2459         if ((first_fid_slen != second_fid_slen) || (!memequal(line_iter, linebuf_iter, first_fid_slen))) {
2460           line_iter = K_CAST(char*, linebuf_iter);
2461           continue;
2462         }
2463       }
2464       uint32_t sample_uidx2;
2465       if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx2, idbuf)) {
2466         if (unlikely(!linebuf_iter)) {
2467           goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2468         }
2469         line_iter = K_CAST(char*, linebuf_iter);
2470         continue;
2471       }
2472       if (unlikely(sample_uidx1 == sample_uidx2)) {
2473         // could technically be due to unloaded SID, so use inconsistent-input
2474         // error code
2475         snprintf(g_logbuf, kLogbufSize, "Error: Identical sample IDs on line %" PRIuPTR " of --king-table-subset file.\n", line_idx);
2476         goto KingTableSubsetLoad_ret_INCONSISTENT_INPUT_WW;
2477       }
2478       if (king_table_subset_thresh != -DBL_MAX) {
2479         linebuf_iter = FirstNonTspace(linebuf_iter);
2480         linebuf_iter = NextTokenMult0(linebuf_iter, kinship_skip);
2481         if (unlikely(!linebuf_iter)) {
2482           goto KingTableSubsetLoad_ret_MISSING_TOKENS;
2483         }
2484         double cur_kinship;
2485         const char* kinship_end = ScanadvDouble(linebuf_iter, &cur_kinship);
2486         if (!kinship_end) {
2487           line_iter = K_CAST(char*, linebuf_iter);
2488           continue;
2489         }
2490         if (unlikely(!IsSpaceOrEoln(*kinship_end))) {
2491           kinship_end = CurTokenEnd(kinship_end);
2492           *K_CAST(char*, kinship_end) = '\0';
2493           logerrprintfww("Error: Invalid numeric token '%s' on line %" PRIuPTR " of --king-table-subset file.\n", linebuf_iter, line_idx);
2494           goto KingTableSubsetLoad_ret_MALFORMED_INPUT;
2495         }
2496         if (cur_kinship < king_table_subset_thresh) {
2497           line_iter = K_CAST(char*, kinship_end);
2498           continue;
2499         }
2500       }
2501       line_iter = K_CAST(char*, linebuf_iter);
2502       if (pair_idx < pair_idx_start) {
2503         ++pair_idx;
2504         continue;
2505       }
2506       *loaded_sample_idx_pairs_iter++ = sample_uidx1;
2507       *loaded_sample_idx_pairs_iter++ = sample_uidx2;
2508       ++pair_idx;
2509       if (pair_idx == pair_idx_stop) {
2510         if (!is_first_parallel_scan) {
2511           TextSetPos(AdvPastDelim(line_iter, '\n'), txsp);
2512           goto KingTableSubsetLoad_finish;
2513         }
2514         // large --parallel job, first pass: count number of valid pairs, don't
2515         // save the remainder
2516         pair_idx_start = ~0LLU;
2517       }
2518     }
2519     if (unlikely(TextStreamErrcode2(txsp, &reterr))) {
2520       goto KingTableSubsetLoad_ret_TSTREAM_FAIL;
2521     }
2522   KingTableSubsetLoad_finish:
2523     *pair_idx_ptr = pair_idx;
2524   }
2525   while (0) {
2526   KingTableSubsetLoad_ret_TSTREAM_FAIL:
2527     TextStreamErrPrint("--king-table-subset file", txsp);
2528     break;
2529   KingTableSubsetLoad_ret_MALFORMED_INPUT:
2530     reterr = kPglRetMalformedInput;
2531     break;
2532   KingTableSubsetLoad_ret_MISSING_TOKENS:
2533     snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --king-table-subset file has fewer tokens than expected.\n", line_idx);
2534   KingTableSubsetLoad_ret_INCONSISTENT_INPUT_WW:
2535     WordWrapB(0);
2536     logerrputsb();
2537     reterr = kPglRetInconsistentInput;
2538     break;
2539   }
2540   return reterr;
2541 }
2542 
2543 typedef struct FidPairIteratorStruct {
2544   uint32_t block_start_idx;
2545   uint32_t block_end_idx;
2546   uint32_t idx1;
2547   uint32_t idx2;
2548 } FidPairIterator;
2549 
InitFidPairIterator(FidPairIterator * fpip)2550 void InitFidPairIterator(FidPairIterator* fpip) {
2551   fpip->block_start_idx = UINT32_MAX;  // deliberate overflow
2552   fpip->block_end_idx = 0;
2553   fpip->idx1 = 0;
2554   fpip->idx2 = 0;  // defensive
2555 }
2556 
CountRelCheckPairs(const char * nsorted_xidbox,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,char * idbuf)2557 uint64_t CountRelCheckPairs(const char* nsorted_xidbox, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, char* idbuf) {
2558   uint64_t total = 0;
2559   for (uintptr_t block_start_idx = 0; block_start_idx != orig_sample_ct; ) {
2560     const char* fid_start = &(nsorted_xidbox[block_start_idx * max_xid_blen]);
2561     const uint32_t fid_slen = AdvToDelim(fid_start, '\t') - fid_start;
2562     memcpy(idbuf, fid_start, fid_slen);
2563     idbuf[fid_slen] = ' ';
2564     // bugfix (14 Jan 2020): forgot that natural-sorting was used...
2565     idbuf[fid_slen + 1] = '\0';
2566     const uintptr_t block_end_idx = ExpsearchNsortStrLb(idbuf, nsorted_xidbox, max_xid_blen, orig_sample_ct, block_start_idx + 1);
2567     const uint64_t cur_block_size = block_end_idx - block_start_idx;
2568     total += (cur_block_size * (cur_block_size - 1)) / 2;
2569     block_start_idx = block_end_idx;
2570   }
2571   return total;
2572 }
2573 
GetRelCheckPairs(const char * nsorted_xidbox,const uint32_t * xid_map,uintptr_t max_xid_blen,uintptr_t orig_sample_ct,uint32_t is_first_parallel_scan,uint64_t pair_idx_start,uint64_t pair_idx_stop,FidPairIterator * fpip,uint64_t * pair_idx_ptr,uint32_t * loaded_sample_idx_pairs,char * idbuf)2574 void GetRelCheckPairs(const char* nsorted_xidbox, const uint32_t* xid_map, uintptr_t max_xid_blen, uintptr_t orig_sample_ct, uint32_t is_first_parallel_scan, uint64_t pair_idx_start, uint64_t pair_idx_stop, FidPairIterator* fpip, uint64_t* pair_idx_ptr, uint32_t* loaded_sample_idx_pairs, char* idbuf) {
2575   // Support "--make-king-table rel-check" without an actual subset-file.
2576   uint32_t block_start_idx = fpip->block_start_idx;
2577   uint32_t block_end_idx = fpip->block_end_idx;
2578   uint32_t idx1 = fpip->idx1;
2579   uint32_t idx2 = fpip->idx2;
2580   uint64_t pair_idx = *pair_idx_ptr;
2581   uint32_t* loaded_sample_idx_pairs_iter = loaded_sample_idx_pairs;
2582   while (1) {
2583     for (; idx1 != block_end_idx; ++idx1) {
2584       // idx1 >= idx2.
2585       uint32_t cur_pair_ct = idx1 - idx2;
2586       uint32_t idx2_stop = idx1;
2587       if (pair_idx_stop - pair_idx < cur_pair_ct) {
2588         cur_pair_ct = pair_idx_stop - pair_idx;
2589         idx2_stop = idx2 + cur_pair_ct;
2590       }
2591       if (pair_idx < pair_idx_start) {
2592         const uint64_t skip_ct = pair_idx_start - pair_idx;
2593         if (skip_ct >= cur_pair_ct) {
2594           idx2 = idx2_stop;
2595         } else {
2596           idx2 += skip_ct;
2597         }
2598         // pair_idx is updated correctly after the inner loop
2599       }
2600       const uint32_t sample_uidx1 = xid_map[idx1];
2601       for (; idx2 != idx2_stop; ++idx2) {
2602         const uint32_t sample_uidx2 = xid_map[idx2];
2603         *loaded_sample_idx_pairs_iter++ = sample_uidx1;
2604         *loaded_sample_idx_pairs_iter++ = sample_uidx2;
2605       }
2606       pair_idx += cur_pair_ct;
2607       if (pair_idx == pair_idx_stop) {
2608         if (is_first_parallel_scan) {
2609           pair_idx = CountRelCheckPairs(nsorted_xidbox, max_xid_blen, orig_sample_ct, idbuf);
2610         }
2611         goto GetRelCheckPairs_early_exit;
2612       }
2613       idx2 = block_start_idx;
2614     }
2615     block_start_idx = block_end_idx;
2616     if (block_start_idx == orig_sample_ct) {
2617       break;
2618     }
2619     idx2 = block_start_idx;
2620     const char* fid_start = &(nsorted_xidbox[block_start_idx * max_xid_blen]);
2621     const uint32_t fid_slen = AdvToDelim(fid_start, '\t') - fid_start;
2622     memcpy(idbuf, fid_start, fid_slen);
2623     idbuf[fid_slen] = ' ';
2624     idbuf[fid_slen + 1] = '\0';
2625     block_end_idx = ExpsearchNsortStrLb(idbuf, nsorted_xidbox, max_xid_blen, orig_sample_ct, block_start_idx + 1);
2626   }
2627  GetRelCheckPairs_early_exit:
2628   *pair_idx_ptr = pair_idx;
2629   fpip->block_start_idx = block_start_idx;
2630   fpip->block_end_idx = block_end_idx;
2631   fpip->idx1 = idx1;
2632   fpip->idx2 = idx2;
2633 }
2634 
CalcKingTableSubset(const uintptr_t * orig_sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const char * subset_fname,uint32_t raw_sample_ct,uint32_t orig_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,double king_table_filter,double king_table_subset_thresh,uint32_t rel_check,KingFlags king_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end)2635 PglErr CalcKingTableSubset(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const char* subset_fname, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, double king_table_filter, double king_table_subset_thresh, uint32_t rel_check, KingFlags king_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end) {
2636   // subset_fname permitted to be nullptr when rel_check is true.
2637   unsigned char* bigstack_mark = g_bigstack_base;
2638   FILE* outfile = nullptr;
2639   char* cswritep = nullptr;
2640   PglErr reterr = kPglRetSuccess;
2641   TextStream txs;
2642   CompressStreamState css;
2643   ThreadGroup tg;
2644   PreinitTextStream(&txs);
2645   PreinitCstream(&css);
2646   PreinitThreads(&tg);
2647   {
2648     if (unlikely(IsSet(cip->haploid_mask, 0))) {
2649       logerrputs("Error: --make-king-table cannot be used on haploid genomes.\n");
2650       goto CalcKingTableSubset_ret_INCONSISTENT_INPUT;
2651     }
2652     reterr = ConditionalAllocateNonAutosomalVariants(cip, "--make-king-table", raw_variant_ct, &variant_include, &variant_ct);
2653     if (unlikely(reterr)) {
2654       goto CalcKingTableSubset_ret_1;
2655     }
2656     // 1. Write output header line if necessary.
2657     // 2. Count number of relevant sample pairs (higher uidx in high 32 bits),
2658     //    and load as much as may be useful during first pass (usually there
2659     //    will be only one pass).
2660     // 3. If list is empty, error out.
2661     // 4. If --parallel, discard part of the list, then exit if remainder
2662     //    empty.
2663     // 5. If remainder of list is too large to process in one pass, determine
2664     //    number of necessary passes.  If output filename refers to the same
2665     //    thing as input file, append ~ to input filename.
2666     // Loop:
2667     // * Determine which sample indexes appear in this part of the list.
2668     //   Compute current cumulative_popcounts, perform uidx -> idx conversion.
2669     //   (Don't bother sorting the pairs, since that prevents
2670     //   --parallel/multipass mode from delivering the same results.)
2671     // * Execute usual KING-robust computation, write .kin0 entries.
2672     // * If not last pass, reload input .kin0, etc.
2673     //
2674     // Could store the pairs in a more compact manner, but can live with 50%
2675     // space bloat for now.
2676     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
2677     uint32_t sample_ctaw = BitCtToAlignedWordCt(orig_sample_ct);
2678     uint32_t sample_ctaw2 = NypCtToAlignedWordCt(orig_sample_ct);
2679     uint32_t king_bufsizew = kKingMultiplexWords * orig_sample_ct;
2680     uintptr_t* cur_sample_include;
2681     uint32_t* sample_include_cumulative_popcounts;
2682     uintptr_t* loadbuf;
2683     uintptr_t* splitbuf_hom;
2684     uintptr_t* splitbuf_ref2het;
2685     VecW* vecaligned_buf;
2686     // ok if allocations are a bit oversized
2687     CalcKingTableSubsetCtx ctx;
2688     if (unlikely(
2689             bigstack_alloc_w(raw_sample_ctl, &cur_sample_include) ||
2690             bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
2691             bigstack_alloc_w(sample_ctaw2, &loadbuf) ||
2692             bigstack_alloc_w(kPglBitTransposeBatch * sample_ctaw, &splitbuf_hom) ||
2693             bigstack_alloc_w(kPglBitTransposeBatch * sample_ctaw, &splitbuf_ref2het) ||
2694             bigstack_alloc_w(king_bufsizew, &(ctx.smaj_hom[0])) ||
2695             bigstack_alloc_w(king_bufsizew, &(ctx.smaj_ref2het[0])) ||
2696             bigstack_alloc_w(king_bufsizew, &(ctx.smaj_hom[1])) ||
2697             bigstack_alloc_w(king_bufsizew, &(ctx.smaj_ref2het[1])) ||
2698             bigstack_alloc_v(kPglBitTransposeBufvecs, &vecaligned_buf))) {
2699       goto CalcKingTableSubset_ret_NOMEM;
2700     }
2701     SetKingTableFname(king_flags, parallel_idx, parallel_tot, outname_end);
2702     if (subset_fname) {
2703       uint32_t fname_slen;
2704 #ifdef _WIN32
2705       fname_slen = GetFullPathName(subset_fname, kPglFnamesize, g_textbuf, nullptr);
2706       if (unlikely((!fname_slen) || (fname_slen > kPglFnamesize)))
2707 #else
2708       if (unlikely(!realpath(subset_fname, g_textbuf)))
2709 #endif
2710       {
2711         logerrprintfww(kErrprintfFopen, subset_fname, strerror(errno));
2712         goto CalcKingTableSubset_ret_OPEN_FAIL;
2713       }
2714       if (RealpathIdentical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]))) {
2715         logerrputs("Warning: --king-table-subset input filename matches --make-king-table output\nfilename.  Appending '~' to input filename.\n");
2716         fname_slen = strlen(subset_fname);
2717         memcpy(g_textbuf, subset_fname, fname_slen);
2718         strcpy_k(&(g_textbuf[fname_slen]), "~");
2719         if (unlikely(rename(subset_fname, g_textbuf))) {
2720           logerrputs("Error: Failed to append '~' to --king-table-subset input filename.\n");
2721           goto CalcKingTableSubset_ret_OPEN_FAIL;
2722         }
2723         subset_fname = g_textbuf;
2724       }
2725     }
2726 
2727     // Safe to "write" the header line now, if necessary.
2728     reterr = InitCstreamAlloc(outname, 0, king_flags & kfKingTableZs, max_thread_ct, kMaxMediumLine + kCompressStreamBlock, &css, &cswritep);
2729     if (unlikely(reterr)) {
2730       goto CalcKingTableSubset_ret_1;
2731     }
2732     const uint32_t king_col_fid = FidColIsRequired(siip, king_flags / kfKingColMaybefid);
2733     const uint32_t king_col_sid = SidColIsRequired(siip->sids, king_flags / kfKingColMaybesid);
2734     if (!parallel_idx) {
2735       cswritep = AppendKingTableHeader(king_flags, king_col_fid, king_col_sid, cswritep);
2736     }
2737     const uintptr_t max_sample_fmtid_blen = GetMaxSampleFmtidBlen(siip, king_col_fid, king_col_sid);
2738     char* collapsed_sample_fmtids;
2739     if (unlikely(bigstack_alloc_c(max_sample_fmtid_blen * orig_sample_ct, &collapsed_sample_fmtids))) {
2740       goto CalcKingTableSubset_ret_NOMEM;
2741     }
2742     // possible todo: allow this to change between passes
2743     uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
2744     if (calc_thread_ct > orig_sample_ct / 32) {
2745       calc_thread_ct = orig_sample_ct / 32;
2746     }
2747     if (!calc_thread_ct) {
2748       calc_thread_ct = 1;
2749     }
2750     // could eventually have 64-bit g_thread_start?
2751     if (unlikely(
2752             SetThreadCt(calc_thread_ct, &tg) ||
2753             bigstack_alloc_u32(calc_thread_ct + 1, &ctx.thread_start))) {
2754       goto CalcKingTableSubset_ret_NOMEM;
2755     }
2756 
2757     uintptr_t line_idx = 0;
2758     uint32_t kinship_skip = 0;
2759     uint32_t skip_sid = 0;
2760     XidMode xid_mode = siip->sids? kfXidModeFidIidSid : kfXidModeIidSid;
2761     if (subset_fname) {
2762       reterr = InitTextStream(subset_fname, kTextStreamBlenFast, 1, &txs);
2763       if (unlikely(reterr)) {
2764         if (reterr == kPglRetEof) {
2765           logerrputs("Error: Empty --king-table-subset file.\n");
2766           goto CalcKingTableSubset_ret_MALFORMED_INPUT;
2767         }
2768         goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2769       }
2770       ++line_idx;
2771       const char* linebuf_iter = TextGet(&txs);
2772       if (unlikely(!linebuf_iter)) {
2773         if (!TextStreamErrcode2(&txs, &reterr)) {
2774           logerrputs("Error: Empty --king-table-subset file.\n");
2775           goto CalcKingTableSubset_ret_MALFORMED_INPUT;
2776         }
2777         goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2778       }
2779       const char* token_end = CurTokenEnd(linebuf_iter);
2780       uint32_t token_slen = token_end - linebuf_iter;
2781       // Make this work with both KING- and plink2-generated .kin0 files.
2782       uint32_t fid_present = strequal_k(linebuf_iter, "#FID1", token_slen) || strequal_k(linebuf_iter, "FID", token_slen);
2783       if (fid_present) {
2784         linebuf_iter = FirstNonTspace(token_end);
2785         token_end = CurTokenEnd(linebuf_iter);
2786         token_slen = token_end - linebuf_iter;
2787         xid_mode = kfXidModeFidIid;
2788       } else {
2789         if (unlikely(*linebuf_iter != '#')) {
2790           goto CalcKingTableSubset_ret_INVALID_HEADER;
2791         }
2792         ++linebuf_iter;
2793         --token_slen;
2794         xid_mode = kfXidModeIid;
2795       }
2796       if (unlikely((!strequal_k(linebuf_iter, "ID1", token_slen)) && (!strequal_k(linebuf_iter, "IID1", token_slen)))) {
2797         goto CalcKingTableSubset_ret_INVALID_HEADER;
2798       }
2799       linebuf_iter = FirstNonTspace(token_end);
2800       token_end = CurTokenEnd(linebuf_iter);
2801       token_slen = token_end - linebuf_iter;
2802       if (strequal_k(linebuf_iter, "SID1", token_slen)) {
2803         if (siip->sids) {
2804           xid_mode = fid_present? kfXidModeFidIidSid : kfXidModeIidSid;
2805         } else {
2806           skip_sid = 1;
2807         }
2808         linebuf_iter = FirstNonTspace(token_end);
2809         token_end = CurTokenEnd(linebuf_iter);
2810         token_slen = token_end - linebuf_iter;
2811       }
2812       if (fid_present) {
2813         if (unlikely(!strequal_k(linebuf_iter, "FID2", token_slen))) {
2814           goto CalcKingTableSubset_ret_INVALID_HEADER;
2815         }
2816         linebuf_iter = FirstNonTspace(token_end);
2817         token_end = CurTokenEnd(linebuf_iter);
2818         token_slen = token_end - linebuf_iter;
2819       }
2820       if (unlikely((!strequal_k(linebuf_iter, "ID2", token_slen)) && (!strequal_k(linebuf_iter, "IID2", token_slen)))) {
2821         goto CalcKingTableSubset_ret_INVALID_HEADER;
2822       }
2823       if (xid_mode == kfXidModeFidIidSid) {
2824         // technically don't need to check this in skip_sid case
2825         linebuf_iter = FirstNonTspace(token_end);
2826         token_end = CurTokenEnd(linebuf_iter);
2827         token_slen = token_end - linebuf_iter;
2828         if (unlikely(!strequal_k(linebuf_iter, "SID2", token_slen))) {
2829           goto CalcKingTableSubset_ret_INVALID_HEADER;
2830         }
2831       }
2832       if (king_table_subset_thresh != -DBL_MAX) {
2833         king_table_subset_thresh *= 1.0 - kSmallEpsilon;
2834         while (1) {
2835           linebuf_iter = FirstNonTspace(token_end);
2836           token_end = CurTokenEnd(linebuf_iter);
2837           token_slen = token_end - linebuf_iter;
2838           if (unlikely(!token_slen)) {
2839             logerrputs("Error: No kinship-coefficient column in --king-table-subset file.\n");
2840             goto CalcKingTableSubset_ret_INCONSISTENT_INPUT;
2841           }
2842           if (strequal_k(linebuf_iter, "KINSHIP", token_slen) || strequal_k(linebuf_iter, "Kinship", token_slen)) {
2843             break;
2844           }
2845           ++kinship_skip;
2846         }
2847       }
2848     }
2849 
2850     uint32_t* xid_map;  // IDs not collapsed
2851     char* sorted_xidbox;
2852     uintptr_t max_xid_blen;
2853     // may as well use natural-sort order in rel-check-only case
2854     reterr = SortedXidboxInitAlloc(orig_sample_include, siip, orig_sample_ct, 0, xid_mode, (!subset_fname), &sorted_xidbox, &xid_map, &max_xid_blen);
2855     if (unlikely(reterr)) {
2856       goto CalcKingTableSubset_ret_1;
2857     }
2858     char* idbuf;
2859     if (unlikely(bigstack_alloc_c(max_xid_blen, &idbuf))) {
2860       goto CalcKingTableSubset_ret_NOMEM;
2861     }
2862 
2863     ctx.homhom_needed = (king_flags & kfKingColNsnp) || ((!(king_flags & kfKingCounts)) && (king_flags & (kfKingColHethet | kfKingColIbs0 | kfKingColIbs1)));
2864     const uint32_t homhom_needed_p4 = ctx.homhom_needed + 4;
2865     // if homhom_needed, 8 + 20 bytes per pair, otherwise 8 + 16
2866     uintptr_t pair_buf_capacity = bigstack_left();
2867     if (unlikely(pair_buf_capacity < 2 * kCacheline)) {
2868       goto CalcKingTableSubset_ret_NOMEM;
2869     }
2870     // adverse rounding
2871     pair_buf_capacity = (pair_buf_capacity - 2 * kCacheline) / (24 + 4 * ctx.homhom_needed);
2872     if (pair_buf_capacity > 0xffffffffU) {
2873       // 32-bit ctx.thread_start[] for now
2874       pair_buf_capacity = 0xffffffffU;
2875     }
2876     ctx.loaded_sample_idx_pairs = S_CAST(uint32_t*, bigstack_alloc_raw_rd(pair_buf_capacity * 2 * sizeof(int32_t)));
2877     ctx.king_counts = R_CAST(uint32_t*, g_bigstack_base);
2878     SetThreadFuncAndData(CalcKingTableSubsetThread, &ctx, &tg);
2879 
2880     FidPairIterator fpi;
2881     InitFidPairIterator(&fpi);
2882 
2883     uint64_t pair_idx = 0;
2884     if (!subset_fname) {
2885       GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, (parallel_tot != 1), 0, pair_buf_capacity, &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2886     } else {
2887       fputs("Scanning --king-table-subset file...", stdout);
2888       fflush(stdout);
2889       reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, (parallel_tot != 1), 0, pair_buf_capacity, line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2890       if (unlikely(reterr)) {
2891         goto CalcKingTableSubset_ret_1;
2892       }
2893     }
2894     uint64_t pair_idx_global_start = 0;
2895     uint64_t pair_idx_global_stop = ~0LLU;
2896     if (parallel_tot != 1) {
2897       const uint64_t parallel_pair_ct = pair_idx;
2898       pair_idx_global_start = (parallel_idx * parallel_pair_ct) / parallel_tot;
2899       pair_idx_global_stop = ((parallel_idx + 1) * parallel_pair_ct) / parallel_tot;
2900       if (pair_idx > pair_buf_capacity) {
2901         // may as well document possible overflow
2902         if (unlikely(parallel_pair_ct > ((~0LLU) / kParallelMax))) {
2903           if (!subset_fname) {
2904             // This is easy to support if there's ever a need, of course.
2905             logerrputs("Error: Too many \"--make-king-table rel-check\" sample pairs for this " PROG_NAME_STR "\nbuild.\n");
2906           } else {
2907             logerrputs("Error: Too many --king-table-subset sample pairs for this " PROG_NAME_STR " build.\n");
2908           }
2909           reterr = kPglRetNotYetSupported;
2910           goto CalcKingTableSubset_ret_1;
2911         }
2912         if (pair_idx_global_stop > pair_buf_capacity) {
2913           // large --parallel job
2914           pair_idx = 0;
2915           if (!subset_fname) {
2916             InitFidPairIterator(&fpi);
2917             GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2918           } else {
2919             reterr = TextRewind(&txs);
2920             if (unlikely(reterr)) {
2921               goto CalcKingTableSubset_ret_TSTREAM_FAIL;
2922             }
2923             // bugfix (4 Oct 2019): forgot a bunch of reinitialization here
2924             line_idx = 1;
2925             char* header_throwaway;
2926             reterr = TextNextLineLstrip(&txs, &header_throwaway);
2927             if (unlikely(reterr)) {
2928               goto CalcKingTableSubset_ret_TSTREAM_REWIND_FAIL;
2929             }
2930             reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
2931             if (unlikely(reterr)) {
2932               goto CalcKingTableSubset_ret_1;
2933             }
2934           }
2935         } else {
2936           pair_idx = pair_idx_global_stop;
2937           if (pair_idx_global_start) {
2938             memmove(ctx.loaded_sample_idx_pairs, &(ctx.loaded_sample_idx_pairs[pair_idx_global_start * 2]), (pair_idx_global_stop - pair_idx_global_start) * 2 * sizeof(int32_t));
2939           }
2940         }
2941       } else {
2942         pair_idx = pair_idx_global_stop;
2943         if (pair_idx_global_start) {
2944           memmove(ctx.loaded_sample_idx_pairs, &(ctx.loaded_sample_idx_pairs[pair_idx_global_start * 2]), (pair_idx_global_stop - pair_idx_global_start) * 2 * sizeof(int32_t));
2945         }
2946       }
2947     }
2948     uint64_t pair_idx_cur_start = pair_idx_global_start;
2949     uint64_t king_table_filter_ct = 0;
2950     uintptr_t pass_idx = 1;
2951     while (pair_idx_cur_start < pair_idx) {
2952       ZeroWArr(raw_sample_ctl, cur_sample_include);
2953       const uintptr_t cur_pair_ct = pair_idx - pair_idx_cur_start;
2954       const uintptr_t cur_pair_ct_x2 = 2 * cur_pair_ct;
2955       for (uintptr_t ulii = 0; ulii != cur_pair_ct_x2; ++ulii) {
2956         SetBit(ctx.loaded_sample_idx_pairs[ulii], cur_sample_include);
2957       }
2958       FillCumulativePopcounts(cur_sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
2959       const uint32_t cur_sample_ct = sample_include_cumulative_popcounts[raw_sample_ctl - 1] + PopcountWord(cur_sample_include[raw_sample_ctl - 1]);
2960       const uint32_t cur_sample_ctaw = BitCtToAlignedWordCt(cur_sample_ct);
2961       const uint32_t cur_sample_ctaw2 = NypCtToAlignedWordCt(cur_sample_ct);
2962       if (cur_sample_ct != raw_sample_ct) {
2963         for (uintptr_t ulii = 0; ulii != cur_pair_ct_x2; ++ulii) {
2964           ctx.loaded_sample_idx_pairs[ulii] = RawToSubsettedPos(cur_sample_include, sample_include_cumulative_popcounts, ctx.loaded_sample_idx_pairs[ulii]);
2965         }
2966       }
2967       ZeroU32Arr(cur_pair_ct * homhom_needed_p4, ctx.king_counts);
2968       CollapsedSampleFmtidInit(cur_sample_include, siip, cur_sample_ct, king_col_fid, king_col_sid, max_sample_fmtid_blen, collapsed_sample_fmtids);
2969       for (uint32_t tidx = 0; tidx <= calc_thread_ct; ++tidx) {
2970         ctx.thread_start[tidx] = (tidx * S_CAST(uint64_t, cur_pair_ct)) / calc_thread_ct;
2971       }
2972       if (pass_idx != 1) {
2973         ReinitThreads(&tg);
2974       }
2975       // possible todo: singleton/monomorphic optimization for sufficiently
2976       // large jobs
2977       uintptr_t variant_uidx_base = 0;
2978       uintptr_t cur_bits = variant_include[0];
2979       uint32_t variants_completed = 0;
2980       uint32_t parity = 0;
2981       const uint32_t sample_batch_ct_m1 = (cur_sample_ct - 1) / kPglBitTransposeBatch;
2982       PgrSampleSubsetIndex pssi;
2983       PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
2984       do {
2985         const uint32_t cur_block_size = MINV(variant_ct - variants_completed, kKingMultiplex);
2986         uintptr_t* cur_smaj_hom = ctx.smaj_hom[parity];
2987         uintptr_t* cur_smaj_ref2het = ctx.smaj_ref2het[parity];
2988         // "block" = distance computation granularity, usually 1024 or 1536
2989         //           variants
2990         // "batch" = variant-major-to-sample-major transpose granularity,
2991         //           currently 512 variants
2992         uint32_t variant_batch_size = kPglBitTransposeBatch;
2993         uint32_t variant_batch_size_rounded_up = kPglBitTransposeBatch;
2994         const uint32_t write_batch_ct_m1 = (cur_block_size - 1) / kPglBitTransposeBatch;
2995         for (uint32_t write_batch_idx = 0; ; ++write_batch_idx) {
2996           if (write_batch_idx >= write_batch_ct_m1) {
2997             if (write_batch_idx > write_batch_ct_m1) {
2998               break;
2999             }
3000             variant_batch_size = ModNz(cur_block_size, kPglBitTransposeBatch);
3001             variant_batch_size_rounded_up = variant_batch_size;
3002             const uint32_t variant_batch_size_rem = variant_batch_size % kBitsPerWord;
3003             if (variant_batch_size_rem) {
3004               const uint32_t trailing_variant_ct = kBitsPerWord - variant_batch_size_rem;
3005               variant_batch_size_rounded_up += trailing_variant_ct;
3006               ZeroWArr(trailing_variant_ct * cur_sample_ctaw, &(splitbuf_hom[variant_batch_size * cur_sample_ctaw]));
3007               ZeroWArr(trailing_variant_ct * cur_sample_ctaw, &(splitbuf_ref2het[variant_batch_size * cur_sample_ctaw]));
3008             }
3009           }
3010           uintptr_t* hom_iter = splitbuf_hom;
3011           uintptr_t* ref2het_iter = splitbuf_ref2het;
3012           for (uint32_t uii = 0; uii != variant_batch_size; ++uii) {
3013             const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3014             reterr = PgrGet(cur_sample_include, pssi, cur_sample_ct, variant_uidx, simple_pgrp, loadbuf);
3015             if (unlikely(reterr)) {
3016               goto CalcKingTableSubset_ret_PGR_FAIL;
3017             }
3018             // may want to support some sort of low-MAF optimization here
3019             SetTrailingNyps(cur_sample_ct, loadbuf);
3020             SplitHomRef2hetUnsafeW(loadbuf, cur_sample_ctaw2, hom_iter, ref2het_iter);
3021             hom_iter = &(hom_iter[cur_sample_ctaw]);
3022             ref2het_iter = &(ref2het_iter[cur_sample_ctaw]);
3023           }
3024           // uintptr_t* read_iter = loadbuf;
3025           uintptr_t* write_hom_iter = &(cur_smaj_hom[write_batch_idx * kPglBitTransposeWords]);
3026           uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[write_batch_idx * kPglBitTransposeWords]);
3027           uint32_t write_batch_size = kPglBitTransposeBatch;
3028           for (uint32_t sample_batch_idx = 0; ; ++sample_batch_idx) {
3029             if (sample_batch_idx >= sample_batch_ct_m1) {
3030               if (sample_batch_idx > sample_batch_ct_m1) {
3031                 break;
3032               }
3033               write_batch_size = ModNz(cur_sample_ct, kPglBitTransposeBatch);
3034             }
3035             // bugfix: read_batch_size must be rounded up to word boundary,
3036             // since we want to one-out instead of zero-out the trailing bits
3037             //
3038             // bugfix: if we always use kPglBitTransposeBatch instead of
3039             // variant_batch_size_rounded_up, we read/write past the
3040             // kKingMultiplex limit and clobber the first variants of the next
3041             // sample with garbage.
3042             TransposeBitblock(&(splitbuf_hom[sample_batch_idx * kPglBitTransposeWords]), cur_sample_ctaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_hom_iter, vecaligned_buf);
3043             TransposeBitblock(&(splitbuf_ref2het[sample_batch_idx * kPglBitTransposeWords]), cur_sample_ctaw, kKingMultiplexWords, variant_batch_size_rounded_up, write_batch_size, write_ref2het_iter, vecaligned_buf);
3044             write_hom_iter = &(write_hom_iter[kKingMultiplex * kPglBitTransposeWords]);
3045             write_ref2het_iter = &(write_ref2het_iter[kKingMultiplex * kPglBitTransposeWords]);
3046           }
3047         }
3048         const uint32_t cur_block_sizew = BitCtToWordCt(cur_block_size);
3049         if (cur_block_sizew < kKingMultiplexWords) {
3050           uintptr_t* write_hom_iter = &(cur_smaj_hom[cur_block_sizew]);
3051           uintptr_t* write_ref2het_iter = &(cur_smaj_ref2het[cur_block_sizew]);
3052           const uint32_t write_word_ct = kKingMultiplexWords - cur_block_sizew;
3053           for (uint32_t sample_idx = 0; sample_idx != cur_sample_ct; ++sample_idx) {
3054             ZeroWArr(write_word_ct, write_hom_iter);
3055             ZeroWArr(write_word_ct, write_ref2het_iter);
3056             write_hom_iter = &(write_hom_iter[kKingMultiplexWords]);
3057             write_ref2het_iter = &(write_ref2het_iter[kKingMultiplexWords]);
3058           }
3059         }
3060         if (variants_completed) {
3061           JoinThreads(&tg);
3062           // CalcKingTableSubsetThread() never errors out
3063         }
3064         // this update must occur after JoinThreads() call
3065         if (variants_completed + cur_block_size == variant_ct) {
3066           DeclareLastThreadBlock(&tg);
3067         }
3068         if (unlikely(SpawnThreads(&tg))) {
3069           goto CalcKingTableSubset_ret_THREAD_CREATE_FAIL;
3070         }
3071         printf("\r--make-king-table pass %" PRIuPTR ": %u variants complete.", pass_idx, variants_completed);
3072         fflush(stdout);
3073         variants_completed += cur_block_size;
3074         parity = 1 - parity;
3075       } while (!IsLastBlock(&tg));
3076       JoinThreads(&tg);
3077       printf("\r--make-king-table pass %" PRIuPTR ": Writing...                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", pass_idx);
3078       fflush(stdout);
3079 
3080       const uint32_t king_col_id = king_flags & kfKingColId;
3081       const uint32_t king_col_nsnp = king_flags & kfKingColNsnp;
3082       const uint32_t king_col_hethet = king_flags & kfKingColHethet;
3083       const uint32_t king_col_ibs0 = king_flags & kfKingColIbs0;
3084       const uint32_t king_col_ibs1 = king_flags & kfKingColIbs1;
3085       const uint32_t king_col_kinship = king_flags & kfKingColKinship;
3086       const uint32_t report_counts = king_flags & kfKingCounts;
3087       uint32_t* results_iter = ctx.king_counts;
3088       double nonmiss_recip = 0.0;
3089       for (uintptr_t cur_pair_idx = 0; cur_pair_idx != cur_pair_ct; ++cur_pair_idx, results_iter = &(results_iter[homhom_needed_p4])) {
3090         const uint32_t ibs0_ct = results_iter[kKingOffsetIbs0];
3091         const uint32_t hethet_ct = results_iter[kKingOffsetHethet];
3092         const uint32_t het2hom1_ct = results_iter[kKingOffsetHet2Hom1];
3093         const uint32_t het1hom2_ct = results_iter[kKingOffsetHet1Hom2];
3094         const intptr_t smaller_het_ct = hethet_ct + MINV(het1hom2_ct, het2hom1_ct);
3095         const double kinship_coeff = 0.5 - (S_CAST(double, 4 * S_CAST(intptr_t, ibs0_ct) + het1hom2_ct + het2hom1_ct) / S_CAST(double, 4 * smaller_het_ct));
3096         if ((king_table_filter != -DBL_MAX) && (kinship_coeff < king_table_filter)) {
3097           ++king_table_filter_ct;
3098           continue;
3099         }
3100         const uint32_t sample_idx1 = ctx.loaded_sample_idx_pairs[2 * cur_pair_idx];
3101         const uint32_t sample_idx2 = ctx.loaded_sample_idx_pairs[2 * cur_pair_idx + 1];
3102         if (king_col_id) {
3103           cswritep = strcpyax(cswritep, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx1]), '\t');
3104           cswritep = strcpyax(cswritep, &(collapsed_sample_fmtids[max_sample_fmtid_blen * sample_idx2]), '\t');
3105         }
3106         if (homhom_needed_p4 == 5) {
3107           const uint32_t homhom_ct = results_iter[kKingOffsetHomhom];
3108           const uint32_t nonmiss_ct = het1hom2_ct + het2hom1_ct + homhom_ct + hethet_ct;
3109           if (king_col_nsnp) {
3110             cswritep = u32toa_x(nonmiss_ct, '\t', cswritep);
3111           }
3112           if (!report_counts) {
3113             nonmiss_recip = 1.0 / u31tod(nonmiss_ct);
3114           }
3115         }
3116         if (king_col_hethet) {
3117           if (report_counts) {
3118             cswritep = u32toa(hethet_ct, cswritep);
3119           } else {
3120             cswritep = dtoa_g(nonmiss_recip * u31tod(hethet_ct), cswritep);
3121           }
3122           *cswritep++ = '\t';
3123         }
3124         if (king_col_ibs0) {
3125           if (report_counts) {
3126             cswritep = u32toa(ibs0_ct, cswritep);
3127           } else {
3128             cswritep = dtoa_g(nonmiss_recip * u31tod(ibs0_ct), cswritep);
3129           }
3130           *cswritep++ = '\t';
3131         }
3132         if (king_col_ibs1) {
3133           if (report_counts) {
3134             cswritep = u32toa_x(het1hom2_ct, '\t', cswritep);
3135             cswritep = u32toa(het2hom1_ct, cswritep);
3136           } else {
3137             cswritep = dtoa_g(nonmiss_recip * u31tod(het1hom2_ct), cswritep);
3138             *cswritep++ = '\t';
3139             cswritep = dtoa_g(nonmiss_recip * u31tod(het2hom1_ct), cswritep);
3140           }
3141           *cswritep++ = '\t';
3142         }
3143         if (king_col_kinship) {
3144           cswritep = dtoa_g(kinship_coeff, cswritep);
3145           ++cswritep;
3146         }
3147         DecrAppendBinaryEoln(&cswritep);
3148         if (unlikely(Cswrite(&css, &cswritep))) {
3149           goto CalcKingTableSubset_ret_WRITE_FAIL;
3150         }
3151       }
3152 
3153       putc_unlocked('\r', stdout);
3154       const uint64_t pair_complete_ct = pair_idx - pair_idx_global_start;
3155       logprintf("Subsetted --make-king-table: %" PRIu64 " pair%s complete.\n", pair_complete_ct, (pair_complete_ct == 1)? "" : "s");
3156       if (TextEof(&txs) || (pair_idx == pair_idx_global_stop)) {
3157         break;
3158       }
3159       pair_idx_cur_start = pair_idx;
3160       if (!subset_fname) {
3161         GetRelCheckPairs(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, pair_idx_global_start, MINV(pair_idx_global_stop, pair_idx_global_start + pair_buf_capacity), &fpi, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
3162       } else {
3163         fputs("Scanning --king-table-subset file...", stdout);
3164         fflush(stdout);
3165         reterr = KingTableSubsetLoad(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, king_table_subset_thresh, xid_mode, skip_sid, rel_check, kinship_skip, 0, pair_idx_cur_start, MINV(pair_idx_global_stop, pair_idx_cur_start + pair_buf_capacity), line_idx, &txs, &pair_idx, ctx.loaded_sample_idx_pairs, idbuf);
3166         if (unlikely(reterr)) {
3167           goto CalcKingTableSubset_ret_1;
3168         }
3169       }
3170       ++pass_idx;
3171     }
3172     if (unlikely(CswriteCloseNull(&css, cswritep))) {
3173       goto CalcKingTableSubset_ret_WRITE_FAIL;
3174     }
3175     logprintfww("Results written to %s .\n", outname);
3176     if (king_table_filter != -DBL_MAX) {
3177       const uint64_t reported_ct = pair_idx - pair_idx_global_start - king_table_filter_ct;
3178       logprintf("--king-table-filter: %" PRIu64 " relationship%s reported (%" PRIu64 " filtered out).\n", reported_ct, (reported_ct == 1)? "" : "s", king_table_filter_ct);
3179     }
3180   }
3181   while (0) {
3182   CalcKingTableSubset_ret_NOMEM:
3183     reterr = kPglRetNomem;
3184     break;
3185   CalcKingTableSubset_ret_OPEN_FAIL:
3186     reterr = kPglRetOpenFail;
3187     break;
3188   CalcKingTableSubset_ret_TSTREAM_REWIND_FAIL:
3189     TextStreamErrPrintRewind("--king-table-subset file", &txs, &reterr);
3190     break;
3191   CalcKingTableSubset_ret_TSTREAM_FAIL:
3192     TextStreamErrPrint("--king-table-subset file", &txs);
3193     break;
3194   CalcKingTableSubset_ret_PGR_FAIL:
3195     PgenErrPrintN(reterr);
3196     break;
3197   CalcKingTableSubset_ret_WRITE_FAIL:
3198     reterr = kPglRetWriteFail;
3199     break;
3200   CalcKingTableSubset_ret_INVALID_HEADER:
3201     logerrputs("Error: Invalid header line in --king-table-subset file.\n");
3202   CalcKingTableSubset_ret_MALFORMED_INPUT:
3203     reterr = kPglRetMalformedInput;
3204     break;
3205   CalcKingTableSubset_ret_INCONSISTENT_INPUT:
3206     reterr = kPglRetInconsistentInput;
3207     break;
3208   CalcKingTableSubset_ret_THREAD_CREATE_FAIL:
3209     reterr = kPglRetThreadCreateFail;
3210     break;
3211   }
3212  CalcKingTableSubset_ret_1:
3213   CleanupThreads(&tg);
3214   CleanupTextStream2("--king-table-subset file", &txs, &reterr);
3215   CswriteCloseCond(&css, cswritep);
3216   fclose_cond(outfile);
3217   BigstackReset(bigstack_mark);
3218   return reterr;
3219 }
3220 
3221 // assumes trailing bits of genovec are zeroed out
ExpandCenteredVarmaj(const uintptr_t * genovec,const uintptr_t * dosage_present,const Dosage * dosage_main,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t dosage_ct,double ref_freq,double * normed_dosages)3222 PglErr ExpandCenteredVarmaj(const uintptr_t* genovec, const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t dosage_ct, double ref_freq, double* normed_dosages) {
3223   const double alt_freq = 1.0 - ref_freq;
3224   double inv_stdev;
3225   if (variance_standardize) {
3226     const double variance = 2 * ref_freq * alt_freq;
3227     if (!(variance > kSmallEpsilon)) {
3228       // See LoadMultiallelicCenteredVarmaj().  This check was tightened up in
3229       // alpha 3 to reject all-het and monomorphic-wrong-allele variants.
3230       STD_ARRAY_DECL(uint32_t, 4, genocounts);
3231       GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts);
3232       if (unlikely(dosage_ct || genocounts[1])) {
3233         return kPglRetDegenerateData;
3234       }
3235       if (variance != variance) {
3236         if (unlikely(genocounts[0] || genocounts[2])) {
3237           return kPglRetDegenerateData;
3238         }
3239       } else {
3240         if (ref_freq > 0.5) {
3241           if (unlikely(genocounts[2])) {
3242             return kPglRetDegenerateData;
3243           }
3244         } else {
3245           if (unlikely(genocounts[0])) {
3246             return kPglRetDegenerateData;
3247           }
3248         }
3249       }
3250       ZeroDArr(sample_ct, normed_dosages);
3251       return kPglRetSuccess;
3252     }
3253     inv_stdev = 1.0 / sqrt(variance);
3254     if (is_haploid) {
3255       // For our purposes, variance is doubled in haploid case.
3256       inv_stdev *= (1.0 / kSqrt2);
3257     }
3258     // possible todo:
3259     // * Could use one inv_stdev for males and one for nonmales for chrX
3260     //   --score (while still leaving that out of GRM... or just leave males
3261     //   out there?).  This depends on dosage compensation model; discussed in
3262     //   e.g. GCTA paper.
3263   } else {
3264     // Extra factor of 2 removed from haploid 'cov' formula in alpha 3.
3265     inv_stdev = is_haploid? 0.5 : 1.0;
3266   }
3267   PopulateRescaledDosage(genovec, dosage_present, dosage_main, inv_stdev, -2 * alt_freq * inv_stdev, 0.0, sample_ct, dosage_ct, normed_dosages);
3268   return kPglRetSuccess;
3269 }
3270 
3271 // This breaks the "don't pass pssi between functions" rule since it's a thin
3272 // wrapper around PgrGetInv1D().
LoadBiallelicCenteredVarmaj(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_uidx,double ref_freq,PgenReader * simple_pgrp,uint32_t * missing_presentp,double * normed_dosages,uintptr_t * genovec_buf,uintptr_t * dosage_present_buf,Dosage * dosage_main_buf)3273 PglErr LoadBiallelicCenteredVarmaj(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_uidx, double ref_freq, PgenReader* simple_pgrp, uint32_t* missing_presentp, double* normed_dosages, uintptr_t* genovec_buf, uintptr_t* dosage_present_buf, Dosage* dosage_main_buf) {
3274   uint32_t dosage_ct;
3275   PglErr reterr = PgrGetD(sample_include, pssi, sample_ct, variant_uidx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_main_buf, &dosage_ct);
3276   if (unlikely(reterr)) {
3277     // don't print malformed-.pgen error message here, since this is called
3278     // from multithreaded loops
3279     return reterr;
3280   }
3281   ZeroTrailingNyps(sample_ct, genovec_buf);
3282   if (missing_presentp) {
3283     // missing_present assumed to be initialized to 0
3284     // this should probably be a library function...
3285     const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
3286     if (!dosage_ct) {
3287       for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3288         const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3289         if (detect_11) {
3290           *missing_presentp = 1;
3291           break;
3292         }
3293       }
3294     } else {
3295       Halfword* dosage_present_alias = R_CAST(Halfword*, dosage_present_buf);
3296       for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3297         const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3298         if (detect_11) {
3299           if (PackWordToHalfword(detect_11) & (~dosage_present_alias[widx])) {
3300             *missing_presentp = 1;
3301             break;
3302           }
3303         }
3304       }
3305     }
3306   }
3307   return ExpandCenteredVarmaj(genovec_buf, dosage_present_buf, dosage_main_buf, variance_standardize, is_haploid, sample_ct, dosage_ct, ref_freq, normed_dosages);
3308 }
3309 
ComputeDiploidMultiallelicVariance(const double * cur_allele_freqs,uint32_t cur_allele_ct)3310 double ComputeDiploidMultiallelicVariance(const double* cur_allele_freqs, uint32_t cur_allele_ct) {
3311   const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3312   double variance = 0.0;
3313   double freq_sum = 0.0;
3314   for (uint32_t allele_idx = 0; allele_idx != cur_allele_ct_m1; ++allele_idx) {
3315     const double cur_allele_freq = cur_allele_freqs[allele_idx];
3316     variance += cur_allele_freq * (1.0 - cur_allele_freq);
3317     freq_sum += cur_allele_freq;
3318   }
3319   if (freq_sum < 1.0 - kSmallEpsilon) {
3320     const double last_allele_freq = 1.0 - freq_sum;
3321     variance += freq_sum * last_allele_freq;
3322   }
3323   return variance;
3324 }
3325 
3326 // Assumes trailing bits of pgvp->genovec have been zeroed out.
CheckMultiallelicDegenVariance(const PgenVariant * pgvp,const double * cur_allele_freqs,uint32_t sample_ct,uint32_t cur_allele_ct,double variance)3327 BoolErr CheckMultiallelicDegenVariance(const PgenVariant* pgvp, const double* cur_allele_freqs, uint32_t sample_ct, uint32_t cur_allele_ct, double variance) {
3328   // One allele has 100% frequency (or all frequencies are NaN).
3329   // If it's the REF allele, error out unless all nonmissing genotypes are
3330   // homozygous-ref, in which case this row can be filled with zeroes (or
3331   // omitted).
3332   // If it's ALT1, error out unless all nonmissing genotypes are hom-ALT1, etc.
3333   const uintptr_t* genovec_buf = pgvp->genovec;
3334   STD_ARRAY_DECL(uint32_t, 4, genocounts);
3335   GenoarrCountFreqsUnsafe(genovec_buf, sample_ct, genocounts);
3336   if (unlikely(pgvp->dosage_ct || genocounts[1])) {
3337     return 1;
3338   }
3339   const uint32_t nm_sample_ct = genocounts[2];
3340   if (variance != variance) {
3341     // NaN frequency is possible when all founder genotypes/dosages are
3342     // missing.  Error out in this case unless all other genotypes/dosages are
3343     // also missing.
3344     return (genocounts[0] || nm_sample_ct);
3345   }
3346   if (cur_allele_freqs[0] > 0.5) {
3347     return (nm_sample_ct != 0);
3348   }
3349   if (unlikely(genocounts[0])) {
3350     return 1;
3351   }
3352   if (cur_allele_freqs[1] > 0.5) {
3353     return (pgvp->patch_10_ct != 0);
3354   }
3355   if (pgvp->patch_10_ct != nm_sample_ct) {
3356     return 0;
3357   }
3358   const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3359   uint32_t mono_allele_idx;
3360   for (mono_allele_idx = 2; mono_allele_idx != cur_allele_ct_m1; ++mono_allele_idx) {
3361     if (cur_allele_freqs[mono_allele_idx] > 0.5) {
3362       break;
3363     }
3364   }
3365   return !AllBytesAreX(pgvp->patch_10_vals, mono_allele_idx, 2 * nm_sample_ct);
3366 }
3367 
LoadMultiallelicCenteredVarmaj(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,const double * cur_allele_freqs,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_uidx,uint32_t cur_allele_ct,uint32_t allele_idx_start,uint32_t allele_idx_end,PgenReader * simple_pgrp,uint32_t * missing_presentp,double * normed_dosages,PgenVariant * pgvp,double * allele_1copy_buf)3368 PglErr LoadMultiallelicCenteredVarmaj(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, const double* cur_allele_freqs, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_uidx, uint32_t cur_allele_ct, uint32_t allele_idx_start, uint32_t allele_idx_end, PgenReader* simple_pgrp, uint32_t* missing_presentp, double* normed_dosages, PgenVariant* pgvp, double* allele_1copy_buf) {
3369   // This handles cur_allele_ct == 2 correctly.  But we typically don't use it
3370   // in that case since it does ~2x as much work as necessary: the two
3371   // normed_dosages[] rows are identical except for opposite sign, so it's best
3372   // to combine them into one row.
3373   PglErr reterr = PgrGetMD(sample_include, pssi, sample_ct, variant_uidx, simple_pgrp, pgvp);
3374   if (unlikely(reterr)) {
3375     return reterr;
3376   }
3377   ZeroTrailingNyps(sample_ct, pgvp->genovec);
3378   const uintptr_t* genovec_buf = pgvp->genovec;
3379   if (missing_presentp) {
3380     // missing_present assumed to be initialized to 0
3381     const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
3382     if (!pgvp->dosage_ct) {
3383       for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3384         const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3385         if (detect_11) {
3386           *missing_presentp = 1;
3387           break;
3388         }
3389       }
3390     } else {
3391       Halfword* dosage_present_alias = R_CAST(Halfword*, pgvp->dosage_present);
3392       for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
3393         const uintptr_t detect_11 = Word11(genovec_buf[widx]);
3394         if (detect_11) {
3395           if (PackWordToHalfword(detect_11) & (~dosage_present_alias[widx])) {
3396             *missing_presentp = 1;
3397             break;
3398           }
3399         }
3400       }
3401     }
3402   }
3403   const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
3404   double freq_sum = cur_allele_freqs[0];
3405   for (uint32_t uii = 1; uii != cur_allele_ct_m1; ++uii) {
3406     freq_sum += cur_allele_freqs[uii];
3407   }
3408   const double last_allele_freq = 1.0 - freq_sum;
3409   double inv_stdev;
3410   if (variance_standardize) {
3411     const double variance = ComputeDiploidMultiallelicVariance(cur_allele_freqs, cur_allele_ct);
3412     if (!(variance > kSmallEpsilon)) {
3413       if (unlikely(CheckMultiallelicDegenVariance(pgvp, cur_allele_freqs, sample_ct, cur_allele_ct, variance))) {
3414         return kPglRetDegenerateData;
3415       }
3416       ZeroDArr(S_CAST(uintptr_t, sample_ct) * (allele_idx_end - allele_idx_start), normed_dosages);
3417       return kPglRetSuccess;
3418     }
3419     inv_stdev = (1.0 / kSqrt2) / sqrt(variance);
3420     if (is_haploid) {
3421       inv_stdev *= (1.0 / kSqrt2);
3422     }
3423   } else {
3424     inv_stdev = is_haploid? (0.5 / kSqrt2) : (1.0 / kSqrt2);
3425   }
3426   if (!pgvp->dosage_ct) {
3427     // diploid:
3428     //   \sum_i x_i * (1 - x_i)
3429     double lookup_vals[32] ALIGNV16;
3430     double* normed_dosages0 = normed_dosages - (allele_idx_start * S_CAST(uintptr_t, sample_ct));
3431     double alt1_intercept = 0.0;
3432     for (uint32_t allele_idx = allele_idx_start; allele_idx != allele_idx_end; ++allele_idx) {
3433       double cur_allele_freq;
3434       if (allele_idx != cur_allele_ct_m1) {
3435         cur_allele_freq = cur_allele_freqs[allele_idx];
3436       } else {
3437         cur_allele_freq = last_allele_freq;
3438       }
3439       const double intercept = -2 * cur_allele_freq * inv_stdev;
3440       if (!allele_idx) {
3441         // genovec entry of 0 corresponds to 2 copies of REF allele, etc.
3442         lookup_vals[0] = intercept + 2 * inv_stdev;
3443         lookup_vals[2] = intercept + inv_stdev;
3444         lookup_vals[4] = intercept;
3445         lookup_vals[6] = 0.0;
3446         InitLookup16x8bx2(lookup_vals);
3447         GenoarrLookup16x8bx2(genovec_buf, lookup_vals, sample_ct, normed_dosages0);
3448         continue;
3449       }
3450       allele_1copy_buf[allele_idx] = intercept + inv_stdev;
3451       if (allele_idx == 1) {
3452         alt1_intercept = intercept;
3453         lookup_vals[0] = intercept;
3454         lookup_vals[2] = intercept + inv_stdev;
3455         lookup_vals[4] = intercept + 2 * inv_stdev;
3456         lookup_vals[6] = 0.0;
3457         InitLookup16x8bx2(lookup_vals);
3458         GenoarrLookup16x8bx2(genovec_buf, lookup_vals, sample_ct, &(normed_dosages0[sample_ct]));
3459       } else {
3460         double* normed_dosages_cur_allele = &(normed_dosages0[allele_idx * S_CAST(uintptr_t, sample_ct)]);
3461         for (uint32_t uii = 0; uii != sample_ct; ++uii) {
3462           normed_dosages_cur_allele[uii] = intercept;
3463         }
3464       }
3465     }
3466     const uintptr_t* patch_01_set = pgvp->patch_01_set;
3467     const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
3468     const uintptr_t* patch_10_set = pgvp->patch_10_set;
3469     const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
3470     const uint32_t patch_01_ct = pgvp->patch_01_ct;
3471     const uint32_t patch_10_ct = pgvp->patch_10_ct;
3472     if ((allele_idx_start < 2) && (allele_idx_end >= 2)) {
3473       if (patch_01_ct) {
3474         uintptr_t sample_idx_base = 0;
3475         uintptr_t cur_bits = patch_01_set[0];
3476         if (cur_allele_ct == allele_idx_end) {
3477           for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3478             const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3479             normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3480             const uintptr_t cur_allele_code = patch_01_vals[uii];
3481             normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3482           }
3483         } else {
3484           for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3485             const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3486             normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3487             const uintptr_t cur_allele_code = patch_01_vals[uii];
3488             if (cur_allele_code < allele_idx_end) {
3489               normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3490             }
3491           }
3492         }
3493       }
3494       if (patch_10_ct) {
3495         uintptr_t sample_idx_base = 0;
3496         uintptr_t cur_bits = patch_10_set[0];
3497         if (cur_allele_ct == allele_idx_end) {
3498           for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3499             const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3500             normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3501             const uintptr_t ac0 = patch_10_vals[2 * uii];
3502             const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3503             const double ac0_1copy_val = allele_1copy_buf[ac0];
3504             if (ac0 == ac1) {
3505               normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3506             } else {
3507               normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3508               normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3509             }
3510           }
3511         } else {
3512           for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3513             const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3514             normed_dosages0[sample_ct + sample_idx] = alt1_intercept;
3515             const uintptr_t ac0 = patch_10_vals[2 * uii];
3516             if (ac0 >= allele_idx_end) {
3517               continue;
3518             }
3519             const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3520             const double ac0_1copy_val = allele_1copy_buf[ac0];
3521             if (ac0 == ac1) {
3522               normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3523             } else {
3524               normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3525               if (ac1 < allele_idx_end) {
3526                 normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3527               }
3528             }
3529           }
3530         }
3531       }
3532     } else {
3533       if (patch_01_ct) {
3534         uintptr_t sample_idx_base = 0;
3535         uintptr_t cur_bits = patch_01_set[0];
3536         for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
3537           const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
3538           const uintptr_t cur_allele_code = patch_01_vals[uii];
3539           if ((cur_allele_code >= allele_idx_start) && (cur_allele_code < allele_idx_end)) {
3540             normed_dosages0[cur_allele_code * sample_ct + sample_idx] = allele_1copy_buf[cur_allele_code];
3541           }
3542         }
3543       }
3544       if (patch_10_ct) {
3545         uintptr_t sample_idx_base = 0;
3546         uintptr_t cur_bits = patch_10_set[0];
3547         for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
3548           const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
3549           const uintptr_t ac0 = patch_10_vals[2 * uii];
3550           if (ac0 >= allele_idx_end) {
3551             continue;
3552           }
3553           const uintptr_t ac1 = patch_10_vals[2 * uii + 1];
3554           if (ac1 < allele_idx_start) {
3555             continue;
3556           }
3557           const double ac0_1copy_val = allele_1copy_buf[ac0];
3558           if (ac0 == ac1) {
3559             normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val + inv_stdev;
3560           } else {
3561             if (ac0 >= allele_idx_start) {
3562               normed_dosages0[ac0 * sample_ct + sample_idx] = ac0_1copy_val;
3563             }
3564             if (ac1 < allele_idx_end) {
3565               normed_dosages0[ac1 * sample_ct + sample_idx] = allele_1copy_buf[ac1];
3566             }
3567           }
3568         }
3569       }
3570     }
3571     return kPglRetSuccess;
3572   }
3573   fputs("true multiallelic dosages not yet supported by LoadMultiallelicCenteredVarmaj()\n", stderr);
3574   exit(S_CAST(int32_t, kPglRetNotYetSupported));
3575   return kPglRetSuccess;
3576 }
3577 
LoadCenteredVarmajBlock(const uintptr_t * sample_include,PgrSampleSubsetIndex pssi,const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,const double * allele_freqs,uint32_t variance_standardize,uint32_t is_haploid,uint32_t sample_ct,uint32_t variant_ct,PgenReader * simple_pgrp,double * normed_vmaj_iter,uintptr_t * variant_include_has_missing,uint32_t * cur_batch_sizep,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uintptr_t * allele_idx_basep,uint32_t * cur_allele_ctp,uint32_t * incomplete_allele_idxp,PgenVariant * pgvp,double * allele_1copy_buf)3578 PglErr LoadCenteredVarmajBlock(const uintptr_t* sample_include, PgrSampleSubsetIndex pssi, const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const double* allele_freqs, uint32_t variance_standardize, uint32_t is_haploid, uint32_t sample_ct, uint32_t variant_ct, PgenReader* simple_pgrp, double* normed_vmaj_iter, uintptr_t* variant_include_has_missing, uint32_t* cur_batch_sizep, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uintptr_t* allele_idx_basep, uint32_t* cur_allele_ctp, uint32_t* incomplete_allele_idxp, PgenVariant* pgvp, double* allele_1copy_buf) {
3579   const uint32_t std_batch_size = *cur_batch_sizep;
3580   uint32_t variant_idx = *variant_idxp;
3581   uintptr_t variant_uidx = *variant_uidxp;
3582   uintptr_t allele_idx_base = *allele_idx_basep;
3583   uint32_t cur_allele_ct = *cur_allele_ctp;
3584   uint32_t incomplete_allele_idx = *incomplete_allele_idxp;
3585   uintptr_t variant_uidx_base;
3586   uintptr_t cur_bits;
3587   BitIter1Start(variant_include, variant_uidx + (incomplete_allele_idx != 0), &variant_uidx_base, &cur_bits);
3588   for (uint32_t allele_bidx = 0; allele_bidx != std_batch_size; ) {
3589     uint32_t missing_present = 0;
3590     if (!incomplete_allele_idx) {
3591       variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3592       if (!allele_idx_offsets) {
3593         allele_idx_base = variant_uidx;
3594       } else {
3595         allele_idx_base = allele_idx_offsets[variant_uidx];
3596         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_base;
3597         allele_idx_base -= variant_uidx;
3598       }
3599     }
3600     uint32_t allele_idx_stop;
3601     uint32_t allele_idx_end;
3602     PglErr reterr;
3603     if (cur_allele_ct == 2) {
3604       allele_idx_stop = 1;
3605       allele_idx_end = 1;
3606       reterr = LoadBiallelicCenteredVarmaj(sample_include, pssi, variance_standardize, is_haploid, sample_ct, variant_uidx, allele_freqs[allele_idx_base], simple_pgrp, variant_include_has_missing? (&missing_present) : nullptr, normed_vmaj_iter, pgvp->genovec, pgvp->dosage_present, pgvp->dosage_main);
3607     } else {
3608       allele_idx_end = cur_allele_ct;
3609       allele_idx_stop = std_batch_size + incomplete_allele_idx - allele_bidx;
3610       if (allele_idx_stop > allele_idx_end) {
3611         allele_idx_stop = allele_idx_end;
3612       }
3613       reterr = LoadMultiallelicCenteredVarmaj(sample_include, pssi, &(allele_freqs[allele_idx_base]), variance_standardize, is_haploid, sample_ct, variant_uidx, cur_allele_ct, incomplete_allele_idx, allele_idx_stop, simple_pgrp, variant_include_has_missing? (&missing_present) : nullptr, normed_vmaj_iter, pgvp, allele_1copy_buf);
3614     }
3615     if (unlikely(reterr)) {
3616       if (reterr == kPglRetDegenerateData) {
3617         logputs("\n");
3618         logerrputs("Error: Zero-MAF variant is not actually monomorphic.  (This is possible when\ne.g. MAF is estimated from founders, but the minor allele was only observed in\nnonfounders.  In any case, you should be using e.g. --maf to filter out all\nvery-low-MAF variants, since the relationship matrix distance formula does not\nhandle them well.)\n");
3619       }
3620       return reterr;
3621     }
3622     if (missing_present) {
3623       SetBit(variant_uidx, variant_include_has_missing);
3624     }
3625     const uintptr_t incr = allele_idx_stop - incomplete_allele_idx;
3626     normed_vmaj_iter = &(normed_vmaj_iter[incr * sample_ct]);
3627     allele_bidx += incr;
3628     if (allele_idx_stop == allele_idx_end) {
3629       if (++variant_idx == variant_ct) {
3630         *cur_batch_sizep = allele_bidx;
3631         break;
3632       }
3633       incomplete_allele_idx = 0;
3634     } else {
3635       incomplete_allele_idx = allele_idx_stop;
3636     }
3637   }
3638   *variant_idxp = variant_idx;
3639   *variant_uidxp = variant_uidx + (incomplete_allele_idx == 0);
3640   *allele_idx_basep = allele_idx_base;
3641   *cur_allele_ctp = cur_allele_ct;
3642   *incomplete_allele_idxp = incomplete_allele_idx;
3643   return kPglRetSuccess;
3644 }
3645 
3646 CONSTI32(kGrmVariantBlockSize, 144);
3647 
3648 typedef struct CalcGrmPartCtxStruct {
3649   uint32_t* thread_start;
3650   uint32_t sample_ct;
3651 
3652   uint32_t cur_batch_size;
3653   double* normed_dosage_vmaj_bufs[2];
3654   double* normed_dosage_smaj_bufs[2];
3655 
3656   double* grm;
3657 } CalcGrmPartCtx;
3658 
3659 // turns out dsyrk_ does exactly what we want here
CalcGrmThread(void * raw_arg)3660 THREAD_FUNC_DECL CalcGrmThread(void* raw_arg) {
3661   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3662   assert(!arg->tidx);
3663   CalcGrmPartCtx* ctx = S_CAST(CalcGrmPartCtx*, arg->sharedp->context);
3664   const uint32_t sample_ct = ctx->sample_ct;
3665   double* grm = ctx->grm;
3666   uint32_t parity = 0;
3667   do {
3668     const uint32_t cur_batch_size = ctx->cur_batch_size;
3669     if (cur_batch_size) {
3670       TransposeMultiplySelfIncr(ctx->normed_dosage_vmaj_bufs[parity], sample_ct, cur_batch_size, grm);
3671     }
3672     parity = 1 - parity;
3673   } while (!THREAD_BLOCK_FINISH(arg));
3674   THREAD_RETURN;
3675 }
3676 
3677 // can't use dsyrk_, so we manually partition the GRM piece we need to compute
3678 // into an appropriate number of sub-pieces
CalcGrmPartThread(void * raw_arg)3679 THREAD_FUNC_DECL CalcGrmPartThread(void* raw_arg) {
3680   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3681   const uintptr_t tidx = arg->tidx;
3682   CalcGrmPartCtx* ctx = S_CAST(CalcGrmPartCtx*, arg->sharedp->context);
3683 
3684   const uintptr_t sample_ct = ctx->sample_ct;
3685   const uintptr_t first_thread_row_start_idx = ctx->thread_start[0];
3686   const uintptr_t row_start_idx = ctx->thread_start[tidx];
3687   const uintptr_t row_ct = ctx->thread_start[tidx + 1] - row_start_idx;
3688   double* grm_piece = &(ctx->grm[(row_start_idx - first_thread_row_start_idx) * sample_ct]);
3689   uint32_t parity = 0;
3690   do {
3691     const uintptr_t cur_batch_size = ctx->cur_batch_size;
3692     if (cur_batch_size) {
3693       double* normed_vmaj = ctx->normed_dosage_vmaj_bufs[parity];
3694       double* normed_smaj = ctx->normed_dosage_smaj_bufs[parity];
3695       RowMajorMatrixMultiplyIncr(&(normed_smaj[row_start_idx * cur_batch_size]), normed_vmaj, row_ct, sample_ct, cur_batch_size, grm_piece);
3696     }
3697     parity = 1 - parity;
3698   } while (!THREAD_BLOCK_FINISH(arg));
3699   THREAD_RETURN;
3700 }
3701 
3702 CONSTI32(kDblMissingBlockWordCt, 2);
3703 CONSTI32(kDblMissingBlockSize, kDblMissingBlockWordCt * kBitsPerWord);
3704 
3705 typedef struct CalcDblMissingCtxStruct {
3706   uint32_t* thread_start;
3707   // missing_nz bit is set iff that sample has at least one missing entry in
3708   // current block
3709   uintptr_t* missing_nz[2];
3710   uintptr_t* missing_smaj[2];
3711   uint32_t* missing_dbl_exclude_cts;
3712 } CalcDblMissingCtx;
3713 
CalcDblMissingThread(void * raw_arg)3714 THREAD_FUNC_DECL CalcDblMissingThread(void* raw_arg) {
3715   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
3716   const uintptr_t tidx = arg->tidx;
3717   CalcDblMissingCtx* ctx = S_CAST(CalcDblMissingCtx*, arg->sharedp->context);
3718 
3719   const uint64_t first_thread_row_start_idx = ctx->thread_start[0];
3720   const uint64_t dbl_exclude_offset = (first_thread_row_start_idx * (first_thread_row_start_idx - 1)) / 2;
3721   const uint32_t row_start_idx = ctx->thread_start[tidx];
3722   const uintptr_t row_end_idx = ctx->thread_start[tidx + 1];
3723   uint32_t* missing_dbl_exclude_cts = ctx->missing_dbl_exclude_cts;
3724   uint32_t parity = 0;
3725   do {
3726     const uintptr_t* missing_nz = ctx->missing_nz[parity];
3727     const uintptr_t* missing_smaj = ctx->missing_smaj[parity];
3728     const uint32_t first_idx = AdvBoundedTo1Bit(missing_nz, 0, row_end_idx);
3729     uint32_t sample_idx = first_idx;
3730     uint32_t prev_missing_nz_ct = 0;
3731     if (sample_idx < row_start_idx) {
3732       sample_idx = AdvBoundedTo1Bit(missing_nz, row_start_idx, row_end_idx);
3733       if (sample_idx != row_end_idx) {
3734         prev_missing_nz_ct = PopcountBitRange(missing_nz, 0, row_start_idx);
3735       }
3736     }
3737     while (sample_idx < row_end_idx) {
3738       // todo: compare this explicit unroll with ordinary iteration over a
3739       // cur_words[] array
3740       // todo: try 1 word at a time, and 30 words at a time
3741       const uintptr_t cur_word0 = missing_smaj[sample_idx * kDblMissingBlockWordCt];
3742       const uintptr_t cur_word1 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 1];
3743 #ifndef __LP64__
3744       const uintptr_t cur_word2 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 2];
3745       const uintptr_t cur_word3 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 3];
3746 #endif
3747       uintptr_t sample_idx2_base;
3748       uintptr_t cur_bits;
3749       BitIter1Start(missing_nz, first_idx, &sample_idx2_base, &cur_bits);
3750       // (sample_idx - 1) underflow ok
3751       uint32_t* write_base = &(missing_dbl_exclude_cts[((S_CAST(uint64_t, sample_idx) * (sample_idx - 1)) / 2) - dbl_exclude_offset]);
3752       for (uint32_t uii = 0; uii != prev_missing_nz_ct; ++uii) {
3753         const uint32_t sample_idx2 = BitIter1(missing_nz, &sample_idx2_base, &cur_bits);
3754         const uintptr_t* cur_missing_smaj_base = &(missing_smaj[sample_idx2 * kDblMissingBlockWordCt]);
3755         const uintptr_t cur_and0 = cur_word0 & cur_missing_smaj_base[0];
3756         const uintptr_t cur_and1 = cur_word1 & cur_missing_smaj_base[1];
3757 #ifdef __LP64__
3758         if (cur_and0 || cur_and1) {
3759           write_base[sample_idx2] += Popcount2Words(cur_and0, cur_and1);
3760         }
3761 #else
3762         const uintptr_t cur_and2 = cur_word2 & cur_missing_smaj_base[2];
3763         const uintptr_t cur_and3 = cur_word3 & cur_missing_smaj_base[3];
3764         if (cur_and0 || cur_and1 || cur_and2 || cur_and3) {
3765           write_base[sample_idx2] += Popcount4Words(cur_and0, cur_and1, cur_and2, cur_and3);
3766         }
3767 #endif
3768       }
3769       ++prev_missing_nz_ct;
3770       sample_idx = AdvBoundedTo1Bit(missing_nz, sample_idx + 1, row_end_idx);
3771     }
3772     parity = 1 - parity;
3773   } while (!THREAD_BLOCK_FINISH(arg));
3774   THREAD_RETURN;
3775 }
3776 
CalcMissingMatrix(const uintptr_t * sample_include,const uint32_t * sample_include_cumulative_popcounts,const uintptr_t * variant_include,uint32_t sample_ct,uint32_t variant_ct,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t row_start_idx,uintptr_t row_end_idx,uint32_t max_thread_ct,PgenReader * simple_pgrp,uint32_t ** missing_cts_ptr,uint32_t ** missing_dbl_exclude_cts_ptr)3777 PglErr CalcMissingMatrix(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t row_start_idx, uintptr_t row_end_idx, uint32_t max_thread_ct, PgenReader* simple_pgrp, uint32_t** missing_cts_ptr, uint32_t** missing_dbl_exclude_cts_ptr) {
3778   unsigned char* bigstack_mark = g_bigstack_base;
3779   ThreadGroup tg;
3780   PreinitThreads(&tg);
3781   PglErr reterr = kPglRetSuccess;
3782   {
3783     const uintptr_t row_end_idxl = BitCtToWordCt(row_end_idx);
3784     // bugfix (1 Oct 2017): missing_vmaj rows must be vector-aligned
3785     const uintptr_t row_end_idxaw = BitCtToAlignedWordCt(row_end_idx);
3786     uintptr_t* missing_vmaj = nullptr;
3787     uintptr_t* genovec_buf = nullptr;
3788     CalcDblMissingCtx ctx;
3789     if (bigstack_calloc_u32(row_end_idx, missing_cts_ptr) ||
3790         bigstack_calloc_u32((S_CAST(uint64_t, row_end_idx) * (row_end_idx - 1) - S_CAST(uint64_t, row_start_idx) * (row_start_idx - 1)) / 2, missing_dbl_exclude_cts_ptr) ||
3791         bigstack_calloc_w(row_end_idxl, &ctx.missing_nz[0]) ||
3792         bigstack_calloc_w(row_end_idxl, &ctx.missing_nz[1]) ||
3793         bigstack_alloc_w(NypCtToWordCt(row_end_idx), &genovec_buf) ||
3794         bigstack_alloc_w(row_end_idxaw * (k1LU * kDblMissingBlockSize), &missing_vmaj) ||
3795         bigstack_alloc_w(RoundUpPow2(row_end_idx, 2) * kDblMissingBlockWordCt, &ctx.missing_smaj[0]) ||
3796         bigstack_alloc_w(RoundUpPow2(row_end_idx, 2) * kDblMissingBlockWordCt, &ctx.missing_smaj[1])) {
3797       goto CalcMissingMatrix_ret_NOMEM;
3798     }
3799     uint32_t* missing_cts = *missing_cts_ptr;
3800     uint32_t* missing_dbl_exclude_cts = *missing_dbl_exclude_cts_ptr;
3801     ctx.missing_dbl_exclude_cts = missing_dbl_exclude_cts;
3802     VecW* transpose_bitblock_wkspace = S_CAST(VecW*, bigstack_alloc_raw(kPglBitTransposeBufbytes));
3803     uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
3804     if (unlikely(
3805             SetThreadCt(calc_thread_ct, &tg) ||
3806             bigstack_alloc_u32(calc_thread_ct + 1, &ctx.thread_start))) {
3807       goto CalcMissingMatrix_ret_NOMEM;
3808     }
3809     // note that this ctx.thread_start[] may have different values than the one
3810     // computed by CalcGrm(), since calc_thread_ct changes in the MTBLAS and
3811     // OS X cases.
3812     TriangleFill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, ctx.thread_start);
3813     assert(ctx.thread_start[0] == row_start_idx);
3814     assert(ctx.thread_start[calc_thread_ct] == row_end_idx);
3815     SetThreadFuncAndData(CalcDblMissingThread, &ctx, &tg);
3816     const uint32_t sample_transpose_batch_ct_m1 = (row_end_idx - 1) / kPglBitTransposeBatch;
3817 
3818     uintptr_t variant_uidx_base = 0;
3819     uintptr_t cur_bits = variant_include[0];
3820     uint32_t parity = 0;
3821     uint32_t pct = 0;
3822     uint32_t next_print_variant_idx = variant_ct / 100;
3823     // caller's responsibility to print this
3824     // logputs("Correcting for missingness: ");
3825     fputs("0%", stdout);
3826     fflush(stdout);
3827     PgrSampleSubsetIndex pssi;
3828     PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
3829     for (uint32_t cur_variant_idx_start = 0; ; ) {
3830       uint32_t cur_batch_size = 0;
3831       if (!IsLastBlock(&tg)) {
3832         cur_batch_size = kDblMissingBlockSize;
3833         uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
3834         if (cur_variant_idx_end > variant_ct) {
3835           cur_batch_size = variant_ct - cur_variant_idx_start;
3836           cur_variant_idx_end = variant_ct;
3837           ZeroWArr((kDblMissingBlockSize - cur_batch_size) * row_end_idxaw, &(missing_vmaj[cur_batch_size * row_end_idxaw]));
3838         }
3839         uintptr_t* missing_vmaj_iter = missing_vmaj;
3840         for (uint32_t variant_idx = cur_variant_idx_start; variant_idx != cur_variant_idx_end; ++variant_idx) {
3841           const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3842           reterr = PgrGetMissingnessD(sample_include, pssi, row_end_idx, variant_uidx, simple_pgrp, nullptr, missing_vmaj_iter, nullptr, genovec_buf);
3843           if (unlikely(reterr)) {
3844             goto CalcMissingMatrix_ret_PGR_FAIL;
3845           }
3846           missing_vmaj_iter = &(missing_vmaj_iter[row_end_idxaw]);
3847         }
3848         uintptr_t* cur_missing_smaj_iter = ctx.missing_smaj[parity];
3849         uint32_t sample_batch_size = kPglBitTransposeBatch;
3850         for (uint32_t sample_transpose_batch_idx = 0; ; ++sample_transpose_batch_idx) {
3851           if (sample_transpose_batch_idx >= sample_transpose_batch_ct_m1) {
3852             if (sample_transpose_batch_idx > sample_transpose_batch_ct_m1) {
3853               break;
3854             }
3855             sample_batch_size = ModNz(row_end_idx, kPglBitTransposeBatch);
3856           }
3857           // missing_smaj offset needs to be 64-bit if kDblMissingBlockWordCt
3858           // increases
3859           TransposeBitblock(&(missing_vmaj[sample_transpose_batch_idx * kPglBitTransposeWords]), row_end_idxaw, kDblMissingBlockWordCt, kDblMissingBlockSize, sample_batch_size, &(cur_missing_smaj_iter[sample_transpose_batch_idx * kPglBitTransposeBatch * kDblMissingBlockWordCt]), transpose_bitblock_wkspace);
3860         }
3861         uintptr_t* cur_missing_nz = ctx.missing_nz[parity];
3862         ZeroWArr(row_end_idxl, cur_missing_nz);
3863         for (uint32_t sample_idx = 0; sample_idx != row_end_idx; ++sample_idx) {
3864           const uintptr_t cur_word0 = *cur_missing_smaj_iter++;
3865           const uintptr_t cur_word1 = *cur_missing_smaj_iter++;
3866 #ifdef __LP64__
3867           if (cur_word0 || cur_word1) {
3868             SetBit(sample_idx, cur_missing_nz);
3869             missing_cts[sample_idx] += Popcount2Words(cur_word0, cur_word1);
3870           }
3871 #else
3872           const uintptr_t cur_word2 = *cur_missing_smaj_iter++;
3873           const uintptr_t cur_word3 = *cur_missing_smaj_iter++;
3874           if (cur_word0 || cur_word1 || cur_word2 || cur_word3) {
3875             SetBit(sample_idx, cur_missing_nz);
3876             missing_cts[sample_idx] += Popcount4Words(cur_word0, cur_word1, cur_word2, cur_word3);
3877           }
3878 #endif
3879         }
3880       }
3881       if (cur_variant_idx_start) {
3882         JoinThreads(&tg);
3883         // CalcDblMissingThread() never errors out
3884         if (IsLastBlock(&tg)) {
3885           break;
3886         }
3887         if (cur_variant_idx_start >= next_print_variant_idx) {
3888           if (pct > 10) {
3889             putc_unlocked('\b', stdout);
3890           }
3891           pct = (cur_variant_idx_start * 100LLU) / variant_ct;
3892           printf("\b\b%u%%", pct++);
3893           fflush(stdout);
3894           next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
3895         }
3896       }
3897       if (cur_variant_idx_start + cur_batch_size == variant_ct) {
3898         DeclareLastThreadBlock(&tg);
3899       }
3900       if (unlikely(SpawnThreads(&tg))) {
3901         goto CalcMissingMatrix_ret_THREAD_CREATE_FAIL;
3902       }
3903       cur_variant_idx_start += cur_batch_size;
3904       parity = 1 - parity;
3905     }
3906     if (pct > 10) {
3907       putc_unlocked('\b', stdout);
3908     }
3909     fputs("\b\b", stdout);
3910     logputs("done.\n");
3911     bigstack_mark = R_CAST(unsigned char*, ctx.missing_nz[0]);
3912   }
3913   while (0) {
3914   CalcMissingMatrix_ret_NOMEM:
3915     reterr = kPglRetNomem;
3916     break;
3917   CalcMissingMatrix_ret_PGR_FAIL:
3918     PgenErrPrintN(reterr);
3919     break;
3920   CalcMissingMatrix_ret_THREAD_CREATE_FAIL:
3921     reterr = kPglRetThreadCreateFail;
3922     break;
3923   }
3924   CleanupThreads(&tg);
3925   BigstackReset(bigstack_mark);
3926   return reterr;
3927 }
3928 
CalcGrm(const uintptr_t * orig_sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const uintptr_t * allele_idx_offsets,const double * allele_freqs,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct,GrmFlags grm_flags,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end,double ** grm_ptr)3929 PglErr CalcGrm(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const uintptr_t* allele_idx_offsets, const double* allele_freqs, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, GrmFlags grm_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end, double** grm_ptr) {
3930   unsigned char* bigstack_mark = g_bigstack_base;
3931   unsigned char* bigstack_end_mark = g_bigstack_end;
3932   FILE* outfile = nullptr;
3933   char* cswritep = nullptr;
3934   CompressStreamState css;
3935   ThreadGroup tg;
3936   PglErr reterr = kPglRetSuccess;
3937   PreinitCstream(&css);
3938   PreinitThreads(&tg);
3939   {
3940     assert(variant_ct);
3941 #if defined(__APPLE__) || defined(USE_MTBLAS)
3942     uint32_t calc_thread_ct = 1;
3943 #else
3944     uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
3945     if (calc_thread_ct * parallel_tot > sample_ct / 32) {
3946       calc_thread_ct = sample_ct / (32 * parallel_tot);
3947       if (!calc_thread_ct) {
3948         calc_thread_ct = 1;
3949       }
3950     }
3951 #endif
3952     if (unlikely(sample_ct < 2)) {
3953       logerrputs("Error: GRM construction requires at least two samples.\n");
3954       goto CalcGrm_ret_DEGENERATE_DATA;
3955     }
3956     const uintptr_t* sample_include = orig_sample_include;
3957     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
3958     uint32_t row_start_idx = 0;
3959     uintptr_t row_end_idx = sample_ct;
3960     uint32_t* thread_start = nullptr;
3961     if ((calc_thread_ct != 1) || (parallel_tot != 1)) {
3962       // note that grm should be allocated on bottom if no --parallel, since it
3963       // may continue to be used after function exit.  So we allocate this on
3964       // top.
3965       if (unlikely(bigstack_end_alloc_u32(calc_thread_ct + 1, &thread_start))) {
3966         goto CalcGrm_ret_NOMEM;
3967       }
3968       // slightly different from plink 1.9 since we don't bother to treat the
3969       // diagonal as a special case any more.
3970       TriangleFill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, thread_start);
3971       row_start_idx = thread_start[0];
3972       row_end_idx = thread_start[calc_thread_ct];
3973       if (row_end_idx < sample_ct) {
3974         // 0
3975         // 0 0
3976         // 0 0 0
3977         // 0 0 0 0
3978         // 1 1 1 1 1
3979         // 1 1 1 1 1 1
3980         // 2 2 2 2 2 2 2
3981         // 2 2 2 2 2 2 2 2
3982         // If we're computing part 0, we never need to load the last 4 samples;
3983         // if part 1, we don't need the last two; etc.
3984         uintptr_t* new_sample_include;
3985         if (unlikely(bigstack_alloc_w(raw_sample_ctl, &new_sample_include))) {
3986           goto CalcGrm_ret_NOMEM;
3987         }
3988         const uint32_t sample_uidx_end = 1 + IdxToUidxBasic(orig_sample_include, row_end_idx - 1);
3989         memcpy(new_sample_include, orig_sample_include, RoundUpPow2(sample_uidx_end, kBitsPerWord) / CHAR_BIT);
3990         ClearBitsNz(sample_uidx_end, raw_sample_ctl * kBitsPerWord, new_sample_include);
3991         sample_include = new_sample_include;
3992       }
3993       if ((!parallel_idx) && (calc_thread_ct == 1)) {
3994         thread_start = nullptr;
3995       }
3996     }
3997 
3998     CalcGrmPartCtx ctx;
3999     ctx.thread_start = thread_start;
4000     double* grm;
4001     if (unlikely(
4002             SetThreadCt(calc_thread_ct, &tg))) {
4003       goto CalcGrm_ret_NOMEM;
4004     }
4005     if (unlikely(
4006             bigstack_calloc_d((row_end_idx - row_start_idx) * row_end_idx, &grm))) {
4007       if (!grm_ptr) {
4008         logerrputs("Error: Out of memory.  If you are SURE you are performing the right matrix\ncomputation, you can split it into smaller pieces with --parallel, and then\nconcatenate the results.  But before you try this, make sure the program you're\nproviding the matrix to can actually handle such a large input file.\n");
4009       } else {
4010         // Need to edit this if there are ever non-PCA ways to get here.
4011         if (!(grm_flags & (kfGrmMatrixShapemask | kfGrmListmask | kfGrmBin))) {
4012           logerrputs("Error: Out of memory.  Consider \"--pca approx\" instead.\n");
4013         } else {
4014           logerrputs("Error: Out of memory.  Consider \"--pca approx\" (and not writing the GRM to\ndisk) instead.\n");
4015         }
4016       }
4017       goto CalcGrm_ret_NOMEM_CUSTOM;
4018     }
4019     ctx.sample_ct = row_end_idx;
4020     ctx.grm = grm;
4021     uint32_t* sample_include_cumulative_popcounts;
4022     PgenVariant pgv;
4023     double* allele_1copy_buf;
4024     if (unlikely(
4025             bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
4026             BigstackAllocPgv(row_end_idx, allele_idx_offsets != nullptr, PgrGetGflags(simple_pgrp), &pgv) ||
4027             bigstack_alloc_d(max_allele_ct, &allele_1copy_buf))) {
4028       goto CalcGrm_ret_NOMEM;
4029     }
4030     FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
4031     reterr = ConditionalAllocateNonAutosomalVariants(cip, "GRM construction", raw_variant_ct, &variant_include, &variant_ct);
4032     if (unlikely(reterr)) {
4033       goto CalcGrm_ret_1;
4034     }
4035     if (unlikely(
4036             bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_vmaj_bufs[0]) ||
4037             bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_vmaj_bufs[1]))) {
4038       goto CalcGrm_ret_NOMEM;
4039     }
4040     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
4041     uintptr_t* variant_include_has_missing = nullptr;
4042     if (!(grm_flags & kfGrmMeanimpute)) {
4043       if (unlikely(bigstack_calloc_w(raw_variant_ctl, &variant_include_has_missing))) {
4044         goto CalcGrm_ret_NOMEM;
4045       }
4046     }
4047     if (thread_start) {
4048       if (unlikely(
4049               bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_smaj_bufs[0]) ||
4050               bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &ctx.normed_dosage_smaj_bufs[1]))) {
4051         goto CalcGrm_ret_NOMEM;
4052       }
4053       SetThreadFuncAndData(CalcGrmPartThread, &ctx, &tg);
4054     } else {
4055       // defensive
4056       ctx.normed_dosage_smaj_bufs[0] = nullptr;
4057       ctx.normed_dosage_smaj_bufs[1] = nullptr;
4058       SetThreadFuncAndData(CalcGrmThread, &ctx, &tg);
4059     }
4060 #ifdef USE_MTBLAS
4061     const uint32_t blas_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
4062     BLAS_SET_NUM_THREADS(blas_thread_ct);
4063 #endif
4064     // Main workflow:
4065     // 1. Set n=0, load batch 0
4066     //
4067     // 2. Spawn threads processing batch n
4068     // 3. Increment n by 1
4069     // 4. Load batch n unless eof
4070     // 5. Join threads
4071     // 6. Goto step 2 unless eof
4072     const uint32_t variance_standardize = !(grm_flags & kfGrmCov);
4073     const uint32_t is_haploid = cip->haploid_mask[0] & 1;
4074     uint32_t cur_batch_size = kGrmVariantBlockSize;
4075     uint32_t variant_idx_start = 0;
4076     uint32_t variant_idx = 0;
4077     uintptr_t variant_uidx = 0;
4078     uintptr_t allele_idx_base = 0;
4079     uint32_t cur_allele_ct = 2;
4080     uint32_t incomplete_allele_idx = 0;
4081     uint32_t parity = 0;
4082     uint32_t is_not_first_block = 0;
4083     uint32_t pct = 0;
4084     uint32_t next_print_variant_idx = variant_ct / 100;
4085     logputs("Constructing GRM: ");
4086     fputs("0%", stdout);
4087     fflush(stdout);
4088     PgrSampleSubsetIndex pssi;
4089     PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
4090     while (1) {
4091       if (!IsLastBlock(&tg)) {
4092         double* normed_vmaj = ctx.normed_dosage_vmaj_bufs[parity];
4093         reterr = LoadCenteredVarmajBlock(sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, variance_standardize, is_haploid, row_end_idx, variant_ct, simple_pgrp, normed_vmaj, variant_include_has_missing, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
4094         if (unlikely(reterr)) {
4095           goto CalcGrm_ret_PGR_FAIL;
4096         }
4097         if (thread_start) {
4098           MatrixTransposeCopy(normed_vmaj, cur_batch_size, row_end_idx, ctx.normed_dosage_smaj_bufs[parity]);
4099         }
4100       }
4101       if (is_not_first_block) {
4102         JoinThreads(&tg);
4103         // CalcGrmPartThread() and CalcGrmThread() never error out
4104         if (IsLastBlock(&tg)) {
4105           break;
4106         }
4107         if (variant_idx_start >= next_print_variant_idx) {
4108           if (pct > 10) {
4109             putc_unlocked('\b', stdout);
4110           }
4111           pct = (variant_idx_start * 100LLU) / variant_ct;
4112           printf("\b\b%u%%", pct++);
4113           fflush(stdout);
4114           next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
4115         }
4116       }
4117       ctx.cur_batch_size = cur_batch_size;
4118       if (variant_idx == variant_ct) {
4119         DeclareLastThreadBlock(&tg);
4120         cur_batch_size = 0;
4121       }
4122       if (unlikely(SpawnThreads(&tg))) {
4123         goto CalcGrm_ret_THREAD_CREATE_FAIL;
4124       }
4125       is_not_first_block = 1;
4126       variant_idx_start = variant_idx;
4127       parity = 1 - parity;
4128     }
4129     BLAS_SET_NUM_THREADS(1);
4130     if (pct > 10) {
4131       putc_unlocked('\b', stdout);
4132     }
4133     fputs("\b\b", stdout);
4134     logputs("done.\n");
4135     uint32_t* missing_cts = nullptr;  // stays null iff meanimpute
4136     uint32_t* missing_dbl_exclude_cts = nullptr;
4137     if (variant_include_has_missing) {
4138       const uint32_t variant_ct_with_missing = PopcountWords(variant_include_has_missing, raw_variant_ctl);
4139       // if no missing calls at all, act as if meanimpute was on
4140       if (variant_ct_with_missing) {
4141         logputs("Correcting for missingness... ");
4142         reterr = CalcMissingMatrix(sample_include, sample_include_cumulative_popcounts, variant_include_has_missing, sample_ct, variant_ct_with_missing, parallel_idx, parallel_tot, row_start_idx, row_end_idx, max_thread_ct, simple_pgrp, &missing_cts, &missing_dbl_exclude_cts);
4143         if (unlikely(reterr)) {
4144           goto CalcGrm_ret_1;
4145         }
4146       }
4147     }
4148     if (missing_cts) {
4149       // could parallelize this loop if it ever matters
4150       const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
4151       for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4152         const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
4153         double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4154         for (uint32_t col_idx = 0; col_idx != row_idx; ++col_idx) {
4155           *grm_iter++ /= u31tod(variant_ct_base - missing_cts[col_idx] + (*missing_dbl_exclude_iter++));
4156         }
4157         *grm_iter++ /= u31tod(variant_ct_base);
4158       }
4159     } else {
4160       const double variant_ct_recip = 1.0 / u31tod(variant_ct);
4161       for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4162         double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4163         for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4164           *grm_iter++ *= variant_ct_recip;
4165         }
4166       }
4167     }
4168     // N.B. Only the lower right of grm[] is valid when parallel_tot == 1.
4169 
4170     // possible todo: allow simultaneous --make-rel and
4171     // --make-grm-list/--make-grm-bin
4172     // (note that this routine may also be called by --pca, which may not write
4173     // a matrix to disk at all.)
4174     if (grm_flags & (kfGrmMatrixShapemask | kfGrmListmask | kfGrmBin)) {
4175       const GrmFlags matrix_shape = grm_flags & kfGrmMatrixShapemask;
4176       char* log_write_iter;
4177       if (matrix_shape) {
4178         // --make-rel
4179         fputs("--make-rel: Writing...", stdout);
4180         fflush(stdout);
4181         if (grm_flags & kfGrmMatrixBin) {
4182           char* outname_end2 = strcpya_k(outname_end, ".rel.bin");
4183           if (parallel_tot != 1) {
4184             *outname_end2++ = '.';
4185             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4186           }
4187           *outname_end2 = '\0';
4188           if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4189             goto CalcGrm_ret_OPEN_FAIL;
4190           }
4191           double* write_double_buf = nullptr;
4192           if (matrix_shape == kfGrmMatrixSq0) {
4193             write_double_buf = R_CAST(double*, g_textbuf);
4194             ZeroDArr(kTextbufMainSize / sizeof(double), write_double_buf);
4195           } else if (matrix_shape == kfGrmMatrixSq) {
4196             if (unlikely(bigstack_alloc_d(row_end_idx - row_start_idx - 1, &write_double_buf))) {
4197               goto CalcGrm_ret_NOMEM;
4198             }
4199           }
4200           for (uintptr_t row_idx = row_start_idx; ; ) {
4201             const double* grm_row = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4202             ++row_idx;
4203             if (unlikely(fwrite_checked(grm_row, row_idx * sizeof(double), outfile))) {
4204               goto CalcGrm_ret_WRITE_FAIL;
4205             }
4206             if (row_idx == row_end_idx) {
4207               break;
4208             }
4209             if (matrix_shape == kfGrmMatrixSq0) {
4210               uintptr_t zbytes_to_dump = (sample_ct - row_idx) * sizeof(double);
4211               while (zbytes_to_dump >= kTextbufMainSize) {
4212                 if (unlikely(fwrite_checked(write_double_buf, kTextbufMainSize, outfile))) {
4213                   goto CalcGrm_ret_WRITE_FAIL;
4214                 }
4215                 zbytes_to_dump -= kTextbufMainSize;
4216               }
4217               if (zbytes_to_dump) {
4218                 if (unlikely(fwrite_checked(write_double_buf, zbytes_to_dump, outfile))) {
4219                   goto CalcGrm_ret_WRITE_FAIL;
4220                 }
4221               }
4222             } else if (matrix_shape == kfGrmMatrixSq) {
4223               double* write_double_iter = write_double_buf;
4224               const double* grm_col = &(grm[row_idx - 1]);
4225               for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4226                 *write_double_iter++ = grm_col[(row_idx2 - row_start_idx) * sample_ct];
4227               }
4228               if (unlikely(fwrite_checked(write_double_buf, (sample_ct - row_idx) * sizeof(double), outfile))) {
4229                 goto CalcGrm_ret_WRITE_FAIL;
4230               }
4231             }
4232           }
4233           if (unlikely(fclose_null(&outfile))) {
4234             goto CalcGrm_ret_WRITE_FAIL;
4235           }
4236         } else if (grm_flags & kfGrmMatrixBin4) {
4237           // downcode all entries to floats
4238           char* outname_end2 = strcpya_k(outname_end, ".rel.bin");
4239           if (parallel_tot != 1) {
4240             *outname_end2++ = '.';
4241             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4242           }
4243           *outname_end2 = '\0';
4244           if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4245             goto CalcGrm_ret_OPEN_FAIL;
4246           }
4247           float* write_float_buf;
4248           if (unlikely(bigstack_alloc_f(row_end_idx, &write_float_buf))) {
4249             goto CalcGrm_ret_NOMEM;
4250           }
4251           uintptr_t row_idx = row_start_idx;
4252           do {
4253             const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4254             float* write_float_iter = write_float_buf;
4255             for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4256               *write_float_iter++ = S_CAST(float, *grm_iter++);
4257             }
4258             ++row_idx;
4259             if (matrix_shape == kfGrmMatrixSq0) {
4260               ZeroFArr(sample_ct - row_idx, write_float_iter);
4261               write_float_iter = &(write_float_buf[sample_ct]);
4262             } else if (matrix_shape == kfGrmMatrixSq) {
4263               const double* grm_col = &(grm[row_idx - 1]);
4264               for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4265                 *write_float_iter++ = S_CAST(float, grm_col[(row_idx2 - row_start_idx) * sample_ct]);
4266               }
4267             }
4268             if (unlikely(fwrite_checked(write_float_buf, sizeof(float) * S_CAST(uintptr_t, write_float_iter - write_float_buf), outfile))) {
4269               goto CalcGrm_ret_WRITE_FAIL;
4270             }
4271           } while (row_idx < row_end_idx);
4272           if (unlikely(fclose_null(&outfile))) {
4273             goto CalcGrm_ret_WRITE_FAIL;
4274           }
4275         } else {
4276           char* outname_end2 = strcpya_k(outname_end, ".rel");
4277           if (parallel_tot != 1) {
4278             *outname_end2++ = '.';
4279             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4280           }
4281           const uint32_t output_zst = (grm_flags / kfGrmMatrixZs) & 1;
4282           if (output_zst) {
4283             outname_end2 = strcpya_k(outname_end2, ".zst");
4284           }
4285           *outname_end2 = '\0';
4286           reterr = InitCstreamAlloc(outname, 0, output_zst, max_thread_ct, kCompressStreamBlock + 16 * row_end_idx, &css, &cswritep);
4287           if (unlikely(reterr)) {
4288             goto CalcGrm_ret_1;
4289           }
4290           uintptr_t row_idx = row_start_idx;
4291           do {
4292             const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4293             ++row_idx;
4294             for (uint32_t col_idx = 0; col_idx != row_idx; ++col_idx) {
4295               cswritep = dtoa_g(*grm_iter++, cswritep);
4296               *cswritep++ = '\t';
4297             }
4298             if (matrix_shape == kfGrmMatrixSq0) {
4299               // (roughly same performance as creating a zero-tab constant
4300               // buffer in advance)
4301               const uint32_t zcount = sample_ct - row_idx;
4302               const uint32_t wct = DivUp(zcount, kBytesPerWord / 2);
4303               // assumes little-endian
4304               const uintptr_t zerotab_word = 0x930 * kMask0001;
4305 #ifdef __arm__
4306 #  error "Unaligned accesses in CalcGrm()."
4307 #endif
4308               uintptr_t* writep_alias = R_CAST(uintptr_t*, cswritep);
4309               for (uintptr_t widx = 0; widx != wct; ++widx) {
4310                 *writep_alias++ = zerotab_word;
4311               }
4312               cswritep = &(cswritep[zcount * 2]);
4313             } else if (matrix_shape == kfGrmMatrixSq) {
4314               const double* grm_col = &(grm[row_idx - 1]);
4315               for (uintptr_t row_idx2 = row_idx; row_idx2 != sample_ct; ++row_idx2) {
4316                 cswritep = dtoa_g(grm_col[(row_idx2 - row_start_idx) * sample_ct], cswritep);
4317                 *cswritep++ = '\t';
4318               }
4319             }
4320             DecrAppendBinaryEoln(&cswritep);
4321             if (unlikely(Cswrite(&css, &cswritep))) {
4322               goto CalcGrm_ret_WRITE_FAIL;
4323             }
4324           } while (row_idx < row_end_idx);
4325           if (unlikely(CswriteCloseNull(&css, cswritep))) {
4326             goto CalcGrm_ret_WRITE_FAIL;
4327           }
4328         }
4329         putc_unlocked('\r', stdout);
4330         log_write_iter = strcpya_k(g_logbuf, "--make-rel: GRM ");
4331         if (parallel_tot != 1) {
4332           log_write_iter = strcpya_k(log_write_iter, "component ");
4333         }
4334         log_write_iter = strcpya_k(log_write_iter, "written to ");
4335         log_write_iter = strcpya(log_write_iter, outname);
4336       } else {
4337         const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
4338         if (grm_flags & kfGrmBin) {
4339           // --make-grm-bin
4340           float* write_float_buf;
4341           if (unlikely(bigstack_alloc_f(row_end_idx, &write_float_buf))) {
4342             goto CalcGrm_ret_NOMEM;
4343           }
4344           char* outname_end2 = strcpya_k(outname_end, ".grm.bin");
4345           if (parallel_tot != 1) {
4346             *outname_end2++ = '.';
4347             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4348           }
4349           *outname_end2 = '\0';
4350           if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4351             goto CalcGrm_ret_OPEN_FAIL;
4352           }
4353           fputs("--make-grm-bin: Writing...", stdout);
4354           fflush(stdout);
4355           for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4356             const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4357             for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4358               write_float_buf[col_idx] = S_CAST(float, *grm_iter++);
4359             }
4360             if (unlikely(fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile))) {
4361               goto CalcGrm_ret_WRITE_FAIL;
4362             }
4363           }
4364           if (unlikely(fclose_null(&outfile))) {
4365             goto CalcGrm_ret_WRITE_FAIL;
4366           }
4367 
4368           outname_end2 = strcpya_k(outname_end, ".grm.N.bin");
4369           if (parallel_tot != 1) {
4370             *outname_end2++ = '.';
4371             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4372           }
4373           *outname_end2 = '\0';
4374           if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4375             goto CalcGrm_ret_OPEN_FAIL;
4376           }
4377           if (!missing_cts) {
4378             // trivial case: write the same number repeatedly
4379             const uintptr_t tot_cells = (S_CAST(uint64_t, row_end_idx) * (row_end_idx - 1) - S_CAST(uint64_t, row_start_idx) * (row_start_idx - 1)) / 2;
4380             const float variant_ctf = u31tof(variant_ct);
4381             write_float_buf = R_CAST(float*, g_textbuf);
4382             for (uint32_t uii = 0; uii != (kTextbufMainSize / sizeof(float)); ++uii) {
4383               write_float_buf[uii] = variant_ctf;
4384             }
4385             const uintptr_t full_write_ct = tot_cells / (kTextbufMainSize / sizeof(float));
4386             for (uintptr_t ulii = 0; ulii != full_write_ct; ++ulii) {
4387               if (unlikely(fwrite_checked(write_float_buf, kTextbufMainSize, outfile))) {
4388                 goto CalcGrm_ret_WRITE_FAIL;
4389               }
4390             }
4391             const uintptr_t remainder = tot_cells % (kTextbufMainSize / sizeof(float));
4392             if (remainder) {
4393               if (unlikely(fwrite_checked(write_float_buf, remainder * sizeof(float), outfile))) {
4394                 goto CalcGrm_ret_WRITE_FAIL;
4395               }
4396             }
4397           } else {
4398             for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4399               const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
4400               for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4401                 uint32_t cur_obs_ct = variant_ct_base;
4402                 if (col_idx != row_idx) {
4403                   cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
4404                 }
4405                 write_float_buf[col_idx] = u31tof(cur_obs_ct);
4406               }
4407               if (unlikely(fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile))) {
4408                 goto CalcGrm_ret_WRITE_FAIL;
4409               }
4410             }
4411           }
4412           if (unlikely(fclose_null(&outfile))) {
4413             goto CalcGrm_ret_WRITE_FAIL;
4414           }
4415           putc_unlocked('\r', stdout);
4416           const uint32_t outname_copy_byte_ct = 5 + S_CAST(uintptr_t, outname_end - outname);
4417           log_write_iter = strcpya_k(g_logbuf, "--make-grm-bin: GRM ");
4418           if (parallel_tot != 1) {
4419             log_write_iter = strcpya_k(log_write_iter, "component ");
4420           }
4421           log_write_iter = strcpya_k(log_write_iter, "written to ");
4422           log_write_iter = memcpya(log_write_iter, outname, outname_copy_byte_ct);
4423           log_write_iter = strcpya_k(log_write_iter, "bin");
4424           if (parallel_tot != 1) {
4425             *log_write_iter++ = '.';
4426             log_write_iter = u32toa(parallel_idx + 1, log_write_iter);
4427           }
4428           log_write_iter = strcpya_k(log_write_iter, " , ");
4429           if (parallel_idx) {
4430             log_write_iter = strcpya_k(log_write_iter, "and ");
4431           }
4432           log_write_iter = strcpya_k(log_write_iter, "observation counts to ");
4433           log_write_iter = memcpya(log_write_iter, outname, outname_end2 - outname);
4434         } else {
4435           // --make-grm-list
4436           char* outname_end2 = strcpya_k(outname_end, ".grm");
4437           if (parallel_tot != 1) {
4438             *outname_end2++ = '.';
4439             outname_end2 = u32toa(parallel_idx + 1, outname_end2);
4440           }
4441           if (grm_flags & kfGrmListZs) {
4442             outname_end2 = strcpya_k(outname_end2, ".zst");
4443           }
4444           *outname_end2 = '\0';
4445           reterr = InitCstreamAlloc(outname, 0, !(grm_flags & kfGrmListNoGz), max_thread_ct, kCompressStreamBlock + kMaxMediumLine, &css, &cswritep);
4446           if (unlikely(reterr)) {
4447             goto CalcGrm_ret_1;
4448           }
4449           fputs("--make-grm-list: Writing...", stdout);
4450           fflush(stdout);
4451           for (uintptr_t row_idx = row_start_idx; row_idx != row_end_idx; ++row_idx) {
4452             uint32_t variant_ct_base = variant_ct;
4453             if (missing_cts) {
4454               variant_ct_base -= missing_cts[row_idx];
4455             }
4456             const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
4457             for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
4458               cswritep = u32toa_x(row_idx + 1, '\t', cswritep);
4459               cswritep = u32toa_x(col_idx + 1, '\t', cswritep);
4460               if (missing_cts) {
4461                 uint32_t cur_obs_ct = variant_ct_base;
4462                 if (col_idx != row_idx) {
4463                   cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
4464                 }
4465                 cswritep = u32toa(cur_obs_ct, cswritep);
4466               } else {
4467                 cswritep = u32toa(variant_ct_base, cswritep);
4468               }
4469               *cswritep++ = '\t';
4470               cswritep = dtoa_g(*grm_iter++, cswritep);
4471               AppendBinaryEoln(&cswritep);
4472               if (unlikely(Cswrite(&css, &cswritep))) {
4473                 goto CalcGrm_ret_WRITE_FAIL;
4474               }
4475             }
4476           }
4477           if (unlikely(CswriteCloseNull(&css, cswritep))) {
4478             goto CalcGrm_ret_WRITE_FAIL;
4479           }
4480           putc_unlocked('\r', stdout);
4481           log_write_iter = strcpya_k(g_logbuf, "--make-grm-list: GRM ");
4482           if (parallel_tot != 1) {
4483             log_write_iter = strcpya_k(log_write_iter, "component ");
4484           }
4485           log_write_iter = strcpya_k(log_write_iter, "written to ");
4486           log_write_iter = strcpya(log_write_iter, outname);
4487         }
4488       }
4489       if (!parallel_idx) {
4490         SampleIdFlags id_print_flags = siip->flags & kfSampleIdFidPresent;
4491         if (grm_flags & kfGrmNoIdHeader) {
4492           id_print_flags |= kfSampleIdNoIdHeader;
4493           if (grm_flags & kfGrmNoIdHeaderIidOnly) {
4494             id_print_flags |= kfSampleIdNoIdHeaderIidOnly;
4495           }
4496         }
4497         snprintf(&(outname_end[4]), kMaxOutfnameExtBlen - 4, ".id");
4498         reterr = WriteSampleIdsOverride(orig_sample_include, siip, outname, sample_ct, id_print_flags);
4499         if (unlikely(reterr)) {
4500           goto CalcGrm_ret_1;
4501         }
4502         log_write_iter = strcpya_k(log_write_iter, " , and IDs to ");
4503         log_write_iter = strcpya(log_write_iter, outname);
4504       }
4505       snprintf(log_write_iter, kLogbufSize - 2 * kPglFnamesize - 256, " .\n");
4506       WordWrapB(0);
4507       logputsb();
4508     }
4509 
4510     if (grm_ptr) {
4511       *grm_ptr = grm;
4512       // allocation right on top of grm[]
4513       bigstack_mark = R_CAST(unsigned char*, sample_include_cumulative_popcounts);
4514     }
4515   }
4516   while (0) {
4517   CalcGrm_ret_NOMEM:
4518     reterr = kPglRetNomem;
4519     break;
4520   CalcGrm_ret_NOMEM_CUSTOM:
4521     reterr = kPglRetNomemCustomMsg;
4522     break;
4523   CalcGrm_ret_OPEN_FAIL:
4524     reterr = kPglRetOpenFail;
4525     break;
4526   CalcGrm_ret_PGR_FAIL:
4527     PgenErrPrintN(reterr);
4528     break;
4529   CalcGrm_ret_WRITE_FAIL:
4530     reterr = kPglRetWriteFail;
4531     break;
4532   CalcGrm_ret_THREAD_CREATE_FAIL:
4533     reterr = kPglRetThreadCreateFail;
4534     break;
4535   CalcGrm_ret_DEGENERATE_DATA:
4536     reterr = kPglRetDegenerateData;
4537     break;
4538   }
4539  CalcGrm_ret_1:
4540   CswriteCloseCond(&css, cswritep);
4541   fclose_cond(outfile);
4542   CleanupThreads(&tg);
4543   BLAS_SET_NUM_THREADS(1);
4544   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
4545   return reterr;
4546 }
4547 
4548 // should be able to remove NOLAPACK later since we already have a non-LAPACK
4549 // SVD implementation
4550 #ifndef NOLAPACK
4551 // this seems to be better than 256 (due to avoidance of cache critical
4552 // stride?)
4553 // (still want this to be a multiple of 8, for cleaner multithreading)
4554 CONSTI32(kPcaVariantBlockSize, 240);
4555 
4556 typedef struct CalcPcaCtxStruct {
4557   uint32_t sample_ct;
4558   uint32_t pc_ct;
4559 
4560   double* yy_bufs[2];
4561 
4562   uint32_t cur_batch_size;
4563 
4564   double* g1;
4565   double* qq;
4566   double** y_transpose_bufs;
4567   double** g2_bb_part_bufs;
4568 } CalcPcaCtx;
4569 
CalcPcaXtxaThread(void * raw_arg)4570 THREAD_FUNC_DECL CalcPcaXtxaThread(void* raw_arg) {
4571   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4572   const uintptr_t tidx = arg->tidx;
4573   CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4574 
4575   const uint32_t sample_ct = ctx->sample_ct;
4576   const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4577   const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4578   const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4579   const double* g1 = ctx->g1;
4580   double* qq_iter = ctx->qq;
4581   double* y_transpose_buf = ctx->y_transpose_bufs[tidx];
4582   double* g2_part_buf = ctx->g2_bb_part_bufs[tidx];
4583   uint32_t parity = 0;
4584   do {
4585     const uint32_t cur_batch_size = ctx->cur_batch_size;
4586     if (vidx_offset < cur_batch_size) {
4587       uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4588       if (cur_thread_batch_size > kPcaVariantBlockSize) {
4589         cur_thread_batch_size = kPcaVariantBlockSize;
4590       }
4591       const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4592       double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
4593       RowMajorMatrixMultiplyStrided(yy_buf, g1, cur_thread_batch_size, sample_ct, pc_ct_x2, pc_ct_x2, sample_ct, qq_col_ct, cur_qq);
4594       MatrixTransposeCopy(yy_buf, cur_thread_batch_size, sample_ct, y_transpose_buf);
4595       RowMajorMatrixMultiplyStridedIncr(y_transpose_buf, cur_qq, sample_ct, cur_thread_batch_size, pc_ct_x2, qq_col_ct, cur_thread_batch_size, pc_ct_x2, g2_part_buf);
4596       qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4597     }
4598     parity = 1 - parity;
4599   } while (!THREAD_BLOCK_FINISH(arg));
4600   THREAD_RETURN;
4601 }
4602 
CalcPcaXaThread(void * raw_arg)4603 THREAD_FUNC_DECL CalcPcaXaThread(void* raw_arg) {
4604   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4605   const uintptr_t tidx = arg->tidx;
4606   CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4607 
4608   const uint32_t sample_ct = ctx->sample_ct;
4609   const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4610   const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4611   const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4612   const double* g1 = ctx->g1;
4613   double* qq_iter = ctx->qq;
4614   uint32_t parity = 0;
4615   do {
4616     const uint32_t cur_batch_size = ctx->cur_batch_size;
4617     if (vidx_offset < cur_batch_size) {
4618       uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4619       if (cur_thread_batch_size > kPcaVariantBlockSize) {
4620         cur_thread_batch_size = kPcaVariantBlockSize;
4621       }
4622       const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4623       double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
4624       RowMajorMatrixMultiplyStrided(yy_buf, g1, cur_thread_batch_size, sample_ct, pc_ct_x2, pc_ct_x2, sample_ct, qq_col_ct, cur_qq);
4625       qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4626     }
4627     parity = 1 - parity;
4628   } while (!THREAD_BLOCK_FINISH(arg));
4629   THREAD_RETURN;
4630 }
4631 
CalcPcaXtbThread(void * raw_arg)4632 THREAD_FUNC_DECL CalcPcaXtbThread(void* raw_arg) {
4633   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4634   const uintptr_t tidx = arg->tidx;
4635   CalcPcaCtx* ctx = S_CAST(CalcPcaCtx*, arg->sharedp->context);
4636 
4637   const uint32_t sample_ct = ctx->sample_ct;
4638   const uint32_t pc_ct_x2 = ctx->pc_ct * 2;
4639   const uintptr_t qq_col_ct = (ctx->pc_ct + 1) * pc_ct_x2;
4640   const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4641   const double* qq_iter = &(ctx->qq[vidx_offset * qq_col_ct]);
4642   double* y_transpose_buf = ctx->y_transpose_bufs[tidx];
4643   double* bb_part_buf = ctx->g2_bb_part_bufs[tidx];
4644   uint32_t parity = 0;
4645   do {
4646     const uint32_t cur_batch_size = ctx->cur_batch_size;
4647     if (vidx_offset < cur_batch_size) {
4648       uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4649       if (cur_thread_batch_size > kPcaVariantBlockSize) {
4650         cur_thread_batch_size = kPcaVariantBlockSize;
4651       }
4652       const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4653       MatrixTransposeCopy(yy_buf, cur_thread_batch_size, sample_ct, y_transpose_buf);
4654       RowMajorMatrixMultiplyIncr(y_transpose_buf, qq_iter, sample_ct, qq_col_ct, cur_thread_batch_size, bb_part_buf);
4655       qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
4656     }
4657     parity = 1 - parity;
4658   } while (!THREAD_BLOCK_FINISH(arg));
4659   THREAD_RETURN;
4660 }
4661 
4662 typedef struct CalcPcaVarWtsCtxStruct {
4663   uint32_t sample_ct;
4664   uint32_t pc_ct;
4665 
4666   double* sample_wts_smaj;
4667 
4668   double* yy_bufs[2];
4669 
4670   uint32_t cur_batch_size;
4671 
4672   double* var_wts;
4673 } CalcPcaVarWtsCtx;
4674 
CalcPcaVarWtsThread(void * raw_arg)4675 THREAD_FUNC_DECL CalcPcaVarWtsThread(void* raw_arg) {
4676   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4677   const uintptr_t tidx = arg->tidx;
4678   CalcPcaVarWtsCtx* ctx = S_CAST(CalcPcaVarWtsCtx*, arg->sharedp->context);
4679 
4680   const uint32_t sample_ct = ctx->sample_ct;
4681   const uint32_t pc_ct = ctx->pc_ct;
4682   const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
4683 
4684   // either first batch size is calc_thread_ct * kPcaVariantBlockSize, or there
4685   // is only one batch
4686   const uintptr_t var_wts_part_size = S_CAST(uintptr_t, pc_ct) * ctx->cur_batch_size;
4687 
4688   const double* sample_wts = ctx->sample_wts_smaj;  // sample-major, pc_ct columns
4689   uint32_t parity = 0;
4690   do {
4691     const uint32_t cur_batch_size = ctx->cur_batch_size;
4692     if (vidx_offset < cur_batch_size) {
4693       uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
4694       if (cur_thread_batch_size > kPcaVariantBlockSize) {
4695         cur_thread_batch_size = kPcaVariantBlockSize;
4696       }
4697       const double* yy_buf = &(ctx->yy_bufs[parity][S_CAST(uintptr_t, vidx_offset) * sample_ct]);
4698       // Variant weight matrix = X^T * S * D^{-1/2}, where X^T is the
4699       // variance-standardized genotype matrix, S is the sample weight matrix,
4700       // and D is a diagonal eigenvalue matrix.
4701       // We postpone the D^{-1/2} part for now, but it's straightforward to
4702       // switch to using precomputed (S * D^{-1/2}).
4703       double* cur_var_wts_part = &(ctx->var_wts[parity * var_wts_part_size + vidx_offset * S_CAST(uintptr_t, pc_ct)]);
4704       RowMajorMatrixMultiply(yy_buf, sample_wts, cur_thread_batch_size, pc_ct, sample_ct, cur_var_wts_part);
4705     }
4706     parity = 1 - parity;
4707   } while (!THREAD_BLOCK_FINISH(arg));
4708   THREAD_RETURN;
4709 }
4710 
FlushBiallelicVarWts(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const AlleleCode * maj_alleles,const double * var_wts_iter,const double * eigval_inv_sqrts,uint32_t batch_size,uint32_t pc_ct,PcaFlags pca_flags,CompressStreamState * cssp,char ** cswritepp,char * chr_buf,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uint32_t * chr_fo_idxp,uint32_t * chr_endp,uint32_t * chr_buf_blenp)4711 PglErr FlushBiallelicVarWts(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const AlleleCode* maj_alleles, const double* var_wts_iter, const double* eigval_inv_sqrts, uint32_t batch_size, uint32_t pc_ct, PcaFlags pca_flags, CompressStreamState* cssp, char** cswritepp, char* chr_buf, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uint32_t* chr_fo_idxp, uint32_t* chr_endp, uint32_t* chr_buf_blenp) {
4712   char* cswritep = *cswritepp;
4713   uint32_t variant_idx = *variant_idxp;
4714   uintptr_t variant_uidx = *variant_uidxp;
4715   uint32_t chr_fo_idx = *chr_fo_idxp;
4716   uint32_t chr_end = *chr_endp;
4717   uint32_t chr_buf_blen = *chr_buf_blenp;
4718 
4719   const uint32_t variant_idx_stop = variant_idx + batch_size;
4720   const uint32_t ref_col = pca_flags & kfPcaVcolRef;
4721   const uint32_t alt1_col = pca_flags & kfPcaVcolAlt1;
4722   const uint32_t alt_col = pca_flags & kfPcaVcolAlt;
4723   const uint32_t maj_col = pca_flags & kfPcaVcolMaj;
4724   const uint32_t nonmaj_col = pca_flags & kfPcaVcolNonmaj;
4725 
4726   uintptr_t variant_uidx_base;
4727   uintptr_t cur_bits;
4728   BitIter1Start(variant_include, variant_uidx, &variant_uidx_base, &cur_bits);
4729   for (; variant_idx != variant_idx_stop; ++variant_idx) {
4730     variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4731     if (chr_buf) {
4732       // ok to skip this logic if chr_col not printed
4733       if (variant_uidx >= chr_end) {
4734         do {
4735           ++chr_fo_idx;
4736           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4737         } while (variant_uidx >= chr_end);
4738         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
4739         char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
4740         *chr_name_end = '\t';
4741         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
4742       }
4743       cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
4744     }
4745     if (variant_bps) {
4746       cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
4747     }
4748     cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
4749     uintptr_t allele_idx_offset_base = variant_uidx * 2;
4750     if (allele_idx_offsets) {
4751       allele_idx_offset_base = allele_idx_offsets[variant_uidx];
4752     }
4753     const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4754     if (ref_col) {
4755       *cswritep++ = '\t';
4756       cswritep = strcpya(cswritep, cur_alleles[0]);
4757     }
4758     if (alt1_col) {
4759       *cswritep++ = '\t';
4760       cswritep = strcpya(cswritep, cur_alleles[1]);
4761     }
4762     if (alt_col) {
4763       *cswritep++ = '\t';
4764       // guaranteed biallelic
4765       cswritep = strcpya(cswritep, cur_alleles[1]);
4766     }
4767     const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
4768     if (maj_col) {
4769       if (unlikely(Cswrite(cssp, &cswritep))) {
4770         return kPglRetWriteFail;
4771       }
4772       *cswritep++ = '\t';
4773       cswritep = strcpya(cswritep, cur_alleles[maj_allele_idx]);
4774     }
4775     if (nonmaj_col) {
4776       *cswritep++ = '\t';
4777       cswritep = strcpya(cswritep, cur_alleles[1 - maj_allele_idx]);
4778     }
4779     if (!maj_allele_idx) {
4780       for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4781         *cswritep++ = '\t';
4782         // could avoid these multiplications by premultiplying the
4783         // sample weight matrix
4784         cswritep = dtoa_g((*var_wts_iter++) * eigval_inv_sqrts[pc_idx], cswritep);
4785       }
4786     } else {
4787       for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4788         *cswritep++ = '\t';
4789         cswritep = dtoa_g((*var_wts_iter++) * (-eigval_inv_sqrts[pc_idx]), cswritep);
4790       }
4791     }
4792     AppendBinaryEoln(&cswritep);
4793     if (unlikely(Cswrite(cssp, &cswritep))) {
4794       // bugfix (15 Dec 2017): prevent buffer overflow when ALT, MAJ,
4795       // and NONMAJ columns all missing.
4796       return kPglRetWriteFail;
4797     }
4798   }
4799   *cswritepp = cswritep;
4800   *variant_idxp = variant_idx_stop;
4801   *variant_uidxp = variant_uidx + 1;
4802   *chr_fo_idxp = chr_fo_idx;
4803   *chr_endp = chr_end;
4804   *chr_buf_blenp = chr_buf_blen;
4805   return kPglRetSuccess;
4806 }
4807 
FlushAlleleWts(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const double * var_wts_iter,const double * eigval_inv_sqrts,uint32_t batch_size,uint32_t pc_ct,PcaFlags pca_flags,CompressStreamState * cssp,char ** cswritepp,char * chr_buf,uint32_t * variant_idxp,uintptr_t * variant_uidxp,uintptr_t * allele_idx_offset_basep,uint32_t * cur_allele_ctp,uint32_t * incomplete_allele_idxp,uint32_t * chr_fo_idxp,uint32_t * chr_endp,uint32_t * chr_buf_blenp)4808 PglErr FlushAlleleWts(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const double* var_wts_iter, const double* eigval_inv_sqrts, uint32_t batch_size, uint32_t pc_ct, PcaFlags pca_flags, CompressStreamState* cssp, char** cswritepp, char* chr_buf, uint32_t* variant_idxp, uintptr_t* variant_uidxp, uintptr_t* allele_idx_offset_basep, uint32_t* cur_allele_ctp, uint32_t* incomplete_allele_idxp, uint32_t* chr_fo_idxp, uint32_t* chr_endp, uint32_t* chr_buf_blenp) {
4809   char* cswritep = *cswritepp;
4810   uint32_t variant_idx = *variant_idxp;
4811   uintptr_t variant_uidx = *variant_uidxp;
4812   uintptr_t allele_idx_offset_base = *allele_idx_offset_basep;
4813   uint32_t cur_allele_ct = *cur_allele_ctp;
4814   uint32_t incomplete_allele_idx = *incomplete_allele_idxp;
4815   uint32_t chr_fo_idx = *chr_fo_idxp;
4816   uint32_t chr_end = *chr_endp;
4817   uint32_t chr_buf_blen = *chr_buf_blenp;
4818 
4819   const uint32_t ref_col = pca_flags & kfPcaVcolRef;
4820   const uint32_t alt1_col = pca_flags & kfPcaVcolAlt1;
4821   const uint32_t alt_col = pca_flags & kfPcaVcolAlt;
4822   const uint32_t ax_col = pca_flags & kfPcaVcolAx;
4823 
4824   uintptr_t variant_uidx_base;
4825   uintptr_t cur_bits;
4826   BitIter1Start(variant_include, variant_uidx + (incomplete_allele_idx != 0), &variant_uidx_base, &cur_bits);
4827   for (uint32_t allele_bidx = 0; allele_bidx != batch_size; ) {
4828     if (!incomplete_allele_idx) {
4829       variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4830       if (chr_buf && (variant_uidx >= chr_end)) {
4831         do {
4832           ++chr_fo_idx;
4833           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4834         } while (variant_uidx >= chr_end);
4835         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
4836         char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
4837         *chr_name_end = '\t';
4838         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
4839       }
4840       if (!allele_idx_offsets) {
4841         allele_idx_offset_base = variant_uidx * 2;
4842       } else {
4843         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
4844         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
4845       }
4846     }
4847     uint32_t allele_idx_end = cur_allele_ct;
4848     uint32_t allele_idx_stop;
4849     uint32_t incr;
4850     if (cur_allele_ct == 2) {
4851       allele_idx_stop = 2;
4852       incr = 1;
4853     } else {
4854       allele_idx_stop = batch_size + incomplete_allele_idx - allele_bidx;
4855       if (allele_idx_stop > allele_idx_end) {
4856         allele_idx_stop = allele_idx_end;
4857       }
4858       incr = allele_idx_stop - incomplete_allele_idx;
4859     }
4860     const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4861     for (uint32_t allele_idx = incomplete_allele_idx; allele_idx != allele_idx_stop; ++allele_idx) {
4862       if (chr_buf) {
4863         cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
4864       }
4865       if (variant_bps) {
4866         cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
4867       }
4868       cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
4869       if (ref_col) {
4870         *cswritep++ = '\t';
4871         cswritep = strcpya(cswritep, cur_alleles[0]);
4872       }
4873       if (alt1_col) {
4874         *cswritep++ = '\t';
4875         cswritep = strcpya(cswritep, cur_alleles[1]);
4876       }
4877       if (alt_col) {
4878         *cswritep++ = '\t';
4879         for (uint32_t allele_idx2 = 1; allele_idx2 != cur_allele_ct; ++allele_idx2) {
4880           if (unlikely(Cswrite(cssp, &cswritep))) {
4881             return kPglRetWriteFail;
4882           }
4883           cswritep = strcpyax(cswritep, cur_alleles[allele_idx2], ',');
4884         }
4885         --cswritep;
4886       }
4887       // A1 col always present
4888       if (unlikely(Cswrite(cssp, &cswritep))) {
4889         return kPglRetWriteFail;
4890       }
4891       *cswritep++ = '\t';
4892       cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
4893       if (ax_col) {
4894         *cswritep++ = '\t';
4895         for (uint32_t allele_idx2 = 0; allele_idx2 != cur_allele_ct; ++allele_idx2) {
4896           if (allele_idx2 == allele_idx) {
4897             continue;
4898           }
4899           if (unlikely(Cswrite(cssp, &cswritep))) {
4900             return kPglRetWriteFail;
4901           }
4902           cswritep = strcpyax(cswritep, cur_alleles[allele_idx2], ',');
4903         }
4904         --cswritep;
4905       }
4906       if (cur_allele_ct == 2) {
4907         const double mult = allele_idx? -0.5 : 0.5;
4908         for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4909           *cswritep++ = '\t';
4910           cswritep = dtoa_g((*var_wts_iter++) * mult * eigval_inv_sqrts[pc_idx], cswritep);
4911         }
4912         if (!allele_idx) {
4913           var_wts_iter -= pc_ct;
4914         }
4915       } else {
4916         for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
4917           *cswritep++ = '\t';
4918           cswritep = dtoa_g((*var_wts_iter++) * eigval_inv_sqrts[pc_idx], cswritep);
4919         }
4920       }
4921       AppendBinaryEoln(&cswritep);
4922       if (unlikely(Cswrite(cssp, &cswritep))) {
4923         return kPglRetWriteFail;
4924       }
4925     }
4926     allele_bidx += incr;
4927     if (allele_idx_stop == allele_idx_end) {
4928       ++variant_idx;
4929       incomplete_allele_idx = 0;
4930     } else {
4931       incomplete_allele_idx = allele_idx_stop;
4932     }
4933   }
4934   *cswritepp = cswritep;
4935   *variant_idxp = variant_idx;
4936   *variant_uidxp = variant_uidx + (incomplete_allele_idx == 0);
4937   *allele_idx_offset_basep = allele_idx_offset_base;
4938   *cur_allele_ctp = cur_allele_ct;
4939   *incomplete_allele_idxp = incomplete_allele_idx;
4940   *chr_fo_idxp = chr_fo_idx;
4941   *chr_endp = chr_end;
4942   *chr_buf_blenp = chr_buf_blen;
4943   return kPglRetSuccess;
4944 }
4945 
CalcPca(const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const AlleleCode * maj_alleles,const double * allele_freqs,uint32_t raw_sample_ct,uintptr_t pca_sample_ct,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct,uint32_t max_allele_slen,uint32_t pc_ct,PcaFlags pca_flags,uint32_t max_thread_ct,PgenReader * simple_pgrp,sfmt_t * sfmtp,double * grm,char * outname,char * outname_end)4946 PglErr CalcPca(const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const AlleleCode* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uintptr_t pca_sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t pc_ct, PcaFlags pca_flags, uint32_t max_thread_ct, PgenReader* simple_pgrp, sfmt_t* sfmtp, double* grm, char* outname, char* outname_end) {
4947   unsigned char* bigstack_mark = g_bigstack_base;
4948   FILE* outfile = nullptr;
4949   char* cswritep = nullptr;
4950   CompressStreamState css;
4951   ThreadGroup tg;
4952   PreinitThreads(&tg);
4953   PglErr reterr = kPglRetSuccess;
4954   PreinitCstream(&css);
4955   {
4956     const uint32_t write_fid = FidColIsRequired(siip, pca_flags / kfPcaScolMaybefid);
4957     const char* sample_ids = siip->sample_ids;
4958     const char* sids = siip->sids;
4959     const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
4960     const uintptr_t max_sid_blen = siip->max_sid_blen;
4961     const uint32_t write_sid = SidColIsRequired(sids, pca_flags / kfPcaScolMaybesid);
4962     const uint32_t is_approx = (pca_flags / kfPcaApprox) & 1;
4963     reterr = ConditionalAllocateNonAutosomalVariants(cip, is_approx? "PCA approximation" : "PCA", raw_variant_ct, &variant_include, &variant_ct);
4964     if (unlikely(reterr)) {
4965       goto CalcPca_ret_1;
4966     }
4967 #ifdef __APPLE__
4968     // min OS X version is 10.7, so we can take Grand Central Dispatch dgemm
4969     // for granted
4970     // (tried this with Linux MKL + OpenMP as well, but results were inferior)
4971     uint32_t calc_thread_ct = 1;
4972 #else
4973     // I/O thread generally has <1/8 of workload
4974     // TODO: recheck this, now that I/O thread is also responsible for fully
4975     // expanding dosages.  Still shouldn't be a big deal, but we probably want
4976     // sample_ct to affect the decision boundary now.
4977     uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
4978     if ((calc_thread_ct - 1) * kPcaVariantBlockSize >= variant_ct) {
4979       calc_thread_ct = 1 + (variant_ct - 1) / kPcaVariantBlockSize;
4980     }
4981 #endif
4982     if (unlikely(pc_ct > pca_sample_ct)) {
4983       // minor update (alpha 3): just error out here instead of trying to
4984       // auto-adjust PC count, number of .eigenvec output columns should be
4985       // easily predictable
4986       logerrprintf("Error: Too few samples to compute %u PCs with \"--pca approx\".\n", pc_ct);
4987       goto CalcPca_ret_DEGENERATE_DATA;
4988     }
4989     const uint32_t wts_requested = ((pca_flags & (kfPcaAlleleWts | kfPcaBiallelicVarWts)) != 0);
4990     const uint32_t biallelic_variant_ct = CountBiallelicVariants(variant_include, allele_idx_offsets, variant_ct);
4991     double* cur_var_wts = nullptr;
4992     double* eigval_inv_sqrts = nullptr;
4993     char* chr_buf = nullptr;
4994     uintptr_t overflow_buf_size = 3 * kMaxMediumLine;
4995     if (wts_requested) {
4996       if (pca_flags & kfPcaBiallelicVarWts) {
4997         if (unlikely(biallelic_variant_ct != variant_ct)) {
4998           logerrputs("Error: Multiallelic variant present in \"--pca biallelic-var-wts\" run.\n");
4999           goto CalcPca_ret_INCONSISTENT_INPUT;
5000         }
5001       }
5002       if (unlikely(
5003               bigstack_alloc_d(pc_ct, &cur_var_wts) ||
5004               bigstack_alloc_d(pc_ct, &eigval_inv_sqrts))) {
5005         goto CalcPca_ret_NOMEM;
5006       }
5007       uint32_t max_chr_blen = 0;
5008       if (pca_flags & kfPcaVcolChrom) {
5009         max_chr_blen = GetMaxChrSlen(cip) + 1;
5010         if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
5011           goto CalcPca_ret_NOMEM;
5012         }
5013       }
5014       const uintptr_t overflow_buf_size2 = RoundUpPow2(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 2 * max_allele_slen + 32 + 16 * pc_ct, kCacheline);
5015       if (overflow_buf_size2 > overflow_buf_size) {
5016         overflow_buf_size = overflow_buf_size2;
5017       }
5018     }
5019     uintptr_t writebuf_alloc = overflow_buf_size;
5020     if (pca_flags & kfPcaVarZs) {
5021       writebuf_alloc += CstreamWkspaceReq(overflow_buf_size);
5022     }
5023     // temporary
5024     // todo: additional --pca-clusters allocations
5025     const uintptr_t* pca_sample_include = sample_include;
5026     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5027     uint32_t* pca_sample_include_cumulative_popcounts;
5028     PgenVariant pgv;
5029     double* allele_1copy_buf;
5030     double* eigvals;
5031     CalcPcaCtx ctx;
5032     if (unlikely(
5033             bigstack_alloc_u32(raw_sample_ctl, &pca_sample_include_cumulative_popcounts) ||
5034             BigstackAllocPgv(pca_sample_ct, allele_idx_offsets != nullptr, PgrGetGflags(simple_pgrp), &pgv) ||
5035             bigstack_alloc_d(max_allele_ct, &allele_1copy_buf) ||
5036             bigstack_alloc_d(pc_ct, &eigvals) ||
5037             SetThreadCt(calc_thread_ct, &tg))) {
5038       goto CalcPca_ret_NOMEM;
5039     }
5040     FillCumulativePopcounts(pca_sample_include, raw_sample_ctl, pca_sample_include_cumulative_popcounts);
5041     PgrSampleSubsetIndex pssi;
5042     PgrSetSampleSubsetIndex(pca_sample_include_cumulative_popcounts, simple_pgrp, &pssi);
5043     ctx.sample_ct = pca_sample_ct;
5044     ctx.pc_ct = pc_ct;
5045     const uintptr_t pca_row_ct = CountAlleles(variant_include, allele_idx_offsets, raw_variant_ct, variant_ct) - biallelic_variant_ct;
5046     const uint32_t is_haploid = cip->haploid_mask[0] & 1;
5047     uint32_t cur_allele_ct = 2;
5048     double* qq = nullptr;
5049     double* eigvecs_smaj;
5050     char* writebuf;
5051     if (is_approx) {
5052       if (pca_sample_ct <= 5000) {
5053         logerrputs("Warning: \"--pca approx\" is only recommended for analysis of >5000 samples.\n");
5054       }
5055       if (pca_row_ct > 5000000) {
5056         logerrputs("Warning: Use of \"--pca approx\" on >5m rows is not advisable.  Apply a MAF\nfilter if you haven't done so yet, and consider LD-pruning your variant set as\nwell.\n");
5057       }
5058       // This is ported from EIGENSOFT 6 src/ksrc/kjg_fpca.c , which is in turn
5059       // primarily based on Halko N, Martinsson P, Shkolnisky Y, Tygert M
5060       // (2011) An Algorithm for the Principal Component Analysis of Large Data
5061       // Sets.
5062       const uintptr_t pc_ct_x2 = pc_ct * 2;
5063       const uintptr_t qq_col_ct = (pc_ct + 1) * pc_ct_x2;
5064       // bugfix (30 Jan 2019): First SvdRect() call returns min(variant_ct,
5065       // qq_col_ct) singular vectors; this was previously assumed to always be
5066       // qq_col_ct, and very inaccurate results were produced when the
5067       // assumption wasn't true.
5068       // Simplest solution is to force the user to request fewer PCs, since the
5069       // final PCs wouldn't be accurate anyway.
5070       if (qq_col_ct > variant_ct) {
5071         logerrprintfww("Error: Too few variants to compute %u PCs with \"--pca approx\" (%u required).\n", pc_ct, qq_col_ct);
5072         goto CalcPca_ret_DEGENERATE_DATA;
5073       }
5074 #ifndef LAPACK_ILP64
5075       if (unlikely((pca_row_ct * S_CAST(uint64_t, qq_col_ct)) > 0x7effffff)) {
5076         logerrputs("Error: \"--pca approx\" problem instance too large for this " PROG_NAME_STR " build.  If\nthis is really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
5077         goto CalcPca_ret_INCONSISTENT_INPUT;
5078       }
5079 #endif
5080       const double variant_ct_recip = 1.0 / u31tod(variant_ct);
5081 
5082       const uintptr_t gg_size = pca_sample_ct * pc_ct_x2;
5083       __CLPK_integer svd_rect_lwork;
5084 #ifdef LAPACK_ILP64
5085       GetSvdRectLwork(MAXV(pca_sample_ct, pca_row_ct), qq_col_ct, &svd_rect_lwork);
5086 #else
5087       if (unlikely(GetSvdRectLwork(MAXV(pca_sample_ct, pca_row_ct), qq_col_ct, &svd_rect_lwork))) {
5088         logerrputs("Error: \"--pca approx\" problem instance too large for this " PROG_NAME_STR " build.  If\nthis is really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
5089         goto CalcPca_ret_INCONSISTENT_INPUT;
5090       }
5091 #endif
5092       uintptr_t svd_rect_wkspace_size = (svd_rect_lwork + qq_col_ct * qq_col_ct) * sizeof(double);
5093       if (svd_rect_wkspace_size < writebuf_alloc) {
5094         // used as writebuf later
5095         svd_rect_wkspace_size = writebuf_alloc;
5096       }
5097 
5098       unsigned char* svd_rect_wkspace;
5099       double* ss;
5100       double* g1;
5101       if (unlikely(
5102               bigstack_alloc_d(qq_col_ct, &ss) ||
5103               bigstack_alloc_d(pca_row_ct * qq_col_ct, &qq) ||
5104               bigstack_alloc_dp(calc_thread_ct, &ctx.y_transpose_bufs) ||
5105               bigstack_alloc_dp(calc_thread_ct, &ctx.g2_bb_part_bufs) ||
5106               bigstack_alloc_uc(svd_rect_wkspace_size, &svd_rect_wkspace) ||
5107               bigstack_alloc_d(gg_size, &g1))) {
5108         goto CalcPca_ret_NOMEM;
5109       }
5110       const uintptr_t yy_alloc_incr = RoundUpPow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
5111       const uintptr_t b_size = pca_sample_ct * qq_col_ct;
5112       const uintptr_t g2_bb_part_alloc = RoundUpPow2(b_size * sizeof(double), kCacheline);
5113       // bugfix (16 Jan 2020)
5114       const uintptr_t per_thread_alloc = 3 * yy_alloc_incr + g2_bb_part_alloc;
5115 
5116       const uintptr_t bigstack_avail = bigstack_left();
5117       if (per_thread_alloc * calc_thread_ct > bigstack_avail) {
5118         if (unlikely(bigstack_avail < per_thread_alloc)) {
5119           goto CalcPca_ret_NOMEM;
5120         }
5121         calc_thread_ct = bigstack_avail / per_thread_alloc;
5122       }
5123       const uintptr_t yy_main_alloc = RoundUpPow2(kPcaVariantBlockSize * calc_thread_ct * pca_sample_ct * sizeof(double), kCacheline);
5124       ctx.yy_bufs[0] = S_CAST(double*, bigstack_alloc_raw(yy_main_alloc));
5125       ctx.yy_bufs[1] = S_CAST(double*, bigstack_alloc_raw(yy_main_alloc));
5126       for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5127         ctx.y_transpose_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(yy_alloc_incr));
5128         ctx.g2_bb_part_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(g2_bb_part_alloc));
5129       }
5130       FillGaussianDArr(gg_size / 2, max_thread_ct, sfmtp, g1);
5131       ctx.g1 = g1;
5132 #ifdef __APPLE__
5133       fputs("Projecting random vectors... ", stdout);
5134 #else
5135       printf("Projecting random vectors (%u compute thread%s)... ", calc_thread_ct, (calc_thread_ct == 1)? "" : "s");
5136 #endif
5137       fflush(stdout);
5138       for (uint32_t iter_idx = 0; iter_idx <= pc_ct; ++iter_idx) {
5139         // kjg_fpca_XTXA(), kjg_fpca_XA()
5140         if (iter_idx < pc_ct) {
5141           SetThreadFuncAndData(CalcPcaXtxaThread, &ctx, &tg);
5142         } else {
5143           SetThreadFuncAndData(CalcPcaXaThread, &ctx, &tg);
5144         }
5145         for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5146           ZeroDArr(gg_size, ctx.g2_bb_part_bufs[tidx]);
5147         }
5148         double* qq_iter = &(qq[iter_idx * pc_ct_x2]);  // offset on first row
5149         ctx.qq = qq_iter;
5150 
5151         // Main workflow:
5152         // 1. Set n=0, load batch 0
5153         //
5154         // 2. Spawn threads processing batch n
5155         // 3. Increment n by 1
5156         // 4. Load batch n unless eof
5157         // 5. Join threads
5158         // 6. Goto step 2 unless eof
5159         //
5160         // 7. Assemble next g1 by summing g2_parts
5161         uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5162         uint32_t variant_idx = 0;
5163         uintptr_t variant_uidx = 0;
5164         uintptr_t allele_idx_base = 0;
5165         uint32_t incomplete_allele_idx = 0;
5166         uint32_t parity = 0;
5167         uint32_t is_not_first_block = 0;
5168         while (1) {
5169           if (!IsLastBlock(&tg)) {
5170             reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, ctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
5171             if (unlikely(reterr)) {
5172               goto CalcPca_ret_PGR_FAIL;
5173             }
5174           }
5175           if (is_not_first_block) {
5176             JoinThreads(&tg);
5177             if (IsLastBlock(&tg)) {
5178               break;
5179             }
5180           }
5181           ctx.cur_batch_size = cur_batch_size;
5182           if (variant_idx == variant_ct) {
5183             DeclareLastThreadBlock(&tg);
5184             cur_batch_size = 0;
5185           }
5186           if (unlikely(SpawnThreads(&tg))) {
5187             goto CalcPca_ret_THREAD_CREATE_FAIL;
5188           }
5189           is_not_first_block = 1;
5190           parity = 1 - parity;
5191         }
5192         if (iter_idx < pc_ct) {
5193           memcpy(g1, ctx.g2_bb_part_bufs[0], gg_size * sizeof(double));
5194           for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
5195             const double* cur_g2_part = ctx.g2_bb_part_bufs[tidx];
5196             for (uintptr_t ulii = 0; ulii != gg_size; ++ulii) {
5197               g1[ulii] += cur_g2_part[ulii];
5198             }
5199           }
5200           for (uintptr_t ulii = 0; ulii != gg_size; ++ulii) {
5201             g1[ulii] *= variant_ct_recip;
5202           }
5203         }
5204 #ifdef __APPLE__
5205         printf("\rProjecting random vectors... %u/%u", iter_idx + 1, pc_ct + 1);
5206 #else
5207         printf("\rProjecting random vectors (%u compute thread%s)... %u/%u", calc_thread_ct, (calc_thread_ct == 1)? "" : "s", iter_idx + 1, pc_ct + 1);
5208 #endif
5209         fflush(stdout);
5210       }
5211       fputs(".\n", stdout);
5212       logputs("Computing SVD of Krylov matrix... ");
5213       fflush(stdout);
5214       BLAS_SET_NUM_THREADS(max_thread_ct);
5215       IntErr svd_rect_err = SvdRect(pca_row_ct, qq_col_ct, svd_rect_lwork, qq, ss, svd_rect_wkspace);
5216       if (unlikely(svd_rect_err)) {
5217         logputs("\n");
5218         snprintf(g_logbuf, kLogbufSize, "Error: Failed to compute SVD of Krylov matrix (DGESVD info=%d).\n", S_CAST(int32_t, svd_rect_err));
5219         goto CalcPca_ret_DEGENERATE_DATA_2;
5220       }
5221       BLAS_SET_NUM_THREADS(1);
5222       logputs("done.\nRecovering top PCs from range approximation... ");
5223       fflush(stdout);
5224 
5225       // kjg_fpca_XTB()
5226       for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
5227         ZeroDArr(b_size, ctx.g2_bb_part_bufs[tidx]);
5228       }
5229       SetThreadFuncAndData(CalcPcaXtbThread, &ctx, &tg);
5230       ctx.qq = qq;
5231       uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5232       uint32_t variant_idx = 0;
5233       uintptr_t variant_uidx = 0;
5234       uintptr_t allele_idx_base = 0;
5235       uint32_t incomplete_allele_idx = 0;
5236       uint32_t parity = 0;
5237       uint32_t is_not_first_block = 0;
5238       while (1) {
5239         if (!IsLastBlock(&tg)) {
5240           reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, ctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx, &variant_uidx, &allele_idx_base, &cur_allele_ct, &incomplete_allele_idx, &pgv, allele_1copy_buf);
5241           if (unlikely(reterr)) {
5242             // this error *didn't* happen on an earlier pass, so assign blame
5243             // to I/O instead
5244             goto CalcPca_ret_REWIND_FAIL;
5245           }
5246         }
5247         if (is_not_first_block) {
5248           JoinThreads(&tg);
5249           if (IsLastBlock(&tg)) {
5250             break;
5251           }
5252         }
5253         ctx.cur_batch_size = cur_batch_size;
5254         if (variant_idx == variant_ct) {
5255           DeclareLastThreadBlock(&tg);
5256           cur_batch_size = 0;
5257         }
5258         if (unlikely(SpawnThreads(&tg))) {
5259           goto CalcPca_ret_THREAD_CREATE_FAIL;
5260         }
5261         is_not_first_block = 1;
5262         parity = 1 - parity;
5263       }
5264       double* bb = ctx.g2_bb_part_bufs[0];
5265       for (uint32_t tidx = 1; tidx != calc_thread_ct; ++tidx) {
5266         const double* cur_bb_part = ctx.g2_bb_part_bufs[tidx];
5267         for (uintptr_t ulii = 0; ulii != b_size; ++ulii) {
5268           bb[ulii] += cur_bb_part[ulii];
5269         }
5270       }
5271       BLAS_SET_NUM_THREADS(max_thread_ct);
5272       svd_rect_err = SvdRect(pca_sample_ct, qq_col_ct, svd_rect_lwork, bb, ss, svd_rect_wkspace);
5273       if (unlikely(svd_rect_err)) {
5274         logputs("\n");
5275         snprintf(g_logbuf, kLogbufSize, "Error: Failed to compute SVD of final matrix (DGESVD info=%d).\n", S_CAST(int32_t, svd_rect_err));
5276         goto CalcPca_ret_DEGENERATE_DATA_2;
5277       }
5278       BLAS_SET_NUM_THREADS(1);
5279       logputs("done.\n");
5280       eigvecs_smaj = g1;
5281       for (uint32_t sample_idx = 0; sample_idx != pca_sample_ct; ++sample_idx) {
5282         memcpy(&(eigvecs_smaj[sample_idx * S_CAST(uintptr_t, pc_ct)]), &(bb[sample_idx * qq_col_ct]), pc_ct * sizeof(double));
5283       }
5284       for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5285         eigvals[pc_idx] = ss[pc_idx] * ss[pc_idx] * variant_ct_recip;
5286       }
5287       writebuf = R_CAST(char*, svd_rect_wkspace);
5288       // bugfix (25 Jun 2018): eigvals[] computation was missing a divide-by-2
5289       // somewhere, in both diploid and haploid cases.
5290       // update (30 Jan 2019): er, actually, no.
5291       if (is_haploid) {
5292         for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5293           eigvals[pc_idx] *= 0.5;
5294         }
5295       }
5296     } else {
5297       __CLPK_integer lwork;
5298       __CLPK_integer liwork;
5299       uintptr_t wkspace_byte_ct;
5300       if (unlikely(GetExtractEigvecsLworks(pca_sample_ct, pc_ct, &lwork, &liwork, &wkspace_byte_ct))) {
5301         goto CalcPca_ret_NOMEM;
5302       }
5303       const uintptr_t eigvecs_smaj_alloc = pc_ct * pca_sample_ct * sizeof(double);
5304       if (wkspace_byte_ct < eigvecs_smaj_alloc) {
5305         wkspace_byte_ct = eigvecs_smaj_alloc;
5306       }
5307       double* reverse_eigvecs_pcmaj;
5308       unsigned char* extract_eigvecs_wkspace;
5309       if (unlikely(
5310               bigstack_alloc_d(pc_ct * pca_sample_ct, &reverse_eigvecs_pcmaj) ||
5311               bigstack_alloc_uc(wkspace_byte_ct, &extract_eigvecs_wkspace))) {
5312         goto CalcPca_ret_NOMEM;
5313       }
5314       logprintf("Extracting eigenvalue%s and eigenvector%s... ", (pc_ct == 1)? "" : "s", (pc_ct == 1)? "" : "s");
5315       fflush(stdout);
5316       BLAS_SET_NUM_THREADS(max_thread_ct);
5317       // not putting unlikely() here for now.
5318       if (ExtractEigvecs(pca_sample_ct, pc_ct, lwork, liwork, grm, eigvals, reverse_eigvecs_pcmaj, extract_eigvecs_wkspace)) {
5319         logerrputs("Error: Failed to extract eigenvector(s) from GRM.\n");
5320         goto CalcPca_ret_DEGENERATE_DATA;
5321       }
5322       BLAS_SET_NUM_THREADS(1);
5323       logputs("done.\n");
5324       eigvecs_smaj = R_CAST(double*, extract_eigvecs_wkspace);
5325       BigstackShrinkTop(eigvecs_smaj, eigvecs_smaj_alloc);
5326       if (unlikely(bigstack_alloc_c(writebuf_alloc, &writebuf))) {
5327         goto CalcPca_ret_NOMEM;
5328       }
5329 
5330       // ExtractEigvecs() results are in reverse order, and we also need to
5331       // transpose eigenvectors to sample-major
5332       const uint32_t pc_ct_m1 = pc_ct - 1;
5333       const uint32_t pc_ct_div2 = pc_ct / 2;
5334       for (uint32_t pc_idx = 0; pc_idx != pc_ct_div2; ++pc_idx) {
5335         double tmp_eigval = eigvals[pc_idx];
5336         eigvals[pc_idx] = eigvals[pc_ct_m1 - pc_idx];
5337         eigvals[pc_ct_m1 - pc_idx] = tmp_eigval;
5338       }
5339       double* eigvecs_smaj_iter = eigvecs_smaj;
5340       for (uint32_t sample_idx = 0; sample_idx != pca_sample_ct; ++sample_idx) {
5341         uintptr_t pc_inv_idx = pc_ct;
5342         const double* reverse_eigvecs_col = &(reverse_eigvecs_pcmaj[sample_idx]);
5343         do {
5344           --pc_inv_idx;
5345           *eigvecs_smaj_iter++ = reverse_eigvecs_col[pc_inv_idx * pca_sample_ct];
5346         } while (pc_inv_idx);
5347       }
5348     }
5349     // (later: --pca-cluster-names, --pca-clusters)
5350     char* writebuf_flush = &(writebuf[kMaxMediumLine]);
5351 
5352     if (wts_requested) {
5353       CalcPcaVarWtsCtx vwctx;
5354       vwctx.sample_ct = pca_sample_ct;
5355       vwctx.pc_ct = pc_ct;
5356       vwctx.sample_wts_smaj = eigvecs_smaj;
5357       for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5358         eigval_inv_sqrts[pc_idx] = 1.0 / sqrt(eigvals[pc_idx]);
5359       }
5360 
5361       const uint32_t allele_wts = (pca_flags / kfPcaAlleleWts) & 1;
5362       const uint32_t output_zst = (pca_flags / kfPcaVarZs) & 1;
5363       if (allele_wts) {
5364         OutnameZstSet(".eigenvec.allele", output_zst, outname_end);
5365       } else {
5366         OutnameZstSet(".eigenvec.var", output_zst, outname_end);
5367       }
5368       reterr = InitCstream(outname, 0, output_zst, max_thread_ct, overflow_buf_size, writebuf, R_CAST(unsigned char*, &(writebuf[overflow_buf_size])), &css);
5369       if (unlikely(reterr)) {
5370         goto CalcPca_ret_1;
5371       }
5372       cswritep = writebuf;
5373       *cswritep++ = '#';
5374       if (chr_buf) {
5375         cswritep = strcpya_k(cswritep, "CHROM\t");
5376       }
5377       if (pca_flags & kfPcaVcolPos) {
5378         cswritep = strcpya_k(cswritep, "POS\t");
5379       } else {
5380         variant_bps = nullptr;
5381       }
5382       cswritep = strcpya_k(cswritep, "ID");
5383       if (pca_flags & kfPcaVcolRef) {
5384         cswritep = strcpya_k(cswritep, "\tREF");
5385       }
5386       if (pca_flags & kfPcaVcolAlt1) {
5387         cswritep = strcpya_k(cswritep, "\tALT1");
5388       }
5389       if (pca_flags & kfPcaVcolAlt) {
5390         cswritep = strcpya_k(cswritep, "\tALT");
5391       }
5392       if (allele_wts) {
5393         cswritep = strcpya_k(cswritep, "\tA1");
5394       }
5395       if (pca_flags & kfPcaVcolAx) {
5396         cswritep = strcpya_k(cswritep, "\tAX");
5397       }
5398       if (pca_flags & kfPcaVcolMaj) {
5399         cswritep = strcpya_k(cswritep, "\tMAJ");
5400       }
5401       if (pca_flags & kfPcaVcolNonmaj) {
5402         cswritep = strcpya_k(cswritep, "\tNONMAJ");
5403       }
5404       for (uint32_t pc_idx = 1; pc_idx <= pc_ct; ++pc_idx) {
5405         cswritep = strcpya_k(cswritep, "\tPC");
5406         cswritep = u32toa(pc_idx, cswritep);
5407       }
5408       AppendBinaryEoln(&cswritep);
5409 
5410       // Main workflow:
5411       // 1. Set n=0, load batch 0
5412       //
5413       // 2. Spawn threads processing batch n
5414       // 3. If n>0, write results and update projection for block (n-1)
5415       // 4. Increment n by 1
5416       // 5. Load batch n unless eof
5417       // 6. Join threads
5418       // 7. Goto step 2 unless eof
5419       //
5420       // 8. Write results and update projection for last block
5421 #ifndef __APPLE__
5422       if (output_zst) {
5423         // compression is relatively expensive?
5424         calc_thread_ct = 1;
5425       }
5426 #endif
5427       uintptr_t var_wts_part_size;
5428       double* var_wts = qq;
5429       if (var_wts) {
5430         var_wts_part_size = (MINV(pca_row_ct, calc_thread_ct * kPcaVariantBlockSize)) * S_CAST(uintptr_t, pc_ct);
5431         vwctx.yy_bufs[0] = ctx.yy_bufs[0];
5432         vwctx.yy_bufs[1] = ctx.yy_bufs[1];
5433         vwctx.var_wts = ctx.qq;
5434       } else {
5435         // non-approximate PCA, some buffers have not been allocated yet
5436 
5437         // if grm[] (which we no longer need) has at least as much remaining
5438         // space as bigstack, allocate from grm
5439         unsigned char* arena_bottom = R_CAST(unsigned char*, grm);
5440         unsigned char* arena_top = bigstack_mark;
5441         uintptr_t arena_avail = arena_top - arena_bottom;
5442         if (arena_avail < bigstack_left()) {
5443           arena_bottom = g_bigstack_base;
5444           arena_top = g_bigstack_end;
5445           arena_avail = bigstack_left();
5446         }
5447         const uintptr_t var_wts_part_alloc = RoundUpPow2(2 * kPcaVariantBlockSize * sizeof(double) * pc_ct, kCacheline);
5448         const uintptr_t yy_alloc_incr = RoundUpPow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
5449         const uintptr_t per_thread_alloc = 2 * yy_alloc_incr + var_wts_part_alloc;
5450         if (per_thread_alloc * calc_thread_ct > arena_avail) {
5451           if (unlikely(arena_avail < per_thread_alloc)) {
5452             goto CalcPca_ret_NOMEM;
5453           }
5454           calc_thread_ct = arena_avail / per_thread_alloc;
5455         }
5456         const uintptr_t yy_main_alloc = RoundUpPow2(kPcaVariantBlockSize * calc_thread_ct * pca_sample_ct * sizeof(double), kCacheline);
5457         vwctx.yy_bufs[0] = S_CAST(double*, arena_alloc_raw(yy_main_alloc, &arena_bottom));
5458         vwctx.yy_bufs[1] = S_CAST(double*, arena_alloc_raw(yy_main_alloc, &arena_bottom));
5459         var_wts_part_size = (MINV(pca_row_ct, calc_thread_ct * kPcaVariantBlockSize)) * S_CAST(uintptr_t, pc_ct);
5460         var_wts = S_CAST(double*, arena_alloc_raw_rd(2 * var_wts_part_size * sizeof(double), &arena_bottom));
5461         vwctx.var_wts = var_wts;
5462 #ifndef NDEBUG
5463         if (arena_top == g_bigstack_end) {
5464           // we shouldn't make any more allocations, but just in case...
5465           g_bigstack_base = arena_bottom;
5466         }
5467 #endif
5468       }
5469       if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
5470         goto CalcPca_ret_NOMEM;
5471       }
5472       SetThreadFuncAndData(CalcPcaVarWtsThread, &vwctx, &tg);
5473       uint32_t prev_batch_size = 0;
5474       uint32_t cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
5475 
5476       uint32_t variant_idx_load = 0;
5477       uintptr_t variant_uidx_load = 0;
5478       uintptr_t allele_idx_base_load = 0;
5479       uint32_t cur_allele_ct_load = 2;
5480       uint32_t incomplete_allele_idx_load = 0;
5481 
5482       uint32_t variant_idx_write = 0;
5483       uintptr_t variant_uidx_write = 0;
5484       uintptr_t allele_idx_offset_write = 0;
5485       // cur_allele_ct = 2;
5486       uint32_t incomplete_allele_idx_write = 0;
5487       uint32_t chr_fo_idx = UINT32_MAX;
5488       uint32_t chr_end = 0;
5489       uint32_t chr_buf_blen = 0;
5490 
5491       uint32_t parity = 0;
5492       uint32_t is_not_first_block = 0;
5493       while (1) {
5494         if (!IsLastBlock(&tg)) {
5495           reterr = LoadCenteredVarmajBlock(pca_sample_include, pssi, variant_include, allele_idx_offsets, allele_freqs, 1, is_haploid, pca_sample_ct, variant_ct, simple_pgrp, vwctx.yy_bufs[parity], nullptr, &cur_batch_size, &variant_idx_load, &variant_uidx_load, &allele_idx_base_load, &cur_allele_ct_load, &incomplete_allele_idx_load, &pgv, allele_1copy_buf);
5496           if (unlikely(reterr)) {
5497             goto CalcPca_ret_PGR_FAIL;
5498           }
5499         }
5500         if (is_not_first_block) {
5501           JoinThreads(&tg);
5502         }
5503         if (!IsLastBlock(&tg)) {
5504           vwctx.cur_batch_size = cur_batch_size;
5505           if (variant_idx_load == variant_ct) {
5506             DeclareLastThreadBlock(&tg);
5507           }
5508           if (unlikely(SpawnThreads(&tg))) {
5509             goto CalcPca_ret_THREAD_CREATE_FAIL;
5510           }
5511         }
5512         parity = 1 - parity;
5513         if (is_not_first_block) {
5514           // write *previous* block results
5515           const double* var_wts_iter = &(var_wts[parity * var_wts_part_size]);
5516           // (todo: update projection here)
5517           if (allele_wts) {
5518             reterr = FlushAlleleWts(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, var_wts_iter, eigval_inv_sqrts, prev_batch_size, pc_ct, pca_flags, &css, &cswritep, chr_buf, &variant_idx_write, &variant_uidx_write, &allele_idx_offset_write, &cur_allele_ct, &incomplete_allele_idx_write, &chr_fo_idx, &chr_end, &chr_buf_blen);
5519           } else {
5520             reterr = FlushBiallelicVarWts(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, maj_alleles, var_wts_iter, eigval_inv_sqrts, prev_batch_size, pc_ct, pca_flags, &css, &cswritep, chr_buf, &variant_idx_write, &variant_uidx_write, &chr_fo_idx, &chr_end, &chr_buf_blen);
5521           }
5522           if (unlikely(reterr)) {
5523             // only write_fail possible in practice
5524             goto CalcPca_ret_1;
5525           }
5526           if (variant_idx_write == variant_ct) {
5527             break;
5528           }
5529         }
5530         is_not_first_block = 1;
5531         prev_batch_size = cur_batch_size;
5532       }
5533       if (unlikely(CswriteCloseNull(&css, cswritep))) {
5534         goto CalcPca_ret_WRITE_FAIL;
5535       }
5536       logprintfww("--pca%s: %s weights written to %s .\n", is_approx? " approx" : "", allele_wts? "Allele" : "Variant", outname);
5537     }
5538 
5539     snprintf(outname_end, kMaxOutfnameExtBlen, ".eigenvec");
5540     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
5541       goto CalcPca_ret_OPEN_FAIL;
5542     }
5543     char* write_iter = writebuf;
5544     *write_iter++ = '#';
5545     if (write_fid) {
5546       write_iter = strcpya_k(write_iter, "FID\t");
5547     }
5548     write_iter = strcpya_k(write_iter, "IID");
5549     if (write_sid) {
5550       write_iter = strcpya_k(write_iter, "\tSID");
5551     }
5552     for (uint32_t pc_idx = 1; pc_idx <= pc_ct; ++pc_idx) {
5553       write_iter = strcpya_k(write_iter, "\tPC");
5554       write_iter = u32toa(pc_idx, write_iter);
5555     }
5556     AppendBinaryEoln(&write_iter);
5557     const uint32_t sample_ct = pca_sample_ct;
5558     uintptr_t sample_uidx_base = 0;
5559     uintptr_t sample_include_bits = sample_include[0];
5560     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
5561       const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
5562       const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
5563       if (!write_fid) {
5564         cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
5565       }
5566       write_iter = strcpya(write_iter, cur_sample_id);
5567       if (write_sid) {
5568         *write_iter++ = '\t';
5569         if (sids) {
5570           write_iter = strcpya(write_iter, &(sids[max_sid_blen * sample_uidx]));
5571         } else {
5572           *write_iter++ = '0';
5573         }
5574       }
5575       double* sample_wts_iter = &(eigvecs_smaj[sample_idx * pc_ct]);
5576       // todo: read from proj_sample_wts instead when pca_sample_include bit
5577       // not set
5578       for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5579         *write_iter++ = '\t';
5580         write_iter = dtoa_g(*sample_wts_iter++, write_iter);
5581       }
5582       AppendBinaryEoln(&write_iter);
5583       if (unlikely(fwrite_ck(writebuf_flush, outfile, &write_iter))) {
5584         goto CalcPca_ret_WRITE_FAIL;
5585       }
5586     }
5587     if (unlikely(fclose_flush_null(writebuf_flush, write_iter, &outfile))) {
5588       goto CalcPca_ret_WRITE_FAIL;
5589     }
5590 
5591     snprintf(outname_end, kMaxOutfnameExtBlen, ".eigenval");
5592     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
5593       goto CalcPca_ret_OPEN_FAIL;
5594     }
5595     write_iter = writebuf;
5596     for (uint32_t pc_idx = 0; pc_idx != pc_ct; ++pc_idx) {
5597       write_iter = dtoa_g(eigvals[pc_idx], write_iter);
5598       AppendBinaryEoln(&write_iter);
5599     }
5600     if (unlikely(fclose_flush_null(writebuf_flush, write_iter, &outfile))) {
5601       goto CalcPca_ret_WRITE_FAIL;
5602     }
5603     *outname_end = '\0';
5604     logprintfww("--pca%s: Eigenvector%s written to %s.eigenvec , and eigenvalue%s written to %s.eigenval .\n", is_approx? " approx" : "", (pc_ct == 1)? "" : "s", outname, (pc_ct == 1)? "" : "s", outname);
5605   }
5606   while (0) {
5607   CalcPca_ret_NOMEM:
5608     reterr = kPglRetNomem;
5609     break;
5610   CalcPca_ret_OPEN_FAIL:
5611     reterr = kPglRetOpenFail;
5612     break;
5613   CalcPca_ret_PGR_FAIL:
5614     PgenErrPrintN(reterr);
5615     break;
5616   CalcPca_ret_REWIND_FAIL:
5617     logerrprintfww(kErrprintfRewind, ".pgen file");
5618     break;
5619   CalcPca_ret_WRITE_FAIL:
5620     reterr = kPglRetWriteFail;
5621     break;
5622   CalcPca_ret_INCONSISTENT_INPUT:
5623     reterr = kPglRetInconsistentInput;
5624     break;
5625   CalcPca_ret_THREAD_CREATE_FAIL:
5626     reterr = kPglRetThreadCreateFail;
5627     break;
5628   CalcPca_ret_DEGENERATE_DATA_2:
5629     logerrputsb();
5630   CalcPca_ret_DEGENERATE_DATA:
5631     reterr = kPglRetDegenerateData;
5632     break;
5633   }
5634  CalcPca_ret_1:
5635   CleanupThreads(&tg);
5636   BLAS_SET_NUM_THREADS(1);
5637   CswriteCloseCond(&css, cswritep);
5638   fclose_cond(outfile);
5639   if (grm) {
5640     // nothing after --pca in the plink2 order of operations uses grm[]
5641     BigstackReset(grm);
5642   } else {
5643     BigstackReset(bigstack_mark);
5644   }
5645   return reterr;
5646 }
5647 #endif
5648 
5649 // to test: do we actually want cur_dosage_ints to be uint64_t* instead of
5650 // uint32_t*?
5651 // also, should this be moved to plink2_common?
FillCurDdosageInts(const uintptr_t * genovec_buf,const uintptr_t * dosage_present,const Dosage * dosage_main_buf,uint32_t sample_ct,uint32_t dosage_ct,uint32_t is_diploid_p1,uint64_t * cur_ddosage_ints)5652 void FillCurDdosageInts(const uintptr_t* genovec_buf, const uintptr_t* dosage_present, const Dosage* dosage_main_buf, uint32_t sample_ct, uint32_t dosage_ct, uint32_t is_diploid_p1, uint64_t* cur_ddosage_ints) {
5653   uint64_t lookup_table[32] ALIGNV16;
5654   lookup_table[0] = 0;
5655   lookup_table[2] = is_diploid_p1 * kDosageMid;
5656   lookup_table[4] = is_diploid_p1 * kDosageMax;
5657   lookup_table[6] = 0;
5658   InitLookup16x8bx2(lookup_table);
5659   GenoarrLookup16x8bx2(genovec_buf, lookup_table, sample_ct, cur_ddosage_ints);
5660   if (!dosage_ct) {
5661     return;
5662   }
5663   uintptr_t sample_idx_base = 0;
5664   uintptr_t cur_bits = dosage_present[0];
5665   for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
5666     const uintptr_t sample_idx = BitIter1(dosage_present, &sample_idx_base, &cur_bits);
5667     cur_ddosage_ints[sample_idx] = dosage_main_buf[dosage_idx] * is_diploid_p1;
5668   }
5669 }
5670 
5671 CONSTI32(kScoreVariantBlockSize, 240);
5672 
5673 typedef struct CalcScoreCtxStruct {
5674   uint32_t score_final_col_ct;
5675   uint32_t sample_ct;
5676 
5677   double* dosages_vmaj[2];
5678   double* score_coefs_cmaj[2];
5679 
5680   uint32_t cur_batch_size;
5681 
5682   double* final_scores_cmaj;
5683 } CalcScoreCtx;
5684 
CalcScoreThread(void * raw_arg)5685 THREAD_FUNC_DECL CalcScoreThread(void* raw_arg) {
5686   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
5687   // don't bother to explicitly multithread for now
5688   assert(!arg->tidx);
5689   CalcScoreCtx* ctx = S_CAST(CalcScoreCtx*, arg->sharedp->context);
5690 
5691   double* final_scores_cmaj = ctx->final_scores_cmaj;
5692   const uint32_t score_final_col_ct = ctx->score_final_col_ct;
5693   const uint32_t sample_ct = ctx->sample_ct;
5694   uint32_t parity = 0;
5695   do {
5696     const uint32_t cur_batch_size = ctx->cur_batch_size;
5697     if (cur_batch_size) {
5698       RowMajorMatrixMultiplyStridedIncr(ctx->score_coefs_cmaj[parity], ctx->dosages_vmaj[parity], score_final_col_ct, kScoreVariantBlockSize, sample_ct, sample_ct, cur_batch_size, sample_ct, final_scores_cmaj);
5699     }
5700     parity = 1 - parity;
5701   } while (!THREAD_BLOCK_FINISH(arg));
5702   THREAD_RETURN;
5703 }
5704 
5705 typedef struct ParsedQscoreRangeStruct {
5706   char* range_name;
5707   double lbound;
5708   double ubound;
5709 } ParsedQscoreRange;
5710 
ScoreReport(const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * sex_male,const PhenoCol * pheno_cols,const char * pheno_names,const uintptr_t * variant_include,const ChrInfo * cip,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const double * allele_freqs,const ScoreInfo * score_info_ptr,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t pheno_ct,uintptr_t max_pheno_name_blen,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_variant_id_slen,uint32_t xchr_model,uint32_t max_thread_ct,PgenReader * simple_pgrp,char * outname,char * outname_end)5711 PglErr ScoreReport(const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const ChrInfo* cip, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const double* allele_freqs, const ScoreInfo* score_info_ptr, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_variant_id_slen, uint32_t xchr_model, uint32_t max_thread_ct, PgenReader* simple_pgrp, char* outname, char* outname_end) {
5712   unsigned char* bigstack_mark = g_bigstack_base;
5713   unsigned char* bigstack_end_mark = g_bigstack_end;
5714   uintptr_t line_idx = 0;
5715   char* cswritep = nullptr;
5716   PglErr reterr = kPglRetSuccess;
5717   TextStream score_txs;
5718   ThreadGroup tg;
5719   CompressStreamState css;
5720   PreinitTextStream(&score_txs);
5721   PreinitThreads(&tg);
5722   PreinitCstream(&css);
5723   {
5724     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
5725     if (!xchr_model) {
5726       uint32_t x_code;
5727       if (XymtExists(cip, kChrOffsetX, &x_code)) {
5728         uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code];
5729         uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
5730         uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
5731         if (!AllBitsAreZero(variant_include, x_start, x_end)) {
5732           uintptr_t* variant_include_no_x;
5733           if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_no_x))) {
5734             goto ScoreReport_ret_NOMEM;
5735           }
5736           memcpy(variant_include_no_x, variant_include, raw_variant_ctl * sizeof(intptr_t));
5737           ClearBitsNz(x_start, x_end, variant_include_no_x);
5738           variant_include = variant_include_no_x;
5739         }
5740       }
5741     } else if (xchr_model == 2) {
5742       xchr_model = 0;
5743     }
5744     // now xchr_model is set iff it's 1
5745 
5746     const ScoreFlags flags = score_info_ptr->flags;
5747     const uint32_t output_zst = (flags / kfScoreZs) & 1;
5748     uint32_t* variant_id_htable = nullptr;
5749     uint32_t variant_id_htable_size = 0;
5750     uint32_t* variant_include_cumulative_popcounts = nullptr;
5751     uintptr_t* qsr_include = nullptr;
5752     char** range_names = nullptr;
5753     uintptr_t qsr_ct = 0;
5754     if (score_info_ptr->qsr_range_fname) {
5755       // Limit this to ~1/8 of available memory, since memory may be tight with
5756       // many ranges.
5757       variant_id_htable_size = GetHtableFastSize(variant_ct);
5758       const uintptr_t htable_size_limit = bigstack_left() / (8 * sizeof(int32_t));
5759       if (variant_id_htable_size > htable_size_limit) {
5760         variant_id_htable_size = htable_size_limit;
5761         const uint32_t htable_size_min = GetHtableMinSize(variant_ct);
5762         if (htable_size_min > variant_id_htable_size) {
5763           variant_id_htable_size = htable_size_min;
5764         }
5765       }
5766       if (unlikely(
5767               bigstack_alloc_u32(variant_id_htable_size, &variant_id_htable))) {
5768         goto ScoreReport_ret_NOMEM;
5769       }
5770       reterr = PopulateIdHtableMt(nullptr, variant_include, variant_ids, variant_ct, 0, variant_id_htable_size, max_thread_ct, nullptr, variant_id_htable, nullptr);
5771       if (unlikely(reterr)) {
5772         goto ScoreReport_ret_1;
5773       }
5774       // Strictly speaking, textFILE would be more appropriate for the range
5775       // file since it should be tiny, but it doesn't really matter.
5776       // We still reserve bigstack_left() / 8 for the line-buffer since the
5777       // data file usually contains allele codes, and we use TextRetarget()
5778       // below to use the buffer allocated here for the data file too (and for
5779       // the score file later).
5780       reterr = SizeAndInitTextStream(score_info_ptr->qsr_range_fname, bigstack_left() / 8, 1, &score_txs);
5781       if (unlikely(reterr)) {
5782         goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5783       }
5784       unsigned char* bigstack_mark2 = g_bigstack_base;
5785       // strlen("<prefix>.<range name>.sscore[.zst]") < kPglFnamesize
5786       const uint32_t max_name_slen = kPglFnamesize - S_CAST(uintptr_t, outname_end - outname) - 9 - output_zst * 4;
5787       ParsedQscoreRange* parsed_qscore_ranges = R_CAST(ParsedQscoreRange*, g_bigstack_base);
5788       unsigned char* tmp_alloc_end = g_bigstack_end;
5789       uintptr_t miss_ct = 0;
5790       while (1) {
5791         ++line_idx;
5792         const char* line_start = TextGet(&score_txs);
5793         if (!line_start) {
5794           if (likely(!TextStreamErrcode2(&score_txs, &reterr))) {
5795             break;
5796           }
5797           goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5798         }
5799         // range name, p-value lower bound, p-value upper bound
5800         const char* range_name_end = CurTokenEnd(line_start);
5801         const char* lbound_start = FirstNonTspace(range_name_end);
5802         double lbound;
5803         // PLINK 1.9 documentation promises that lines with too few entries or
5804         // nonnumeric values in the second and third column are ignored.
5805         const char* lbound_end = ScantokDouble(lbound_start, &lbound);
5806         if (!lbound_end) {
5807           continue;
5808         }
5809         const char* ubound_start = FirstNonTspace(lbound_end);
5810         double ubound;
5811         const char* ubound_end = ScantokDouble(ubound_start, &ubound);
5812         if (!ubound_end) {
5813           continue;
5814         }
5815         if (unlikely(lbound > ubound)) {
5816           snprintf(g_logbuf, kLogbufSize, "Error: Upper bound < lower bound on line %" PRIuPTR " of --q-score-range range file.\n", line_idx);
5817           goto ScoreReport_ret_MALFORMED_INPUT_WW;
5818         }
5819         const uint32_t name_slen = range_name_end - line_start;
5820         if (name_slen > max_name_slen) {
5821           snprintf(g_logbuf, kLogbufSize, "Error: Name too long on line %" PRIuPTR " of --q-score-range range file.\n", line_idx);
5822         }
5823         unsigned char* tmp_alloc_base = R_CAST(unsigned char*, &(parsed_qscore_ranges[qsr_ct]));
5824         if (S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) <= name_slen + sizeof(ParsedQscoreRange)) {
5825           goto ScoreReport_ret_NOMEM;
5826         }
5827         tmp_alloc_end -= name_slen + 1;
5828         char* stored_name = R_CAST(char*, tmp_alloc_end);
5829         memcpyx(stored_name, line_start, name_slen, '\0');
5830         parsed_qscore_ranges[qsr_ct].range_name = stored_name;
5831         parsed_qscore_ranges[qsr_ct].lbound = lbound;
5832         parsed_qscore_ranges[qsr_ct].ubound = ubound;
5833         ++qsr_ct;
5834       }
5835       if (unlikely(!qsr_ct)) {
5836         logerrputs("Error: Empty --q-score-range range file.\n");
5837         goto ScoreReport_ret_INCONSISTENT_INPUT;
5838       }
5839       BigstackBaseSet(&(parsed_qscore_ranges[qsr_ct]));
5840       BigstackEndSet(tmp_alloc_end);
5841 #ifndef LAPACK_ILP64
5842       if (unlikely(qsr_ct > (0x7fffffff / kScoreVariantBlockSize))) {
5843         logerrputs("Error: --q-score-range range count too large for this " PROG_NAME_STR " build.  If this is\nreally the computation you want, use a " PROG_NAME_STR " build with large-matrix support.\n");
5844         goto ScoreReport_ret_INCONSISTENT_INPUT;
5845       }
5846 #  ifndef __LP64__
5847       const uint64_t bit_ct = S_CAST(uint64_t, qsr_ct) * variant_ct;
5848       if (unlikely(bit_ct > 0xffffffffU)) {
5849         goto ScoreReport_ret_NOMEM;
5850       }
5851 #  endif
5852 #endif
5853       if (unlikely(
5854               (g_bigstack_base > g_bigstack_end) ||
5855               bigstack_end_alloc_u32(raw_variant_ctl, &variant_include_cumulative_popcounts) ||
5856               bigstack_end_calloc_w(BitCtToWordCt(S_CAST(uint64_t, qsr_ct) * variant_ct), &qsr_include) ||
5857               bigstack_end_alloc_cp(qsr_ct, &range_names))) {
5858         goto ScoreReport_ret_NOMEM;
5859       }
5860       for (uintptr_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
5861         range_names[qsr_idx] = parsed_qscore_ranges[qsr_idx].range_name;
5862       }
5863       const uint32_t variant_ctl = BitCtToWordCt(variant_ct);
5864       uintptr_t* already_seen;
5865       if (unlikely(
5866               bigstack_calloc_w(variant_ctl, &already_seen))) {
5867         goto ScoreReport_ret_NOMEM;
5868       }
5869       FillCumulativePopcounts(variant_include, raw_variant_ctl, variant_include_cumulative_popcounts);
5870       reterr = TextRetarget(score_info_ptr->qsr_data_fname, &score_txs);
5871       if (unlikely(reterr)) {
5872         goto ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL;
5873       }
5874       const uint32_t colid_first = (score_info_ptr->qsr_varid_col_p1 < score_info_ptr->qsr_val_col_p1);
5875       uint32_t colmin;
5876       uint32_t coldiff;
5877       if (colid_first) {
5878         colmin = score_info_ptr->qsr_varid_col_p1 - 1;
5879         coldiff = score_info_ptr->qsr_val_col_p1 - score_info_ptr->qsr_varid_col_p1;
5880       } else {
5881         colmin = score_info_ptr->qsr_val_col_p1 - 1;
5882         coldiff = score_info_ptr->qsr_varid_col_p1 - score_info_ptr->qsr_val_col_p1;
5883       }
5884       line_idx = 0;
5885       miss_ct = 0;
5886       if (flags & kfScoreQsrHeader) {
5887         ++line_idx;
5888         if (unlikely(!TextGet(&score_txs))) {
5889           if (!TextStreamErrcode2(&score_txs, &reterr)) {
5890             logerrputs("Error: Empty --q-score-range data file.\n");
5891             goto ScoreReport_ret_MALFORMED_INPUT;
5892           }
5893           goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5894         }
5895       }
5896       double* min_vals = nullptr;
5897       if (flags & kfScoreQsrMin) {
5898         // something like this is needed to handle --glm output for
5899         // multiallelic variants.
5900         // (possible todo: --glm modifier which requests all-allele joint tests
5901         // for multiallelic variants)
5902         if (unlikely(bigstack_alloc_d(variant_ct, &min_vals))) {
5903           goto ScoreReport_ret_NOMEM;
5904         }
5905       }
5906       while (1) {
5907         ++line_idx;
5908         const char* line_start = TextGet(&score_txs);
5909         if (!line_start) {
5910           if (likely(!TextStreamErrcode2(&score_txs, &reterr))) {
5911             break;
5912           }
5913           goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5914         }
5915         const char* colid_ptr;
5916         const char* colval_ptr;
5917         if (colid_first) {
5918           colid_ptr = NextTokenMult0(line_start, colmin);
5919           colval_ptr = NextTokenMult(colid_ptr, coldiff);
5920           if (unlikely(!colval_ptr)) {
5921             goto ScoreReport_ret_QSR_DATA_MISSING_TOKENS;
5922           }
5923         } else {
5924           colval_ptr = NextTokenMult0(line_start, colmin);
5925           colid_ptr = NextTokenMult(colval_ptr, coldiff);
5926           if (unlikely(!colid_ptr)) {
5927             goto ScoreReport_ret_QSR_DATA_MISSING_TOKENS;
5928           }
5929         }
5930         const uint32_t varid_slen = strlen_se(colid_ptr);
5931         const uint32_t variant_uidx = VariantIdDupflagHtableFind(colid_ptr, variant_ids, variant_id_htable, varid_slen, variant_id_htable_size, max_variant_id_slen);
5932         if ((variant_uidx >> 31) || (!IsSet(variant_include, variant_uidx))) {
5933           ++miss_ct;
5934           continue;
5935         }
5936         double cur_val;
5937         if (!ScantokDouble(colval_ptr, &cur_val)) {
5938           // Tolerate NA without erroring out.  (Could count this as seen, but
5939           // that would for the min_vals logic to be more complicated.)
5940           const char* colval_end = CurTokenEnd(colval_ptr);
5941           if (likely(IsNanStr(colval_ptr, colval_end - colval_ptr))) {
5942             continue;
5943           }
5944           *K_CAST(char*, colval_end) = '\0';
5945           logerrprintfww("Error: Invalid value '%s' on line %" PRIuPTR " of --q-score-range data file.\n", colval_ptr, line_idx);
5946           goto ScoreReport_ret_MALFORMED_INPUT;
5947         }
5948         const uint32_t variant_idx = RawToSubsettedPos(variant_include, variant_include_cumulative_popcounts, variant_uidx);
5949         const uintptr_t bit_idx_base = variant_idx * qsr_ct;
5950         if (min_vals) {
5951           if (IsSet(already_seen, variant_idx)) {
5952             if (min_vals[variant_idx] <= cur_val) {
5953               continue;
5954             }
5955             ClearBitsNz(bit_idx_base, bit_idx_base + qsr_ct, qsr_include);
5956           }
5957           min_vals[variant_idx] = cur_val;
5958         } else {
5959           if (IsSet(already_seen, variant_idx)) {
5960             logerrprintfww("Error: Duplicate ID '%s' in --q-score-range data file. (Add the 'min' modifier if this is a multiallelic variant that you want to use the minimum p-value for.)\n", variant_ids[variant_uidx]);
5961             goto ScoreReport_ret_MALFORMED_INPUT;
5962           }
5963         }
5964         SetBit(variant_idx, already_seen);
5965         for (uintptr_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
5966           if ((cur_val < parsed_qscore_ranges[qsr_idx].lbound) || (cur_val > parsed_qscore_ranges[qsr_idx].ubound)) {
5967             continue;
5968           }
5969           SetBit(bit_idx_base + qsr_idx, qsr_include);
5970         }
5971       }
5972       const uint32_t qsr_variant_ct = PopcountWords(already_seen, variant_ctl);
5973       if (unlikely(!qsr_variant_ct)) {
5974         logerrputs("Error: No valid entries in --q-score-range data file.\n");
5975         goto ScoreReport_ret_INCONSISTENT_INPUT;
5976       }
5977       logprintf("--q-score-range: %" PRIuPTR " range%s and %u variant%s loaded.\n", qsr_ct, (qsr_ct == 1)? "" : "s", qsr_variant_ct, (qsr_variant_ct == 1)? "" : "s");
5978       if (miss_ct) {
5979         logerrprintf("Warning: %" PRIuPTR " line%s skipped in --q-score-range data file.\n", miss_ct, (miss_ct == 1)? "" : "s");
5980       }
5981       // possible todo: replace variant_include with already_seen, and compact
5982       // qsr_include.
5983       // but for now, we just free already_seen, and in the common use cases
5984       // this should be fine.
5985       reterr = TextRetarget(score_info_ptr->input_fname, &score_txs);
5986       if (unlikely(reterr)) {
5987         goto ScoreReport_ret_QSR_DATA_TSTREAM_FAIL;
5988       }
5989       BigstackReset(bigstack_mark2);
5990       line_idx = 0;
5991     } else {
5992       reterr = SizeAndInitTextStream(score_info_ptr->input_fname, bigstack_left() / 8, 1, &score_txs);
5993       if (unlikely(reterr)) {
5994         goto ScoreReport_ret_TSTREAM_FAIL;
5995       }
5996     }
5997     uint32_t lines_to_skip_p1 = 1 + ((flags / kfScoreHeaderIgnore) & 1);
5998     char* line_start;
5999     for (uint32_t uii = 0; uii != lines_to_skip_p1; ++uii) {
6000       ++line_idx;
6001       line_start = TextGet(&score_txs);
6002       if (unlikely(!line_start)) {
6003         if (!TextStreamErrcode2(&score_txs, &reterr)) {
6004           logerrputs("Error: Empty --score file.\n");
6005           goto ScoreReport_ret_MALFORMED_INPUT;
6006         }
6007         goto ScoreReport_ret_TSTREAM_FAIL;
6008       }
6009     }
6010     uint32_t last_col_idx = CountTokens(line_start);
6011     const uint32_t varid_col_idx = score_info_ptr->varid_col_p1 - 1;
6012     const uint32_t allele_col_idx = score_info_ptr->allele_col_p1 - 1;
6013     if (unlikely(MAXV(varid_col_idx, allele_col_idx) >= last_col_idx)) {
6014       goto ScoreReport_ret_MISSING_TOKENS;
6015     }
6016     uint32_t* score_col_idx_deltas = nullptr;
6017     uintptr_t score_col_ct = 1;
6018     if (!score_info_ptr->input_col_idx_range_list.name_ct) {
6019       if (unlikely(allele_col_idx == last_col_idx)) {
6020         goto ScoreReport_ret_MISSING_TOKENS;
6021       }
6022       if (unlikely(bigstack_alloc_u32(1, &score_col_idx_deltas))) {
6023         goto ScoreReport_ret_NOMEM;
6024       }
6025       // catch edge case
6026       if (unlikely(allele_col_idx + 1 == varid_col_idx)) {
6027         logerrputs("Error: --score variant ID column index matches a coefficient column index.\n");
6028         goto ScoreReport_ret_INVALID_CMDLINE;
6029       }
6030       score_col_idx_deltas[0] = allele_col_idx + 1;
6031     } else {
6032       unsigned char* bigstack_end_mark2 = g_bigstack_end;
6033       const uint32_t last_col_idxl = BitCtToWordCt(last_col_idx);
6034       uintptr_t* score_col_bitarr;
6035       if (unlikely(bigstack_end_calloc_w(last_col_idxl, &score_col_bitarr))) {
6036         goto ScoreReport_ret_NOMEM;
6037       }
6038       if (unlikely(NumericRangeListToBitarr(&(score_info_ptr->input_col_idx_range_list), last_col_idx, 1, 0, score_col_bitarr))) {
6039         goto ScoreReport_ret_MISSING_TOKENS;
6040       }
6041       if (unlikely(IsSet(score_col_bitarr, varid_col_idx))) {
6042         logerrputs("Error: --score variant ID column index matches a coefficient column index.\n");
6043         goto ScoreReport_ret_INVALID_CMDLINE;
6044       }
6045       if (unlikely(IsSet(score_col_bitarr, allele_col_idx))) {
6046         logerrputs("Error: --score allele column index matches a coefficient column index.\n");
6047         goto ScoreReport_ret_INVALID_CMDLINE;
6048       }
6049       score_col_ct = PopcountWords(score_col_bitarr, last_col_idxl);
6050       if (unlikely(bigstack_alloc_u32(score_col_ct, &score_col_idx_deltas))) {
6051         goto ScoreReport_ret_NOMEM;
6052       }
6053       uintptr_t col_uidx_base = 0;
6054       uintptr_t score_col_bitarr_bits = score_col_bitarr[0];
6055       for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6056         const uint32_t col_uidx = BitIter1(score_col_bitarr, &col_uidx_base, &score_col_bitarr_bits);
6057         score_col_idx_deltas[score_col_idx] = col_uidx;
6058       }
6059       // now convert to deltas
6060       for (uintptr_t score_col_idx = score_col_ct - 1; score_col_idx; --score_col_idx) {
6061         score_col_idx_deltas[score_col_idx] -= score_col_idx_deltas[score_col_idx - 1];
6062       }
6063       BigstackEndReset(bigstack_end_mark2);
6064     }
6065     char** score_col_names;
6066     if (unlikely(bigstack_alloc_cp(score_col_ct, &score_col_names))) {
6067       goto ScoreReport_ret_NOMEM;
6068     }
6069     char* write_iter = R_CAST(char*, g_bigstack_base);
6070     // don't have to worry about overflow, since linebuf was limited to 1/8
6071     // of available workspace.
6072     if (flags & kfScoreHeaderRead) {
6073       char* read_iter = line_start;
6074       for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6075         read_iter = NextTokenMult0(read_iter, score_col_idx_deltas[score_col_idx]);
6076         if (unlikely(!read_iter)) {
6077           goto ScoreReport_ret_MISSING_TOKENS;
6078         }
6079         score_col_names[score_col_idx] = write_iter;
6080         char* token_end = CurTokenEnd(read_iter);
6081         const uint32_t slen = token_end - read_iter;
6082         write_iter = memcpyax(write_iter, read_iter, slen, '\0');
6083       }
6084     } else {
6085       for (uintptr_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6086         score_col_names[score_col_idx] = write_iter;
6087         write_iter = strcpya_k(write_iter, "SCORE");
6088         write_iter = u32toa_x(score_col_idx + 1, '\0', write_iter);
6089       }
6090     }
6091     BigstackBaseSet(write_iter);
6092 
6093     uint32_t score_final_col_ct = score_col_ct;
6094     if (qsr_ct) {
6095       const uint64_t prod = S_CAST(uint64_t, qsr_ct) * score_col_ct;
6096       if (prod > 0x7fffffff) {
6097         // little point in supporting this even in large-matrix build
6098         logerrputs("Error: <--score column count> * <--q-score-range range count> too large.\n");
6099         goto ScoreReport_ret_INCONSISTENT_INPUT;
6100       }
6101 #ifndef LAPACK_ILP64
6102       if (unlikely(prod > (0x7fffffff / kScoreVariantBlockSize))) {
6103         logerrputs("Error: <--score column count> * <--q-score-range range count> too large for\nthis " PROG_NAME_STR " build.  If this is really the computation you want, use a " PROG_NAME_STR "\nbuild with large-matrix support.\n");
6104         goto ScoreReport_ret_INCONSISTENT_INPUT;
6105       }
6106 #endif
6107       score_final_col_ct = qsr_ct * score_col_ct;
6108 #ifndef LAPACK_ILP64
6109     } else {
6110       if (unlikely(score_final_col_ct > (0x7fffffff / kScoreVariantBlockSize))) {
6111         logerrputs("Error: --score column count too large for this " PROG_NAME_STR " build.  If this is really\nthe computation you want, use a " PROG_NAME_STR " build with large-matrix support.\n");
6112         goto ScoreReport_ret_INCONSISTENT_INPUT;
6113       }
6114 #endif
6115     }
6116     CalcScoreCtx ctx;
6117     ctx.score_final_col_ct = score_final_col_ct;
6118     ctx.sample_ct = sample_ct;
6119     ctx.cur_batch_size = kScoreVariantBlockSize;
6120     if (unlikely(SetThreadCt(1, &tg))) {
6121       goto ScoreReport_ret_NOMEM;
6122     }
6123     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
6124     const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6125     const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
6126     const uint32_t acc1_vec_ct = BitCtToVecCt(sample_ct);
6127     const uint32_t acc4_vec_ct = acc1_vec_ct * 4;
6128     const uint32_t acc8_vec_ct = acc1_vec_ct * 8;
6129     const uint32_t write_score_avgs = (flags / kfScoreColScoreAvgs) & 1;
6130     const uint32_t write_score_sums = (flags / kfScoreColScoreSums) & 1;
6131     const uintptr_t overflow_buf_size = RoundUpPow2((score_col_ct * (write_score_avgs + write_score_sums) + pheno_ct) * 16 + 3 * kMaxIdSlen + kCompressStreamBlock + 64, kCacheline);
6132     uintptr_t overflow_buf_alloc = overflow_buf_size;
6133     if (flags & (kfScoreZs | kfScoreListVariantsZs)) {
6134       overflow_buf_alloc += CstreamWkspaceReq(overflow_buf_size);
6135     }
6136     uintptr_t raw_allele_ct = 2 * raw_variant_ct;
6137     if (allele_idx_offsets) {
6138       raw_allele_ct = allele_idx_offsets[raw_variant_ct];
6139     }
6140     const uintptr_t raw_allele_ctl = BitCtToWordCt(raw_allele_ct);
6141     uint32_t* sample_include_cumulative_popcounts = nullptr;
6142     uintptr_t* sex_nonmale_collapsed = nullptr;
6143     uintptr_t* genovec_buf = nullptr;
6144     uintptr_t* dosage_present_buf = nullptr;
6145     Dosage* dosage_main_buf = nullptr;
6146     uintptr_t* missing_acc1 = nullptr;
6147     uintptr_t* missing_male_acc1 = nullptr;
6148     uint64_t* ddosage_sums;
6149     uint64_t* ddosage_incrs;
6150     uintptr_t* already_seen_variants;
6151     uintptr_t* already_seen_alleles;
6152     char* overflow_buf = nullptr;
6153     if (unlikely(
6154             bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(ctx.dosages_vmaj[0])) ||
6155             bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(ctx.dosages_vmaj[1])) ||
6156             bigstack_alloc_d(kScoreVariantBlockSize * score_final_col_ct, &(ctx.score_coefs_cmaj[0])) ||
6157             bigstack_alloc_d(kScoreVariantBlockSize * score_final_col_ct, &(ctx.score_coefs_cmaj[1])) ||
6158             bigstack_calloc_d(score_final_col_ct * sample_ct, &ctx.final_scores_cmaj) ||
6159             // bugfix (4 Nov 2017): need raw_sample_ctl here, not sample_ctl
6160             bigstack_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
6161             bigstack_alloc_w(sample_ctl, &sex_nonmale_collapsed) ||
6162             bigstack_alloc_w(sample_ctl2, &genovec_buf) ||
6163             bigstack_alloc_w(sample_ctl, &dosage_present_buf) ||
6164             bigstack_alloc_dosage(sample_ct, &dosage_main_buf) ||
6165             bigstack_alloc_w(45 * acc1_vec_ct * kWordsPerVec, &missing_acc1) ||
6166             bigstack_alloc_w(45 * acc1_vec_ct * kWordsPerVec, &missing_male_acc1) ||
6167             bigstack_calloc_u64(sample_ct, &ddosage_sums) ||
6168             bigstack_calloc_u64(sample_ct, &ddosage_incrs) ||
6169             bigstack_calloc_w(raw_variant_ctl, &already_seen_variants) ||
6170             bigstack_calloc_w(raw_allele_ctl, &already_seen_alleles) ||
6171             bigstack_alloc_c(overflow_buf_alloc, &overflow_buf))) {
6172       goto ScoreReport_ret_NOMEM;
6173     }
6174     SetThreadFuncAndData(CalcScoreThread, &ctx, &tg);
6175 
6176     VecW* missing_diploid_acc4 = &(R_CAST(VecW*, missing_acc1)[acc1_vec_ct]);
6177     VecW* missing_diploid_acc8 = &(missing_diploid_acc4[acc4_vec_ct]);
6178     VecW* missing_diploid_acc32 = &(missing_diploid_acc8[acc8_vec_ct]);
6179     VecW* missing_haploid_acc4 = &(R_CAST(VecW*, missing_male_acc1)[acc1_vec_ct]);
6180     VecW* missing_haploid_acc8 = &(missing_haploid_acc4[acc4_vec_ct]);
6181     VecW* missing_haploid_acc32 = &(missing_haploid_acc8[acc8_vec_ct]);
6182     ZeroVecArr(acc4_vec_ct, missing_diploid_acc4);
6183     ZeroVecArr(acc8_vec_ct, missing_diploid_acc8);
6184     ZeroVecArr(acc8_vec_ct * 4, missing_diploid_acc32);
6185     ZeroVecArr(acc4_vec_ct, missing_haploid_acc4);
6186     ZeroVecArr(acc8_vec_ct, missing_haploid_acc8);
6187     ZeroVecArr(acc8_vec_ct * 4, missing_haploid_acc32);
6188     FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
6189     CopyBitarrSubset(sex_male, sample_include, sample_ct, sex_nonmale_collapsed);
6190     AlignedBitarrInvert(sample_ct, sex_nonmale_collapsed);
6191     const uint32_t nonmale_ct = PopcountWords(sex_nonmale_collapsed, sample_ctl);
6192     const uint32_t male_ct = sample_ct - nonmale_ct;
6193     if (!variant_id_htable) {
6194       reterr = AllocAndPopulateIdHtableMt(variant_include, variant_ids, variant_ct, 0, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size, nullptr);
6195       if (unlikely(reterr)) {
6196         goto ScoreReport_ret_1;
6197       }
6198     }
6199 
6200     const uint32_t ignore_dup_ids = (flags / kfScoreIgnoreDupIds) & 1;
6201     const uint32_t list_variants = (flags / kfScoreListVariants) & 1;
6202     if (list_variants) {
6203       const uint32_t list_variants_zst = (flags / kfScoreListVariantsZs) & 1;
6204       OutnameZstSet(".sscore.vars", list_variants_zst, outname_end);
6205       reterr = InitCstream(outname, 0, list_variants_zst, max_thread_ct, overflow_buf_size, overflow_buf, R_CAST(unsigned char*, &(overflow_buf[overflow_buf_size])), &css);
6206       if (unlikely(reterr)) {
6207         goto ScoreReport_ret_1;
6208       }
6209       cswritep = overflow_buf;
6210     }
6211 
6212     const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
6213     const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
6214     const uint32_t mt_code = cip->xymt_codes[kChrOffsetMT];
6215     const uint32_t model_dominant = (flags / kfScoreDominant) & 1;
6216     const uint32_t domrec = model_dominant || (flags & kfScoreRecessive);
6217     const uint32_t variance_standardize = (flags / kfScoreVarianceStandardize) & 1;
6218     const uint32_t center = variance_standardize || (flags & kfScoreCenter);
6219     const uint32_t no_meanimpute = (flags / kfScoreNoMeanimpute) & 1;
6220     const uint32_t se_mode = (flags / kfScoreSe) & 1;
6221     uint32_t block_vidx = 0;
6222     uint32_t parity = 0;
6223     uint32_t cur_allele_ct = 2;
6224     double* cur_dosages_vmaj_iter = ctx.dosages_vmaj[0];
6225     double* cur_score_coefs_cmaj = ctx.score_coefs_cmaj[0];
6226     double geno_slope = kRecipDosageMax;
6227     double geno_intercept = 0.0;
6228     double cur_allele_freq = 0.0;
6229     uint32_t variant_ct_rem15 = 15;
6230     uint32_t variant_ct_rem255d15 = 17;
6231     uint32_t variant_hap_ct_rem15 = 15;
6232     uint32_t variant_hap_ct_rem255d15 = 17;
6233     uint32_t allele_ct_base = 0;
6234     int32_t male_allele_ct_delta = 0;
6235     uint32_t valid_variant_ct = 0;
6236     uintptr_t missing_var_id_ct = 0;
6237     uintptr_t duplicated_var_id_ct = 0;
6238     uintptr_t missing_allele_code_ct = 0;
6239 #ifdef USE_MTBLAS
6240     const uint32_t matrix_multiply_thread_ct = (max_thread_ct > 1)? (max_thread_ct - 1) : 1;
6241     BLAS_SET_NUM_THREADS(matrix_multiply_thread_ct);
6242 #endif
6243     PgrSampleSubsetIndex pssi;
6244     PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, simple_pgrp, &pssi);
6245     if (flags & kfScoreHeaderRead) {
6246       ++line_idx;
6247       line_start = TextGet(&score_txs);
6248     }
6249     for (; line_start; ++line_idx, line_start = TextGet(&score_txs)) {
6250       // varid_col_idx and allele_col_idx will almost always be very small
6251       char* variant_id_start = NextTokenMult0(line_start, varid_col_idx);
6252       if (unlikely(!variant_id_start)) {
6253         goto ScoreReport_ret_MISSING_TOKENS;
6254       }
6255       char* variant_id_token_end = CurTokenEnd(variant_id_start);
6256       const uint32_t variant_id_slen = variant_id_token_end - variant_id_start;
6257       uint32_t variant_uidx = VariantIdDupflagHtableFind(variant_id_start, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen);
6258       if (variant_uidx >> 31) {
6259         ++missing_var_id_ct;
6260         if (variant_uidx != UINT32_MAX) {
6261           if (unlikely(!ignore_dup_ids)) {
6262             snprintf(g_logbuf, kLogbufSize, "Error: --score variant ID '%s' appears multiple times in main dataset.\n", variant_ids[variant_uidx & 0x7fffffff]);
6263             goto ScoreReport_ret_INCONSISTENT_INPUT_WW;
6264           }
6265           ++duplicated_var_id_ct;
6266           // subtract this from missing_var_id_ct later
6267         }
6268         continue;
6269       }
6270       char* allele_start = NextTokenMult0(line_start, allele_col_idx);
6271       if (unlikely(!allele_start)) {
6272         goto ScoreReport_ret_MISSING_TOKENS;
6273       }
6274       uintptr_t allele_idx_offset_base;
6275       if (!allele_idx_offsets) {
6276         allele_idx_offset_base = variant_uidx * 2;
6277       } else {
6278         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
6279         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
6280       }
6281       char* allele_end = CurTokenEnd(allele_start);
6282       char allele_end_char = *allele_end;
6283       *allele_end = '\0';
6284       const uint32_t allele_blen = 1 + S_CAST(uintptr_t, allele_end - allele_start);
6285       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
6286 
6287       uint32_t cur_allele_idx = 0;
6288       for (; cur_allele_idx != cur_allele_ct; ++cur_allele_idx) {
6289         if (memequal(allele_start, cur_alleles[cur_allele_idx], allele_blen)) {
6290           break;
6291         }
6292       }
6293       // compiler is smart enough to avoid repeating this test
6294       if (cur_allele_idx == cur_allele_ct) {
6295         ++missing_allele_code_ct;
6296         continue;
6297       }
6298       const uintptr_t allele_uidx = allele_idx_offset_base + cur_allele_idx;
6299       if (unlikely(IsSet(already_seen_alleles, allele_uidx))) {
6300         char* errwrite_iter = strcpya_k(g_logbuf, "Error: ");
6301         // Don't write allele code, since it might be too long for the buffer.
6302         if (!cur_allele_idx) {
6303           errwrite_iter = strcpya_k(errwrite_iter, "REF");
6304         } else {
6305           errwrite_iter = strcpya_k(errwrite_iter, "ALT");
6306           errwrite_iter = u32toa(cur_allele_idx, errwrite_iter);
6307         }
6308         errwrite_iter = strcpya_k(errwrite_iter, " allele for variant '");
6309         errwrite_iter = strcpya(errwrite_iter, variant_ids[variant_uidx]);
6310         strcpy_k(errwrite_iter, "' appears multiple times in --score file.\n");
6311         goto ScoreReport_ret_MALFORMED_INPUT_WW;
6312       }
6313       SetBit(allele_uidx, already_seen_alleles);
6314       const uint32_t is_new_variant = 1 - IsSet(already_seen_variants, variant_uidx);
6315       SetBit(variant_uidx, already_seen_variants);
6316 
6317       // okay, the variant and allele are in our dataset.  Load it.
6318       // (possible todo: avoid reloading the same variant multiple times in a
6319       // row.)
6320       uint32_t dosage_ct;
6321       reterr = PgrGet1D(sample_include, pssi, sample_ct, variant_uidx, cur_allele_idx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_main_buf, &dosage_ct);
6322       if (unlikely(reterr)) {
6323         goto ScoreReport_ret_PGR_FAIL;
6324       }
6325       const uint32_t chr_idx = GetVariantChr(cip, variant_uidx);
6326       uint32_t is_nonx_haploid = IsSet(cip->haploid_mask, chr_idx);
6327       if (unlikely(domrec && is_nonx_haploid)) {
6328         logerrputs("Error: --score 'dominant' and 'recessive' modifiers cannot be used with haploid\nchromosomes.\n");
6329         goto ScoreReport_ret_INCONSISTENT_INPUT;
6330       }
6331       uint32_t is_relevant_x = (chr_idx == x_code);
6332       if (unlikely(variance_standardize && (is_relevant_x || (chr_idx == mt_code)))) {
6333         logerrputs("Error: --score 'variance-standardize' cannot be used with chrX or MT.\n");
6334         goto ScoreReport_ret_INCONSISTENT_INPUT;
6335       }
6336       is_nonx_haploid = (!is_relevant_x) && is_nonx_haploid;
6337 
6338       // only if --xchr-model 1 (which is no longer the default)
6339       is_relevant_x = is_relevant_x && xchr_model;
6340 
6341       const uint32_t is_y = (chr_idx == y_code);
6342       ZeroTrailingNyps(sample_ct, genovec_buf);
6343       GenoarrToMissingnessUnsafe(genovec_buf, sample_ct, missing_acc1);
6344       if (dosage_ct) {
6345         BitvecInvmask(dosage_present_buf, sample_ctl, missing_acc1);
6346       }
6347       FillCurDdosageInts(genovec_buf, dosage_present_buf, dosage_main_buf, sample_ct, dosage_ct, 2 - is_nonx_haploid, ddosage_incrs);
6348       double ploidy_d;
6349       if (is_nonx_haploid) {
6350         if (is_y) {
6351           uintptr_t sample_idx_base = 0;
6352           uintptr_t sex_nonmale_collapsed_bits = sex_nonmale_collapsed[0];
6353           for (uint32_t nonmale_idx = 0; nonmale_idx != nonmale_ct; ++nonmale_idx) {
6354             const uintptr_t sample_idx = BitIter1(sex_nonmale_collapsed, &sample_idx_base, &sex_nonmale_collapsed_bits);
6355             ddosage_incrs[sample_idx] = 0;
6356           }
6357           male_allele_ct_delta += is_new_variant;
6358           BitvecInvmask(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6359         } else {
6360           allele_ct_base += is_new_variant;
6361         }
6362         if (is_new_variant) {
6363           VcountIncr1To4(missing_acc1, acc1_vec_ct, missing_haploid_acc4);
6364           if (!(--variant_hap_ct_rem15)) {
6365             Vcount0Incr4To8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
6366             variant_hap_ct_rem15 = 15;
6367             if (!(--variant_hap_ct_rem255d15)) {
6368               Vcount0Incr8To32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
6369               variant_hap_ct_rem255d15 = 17;
6370             }
6371           }
6372         }
6373         if (is_y) {
6374           memcpy(missing_male_acc1, missing_acc1, sample_ctl * sizeof(intptr_t));
6375           BitvecOr(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6376         }
6377         ploidy_d = 1.0;
6378       } else {
6379         if (is_relevant_x) {
6380           uintptr_t sample_idx_base = 0;
6381           uintptr_t sex_nonmale_collapsed_inv_bits = ~sex_nonmale_collapsed[0];
6382           for (uint32_t male_idx = 0; male_idx != male_ct; ++male_idx) {
6383             const uintptr_t sample_idx = BitIter0(sex_nonmale_collapsed, &sample_idx_base, &sex_nonmale_collapsed_inv_bits);
6384             ddosage_incrs[sample_idx] /= 2;
6385           }
6386           BitvecInvmaskCopy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
6387           BitvecAnd(sex_nonmale_collapsed, sample_ctl, missing_acc1);
6388         }
6389         if (is_new_variant) {
6390           VcountIncr1To4(missing_acc1, acc1_vec_ct, missing_diploid_acc4);
6391           if (!(--variant_ct_rem15)) {
6392             Vcount0Incr4To8(acc4_vec_ct, missing_diploid_acc4, missing_diploid_acc8);
6393             variant_ct_rem15 = 15;
6394             if (!(--variant_ct_rem255d15)) {
6395               Vcount0Incr8To32(acc8_vec_ct, missing_diploid_acc8, missing_diploid_acc32);
6396               variant_ct_rem255d15 = 17;
6397             }
6398           }
6399           allele_ct_base += 2;
6400         }
6401         if (is_relevant_x) {
6402           if (is_new_variant) {
6403             --male_allele_ct_delta;
6404             VcountIncr1To4(missing_male_acc1, acc1_vec_ct, missing_haploid_acc4);
6405             if (!(--variant_hap_ct_rem15)) {
6406               Vcount0Incr4To8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
6407               variant_hap_ct_rem15 = 15;
6408               if (!(--variant_hap_ct_rem255d15)) {
6409                 Vcount0Incr8To32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
6410                 variant_hap_ct_rem255d15 = 17;
6411               }
6412             }
6413           }
6414           BitvecOr(missing_male_acc1, sample_ctl, missing_acc1);
6415         }
6416         if (!domrec) {
6417           ploidy_d = 2.0;
6418         } else {
6419           if (model_dominant) {
6420             for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6421               if (ddosage_incrs[sample_idx] > kDosageMax) {
6422                 ddosage_incrs[sample_idx] = kDosageMax;
6423               }
6424             }
6425           } else {
6426             for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6427               uint64_t cur_ddosage_incr = ddosage_incrs[sample_idx];
6428               if (cur_ddosage_incr <= kDosageMax) {
6429                 cur_ddosage_incr = 0;
6430               } else {
6431                 cur_ddosage_incr -= kDosageMax;
6432               }
6433               ddosage_incrs[sample_idx] = cur_ddosage_incr;
6434             }
6435           }
6436           ploidy_d = 1.0;
6437         }
6438       }
6439       for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6440         ddosage_sums[sample_idx] += ddosage_incrs[sample_idx];
6441       }
6442       if (allele_freqs) {
6443         cur_allele_freq = GetAlleleFreq(&(allele_freqs[allele_idx_offset_base - variant_uidx]), cur_allele_idx, cur_allele_ct);
6444       }
6445       if (center) {
6446         if (variance_standardize) {
6447           const double variance = ploidy_d * 0.5 * ComputeDiploidMultiallelicVariance(&(allele_freqs[allele_idx_offset_base - variant_uidx]), cur_allele_ct);
6448           if (!(variance > kSmallEpsilon)) {
6449             // ZeroTrailingNyps(sample_ct, genovec_buf);
6450             STD_ARRAY_DECL(uint32_t, 4, genocounts);
6451             GenoarrCountFreqsUnsafe(genovec_buf, sample_ct, genocounts);
6452             if (unlikely(dosage_ct || genocounts[1] || genocounts[2])) {
6453               snprintf(g_logbuf, kLogbufSize, "Error: --score variance-standardize failure for variant '%s': estimated allele frequency is zero or NaN, but not all dosages are zero. (This is possible when e.g. allele frequencies are estimated from founders, but the allele is only observed in nonfounders.)\n", variant_ids[variant_uidx]);
6454               goto ScoreReport_ret_DEGENERATE_DATA_WW;
6455             }
6456             geno_slope = 0.0;
6457           } else {
6458             geno_slope = kRecipDosageMax / sqrt(variance);
6459           }
6460         }
6461         // (ploidy * cur_allele_freq * kDosageMax) * geno_slope +
6462         //   geno_intercept == 0
6463         // bugfix: must use "-1.0 *" instead of - to avoid unsigned int
6464         //   wraparound
6465         geno_intercept = (-1.0 * kDosageMax) * ploidy_d * cur_allele_freq * geno_slope;
6466       }
6467       const uint32_t missing_ct = PopcountWords(missing_acc1, sample_ctl);
6468       const uint32_t nm_sample_ct = sample_ct - missing_ct;
6469       if (missing_ct) {
6470         double missing_effect = 0.0;
6471         if (!no_meanimpute) {
6472           missing_effect = kDosageMax * cur_allele_freq * geno_slope;
6473         }
6474         uintptr_t sample_idx_base = 0;
6475         if (is_y || is_relevant_x) {
6476           ZeroDArr(sample_ct, cur_dosages_vmaj_iter);
6477           if (!no_meanimpute) {
6478             const uint32_t male_missing_ct = PopcountWords(missing_male_acc1, sample_ctl);
6479             uintptr_t missing_male_acc1_bits = missing_male_acc1[0];
6480             for (uint32_t male_missing_idx = 0; male_missing_idx != male_missing_ct; ++male_missing_idx) {
6481               const uintptr_t sample_idx = BitIter1(missing_male_acc1, &sample_idx_base, &missing_male_acc1_bits);
6482               cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6483             }
6484             if (is_relevant_x) {
6485               // missing_male_acc1 not used after this point, so okay to
6486               // use buffer for nonmales
6487               BitvecAndCopy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
6488               missing_effect *= 2;
6489               // bugfix (8 Jul 2018): need to reset sample_idx
6490               sample_idx_base = 0;
6491               missing_male_acc1_bits = missing_male_acc1[0];
6492               const uint32_t nonmale_missing_ct = PopcountWords(missing_male_acc1, sample_ctl);
6493               for (uint32_t nonmale_missing_idx = 0; nonmale_missing_idx != nonmale_missing_ct; ++nonmale_missing_idx) {
6494                 const uintptr_t sample_idx = BitIter1(missing_male_acc1, &sample_idx_base, &missing_male_acc1_bits);
6495                 cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6496               }
6497             }
6498           }
6499         } else {
6500           missing_effect *= ploidy_d;
6501           uintptr_t missing_acc1_bits = missing_acc1[0];
6502           for (uint32_t missing_idx = 0; missing_idx != missing_ct; ++missing_idx) {
6503             const uintptr_t sample_idx = BitIter1(missing_acc1, &sample_idx_base, &missing_acc1_bits);
6504             cur_dosages_vmaj_iter[sample_idx] = missing_effect;
6505           }
6506         }
6507       }
6508       uintptr_t sample_idx_base = 0;
6509       uintptr_t missing_acc1_inv_bits = ~missing_acc1[0];
6510       for (uint32_t nm_sample_idx = 0; nm_sample_idx != nm_sample_ct; ++nm_sample_idx) {
6511         const uintptr_t sample_idx = BitIter0(missing_acc1, &sample_idx_base, &missing_acc1_inv_bits);
6512         cur_dosages_vmaj_iter[sample_idx] = u63tod(ddosage_incrs[sample_idx]) * geno_slope + geno_intercept;
6513       }
6514       if (se_mode) {
6515         // Suppose our score coefficients are drawn from independent Gaussians.
6516         // Then the variance of the final score average is the sum of the
6517         // variances of the individual terms, divided by (T^2) where T is the
6518         // number of terms.  These individual variances are of the form
6519         // (<genotype value> * <stdev>)^2.
6520         //
6521         // Thus, we can use the same inner loop to compute standard errors, as
6522         // long as
6523         //   1. we square the genotypes and the standard errors before matrix
6524         //      multiplication, and
6525         //   2. we take the square root of the sums at the end.
6526         for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6527           cur_dosages_vmaj_iter[sample_idx] *= cur_dosages_vmaj_iter[sample_idx];
6528         }
6529       }
6530       cur_dosages_vmaj_iter = &(cur_dosages_vmaj_iter[sample_ct]);
6531 
6532       *allele_end = allele_end_char;
6533       double* cur_score_coefs_iter = &(cur_score_coefs_cmaj[block_vidx]);
6534       const char* read_iter = line_start;
6535       for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6536         read_iter = NextTokenMult0(read_iter, score_col_idx_deltas[score_col_idx]);
6537         if (unlikely(!read_iter)) {
6538           goto ScoreReport_ret_MISSING_TOKENS;
6539         }
6540         double raw_coef;
6541         const char* token_end = ScantokDouble(read_iter, &raw_coef);
6542         if (unlikely(!token_end)) {
6543           snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --score file has an invalid coefficient.\n", line_idx);
6544           goto ScoreReport_ret_MALFORMED_INPUT_2;
6545         }
6546         if (!qsr_ct) {
6547           *cur_score_coefs_iter = raw_coef;
6548           cur_score_coefs_iter = &(cur_score_coefs_iter[kScoreVariantBlockSize]);
6549         } else {
6550           const uintptr_t bit_idx_base = RawToSubsettedPos(variant_include, variant_include_cumulative_popcounts, variant_uidx) * qsr_ct;
6551           for (uint32_t qsr_idx = 0; qsr_idx != qsr_ct; ++qsr_idx) {
6552             double cur_coef = raw_coef * u31tod(IsSet(qsr_include, qsr_idx + bit_idx_base));
6553             *cur_score_coefs_iter = cur_coef;
6554             cur_score_coefs_iter = &(cur_score_coefs_iter[kScoreVariantBlockSize]);
6555           }
6556         }
6557         read_iter = token_end;
6558       }
6559       if (is_new_variant) {
6560         if (list_variants) {
6561           cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
6562           AppendBinaryEoln(&cswritep);
6563           if (unlikely(Cswrite(&css, &cswritep))) {
6564             goto ScoreReport_ret_WRITE_FAIL;
6565           }
6566         }
6567         ++valid_variant_ct;
6568         if (!(valid_variant_ct % 10000)) {
6569           printf("\r--score: %uk variants loaded.", valid_variant_ct / 1000);
6570           fflush(stdout);
6571         }
6572       }
6573       ++block_vidx;
6574       if (block_vidx == kScoreVariantBlockSize) {
6575         if (se_mode) {
6576           for (uintptr_t ulii = 0; ulii != kScoreVariantBlockSize * score_final_col_ct; ++ulii) {
6577             cur_score_coefs_cmaj[ulii] *= cur_score_coefs_cmaj[ulii];
6578           }
6579         }
6580         parity = 1 - parity;
6581         const uint32_t is_not_first_block = ThreadsAreActive(&tg);
6582         if (is_not_first_block) {
6583           JoinThreads(&tg);
6584           // CalcScoreThread() never errors out
6585         }
6586         if (unlikely(SpawnThreads(&tg))) {
6587           goto ScoreReport_ret_THREAD_CREATE_FAIL;
6588         }
6589         cur_dosages_vmaj_iter = ctx.dosages_vmaj[parity];
6590         cur_score_coefs_cmaj = ctx.score_coefs_cmaj[parity];
6591         block_vidx = 0;
6592       }
6593     }
6594     if (unlikely(TextStreamErrcode2(&score_txs, &reterr))) {
6595       goto ScoreReport_ret_TSTREAM_FAIL;
6596     }
6597     VcountIncr4To8(missing_diploid_acc4, acc4_vec_ct, missing_diploid_acc8);
6598     VcountIncr8To32(missing_diploid_acc8, acc8_vec_ct, missing_diploid_acc32);
6599     VcountIncr4To8(missing_haploid_acc4, acc4_vec_ct, missing_haploid_acc8);
6600     VcountIncr8To32(missing_haploid_acc8, acc8_vec_ct, missing_haploid_acc32);
6601     const uint32_t is_not_first_block = ThreadsAreActive(&tg);
6602     putc_unlocked('\r', stdout);
6603     if (missing_var_id_ct || missing_allele_code_ct || duplicated_var_id_ct) {
6604       missing_var_id_ct -= duplicated_var_id_ct;
6605       if (!missing_var_id_ct) {
6606         if (missing_allele_code_ct) {
6607           snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_allele_code_ct, (missing_allele_code_ct == 1)? "y was skipped due to a mismatching allele code" : "ies were skipped due to mismatching allele codes");
6608         }
6609       } else if (!missing_allele_code_ct) {
6610         snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs");
6611       } else {
6612         snprintf(g_logbuf, kLogbufSize, "Warning: %" PRIuPTR " --score file entr%s, and %" PRIuPTR " %s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs", missing_allele_code_ct, (missing_allele_code_ct == 1)? "was skipped due to a mismatching allele code" : "were skipped due to mismatching allele codes");
6613       }
6614       WordWrapB(0);
6615       logerrputsb();
6616       if (duplicated_var_id_ct) {
6617         logerrprintfww("Warning: %" PRIuPTR " --score file entr%s appear multiple times in the main dataset.\n", duplicated_var_id_ct, (duplicated_var_id_ct == 1)? "y was skipped since its variant ID" : "ies were skipped since their variant IDs");
6618       }
6619       if (!list_variants) {
6620         logerrputs("(Add the 'list-variants' modifier to see which variants were actually used for\nscoring.)\n");
6621       }
6622     }
6623     if (block_vidx) {
6624       if (is_not_first_block) {
6625         JoinThreads(&tg);
6626       }
6627     } else if (unlikely(!valid_variant_ct)) {
6628       logerrputs("Error: No valid variants in --score file.\n");
6629       goto ScoreReport_ret_DEGENERATE_DATA;
6630     } else {
6631       JoinThreads(&tg);
6632     }
6633     DeclareLastThreadBlock(&tg);
6634     ctx.cur_batch_size = block_vidx;
6635     if (se_mode) {
6636       for (uintptr_t score_final_col_idx = 0; score_final_col_idx != score_final_col_ct; ++score_final_col_idx) {
6637         double* cur_score_coefs_row = &(cur_score_coefs_cmaj[score_final_col_idx * kScoreVariantBlockSize]);
6638         for (uint32_t uii = 0; uii != block_vidx; ++uii) {
6639           cur_score_coefs_row[uii] *= cur_score_coefs_row[uii];
6640         }
6641       }
6642     }
6643     if (unlikely(SpawnThreads(&tg))) {
6644       goto ScoreReport_ret_THREAD_CREATE_FAIL;
6645     }
6646     JoinThreads(&tg);
6647     if (se_mode) {
6648       // sample_ct * score_final_col_ct
6649       for (uintptr_t ulii = 0; ulii != sample_ct * score_final_col_ct; ++ulii) {
6650         ctx.final_scores_cmaj[ulii] = sqrt(ctx.final_scores_cmaj[ulii]);
6651       }
6652     }
6653     logprintf("--score: %u variant%s processed.\n", valid_variant_ct, (valid_variant_ct == 1)? "" : "s");
6654     if (list_variants) {
6655       if (unlikely(CswriteCloseNull(&css, cswritep))) {
6656         goto ScoreReport_ret_WRITE_FAIL;
6657       }
6658       cswritep = nullptr;
6659       logprintf("Variant list written to %s .\n", outname);
6660     }
6661 
6662     const uint32_t qsr_ct_nz = qsr_ct + (qsr_ct == 0);
6663     for (uint32_t qsr_idx = 0; qsr_idx != qsr_ct_nz; ++qsr_idx) {
6664       char* outname_end2 = outname_end;
6665       if (range_names) {
6666         *outname_end2++ = '.';
6667         outname_end2 = strcpya(outname_end2, range_names[qsr_idx]);
6668       }
6669       OutnameZstSet(".sscore", output_zst, outname_end2);
6670       reterr = InitCstream(outname, 0, output_zst, max_thread_ct, overflow_buf_size, overflow_buf, R_CAST(unsigned char*, &(overflow_buf[overflow_buf_size])), &css);
6671       if (unlikely(reterr)) {
6672         goto ScoreReport_ret_1;
6673       }
6674       cswritep = overflow_buf;
6675       const uint32_t write_fid = FidColIsRequired(siip, flags / kfScoreColMaybefid);
6676       const char* sample_ids = siip->sample_ids;
6677       const char* sids = siip->sids;
6678       const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
6679       const uintptr_t max_sid_blen = siip->max_sid_blen;
6680       const uint32_t write_sid = SidColIsRequired(sids, flags / kfScoreColMaybesid);
6681       const uint32_t write_empty_pheno = (flags & kfScoreColPheno1) && (!pheno_ct);
6682       const uint32_t write_phenos = (flags & (kfScoreColPheno1 | kfScoreColPhenos)) && pheno_ct;
6683       if (write_phenos && (!(flags & kfScoreColPhenos))) {
6684         pheno_ct = 1;
6685       }
6686       *cswritep++ = '#';
6687       if (write_fid) {
6688         cswritep = strcpya_k(cswritep, "FID\t");
6689       }
6690       cswritep = strcpya_k(cswritep, "IID");
6691       if (write_sid) {
6692         cswritep = strcpya_k(cswritep, "\tSID");
6693       }
6694       if (write_phenos) {
6695         for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
6696           *cswritep++ = '\t';
6697           cswritep = strcpya(cswritep, &(pheno_names[pheno_idx * max_pheno_name_blen]));
6698           if (unlikely(Cswrite(&css, &cswritep))) {
6699             goto ScoreReport_ret_WRITE_FAIL;
6700           }
6701         }
6702       } else if (write_empty_pheno) {
6703         cswritep = strcpya_k(cswritep, "\tPHENO1");
6704       }
6705       const uint32_t write_nallele = (flags / kfScoreColNallele) & 1;
6706       if (write_nallele) {
6707         cswritep = strcpya_k(cswritep, "\tALLELE_CT");
6708       }
6709       const uint32_t write_denom = (flags / kfScoreColDenom) & 1;
6710       if (write_denom) {
6711         cswritep = strcpya_k(cswritep, "\tDENOM");
6712       }
6713       const uint32_t write_dosage_sum = (flags / kfScoreColDosageSum) & 1;
6714       if (write_dosage_sum) {
6715         cswritep = strcpya_k(cswritep, "\tNAMED_ALLELE_DOSAGE_SUM");
6716       }
6717       if (write_score_avgs) {
6718         for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6719           *cswritep++ = '\t';
6720           cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
6721           cswritep = strcpya_k(cswritep, "_AVG");
6722           if (unlikely(Cswrite(&css, &cswritep))) {
6723             goto ScoreReport_ret_WRITE_FAIL;
6724           }
6725         }
6726       }
6727       if (write_score_sums) {
6728         for (uint32_t score_col_idx = 0; score_col_idx != score_col_ct; ++score_col_idx) {
6729           *cswritep++ = '\t';
6730           cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
6731           cswritep = strcpya_k(cswritep, "_SUM");
6732           if (unlikely(Cswrite(&css, &cswritep))) {
6733             goto ScoreReport_ret_WRITE_FAIL;
6734           }
6735         }
6736       }
6737       AppendBinaryEoln(&cswritep);
6738       const uint32_t* scrambled_missing_diploid_cts = R_CAST(uint32_t*, missing_diploid_acc32);
6739       const uint32_t* scrambled_missing_haploid_cts = R_CAST(uint32_t*, missing_haploid_acc32);
6740       const char* output_missing_pheno = g_output_missing_pheno;
6741       const uint32_t omp_slen = strlen(output_missing_pheno);
6742 
6743       uintptr_t sample_uidx_base = 0;
6744       uintptr_t sample_include_bits = sample_include[0];
6745       for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
6746         const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
6747         const char* cur_sample_id = &(sample_ids[sample_uidx * max_sample_id_blen]);
6748         if (!write_fid) {
6749           cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
6750         }
6751         cswritep = strcpya(cswritep, cur_sample_id);
6752         if (write_sid) {
6753           *cswritep++ = '\t';
6754           if (sids) {
6755             cswritep = strcpya(cswritep, &(sids[max_sid_blen * sample_uidx]));
6756           } else {
6757             *cswritep++ = '0';
6758           }
6759         }
6760         if (write_phenos) {
6761           // er, this probably belongs in its own function
6762           for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
6763             const PhenoCol* cur_pheno_col = &(pheno_cols[pheno_idx]);
6764             const PhenoDtype type_code = cur_pheno_col->type_code;
6765             *cswritep++ = '\t';
6766             if (type_code <= kPhenoDtypeQt) {
6767               if (!IsSet(cur_pheno_col->nonmiss, sample_uidx)) {
6768                 cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
6769               } else if (type_code == kPhenoDtypeCc) {
6770                 *cswritep++ = '1' + IsSet(cur_pheno_col->data.cc, sample_uidx);
6771               } else {
6772                 cswritep = dtoa_g(cur_pheno_col->data.qt[sample_uidx], cswritep);
6773               }
6774             } else {
6775               // category index guaranteed to be zero for missing values
6776               cswritep = strcpya(cswritep, cur_pheno_col->category_names[cur_pheno_col->data.cat[sample_uidx]]);
6777               if (unlikely(Cswrite(&css, &cswritep))) {
6778                 goto ScoreReport_ret_WRITE_FAIL;
6779               }
6780             }
6781           }
6782         } else if (write_empty_pheno) {
6783           *cswritep++ = '\t';
6784           cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
6785         }
6786         const uint32_t scrambled_idx = VcountScramble1(sample_idx);
6787         uint32_t denom = allele_ct_base + IsSet(sex_male, sample_uidx) * male_allele_ct_delta;
6788         const uint32_t nallele = denom - 2 * scrambled_missing_diploid_cts[scrambled_idx] - scrambled_missing_haploid_cts[scrambled_idx];
6789         if (write_nallele) {
6790           *cswritep++ = '\t';
6791           cswritep = u32toa(nallele, cswritep);
6792         }
6793         if (no_meanimpute) {
6794           denom = nallele;
6795         }
6796         if (write_denom) {
6797           *cswritep++ = '\t';
6798           cswritep = u32toa(denom, cswritep);
6799         }
6800         if (write_dosage_sum) {
6801           *cswritep++ = '\t';
6802           cswritep = ddosagetoa(ddosage_sums[sample_idx], cswritep);
6803         }
6804         const double* final_score_col = &(ctx.final_scores_cmaj[sample_idx]);
6805         if (write_score_avgs) {
6806           const double denom_recip = 1.0 / S_CAST(double, denom);
6807           for (uintptr_t score_final_col_idx = qsr_idx; score_final_col_idx < score_final_col_ct; score_final_col_idx += qsr_ct_nz) {
6808             *cswritep++ = '\t';
6809             cswritep = dtoa_g(final_score_col[score_final_col_idx * sample_ct] * denom_recip, cswritep);
6810           }
6811         }
6812         if (write_score_sums) {
6813           for (uint32_t score_final_col_idx = qsr_idx; score_final_col_idx < score_final_col_ct; score_final_col_idx += qsr_ct_nz) {
6814             *cswritep++ = '\t';
6815             cswritep = dtoa_g(final_score_col[score_final_col_idx * sample_ct], cswritep);
6816           }
6817         }
6818         AppendBinaryEoln(&cswritep);
6819         if (unlikely(Cswrite(&css, &cswritep))) {
6820           goto ScoreReport_ret_WRITE_FAIL;
6821         }
6822       }
6823       if (unlikely(CswriteCloseNull(&css, cswritep))) {
6824         goto ScoreReport_ret_WRITE_FAIL;
6825       }
6826     }
6827     if (!qsr_ct) {
6828       logprintfww("--score: Results written to %s .\n", outname);
6829     } else {
6830       *outname_end = '\0';
6831       logprintfww("--score + --q-score-range: Results written to %s.<range name>.sscore%s .\n", outname, output_zst? ".zst" : "");
6832     }
6833   }
6834   while (0) {
6835   ScoreReport_ret_TSTREAM_FAIL:
6836     TextStreamErrPrint("--score file", &score_txs);
6837     break;
6838   ScoreReport_ret_QSR_RANGE_TSTREAM_FAIL:
6839     TextStreamErrPrint("--q-score-range range file", &score_txs);
6840     break;
6841   ScoreReport_ret_QSR_DATA_TSTREAM_FAIL:
6842     TextStreamErrPrint("--q-score-range data file", &score_txs);
6843     break;
6844   ScoreReport_ret_NOMEM:
6845     reterr = kPglRetNomem;
6846     break;
6847   ScoreReport_ret_PGR_FAIL:
6848     PgenErrPrintN(reterr);
6849     break;
6850   ScoreReport_ret_WRITE_FAIL:
6851     reterr = kPglRetWriteFail;
6852     break;
6853   ScoreReport_ret_INVALID_CMDLINE:
6854     reterr = kPglRetInvalidCmdline;
6855     break;
6856   ScoreReport_ret_MALFORMED_INPUT_WW:
6857     WordWrapB(0);
6858   ScoreReport_ret_MALFORMED_INPUT_2:
6859     logputs("\n");
6860     logerrputsb();
6861   ScoreReport_ret_MALFORMED_INPUT:
6862     reterr = kPglRetMalformedInput;
6863     break;
6864   ScoreReport_ret_QSR_DATA_MISSING_TOKENS:
6865     logerrprintfww("Error: Line %" PRIuPTR " of --q-score-range data file has fewer tokens than expected.\n", line_idx);
6866     reterr = kPglRetInconsistentInput;
6867     break;
6868   ScoreReport_ret_MISSING_TOKENS:
6869     logputs("\n");
6870     logerrprintfww("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, score_info_ptr->input_fname);
6871     reterr = kPglRetInconsistentInput;
6872     break;
6873   ScoreReport_ret_INCONSISTENT_INPUT_WW:
6874     WordWrapB(0);
6875     logputs("\n");
6876     logerrputsb();
6877   ScoreReport_ret_INCONSISTENT_INPUT:
6878     reterr = kPglRetInconsistentInput;
6879     break;
6880   ScoreReport_ret_THREAD_CREATE_FAIL:
6881     reterr = kPglRetThreadCreateFail;
6882     break;
6883   ScoreReport_ret_DEGENERATE_DATA_WW:
6884     WordWrapB(0);
6885     logputs("\n");
6886     logerrputsb();
6887   ScoreReport_ret_DEGENERATE_DATA:
6888     reterr = kPglRetDegenerateData;
6889     break;
6890   }
6891  ScoreReport_ret_1:
6892   CswriteCloseCond(&css, cswritep);
6893   CleanupThreads(&tg);
6894   BLAS_SET_NUM_THREADS(1);
6895   CleanupTextStream2("--score file", &score_txs, &reterr);
6896   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
6897   return reterr;
6898 }
6899 
6900 typedef struct VscoreCtxStruct {
6901   const uintptr_t* variant_include;
6902   const ChrInfo* cip;
6903   const uintptr_t* allele_idx_offsets;
6904   const double* allele_freqs;
6905   const uintptr_t* sample_include;
6906   const uint32_t* sample_include_cumulative_popcounts;
6907   const uintptr_t* sex_male_collapsed;
6908   const uintptr_t* sex_male_interleaved_vec;
6909   const double* wts_smaj;
6910   uint32_t vscore_ct;
6911   uint32_t sample_ct;
6912   uint32_t male_ct;
6913   uint32_t is_xchr_model_1;
6914 
6915   PgenReader** pgr_ptrs;
6916   uintptr_t** genovecs;
6917   uintptr_t** raregenos;
6918   uint32_t** difflist_sample_id_bufs;
6919   uintptr_t** dosage_presents;
6920   Dosage** dosage_mains;
6921   uint32_t* read_variant_uidx_starts;
6922 
6923   uint32_t cur_block_size;
6924 
6925   double** dosage_vmaj_bufs;
6926   double** tmp_result_bufs;
6927 
6928   // variant-major
6929   double* results[2];
6930 
6931   uint32_t* missing_cts[2];
6932 
6933   // only kPglRetMalformedInput possible, no atomic ops needed
6934   PglErr reterr;
6935 } VscoreCtx;
6936 
6937 // This setting seems optimal on my Mac (smaller doesn't take full advantage of
6938 // AVX, larger creates cache problems?).
6939 CONSTI32(kVscoreBlockSize, 32);
6940 
VscoreThread(void * raw_arg)6941 THREAD_FUNC_DECL VscoreThread(void* raw_arg) {
6942   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
6943   const uintptr_t tidx = arg->tidx;
6944   VscoreCtx* ctx = S_CAST(VscoreCtx*, arg->sharedp->context);
6945 
6946   const uintptr_t* variant_include = ctx->variant_include;
6947   const ChrInfo* cip = ctx->cip;
6948   const uintptr_t* allele_idx_offsets = ctx->allele_idx_offsets;
6949   const double* allele_freqs = ctx->allele_freqs;
6950   const uintptr_t* sample_include = ctx->sample_include;
6951   const uintptr_t* sex_male = ctx->sex_male_collapsed;
6952   const uintptr_t* sex_male_interleaved_vec = ctx->sex_male_interleaved_vec;
6953   const double* wts_smaj = ctx->wts_smaj;
6954 
6955   PgenReader* pgrp = ctx->pgr_ptrs[tidx];
6956   PgrSampleSubsetIndex pssi;
6957   PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
6958   uintptr_t* genovec = ctx->genovecs[tidx];
6959   uintptr_t* raregeno = ctx->raregenos[tidx];
6960   uint32_t* difflist_sample_ids = ctx->difflist_sample_id_bufs[tidx];
6961   uintptr_t* dosage_present = nullptr;
6962   Dosage* dosage_main = nullptr;
6963   if (ctx->dosage_presents) {
6964     dosage_present = ctx->dosage_presents[tidx];
6965     dosage_main = ctx->dosage_mains[tidx];
6966   }
6967 
6968   const uintptr_t vscore_ct = ctx->vscore_ct;
6969   const uintptr_t sample_ct = ctx->sample_ct;
6970   const uint32_t male_ct = ctx->male_ct;
6971   const uint32_t nonmale_ct = sample_ct - male_ct;
6972   const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
6973   const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
6974   const uint32_t is_xchr_model_1 = ctx->is_xchr_model_1;
6975   const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp);
6976 
6977   const uint32_t max_sparse = sample_ct / 9;
6978 
6979   double* tmp_result_buf = ctx->tmp_result_bufs[tidx];
6980   uint16_t cur_bidxs[kVscoreBlockSize];
6981 
6982   double* dosage_vmaj = ctx->dosage_vmaj_bufs[tidx];
6983 
6984   uint32_t is_y = 0;
6985   uint32_t is_x_or_y = 0;
6986   uint32_t is_nonxy_haploid = 0;
6987   uint32_t chr_end = 0;
6988   double slope = 0.0;
6989 
6990   uint32_t dosage_ct = 0;
6991 
6992   uint32_t parity = 0;
6993   do {
6994     const uintptr_t cur_block_size = ctx->cur_block_size;
6995     const uint32_t bidx_end = ((tidx + 1) * cur_block_size) / calc_thread_ct;
6996     double* cur_results = ctx->results[parity];
6997     uint32_t* missing_cts = ctx->missing_cts[parity];
6998     uintptr_t row_idx = 0;
6999     uintptr_t variant_uidx_base;
7000     uintptr_t variant_include_bits;
7001     BitIter1Start(variant_include, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &variant_include_bits);
7002     for (uint32_t variant_bidx = (tidx * cur_block_size) / calc_thread_ct; variant_bidx != bidx_end; ++variant_bidx) {
7003       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
7004       if (variant_uidx >= chr_end) {
7005         const uint32_t chr_fo_idx = GetVariantChrFoIdx(cip, variant_uidx);
7006         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
7007         chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
7008         is_y = 0;
7009         is_nonxy_haploid = 0;
7010         if (chr_idx == x_code) {
7011           is_x_or_y = is_xchr_model_1;
7012         } else if (chr_idx == y_code) {
7013           is_x_or_y = 1;
7014           is_y = 1;
7015         } else {
7016           is_x_or_y = 0;
7017           is_nonxy_haploid = IsSet(cip->haploid_mask, chr_idx);
7018         }
7019         slope = (is_nonxy_haploid || is_y)? 0.5 : 1.0;
7020       }
7021       double ref_freq;
7022       if (!allele_idx_offsets) {
7023         ref_freq = allele_freqs[variant_uidx];
7024       } else {
7025         ref_freq = allele_freqs[allele_idx_offsets[variant_uidx] - variant_uidx];
7026       }
7027       const double missing_val = slope * 2 * (1.0 - ref_freq);
7028       if (!dosage_present) {
7029         uint32_t difflist_common_geno;
7030         uint32_t difflist_len;
7031         PglErr reterr = PgrGetDifflistOrGenovec(sample_include, pssi, sample_ct, max_sparse, variant_uidx, pgrp, genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len);
7032         if (unlikely(reterr)) {
7033           ctx->reterr = reterr;
7034           goto VscoreThread_err;
7035         }
7036         if (difflist_common_geno != UINT32_MAX) {
7037           if ((!is_x_or_y) && (!difflist_common_geno)) {
7038             double* target = &(cur_results[variant_bidx * vscore_ct]);
7039             uint32_t missing_ct = 0;
7040             if (!difflist_len) {
7041               ZeroDArr(vscore_ct, target);
7042             } else {
7043               ZeroTrailingNyps(difflist_len, raregeno);
7044               ZeroDArr(vscore_ct * 3, tmp_result_buf);
7045               const uint32_t word_ct_m1 = (difflist_len - 1) / kBitsPerWordD2;
7046               uint32_t loop_len = kBitsPerWordD2;
7047               for (uint32_t widx = 0; ; ++widx) {
7048                 if (widx >= word_ct_m1) {
7049                   if (widx > word_ct_m1) {
7050                     break;
7051                   }
7052                   loop_len = ModNz(difflist_len, kBitsPerWordD2);
7053                 }
7054                 // slightly nicer to work with 2..0 than 1..3 row-indexes
7055                 uintptr_t raregeno_word = raregeno[widx];
7056                 uintptr_t raregeno_invword = ~raregeno_word;
7057                 missing_ct += Popcount01Word(raregeno_word & (raregeno_word >> 1) & kMask5555);
7058                 const uint32_t* cur_difflist_sample_ids = &(difflist_sample_ids[widx * kBitsPerWordD2]);
7059                 for (uint32_t uii = 0; uii != loop_len; ++uii) {
7060                   const uintptr_t sample_idx = cur_difflist_sample_ids[uii];
7061                   const uint32_t cur_invgeno = raregeno_invword & 3;
7062                   const double* incr_src = &(wts_smaj[sample_idx * vscore_ct]);
7063                   double* incr_dst = &(tmp_result_buf[cur_invgeno * vscore_ct]);
7064                   for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7065                     incr_dst[ulii] += incr_src[ulii];
7066                   }
7067                   raregeno_invword = raregeno_invword >> 2;
7068                 }
7069               }
7070               if (!is_nonxy_haploid) {
7071                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7072                   target[ulii] = 2 * tmp_result_buf[ulii + vscore_ct] + tmp_result_buf[ulii + 2 * vscore_ct];
7073                 }
7074               } else {
7075                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7076                   target[ulii] = tmp_result_buf[ulii + vscore_ct] + 0.5 * tmp_result_buf[ulii + 2 * vscore_ct];
7077                 }
7078               }
7079               if (missing_ct) {
7080                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7081                   target[ulii] += missing_val * tmp_result_buf[ulii];
7082                 }
7083               }
7084             }
7085             if (missing_cts) {
7086               missing_cts[variant_bidx] = missing_ct;
7087             }
7088             continue;
7089           }
7090           PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genovec);
7091         }
7092       } else {
7093         PglErr reterr = PgrGetD(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_main, &dosage_ct);
7094         if (unlikely(reterr)) {
7095           ctx->reterr = reterr;
7096           goto VscoreThread_err;
7097         }
7098         if ((!is_x_or_y) && (dosage_ct <= max_sparse)) {
7099           STD_ARRAY_DECL(uint32_t, 4, genocounts);
7100           ZeroTrailingNyps(sample_ct, genovec);
7101           if (!dosage_ct) {
7102             // dosage_present contains garbage if dosage_ct == 0; might want to
7103             // append 'Unsafe' to PgrGetD and similar function names...
7104             ZeroWArr(BitCtToWordCt(sample_ct), dosage_present);
7105           }
7106           GenoarrCountInvsubsetFreqs2(genovec, dosage_present, sample_ct, sample_ct - dosage_ct, genocounts);
7107           if (genocounts[0] >= sample_ct - max_sparse) {
7108             double* target = &(cur_results[variant_bidx * vscore_ct]);
7109             if (genocounts[0] == sample_ct) {
7110               ZeroDArr(vscore_ct, target);
7111             } else {
7112               ZeroDArr(vscore_ct * 3, tmp_result_buf);
7113               const Halfword* dosage_present_alias = R_CAST(Halfword*, dosage_present);
7114               const uint32_t sample_ctl2 = DivUp(sample_ct, kBitsPerWordD2);
7115               for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
7116                 uintptr_t geno_word = genovec[widx];
7117                 if (!geno_word) {
7118                   continue;
7119                 }
7120                 const uintptr_t dosage_mask = UnpackHalfwordToWord(dosage_present_alias[widx]);
7121                 geno_word = geno_word & (~(dosage_mask * 3));
7122                 if (!geno_word) {
7123                   continue;
7124                 }
7125                 const double* cur_wts_smaj = &(wts_smaj[widx * kBitsPerWordD2 * vscore_ct]);
7126                 do {
7127                   const uint32_t shift_ct = ctzw(geno_word) & (~1);
7128                   const uintptr_t cur_invgeno = 3 & (~(geno_word >> shift_ct));
7129                   const double* incr_src = &(cur_wts_smaj[(shift_ct / 2) * vscore_ct]);
7130                   double* incr_dst = &(tmp_result_buf[cur_invgeno * vscore_ct]);
7131                   for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7132                     incr_dst[ulii] += incr_src[ulii];
7133                   }
7134                   geno_word &= ~((3 * k1LU) << shift_ct);
7135                 } while (geno_word);
7136               }
7137               if (!is_nonxy_haploid) {
7138                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7139                   target[ulii] = 2 * tmp_result_buf[ulii + vscore_ct] + tmp_result_buf[ulii + 2 * vscore_ct];
7140                 }
7141               } else {
7142                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7143                   target[ulii] = tmp_result_buf[ulii + vscore_ct] + 0.5 * tmp_result_buf[ulii + 2 * vscore_ct];
7144                 }
7145               }
7146               if (genocounts[3]) {
7147                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7148                   target[ulii] += missing_val * tmp_result_buf[ulii];
7149                 }
7150               }
7151               uintptr_t sample_idx_base = 0;
7152               uintptr_t dosage_present_bits = dosage_present[0];
7153               for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
7154                 const uintptr_t sample_idx = BitIter1(dosage_present, &sample_idx_base, &dosage_present_bits);
7155                 const double* incr_src = &(wts_smaj[sample_idx * vscore_ct]);
7156                 const double cur_dosage = slope * kRecipDosageMid * u31tod(dosage_main[dosage_idx]);
7157                 for (uintptr_t ulii = 0; ulii != vscore_ct; ++ulii) {
7158                   target[ulii] += cur_dosage * incr_src[ulii];
7159                 }
7160               }
7161             }
7162             if (missing_cts) {
7163               missing_cts[variant_bidx] = genocounts[3];
7164             }
7165             continue;
7166           }
7167         }
7168       }
7169 
7170       if (row_idx == kVscoreBlockSize) {
7171         RowMajorMatrixMultiply(dosage_vmaj, wts_smaj, kVscoreBlockSize, vscore_ct, sample_ct, tmp_result_buf);
7172         const double* tmp_result_iter = tmp_result_buf;
7173         for (uintptr_t ulii = 0; ulii != kVscoreBlockSize; ++ulii) {
7174           const uintptr_t cur_bidx = cur_bidxs[ulii];
7175           memcpy(&(cur_results[cur_bidx * vscore_ct]), tmp_result_iter, vscore_ct * sizeof(double));
7176           tmp_result_iter = &(tmp_result_iter[vscore_ct]);
7177         }
7178         row_idx = 0;
7179       }
7180       cur_bidxs[row_idx] = variant_bidx;
7181       double* cur_row = &(dosage_vmaj[row_idx * sample_ct]);
7182       ++row_idx;
7183       PopulateRescaledDosage(genovec, dosage_present, dosage_main, slope, 0.0, missing_val, sample_ct, dosage_ct, cur_row);
7184       if (is_x_or_y) {
7185         // Instead of doing this for every variant, we could precompute
7186         // chrX/chrY weight matrices with male weights halved/nonmale weights
7187         // zeroed out.  But the number of chrY variants is typically small
7188         // enough (and how often will --xchr-model 1 be used, anyway?) that I
7189         // don't think it's worth it.
7190         uintptr_t sample_uidx_base = 0;
7191         if (is_y) {
7192           // zero out nonmale values
7193           uintptr_t sex_male_invbits = ~sex_male[0];
7194           for (uint32_t nonmale_idx = 0; nonmale_idx != nonmale_ct; ++nonmale_idx) {
7195             const uintptr_t sample_uidx = BitIter0(sex_male, &sample_uidx_base, &sex_male_invbits);
7196             cur_row[sample_uidx] = 0.0;
7197           }
7198         } else {
7199           // xchr_model 1: halve male values
7200           uintptr_t sex_male_bits = sex_male[0];
7201           for (uint32_t male_idx = 0; male_idx != male_ct; ++male_idx) {
7202             const uintptr_t sample_uidx = BitIter1(sex_male, &sample_uidx_base, &sex_male_bits);
7203             cur_row[sample_uidx] *= 0.5;
7204           }
7205         }
7206       }
7207       if (missing_cts) {
7208         ZeroTrailingNyps(sample_ct, genovec);
7209         uint32_t missing_ct;
7210         if (!dosage_ct) {
7211           if (!is_y) {
7212             missing_ct = GenoarrCountMissingUnsafe(genovec, sample_ct);
7213           } else {
7214             missing_ct = GenoarrCountMissingSubset(genovec, sex_male_interleaved_vec, sample_ct);
7215           }
7216         } else {
7217           if (!is_y) {
7218             missing_ct = GenoarrCountMissingInvsubsetUnsafe(genovec, dosage_present, sample_ct);
7219           } else {
7220             // include males, exclude dosages
7221             const uint32_t fullword_ct = (sample_ct + kBitsPerWordD2 - 1) / kBitsPerWord;
7222             missing_ct = 0;
7223             for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
7224               uintptr_t w1 = genovec[2 * widx];
7225               uintptr_t w2 = genovec[2 * widx + 1];
7226               w1 = w1 & (w1 >> 1);
7227               w2 = w2 & (w2 >> 1);
7228               w1 = PackWordToHalfwordMask5555(w1);
7229               w2 = PackWordToHalfwordMask5555(w2);
7230               const uintptr_t ww = w1 | (w2 << kBitsPerWordD2);
7231               missing_ct += PopcountWord(ww & sex_male[widx] & (~dosage_present[widx]));
7232             }
7233             if (sample_ct > fullword_ct * kBitsPerWord) {
7234               uintptr_t w1 = genovec[2 * fullword_ct];
7235               w1 = w1 & (w1 >> 1);
7236               w1 = PackWordToHalfwordMask5555(w1);
7237               missing_ct += PopcountWord(w1 & sex_male[fullword_ct] & (~dosage_present[fullword_ct]));
7238             }
7239           }
7240         }
7241         missing_cts[variant_bidx] = missing_ct;
7242       }
7243     }
7244     if (row_idx) {
7245       RowMajorMatrixMultiply(dosage_vmaj, wts_smaj, row_idx, vscore_ct, sample_ct, tmp_result_buf);
7246       const double* tmp_result_iter = tmp_result_buf;
7247       for (uintptr_t ulii = 0; ulii != row_idx; ++ulii) {
7248         uintptr_t cur_bidx = cur_bidxs[ulii];
7249         memcpy(&(cur_results[cur_bidx * vscore_ct]), tmp_result_iter, vscore_ct * sizeof(double));
7250         tmp_result_iter = &(tmp_result_iter[vscore_ct]);
7251       }
7252     }
7253   VscoreThread_err:
7254     parity = 1 - parity;
7255   } while (!THREAD_BLOCK_FINISH(arg));
7256   THREAD_RETURN;
7257 }
7258 
Vscore(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const uintptr_t * sample_include,const SampleIdInfo * siip,const uintptr_t * sex_male,const double * allele_freqs,const char * in_fname,const RangeList * col_idx_range_listp,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t max_allele_slen,VscoreFlags flags,uint32_t xchr_model,uint32_t max_thread_ct,uintptr_t pgr_alloc_cacheline_ct,PgenFileInfo * pgfip,char * outname,char * outname_end)7259 PglErr Vscore(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* sample_include, const SampleIdInfo* siip, const uintptr_t* sex_male, const double* allele_freqs, const char* in_fname, const RangeList* col_idx_range_listp, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t max_allele_slen, VscoreFlags flags, uint32_t xchr_model, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, char* outname, char* outname_end) {
7260   unsigned char* bigstack_mark = g_bigstack_base;
7261   unsigned char* bigstack_end_mark = g_bigstack_end;
7262   uintptr_t line_idx = 0;
7263   char* cswritep = nullptr;
7264   FILE* binfile = nullptr;
7265   PglErr reterr = kPglRetSuccess;
7266   TextStream txs;
7267   ThreadGroup tg;
7268   CompressStreamState css;
7269   PreinitTextStream(&txs);
7270   PreinitThreads(&tg);
7271   PreinitCstream(&css);
7272   {
7273     // unsurprisingly, lots of overlap with --score
7274     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
7275     if (!xchr_model) {
7276       uint32_t x_code;
7277       if (XymtExists(cip, kChrOffsetX, &x_code)) {
7278         uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code];
7279         uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
7280         uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
7281         if (!AllBitsAreZero(variant_include, x_start, x_end)) {
7282           uintptr_t* variant_include_no_x;
7283           if (unlikely(bigstack_alloc_w(raw_variant_ctl, &variant_include_no_x))) {
7284             goto Vscore_ret_NOMEM;
7285           }
7286           memcpy(variant_include_no_x, variant_include, raw_variant_ctl * sizeof(intptr_t));
7287           variant_ct -= PopcountBitRange(variant_include, x_start, x_end);
7288           if (!variant_ct) {
7289             logerrputs("Error: No --variant-score variants remaining after --xchr-model 0.\n");
7290             goto Vscore_ret_INCONSISTENT_INPUT;
7291           }
7292           ClearBitsNz(x_start, x_end, variant_include_no_x);
7293           variant_include = variant_include_no_x;
7294         }
7295       }
7296     } else if (xchr_model == 2) {
7297       xchr_model = 0;
7298     }
7299     // now xchr_model is set iff it's 1
7300 
7301     // see KeepFcol() and SampleSortFileMap()
7302     char* line_start;
7303     XidMode xid_mode;
7304     reterr = OpenAndLoadXidHeader(in_fname, "variant-score", (siip->sids || (siip->flags & kfSampleIdStrictSid0))? kfXidHeaderFixedWidth : kfXidHeaderFixedWidthIgnoreSid, kTextStreamBlenFast, &txs, &xid_mode, &line_idx, &line_start, nullptr);
7305     if (unlikely(reterr)) {
7306       if (reterr == kPglRetEof) {
7307         logerrputs("Error: Empty --variant-score file.\n");
7308         reterr = kPglRetMalformedInput;
7309       }
7310       goto Vscore_ret_1;
7311     }
7312     const uint32_t id_col_ct = GetXidColCt(xid_mode);
7313     const uint32_t col_ct = CountTokens(line_start);
7314     if (unlikely(id_col_ct == col_ct)) {
7315       logerrputs("Error: No score columns in --variant-score file.\n");
7316       goto Vscore_ret_MALFORMED_INPUT;
7317     }
7318     uintptr_t vscore_ct;
7319     uint32_t* col_idx_deltas;
7320     if (!col_idx_range_listp->name_ct) {
7321       vscore_ct = col_ct - id_col_ct;
7322       if (unlikely(bigstack_alloc_u32(vscore_ct, &col_idx_deltas))) {
7323         goto Vscore_ret_NOMEM;
7324       }
7325       for (uint32_t uii = 0; uii != vscore_ct; ++uii) {
7326         col_idx_deltas[uii] = 1;
7327       }
7328     } else {
7329       const uint32_t col_ctl = BitCtToWordCt(col_ct);
7330       uintptr_t* vscore_col_bitarr;
7331       if (unlikely(bigstack_calloc_w(col_ctl, &vscore_col_bitarr))) {
7332         goto Vscore_ret_NOMEM;
7333       }
7334       if (unlikely(NumericRangeListToBitarr(col_idx_range_listp, col_ct, 1, 0, vscore_col_bitarr))) {
7335         goto Vscore_ret_MISSING_TOKENS;
7336       }
7337       if (vscore_col_bitarr[0] & ((1 << id_col_ct) - 1)) {
7338         logerrputs("Error: --vscore-col-nums argument overlaps with ID columns.\n");
7339         goto Vscore_ret_INCONSISTENT_INPUT;
7340       }
7341       vscore_ct = PopcountWords(vscore_col_bitarr, col_ctl);
7342       // since we don't allow overflow, this should be guaranteed to be
7343       // positive
7344       assert(vscore_ct);
7345       if (unlikely(bigstack_alloc_u32(vscore_ct, &col_idx_deltas))) {
7346         goto Vscore_ret_NOMEM;
7347       }
7348       uintptr_t col_uidx_base = 0;
7349       uintptr_t vscore_col_bitarr_bits = vscore_col_bitarr[0];
7350       for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7351         const uint32_t col_uidx = BitIter1(vscore_col_bitarr, &col_uidx_base, &vscore_col_bitarr_bits);
7352         col_idx_deltas[vscore_idx] = col_uidx;
7353       }
7354       // now convert to deltas
7355       for (uintptr_t vscore_idx = vscore_ct - 1; vscore_idx; --vscore_idx) {
7356         col_idx_deltas[vscore_idx] -= col_idx_deltas[vscore_idx - 1];
7357       }
7358       col_idx_deltas[0] -= id_col_ct - 1;
7359     }
7360     char** vscore_names;
7361     if (unlikely(bigstack_end_alloc_cp(vscore_ct, &vscore_names))) {
7362       goto Vscore_ret_NOMEM;
7363     }
7364     const uint32_t is_header_line = (line_start[0] == '#');
7365     unsigned char* tmp_alloc_base = g_bigstack_base;
7366     unsigned char* tmp_alloc_end = g_bigstack_end;
7367     if (is_header_line) {
7368       const char* name_iter = NextTokenMult0(line_start, id_col_ct - 1);
7369       for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7370         name_iter = NextTokenMult(name_iter, col_idx_deltas[vscore_idx]);
7371         const char* name_end = CurTokenEnd(name_iter);
7372         // don't actually need to enforce unique names, though we could print a
7373         // warning later
7374         const uint32_t cur_slen = name_end - name_iter;
7375         if (cur_slen > kMaxIdSlen) {
7376           snprintf(g_logbuf, kLogbufSize, "Error: Variant-score name in column %" PRIuPTR " of %s is too long.\n", vscore_idx + id_col_ct + 1, in_fname);
7377           goto Vscore_ret_MALFORMED_INPUT_WW;
7378         }
7379         if (StoreStringAtEnd(tmp_alloc_base, name_iter, cur_slen, &tmp_alloc_end, &(vscore_names[vscore_idx]))) {
7380           goto Vscore_ret_NOMEM;
7381         }
7382         name_iter = name_end;
7383       }
7384       ++line_idx;
7385       line_start = TextGet(&txs);
7386     } else {
7387       for (uintptr_t vscore_num = 1; vscore_num <= vscore_ct; ++vscore_num) {
7388         const uint32_t cur_blen = 7 + UintSlen(vscore_num);
7389         if (PtrWSubCk(tmp_alloc_base, cur_blen, &tmp_alloc_end)) {
7390           goto Vscore_ret_NOMEM;
7391         }
7392         char* cur_name_iter = R_CAST(char*, tmp_alloc_end);
7393         vscore_names[vscore_num - 1] = cur_name_iter;
7394         cur_name_iter = strcpya_k(cur_name_iter, "VSCORE");
7395         cur_name_iter = u32toa(vscore_num, cur_name_iter);
7396         *cur_name_iter = '\0';
7397       }
7398     }
7399     BigstackEndSet(tmp_alloc_end);
7400     uint32_t* xid_map;
7401     char* sorted_xidbox;
7402     uintptr_t max_xid_blen;
7403     reterr = SortedXidboxInitAlloc(sample_include, siip, sample_ct, 0, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
7404     if (unlikely(reterr)) {
7405       goto Vscore_ret_1;
7406     }
7407     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7408 #ifndef __LP64__
7409     if (sample_ct * S_CAST(uint64_t, vscore_ct) >= 0x80000000U / sizeof(double)) {
7410       goto Vscore_ret_NOMEM;
7411     }
7412 #endif
7413     char* idbuf;
7414     uintptr_t* already_seen;
7415     double* raw_wts;
7416     uint32_t* sample_uidx_order;
7417     if (unlikely(
7418             bigstack_alloc_c(siip->max_sample_id_blen, &idbuf) ||
7419             bigstack_alloc_u32(sample_ct, &sample_uidx_order) ||
7420             bigstack_alloc_d(sample_ct * vscore_ct, &raw_wts) ||
7421             bigstack_end_calloc_w(raw_sample_ctl, &already_seen))) {
7422       goto Vscore_ret_NOMEM;
7423     }
7424     uintptr_t miss_ct = 0;
7425     uint32_t hit_ct = 0;
7426 
7427     for (double* raw_wts_iter = raw_wts; line_start; ++line_idx, line_start = TextGet(&txs)) {
7428       if (unlikely(line_start[0] == '#')) {
7429         snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --variant-score file starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx);
7430         goto Vscore_ret_MALFORMED_INPUT_WW;
7431       }
7432       const char* linebuf_iter = line_start;
7433       uint32_t sample_uidx;
7434       if (SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx, idbuf)) {
7435         if (unlikely(!linebuf_iter)) {
7436           goto Vscore_ret_MISSING_TOKENS;
7437         }
7438         ++miss_ct;
7439         continue;
7440       }
7441       if (unlikely(IsSet(already_seen, sample_uidx))) {
7442         char* tab_iter = AdvToDelim(idbuf, '\t');
7443         *tab_iter = ' ';
7444         if (xid_mode & kfXidModeFlagSid) {
7445           *AdvToDelim(&(tab_iter[1]), '\t') = ' ';
7446         }
7447         snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID '%s' in --variant-score file.\n", idbuf);
7448         goto Vscore_ret_MALFORMED_INPUT_WW;
7449       }
7450       SetBit(sample_uidx, already_seen);
7451       sample_uidx_order[hit_ct] = sample_uidx;
7452       for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx, ++raw_wts_iter) {
7453         linebuf_iter = NextTokenMult(linebuf_iter, col_idx_deltas[vscore_idx]);
7454         if (unlikely(!linebuf_iter)) {
7455           goto Vscore_ret_MISSING_TOKENS;
7456         }
7457         const char* token_end = ScantokDouble(linebuf_iter, raw_wts_iter);
7458         if (unlikely(!token_end)) {
7459           token_end = CurTokenEnd(linebuf_iter);
7460           *K_CAST(char*, token_end) = '\0';
7461           snprintf(g_logbuf, kLogbufSize, "Error: Invalid coefficient '%s' on line %" PRIuPTR " of --variant-score file.\n", linebuf_iter, line_idx);
7462           goto Vscore_ret_MALFORMED_INPUT_WW;
7463         }
7464         linebuf_iter = token_end;
7465       }
7466       ++hit_ct;
7467     }
7468     if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
7469       goto Vscore_ret_TSTREAM_FAIL;
7470     }
7471     if (unlikely(CleanupTextStream2(in_fname, &txs, &reterr))) {
7472       goto Vscore_ret_1;
7473     }
7474     if (!hit_ct) {
7475       logerrputs("Error: No valid entries in --variant-score file.\n");
7476       goto Vscore_ret_INCONSISTENT_INPUT;
7477     }
7478     sample_include = already_seen;
7479     sample_ct = hit_ct;
7480 #if defined(__LP64__) && !defined(LAPACK_ILP64)
7481     if (sample_ct * vscore_ct > 0x7fffffff) {
7482       logerrputs("Error: --variant-score input matrix too large for this " PROG_NAME_STR " build.  If this\nis really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
7483       goto Vscore_ret_INCONSISTENT_INPUT;
7484     }
7485 #endif
7486     VscoreCtx ctx;
7487     ctx.variant_include = variant_include;
7488     ctx.cip = cip;
7489     ctx.allele_idx_offsets = allele_idx_offsets;
7490     ctx.allele_freqs = allele_freqs;
7491     ctx.sample_include = sample_include;
7492     const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
7493     const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
7494     uint32_t calc_thread_ct = max_thread_ct;
7495     uint32_t compress_thread_ct = 1;
7496     const uint32_t output_zst = (flags / kfVscoreZs) & 1;
7497     snprintf(outname_end, kMaxOutfnameExtBlen, ".vscore");
7498     if (flags & kfVscoreBin) {
7499       snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".cols");
7500       if (unlikely(fopen_checked(outname, FOPEN_WB, &binfile))) {
7501         goto Vscore_ret_OPEN_FAIL;
7502       }
7503       for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7504         fputs(vscore_names[vscore_idx], binfile);
7505 #ifdef _WIN32
7506         putc_unlocked('\r', binfile);
7507 #endif
7508         putc_unlocked('\n', binfile);
7509       }
7510       if (unlikely(fclose_null(&binfile))) {
7511         goto Vscore_ret_WRITE_FAIL;
7512       }
7513       snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".bin");
7514       if (unlikely(fopen_checked(outname, FOPEN_WB, &binfile))) {
7515         goto Vscore_ret_OPEN_FAIL;
7516       }
7517       snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".vars");
7518       if (output_zst) {
7519         snprintf(&(outname_end[12]), kMaxOutfnameExtBlen - 12, ".zst");
7520       }
7521     } else if (output_zst) {
7522       snprintf(&(outname_end[7]), kMaxOutfnameExtBlen - 7, ".zst");
7523       if (calc_thread_ct > 1) {
7524         // The more samples there are, the higher the compute:compress ratio we
7525         // want, though this is not a linear relationship due to the sparse
7526         // optimization.
7527         // 1:1 split seems to work well for a few thousand samples; I'm
7528         // guessing that ~7:1 is better for hundreds of thousands.
7529         if (sample_ct < 8192) {
7530           compress_thread_ct = calc_thread_ct / 2;
7531         } else {
7532           const uint32_t log2_sample_ct_m10 = bsru32(sample_ct) - 10;
7533           // 3/8, 4/16, 5/24, ...
7534           compress_thread_ct = (calc_thread_ct * log2_sample_ct_m10) / (8 * (log2_sample_ct_m10 - 2));
7535           if (!compress_thread_ct) {
7536             compress_thread_ct = 1;
7537           }
7538         }
7539         calc_thread_ct -= compress_thread_ct;
7540       }
7541     }
7542     {
7543       uint32_t* sample_include_cumulative_popcounts;
7544       double* wts_smaj;
7545       if (unlikely(
7546               bigstack_end_alloc_u32(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
7547               bigstack_end_alloc_d(sample_ct * vscore_ct, &wts_smaj))) {
7548         goto Vscore_ret_NOMEM;
7549       }
7550       FillCumulativePopcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
7551       ctx.sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
7552       logprintfww("--variant-score: %" PRIuPTR " score-vector%s loaded for %u sample%s.\n", vscore_ct, (vscore_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
7553       if (miss_ct) {
7554         logerrprintf("Warning: %" PRIuPTR " line%s skipped in --variant-score file.\n", miss_ct, (miss_ct == 1)? "" : "s");
7555       }
7556       const double* wts_read_iter = raw_wts;
7557       for (uint32_t uii = 0; uii != sample_ct; ++uii) {
7558         const uint32_t sample_uidx = sample_uidx_order[uii];
7559         const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_uidx);
7560         memcpy(&(wts_smaj[sample_idx * vscore_ct]), wts_read_iter, vscore_ct * sizeof(double));
7561         wts_read_iter = &(wts_read_iter[vscore_ct]);
7562       }
7563       ctx.wts_smaj = wts_smaj;
7564       BigstackReset(bigstack_mark);
7565       const uint32_t sample_ctv = BitCtToVecCt(sample_ct);
7566       uintptr_t* sex_male_collapsed;
7567       uintptr_t* sex_male_interleaved_vec;
7568       if (unlikely(
7569               bigstack_alloc_w(sample_ctl, &sex_male_collapsed) ||
7570               bigstack_alloc_w(sample_ctv * kWordsPerVec, &sex_male_interleaved_vec) ||
7571               bigstack_alloc_wp(calc_thread_ct, &ctx.raregenos) ||
7572               bigstack_alloc_u32p(calc_thread_ct, &ctx.difflist_sample_id_bufs) ||
7573               bigstack_alloc_dp(calc_thread_ct, &ctx.dosage_vmaj_bufs) ||
7574               bigstack_alloc_dp(calc_thread_ct, &ctx.tmp_result_bufs))) {
7575         goto Vscore_ret_NOMEM;
7576       }
7577       CopyBitarrSubset(sex_male, sample_include, sample_ct, sex_male_collapsed);
7578       FillInterleavedMaskVec(sex_male_collapsed, sample_ctv, sex_male_interleaved_vec);
7579       ctx.sex_male_collapsed = sex_male_collapsed;
7580       ctx.sex_male_interleaved_vec = sex_male_interleaved_vec;
7581     }
7582     ctx.vscore_ct = vscore_ct;
7583     ctx.sample_ct = sample_ct;
7584     const uint32_t male_ct = PopcountWords(ctx.sex_male_collapsed, sample_ctl);
7585     ctx.male_ct = male_ct;
7586     ctx.is_xchr_model_1 = xchr_model;
7587 
7588     const uint32_t chr_col = (flags / kfVscoreColChrom) & 1;
7589     char* chr_buf = nullptr;
7590     uint32_t max_chr_blen = 0;
7591     if (chr_col) {
7592       max_chr_blen = GetMaxChrSlen(cip) + 1;
7593       if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
7594         goto Vscore_ret_NOMEM;
7595       }
7596     }
7597     const uint32_t ref_col = (flags / kfVscoreColRef) & 1;
7598     const uint32_t alt1_col = (flags / kfVscoreColAlt1) & 1;
7599     const uint32_t alt_col = (flags / kfVscoreColAlt) & 1;
7600     uintptr_t overflow_buf_size;
7601     if (binfile) {
7602       overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 16;
7603     } else {
7604       overflow_buf_size = kCompressStreamBlock + max_chr_blen * chr_col + kMaxIdSlen + 128 + (24 * k1LU) * vscore_ct + MAXV(ref_col + alt1_col, alt_col) * max_allele_slen;
7605     }
7606     reterr = InitCstreamAlloc(outname, 0, output_zst, compress_thread_ct, overflow_buf_size, &css, &cswritep);
7607     if (unlikely(reterr)) {
7608       goto Vscore_ret_1;
7609     }
7610     const uint32_t nmiss_col = (flags / kfVscoreColNmiss) & 1;
7611     const uint32_t nobs_col = (flags / kfVscoreColNobs) & 1;
7612     if (!binfile) {
7613       *cswritep++ = '#';
7614       if (chr_col) {
7615         cswritep = strcpya_k(cswritep, "CHROM\t");
7616       }
7617       if (flags & kfVscoreColPos) {
7618         cswritep = strcpya_k(cswritep, "POS\t");
7619       } else {
7620         variant_bps = nullptr;
7621       }
7622       cswritep = strcpya_k(cswritep, "ID");
7623       if (ref_col) {
7624         cswritep = strcpya_k(cswritep, "\tREF");
7625       }
7626       if (alt1_col) {
7627         cswritep = strcpya_k(cswritep, "\tALT1");
7628       }
7629       if (alt_col) {
7630         cswritep = strcpya_k(cswritep, "\tALT");
7631       }
7632       if (flags & kfVscoreColAltfreq) {
7633         cswritep = strcpya_k(cswritep, "\tALT_FREQ");
7634       } else {
7635         allele_freqs = nullptr;
7636       }
7637       if (nmiss_col) {
7638         cswritep = strcpya_k(cswritep, "\tMISSING_CT");
7639       }
7640       if (nobs_col) {
7641         cswritep = strcpya_k(cswritep, "\tOBS_CT");
7642       }
7643       for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7644         *cswritep++ = '\t';
7645         cswritep = strcpya(cswritep, vscore_names[vscore_idx]);
7646         if (unlikely(Cswrite(&css, &cswritep))) {
7647           goto Vscore_ret_WRITE_FAIL;
7648         }
7649       }
7650       AppendBinaryEoln(&cswritep);
7651     }
7652 
7653     if (nmiss_col || nobs_col) {
7654       if (unlikely(
7655               bigstack_alloc_u32(kPglVblockSize, &ctx.missing_cts[0]) ||
7656               bigstack_alloc_u32(kPglVblockSize, &ctx.missing_cts[1]))) {
7657         goto Vscore_ret_NOMEM;
7658       }
7659     } else {
7660       ctx.missing_cts[0] = nullptr;
7661       ctx.missing_cts[1] = nullptr;
7662     }
7663 
7664     const uint32_t max_returned_difflist_len = 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor);
7665     // * Per-thread raregeno buffers must have space for
7666     //   max_returned_difflist_len nyps, and difflist_sample_ids buffers need
7667     //   space for that many uint32s.
7668     // * Per-thread dosage_vmaj buffers must have space for
7669     //   kVscoreBlockSize * sample_ct elements.
7670     // * Per-thread result buffers must have space for kVscoreBlockSize *
7671     //   vscore_ct elements.
7672     const uintptr_t thread_xalloc_cacheline_ct = DivUp(max_returned_difflist_len, kNypsPerCacheline) + DivUp(max_returned_difflist_len, kInt32PerCacheline) + DivUp(kVscoreBlockSize * S_CAST(uintptr_t, sample_ct) * sizeof(double), kCacheline) + DivUp(kVscoreBlockSize * vscore_ct * sizeof(double), kCacheline);
7673 
7674     // ctx.results must have space for 2 * vscore_ct * read_block_size doubles.
7675     const uintptr_t per_variant_xalloc_byte_ct = 2 * vscore_ct * sizeof(double);
7676     STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
7677     // defensive
7678     ctx.dosage_presents = nullptr;
7679     ctx.dosage_mains = nullptr;
7680     uint32_t read_block_size;
7681     if (unlikely(PgenMtLoadInit(variant_include, sample_ct, variant_ct, bigstack_left(), pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, per_variant_xalloc_byte_ct, 0, pgfip, &calc_thread_ct, &ctx.genovecs, nullptr, nullptr, nullptr, dosage_is_present? (&ctx.dosage_presents) : nullptr, dosage_is_present? (&ctx.dosage_mains) : nullptr, nullptr, nullptr, &read_block_size, nullptr, main_loadbufs, &ctx.pgr_ptrs, &ctx.read_variant_uidx_starts))) {
7682       goto Vscore_ret_NOMEM;
7683     }
7684     if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
7685       goto Vscore_ret_NOMEM;
7686     }
7687     {
7688       // could vector-align individual allocations and only cacheline-align at
7689       // thread boundaries, but the savings are microscopic
7690       const uintptr_t raregeno_alloc = kCacheline * DivUp(max_returned_difflist_len, kNypsPerCacheline);
7691       const uintptr_t difflist_sample_ids_alloc = RoundUpPow2(max_returned_difflist_len * sizeof(int32_t), kCacheline);
7692       const uintptr_t dosage_vmaj_alloc = RoundUpPow2(kVscoreBlockSize * S_CAST(uintptr_t, sample_ct) * sizeof(double), kCacheline);
7693       const uintptr_t tmp_result_alloc = RoundUpPow2(kVscoreBlockSize * vscore_ct * sizeof(double), kCacheline);
7694       for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
7695         ctx.raregenos[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(raregeno_alloc));
7696         ctx.difflist_sample_id_bufs[tidx] = S_CAST(uint32_t*, bigstack_alloc_raw(difflist_sample_ids_alloc));
7697         ctx.dosage_vmaj_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(dosage_vmaj_alloc));
7698         ctx.tmp_result_bufs[tidx] = S_CAST(double*, bigstack_alloc_raw(tmp_result_alloc));
7699       }
7700     }
7701     const uintptr_t results_byte_ct = RoundUpPow2(per_variant_xalloc_byte_ct * read_block_size, kCacheline);
7702     ctx.results[0] = S_CAST(double*, bigstack_alloc_raw(results_byte_ct));
7703     ctx.results[1] = S_CAST(double*, bigstack_alloc_raw(results_byte_ct));
7704     assert(g_bigstack_base <= g_bigstack_end);
7705     ctx.reterr = kPglRetSuccess;
7706     SetThreadFuncAndData(VscoreThread, &ctx, &tg);
7707 
7708     fputs("--variant-score: 0%", stdout);
7709     fflush(stdout);
7710     const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
7711     // Main workflow:
7712     // 1. Set n=0, load/skip block 0
7713     //
7714     // 2. Spawn threads processing block n
7715     // 3. If n>0, write results for block (n-1)
7716     // 4. Increment n by 1
7717     // 5. Load/skip block n unless eof
7718     // 6. Join threads
7719     // 7. Goto step 2 unless eof
7720     //
7721     // 8. Write results for last block
7722     uintptr_t write_variant_uidx_base = 0;
7723     uintptr_t cur_bits = variant_include[0];
7724     uint32_t prev_block_size = 0;
7725     uint32_t pct = 0;
7726     uint32_t next_print_variant_idx = variant_ct / 100;
7727     uint32_t parity = 0;
7728     uint32_t read_block_idx = 0;
7729     uint32_t chr_fo_idx = UINT32_MAX;
7730     uint32_t chr_end = 0;
7731     uint32_t chr_buf_blen = 0;
7732     uint32_t cur_sample_ct = 0;
7733     uint32_t cur_allele_ct = 2;
7734     for (uint32_t variant_idx = 0; ; ++read_block_idx) {
7735       const uint32_t cur_block_size = MultireadNonempty(variant_include, &tg, raw_variant_ct, read_block_size, pgfip, &read_block_idx, &reterr);
7736       if (unlikely(reterr)) {
7737         goto Vscore_ret_PGR_FAIL;
7738       }
7739       if (variant_idx) {
7740         JoinThreads(&tg);
7741         reterr = ctx.reterr;
7742         if (unlikely(reterr)) {
7743           goto Vscore_ret_PGR_FAIL;
7744         }
7745       }
7746       if (!IsLastBlock(&tg)) {
7747         // it may make sense to put this boilerplate into its own function,
7748         // too...
7749         ctx.cur_block_size = cur_block_size;
7750         ComputeUidxStartPartition(variant_include, cur_block_size, calc_thread_ct, read_block_idx * read_block_size, ctx.read_variant_uidx_starts);
7751         PgrCopyBaseAndOffset(pgfip, calc_thread_ct, ctx.pgr_ptrs);
7752         if (variant_idx + cur_block_size == variant_ct) {
7753           DeclareLastThreadBlock(&tg);
7754         }
7755         if (unlikely(SpawnThreads(&tg))) {
7756           goto Vscore_ret_THREAD_CREATE_FAIL;
7757         }
7758       }
7759       parity = 1 - parity;
7760       if (variant_idx) {
7761         // write *previous* block results
7762         const double* cur_results_iter = ctx.results[parity];
7763         const uint32_t* cur_missing_cts = ctx.missing_cts[parity];
7764         for (uint32_t variant_bidx = 0; variant_bidx != prev_block_size; ++variant_bidx) {
7765           const uint32_t write_variant_uidx = BitIter1(variant_include, &write_variant_uidx_base, &cur_bits);
7766           if (write_variant_uidx >= chr_end) {
7767             do {
7768               ++chr_fo_idx;
7769               chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
7770             } while (write_variant_uidx >= chr_end);
7771             const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
7772             cur_sample_ct = (chr_idx == y_code)? male_ct : sample_ct;
7773             if (chr_buf) {
7774               char* chr_name_end = chrtoa(cip, chr_idx, chr_buf);
7775               *chr_name_end = '\t';
7776               chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
7777             }
7778           }
7779           if (binfile) {
7780             // may as well write variant-ID file in this loop
7781             cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
7782             AppendBinaryEoln(&cswritep);
7783             if (unlikely(Cswrite(&css, &cswritep))) {
7784               goto Vscore_ret_WRITE_FAIL;
7785             }
7786             continue;
7787           }
7788           if (chr_col) {
7789             cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
7790           }
7791           if (variant_bps) {
7792             cswritep = u32toa_x(variant_bps[write_variant_uidx], '\t', cswritep);
7793           }
7794           cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
7795           uintptr_t allele_idx_offset_base = write_variant_uidx * 2;
7796           if (allele_idx_offsets) {
7797             allele_idx_offset_base = allele_idx_offsets[write_variant_uidx];
7798             cur_allele_ct = allele_idx_offsets[write_variant_uidx + 1] - allele_idx_offset_base;
7799           }
7800           const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
7801           if (ref_col) {
7802             *cswritep++ = '\t';
7803             cswritep = strcpya(cswritep, cur_alleles[0]);
7804           }
7805           if (alt1_col) {
7806             *cswritep++ = '\t';
7807             cswritep = strcpya(cswritep, cur_alleles[1]);
7808           }
7809           if (alt_col) {
7810             *cswritep++ = '\t';
7811             for (uint32_t allele_idx = 1; allele_idx != cur_allele_ct; ++allele_idx) {
7812               if (unlikely(Cswrite(&css, &cswritep))) {
7813                 goto Vscore_ret_WRITE_FAIL;
7814               }
7815               cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
7816             }
7817             --cswritep;
7818           }
7819           if (allele_freqs) {
7820             *cswritep++ = '\t';
7821             cswritep = dtoa_g(1.0 - allele_freqs[allele_idx_offset_base - write_variant_uidx], cswritep);
7822           }
7823           if (nmiss_col) {
7824             *cswritep++ = '\t';
7825             cswritep = u32toa(cur_missing_cts[variant_bidx], cswritep);
7826           }
7827           if (nobs_col) {
7828             *cswritep++ = '\t';
7829             cswritep = u32toa(cur_sample_ct - cur_missing_cts[variant_bidx], cswritep);
7830           }
7831           for (uintptr_t vscore_idx = 0; vscore_idx != vscore_ct; ++vscore_idx) {
7832             *cswritep++ = '\t';
7833             cswritep = dtoa_g(*cur_results_iter++, cswritep);
7834           }
7835           AppendBinaryEoln(&cswritep);
7836           if (unlikely(Cswrite(&css, &cswritep))) {
7837             goto Vscore_ret_WRITE_FAIL;
7838           }
7839         }
7840         if (binfile) {
7841           if (unlikely(fwrite_checked(cur_results_iter, vscore_ct * prev_block_size * sizeof(double), binfile))) {
7842             goto Vscore_ret_WRITE_FAIL;
7843           }
7844         }
7845         if (variant_idx == variant_ct) {
7846           break;
7847         }
7848         if (variant_idx >= next_print_variant_idx) {
7849           if (pct > 10) {
7850             putc_unlocked('\b', stdout);
7851           }
7852           pct = (variant_idx * 100LLU) / variant_ct;
7853           printf("\b\b%u%%", pct++);
7854           fflush(stdout);
7855           next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
7856         }
7857       }
7858       prev_block_size = cur_block_size;
7859       variant_idx += cur_block_size;
7860       pgfip->block_base = main_loadbufs[parity];
7861     }
7862     if (unlikely(CswriteCloseNull(&css, cswritep))) {
7863       goto Vscore_ret_WRITE_FAIL;
7864     }
7865     putc_unlocked('\r', stdout);
7866     if (!binfile) {
7867       logprintfww("--variant-score: Results written to %s .\n", outname);
7868     } else {
7869       if (unlikely(fclose_null(&binfile))) {
7870         goto Vscore_ret_WRITE_FAIL;
7871       }
7872       outname_end[8] = '\0';
7873       logprintfww("--variant-score: Score matrix written to %sbin , and associated column and variant ID labels written to %scols and %svars%s .\n", outname, outname, outname, output_zst? ".zst" : "");
7874     }
7875   }
7876   while (0) {
7877   Vscore_ret_NOMEM:
7878     reterr = kPglRetNomem;
7879     break;
7880   Vscore_ret_OPEN_FAIL:
7881     reterr = kPglRetOpenFail;
7882     break;
7883   Vscore_ret_TSTREAM_FAIL:
7884     TextStreamErrPrint("--variant-score file", &txs);
7885     break;
7886   Vscore_ret_PGR_FAIL:
7887     PgenErrPrintN(reterr);
7888     break;
7889   Vscore_ret_WRITE_FAIL:
7890     reterr = kPglRetWriteFail;
7891     break;
7892   Vscore_ret_MALFORMED_INPUT_WW:
7893     WordWrapB(0);
7894     logerrputsb();
7895   Vscore_ret_MALFORMED_INPUT:
7896     reterr = kPglRetMalformedInput;
7897     break;
7898   Vscore_ret_MISSING_TOKENS:
7899     logerrprintfww("Error: Line %" PRIuPTR " of --variant-score file has fewer tokens than expected.\n", line_idx);
7900   Vscore_ret_INCONSISTENT_INPUT:
7901     reterr = kPglRetInconsistentInput;
7902     break;
7903   Vscore_ret_THREAD_CREATE_FAIL:
7904     reterr = kPglRetThreadCreateFail;
7905     break;
7906   }
7907  Vscore_ret_1:
7908   fclose_cond(binfile);
7909   CswriteCloseCond(&css, cswritep);
7910   CleanupThreads(&tg);
7911   CleanupTextStream2("--variant-score file", &txs, &reterr);
7912   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
7913   return reterr;
7914 }
7915 
7916 #ifdef __cplusplus
7917 }  // namespace plink2
7918 #endif
7919