1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink_common.h"
19 
20 #include <stddef.h>
21 #include "plink_assoc.h"
22 #include "plink_glm.h"
23 #include "plink_ld.h"
24 #include "plink_stats.h"
25 #include "pigz.h"
26 
27 #define MULTIPLEX_LD 1920
28 #define MULTIPLEX_2LD (MULTIPLEX_LD * 2)
29 
ld_epi_init(Ld_info * ldip,Epi_info * epi_ip,Clump_info * clump_ip)30 void ld_epi_init(Ld_info* ldip, Epi_info* epi_ip, Clump_info* clump_ip) {
31   ldip->modifier = 0;
32   ldip->prune_window_size = 0;
33   ldip->prune_window_incr = 0;
34   ldip->prune_last_param = 0.0;
35   ldip->window_size = 0xffffffffU;
36   ldip->window_bp = 0xffffffffU;
37   ldip->window_cm = -1;
38   ldip->window_r2 = 0.2;
39   ldip->blocks_max_bp = 0xffffffffU;
40   ldip->blocks_min_maf = 0.05;
41   ldip->blocks_strong_lowci_outer = 71;
42   ldip->blocks_strong_lowci = 72;
43   ldip->blocks_strong_highci = 97;
44   ldip->blocks_recomb_highci = 89;
45   ldip->blocks_inform_frac = 0.95;
46   ldip->flipscan_window_size = 10;
47   ldip->flipscan_window_bp = 0xffffffffU;
48   ldip->flipscan_thresh = 0.5;
49   ldip->show_tags_bp = 250000;
50   ldip->show_tags_r2 = 0.8;
51   ldip->snpstr = nullptr;
52   ldip->show_tags_fname = nullptr;
53   range_list_init(&(ldip->snps_rl));
54   epi_ip->modifier = 0;
55   epi_ip->case_only_gap = 1000000;
56   epi_ip->epi1 = 0.0;
57   epi_ip->epi2 = 0.01;
58   epi_ip->je_cellmin = 5;
59   epi_ip->ld_mkr1 = nullptr;
60   epi_ip->ld_mkr2 = nullptr;
61   epi_ip->twolocus_mkr1 = nullptr;
62   epi_ip->twolocus_mkr2 = nullptr;
63   epi_ip->summary_merge_prefix = nullptr;
64   clump_ip->modifier = 0;
65   clump_ip->fname_ct = 0;
66   clump_ip->bp_radius = 249999;
67   clump_ip->range_border = 0;
68   clump_ip->fnames_flattened = nullptr;
69   clump_ip->annotate_flattened = nullptr;
70   clump_ip->snpfield_search_order = nullptr;
71   clump_ip->pfield_search_order = nullptr;
72   clump_ip->range_fname = nullptr;
73   clump_ip->p1 = 1e-4;
74   clump_ip->p2 = 1e-2;
75   clump_ip->r2 = 0.5;
76 }
77 
ld_epi_cleanup(Ld_info * ldip,Epi_info * epi_ip,Clump_info * clump_ip)78 void ld_epi_cleanup(Ld_info* ldip, Epi_info* epi_ip, Clump_info* clump_ip) {
79   free_cond(ldip->snpstr);
80   free_cond(ldip->show_tags_fname);
81   free_range_list(&(ldip->snps_rl));
82   free_cond(epi_ip->ld_mkr1);
83   free_cond(epi_ip->ld_mkr2);
84   free_cond(epi_ip->twolocus_mkr1);
85   free_cond(epi_ip->twolocus_mkr2);
86   free_cond(epi_ip->summary_merge_prefix);
87   free_cond(clump_ip->fnames_flattened);
88   free_cond(clump_ip->annotate_flattened);
89   free_cond(clump_ip->snpfield_search_order);
90   free_cond(clump_ip->pfield_search_order);
91   free_cond(clump_ip->range_fname);
92 }
93 
94 #ifdef __LP64__
ld_dot_prod_batch(__m128i * vec1,__m128i * vec2,__m128i * mask1,__m128i * mask2,int32_t * return_vals,uint32_t iters)95 static inline void ld_dot_prod_batch(__m128i* vec1, __m128i* vec2, __m128i* mask1, __m128i* mask2, int32_t* return_vals, uint32_t iters) {
96   // Main routine for computation of \sum_i^M (x_i - \mu_x)(y_i - \mu_y), where
97   // x_i, y_i \in \{-1, 0, 1\}, but there are missing values.
98   //
99   //
100   // We decompose this sum into
101   //   \sum_i x_iy_i - \mu_y\sum_i x_i - \mu_x\sum_i y_i +
102   //   (M - # missing)\mu_x\mu_y.
103   // *Without* missing values, this can be handled very cleanly.  The last
104   // three terms can all be precomputed, and \sum_i x_iy_i can be handled in a
105   // manner very similar to bitwise Hamming distance.  This is several times as
106   // fast as the lookup tables used for relationship matrices.
107   //
108   // Unfortunately, when missing values are present,
109   // \mu_y\sum_{i: nonmissing from y} x_i and
110   // \mu_x\sum_{i: nonmissing from x} y_i must also be evaluated (and, in
111   // practice, \mu_y\sum_{i: nonmissing from y} x_i^2 and
112   // \mu_x\sum_{i: nonmissing from x} y_i^2 should be determined here as well);
113   // this removes much of the speed advantage, and the best applications of the
114   // underlying ternary dot product algorithm used here lie elsewhere.
115   // Nevertheless, it is still faster, so we use it.
116   // (possible todo: accelerated function when there really are no missing
117   // values, similar to what is now done for --fast-epistasis)
118   //
119   //
120   // Input:
121   // * vec1 and vec2 are encoded -1 -> 00, 0/missing -> 01, 1 -> 10.
122   // * mask1 and mask2 mask out missing values (i.e. 00 for missing, 11 for
123   //   nonmissing).
124   // * return_vals provides space for return values.
125   // * iters is the number of 48-byte windows to process, anywhere from 1 to 10
126   //   inclusive.
127   //
128   // This function performs the update
129   //   return_vals[0] += (-N) + \sum_i x_iy_i
130   //   return_vals[1] += N_y + \sum_{i: nonmissing from y} x_i
131   //   return_vals[2] += N_x + \sum_{i: nonmissing from x} y_i
132   //   return_vals[3] += N_y - \sum_{i: nonmissing from y} x_i^2
133   //   return_vals[4] += N_x - \sum_{i: nonmissing from x} y_i^2
134   // where N is the number of samples processed after applying the missingness
135   // masks indicated by the subscripts.
136   //
137   // Computation of terms [1]-[4] is based on the identity
138   //   N_y + \sum_{i: nonmissing from y} x_i = popcount2(vec1 & mask2)
139   // where "popcount2" refers to starting with two-bit integers instead of
140   // one-bit integers in our summing process (this allows us to skip a few
141   // operations).  (Once we can assume the presence of hardware popcount, a
142   // slightly different implementation may be better.)
143   //
144   // The trickier [0] computation currently proceeds as follows:
145   //
146   // 1. zcheck := (vec1 | vec2) & 0x5555...
147   // Detects whether at least one member of the pair has a 0/missing value.
148   //
149   // 2. popcount2(((vec1 ^ vec2) & (0xaaaa... - zcheck)) | zcheck)
150   // Subtracting this *from* a bias will give us our desired \sum_i x_iy_i dot
151   // product.
152   //
153   // MULTIPLEX_LD sets of values are usually handled per function call.  If
154   // fewer values are present, the ends of all input vectors should be zeroed
155   // out.
156 
157   const __m128i m1 = {FIVEMASK, FIVEMASK};
158   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
159   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
160   __m128i loader1;
161   __m128i loader2;
162   __m128i sum1;
163   __m128i sum2;
164   __m128i sum11;
165   __m128i sum22;
166   __m128i sum12;
167   __m128i tmp_sum1;
168   __m128i tmp_sum2;
169   __m128i tmp_sum12;
170   __univec acc;
171   __univec acc1;
172   __univec acc2;
173   __univec acc11;
174   __univec acc22;
175   acc.vi = _mm_setzero_si128();
176   acc1.vi = _mm_setzero_si128();
177   acc2.vi = _mm_setzero_si128();
178   acc11.vi = _mm_setzero_si128();
179   acc22.vi = _mm_setzero_si128();
180   do {
181     loader1 = *vec1++;
182     loader2 = *vec2++;
183     sum1 = *mask2++;
184     sum2 = *mask1++;
185     sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
186     // sum11 = _mm_and_si128(_mm_and_si128(_mm_xor_si128(sum1, m1), m1), loader1);
187     // sum22 = _mm_and_si128(_mm_and_si128(_mm_xor_si128(sum2, m1), m1), loader2);
188     sum1 = _mm_and_si128(sum1, loader1);
189     sum2 = _mm_and_si128(sum2, loader2);
190     sum11 = _mm_and_si128(sum1, m1);
191     sum22 = _mm_and_si128(sum2, m1);
192     // use andnot to eliminate need for 0xaaaa... to occupy an xmm register
193     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, sum12), _mm_xor_si128(loader1, loader2));
194     sum12 = _mm_or_si128(sum12, loader1);
195 
196     // sum1, sum2, and sum12 now store the (biased) two-bit sums of
197     // interest; merge to 4 bits to prevent overflow.  this merge can be
198     // postponed for sum11 and sum22 because the individual terms are 0/1
199     // instead of 0/1/2.
200     sum1 = _mm_add_epi64(_mm_and_si128(sum1, m2), _mm_and_si128(_mm_srli_epi64(sum1, 2), m2));
201     sum2 = _mm_add_epi64(_mm_and_si128(sum2, m2), _mm_and_si128(_mm_srli_epi64(sum2, 2), m2));
202     sum12 = _mm_add_epi64(_mm_and_si128(sum12, m2), _mm_and_si128(_mm_srli_epi64(sum12, 2), m2));
203 
204     loader1 = *vec1++;
205     loader2 = *vec2++;
206     tmp_sum1 = *mask2++;
207     tmp_sum2 = *mask1++;
208     tmp_sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
209     tmp_sum1 = _mm_and_si128(tmp_sum1, loader1);
210     tmp_sum2 = _mm_and_si128(tmp_sum2, loader2);
211     sum11 = _mm_add_epi64(sum11, _mm_and_si128(tmp_sum1, m1));
212     sum22 = _mm_add_epi64(sum22, _mm_and_si128(tmp_sum2, m1));
213     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, tmp_sum12), _mm_xor_si128(loader1, loader2));
214     tmp_sum12 = _mm_or_si128(loader1, tmp_sum12);
215 
216     sum1 = _mm_add_epi64(sum1, _mm_add_epi64(_mm_and_si128(tmp_sum1, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum1, 2), m2)));
217     sum2 = _mm_add_epi64(sum2, _mm_add_epi64(_mm_and_si128(tmp_sum2, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum2, 2), m2)));
218     sum12 = _mm_add_epi64(sum12, _mm_add_epi64(_mm_and_si128(tmp_sum12, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum12, 2), m2)));
219 
220     loader1 = *vec1++;
221     loader2 = *vec2++;
222     tmp_sum1 = *mask2++;
223     tmp_sum2 = *mask1++;
224     tmp_sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
225     tmp_sum1 = _mm_and_si128(tmp_sum1, loader1);
226     tmp_sum2 = _mm_and_si128(tmp_sum2, loader2);
227     sum11 = _mm_add_epi64(sum11, _mm_and_si128(tmp_sum1, m1));
228     sum22 = _mm_add_epi64(sum22, _mm_and_si128(tmp_sum2, m1));
229     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, tmp_sum12), _mm_xor_si128(loader1, loader2));
230     tmp_sum12 = _mm_or_si128(loader1, tmp_sum12);
231 
232     sum1 = _mm_add_epi64(sum1, _mm_add_epi64(_mm_and_si128(tmp_sum1, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum1, 2), m2)));
233     sum2 = _mm_add_epi64(sum2, _mm_add_epi64(_mm_and_si128(tmp_sum2, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum2, 2), m2)));
234     sum11 = _mm_add_epi64(_mm_and_si128(sum11, m2), _mm_and_si128(_mm_srli_epi64(sum11, 2), m2));
235     sum22 = _mm_add_epi64(_mm_and_si128(sum22, m2), _mm_and_si128(_mm_srli_epi64(sum22, 2), m2));
236     sum12 = _mm_add_epi64(sum12, _mm_add_epi64(_mm_and_si128(tmp_sum12, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum12, 2), m2)));
237 
238     acc1.vi = _mm_add_epi64(acc1.vi, _mm_add_epi64(_mm_and_si128(sum1, m4), _mm_and_si128(_mm_srli_epi64(sum1, 4), m4)));
239     acc2.vi = _mm_add_epi64(acc2.vi, _mm_add_epi64(_mm_and_si128(sum2, m4), _mm_and_si128(_mm_srli_epi64(sum2, 4), m4)));
240     acc11.vi = _mm_add_epi64(acc11.vi, _mm_add_epi64(_mm_and_si128(sum11, m4), _mm_and_si128(_mm_srli_epi64(sum11, 4), m4)));
241     acc22.vi = _mm_add_epi64(acc22.vi, _mm_add_epi64(_mm_and_si128(sum22, m4), _mm_and_si128(_mm_srli_epi64(sum22, 4), m4)));
242     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(sum12, m4), _mm_and_si128(_mm_srli_epi64(sum12, 4), m4)));
243   } while (--iters);
244   // moved down because we've almost certainly run out of xmm registers
245   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
246 #if MULTIPLEX_LD > 960
247   acc1.vi = _mm_add_epi64(_mm_and_si128(acc1.vi, m8), _mm_and_si128(_mm_srli_epi64(acc1.vi, 8), m8));
248   acc2.vi = _mm_add_epi64(_mm_and_si128(acc2.vi, m8), _mm_and_si128(_mm_srli_epi64(acc2.vi, 8), m8));
249   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
250 #else
251   acc1.vi = _mm_and_si128(_mm_add_epi64(acc1.vi, _mm_srli_epi64(acc1.vi, 8)), m8);
252   acc2.vi = _mm_and_si128(_mm_add_epi64(acc2.vi, _mm_srli_epi64(acc2.vi, 8)), m8);
253   acc.vi = _mm_and_si128(_mm_add_epi64(acc.vi, _mm_srli_epi64(acc.vi, 8)), m8);
254 #endif
255   acc11.vi = _mm_and_si128(_mm_add_epi64(acc11.vi, _mm_srli_epi64(acc11.vi, 8)), m8);
256   acc22.vi = _mm_and_si128(_mm_add_epi64(acc22.vi, _mm_srli_epi64(acc22.vi, 8)), m8);
257 
258   return_vals[0] -= ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
259   return_vals[1] += ((acc1.u8[0] + acc1.u8[1]) * 0x1000100010001LLU) >> 48;
260   return_vals[2] += ((acc2.u8[0] + acc2.u8[1]) * 0x1000100010001LLU) >> 48;
261   return_vals[3] += ((acc11.u8[0] + acc11.u8[1]) * 0x1000100010001LLU) >> 48;
262   return_vals[4] += ((acc22.u8[0] + acc22.u8[1]) * 0x1000100010001LLU) >> 48;
263 }
264 
ld_dot_prod(uintptr_t * vec1,uintptr_t * vec2,uintptr_t * mask1,uintptr_t * mask2,int32_t * return_vals,uint32_t batch_ct_m1,uint32_t last_batch_size)265 void ld_dot_prod(uintptr_t* vec1, uintptr_t* vec2, uintptr_t* mask1, uintptr_t* mask2, int32_t* return_vals, uint32_t batch_ct_m1, uint32_t last_batch_size) {
266   while (batch_ct_m1--) {
267     ld_dot_prod_batch((__m128i*)vec1, (__m128i*)vec2, (__m128i*)mask1, (__m128i*)mask2, return_vals, MULTIPLEX_LD / 192);
268     vec1 = &(vec1[MULTIPLEX_LD / BITCT2]);
269     vec2 = &(vec2[MULTIPLEX_LD / BITCT2]);
270     mask1 = &(mask1[MULTIPLEX_LD / BITCT2]);
271     mask2 = &(mask2[MULTIPLEX_LD / BITCT2]);
272   }
273   ld_dot_prod_batch((__m128i*)vec1, (__m128i*)vec2, (__m128i*)mask1, (__m128i*)mask2, return_vals, last_batch_size);
274 }
275 
ld_dot_prod_nm_batch(__m128i * vec1,__m128i * vec2,uint32_t iters)276 static inline int32_t ld_dot_prod_nm_batch(__m128i* vec1, __m128i* vec2, uint32_t iters) {
277   // faster ld_dot_prod_batch() for no-missing-calls case.
278   const __m128i m1 = {FIVEMASK, FIVEMASK};
279   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
280   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
281   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
282   __m128i loader1;
283   __m128i loader2;
284   __m128i sum12;
285   __m128i tmp_sum12;
286   __univec acc;
287   acc.vi = _mm_setzero_si128();
288   do {
289     loader1 = *vec1++;
290     loader2 = *vec2++;
291     sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
292     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, sum12), _mm_xor_si128(loader1, loader2));
293     sum12 = _mm_or_si128(sum12, loader1);
294     sum12 = _mm_add_epi64(_mm_and_si128(sum12, m2), _mm_and_si128(_mm_srli_epi64(sum12, 2), m2));
295 
296     loader1 = *vec1++;
297     loader2 = *vec2++;
298     tmp_sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
299     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, tmp_sum12), _mm_xor_si128(loader1, loader2));
300     tmp_sum12 = _mm_or_si128(loader1, tmp_sum12);
301     sum12 = _mm_add_epi64(sum12, _mm_add_epi64(_mm_and_si128(tmp_sum12, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum12, 2), m2)));
302 
303     loader1 = *vec1++;
304     loader2 = *vec2++;
305     tmp_sum12 = _mm_and_si128(_mm_or_si128(loader1, loader2), m1);
306     loader1 = _mm_andnot_si128(_mm_add_epi64(m1, tmp_sum12), _mm_xor_si128(loader1, loader2));
307     tmp_sum12 = _mm_or_si128(loader1, tmp_sum12);
308     sum12 = _mm_add_epi64(sum12, _mm_add_epi64(_mm_and_si128(tmp_sum12, m2), _mm_and_si128(_mm_srli_epi64(tmp_sum12, 2), m2)));
309 
310     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(sum12, m4), _mm_and_si128(_mm_srli_epi64(sum12, 4), m4)));
311   } while (--iters);
312 #if MULTIPLEX_LD > 960
313   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
314 #else
315   acc.vi = _mm_and_si128(_mm_add_epi64(acc.vi, _mm_srli_epi64(acc.vi, 8)), m8);
316 #endif
317   return (uint32_t)(((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48);
318 }
319 
ld_dot_prod_nm(uintptr_t * vec1,uintptr_t * vec2,uint32_t founder_ct,uint32_t batch_ct_m1,uint32_t last_batch_size)320 int32_t ld_dot_prod_nm(uintptr_t* vec1, uintptr_t* vec2, uint32_t founder_ct, uint32_t batch_ct_m1, uint32_t last_batch_size) {
321   // accelerated implementation for no-missing-loci case
322   int32_t result = (int32_t)founder_ct;
323   while (batch_ct_m1--) {
324     result -= ld_dot_prod_nm_batch((__m128i*)vec1, (__m128i*)vec2, MULTIPLEX_LD / 192);
325     vec1 = &(vec1[MULTIPLEX_LD / BITCT2]);
326     vec2 = &(vec2[MULTIPLEX_LD / BITCT2]);
327   }
328   result -= ld_dot_prod_nm_batch((__m128i*)vec1, (__m128i*)vec2, last_batch_size);
329   return result;
330 }
331 #else
ld_dot_prod_batch(uintptr_t * vec1,uintptr_t * vec2,uintptr_t * mask1,uintptr_t * mask2,int32_t * return_vals,uint32_t iters)332 static inline void ld_dot_prod_batch(uintptr_t* vec1, uintptr_t* vec2, uintptr_t* mask1, uintptr_t* mask2, int32_t* return_vals, uint32_t iters) {
333   uint32_t final_sum1 = 0;
334   uint32_t final_sum2 = 0;
335   uint32_t final_sum11 = 0;
336   uint32_t final_sum22 = 0;
337   uint32_t final_sum12 = 0;
338   uintptr_t loader1;
339   uintptr_t loader2;
340   uintptr_t sum1;
341   uintptr_t sum2;
342   uintptr_t sum11;
343   uintptr_t sum22;
344   uintptr_t sum12;
345   uintptr_t tmp_sum1;
346   uintptr_t tmp_sum2;
347   uintptr_t tmp_sum12;
348   do {
349     // (The important part of the header comment on the 64-bit version is
350     // copied below.)
351     //
352     // Input:
353     // * vec1 and vec2 are encoded -1 -> 00, 0/missing -> 01, 1 -> 10.
354     // * mask1 and mask2 mask out missing values (i.e. 00 for missing, 11 for
355     //   nonmissing).
356     // * return_vals provides space for return values.
357     // * iters is the number of 12-byte windows to process, anywhere from 1 to
358     //   40 inclusive.  (No, this is not the interface you'd use for a
359     //   general-purpose library.)  [32- and 64-bit differ here.]
360     //
361     // This function performs the update
362     //   return_vals[0] += (-N) + \sum_i x_iy_i
363     //   return_vals[1] += N_y + \sum_{i: nonmissing from y} x_i
364     //   return_vals[2] += N_x + \sum_{i: nonmissing from x} y_i
365     //   return_vals[3] += N_y - \sum_{i: nonmissing from y} x_i^2
366     //   return_vals[4] += N_x - \sum_{i: nonmissing from x} y_i^2
367     // where N is the number of samples processed after applying the
368     // missingness masks indicated by the subscripts.
369     //
370     // Computation of terms [1]-[4] is based on the identity
371     //   N_y + \sum_{i: nonmissing from y} x_i = popcount2(vec1 & mask2)
372     // where "popcount2" refers to starting with two-bit integers instead of
373     // one-bit integers in our summing process (this allows us to skip a few
374     // operations).  (Once we can assume the presence of hardware popcount, a
375     // slightly different implementation may be better.)
376     //
377     // The trickier [0] computation currently proceeds as follows:
378     //
379     // 1. zcheck := (vec1 | vec2) & 0x5555...
380     // Detects whether at least one member of the pair has a 0/missing value.
381     //
382     // 2. popcount2(((vec1 ^ vec2) & (0xaaaa... - zcheck)) | zcheck)
383     // Subtracting this *from* a bias will give us our desired \sum_i x_iy_i
384     // dot product.
385 
386 
387     loader1 = *vec1++;
388     loader2 = *vec2++;
389     sum1 = *mask2++;
390     sum2 = *mask1++;
391     sum12 = (loader1 | loader2) & FIVEMASK;
392 
393     sum1 = sum1 & loader1;
394     sum2 = sum2 & loader2;
395     loader1 = (loader1 ^ loader2) & (AAAAMASK - sum12);
396     sum12 = sum12 | loader1;
397     sum11 = sum1 & FIVEMASK;
398     sum22 = sum2 & FIVEMASK;
399 
400     sum1 = (sum1 & 0x33333333) + ((sum1 >> 2) & 0x33333333);
401     sum2 = (sum2 & 0x33333333) + ((sum2 >> 2) & 0x33333333);
402     sum12 = (sum12 & 0x33333333) + ((sum12 >> 2) & 0x33333333);
403 
404     loader1 = *vec1++;
405     loader2 = *vec2++;
406     tmp_sum1 = *mask2++;
407     tmp_sum2 = *mask1++;
408     tmp_sum12 = (loader1 | loader2) & FIVEMASK;
409     tmp_sum1 = tmp_sum1 & loader1;
410     tmp_sum2 = tmp_sum2 & loader2;
411 
412     loader1 = (loader1 ^ loader2) & (AAAAMASK - tmp_sum12);
413     tmp_sum12 = tmp_sum12 | loader1;
414     sum11 += tmp_sum1 & FIVEMASK;
415     sum22 += tmp_sum2 & FIVEMASK;
416 
417     sum1 += (tmp_sum1 & 0x33333333) + ((tmp_sum1 >> 2) & 0x33333333);
418     sum2 += (tmp_sum2 & 0x33333333) + ((tmp_sum2 >> 2) & 0x33333333);
419     sum12 += (tmp_sum12 & 0x33333333) + ((tmp_sum12 >> 2) & 0x33333333);
420 
421     loader1 = *vec1++;
422     loader2 = *vec2++;
423     tmp_sum1 = *mask2++;
424     tmp_sum2 = *mask1++;
425     tmp_sum12 = (loader1 | loader2) & FIVEMASK;
426 
427     tmp_sum1 = tmp_sum1 & loader1;
428     tmp_sum2 = tmp_sum2 & loader2;
429     loader1 = (loader1 ^ loader2) & (AAAAMASK - tmp_sum12);
430     tmp_sum12 = tmp_sum12 | loader1;
431     sum11 += tmp_sum1 & FIVEMASK;
432     sum22 += tmp_sum2 & FIVEMASK;
433 
434     sum1 += (tmp_sum1 & 0x33333333) + ((tmp_sum1 >> 2) & 0x33333333);
435     sum2 += (tmp_sum2 & 0x33333333) + ((tmp_sum2 >> 2) & 0x33333333);
436     sum11 = (sum11 & 0x33333333) + ((sum11 >> 2) & 0x33333333);
437     sum22 = (sum22 & 0x33333333) + ((sum22 >> 2) & 0x33333333);
438     sum12 += (tmp_sum12 & 0x33333333) + ((tmp_sum12 >> 2) & 0x33333333);
439 
440     sum1 = (sum1 & 0x0f0f0f0f) + ((sum1 >> 4) & 0x0f0f0f0f);
441     sum2 = (sum2 & 0x0f0f0f0f) + ((sum2 >> 4) & 0x0f0f0f0f);
442     sum11 = (sum11 & 0x0f0f0f0f) + ((sum11 >> 4) & 0x0f0f0f0f);
443     sum22 = (sum22 & 0x0f0f0f0f) + ((sum22 >> 4) & 0x0f0f0f0f);
444     sum12 = (sum12 & 0x0f0f0f0f) + ((sum12 >> 4) & 0x0f0f0f0f);
445 
446     // technically could do the multiply-and-shift only once every two rounds
447     final_sum1 += (sum1 * 0x01010101) >> 24;
448     final_sum2 += (sum2 * 0x01010101) >> 24;
449     final_sum11 += (sum11 * 0x01010101) >> 24;
450     final_sum22 += (sum22 * 0x01010101) >> 24;
451     final_sum12 += (sum12 * 0x01010101) >> 24;
452   } while (--iters);
453   return_vals[0] -= final_sum12;
454   return_vals[1] += final_sum1;
455   return_vals[2] += final_sum2;
456   return_vals[3] += final_sum11;
457   return_vals[4] += final_sum22;
458 }
459 
ld_dot_prod(uintptr_t * vec1,uintptr_t * vec2,uintptr_t * mask1,uintptr_t * mask2,int32_t * return_vals,uint32_t batch_ct_m1,uint32_t last_batch_size)460 void ld_dot_prod(uintptr_t* vec1, uintptr_t* vec2, uintptr_t* mask1, uintptr_t* mask2, int32_t* return_vals, uint32_t batch_ct_m1, uint32_t last_batch_size) {
461   while (batch_ct_m1--) {
462     ld_dot_prod_batch(vec1, vec2, mask1, mask2, return_vals, MULTIPLEX_LD / 48);
463     vec1 = &(vec1[MULTIPLEX_LD / BITCT2]);
464     vec2 = &(vec2[MULTIPLEX_LD / BITCT2]);
465     mask1 = &(mask1[MULTIPLEX_LD / BITCT2]);
466     mask2 = &(mask2[MULTIPLEX_LD / BITCT2]);
467   }
468   ld_dot_prod_batch(vec1, vec2, mask1, mask2, return_vals, last_batch_size);
469 }
470 
ld_dot_prod_nm_batch(uintptr_t * vec1,uintptr_t * vec2,uint32_t iters)471 static inline int32_t ld_dot_prod_nm_batch(uintptr_t* vec1, uintptr_t* vec2, uint32_t iters) {
472   uint32_t final_sum12 = 0;
473   uintptr_t loader1;
474   uintptr_t loader2;
475   uintptr_t sum12;
476   uintptr_t tmp_sum12;
477   do {
478     loader1 = *vec1++;
479     loader2 = *vec2++;
480     sum12 = (loader1 | loader2) & FIVEMASK;
481     loader1 = (loader1 ^ loader2) & (AAAAMASK - sum12);
482     sum12 = sum12 | loader1;
483     sum12 = (sum12 & 0x33333333) + ((sum12 >> 2) & 0x33333333);
484 
485     loader1 = *vec1++;
486     loader2 = *vec2++;
487     tmp_sum12 = (loader1 | loader2) & FIVEMASK;
488     loader1 = (loader1 ^ loader2) & (AAAAMASK - tmp_sum12);
489     tmp_sum12 = tmp_sum12 | loader1;
490     sum12 += (tmp_sum12 & 0x33333333) + ((tmp_sum12 >> 2) & 0x33333333);
491 
492     loader1 = *vec1++;
493     loader2 = *vec2++;
494     tmp_sum12 = (loader1 | loader2) & FIVEMASK;
495     loader1 = (loader1 ^ loader2) & (AAAAMASK - tmp_sum12);
496     tmp_sum12 = tmp_sum12 | loader1;
497     sum12 += (tmp_sum12 & 0x33333333) + ((tmp_sum12 >> 2) & 0x33333333);
498     sum12 = (sum12 & 0x0f0f0f0f) + ((sum12 >> 4) & 0x0f0f0f0f);
499 
500     final_sum12 += (sum12 * 0x01010101) >> 24;
501   } while (--iters);
502   return final_sum12;
503 }
504 
ld_dot_prod_nm(uintptr_t * vec1,uintptr_t * vec2,uint32_t founder_ct,uint32_t batch_ct_m1,uint32_t last_batch_size)505 int32_t ld_dot_prod_nm(uintptr_t* vec1, uintptr_t* vec2, uint32_t founder_ct, uint32_t batch_ct_m1, uint32_t last_batch_size) {
506   int32_t result = (int32_t)founder_ct;
507   while (batch_ct_m1--) {
508     result -= ld_dot_prod_nm_batch(vec1, vec2, MULTIPLEX_LD / 48);
509     vec1 = &(vec1[MULTIPLEX_LD / BITCT2]);
510     vec2 = &(vec2[MULTIPLEX_LD / BITCT2]);
511   }
512   result -= ld_dot_prod_nm_batch(vec1, vec2, last_batch_size);
513   return result;
514 }
515 #endif // __LP64__
516 
ld_process_load(uintptr_t * geno_buf,uintptr_t * mask_buf,uintptr_t * missing_buf,uint32_t * missing_ct_ptr,double * sum_ptr,double * variance_recip_ptr,uint32_t founder_ct,uint32_t is_x,uint32_t weighted_x,uint32_t nonmale_founder_ct,uintptr_t * founder_male_include2,uintptr_t * nonmale_geno,uintptr_t * nonmale_masks,uintptr_t nonmale_offset)517 uint32_t ld_process_load(uintptr_t* geno_buf, uintptr_t* mask_buf, uintptr_t* missing_buf, uint32_t* missing_ct_ptr, double* sum_ptr, double* variance_recip_ptr, uint32_t founder_ct, uint32_t is_x, uint32_t weighted_x, uint32_t nonmale_founder_ct, uintptr_t* founder_male_include2, uintptr_t* nonmale_geno, uintptr_t* nonmale_masks, uintptr_t nonmale_offset) {
518   uintptr_t* geno_ptr = geno_buf;
519   uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
520   uintptr_t* geno_end = &(geno_buf[founder_ctl2]);
521   uintptr_t* mask_buf_ptr = mask_buf;
522   uintptr_t* missing_ptr = missing_buf;
523   uintptr_t new_missing = 0;
524   int64_t llii;
525   uint32_t missing_bit_offset = 0;
526   uint32_t ssq = 0;
527   uint32_t missing_ct = 0;
528   int32_t sum = -founder_ct;
529   uintptr_t* nm_mask_ptr;
530   uintptr_t cur_geno;
531   uintptr_t shifted_masked_geno;
532   uintptr_t new_geno;
533   uintptr_t new_mask;
534   while (1) {
535     // Desired encodings:
536     // new_geno: nonset homozygote -> 00
537     //           het/missing       -> 01
538     //           set homozygote    -> 10
539     // Given PLINK encoding xx, this is (xx - ((xx >> 1) & FIVEMASK)).
540     //
541     // new_mask: missing   -> 00
542     //           otherwise -> 11
543     // ...and this is (((xx >> 1) & FIVEMASK) | ((~xx) & FIVEMASK)) * 3.
544     //
545     // new_missing: missing   -> 1
546     //              otherwise -> 0
547     // This can be assembled via repeated CTZLU on ~new_mask.
548     cur_geno = *geno_ptr;
549     shifted_masked_geno = (cur_geno >> 1) & FIVEMASK;
550     new_geno = cur_geno - shifted_masked_geno;
551     *geno_ptr++ = new_geno;
552     new_mask = (((~cur_geno) & FIVEMASK) | shifted_masked_geno) * 3;
553     *mask_buf_ptr++ = new_mask;
554     new_mask = (~new_mask) & FIVEMASK;
555     while (new_mask) {
556       new_missing |= ONELU << (missing_bit_offset + (CTZLU(new_mask) / 2));
557       missing_ct++;
558       new_mask &= new_mask - 1;
559     }
560     if (geno_ptr == geno_end) {
561       break;
562     }
563     if (missing_bit_offset) {
564       missing_bit_offset = 0;
565       *missing_ptr++ = new_missing;
566       new_missing = 0;
567     } else {
568       missing_bit_offset = BITCT2;
569     }
570   }
571   *missing_ptr = new_missing;
572   if (is_x && (!weighted_x)) {
573     // special case #1: recode male clear homozygotes to 01 on X chromosome,
574     // for backwards compatibility
575     //
576     // this is a bit ugly (e.g. results are actually affected by which allele
577     // is A1), so may want to switch the default to mode 3
578     geno_ptr = geno_buf;
579     do {
580       new_geno = *geno_ptr;
581       *geno_ptr++ = new_geno + ((~(new_geno | (new_geno >> 1))) & (*founder_male_include2++));
582     } while (geno_ptr < geno_end);
583   }
584   geno_ptr = geno_buf;
585   while (1) {
586     new_geno = *geno_ptr++;
587     sum += popcount2_long(new_geno);
588     new_geno = (new_geno ^ FIVEMASK) & FIVEMASK;
589     if (geno_ptr == geno_end) {
590       break;
591     }
592     ssq += popcount2_long(new_geno);
593   }
594   // have to be careful with trailing zeroes here
595   ssq += popcount2_long(new_geno << (BITCT - 2 * (1 + ((founder_ct - 1) % BITCT2))));
596   if (founder_ct % BITCT2) {
597     mask_buf[founder_ct / BITCT2] &= (ONELU << (2 * (founder_ct % BITCT2))) - ONELU;
598   }
599   if (is_x && weighted_x) {
600     // special case #2: double-count nonmales
601     geno_ptr = geno_buf;
602     sum -= founder_ct;
603     nonmale_geno = &(nonmale_geno[nonmale_offset]);
604     nonmale_masks = &(nonmale_masks[nonmale_offset]);
605     mask_buf_ptr = mask_buf;
606     nm_mask_ptr = nonmale_masks;
607     while (1) {
608       new_mask = ~((*founder_male_include2) * 3);
609       new_geno = ((*geno_ptr++) & new_mask) | (*founder_male_include2++);
610       *nonmale_geno++ = new_geno;
611       *nm_mask_ptr++ = new_mask & (*mask_buf_ptr++);
612       sum += popcount2_long(new_geno);
613       new_geno = (new_geno ^ FIVEMASK) & FIVEMASK;
614       if (geno_ptr == geno_end) {
615 	break;
616       }
617       ssq += popcount2_long(new_geno);
618     }
619     ssq += popcount2_long(new_geno << (BITCT - 2 * (1 + ((founder_ct - 1) % BITCT2))));
620     missing_ct += founder_ct - (popcount_longs(nonmale_masks, founder_ctl2) / 2);
621     founder_ct *= 2;
622   } else if (!missing_ct) {
623     // save sum and (n^2)/variance, for faster processing of pairwise
624     // no-missing-calls case
625     llii = (int64_t)((uint64_t)ssq) * founder_ct - ((int64_t)sum) * sum;
626     if (!llii) {
627       return 0;
628     }
629     *missing_ct_ptr = 0;
630     *sum_ptr = (double)sum;
631     *variance_recip_ptr = 1.0 / ((double)llii);
632     return 1;
633   }
634   *missing_ct_ptr = missing_ct;
635   return (((int64_t)((uint64_t)ssq)) * (founder_ct - missing_ct) - ((int64_t)sum) * sum)? 1 : 0;
636 }
637 
ld_prune_next_valid_chrom_start(uintptr_t * marker_exclude,uint32_t cur_uidx,Chrom_info * chrom_info_ptr,uint32_t chrom_code_end,uint32_t unfiltered_marker_ct)638 uint32_t ld_prune_next_valid_chrom_start(uintptr_t* marker_exclude, uint32_t cur_uidx, Chrom_info* chrom_info_ptr, uint32_t chrom_code_end, uint32_t unfiltered_marker_ct) {
639   uint32_t chrom_idx;
640   cur_uidx = next_unset(marker_exclude, cur_uidx, unfiltered_marker_ct);
641   while (cur_uidx < unfiltered_marker_ct) {
642     chrom_idx = get_variant_chrom(chrom_info_ptr, cur_uidx);
643     // --aec 0 support
644     if (chrom_idx && (chrom_idx < chrom_code_end)) {
645       return cur_uidx;
646     }
647     cur_uidx = next_unset(marker_exclude, get_chrom_end_vidx(chrom_info_ptr, chrom_idx), unfiltered_marker_ct);
648   }
649   return cur_uidx;
650 }
651 
ld_prune_start_chrom(uint32_t ld_window_kb,uint32_t * cur_chrom_ptr,uint32_t * chrom_end_ptr,uint32_t window_unfiltered_start,uint32_t * live_indices,uint32_t * start_arr,uint32_t * window_unfiltered_end_ptr,uint32_t ld_window_size,uint32_t * cur_window_size_ptr,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,Chrom_info * chrom_info_ptr,uint32_t * marker_pos,uint32_t * is_haploid_ptr,uint32_t * is_x_ptr,uint32_t * is_y_ptr)652 void ld_prune_start_chrom(uint32_t ld_window_kb, uint32_t* cur_chrom_ptr, uint32_t* chrom_end_ptr, uint32_t window_unfiltered_start, uint32_t* live_indices, uint32_t* start_arr, uint32_t* window_unfiltered_end_ptr, uint32_t ld_window_size, uint32_t* cur_window_size_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uint32_t* is_haploid_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr) {
653   uint32_t cur_chrom = get_variant_chrom(chrom_info_ptr, window_unfiltered_start);
654   uint32_t window_unfiltered_end = window_unfiltered_start + 1;
655   uint32_t chrom_end = get_chrom_end_vidx(chrom_info_ptr, cur_chrom);
656   uint32_t uii = 0;
657   uint32_t window_size;
658   live_indices[0] = window_unfiltered_start;
659   next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
660   if (ld_window_kb) {
661     window_size = 1;
662     uii = window_unfiltered_end;
663     while ((uii < chrom_end) && (marker_pos[uii] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
664       window_size++;
665       uii++;
666       next_unset_ck(marker_exclude, chrom_end, &uii);
667     }
668     uii = 0;
669   } else {
670     window_size = ld_window_size;
671   }
672   for (uii = 1; uii < window_size; uii++) {
673     if (window_unfiltered_end == chrom_end) {
674       break;
675     }
676     start_arr[uii - 1] = window_unfiltered_end;
677     live_indices[uii] = window_unfiltered_end;
678     window_unfiltered_end++;
679     next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
680   }
681   *cur_window_size_ptr = uii;
682   start_arr[uii - 1] = window_unfiltered_end;
683   *cur_chrom_ptr = cur_chrom;
684   *chrom_end_ptr = chrom_end;
685   *window_unfiltered_end_ptr = window_unfiltered_end;
686   *is_haploid_ptr = IS_SET(chrom_info_ptr->haploid_mask, cur_chrom);
687   *is_x_ptr = (((int32_t)cur_chrom) == chrom_info_ptr->xymt_codes[X_OFFSET]);
688   *is_y_ptr = (((int32_t)cur_chrom) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
689 }
690 
ld_prune_write(char * outname,char * outname_end,uintptr_t * marker_exclude,uintptr_t * pruned_arr,char * marker_ids,uintptr_t max_marker_id_len,Chrom_info * chrom_info_ptr,uint32_t chrom_code_end)691 int32_t ld_prune_write(char* outname, char* outname_end, uintptr_t* marker_exclude, uintptr_t* pruned_arr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t chrom_code_end) {
692   FILE* outfile = nullptr;
693   int32_t retval = 0;
694   {
695     fputs("Writing...", stdout);
696     fflush(stdout);
697     strcpy(outname_end, ".prune.in");
698     if (fopen_checked(outname, "w", &outfile)) {
699       goto ld_prune_write_ret_OPEN_FAIL;
700     }
701     for (uint32_t cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
702       if (!is_set(chrom_info_ptr->chrom_mask, cur_chrom)) {
703 	continue;
704       }
705       const uint32_t chrom_end = get_chrom_end_vidx(chrom_info_ptr, cur_chrom);
706       for (uint32_t marker_uidx = get_chrom_start_vidx(chrom_info_ptr, cur_chrom); marker_uidx < chrom_end; marker_uidx++) {
707 	// pruned_arr initialized to marker_exclude
708 	if (!IS_SET(pruned_arr, marker_uidx)) {
709 	  fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
710 	  putc_unlocked('\n', outfile);
711 	}
712       }
713     }
714     if (fclose_null(&outfile)) {
715       goto ld_prune_write_ret_WRITE_FAIL;
716     }
717     strcpy(outname_end, ".prune.out");
718     if (fopen_checked(outname, "w", &outfile)) {
719       goto ld_prune_write_ret_OPEN_FAIL;
720     }
721     for (uint32_t cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
722       if (!is_set(chrom_info_ptr->chrom_mask, cur_chrom)) {
723 	continue;
724       }
725       const uint32_t chrom_end = get_chrom_end_vidx(chrom_info_ptr, cur_chrom);
726       for (uint32_t marker_uidx = get_chrom_start_vidx(chrom_info_ptr, cur_chrom); marker_uidx < chrom_end; marker_uidx++) {
727 	if ((!IS_SET(marker_exclude, marker_uidx)) && IS_SET(pruned_arr, marker_uidx)) {
728 	  fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
729 	  putc_unlocked('\n', outfile);
730 	}
731       }
732     }
733     if (fclose_null(&outfile)) {
734       goto ld_prune_write_ret_WRITE_FAIL;
735     }
736     *outname_end = '\0';
737     putc_unlocked('\r', stdout);
738     LOGPRINTFWW("Marker lists written to %s.prune.in and %s.prune.out .\n", outname, outname);
739   }
740   while (0) {
741   ld_prune_write_ret_OPEN_FAIL:
742     retval = RET_OPEN_FAIL;
743     break;
744   ld_prune_write_ret_WRITE_FAIL:
745     retval = RET_WRITE_FAIL;
746     break;
747   }
748   fclose_cond(outfile);
749   return retval;
750 }
751 
ld_prune(Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uint32_t * marker_pos,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)752 int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
753   // Results are slightly different from PLINK 1.07 when missing data is
754   // present, but that's due to a minor bug in 1.07 (sample per-marker
755   // variances don't exclude the missing markers).
756 
757   // for future consideration: chromosome-based multithread/parallel?
758   unsigned char* bigstack_mark = g_bigstack_base;
759   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
760   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
761   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
762   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
763   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
764 #ifdef __LP64__
765   uintptr_t founder_ctv = BITCT_TO_ALIGNED_WORDCT(founder_ct);
766 #else
767   uintptr_t founder_ctv = founder_ctl;
768 #endif
769   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
770   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
771 #ifdef __LP64__
772   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
773 #else
774   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
775 #endif
776   uintptr_t founder_ct_192_long = founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + founder_ct_mld_rem * (192 / BITCT2);
777   uintptr_t final_mask = get_final_mask(founder_ct);
778   uint32_t weighted_founder_ct = founder_ct;
779   uint32_t founder_trail_ct = founder_ct_192_long - founder_ctl * 2;
780   uint32_t pairwise = (ldip->modifier / LD_PRUNE_PAIRWISE) & 1;
781   uint32_t ignore_x = (ldip->modifier / LD_IGNORE_X) & 1;
782   uint32_t weighted_x = (ldip->modifier / LD_WEIGHTED_X) & 1;
783   uint32_t window_is_kb = (ldip->modifier / LD_PRUNE_KB_WINDOW) & 1;
784   uint32_t ld_window_size = ldip->prune_window_size;
785   uint32_t ld_window_incr = ldip->prune_window_incr;
786   double ld_last_param = ldip->prune_last_param;
787   uint32_t nonmale_founder_ct = 0;
788   uintptr_t window_max = 1;
789   uintptr_t* geno = nullptr;
790   uintptr_t* founder_include2 = nullptr;
791   uintptr_t* founder_male_include2 = nullptr;
792   uintptr_t* nonmale_geno = nullptr;
793   uintptr_t* nonmale_masks = nullptr;
794   double* cov_matrix = nullptr;
795   double* new_cov_matrix = nullptr;
796   MATRIX_INVERT_BUF1_TYPE* irow = nullptr;
797   double* work = nullptr;
798   uint32_t* idx_remap = nullptr;
799   uint32_t tot_exclude_ct = 0;
800   uint32_t at_least_one_prune = 0;
801   uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
802   int32_t retval = 0;
803   uintptr_t* geno_masks;
804   uintptr_t* geno_mmasks;
805   uintptr_t* pruned_arr;
806   uint32_t* live_indices;
807   uint32_t* start_arr;
808   uint32_t pct;
809   uint32_t pct_thresh;
810   uint32_t window_unfiltered_start;
811   uint32_t window_unfiltered_end;
812   uint32_t cur_window_size;
813   uint32_t old_window_size;
814   uint32_t uii;
815   uint32_t ujj;
816   uint32_t ukk;
817   int32_t ii;
818   uint32_t cur_chrom;
819   uint32_t chrom_start;
820   uint32_t chrom_end;
821   uint32_t is_haploid;
822   uint32_t is_x;
823   uint32_t is_y;
824   uintptr_t* loadbuf;
825   double* sums;
826   double* variance_recips; // entries are actually n^2 / variance
827   uint32_t* missing_cts;
828   uint32_t fixed_missing_ct;
829   uintptr_t ulii;
830   double dxx;
831   double dyy;
832   double cov12;
833   uint32_t fixed_non_missing_ct;
834   uint32_t non_missing_ct;
835   int32_t dp_result[5];
836   double non_missing_ctd;
837   uintptr_t* geno_fixed_vec_ptr;
838   uintptr_t* geno_var_vec_ptr;
839   uintptr_t* mask_fixed_vec_ptr;
840   uintptr_t* mask_var_vec_ptr;
841   uintptr_t cur_exclude_ct;
842   uint32_t prev_end;
843   __CLPK_integer window_rem_li;
844   uint32_t old_window_rem;
845   uint32_t window_rem;
846   uint32_t bsearch_min;
847   uint32_t bsearch_max;
848   uint32_t bsearch_cur;
849   double prune_ld_thresh;
850 
851   if (founder_ct < 2) {
852     LOGERRPRINTF("Warning: Skipping --indep%s since there are less than two founders.\n(--make-founders may come in handy here.)\n", pairwise? "-pairwise" : "");
853     goto ld_prune_ret_1;
854   }
855   if (is_set(chrom_info_ptr->chrom_mask, 0)) {
856     ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, 0);
857     if (chrom_info_ptr->zero_extra_chroms) {
858       for (uii = chrom_info_ptr->max_code + 1; uii < chrom_code_end; uii++) {
859 	ulii += count_chrom_markers(chrom_info_ptr, marker_exclude, uii);
860       }
861       chrom_code_end = chrom_info_ptr->max_code + 1;
862     }
863     marker_ct -= ulii;
864     LOGPRINTF("--indep%s: Ignoring %" PRIuPTR " chromosome 0 variant%s.\n", pairwise? "-pairwise" : "", ulii, (ulii == 1)? "" : "s");
865   }
866   if (marker_ct < 2) {
867     LOGERRPRINTF("Error: Too few valid variants for --indep%s.\n", pairwise? "-pairwise" : "");
868     goto ld_prune_ret_INVALID_FORMAT;
869   }
870 
871   // force founder_male_include2 allocation
872   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
873     goto ld_prune_ret_NOMEM;
874   }
875   if (weighted_x) {
876     nonmale_founder_ct = founder_ct - popcount01_longs(founder_male_include2, founder_ctl);
877     if (founder_ct + nonmale_founder_ct > 0x7fffffff) {
878       // no, this shouldn't ever happen, but may as well document that there
879       // theoretically is a 32-bit integer range issue here
880       logerrprint("Error: Too many founders for --indep[-pairwise] + --ld-xchr 3.\n");
881       goto ld_prune_ret_1;
882     }
883   }
884 
885   if (window_is_kb) {
886     // determine maximum number of markers that may need to be loaded at once
887     for (cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
888       if (is_set(chrom_info_ptr->chrom_mask, cur_chrom)) {
889 	window_max = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, cur_chrom, 0x7fffffff, ld_window_size * 1000, window_max);
890       }
891     }
892   }
893   if (pairwise) {
894     prune_ld_thresh = ld_last_param * (1 + SMALL_EPSILON);
895   } else {
896 #ifdef __LP64__
897     if (window_max > 46340) {
898       // todo: check what LAPACK's matrix inversion limit actually is.  Guess
899       // sqrt(2^31 - 1) for now.
900       logerrprint("Error: --indep does not currently support window sizes > 46340.\n");
901       goto ld_prune_ret_INVALID_CMDLINE;
902     }
903 #endif
904     // r, not r2, in this case
905     prune_ld_thresh = 0.999999;
906   }
907 
908   window_unfiltered_start = ld_prune_next_valid_chrom_start(marker_exclude, 0, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
909 
910   if (bigstack_alloc_ul(unfiltered_marker_ctl, &pruned_arr)) {
911     goto ld_prune_ret_NOMEM;
912   }
913 
914   memcpy(pruned_arr, marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t));
915 
916   if (!window_is_kb) {
917     window_max = ld_window_size;
918   }
919   if (bigstack_alloc_ui(window_max, &live_indices) ||
920       bigstack_alloc_ui(window_max, &start_arr) ||
921       bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
922       bigstack_alloc_ul(window_max * founder_ct_192_long, &geno) ||
923       bigstack_alloc_ul(window_max * founder_ct_192_long, &geno_masks) ||
924       bigstack_alloc_ul(window_max * founder_ctv, &geno_mmasks) ||
925       bigstack_alloc_ui(window_max, &missing_cts) ||
926       bigstack_alloc_d(window_max, &sums) ||
927       bigstack_alloc_d(window_max, &variance_recips)) {
928     goto ld_prune_ret_NOMEM;
929   }
930   if (weighted_x) {
931     if (bigstack_alloc_ul(window_max * founder_ct_192_long, &nonmale_geno) ||
932         bigstack_alloc_ul(window_max * founder_ct_192_long, &nonmale_masks)) {
933       goto ld_prune_ret_NOMEM;
934     }
935   }
936   for (ulii = 1; ulii <= window_max; ulii++) {
937     fill_ulong_zero(founder_trail_ct + 2, &(geno[ulii * founder_ct_192_long - founder_trail_ct - 2]));
938     fill_ulong_zero(founder_trail_ct + 2, &(geno_masks[ulii * founder_ct_192_long - founder_trail_ct - 2]));
939     if (weighted_x) {
940       fill_ulong_zero(founder_trail_ct + 2, &(nonmale_geno[ulii * founder_ct_192_long - founder_trail_ct - 2]));
941       fill_ulong_zero(founder_trail_ct + 2, &(nonmale_masks[ulii * founder_ct_192_long - founder_trail_ct - 2]));
942     }
943   }
944   if (!pairwise) {
945     if (bigstack_alloc_d(window_max * window_max, &cov_matrix) ||
946         bigstack_alloc_d(window_max * window_max, &new_cov_matrix) ||
947         bigstack_alloc_ui(window_max, &idx_remap)) {
948       goto ld_prune_ret_NOMEM;
949     }
950 
951     irow = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(window_max * MATRIX_INVERT_BUF1_CHECKED_ALLOC);
952     if (!irow) {
953       goto ld_prune_ret_NOMEM;
954     }
955 
956     if (window_max < 4) {
957       ulii = 4;
958     } else {
959       ulii = window_max;
960     }
961     if (bigstack_alloc_d(ulii * window_max, &work)) {
962       goto ld_prune_ret_NOMEM;
963     }
964   }
965   do {
966     prev_end = 0;
967     ld_prune_start_chrom(window_is_kb, &cur_chrom, &chrom_end, window_unfiltered_start, live_indices, start_arr, &window_unfiltered_end, ld_window_size, &cur_window_size, unfiltered_marker_ct, pruned_arr, chrom_info_ptr, marker_pos, &is_haploid, &is_x, &is_y);
968     if (weighted_x) {
969       if (is_x) {
970 	weighted_founder_ct = 2 * founder_ct;
971       } else {
972 	weighted_founder_ct = founder_ct;
973       }
974     }
975     old_window_size = 0;
976     cur_exclude_ct = 0;
977     if (cur_window_size > 1) {
978       for (ulii = 0; ulii < (uintptr_t)cur_window_size; ulii++) {
979 	uii = live_indices[ulii];
980 	if (fseeko(bedfile, bed_offset + (uii * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
981 	  goto ld_prune_ret_READ_FAIL;
982 	}
983 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uii), bedfile, loadbuf, &(geno[ulii * founder_ct_192_long]))) {
984 	  goto ld_prune_ret_READ_FAIL;
985 	}
986 	if (is_haploid && hh_exists) {
987 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(geno[ulii * founder_ct_192_long])));
988 	}
989         if (!ld_process_load(&(geno[ulii * founder_ct_192_long]), &(geno_masks[ulii * founder_ct_192_long]), &(geno_mmasks[ulii * founder_ctv]), &(missing_cts[ulii]), &(sums[ulii]), &(variance_recips[ulii]), founder_ct, is_x && (!ignore_x), weighted_x, nonmale_founder_ct, founder_male_include2, nonmale_geno, nonmale_masks, ulii * founder_ct_192_long)) {
990 	  SET_BIT(uii, pruned_arr);
991           cur_exclude_ct++;
992 	}
993       }
994     }
995     pct = 1;
996     chrom_start = get_chrom_start_vidx(chrom_info_ptr, cur_chrom);
997     pct_thresh = window_unfiltered_start + ((uint64_t)pct * (chrom_end - chrom_start)) / 100;
998     while ((window_unfiltered_start < chrom_end) || (cur_window_size > 1)) {
999       if (cur_window_size > 1) {
1000 	do {
1001 	  at_least_one_prune = 0;
1002 	  for (uii = 0; uii < cur_window_size - 1; uii++) {
1003 	    if (IS_SET(pruned_arr, live_indices[uii])) {
1004 	      continue;
1005 	    }
1006             fixed_missing_ct = missing_cts[uii];
1007 	    fixed_non_missing_ct = weighted_founder_ct - fixed_missing_ct;
1008 	    geno_fixed_vec_ptr = &(geno[uii * founder_ct_192_long]);
1009 	    mask_fixed_vec_ptr = &(geno_masks[uii * founder_ct_192_long]);
1010 	    ujj = uii + 1;
1011 	    while (live_indices[ujj] < start_arr[uii]) {
1012 	      if (++ujj == cur_window_size) {
1013 		break;
1014 	      }
1015 	    }
1016 	    for (; ujj < cur_window_size; ujj++) {
1017 	      if (IS_SET(pruned_arr, live_indices[ujj])) {
1018 		continue;
1019 	      }
1020 	      geno_var_vec_ptr = &(geno[ujj * founder_ct_192_long]);
1021 	      if ((!fixed_missing_ct) && (!missing_cts[ujj]) && ((!is_x) || (!weighted_x))) {
1022 		cov12 = (double)(ld_dot_prod_nm(geno_fixed_vec_ptr, geno_var_vec_ptr, weighted_founder_ct, founder_ct_mld_m1, founder_ct_mld_rem) * ((int64_t)founder_ct)) - sums[uii] * sums[ujj];
1023 		dxx = variance_recips[uii] * variance_recips[ujj];
1024 	      } else {
1025 		mask_var_vec_ptr = &(geno_masks[ujj * founder_ct_192_long]);
1026 		dp_result[0] = weighted_founder_ct;
1027 		// reversed from what I initially thought because I'm passing
1028 		// the ujj-associated buffers before the uii-associated ones.
1029 		dp_result[1] = -((int32_t)fixed_non_missing_ct);
1030 		dp_result[2] = missing_cts[ujj] - weighted_founder_ct;
1031 		dp_result[3] = dp_result[1];
1032 		dp_result[4] = dp_result[2];
1033 		ld_dot_prod(geno_var_vec_ptr, geno_fixed_vec_ptr, mask_var_vec_ptr, mask_fixed_vec_ptr, dp_result, founder_ct_mld_m1, founder_ct_mld_rem);
1034 		if (is_x && weighted_x) {
1035 		  non_missing_ct = (popcount_longs_intersect(&(nonmale_masks[uii * founder_ct_192_long]), &(nonmale_masks[ujj * founder_ct_192_long]), 2 * founder_ctl) + popcount_longs_intersect(mask_fixed_vec_ptr, mask_var_vec_ptr, 2 * founder_ctl)) / 2;
1036 		  ld_dot_prod(&(nonmale_geno[ujj * founder_ct_192_long]), &(nonmale_geno[uii * founder_ct_192_long]), &(nonmale_masks[ujj * founder_ct_192_long]), &(nonmale_masks[uii * founder_ct_192_long]), dp_result, founder_ct_mld_m1, founder_ct_mld_rem);
1037 		} else {
1038 		  non_missing_ct = fixed_non_missing_ct - missing_cts[ujj];
1039 		  if (fixed_missing_ct && missing_cts[ujj]) {
1040 		    non_missing_ct += popcount_longs_intersect(&(geno_mmasks[uii * founder_ctv]), &(geno_mmasks[ujj * founder_ctv]), founder_ctl);
1041 		  }
1042 		}
1043 		non_missing_ctd = (double)((int32_t)non_missing_ct);
1044 		dxx = dp_result[1];
1045 		dyy = dp_result[2];
1046 		cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
1047 		dxx = 1.0 / ((dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy));
1048 	      }
1049 	      if (!pairwise) {
1050 		dxx = cov12 * sqrt(dxx);
1051 		if (dxx != dxx) {
1052 		  // force prune if 0/0 for now
1053 		  dxx = 1.0;
1054 		}
1055 		cov_matrix[uii * window_max + ujj] = dxx;
1056 	      } else {
1057 		dxx = cov12 * cov12 * dxx;
1058 	      }
1059 	      if (dxx > prune_ld_thresh) {
1060 		at_least_one_prune = 1;
1061 		cur_exclude_ct++;
1062 		// remove marker with lower MAF
1063 		// could cache MAFs of all current-window variants, but
1064 		// get_maf() is too cheap for this to make a noticeable
1065 		// difference
1066 		if (get_maf(set_allele_freqs[live_indices[uii]]) < (1 - SMALL_EPSILON) * get_maf(set_allele_freqs[live_indices[ujj]])) {
1067 		  /*
1068 		  if (debug_print) {
1069 		    printf("removed %u, kept %u, MAFs %g/%g, r2 %g\n", live_indices[uii], live_indices[ujj], get_maf(set_allele_freqs[live_indices[uii]]), get_maf(set_allele_freqs[live_indices[ujj]]), dxx);
1070 		  }
1071 		  */
1072 		  SET_BIT(live_indices[uii], pruned_arr);
1073 		} else {
1074 		  /*
1075 		  if (debug_print) {
1076 		    printf("removed %u, kept %u, MAFs %g/%g, r2 %g\n", live_indices[ujj], live_indices[uii], get_maf(set_allele_freqs[live_indices[ujj]]), get_maf(set_allele_freqs[live_indices[uii]]), dxx);
1077 		  }
1078 		  */
1079 		  SET_BIT(live_indices[ujj], pruned_arr);
1080 		  ujj++;
1081 		  while (ujj < cur_window_size) {
1082 		    if (!IS_SET(pruned_arr, live_indices[ujj])) {
1083 		      break;
1084 		    }
1085 		    ujj++;
1086 		  }
1087 		  if (ujj < cur_window_size) {
1088 		    start_arr[uii] = live_indices[ujj];
1089 		  }
1090 		}
1091 		break;
1092 	      }
1093 	    }
1094 	    if (ujj == cur_window_size) {
1095 	      start_arr[uii] = window_unfiltered_end;
1096 	    }
1097 	  }
1098 	} while (at_least_one_prune);
1099 	if (!pairwise) {
1100 	  window_rem = 0;
1101 	  for (uii = 0; uii < old_window_size; uii++) {
1102 	    if (IS_SET(pruned_arr, live_indices[uii])) {
1103 	      continue;
1104 	    }
1105             idx_remap[window_rem++] = uii;
1106 	  }
1107 	  old_window_rem = window_rem;
1108 	  for (; uii < cur_window_size; uii++) {
1109 	    if (IS_SET(pruned_arr, live_indices[uii])) {
1110 	      continue;
1111 	    }
1112             idx_remap[window_rem++] = uii;
1113 	  }
1114 	  while (window_rem > 1) {
1115 	    new_cov_matrix[0] = 1.0;
1116 	    for (uii = 1; uii < window_rem; uii++) {
1117 	      ukk = idx_remap[uii];
1118 	      for (ujj = 0; ujj < uii; ujj++) {
1119 		dxx = cov_matrix[idx_remap[ujj] * window_max + ukk];
1120 		new_cov_matrix[ujj * window_rem + uii] = dxx;
1121 		new_cov_matrix[uii * window_rem + ujj] = dxx;
1122 	      }
1123 	      new_cov_matrix[uii * (window_rem + 1)] = 1.0;
1124 	    }
1125 	    window_rem_li = window_rem;
1126 	    ii = invert_matrix_checked(window_rem_li, new_cov_matrix, irow, work);
1127 	    while (ii) {
1128 #ifdef NOLAPACK
1129 	      if (ii == -1) {
1130 		goto ld_prune_ret_NOMEM;
1131 	      }
1132 #endif
1133 	      // 1. binary search for minimum number of bottom right rows/
1134 	      //    columns that must be trimmed to get a nonsingular matrix
1135 	      bsearch_max = window_rem - 1;
1136 	      if (old_window_rem > bsearch_max) {
1137 		// Normally we can assume that only loci not in the previous
1138 		// window need to be considered here.  But, thanks to numeric
1139 		// instability, we might still need to properly handle an
1140 		// apparently-singular old submatrix?
1141 		old_window_size = 0;
1142 		old_window_rem = 0;
1143 	      }
1144 	      bsearch_min = old_window_rem;
1145 	      while (bsearch_min < bsearch_max) {
1146 	        bsearch_cur = (bsearch_min + bsearch_max) / 2;
1147                 new_cov_matrix[0] = 1.0;
1148 		for (uii = 1; uii < bsearch_cur; uii++) {
1149 		  ukk = idx_remap[uii];
1150 		  for (ujj = 0; ujj < uii; ujj++) {
1151 		    dxx = cov_matrix[idx_remap[ujj] * window_max + ukk];
1152 		    new_cov_matrix[ujj * bsearch_cur + uii] = dxx;
1153 		    new_cov_matrix[uii * bsearch_cur + ujj] = dxx;
1154 		  }
1155 		  new_cov_matrix[uii * (bsearch_cur + 1)] = 1.0;
1156 		}
1157 		if (bsearch_cur) {
1158 		  window_rem_li = bsearch_cur;
1159 		  ii = invert_matrix_checked(window_rem_li, new_cov_matrix, irow, work);
1160 		  if (!ii) {
1161 		    bsearch_min = bsearch_cur + 1;
1162 		  } else {
1163 		    bsearch_max = bsearch_cur;
1164 		  }
1165 		} else {
1166 		  bsearch_min = 1;
1167 		}
1168 	      }
1169 
1170 	      // 2. the last trimmed row/column must be part of some linear
1171 	      //    combination.  prune *just* that, and retry.
1172 	      ujj = bsearch_min;
1173 	      // bug reported by Kaustubh was a violation of this:
1174 	      // assert(!IS_SET(pruned_arr, live_indices[idx_remap[ujj]]));
1175               SET_BIT(live_indices[idx_remap[ujj]], pruned_arr);
1176 	      cur_exclude_ct++;
1177 	      window_rem--;
1178 	      for (uii = ujj; uii < window_rem; uii++) {
1179 		idx_remap[uii] = idx_remap[uii + 1];
1180 	      }
1181 	      new_cov_matrix[0] = 1.0;
1182 	      for (uii = 1; uii < window_rem; uii++) {
1183 		ukk = idx_remap[uii];
1184 		for (ujj = 0; ujj < uii; ujj++) {
1185 		  dxx = cov_matrix[idx_remap[ujj] * window_max + ukk];
1186 		  new_cov_matrix[ujj * window_rem + uii] = dxx;
1187 		  new_cov_matrix[uii * window_rem + ujj] = dxx;
1188 		}
1189 		new_cov_matrix[uii * (window_rem + 1)] = 1.0;
1190 	      }
1191               window_rem_li = window_rem;
1192 	      ii = invert_matrix_checked(window_rem_li, new_cov_matrix, irow, work);
1193 	    }
1194 	    dxx = new_cov_matrix[0];
1195 	    ujj = 0;
1196 	    for (uii = 1; uii < window_rem; uii++) {
1197               if (new_cov_matrix[uii * (window_rem + 1)] > dxx) {
1198 		dxx = new_cov_matrix[uii * (window_rem + 1)];
1199 		ujj = uii;
1200 	      }
1201 	    }
1202 	    if (dxx > ld_last_param) {
1203 	      SET_BIT(live_indices[idx_remap[ujj]], pruned_arr);
1204 	      cur_exclude_ct++;
1205 	      window_rem--;
1206 	      if (idx_remap[ujj] < (uint32_t)old_window_size) {
1207                 old_window_rem--;
1208 	      }
1209 	      for (uii = ujj; uii < window_rem; uii++) {
1210                 idx_remap[uii] = idx_remap[uii + 1];
1211 	      }
1212 	    } else {
1213 	      // break out
1214 	      window_rem = 1;
1215 	    }
1216 	  }
1217 	}
1218       }
1219       for (uii = 0; uii < ld_window_incr; uii++) {
1220 	if (window_unfiltered_start == chrom_end) {
1221 	  break;
1222 	}
1223 	window_unfiltered_start++;
1224 	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_start);
1225       }
1226       if (window_unfiltered_start == chrom_end) {
1227 	break;
1228       }
1229       if (window_unfiltered_start >= pct_thresh) {
1230 	pct = ((window_unfiltered_start - chrom_start) * 100LLU) / (chrom_end - chrom_start);
1231 	printf("\r%u%%", pct++);
1232 	fflush(stdout);
1233 	pct_thresh = chrom_start + (((uint64_t)pct * (chrom_end - chrom_start)) / 100);
1234       }
1235       ujj = 0;
1236 
1237       if (window_unfiltered_end < window_unfiltered_start) {
1238 	window_unfiltered_end = window_unfiltered_start;
1239       }
1240 
1241       // copy back previously loaded/computed results
1242       while (live_indices[ujj] < window_unfiltered_start) {
1243 	ujj++;
1244 	if (ujj == cur_window_size) {
1245 	  break;
1246 	}
1247       }
1248       for (uii = 0; ujj < cur_window_size; ujj++) {
1249 	if (IS_SET(pruned_arr, live_indices[ujj])) {
1250 	  continue;
1251 	}
1252 	memcpy(&(geno[uii * founder_ct_192_long]), &(geno[ujj * founder_ct_192_long]), founder_ct_192_long * sizeof(intptr_t));
1253 	memcpy(&(geno_masks[uii * founder_ct_192_long]), &(geno_masks[ujj * founder_ct_192_long]), founder_ct_192_long * sizeof(intptr_t));
1254 	if (is_x && weighted_x) {
1255 	  memcpy(&(nonmale_geno[uii * founder_ct_192_long]), &(nonmale_geno[ujj * founder_ct_192_long]), founder_ct_192_long * sizeof(intptr_t));
1256 	  memcpy(&(nonmale_masks[uii * founder_ct_192_long]), &(nonmale_masks[ujj * founder_ct_192_long]), founder_ct_192_long * sizeof(intptr_t));
1257 	}
1258 	memcpy(&(geno_mmasks[uii * founder_ctv]), &(geno_mmasks[ujj * founder_ctv]), founder_ctl * sizeof(intptr_t));
1259 	live_indices[uii] = live_indices[ujj];
1260 	start_arr[uii] = start_arr[ujj];
1261 	missing_cts[uii] = missing_cts[ujj];
1262 	sums[uii] = sums[ujj];
1263         variance_recips[uii] = variance_recips[ujj];
1264 	if (!pairwise) {
1265 	  for (ukk = 0; ukk < uii; ukk++) {
1266 	    cov_matrix[ukk * window_max + uii] = cov_matrix[idx_remap[ukk] * window_max + ujj];
1267 	  }
1268 	  idx_remap[uii] = ujj;
1269 	}
1270 	uii++;
1271       }
1272 
1273       prev_end = uii;
1274       cur_window_size = uii;
1275       if (window_is_kb) {
1276 	ujj = 0;
1277 	ukk = window_unfiltered_end;
1278 	while ((ukk < chrom_end) && (marker_pos[ukk] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
1279 	  ujj++;
1280 	  ukk++;
1281 	  next_unset_ck(marker_exclude, chrom_end, &ukk);
1282 	}
1283       } else {
1284 	ujj = ld_window_incr;
1285       }
1286       old_window_size = cur_window_size;
1287       for (uii = 0; uii < ujj; uii++) {
1288 	if (window_unfiltered_end == chrom_end) {
1289 	  break;
1290 	}
1291 	live_indices[cur_window_size] = window_unfiltered_end;
1292 	if (cur_window_size > prev_end) {
1293 	  start_arr[cur_window_size - 1] = window_unfiltered_end;
1294 	}
1295 	if (fseeko(bedfile, bed_offset + (window_unfiltered_end * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
1296 	  goto ld_prune_ret_READ_FAIL;
1297 	}
1298 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end), bedfile, loadbuf, &(geno[cur_window_size * founder_ct_192_long]))) {
1299 	  goto ld_prune_ret_READ_FAIL;
1300 	}
1301 	if (is_haploid && hh_exists) {
1302 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(geno[cur_window_size * founder_ct_192_long])));
1303 	}
1304 	if (!ld_process_load(&(geno[cur_window_size * founder_ct_192_long]), &(geno_masks[cur_window_size * founder_ct_192_long]), &(geno_mmasks[cur_window_size * founder_ctv]), &(missing_cts[cur_window_size]), &(sums[cur_window_size]), &(variance_recips[cur_window_size]), founder_ct, is_x && (!ignore_x), weighted_x, nonmale_founder_ct, founder_male_include2, nonmale_geno, nonmale_masks, cur_window_size * founder_ct_192_long)) {
1305 	  SET_BIT(window_unfiltered_end, pruned_arr);
1306 	  cur_exclude_ct++;
1307 	}
1308 	cur_window_size++;
1309 	window_unfiltered_end++;
1310 	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
1311       }
1312       if (cur_window_size > prev_end) {
1313 	start_arr[cur_window_size - 1] = window_unfiltered_end;
1314       }
1315     }
1316     putc_unlocked('\r', stdout);
1317     LOGPRINTF("Pruned %" PRIuPTR " variant%s from chromosome %u, leaving %" PRIuPTR ".\n", cur_exclude_ct, (cur_exclude_ct == 1)? "" : "s", cur_chrom, chrom_end - chrom_start - popcount_bit_idx(marker_exclude, chrom_start, chrom_end) - cur_exclude_ct);
1318     tot_exclude_ct += cur_exclude_ct;
1319 
1320     // advance chromosomes as necessary
1321     window_unfiltered_start = ld_prune_next_valid_chrom_start(pruned_arr, window_unfiltered_start, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
1322   } while (window_unfiltered_start < unfiltered_marker_ct);
1323 
1324   LOGPRINTF("Pruning complete.  %u of %" PRIuPTR " variants removed.\n", tot_exclude_ct, marker_ct);
1325   retval = ld_prune_write(outname, outname_end, marker_exclude, pruned_arr, marker_ids, max_marker_id_len, chrom_info_ptr, chrom_code_end);
1326   if (retval) {
1327     goto ld_prune_ret_1;
1328   }
1329 
1330   while (0) {
1331   ld_prune_ret_NOMEM:
1332     retval = RET_NOMEM;
1333     break;
1334   ld_prune_ret_READ_FAIL:
1335     retval = RET_READ_FAIL;
1336     break;
1337   ld_prune_ret_INVALID_FORMAT:
1338     retval = RET_INVALID_FORMAT;
1339     break;
1340 #ifdef __LP64__
1341   ld_prune_ret_INVALID_CMDLINE:
1342     retval = RET_INVALID_CMDLINE;
1343     break;
1344 #endif
1345   }
1346  ld_prune_ret_1:
1347   bigstack_reset(bigstack_mark);
1348   return retval;
1349 }
1350 
ld_process_load2(uintptr_t * geno_buf,uintptr_t * mask_buf,uint32_t * missing_ct_ptr,uint32_t founder_ct,uint32_t is_x,uintptr_t * founder_male_include2)1351 void ld_process_load2(uintptr_t* geno_buf, uintptr_t* mask_buf, uint32_t* missing_ct_ptr, uint32_t founder_ct, uint32_t is_x, uintptr_t* founder_male_include2) {
1352   // ld_process_load(), except no missing_buf[] to conserve memory (and no
1353   // --ld-xchr 3 support yet), and no zero-variance check (we just want to
1354   // dump nans in that case)
1355   uintptr_t* geno_ptr = geno_buf;
1356   uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
1357   uintptr_t* geno_end = &(geno_buf[founder_ctl2]);
1358   uintptr_t* mask_buf_ptr = mask_buf;
1359   uintptr_t cur_geno;
1360   uintptr_t shifted_masked_geno;
1361   uintptr_t new_geno;
1362   uintptr_t new_mask;
1363   do {
1364     cur_geno = *geno_ptr;
1365     shifted_masked_geno = (cur_geno >> 1) & FIVEMASK;
1366     new_geno = cur_geno - shifted_masked_geno;
1367     *geno_ptr++ = new_geno;
1368     new_mask = (((~cur_geno) & FIVEMASK) | shifted_masked_geno) * 3;
1369     *mask_buf_ptr++ = new_mask;
1370   } while (geno_ptr < geno_end);
1371   if (is_x) {
1372     geno_ptr = geno_buf;
1373     do {
1374       new_geno = *geno_ptr;
1375       *geno_ptr++ = new_geno + ((~(new_geno | (new_geno >> 1))) & (*founder_male_include2++));
1376     } while (geno_ptr < geno_end);
1377   }
1378   if (founder_ct % BITCT2) {
1379     mask_buf[founder_ct / BITCT2] &= (ONELU << (2 * (founder_ct % BITCT2))) - ONELU;
1380   }
1381   *missing_ct_ptr = founder_ct - (popcount_longs(mask_buf, founder_ctl2) / 2);
1382 }
1383 
ld_missing_ct_intersect(uintptr_t * lptr1,uintptr_t * lptr2,uintptr_t word12_ct,uintptr_t word12_rem,uintptr_t lshift_last)1384 uint32_t ld_missing_ct_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word12_ct, uintptr_t word12_rem, uintptr_t lshift_last) {
1385   // variant of popcount_longs_intersect()
1386   uintptr_t tot = 0;
1387   uintptr_t* lptr1_end2;
1388 #ifdef __LP64__
1389   const __m128i m1 = {FIVEMASK, FIVEMASK};
1390   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
1391   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
1392   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
1393   __m128i* vptr1 = (__m128i*)lptr1;
1394   __m128i* vptr2 = (__m128i*)lptr2;
1395   __m128i* vend1;
1396   __m128i loader1;
1397   __m128i loader2;
1398   __univec acc;
1399 
1400   while (word12_ct >= 10) {
1401     word12_ct -= 10;
1402     vend1 = &(vptr1[60]);
1403   ld_missing_ct_intersect_main_loop:
1404     acc.vi = _mm_setzero_si128();
1405     do {
1406       loader1 = _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1);
1407       loader2 = _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1);
1408       loader1 = _mm_add_epi64(loader1, _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1));
1409       loader2 = _mm_add_epi64(loader2, _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1));
1410       loader1 = _mm_add_epi64(loader1, _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1));
1411       loader2 = _mm_add_epi64(loader2, _mm_andnot_si128(_mm_or_si128(*vptr2++, *vptr1++), m1));
1412       loader1 = _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2));
1413       loader1 = _mm_add_epi64(loader1, _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2)));
1414       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(loader1, m4), _mm_and_si128(_mm_srli_epi64(loader1, 4), m4)));
1415     } while (vptr1 < vend1);
1416     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
1417     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
1418   }
1419   if (word12_ct) {
1420     vend1 = &(vptr1[word12_ct * 6]);
1421     word12_ct = 0;
1422     goto ld_missing_ct_intersect_main_loop;
1423   }
1424   lptr1 = (uintptr_t*)vptr1;
1425   lptr2 = (uintptr_t*)vptr2;
1426 #else
1427   uintptr_t* lptr1_end = &(lptr1[word12_ct * 12]);
1428   uintptr_t tmp_stor;
1429   uintptr_t loader1;
1430   uintptr_t loader2;
1431   while (lptr1 < lptr1_end) {
1432     loader1 = (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1433     loader2 = (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1434     loader1 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1435     loader2 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1436     loader1 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1437     loader2 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1438     loader1 = (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
1439     loader1 += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
1440     tmp_stor = (loader1 & 0x0f0f0f0f) + ((loader1 >> 4) & 0x0f0f0f0f);
1441 
1442     loader1 = (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1443     loader2 = (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1444     loader1 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1445     loader2 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1446     loader1 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1447     loader2 += (~((*lptr1++) | (*lptr2++))) & FIVEMASK;
1448     loader1 = (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
1449     loader1 += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
1450     tmp_stor += (loader1 & 0x0f0f0f0f) + ((loader1 >> 4) & 0x0f0f0f0f);
1451     tot += (tmp_stor * 0x01010101) >> 24;
1452   }
1453 #endif
1454   lptr1_end2 = &(lptr1[word12_rem]);
1455   while (lptr1 < lptr1_end2) {
1456     tot += popcount2_long((~((*lptr1++) | (*lptr2++))) & FIVEMASK);
1457   }
1458   if (lshift_last) {
1459     tot += popcount2_long(((~((*lptr1) | (*lptr2))) & FIVEMASK) << lshift_last);
1460   }
1461   return tot;
1462 }
1463 
flipscan(Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uint32_t * marker_pos,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * founder_info,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)1464 int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
1465   unsigned char* bigstack_mark = g_bigstack_base;
1466   FILE* outfile = nullptr;
1467   FILE* outfile_verbose = nullptr;
1468   uintptr_t* sample_include2 = nullptr;
1469   uintptr_t* sample_male_include2 = nullptr;
1470   double min_corr = ldip->flipscan_thresh * (1 - SMALL_EPSILON);
1471   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
1472   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
1473   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
1474   uintptr_t marker_idx = 0;
1475   uintptr_t max_window_size = 1;
1476   uintptr_t pct = 1;
1477   uintptr_t pct_thresh = marker_ct / 100;
1478   uint32_t verbose = (ldip->modifier / LD_FLIPSCAN_VERBOSE) & 1;
1479   uint32_t ignore_x = (ldip->modifier / LD_IGNORE_X) & 1;
1480   uint32_t max_window_locus_ct = ldip->flipscan_window_size - 1;
1481   uint32_t window_bp = ldip->flipscan_window_bp;
1482   uint32_t problem_ct = 0;
1483   int32_t retval = 0;
1484   uintptr_t* founder_phenos[2];
1485   uintptr_t* pheno_male_include2[2];
1486   uintptr_t* window_geno[2];
1487   uintptr_t* window_mask[2];
1488   uintptr_t pheno_ct[2];
1489   uintptr_t pheno_ct_192_long[2];
1490   uint32_t pheno_ctl[2];
1491   uint32_t pheno_ct_mld_m1[2];
1492   uint32_t pheno_ct_mld_rem[2];
1493   int32_t dp_result[5];
1494   double* r_matrix;
1495   double* r_matrix_ptr;
1496   double* r_row_ptr;
1497   uintptr_t* loadbuf_raw;
1498   uintptr_t* window_geno_ptr;
1499   uintptr_t* window_mask_ptr;
1500   uintptr_t* geno_fixed_vec_ptr;
1501   uintptr_t* mask_fixed_vec_ptr;
1502   uintptr_t* geno_var_vec_ptr;
1503   uintptr_t* mask_var_vec_ptr;
1504   uint32_t* window_uidxs;
1505   uint32_t* window_cidx_starts;
1506   uint32_t* neg_uidx_buf;
1507   uint32_t* missing_cts;
1508   uint32_t* missing_cts_ptr;
1509   char* textbuf;
1510   char* wptr;
1511   char* wptr_start;
1512   char* wptr_start2;
1513   double pos_r_tot;
1514   double neg_r_tot;
1515   double ctrl_pheno;
1516   double case_pheno;
1517   double non_missing_ctd;
1518   double cov12;
1519   double dxx;
1520   double dyy;
1521   uintptr_t marker_uidx;
1522   uintptr_t cur_pheno_ct;
1523   uintptr_t window_cidx;
1524   uintptr_t window_cidx2;
1525   uintptr_t window_cidx3;
1526   uintptr_t marker_uidx2;
1527   uintptr_t marker_uidx3;
1528   uintptr_t cur_192_long;
1529   uintptr_t cur_ctwd12;
1530   uintptr_t cur_ctwd12_rem;
1531   uintptr_t lshift_last;
1532   uintptr_t ulii;
1533   uintptr_t uljj;
1534   uint32_t chrom_fo_idx;
1535   uint32_t chrom_idx;
1536   uint32_t chrom_end;
1537   uint32_t chrom_marker_ct;
1538   uint32_t chrom_marker_idx;
1539   uint32_t is_haploid;
1540   uint32_t is_x;
1541   uint32_t is_y;
1542   uint32_t is_case;
1543   uint32_t marker_pos_thresh;
1544   uint32_t pos_r_ct;
1545   uint32_t neg_r_ct;
1546   uint32_t fixed_missing_ct;
1547   uint32_t fixed_non_missing_ct;
1548   uint32_t non_missing_ct;
1549   uint32_t cur_mld_m1;
1550   uint32_t cur_mld_rem;
1551   uint32_t uii;
1552   ulii = 2 * (max_marker_allele_len + plink_maxsnp) + 256;
1553   if (ulii <= MAXLINELEN) {
1554     textbuf = g_textbuf;
1555   } else {
1556     if (bigstack_alloc_c(ulii, &textbuf)) {
1557       goto flipscan_ret_NOMEM;
1558     }
1559   }
1560   if (bigstack_alloc_ul(unfiltered_sample_ctl, &(founder_phenos[0])) ||
1561       bigstack_alloc_ul(unfiltered_sample_ctl, &(founder_phenos[1])) ||
1562       bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
1563     goto flipscan_ret_NOMEM;
1564   }
1565   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
1566   memcpy(founder_phenos[0], founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
1567   bitvec_and(pheno_nm, unfiltered_sample_ctl, founder_phenos[0]);
1568   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_exists, 0, founder_phenos[0], sex_male, &sample_include2, &sample_male_include2)) {
1569     goto flipscan_ret_NOMEM;
1570   }
1571   memcpy(founder_phenos[1], founder_phenos[0], unfiltered_sample_ctl * sizeof(intptr_t));
1572   bitvec_and(pheno_c, unfiltered_sample_ctl, founder_phenos[1]);
1573   bitvec_andnot(pheno_c, unfiltered_sample_ctl, founder_phenos[0]);
1574   pheno_ct[0] = popcount_longs(founder_phenos[0], unfiltered_sample_ctl);
1575   pheno_ct[1] = popcount_longs(founder_phenos[1], unfiltered_sample_ctl);
1576   if ((!pheno_ct[0]) || (!pheno_ct[1])) {
1577     if (popcount_longs(founder_info, unfiltered_sample_ctl)) {
1578       logerrprint("Error: --flip-scan requires at least one case and one control, and only\nconsiders founders.\n");
1579     } else {
1580       logerrprint("Error: --flip-scan requires founders.  (--make-founders may come in handy\nhere.)\n");
1581     }
1582     goto flipscan_ret_INVALID_CMDLINE;
1583   }
1584   for (is_case = 0; is_case < 2; is_case++) {
1585     pheno_ctl[is_case] = BITCT_TO_WORDCT(pheno_ct[is_case]);
1586 
1587     // ulii == total number of blocks, all but last is size MULTIPLEX_LD
1588     ulii = (pheno_ct[is_case] + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
1589     pheno_ct_mld_m1[is_case] = ulii - 1;
1590 
1591     // number of size-{48,192} sub-blocks in trailing block
1592 #ifdef __LP64__
1593     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 192;
1594 #else
1595     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 48;
1596 #endif
1597 
1598     // number of genotype words per variant, rounded up to the next 192-sample
1599     // boundary
1600     pheno_ct_192_long[is_case] = pheno_ct_mld_m1[is_case] * (MULTIPLEX_LD / BITCT2) + pheno_ct_mld_rem[is_case] * (192 / BITCT2);
1601   }
1602   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
1603     max_window_size = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], max_window_locus_ct * 2 + 1, window_bp * 2, max_window_size);
1604   }
1605   if (bigstack_alloc_ui(max_window_size, &window_uidxs) ||
1606       bigstack_alloc_ui(max_window_size, &window_cidx_starts) ||
1607       bigstack_alloc_ui(max_window_size, &neg_uidx_buf) ||
1608       bigstack_alloc_ul(pheno_ctl[0] * 2, &(pheno_male_include2[0])) ||
1609       bigstack_alloc_ul(pheno_ctl[1] * 2, &(pheno_male_include2[1])) ||
1610       bigstack_alloc_ui(max_window_size * 2, &missing_cts) ||
1611       bigstack_alloc_ul(max_window_size * pheno_ct_192_long[0], &(window_geno[0])) ||
1612       bigstack_alloc_ul(max_window_size * pheno_ct_192_long[0], &(window_mask[0])) ||
1613       bigstack_alloc_ul(max_window_size * pheno_ct_192_long[1], &(window_geno[1])) ||
1614       bigstack_alloc_ul(max_window_size * pheno_ct_192_long[1], &(window_mask[1])) ||
1615       // not advantageous to choose a very large block size here, so O(n^2)
1616       // memory is fine (though it can be avoided by calculating each
1617       // correlation twice).
1618       bigstack_alloc_d(max_window_size * max_window_size * 2, &r_matrix)) {
1619     goto flipscan_ret_NOMEM;
1620   }
1621   ulii = (max_window_size + 1) * 2;
1622   for (uljj = 0; uljj < max_window_size; uljj++) {
1623     neg_uidx_buf[uljj * ulii] = 0.0;
1624     neg_uidx_buf[uljj * ulii + 1] = 0.0;
1625     // bugfix: initialize r_matrix diagonal
1626     r_matrix[uljj * ulii] = 0.0;
1627     r_matrix[uljj * ulii + 1] = 0.0;
1628   }
1629   for (is_case = 0; is_case < 2; is_case++) {
1630     quaterarr_collapse_init(sex_male, unfiltered_sample_ct, founder_phenos[is_case], pheno_ct[is_case], pheno_male_include2[is_case]);
1631     window_geno_ptr = window_geno[is_case];
1632     window_mask_ptr = window_mask[is_case];
1633     cur_192_long = pheno_ct_192_long[is_case];
1634     ulii = 2 + pheno_ct_192_long[is_case] - pheno_ctl[is_case] * 2;
1635     for (uljj = 1; uljj <= max_window_size; uljj++) {
1636       fill_ulong_zero(ulii, &(window_geno_ptr[uljj * cur_192_long - ulii]));
1637       fill_ulong_zero(ulii, &(window_mask_ptr[uljj * cur_192_long - ulii]));
1638     }
1639   }
1640 
1641   memcpy(outname_end, ".flipscan", 10);
1642   if (fopen_checked(outname, "w", &outfile)) {
1643     goto flipscan_ret_OPEN_FAIL;
1644   }
1645   wptr = memcpya(textbuf, "   CHR ", 7);
1646   wptr = fw_strcpyn(plink_maxsnp, 3, "SNP", wptr);
1647   wptr = strcpya(wptr, "           BP   A1   A2        F    POS    R_POS    NEG    R_NEG NEGSNPS\n");
1648   if (fwrite_checked(textbuf, wptr - textbuf, outfile)) {
1649     goto flipscan_ret_WRITE_FAIL;
1650   }
1651   if (verbose) {
1652     memcpy(&(outname_end[9]), ".verbose", 9);
1653     if (fopen_checked(outname, "w", &outfile_verbose)) {
1654       goto flipscan_ret_OPEN_FAIL;
1655     }
1656     outname_end[9] = '\0';
1657     // er, this is a misalignment disaster
1658     wptr = memcpya(textbuf, "CHR_INDX ", 9);
1659     wptr = fw_strcpyn(plink_maxsnp, 8, "SNP_INDX", wptr);
1660     wptr = memcpya(wptr, "      BP_INDX A1_INDX ", 22);
1661     wptr = fw_strcpyn(plink_maxsnp, 8, "SNP_PAIR", wptr);
1662     wptr = strcpya(wptr, "      BP_PAIR A1_PAIR      R_A      R_U\n");
1663     if (fwrite_checked(textbuf, wptr - textbuf, outfile_verbose)) {
1664       goto flipscan_ret_WRITE_FAIL;
1665     }
1666   }
1667   printf("--flip-scan%s: 0%%", verbose? " verbose" : "");
1668   fflush(stdout);
1669   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
1670     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
1671     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
1672     marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_end);
1673     chrom_marker_ct = chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
1674     if (chrom_marker_ct < 2) {
1675       marker_idx += chrom_marker_ct;
1676       continue;
1677     }
1678     wptr_start = width_force(6, textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, textbuf));
1679     *wptr_start++ = ' ';
1680     is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
1681     is_x = (chrom_idx == ((uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]));
1682     is_y = (chrom_idx == ((uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]));
1683     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
1684       goto flipscan_ret_READ_FAIL;
1685     }
1686     chrom_marker_idx = 0;
1687     window_cidx = max_window_size - 1;
1688     window_cidx2 = 0;
1689     do {
1690       if (++window_cidx == max_window_size) {
1691 	window_cidx = 0;
1692       }
1693       window_uidxs[window_cidx] = marker_uidx;
1694 
1695       // circular index of beginning of window starting at current marker
1696       window_cidx_starts[window_cidx] = window_cidx2;
1697       if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
1698 	goto flipscan_ret_READ_FAIL;
1699       }
1700       if (IS_SET(marker_reverse, marker_uidx)) {
1701 	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
1702       }
1703       if (is_haploid && hh_exists) {
1704         haploid_fix(hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
1705       }
1706       for (is_case = 0; is_case < 2; is_case++) {
1707 	// similar to ld_block_thread() below
1708 	cur_pheno_ct = pheno_ct[is_case];
1709 	uii = cur_pheno_ct / BITCT2;
1710         cur_ctwd12 = uii / 12;
1711 	cur_ctwd12_rem = uii - (12 * cur_ctwd12);
1712 	lshift_last = 2 * ((0x7fffffc0 - cur_pheno_ct) % BITCT2);
1713 	cur_mld_m1 = pheno_ct_mld_m1[is_case];
1714         cur_mld_rem = pheno_ct_mld_rem[is_case];
1715 	cur_192_long = pheno_ct_192_long[is_case];
1716 	window_geno_ptr = window_geno[is_case];
1717 	window_mask_ptr = window_mask[is_case];
1718 	missing_cts_ptr = &(missing_cts[is_case * max_window_size]);
1719 	r_matrix_ptr = &(r_matrix[is_case]);
1720 	geno_fixed_vec_ptr = &(window_geno_ptr[window_cidx * cur_192_long]);
1721 	mask_fixed_vec_ptr = &(window_mask_ptr[window_cidx * cur_192_long]);
1722         copy_quaterarr_nonempty_subset(loadbuf_raw, founder_phenos[is_case], unfiltered_sample_ct, cur_pheno_ct, geno_fixed_vec_ptr);
1723         ld_process_load2(geno_fixed_vec_ptr, mask_fixed_vec_ptr, &fixed_missing_ct, cur_pheno_ct, is_x && (!ignore_x), pheno_male_include2[is_case]);
1724 	fixed_non_missing_ct = cur_pheno_ct - fixed_missing_ct;
1725         missing_cts_ptr[window_cidx] = fixed_missing_ct;
1726 	window_cidx3 = window_cidx2;
1727 	while (window_cidx3 != window_cidx) {
1728 	  geno_var_vec_ptr = &(window_geno_ptr[window_cidx3 * cur_192_long]);
1729 	  mask_var_vec_ptr = &(window_mask_ptr[window_cidx3 * cur_192_long]);
1730 	  non_missing_ct = fixed_non_missing_ct - missing_cts_ptr[window_cidx3];
1731 	  if (fixed_missing_ct && missing_cts_ptr[window_cidx3]) {
1732             non_missing_ct += ld_missing_ct_intersect(mask_var_vec_ptr, mask_fixed_vec_ptr, cur_ctwd12, cur_ctwd12_rem, lshift_last);
1733 	  }
1734 	  if (non_missing_ct) {
1735 	    dp_result[0] = cur_pheno_ct;
1736 	    dp_result[1] = -((int32_t)fixed_non_missing_ct);
1737 	    dp_result[2] = missing_cts_ptr[window_cidx3] - cur_pheno_ct;
1738 	    dp_result[3] = dp_result[1];
1739 	    dp_result[4] = dp_result[2];
1740 	    ld_dot_prod(geno_var_vec_ptr, geno_fixed_vec_ptr, mask_var_vec_ptr, mask_fixed_vec_ptr, dp_result, cur_mld_m1, cur_mld_rem);
1741 	    non_missing_ctd = (double)((int32_t)non_missing_ct);
1742             dxx = dp_result[1];
1743             dyy = dp_result[2];
1744             cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
1745             dxx = (dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy);
1746 	    dxx = cov12 / sqrt(dxx);
1747 	  } else {
1748 	    dxx = 0.0;
1749 	  }
1750 	  r_matrix_ptr[2 * (window_cidx3 * max_window_size + window_cidx)] = dxx;
1751 	  r_matrix_ptr[2 * (window_cidx * max_window_size + window_cidx3)] = dxx;
1752           if (++window_cidx3 == max_window_size) {
1753             window_cidx3 = 0;
1754 	  }
1755 	}
1756       }
1757 
1758       if (++chrom_marker_idx < chrom_marker_ct) {
1759         marker_uidx++;
1760 	if (IS_SET(marker_exclude, marker_uidx)) {
1761 	  marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
1762 	  if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
1763 	    goto flipscan_ret_READ_FAIL;
1764 	  }
1765 	}
1766         marker_pos_thresh = marker_pos[marker_uidx];
1767 	if (marker_pos_thresh < window_bp) {
1768 	  marker_pos_thresh = 0;
1769 	} else {
1770 	  marker_pos_thresh -= window_bp;
1771 	}
1772       } else {
1773 	// close out the chromosome
1774         marker_pos_thresh = 0x80000000U;
1775       }
1776       // only need to enforce window locus count constraint during first loop
1777       // iteration
1778       ulii = window_cidx2 + max_window_locus_ct;
1779       if (ulii >= max_window_size) {
1780 	ulii -= max_window_size;
1781       }
1782       marker_uidx2 = window_uidxs[window_cidx2];
1783       if ((ulii == window_cidx) || (marker_pos[marker_uidx2] < marker_pos_thresh)) {
1784 	do {
1785 	  pos_r_tot = 0.0;
1786 	  neg_r_tot = 0.0;
1787 	  pos_r_ct = 0;
1788 	  neg_r_ct = 0;
1789           r_row_ptr = &(r_matrix[2 * max_window_size * window_cidx2]);
1790 	  window_cidx3 = window_cidx_starts[window_cidx2];
1791           while (1) {
1792 	    ctrl_pheno = r_row_ptr[2 * window_cidx3];
1793 	    case_pheno = r_row_ptr[2 * window_cidx3 + 1];
1794 	    if ((fabs(ctrl_pheno) >= min_corr) || (fabs(case_pheno) >= min_corr)) {
1795 	      dxx = fabs(ctrl_pheno) + fabs(case_pheno);
1796 	      if (case_pheno * ctrl_pheno >= 0.0) {
1797                 pos_r_ct++;
1798 		pos_r_tot += dxx;
1799 	      } else {
1800 		neg_uidx_buf[neg_r_ct++] = window_uidxs[window_cidx3];
1801 		neg_r_tot += dxx;
1802 	      }
1803 	    }
1804 	    if (window_cidx3 == window_cidx) {
1805 	      break;
1806 	    }
1807 	    if (++window_cidx3 == max_window_size) {
1808               window_cidx3 = 0;
1809 	    }
1810 	  }
1811 	  wptr_start2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start);
1812 	  wptr_start2 = memseta(wptr_start2, 32, 3);
1813           wptr_start2 = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr_start2);
1814 	  wptr_start2 = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx2], wptr_start2);
1815 	  *wptr_start2++ = ' ';
1816 	  wptr = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx2 + 1], wptr_start2);
1817 	  *wptr++ = ' ';
1818 	  wptr = dtoa_g_wxp3x(1.0 - set_allele_freqs[marker_uidx2], 8, ' ', wptr);
1819           wptr = uint32toa_w6x(pos_r_ct, ' ', wptr);
1820 	  if (!pos_r_ct) {
1821 	    wptr = memcpya(wptr, "      NA", 8);
1822 	  } else {
1823             wptr = dtoa_g_wxp3(pos_r_tot / ((int32_t)(pos_r_ct * 2)), 8, wptr);
1824 	  }
1825           *wptr++ = ' ';
1826           wptr = uint32toa_w6x(neg_r_ct, ' ', wptr);
1827 	  if (!neg_r_ct) {
1828 	    wptr = memcpya(wptr, "      NA", 8);
1829 	  } else {
1830 	    wptr = dtoa_g_wxp3(neg_r_tot / ((int32_t)(neg_r_ct * 2)), 8, wptr);
1831 	  }
1832 	  *wptr++ = ' ';
1833           if (fwrite_checked(textbuf, wptr - textbuf, outfile)) {
1834 	    goto flipscan_ret_WRITE_FAIL;
1835 	  }
1836 	  if (neg_r_ct) {
1837 	    for (ulii = 0; ulii < neg_r_ct; ulii++) {
1838 	      if (ulii) {
1839 		putc_unlocked('|', outfile);
1840 	      }
1841               fputs(&(marker_ids[neg_uidx_buf[ulii] * max_marker_id_len]), outfile);
1842 	    }
1843 	    problem_ct++;
1844 	    if (verbose) {
1845 	      window_cidx3 = window_cidx_starts[window_cidx2];
1846 	      while (1) {
1847 		ctrl_pheno = r_row_ptr[2 * window_cidx3];
1848 		case_pheno = r_row_ptr[2 * window_cidx3 + 1];
1849 		if ((fabs(ctrl_pheno) >= min_corr) || (fabs(case_pheno) >= min_corr)) {
1850 		  marker_uidx3 = window_uidxs[window_cidx3];
1851 		  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx3 * max_marker_id_len]), wptr_start2);
1852 		  wptr = memseta(wptr, 32, 3);
1853 		  wptr = uint32toa_w10x(marker_pos[marker_uidx3], ' ', wptr);
1854 		  wptr = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx3], wptr);
1855                   *wptr++ = ' ';
1856 		  wptr = dtoa_g_wxp3x(case_pheno, 8, ' ', wptr);
1857 		  wptr = dtoa_g_wxp3x(ctrl_pheno, 8, '\n', wptr);
1858 		  if (fwrite_checked(textbuf, wptr - textbuf, outfile_verbose)) {
1859 		    goto flipscan_ret_WRITE_FAIL;
1860 		  }
1861 		}
1862 		if (window_cidx3 == window_cidx) {
1863 		  break;
1864 		}
1865 		if (++window_cidx3 == max_window_size) {
1866 		  window_cidx3 = 0;
1867 		}
1868 	      }
1869 	    }
1870 	  }
1871 	  putc_unlocked('\n', outfile);
1872 	  if (++marker_idx >= pct_thresh) {
1873 	    if (pct > 10) {
1874 	      putc_unlocked('\b', stdout);
1875 	    }
1876 	    pct = (marker_idx * 100LLU) / marker_ct;
1877 	    if (pct < 100) {
1878 	      printf("\b\b%" PRIuPTR "%%", pct);
1879 	      fflush(stdout);
1880 	      pct_thresh = ((++pct) * ((uint64_t)marker_ct)) / 100;
1881 	    }
1882 	  }
1883 	  // better to perform this comparison first
1884 	  if (window_cidx2 == window_cidx) {
1885 	    if (++window_cidx2 == max_window_size) {
1886 	      window_cidx2 = 0;
1887 	    }
1888 	    break;
1889 	  }
1890 	  if (++window_cidx2 == max_window_size) {
1891 	    window_cidx2 = 0;
1892 	  }
1893 	  marker_uidx2 = window_uidxs[window_cidx2];
1894 	} while (marker_pos[marker_uidx2] < marker_pos_thresh);
1895       }
1896     } while (chrom_marker_idx < chrom_marker_ct);
1897   }
1898   if (fclose_null(&outfile)) {
1899     goto flipscan_ret_WRITE_FAIL;
1900   }
1901   if (verbose) {
1902     if (fclose_null(&outfile_verbose)) {
1903       goto flipscan_ret_WRITE_FAIL;
1904     }
1905   }
1906   putc_unlocked('\r', stdout);
1907   // not actually possible to have exactly one problem variant, heh
1908   LOGPRINTF("--flip-scan%s: %u variants with at least one negative LD match.\n", verbose? " verbose" : "", problem_ct);
1909   if (verbose) {
1910     LOGPRINTFWW("Report written to %s ; neg-match details written to %s.verbose .\n", outname, outname);
1911   } else {
1912     LOGPRINTFWW("Report written to %s .\n", outname);
1913   }
1914   while (0) {
1915   flipscan_ret_NOMEM:
1916     retval = RET_NOMEM;
1917     break;
1918   flipscan_ret_OPEN_FAIL:
1919     retval = RET_OPEN_FAIL;
1920     break;
1921   flipscan_ret_READ_FAIL:
1922     retval = RET_READ_FAIL;
1923     break;
1924   flipscan_ret_WRITE_FAIL:
1925     retval = RET_WRITE_FAIL;
1926     break;
1927   flipscan_ret_INVALID_CMDLINE:
1928     retval = RET_INVALID_CMDLINE;
1929     break;
1930   }
1931   bigstack_reset(bigstack_mark);
1932   fclose_cond(outfile);
1933   fclose_cond(outfile_verbose);
1934   return retval;
1935 }
1936 
1937 // LD multithread globals
1938 static uintptr_t* g_ld_geno1;
1939 static uintptr_t* g_ld_geno2;
1940 static uintptr_t* g_ld_geno_masks1;
1941 static uintptr_t* g_ld_geno_masks2;
1942 static uint32_t* g_ld_missing_cts1;
1943 static uint32_t* g_ld_missing_cts2;
1944 static uint32_t* g_ld_interval1;
1945 static double* g_ld_results;
1946 static float* g_ld_results_f;
1947 static double* g_ld_set_allele_freqs;
1948 static uintptr_t g_ld_idx1_block_size;
1949 static uintptr_t g_ld_idx2_block_size;
1950 static uintptr_t g_ld_idx2_block_start;
1951 static uintptr_t g_ld_block_idx1;
1952 static uintptr_t g_ld_marker_ct;
1953 static uintptr_t g_ld_marker_ctm8;
1954 static uintptr_t g_ld_founder_ct;
1955 static uintptr_t g_ld_founder_ct_192_long;
1956 static uint32_t g_ld_founder_ct_mld_m1;
1957 static uint32_t g_ld_founder_ct_mld_rem;
1958 static uint32_t g_ld_is_r2;
1959 static uint32_t g_ld_thread_ct;
1960 
1961 // with '--r2 dprime', males should be downweighted by a factor of 2 when
1962 // considering two X chromosome variants, and by a factor of sqrt(2) when doing
1963 // an inter-chromosome evaluation involving a single Xchr variant.  (The
1964 // sqrt(2) factor is not implemented by PLINK 1.07, but the math compels its
1965 // use.)
1966 static uintptr_t* g_ld_sex_male;
1967 static uintptr_t* g_ld_thread_wkspace;
1968 static uint32_t g_ld_xstart1;
1969 static uint32_t g_ld_xend1;
1970 static uint32_t g_ld_xstart2;
1971 static uint32_t g_ld_xend2;
1972 
1973 static char g_ld_delimiter;
1974 static uint32_t g_ld_plink_maxsnp;
1975 static char* g_ld_marker_ids;
1976 static Chrom_info* g_ld_chrom_info_ptr;
1977 static uint32_t* g_ld_marker_pos;
1978 static double* g_ld_marker_cms;
1979 static uintptr_t* g_ld_marker_exclude_idx1;
1980 static uintptr_t* g_ld_marker_exclude;
1981 static char** g_ld_marker_allele_ptrs;
1982 static uintptr_t g_ld_max_marker_id_len;
1983 static uintptr_t g_ld_marker_uidx1;
1984 static uintptr_t g_ld_uidx2_start;
1985 static uintptr_t g_ld_marker_uidx2;
1986 static uintptr_t g_ld_block_idx2;
1987 static double g_ld_window_r2;
1988 static uint32_t g_ld_is_first_block;
1989 static uint32_t g_ld_is_inter_chr;
1990 static uint32_t g_ld_prefix_len;
1991 static uint32_t g_ld_keep_sign;
1992 static uint32_t g_ld_modifier;
1993 
ld_block_thread(void * arg)1994 THREAD_RET_TYPE ld_block_thread(void* arg) {
1995   uintptr_t tidx = (uintptr_t)arg;
1996   uint32_t thread_ct = g_ld_thread_ct;
1997   uintptr_t block_idx1_start = (tidx * g_ld_idx1_block_size) / thread_ct;
1998   uintptr_t block_idx1_end = ((tidx + 1) * g_ld_idx1_block_size) / thread_ct;
1999   uintptr_t marker_idx2_maxw = g_ld_marker_ctm8;
2000   uintptr_t founder_ct = g_ld_founder_ct;
2001   uintptr_t founder_ctwd = founder_ct / BITCT2;
2002   uintptr_t founder_ctwd12 = founder_ctwd / 12;
2003   uintptr_t founder_ctwd12_rem = founder_ctwd - (12 * founder_ctwd12);
2004   uintptr_t lshift_last = 2 * ((0x7fffffc0 - founder_ct) % BITCT2);
2005   uintptr_t founder_ct_192_long = g_ld_founder_ct_192_long;
2006   uintptr_t* geno1 = g_ld_geno1;
2007   uintptr_t* geno_masks1 = g_ld_geno_masks1;
2008   uint32_t* missing_cts1 = g_ld_missing_cts1;
2009   uint32_t* ld_interval1 = g_ld_interval1;
2010   uint32_t founder_ct_mld_m1 = g_ld_founder_ct_mld_m1;
2011   uint32_t founder_ct_mld_rem = g_ld_founder_ct_mld_rem;
2012   uint32_t is_r2 = g_ld_is_r2;
2013   uint32_t keep_sign = g_ld_keep_sign;
2014   double* results = g_ld_results;
2015   float* results_f = g_ld_results_f;
2016   double* rptr = nullptr;
2017   float* rptr_f = nullptr;
2018   int32_t dp_result[5];
2019   uintptr_t* geno_fixed_vec_ptr;
2020   uintptr_t* geno_var_vec_ptr;
2021   uintptr_t* mask_fixed_vec_ptr;
2022   uintptr_t* mask_var_vec_ptr;
2023   uintptr_t* geno2;
2024   uintptr_t* geno_masks2;
2025   uint32_t* missing_cts2;
2026   uintptr_t idx2_block_size;
2027   uintptr_t idx2_block_start;
2028   uintptr_t block_idx1;
2029   uintptr_t block_idx2;
2030   uintptr_t cur_block_idx2_end;
2031   double non_missing_ctd;
2032   double cov12;
2033   double dxx;
2034   double dyy;
2035   float non_missing_ctf;
2036   float cov12_f;
2037   float fxx;
2038   float fyy;
2039   uint32_t fixed_missing_ct;
2040   uint32_t fixed_non_missing_ct;
2041   uint32_t non_missing_ct;
2042   while (1) {
2043     idx2_block_size = g_ld_idx2_block_size;
2044     idx2_block_start = g_ld_idx2_block_start;
2045     geno2 = g_ld_geno2;
2046     geno_masks2 = g_ld_geno_masks2;
2047     missing_cts2 = g_ld_missing_cts2;
2048     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++) {
2049       fixed_non_missing_ct = ld_interval1[block_idx1 * 2]; // temporary redefine
2050       block_idx2 = fixed_non_missing_ct;
2051       cur_block_idx2_end = ld_interval1[block_idx1 * 2 + 1];
2052       if (block_idx2 < idx2_block_start) {
2053 	if (cur_block_idx2_end <= idx2_block_start) {
2054 	  continue;
2055 	}
2056 	block_idx2 = 0;
2057       } else {
2058 	block_idx2 -= idx2_block_start;
2059 	if (block_idx2 >= idx2_block_size) {
2060 	  // nondecreasing, so we can safely exit
2061 	  break;
2062 	}
2063       }
2064       cur_block_idx2_end -= idx2_block_start;
2065       if (cur_block_idx2_end > idx2_block_size) {
2066 	cur_block_idx2_end = idx2_block_size;
2067       }
2068       if (results) {
2069 	rptr = &(results[block_idx1 * marker_idx2_maxw + block_idx2 + idx2_block_start - fixed_non_missing_ct]);
2070       } else {
2071 	rptr_f = &(results_f[block_idx1 * marker_idx2_maxw + block_idx2 + idx2_block_start - fixed_non_missing_ct]);
2072       }
2073       fixed_missing_ct = missing_cts1[block_idx1];
2074       fixed_non_missing_ct = founder_ct - fixed_missing_ct;
2075       geno_fixed_vec_ptr = &(geno1[block_idx1 * founder_ct_192_long]);
2076       mask_fixed_vec_ptr = &(geno_masks1[block_idx1 * founder_ct_192_long]);
2077       for (; block_idx2 < cur_block_idx2_end; block_idx2++) {
2078 	geno_var_vec_ptr = &(geno2[block_idx2 * founder_ct_192_long]);
2079 	mask_var_vec_ptr = &(geno_masks2[block_idx2 * founder_ct_192_long]);
2080 	non_missing_ct = fixed_non_missing_ct - missing_cts2[block_idx2];
2081 	if (fixed_missing_ct && missing_cts2[block_idx2]) {
2082 	  non_missing_ct += ld_missing_ct_intersect(mask_var_vec_ptr, mask_fixed_vec_ptr, founder_ctwd12, founder_ctwd12_rem, lshift_last);
2083 	}
2084 	dp_result[0] = founder_ct;
2085 	dp_result[1] = -fixed_non_missing_ct;
2086 	dp_result[2] = missing_cts2[block_idx2] - founder_ct;
2087 	dp_result[3] = dp_result[1];
2088 	dp_result[4] = dp_result[2];
2089 	ld_dot_prod(geno_var_vec_ptr, geno_fixed_vec_ptr, mask_var_vec_ptr, mask_fixed_vec_ptr, dp_result, founder_ct_mld_m1, founder_ct_mld_rem);
2090 	if (results) {
2091 	  non_missing_ctd = (double)((int32_t)non_missing_ct);
2092 	  dxx = dp_result[1];
2093 	  dyy = dp_result[2];
2094 	  cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
2095 	  dxx = (dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy);
2096 	  if (!is_r2) {
2097 	    dxx = cov12 / sqrt(dxx);
2098 	  } else if (!keep_sign) {
2099 	    dxx = (cov12 * cov12) / dxx;
2100 	  } else {
2101 	    dxx = (fabs(cov12) * cov12) / dxx;
2102 	  }
2103 	  *rptr++ = dxx;
2104 	} else {
2105 	  non_missing_ctf = (float)((int32_t)non_missing_ct);
2106 	  fxx = dp_result[1];
2107 	  fyy = dp_result[2];
2108 	  cov12_f = dp_result[0] * non_missing_ctf - fxx * fyy;
2109 	  fxx = (dp_result[3] * non_missing_ctf + fxx * fxx) * (dp_result[4] * non_missing_ctf + fyy * fyy);
2110 	  if (!is_r2) {
2111 	    fxx = cov12_f / sqrt(fxx);
2112 	  } else if (!keep_sign) {
2113 	    fxx = (cov12_f * cov12_f) / fxx;
2114 	  } else {
2115 	    fxx = (fabs(cov12_f) * cov12_f) / fxx;
2116 	  }
2117 	  *rptr_f++ = fxx;
2118 	}
2119       }
2120     }
2121     if ((!tidx) || g_is_last_thread_block) {
2122       THREAD_RETURN;
2123     }
2124     THREAD_BLOCK_FINISH(tidx);
2125   }
2126 }
2127 
ld_matrix_emitn(uint32_t overflow_ct,unsigned char * readbuf)2128 uint32_t ld_matrix_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
2129   char* sptr_cur = (char*)(&(readbuf[overflow_ct]));
2130   char* readbuf_end = (char*)(&(readbuf[PIGZ_BLOCK_SIZE]));
2131   uintptr_t block_size1 = g_ld_idx1_block_size;
2132   uintptr_t marker_ct = g_ld_marker_ct;
2133   uintptr_t marker_ctm8 = g_ld_marker_ctm8;
2134   uintptr_t block_idx1 = g_ld_block_idx1;
2135   uintptr_t marker_idx = g_ld_idx2_block_start;
2136   uintptr_t marker_idx_end = g_ld_idx2_block_size;
2137   uint32_t is_square = ((g_ld_modifier & LD_MATRIX_SHAPEMASK) == LD_MATRIX_SQ);
2138   uint32_t is_square0 = ((g_ld_modifier & LD_MATRIX_SHAPEMASK) == LD_MATRIX_SQ0);
2139   char delimiter = g_ld_delimiter;
2140   double* results = g_ld_results;
2141   double* dptr;
2142   uintptr_t ulii;
2143   while (block_idx1 < block_size1) {
2144     dptr = &(results[block_idx1 * marker_ctm8 + marker_idx]);
2145     while (marker_idx < marker_idx_end) {
2146       sptr_cur = dtoa_gx(*dptr++, delimiter, sptr_cur);
2147       marker_idx++;
2148       if (sptr_cur > readbuf_end) {
2149 	goto ld_matrix_emitn_ret;
2150       }
2151     }
2152     if (is_square0 && (marker_idx < marker_ct)) {
2153       ulii = (((uintptr_t)(readbuf_end - sptr_cur)) + 1) / 2;
2154       // bugfix: can't be <= since tab delimiter wouldn't be handled correctly
2155       // on subsequent pass
2156       if (ulii < marker_ct - marker_idx) {
2157 	sptr_cur = memcpya(sptr_cur, g_textbuf, ulii * 2);
2158 	marker_idx += ulii;
2159 	goto ld_matrix_emitn_ret;
2160       } else {
2161 	sptr_cur = memcpya(sptr_cur, g_textbuf, (marker_ct - marker_idx) * 2);
2162 	marker_idx = marker_ct;
2163       }
2164     }
2165     if (delimiter == '\t') {
2166       sptr_cur--;
2167     }
2168     *sptr_cur++ = '\n';
2169     marker_idx = 0;
2170     if (!is_square) {
2171       marker_idx_end++;
2172     }
2173     block_idx1++;
2174   }
2175  ld_matrix_emitn_ret:
2176   g_ld_block_idx1 = block_idx1;
2177   g_ld_idx2_block_start = marker_idx;
2178   g_ld_idx2_block_size = marker_idx_end;
2179   return (uintptr_t)(((unsigned char*)sptr_cur) - readbuf);
2180 }
2181 
ld_report_matrix(pthread_t * threads,Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uint32_t parallel_idx,uint32_t parallel_tot,uintptr_t * sex_male,uintptr_t * founder_include2,uintptr_t * founder_male_include2,uintptr_t * loadbuf,char * outname,uint32_t hh_exists)2182 int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t parallel_idx, uint32_t parallel_tot, uintptr_t* sex_male, uintptr_t* founder_include2, uintptr_t* founder_male_include2, uintptr_t* loadbuf, char* outname, uint32_t hh_exists) {
2183   FILE* outfile = nullptr;
2184   uint32_t ld_modifier = ldip->modifier;
2185   uint32_t is_binary = ld_modifier & (LD_MATRIX_BIN | LD_MATRIX_BIN4);
2186   uint32_t is_square = ((ld_modifier & LD_MATRIX_SHAPEMASK) == LD_MATRIX_SQ);
2187   uint32_t is_square0 = ((ld_modifier & LD_MATRIX_SHAPEMASK) == LD_MATRIX_SQ0);
2188   uint32_t output_single_prec = (ld_modifier / LD_MATRIX_BIN4) & 1;
2189   uint32_t output_gz = ld_modifier & LD_REPORT_GZ;
2190   uint32_t ignore_x = (ld_modifier / LD_IGNORE_X) & 1;
2191   uintptr_t marker_ct = g_ld_marker_ct;
2192   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
2193   uintptr_t marker_ctm8 = round_up_pow2(marker_ct, 8);
2194   uintptr_t founder_ct = g_ld_founder_ct;
2195   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
2196   uintptr_t founder_ct_192_long = g_ld_founder_ct_192_long;
2197   uintptr_t final_mask = get_final_mask(founder_ct);
2198   uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
2199   uintptr_t marker_uidx1 = marker_uidx_base;
2200   uintptr_t marker_idx1_start = (((uint64_t)parallel_idx) * marker_ct) / parallel_tot;
2201   uintptr_t marker_idx1 = marker_idx1_start;
2202   uintptr_t marker_idx1_end = (((uint64_t)(parallel_idx + 1)) * marker_ct) / parallel_tot;
2203   uintptr_t pct = 1;
2204   uint64_t job_size = marker_idx1_end - marker_idx1_start;
2205   uint64_t pct_thresh = job_size / 100;
2206   Chrom_info* chrom_info_ptr = g_ld_chrom_info_ptr;
2207   uint32_t founder_trail_ct = founder_ct_192_long - founder_ctl * 2;
2208   uint32_t thread_ct = g_ld_thread_ct;
2209   uint32_t chrom_fo_idx = 0;
2210   uint32_t is_haploid = 0;
2211   uint32_t is_x = 0;
2212   uint32_t is_y = 0;
2213   uint32_t not_first_write = 0;
2214   int32_t retval = 0;
2215   unsigned char* bigstack_mark2;
2216   uintptr_t* ulptr;
2217   unsigned char* overflow_buf;
2218   uint64_t tests_completed;
2219   uintptr_t thread_workload;
2220   uintptr_t cur_idx2_block_size;
2221   uintptr_t marker_idx2_end;
2222   uintptr_t block_idx1;
2223   uintptr_t marker_uidx2;
2224   uintptr_t marker_idx2;
2225   uintptr_t block_idx2;
2226   uintptr_t idx1_block_size;
2227   uintptr_t idx2_block_size;
2228   uintptr_t ulii;
2229   uintptr_t uljj;
2230   uint32_t chrom_idx;
2231   uint32_t chrom_end;
2232   uint32_t is_last_block;
2233 
2234   if (bigstack_alloc_uc(262144, &overflow_buf)) {
2235     goto ld_report_matrix_ret_NOMEM;
2236   }
2237   if (output_single_prec) {
2238     // force divisibility by 16 instead (cacheline = 64 bytes, float = 4)
2239     marker_ctm8 = (marker_ctm8 + 8) & (~15);
2240   }
2241   if (is_binary) {
2242     if (fopen_checked(outname, FOPEN_WB, &outfile)) {
2243       goto ld_report_matrix_ret_OPEN_FAIL;
2244     }
2245   }
2246   // claim up to half of memory with idx1 bufs; each marker costs
2247   //   founder_ct_192_long * sizeof(intptr_t) for genotype buffer
2248   // + founder_ct_192_long * sizeof(intptr_t) for missing mask buffer
2249   // + sizeof(int32_t) for g_ld_missing_cts1 entry
2250   // + 2 * sizeof(int32_t) for g_ld_interval1
2251   // + marker_ctm8 * sizeof(double) or marker_ctm16 * sizeof(float) for
2252   //     g_ld_results buffer
2253   // round down to multiple of thread_ct for better workload distribution
2254   ulii = founder_ct_192_long * 2 * sizeof(intptr_t) + 3 * sizeof(int32_t) + marker_ctm8 * (8 - 4 * output_single_prec);
2255   idx1_block_size = bigstack_left() / (ulii * 2);
2256   thread_workload = idx1_block_size / thread_ct;
2257   if (!thread_workload) {
2258     goto ld_report_matrix_ret_NOMEM;
2259   }
2260   idx1_block_size = thread_workload * thread_ct;
2261   if ((parallel_tot > 1) && (marker_ct < 2 * parallel_tot)) {
2262     LOGERRPRINTF("Error: Too few variants in --r%s run for --parallel %u %u.\n", g_ld_is_r2? "2" : "", parallel_idx + 1, parallel_tot);
2263     goto ld_report_matrix_ret_INVALID_CMDLINE;
2264   }
2265   if (!is_square) {
2266     job_size = ((uint64_t)marker_ct) * (marker_ct + 1);
2267     if (parallel_tot > 1) {
2268       job_size /= parallel_tot;
2269       marker_idx1_start = triangle_divide(job_size * parallel_idx, 1);
2270       if (parallel_idx + 1 < parallel_tot) {
2271         marker_idx1_end = triangle_divide(job_size * (parallel_idx + 1), 1);
2272       }
2273       job_size = ((((uint64_t)marker_idx1_end) * (marker_idx1_end + 1)) - (((uint64_t)marker_idx1_start) * (marker_idx1_start + 1))) / 2;
2274     } else {
2275       job_size /= 2;
2276     }
2277   }
2278   pct_thresh = job_size / 100;
2279   if (idx1_block_size > marker_idx1_end - marker_idx1_start) {
2280     idx1_block_size = marker_idx1_end - marker_idx1_start;
2281   }
2282   bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno1);
2283   bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno_masks1);
2284   bigstack_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
2285   bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1);
2286 
2287   if (!output_single_prec) {
2288     // may want to set g_ld_results_f to nullptr
2289     if (bigstack_alloc_d(marker_ctm8 * idx1_block_size, &g_ld_results)) {
2290       goto ld_report_matrix_ret_NOMEM;
2291     }
2292   } else {
2293     g_ld_results = nullptr;
2294     if (bigstack_alloc_f(marker_ctm8 * idx1_block_size, &g_ld_results_f)) {
2295       goto ld_report_matrix_ret_NOMEM;
2296     }
2297   }
2298 
2299   // claim the other half with idx2 buffer
2300   ulii -= marker_ctm8 * (8 - 4 * output_single_prec) + 2 * sizeof(int32_t);
2301   if (!output_single_prec) {
2302     idx2_block_size = (bigstack_left() / ulii) & (~(7 * ONELU));
2303   } else {
2304     idx2_block_size = (bigstack_left() / ulii) & (~(15 * ONELU));
2305   }
2306   if (idx2_block_size > marker_ctm8) {
2307     idx2_block_size = marker_ctm8;
2308   }
2309   bigstack_mark2 = g_bigstack_base;
2310   while (1) {
2311     if (!idx2_block_size) {
2312       goto ld_report_matrix_ret_NOMEM;
2313     }
2314     if (!(bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno2) ||
2315           bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno_masks2) ||
2316           bigstack_alloc_ui(idx2_block_size, &g_ld_missing_cts2))) {
2317       break;
2318     }
2319     bigstack_reset(bigstack_mark2);
2320     if (!output_single_prec) {
2321       idx2_block_size -= 8;
2322     } else {
2323       idx2_block_size -= 16;
2324     }
2325   }
2326   uljj = founder_trail_ct + 2;
2327   for (ulii = 1; ulii <= idx1_block_size; ulii++) {
2328     fill_ulong_zero(uljj, &(g_ld_geno1[ulii * founder_ct_192_long - uljj]));
2329     fill_ulong_zero(uljj, &(g_ld_geno_masks1[ulii * founder_ct_192_long - uljj]));
2330   }
2331   for (ulii = 1; ulii <= idx2_block_size; ulii++) {
2332     fill_ulong_zero(uljj, &(g_ld_geno2[ulii * founder_ct_192_long - uljj]));
2333     fill_ulong_zero(uljj, &(g_ld_geno_masks2[ulii * founder_ct_192_long - uljj]));
2334   }
2335   if (is_square) {
2336     for (ulii = 0; ulii < idx1_block_size; ulii++) {
2337       g_ld_interval1[ulii * 2] = 0;
2338       g_ld_interval1[ulii * 2 + 1] = marker_ct;
2339     }
2340     g_ld_marker_ctm8 = marker_ctm8;
2341   } else {
2342     for (ulii = 0; ulii < idx1_block_size; ulii++) {
2343       g_ld_interval1[ulii * 2] = 0;
2344     }
2345     if (is_square0) {
2346       if (is_binary) {
2347 	if (!output_single_prec) {
2348           fill_double_zero(MAXLINELEN / sizeof(double), (double*)g_textbuf);
2349 	} else {
2350           fill_float_zero(MAXLINELEN / sizeof(float), (float*)g_textbuf);
2351 	}
2352       } else {
2353 	ulptr = (uintptr_t*)g_textbuf;
2354 	// assume little-endian
2355 	// 0[delim]0[delim]...
2356 #ifdef __LP64__
2357 	ulii = 0x30003000300030LLU | (0x100010001000100LLU * ((unsigned char)g_ld_delimiter));
2358 #else
2359 	ulii = 0x300030 | (0x1000100 * ((unsigned char)g_ld_delimiter));
2360 #endif
2361         for (uljj = 0; uljj < MAXLINELEN / sizeof(intptr_t); uljj++) {
2362 	  *ulptr++ = ulii;
2363 	}
2364       }
2365     }
2366   }
2367   if (marker_idx1) {
2368     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude, marker_uidx1 + 1, marker_idx1);
2369   }
2370   g_ld_keep_sign = 0;
2371   sprintf(g_logbuf, "--r%s %s%s to %s ... ", g_ld_is_r2? "2" : "", is_square? "square" : (is_square0? "square0" : "triangle"), is_binary? (output_single_prec? " bin4" : " bin") : (output_gz? " gz" : ""), outname);
2372   wordwrapb(16); // strlen("99% [processing]")
2373   logprintb();
2374   fputs("0%", stdout);
2375   do {
2376     fputs(" [processing]", stdout);
2377     fflush(stdout);
2378     if (idx1_block_size > marker_idx1_end - marker_idx1) {
2379       idx1_block_size = marker_idx1_end - marker_idx1;
2380       if (idx1_block_size < thread_ct) {
2381         thread_ct = idx1_block_size;
2382         g_ld_thread_ct = thread_ct;
2383       }
2384     }
2385     g_ld_idx1_block_size = idx1_block_size;
2386     // marker_uidx1_tmp = marker_uidx1;
2387     if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
2388       goto ld_report_matrix_ret_READ_FAIL;
2389     }
2390     chrom_end = 0;
2391     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1++, block_idx1++) {
2392       if (IS_SET(marker_exclude, marker_uidx1)) {
2393         marker_uidx1 = next_unset_ul_unsafe(marker_exclude, marker_uidx1);
2394         if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
2395 	  goto ld_report_matrix_ret_READ_FAIL;
2396 	}
2397       }
2398       if (marker_uidx1 >= chrom_end) {
2399         chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1);
2400         chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
2401         is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
2402 	is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
2403 	is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
2404       }
2405       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1), bedfile, loadbuf, &(g_ld_geno1[block_idx1 * founder_ct_192_long]))) {
2406 	goto ld_report_matrix_ret_READ_FAIL;
2407       }
2408       if (is_haploid && hh_exists) {
2409 	haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(g_ld_geno1[block_idx1 * founder_ct_192_long])));
2410       }
2411       ld_process_load2(&(g_ld_geno1[block_idx1 * founder_ct_192_long]), &(g_ld_geno_masks1[block_idx1 * founder_ct_192_long]), &(g_ld_missing_cts1[block_idx1]), founder_ct, is_x && (!ignore_x), founder_male_include2);
2412     }
2413     marker_uidx2 = marker_uidx_base;
2414     marker_idx2 = 0;
2415     if (is_square) {
2416       marker_idx2_end = marker_ct;
2417     } else {
2418       marker_idx2_end = marker_idx1 + idx1_block_size;
2419       for (ulii = 1; ulii <= idx1_block_size; ulii++) {
2420 	g_ld_interval1[2 * ulii - 1] = ulii + marker_idx1;
2421       }
2422       if (!output_single_prec) {
2423         marker_ctm8 = round_up_pow2(marker_idx2_end, 8);
2424       } else {
2425         marker_ctm8 = round_up_pow2(marker_idx2_end, 16);
2426       }
2427       g_ld_marker_ctm8 = marker_ctm8;
2428     }
2429     chrom_end = 0;
2430     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
2431       goto ld_report_matrix_ret_READ_FAIL;
2432     }
2433     cur_idx2_block_size = idx2_block_size;
2434     do {
2435       if (cur_idx2_block_size > marker_idx2_end - marker_idx2) {
2436 	cur_idx2_block_size = marker_idx2_end - marker_idx2;
2437       }
2438       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
2439 	if (IS_SET(marker_exclude, marker_uidx2)) {
2440           marker_uidx2 = next_unset_ul_unsafe(marker_exclude, marker_uidx2);
2441 	  if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
2442 	    goto ld_report_matrix_ret_READ_FAIL;
2443 	  }
2444 	}
2445 	if (marker_uidx2 >= chrom_end) {
2446 	  chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
2447 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
2448 	  is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
2449 	  is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
2450 	  is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
2451 	}
2452 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, &(g_ld_geno2[block_idx2 * founder_ct_192_long]))) {
2453 	  goto ld_report_matrix_ret_READ_FAIL;
2454 	}
2455 	if (is_haploid && hh_exists) {
2456 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(g_ld_geno2[block_idx2 * founder_ct_192_long])));
2457 	}
2458 	ld_process_load2(&(g_ld_geno2[block_idx2 * founder_ct_192_long]), &(g_ld_geno_masks2[block_idx2 * founder_ct_192_long]), &(g_ld_missing_cts2[block_idx2]), founder_ct, is_x && (!ignore_x), founder_male_include2);
2459       }
2460       g_ld_idx2_block_size = cur_idx2_block_size;
2461       g_ld_idx2_block_start = marker_idx2;
2462       marker_idx2 += cur_idx2_block_size;
2463       is_last_block = (marker_idx2 >= marker_idx2_end);
2464       if (spawn_threads2(threads, &ld_block_thread, thread_ct, is_last_block)) {
2465 	goto ld_report_matrix_ret_THREAD_CREATE_FAIL;
2466       }
2467       ld_block_thread((void*)0);
2468       join_threads2(threads, thread_ct, is_last_block);
2469     } while (!is_last_block);
2470     fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
2471     fflush(stdout);
2472     if (is_binary) {
2473       if (!output_single_prec) {
2474 	if (is_square) {
2475 	  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
2476 	    if (fwrite_checked(&(g_ld_results[block_idx1 * marker_ctm8]), cur_idx2_block_size * sizeof(double), outfile)) {
2477 	      goto ld_report_matrix_ret_WRITE_FAIL;
2478 	    }
2479 	  }
2480 	} else {
2481 	  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
2482 	    if (fwrite_checked(&(g_ld_results[block_idx1 * marker_ctm8]), (block_idx1 + marker_idx1 + 1) * sizeof(double), outfile)) {
2483 	      goto ld_report_matrix_ret_WRITE_FAIL;
2484 	    }
2485 	    if (is_square0) {
2486 	      ulii = marker_ct - block_idx1 - marker_idx1 - 1;
2487 	      while (ulii) {
2488 		if (ulii > MAXLINELEN / sizeof(double)) {
2489 		  uljj = MAXLINELEN / sizeof(double);
2490 		  ulii -= MAXLINELEN / sizeof(double);
2491 		} else {
2492 		  uljj = ulii;
2493 		  ulii = 0;
2494 		}
2495 		if (fwrite_checked(g_textbuf, uljj * sizeof(double), outfile)) {
2496 		  goto ld_report_matrix_ret_WRITE_FAIL;
2497 		}
2498 	      }
2499 	    }
2500 	  }
2501 	}
2502       } else {
2503 	if (is_square) {
2504 	  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
2505 	    if (fwrite_checked(&(g_ld_results_f[block_idx1 * marker_ctm8]), cur_idx2_block_size * sizeof(float), outfile)) {
2506 	      goto ld_report_matrix_ret_WRITE_FAIL;
2507 	    }
2508 	  }
2509 	} else {
2510 	  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
2511 	    if (fwrite_checked(&(g_ld_results_f[block_idx1 * marker_ctm8]), (block_idx1 + marker_idx1 + 1) * sizeof(float), outfile)) {
2512 	      goto ld_report_matrix_ret_WRITE_FAIL;
2513 	    }
2514 	    if (is_square0) {
2515 	      ulii = marker_ct - block_idx1 - marker_idx1 - 1;
2516 	      while (ulii) {
2517 		if (ulii > MAXLINELEN / sizeof(float)) {
2518 		  uljj = MAXLINELEN / sizeof(float);
2519 		  ulii -= MAXLINELEN / sizeof(float);
2520 		} else {
2521 		  uljj = ulii;
2522 		  ulii = 0;
2523 		}
2524 		if (fwrite_checked(g_textbuf, uljj * sizeof(float), outfile)) {
2525 		  goto ld_report_matrix_ret_WRITE_FAIL;
2526 		}
2527 	      }
2528 	    }
2529 	  }
2530 	}
2531       }
2532     } else {
2533       g_ld_block_idx1 = 0;
2534       g_ld_idx2_block_start = 0;
2535       if (is_square) {
2536         g_ld_idx2_block_size = marker_ct;
2537       } else {
2538 	g_ld_idx2_block_size = marker_idx1 + 1;
2539       }
2540       if (output_gz) {
2541         parallel_compress(outname, overflow_buf, not_first_write, ld_matrix_emitn);
2542       } else {
2543         write_uncompressed(outname, overflow_buf, not_first_write, ld_matrix_emitn);
2544       }
2545       not_first_write = 1;
2546     }
2547     marker_idx1 += idx1_block_size;
2548     fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
2549     if (is_square) {
2550       tests_completed = marker_idx1 - marker_idx1_start;
2551     } else {
2552       tests_completed = ((((uint64_t)marker_idx1) * (marker_idx1 + 1)) - (((uint64_t)marker_idx1_start) * (marker_idx1_start + 1))) / 2;
2553     }
2554     if (tests_completed >= pct_thresh) {
2555       if (pct > 10) {
2556 	putc_unlocked('\b', stdout);
2557       }
2558       pct = (tests_completed * 100LLU) / job_size;
2559       if (pct < 100) {
2560 	printf("\b\b%" PRIuPTR "%%", pct);
2561 	fflush(stdout);
2562 	pct_thresh = ((++pct) * ((uint64_t)job_size)) / 100;
2563       }
2564     }
2565   } while (marker_idx1 < marker_idx1_end);
2566   fputs("\b\b", stdout);
2567   logprint("done.\n");
2568   if (is_binary) {
2569     if (fclose_null(&outfile)) {
2570       goto ld_report_matrix_ret_WRITE_FAIL;
2571     }
2572   }
2573   while (0) {
2574   ld_report_matrix_ret_NOMEM:
2575     retval = RET_NOMEM;
2576     break;
2577   ld_report_matrix_ret_OPEN_FAIL:
2578     retval = RET_OPEN_FAIL;
2579     break;
2580   ld_report_matrix_ret_READ_FAIL:
2581     retval = RET_READ_FAIL;
2582     break;
2583   ld_report_matrix_ret_WRITE_FAIL:
2584     retval = RET_WRITE_FAIL;
2585     break;
2586   ld_report_matrix_ret_INVALID_CMDLINE:
2587     retval = RET_INVALID_CMDLINE;
2588     break;
2589   ld_report_matrix_ret_THREAD_CREATE_FAIL:
2590     retval = RET_THREAD_CREATE_FAIL;
2591     break;
2592   }
2593   fclose_cond(outfile);
2594   // trust parent to free memory
2595   return retval;
2596 }
2597 
ld_regular_emitn(uint32_t overflow_ct,unsigned char * readbuf)2598 uint32_t ld_regular_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
2599   char* sptr_cur = (char*)(&(readbuf[overflow_ct]));
2600   char* readbuf_end = (char*)(&(readbuf[PIGZ_BLOCK_SIZE]));
2601   Chrom_info* chrom_info_ptr = g_ld_chrom_info_ptr;
2602   uintptr_t* marker_exclude_idx1 = g_ld_marker_exclude_idx1;
2603   uintptr_t* marker_exclude = g_ld_marker_exclude;
2604   uint32_t* marker_pos = g_ld_marker_pos;
2605   char* marker_ids = g_ld_marker_ids;
2606   char** marker_allele_ptrs = g_ld_marker_allele_ptrs;
2607   uint32_t* ld_interval1 = g_ld_interval1;
2608   double* results = g_ld_results;
2609   double* set_allele_freqs = g_ld_set_allele_freqs;
2610   char* fixed_a1 = nullptr;
2611   char* fixed_a2 = nullptr;
2612   uintptr_t max_marker_id_len = g_ld_max_marker_id_len;
2613   uintptr_t marker_uidx1 = g_ld_marker_uidx1;
2614   uintptr_t block_idx1 = g_ld_block_idx1;
2615   uintptr_t block_size1 = g_ld_idx1_block_size;
2616   uintptr_t marker_uidx2_start = g_ld_uidx2_start;
2617   uintptr_t block_idx2_start = g_ld_idx2_block_start;
2618   uintptr_t block_idx2 = g_ld_block_idx2;
2619   uintptr_t marker_idx2_maxw = g_ld_marker_ctm8;
2620   uintptr_t marker_uidx2 = g_ld_marker_uidx2;
2621   double window_r2 = g_ld_window_r2;
2622   uint32_t plink_maxsnp = g_ld_plink_maxsnp;
2623   uint32_t is_inter_chr = g_ld_is_inter_chr;
2624 
2625   // 0 = not d/dprime/dprime-signed
2626   uint32_t dprime_type = g_ld_modifier & LD_DX;
2627 
2628   uint32_t is_r2 = g_ld_is_r2;
2629   uint32_t prefix_len = g_ld_prefix_len;
2630   uint32_t chrom_fo_idx1 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1);
2631   uint32_t chrom_idx1 = chrom_info_ptr->chrom_file_order[chrom_fo_idx1];
2632   uint32_t chrom_end1 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx1 + 1];
2633   uint32_t chrom_fo_idx2 = 0;
2634   uint32_t chrom_idx2 = 0;
2635   uint32_t fixed_a1_len = 0;
2636   uint32_t fixed_a2_len = 0;
2637   uintptr_t block_end2;
2638   uint32_t coupling;
2639   uint32_t chrom_end2;
2640   char* sptr2;
2641   double* dptr;
2642   double dxx;
2643   if (block_idx1 == block_size1) {
2644     goto ld_regular_emitn_ret;
2645   }
2646   if (block_idx2) {
2647     goto ld_regular_emitn_start_2;
2648   }
2649   // block_idx2 is only zero on initial call, never on reentry
2650   if (g_ld_is_first_block) {
2651     sptr_cur = memcpya(sptr_cur, " CHR_A         BP_A ", 20);
2652     sptr_cur = fw_strcpyn(g_ld_plink_maxsnp, 5, "SNP_A", sptr_cur);
2653     if (set_allele_freqs) {
2654       sptr_cur = memcpya(sptr_cur, "      MAF_A", 11);
2655     }
2656     sptr_cur = memcpya(sptr_cur, "  CHR_B         BP_B ", 21);
2657     sptr_cur = fw_strcpyn(g_ld_plink_maxsnp, 5, "SNP_B", sptr_cur);
2658     if (marker_allele_ptrs) {
2659       sptr_cur = memcpya(sptr_cur, "      PHASE", 11);
2660     }
2661     if (set_allele_freqs) {
2662       sptr_cur = memcpya(sptr_cur, "      MAF_B", 11);
2663     }
2664     sptr_cur = memseta(sptr_cur, 32, 11);
2665     sptr_cur = memcpyl3a(sptr_cur, is_r2? "R2 " : " R ");
2666     if (dprime_type) {
2667       sptr_cur = memcpya(sptr_cur, (dprime_type == LD_D)? "           D " : "          DP ", 13);
2668     }
2669     *sptr_cur++ = '\n';
2670   }
2671   goto ld_regular_emitn_start;
2672   do {
2673     marker_uidx1++;
2674     next_unset_ul_unsafe_ck(marker_exclude_idx1, &marker_uidx1);
2675     if (marker_uidx1 >= chrom_end1) {
2676       chrom_fo_idx1 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1);
2677       chrom_idx1 = chrom_info_ptr->chrom_file_order[chrom_fo_idx1];
2678       chrom_end1 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx1 + 1];
2679     }
2680     block_idx2 = ld_interval1[2 * block_idx1];
2681     if (block_idx2_start < block_idx2) {
2682       marker_uidx2_start = jump_forward_unset_unsafe(marker_exclude, marker_uidx2_start + 1, block_idx2 - block_idx2_start);
2683       block_idx2_start = block_idx2;
2684     }
2685   ld_regular_emitn_start:
2686     marker_uidx2 = marker_uidx2_start;
2687     sptr2 = width_force(6, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx1, g_textbuf));
2688     sptr2 = memseta(sptr2, 32, 3);
2689     sptr2 = uint32toa_w10x(marker_pos[marker_uidx1], ' ', sptr2);
2690     sptr2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), sptr2);
2691     *sptr2++ = ' ';
2692     if (set_allele_freqs) {
2693       sptr2 = width_force(10, sptr2, dtoa_g(1.0 - set_allele_freqs[marker_uidx1], sptr2));
2694       *sptr2++ = ' ';
2695     }
2696     if (!is_inter_chr) {
2697       sptr2 = width_force(6, sptr2, chrom_name_write(chrom_info_ptr, chrom_idx1, sptr2));
2698       sptr2 = memseta(sptr2, 32, 3);
2699     }
2700     prefix_len = (uintptr_t)(sptr2 - g_textbuf);
2701   ld_regular_emitn_start_2:
2702     if (marker_allele_ptrs) {
2703       fixed_a1 = marker_allele_ptrs[2 * marker_uidx1];
2704       fixed_a2 = marker_allele_ptrs[2 * marker_uidx1 + 1];
2705       fixed_a1_len = strlen(fixed_a1);
2706       fixed_a2_len = strlen(fixed_a2);
2707     }
2708     chrom_end2 = 0;
2709     block_end2 = ld_interval1[2 * block_idx1 + 1];
2710     dptr = &(results[(block_idx1 * marker_idx2_maxw + block_idx2 - block_idx2_start) * (1 + (dprime_type != 0))]);
2711     while (block_idx2 < block_end2) {
2712       next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx2);
2713       dxx = *dptr++;
2714       if (fabs(dxx) >= window_r2) {
2715 	sptr_cur = memcpya(sptr_cur, g_textbuf, prefix_len);
2716 	if (is_inter_chr) {
2717 	  if (marker_uidx2 >= chrom_end2) {
2718 	    chrom_fo_idx2 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
2719 	    chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
2720 	    chrom_end2 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx2 + 1];
2721 	  }
2722 	  sptr_cur = width_force(6, sptr_cur, chrom_name_write(chrom_info_ptr, chrom_idx2, sptr_cur));
2723 	  sptr_cur = memseta(sptr_cur, 32, 3);
2724 	}
2725 	sptr_cur = uint32toa_w10x(marker_pos[marker_uidx2], ' ', sptr_cur);
2726 	sptr_cur = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), sptr_cur);
2727 	*sptr_cur++ = ' ';
2728 	if (marker_allele_ptrs) {
2729 	  coupling = (dxx > 0);
2730 	  sptr2 = memcpya(sptr_cur, fixed_a1, fixed_a1_len);
2731 	  sptr2 = strcpyax(sptr2, marker_allele_ptrs[2 * marker_uidx2 + (1 - coupling)], '/');
2732 	  sptr2 = memcpya(sptr2, fixed_a2, fixed_a2_len);
2733           sptr2 = strcpya(sptr2, marker_allele_ptrs[2 * marker_uidx2 + coupling]);
2734 	  sptr_cur = width_force(10, sptr_cur, sptr2);
2735 	  *sptr_cur++ = ' ';
2736 	}
2737 	if (set_allele_freqs) {
2738 	  sptr_cur = width_force(10, sptr_cur, dtoa_g(1.0 - set_allele_freqs[marker_uidx2], sptr_cur));
2739 	  *sptr_cur++ = ' ';
2740 	}
2741 	if (is_r2) {
2742 	  dxx = fabs(dxx);
2743 	}
2744 	sptr_cur = width_force(12, sptr_cur, dtoa_g(dxx, sptr_cur));
2745 	if (dprime_type) {
2746 	  *sptr_cur++ = ' ';
2747           sptr_cur = width_force(12, sptr_cur, dtoa_g(*dptr++, sptr_cur));
2748 	}
2749 	sptr_cur = memcpya(sptr_cur, " \n", 2);
2750       } else if (dprime_type) {
2751 	dptr++;
2752       }
2753       block_idx2++;
2754       marker_uidx2++;
2755       if (sptr_cur > readbuf_end) {
2756         goto ld_regular_emitn_ret;
2757       }
2758     }
2759   } while (++block_idx1 < block_size1);
2760  ld_regular_emitn_ret:
2761   g_ld_marker_uidx1 = marker_uidx1;
2762   g_ld_block_idx1 = block_idx1;
2763   g_ld_prefix_len = prefix_len;
2764   g_ld_uidx2_start = marker_uidx2_start;
2765   g_ld_idx2_block_start = block_idx2_start;
2766   g_ld_marker_uidx2 = marker_uidx2;
2767   g_ld_block_idx2 = block_idx2;
2768   return (uintptr_t)(((unsigned char*)sptr_cur) - readbuf);
2769 }
2770 
2771 // The following three functions are built around a data representation
2772 // introduced by Xiang Yan et al.'s BOOST software (the original bitwise
2773 // representation I came up with was less efficient); see
2774 // http://bioinformatics.ust.hk/BOOST.html .
2775 //
2776 // The BOOST implementation just evaluated four contingency table values; when
2777 // there is no missing data, the other five can be determined via subtraction.
2778 // two_locus_3x3_zmiss_tablev() function handles this case.  However, with
2779 // *only* that logic, all sites with missing data must be thrown out.
2780 // two_locus_3x3_tablev() handles the other cases, directly summing 6 or 9
2781 // table values when necessary.
2782 //
2783 // If permutation testing is added later, it should exploit the fact that
2784 // [cell xy value in case 3x3 table] + [cell xy value in ctrl 3x3 table]
2785 // is constant across permutations; i.e. we just need to determine the new case
2786 // contingency table, and then the control table falls out via subtraction.
2787 // Several ideas from PERMORY could also be applied.
load_and_split3(FILE * bedfile,uintptr_t * rawbuf,uint32_t unfiltered_sample_ct,uintptr_t * casebuf,uintptr_t * pheno_nm,uintptr_t * pheno_c,uint32_t case_ctv,uint32_t ctrl_ctv,uint32_t do_reverse,uint32_t is_case_only,uintptr_t * nm_info_ptr)2788 uint32_t load_and_split3(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* casebuf, uintptr_t* pheno_nm, uintptr_t* pheno_c, uint32_t case_ctv, uint32_t ctrl_ctv, uint32_t do_reverse, uint32_t is_case_only, uintptr_t* nm_info_ptr) {
2789   uintptr_t* rawbuf_end = &(rawbuf[unfiltered_sample_ct / BITCT2]);
2790   uintptr_t* ctrlbuf = &(casebuf[3 * case_ctv]);
2791   uintptr_t case_words[4];
2792   uintptr_t ctrl_words[4];
2793   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
2794   uint32_t case_rem = 0;
2795   uint32_t ctrl_rem = 0;
2796   uint32_t read_shift_max = BITCT2;
2797   uint32_t sample_uidx = 0;
2798   uint32_t offset0_case = do_reverse * 2 * case_ctv;
2799   uint32_t offset2_case = (1 - do_reverse) * 2 * case_ctv;
2800   uint32_t offset0_ctrl = do_reverse * 2 * ctrl_ctv;
2801   uint32_t offset2_ctrl = (1 - do_reverse) * 2 * ctrl_ctv;
2802   uint32_t read_shift;
2803   uintptr_t read_word;
2804   uintptr_t ulii;
2805   if (bedfile) {
2806     // ld_report_dprime() preloads this and does het. haploid handling, etc.
2807     if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
2808       return RET_READ_FAIL;
2809     }
2810   }
2811   case_words[0] = 0;
2812   case_words[1] = 0;
2813   case_words[2] = 0;
2814   case_words[3] = 0;
2815   ctrl_words[0] = 0;
2816   ctrl_words[1] = 0;
2817   ctrl_words[2] = 0;
2818   ctrl_words[3] = 0;
2819   while (1) {
2820     while (rawbuf < rawbuf_end) {
2821       read_word = *rawbuf++;
2822       for (read_shift = 0; read_shift < read_shift_max; sample_uidx++, read_shift++) {
2823 	if (is_set(pheno_nm, sample_uidx)) {
2824 	  ulii = read_word & 3;
2825 	  if (is_set(pheno_c, sample_uidx)) {
2826 	    case_words[ulii] |= ONELU << case_rem;
2827 	    if (++case_rem == BITCT) {
2828 	      casebuf[offset0_case] = case_words[0];
2829 	      casebuf[case_ctv] = case_words[2];
2830 	      casebuf[offset2_case] = case_words[3];
2831 	      casebuf++;
2832 	      case_words[0] = 0;
2833 	      case_words[2] = 0;
2834 	      case_words[3] = 0;
2835 	      case_rem = 0;
2836 	    }
2837 	  } else if (!is_case_only) {
2838 	    ctrl_words[ulii] |= ONELU << ctrl_rem;
2839 	    if (++ctrl_rem == BITCT) {
2840 	      ctrlbuf[offset0_ctrl] = ctrl_words[0];
2841 	      ctrlbuf[ctrl_ctv] = ctrl_words[2];
2842 	      ctrlbuf[offset2_ctrl] = ctrl_words[3];
2843 	      ctrlbuf++;
2844 	      ctrl_words[0] = 0;
2845 	      ctrl_words[2] = 0;
2846 	      ctrl_words[3] = 0;
2847 	      ctrl_rem = 0;
2848 	    }
2849 	  }
2850 	}
2851 	read_word >>= 2;
2852       }
2853     }
2854     if (sample_uidx == unfiltered_sample_ct) {
2855       if (case_rem) {
2856 	casebuf[offset0_case] = case_words[0];
2857 	casebuf[case_ctv] = case_words[2];
2858 	casebuf[offset2_case] = case_words[3];
2859       }
2860       if (ctrl_rem) {
2861 	ctrlbuf[offset0_ctrl] = ctrl_words[0];
2862 	ctrlbuf[ctrl_ctv] = ctrl_words[2];
2863 	ctrlbuf[offset2_ctrl] = ctrl_words[3];
2864       }
2865       ulii = 3;
2866       if (case_words[1]) {
2867 	ulii -= 1;
2868       }
2869       if (ctrl_words[1]) {
2870 	ulii -= 2;
2871       }
2872       *nm_info_ptr = ulii;
2873       return 0;
2874     }
2875     rawbuf_end++;
2876     read_shift_max = unfiltered_sample_ct % BITCT2;
2877   }
2878 }
2879 
2880 #ifdef __LP64__
two_locus_3x3_tablev(__m128i * vec1,__m128i * vec2,uint32_t * counts_3x3,uint32_t sample_ctv6,uint32_t iter_ct)2881 static void two_locus_3x3_tablev(__m128i* vec1, __m128i* vec2, uint32_t* counts_3x3, uint32_t sample_ctv6, uint32_t iter_ct) {
2882   const __m128i m1 = {FIVEMASK, FIVEMASK};
2883   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
2884   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
2885   __m128i* vec20;
2886   __m128i* vec21;
2887   __m128i* vec22;
2888   __m128i* vend1;
2889   __m128i loader1;
2890   __m128i loader20;
2891   __m128i loader21;
2892   __m128i loader22;
2893   __m128i count10;
2894   __m128i count11;
2895   __m128i count12;
2896   __m128i count20;
2897   __m128i count21;
2898   __m128i count22;
2899   __univec acc0;
2900   __univec acc1;
2901   __univec acc2;
2902   uint32_t ct;
2903   uint32_t ct2;
2904   while (iter_ct--) {
2905     ct = sample_ctv6;
2906     vec20 = vec2;
2907     vec21 = &(vec20[sample_ctv6]);
2908     vec22 = &(vec20[2 * sample_ctv6]);
2909     while (ct >= 30) {
2910       ct -= 30;
2911       vend1 = &(vec1[30]);
2912       acc0.vi = _mm_setzero_si128();
2913       acc1.vi = _mm_setzero_si128();
2914       acc2.vi = _mm_setzero_si128();
2915       do {
2916       two_locus_3x3_tablev_outer:
2917 	loader1 = *vec1++;
2918 	loader20 = *vec20++;
2919 	loader21 = *vec21++;
2920 	loader22 = *vec22++;
2921 	count10 = _mm_and_si128(loader1, loader20);
2922 	count11 = _mm_and_si128(loader1, loader21);
2923 	count12 = _mm_and_si128(loader1, loader22);
2924 	count10 = _mm_sub_epi64(count10, _mm_and_si128(_mm_srli_epi64(count10, 1), m1));
2925 	count11 = _mm_sub_epi64(count11, _mm_and_si128(_mm_srli_epi64(count11, 1), m1));
2926 	count12 = _mm_sub_epi64(count12, _mm_and_si128(_mm_srli_epi64(count12, 1), m1));
2927       two_locus_3x3_tablev_two_left:
2928         // unlike the zmiss variant, this apparently does not suffer from
2929 	// enough register spill to justify shrinking the inner loop
2930 	loader1 = *vec1++;
2931 	loader20 = *vec20++;
2932 	loader21 = *vec21++;
2933 	loader22 = *vec22++;
2934 	count20 = _mm_and_si128(loader1, loader20);
2935 	count21 = _mm_and_si128(loader1, loader21);
2936 	count22 = _mm_and_si128(loader1, loader22);
2937 	count20 = _mm_sub_epi64(count20, _mm_and_si128(_mm_srli_epi64(count20, 1), m1));
2938 	count21 = _mm_sub_epi64(count21, _mm_and_si128(_mm_srli_epi64(count21, 1), m1));
2939 	count22 = _mm_sub_epi64(count22, _mm_and_si128(_mm_srli_epi64(count22, 1), m1));
2940       two_locus_3x3_tablev_one_left:
2941 	loader1 = *vec1++;
2942 	loader20 = *vec20++;
2943 	loader21 = _mm_and_si128(loader1, loader20); // half1
2944 	loader22 = _mm_and_si128(_mm_srli_epi64(loader21, 1), m1); // half2
2945 	count10 = _mm_add_epi64(count10, _mm_and_si128(loader21, m1));
2946 	count20 = _mm_add_epi64(count20, loader22);
2947 	loader20 = *vec21++;
2948 	loader21 = _mm_and_si128(loader1, loader20);
2949 	loader22 = _mm_and_si128(_mm_srli_epi64(loader21, 1), m1);
2950 	count11 = _mm_add_epi64(count11, _mm_and_si128(loader21, m1));
2951 	count21 = _mm_add_epi64(count21, loader22);
2952 	loader20 = *vec22++;
2953 	loader21 = _mm_and_si128(loader1, loader20);
2954 	loader22 = _mm_and_si128(_mm_srli_epi64(loader21, 1), m1);
2955 	count12 = _mm_add_epi64(count12, _mm_and_si128(loader21, m1));
2956 	count22 = _mm_add_epi64(count22, loader22);
2957 
2958 	count10 = _mm_add_epi64(_mm_and_si128(count10, m2), _mm_and_si128(_mm_srli_epi64(count10, 2), m2));
2959 	count11 = _mm_add_epi64(_mm_and_si128(count11, m2), _mm_and_si128(_mm_srli_epi64(count11, 2), m2));
2960 	count12 = _mm_add_epi64(_mm_and_si128(count12, m2), _mm_and_si128(_mm_srli_epi64(count12, 2), m2));
2961 	count10 = _mm_add_epi64(count10, _mm_add_epi64(_mm_and_si128(count20, m2), _mm_and_si128(_mm_srli_epi64(count20, 2), m2)));
2962 	count11 = _mm_add_epi64(count11, _mm_add_epi64(_mm_and_si128(count21, m2), _mm_and_si128(_mm_srli_epi64(count21, 2), m2)));
2963 	count12 = _mm_add_epi64(count12, _mm_add_epi64(_mm_and_si128(count22, m2), _mm_and_si128(_mm_srli_epi64(count22, 2), m2)));
2964 	acc0.vi = _mm_add_epi64(acc0.vi, _mm_add_epi64(_mm_and_si128(count10, m4), _mm_and_si128(_mm_srli_epi64(count10, 4), m4)));
2965 	acc1.vi = _mm_add_epi64(acc1.vi, _mm_add_epi64(_mm_and_si128(count11, m4), _mm_and_si128(_mm_srli_epi64(count11, 4), m4)));
2966 	acc2.vi = _mm_add_epi64(acc2.vi, _mm_add_epi64(_mm_and_si128(count12, m4), _mm_and_si128(_mm_srli_epi64(count12, 4), m4)));
2967       } while (vec1 < vend1);
2968       const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
2969       acc0.vi = _mm_add_epi64(_mm_and_si128(acc0.vi, m8), _mm_and_si128(_mm_srli_epi64(acc0.vi, 8), m8));
2970       acc1.vi = _mm_add_epi64(_mm_and_si128(acc1.vi, m8), _mm_and_si128(_mm_srli_epi64(acc1.vi, 8), m8));
2971       acc2.vi = _mm_add_epi64(_mm_and_si128(acc2.vi, m8), _mm_and_si128(_mm_srli_epi64(acc2.vi, 8), m8));
2972       counts_3x3[0] += ((acc0.u8[0] + acc0.u8[1]) * 0x1000100010001LLU) >> 48;
2973       counts_3x3[1] += ((acc1.u8[0] + acc1.u8[1]) * 0x1000100010001LLU) >> 48;
2974       counts_3x3[2] += ((acc2.u8[0] + acc2.u8[1]) * 0x1000100010001LLU) >> 48;
2975     }
2976     if (ct) {
2977       vend1 = &(vec1[ct]);
2978       ct2 = ct % 3;
2979       acc0.vi = _mm_setzero_si128();
2980       acc1.vi = _mm_setzero_si128();
2981       acc2.vi = _mm_setzero_si128();
2982       ct = 0;
2983       if (ct2) {
2984 	count10 = _mm_setzero_si128();
2985 	count11 = _mm_setzero_si128();
2986 	count12 = _mm_setzero_si128();
2987 	if (ct2 == 2) {
2988 	  goto two_locus_3x3_tablev_two_left;
2989 	}
2990 	count20 = _mm_setzero_si128();
2991 	count21 = _mm_setzero_si128();
2992 	count22 = _mm_setzero_si128();
2993 	goto two_locus_3x3_tablev_one_left;
2994       }
2995       goto two_locus_3x3_tablev_outer;
2996     }
2997     counts_3x3 = &(counts_3x3[3]);
2998   }
2999 }
3000 
two_locus_3x3_zmiss_tablev(__m128i * veca0,__m128i * vecb0,uint32_t * counts_3x3,uint32_t sample_ctv6)3001 static inline void two_locus_3x3_zmiss_tablev(__m128i* veca0, __m128i* vecb0, uint32_t* counts_3x3, uint32_t sample_ctv6) {
3002   const __m128i m1 = {FIVEMASK, FIVEMASK};
3003   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
3004   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
3005   __m128i* vecb1 = &(vecb0[sample_ctv6]);
3006   __m128i* veca1 = &(veca0[sample_ctv6]);
3007   __m128i* vend;
3008   __m128i loadera0;
3009   __m128i loaderb0;
3010   __m128i loaderb1;
3011   __m128i loadera1;
3012   __m128i countx00;
3013   __m128i countx01;
3014   __m128i countx11;
3015   __m128i countx10;
3016   __m128i county00;
3017   __m128i county01;
3018   __m128i county11;
3019   __m128i county10;
3020   __univec acc00;
3021   __univec acc01;
3022   __univec acc11;
3023   __univec acc10;
3024   uint32_t ct2;
3025   while (sample_ctv6 >= 30) {
3026     sample_ctv6 -= 30;
3027     vend = &(veca0[30]);
3028     acc00.vi = _mm_setzero_si128();
3029     acc01.vi = _mm_setzero_si128();
3030     acc11.vi = _mm_setzero_si128();
3031     acc10.vi = _mm_setzero_si128();
3032     do {
3033     two_locus_3x3_zmiss_tablev_outer:
3034       loadera0 = *veca0++;
3035       loaderb0 = *vecb0++;
3036       loaderb1 = *vecb1++;
3037       loadera1 = *veca1++;
3038       countx00 = _mm_and_si128(loadera0, loaderb0);
3039       countx01 = _mm_and_si128(loadera0, loaderb1);
3040       countx11 = _mm_and_si128(loadera1, loaderb1);
3041       countx10 = _mm_and_si128(loadera1, loaderb0);
3042       countx00 = _mm_sub_epi64(countx00, _mm_and_si128(_mm_srli_epi64(countx00, 1), m1));
3043       countx01 = _mm_sub_epi64(countx01, _mm_and_si128(_mm_srli_epi64(countx01, 1), m1));
3044       countx11 = _mm_sub_epi64(countx11, _mm_and_si128(_mm_srli_epi64(countx11, 1), m1));
3045       countx10 = _mm_sub_epi64(countx10, _mm_and_si128(_mm_srli_epi64(countx10, 1), m1));
3046       countx00 = _mm_add_epi64(_mm_and_si128(countx00, m2), _mm_and_si128(_mm_srli_epi64(countx00, 2), m2));
3047       countx01 = _mm_add_epi64(_mm_and_si128(countx01, m2), _mm_and_si128(_mm_srli_epi64(countx01, 2), m2));
3048       countx11 = _mm_add_epi64(_mm_and_si128(countx11, m2), _mm_and_si128(_mm_srli_epi64(countx11, 2), m2));
3049       countx10 = _mm_add_epi64(_mm_and_si128(countx10, m2), _mm_and_si128(_mm_srli_epi64(countx10, 2), m2));
3050     two_locus_3x3_zmiss_tablev_one_left:
3051       loadera0 = *veca0++;
3052       loaderb0 = *vecb0++;
3053       loaderb1 = *vecb1++;
3054       loadera1 = *veca1++;
3055       county00 = _mm_and_si128(loadera0, loaderb0);
3056       county01 = _mm_and_si128(loadera0, loaderb1);
3057       county11 = _mm_and_si128(loadera1, loaderb1);
3058       county10 = _mm_and_si128(loadera1, loaderb0);
3059       county00 = _mm_sub_epi64(county00, _mm_and_si128(_mm_srli_epi64(county00, 1), m1));
3060       county01 = _mm_sub_epi64(county01, _mm_and_si128(_mm_srli_epi64(county01, 1), m1));
3061       county11 = _mm_sub_epi64(county11, _mm_and_si128(_mm_srli_epi64(county11, 1), m1));
3062       county10 = _mm_sub_epi64(county10, _mm_and_si128(_mm_srli_epi64(county10, 1), m1));
3063       countx00 = _mm_add_epi64(countx00, _mm_add_epi64(_mm_and_si128(county00, m2), _mm_and_si128(_mm_srli_epi64(county00, 2), m2)));
3064       countx01 = _mm_add_epi64(countx01, _mm_add_epi64(_mm_and_si128(county01, m2), _mm_and_si128(_mm_srli_epi64(county01, 2), m2)));
3065       countx11 = _mm_add_epi64(countx11, _mm_add_epi64(_mm_and_si128(county11, m2), _mm_and_si128(_mm_srli_epi64(county11, 2), m2)));
3066       countx10 = _mm_add_epi64(countx10, _mm_add_epi64(_mm_and_si128(county10, m2), _mm_and_si128(_mm_srli_epi64(county10, 2), m2)));
3067       acc00.vi = _mm_add_epi64(acc00.vi, _mm_add_epi64(_mm_and_si128(countx00, m4), _mm_and_si128(_mm_srli_epi64(countx00, 4), m4)));
3068       acc01.vi = _mm_add_epi64(acc01.vi, _mm_add_epi64(_mm_and_si128(countx01, m4), _mm_and_si128(_mm_srli_epi64(countx01, 4), m4)));
3069       acc11.vi = _mm_add_epi64(acc11.vi, _mm_add_epi64(_mm_and_si128(countx11, m4), _mm_and_si128(_mm_srli_epi64(countx11, 4), m4)));
3070       acc10.vi = _mm_add_epi64(acc10.vi, _mm_add_epi64(_mm_and_si128(countx10, m4), _mm_and_si128(_mm_srli_epi64(countx10, 4), m4)));
3071     } while (veca0 < vend);
3072     const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
3073     acc00.vi = _mm_add_epi64(_mm_and_si128(acc00.vi, m8), _mm_and_si128(_mm_srli_epi64(acc00.vi, 8), m8));
3074     acc01.vi = _mm_add_epi64(_mm_and_si128(acc01.vi, m8), _mm_and_si128(_mm_srli_epi64(acc01.vi, 8), m8));
3075     acc11.vi = _mm_add_epi64(_mm_and_si128(acc11.vi, m8), _mm_and_si128(_mm_srli_epi64(acc11.vi, 8), m8));
3076     acc10.vi = _mm_add_epi64(_mm_and_si128(acc10.vi, m8), _mm_and_si128(_mm_srli_epi64(acc10.vi, 8), m8));
3077     counts_3x3[0] += ((acc00.u8[0] + acc00.u8[1]) * 0x1000100010001LLU) >> 48;
3078     counts_3x3[1] += ((acc01.u8[0] + acc01.u8[1]) * 0x1000100010001LLU) >> 48;
3079     counts_3x3[4] += ((acc11.u8[0] + acc11.u8[1]) * 0x1000100010001LLU) >> 48;
3080     counts_3x3[3] += ((acc10.u8[0] + acc10.u8[1]) * 0x1000100010001LLU) >> 48;
3081   }
3082   if (sample_ctv6) {
3083     vend = &(veca0[sample_ctv6]);
3084     ct2 = sample_ctv6 % 2;
3085     sample_ctv6 = 0;
3086     acc00.vi = _mm_setzero_si128();
3087     acc01.vi = _mm_setzero_si128();
3088     acc11.vi = _mm_setzero_si128();
3089     acc10.vi = _mm_setzero_si128();
3090     if (ct2) {
3091       countx00 = _mm_setzero_si128();
3092       countx01 = _mm_setzero_si128();
3093       countx11 = _mm_setzero_si128();
3094       countx10 = _mm_setzero_si128();
3095       goto two_locus_3x3_zmiss_tablev_one_left;
3096     }
3097     goto two_locus_3x3_zmiss_tablev_outer;
3098   }
3099 }
3100 #endif
3101 
two_locus_count_table_zmiss1(uintptr_t * lptr1,uintptr_t * lptr2,uint32_t * counts_3x3,uint32_t sample_ctv3,uint32_t is_zmiss2)3102 static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
3103 #ifdef __LP64__
3104   fill_uint_zero(6, counts_3x3);
3105   if (is_zmiss2) {
3106     two_locus_3x3_zmiss_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2);
3107   } else {
3108     two_locus_3x3_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2, 2);
3109   }
3110 #else
3111   counts_3x3[0] = popcount_longs_intersect(lptr1, lptr2, sample_ctv3);
3112   counts_3x3[1] = popcount_longs_intersect(lptr1, &(lptr2[sample_ctv3]), sample_ctv3);
3113   if (!is_zmiss2) {
3114     counts_3x3[2] = popcount_longs_intersect(lptr1, &(lptr2[2 * sample_ctv3]), sample_ctv3);
3115     counts_3x3[5] = popcount_longs_intersect(&(lptr1[sample_ctv3]), &(lptr2[2 * sample_ctv3]), sample_ctv3);
3116   }
3117   lptr1 = &(lptr1[sample_ctv3]);
3118   counts_3x3[3] = popcount_longs_intersect(lptr1, lptr2, sample_ctv3);
3119   counts_3x3[4] = popcount_longs_intersect(lptr1, &(lptr2[sample_ctv3]), sample_ctv3);
3120 #endif
3121 }
3122 
two_locus_count_table(uintptr_t * lptr1,uintptr_t * lptr2,uint32_t * counts_3x3,uint32_t sample_ctv3,uint32_t is_zmiss2)3123 static void two_locus_count_table(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
3124 #ifdef __LP64__
3125   uint32_t uii;
3126   fill_uint_zero(9, counts_3x3);
3127   if (!is_zmiss2) {
3128     two_locus_3x3_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2, 3);
3129   } else {
3130     two_locus_3x3_tablev((__m128i*)lptr2, (__m128i*)lptr1, counts_3x3, sample_ctv3 / 2, 2);
3131     uii = counts_3x3[1];
3132     counts_3x3[1] = counts_3x3[3];
3133     counts_3x3[3] = uii;
3134     counts_3x3[6] = counts_3x3[2];
3135     counts_3x3[7] = counts_3x3[5];
3136   }
3137 #else
3138   counts_3x3[0] = popcount_longs_intersect(lptr2, lptr1, sample_ctv3);
3139   counts_3x3[3] = popcount_longs_intersect(lptr2, &(lptr1[sample_ctv3]), sample_ctv3);
3140   counts_3x3[6] = popcount_longs_intersect(lptr2, &(lptr1[2 * sample_ctv3]), sample_ctv3);
3141   lptr2 = &(lptr2[sample_ctv3]);
3142   counts_3x3[1] = popcount_longs_intersect(lptr2, lptr1, sample_ctv3);
3143   counts_3x3[4] = popcount_longs_intersect(lptr2, &(lptr1[sample_ctv3]), sample_ctv3);
3144   counts_3x3[7] = popcount_longs_intersect(lptr2, &(lptr1[2 * sample_ctv3]), sample_ctv3);
3145   if (!is_zmiss2) {
3146     lptr2 = &(lptr2[sample_ctv3]);
3147     counts_3x3[2] = popcount_longs_intersect(lptr2, lptr1, sample_ctv3);
3148     counts_3x3[5] = popcount_longs_intersect(lptr2, &(lptr1[sample_ctv3]), sample_ctv3);
3149     counts_3x3[8] = popcount_longs_intersect(lptr2, &(lptr1[2 * sample_ctv3]), sample_ctv3);
3150   }
3151 #endif
3152 }
3153 
fepi_counts_to_joint_effects_stats(uint32_t group_ct,uint32_t * counts,double * diff_ptr,double * case_var_ptr,double * ctrl_var_ptr)3154 void fepi_counts_to_joint_effects_stats(uint32_t group_ct, uint32_t* counts, double* diff_ptr, double* case_var_ptr, double* ctrl_var_ptr) {
3155   // See JointEffects::evaluateStatistic().  This is slightly reordered to
3156   // avoid a bit of redundant calculation, but the logic is otherwise
3157   // identical.
3158   //
3159   // Two adjustments to the raw counts are applied:
3160   // 1. If any cell in either the case or control tables is zero, we add 0.5 to
3161   //    all cells in both tables.
3162   // 2. Then, if the [hom A2 x hom B2] cell in either the case or control table
3163   //    is less than 1% of the total (very unlikely since A2/B2 are normally
3164   //    major), multiply all other cells by a reduction factor and increase the
3165   //    [hom A2 x hom B2] cell by the total reduction (choosing the factor such
3166   //    that the [hom A2 x hom B2] cell ends up at about 1%).
3167   //
3168   // Then, we define
3169   //   i22_case := [hom A1 x hom B1] * [hom A2 x hom B2] /
3170   //               ([hom A1 x hom B2] * [hom A2 x hom B1])
3171   //   i21_case := [hom A1 x het] * [hom A2 x hom B2] /
3172   //               ([hom A1 x hom B2] * [hom A2 x het])
3173   //   i12_case := [het x hom B1] * [hom A2 x hom B2] /
3174   //               ([het x hom B2] * [hom A2 x hom B1])
3175   //   i11_case := [het x het] * [hom A2 x hom B2] /
3176   //               ([het x hom B2] * [hom A2 x het])
3177   //   (analogously for controls)
3178   //
3179   // At this point, two formulas may be applied to the (adjusted) counts:
3180   // 1. If i11 is greater than 0.5 for both cases and controls (this is usually
3181    //    true),
3182   //      xi0 := 0.5
3183   //      xi1 := 1.0
3184   //      xi2 := 1.0
3185   //      xi3 := 2 * i11_case / (2 * i11_case - 1)
3186   //      invq00 := 1.0 / [hom A2 x hom B2]
3187   //      invq01 := 1.0 / [hom A2 x het]
3188   //      ...
3189   //      inverse_matrix := [ (invq22+invq02+invq20+invq00)*xi0*xi0   (invq20+invq00)*xi0*xi1   (invq02+invq00)*xi0*xi2   invq00*xi0*xi3 ]^-1
3190   //                        [ ...   (invq21+invq20+invq01+invq00)*xi1*xi1   invq00*xi1*xi2   (invq01+invq00)*xi1*xi3 ]
3191   //                        [ ...   ...   (invq12+invq10+invq02+invq00)*xi2*xi2   (invq10+invq00)*xi2*xi3 ]
3192   //                        [ ...   ...   ... (invq11+invq10+invq01+invq00)*xi3*xi3 ]
3193   //      (bottom left is symmetric copy of upper right)
3194   //      row_totals_case[i] := sum(row i of inverse_matrix_case)
3195   //      total_inv_v_case := 1.0 / (row_totals_case[0] + [1] + [2] + [3])
3196   //      lambda_case := row_totals_case[0] * log(i22_case) * 0.5 +
3197   //                     row_totals_case[1] * log(i21_case) +
3198   //                     row_totals_case[2] * log(i12_case) +
3199   //                     row_totals_case[3] * log(2 * i11_case - 1)
3200   //      (analogous formulas for lambda_ctrl)
3201   //      diff := lambda_case * total_inv_v_case -
3202   //              lambda_ctrl * total_inv_v_ctrl
3203   //      chisq := diff * diff / (total_inv_v_case + total_inv_v_ctrl)
3204   //
3205   // 2. Otherwise,
3206   //      xi0 := sqrt(i22) / (2 * sqrt(i22) + 2)
3207   //      xi1 := i21 / (i21 + 1)
3208   //      xi2 := i12 / (i12 + 1)
3209   //      xi3 := 1.0
3210   //      (inverse_matrix, row_totals, total_inv_v defined as before)
3211   //      mu_case := row_totals_case[0] * log((sqrt(i22_case) + 1) * 0.5) +
3212   //                 row_totals_case[1] * log((i21_case + 1) * 0.5) +
3213   //                 row_totals_case[2] * log((i12_case + 1) * 0.5) +
3214   //                 row_totals_case[3] * log(i11_case)
3215   //      (similar for mu_ctrl)
3216   //      diff := mu_case * total_inv_v_case - mu_ctrl * total_inv_v_ctrl
3217   double dcounts[18];
3218   double invcounts[18];
3219   double ivv[8]; // i22_case in [0], i21_case in [1], ..., i22_ctrl in [4]...
3220   double xiv[8];
3221   double row_totals[8];
3222   double to_invert[16];
3223   MATRIX_INVERT_BUF1_TYPE int_1d_buf[4];
3224   double dbl_2d_buf[16];
3225   double tot_inv_v[2];
3226   double lambda_or_mu[2];
3227   double dxx;
3228   double dyy;
3229   double* dptr;
3230   double* dptr2;
3231   double* dptr3;
3232   uint32_t use_reg_stat;
3233   uint32_t uii;
3234   uint32_t ujj;
3235   uint32_t ukk;
3236   tot_inv_v[0] = 0.0;  // gcc7 maybe-uninitialized warning
3237   dptr = dcounts;
3238   if (counts[0] && counts[1] && counts[2] && counts[3] && counts[4] && counts[5] && counts[6] && counts[7] && counts[8] && ((group_ct == 1) || (counts[9] && counts[10] && counts[11] && counts[12] && counts[13] && counts[14] && counts[15] && counts[16] && counts[17]))) {
3239     for (uii = 0; uii < group_ct; uii++) {
3240       dxx = 0;
3241       for (ujj = 0; ujj < 9; ujj++) {
3242 	dyy = (double)((int32_t)(*counts++));
3243 	*dptr++ = dyy;
3244 	dxx += dyy;
3245       }
3246       if (dyy * 100 < dxx) {
3247 	// This tends to come up with adjacent pairs of markers where MAF
3248 	// "flips" from one side of 0.5 to the other.  Is this really a good
3249 	// way to handle it?
3250 	dyy = dxx / (1.01 * dxx - dyy);
3251         dptr = &(dptr[-9]);
3252 	for (ujj = 0; ujj < 8; ujj++) {
3253 	  *dptr *= dyy;
3254 	  dptr++;
3255 	}
3256 	*dptr++ = 0.01 * dyy * dxx;
3257       }
3258     }
3259   } else {
3260     for (uii = 0; uii < group_ct; uii++) {
3261       dxx = -4.5;
3262       for (ujj = 0; ujj < 9; ujj++) {
3263 	dyy = 0.5 + (double)((int32_t)(*counts++));
3264 	*dptr++ = dyy;
3265 	dxx += dyy;
3266       }
3267       if (dyy * 100 < dxx) {
3268 	dyy = dxx / (1.01 * dxx - dyy + 4.5);
3269         dptr = &(dptr[-9]);
3270 	for (ujj = 0; ujj < 8; ujj++) {
3271 	  *dptr *= dyy;
3272 	  dptr++;
3273 	}
3274 	*dptr++ = 0.01 * dyy * dxx;
3275       }
3276     }
3277   }
3278   dptr = dcounts;
3279   dptr2 = invcounts;
3280   for (uii = 0; uii < group_ct; uii++) {
3281     for (ujj = 0; ujj < 9; ujj++) {
3282       *dptr2++ = 1.0 / (*dptr++);
3283     }
3284   }
3285   dptr2 = ivv;
3286   uii = 0;
3287   do {
3288     dptr = &(dcounts[uii * 9]);
3289     dptr3 = &(invcounts[uii * 9]);
3290     dxx = dptr[8];
3291     *dptr2++ = dxx * dptr[0] * dptr3[2] * dptr3[6];
3292     *dptr2++ = dxx * dptr[1] * dptr3[2] * dptr3[7];
3293     *dptr2++ = dxx * dptr[3] * dptr3[5] * dptr3[6];
3294     *dptr2++ = dxx * dptr[4] * dptr3[5] * dptr3[7];
3295   } while (++uii < group_ct);
3296   use_reg_stat = (ivv[3] > 0.5) && ((group_ct == 1) || (ivv[7] > 0.5));
3297   if (use_reg_stat) {
3298     dptr2 = xiv;
3299     for (uii = 0; uii < group_ct; uii++) {
3300       dxx = 2 * ivv[3 + 4 * uii];
3301       *dptr2++ = 0.5;
3302       *dptr2++ = 1.0;
3303       *dptr2++ = 1.0;
3304       *dptr2++ = dxx / (dxx - 1);
3305     }
3306   } else {
3307     for (uii = 0; uii < group_ct; uii++) {
3308       dptr = &(ivv[uii * 4]);
3309       dptr2 = &(xiv[uii * 4]);
3310       dxx = sqrt(dptr[0]);
3311       dptr2[1] = dptr[1] / (dptr[1] + 1);
3312       dptr2[2] = dptr[2] / (dptr[2] + 1);
3313       dptr2[3] = 1.0;
3314       dptr2[0] = dxx / (2 * dxx + 2);
3315       dptr[0] = dxx; // original i22 is not used from here on
3316     }
3317   }
3318   for (uii = 0; uii < group_ct; uii++) {
3319     dptr = &(invcounts[uii * 9]);
3320     dptr2 = &(xiv[uii * 4]);
3321     // invq00 = dptr[8]
3322     // invq01 = dptr[7]
3323     // ...
3324     // thank god this code doesn't need to be edited every day
3325     dxx = dptr[8];
3326     dyy = dptr2[0];
3327     to_invert[0] = (dptr[0] + dptr[2] + dptr[6] + dxx) * dyy * dyy;
3328     to_invert[1] = (dptr[2] + dxx) * dyy * dptr2[1];
3329     to_invert[2] = (dptr[6] + dxx) * dyy * dptr2[2];
3330     to_invert[3] = dxx * dyy * dptr2[3];
3331     dyy = dptr2[1];
3332     to_invert[4] = to_invert[1];
3333     to_invert[5] = (dptr[1] + dptr[2] + dptr[7] + dxx) * dyy * dyy;
3334     to_invert[6] = dxx * dyy * dptr2[2];
3335     to_invert[7] = (dptr[7] + dxx) * dyy * dptr2[3];
3336     dyy = dptr2[2];
3337     to_invert[8] = to_invert[2];
3338     to_invert[9] = to_invert[6];
3339     to_invert[10] = (dptr[3] + dptr[5] + dptr[6] + dxx) * dyy * dyy;
3340     to_invert[11] = (dptr[5] + dxx) * dyy * dptr2[3];
3341     dyy = dptr2[3];
3342     to_invert[12] = to_invert[3];
3343     to_invert[13] = to_invert[7];
3344     to_invert[14] = to_invert[11];
3345     to_invert[15] = (dptr[4] + dptr[5] + dptr[7] + dxx) * dyy * dyy;
3346     invert_matrix(4, to_invert, int_1d_buf, dbl_2d_buf);
3347     dptr = to_invert;
3348     dptr2 = &(row_totals[uii * 4]);
3349     dxx = 0;
3350     for (ujj = 0; ujj < 4; ujj++) {
3351       dyy = 0;
3352       for (ukk = 0; ukk < 4; ukk++) {
3353 	dyy += (*dptr++);
3354       }
3355       *dptr2++ = dyy;
3356       dxx += dyy;
3357     }
3358     tot_inv_v[uii] = dxx;
3359   }
3360   if (use_reg_stat) {
3361     for (uii = 0; uii < group_ct; uii++) {
3362       dptr = &(row_totals[uii * 4]);
3363       dptr2 = &(ivv[uii * 4]);
3364       lambda_or_mu[uii] = dptr[0] * log(dptr2[0]) * 0.5 +
3365 	                  dptr[1] * log(dptr2[1]) +
3366 	                  dptr[2] * log(dptr2[2]) +
3367                           dptr[3] * log(2 * dptr2[3] - 1);
3368     }
3369   } else {
3370     for (uii = 0; uii < group_ct; uii++) {
3371       dptr = &(row_totals[uii * 4]);
3372       dptr2 = &(ivv[uii * 4]);
3373       // note that dptr2[0] has sqrt(i22) instead of i22
3374       // really minor thing to check: cheaper to subtract log(2) than multiply
3375       // by 0.5 inside log?  (I wouldn't think so: multiplication-by-0.5 is the
3376       // sort of thing which looks like it's eligible for automatic
3377       // optimization.)
3378       lambda_or_mu[uii] = dptr[0] * log((dptr2[0] + 1) * 0.5) +
3379 	                  dptr[1] * log((dptr2[1] + 1) * 0.5) +
3380 	                  dptr[2] * log((dptr2[2] + 1) * 0.5) +
3381 	                  dptr[3] * log(dptr2[3]);
3382     }
3383   }
3384   dxx = tot_inv_v[0];
3385   if (group_ct == 1) {
3386     *case_var_ptr = dxx;
3387     *diff_ptr = lambda_or_mu[0];
3388     return;
3389   }
3390   dxx = 1.0 / dxx;
3391   dyy = 1.0 / tot_inv_v[1];
3392   *diff_ptr = lambda_or_mu[0] * dxx - lambda_or_mu[1] * dyy;
3393   *case_var_ptr = dxx;
3394   *ctrl_var_ptr = dyy;
3395 }
3396 
3397 // epistasis multithread globals
3398 static uint32_t* g_epi_geno1_offsets;
3399 static double* g_epi_all_chisq;
3400 static uintptr_t* g_epi_geno1;
3401 static uintptr_t* g_epi_zmiss1;
3402 static uint32_t* g_epi_idx1_block_bounds;
3403 static uint32_t* g_epi_idx1_block_bounds16;
3404 static double* g_epi_best_chisq1;
3405 static uint32_t* g_epi_best_id1; // best partner ID
3406 static uint32_t* g_epi_n_sig_ct1;
3407 static uint32_t* g_epi_fail_ct1;
3408 static uintptr_t* g_epi_geno2;
3409 static uintptr_t* g_epi_zmiss2;
3410 static uint32_t* g_epi_tot2;
3411 static double* g_epi_boost_precalc2 = nullptr;
3412 static double* g_epi_best_chisq2;
3413 static uint32_t* g_epi_best_id2;
3414 static uint32_t* g_epi_n_sig_ct2;
3415 static uint32_t* g_epi_fail_ct2;
3416 static double* g_epi_recip_cache;
3417 static uint32_t g_epi_thread_ct;
3418 static uint32_t g_epi_case_ct;
3419 static uint32_t g_epi_ctrl_ct;
3420 static uint32_t g_epi_flag;
3421 static uint32_t g_epi_cellmin;
3422 static uintptr_t g_epi_marker_ct;
3423 static uintptr_t g_epi_marker_idx1;
3424 static uintptr_t g_epi_idx2_block_size;
3425 static uintptr_t g_epi_idx2_block_start;
3426 static double g_epi_alpha1sq[3];
3427 static double g_epi_alpha2sq[3];
3428 
3429 // The following two functions are essentially ported from Statistics.cpp in
3430 // Richard Howey's CASSI software
3431 // (http://www.staff.ncl.ac.uk/richard.howey/cassi/index.html).  (CASSI is also
3432 // GPLv3-licensed; just remember to give credit to Howey if you redistribute a
3433 // variant of this code.  This would have been a friggin' nightmare to debug if
3434 // he hadn't already done all the real work.)
fepi_counts_to_stats(uint32_t * counts_3x3,uint32_t no_ueki,double * or_ptr,double * var_ptr)3435 static void fepi_counts_to_stats(uint32_t* counts_3x3, uint32_t no_ueki, double* or_ptr, double* var_ptr) {
3436   double c11 = (double)((int32_t)(4 * counts_3x3[0] + 2 * (counts_3x3[1] + counts_3x3[3]) + counts_3x3[4]));
3437   double c12 = (double)((int32_t)(4 * counts_3x3[2] + 2 * (counts_3x3[1] + counts_3x3[5]) + counts_3x3[4]));
3438   double c21 = (double)((int32_t)(4 * counts_3x3[6] + 2 * (counts_3x3[3] + counts_3x3[7]) + counts_3x3[4]));
3439   double c22 = (double)((int32_t)(4 * counts_3x3[8] + 2 * (counts_3x3[5] + counts_3x3[7]) + counts_3x3[4]));
3440   double rc11;
3441   double rc12;
3442   double rc21;
3443   double rc22;
3444   double dxx;
3445   uint32_t no_adj;
3446   if (!no_ueki) {
3447     // See AdjustedFastEpistasis::calculateLogOddsAdjustedVariance().
3448     no_adj = (counts_3x3[0] && counts_3x3[1] && counts_3x3[2] && counts_3x3[3] && counts_3x3[4] && counts_3x3[5] && counts_3x3[6] && counts_3x3[7] && counts_3x3[8]);
3449     if (!no_adj) {
3450       c11 += 4.5;
3451       c12 += 4.5;
3452       c21 += 4.5;
3453       c22 += 4.5;
3454     }
3455     rc11 = 1.0 / c11;
3456     rc12 = 1.0 / c12;
3457     rc21 = 1.0 / c21;
3458     rc22 = 1.0 / c22;
3459     *or_ptr = log(c11 * c22 * rc12 * rc21);
3460 
3461     c11 = rc11 - rc12; // bit2
3462     c12 = rc11 - rc21; // bit3
3463     dxx = rc11 - rc12 - rc21 + rc22; // bit5
3464     c21 = rc22 - rc12; // bit6
3465     c22 = rc22 - rc21; // bit8
3466 
3467     rc11 *= rc11;
3468     rc12 *= rc12;
3469     rc21 *= rc21;
3470     rc22 *= rc22;
3471     c11 *= c11;
3472     c12 *= c12;
3473     c21 *= c21;
3474     c22 *= c22;
3475     dxx *= dxx;
3476 
3477     if (no_adj) {
3478       *var_ptr = 4 * (4 * (rc11 * (double)((int32_t)counts_3x3[0]) +
3479 			   rc12 * (double)((int32_t)counts_3x3[2]) +
3480 			   rc21 * (double)((int32_t)counts_3x3[6]) +
3481 			   rc22 * (double)((int32_t)counts_3x3[8])) +
3482 		      c11 * (double)((int32_t)counts_3x3[1]) +
3483 		      c12 * (double)((int32_t)counts_3x3[3]) +
3484 		      c21 * (double)((int32_t)counts_3x3[5]) +
3485 		      c22 * (double)((int32_t)counts_3x3[7])) +
3486                  dxx * (double)((int32_t)counts_3x3[4]);
3487     } else {
3488       *var_ptr = 4 * (4 * (rc11 * ((double)((int32_t)counts_3x3[0]) + 0.5) +
3489 			   rc12 * ((double)((int32_t)counts_3x3[2]) + 0.5) +
3490 			   rc21 * ((double)((int32_t)counts_3x3[6]) + 0.5) +
3491 			   rc22 * ((double)((int32_t)counts_3x3[8]) + 0.5)) +
3492 		      c11 * ((double)((int32_t)counts_3x3[1]) + 0.5) +
3493 		      c12 * ((double)((int32_t)counts_3x3[3]) + 0.5) +
3494 		      c21 * ((double)((int32_t)counts_3x3[5]) + 0.5) +
3495 		      c22 * ((double)((int32_t)counts_3x3[7]) + 0.5)) +
3496                  dxx * ((double)((int32_t)counts_3x3[4]) + 0.5);
3497     }
3498   } else {
3499     rc11 = 1.0 / c11;
3500     rc12 = 1.0 / c12;
3501     rc21 = 1.0 / c21;
3502     rc22 = 1.0 / c22;
3503     *or_ptr = log(c11 * c22 * rc12 * rc21);
3504     *var_ptr = rc11 + rc12 + rc21 + rc22;
3505   }
3506 }
3507 
boost_calc_p_bc(uint32_t case0_ct,uint32_t case1_ct,uint32_t case2_ct,uint32_t ctrl0_ct,uint32_t ctrl1_ct,uint32_t ctrl2_ct,double * p_bc)3508 void boost_calc_p_bc(uint32_t case0_ct, uint32_t case1_ct, uint32_t case2_ct, uint32_t ctrl0_ct, uint32_t ctrl1_ct, uint32_t ctrl2_ct, double* p_bc) {
3509   double* recip_cache = g_epi_recip_cache;
3510   double tot_recip = recip_cache[case0_ct + case1_ct + case2_ct];
3511   p_bc[0] = ((int32_t)case0_ct) * tot_recip;
3512   p_bc[1] = ((int32_t)case1_ct) * tot_recip;
3513   p_bc[2] = ((int32_t)case2_ct) * tot_recip;
3514   tot_recip = recip_cache[ctrl0_ct + ctrl1_ct + ctrl2_ct];
3515   p_bc[3] = ((int32_t)ctrl0_ct) * tot_recip;
3516   p_bc[4] = ((int32_t)ctrl1_ct) * tot_recip;
3517   p_bc[5] = ((int32_t)ctrl2_ct) * tot_recip;
3518 }
3519 
boost_calc_p_ca(uint32_t case0_ct,uint32_t case1_ct,uint32_t case2_ct,uint32_t ctrl0_ct,uint32_t ctrl1_ct,uint32_t ctrl2_ct,double * p_ca,uint32_t * df_adj_ptr)3520 uint32_t boost_calc_p_ca(uint32_t case0_ct, uint32_t case1_ct, uint32_t case2_ct, uint32_t ctrl0_ct, uint32_t ctrl1_ct, uint32_t ctrl2_ct, double* p_ca, uint32_t* df_adj_ptr) {
3521   double* recip_cache = g_epi_recip_cache;
3522   uint32_t uii = case0_ct + ctrl0_ct;
3523   uint32_t df_adj = 0;
3524   double tot_recip;
3525   tot_recip = recip_cache[uii];
3526   if (!uii) {
3527     df_adj++;
3528   }
3529   p_ca[0] = ((int32_t)case0_ct) * tot_recip;
3530   p_ca[1] = ((int32_t)ctrl0_ct) * tot_recip;
3531   uii = case1_ct + ctrl1_ct;
3532   tot_recip = recip_cache[uii];
3533   if (!uii) {
3534     df_adj++;
3535   }
3536   p_ca[2] = ((int32_t)case1_ct) * tot_recip;
3537   p_ca[3] = ((int32_t)ctrl1_ct) * tot_recip;
3538   uii = case2_ct + ctrl2_ct;
3539   tot_recip = recip_cache[uii];
3540   if (!uii) {
3541     df_adj++;
3542   }
3543   p_ca[4] = ((int32_t)case2_ct) * tot_recip;
3544   p_ca[5] = ((int32_t)ctrl2_ct) * tot_recip;
3545   *df_adj_ptr = df_adj;
3546   return (df_adj > 1);
3547 }
3548 
fepi_counts_to_boost_chisq(uint32_t * counts,double * p_bc,double * p_ca,double * alpha1sq_ptr,double * alpha2sq_ptr,uintptr_t df_adj,double * chisq_ptr,uint32_t * sig_ct1_ptr,uint32_t * sig_ct2_ptr)3549 double fepi_counts_to_boost_chisq(uint32_t* counts, double* p_bc, double* p_ca, double* alpha1sq_ptr, double* alpha2sq_ptr, uintptr_t df_adj, double* chisq_ptr, uint32_t* sig_ct1_ptr, uint32_t* sig_ct2_ptr) {
3550   // see BOOSTx64.c lines 625-903.
3551   double interaction_measure = 0.0;
3552   double tau = 0.0;
3553   double* recip_cache = g_epi_recip_cache;
3554   uint32_t* uiptr = counts;
3555   uint32_t sum = 0;
3556   uint32_t uoo = 0;
3557   double mu_xx[9]; // initially p_ab
3558   double mu_tmp[18];
3559   double mu0_tmp[18];
3560   double* dptr = mu_xx;
3561   double sum_recip;
3562   double dxx;
3563   double dyy;
3564   double mu_error;
3565 
3566   // dirty hack: encode df adjustment in low bits of *chisq_ptr
3567   uintptr_t ularr[sizeof(double) / BYTECT];
3568 
3569   uint32_t uii;
3570   uint32_t ujj;
3571   uint32_t ukk;
3572   uint32_t umm;
3573   uint32_t unn;
3574   for (uii = 0; uii < 3; uii++) {
3575     ujj = counts[uii] + counts[uii + 9];
3576     ukk = counts[uii + 3] + counts[uii + 12];
3577     umm = counts[uii + 6] + counts[uii + 15];
3578     unn = ujj + ukk + umm;
3579     if (!unn) {
3580       if (uoo++) {
3581 	return NAN;
3582       }
3583       df_adj++;
3584     }
3585     sum += unn;
3586     dxx = recip_cache[unn];
3587     *dptr++ = ((int32_t)ujj) * dxx;
3588     *dptr++ = ((int32_t)ukk) * dxx;
3589     *dptr++ = ((int32_t)umm) * dxx;
3590   }
3591   for (ukk = 0; ukk < 2; ukk++) {
3592     for (uii = 0; uii < 3; uii++) {
3593       dyy = p_ca[2 * uii + ukk];
3594       dptr = &(p_bc[3 * ukk]);
3595       dxx = mu_xx[uii] * (*dptr++) * dyy;
3596       tau += dxx;
3597       umm = *uiptr++;
3598       if (umm) {
3599 	if (dxx != 0.0) {
3600 	  //   Cx * log(Cx / y)
3601 	  // = Cx * (log(C) + log(x / y))
3602 	  // = Cx * log(C) + Cx * log(x / y)
3603 
3604 	  // caching entropy as well would merely reduce a multiplication to
3605 	  // an addition, which is almost certainly not worth the cost
3606 	  interaction_measure -= ((int32_t)umm) * log(dxx * recip_cache[umm]);
3607 	} else {
3608 	  dxx = (double)((int32_t)umm);
3609 	  interaction_measure += dxx * log(dxx);
3610 	}
3611       }
3612       dxx = mu_xx[uii + 3] * (*dptr++) * dyy;
3613       tau += dxx;
3614       umm = *uiptr++;
3615       if (umm) {
3616 	if (dxx != 0.0) {
3617 	  interaction_measure -= ((int32_t)umm) * log(dxx * recip_cache[umm]);
3618 	} else {
3619 	  dxx = (double)((int32_t)umm);
3620 	  interaction_measure += dxx * log(dxx);
3621 	}
3622       }
3623       dxx = mu_xx[uii + 6] * (*dptr++) * dyy;
3624       tau += dxx;
3625       umm = *uiptr++;
3626       if (umm) {
3627 	if (dxx != 0.0) {
3628 	  interaction_measure -= ((int32_t)umm) * log(dxx * recip_cache[umm]);
3629 	} else {
3630 	  dxx = (double)((int32_t)umm);
3631 	  interaction_measure += dxx * log(dxx);
3632 	}
3633       }
3634     }
3635   }
3636   // interaction_measure = interaction_measure / sum - log(sum);
3637   // interaction_measure = (interaction_measure + log(tau)) * sum * 2;
3638   sum_recip = recip_cache[sum];
3639   interaction_measure = 2 * (interaction_measure + ((int32_t)sum) * log(tau * sum_recip));
3640   // > instead of >= for maximum compatibility, I guess
3641   if (interaction_measure > alpha1sq_ptr[df_adj]) {
3642     for (uii = 0; uii < 18; uii++) {
3643       mu_tmp[uii] = 1.0;
3644     }
3645     do {
3646       memcpy(mu0_tmp, mu_tmp, 18 * sizeof(double));
3647       dptr = mu_xx; // mu_ij
3648       for (uii = 0; uii < 18; uii += 2) {
3649         *dptr++ = mu_tmp[uii] + mu_tmp[uii + 1];
3650       }
3651       dptr = mu_tmp;
3652       for (uii = 0; uii < 9; uii++) {
3653 	dxx = mu_xx[uii];
3654 	if (dxx != 0.0) {
3655 	  dxx = (double)((int32_t)(counts[uii] + counts[uii + 9])) / dxx;
3656 	}
3657 	*dptr *= dxx;
3658 	dptr++;
3659 	*dptr *= dxx;
3660 	dptr++;
3661       }
3662       dptr = mu_xx; // mu_ik
3663       for (uii = 0; uii < 18; uii += 6) {
3664 	for (ukk = uii; ukk < uii + 2; ukk++) {
3665           *dptr++ = mu_tmp[ukk] + mu_tmp[ukk + 2] + mu_tmp[ukk + 4];
3666 	}
3667       }
3668       for (uii = 0; uii < 3; uii++) {
3669 	for (ukk = 0; ukk < 2; ukk++) {
3670 	  dxx = mu_xx[uii * 2 + ukk];
3671           if (dxx != 0.0) {
3672             dxx = ((double)((int32_t)(counts[ukk * 9 + uii * 3] + counts[ukk * 9 + uii * 3 + 1] + counts[ukk * 9 + uii * 3 + 2]))) / dxx;
3673 	  }
3674 	  mu_tmp[uii * 6 + ukk] *= dxx;
3675 	  mu_tmp[uii * 6 + ukk + 2] *= dxx;
3676 	  mu_tmp[uii * 6 + ukk + 4] *= dxx;
3677 	}
3678       }
3679       dptr = mu_xx; // mu_jk
3680       for (uii = 0; uii < 6; uii++) {
3681         *dptr = mu_tmp[uii] + mu_tmp[uii + 6] + mu_tmp[uii + 12];
3682 	dptr++;
3683       }
3684       for (ujj = 0; ujj < 3; ujj++) {
3685 	for (ukk = 0; ukk < 2; ukk++) {
3686 	  dxx = mu_xx[ujj * 2 + ukk];
3687           if (dxx != 0.0) {
3688 	    dxx = ((double)((int32_t)(counts[ukk * 9 + ujj] + counts[ukk * 9 + ujj + 3] + counts[ukk * 9 + ujj + 6]))) / dxx;
3689 	  }
3690           mu_tmp[ujj * 2 + ukk] *= dxx;
3691           mu_tmp[ujj * 2 + ukk + 6] *= dxx;
3692           mu_tmp[ujj * 2 + ukk + 12] *= dxx;
3693 	}
3694       }
3695       mu_error = 0.0;
3696       for (uii = 0; uii < 18; uii++) {
3697         mu_error += fabs(mu_tmp[uii] - mu0_tmp[uii]);
3698       }
3699     } while (mu_error > 0.001);
3700     tau = 0.0;
3701     interaction_measure = 0.0;
3702     uiptr = counts;
3703     for (ukk = 0; ukk < 2; ukk++) {
3704       for (uii = 0; uii < 3; uii++) {
3705 	for (ujj = 0; ujj < 3; ujj++) {
3706 	  dxx = ((double)((int32_t)(*uiptr++))) * sum_recip;
3707 	  dyy = mu_tmp[uii * 6 + ujj * 2 + ukk] * sum_recip;
3708 	  if (dxx != 0.0) {
3709 	    if (dyy != 0.0) {
3710 	      interaction_measure += dxx * log(dxx / dyy);
3711 	    } else {
3712               interaction_measure += dxx * log(dxx);
3713 	    }
3714 	  }
3715 	  tau += dyy;
3716 	}
3717       }
3718     }
3719     interaction_measure = (interaction_measure + log(tau)) * ((int32_t)(sum * 2));
3720     memcpy(ularr, &interaction_measure, sizeof(double));
3721     // save df_adj in low two bits
3722     ularr[0] &= ~(3 * ONELU);
3723     ularr[0] |= df_adj;
3724     memcpy(chisq_ptr, ularr, sizeof(double));
3725     if (interaction_measure < alpha1sq_ptr[df_adj]) {
3726       interaction_measure = alpha1sq_ptr[df_adj];
3727     }
3728   }
3729   if (interaction_measure >= alpha2sq_ptr[df_adj]) {
3730     *sig_ct1_ptr += 1;
3731     *sig_ct2_ptr += 1;
3732   }
3733   return interaction_measure;
3734 }
3735 
fast_epi_thread(void * arg)3736 THREAD_RET_TYPE fast_epi_thread(void* arg) {
3737   uintptr_t tidx = (uintptr_t)arg;
3738   uintptr_t block_idx1_start = g_epi_idx1_block_bounds[tidx];
3739   uintptr_t block_idx1_end = g_epi_idx1_block_bounds[tidx + 1];
3740   uintptr_t idx1_block_start16 = g_epi_idx1_block_bounds16[tidx];
3741   uintptr_t marker_idx1 = g_epi_marker_idx1 + block_idx1_start;
3742   uintptr_t marker_ct = g_epi_marker_ct;
3743   uint32_t case_ct = g_epi_case_ct;
3744   uint32_t ctrl_ct = g_epi_ctrl_ct;
3745   uint32_t case_ctv3 = BITCT_TO_ALIGNED_WORDCT(case_ct);
3746   uint32_t ctrl_ctv3 = BITCT_TO_ALIGNED_WORDCT(ctrl_ct);
3747   uint32_t case_ctsplit = 3 * case_ctv3;
3748   uint32_t ctrl_ctsplit = 3 * ctrl_ctv3;
3749   uint32_t tot_ctsplit = case_ctsplit + ctrl_ctsplit;
3750   uint32_t is_case_only = (g_epi_flag / EPI_FAST_CASE_ONLY) & 1;
3751   uint32_t group_ct = 2 - is_case_only;
3752   uint32_t tot_stride = group_ct * 3;
3753   uint32_t no_ueki = (g_epi_flag / EPI_FAST_NO_UEKI) & 1;
3754   uint32_t is_boost = (g_epi_flag / EPI_FAST_BOOST) & 1;
3755   uint32_t do_joint_effects = (g_epi_flag / EPI_FAST_JOINT_EFFECTS) & 1;
3756   uint32_t cellmin = g_epi_cellmin;
3757   uint32_t best_id_fixed = 0;
3758   uint32_t is_first_half = 0;
3759   uintptr_t* geno1 = g_epi_geno1;
3760   uintptr_t* zmiss1 = g_epi_zmiss1;
3761   uintptr_t* cur_geno1 = nullptr;
3762   uintptr_t* cur_geno1_ctrls = nullptr;
3763   double* cur_boost_precalc2 = nullptr;
3764   double* p_bc_ptr = nullptr;
3765   uint32_t* geno1_offsets = g_epi_geno1_offsets;
3766   uint32_t* best_id1 = &(g_epi_best_id1[idx1_block_start16]);
3767   double* alpha1sq_ptr = g_epi_alpha1sq;
3768   double* alpha2sq_ptr = g_epi_alpha2sq;
3769   double alpha1sq = alpha1sq_ptr[0];
3770   double alpha2sq = alpha2sq_ptr[0];
3771   double ctrl_var = 0;
3772   uint32_t tot1[6];
3773   uint32_t counts[18];
3774   double p_bc_tmp[6];
3775   double p_ca_fixed[6];
3776   double p_ca_tmp[6];
3777   uintptr_t* geno2;
3778   uintptr_t* zmiss2;
3779   uintptr_t* cur_geno2;
3780   double* all_chisq_write;
3781   double* chisq2_ptr;
3782   double* boost_precalc2;
3783   double* all_chisq;
3784   double* best_chisq1;
3785   double* best_chisq2;
3786   double* p_ca_ptr;
3787   uint32_t* n_sig_ct1;
3788   uint32_t* fail_ct1;
3789   uint32_t* best_id2;
3790   uint32_t* n_sig_ct2;
3791   uint32_t* fail_ct2;
3792   uint32_t* tot2;
3793   uint32_t* cur_tot2;
3794   uintptr_t idx2_block_size;
3795   uintptr_t cur_idx2_block_size;
3796   uintptr_t idx2_block_start;
3797   uintptr_t idx2_block_end;
3798   uintptr_t idx2_block_sizea16;
3799   uintptr_t block_idx1;
3800   uintptr_t block_delta1;
3801   uintptr_t block_idx2;
3802   uintptr_t cur_zmiss2;
3803   uintptr_t cur_zmiss2_tmp;
3804   uintptr_t ulii;
3805   double best_chisq_fixed;
3806   double case_var;
3807   double ctrl_or;
3808   double dxx;
3809   double zsq;
3810   uint32_t nm_case_fixed;
3811   uint32_t nm_ctrl_fixed;
3812   uint32_t nm_fixed;
3813   uint32_t n_sig_ct_fixed;
3814   uint32_t fail_ct_fixed;
3815   uint32_t df_adj_base;
3816   uint32_t df_adj;
3817   tot1[3] = 0; // suppress warning
3818   tot1[4] = 0;
3819   tot1[5] = 0;
3820   while (1) {
3821     idx2_block_size = g_epi_idx2_block_size;
3822     cur_idx2_block_size = idx2_block_size;
3823     idx2_block_start = g_epi_idx2_block_start;
3824     idx2_block_end = idx2_block_start + idx2_block_size;
3825     idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
3826     geno2 = g_epi_geno2;
3827     zmiss2 = g_epi_zmiss2;
3828     tot2 = g_epi_tot2;
3829     boost_precalc2 = g_epi_boost_precalc2;
3830     all_chisq = &(g_epi_all_chisq[idx2_block_start]);
3831     best_chisq1 = &(g_epi_best_chisq1[idx1_block_start16]);
3832     best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizea16]);
3833     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
3834     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
3835     best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
3836     n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
3837     fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
3838     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
3839       ulii = geno1_offsets[2 * block_idx1];
3840       if (ulii > idx2_block_start) {
3841 	block_idx2 = 0;
3842 	cur_idx2_block_size = ulii - idx2_block_start;
3843 	if (cur_idx2_block_size >= idx2_block_size) {
3844 	  cur_idx2_block_size = idx2_block_size;
3845 	} else {
3846 	  is_first_half = 1;
3847 	}
3848       } else {
3849 	ulii = geno1_offsets[2 * block_idx1 + 1];
3850 	if (ulii >= idx2_block_end) {
3851 	  // may not be done in set1 x all or set1 x set2 cases
3852 	  continue;
3853 	} else {
3854 	  if (ulii <= idx2_block_start) {
3855 	    block_idx2 = 0;
3856 	  } else {
3857 	    block_idx2 = ulii - idx2_block_start;
3858 	  }
3859 	}
3860       }
3861       cur_geno1 = &(geno1[block_idx1 * tot_ctsplit]);
3862       n_sig_ct_fixed = 0;
3863       fail_ct_fixed = 0;
3864       nm_case_fixed = is_set_ul(zmiss1, block_idx1 * 2);
3865       nm_ctrl_fixed = is_set_ul(zmiss1, block_idx1 * 2 + 1);
3866       nm_fixed = nm_case_fixed & nm_ctrl_fixed;
3867       tot1[0] = popcount_longs(cur_geno1, case_ctv3);
3868       tot1[1] = popcount_longs(&(cur_geno1[case_ctv3]), case_ctv3);
3869       tot1[2] = popcount_longs(&(cur_geno1[2 * case_ctv3]), case_ctv3);
3870       if (!is_case_only) {
3871 	cur_geno1_ctrls = &(cur_geno1[case_ctsplit]);
3872 	tot1[3] = popcount_longs(cur_geno1_ctrls, ctrl_ctv3);
3873 	tot1[4] = popcount_longs(&(cur_geno1_ctrls[ctrl_ctv3]), ctrl_ctv3);
3874 	tot1[5] = popcount_longs(&(cur_geno1_ctrls[2 * ctrl_ctv3]), ctrl_ctv3);
3875 	if (is_boost) {
3876 	  if (nm_fixed) {
3877 	    cur_boost_precalc2 = &(boost_precalc2[block_idx2 * 6]);
3878 	  } else {
3879 	    p_bc_ptr = p_bc_tmp;
3880 	  }
3881 	  boost_calc_p_ca(tot1[0], tot1[1], tot1[2], tot1[3], tot1[4], tot1[5], p_ca_fixed, &df_adj_base);
3882 	}
3883       }
3884       block_delta1 = block_idx1 - block_idx1_start;
3885       best_chisq_fixed = best_chisq1[block_delta1];
3886       all_chisq_write = &(all_chisq[block_idx1 * marker_ct]);
3887     fast_epi_thread_second_half:
3888       cur_geno2 = &(geno2[block_idx2 * tot_ctsplit]);
3889       chisq2_ptr = &(best_chisq2[block_idx2]);
3890       for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[tot_ctsplit])) {
3891 	cur_tot2 = &(tot2[block_idx2 * tot_stride]);
3892 	// this operation isn't extracting a 2-bit genotype, so don't use the
3893 	// macro
3894 	cur_zmiss2 = (zmiss2[block_idx2 / BITCT2] >> (2 * (block_idx2 % BITCT2))) & 3;
3895 	cur_zmiss2_tmp = cur_zmiss2 & 1;
3896 	if (nm_case_fixed) {
3897 	  two_locus_count_table_zmiss1(cur_geno1, cur_geno2, counts, case_ctv3, cur_zmiss2_tmp);
3898 	  if (cur_zmiss2_tmp) {
3899 	    counts[2] = tot1[0] - counts[0] - counts[1];
3900 	    counts[5] = tot1[1] - counts[3] - counts[4];
3901 	  }
3902 	  counts[6] = cur_tot2[0] - counts[0] - counts[3];
3903 	  counts[7] = cur_tot2[1] - counts[1] - counts[4];
3904 	  counts[8] = cur_tot2[2] - counts[2] - counts[5];
3905 	} else {
3906 	  two_locus_count_table(cur_geno1, cur_geno2, counts, case_ctv3, cur_zmiss2_tmp);
3907 	  if (cur_zmiss2_tmp) {
3908 	    counts[2] = tot1[0] - counts[0] - counts[1];
3909 	    counts[5] = tot1[1] - counts[3] - counts[4];
3910 	    counts[8] = tot1[2] - counts[6] - counts[7];
3911 	  }
3912 	}
3913 	if (!is_case_only) {
3914 	  cur_zmiss2_tmp = cur_zmiss2 >> 1;
3915 	  if (nm_ctrl_fixed) {
3916 	    two_locus_count_table_zmiss1(cur_geno1_ctrls, &(cur_geno2[case_ctsplit]), &(counts[9]), ctrl_ctv3, cur_zmiss2_tmp);
3917 	    if (cur_zmiss2_tmp) {
3918 	      counts[11] = tot1[3] - counts[9] - counts[10];
3919 	      counts[14] = tot1[4] - counts[12] - counts[13];
3920 	    }
3921 	    counts[15] = cur_tot2[3] - counts[9] - counts[12];
3922 	    counts[16] = cur_tot2[4] - counts[10] - counts[13];
3923 	    counts[17] = cur_tot2[5] - counts[11] - counts[14];
3924 	  } else {
3925 	    two_locus_count_table(cur_geno1_ctrls, &(cur_geno2[case_ctsplit]), &(counts[9]), ctrl_ctv3, cur_zmiss2_tmp);
3926 	    if (cur_zmiss2_tmp) {
3927 	      counts[11] = tot1[3] - counts[9] - counts[10];
3928 	      counts[14] = tot1[4] - counts[12] - counts[13];
3929 	      counts[17] = tot1[5] - counts[15] - counts[16];
3930 	    }
3931 	  }
3932 	}
3933 	if (!is_boost) {
3934 	  if (!do_joint_effects) {
3935 	    fepi_counts_to_stats(counts, no_ueki, &dxx, &case_var);
3936 	    if (!is_case_only) {
3937 	      fepi_counts_to_stats(&(counts[9]), no_ueki, &ctrl_or, &ctrl_var);
3938 	      dxx -= ctrl_or;
3939 	    }
3940 	  } else {
3941 	    if (cellmin) {
3942 	      if ((counts[0] < cellmin) || (counts[1] < cellmin) || (counts[2] < cellmin) || (counts[3] < cellmin) || (counts[4] < cellmin) || (counts[5] < cellmin) || (counts[6] < cellmin) || (counts[7] < cellmin) || (counts[8] < cellmin)) {
3943 		goto fast_epi_thread_fail;
3944 	      }
3945 	      if (!is_case_only) {
3946 		if ((counts[9] < cellmin) || (counts[10] < cellmin) || (counts[11] < cellmin) || (counts[12] < cellmin) || (counts[13] < cellmin) || (counts[14] < cellmin) || (counts[15] < cellmin) || (counts[16] < cellmin) || (counts[17] < cellmin)) {
3947 		  goto fast_epi_thread_fail;
3948 		}
3949 	      }
3950 	    }
3951 	    fepi_counts_to_joint_effects_stats(group_ct, counts, &dxx, &case_var, &ctrl_var);
3952 	  }
3953 	  zsq = dxx * dxx / (case_var + ctrl_var);
3954 	  if (!realnum(zsq)) {
3955 	    goto fast_epi_thread_fail;
3956 	  }
3957 	  if (zsq >= alpha1sq) {
3958 	    all_chisq_write[block_idx2] = zsq;
3959 	  }
3960 	  if (zsq >= alpha2sq) {
3961 	    n_sig_ct_fixed++;
3962 	    n_sig_ct2[block_idx2] += 1;
3963 	  }
3964 	fast_epi_thread_boost_save:
3965 	  if (zsq > best_chisq_fixed) {
3966 	    best_chisq_fixed = zsq;
3967 	    best_id_fixed = block_idx2 + idx2_block_start;
3968 	  }
3969 	  dxx = *chisq2_ptr;
3970 	  if (zsq > dxx) {
3971 	    *chisq2_ptr = zsq;
3972 	    best_id2[block_idx2] = marker_idx1;
3973 	  }
3974 	} else {
3975 	  if (nm_fixed) {
3976 	    p_bc_ptr = cur_boost_precalc2;
3977 	    cur_boost_precalc2 = &(cur_boost_precalc2[6]);
3978 	  } else {
3979 	    boost_calc_p_bc(counts[0] + counts[3] + counts[6], counts[1] + counts[4] + counts[7], counts[2] + counts[5] + counts[8], counts[9] + counts[12] + counts[15], counts[10] + counts[13] + counts[16], counts[11] + counts[14] + counts[17], p_bc_ptr);
3980 	  }
3981 	  if (cur_zmiss2 == 3) {
3982 	    p_ca_ptr = p_ca_fixed;
3983 	    df_adj = df_adj_base;
3984 	  } else {
3985 	    if (boost_calc_p_ca(counts[0] + counts[1] + counts[2], counts[3] + counts[4] + counts[5], counts[6] + counts[7] + counts[8], counts[9] + counts[10] + counts[11], counts[12] + counts[13] + counts[14], counts[15] + counts[16] + counts[17], p_ca_tmp, &df_adj)) {
3986 	      goto fast_epi_thread_fail;
3987 	    }
3988 	    p_ca_ptr = p_ca_tmp;
3989 	  }
3990 
3991 	  // if approximate zsq >= epi1 threshold but more accurate value is
3992 	  // not, we still want to save the more accurate value
3993 	  // also, we want epi2 counting to be df-sensitive
3994 	  // (punt on df/best_chisq for now)
3995 	  zsq = fepi_counts_to_boost_chisq(counts, p_bc_ptr, p_ca_ptr, alpha1sq_ptr, alpha2sq_ptr, df_adj, &(all_chisq_write[block_idx2]), &n_sig_ct_fixed, &(n_sig_ct2[block_idx2]));
3996 	  if (realnum(zsq)) {
3997 	    goto fast_epi_thread_boost_save;
3998 	  }
3999 	fast_epi_thread_fail:
4000 	  fail_ct_fixed++;
4001 	  fail_ct2[block_idx2] += 1;
4002 	  if (alpha1sq == 0.0) {
4003 	    // special case: log NA when '--epi1 1' specified
4004 	    all_chisq_write[block_idx2] = NAN;
4005 	  }
4006 	}
4007       }
4008       if (is_first_half) {
4009 	is_first_half = 0;
4010 	ulii = geno1_offsets[2 * block_idx1 + 1];
4011 	cur_idx2_block_size = idx2_block_size;
4012 	if (ulii < idx2_block_end) {
4013 	  // guaranteed to be larger than idx2_block_start, otherwise there
4014 	  // would have been no first half
4015 	  block_idx2 = ulii - idx2_block_start;
4016 	  if (is_boost && nm_fixed) {
4017 	    cur_boost_precalc2 = &(boost_precalc2[block_idx2 * 6]);
4018 	  }
4019 	  goto fast_epi_thread_second_half;
4020 	}
4021       }
4022       if (best_chisq_fixed > best_chisq1[block_delta1]) {
4023 	best_chisq1[block_delta1] = best_chisq_fixed;
4024 	best_id1[block_delta1] = best_id_fixed;
4025       }
4026       n_sig_ct1[block_delta1] = n_sig_ct_fixed;
4027       if (fail_ct_fixed) {
4028 	fail_ct1[block_delta1] = fail_ct_fixed;
4029       }
4030     }
4031     if ((!tidx) || g_is_last_thread_block) {
4032       THREAD_RETURN;
4033     }
4034     THREAD_BLOCK_FINISH(tidx);
4035   }
4036 }
4037 
4038 // epistasis linear/logistic regression multithread globals
4039 
4040 static double* g_epi_pheno_d2;
4041 static double* g_epi_phenogeno1;
4042 static double* g_epi_phenogeno2;
4043 static uint32_t* g_epi_genosums1;
4044 static uint32_t* g_epi_genosums2;
4045 static double g_epi_pheno_sum;
4046 static double g_epi_pheno_ssq;
4047 static double g_epi_vif_thresh;
4048 
4049 static uint32_t g_epi_pheno_nm_ct;
4050 
4051 typedef struct epi_logistic_multithread_struct {
4052   float* cur_covars_cov_major;
4053   float* coef;
4054   float* pp;
4055   float* sample_1d_buf;
4056   float* pheno_buf;
4057   float* param_1d_buf;
4058   float* param_1d_buf2;
4059   float* param_2d_buf;
4060   float* param_2d_buf2;
4061 } Epi_logistic_multithread;
4062 
4063 static Epi_logistic_multithread* g_epi_logistic_mt;
4064 static uintptr_t* g_epi_pheno_c;
4065 static float* g_epi_all_chisq_f;
4066 static float* g_epi_best_chisq_f1;
4067 static float* g_epi_best_chisq_f2;
4068 
matrix_invert_4x4symm(double * dmatrix)4069 uint32_t matrix_invert_4x4symm(double* dmatrix) {
4070   double buf[16];
4071   double determinant;
4072   // initially, dww = A_{22}A_{34} - A_{23}A_{24}
4073   //            dxx = A_{23}A_{34} - A_{24}A_{33}
4074   //            dyy = A_{23}A_{44} - A_{24}A_{34}
4075   //            dzz = A_{33}A_{44} - A_{34}A_{34}
4076   double dww = dmatrix[5] * dmatrix[11] - dmatrix[6] * dmatrix[7];
4077   double dxx = dmatrix[6] * dmatrix[11] - dmatrix[7] * dmatrix[10];
4078   double dyy = dmatrix[6] * dmatrix[15] - dmatrix[7] * dmatrix[11];
4079   double dzz = dmatrix[10] * dmatrix[15] - dmatrix[11] * dmatrix[11];
4080   double dvv;
4081   double duu;
4082   buf[0] = dmatrix[5] * dzz
4083          - dmatrix[6] * dyy
4084          + dmatrix[7] * dxx;
4085   buf[1] = dmatrix[2] * dyy
4086          - dmatrix[1] * dzz
4087          - dmatrix[3] * dxx;
4088   buf[2] = dmatrix[1] * dyy
4089          + dmatrix[2] * (dmatrix[7] * dmatrix[7] - dmatrix[5] * dmatrix[15])
4090          + dmatrix[3] * dww;
4091   duu = dmatrix[5] * dmatrix[10] - dmatrix[6] * dmatrix[6];
4092   buf[3] = dmatrix[2] * dww
4093          - dmatrix[1] * dxx
4094          - dmatrix[3] * duu;
4095   determinant = dmatrix[0] * buf[0] + dmatrix[1] * buf[1] + dmatrix[2] * buf[2] + dmatrix[3] * buf[3];
4096   if (fabs(determinant) < EPSILON) {
4097     return 1;
4098   }
4099   buf[5] = dmatrix[0] * dzz
4100          + dmatrix[2] * (dmatrix[3] * dmatrix[11] - dmatrix[2] * dmatrix[15])
4101          + dmatrix[3] * (dmatrix[2] * dmatrix[11] - dmatrix[3] * dmatrix[10]);
4102   dzz = dmatrix[1] * dmatrix[15] - dmatrix[3] * dmatrix[7];
4103   buf[6] = dmatrix[2] * dzz
4104          - dmatrix[0] * dyy
4105          + dmatrix[3] * (dmatrix[3] * dmatrix[6] - dmatrix[1] * dmatrix[11]);
4106   dyy = dmatrix[1] * dmatrix[11] - dmatrix[2] * dmatrix[7];
4107   dvv = dmatrix[1] * dmatrix[10] - dmatrix[2] * dmatrix[6];
4108   buf[7] = dmatrix[0] * dxx
4109          - dmatrix[2] * dyy
4110          + dmatrix[3] * dvv;
4111   buf[10] = dmatrix[0] * (dmatrix[5] * dmatrix[15] - dmatrix[7] * dmatrix[7])
4112           - dmatrix[1] * dzz
4113           + dmatrix[3] * (dmatrix[1] * dmatrix[7] - dmatrix[3] * dmatrix[5]);
4114   dxx = dmatrix[1] * dmatrix[6] - dmatrix[2] * dmatrix[5];
4115   buf[11] = dmatrix[1] * dyy
4116           - dmatrix[0] * dww
4117           - dmatrix[3] * dxx;
4118   buf[15] = dmatrix[0] * duu
4119           - dmatrix[1] * dvv
4120           + dmatrix[2] * dxx;
4121   determinant = 1.0 / determinant; // now reciprocal
4122   dmatrix[0] = buf[0] * determinant;
4123   dmatrix[1] = buf[1] * determinant;
4124   dmatrix[2] = buf[2] * determinant;
4125   dmatrix[3] = buf[3] * determinant;
4126   dmatrix[4] = dmatrix[1];
4127   dmatrix[5] = buf[5] * determinant;
4128   dmatrix[6] = buf[6] * determinant;
4129   dmatrix[7] = buf[7] * determinant;
4130   dmatrix[8] = dmatrix[2];
4131   dmatrix[9] = dmatrix[6];
4132   dmatrix[10] = buf[10] * determinant;
4133   dmatrix[11] = buf[11] * determinant;
4134   dmatrix[12] = dmatrix[3];
4135   dmatrix[13] = dmatrix[7];
4136   dmatrix[14] = dmatrix[11];
4137   dmatrix[15] = buf[15] * determinant;
4138   return 0;
4139 }
4140 
epi_linear_thread(void * arg)4141 THREAD_RET_TYPE epi_linear_thread(void* arg) {
4142   uintptr_t tidx = (uintptr_t)arg;
4143   uintptr_t block_idx1_start = g_epi_idx1_block_bounds[tidx];
4144   uintptr_t block_idx1_end = g_epi_idx1_block_bounds[tidx + 1];
4145   uintptr_t idx1_block_start16 = g_epi_idx1_block_bounds16[tidx];
4146   uintptr_t marker_idx1 = g_epi_marker_idx1 + block_idx1_start;
4147   uintptr_t marker_ct = g_epi_marker_ct;
4148   double alpha1sq = g_epi_alpha1sq[0];
4149   double alpha2sq = g_epi_alpha2sq[0];
4150   double pheno_sum = g_epi_pheno_sum;
4151   double pheno_ssq = g_epi_pheno_ssq;
4152   double vif_thresh = g_epi_vif_thresh;
4153   uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
4154   uint32_t best_id_fixed = 0;
4155   uint32_t is_first_half = 0;
4156   uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
4157   uintptr_t* geno1 = g_epi_geno1;
4158   double* pheno_d2 = g_epi_pheno_d2;
4159   uint32_t* geno1_offsets = g_epi_geno1_offsets;
4160   uint32_t* best_id1 = &(g_epi_best_id1[idx1_block_start16]);
4161   const double dconst[] = {1.0, 2.0, 2.0, 4.0};
4162   double dmatrix_buf[16];
4163   double dmatrix_buf2[4];
4164 
4165   // sum(aa), sum(ab), sum(bb), sum(aab), sum(abb), and sum(aabb) can all be
4166   // derived from these four quantities.
4167   uint32_t cur_minor_cts[4]; // 11, 12, 21, 22
4168 
4169   uintptr_t* cur_geno1;
4170   uintptr_t* geno2;
4171   uintptr_t* cur_geno2;
4172   double* phenogeno1;
4173   double* phenogeno2;
4174   double* all_chisq_write;
4175   double* chisq2_ptr;
4176   double* all_chisq;
4177   double* best_chisq1;
4178   double* best_chisq2;
4179   double* dptr;
4180   double* dptr2;
4181   uint32_t* n_sig_ct1;
4182   uint32_t* fail_ct1;
4183   uint32_t* best_id2;
4184   uint32_t* n_sig_ct2;
4185   uint32_t* fail_ct2;
4186   uint32_t* genosums1;
4187   uint32_t* genosums2;
4188   uintptr_t idx2_block_size;
4189   uintptr_t cur_idx2_block_size;
4190   uintptr_t idx2_block_start;
4191   uintptr_t idx2_block_end;
4192   uintptr_t idx2_block_sizea16;
4193   uintptr_t block_idx1;
4194   uintptr_t block_delta1;
4195   uintptr_t block_idx2;
4196   uintptr_t cur_word1;
4197   uintptr_t cur_word2;
4198   uintptr_t active_mask;
4199   uintptr_t param_idx;
4200   uintptr_t param_idx2;
4201   uintptr_t cur_sum_aab;
4202   uintptr_t cur_sum_abb;
4203   uintptr_t cur_sum_aabb;
4204   uintptr_t ulii;
4205   uintptr_t uljj;
4206   double best_chisq_fixed;
4207   double sum_a_pheno_base;
4208   double cur_pheno_sum;
4209   double cur_pheno_ssq;
4210   double cur_sum_a_pheno;
4211   double cur_sum_b_pheno;
4212   double cur_sum_ab_pheno;
4213   double sample_ctd;
4214   double sample_ct_recip;
4215   double sample_ct_m1_recip;
4216   double cur_sum_ad;
4217   double cur_sum_bd;
4218   double cur_sum_abd;
4219   double determinant;
4220   double min_sigma;
4221   double sigma;
4222   double dxx;
4223   double dyy;
4224   double dzz;
4225   double dww;
4226   double dvv;
4227   double duu;
4228   double zsq;
4229   uint32_t n_sig_ct_fixed;
4230   uint32_t fail_ct_fixed;
4231 
4232   uint32_t sum_a_base;
4233   uint32_t sum_aa_base;
4234   uint32_t cur_sum_a;
4235   uint32_t cur_sum_aa;
4236   uint32_t cur_sum_b;
4237   uint32_t cur_sum_bb;
4238   uint32_t cur_sum_ab;
4239   uint32_t widx;
4240   uint32_t sample_idx;
4241   uint32_t cur_sample_ct;
4242   uint32_t woffset;
4243   while (1) {
4244     idx2_block_size = g_epi_idx2_block_size;
4245     cur_idx2_block_size = idx2_block_size;
4246     idx2_block_start = g_epi_idx2_block_start;
4247     idx2_block_end = idx2_block_start + idx2_block_size;
4248     idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
4249     geno2 = g_epi_geno2;
4250     phenogeno1 = g_epi_phenogeno1;
4251     phenogeno2 = g_epi_phenogeno2;
4252     genosums1 = g_epi_genosums1;
4253     genosums2 = g_epi_genosums2;
4254     all_chisq = &(g_epi_all_chisq[2 * idx2_block_start]);
4255     best_chisq1 = &(g_epi_best_chisq1[idx1_block_start16]);
4256     best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizea16]);
4257     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
4258     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
4259     best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
4260     n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
4261     fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
4262     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
4263       ulii = geno1_offsets[2 * block_idx1];
4264       if (ulii > idx2_block_start) {
4265         block_idx2 = 0;
4266         cur_idx2_block_size = ulii - idx2_block_start;
4267 	if (cur_idx2_block_size >= idx2_block_size) {
4268           cur_idx2_block_size = idx2_block_size;
4269 	} else {
4270 	  is_first_half = 1;
4271         }
4272       } else {
4273         ulii = geno1_offsets[2 * block_idx1 + 1];
4274         if (ulii >= idx2_block_end) {
4275           // may not be done in set1 x all or set1 x set2 cases
4276           continue;
4277 	} else {
4278           if (ulii <= idx2_block_start) {
4279             block_idx2 = 0;
4280 	  } else {
4281             block_idx2 = ulii - idx2_block_start;
4282 	  }
4283 	}
4284       }
4285       cur_geno1 = &(geno1[block_idx1 * pheno_nm_ctl2]);
4286       n_sig_ct_fixed = 0;
4287       fail_ct_fixed = 0;
4288       block_delta1 = block_idx1 - block_idx1_start;
4289       best_chisq_fixed = best_chisq1[block_delta1];
4290       sum_a_pheno_base = phenogeno1[block_idx1];
4291       sum_a_base = genosums1[2 * block_idx1];
4292       sum_aa_base = genosums1[2 * block_idx1 + 1];
4293 
4294       // [0] = chisq, [1] = beta
4295       all_chisq_write = &(all_chisq[block_idx1 * marker_ct * 2]);
4296 
4297     epi_linear_thread_second_half:
4298       cur_geno2 = &(geno2[block_idx2 * pheno_nm_ctl2]);
4299       chisq2_ptr = &(best_chisq2[block_idx2]);
4300       for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[pheno_nm_ctl2])) {
4301 	// Our covariates are 1, genotype A (in {0, 1, 2}), genotype B, and
4302 	//   [genotype A] * [genotype B].
4303 	// The ordinary least squares solution to this system is
4304 	//   (X^T X)^{-1} X^T Y
4305 	// where X^T X is the following 4x4 matrix (where n = # of samples):
4306 	//   [ n       sum(A)   sum(B)   sum(AB)   ]
4307 	//   [ sum(A)  sum(AA)  sum(AB)  sum(AAB)  ]
4308 	//   [ sum(B)  sum(AB)  sum(BB)  sum(ABB)  ]
4309 	//   [ sum(AB) sum(AAB) sum(ABB) sum(AABB) ]
4310         // (sum(.) denotes the sum of that (product of) genotypes, across all
4311 	// samples.)
4312 	// Meanwhile, X^T Y is the following 4x1 matrix:
4313 	//   [ sum(pheno)      ]
4314 	//   [ sum(A * pheno)  ]
4315 	//   [ sum(B * pheno)  ]
4316 	//   [ sum(AB * pheno) ]
4317 	// Crucially, the VIF and valid parameters checks can also operate
4318 	// purely on the terms above and sum(pheno * pheno).
4319 
4320 	// these nine values can be mostly precomputed; just need to subtract
4321 	// from them sometimes when missing values are present.
4322 	cur_pheno_sum = pheno_sum;
4323 	cur_pheno_ssq = pheno_ssq;
4324 	cur_sum_a_pheno = sum_a_pheno_base;
4325 	cur_sum_b_pheno = phenogeno2[block_idx2];
4326 	cur_sum_a = sum_a_base;
4327 	cur_sum_aa = sum_aa_base;
4328 	cur_sum_b = genosums2[block_idx2 * 2];
4329 	cur_sum_bb = genosums2[block_idx2 * 2 + 1];
4330 	cur_sample_ct = pheno_nm_ct;
4331 
4332 	cur_sum_ab_pheno = 0.0;
4333 	fill_uint_zero(4, cur_minor_cts);
4334 	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
4335 	  sample_idx = widx * BITCT2;
4336           cur_word1 = cur_geno1[widx];
4337           cur_word2 = cur_geno2[widx];
4338 	  // we can entirely skip 5 common cases: 00/00, 00/01, 00/10, 01/00,
4339 	  // 10/00.
4340 	  active_mask = cur_word1 | cur_word2;
4341 	  active_mask = (active_mask & (active_mask >> 1) & FIVEMASK) | (cur_word1 & cur_word2);
4342 	  dptr = &(pheno_d2[sample_idx]);
4343 	  while (active_mask) {
4344             woffset = CTZLU(active_mask) / 2;
4345 	    dxx = dptr[woffset];
4346 	    woffset *= 2;
4347 	    ulii = (cur_word1 >> woffset) & (3 * ONELU);
4348             uljj = (cur_word2 >> woffset) & (3 * ONELU);
4349 	    active_mask &= ~((3 * ONELU) << woffset);
4350 	    if (ulii && uljj) {
4351 	      if (ulii == 3) {
4352 		if (uljj == 1) {
4353 		  cur_sum_b_pheno -= dxx;
4354 		  cur_sum_b--;
4355 		  cur_sum_bb--;
4356 		} else if (uljj == 2) {
4357 		  cur_sum_b_pheno -= 2 * dxx;
4358 		  cur_sum_b -= 2;
4359 		  cur_sum_bb -= 4;
4360 		}
4361 	      } else if (uljj == 3) {
4362 		// ulii must be 1 or 2
4363 		cur_sum_a_pheno -= dxx;
4364 		if (ulii == 2) {
4365 		  cur_sum_a_pheno -= dxx;
4366 		}
4367 		cur_sum_a -= ulii;
4368 		cur_sum_aa -= ulii * ulii;
4369 	      } else {
4370 		ulii = ulii * 2 + uljj - 3;
4371 		cur_sum_ab_pheno += dconst[ulii] * dxx;
4372 		cur_minor_cts[ulii] += 1;
4373 		continue;
4374 	      }
4375 	    }
4376 	    cur_pheno_sum -= dxx;
4377 	    cur_pheno_ssq -= dxx * dxx;
4378 	    cur_sample_ct--;
4379 	  }
4380 	}
4381 	if (cur_sample_ct <= 4) {
4382           goto epi_linear_thread_regression_fail;
4383 	}
4384 	// VIF check.  Mirrors glm_check_vif(), but param_ct is hardcoded to 4
4385 	// and we avoid additional iteration over the sample_idxs.
4386 	sample_ctd = (double)((int32_t)cur_sample_ct);
4387 	sample_ct_recip = 1.0 / sample_ctd;
4388 	sample_ct_m1_recip = 1.0 / ((double)((int32_t)(cur_sample_ct - 1)));
4389 	cur_sum_ab = cur_minor_cts[0] + 2 * (cur_minor_cts[1] + cur_minor_cts[2]) + 4 * cur_minor_cts[3];
4390 	cur_sum_aab = cur_minor_cts[0] + 2 * cur_minor_cts[1] + 4 * cur_minor_cts[2] + (8 * ONELU) * cur_minor_cts[3];
4391 	cur_sum_abb = cur_minor_cts[0] + 4 * cur_minor_cts[1] + 2 * cur_minor_cts[2] + (8 * ONELU) * cur_minor_cts[3];
4392 	cur_sum_aabb = cur_minor_cts[0] + 4 * (cur_minor_cts[1] + cur_minor_cts[2]) + (16 * ONELU) * cur_minor_cts[3];
4393 
4394 	cur_sum_ad = (double)((int32_t)cur_sum_a);
4395 	cur_sum_bd = (double)((int32_t)cur_sum_b);
4396 	cur_sum_abd = (double)((int32_t)cur_sum_ab);
4397 
4398 	// some genotype means
4399 	dxx = cur_sum_bd * sample_ct_recip;
4400 	dyy = cur_sum_abd * sample_ct_recip;
4401 
4402 	dww = ((double)((int32_t)cur_sum_aa)) - cur_sum_ad * cur_sum_ad * sample_ct_recip;
4403 	dvv = ((double)((int32_t)cur_sum_bb)) - cur_sum_bd * dxx;
4404 	duu = ((double)((intptr_t)cur_sum_aabb)) - cur_sum_abd * dyy;
4405 	if ((dww <= 0) || (dvv <= 0) || (duu <= 0)) {
4406 	  goto epi_linear_thread_regression_fail;
4407 	}
4408 	dww = 1.0 / sqrt(dww * sample_ct_m1_recip);
4409 	dvv = 1.0 / sqrt(dvv * sample_ct_m1_recip);
4410 	duu = 1.0 / sqrt(duu * sample_ct_m1_recip);
4411 
4412 	dxx = (cur_sum_abd - cur_sum_ad * dxx) * sample_ct_m1_recip;
4413 	dzz = (((double)((intptr_t)cur_sum_abb)) - cur_sum_bd * dyy) * sample_ct_m1_recip;
4414 	dyy = (((double)((intptr_t)cur_sum_aab)) - cur_sum_ad * dyy) * sample_ct_m1_recip;
4415 	// now dxx = A_{12}, dyy = A_{13}, dzz = A_{23}
4416 
4417 	dxx *= dww * dvv;
4418 	dyy *= dww * duu;
4419 	dzz *= dvv * duu;
4420 	if ((dxx > 0.999) || (dyy > 0.999) || (dzz > 0.999)) {
4421 	  goto epi_linear_thread_regression_fail;
4422 	}
4423 	// Use analytic formula for 3x3 symmetric matrix inverse.
4424 	// det A = A_{11}A_{22}A_{33} + 2 * A_{12}A_{13}A_{23}
4425 	//       - A_{11}(A_{23}^2) - A_{22}(A_{13}^2) - A_{33}(A_{12}^2)
4426 	// upper left of inverse = (A_{22}A_{33} - (A_{23}^2))(det A)^{-1}
4427         //                middle = (A_{11}A_{33} - (A_{13}^2))(det A)^{-1}
4428 	//           lower right = (A_{11}A_{22} - (A_{12}^2))(det A)^{-1}
4429 	dww = dxx * dxx;
4430 	dvv = dyy * dyy;
4431 	duu = dzz * dzz;
4432 	determinant = 1 + 2 * dxx * dyy * dzz - dww - dvv - duu;
4433 	if (fabs(determinant) < EPSILON) {
4434 	  goto epi_linear_thread_regression_fail;
4435 	}
4436 	// (1 - x^2)/det > vif_thresh
4437 	// if det > 0:
4438 	//   1 - x^2 > vif_thresh * det
4439 	//   1 - vif_thresh * det > x^2
4440 	// otherwise:
4441 	//   1 - x^2 < vif_thresh * det
4442 	//   1 - vif_thresh * det < x^2
4443         dxx = 1 - vif_thresh * determinant; // now a threshold
4444 	if (((determinant > 0) && ((dxx > dww) || (dxx > dvv) || (dxx > duu))) || ((determinant < 0) && ((dxx < dww) || (dxx < dvv) || (dxx < duu)))) {
4445 	  goto epi_linear_thread_regression_fail;
4446 	}
4447 
4448 	// VIF check done, now perform linear regression
4449 	dmatrix_buf[0] = sample_ctd;
4450 	dmatrix_buf[1] = cur_sum_ad;
4451         dmatrix_buf[2] = cur_sum_bd;
4452 	dmatrix_buf[3] = cur_sum_abd;
4453 	dmatrix_buf[5] = (double)((int32_t)cur_sum_aa);
4454 	dmatrix_buf[6] = cur_sum_abd;
4455 	dmatrix_buf[7] = (double)((intptr_t)cur_sum_aab);
4456 	dmatrix_buf[10] = (double)((int32_t)cur_sum_bb);
4457 	dmatrix_buf[11] = (double)((intptr_t)cur_sum_abb);
4458 	dmatrix_buf[15] = (double)((intptr_t)cur_sum_aabb);
4459 	if (matrix_invert_4x4symm(dmatrix_buf)) {
4460 	  goto epi_linear_thread_regression_fail;
4461 	}
4462 
4463 	for (param_idx = 0; param_idx < 4; param_idx++) {
4464 	  dmatrix_buf2[param_idx] = sqrt(dmatrix_buf[param_idx * 5]);
4465 	}
4466         for (param_idx = 1; param_idx < 4; param_idx++) {
4467           dxx = 0.99999 * dmatrix_buf2[param_idx];
4468           dptr = &(dmatrix_buf[param_idx * 4]);
4469           dptr2 = dmatrix_buf2;
4470           for (param_idx2 = 0; param_idx2 < param_idx; param_idx2++) {
4471             if ((*dptr++) > dxx * (*dptr2++)) {
4472               goto epi_linear_thread_regression_fail;
4473 	    }
4474 	  }
4475 	}
4476         min_sigma = MAXV(dmatrix_buf[5], dmatrix_buf[10]);
4477 	if (dmatrix_buf[15] > min_sigma) {
4478           min_sigma = dmatrix_buf[15];
4479 	}
4480 	min_sigma = 1e-20 / min_sigma;
4481 
4482 	for (param_idx = 0; param_idx < 4; param_idx++) {
4483 	  dptr = &(dmatrix_buf[param_idx * 4]);
4484 	  dmatrix_buf2[param_idx] = cur_pheno_sum * dptr[0] + cur_sum_a_pheno * dptr[1] + cur_sum_b_pheno * dptr[2] + cur_sum_ab_pheno * dptr[3];
4485 	}
4486 	// dmatrix_buf2[0..3] now has linear regression result
4487 
4488 	// partial = coef[0] + A * coef[1] + B * coef[2] + AB * coef[3] - pheno
4489 	// sigma = \sum_{all samples} (partial * partial)
4490 	//       = \sum (coef[0]^2
4491         //               + 2 * A * coef[0] * coef[1]
4492 	//               + 2 * B * coef[0] * coef[2]
4493 	//               + 2 * AB * coef[0] * coef[3]
4494 	//               - 2 * coef[0] * pheno
4495 	//               + AA * coef[1]^2
4496 	//               + 2 * AB * coef[1] * coef[2]
4497 	//               + 2 * AAB * coef[1] * coef[3]
4498 	//               - 2 * A * coef[1] * pheno
4499 	//               + BB * coef[2]^2
4500 	//               + 2 * ABB * coef[2] * coef[3]
4501 	//               - 2 * B * coef[2] * pheno
4502 	//               + AABB * coef[3]^2
4503 	//               - 2 * AB * coef[3] * pheno
4504 	//               + pheno * pheno
4505 	sigma = dmatrix_buf2[0] * dmatrix_buf2[0] * sample_ctd
4506 	      + dmatrix_buf2[1] * dmatrix_buf2[1] * ((double)((int32_t)cur_sum_aa))
4507 	      + dmatrix_buf2[2] * dmatrix_buf2[2] * ((double)((int32_t)cur_sum_bb))
4508 	      + dmatrix_buf2[3] * dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_aabb))
4509               + cur_pheno_ssq
4510 	      + 2 * (dmatrix_buf2[0] * (dmatrix_buf2[1] * cur_sum_ad
4511                                       + dmatrix_buf2[2] * cur_sum_bd
4512                                       + dmatrix_buf2[3] * cur_sum_abd
4513 				      - cur_pheno_sum)
4514 		   + dmatrix_buf2[1] * (dmatrix_buf2[2] * cur_sum_abd
4515 				      + dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_aab))
4516 				      - cur_sum_a_pheno)
4517 		   + dmatrix_buf2[2] * (dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_abb))
4518 				      - cur_sum_b_pheno)
4519 		   - dmatrix_buf2[3] * cur_sum_ab_pheno);
4520 	sigma /= (double)((int32_t)(cur_sample_ct - 4));
4521         if (sigma < min_sigma) {
4522           goto epi_linear_thread_regression_fail;
4523 	}
4524 
4525 	// dmatrix_buf2[3] = linear regression beta for AB term
4526 	// sqrt(dmatrix_buf[15] * sigma) = standard error for AB term
4527 	dxx = dmatrix_buf2[3];
4528 	zsq = (dxx * dxx) / (dmatrix_buf[15] * sigma);
4529 	if (zsq >= alpha1sq) {
4530           all_chisq_write[2 * block_idx2] = zsq;
4531           all_chisq_write[2 * block_idx2 + 1] = dxx;
4532 	}
4533 	if (zsq >= alpha2sq) {
4534           n_sig_ct_fixed++;
4535 	  n_sig_ct2[block_idx2] += 1;
4536 	}
4537 	if (zsq > best_chisq_fixed) {
4538           best_chisq_fixed = zsq;
4539 	  best_id_fixed = block_idx2 + idx2_block_start;
4540 	}
4541         dxx = *chisq2_ptr;
4542         if (zsq > dxx) {
4543           *chisq2_ptr = zsq;
4544 	  best_id2[block_idx2] = marker_idx1;
4545 	}
4546 	while (0) {
4547 	epi_linear_thread_regression_fail:
4548 	  zsq = 0;
4549 	  fail_ct_fixed++;
4550 	  fail_ct2[block_idx2] += 1;
4551 	  if (alpha1sq == 0.0) {
4552 	    // special case: log NA when '--epi1 1' specified
4553 	    all_chisq_write[block_idx2 * 2] = NAN;
4554 	    all_chisq_write[block_idx2 * 2 + 1] = NAN;
4555 	  }
4556 	}
4557       }
4558       if (is_first_half) {
4559         is_first_half = 0;
4560 	ulii = geno1_offsets[2 * block_idx1 + 1];
4561         cur_idx2_block_size = idx2_block_size;
4562         if (ulii < idx2_block_end) {
4563 	  // guaranteed to be larger than idx2_block_start, otherwise there
4564 	  // would have been no first half
4565           block_idx2 = ulii - idx2_block_start;
4566 	  goto epi_linear_thread_second_half;
4567 	}
4568       }
4569       if (best_chisq_fixed > best_chisq1[block_delta1]) {
4570         best_chisq1[block_delta1] = best_chisq_fixed;
4571 	best_id1[block_delta1] = best_id_fixed;
4572       }
4573       n_sig_ct1[block_delta1] = n_sig_ct_fixed;
4574       if (fail_ct_fixed) {
4575         fail_ct1[block_delta1] = fail_ct_fixed;
4576       }
4577     }
4578     if ((!tidx) || g_is_last_thread_block) {
4579       THREAD_RETURN;
4580     }
4581     THREAD_BLOCK_FINISH(tidx);
4582   }
4583 }
4584 
epi_logistic_thread(void * arg)4585 THREAD_RET_TYPE epi_logistic_thread(void* arg) {
4586   uintptr_t tidx = (uintptr_t)arg;
4587   uintptr_t block_idx1_start = g_epi_idx1_block_bounds[tidx];
4588   uintptr_t block_idx1_end = g_epi_idx1_block_bounds[tidx + 1];
4589   uintptr_t idx1_block_start16 = g_epi_idx1_block_bounds16[tidx];
4590   uintptr_t marker_idx1 = g_epi_marker_idx1 + block_idx1_start;
4591   uintptr_t marker_ct = g_epi_marker_ct;
4592   float alpha1sq = (float)g_epi_alpha1sq[0];
4593   float alpha2sq = (float)g_epi_alpha2sq[0];
4594   uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
4595   uint32_t best_id_fixed = 0;
4596   uint32_t is_first_half = 0;
4597   uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
4598   uintptr_t* geno1 = g_epi_geno1;
4599   uintptr_t* pheno_c = g_epi_pheno_c;
4600   float* covars_cov_major = g_epi_logistic_mt[tidx].cur_covars_cov_major;
4601   float* coef = g_epi_logistic_mt[tidx].coef;
4602   float* pp = g_epi_logistic_mt[tidx].pp;
4603   float* sample_1d_buf = g_epi_logistic_mt[tidx].sample_1d_buf;
4604   float* pheno_buf = g_epi_logistic_mt[tidx].pheno_buf;
4605   float* param_1d_buf = g_epi_logistic_mt[tidx].param_1d_buf;
4606   float* param_1d_buf2 = g_epi_logistic_mt[tidx].param_1d_buf2;
4607   float* param_2d_buf = g_epi_logistic_mt[tidx].param_2d_buf;
4608   float* param_2d_buf2 = g_epi_logistic_mt[tidx].param_2d_buf2;
4609   uint32_t* geno1_offsets = g_epi_geno1_offsets;
4610   uint32_t* best_id1 = &(g_epi_best_id1[idx1_block_start16]);
4611   uintptr_t* cur_geno1;
4612   uintptr_t* geno2;
4613   uintptr_t* cur_geno2;
4614   float* all_chisq_write;
4615   float* chisq2_ptr;
4616   float* all_chisq;
4617   float* best_chisq1;
4618   float* best_chisq2;
4619   float* fptr;
4620   float* fptr2;
4621   uint32_t* n_sig_ct1;
4622   uint32_t* fail_ct1;
4623   uint32_t* best_id2;
4624   uint32_t* n_sig_ct2;
4625   uint32_t* fail_ct2;
4626   uintptr_t idx2_block_size;
4627   uintptr_t cur_idx2_block_size;
4628   uintptr_t idx2_block_start;
4629   uintptr_t idx2_block_end;
4630   uintptr_t idx2_block_sizea16;
4631   uintptr_t block_idx1;
4632   uintptr_t block_delta1;
4633   uintptr_t block_idx2;
4634   uintptr_t cur_word1;
4635   uintptr_t cur_word2;
4636   uintptr_t param_idx;
4637   uintptr_t param_idx2;
4638   uintptr_t cur_sample_cta4;
4639   uintptr_t ulii;
4640   uintptr_t uljj;
4641   float best_chisq_fixed;
4642   // todo
4643   float fxx;
4644   float fyy;
4645   float zsq;
4646   uint32_t n_sig_ct_fixed;
4647   uint32_t fail_ct_fixed;
4648   uint32_t widx;
4649   uint32_t loop_end;
4650   uint32_t sample_idx;
4651   uint32_t cur_sample_ct;
4652   while (1) {
4653     idx2_block_size = g_epi_idx2_block_size;
4654     cur_idx2_block_size = idx2_block_size;
4655     idx2_block_start = g_epi_idx2_block_start;
4656     idx2_block_end = idx2_block_start + idx2_block_size;
4657     idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
4658     geno2 = g_epi_geno2;
4659     all_chisq = &(g_epi_all_chisq_f[2 * idx2_block_start]);
4660     best_chisq1 = &(g_epi_best_chisq_f1[idx1_block_start16]);
4661     best_chisq2 = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16]);
4662     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
4663     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
4664     best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
4665     n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
4666     fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
4667     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
4668       ulii = geno1_offsets[2 * block_idx1];
4669       if (ulii > idx2_block_start) {
4670         block_idx2 = 0;
4671         cur_idx2_block_size = ulii - idx2_block_start;
4672 	if (cur_idx2_block_size >= idx2_block_size) {
4673           cur_idx2_block_size = idx2_block_size;
4674 	} else {
4675 	  is_first_half = 1;
4676         }
4677       } else {
4678         ulii = geno1_offsets[2 * block_idx1 + 1];
4679         if (ulii >= idx2_block_end) {
4680           // may not be done in set1 x all or set1 x set2 cases
4681           continue;
4682 	} else {
4683           if (ulii <= idx2_block_start) {
4684             block_idx2 = 0;
4685 	  } else {
4686             block_idx2 = ulii - idx2_block_start;
4687 	  }
4688 	}
4689       }
4690       cur_geno1 = &(geno1[block_idx1 * pheno_nm_ctl2]);
4691       n_sig_ct_fixed = 0;
4692       fail_ct_fixed = 0;
4693       block_delta1 = block_idx1 - block_idx1_start;
4694       best_chisq_fixed = best_chisq1[block_delta1];
4695 
4696       // [0] = chisq, [1] = beta
4697       all_chisq_write = &(all_chisq[block_idx1 * marker_ct * 2]);
4698 
4699     epi_logistic_thread_second_half:
4700       cur_geno2 = &(geno2[block_idx2 * pheno_nm_ctl2]);
4701       chisq2_ptr = &(best_chisq2[block_idx2]);
4702       for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[pheno_nm_ctl2])) {
4703         fptr = covars_cov_major;
4704 	fptr2 = pheno_buf;
4705 	cur_sample_ct = pheno_nm_ct;
4706 	// this part is similar to glm_logistic().
4707 
4708 	// 1. determine number of samples with at least one missing genotype
4709 	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
4710 	  cur_word1 = cur_geno1[widx];
4711 	  cur_word2 = cur_geno2[widx];
4712 	  cur_word1 = cur_word1 & (cur_word1 >> 1);
4713 	  cur_word2 = cur_word2 & (cur_word2 >> 1);
4714 	  cur_sample_ct -= popcount2_long((cur_word1 | cur_word2) & FIVEMASK);
4715 	}
4716         unsigned char geno_pair_present[12];
4717 	if (cur_sample_ct <= 4) {
4718 	  goto epi_logistic_thread_regression_fail;
4719 	}
4720 	// 2. now populate covariate-major matrix with 16-byte-aligned,
4721 	//    trailing-entries-zeroed rows
4722         // quasi-bugfix (13 Sep 2018): reliably detect when this matrix is not
4723         // of full rank, and skip the regression in that case.
4724         memset(geno_pair_present, 0, 12);
4725 	cur_sample_cta4 = round_up_pow2(cur_sample_ct, 4);
4726 	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
4727 	  sample_idx = widx * BITCT2;
4728           cur_word1 = cur_geno1[widx];
4729           cur_word2 = cur_geno2[widx];
4730           loop_end = sample_idx + BITCT2;
4731 	  if (loop_end > pheno_nm_ct) {
4732             loop_end = pheno_nm_ct;
4733 	  }
4734           for (; sample_idx < loop_end; sample_idx++) {
4735             ulii = cur_word1 & (3 * ONELU);
4736             uljj = cur_word2 & (3 * ONELU);
4737 	    if ((ulii != 3) && (uljj != 3)) {
4738               *fptr = 1.0;
4739               geno_pair_present[ulii + uljj * 4] = 1;
4740 	      fxx = (float)((intptr_t)ulii);
4741 	      fyy = (float)((intptr_t)uljj);
4742 	      // maybe this is faster with continuous writes instead of
4743 	      // continuous reads?  can experiment later
4744               fptr[cur_sample_cta4] = fxx;
4745 	      fptr[2 * cur_sample_cta4] = fyy;
4746               fptr[3 * cur_sample_cta4] = fxx * fyy;
4747 	      fptr++;
4748 	      *fptr2++ = (float)((int32_t)is_set(pheno_c, sample_idx));
4749 	    }
4750             cur_word1 >>= 2;
4751             cur_word2 >>= 2;
4752 	  }
4753 	}
4754         if (!geno_pair_present[5]) {
4755           // not full rank if any 2x2 square in the 3x3 contingency table is
4756           // empty.
4757           if (((!geno_pair_present[0]) && (!geno_pair_present[1]) && (!geno_pair_present[4])) ||
4758               ((!geno_pair_present[1]) && (!geno_pair_present[2]) && (!geno_pair_present[6])) ||
4759               ((!geno_pair_present[4]) && (!geno_pair_present[8]) && (!geno_pair_present[9])) ||
4760               ((!geno_pair_present[6]) && (!geno_pair_present[9]) && (!geno_pair_present[10]))) {
4761             goto epi_logistic_thread_regression_fail;
4762           }
4763         }
4764 	if (cur_sample_ct < cur_sample_cta4) {
4765 	  loop_end = cur_sample_cta4 - cur_sample_ct;
4766 	  fill_float_zero(loop_end, fptr);
4767 	  fill_float_zero(loop_end, &(fptr[cur_sample_cta4]));
4768 	  fill_float_zero(loop_end, &(fptr[2 * cur_sample_cta4]));
4769 	  fill_float_zero(loop_end, &(fptr[3 * cur_sample_cta4]));
4770 	  fill_float_zero(loop_end, fptr2);
4771 	}
4772 
4773 	fill_float_zero(4, coef);
4774 	if (logistic_regression(cur_sample_ct, 4, sample_1d_buf, param_2d_buf, param_1d_buf, param_2d_buf2, param_1d_buf2, covars_cov_major, pheno_buf, coef, pp)) {
4775           goto epi_logistic_thread_regression_fail;
4776 	}
4777 
4778 	// compute S
4779 	for (param_idx = 0; param_idx < 4; param_idx++) {
4780           fill_float_zero(4, param_1d_buf);
4781           param_1d_buf[param_idx] = 1.0;
4782 	  solve_linear_system(param_2d_buf2, param_1d_buf, param_1d_buf2, 4);
4783 	  memcpy(&(param_2d_buf[param_idx * 4]), param_1d_buf2, 4 * sizeof(float));
4784 	}
4785 	for (param_idx = 1; param_idx < 4; param_idx++) {
4786           fxx = param_2d_buf[param_idx * 5];
4787 	  if ((fxx < 1e-20) || (!realnum(fxx))) {
4788 	    goto epi_logistic_thread_regression_fail;
4789 	  }
4790           param_2d_buf2[param_idx] = sqrtf(fxx);
4791 	}
4792 	param_2d_buf2[0] = sqrtf(param_2d_buf[0]);
4793 	for (param_idx = 1; param_idx < 4; param_idx++) {
4794           fxx = 0.99999 * param_2d_buf2[param_idx];
4795 	  fptr = &(param_2d_buf[param_idx * 4]);
4796 	  fptr2 = param_2d_buf2;
4797 	  for (param_idx2 = 0; param_idx2 < param_idx; param_idx2++) {
4798             if ((*fptr++) > fxx * (*fptr2++)) {
4799 	      goto epi_logistic_thread_regression_fail;
4800 	    }
4801 	  }
4802 	}
4803 
4804 	// coef[3] = logistic regression beta for AB term
4805 	// sqrt(param_2d_buf[15]) = standard error for AB term
4806 	zsq = coef[3] * coef[3] / param_2d_buf[15];
4807 	if (zsq >= alpha1sq) {
4808           all_chisq_write[2 * block_idx2] = zsq;
4809           all_chisq_write[2 * block_idx2 + 1] = coef[3];
4810 	}
4811 	if (zsq >= alpha2sq) {
4812           n_sig_ct_fixed++;
4813 	  n_sig_ct2[block_idx2] += 1;
4814 	}
4815 	if (zsq > best_chisq_fixed) {
4816           best_chisq_fixed = zsq;
4817 	  best_id_fixed = block_idx2 + idx2_block_start;
4818 	}
4819         fxx = *chisq2_ptr;
4820         if (zsq > fxx) {
4821           *chisq2_ptr = zsq;
4822 	  best_id2[block_idx2] = marker_idx1;
4823 	}
4824 	while (0) {
4825 	epi_logistic_thread_regression_fail:
4826 	  zsq = 0;
4827 	  fail_ct_fixed++;
4828 	  fail_ct2[block_idx2] += 1;
4829 	  if (alpha1sq == 0.0) {
4830 	    // special case: log NA when '--epi1 1' specified
4831 	    all_chisq_write[block_idx2 * 2] = NAN;
4832 	    all_chisq_write[block_idx2 * 2 + 1] = NAN;
4833 	  }
4834 	}
4835       }
4836       if (is_first_half) {
4837         is_first_half = 0;
4838 	ulii = geno1_offsets[2 * block_idx1 + 1];
4839         cur_idx2_block_size = idx2_block_size;
4840         if (ulii < idx2_block_end) {
4841           block_idx2 = ulii - idx2_block_start;
4842 	  goto epi_logistic_thread_second_half;
4843 	}
4844       }
4845       if (best_chisq_fixed > best_chisq1[block_delta1]) {
4846         best_chisq1[block_delta1] = best_chisq_fixed;
4847 	best_id1[block_delta1] = best_id_fixed;
4848       }
4849       n_sig_ct1[block_delta1] = n_sig_ct_fixed;
4850       if (fail_ct_fixed) {
4851         fail_ct1[block_delta1] = fail_ct_fixed;
4852       }
4853     }
4854     if ((!tidx) || g_is_last_thread_block) {
4855       THREAD_RETURN;
4856     }
4857     THREAD_BLOCK_FINISH(tidx);
4858   }
4859 }
4860 
calc_lnlike(double known11,double known12,double known21,double known22,double center_ct_d,double freq11,double freq12,double freq21,double freq22,double half_hethet_share,double freq11_incr)4861 double calc_lnlike(double known11, double known12, double known21, double known22, double center_ct_d, double freq11, double freq12, double freq21, double freq22, double half_hethet_share, double freq11_incr) {
4862   double lnlike;
4863   freq11 += freq11_incr;
4864   freq22 += freq11_incr;
4865   freq12 += half_hethet_share - freq11_incr;
4866   freq21 += half_hethet_share - freq11_incr;
4867   lnlike = center_ct_d * log(freq11 * freq22 + freq12 * freq21);
4868   if (known11 != 0.0) {
4869     lnlike += known11 * log(freq11);
4870   }
4871   if (known12 != 0.0) {
4872     lnlike += known12 * log(freq12);
4873   }
4874   if (known21 != 0.0) {
4875     lnlike += known21 * log(freq21);
4876   }
4877   if (known22 != 0.0) {
4878     lnlike += known22 * log(freq22);
4879   }
4880   return lnlike;
4881 }
4882 
em_phase_hethet(double known11,double known12,double known21,double known22,uint32_t center_ct,double * freq1x_ptr,double * freq2x_ptr,double * freqx1_ptr,double * freqx2_ptr,double * freq11_ptr,uint32_t * onside_sol_ct_ptr)4883 uint32_t em_phase_hethet(double known11, double known12, double known21, double known22, uint32_t center_ct, double* freq1x_ptr, double* freq2x_ptr, double* freqx1_ptr, double* freqx2_ptr, double* freq11_ptr, uint32_t* onside_sol_ct_ptr) {
4884   // Returns 1 if at least one SNP is monomorphic over all valid observations;
4885   // returns 0 otherwise, and fills all frequencies using the maximum
4886   // likelihood solution to the cubic equation.
4887   // (We're discontinuing most use of EM phasing since better algorithms have
4888   // been developed, but the two marker case is mathematically clean and fast
4889   // enough that it'll probably remain useful as an input for some of those
4890   // better algorithms...)
4891   double center_ct_d = (int32_t)center_ct;
4892   double twice_tot = known11 + known12 + known21 + known22 + 2 * center_ct_d;
4893   uint32_t sol_start_idx = 0;
4894   uint32_t sol_end_idx = 1;
4895   double solutions[3];
4896   double twice_tot_recip;
4897   double half_hethet_share;
4898   double freq11;
4899   double freq12;
4900   double freq21;
4901   double freq22;
4902   double prod_1122;
4903   double prod_1221;
4904   double incr_1122;
4905   double best_sol;
4906   double best_lnlike;
4907   double cur_lnlike;
4908   double freq1x;
4909   double freq2x;
4910   double freqx1;
4911   double freqx2;
4912   double lbound;
4913   double dxx;
4914   uint32_t cur_sol_idx;
4915   // shouldn't have to worry about subtractive cancellation problems here
4916   if (twice_tot == 0.0) {
4917     return 1;
4918   }
4919   twice_tot_recip = 1.0 / twice_tot;
4920   freq11 = known11 * twice_tot_recip;
4921   freq12 = known12 * twice_tot_recip;
4922   freq21 = known21 * twice_tot_recip;
4923   freq22 = known22 * twice_tot_recip;
4924   prod_1122 = freq11 * freq22;
4925   prod_1221 = freq12 * freq21;
4926   half_hethet_share = center_ct_d * twice_tot_recip;
4927   // the following four values should all be guaranteed nonzero except in the
4928   // NAN case
4929   freq1x = freq11 + freq12 + half_hethet_share;
4930   freq2x = 1.0 - freq1x;
4931   freqx1 = freq11 + freq21 + half_hethet_share;
4932   freqx2 = 1.0 - freqx1;
4933   if (center_ct) {
4934     if ((prod_1122 != 0.0) || (prod_1221 != 0.0)) {
4935       sol_end_idx = cubic_real_roots(0.5 * (freq11 + freq22 - freq12 - freq21 - 3 * half_hethet_share), 0.5 * (prod_1122 + prod_1221 + half_hethet_share * (freq12 + freq21 - freq11 - freq22 + half_hethet_share)), -0.5 * half_hethet_share * prod_1122, solutions);
4936       while (sol_end_idx && (solutions[sol_end_idx - 1] > half_hethet_share + SMALLISH_EPSILON)) {
4937 	sol_end_idx--;
4938       }
4939       while ((sol_start_idx < sol_end_idx) && (solutions[sol_start_idx] < -SMALLISH_EPSILON)) {
4940 	sol_start_idx++;
4941       }
4942       if (sol_start_idx == sol_end_idx) {
4943 	// Lost a planet Master Obi-Wan has.  How embarrassing...
4944 	// lost root must be a double root at one of the boundary points, just
4945 	// check their likelihoods
4946 	sol_start_idx = 0;
4947 	sol_end_idx = 2;
4948 	solutions[0] = 0;
4949 	solutions[1] = half_hethet_share;
4950       } else {
4951 	if (solutions[sol_start_idx] < 0) {
4952 	  solutions[sol_start_idx] = 0;
4953 	}
4954 	if (solutions[sol_end_idx - 1] > half_hethet_share) {
4955 	  solutions[sol_end_idx - 1] = half_hethet_share;
4956 	}
4957       }
4958     } else {
4959       solutions[0] = 0;
4960       // bugfix (6 Oct 2017): need to use all nonzero values here
4961       const double nonzero_freq_xx = freq11 + freq22;
4962       const double nonzero_freq_xy = freq12 + freq21;
4963       if ((nonzero_freq_xx + SMALLISH_EPSILON < half_hethet_share + nonzero_freq_xy) && (nonzero_freq_xy + SMALLISH_EPSILON < half_hethet_share + nonzero_freq_xx)) {
4964 	sol_end_idx = 3;
4965 	solutions[1] = (half_hethet_share + nonzero_freq_xy - nonzero_freq_xx) * 0.5;
4966 	solutions[2] = half_hethet_share;
4967       } else {
4968 	sol_end_idx = 2;
4969 	solutions[1] = half_hethet_share;
4970       }
4971     }
4972     best_sol = solutions[sol_start_idx];
4973     if (sol_end_idx > sol_start_idx + 1) {
4974       // select largest log likelihood
4975       best_lnlike = calc_lnlike(known11, known12, known21, known22, center_ct_d, freq11, freq12, freq21, freq22, half_hethet_share, best_sol);
4976       cur_sol_idx = sol_start_idx + 1;
4977       do {
4978 	incr_1122 = solutions[cur_sol_idx];
4979         cur_lnlike = calc_lnlike(known11, known12, known21, known22, center_ct_d, freq11, freq12, freq21, freq22, half_hethet_share, incr_1122);
4980 	if (cur_lnlike > best_lnlike) {
4981           cur_lnlike = best_lnlike;
4982           best_sol = incr_1122;
4983 	}
4984       } while (++cur_sol_idx < sol_end_idx);
4985     }
4986     if (onside_sol_ct_ptr && (sol_end_idx > sol_start_idx + 1)) {
4987       if (freqx1 * freq1x >= freq11) {
4988 	dxx = freq1x * freqx1 - freq11;
4989 	if (dxx > half_hethet_share) {
4990 	  dxx = half_hethet_share;
4991 	}
4992       } else {
4993 	dxx = 0.0;
4994       }
4995       // okay to NOT count suboptimal boundary points because they don't permit
4996       // direction changes within the main interval
4997       // this should exactly match haploview_blocks_classify()'s D sign check
4998       if ((freq11 + best_sol) - freqx1 * freq1x >= 0.0) {
4999 	if (best_sol > dxx + SMALLISH_EPSILON) {
5000           lbound = dxx + SMALLISH_EPSILON;
5001 	} else {
5002 	  lbound = dxx;
5003 	}
5004 	if (best_sol < half_hethet_share - SMALLISH_EPSILON) {
5005 	  half_hethet_share -= SMALLISH_EPSILON;
5006 	}
5007       } else {
5008 	if (best_sol > SMALLISH_EPSILON) {
5009 	  lbound = SMALLISH_EPSILON;
5010 	} else {
5011 	  lbound = 0.0;
5012 	}
5013 	if (best_sol < dxx - SMALLISH_EPSILON) {
5014 	  half_hethet_share = dxx - SMALLISH_EPSILON;
5015 	} else {
5016 	  half_hethet_share = dxx;
5017 	}
5018       }
5019       for (cur_sol_idx = sol_start_idx; cur_sol_idx < sol_end_idx; cur_sol_idx++) {
5020 	if (solutions[cur_sol_idx] < lbound) {
5021 	  sol_start_idx++;
5022 	}
5023 	if (solutions[cur_sol_idx] > half_hethet_share) {
5024 	  break;
5025 	}
5026       }
5027       if (cur_sol_idx >= sol_start_idx + 2) {
5028 	*onside_sol_ct_ptr = cur_sol_idx - sol_start_idx;
5029       }
5030     }
5031     freq11 += best_sol;
5032   } else if ((prod_1122 == 0.0) && (prod_1221 == 0.0)) {
5033     return 1;
5034   }
5035   *freq1x_ptr = freq1x;
5036   *freq2x_ptr = freq2x;
5037   *freqx1_ptr = freqx1;
5038   *freqx2_ptr = freqx2;
5039   *freq11_ptr = freq11;
5040   return 0;
5041 }
5042 
em_phase_hethet_nobase(uint32_t * counts,uint32_t is_x1,uint32_t is_x2,double * freq1x_ptr,double * freq2x_ptr,double * freqx1_ptr,double * freqx2_ptr,double * freq11_ptr)5043 uint32_t em_phase_hethet_nobase(uint32_t* counts, uint32_t is_x1, uint32_t is_x2, double* freq1x_ptr, double* freq2x_ptr, double* freqx1_ptr, double* freqx2_ptr, double* freq11_ptr) {
5044   // if is_x1 and/or is_x2 is set, counts[9]..[17] are male-only counts.
5045   double known11 = (double)(2 * counts[0] + counts[1] + counts[3]);
5046   double known12 = (double)(2 * counts[2] + counts[1] + counts[5]);
5047   double known21 = (double)(2 * counts[6] + counts[3] + counts[7]);
5048   double known22 = (double)(2 * counts[8] + counts[5] + counts[7]);
5049   if (is_x1 || is_x2) {
5050     if (is_x1 && is_x2) {
5051       known11 -= (double)((int32_t)counts[9]);
5052       known12 -= (double)((int32_t)counts[11]);
5053       known21 -= (double)((int32_t)counts[15]);
5054       known22 -= (double)((int32_t)counts[17]);
5055     } else if (is_x1) {
5056       known11 -= ((double)(2 * counts[9] + counts[10])) * (1.0 - SQRT_HALF);
5057       known12 -= ((double)(2 * counts[11] + counts[10])) * (1.0 - SQRT_HALF);
5058       known21 -= ((double)(2 * counts[15] + counts[16])) * (1.0 - SQRT_HALF);
5059       known22 -= ((double)(2 * counts[17] + counts[16])) * (1.0 - SQRT_HALF);
5060     } else {
5061       known11 -= ((double)(2 * counts[9] + counts[12])) * (1.0 - SQRT_HALF);
5062       known12 -= ((double)(2 * counts[11] + counts[12])) * (1.0 - SQRT_HALF);
5063       known21 -= ((double)(2 * counts[15] + counts[14])) * (1.0 - SQRT_HALF);
5064       known22 -= ((double)(2 * counts[17] + counts[14])) * (1.0 - SQRT_HALF);
5065     }
5066   }
5067   return em_phase_hethet(known11, known12, known21, known22, counts[4], freq1x_ptr, freq2x_ptr, freqx1_ptr, freqx2_ptr, freq11_ptr, nullptr);
5068 }
5069 
ld_dprime_thread(void * arg)5070 THREAD_RET_TYPE ld_dprime_thread(void* arg) {
5071   uintptr_t tidx = (uintptr_t)arg;
5072   uintptr_t block_idx1_start = (tidx * g_ld_idx1_block_size) / g_ld_thread_ct;
5073   uintptr_t block_idx1_end = ((tidx + 1) * g_ld_idx1_block_size) / g_ld_thread_ct;
5074   uintptr_t marker_idx2_maxw = g_ld_marker_ctm8;
5075   uintptr_t founder_ct = g_ld_founder_ct;
5076   uint32_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
5077   uint32_t founder_ctsplit = 3 * founder_ctv3;
5078   uintptr_t* geno1 = g_ld_geno1;
5079   uintptr_t* zmiss1 = g_epi_zmiss1;
5080   uintptr_t* sex_male = g_ld_sex_male;
5081   uintptr_t* cur_geno1_male = nullptr;
5082   uint32_t* ld_interval1 = g_ld_interval1;
5083   uint32_t is_dprime = g_ld_modifier & (LD_DPRIME | LD_DPRIME_SIGNED);
5084   uint32_t is_dprime_unsigned = g_ld_modifier & LD_DPRIME;
5085   uint32_t is_r2 = g_ld_is_r2;
5086   uint32_t xstart1 = g_ld_xstart1;
5087   uint32_t xend1 = g_ld_xend1;
5088   double* results = g_ld_results;
5089   uint32_t tot1[6];
5090   uint32_t counts[18];
5091   uintptr_t* cur_geno1;
5092   uintptr_t* cur_geno2;
5093   uintptr_t* geno2;
5094   uintptr_t* zmiss2;
5095   double* rptr;
5096   uint32_t* tot2;
5097   uint32_t* cur_tot2;
5098   uintptr_t idx2_block_size;
5099   uintptr_t idx2_block_start;
5100   uintptr_t block_idx1;
5101   uintptr_t block_idx2;
5102   uintptr_t cur_zmiss2;
5103   uintptr_t cur_block_idx2_end;
5104   double freq11;
5105   double freq11_expected;
5106   double freq1x;
5107   double freq2x;
5108   double freqx1;
5109   double freqx2;
5110   double dxx;
5111   uint32_t xstart2;
5112   uint32_t xend2;
5113   uint32_t x2_present;
5114   uint32_t is_x1;
5115   uint32_t is_x2;
5116   uint32_t nm_fixed;
5117   if (g_ld_thread_wkspace) {
5118     cur_geno1_male = &(g_ld_thread_wkspace[tidx * round_up_pow2(founder_ctsplit, CACHELINE_WORD)]);
5119   }
5120   // suppress warning
5121   fill_uint_zero(3, &(tot1[3]));
5122   while (1) {
5123     idx2_block_size = g_ld_idx2_block_size;
5124     idx2_block_start = g_ld_idx2_block_start;
5125     geno2 = g_ld_geno2;
5126     zmiss2 = g_epi_zmiss2;
5127     tot2 = g_epi_tot2;
5128     xstart2 = g_ld_xstart2;
5129     xend2 = g_ld_xend2;
5130     x2_present = (g_ld_thread_wkspace && (idx2_block_start < xend2) && (idx2_block_start + idx2_block_size > xstart2));
5131     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++) {
5132       cur_zmiss2 = ld_interval1[block_idx1 * 2];
5133       block_idx2 = cur_zmiss2;
5134       cur_block_idx2_end = ld_interval1[block_idx1 * 2 + 1];
5135       if (block_idx2 < idx2_block_start) {
5136 	if (cur_block_idx2_end <= idx2_block_start) {
5137 	  continue;
5138 	}
5139 	block_idx2 = 0;
5140       } else {
5141 	block_idx2 -= idx2_block_start;
5142 	if (block_idx2 >= idx2_block_size) {
5143 	  break;
5144 	}
5145       }
5146       cur_block_idx2_end -= idx2_block_start;
5147       if (cur_block_idx2_end > idx2_block_size) {
5148 	cur_block_idx2_end = idx2_block_size;
5149       }
5150       is_x1 = (block_idx1 >= xstart1) && (block_idx1 < xend1);
5151       nm_fixed = is_set_ul(zmiss1, block_idx1);
5152       cur_geno1 = &(geno1[block_idx1 * founder_ctsplit]);
5153       tot1[0] = popcount_longs(cur_geno1, founder_ctv3);
5154       tot1[1] = popcount_longs(&(cur_geno1[founder_ctv3]), founder_ctv3);
5155       tot1[2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
5156       if (is_x1 || x2_present) {
5157 	memcpy(cur_geno1_male, cur_geno1, founder_ctsplit * sizeof(intptr_t));
5158         bitvec_and(sex_male, founder_ctv3, cur_geno1_male);
5159         tot1[3] = popcount_longs(cur_geno1_male, founder_ctv3);
5160         bitvec_and(sex_male, founder_ctv3, &(cur_geno1_male[founder_ctv3]));
5161 	tot1[4] = popcount_longs(&(cur_geno1_male[founder_ctv3]), founder_ctv3);
5162         bitvec_and(sex_male, founder_ctv3, &(cur_geno1_male[2 * founder_ctv3]));
5163 	tot1[5] = popcount_longs(&(cur_geno1_male[2 * founder_ctv3]), founder_ctv3);
5164       }
5165       cur_geno2 = &(geno2[block_idx2 * founder_ctsplit]);
5166       rptr = &(results[2 * block_idx1 * marker_idx2_maxw]);
5167       for (; block_idx2 < cur_block_idx2_end; block_idx2++, cur_geno2 = &(cur_geno2[founder_ctsplit])) {
5168 	cur_tot2 = &(tot2[block_idx2 * 3]);
5169 	cur_zmiss2 = is_set_ul(zmiss2, block_idx2);
5170 	if (nm_fixed) {
5171 	  two_locus_count_table_zmiss1(cur_geno1, cur_geno2, counts, founder_ctv3, cur_zmiss2);
5172 	  if (cur_zmiss2) {
5173 	    counts[2] = tot1[0] - counts[0] - counts[1];
5174 	    counts[5] = tot1[1] - counts[3] - counts[4];
5175 	  }
5176 	  counts[6] = cur_tot2[0] - counts[0] - counts[3];
5177 	  counts[7] = cur_tot2[1] - counts[1] - counts[4];
5178 	  counts[8] = cur_tot2[2] - counts[2] - counts[5];
5179 	} else {
5180 	  two_locus_count_table(cur_geno1, cur_geno2, counts, founder_ctv3, cur_zmiss2);
5181 	  if (cur_zmiss2) {
5182 	    counts[2] = tot1[0] - counts[0] - counts[1];
5183 	    counts[5] = tot1[1] - counts[3] - counts[4];
5184 	    counts[8] = tot1[2] - counts[6] - counts[7];
5185 	  }
5186 	}
5187 	is_x2 = ((block_idx2 < xend2) && (block_idx2 >= xstart2));
5188 	if (is_x1 || is_x2) {
5189 	  two_locus_count_table(cur_geno1_male, cur_geno2, &(counts[9]), founder_ctv3, cur_zmiss2);
5190 	  if (cur_zmiss2) {
5191 	    counts[11] = tot1[3] - counts[9] - counts[10];
5192 	    counts[14] = tot1[4] - counts[12] - counts[13];
5193 	    counts[17] = tot1[5] - counts[15] - counts[16];
5194 	  }
5195 	}
5196 	if (em_phase_hethet_nobase(counts, is_x1, is_x2, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
5197 	  *rptr++ = NAN;
5198 	  *rptr++ = NAN;
5199 	  continue;
5200 	}
5201 	freq11_expected = freqx1 * freq1x; // fA * fB temp var
5202 	// a bit of numeric instability here, but not tragic since this is the
5203 	// end of the calculation
5204 	dxx = freq11 - freq11_expected; // D
5205 	if (fabs(dxx) < SMALL_EPSILON) {
5206 	  *rptr++ = 0;
5207 	  *rptr = 0;
5208 	} else {
5209 	  if (is_r2) {
5210 	    *rptr = fabs(dxx) * dxx / (freq11_expected * freq2x * freqx2);
5211 	  } else {
5212 	    *rptr = dxx / sqrt(freq11_expected * freq2x * freqx2);
5213 	  }
5214 	  rptr++;
5215 	  if (is_dprime) {
5216 	    if (dxx >= 0) {
5217 	      dxx /= MINV(freqx1 * freq2x, freqx2 * freq1x);
5218 	    } else {
5219 	      if (is_dprime_unsigned) {
5220 		dxx = -dxx;
5221 	      }
5222 	      dxx /= MINV(freq11_expected, freqx2 * freq2x);
5223 	    }
5224 	  }
5225 	  *rptr = dxx;
5226 	}
5227 	rptr++;
5228       }
5229     }
5230     if ((!tidx) || g_is_last_thread_block) {
5231       THREAD_RETURN;
5232     }
5233     THREAD_BLOCK_FINISH(tidx);
5234   }
5235 }
5236 
ld_report_dprime(pthread_t * threads,Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t * marker_reverse,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uintptr_t * sex_male,uintptr_t * founder_include2,uintptr_t * founder_male_include2,uintptr_t * loadbuf_raw,char * outname,uint32_t hh_exists,uintptr_t marker_idx1_start,uintptr_t marker_idx1_end)5237 int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, uintptr_t* founder_include2, uintptr_t* founder_male_include2, uintptr_t* loadbuf_raw, char* outname, uint32_t hh_exists, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end) {
5238   Chrom_info* chrom_info_ptr = g_ld_chrom_info_ptr;
5239   uintptr_t* marker_exclude_idx1 = g_ld_marker_exclude_idx1;
5240   uintptr_t* marker_exclude = g_ld_marker_exclude;
5241   uint32_t* marker_pos = g_ld_marker_pos;
5242   double* marker_cms = g_ld_marker_cms;
5243   uintptr_t marker_ct = g_ld_marker_ct;
5244   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
5245   uintptr_t founder_ct = g_ld_founder_ct;
5246   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
5247   uintptr_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
5248   uintptr_t founder_ctsplit = 3 * founder_ctv3;
5249   uintptr_t final_mask = get_final_mask(founder_ct);
5250   uintptr_t orig_marker_ctm8 = g_ld_marker_ctm8;
5251   uintptr_t marker_idx2_maxw = orig_marker_ctm8;
5252   uintptr_t marker_idx1 = marker_idx1_start;
5253   uintptr_t job_size = marker_idx1_end - marker_idx1_start;
5254   uintptr_t pct_thresh = job_size / 100;
5255   uintptr_t pct = 1;
5256   uintptr_t ulii = founder_ctsplit * sizeof(intptr_t) + 2 * sizeof(int32_t) + marker_idx2_maxw * 2 * sizeof(double);
5257   uint32_t output_gz = ldip->modifier & LD_REPORT_GZ;
5258   uint32_t is_inter_chr = g_ld_is_inter_chr;
5259   uint32_t idx1_subset = (ldip->snpstr || ldip->snps_rl.name_ct);
5260   uint32_t window_size_m1 = ldip->window_size - 1;
5261   uint32_t window_bp = ldip->window_bp;
5262   double window_cm = ldip->window_cm;
5263   uint32_t thread_ct = g_ld_thread_ct;
5264   uint32_t chrom_fo_idx = 0;
5265   uint32_t is_haploid = 0;
5266   uint32_t is_x = 0;
5267   uint32_t is_y = 0;
5268   uint32_t not_first_write = 0;
5269   uint32_t chrom_last = 0;
5270   uint32_t marker_uidx2_back = 0;
5271   uint32_t marker_uidx2_fwd = 0;
5272   uint32_t marker_uidx2_fwd2 = 0;
5273   uint32_t window_trail_ct = 0;
5274   uint32_t window_lead_ct = 0;
5275   int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
5276   uint32_t xstart = 0;
5277   uint32_t xend = 0;
5278   int32_t retval = 0;
5279   uintptr_t* loadbuf;
5280   uintptr_t* dummy_nm;
5281   uintptr_t* ulptr;
5282   uint32_t* uiptr;
5283   unsigned char* overflow_buf;
5284   unsigned char* bigstack_mark2;
5285   uintptr_t cur_bigstack_left;
5286   uintptr_t thread_workload;
5287   uintptr_t idx1_block_size;
5288   uintptr_t idx2_block_size;
5289   uintptr_t cur_idx2_block_size;
5290   uintptr_t marker_idx2;
5291   uintptr_t marker_uidx1;
5292   uintptr_t marker_uidx1_tmp;
5293   uintptr_t marker_uidx2_base;
5294   uintptr_t marker_uidx2;
5295   uintptr_t marker_idx2_base;
5296   uintptr_t marker_idx2_end;
5297   uintptr_t block_idx1;
5298   uintptr_t block_idx2;
5299   uintptr_t uljj;
5300   uint32_t chrom_idx;
5301   uint32_t chrom_end;
5302   uint32_t cur_marker_pos;
5303   double cur_marker_cm;
5304   uint32_t is_last_block;
5305   uint32_t uii;
5306   if (bigstack_alloc_uc(262144, &overflow_buf) ||
5307       bigstack_alloc_ul(founder_ctl * 2, &loadbuf) ||
5308       bigstack_alloc_ul(founder_ctl, &dummy_nm)) {
5309     goto ld_report_dprime_ret_NOMEM;
5310   }
5311   loadbuf[founder_ctl * 2 - 2] = 0;
5312   loadbuf[founder_ctl * 2 - 1] = 0;
5313   fill_all_bits(founder_ct, dummy_nm);
5314   g_ld_thread_wkspace = nullptr;
5315   if ((x_code != -2) && is_set(chrom_info_ptr->chrom_mask, x_code)) {
5316     uii = get_chrom_start_vidx(chrom_info_ptr, (uint32_t)x_code);
5317     chrom_end = get_chrom_end_vidx(chrom_info_ptr, (uint32_t)x_code);
5318     chrom_end = chrom_end - uii - popcount_bit_idx(marker_exclude, uii, chrom_end);
5319     if (chrom_end) {
5320       if (bigstack_alloc_ul(round_up_pow2(founder_ctsplit, CACHELINE_WORD) * thread_ct, &g_ld_thread_wkspace)) {
5321 	goto ld_report_dprime_ret_NOMEM;
5322       }
5323       xstart = uii - popcount_bit_idx(marker_exclude, 0, uii);
5324       xend = xstart + chrom_end;
5325       g_ld_sex_male = sex_male;
5326     }
5327   }
5328   cur_bigstack_left = bigstack_left();
5329   if (cur_bigstack_left < 2 * CACHELINE) {
5330     goto ld_report_dprime_ret_NOMEM;
5331   }
5332   idx1_block_size = (cur_bigstack_left - 2 * CACHELINE) / (ulii * 2 + 1);
5333   thread_workload = idx1_block_size / thread_ct;
5334   if (!thread_workload) {
5335     goto ld_report_dprime_ret_NOMEM;
5336   }
5337   idx1_block_size = thread_workload * thread_ct;
5338   if (idx1_block_size > job_size) {
5339     idx1_block_size = job_size;
5340   }
5341   if (bigstack_alloc_ul(founder_ctsplit * idx1_block_size, &g_ld_geno1) ||
5342       bigstack_alloc_ul(BITCT_TO_WORDCT(idx1_block_size), &g_epi_zmiss1) ||
5343       bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1) ||
5344       // double size since both r/r^2 and dprime are needed
5345       // (marker_idx2_maxw only needs to be divisible by 4 as a result)
5346       bigstack_alloc_d(marker_idx2_maxw * 2 * idx1_block_size, &g_ld_results)) {
5347     goto ld_report_dprime_ret_NOMEM;
5348   }
5349   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
5350     g_ld_geno1[block_idx1 * founder_ctsplit + founder_ctv3 - 1] = 0;
5351     g_ld_geno1[block_idx1 * founder_ctsplit + 2 * founder_ctv3 - 1] = 0;
5352     g_ld_geno1[block_idx1 * founder_ctsplit + founder_ctsplit - 1] = 0;
5353   }
5354 
5355   ulii = founder_ctsplit * sizeof(intptr_t) + 1 + 3 * sizeof(int32_t);
5356   cur_bigstack_left = bigstack_left();
5357   if (cur_bigstack_left >= CACHELINE) {
5358     cur_bigstack_left -= CACHELINE;
5359   }
5360   idx2_block_size = (cur_bigstack_left / ulii) & (~(7 * ONELU));
5361   if (idx2_block_size > marker_ct) {
5362     idx2_block_size = round_up_pow2(marker_ct, 8);
5363   }
5364   bigstack_mark2 = g_bigstack_base;
5365   while (1) {
5366     if (!idx2_block_size) {
5367       goto ld_report_dprime_ret_NOMEM;
5368     }
5369     if (!(bigstack_alloc_ul(founder_ctsplit * idx2_block_size, &g_ld_geno2) ||
5370           bigstack_alloc_ul(BITCT_TO_WORDCT(idx2_block_size), &g_epi_zmiss2) ||
5371           bigstack_alloc_ui(idx2_block_size * 3, &g_epi_tot2))) {
5372       break;
5373     }
5374     bigstack_reset(bigstack_mark2);
5375     idx2_block_size -= 4;
5376   }
5377   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
5378     g_ld_geno2[block_idx2 * founder_ctsplit + founder_ctv3 - 1] = 0;
5379     g_ld_geno2[block_idx2 * founder_ctsplit + 2 * founder_ctv3 - 1] = 0;
5380     g_ld_geno2[block_idx2 * founder_ctsplit + founder_ctsplit - 1] = 0;
5381   }
5382   marker_uidx1 = next_unset_unsafe(marker_exclude_idx1, 0);
5383   if (marker_idx1) {
5384     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude_idx1, marker_uidx1 + 1, marker_idx1);
5385   }
5386   LOGPRINTF("--r%s%s%s d%s%s...", g_ld_is_r2? "2" : "", is_inter_chr? " inter-chr" : "", g_ld_marker_allele_ptrs? " in-phase" : "", (g_ld_modifier & LD_D)? "" : ((g_ld_modifier & LD_DPRIME)? "prime" : "prime-signed"), g_ld_set_allele_freqs? " with-freqs" : "");
5387   fputs(" 0%", stdout);
5388   while (1) {
5389     fputs(" [processing]", stdout);
5390     fflush(stdout);
5391     if (idx1_block_size > marker_idx1_end - marker_idx1) {
5392       idx1_block_size = marker_idx1_end - marker_idx1;
5393       if (idx1_block_size < thread_ct) {
5394         thread_ct = idx1_block_size;
5395         g_ld_thread_ct = thread_ct;
5396       }
5397     }
5398     g_ld_idx1_block_size = idx1_block_size;
5399     marker_uidx1_tmp = marker_uidx1;
5400     if ((marker_idx1 < xend) && (marker_idx1 + idx1_block_size > xstart)) {
5401       uii = MAXV(marker_idx1, xstart);
5402       g_ld_xstart1 = uii - marker_idx1;
5403       g_ld_xend1 = MINV(xend, marker_idx1 + idx1_block_size) - uii;
5404     }
5405 
5406     if (idx1_subset) {
5407       if (!is_inter_chr) {
5408 	chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
5409 	marker_uidx2_base = window_back(marker_pos, marker_cms, marker_exclude, next_unset_unsafe(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx]), marker_uidx1, window_size_m1, window_bp, window_cm, &uii);
5410 	marker_idx2_base = marker_uidx2_base - popcount_bit_idx(marker_exclude, 0, marker_uidx2_base);
5411 	marker_idx2 = marker_idx2_base + uii;
5412       } else {
5413 	marker_uidx2_base = next_unset_unsafe(marker_exclude, 0);
5414 	marker_idx2_base = 0;
5415 	marker_idx2 = 0;
5416       }
5417     } else {
5418       marker_idx2_base = marker_uidx1 + 1 - popcount_bit_idx(marker_exclude, 0, marker_uidx1);
5419       if (marker_idx2_base == marker_ct) {
5420 	goto ld_report_dprime_done;
5421       }
5422       marker_idx2 = marker_idx2_base - 1;
5423       marker_uidx2_base = next_unset_unsafe(marker_exclude, marker_uidx1 + 1);
5424     }
5425     if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5426       goto ld_report_dprime_ret_READ_FAIL;
5427     }
5428     chrom_end = 0;
5429     fill_ulong_zero(BITCT_TO_WORDCT(idx1_block_size), g_epi_zmiss1);
5430     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1_tmp++, block_idx1++, marker_idx2++) {
5431       if (IS_SET(marker_exclude_idx1, marker_uidx1_tmp)) {
5432         ulii = next_unset_ul_unsafe(marker_exclude_idx1, marker_uidx1_tmp);
5433 	uljj = ulii - marker_uidx1_tmp - popcount_bit_idx(marker_exclude, marker_uidx1_tmp, ulii);
5434 	if (uljj) {
5435 	  uii = 1;
5436 	  marker_idx2 += uljj;
5437 	}
5438 	marker_uidx1_tmp = ulii;
5439         if (fseeko(bedfile, bed_offset + (marker_uidx1_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5440           goto ld_report_dprime_ret_READ_FAIL;
5441 	}
5442       }
5443       if (marker_uidx1_tmp >= chrom_end) {
5444         chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
5445         chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5446         chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
5447 	chrom_last = prev_unset_unsafe(marker_exclude, chrom_end);
5448         is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
5449 	is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
5450 	is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
5451 	uii = 1;
5452       }
5453       if (!is_inter_chr) {
5454 	// uii == 0 if we can perform an incremental update, 1 if we need
5455 	// fully-powered window_back()/window_forward()
5456 	if (uii) {
5457 	  if (idx1_subset) {
5458 	    marker_uidx2_back = window_back(marker_pos, marker_cms, marker_exclude, next_unset_unsafe(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx]), marker_uidx1_tmp, window_size_m1, window_bp, window_cm, &window_trail_ct);
5459 	  }
5460 	  marker_uidx2_fwd = window_forward(marker_pos, marker_cms, marker_exclude, marker_uidx1_tmp, chrom_last, window_size_m1, window_bp, window_cm, &window_lead_ct);
5461 	  marker_uidx2_fwd2 = marker_uidx2_fwd;
5462 	  if (marker_uidx2_fwd < chrom_last) {
5463 	    marker_uidx2_fwd2++;
5464 	    next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
5465 	  }
5466 	  uii = 0;
5467 	} else {
5468 	  if (idx1_subset) {
5469 	    if (window_trail_ct == window_size_m1) {
5470 	      marker_uidx2_back++;
5471 	      next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
5472 	    } else {
5473 	      window_trail_ct++;
5474 	    }
5475 	    cur_marker_pos = marker_pos[marker_uidx1_tmp];
5476 	    if (cur_marker_pos > window_bp) {
5477 	      cur_marker_pos -= window_bp;
5478 	      while (marker_pos[marker_uidx2_back] < cur_marker_pos) {
5479 		window_trail_ct--;
5480 		marker_uidx2_back++;
5481 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
5482 	      }
5483 	    }
5484 	    if (marker_cms) {
5485 	      cur_marker_cm = marker_cms[marker_uidx1_tmp] - window_cm;
5486 	      while (marker_cms[marker_uidx2_back] < cur_marker_cm) {
5487 		window_trail_ct--;
5488 		marker_uidx2_back++;
5489 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
5490 	      }
5491 	    }
5492 	  }
5493 	  if (marker_uidx2_fwd < chrom_last) {
5494 	    cur_marker_pos = marker_pos[marker_uidx1_tmp] + window_bp;
5495 	    if (!marker_cms) {
5496 	      while (marker_pos[marker_uidx2_fwd2] <= cur_marker_pos) {
5497 		marker_uidx2_fwd = marker_uidx2_fwd2;
5498 		window_lead_ct++;
5499 		if (marker_uidx2_fwd == chrom_last) {
5500 		  break;
5501 		}
5502 		marker_uidx2_fwd2++;
5503 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
5504 		if (window_lead_ct > window_size_m1) {
5505 		  break;
5506 		}
5507 	      }
5508 	    } else {
5509 	      cur_marker_cm = marker_cms[marker_uidx1_tmp] + window_cm;
5510 	      while ((marker_pos[marker_uidx2_fwd2] <= cur_marker_pos) && (marker_cms[marker_uidx2_fwd2] <= window_cm)) {
5511 		marker_uidx2_fwd = marker_uidx2_fwd2;
5512 		window_lead_ct++;
5513 		if (marker_uidx2_fwd == chrom_last) {
5514 		  break;
5515 		}
5516 		marker_uidx2_fwd2++;
5517 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
5518 		if (window_lead_ct > window_size_m1) {
5519 		  break;
5520 		}
5521 	      }
5522 	    }
5523 	  }
5524 	  window_lead_ct--;
5525 	}
5526       }
5527       if (!is_inter_chr) {
5528 	if (idx1_subset) {
5529 	  g_ld_interval1[block_idx1 * 2] = marker_idx2 - window_trail_ct - marker_idx2_base;
5530 	} else {
5531 	  g_ld_interval1[block_idx1 * 2] = marker_idx2 + 1 - marker_idx2_base;
5532 	}
5533         g_ld_interval1[block_idx1 * 2 + 1] = marker_idx2 + window_lead_ct + 1 - marker_idx2_base;
5534       } else {
5535 	if (!idx1_subset) {
5536           g_ld_interval1[block_idx1 * 2] = marker_idx2 + 1 - marker_idx2_base;
5537 	} else {
5538 	  g_ld_interval1[block_idx1 * 2] = 0;
5539 	}
5540 	g_ld_interval1[block_idx1 * 2 + 1] = marker_ct - marker_idx2_base;
5541       }
5542 
5543       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp), bedfile, loadbuf_raw, loadbuf)) {
5544 	goto ld_report_dprime_ret_READ_FAIL;
5545       }
5546       if (is_haploid && hh_exists) {
5547         haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf);
5548       }
5549       load_and_split3(nullptr, loadbuf, founder_ct, &(g_ld_geno1[block_idx1 * founder_ctsplit]), dummy_nm, dummy_nm, founder_ctv3, 0, 0, 1, &ulii);
5550       if (ulii == 3) {
5551         SET_BIT(block_idx1, g_epi_zmiss1);
5552       }
5553     }
5554     marker_uidx2 = marker_uidx2_base;
5555     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5556       goto ld_report_dprime_ret_READ_FAIL;
5557     }
5558 
5559     cur_idx2_block_size = idx2_block_size;
5560     uljj = g_ld_interval1[2 * idx1_block_size - 1];
5561     marker_idx2_end = uljj + marker_idx2_base;
5562     marker_idx2_maxw = round_up_pow2(uljj, 4);
5563     if (marker_idx2_maxw > orig_marker_ctm8) {
5564       marker_idx2_maxw = orig_marker_ctm8;
5565     }
5566     g_ld_marker_ctm8 = marker_idx2_maxw;
5567     marker_idx2 = marker_idx2_base;
5568     do {
5569       if (cur_idx2_block_size > marker_idx2_end - marker_idx2) {
5570 	cur_idx2_block_size = marker_idx2_end - marker_idx2;
5571       }
5572       if ((marker_idx2 < xend) && (marker_idx2 + cur_idx2_block_size > xstart)) {
5573 	uii = MAXV(marker_idx2, xstart);
5574 	g_ld_xstart2 = uii - marker_idx2;
5575 	g_ld_xend2 = MINV(xend, marker_idx2 + cur_idx2_block_size) - uii;
5576       }
5577       fill_ulong_zero(BITCT_TO_WORDCT(cur_idx2_block_size), g_epi_zmiss2);
5578       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
5579 	if (IS_SET(marker_exclude, marker_uidx2)) {
5580           marker_uidx2 = next_unset_ul_unsafe(marker_exclude, marker_uidx2);
5581           if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5582 	    goto ld_report_dprime_ret_READ_FAIL;
5583 	  }
5584 	}
5585 	if (marker_uidx2 >= chrom_end) {
5586 	  chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
5587 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5588 	  is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
5589 	  is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
5590 	  is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
5591 	}
5592 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
5593 	  goto ld_report_dprime_ret_READ_FAIL;
5594 	}
5595 	if (is_haploid && hh_exists) {
5596 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf);
5597 	}
5598 	ulptr = &(g_ld_geno2[block_idx2 * founder_ctsplit]);
5599 	load_and_split3(nullptr, loadbuf, founder_ct, ulptr, dummy_nm, dummy_nm, founder_ctv3, 0, 0, 1, &ulii);
5600 	uiptr = &(g_epi_tot2[block_idx2 * 3]);
5601 	uiptr[0] = popcount_longs(ulptr, founder_ctv3);
5602 	uiptr[1] = popcount_longs(&(ulptr[founder_ctv3]), founder_ctv3);
5603         uiptr[2] = popcount_longs(&(ulptr[2 * founder_ctv3]), founder_ctv3);
5604 	if (ulii == 3) {
5605 	  SET_BIT(block_idx2, g_epi_zmiss2);
5606 	}
5607       }
5608       g_ld_idx2_block_size = cur_idx2_block_size;
5609       g_ld_idx2_block_start = marker_idx2 - marker_idx2_base;
5610       marker_idx2 += cur_idx2_block_size;
5611       is_last_block = (marker_idx2 >= marker_idx2_end);
5612       if (spawn_threads2(threads, &ld_dprime_thread, thread_ct, is_last_block)) {
5613 	goto ld_report_dprime_ret_THREAD_CREATE_FAIL;
5614       }
5615       ld_dprime_thread((void*)0);
5616       join_threads2(threads, thread_ct, is_last_block);
5617     } while (!is_last_block);
5618 
5619     fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
5620     fflush(stdout);
5621     g_ld_marker_uidx1 = marker_uidx1;
5622     g_ld_block_idx1 = 0;
5623     g_ld_uidx2_start = marker_uidx2_base;
5624     g_ld_idx2_block_start = 0;
5625     g_ld_block_idx2 = 0;
5626     if (output_gz) {
5627       parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
5628     } else {
5629       write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
5630     }
5631     not_first_write = 1;
5632     g_ld_is_first_block = 0;
5633   ld_report_dprime_done:
5634     marker_idx1 += idx1_block_size;
5635     fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
5636     if (marker_idx1 >= pct_thresh) {
5637       if (pct > 10) {
5638 	putc_unlocked('\b', stdout);
5639       }
5640       pct = ((marker_idx1 - marker_idx1_start) * 100LLU) / job_size;
5641       if (pct < 100) {
5642 	printf("\b\b%" PRIuPTR "%%", pct);
5643 	fflush(stdout);
5644 	pct_thresh = marker_idx1_start + ((++pct) * ((uint64_t)job_size)) / 100;
5645       }
5646     }
5647     if (marker_idx1 == marker_idx1_end) {
5648       break;
5649     }
5650     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude_idx1, marker_uidx1 + 1, idx1_block_size);
5651   }
5652   fputs("\b\b\b", stdout);
5653   logprint(" done.\n");
5654   LOGPRINTFWW("Results written to %s .\n", outname);
5655 
5656   while (0) {
5657   ld_report_dprime_ret_NOMEM:
5658     retval = RET_NOMEM;
5659     break;
5660   ld_report_dprime_ret_READ_FAIL:
5661     retval = RET_READ_FAIL;
5662     break;
5663   ld_report_dprime_ret_THREAD_CREATE_FAIL:
5664     retval = RET_THREAD_CREATE_FAIL;
5665     break;
5666   }
5667   return retval;
5668 }
5669 
ld_report_regular(pthread_t * threads,Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t unfiltered_marker_ct,uintptr_t * marker_reverse,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uint32_t parallel_idx,uint32_t parallel_tot,uintptr_t * sex_male,uintptr_t * founder_include2,uintptr_t * founder_male_include2,uintptr_t * loadbuf,char * outname,uint32_t hh_exists)5670 int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t parallel_idx, uint32_t parallel_tot, uintptr_t* sex_male, uintptr_t* founder_include2, uintptr_t* founder_male_include2, uintptr_t* loadbuf, char* outname, uint32_t hh_exists) {
5671   FILE* infile = nullptr;
5672   uintptr_t* marker_exclude = g_ld_marker_exclude;
5673   char* marker_ids = g_ld_marker_ids;
5674   uintptr_t max_marker_id_len = g_ld_max_marker_id_len;
5675   uint32_t ld_modifier = ldip->modifier;
5676   uint32_t output_gz = ld_modifier & LD_REPORT_GZ;
5677   uint32_t ignore_x = (ld_modifier & LD_IGNORE_X) & 1;
5678   uint32_t is_inter_chr = ld_modifier & LD_INTER_CHR;
5679   uint32_t snp_list_file = ld_modifier & LD_SNP_LIST_FILE;
5680   uintptr_t marker_ct = g_ld_marker_ct;
5681   uintptr_t marker_ct1 = marker_ct;
5682   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
5683   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
5684   uintptr_t founder_ct = g_ld_founder_ct;
5685   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
5686   uintptr_t founder_ct_192_long = g_ld_founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + g_ld_founder_ct_mld_rem * (192 / BITCT2);
5687   uintptr_t final_mask = get_final_mask(founder_ct);
5688   uintptr_t pct = 1;
5689   uintptr_t marker_idx2_maxw = 1;
5690   Chrom_info* chrom_info_ptr = g_ld_chrom_info_ptr;
5691   uintptr_t* marker_exclude_idx1 = marker_exclude;
5692   uint32_t* marker_pos = g_ld_marker_pos;
5693   double* marker_cms = g_ld_marker_cms;
5694   uint32_t founder_trail_ct = founder_ct_192_long - founder_ctl * 2;
5695   uint32_t idx1_subset = (ldip->snpstr || ldip->snps_rl.name_ct);
5696   uint32_t window_size_m1 = ldip->window_size - 1;
5697   uint32_t window_bp = ldip->window_bp;
5698   double window_cm = ldip->window_cm;
5699   uint32_t thread_ct = g_ld_thread_ct;
5700   uint32_t chrom_fo_idx = 0;
5701   uint32_t chrom_fo_idx2 = 0;
5702   uint32_t is_haploid = 0;
5703   uint32_t is_x = 0;
5704   uint32_t is_y = 0;
5705   uint32_t not_first_write = 0;
5706   uint32_t marker_uidx2_back = 0;
5707   uint32_t marker_uidx2_fwd = 0;
5708   uint32_t marker_uidx2_fwd2 = 0;
5709   uint32_t window_trail_ct = 0;
5710   uint32_t window_lead_ct = 0;
5711   uint32_t chrom_last = 0;
5712   int32_t retval = 0;
5713   unsigned char* bigstack_mark2;
5714   unsigned char* overflow_buf;
5715   uint32_t* id_map;
5716   char* sorted_ids;
5717   char* bufptr;
5718   uintptr_t thread_workload;
5719   uintptr_t idx1_block_size;
5720   uintptr_t idx2_block_size;
5721   uintptr_t cur_idx2_block_size;
5722   uintptr_t orig_marker_ctm8;
5723   uintptr_t marker_idx1_start;
5724   uintptr_t marker_idx1;
5725   uintptr_t marker_idx1_end;
5726   uintptr_t marker_idx2;
5727   uintptr_t job_size;
5728   uintptr_t pct_thresh;
5729   uintptr_t marker_uidx1;
5730   uintptr_t marker_uidx1_tmp;
5731   uintptr_t marker_uidx2_base;
5732   uintptr_t marker_uidx2;
5733   uintptr_t marker_idx2_base;
5734   uintptr_t marker_idx2_end;
5735   uintptr_t block_idx1;
5736   uintptr_t block_idx2;
5737   uintptr_t snplist_ct;
5738   uintptr_t max_snplist_id_len;
5739   uintptr_t ulii;
5740   uintptr_t uljj;
5741   uint32_t window_size_ceil;
5742   uint32_t chrom_idx;
5743   uint32_t chrom_end;
5744   uint32_t chrom_idx2;
5745   uint32_t chrom_end2;
5746   uint32_t cur_marker_pos;
5747   double cur_marker_cm;
5748   uint32_t is_last_block;
5749   uint32_t uii;
5750   int32_t ii;
5751   if (bigstack_alloc_uc(262144, &overflow_buf)) {
5752     goto ld_report_regular_ret_NOMEM;
5753   }
5754   if (idx1_subset) {
5755     if (bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_idx1)) {
5756       goto ld_report_regular_ret_NOMEM;
5757     }
5758     fill_all_bits(unfiltered_marker_ct, marker_exclude_idx1);
5759     marker_uidx1 = next_unset_unsafe(marker_exclude, 0);
5760     if (ldip->snpstr && (!snp_list_file)) {
5761       bufptr = ldip->snpstr;
5762       uii = strlen(bufptr) + 1;
5763       if (uii > max_marker_id_len) {
5764 	goto ld_report_regular_ret_EMPTY_SET1;
5765       }
5766       for (marker_idx1 = 0; marker_idx1 < marker_ct; marker_uidx1++, marker_idx1++) {
5767 	next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx1);
5768         if (!memcmp(&(marker_ids[marker_uidx1 * max_marker_id_len]), bufptr, uii)) {
5769 	  break;
5770 	}
5771       }
5772       if (marker_idx1 == marker_ct) {
5773 	goto ld_report_regular_ret_EMPTY_SET1;
5774       }
5775       clear_bit_ul(marker_uidx1, marker_exclude_idx1);
5776       marker_ct1 = 1;
5777     } else {
5778       marker_ct1 = 0;
5779       retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_ids, &id_map);
5780       if (retval) {
5781 	goto ld_report_regular_ret_1;
5782       }
5783       if (snp_list_file) {
5784         if (fopen_checked(ldip->snpstr, FOPEN_RB, &infile)) {
5785 	  goto ld_report_regular_ret_OPEN_FAIL;
5786 	}
5787 	snplist_ct = 0;
5788 	max_snplist_id_len = 0;
5789 	retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &snplist_ct, &max_snplist_id_len);
5790 	if (retval) {
5791 	  goto ld_report_regular_ret_1;
5792 	}
5793 	if (!snplist_ct) {
5794 	  goto ld_report_regular_ret_EMPTY_SET1;
5795 	}
5796 	if (bigstack_alloc_c(snplist_ct * max_snplist_id_len, &bufptr)) {
5797 	  goto ld_report_regular_ret_NOMEM;
5798 	}
5799 	rewind(infile);
5800 	retval = read_tokens(MAXLINELEN, snplist_ct, max_snplist_id_len, infile, g_textbuf, bufptr);
5801 	if (retval) {
5802 	  goto ld_report_regular_ret_1;
5803 	}
5804         if (fclose_null(&infile)) {
5805           goto ld_report_regular_ret_READ_FAIL;
5806 	}
5807 	for (marker_idx1 = 0; marker_idx1 < snplist_ct; marker_idx1++) {
5808           ii = bsearch_str_nl(&(bufptr[marker_idx1 * max_snplist_id_len]), sorted_ids, max_marker_id_len, marker_ct);
5809           if (ii != -1) {
5810             uii = id_map[(uint32_t)ii];
5811             if (!is_set(marker_exclude_idx1, uii)) {
5812 	      logerrprint("Error: Duplicate variant ID in --ld-snp-list file.\n");
5813 	      goto ld_report_regular_ret_INVALID_FORMAT;
5814 	    }
5815             clear_bit(uii, marker_exclude_idx1);
5816             marker_ct1++;
5817 	  }
5818 	}
5819       } else {
5820         retval = string_range_list_to_bitarr2(sorted_ids, id_map, marker_ct, max_marker_id_len, &(ldip->snps_rl), "ld-snps", marker_exclude_idx1);
5821 	if (retval) {
5822 	  goto ld_report_regular_ret_1;
5823 	}
5824         bitvec_or(marker_exclude, unfiltered_marker_ctl, marker_exclude_idx1);
5825 	// bugfix, 13 Jan 2017
5826 	// another bugfix, 28 Mar 2017: popcounted the wrong array...
5827         marker_ct1 = unfiltered_marker_ct - popcount_longs(marker_exclude_idx1, unfiltered_marker_ctl);
5828       }
5829       if (!marker_ct1) {
5830 	goto ld_report_regular_ret_EMPTY_SET1;
5831       }
5832       bigstack_reset(id_map);
5833     }
5834   }
5835   if ((parallel_tot > 1) && (marker_ct1 < 2 * parallel_tot)) {
5836     LOGERRPRINTF("Error: Too few variants in --r%s run for --parallel %u %u.\n", g_ld_is_r2? "2" : "", parallel_idx + 1, parallel_tot);
5837     goto ld_report_regular_ret_INVALID_CMDLINE;
5838   }
5839   // yeah, this is uneven in the inter-chr case
5840   marker_idx1_start = (((uint64_t)parallel_idx) * marker_ct1) / parallel_tot;
5841   marker_idx1 = marker_idx1_start;
5842   marker_idx1_end = (((uint64_t)(parallel_idx + 1)) * marker_ct1) / parallel_tot;
5843   job_size = marker_idx1_end - marker_idx1_start;
5844   pct_thresh = job_size / 100;
5845 
5846   if (is_inter_chr) {
5847     marker_idx2_maxw = marker_ct + idx1_subset - 1;
5848   } else {
5849     window_size_ceil = (idx1_subset + 1) * (window_size_m1 + 1) - 1;
5850     if ((window_size_m1 < 12) || ((!idx1_subset) && (window_size_m1 <= 16))) {
5851       marker_idx2_maxw = window_size_ceil;
5852     } else {
5853       for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
5854         marker_idx2_maxw = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], window_size_ceil, window_bp * (idx1_subset + 1), marker_idx2_maxw);
5855       }
5856     }
5857   }
5858 
5859   g_ld_marker_exclude_idx1 = marker_exclude_idx1;
5860   g_ld_marker_exclude = marker_exclude;
5861   g_ld_is_inter_chr = is_inter_chr;
5862 
5863   g_ld_is_first_block = (!parallel_idx);
5864   if (g_ld_is_r2) {
5865     g_ld_window_r2 = ldip->window_r2;
5866   } else {
5867     g_ld_window_r2 = sqrt(ldip->window_r2);
5868   }
5869   if (ld_modifier & LD_DX) {
5870     // this is more like --fast-epistasis under the hood, since it requires the
5871     // entire 3x3 table
5872     g_ld_marker_ctm8 = round_up_pow2(marker_idx2_maxw, 4);
5873     retval = ld_report_dprime(threads, ldip, bedfile, bed_offset, marker_reverse, unfiltered_sample_ct, founder_info, sex_male, founder_include2, founder_male_include2, loadbuf, outname, hh_exists, marker_idx1_start, marker_idx1_end);
5874     goto ld_report_regular_ret_1;
5875   }
5876   marker_idx2_maxw = round_up_pow2(marker_idx2_maxw, 8);
5877   orig_marker_ctm8 = marker_idx2_maxw;
5878   g_ld_marker_ctm8 = marker_idx2_maxw;
5879   g_ld_keep_sign = 1;
5880   // each marker costs
5881   //   founder_ct_192_long * sizeof(intptr_t) for genotype buffer
5882   // + founder_ct_192_long * sizeof(intptr_t) for missing mask buffer
5883   // + sizeof(int32_t) for g_ld_missing_cts1 entry
5884   // + 2 * sizeof(int32_t) for window offset and size
5885   // + marker_idx2_maxw * sizeof(double) for g_ld_results buffer
5886   // round down to multiple of thread_ct for better workload distribution
5887   ulii = founder_ct_192_long * 2 * sizeof(intptr_t) + 3 * sizeof(int32_t) + marker_idx2_maxw * sizeof(double);
5888   idx1_block_size = bigstack_left() / (ulii * 2);
5889   thread_workload = idx1_block_size / thread_ct;
5890   if (!thread_workload) {
5891     goto ld_report_regular_ret_NOMEM;
5892   }
5893   idx1_block_size = thread_workload * thread_ct;
5894   if (idx1_block_size > job_size) {
5895     idx1_block_size = job_size;
5896   }
5897   bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno1);
5898   bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno_masks1);
5899   bigstack_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
5900   bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1);
5901   if (bigstack_alloc_d(marker_idx2_maxw * idx1_block_size, &g_ld_results)) {
5902     goto ld_report_regular_ret_NOMEM;
5903   }
5904 
5905   ulii -= 2 * sizeof(int32_t) + marker_idx2_maxw * sizeof(double);
5906   idx2_block_size = (bigstack_left() / ulii) & (~(7 * ONELU));
5907   if (idx2_block_size > marker_ct) {
5908     idx2_block_size = round_up_pow2(marker_ct, 8);
5909   }
5910   bigstack_mark2 = g_bigstack_base;
5911   while (1) {
5912     if (!idx2_block_size) {
5913       goto ld_report_regular_ret_NOMEM;
5914     }
5915     if (!(bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno2) ||
5916           bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno_masks2) ||
5917           bigstack_alloc_ui(idx2_block_size, &g_ld_missing_cts2))) {
5918       break;
5919     }
5920     bigstack_reset(bigstack_mark2);
5921     idx2_block_size -= 8;
5922   }
5923   uljj = founder_trail_ct + 2;
5924   for (ulii = 1; ulii <= idx1_block_size; ulii++) {
5925     fill_ulong_zero(uljj, &(g_ld_geno1[ulii * founder_ct_192_long - uljj]));
5926     fill_ulong_zero(uljj, &(g_ld_geno_masks1[ulii * founder_ct_192_long - uljj]));
5927   }
5928   for (ulii = 1; ulii <= idx2_block_size; ulii++) {
5929     fill_ulong_zero(uljj, &(g_ld_geno2[ulii * founder_ct_192_long - uljj]));
5930     fill_ulong_zero(uljj, &(g_ld_geno_masks2[ulii * founder_ct_192_long - uljj]));
5931   }
5932   marker_uidx1 = next_unset_unsafe(marker_exclude_idx1, 0);
5933   if (marker_idx1) {
5934     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude_idx1, marker_uidx1 + 1, marker_idx1);
5935   }
5936   sprintf(g_logbuf, "--r%s%s%s%s to %s ... ", g_ld_is_r2? "2" : "", is_inter_chr? " inter-chr" : "", g_ld_marker_allele_ptrs? " in-phase" : "", g_ld_set_allele_freqs? " with-freqs" : "", outname);
5937   wordwrapb(16); // strlen("99% [processing]")
5938   logprintb();
5939   fputs("0%", stdout);
5940   while (1) {
5941     fputs(" [processing]", stdout);
5942     fflush(stdout);
5943     if (idx1_block_size > marker_idx1_end - marker_idx1) {
5944       idx1_block_size = marker_idx1_end - marker_idx1;
5945       if (idx1_block_size < thread_ct) {
5946         thread_ct = idx1_block_size;
5947         g_ld_thread_ct = thread_ct;
5948       }
5949     }
5950     g_ld_idx1_block_size = idx1_block_size;
5951     marker_uidx1_tmp = marker_uidx1;
5952 
5953     if (idx1_subset) {
5954       if (!is_inter_chr) {
5955 	chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
5956 	marker_uidx2_base = window_back(marker_pos, marker_cms, marker_exclude, next_unset_unsafe(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx]), marker_uidx1, window_size_m1, window_bp, window_cm, &uii);
5957 	marker_idx2_base = marker_uidx2_base - popcount_bit_idx(marker_exclude, 0, marker_uidx2_base);
5958 	marker_idx2 = marker_idx2_base + uii;
5959       } else {
5960 	marker_uidx2_base = next_unset_unsafe(marker_exclude, 0);
5961 	marker_idx2_base = 0;
5962 	marker_idx2 = 0; // ignored
5963       }
5964     } else {
5965       marker_idx2_base = marker_uidx1 + 1 - popcount_bit_idx(marker_exclude, 0, marker_uidx1);
5966       if (marker_idx2_base == marker_ct) {
5967 	goto ld_report_regular_done;
5968       }
5969       marker_idx2 = marker_idx2_base - 1;
5970       marker_uidx2_base = next_unset_unsafe(marker_exclude, marker_uidx1 + 1);
5971     }
5972     if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5973       goto ld_report_regular_ret_READ_FAIL;
5974     }
5975     chrom_end = 0;
5976     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1_tmp++, block_idx1++, marker_idx2++) {
5977       if (IS_SET(marker_exclude_idx1, marker_uidx1_tmp)) {
5978         ulii = next_unset_ul_unsafe(marker_exclude_idx1, marker_uidx1_tmp);
5979 	uljj = ulii - marker_uidx1_tmp - popcount_bit_idx(marker_exclude, marker_uidx1_tmp, ulii);
5980 	if (uljj) {
5981 	  uii = 1; // recalculate window beginning/end from scratch
5982 	  marker_idx2 += uljj;
5983 	}
5984 	marker_uidx1_tmp = ulii;
5985         if (fseeko(bedfile, bed_offset + (marker_uidx1_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
5986           goto ld_report_regular_ret_READ_FAIL;
5987 	}
5988       }
5989       if (marker_uidx1_tmp >= chrom_end) {
5990         chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
5991         chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5992         chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
5993 	chrom_last = prev_unset_unsafe(marker_exclude, chrom_end);
5994         is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
5995 	is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
5996 	is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
5997 	uii = 1;
5998       }
5999       if (!is_inter_chr) {
6000 	// uii == 0 if we can perform an incremental update, 1 if we need
6001 	// fully-powered window_back()/window_forward()
6002 	if (uii) {
6003 	  if (idx1_subset) {
6004 	    marker_uidx2_back = window_back(marker_pos, marker_cms, marker_exclude, next_unset_unsafe(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx]), marker_uidx1_tmp, window_size_m1, window_bp, window_cm, &window_trail_ct);
6005 	  }
6006 	  marker_uidx2_fwd = window_forward(marker_pos, marker_cms, marker_exclude, marker_uidx1_tmp, chrom_last, window_size_m1, window_bp, window_cm, &window_lead_ct);
6007 	  marker_uidx2_fwd2 = marker_uidx2_fwd;
6008 	  if (marker_uidx2_fwd < chrom_last) {
6009 	    marker_uidx2_fwd2++;
6010 	    next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
6011 	  }
6012 	  uii = 0;
6013 	} else {
6014 	  if (idx1_subset) {
6015 	    if (window_trail_ct == window_size_m1) {
6016 	      marker_uidx2_back++;
6017 	      next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
6018 	    } else {
6019 	      window_trail_ct++;
6020 	    }
6021 	    cur_marker_pos = marker_pos[marker_uidx1_tmp];
6022 	    if (cur_marker_pos > window_bp) {
6023 	      cur_marker_pos -= window_bp;
6024 	      while (marker_pos[marker_uidx2_back] < cur_marker_pos) {
6025 		window_trail_ct--;
6026 		marker_uidx2_back++;
6027 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
6028 	      }
6029 	    }
6030 	    if (marker_cms) {
6031 	      cur_marker_cm = marker_cms[marker_uidx1_tmp] - window_cm;
6032 	      while (marker_cms[marker_uidx2_back] < cur_marker_cm) {
6033 		window_trail_ct--;
6034 		marker_uidx2_back++;
6035 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_back);
6036 	      }
6037 	    }
6038 	  }
6039 	  if (marker_uidx2_fwd < chrom_last) {
6040 	    cur_marker_pos = marker_pos[marker_uidx1_tmp] + window_bp;
6041 	    if (!marker_cms) {
6042 	      while (marker_pos[marker_uidx2_fwd2] <= cur_marker_pos) {
6043 		marker_uidx2_fwd = marker_uidx2_fwd2;
6044 		window_lead_ct++;
6045 		if (marker_uidx2_fwd == chrom_last) {
6046 		  break;
6047 		}
6048 		marker_uidx2_fwd2++;
6049 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
6050 		if (window_lead_ct > window_size_m1) {
6051 		  break;
6052 		}
6053 	      }
6054 	    } else {
6055 	      cur_marker_cm = marker_cms[marker_uidx1_tmp] + window_cm;
6056 	      while ((marker_pos[marker_uidx2_fwd2] <= cur_marker_pos) && (marker_cms[marker_uidx2_fwd2] <= cur_marker_cm)) {
6057 		marker_uidx2_fwd = marker_uidx2_fwd2;
6058 		window_lead_ct++;
6059 		if (marker_uidx2_fwd == chrom_last) {
6060 		  break;
6061 		}
6062 		marker_uidx2_fwd2++;
6063 		next_unset_unsafe_ck(marker_exclude, &marker_uidx2_fwd2);
6064 		if (window_lead_ct > window_size_m1) {
6065 		  break;
6066 		}
6067 	      }
6068 	    }
6069 	  }
6070 	  window_lead_ct--;
6071 	}
6072       }
6073       if (!is_inter_chr) {
6074 	if (idx1_subset) {
6075 	  g_ld_interval1[block_idx1 * 2] = marker_idx2 - window_trail_ct - marker_idx2_base;
6076 	} else {
6077 	  g_ld_interval1[block_idx1 * 2] = marker_idx2 + 1 - marker_idx2_base;
6078 	}
6079         g_ld_interval1[block_idx1 * 2 + 1] = marker_idx2 + window_lead_ct + 1 - marker_idx2_base;
6080       } else {
6081 	if (!idx1_subset) {
6082           g_ld_interval1[block_idx1 * 2] = marker_idx2 + 1 - marker_idx2_base;
6083 	} else {
6084 	  g_ld_interval1[block_idx1 * 2] = 0;
6085 	}
6086 	g_ld_interval1[block_idx1 * 2 + 1] = marker_ct - marker_idx2_base;
6087       }
6088 
6089       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp), bedfile, loadbuf, &(g_ld_geno1[block_idx1 * founder_ct_192_long]))) {
6090 	goto ld_report_regular_ret_READ_FAIL;
6091       }
6092       if (is_haploid && hh_exists) {
6093 	haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(g_ld_geno1[block_idx1 * founder_ct_192_long])));
6094       }
6095       ld_process_load2(&(g_ld_geno1[block_idx1 * founder_ct_192_long]), &(g_ld_geno_masks1[block_idx1 * founder_ct_192_long]), &(g_ld_missing_cts1[block_idx1]), founder_ct, is_x && (!ignore_x), founder_male_include2);
6096     }
6097     marker_uidx2 = marker_uidx2_base;
6098     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
6099       goto ld_report_regular_ret_READ_FAIL;
6100     }
6101 
6102     cur_idx2_block_size = idx2_block_size;
6103     uljj = g_ld_interval1[2 * idx1_block_size - 1];
6104     marker_idx2_end = uljj + marker_idx2_base;
6105     marker_idx2_maxw = round_up_pow2(uljj, 8);
6106     if (marker_idx2_maxw > orig_marker_ctm8) {
6107       marker_idx2_maxw = orig_marker_ctm8;
6108     }
6109     g_ld_marker_ctm8 = marker_idx2_maxw;
6110     marker_idx2 = marker_idx2_base;
6111     chrom_end2 = 0;
6112     do {
6113       if (cur_idx2_block_size > marker_idx2_end - marker_idx2) {
6114 	cur_idx2_block_size = marker_idx2_end - marker_idx2;
6115       }
6116 
6117       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
6118 	// todo: when set has big holes in the middle, do not load everything
6119 	if (IS_SET(marker_exclude, marker_uidx2)) {
6120           marker_uidx2 = next_unset_ul_unsafe(marker_exclude, marker_uidx2);
6121           if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
6122 	    goto ld_report_regular_ret_READ_FAIL;
6123 	  }
6124 	}
6125 	if (marker_uidx2 >= chrom_end2) {
6126 	  chrom_fo_idx2 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
6127 	  chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
6128 	  chrom_end2 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx2 + 1];
6129 	  is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx2);
6130 	  is_x = (((int32_t)chrom_idx2) == chrom_info_ptr->xymt_codes[X_OFFSET]);
6131 	  is_y = (((int32_t)chrom_idx2) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
6132 	}
6133 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, &(g_ld_geno2[block_idx2 * founder_ct_192_long]))) {
6134 	  goto ld_report_regular_ret_READ_FAIL;
6135 	}
6136 	if (is_haploid && hh_exists) {
6137 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(g_ld_geno2[block_idx2 * founder_ct_192_long])));
6138 	}
6139 	ld_process_load2(&(g_ld_geno2[block_idx2 * founder_ct_192_long]), &(g_ld_geno_masks2[block_idx2 * founder_ct_192_long]), &(g_ld_missing_cts2[block_idx2]), founder_ct, is_x && (!ignore_x), founder_male_include2);
6140       }
6141 
6142       g_ld_idx2_block_size = cur_idx2_block_size;
6143       g_ld_idx2_block_start = marker_idx2 - marker_idx2_base;
6144       marker_idx2 += cur_idx2_block_size;
6145       is_last_block = (marker_idx2 >= marker_idx2_end);
6146       if (spawn_threads2(threads, &ld_block_thread, thread_ct, is_last_block)) {
6147 	goto ld_report_regular_ret_THREAD_CREATE_FAIL;
6148       }
6149       ld_block_thread((void*)0);
6150       join_threads2(threads, thread_ct, is_last_block);
6151     } while (!is_last_block);
6152 
6153     fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
6154     fflush(stdout);
6155     g_ld_marker_uidx1 = marker_uidx1;
6156     g_ld_block_idx1 = 0;
6157     g_ld_uidx2_start = marker_uidx2_base;
6158     g_ld_idx2_block_start = 0;
6159     g_ld_block_idx2 = 0;
6160     if (output_gz) {
6161       parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
6162     } else {
6163       write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
6164     }
6165     not_first_write = 1;
6166     g_ld_is_first_block = 0;
6167   ld_report_regular_done:
6168     marker_idx1 += idx1_block_size;
6169     fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
6170     if (marker_idx1 >= pct_thresh) {
6171       if (pct > 10) {
6172 	putc_unlocked('\b', stdout);
6173       }
6174       pct = ((marker_idx1 - marker_idx1_start) * 100LLU) / job_size;
6175       if (pct < 100) {
6176 	printf("\b\b%" PRIuPTR "%%", pct);
6177 	fflush(stdout);
6178 	pct_thresh = marker_idx1_start + ((++pct) * ((uint64_t)job_size)) / 100;
6179       }
6180     }
6181     if (marker_idx1 == marker_idx1_end) {
6182       break;
6183     }
6184     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude_idx1, marker_uidx1 + 1, idx1_block_size);
6185   }
6186   fputs("\b\b", stdout);
6187   logprint("done.\n");
6188   while (0) {
6189   ld_report_regular_ret_NOMEM:
6190     retval = RET_NOMEM;
6191     break;
6192   ld_report_regular_ret_OPEN_FAIL:
6193     retval = RET_OPEN_FAIL;
6194     break;
6195   ld_report_regular_ret_READ_FAIL:
6196     retval = RET_READ_FAIL;
6197     break;
6198   ld_report_regular_ret_EMPTY_SET1:
6199     logerrprint("Error: No valid variants specified by --ld-snp/--ld-snps/--ld-snp-list.\n");
6200   ld_report_regular_ret_INVALID_CMDLINE:
6201     retval = RET_INVALID_CMDLINE;
6202     break;
6203   ld_report_regular_ret_INVALID_FORMAT:
6204     retval = RET_INVALID_FORMAT;
6205     break;
6206   ld_report_regular_ret_THREAD_CREATE_FAIL:
6207     retval = RET_THREAD_CREATE_FAIL;
6208     break;
6209   }
6210  ld_report_regular_ret_1:
6211   fclose_cond(infile);
6212   // trust parent to free memory
6213   return retval;
6214 }
6215 
ld_report(pthread_t * threads,Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,double * set_allele_freqs,Chrom_info * chrom_info_ptr,uint32_t * marker_pos,double * marker_cms,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uint32_t parallel_idx,uint32_t parallel_tot,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)6216 int32_t ld_report(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, double* set_allele_freqs, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, double* marker_cms, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t parallel_idx, uint32_t parallel_tot, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
6217   unsigned char* bigstack_mark = g_bigstack_base;
6218   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
6219   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
6220   uintptr_t* founder_include2 = nullptr;
6221   uintptr_t* founder_male_include2 = nullptr;
6222   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
6223   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
6224 #ifdef __LP64__
6225   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
6226 #else
6227   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
6228 #endif
6229   uintptr_t founder_ct_192_long = founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + founder_ct_mld_rem * (192 / BITCT2);
6230   uint32_t ld_modifier = ldip->modifier;
6231   uint32_t is_binary = ld_modifier & (LD_MATRIX_BIN | LD_MATRIX_BIN4);
6232   uint32_t output_gz = ld_modifier & LD_REPORT_GZ;
6233   char* bufptr = memcpyl3a(outname_end, ".ld");
6234   int32_t retval = 0;
6235   uintptr_t* loadbuf;
6236 
6237   g_ld_modifier = ld_modifier;
6238   g_ld_founder_ct = founder_ct;
6239   g_ld_founder_ct_192_long = founder_ct_192_long;
6240   g_ld_founder_ct_mld_m1 = founder_ct_mld_m1;
6241   g_ld_founder_ct_mld_rem = founder_ct_mld_rem;
6242   g_ld_is_r2 = ld_modifier & LD_R2;
6243   g_ld_marker_ct = marker_ct;
6244   g_ld_chrom_info_ptr = chrom_info_ptr;
6245   g_ld_thread_ct = g_thread_ct;
6246   g_ld_set_allele_freqs = (ld_modifier & LD_WITH_FREQS)? set_allele_freqs : nullptr;
6247   if (founder_ct < 2) {
6248     LOGERRPRINTF("Warning: Skipping --r%s since there are less than two founders.\n(--make-founders may come in handy here.)\n", g_ld_is_r2? "2" : "");
6249     goto ld_report_ret_1;
6250   } else if (founder_ct >= 0x20000000) {
6251     logerrprint("Error: --r/--r2 does not support >= 2^29 samples.\n");
6252     goto ld_report_ret_INVALID_CMDLINE;
6253   }
6254   if ((marker_ct > 400000) && (!(ld_modifier & LD_YES_REALLY)) && (parallel_tot == 1) && ((ld_modifier & LD_MATRIX_SHAPEMASK) || ((ld_modifier & LD_INTER_CHR) && (!ldip->snpstr) && (!ldip->snps_rl.name_ct) && ((!g_ld_is_r2) || (ldip->window_r2 == 0.0))))) {
6255     logerrprint("Error: Gigantic (over 400k loci) --r/--r2 unfiltered, non-distributed\ncomputation.  Rerun with the 'yes-really' modifier if you are SURE you have\nenough hard drive space and want to do this.\n");
6256     goto ld_report_ret_INVALID_CMDLINE;
6257   }
6258   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
6259     goto ld_report_ret_NOMEM;
6260   }
6261   if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf)) {
6262     goto ld_report_ret_NOMEM;
6263   }
6264   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
6265   loadbuf[unfiltered_sample_ctv2 - 1] = 0;
6266   // possible todo: throw out all monomorphic sites (and, in at least the
6267   // matrix case, dump a list of expelled site IDs)
6268   if (is_binary) {
6269     bufptr = memcpya(bufptr, ".bin", 4);
6270   }
6271   if (parallel_tot > 1) {
6272     *bufptr++ = '.';
6273     bufptr = uint32toa(parallel_idx + 1, bufptr);
6274   }
6275   if (!is_binary) {
6276     g_ld_delimiter = (ld_modifier & LD_MATRIX_SPACES)? ' ' : '\t';
6277     if (output_gz) {
6278       bufptr = memcpyl3a(bufptr, ".gz");
6279     }
6280   }
6281   *bufptr = '\0';
6282   if (ld_modifier & LD_INPHASE) {
6283     if (max_marker_allele_len * 4 + plink_maxsnp * 2 + get_max_chrom_slen(chrom_info_ptr) * 2 + 128 > MAXLINELEN) {
6284       logerrprint("Error: --r/--r2 in-phase does not support very long allele codes.\n");
6285       goto ld_report_ret_INVALID_CMDLINE;
6286     }
6287     g_ld_marker_allele_ptrs = marker_allele_ptrs;
6288   } else {
6289     g_ld_marker_allele_ptrs = nullptr;
6290   }
6291   if (ld_modifier & (LD_MATRIX_SQ | LD_MATRIX_SQ0 | LD_MATRIX_TRI)) {
6292     retval = ld_report_matrix(threads, ldip, bedfile, bed_offset, unfiltered_marker_ct, marker_exclude, marker_reverse, unfiltered_sample_ct, founder_info, parallel_idx, parallel_tot, sex_male, founder_include2, founder_male_include2, loadbuf, outname, hh_exists);
6293   } else {
6294     g_ld_plink_maxsnp = plink_maxsnp;
6295     g_ld_marker_ids = marker_ids;
6296     g_ld_marker_pos = marker_pos;
6297     g_ld_marker_cms = (ldip->window_cm == -1)? nullptr : marker_cms;
6298     g_ld_marker_exclude = marker_exclude;
6299     g_ld_max_marker_id_len = max_marker_id_len;
6300     retval = ld_report_regular(threads, ldip, bedfile, bed_offset, unfiltered_marker_ct, marker_reverse, unfiltered_sample_ct, founder_info, parallel_idx, parallel_tot, sex_male, founder_include2, founder_male_include2, loadbuf, outname, hh_exists);
6301   }
6302   while (0) {
6303   ld_report_ret_NOMEM:
6304     retval = RET_NOMEM;
6305     break;
6306   ld_report_ret_INVALID_CMDLINE:
6307     retval = RET_INVALID_CMDLINE;
6308     break;
6309   }
6310  ld_report_ret_1:
6311   bigstack_reset(bigstack_mark);
6312   return retval;
6313 }
6314 
show_tags(Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)6315 int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
6316   // Similar to ld_prune() and flipscan().
6317   unsigned char* bigstack_mark = g_bigstack_base;
6318   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
6319   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
6320   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
6321   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
6322   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl);
6323   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
6324   uintptr_t final_mask = get_final_mask(founder_ct);
6325   uintptr_t marker_idx = 0;
6326   uintptr_t max_window_size = 1;
6327   uintptr_t pct = 1;
6328   uintptr_t pct_thresh = marker_ct / 100;
6329   FILE* infile = nullptr;
6330   FILE* outfile = nullptr;
6331   uintptr_t* final_set = nullptr;
6332   uintptr_t* founder_include2 = nullptr;
6333   uintptr_t* founder_male_include2 = nullptr;
6334   char* chrom_name_ptr = nullptr;
6335   double tag_thresh = ldip->show_tags_r2 * (1 - SMALL_EPSILON);
6336   uint32_t tags_list = (ldip->modifier & LD_SHOW_TAGS_LIST_ALL) || (!ldip->show_tags_fname);
6337   uint32_t twocolumn = ldip->modifier & LD_SHOW_TAGS_MODE2;
6338   uint32_t ignore_x = (ldip->modifier & LD_IGNORE_X) & 1;
6339   uint32_t window_bp = ldip->show_tags_bp;
6340   uint32_t target_ct = 0;
6341   uint32_t chrom_name_len = 0;
6342   int32_t retval = 0;
6343   char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
6344   int32_t dp_result[5];
6345   uintptr_t founder_ct_192_long;
6346   uintptr_t founder_ctwd12;
6347   uintptr_t founder_ctwd12_rem;
6348   uintptr_t lshift_last;
6349   uintptr_t line_idx;
6350   uintptr_t unrecog_ct;
6351   uintptr_t max_window_ctal;
6352   uintptr_t max_window_ctl;
6353   uintptr_t marker_uidx;
6354   uintptr_t window_cidx;
6355   uintptr_t window_cidx2;
6356   uintptr_t window_cidx3;
6357   uintptr_t marker_uidx2;
6358   uintptr_t ulii;
6359   uintptr_t* targets;
6360   uintptr_t* loadbuf_raw;
6361   uintptr_t* geno;
6362   uintptr_t* geno_masks;
6363   uintptr_t* geno_fixed_vec_ptr;
6364   uintptr_t* mask_fixed_vec_ptr;
6365   uintptr_t* geno_var_vec_ptr;
6366   uintptr_t* mask_var_vec_ptr;
6367   uintptr_t* cur_targets;
6368   uintptr_t* tag_matrix;
6369   uintptr_t* tag_matrix_row_ptr;
6370   char* sorted_marker_ids;
6371   char* bufptr;
6372   char* bufptr2;
6373   uint32_t* marker_id_map;
6374   uint32_t* window_uidxs;
6375   uint32_t* window_cidx_starts;
6376   uint32_t* missing_cts;
6377   double non_missing_ctd;
6378   double cov12;
6379   double dxx;
6380   double dyy;
6381   uint32_t founder_ct_mld_m1;
6382   uint32_t founder_ct_mld_rem;
6383   uint32_t chrom_fo_idx;
6384   uint32_t chrom_idx;
6385   uint32_t chrom_end;
6386   uint32_t chrom_marker_ct;
6387   uint32_t chrom_marker_idx;
6388   uint32_t is_haploid;
6389   uint32_t is_x;
6390   uint32_t is_y;
6391   uint32_t is_target;
6392   uint32_t marker_pos_thresh;
6393   uint32_t fixed_missing_ct;
6394   uint32_t fixed_non_missing_ct;
6395   uint32_t non_missing_ct;
6396   uint32_t slen;
6397   uint32_t tag_ct;
6398   uint32_t marker_uidx3;
6399   uint32_t min_bp;
6400   uint32_t max_bp;
6401   uint32_t cur_bp;
6402   uint32_t uii;
6403   int32_t ii;
6404   if (founder_ct < 2) {
6405     logerrprint("Warning: Skipping --show-tags since there are less than two founders.\n(--make-founders may come in handy here.)\n");
6406     goto show_tags_ret_1;
6407   }
6408   if (bigstack_alloc_ul(unfiltered_marker_ctl, &targets) ||
6409       bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
6410     goto show_tags_ret_NOMEM;
6411   }
6412   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
6413   if (ldip->show_tags_fname) {
6414     fill_ulong_zero(unfiltered_marker_ctl, targets);
6415     retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_marker_ids, &marker_id_map);
6416     if (retval) {
6417       goto show_tags_ret_1;
6418     }
6419     if (fopen_checked(ldip->show_tags_fname, "r", &infile)) {
6420       goto show_tags_ret_OPEN_FAIL;
6421     }
6422     g_textbuf[MAXLINELEN - 1] = ' ';
6423     line_idx = 0;
6424     unrecog_ct = 0;
6425     while (fgets(g_textbuf, MAXLINELEN, infile)) {
6426       line_idx++;
6427       if (!g_textbuf[MAXLINELEN - 1]) {
6428 	LOGERRPRINTF("Error: Line %" PRIuPTR " of --show-tags file is pathologically long.\n", line_idx);
6429 	goto show_tags_ret_INVALID_FORMAT;
6430       }
6431       bufptr = skip_initial_spaces(g_textbuf);
6432       if (is_eoln_kns(*bufptr)) {
6433 	continue;
6434       }
6435       slen = strlen_se(bufptr);
6436       if (twocolumn) {
6437 	bufptr2 = skip_initial_spaces(&(bufptr[slen]));
6438         if (!bufptr2) {
6439 	  LOGERRPRINTF("Error: Line %" PRIuPTR " of --show-tags file has fewer tokens than expected.\n", line_idx);
6440 	  goto show_tags_ret_INVALID_FORMAT;
6441 	}
6442         if ((*bufptr2 != '1') || (!is_space_or_eoln(bufptr2[1]))) {
6443 	  continue;
6444 	}
6445       }
6446       ii = bsearch_str(bufptr, slen, sorted_marker_ids, max_marker_id_len, marker_ct);
6447       if (ii == -1) {
6448 	unrecog_ct++;
6449 	continue;
6450       }
6451       marker_uidx = marker_id_map[(uint32_t)ii];
6452       if (IS_SET(targets, marker_uidx)) {
6453         bufptr[slen] = '\0';
6454         LOGERRPRINTF("Error: Duplicate variant ID '%s' in --show-tags file.\n", bufptr);
6455 	goto show_tags_ret_INVALID_FORMAT;
6456       }
6457       SET_BIT(marker_uidx, targets);
6458     }
6459     if (fclose_null(&infile)) {
6460       goto show_tags_ret_READ_FAIL;
6461     }
6462     bigstack_reset((unsigned char*)marker_id_map);
6463     target_ct = popcount_longs(targets, unfiltered_marker_ctl);
6464     if (!target_ct) {
6465       logerrprint("Error: No recognized variant IDs in --show-tags file.\n");
6466       goto show_tags_ret_INVALID_FORMAT;
6467     }
6468     if (bigstack_alloc_ul(unfiltered_marker_ctl, &final_set)) {
6469       goto show_tags_ret_NOMEM;
6470     }
6471     memcpy(final_set, targets, unfiltered_marker_ctl * sizeof(intptr_t));
6472     LOGPRINTF("--show-tags: %u target variant%s loaded.\n", target_ct, (target_ct == 1)? "" : "s");
6473     if (unrecog_ct) {
6474       LOGERRPRINTF("Warning: %" PRIuPTR " unrecognized variant ID%s in --show-tags file.\n", unrecog_ct, (unrecog_ct == 1)? "" : "s");
6475     }
6476   } else {
6477     bitarr_invert_copy(marker_exclude, unfiltered_marker_ct, targets);
6478   }
6479   // force founder_male_include2 allocation
6480   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
6481     goto show_tags_ret_NOMEM;
6482   }
6483   founder_ct_mld_m1 = (founder_ct - 1) / MULTIPLEX_LD;
6484   ulii = founder_ct_mld_m1 + 1;
6485 #ifdef __LP64__
6486   founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - founder_ct) / 192;
6487 #else
6488   founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - founder_ct) / 48;
6489 #endif
6490   founder_ct_192_long = founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + founder_ct_mld_rem * (192 / BITCT2);
6491   uii = founder_ct / BITCT2;
6492   founder_ctwd12 = uii / 12;
6493   founder_ctwd12_rem = uii - (12 * founder_ctwd12);
6494   lshift_last = 2 * ((0x7fffffc0 - founder_ct) % BITCT2);
6495   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
6496     max_window_size = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], 0x7fffffff, window_bp * 2, max_window_size);
6497   }
6498   max_window_ctl = BITCT_TO_WORDCT(max_window_size);
6499   max_window_ctal = max_window_ctl * BITCT;
6500   if (bigstack_alloc_ui(max_window_size, &window_uidxs) ||
6501       bigstack_alloc_ui(max_window_size, &window_cidx_starts) ||
6502       bigstack_alloc_ui(max_window_size, &missing_cts) ||
6503       bigstack_alloc_ul(max_window_size * founder_ct_192_long, &geno) ||
6504       bigstack_alloc_ul(max_window_size * founder_ct_192_long, &geno_masks) ||
6505       bigstack_alloc_ul(max_window_ctl, &cur_targets) ||
6506       bigstack_alloc_ul(max_window_size * max_window_ctl, &tag_matrix)) {
6507     goto show_tags_ret_NOMEM;
6508   }
6509   uii = 2 + founder_ct_192_long - founder_ctl * 2;
6510   for (ulii = 1; ulii <= max_window_size; ulii++) {
6511     fill_ulong_zero(uii, &(geno[ulii * founder_ct_192_long - uii]));
6512     fill_ulong_zero(uii, &(geno_masks[ulii * founder_ct_192_long - uii]));
6513   }
6514 
6515   if (tags_list) {
6516     memcpy(outname_end, ".tags.list", 11);
6517     if (fopen_checked(outname, "w", &outfile)) {
6518       goto show_tags_ret_WRITE_FAIL;
6519     }
6520     sprintf(g_textbuf, "%%%us  CHR         BP NTAG       LEFT      RIGHT   KBSPAN TAGS\n", plink_maxsnp);
6521     fprintf(outfile, g_textbuf, "SNP");
6522   }
6523   printf("--show-tags%s: 0%%", final_set? "" : " all");
6524   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
6525     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
6526     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
6527     marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_end);
6528     chrom_marker_ct = chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
6529     if (chrom_marker_ct < 2) {
6530       marker_idx += chrom_marker_ct;
6531       continue;
6532     }
6533     chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, &chrom_name_len, chrom_name_buf);
6534     is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
6535     is_x = (chrom_idx == ((uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]));
6536     is_y = (chrom_idx == ((uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]));
6537     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
6538       goto show_tags_ret_READ_FAIL;
6539     }
6540     chrom_marker_idx = 0;
6541     window_cidx = max_window_size - 1;
6542     window_cidx2 = 0;
6543     do {
6544       if (++window_cidx == max_window_size) {
6545         window_cidx = 0;
6546       }
6547       window_uidxs[window_cidx] = marker_uidx;
6548       is_target = IS_SET(targets, marker_uidx);
6549       if (is_target) {
6550 	SET_BIT(window_cidx, cur_targets);
6551       } else {
6552 	CLEAR_BIT(window_cidx, cur_targets);
6553       }
6554 
6555       // circular index of beginning of window starting at current marker
6556       window_cidx_starts[window_cidx] = window_cidx2;
6557       geno_fixed_vec_ptr = &(geno[window_cidx * founder_ct_192_long]);
6558       mask_fixed_vec_ptr = &(geno_masks[window_cidx * founder_ct_192_long]);
6559       fill_ulong_zero(max_window_ctl, &(tag_matrix[window_cidx * max_window_ctl]));
6560       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, geno_fixed_vec_ptr)) {
6561         goto show_tags_ret_READ_FAIL;
6562       }
6563       if (is_haploid && hh_exists) {
6564 	haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
6565       }
6566       ld_process_load2(geno_fixed_vec_ptr, mask_fixed_vec_ptr, &fixed_missing_ct, founder_ct, is_x && (!ignore_x), founder_male_include2);
6567       fixed_non_missing_ct = founder_ct - fixed_missing_ct;
6568       missing_cts[window_cidx] = fixed_missing_ct;
6569       window_cidx3 = window_cidx2;
6570       while (window_cidx3 != window_cidx) {
6571 	if (is_target || IS_SET(cur_targets, window_cidx3)) {
6572 	  // don't bother computing r^2 if no target variant involved
6573 	  geno_var_vec_ptr = &(geno[window_cidx3 * founder_ct_192_long]);
6574 	  mask_var_vec_ptr = &(geno_masks[window_cidx3 * founder_ct_192_long]);
6575 	  non_missing_ct = fixed_non_missing_ct - missing_cts[window_cidx3];
6576 	  if (fixed_missing_ct && missing_cts[window_cidx3]) {
6577 	    non_missing_ct += ld_missing_ct_intersect(mask_var_vec_ptr, mask_fixed_vec_ptr, founder_ctwd12, founder_ctwd12_rem, lshift_last);
6578 	  }
6579 	  if (non_missing_ct) {
6580 	    dp_result[0] = founder_ct;
6581 	    dp_result[1] = -((int32_t)fixed_non_missing_ct);
6582 	    dp_result[2] = missing_cts[window_cidx3] - founder_ct;
6583 	    dp_result[3] = dp_result[1];
6584 	    dp_result[4] = dp_result[2];
6585 	    ld_dot_prod(geno_var_vec_ptr, geno_fixed_vec_ptr, mask_var_vec_ptr, mask_fixed_vec_ptr, dp_result, founder_ct_mld_m1, founder_ct_mld_rem);
6586 	    non_missing_ctd = (double)((int32_t)non_missing_ct);
6587 	    dxx = dp_result[1];
6588 	    dyy = dp_result[2];
6589 	    cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
6590 	    dxx = (dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy);
6591 	    if (cov12 * cov12 > dxx * tag_thresh) {
6592 	      set_bit_ul(window_cidx * max_window_ctal + window_cidx3, tag_matrix);
6593 	      set_bit_ul(window_cidx3 * max_window_ctal + window_cidx, tag_matrix);
6594 	    }
6595 	  }
6596 	}
6597         if (++window_cidx3 == max_window_size) {
6598 	  window_cidx3 = 0;
6599 	}
6600       }
6601       if (++chrom_marker_idx < chrom_marker_ct) {
6602         marker_uidx++;
6603         if (IS_SET(marker_exclude, marker_uidx)) {
6604           marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
6605           if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
6606 	    goto show_tags_ret_READ_FAIL;
6607 	  }
6608 	}
6609         marker_pos_thresh = marker_pos[marker_uidx];
6610         if (marker_pos_thresh < window_bp) {
6611 	  marker_pos_thresh = 0;
6612 	} else {
6613 	  marker_pos_thresh -= window_bp;
6614 	}
6615       } else {
6616 	// close out the chromosome
6617 	marker_pos_thresh = 0x80000000U;
6618       }
6619       marker_uidx2 = window_uidxs[window_cidx2];
6620       while (marker_pos[marker_uidx2] < marker_pos_thresh) {
6621 	if (IS_SET(cur_targets, window_cidx2)) {
6622 	  // bugfix: tag_matrix_row_ptr is not always 16-byte aligned.
6623 	  tag_ct = popcount_longs_nzbase(tag_matrix, window_cidx2 * max_window_ctl, (window_cidx2 + 1) * max_window_ctl);
6624 	  tag_matrix_row_ptr = &(tag_matrix[window_cidx2 * max_window_ctl]);
6625 	  min_bp = marker_pos[marker_uidx2];
6626 	  max_bp = marker_pos[marker_uidx2];
6627 	  window_cidx3 = window_cidx_starts[window_cidx2];
6628 	  for (uii = 0; uii < tag_ct; uii++, window_cidx3++) {
6629 	    next_set_ul_ck(tag_matrix_row_ptr, max_window_size, &window_cidx3);
6630 	    if (window_cidx3 == max_window_size) {
6631 	      window_cidx3 = next_set_unsafe(tag_matrix_row_ptr, 0);
6632 	    }
6633 	    marker_uidx3 = window_uidxs[window_cidx3];
6634 	    if (final_set) {
6635 	      SET_BIT(marker_uidx3, final_set);
6636 	    }
6637 	    if (tags_list) {
6638 	      cur_bp = marker_pos[marker_uidx3];
6639 	      if (cur_bp < min_bp) {
6640 		min_bp = cur_bp;
6641 	      } else if (cur_bp > max_bp) {
6642 		max_bp = cur_bp;
6643 	      }
6644 	    }
6645 	  }
6646 	  if (tags_list) {
6647 	    bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), g_textbuf);
6648 	    *bufptr++ = ' ';
6649 	    bufptr = memcpyax(bufptr, chrom_name_ptr, chrom_name_len, ' ');
6650 	    bufptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', bufptr);
6651 	    bufptr = uint32toa_w4x(tag_ct, ' ', bufptr);
6652 	    bufptr = uint32toa_w10x(min_bp, ' ', bufptr);
6653 	    bufptr = uint32toa_w10x(max_bp, ' ', bufptr);
6654 	    bufptr = width_force(8, bufptr, dtoa_g(((int32_t)(max_bp - min_bp + 1)) * 0.001, bufptr));
6655 	    *bufptr++ = ' ';
6656 	    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
6657 	      goto show_tags_ret_WRITE_FAIL;
6658 	    }
6659 	    window_cidx3 = window_cidx_starts[window_cidx2];
6660 	    for (uii = 0; uii < tag_ct; uii++, window_cidx3++) {
6661 	      next_set_ul_ck(tag_matrix_row_ptr, max_window_size, &window_cidx3);
6662 	      if (window_cidx3 == max_window_size) {
6663 		window_cidx3 = next_set_unsafe(tag_matrix_row_ptr, 0);
6664 	      }
6665 	      if (uii) {
6666 		putc_unlocked('|', outfile);
6667 	      }
6668 	      fputs(&(marker_ids[window_uidxs[window_cidx3] * max_marker_id_len]), outfile);
6669 	    }
6670 	    if (!tag_ct) {
6671 	      fputs("NONE", outfile);
6672 	    }
6673 	    putc_unlocked('\n', outfile);
6674 	  }
6675 	}
6676 	if (++marker_idx >= pct_thresh) {
6677 	  if (pct > 10) {
6678 	    putc_unlocked('\b', stdout);
6679 	  }
6680           pct = (marker_idx * 100LLU) / marker_ct;
6681           if (pct < 100) {
6682             printf("\b\b%" PRIuPTR "%%", pct);
6683             fflush(stdout);
6684             pct_thresh = ((++pct) * ((uint64_t)marker_ct)) / 100;
6685 	  }
6686 	}
6687 	if (window_cidx2 == window_cidx) {
6688 	  if (++window_cidx2 == max_window_size) {
6689 	    window_cidx2 = 0;
6690 	  }
6691 	  break;
6692 	}
6693 	if (++window_cidx2 == max_window_size) {
6694 	  window_cidx2 = 0;
6695 	}
6696 	marker_uidx2 = window_uidxs[window_cidx2];
6697       }
6698     } while (chrom_marker_idx < chrom_marker_ct);
6699   }
6700   putc_unlocked('\r', stdout);
6701   if (tags_list) {
6702     if (fclose_null(&outfile)) {
6703       goto show_tags_ret_WRITE_FAIL;
6704     }
6705     if (!final_set) {
6706       LOGPRINTFWW("--show-tags all: Report written to %s .\n", outname);
6707     }
6708   }
6709   if (final_set) {
6710     memcpy(outname_end, ".tags", 6);
6711     if (fopen_checked(outname, "w", &outfile)) {
6712       goto show_tags_ret_OPEN_FAIL;
6713     }
6714     if (!twocolumn) {
6715       marker_uidx = next_set(final_set, 0, unfiltered_marker_ct);
6716       while (marker_uidx < unfiltered_marker_ct) {
6717 	fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
6718 	putc_unlocked('\n', outfile);
6719 	marker_uidx++;
6720 	next_set_ul_ck(final_set, unfiltered_marker_ct, &marker_uidx);
6721       }
6722     } else {
6723       for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
6724         next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
6725 	fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
6726 	putc_unlocked('\t', outfile);
6727         putc_unlocked('0' + IS_SET(final_set, marker_uidx), outfile);
6728         putc_unlocked('\n', outfile);
6729       }
6730     }
6731     if (fclose_null(&outfile)) {
6732       goto show_tags_ret_WRITE_FAIL;
6733     }
6734     uii = popcount_longs(final_set, unfiltered_marker_ctl) - target_ct;
6735     if (tags_list) {
6736       LOGPRINTFWW("--show-tags: Main report written to %s.list , and simple tag ID list (%u tag%s added) written to %s .\n", outname, uii, (uii == 1)? "" : "s", outname);
6737     } else {
6738       LOGPRINTFWW("--show-tags: Simple tag ID list (%u tag%s added) written to %s .\n", uii, (uii == 1)? "" : "s", outname);
6739     }
6740   }
6741   while (0) {
6742   show_tags_ret_NOMEM:
6743     retval = RET_NOMEM;
6744     break;
6745   show_tags_ret_OPEN_FAIL:
6746     retval = RET_OPEN_FAIL;
6747     break;
6748   show_tags_ret_READ_FAIL:
6749     retval = RET_READ_FAIL;
6750     break;
6751   show_tags_ret_WRITE_FAIL:
6752     retval = RET_WRITE_FAIL;
6753     break;
6754   show_tags_ret_INVALID_FORMAT:
6755     retval = RET_INVALID_FORMAT;
6756     break;
6757   }
6758  show_tags_ret_1:
6759   bigstack_reset(bigstack_mark);
6760   fclose_cond(infile);
6761   fclose_cond(outfile);
6762   return retval;
6763 }
6764 
calc_lnlike_quantile(double known11,double known12,double known21,double known22,double unknown_dh,double freqx1,double freq1x,double freq2x,double freq11_expected,double denom,int32_t quantile)6765 double calc_lnlike_quantile(double known11, double known12, double known21, double known22, double unknown_dh, double freqx1, double freq1x, double freq2x, double freq11_expected, double denom, int32_t quantile) {
6766   // almost identical to calc_lnlike, but we can skip the equal-to-zero checks
6767   // when quantile isn't 100
6768   double tmp11 = quantile * denom + freq11_expected;
6769   double tmp12 = freq1x - tmp11;
6770   double tmp21 = freqx1 - tmp11;
6771   double tmp22 = freq2x - tmp21;
6772   if (quantile == 100) {
6773     // One of these values will be ~zero, and we want to ensure its logarithm
6774     // is treated as a very negative number instead of nan.  May as well do it
6775     // the same way as Haploview.
6776     if (tmp11 < 1e-10) {
6777       tmp11 = 1e-10;
6778     }
6779     if (tmp12 < 1e-10) {
6780       tmp12 = 1e-10;
6781     }
6782     if (tmp21 < 1e-10) {
6783       tmp21 = 1e-10;
6784     }
6785     if (tmp22 < 1e-10) {
6786       tmp22 = 1e-10;
6787     }
6788   }
6789   return known11 * log(tmp11) + known12 * log(tmp12) + known21 * log(tmp21) + known22 * log(tmp22) + unknown_dh * log(tmp11 * tmp22 + tmp12 * tmp21);
6790 }
6791 
haploview_blocks_classify(uint32_t * counts,uint32_t lowci_max,uint32_t lowci_min,uint32_t recomb_highci,uint32_t strong_highci,uint32_t strong_lowci,uint32_t strong_lowci_outer,uint32_t is_x,double recomb_fast_ln_thresh)6792 uint32_t haploview_blocks_classify(uint32_t* counts, uint32_t lowci_max, uint32_t lowci_min, uint32_t recomb_highci, uint32_t strong_highci, uint32_t strong_lowci, uint32_t strong_lowci_outer, uint32_t is_x, double recomb_fast_ln_thresh) {
6793   // See comments in the middle of haploview_blocks().  The key insight is that
6794   // we only need to classify the D' confidence intervals into a few types, and
6795   // this almost never requires evaluation of all 101 log likelihoods.
6796 
6797   // Note that lowCI and highCI are *one-sided* 95% confidence bounds, i.e.
6798   // together, they form a 90% confidence interval.
6799   double known11 = (double)(2 * counts[0] + counts[1] + counts[3]);
6800   double known12 = (double)(2 * counts[2] + counts[1] + counts[5]);
6801   double known21 = (double)(2 * counts[6] + counts[3] + counts[7]);
6802   double known22 = (double)(2 * counts[8] + counts[5] + counts[7]);
6803   double total_prob = 0.0;
6804   double lnsurf_highstrong_thresh = 0.0;
6805   uint32_t onside_sol_ct = 1;
6806   double right_sum[83];
6807   double freq1x;
6808   double freq2x;
6809   double freqx1;
6810   double freqx2;
6811   double freq11_expected;
6812   double unknown_dh;
6813   double denom;
6814   double lnlike1;
6815   double lnsurf_highindiff_thresh;
6816   double dxx;
6817   double dyy;
6818   double dzz;
6819   uint32_t quantile;
6820   uint32_t center;
6821   if (is_x) {
6822     known11 -= (double)((int32_t)counts[9]);
6823     known12 -= (double)((int32_t)counts[11]);
6824     known21 -= (double)((int32_t)counts[12]);
6825     known22 -= (double)((int32_t)counts[14]);
6826   }
6827   if (em_phase_hethet(known11, known12, known21, known22, counts[4], &freq1x, &freq2x, &freqx1, &freqx2, &dzz, &onside_sol_ct)) {
6828     return 1;
6829   }
6830   freq11_expected = freqx1 * freq1x;
6831   dxx = dzz - freq11_expected;
6832   if (dxx < 0.0) {
6833     // D < 0, flip (1,1)<->(1,2) and (2,1)<->(2,2) to make D positive
6834     dyy = known11;
6835     known11 = known12;
6836     known12 = dyy;
6837     dyy = known21;
6838     known21 = known22;
6839     known22 = dyy;
6840     freq11_expected = freqx2 * freq1x;
6841     dyy = freqx1;
6842     freqx1 = freqx2;
6843     freqx2 = dyy;
6844     dxx = -dxx;
6845   }
6846   dyy = MINV(freqx1 * freq2x, freqx2 * freq1x);
6847   // this will always be in a term with a 0.01 multiplier from now on, so may
6848   // as well premultiply.
6849   denom = 0.01 * dyy;
6850   unknown_dh = (double)((int32_t)counts[4]);
6851 
6852   // force this to an actual likelihood array entry, so we know for sure
6853   // total_prob >= 1.0 and can use that inequality for both early exit and
6854   // determining the "futility threshold" (terms smaller than 2^{-53} / 19 are
6855   // too small to matter).
6856   center = (int32_t)(((dxx / dyy) * 100) + 0.5);
6857 
6858   lnlike1 = calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, center);
6859 
6860   // Previously assumed log likelihood was always concave, and used geometric
6861   // series bounds... then I realized this was NOT a safe assumption to make.
6862   // See e.g. rs9435793 and rs7531410 in 1000 Genomes phase 1.
6863   // So, instead, we only use an aggressive approach when onside_sol_ct == 1
6864   // (fortunately, that is almost always the case).
6865   if (onside_sol_ct == 1) {
6866     // It's not actually necessary to keep the entire likelihood array in
6867     // memory.  This is similar to the HWE and Fisher's exact test
6868     // calculations: we can get away with tracking a few partial sums, and
6869     // exploit unimodality, fixed direction on both sides of the center,
6870     // knowledge of the center's location, and the fact that we only need to
6871     // classify the CI rather than fully evaluate it.
6872     //
6873     // Specifically, we need to determine the following:
6874     // 1. Is highCI >= 0.98?  Or < 0.90?
6875     // 2. If highCI >= 0.98, is lowCI >= 0.81?  In [0.71, 0.81)?  Equal to
6876     //    0.70?  In [0.51, 0.70)?  In [0.01, 0.51)?  Or < 0.01?
6877     //    (Crucially, if highCI < 0.98, we don't actually need to determine
6878     //    lowCI at all.)
6879     // To make this classification with as few relative likelihood evaluations
6880     // as possible (5 logs, an exp call, 8 multiplies, 9 adds... that's kinda
6881     // heavy for an inner loop operation), we distinguish the following cases:
6882     // a. D' >= 0.41.  We first try to quickly rule out highCI >= 0.98 by
6883     //    inspection of f(0.97).  Then,
6884     //    * If it's below the futility threshold, jump to case (b).
6885     //    * Otherwise, sum f(0.98)..f(1.00), and then sum other likelihoods
6886     //      from f(0.96) on down.
6887     // b. D' < 0.41.  highCI >= 0.98 is impossible since f(0.41) >= f(0.42) >=
6888     //    ...; goal is to quickly establish highCI < 0.90.  A large fraction of
6889     //    the time, this can be accomplished simply by inspecting f(0.89); if
6890     //    it's less than 1/220, we're done because we know there's a 1
6891     //    somewhere in the array, and the sum of the likelihoods between
6892     //    f(0.89) and whatever that 1 entry is is bounded above by 12 * (1/220)
6893     //    due to fixed direction.  Otherwise, we sum from the top down.
6894     // This should be good for a ~10x speedup on the larger datasets where it's
6895     // most wanted.
6896     if (100 - center < 20 * (100 - strong_highci)) {
6897       dxx = calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, strong_highci) - lnlike1;
6898       // ln(2^{-53} / 19) is just under -39.6812
6899       if ((center > strong_highci) || (dxx > -39.6812)) {
6900 	total_prob = exp(dxx);
6901 	for (quantile = 100; quantile > strong_highci; quantile--) {
6902 	  total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6903 	}
6904 	if (total_prob > (1.0 / 19.0)) {
6905 	  // branch 1: highCI might be >= 0.98
6906 	  lnsurf_highstrong_thresh = total_prob * 20;
6907 	  for (quantile = strong_highci - 1; quantile >= recomb_highci; quantile--) {
6908 	    total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6909 	  }
6910 	  lnsurf_highindiff_thresh = total_prob * 20;
6911 	  while (1) {
6912 	    dxx = exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6913 	    total_prob += dxx;
6914 	    // see comments on branch 2.  this is more complicated because we
6915 	    // still have work to do after resolving whether highCI >= 0.98,
6916 	    // but the reasoning is similar.
6917 	    if (total_prob >= lnsurf_highstrong_thresh) {
6918 	      if (quantile >= center) {
6919 	        goto haploview_blocks_classify_no_highstrong_1;
6920 	      }
6921 	      goto haploview_blocks_classify_no_highstrong_2;
6922 	    }
6923 	    if ((quantile <= lowci_max) && (quantile >= lowci_min)) {
6924 	      // We actually only need the [52..100], [71..100], [72..100], and
6925 	      // [82..100] right sums, but saving a few extra values is
6926 	      // probably more efficient than making this if-statement more
6927 	      // complicated.  [99 - quantile] rather than e.g. [quantile]
6928 	      // is used so memory writes go to sequentially increasing rather
6929 	      // than decreasing addresses.  (okay, this shouldn't matter since
6930 	      // everything should be in L1 cache, but there's negligible
6931 	      // opportunity cost)
6932 	      right_sum[quantile] = total_prob;
6933 	    }
6934 	    dxx *= ((int32_t)quantile);
6935 	    if (total_prob + dxx < lnsurf_highstrong_thresh) {
6936 	      while (1) {
6937 		// Now we want to bound lowCI, optimizing for being able to
6938 		// quickly establish lowCI >= 0.71.
6939 		if (dxx * 19 < total_prob) {
6940 		  // less than 5% remaining on left tail
6941 		  if (quantile >= lowci_max) {
6942 		    return 6;
6943 		  }
6944 		  while (quantile > lowci_min) {
6945 		    quantile--;
6946 		    total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6947 		    if (quantile <= lowci_max) {
6948 		      right_sum[quantile] = total_prob;
6949 		    }
6950 		  }
6951 		  dyy = right_sum[lowci_min] * (20.0 / 19.0);
6952 		  while (total_prob < dyy) {
6953 		    if ((!quantile) || (dxx <= RECIP_2_53)) {
6954 		      total_prob *= 0.95;
6955 		      if (total_prob >= right_sum[strong_lowci_outer]) {
6956 			// lowCI < 0.70
6957 			// -> f(0.00) + f(0.01) + ... + f(0.70) > 0.05 * total
6958 			return 3;
6959 		      } else if (total_prob < right_sum[lowci_max]) {
6960 			return 6;
6961 		      } else if ((lowci_max > strong_lowci) && (total_prob < right_sum[strong_lowci])) {
6962 			return 5;
6963 		      }
6964 		      return 4;
6965 		    }
6966 		    quantile--;
6967 		    dxx = exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6968 		    total_prob += dxx;
6969 		  }
6970 		  return 2;
6971 		}
6972 		quantile--;
6973 		dxx = exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
6974 		total_prob += dxx;
6975 		if ((quantile <= lowci_max) && (quantile >= lowci_min)) {
6976 		  right_sum[quantile] = total_prob;
6977 		}
6978 		dxx *= ((int32_t)quantile);
6979 	      }
6980 	    }
6981 	    quantile--;
6982 	  }
6983 	}
6984       }
6985       quantile = strong_highci - 1;
6986     } else {
6987       quantile = 100;
6988     }
6989     // branch 2: highCI guaranteed less than 0.98.  If D' <= 0.875, try to
6990     // quickly establish highCI < 0.90.
6991     dxx = calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, recomb_highci) - lnlike1;
6992     if ((center < recomb_highci) && (dxx < recomb_fast_ln_thresh)) {
6993       return 0;
6994     }
6995     // okay, we'll sum the whole right tail.  May as well sum from the outside
6996     // in here for a bit more numerical stability, instead of adding exp(dxx)
6997     // first.
6998     do {
6999       total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
7000     } while (--quantile > recomb_highci);
7001     total_prob += exp(dxx);
7002     lnsurf_highindiff_thresh = total_prob * 20;
7003   haploview_blocks_classify_no_highstrong_1:
7004     quantile--;
7005     if (center < recomb_highci) {
7006       // if we know there's a 1.0 ahead in the likelihood array, may as well
7007       // take advantage of that
7008       lnsurf_highstrong_thresh = lnsurf_highindiff_thresh - 1.0;
7009       while (quantile > center) {
7010 	total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
7011 	if (total_prob >= lnsurf_highstrong_thresh) {
7012 	  return 0;
7013 	}
7014 	quantile--;
7015       }
7016       if (!center) {
7017 	return 1;
7018       }
7019       total_prob += 1;
7020       quantile--;
7021     }
7022     // likelihoods are now declining, try to exploit that to exit early
7023     // (it's okay if the first likelihood does not represent a decline)
7024     while (1) {
7025       dxx = exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
7026       total_prob += dxx;
7027     haploview_blocks_classify_no_highstrong_2:
7028       if (total_prob >= lnsurf_highindiff_thresh) {
7029 	return 0;
7030       }
7031       if (total_prob + ((int32_t)quantile) * dxx < lnsurf_highindiff_thresh) {
7032 	// guaranteed to catch quantile == 0
7033 	return 1;
7034       }
7035       quantile--;
7036     }
7037   }
7038   for (quantile = 100; quantile >= recomb_highci; quantile--) {
7039     total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
7040     if (quantile == strong_highci) {
7041       lnsurf_highstrong_thresh = total_prob * 20;
7042     }
7043   }
7044   if (total_prob < (1.0 / 19.0)) {
7045     return 0;
7046   }
7047   lnsurf_highindiff_thresh = total_prob * 20;
7048   while (1) {
7049     total_prob += exp(calc_lnlike_quantile(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) - lnlike1);
7050     if (total_prob >= lnsurf_highindiff_thresh) {
7051       return 0;
7052     }
7053     if (quantile <= lowci_max) {
7054       if (quantile >= lowci_min) {
7055         right_sum[quantile] = total_prob;
7056       } else if (!quantile) {
7057 	break;
7058       }
7059     }
7060     quantile--;
7061   }
7062   if (total_prob >= lnsurf_highstrong_thresh) {
7063     return 1;
7064   }
7065   total_prob *= 0.95;
7066   if (total_prob < right_sum[strong_lowci]) {
7067     if ((lowci_max > strong_lowci) && (total_prob >= right_sum[lowci_max])) {
7068       return 5;
7069     }
7070     return 6;
7071   }
7072   if (total_prob >= right_sum[strong_lowci_outer]) {
7073     if ((lowci_min < strong_lowci_outer) && (total_prob >= right_sum[lowci_min])) {
7074       return 2;
7075     }
7076     return 3;
7077   }
7078   return 4;
7079 }
7080 
haploview_blocks(Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t * marker_pos,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uintptr_t * pheno_nm,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)7081 int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* pheno_nm, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
7082   // See Plink::mkBlks() in blox.cpp (which is, in turn, a port of doGabriel()
7083   // in FindBlocks.java and computeDPrime() in HaploData.java from Haploview).
7084   // No unwindowed/inter-chr mode, so little point in bothering with
7085   // multithreading.
7086   //
7087   // MAF < 0.05 markers have a minor effect on PLINK 1.07 --blocks's behavior
7088   // when present, while Haploview completely ignores them.  We replicate
7089   // Haploview's behavior.
7090   unsigned char* bigstack_mark = g_bigstack_base;
7091   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
7092   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
7093   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
7094   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
7095   FILE* outfile = nullptr;
7096   FILE* outfile_det = nullptr;
7097   // circular.  [2n] = numStrong, [2n+1] = numRec
7098   uintptr_t* strong_rec_cts = nullptr;
7099   uintptr_t* founder_include2 = nullptr;
7100   uintptr_t* founder_male_include2 = nullptr;
7101   uintptr_t marker_uidx = 0;
7102   uintptr_t block_idx_first = 0;
7103   uintptr_t block_uidx_first = 0;
7104   uintptr_t block_pos_first = 0;
7105   uintptr_t prev_strong = 0;
7106   uintptr_t prev_rec = 0;
7107   uintptr_t markers_done = 0;
7108   uint32_t no_pheno_req = ldip->modifier & LD_BLOCKS_NO_PHENO_REQ;
7109   uint32_t max_window_bp = ldip->blocks_max_bp;
7110   uint32_t max_window_bp1 = 20000;
7111   uint32_t max_window_bp2 = 30000;
7112   uint32_t recomb_highci = ldip->blocks_recomb_highci;
7113   uint32_t strong_highci = ldip->blocks_strong_highci;
7114   uint32_t strong_lowci = ldip->blocks_strong_lowci;
7115   uint32_t strong_lowci_outer = ldip->blocks_strong_lowci_outer;
7116   uint32_t block_ct = 0;
7117   uint32_t maxspan = 0;
7118   uint32_t pct = 0;
7119   int32_t retval = 0;
7120   double recomb_fast_ln_thresh = -log((int32_t)((100 - recomb_highci) * 20));
7121   double inform_frac = ldip->blocks_inform_frac + SMALLISH_EPSILON;
7122   uint32_t inform_thresh_two = 1 + ((int32_t)(3 * inform_frac));
7123   uint32_t inform_thresh_three = (int32_t)(6 * inform_frac);
7124   uint32_t counts[15];
7125   // [0]: (m, m-1)
7126   // [1]: (m, m-2)
7127   // [2]: (m-1, m-2)
7128   // [3]: (m-1, m-3)
7129   // [4]: (m-2, m-3)
7130   uint32_t recent_ci_types[5];
7131   uint32_t index_tots[5];
7132   uintptr_t* founder_pnm;
7133   uintptr_t* marker_exclude;
7134   uintptr_t* in_haploblock;
7135   uintptr_t* loadbuf_raw;
7136   uintptr_t* index_data;
7137   uintptr_t* window_data;
7138   uintptr_t* window_data_ptr;
7139   unsigned char* bigstack_mark2;
7140   uint32_t* block_uidxs;
7141   uint32_t* forward_block_sizes;
7142   uint32_t* candidate_pairs;
7143   char* wptr_start;
7144   char* wptr;
7145   char* sptr;
7146   uintptr_t cur_marker_ct;
7147   uintptr_t max_block_size;
7148   uintptr_t marker_idx;
7149   uintptr_t cur_block_size;
7150   uintptr_t last_block_size;
7151   uintptr_t founder_ct;
7152   uintptr_t founder_ctl2;
7153   uintptr_t founder_ctv2;
7154   uintptr_t final_mask;
7155   uintptr_t futility_rec;
7156   uintptr_t max_candidates;
7157   uintptr_t candidate_ct;
7158   uintptr_t candidate_idx;
7159   uintptr_t delta;
7160   uintptr_t pct_thresh;
7161   uintptr_t ulii;
7162   double min_maf;
7163   double max_maf;
7164   double dxx;
7165   uint32_t chrom_fo_idx;
7166   uint32_t chrom_idx;
7167   uint32_t chrom_start;
7168   uint32_t chrom_end;
7169   uint32_t is_haploid;
7170   uint32_t is_x;
7171   uint32_t is_y;
7172   uint32_t marker_pos_thresh;
7173   uint32_t forward_scan_uidx;
7174   uint32_t block_cidx;
7175   uint32_t block_cidx2;
7176   uint32_t cur_strong;
7177   uint32_t cur_rec;
7178   uint32_t lowci_max;
7179   uint32_t lowci_min;
7180   uint32_t cur_ci_type;
7181   uint32_t cur_marker_pos;
7182   uint32_t uii;
7183   uint32_t ujj;
7184   // suppress warning
7185   index_tots[3] = 0;
7186   index_tots[4] = 0;
7187   if (ldip->modifier & LD_BLOCKS_NO_SMALL_MAX_SPAN) {
7188     max_window_bp1 = 0x7fffffff;
7189     max_window_bp2 = 0x7fffffff;
7190   }
7191 
7192   // First enforce MAF 0.05 minimum; then, on each chromosome:
7193   // 1. Determine maximum number of markers that might need to be loaded at
7194   //    once on current chromosome, and then (re)allocate memory buffers.
7195   // 2. Find all pairs of markers satisfying the "strong LD" and informative
7196   //    fraction criteria.  (The original algorithm deferred the informative
7197   //    fraction calculation; we don't do that because it forces nonsequential
7198   //    file access.)
7199   // 3. Sort the pairs in decreasing order primarily by bp distance, and
7200   //    secondarily by start uidx.
7201   // 4. Greedily construct blocks from the sorted list (i.e. form largest
7202   //    blocks first).
7203   if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm) ||
7204       bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude) ||
7205       bigstack_alloc_ul(unfiltered_marker_ctl, &in_haploblock) ||
7206       bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
7207     goto haploview_blocks_ret_NOMEM;
7208   }
7209   memcpy(founder_pnm, founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
7210   if (!no_pheno_req) {
7211     bitvec_and(pheno_nm, unfiltered_sample_ctl, founder_pnm);
7212   }
7213   founder_ct = popcount_longs(founder_pnm, unfiltered_sample_ctl);
7214   if (founder_ct < 2) {
7215     if ((!no_pheno_req) && (!popcount_longs(pheno_nm, unfiltered_sample_ctl))) {
7216       logerrprint("Warning: Skipping --blocks, since there are less than two founders with\nnonmissing phenotypes.  (The 'no-pheno-req' modifier removes the phenotype\nrestriction.)\n");
7217     } else {
7218       logerrprint("Warning: Skipping --blocks, since there are less than two founders with\nnonmissing phenotypes.  (--make-founders may come in handy here.)\n");
7219     }
7220     goto haploview_blocks_ret_1;
7221   }
7222   final_mask = get_final_mask(founder_ct);
7223   memcpy(marker_exclude, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
7224   if (ldip->blocks_min_maf > 0.0) {
7225     min_maf = ldip->blocks_min_maf * (1 - SMALL_EPSILON);
7226     max_maf = 1 - min_maf;
7227     for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
7228       next_unset_ul_unsafe_ck(marker_exclude_orig, &marker_uidx);
7229       dxx = set_allele_freqs[marker_uidx];
7230       if ((dxx < min_maf) || (dxx > max_maf)) {
7231 	set_bit_ul(marker_uidx, marker_exclude);
7232       }
7233     }
7234     marker_ct = unfiltered_marker_ct - popcount_longs(marker_exclude, unfiltered_marker_ctl);
7235   }
7236   if (marker_ct < 2) {
7237     logerrprint("Warning: Skipping --blocks since there are too few variants with MAF >= 0.05.\n");
7238     goto haploview_blocks_ret_1;
7239   }
7240   pct_thresh = marker_ct / 100;
7241   fill_ulong_zero(unfiltered_marker_ctl, in_haploblock);
7242   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
7243   founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
7244   founder_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(founder_ct);
7245   if (bigstack_alloc_ul(5 * founder_ctv2, &index_data)) {
7246     goto haploview_blocks_ret_NOMEM;
7247   }
7248   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, &founder_include2, &founder_male_include2)) {
7249     goto haploview_blocks_ret_NOMEM;
7250   }
7251   memcpy(outname_end, ".blocks.det", 12);
7252   if (fopen_checked(outname, "w", &outfile_det)) {
7253     goto haploview_blocks_ret_OPEN_FAIL;
7254   }
7255   if (fputs_checked(" CHR          BP1          BP2           KB  NSNPS SNPS\n", outfile_det)) {
7256     goto haploview_blocks_ret_WRITE_FAIL;
7257   }
7258   outname_end[7] = '\0';
7259   if (fopen_checked(outname, "w", &outfile)) {
7260     goto haploview_blocks_ret_OPEN_FAIL;
7261   }
7262   bigstack_mark2 = g_bigstack_base;
7263   fputs("--blocks: 0%", stdout);
7264   fflush(stdout);
7265   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++, markers_done += cur_marker_ct) {
7266     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
7267     chrom_start = next_unset(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_end);
7268     cur_marker_ct = chrom_end - chrom_start - popcount_bit_idx(marker_exclude, chrom_start, chrom_end);
7269     if (cur_marker_ct < 2) {
7270       continue;
7271     }
7272     marker_uidx = chrom_start;
7273     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
7274     max_block_size = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_idx, 0x7fffffff, max_window_bp, 1);
7275     if (max_block_size < 2) {
7276       continue;
7277     }
7278 #ifndef __LP64__
7279     if (max_block_size > 65536) {
7280       logprint("\n");
7281       logerrprint("Error: 32-bit --blocks cannot analyze potential blocks with more than 65536\nvariants.  Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
7282       goto haploview_blocks_ret_INVALID_CMDLINE;
7283     }
7284 #endif
7285     is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
7286     is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
7287     is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
7288     bigstack_reset(bigstack_mark2);
7289     // Need to compute full 3x3 count tables, but only for a limited window;
7290     // more similar to --clump than --fast-epistasis, so we don't bother with
7291     // precomputing 0-only/1-only/2-only bitfields or multithreading for now.
7292 
7293     // For each pair, we just need to know 100x the Haploview lowCI and highCI
7294     // values; lod and dp are unnecessary since the CI value also tracks bad
7295     // pairs.  More precisely, there are only seven types of CIs worth
7296     // distinguishing:
7297     // 0. non-bad pair, and highCI < recHighCI (0.90)
7298     // 1. "null" (bad pair, or highCI in [0.90, 0.98))
7299     // 2. highCI >= 0.98, and lowCI < 0.51
7300     //    (treated the same as type 1, but it takes no additional effort to
7301     //    distinguish this case)
7302     // 3. highCI >= 0.98, and lowCI in [0.51, 0.70)
7303     // 4. highCI >= 0.98, and lowCI == 0.70
7304     //    (turns out (double)70 / 100.0 compares exactly equal to 0.70, so
7305     //    Haploview's use of < cutLowCI in its initial "strong LD" check
7306     //    actually behaves differently from the later "not > cutLowCI" check)
7307     // 5. highCI >= 0.98, and lowCI in [0.71, 0.81)
7308     // 6. highCI >= 0.98, and lowCI in [0.81, 1]
7309     // And it gets better than that: given block size n, we just need to
7310     // maintain #(type 0) and #(type 4/5/6) arrays (and a tiny array with more
7311     // detailed information on the most recent blocks) to find all potentially
7312     // valid blocks in a single pass.  So we can use practically all our memory
7313     // to track and sort those blocks by bp length.
7314     if (bigstack_alloc_ui(max_block_size, &block_uidxs) ||
7315         bigstack_alloc_ui(max_block_size, &forward_block_sizes) ||
7316         bigstack_alloc_ul(max_block_size * founder_ctv2, &window_data)) {
7317       goto haploview_blocks_ret_NOMEM;
7318     }
7319     if (max_block_size >= 4) {
7320       // After marker m is fully processed,
7321       //   strong_rec_cts[(block_cidx + delta) * 2] = numStrong, and
7322       //   strong_rec_cts[(block_cidx + delta) * 2 + 1] = numRec
7323       // for the potential [m - delta, m] block, taking array indices modulo
7324       // max_block_size * 2.
7325       if (bigstack_alloc_ul(max_block_size * 2, &strong_rec_cts)) {
7326 	goto haploview_blocks_ret_NOMEM;
7327       }
7328     }
7329     window_data_ptr = &(window_data[founder_ctv2 - 2]);
7330     for (ulii = 0; ulii < max_block_size; ulii++) {
7331       window_data_ptr[0] = 0;
7332       window_data_ptr[1] = 0;
7333       window_data_ptr = &(window_data_ptr[founder_ctv2]);
7334     }
7335     block_idx_first = 0;
7336     block_uidx_first = chrom_start;
7337     marker_uidx = chrom_start;
7338     block_pos_first = marker_pos[chrom_start];
7339     max_candidates = bigstack_left() / (3 * sizeof(int32_t));
7340     bigstack_alloc_ui(max_candidates * 3, &candidate_pairs);
7341     candidate_ct = 0;
7342     cur_block_size = 0;
7343     fill_uint_zero(3, recent_ci_types);
7344     // count down instead of up so more memory accesses are sequential
7345     block_cidx = max_block_size;
7346     forward_scan_uidx = marker_uidx;
7347     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
7348       goto haploview_blocks_ret_READ_FAIL;
7349     }
7350     for (marker_idx = 0; marker_idx < cur_marker_ct; marker_uidx++, marker_idx++) {
7351       if (block_cidx) {
7352         block_cidx--;
7353       } else {
7354 	block_cidx = max_block_size - 1;
7355       }
7356       window_data_ptr = &(window_data[block_cidx * founder_ctv2]);
7357       if (IS_SET(marker_exclude, marker_uidx)) {
7358         marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
7359         if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
7360           goto haploview_blocks_ret_READ_FAIL;
7361 	}
7362       }
7363       block_uidxs[block_cidx] = marker_uidx;
7364       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, 0, bedfile, loadbuf_raw, window_data_ptr)) {
7365 	goto haploview_blocks_ret_READ_FAIL;
7366       }
7367       if (is_haploid) {
7368 	haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)window_data_ptr);
7369       }
7370       cur_marker_pos = marker_pos[marker_uidx];
7371       marker_pos_thresh = cur_marker_pos;
7372       if (marker_pos_thresh < max_window_bp) {
7373 	marker_pos_thresh = 0;
7374       } else {
7375 	marker_pos_thresh -= max_window_bp;
7376       }
7377       if (marker_pos_thresh > block_pos_first) {
7378 	do {
7379 	  block_uidx_first++;
7380 	  next_unset_ul_unsafe_ck(marker_exclude, &block_uidx_first);
7381 	  block_pos_first = marker_pos[block_uidx_first];
7382 	  block_idx_first++;
7383 	} while (marker_pos_thresh > block_pos_first);
7384       }
7385       last_block_size = cur_block_size;
7386       cur_block_size = marker_idx - block_idx_first;
7387       recent_ci_types[4] = recent_ci_types[2];
7388       recent_ci_types[2] = recent_ci_types[0];
7389       recent_ci_types[3] = recent_ci_types[1];
7390       if (cur_block_size > last_block_size) {
7391 	cur_block_size = last_block_size + 1;
7392       }
7393       // now determine maximum local block size, so we can set futility_rec
7394       // efficiently.
7395       marker_pos_thresh = cur_marker_pos + max_window_bp;
7396       if (forward_scan_uidx < marker_uidx) {
7397 	forward_scan_uidx = marker_uidx;
7398       }
7399       while (marker_pos_thresh >= marker_pos[forward_scan_uidx]) {
7400 	uii = forward_scan_uidx + 1;
7401 	next_unset_ck(marker_exclude, chrom_end, &uii);
7402 	if (uii == chrom_end) {
7403 	  break;
7404 	}
7405         forward_scan_uidx = uii;
7406       }
7407       uii = forward_scan_uidx + 1 - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, forward_scan_uidx);
7408       forward_block_sizes[block_cidx] = uii;
7409       if (!cur_block_size) {
7410 	continue;
7411       }
7412       block_cidx2 = block_cidx + 1;
7413       for (delta = 1; delta <= cur_block_size; delta++, block_cidx2++) {
7414 	if (block_cidx2 == max_block_size) {
7415 	  block_cidx2 = 0;
7416 	}
7417         if (forward_block_sizes[block_cidx2] > uii) {
7418 	  uii = forward_block_sizes[block_cidx2];
7419 	}
7420       }
7421       ulii = uii;
7422       // If numRec ever reaches this value, we can just move on to the next
7423       // marker (even skipping the remaining D' evaluations).
7424       futility_rec = 1 + (((double)((intptr_t)((ulii * (ulii - 1)) / 2))) * (1.0 - inform_frac));
7425       block_cidx2 = block_cidx + 1;
7426       cur_strong = 0;
7427       cur_rec = 0;
7428       vec_datamask(founder_ct, 0, window_data_ptr, founder_include2, index_data);
7429       index_tots[0] = popcount2_longs(index_data, founder_ctl2);
7430       vec_datamask(founder_ct, 2, window_data_ptr, founder_include2, &(index_data[founder_ctv2]));
7431       index_tots[1] = popcount2_longs(&(index_data[founder_ctv2]), founder_ctl2);
7432       vec_datamask(founder_ct, 3, window_data_ptr, founder_include2, &(index_data[2 * founder_ctv2]));
7433       index_tots[2] = popcount2_longs(&(index_data[2 * founder_ctv2]), founder_ctl2);
7434       if (is_x) {
7435 	vec_datamask(founder_ct, 0, window_data_ptr, founder_male_include2, &(index_data[3 * founder_ctv2]));
7436 	index_tots[3] = popcount2_longs(&(index_data[3 * founder_ctv2]), founder_ctl2);
7437 	vec_datamask(founder_ct, 3, window_data_ptr, founder_male_include2, &(index_data[4 * founder_ctv2]));
7438 	index_tots[4] = popcount2_longs(&(index_data[4 * founder_ctv2]), founder_ctl2);
7439       }
7440       lowci_max = 82;
7441       lowci_min = 52;
7442       for (delta = 1; delta <= cur_block_size; delta++, block_cidx2++) {
7443 	if (block_cidx2 == max_block_size) {
7444 	  block_cidx2 = 0;
7445 	}
7446 	if (delta >= 4) {
7447 	  prev_rec = strong_rec_cts[block_cidx2 * 2 + 1];
7448 	  if (cur_rec + prev_rec >= futility_rec) {
7449 	    cur_block_size = delta - 1;
7450 	    break;
7451 	  }
7452           prev_strong = strong_rec_cts[block_cidx2 * 2];
7453 	}
7454 	window_data_ptr = &(window_data[block_cidx2 * founder_ctv2]);
7455 	genovec_3freq(window_data_ptr, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
7456 	counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
7457 	genovec_3freq(window_data_ptr, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
7458 	counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
7459 	genovec_3freq(window_data_ptr, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
7460 	counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
7461 	if (is_x) {
7462 	  genovec_3freq(window_data_ptr, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
7463 	  // counts[10] should always be zero
7464 	  counts[9] = index_tots[3] - counts[9] - counts[11];
7465 	  genovec_3freq(window_data_ptr, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[12]), &(counts[13]), &(counts[14]));
7466 	  counts[12] = index_tots[4] - counts[12] - counts[14];
7467 	}
7468 	cur_ci_type = haploview_blocks_classify(counts, lowci_max, lowci_min, recomb_highci, strong_highci, strong_lowci, strong_lowci_outer, is_x, recomb_fast_ln_thresh);
7469 	if (cur_ci_type > 4) {
7470 	  cur_strong++;
7471 	} else if (!cur_ci_type) {
7472 	  cur_rec++;
7473 	}
7474 	if (delta < 4) {
7475 	  if (delta == 1) {
7476 	    lowci_max = strong_lowci;
7477 	    recent_ci_types[0] = cur_ci_type;
7478 	    if ((cur_ci_type == 6) && (cur_marker_pos - marker_pos[block_uidxs[block_cidx2]] <= max_window_bp1)) {
7479 	      goto haploview_blocks_save_candidate;
7480 	    }
7481 	  } else if (delta == 2) {
7482 	    recent_ci_types[1] = cur_ci_type;
7483 	    if ((cur_ci_type >= 4) && (cur_marker_pos - marker_pos[block_uidxs[block_cidx2]] <= max_window_bp2)) {
7484 	      uii = 1;
7485 	      if (recent_ci_types[0] >= 3) {
7486 		uii++;
7487 	      }
7488 	      if (recent_ci_types[2] >= 3) {
7489 		uii++;
7490 	      }
7491 	      if (uii >= inform_thresh_two) {
7492 	        goto haploview_blocks_save_candidate;
7493 	      }
7494 	    }
7495 	  } else {
7496 	    lowci_min = strong_lowci_outer;
7497 	    prev_strong = 0; // 5+
7498 	    uii = 0; // 3+, not counting cur_ci_type
7499 	    prev_rec = 0;
7500 	    if (cur_ci_type > 4) {
7501 	      prev_strong++;
7502 	    } else if (!cur_ci_type) {
7503 	      prev_rec++;
7504 	    }
7505 	    for (ujj = 0; ujj < 5; ujj++) {
7506 	      if (recent_ci_types[ujj] >= 3) {
7507 		uii++;
7508 		if (recent_ci_types[ujj] > 4) {
7509 		  prev_strong++;
7510 		}
7511 	      } else if (!recent_ci_types[ujj]) {
7512 		prev_rec++;
7513 	      }
7514 	    }
7515 	    strong_rec_cts[block_cidx2 * 2] = prev_strong;
7516 	    strong_rec_cts[block_cidx2 * 2 + 1] = prev_rec;
7517 	    if ((cur_ci_type >= 4) && (uii >= inform_thresh_three)) {
7518 	      goto haploview_blocks_save_candidate;
7519 	    }
7520 	  }
7521 	} else {
7522 	  prev_strong += cur_strong;
7523 	  prev_rec += cur_rec;
7524 	  strong_rec_cts[block_cidx2 * 2] = prev_strong;
7525 	  strong_rec_cts[block_cidx2 * 2 + 1] = prev_rec;
7526 	  ulii = prev_strong + prev_rec;
7527 	  if ((cur_ci_type >= 4) && (ulii >= 6) && (((intptr_t)ulii) * inform_frac < ((double)((intptr_t)prev_strong)))) {
7528 	  haploview_blocks_save_candidate:
7529 	    if (candidate_ct == max_candidates) {
7530 	      goto haploview_blocks_ret_NOMEM;
7531 	    }
7532 	    uii = block_uidxs[block_cidx2];
7533 	    candidate_pairs[3 * candidate_ct] = cur_marker_pos - marker_pos[uii];
7534 	    candidate_pairs[3 * candidate_ct + 1] = uii;
7535 	    candidate_pairs[3 * candidate_ct + 2] = marker_uidx;
7536 	    candidate_ct++;
7537 	  }
7538 	}
7539       }
7540       if (markers_done + marker_idx >= pct_thresh) {
7541 	if (pct > 10) {
7542 	  putc_unlocked('\b', stdout);
7543 	}
7544 	pct = ((markers_done + marker_idx) * 100LLU) / marker_ct;
7545 	printf("\b\b%u%%", pct++);
7546 	fflush(stdout);
7547 	pct_thresh = (pct * marker_ct) / 100;
7548       }
7549     }
7550     if (!candidate_ct) {
7551       continue;
7552     }
7553     qsort(candidate_pairs, candidate_ct, 12, intcmp3_decr);
7554     if (candidate_pairs[0] > maxspan) {
7555       maxspan = candidate_pairs[0];
7556     }
7557     ulii = 0; // final haploblock count
7558     for (candidate_idx = 0; candidate_idx < candidate_ct; candidate_idx++) {
7559       block_cidx = candidate_pairs[candidate_idx * 3 + 1];
7560       if (is_set(in_haploblock, block_cidx)) {
7561 	continue;
7562       }
7563       block_cidx2 = candidate_pairs[candidate_idx * 3 + 2];
7564       if (is_set(in_haploblock, block_cidx2)) {
7565 	continue;
7566       }
7567       candidate_pairs[2 * ulii] = block_cidx;
7568       candidate_pairs[2 * ulii + 1] = block_cidx2;
7569       fill_bits(block_cidx, block_cidx2 + 1 - block_cidx, in_haploblock);
7570       ulii++;
7571     }
7572 #ifdef __cplusplus
7573     std::sort((int64_t*)candidate_pairs, (int64_t*)(&(candidate_pairs[ulii * 2])));
7574 #else
7575     qsort(candidate_pairs, ulii, sizeof(int64_t), llcmp);
7576 #endif
7577     wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
7578     wptr_start = memseta(wptr_start, 32, 3);
7579     for (candidate_idx = 0; candidate_idx < ulii; candidate_idx++) {
7580       putc_unlocked('*', outfile);
7581       block_cidx = candidate_pairs[2 * candidate_idx];
7582       block_cidx2 = candidate_pairs[2 * candidate_idx + 1];
7583       marker_uidx = block_cidx;
7584       wptr = uint32toa_w10(marker_pos[block_cidx], wptr_start);
7585       wptr = memseta(wptr, 32, 3);
7586       wptr = uint32toa_w10x(marker_pos[block_cidx2], ' ', wptr);
7587       wptr = width_force(12, wptr, dtoa_g(((int32_t)(marker_pos[block_cidx2] + 1 - marker_pos[block_cidx])) * 0.001, wptr));
7588       *wptr++ = ' ';
7589       wptr = uint32toa_w6x(block_cidx2 + 1 - block_cidx - popcount_bit_idx(marker_exclude, block_cidx, block_cidx2), ' ', wptr);
7590       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_det)) {
7591 	goto haploview_blocks_ret_WRITE_FAIL;
7592       }
7593       for (marker_uidx = block_cidx; marker_uidx <= block_cidx2; marker_uidx++) {
7594 	next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
7595         sptr = &(marker_ids[marker_uidx * max_marker_id_len]);
7596         putc_unlocked(' ', outfile);
7597         fputs(sptr, outfile);
7598         if (marker_uidx != block_cidx) {
7599 	  putc_unlocked('|', outfile_det);
7600 	}
7601 	fputs(sptr, outfile_det);
7602       }
7603       putc_unlocked('\n', outfile);
7604       putc_unlocked('\n', outfile_det);
7605     }
7606     block_ct += ulii;
7607   }
7608   if (fclose_null(&outfile)) {
7609     goto haploview_blocks_ret_WRITE_FAIL;
7610   }
7611   if (fclose_null(&outfile_det)) {
7612     goto haploview_blocks_ret_WRITE_FAIL;
7613   }
7614   putc_unlocked('\r', stdout);
7615   LOGPRINTFWW("--blocks: %u haploblock%s written to %s .\n", block_ct, (block_ct == 1)? "" : "s", outname);
7616   LOGPRINTFWW("Extra block details written to %s.det .\n", outname);
7617   if (block_ct) {
7618     LOGPRINTF("Longest span: %gkb.\n", ((double)(maxspan + 1)) * 0.001);
7619   }
7620   while (0) {
7621   haploview_blocks_ret_NOMEM:
7622     retval = RET_NOMEM;
7623     break;
7624   haploview_blocks_ret_OPEN_FAIL:
7625     retval = RET_OPEN_FAIL;
7626     break;
7627   haploview_blocks_ret_READ_FAIL:
7628     retval = RET_READ_FAIL;
7629     break;
7630   haploview_blocks_ret_WRITE_FAIL:
7631     retval = RET_WRITE_FAIL;
7632     break;
7633 #ifndef __LP64__
7634   haploview_blocks_ret_INVALID_CMDLINE:
7635     retval = RET_INVALID_CMDLINE;
7636     break;
7637 #endif
7638   }
7639  haploview_blocks_ret_1:
7640   bigstack_reset(bigstack_mark);
7641   fclose_cond(outfile);
7642   fclose_cond(outfile_det);
7643   return retval;
7644 }
7645 
twolocus_write_table(FILE * outfile,uint32_t * counts,uint32_t plink_maxsnp,char * mkr1,char * mkr2,char * allele00,char * allele01,char * allele10,char * allele11,uint32_t alen00,uint32_t alen01,uint32_t alen10,uint32_t alen11)7646 void twolocus_write_table(FILE* outfile, uint32_t* counts, uint32_t plink_maxsnp, char* mkr1, char* mkr2, char* allele00, char* allele01, char* allele10, char* allele11, uint32_t alen00, uint32_t alen01, uint32_t alen10, uint32_t alen11) {
7647   // PLINK 1.07's print settings for this function don't handle large numbers
7648   // well so we break byte-for-byte compatibility.
7649   char* bufptr = memseta(g_textbuf, 32, plink_maxsnp + 14);
7650   uint32_t* uiptr = counts;
7651   uint32_t total = 0;
7652   uint32_t marg_a[4];
7653   uint32_t marg_b[4];
7654   char spaces[7];
7655   double tot_recip;
7656   uint32_t uii;
7657   uint32_t ujj;
7658   uint32_t ukk;
7659   uint32_t umm;
7660   fill_uint_zero(4, marg_b);
7661   memset(spaces, 32, 7);
7662   for (uii = 0; uii < 4; uii++) {
7663     ukk = 0;
7664     for (ujj = 0; ujj < 4; ujj++) {
7665       umm = *uiptr++;
7666       ukk += umm;
7667       marg_b[ujj] += umm;
7668     }
7669     marg_a[uii] = ukk;
7670     total += ukk;
7671   }
7672   tot_recip = 1.0 / ((double)((int32_t)total));
7673   bufptr = strcpyax(bufptr, mkr2, '\n');
7674   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7675   fwrite(g_textbuf, 1, plink_maxsnp + 7, outfile);
7676   if (alen10 < 4) {
7677     fwrite(spaces, 1, 9 - 2 * alen10, outfile);
7678   }
7679   fputs(allele10, outfile);
7680   putc_unlocked('/', outfile);
7681   fputs(allele10, outfile);
7682   putc_unlocked(' ', outfile);
7683   if (alen10 + alen11 < 7) {
7684     fwrite(spaces, 1, 9 - alen10 - alen11, outfile);
7685   }
7686   fputs(allele10, outfile);
7687   putc_unlocked('/', outfile);
7688   fputs(allele11, outfile);
7689   putc_unlocked(' ', outfile);
7690   if (alen11 < 4) {
7691     fwrite(spaces, 1, 9 - 2 * alen11, outfile);
7692   }
7693   fputs(allele11, outfile);
7694   putc_unlocked('/', outfile);
7695   fputs(allele11, outfile);
7696   fputs("        0/0        */*\n", outfile);
7697 
7698   bufptr = fw_strcpy(plink_maxsnp, mkr1, g_textbuf);
7699   *bufptr++ = ' ';
7700   if (alen00 == 1) {
7701     bufptr = memseta(bufptr, 32, 2);
7702   }
7703   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7704   fputs(allele00, outfile);
7705   putc_unlocked('/', outfile);
7706   fputs(allele00, outfile);
7707   bufptr = g_textbuf;
7708   *bufptr++ = ' ';
7709   bufptr = uint32toa_w10x(counts[0], ' ', bufptr);
7710   bufptr = uint32toa_w10x(counts[2], ' ', bufptr);
7711   bufptr = uint32toa_w10x(counts[3], ' ', bufptr);
7712   bufptr = uint32toa_w10x(counts[1], ' ', bufptr);
7713   bufptr = uint32toa_w10x(marg_a[0], '\n', bufptr);
7714   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7715 
7716   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
7717   if (alen00 + alen01 < 4) {
7718     bufptr = memseta(bufptr, 32, 4 - alen00 - alen01);
7719   }
7720   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7721   fputs(allele00, outfile);
7722   putc_unlocked('/', outfile);
7723   fputs(allele01, outfile);
7724   bufptr = g_textbuf;
7725   *bufptr++ = ' ';
7726   bufptr = uint32toa_w10x(counts[8], ' ', bufptr);
7727   bufptr = uint32toa_w10x(counts[10], ' ', bufptr);
7728   bufptr = uint32toa_w10x(counts[11], ' ', bufptr);
7729   bufptr = uint32toa_w10x(counts[9], ' ', bufptr);
7730   bufptr = uint32toa_w10x(marg_a[2], '\n', bufptr);
7731   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7732 
7733   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
7734   if (alen01 == 1) {
7735     bufptr = memseta(bufptr, 32, 2);
7736   }
7737   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7738   fputs(allele01, outfile);
7739   putc_unlocked('/', outfile);
7740   fputs(allele01, outfile);
7741   bufptr = g_textbuf;
7742   *bufptr++ = ' ';
7743   bufptr = uint32toa_w10x(counts[12], ' ', bufptr);
7744   bufptr = uint32toa_w10x(counts[14], ' ', bufptr);
7745   bufptr = uint32toa_w10x(counts[15], ' ', bufptr);
7746   bufptr = uint32toa_w10x(counts[13], ' ', bufptr);
7747   bufptr = uint32toa_w10x(marg_a[3], '\n', bufptr);
7748   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7749 
7750   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
7751   bufptr = memcpya(bufptr, "0/0 ", 4);
7752   bufptr = uint32toa_w10x(counts[4], ' ', bufptr);
7753   bufptr = uint32toa_w10x(counts[6], ' ', bufptr);
7754   bufptr = uint32toa_w10x(counts[7], ' ', bufptr);
7755   bufptr = uint32toa_w10x(counts[5], ' ', bufptr);
7756   bufptr = uint32toa_w10x(marg_a[1], '\n', bufptr);
7757   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7758 
7759   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
7760   bufptr = memcpya(bufptr, "*/* ", 4);
7761   bufptr = uint32toa_w10x(marg_b[0], ' ', bufptr);
7762   bufptr = uint32toa_w10x(marg_b[2], ' ', bufptr);
7763   bufptr = uint32toa_w10x(marg_b[3], ' ', bufptr);
7764   bufptr = uint32toa_w10x(marg_b[1], ' ', bufptr);
7765   bufptr = uint32toa_w10x(total, '\n', bufptr);
7766   *bufptr++ = '\n';
7767   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7768 
7769   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 14);
7770   bufptr = strcpyax(bufptr, mkr2, '\n');
7771   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7772   fwrite(g_textbuf, 1, plink_maxsnp + 9, outfile);
7773   fputs(allele10, outfile);
7774   putc_unlocked('/', outfile);
7775   fputs(allele10, outfile);
7776   if (alen10 < 4) {
7777     fwrite(spaces, 1, 9 - 2 * alen10, outfile);
7778   }
7779   putc_unlocked(' ', outfile);
7780   fputs(allele10, outfile);
7781   putc_unlocked('/', outfile);
7782   fputs(allele11, outfile);
7783   if (alen10 + alen11 < 7) {
7784     fwrite(spaces, 1, 9 - alen10 - alen11, outfile);
7785   }
7786   putc_unlocked(' ', outfile);
7787   fputs(allele11, outfile);
7788   putc_unlocked('/', outfile);
7789   fputs(allele11, outfile);
7790   if (alen11 < 4) {
7791     fwrite(spaces, 1, 9 - 2 * alen11, outfile);
7792   }
7793   fputs(" 0/0        */*\n", outfile);
7794 
7795   bufptr = fw_strcpy(plink_maxsnp, mkr1, g_textbuf);
7796   *bufptr++ = ' ';
7797   if (alen00 == 1) {
7798     bufptr = memseta(bufptr, 32, 2);
7799   }
7800   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7801   fputs(allele00, outfile);
7802   putc_unlocked('/', outfile);
7803   fputs(allele00, outfile);
7804   bufptr = memseta(g_textbuf, 32, 2);
7805   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[0]) * tot_recip, bufptr);
7806   bufptr = memseta(bufptr, 32, 2);
7807   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[2]) * tot_recip, bufptr);
7808   bufptr = memseta(bufptr, 32, 2);
7809   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[3]) * tot_recip, bufptr);
7810   bufptr = memseta(bufptr, 32, 2);
7811   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[1]) * tot_recip, bufptr);
7812   bufptr = memseta(bufptr, 32, 2);
7813   bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[0]) * tot_recip, bufptr);
7814   *bufptr++ = '\n';
7815   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7816 
7817   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
7818   if (alen00 + alen01 < 4) {
7819     bufptr = memseta(bufptr, 32, 4 - alen00 - alen01);
7820   }
7821   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7822   fputs(allele00, outfile);
7823   putc_unlocked('/', outfile);
7824   fputs(allele01, outfile);
7825   bufptr = memseta(g_textbuf, 32, 2);
7826   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[8]) * tot_recip, bufptr);
7827   bufptr = memseta(bufptr, 32, 2);
7828   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[10]) * tot_recip, bufptr);
7829   bufptr = memseta(bufptr, 32, 2);
7830   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[11]) * tot_recip, bufptr);
7831   bufptr = memseta(bufptr, 32, 2);
7832   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[9]) * tot_recip, bufptr);
7833   bufptr = memseta(bufptr, 32, 2);
7834   bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[2]) * tot_recip, bufptr);
7835   *bufptr++ = '\n';
7836   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7837 
7838   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
7839   if (alen01 == 1) {
7840     bufptr = memseta(bufptr, 32, 2);
7841   }
7842   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7843   fputs(allele01, outfile);
7844   putc_unlocked('/', outfile);
7845   fputs(allele01, outfile);
7846   bufptr = memseta(g_textbuf, 32, 2);
7847   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[12]) * tot_recip, bufptr);
7848   bufptr = memseta(bufptr, 32, 2);
7849   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[14]) * tot_recip, bufptr);
7850   bufptr = memseta(bufptr, 32, 2);
7851   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[15]) * tot_recip, bufptr);
7852   bufptr = memseta(bufptr, 32, 2);
7853   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[13]) * tot_recip, bufptr);
7854   bufptr = memseta(bufptr, 32, 2);
7855   bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[3]) * tot_recip, bufptr);
7856   *bufptr++ = '\n';
7857   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7858 
7859   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
7860   bufptr = memcpya(bufptr, "0/0  ", 5);
7861   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[4]) * tot_recip, bufptr);
7862   bufptr = memseta(bufptr, 32, 2);
7863   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[6]) * tot_recip, bufptr);
7864   bufptr = memseta(bufptr, 32, 2);
7865   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[7]) * tot_recip, bufptr);
7866   bufptr = memseta(bufptr, 32, 2);
7867   bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[5]) * tot_recip, bufptr);
7868   bufptr = memseta(bufptr, 32, 2);
7869   bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[1]) * tot_recip, bufptr);
7870   *bufptr++ = '\n';
7871   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7872 
7873   bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
7874   bufptr = memcpya(bufptr, "*/*  ", 5);
7875   bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[0]) * tot_recip, bufptr);
7876   bufptr = memseta(bufptr, 32, 2);
7877   bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[2]) * tot_recip, bufptr);
7878   bufptr = memseta(bufptr, 32, 2);
7879   bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[3]) * tot_recip, bufptr);
7880   bufptr = memseta(bufptr, 32, 2);
7881   bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[1]) * tot_recip, bufptr);
7882   bufptr = memcpya(bufptr, "   1\n\n", 6);
7883   fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
7884 }
7885 
twolocus(Epi_info * epi_ip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sample_exclude,uintptr_t sample_ct,uintptr_t * pheno_nm,uint32_t pheno_nm_ct,uint32_t pheno_ctrl_ct,uintptr_t * pheno_c,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)7886 int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t pheno_ctrl_ct, uintptr_t* pheno_c, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
7887   unsigned char* bigstack_mark = g_bigstack_base;
7888   FILE* outfile = nullptr;
7889   char* mkr1 = outname? epi_ip->twolocus_mkr1 : epi_ip->ld_mkr1;
7890   char* mkr2 = outname? epi_ip->twolocus_mkr2 : epi_ip->ld_mkr2;
7891   uintptr_t* sample_include2 = nullptr;
7892   uintptr_t* sample_male_include2 = nullptr;
7893   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
7894   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
7895   uintptr_t ulii = strlen(mkr1) + 1;
7896   uintptr_t uljj = strlen(mkr2) + 1;
7897   uint32_t hwe_midp = epi_ip->modifier & EPI_HWE_MIDP;
7898   int32_t retval = 0;
7899   uint32_t counts_all[16];
7900   uint32_t counts_cc[32];
7901   uintptr_t* loadbufs[2];
7902   uintptr_t marker_uidxs[2];
7903   double solutions[3];
7904   uint32_t is_haploid[2];
7905   uint32_t is_x[2];
7906   uintptr_t* loadbuf_raw;
7907   uintptr_t* loadbuf0_ptr;
7908   uintptr_t* loadbuf1_ptr;
7909   uintptr_t* loadbuf0_end;
7910   char* bufptr;
7911   char* bufptr2;
7912   uintptr_t sample_ctl2;
7913   uintptr_t final_mask;
7914   uintptr_t marker_uidx;
7915   uintptr_t marker_idx;
7916   uintptr_t sample_uidx;
7917   uintptr_t sample_idx;
7918   uintptr_t sample_idx_end;
7919   uintptr_t ulkk;
7920   double twice_tot_recip;
7921   double half_hethet_share;
7922   double freq11;
7923   double freq12;
7924   double freq21;
7925   double freq22;
7926   double freq1x;
7927   double freq2x;
7928   double freqx1;
7929   double freqx2;
7930   double dxx;
7931   uint32_t chrom_fo_idx;
7932   uint32_t chrom_idx;
7933   uint32_t is_y;
7934   uint32_t alen00;
7935   uint32_t alen01;
7936   uint32_t alen10;
7937   uint32_t alen11;
7938   uint32_t count_total;
7939   if (!outname) {
7940     ulkk = BITCT_TO_WORDCT(unfiltered_sample_ct);
7941     // ulkk = (unfiltered_sample_ctl2 + 1) / 2;
7942     sample_ct = popcount_longs(sample_exclude, ulkk);
7943     if (!sample_ct) {
7944       logerrprint("Warning: Skipping --ld since there are no founders.  (--make-founders may come\nin handy here.)\n");
7945       goto twolocus_ret_1;
7946     }
7947     if (bigstack_alloc_ul(ulkk, &loadbuf_raw)) {
7948       goto twolocus_ret_NOMEM;
7949     }
7950     bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, loadbuf_raw);
7951     sample_exclude = loadbuf_raw;
7952   }
7953   sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
7954   final_mask = get_final_mask(sample_ct);
7955   if ((ulii > max_marker_id_len) || (uljj > max_marker_id_len)) {
7956     goto twolocus_ret_MARKER_NOT_FOUND;
7957   }
7958   marker_uidxs[0] = 0;
7959   marker_uidxs[1] = 0;
7960   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
7961     next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
7962     bufptr = &(marker_ids[marker_uidx * max_marker_id_len]);
7963     if (ulii && (!memcmp(mkr1, bufptr, ulii))) {
7964       marker_uidxs[0] = marker_uidx;
7965       if (!uljj) {
7966 	break;
7967       }
7968       ulii = 0;
7969     } else if (uljj && (!memcmp(mkr2, bufptr, uljj))) {
7970       marker_uidxs[1] = marker_uidx;
7971       if (!ulii) {
7972 	break;
7973       }
7974       uljj = 0;
7975     }
7976   }
7977   if (marker_idx == marker_ct) {
7978     goto twolocus_ret_MARKER_NOT_FOUND;
7979   }
7980   if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
7981       bigstack_alloc_ul(sample_ctl2, &loadbufs[0]) ||
7982       bigstack_alloc_ul(sample_ctl2, &loadbufs[1])) {
7983     goto twolocus_ret_NOMEM;
7984   }
7985   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
7986   loadbufs[0][sample_ctl2 - 1] = 0;
7987   loadbufs[1][sample_ctl2 - 1] = 0;
7988   if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, hh_exists, 0, &sample_include2, &sample_male_include2)) {
7989     goto twolocus_ret_NOMEM;
7990   }
7991   is_haploid[0] = 0;
7992   is_haploid[1] = 0;
7993   is_x[0] = 0;
7994   is_x[1] = 0;
7995   for (marker_idx = 0; marker_idx < 2; marker_idx++) {
7996     marker_uidx = marker_uidxs[marker_idx];
7997     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
7998       goto twolocus_ret_READ_FAIL;
7999     }
8000     if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbufs[marker_idx])) {
8001       goto twolocus_ret_READ_FAIL;
8002     }
8003     chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
8004     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
8005     is_haploid[marker_idx] = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
8006     if (is_haploid[marker_idx]) {
8007       is_x[marker_idx] = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]);
8008       is_y = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]);
8009       haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x[marker_idx], is_y, (unsigned char*)(loadbufs[marker_idx]));
8010     }
8011   }
8012   if (!outname) {
8013     // --ld needs X chromosome sex stratification instead of --twolocus's
8014     // case/control stratification
8015     if (is_x[0] || is_x[1]) {
8016       pheno_c = sex_male;
8017       pheno_nm = sex_male;
8018     } else {
8019       pheno_c = nullptr;
8020     }
8021   }
8022   fill_uint_zero(16, counts_all);
8023   fill_uint_zero(32, counts_cc);
8024   loadbuf0_ptr = loadbufs[0];
8025   loadbuf1_ptr = loadbufs[1];
8026   loadbuf0_end = &(loadbuf0_ptr[sample_ct / BITCT2]);
8027   sample_uidx = 0;
8028   sample_idx = 0;
8029   sample_idx_end = BITCT2;
8030   while (1) {
8031     while (loadbuf0_ptr < loadbuf0_end) {
8032       ulii = *loadbuf0_ptr++;
8033       uljj = *loadbuf1_ptr++;
8034       if (pheno_c) {
8035 	for (; sample_idx < sample_idx_end; sample_uidx++, sample_idx++) {
8036           next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
8037 	  ulkk = ((ulii & 3) << 2) | (uljj & 3);
8038 	  ulii >>= 2;
8039 	  uljj >>= 2;
8040 	  counts_all[ulkk] += 1;
8041           if (IS_SET(pheno_nm, sample_uidx)) {
8042             counts_cc[(16 * IS_SET(pheno_c, sample_uidx)) + ulkk] += 1;
8043 	  }
8044 	}
8045       } else {
8046 	for (; sample_idx < sample_idx_end; sample_idx++) {
8047 	  ulkk = ((ulii & 3) << 2) | (uljj & 3);
8048 	  ulii >>= 2;
8049 	  uljj >>= 2;
8050 	  counts_all[ulkk] += 1;
8051 	}
8052       }
8053       sample_idx_end += BITCT2;
8054     }
8055     if (sample_idx == sample_ct) {
8056       break;
8057     }
8058     loadbuf0_end++;
8059     sample_idx_end = sample_ct;
8060   }
8061 
8062   alen00 = strlen(marker_allele_ptrs[2 * marker_uidxs[0]]);
8063   alen01 = strlen(marker_allele_ptrs[2 * marker_uidxs[0] + 1]);
8064   alen10 = strlen(marker_allele_ptrs[2 * marker_uidxs[1]]);
8065   alen11 = strlen(marker_allele_ptrs[2 * marker_uidxs[1] + 1]);
8066   if (outname) {
8067     memcpy(outname_end, ".twolocus", 10);
8068     if (fopen_checked(outname, "w", &outfile)) {
8069       goto twolocus_ret_OPEN_FAIL;
8070     }
8071     fputs("\nAll individuals\n===============\n", outfile);
8072     twolocus_write_table(outfile, counts_all, plink_maxsnp, mkr1, mkr2, marker_allele_ptrs[2 * marker_uidxs[0]], marker_allele_ptrs[2 * marker_uidxs[0] + 1], marker_allele_ptrs[2 * marker_uidxs[1]], marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen00, alen01, alen10, alen11);
8073     if (pheno_c) {
8074       if (pheno_nm_ct != pheno_ctrl_ct) {
8075 	fputs("\nCases\n=====\n", outfile);
8076 	twolocus_write_table(outfile, &(counts_cc[16]), plink_maxsnp, mkr1, mkr2, marker_allele_ptrs[2 * marker_uidxs[0]], marker_allele_ptrs[2 * marker_uidxs[0] + 1], marker_allele_ptrs[2 * marker_uidxs[1]], marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen00, alen01, alen10, alen11);
8077       }
8078       if (pheno_ctrl_ct) {
8079 	fputs("\nControls\n========\n", outfile);
8080 	twolocus_write_table(outfile, counts_cc, plink_maxsnp, mkr1, mkr2, marker_allele_ptrs[2 * marker_uidxs[0]], marker_allele_ptrs[2 * marker_uidxs[0] + 1], marker_allele_ptrs[2 * marker_uidxs[1]], marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen00, alen01, alen10, alen11);
8081       }
8082     }
8083     putc_unlocked('\n', outfile);
8084     if (fclose_null(&outfile)) {
8085       goto twolocus_ret_WRITE_FAIL;
8086     }
8087     LOGPRINTFWW("--twolocus: Report written to %s .\n", outname);
8088   } else {
8089     // low counts_cc[] values aren't used, so may as well store marginal counts
8090     // there
8091     counts_cc[0] = counts_all[0] + counts_all[2] + counts_all[3];
8092     counts_cc[2] = counts_all[8] + counts_all[10] + counts_all[11];
8093     counts_cc[3] = counts_all[12] + counts_all[14] + counts_all[15];
8094     counts_cc[4] = counts_all[0] + counts_all[8] + counts_all[12];
8095     counts_cc[6] = counts_all[2] + counts_all[10] + counts_all[14];
8096     counts_cc[7] = counts_all[3] + counts_all[11] + counts_all[15];
8097     count_total = counts_cc[0] + counts_cc[2] + counts_cc[3];
8098     if (!count_total) {
8099       logerrprint("Error: No valid observations for --ld.\n");
8100       goto twolocus_ret_INVALID_CMDLINE;
8101     }
8102     if ((!counts_cc[2]) && ((!counts_cc[0]) || (!counts_cc[3]))) {
8103       LOGPREPRINTFWW("Error: %s is monomorphic across all valid observations.\n", mkr1);
8104       goto twolocus_ret_INVALID_CMDLINE_2;
8105     } else if ((!counts_cc[6]) && ((!counts_cc[4]) || (!counts_cc[7]))) {
8106       LOGPREPRINTFWW("Error: %s is monomorphic across all valid observations.\n", mkr2);
8107       goto twolocus_ret_INVALID_CMDLINE_2;
8108     } else if ((alen00 > (MAXLINELEN / 4) - 16) || (alen01 > (MAXLINELEN / 4) - 16)) {
8109       LOGPREPRINTFWW("Error: %s has a pathologically long allele code.\n", mkr1);
8110       goto twolocus_ret_INVALID_CMDLINE_2;
8111     } else if ((alen10 > (MAXLINELEN / 4) - 16) || (alen11 > (MAXLINELEN / 4) - 16)) {
8112       LOGPREPRINTFWW("Error: %s has a pathologically long allele code.\n", mkr2);
8113       goto twolocus_ret_INVALID_CMDLINE_2;
8114     }
8115     LOGPRINTF("\n--ld %s %s:\n", mkr1, mkr2);
8116     ulii = 0;
8117     // possible todo: factor out redundancy with other D-prime calculations
8118     freq11 = (double)(2 * counts_all[0] + counts_all[2] + counts_all[8]);
8119     freq12 = (double)(2 * counts_all[3] + counts_all[2] + counts_all[11]);
8120     freq21 = (double)(2 * counts_all[12] + counts_all[8] + counts_all[14]);
8121     freq22 = (double)(2 * counts_all[15] + counts_all[11] + counts_all[14]);
8122     if (is_x[0] || is_x[1]) {
8123       if (is_x[0] && is_x[1]) {
8124         freq11 -= (double)((int32_t)counts_cc[16]);
8125         freq12 -= (double)((int32_t)counts_cc[19]);
8126         freq21 -= (double)((int32_t)counts_cc[28]);
8127         freq22 -= (double)((int32_t)counts_cc[31]);
8128       } else if (is_x[0]) {
8129         freq11 -= ((double)(2 * counts_cc[16] + counts_cc[18])) * (1.0 - SQRT_HALF);
8130         freq12 -= ((double)(2 * counts_cc[19] + counts_cc[18])) * (1.0 - SQRT_HALF);
8131         freq21 -= ((double)(2 * counts_cc[28] + counts_cc[30])) * (1.0 - SQRT_HALF);
8132         freq22 -= ((double)(2 * counts_cc[31] + counts_cc[30])) * (1.0 - SQRT_HALF);
8133       } else {
8134         freq11 -= ((double)(2 * counts_cc[16] + counts_cc[24])) * (1.0 - SQRT_HALF);
8135         freq12 -= ((double)(2 * counts_cc[19] + counts_cc[27])) * (1.0 - SQRT_HALF);
8136         freq21 -= ((double)(2 * counts_cc[28] + counts_cc[24])) * (1.0 - SQRT_HALF);
8137         freq22 -= ((double)(2 * counts_cc[31] + counts_cc[27])) * (1.0 - SQRT_HALF);
8138       }
8139     }
8140     twice_tot_recip = 1.0 / (freq11 + freq12 + freq21 + freq22 + 2 * ((int32_t)counts_all[10]));
8141     freq11 *= twice_tot_recip;
8142     freq12 *= twice_tot_recip;
8143     freq21 *= twice_tot_recip;
8144     freq22 *= twice_tot_recip;
8145     half_hethet_share = ((int32_t)counts_all[10]) * twice_tot_recip;
8146     freq1x = freq11 + freq12 + half_hethet_share;
8147     freq2x = 1.0 - freq1x;
8148     freqx1 = freq11 + freq21 + half_hethet_share;
8149     freqx2 = 1.0 - freqx1;
8150     if (counts_all[10]) {
8151       // detect degenerate cases to avoid e-17 ugliness
8152       // possible todo: when there are multiple solutions, compute log
8153       // likelihood for each and mark the EM solution in some manner
8154       if ((freq11 * freq22 != 0.0) || (freq12 * freq21 != 0.0)) {
8155 	// (f11 + x)(f22 + x)(K - x) = x(f12 + K - x)(f21 + K - x)
8156 	// (x - K)(x + f11)(x + f22) + x(x - K - f12)(x - K - f21) = 0
8157 	//   x^3 + (f11 + f22 - K)x^2 + (f11*f22 - K*f11 - K*f22)x
8158 	// - K*f11*f22 + x^3 - (2K + f12 + f21)x^2 + (K + f12)(K + f21)x = 0
8159 	uljj = cubic_real_roots(0.5 * (freq11 + freq22 - freq12 - freq21 - 3 * half_hethet_share), 0.5 * (freq11 * freq22 + freq12 * freq21 + half_hethet_share * (freq12 + freq21 - freq11 - freq22 + half_hethet_share)), -0.5 * half_hethet_share * freq11 * freq22, solutions);
8160 	if (uljj > 1) {
8161 	  while (solutions[uljj - 1] > half_hethet_share + SMALLISH_EPSILON) {
8162 	    uljj--;
8163 	  }
8164 	  if (solutions[uljj - 1] > half_hethet_share - SMALLISH_EPSILON) {
8165 	    solutions[uljj - 1] = half_hethet_share;
8166 	  }
8167 	  while (solutions[ulii] < -SMALLISH_EPSILON) {
8168 	    ulii++;
8169 	  }
8170 	  if (solutions[ulii] < SMALLISH_EPSILON) {
8171 	    solutions[ulii] = 0;
8172 	  }
8173 	}
8174       } else {
8175         // bugfix (6 Oct 2017):
8176         // At least one of {f11, f22} is zero, and one of {f12, f21} is zero.
8177         // Initially suppose that the zero-values are f11 and f12.  Then the
8178         // equality becomes
8179         //   x(f22 + x)(K - x) = x(K - x)(f21 + K - x)
8180         //   x=0 and x=K are always solutions; the rest becomes
8181         //     f22 + x = f21 + K - x
8182         //     2x = K + f21 - f22
8183         //     x = (K + f21 - f22)/2; in-range iff (f21 - f22) in (-K, K).
8184         // So far so good.  However, this code used to *always* check
8185         // (f21 - f22), when it's necessary to use all the nonzero values.
8186         // (this still works if three or all four values are zero)
8187 	solutions[0] = 0;
8188         const double nonzero_freq_xx = freq11 + freq22;
8189         const double nonzero_freq_xy = freq12 + freq21;
8190 	if ((nonzero_freq_xx + SMALLISH_EPSILON < half_hethet_share + nonzero_freq_xy) && (nonzero_freq_xy + SMALLISH_EPSILON < half_hethet_share + nonzero_freq_xx)) {
8191 	  uljj = 3;
8192 	  solutions[1] = (half_hethet_share + nonzero_freq_xy - nonzero_freq_xx) * 0.5;
8193 	  solutions[2] = half_hethet_share;
8194 	} else {
8195 	  uljj = 2;
8196 	  solutions[1] = half_hethet_share;
8197 	}
8198       }
8199       if (uljj > ulii + 1) {
8200 	// not Xchr/haploid-sensitive yet
8201 	logprint("Multiple haplotype phasing solutions; sample size, HWE, or random mating\nassumption may be violated.\n\nHWE exact test p-values\n-----------------------\n");
8202 	if (is_haploid[0] && (!is_x[0])) {
8203           LOGPRINTF("   %s: n/a\n", mkr1);
8204 	} else {
8205 	  LOGPRINTF("   %s: %g\n", mkr1, SNPHWE2(counts_cc[2] + counts_all[9], counts_cc[0] + counts_all[1] - 2 * (counts_cc[16] + counts_cc[19]), counts_cc[3] + counts_all[13] - 2 * (counts_cc[28] + counts_cc[31]), hwe_midp));
8206 	}
8207 	if (is_haploid[1] && (!is_x[1])) {
8208 	  LOGPRINTF("   %s: n/a\n", mkr2);
8209 	} else {
8210 	  LOGPRINTF("   %s: %g\n\n", mkr2, SNPHWE2(counts_cc[6] + counts_all[6], counts_cc[4] + counts_all[4] - 2 * (counts_cc[16] + counts_cc[28]), counts_cc[7] + counts_all[7] - 2 * (counts_cc[19] + counts_cc[31]), hwe_midp));
8211 	}
8212       }
8213     } else {
8214       uljj = 1;
8215       solutions[0] = 0.0;
8216     }
8217     if (uljj == ulii + 1) {
8218       logprint("\n");
8219     }
8220     for (ulkk = ulii; ulkk < uljj; ulkk++) {
8221       if (uljj - ulii > 1) {
8222         LOGPRINTF("Solution #%" PRIuPTR ":\n", ulkk + 1 - ulii);
8223       }
8224       dxx = freq11 + solutions[ulkk] - freqx1 * freq1x; // D
8225       if (fabs(dxx) < SMALL_EPSILON) {
8226 	dxx = 0;
8227       }
8228       bufptr = memcpya(g_logbuf, "   R-sq = ", 10);
8229       bufptr2 = dtoa_g(dxx * dxx / (freq1x * freqx1 * freq2x * freqx2), bufptr);
8230       // assumes bufptr2 - bufptr < 15
8231       bufptr = memseta(bufptr2, 32, 15 - ((uintptr_t)(bufptr2 - bufptr)));
8232       bufptr = memcpya(bufptr, "D' = ", 5);
8233       if (dxx >= 0) {
8234 	bufptr = dtoa_g(dxx / MINV(freqx1 * freq2x, freqx2 * freq1x), bufptr);
8235       } else {
8236 	bufptr = dtoa_g(-dxx / MINV(freqx1 * freq1x, freqx2 * freq2x), bufptr);
8237       }
8238       bufptr = memcpya(bufptr, "\n\n", 3);
8239       logprintb();
8240       logprint("   Haplotype     Frequency    Expectation under LE\n");
8241       logprint("   ---------     ---------    --------------------\n");
8242       bufptr = memseta(g_logbuf, 32, 3);
8243       if (alen00 + alen10 < 9) {
8244 	bufptr = memseta(bufptr, 32, 9 - alen00 - alen10);
8245       }
8246       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
8247       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10);
8248       bufptr = memseta(bufptr, 32, 5);
8249       bufptr = dtoa_f_w9p6_spaced(freq11 + solutions[ulkk], bufptr);
8250       bufptr = memseta(bufptr, 32, 15);
8251       bufptr = dtoa_f_w9p6_clipped(freqx1 * freq1x, bufptr);
8252       bufptr = memcpya(bufptr, "\n", 2);
8253       logprintb();
8254       bufptr = &(g_logbuf[3]);
8255       if (alen01 + alen10 < 9) {
8256 	bufptr = memseta(bufptr, 32, 9 - alen01 - alen10);
8257       }
8258       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
8259       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10);
8260       bufptr = memseta(bufptr, 32, 5);
8261       bufptr = dtoa_f_w9p6_spaced(freq21 + half_hethet_share - solutions[ulkk], bufptr);
8262       bufptr = memseta(bufptr, 32, 15);
8263       bufptr = dtoa_f_w9p6_clipped(freqx1 * freq2x, bufptr);
8264       bufptr = memcpya(bufptr, "\n", 2);
8265       logprintb();
8266       bufptr = &(g_logbuf[3]);
8267       if (alen00 + alen11 < 9) {
8268 	bufptr = memseta(bufptr, 32, 9 - alen00 - alen11);
8269       }
8270       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
8271       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11);
8272       bufptr = memseta(bufptr, 32, 5);
8273       bufptr = dtoa_f_w9p6_spaced(freq12 + half_hethet_share - solutions[ulkk], bufptr);
8274       bufptr = memseta(bufptr, 32, 15);
8275       bufptr = dtoa_f_w9p6_clipped(freqx2 * freq1x, bufptr);
8276       bufptr = memcpya(bufptr, "\n", 2);
8277       logprintb();
8278       bufptr = &(g_logbuf[3]);
8279       if (alen01 + alen11 < 9) {
8280 	bufptr = memseta(bufptr, 32, 9 - alen01 - alen11);
8281       }
8282       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
8283       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11);
8284       bufptr = memseta(bufptr, 32, 5);
8285       bufptr = dtoa_f_w9p6_spaced(freq22 + solutions[ulkk], bufptr);
8286       bufptr = memseta(bufptr, 32, 15);
8287       bufptr = dtoa_f_w9p6_clipped(freqx2 * freq2x, bufptr);
8288       bufptr = memcpyl3a(bufptr, "\n\n");
8289       logprintb();
8290       bufptr = &(g_logbuf[3]);
8291       bufptr = memcpya(bufptr, "In phase alleles are ", 21);
8292       if (dxx > 0) {
8293 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
8294 	bufptr = memcpyax(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10, '/');
8295 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
8296 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11);
8297       } else {
8298 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
8299 	bufptr = memcpyax(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11, '/');
8300 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
8301 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10);
8302       }
8303       bufptr = memcpyl3a(bufptr, "\n\n");
8304       logprintb();
8305     }
8306   }
8307   while (0) {
8308   twolocus_ret_NOMEM:
8309     retval = RET_NOMEM;
8310     break;
8311   twolocus_ret_OPEN_FAIL:
8312     retval = RET_OPEN_FAIL;
8313     break;
8314   twolocus_ret_READ_FAIL:
8315     retval = RET_READ_FAIL;
8316     break;
8317   twolocus_ret_WRITE_FAIL:
8318     retval = RET_WRITE_FAIL;
8319     break;
8320   twolocus_ret_MARKER_NOT_FOUND:
8321     if (outname) {
8322       logerrprint("Error: --twolocus variant name not found.\n");
8323     } else {
8324       logerrprint("Error: --ld variant name not found.\n");
8325     }
8326     retval = RET_INVALID_CMDLINE;
8327     break;
8328   twolocus_ret_INVALID_CMDLINE_2:
8329     logerrprintb();
8330   twolocus_ret_INVALID_CMDLINE:
8331     retval = RET_INVALID_CMDLINE;
8332     break;
8333   }
8334  twolocus_ret_1:
8335   fclose_cond(outfile);
8336   bigstack_reset(bigstack_mark);
8337   return retval;
8338 }
8339 
rotate_loadbuf_and_compute_phenogeno(uintptr_t * loadbuf,double * pheno_d2,uint32_t pheno_nm_ct,uintptr_t * loadbuf_write,double * phenogeno,uint32_t * genosums)8340 void rotate_loadbuf_and_compute_phenogeno(uintptr_t* loadbuf, double* pheno_d2, uint32_t pheno_nm_ct, uintptr_t* loadbuf_write, double* phenogeno, uint32_t* genosums) {
8341   double cur_phenogeno = 0;
8342   uint32_t geno1_ct = 0;
8343   uint32_t geno2_ct = 0;
8344   uintptr_t cur_word;
8345   uintptr_t ulii;
8346   uintptr_t uljj;
8347   double dxx;
8348   uint32_t sample_idx;
8349   uint32_t sample_idx_base;
8350   uint32_t uii;
8351   for (sample_idx = 0; sample_idx < pheno_nm_ct;) {
8352     // we're interested in hom A1 (non-trailing 00) and het (10) bit
8353     // values here.
8354     cur_word = ~(*loadbuf++);
8355     sample_idx_base = sample_idx;
8356     sample_idx += BITCT2;
8357     if (sample_idx > pheno_nm_ct) {
8358       cur_word &= (ONELU << (2 * (pheno_nm_ct % BITCT2))) - ONELU;
8359     }
8360     // now hom A1 = 11 and het = 01.  Temporarily erase the 10s.
8361     uljj = cur_word & FIVEMASK;
8362     ulii = uljj | (cur_word & (uljj << 1));
8363     while (ulii) {
8364       uii = CTZLU(ulii);
8365       dxx = pheno_d2[sample_idx_base + uii / 2];
8366       if (!((ulii >> (uii + 1)) & 1)) {
8367 	// het
8368 	cur_phenogeno += dxx;
8369 	geno1_ct++;
8370       } else {
8371 	// hom A1
8372 	cur_phenogeno += 2 * dxx;
8373 	geno2_ct++;
8374       }
8375       ulii &= ~((3 * ONELU) << uii);
8376     }
8377     // currently hom A1 = 11, missing = 10, het = 01, hom A2 = 00
8378     // rotate to hom A1 = 10, missing = 11, het = 01, hom A2 = 00
8379     // to allow inner loop to use ordinary multiplication
8380     *loadbuf_write++ = cur_word ^ ((cur_word >> 1) & FIVEMASK);
8381   }
8382   *phenogeno = cur_phenogeno;
8383   genosums[0] = geno1_ct + 2 * geno2_ct;
8384   genosums[1] = geno1_ct + 4 * geno2_ct;
8385 }
8386 
epistasis_linear_regression(pthread_t * threads,Epi_info * epi_ip,FILE * bedfile,uintptr_t bed_offset,uintptr_t unfiltered_marker_ct,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,uintptr_t marker_uidx_base,uintptr_t marker_ct1,uintptr_t * marker_exclude1,uintptr_t marker_idx1_start,uintptr_t marker_idx1_end,uintptr_t marker_ct2,uintptr_t * marker_exclude2,uint32_t is_triangular,uintptr_t job_size,uint64_t tests_expected,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uint32_t pheno_nm_ct,double * pheno_d,uint32_t parallel_idx,uint32_t parallel_tot,char * outname,char * outname_end,double output_min_p,double glm_vif_thresh,uintptr_t * loadbuf_raw,uintptr_t * loadbuf,double * best_chisq,uint32_t * best_ids,uint32_t * n_sig_cts,uint32_t * fail_cts,uint32_t * gap_cts)8387 int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t marker_uidx_base, uintptr_t marker_ct1, uintptr_t* marker_exclude1, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end, uintptr_t marker_ct2, uintptr_t* marker_exclude2, uint32_t is_triangular, uintptr_t job_size, uint64_t tests_expected, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, double* pheno_d, uint32_t parallel_idx, uint32_t parallel_tot, char* outname, char* outname_end, double output_min_p, double glm_vif_thresh, uintptr_t* loadbuf_raw, uintptr_t* loadbuf, double* best_chisq, uint32_t* best_ids, uint32_t* n_sig_cts, uint32_t* fail_cts, uint32_t* gap_cts) {
8388   // We use QT --assoc's strategy for speeding up linear regression, since we
8389   // do not need to support arbitrary covariates.  It's more complicated here
8390   // because we have 3 covariates instead of one, but two of them are still
8391   // restricted to the values {0, 1, 2} and the last is the product of the
8392   // first two.  So we're able to use variations of the QT --assoc bit hacks.
8393   FILE* outfile = nullptr;
8394   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8395   uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
8396   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
8397   uintptr_t marker_uidx = marker_uidx_base;
8398   uintptr_t pct = 1;
8399   uintptr_t marker_uidx2 = 0;
8400   uintptr_t marker_idx1 = marker_idx1_start;
8401   uintptr_t marker_idx2 = 0;
8402   uint64_t pct_thresh = tests_expected / 100;
8403   uint64_t tests_complete = 0;
8404   uint32_t max_thread_ct = g_epi_thread_ct;
8405   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
8406   uint32_t chrom_end = 0;
8407   uint32_t chrom_idx = 0;
8408   uint32_t chrom_idx2 = 0;
8409   int32_t retval = 0;
8410   unsigned char* bigstack_mark2;
8411   char* wptr_start;
8412   char* wptr_start2;
8413   char* wptr;
8414   double* pheno_d2;
8415   double* dptr;
8416   double* dptr2;
8417   uint32_t* uiptr;
8418   uint32_t* uiptr2;
8419   uint32_t* uiptr3;
8420   uint32_t* uiptr4;
8421   uint32_t* uiptr5;
8422   uintptr_t cur_bigstack_left;
8423   uintptr_t cur_workload;
8424   uintptr_t idx1_block_size;
8425   uintptr_t idx2_block_size;
8426   uintptr_t idx2_block_sizea16;
8427   uintptr_t marker_uidx_tmp;
8428   uintptr_t block_idx1;
8429   uintptr_t block_idx2;
8430   uintptr_t cur_idx2_block_size;
8431   uintptr_t chrom_end2;
8432   uintptr_t tidx;
8433   uintptr_t ulii;
8434   uintptr_t uljj;
8435   uintptr_t ulkk;
8436   double dxx;
8437   uint32_t chrom_fo_idx;
8438   uint32_t chrom_fo_idx2;
8439   uint32_t is_last_block;
8440   uint32_t sample_uidx;
8441   uint32_t sample_idx;
8442   uint32_t uii;
8443   uint32_t ujj;
8444   if (bigstack_alloc_d(pheno_nm_ct, &pheno_d2)) {
8445     goto epistasis_linear_regression_ret_NOMEM;
8446   }
8447   g_epi_pheno_d2 = pheno_d2;
8448   g_epi_pheno_nm_ct = pheno_nm_ct;
8449   dptr = pheno_d2;
8450   g_epi_pheno_sum = 0;
8451   g_epi_pheno_ssq = 0;
8452   for (sample_uidx = 0, sample_idx = 0; sample_idx < pheno_nm_ct; sample_uidx++, sample_idx++) {
8453     next_set_unsafe_ck(pheno_nm, &sample_uidx);
8454     dxx = pheno_d[sample_uidx];
8455     g_epi_pheno_sum += dxx;
8456     g_epi_pheno_ssq += dxx * dxx;
8457     pheno_d2[sample_idx] = dxx;
8458   }
8459   // could add an epsilon here, but this is good enough to catch the most
8460   // common case (all phenotypes are the same integer near zero).
8461   if (g_epi_pheno_ssq * ((double)((int32_t)pheno_nm_ct)) == g_epi_pheno_sum * g_epi_pheno_sum) {
8462     logerrprint("Error: Phenotype is constant.\n");
8463     goto epistasis_linear_regression_ret_INVALID_CMDLINE;
8464   }
8465   g_epi_vif_thresh = glm_vif_thresh;
8466 
8467   // claim up to half of memory with idx1 bufs; each marker currently costs:
8468   //   pheno_nm_ctl2 * sizeof(intptr_t) for geno buf
8469   //   sizeof(double) for precomputed sum(phenotype * genotype) values
8470   //   2 * sizeof(int32_t) for precomputed sum(genotype) and sum(genotype^2)
8471   //     values
8472   //   4 * sizeof(int32_t) + sizeof(double) + marker_ct2 * 2 * sizeof(double)
8473   //     for other stuff (see epistasis_report() comment, starting from
8474   //     "offset"; main result buffer must be double-size to store both beta
8475   //     and chi-square stat)
8476   cur_bigstack_left = bigstack_left();
8477   ulii = 6 * CACHELINE + max_thread_ct * (5 * (CACHELINE - 4)) - 5 * sizeof(int32_t) - sizeof(double);
8478   if (cur_bigstack_left >= ulii) {
8479     cur_bigstack_left -= ulii;
8480   }
8481   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 6 * sizeof(int32_t) + 2 * sizeof(double) + marker_ct2 * 2 * sizeof(double);
8482   idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
8483   if (!idx1_block_size) {
8484     goto epistasis_linear_regression_ret_NOMEM;
8485   }
8486   if (idx1_block_size > job_size) {
8487     idx1_block_size = job_size;
8488   }
8489   // pad to avoid threads writing to same cacheline
8490   ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
8491   bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
8492   bigstack_alloc_ul(pheno_nm_ctl2 * idx1_block_size, &g_epi_geno1);
8493   bigstack_alloc_d(idx1_block_size, &g_epi_phenogeno1);
8494   // may be better to just recompute genosums values in inner loop?  can test
8495   // this later
8496   bigstack_alloc_ui(idx1_block_size * 2, &g_epi_genosums1);
8497   bigstack_alloc_d(idx1_block_size * marker_ct2 * 2, &g_epi_all_chisq);
8498   bigstack_alloc_d(ulii, &g_epi_best_chisq1);
8499   bigstack_alloc_ui(ulii, &g_epi_best_id1);
8500   bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
8501   bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
8502   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
8503     g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
8504   }
8505   if (is_triangular) {
8506     fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
8507   }
8508 
8509   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 2 * sizeof(int32_t) + sizeof(double) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
8510   idx2_block_size = (bigstack_left() - (3 * CACHELINE - sizeof(intptr_t) - 2 * sizeof(int32_t) - sizeof(double)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(double)))) / ulii;
8511   if (idx2_block_size > marker_ct2) {
8512     idx2_block_size = marker_ct2;
8513   }
8514   idx2_block_size = round_up_pow2(idx2_block_size, 16);
8515 
8516   memcpy(outname_end, ".epi.qt", 8);
8517   if (parallel_tot > 1) {
8518     outname_end[7] = '.';
8519     uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
8520   }
8521   if (fopen_checked(outname, "w", &outfile)) {
8522     goto epistasis_linear_regression_ret_OPEN_FAIL;
8523   }
8524   if (!parallel_idx) {
8525     wptr = memcpya(g_textbuf, "CHR1 ", 5);
8526     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
8527     wptr = memcpya(wptr, " CHR2 ", 6);
8528     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
8529     wptr = memcpya(wptr, "     BETA_INT         STAT            P \n", 41);
8530     if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
8531       goto epistasis_linear_regression_ret_WRITE_FAIL;
8532     }
8533   }
8534 
8535   bigstack_mark2 = g_bigstack_base;
8536   while (1) {
8537     if (!idx2_block_size) {
8538       goto epistasis_linear_regression_ret_NOMEM;
8539     }
8540     if (!(bigstack_alloc_ul(pheno_nm_ctl2 * idx2_block_size, &g_epi_geno2) ||
8541           bigstack_alloc_d(idx2_block_size, &g_epi_phenogeno2) ||
8542           bigstack_alloc_ui(idx2_block_size * 2, &g_epi_genosums2) ||
8543           bigstack_alloc_d(max_thread_ct * idx2_block_size, &g_epi_best_chisq2) ||
8544           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
8545           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
8546           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
8547       break;
8548     }
8549     bigstack_reset(bigstack_mark2);
8550     idx2_block_size -= 16;
8551   }
8552   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
8553     g_epi_geno2[block_idx2 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
8554   }
8555   marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
8556   if (marker_idx1) {
8557     marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
8558   }
8559   wptr = memcpya(g_logbuf, "QT --epistasis to ", 18);
8560   wptr = strcpya(wptr, outname);
8561   memcpy(wptr, " ... ", 6);
8562   wordwrapb(16); // strlen("99% [processing]")
8563   logprintb();
8564   fputs("0%", stdout);
8565   do {
8566     fputs(" [processing]", stdout);
8567     fflush(stdout);
8568     if (idx1_block_size > marker_idx1_end - marker_idx1) {
8569       idx1_block_size = marker_idx1_end - marker_idx1;
8570       if (idx1_block_size < max_thread_ct) {
8571         max_thread_ct = idx1_block_size;
8572         g_epi_thread_ct = max_thread_ct;
8573       }
8574     }
8575     g_epi_marker_idx1 = marker_idx1;
8576     dptr = g_epi_all_chisq;
8577     dptr2 = &(g_epi_all_chisq[idx1_block_size * marker_ct2 * 2]);
8578     do {
8579       *dptr = -1;
8580       dptr = &(dptr[2]);
8581     } while (dptr < dptr2);
8582     marker_uidx_tmp = marker_uidx;
8583     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
8584       goto epistasis_linear_regression_ret_READ_FAIL;
8585     }
8586     cur_workload = idx1_block_size * marker_ct2;
8587     if (is_triangular) {
8588       for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
8589         ulii = block_idx1 + marker_idx1 + 1;
8590         cur_workload -= ulii;
8591         g_epi_geno1_offsets[2 * block_idx1 + 1] = ulii;
8592       }
8593     } else {
8594       fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
8595       marker_uidx2 = marker_uidx_base;
8596       marker_idx2 = 0;
8597     }
8598     tests_complete += cur_workload;
8599     ulii = 0; // total number of tests
8600     g_epi_idx1_block_bounds[0] = 0;
8601     g_epi_idx1_block_bounds16[0] = 0;
8602     block_idx1 = 0;
8603     for (tidx = 1; tidx < max_thread_ct; tidx++) {
8604       uljj = (((uint64_t)cur_workload) * tidx) / max_thread_ct;
8605       if (is_triangular) {
8606         do {
8607           ulii += marker_ct2 - g_epi_geno1_offsets[2 * block_idx1 + 1];
8608           block_idx1++;
8609 	} while (ulii < uljj);
8610       } else {
8611         do {
8612 	  ulii += marker_ct2;
8613 	  block_idx1++;
8614 	} while (ulii < uljj);
8615       }
8616       uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
8617       g_epi_idx1_block_bounds[tidx] = block_idx1;
8618       g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
8619     }
8620     g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
8621     chrom_end = 0;
8622     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
8623       if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
8624 	marker_uidx_tmp = next_unset_ul_unsafe(marker_exclude1, marker_uidx_tmp);
8625         if (fseeko(bedfile, bed_offset + (marker_uidx_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
8626           goto epistasis_linear_regression_ret_READ_FAIL;
8627 	}
8628       }
8629       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx_tmp), bedfile, loadbuf_raw, loadbuf)) {
8630         goto epistasis_linear_regression_ret_READ_FAIL;
8631       }
8632       rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), &(g_epi_phenogeno1[block_idx1]), &(g_epi_genosums1[block_idx1 * 2]));
8633       if (!is_triangular) {
8634 	if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
8635           // do not compare against self
8636           marker_idx2 += marker_uidx_tmp - marker_uidx2 - popcount_bit_idx(marker_exclude2, marker_uidx2, marker_uidx_tmp);
8637           marker_uidx2 = marker_uidx_tmp;
8638           g_epi_geno1_offsets[2 * block_idx1] = marker_idx2;
8639           g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2 + 1;
8640           gap_cts[block_idx1 + marker_idx1] = 1;
8641 	}
8642       }
8643     }
8644     marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
8645     if (is_triangular) {
8646       marker_idx2 = marker_idx1 + 1;
8647       marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2);
8648     } else {
8649       marker_idx2 = 0;
8650     }
8651     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
8652       goto epistasis_linear_regression_ret_READ_FAIL;
8653     }
8654     cur_idx2_block_size = idx2_block_size;
8655     do {
8656       if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
8657         cur_idx2_block_size = marker_ct2 - marker_idx2;
8658       }
8659       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
8660         if (IS_SET(marker_exclude2, marker_uidx2)) {
8661           marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
8662           if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
8663             goto epistasis_linear_regression_ret_READ_FAIL;
8664 	  }
8665 	}
8666 	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
8667 	  goto epistasis_linear_regression_ret_READ_FAIL;
8668 	}
8669         rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]), &(g_epi_phenogeno2[block_idx2]), &(g_epi_genosums2[block_idx2 * 2]));
8670       }
8671       g_epi_idx2_block_size = cur_idx2_block_size;
8672       g_epi_idx2_block_start = marker_idx2;
8673       idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
8674       fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_n_sig_ct1);
8675       fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_fail_ct1);
8676       fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_n_sig_ct2);
8677       fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_fail_ct2);
8678       for (tidx = 0; tidx < max_thread_ct; tidx++) {
8679         ulii = g_epi_idx1_block_bounds[tidx];
8680         uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
8681         dptr = &(g_epi_best_chisq1[g_epi_idx1_block_bounds[tidx]]);
8682 	dptr2 = &(g_epi_all_chisq[(marker_idx1 + ulii) * 2]);
8683 	for (ulkk = 0; ulkk < uljj; ulkk++) {
8684           *dptr++ = dptr2[ulkk * 2];
8685 	}
8686         ulii = g_epi_geno1_offsets[2 * ulii + 1];
8687         if (ulii < marker_idx2 + cur_idx2_block_size) {
8688           if (ulii <= marker_idx2) {
8689             ulii = 0;
8690 	  } else {
8691             ulii -= marker_idx2;
8692 	  }
8693           uljj = cur_idx2_block_size - ulii;
8694 	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + ulii]);
8695 	  dptr2 = &(g_epi_all_chisq[(marker_idx2 + ulii) * 2]);
8696           for (ulkk = 0; ulkk < uljj; ulkk++) {
8697             *dptr++ = dptr2[ulkk * 2];
8698 	  }
8699 	}
8700       }
8701       is_last_block = (marker_idx2 + cur_idx2_block_size >= marker_ct2);
8702       if (spawn_threads2(threads, &epi_linear_thread, max_thread_ct, is_last_block)) {
8703 	goto epistasis_linear_regression_ret_THREAD_CREATE_FAIL;
8704       }
8705       epi_linear_thread((void*)0);
8706       join_threads2(threads, max_thread_ct, is_last_block);
8707       // merge best_chisq, best_ids, fail_cts
8708       for (tidx = 0; tidx < max_thread_ct; tidx++) {
8709 	ulii = g_epi_idx1_block_bounds[tidx];
8710 	uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
8711 	uii = g_epi_idx1_block_bounds16[tidx];
8712 	dptr = &(g_epi_best_chisq1[uii]);
8713 	uiptr = &(g_epi_best_id1[uii]);
8714 	uiptr2 = &(g_epi_n_sig_ct1[uii]);
8715 	uiptr3 = &(g_epi_fail_ct1[uii]);
8716 	ulii += marker_idx1;
8717 	dptr2 = &(best_chisq[ulii]);
8718 	uiptr4 = &(n_sig_cts[ulii]);
8719 	uiptr5 = &(fail_cts[ulii]);
8720 	for (block_idx1 = 0; block_idx1 < uljj; block_idx1++, dptr2++, uiptr4++, uiptr5++) {
8721 	  dxx = *dptr++;
8722 	  if (dxx > (*dptr2)) {
8723 	    *dptr2 = dxx;
8724 	    best_ids[block_idx1 + ulii] = uiptr[block_idx1];
8725 	  }
8726 	  *uiptr4 += *uiptr2++;
8727 	  *uiptr5 += *uiptr3++;
8728 	}
8729       }
8730       if (is_triangular) {
8731 	for (tidx = 0; tidx < max_thread_ct; tidx++) {
8732 	  block_idx2 = g_epi_geno1_offsets[2 * g_epi_idx1_block_bounds[tidx] + 1];
8733 	  if (block_idx2 <= marker_idx2) {
8734 	    block_idx2 = 0;
8735 	  } else {
8736 	    block_idx2 -= marker_idx2;
8737 	  }
8738 	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + block_idx2]);
8739 	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
8740 	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
8741 	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
8742 	  dptr2 = &(best_chisq[block_idx2 + marker_idx2]);
8743 	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
8744 	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
8745 	  for (; block_idx2 < cur_idx2_block_size; block_idx2++, dptr2++, uiptr4++, uiptr5++) {
8746 	    dxx = *dptr++;
8747 	    if (dxx > (*dptr2)) {
8748 	      *dptr2 = dxx;
8749 	      best_ids[block_idx2 + marker_idx2] = uiptr[block_idx2];
8750 	    }
8751 	    *uiptr4 += *uiptr2++;
8752 	    *uiptr5 += *uiptr3++;
8753 	  }
8754 	}
8755       }
8756       marker_idx2 += cur_idx2_block_size;
8757     } while (marker_idx2 < marker_ct2);
8758     fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
8759     fflush(stdout);
8760     chrom_end = 0;
8761     block_idx1 = 0;
8762     while (1) {
8763       next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
8764       ujj = g_epi_geno1_offsets[2 * block_idx1];
8765       marker_idx2 = 0;
8766       dptr = &(g_epi_all_chisq[block_idx1 * 2 * marker_ct2]);
8767       if (marker_uidx >= chrom_end) {
8768 	chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
8769 	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
8770 	chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
8771       }
8772       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
8773       *wptr_start++ = ' ';
8774       wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
8775       *wptr_start++ = ' ';
8776       marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
8777       for (chrom_fo_idx2 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
8778 	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
8779 	chrom_end2 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx2 + 1];
8780 	wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
8781 	*wptr_start2++ = ' ';
8782 	for (; marker_uidx2 < chrom_end2; ++marker_uidx2, next_unset_ul_ck(marker_exclude2, unfiltered_marker_ct, &marker_uidx2), ++marker_idx2, dptr = &(dptr[2])) {
8783 	  if (marker_idx2 == ujj) {
8784 	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
8785 	    if (marker_idx2 == marker_ct2) {
8786 	      goto epistasis_linear_regression_write_loop;
8787 	    }
8788 	    if (marker_idx2 > ujj) {
8789 	      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2 - ujj);
8790 	      dptr = &(dptr[2 * (marker_idx2 - ujj)]);
8791 	      if (marker_uidx2 >= chrom_end2) {
8792 		break;
8793 	      }
8794 	    }
8795 	  } else if (marker_idx2 == marker_ct2) {
8796 	    goto epistasis_linear_regression_write_loop;
8797 	  }
8798 	  dxx = *dptr;
8799 	  if (dxx != -1) {
8800 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
8801 	    *wptr++ = ' ';
8802 	    // beta
8803 	    wptr = width_force(12, wptr, dtoa_g(dptr[1], wptr));
8804             *wptr++ = ' ';
8805 	    wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
8806 	    *wptr++ = ' ';
8807 	    dxx = normdist(-sqrt(dxx)) * 2;
8808 	    wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
8809 	    *wptr++ = '\n';
8810 	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
8811 	      goto epistasis_linear_regression_ret_WRITE_FAIL;
8812 	    }
8813 	    // could remove this writeback in --epi1 1 case
8814 	    *dptr = -1;
8815 	  }
8816 	}
8817       }
8818     epistasis_linear_regression_write_loop:
8819       block_idx1++;
8820       marker_uidx++;
8821       if (block_idx1 >= idx1_block_size) {
8822         break;
8823       }
8824     }
8825     marker_idx1 += idx1_block_size;
8826     fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
8827     if (tests_complete >= pct_thresh) {
8828       if (pct > 10) {
8829         putc_unlocked('\b', stdout);
8830       }
8831       pct = (tests_complete * 100LLU) / tests_expected;
8832       if (pct < 100) {
8833         printf("\b\b%" PRIuPTR "%%", pct);
8834         fflush(stdout);
8835         pct_thresh = ((++pct) * ((uint64_t)tests_expected)) / 100;
8836       }
8837     }
8838   } while (marker_idx1 < marker_idx1_end);
8839   if (fclose_null(&outfile)) {
8840     goto epistasis_linear_regression_ret_WRITE_FAIL;
8841   }
8842   while (0) {
8843   epistasis_linear_regression_ret_NOMEM:
8844     retval = RET_NOMEM;
8845     break;
8846   epistasis_linear_regression_ret_OPEN_FAIL:
8847     retval = RET_OPEN_FAIL;
8848     break;
8849   epistasis_linear_regression_ret_READ_FAIL:
8850     retval = RET_READ_FAIL;
8851     break;
8852   epistasis_linear_regression_ret_WRITE_FAIL:
8853     retval = RET_WRITE_FAIL;
8854     break;
8855   epistasis_linear_regression_ret_INVALID_CMDLINE:
8856     retval = RET_INVALID_CMDLINE;
8857     break;
8858   epistasis_linear_regression_ret_THREAD_CREATE_FAIL:
8859     retval = RET_THREAD_CREATE_FAIL;
8860     break;
8861   }
8862   fclose_cond(outfile);
8863   // caller will free memory
8864   return retval;
8865 }
8866 
epistasis_logistic_regression(pthread_t * threads,Epi_info * epi_ip,FILE * bedfile,uintptr_t bed_offset,uintptr_t unfiltered_marker_ct,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,uintptr_t marker_uidx_base,uintptr_t marker_ct1,uintptr_t * marker_exclude1,uintptr_t marker_idx1_start,uintptr_t marker_idx1_end,uintptr_t marker_ct2,uintptr_t * marker_exclude2,uint32_t is_triangular,uintptr_t job_size,uint64_t tests_expected,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uint32_t pheno_nm_ct,uintptr_t * pheno_c,uint32_t parallel_idx,uint32_t parallel_tot,char * outname,char * outname_end,double output_min_p,uintptr_t * loadbuf_raw,uintptr_t * loadbuf,double * best_chisq,uint32_t * best_ids,uint32_t * n_sig_cts,uint32_t * fail_cts,uint32_t * gap_cts)8867 int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t marker_uidx_base, uintptr_t marker_ct1, uintptr_t* marker_exclude1, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end, uintptr_t marker_ct2, uintptr_t* marker_exclude2, uint32_t is_triangular, uintptr_t job_size, uint64_t tests_expected, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uintptr_t* pheno_c, uint32_t parallel_idx, uint32_t parallel_tot, char* outname, char* outname_end, double output_min_p, uintptr_t* loadbuf_raw, uintptr_t* loadbuf, double* best_chisq, uint32_t* best_ids, uint32_t* n_sig_cts, uint32_t* fail_cts, uint32_t* gap_cts) {
8868   FILE* outfile = nullptr;
8869   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8870   uintptr_t pheno_nm_cta4 = round_up_pow2(pheno_nm_ct, 4);
8871   uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
8872   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
8873   uintptr_t marker_uidx = marker_uidx_base;
8874   uintptr_t pct = 1;
8875   uintptr_t marker_uidx2 = 0;
8876   uintptr_t marker_idx1 = marker_idx1_start;
8877   uintptr_t marker_idx2 = 0;
8878   uint64_t pct_thresh = tests_expected / 100;
8879   uint64_t tests_complete = 0;
8880   uint32_t max_thread_ct = g_epi_thread_ct;
8881   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
8882   uint32_t chrom_end = 0;
8883   uint32_t chrom_idx = 0;
8884   uint32_t chrom_idx2 = 0;
8885   int32_t retval = 0;
8886   unsigned char* bigstack_mark2;
8887   uintptr_t* ulptr;
8888   char* wptr_start;
8889   char* wptr_start2;
8890   char* wptr;
8891   float* fptr;
8892   float* fptr2;
8893   double* dptr;
8894   uint32_t* uiptr;
8895   uint32_t* uiptr2;
8896   uint32_t* uiptr3;
8897   uint32_t* uiptr4;
8898   uint32_t* uiptr5;
8899   uintptr_t cur_bigstack_left;
8900   uintptr_t cur_workload;
8901   uintptr_t idx1_block_size;
8902   uintptr_t idx2_block_size;
8903   uintptr_t idx2_block_sizea16;
8904   uintptr_t marker_uidx_tmp;
8905   uintptr_t block_idx1;
8906   uintptr_t block_idx2;
8907   uintptr_t cur_idx2_block_size;
8908   uintptr_t chrom_end2;
8909   uintptr_t tidx;
8910   uintptr_t ulii;
8911   uintptr_t uljj;
8912   uintptr_t ulkk;
8913   double dxx;
8914   float fxx;
8915   uint32_t chrom_fo_idx;
8916   uint32_t chrom_fo_idx2;
8917   uint32_t is_last_block;
8918   uint32_t uii;
8919   uint32_t ujj;
8920   if (bigstack_alloc_ul(pheno_nm_ctl2, &g_epi_pheno_c)) {
8921     goto epistasis_logistic_regression_ret_NOMEM;
8922   }
8923   copy_bitarr_subset(pheno_c, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, g_epi_pheno_c);
8924   g_epi_pheno_nm_ct = pheno_nm_ct;
8925   // per-thread buffers
8926   g_epi_logistic_mt = (Epi_logistic_multithread*)bigstack_alloc(max_thread_ct * sizeof(Epi_logistic_multithread));
8927   if (!g_epi_logistic_mt) {
8928     goto epistasis_logistic_regression_ret_NOMEM;
8929   }
8930   // param_ct_max = 4 (intercept, A, B, AB)
8931   for (tidx = 0; tidx < max_thread_ct; tidx++) {
8932     if (bigstack_alloc_f(pheno_nm_cta4 * 4, &(g_epi_logistic_mt[tidx].cur_covars_cov_major)) ||
8933         bigstack_alloc_f(4, &(g_epi_logistic_mt[tidx].coef)) ||
8934         bigstack_alloc_f(pheno_nm_cta4, &(g_epi_logistic_mt[tidx].pp)) ||
8935         bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].sample_1d_buf)) ||
8936         bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].pheno_buf)) ||
8937         bigstack_alloc_f(pheno_nm_ct * 4, &(g_epi_logistic_mt[tidx].param_1d_buf)) ||
8938         bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].param_1d_buf2)) ||
8939 	bigstack_alloc_f(4 * 4, &(g_epi_logistic_mt[tidx].param_2d_buf)) ||
8940         bigstack_alloc_f(4 * 4, &(g_epi_logistic_mt[tidx].param_2d_buf2))) {
8941       goto epistasis_logistic_regression_ret_NOMEM;
8942     }
8943   }
8944 
8945   // claim up to half of memory with idx1 bufs; each marker currently costs:
8946   //   pheno_nm_ctl2 * sizeof(intptr_t) for geno buf
8947   //   4 * sizeof(int32_t) + sizeof(float) + marker_ct2 * 2 * sizeof(float)
8948   //     for other stuff (see epistasis_report() comment, starting from
8949   //     "offset"; main result buffer must be double-size to store both beta
8950   //     and chi-square stat)
8951   cur_bigstack_left = bigstack_left();
8952   ulii = 4 * CACHELINE - 3 * sizeof(int32_t) + max_thread_ct * (5 * (CACHELINE - 4));
8953   if (cur_bigstack_left >= ulii) {
8954     cur_bigstack_left -= ulii;
8955   }
8956   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 4 * sizeof(int32_t) + sizeof(float) + marker_ct2 * 2 * sizeof(float);
8957   idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
8958   if (!idx1_block_size) {
8959     goto epistasis_logistic_regression_ret_NOMEM;
8960   }
8961   if (idx1_block_size > job_size) {
8962     idx1_block_size = job_size;
8963   }
8964   // pad to avoid threads writing to same cacheline
8965   ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
8966   bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
8967   bigstack_alloc_ul(pheno_nm_ctl2 * idx1_block_size, &g_epi_geno1);
8968   bigstack_alloc_f(idx1_block_size * marker_ct2 * 2, &g_epi_all_chisq_f);
8969   bigstack_alloc_f(ulii, &g_epi_best_chisq_f1);
8970   bigstack_alloc_ui(ulii, &g_epi_best_id1);
8971   bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
8972   bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
8973   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
8974     g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
8975   }
8976   if (is_triangular) {
8977     fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
8978   }
8979 
8980   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
8981   idx2_block_size = (bigstack_left() - (CACHELINE - sizeof(intptr_t)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(float)))) / ulii;
8982   if (idx2_block_size > marker_ct2) {
8983     idx2_block_size = marker_ct2;
8984   }
8985   idx2_block_size = round_up_pow2(idx2_block_size, 16);
8986 
8987   memcpy(outname_end, ".epi.cc", 8);
8988   if (parallel_tot > 1) {
8989     outname_end[7] = '.';
8990     uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
8991   }
8992   if (fopen_checked(outname, "w", &outfile)) {
8993     goto epistasis_logistic_regression_ret_OPEN_FAIL;
8994   }
8995   if (!parallel_idx) {
8996     wptr = memcpya(g_textbuf, "CHR1 ", 5);
8997     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
8998     wptr = memcpya(wptr, " CHR2 ", 6);
8999     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
9000     wptr = memcpya(wptr, "       OR_INT         STAT            P \n", 41);
9001     if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9002       goto epistasis_logistic_regression_ret_WRITE_FAIL;
9003     }
9004   }
9005 
9006   bigstack_mark2 = g_bigstack_base;
9007   while (1) {
9008     if (!idx2_block_size) {
9009       goto epistasis_logistic_regression_ret_NOMEM;
9010     }
9011     if (!(bigstack_alloc_ul(pheno_nm_ctl2 * idx2_block_size, &g_epi_geno2) ||
9012           bigstack_alloc_f(max_thread_ct * idx2_block_size, &g_epi_best_chisq_f2) ||
9013           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
9014           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
9015           bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
9016       break;
9017     }
9018     bigstack_reset(bigstack_mark2);
9019     idx2_block_size -= 16;
9020   }
9021   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
9022     g_epi_geno2[block_idx2 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
9023   }
9024   marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
9025   if (marker_idx1) {
9026     marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
9027   }
9028   wptr = memcpya(g_logbuf, "C/C --epistasis to ", 19);
9029   wptr = strcpya(wptr, outname);
9030   memcpy(wptr, " ... ", 6);
9031   wordwrapb(16); // strlen("99% [processing]")
9032   logprintb();
9033   fputs("0%", stdout);
9034   do {
9035     fputs(" [processing]", stdout);
9036     fflush(stdout);
9037     if (idx1_block_size > marker_idx1_end - marker_idx1) {
9038       idx1_block_size = marker_idx1_end - marker_idx1;
9039       if (idx1_block_size < max_thread_ct) {
9040         max_thread_ct = idx1_block_size;
9041         g_epi_thread_ct = max_thread_ct;
9042       }
9043     }
9044     g_epi_marker_idx1 = marker_idx1;
9045     fptr = g_epi_all_chisq_f;
9046     fptr2 = &(g_epi_all_chisq_f[idx1_block_size * marker_ct2 * 2]);
9047     do {
9048       *fptr = -1;
9049       fptr = &(fptr[2]);
9050     } while (fptr < fptr2);
9051     marker_uidx_tmp = marker_uidx;
9052     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9053       goto epistasis_logistic_regression_ret_READ_FAIL;
9054     }
9055     cur_workload = idx1_block_size * marker_ct2;
9056     if (is_triangular) {
9057       for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
9058         ulii = block_idx1 + marker_idx1 + 1;
9059         cur_workload -= ulii;
9060         g_epi_geno1_offsets[2 * block_idx1 + 1] = ulii;
9061       }
9062     } else {
9063       fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
9064       marker_uidx2 = marker_uidx_base;
9065       marker_idx2 = 0;
9066     }
9067     tests_complete += cur_workload;
9068     ulii = 0; // total number of tests
9069     g_epi_idx1_block_bounds[0] = 0;
9070     g_epi_idx1_block_bounds16[0] = 0;
9071     block_idx1 = 0;
9072     for (tidx = 1; tidx < max_thread_ct; tidx++) {
9073       uljj = (((uint64_t)cur_workload) * tidx) / max_thread_ct;
9074       if (is_triangular) {
9075         do {
9076           ulii += marker_ct2 - g_epi_geno1_offsets[2 * block_idx1 + 1];
9077           block_idx1++;
9078 	} while (ulii < uljj);
9079       } else {
9080         do {
9081 	  ulii += marker_ct2;
9082 	  block_idx1++;
9083 	} while (ulii < uljj);
9084       }
9085       uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
9086       g_epi_idx1_block_bounds[tidx] = block_idx1;
9087       g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
9088     }
9089     g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
9090     chrom_end = 0;
9091     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
9092       if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
9093 	marker_uidx_tmp = next_unset_ul_unsafe(marker_exclude1, marker_uidx_tmp);
9094         if (fseeko(bedfile, bed_offset + (marker_uidx_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9095           goto epistasis_logistic_regression_ret_READ_FAIL;
9096 	}
9097       }
9098       // marker_reverse deliberately flipped
9099       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx_tmp), bedfile, loadbuf_raw, loadbuf)) {
9100         goto epistasis_logistic_regression_ret_READ_FAIL;
9101       }
9102       // rotate to hom A1 = 10, het = 01, hom A2 = 00, missing = 11, to allow
9103       // inner loop to use ordinary multiplication
9104       // this is a bit redundant with the forced reverse, but it's not a
9105       // bottleneck
9106       rotate_plink1_to_a2ct_and_copy(loadbuf, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), pheno_nm_ctl2);
9107       if (!is_triangular) {
9108 	if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
9109           // do not compare against self
9110           marker_idx2 += marker_uidx_tmp - marker_uidx2 - popcount_bit_idx(marker_exclude2, marker_uidx2, marker_uidx_tmp);
9111           marker_uidx2 = marker_uidx_tmp;
9112           g_epi_geno1_offsets[2 * block_idx1] = marker_idx2;
9113           g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2 + 1;
9114           gap_cts[block_idx1 + marker_idx1] = 1;
9115 	}
9116       }
9117     }
9118     marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
9119     if (is_triangular) {
9120       marker_idx2 = marker_idx1 + 1;
9121       marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2);
9122     } else {
9123       marker_idx2 = 0;
9124     }
9125     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9126       goto epistasis_logistic_regression_ret_READ_FAIL;
9127     }
9128     cur_idx2_block_size = idx2_block_size;
9129     do {
9130       if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
9131         cur_idx2_block_size = marker_ct2 - marker_idx2;
9132       }
9133       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
9134         if (IS_SET(marker_exclude2, marker_uidx2)) {
9135           marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
9136           if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9137             goto epistasis_logistic_regression_ret_READ_FAIL;
9138 	  }
9139 	}
9140         ulptr = &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]);
9141 	// marker_reverse deliberately flipped
9142 	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
9143 	  goto epistasis_logistic_regression_ret_READ_FAIL;
9144 	}
9145 	rotate_plink1_to_a2ct_and_copy(loadbuf, ulptr, pheno_nm_ctl2);
9146       }
9147       g_epi_idx2_block_size = cur_idx2_block_size;
9148       g_epi_idx2_block_start = marker_idx2;
9149       idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
9150       fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_n_sig_ct1);
9151       fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_fail_ct1);
9152       fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_n_sig_ct2);
9153       fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_fail_ct2);
9154       for (tidx = 0; tidx < max_thread_ct; tidx++) {
9155         ulii = g_epi_idx1_block_bounds[tidx];
9156         uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
9157         fptr = &(g_epi_best_chisq_f1[g_epi_idx1_block_bounds[tidx]]);
9158 	fptr2 = &(g_epi_all_chisq_f[(marker_idx1 + ulii) * 2]);
9159 	for (ulkk = 0; ulkk < uljj; ulkk++) {
9160           *fptr++ = fptr2[ulkk * 2];
9161 	}
9162         ulii = g_epi_geno1_offsets[2 * ulii + 1];
9163         if (ulii < marker_idx2 + cur_idx2_block_size) {
9164           if (ulii <= marker_idx2) {
9165             ulii = 0;
9166 	  } else {
9167             ulii -= marker_idx2;
9168 	  }
9169           uljj = cur_idx2_block_size - ulii;
9170 	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16 + ulii]);
9171 	  fptr2 = &(g_epi_all_chisq_f[(marker_idx2 + ulii) * 2]);
9172           for (ulkk = 0; ulkk < uljj; ulkk++) {
9173             *fptr++ = fptr2[ulkk * 2];
9174 	  }
9175 	}
9176       }
9177       is_last_block = (marker_idx2 + cur_idx2_block_size >= marker_ct2);
9178       if (spawn_threads2(threads, &epi_logistic_thread, max_thread_ct, is_last_block)) {
9179 	goto epistasis_logistic_regression_ret_THREAD_CREATE_FAIL;
9180       }
9181       epi_logistic_thread((void*)0);
9182       join_threads2(threads, max_thread_ct, is_last_block);
9183       // merge best_chisq, best_ids, fail_cts
9184       for (tidx = 0; tidx < max_thread_ct; tidx++) {
9185 	ulii = g_epi_idx1_block_bounds[tidx];
9186 	uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
9187 	uii = g_epi_idx1_block_bounds16[tidx];
9188 	fptr = &(g_epi_best_chisq_f1[uii]);
9189 	uiptr = &(g_epi_best_id1[uii]);
9190 	uiptr2 = &(g_epi_n_sig_ct1[uii]);
9191 	uiptr3 = &(g_epi_fail_ct1[uii]);
9192 	ulii += marker_idx1;
9193 	dptr = &(best_chisq[ulii]);
9194 	uiptr4 = &(n_sig_cts[ulii]);
9195 	uiptr5 = &(fail_cts[ulii]);
9196 	for (block_idx1 = 0; block_idx1 < uljj; block_idx1++, dptr++, uiptr4++, uiptr5++) {
9197 	  dxx = (double)(*fptr++);
9198 	  if (dxx > (*dptr)) {
9199 	    *dptr = dxx;
9200 	    best_ids[block_idx1 + ulii] = uiptr[block_idx1];
9201 	  }
9202 	  *uiptr4 += *uiptr2++;
9203 	  *uiptr5 += *uiptr3++;
9204 	}
9205       }
9206       if (is_triangular) {
9207 	for (tidx = 0; tidx < max_thread_ct; tidx++) {
9208 	  block_idx2 = g_epi_geno1_offsets[2 * g_epi_idx1_block_bounds[tidx] + 1];
9209 	  if (block_idx2 <= marker_idx2) {
9210 	    block_idx2 = 0;
9211 	  } else {
9212 	    block_idx2 -= marker_idx2;
9213 	  }
9214 	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16 + block_idx2]);
9215 	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
9216 	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
9217 	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
9218 	  dptr = &(best_chisq[block_idx2 + marker_idx2]);
9219 	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
9220 	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
9221 	  for (; block_idx2 < cur_idx2_block_size; block_idx2++, dptr++, uiptr4++, uiptr5++) {
9222 	    dxx = (double)(*fptr++);
9223 	    if (dxx > (*dptr)) {
9224 	      *dptr = dxx;
9225 	      best_ids[block_idx2 + marker_idx2] = uiptr[block_idx2];
9226 	    }
9227 	    *uiptr4 += *uiptr2++;
9228 	    *uiptr5 += *uiptr3++;
9229 	  }
9230 	}
9231       }
9232       marker_idx2 += cur_idx2_block_size;
9233     } while (marker_idx2 < marker_ct2);
9234     fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
9235     fflush(stdout);
9236     chrom_end = 0;
9237     block_idx1 = 0;
9238     while (1) {
9239       next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
9240       ujj = g_epi_geno1_offsets[2 * block_idx1];
9241       marker_idx2 = 0;
9242       fptr = &(g_epi_all_chisq_f[block_idx1 * 2 * marker_ct2]);
9243       if (marker_uidx >= chrom_end) {
9244 	chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
9245 	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9246 	chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
9247       }
9248       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
9249       *wptr_start++ = ' ';
9250       wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
9251       *wptr_start++ = ' ';
9252       marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
9253       for (chrom_fo_idx2 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
9254 	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
9255 	chrom_end2 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx2 + 1];
9256 	wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
9257 	*wptr_start2++ = ' ';
9258 	for (; marker_uidx2 < chrom_end2; ++marker_uidx2, next_unset_ul_ck(marker_exclude2, unfiltered_marker_ct, &marker_uidx2), ++marker_idx2, fptr = &(fptr[2])) {
9259 	  if (marker_idx2 == ujj) {
9260 	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
9261 	    if (marker_idx2 == marker_ct2) {
9262 	      goto epistasis_logistic_regression_write_loop;
9263 	    }
9264 	    if (marker_idx2 > ujj) {
9265 	      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2 - ujj);
9266 	      fptr = &(fptr[2 * (marker_idx2 - ujj)]);
9267 	      if (marker_uidx2 >= chrom_end2) {
9268 		break;
9269 	      }
9270 	    }
9271 	  } else if (marker_idx2 == marker_ct2) {
9272 	    goto epistasis_logistic_regression_write_loop;
9273 	  }
9274 	  fxx = *fptr;
9275 	  if (fxx != -1) {
9276 	    dxx = (double)fxx;
9277 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
9278 	    *wptr++ = ' ';
9279 	    // odds ratio
9280 	    wptr = width_force(12, wptr, dtoa_g(exp((double)fptr[1]), wptr));
9281             *wptr++ = ' ';
9282 	    wptr = width_force(12, wptr, ftoa_g(fxx, wptr));
9283 	    *wptr++ = ' ';
9284 	    dxx = normdist(-sqrt(dxx)) * 2;
9285 	    wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
9286 	    *wptr++ = '\n';
9287 	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9288 	      goto epistasis_logistic_regression_ret_WRITE_FAIL;
9289 	    }
9290 	    // could remove this writeback in --epi1 1 case
9291 	    *fptr = -1;
9292 	  }
9293 	}
9294       }
9295     epistasis_logistic_regression_write_loop:
9296       block_idx1++;
9297       marker_uidx++;
9298       if (block_idx1 >= idx1_block_size) {
9299         break;
9300       }
9301     }
9302     marker_idx1 += idx1_block_size;
9303     fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
9304     if (tests_complete >= pct_thresh) {
9305       if (pct > 10) {
9306         putc_unlocked('\b', stdout);
9307       }
9308       pct = (tests_complete * 100LLU) / tests_expected;
9309       if (pct < 100) {
9310         printf("\b\b%" PRIuPTR "%%", pct);
9311         fflush(stdout);
9312         pct_thresh = ((++pct) * ((uint64_t)tests_expected)) / 100;
9313       }
9314     }
9315   } while (marker_idx1 < marker_idx1_end);
9316   if (fclose_null(&outfile)) {
9317     goto epistasis_logistic_regression_ret_WRITE_FAIL;
9318   }
9319   while (0) {
9320   epistasis_logistic_regression_ret_NOMEM:
9321     retval = RET_NOMEM;
9322     break;
9323   epistasis_logistic_regression_ret_OPEN_FAIL:
9324     retval = RET_OPEN_FAIL;
9325     break;
9326   epistasis_logistic_regression_ret_READ_FAIL:
9327     retval = RET_READ_FAIL;
9328     break;
9329   epistasis_logistic_regression_ret_WRITE_FAIL:
9330     retval = RET_WRITE_FAIL;
9331     break;
9332   epistasis_logistic_regression_ret_THREAD_CREATE_FAIL:
9333     retval = RET_THREAD_CREATE_FAIL;
9334     break;
9335   }
9336   fclose_cond(outfile);
9337   // caller will free memory
9338   return retval;
9339 }
9340 
epistasis_report(pthread_t * threads,Epi_info * epi_ip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct2,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,uint32_t * marker_pos,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uint32_t pheno_nm_ct,uint32_t ctrl_ct,uintptr_t * pheno_c,double * pheno_d,uint32_t parallel_idx,uint32_t parallel_tot,char * outname,char * outname_end,double output_min_p,double glm_vif_thresh,Set_info * sip)9341 int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct2, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t parallel_tot, char* outname, char* outname_end, double output_min_p, double glm_vif_thresh, Set_info* sip) {
9342   unsigned char* bigstack_mark = g_bigstack_base;
9343   FILE* outfile = nullptr;
9344   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
9345   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
9346   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
9347   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
9348   uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
9349   uintptr_t marker_uidx = marker_uidx_base;
9350   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
9351   uint32_t modifier = epi_ip->modifier;
9352   uint32_t is_fast = modifier & EPI_FAST;
9353   uint32_t is_boost = (modifier / EPI_FAST_BOOST) & 1;
9354   uint32_t do_joint_effects = modifier & EPI_FAST_JOINT_EFFECTS;
9355   uint32_t no_ueki = modifier & EPI_FAST_NO_UEKI;
9356   uint32_t is_case_only = (modifier / EPI_FAST_CASE_ONLY) & 1;
9357   uint32_t is_triangular = 1;
9358   uint32_t is_custom_set1 = modifier & (EPI_SET_BY_SET | EPI_SET_BY_ALL)? 1 : 0;
9359   uint32_t is_set_by_set = modifier & EPI_SET_BY_SET;
9360   uint32_t tot_stride = 6 - 3 * is_case_only;
9361   uint32_t no_p_value = modifier & EPI_FAST_NO_P_VALUE;
9362   uint32_t case_only_gap = epi_ip->case_only_gap;
9363   uint32_t is_case_only_window = (is_case_only && case_only_gap);
9364   uint32_t case_ct = pheno_nm_ct - ctrl_ct;
9365   uint32_t cellminx3 = 0;
9366   uintptr_t case_ctl2 = QUATERCT_TO_WORDCT(case_ct);
9367   uintptr_t case_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(case_ct);
9368   uintptr_t ctrl_ctl2 = QUATERCT_TO_WORDCT(ctrl_ct);
9369   uintptr_t case_ctv3 = BITCT_TO_ALIGNED_WORDCT(case_ct);
9370   uintptr_t ctrl_ctv3 = BITCT_TO_ALIGNED_WORDCT(ctrl_ct);
9371   uintptr_t case_ctsplit = 3 * case_ctv3;
9372   uintptr_t ctrl_ctsplit = 3 * ctrl_ctv3;
9373   uintptr_t pct = 1;
9374   uintptr_t marker_uidx2 = 0;
9375   uintptr_t marker_uidx2_trail = 0;
9376   uintptr_t marker_idx2 = 0;
9377   uintptr_t marker_idx2_trail = 0;
9378   uint64_t tests_thrown_out = 0;
9379   uint64_t tests_complete = 0;
9380   uint32_t max_thread_ct = g_thread_ct;
9381   uint32_t chrom_idx = 0;
9382   uint32_t chrom_end = 0;
9383   uint32_t last_pos = 0;
9384   uint32_t first_pos = 0;
9385   uint32_t uii = 0;
9386   int32_t retval = 0;
9387   uint32_t* gap_cts = nullptr;
9388   uintptr_t* ctrlbuf = nullptr;
9389   uintptr_t* marker_exclude1 = nullptr;
9390   uintptr_t* ulptr = nullptr;
9391   uintptr_t ularr[sizeof(double) / BYTECT];
9392   uintptr_t* casebuf;
9393   uintptr_t* loadbuf;
9394   uintptr_t* marker_exclude2;
9395   double* best_chisq;
9396   uint32_t* best_ids;
9397   uint32_t* n_sig_cts;
9398   uint32_t* fail_cts;
9399   uint32_t* marker_idx_to_uidx;
9400   unsigned char* bigstack_mark2;
9401   unsigned char* bigstack_mark3;
9402   char* wptr_start;
9403   char* wptr_start2;
9404   char* wptr;
9405   double* dptr;
9406   double* dptr2;
9407   uint32_t* uiptr;
9408   uint32_t* uiptr2;
9409   uint32_t* uiptr3;
9410   uint32_t* uiptr4;
9411   uint32_t* uiptr5;
9412   uint64_t tests_expected;
9413   uint64_t pct_thresh;
9414   double dxx;
9415   uintptr_t marker_ct1;
9416   uintptr_t tot_ctsplit;
9417   uintptr_t job_size;
9418   uintptr_t cur_bigstack_left;
9419   uintptr_t cur_workload;
9420   uintptr_t marker_idx1_start;
9421   uintptr_t marker_idx1;
9422   uintptr_t marker_idx1_end;
9423   uintptr_t idx1_block_size;
9424   uintptr_t idx2_block_size;
9425   uintptr_t idx2_block_sizea16;
9426   uintptr_t marker_uidx_tmp;
9427   uintptr_t block_idx1;
9428   uintptr_t block_idx2;
9429   uintptr_t cur_idx2_block_size;
9430   uintptr_t tidx;
9431   uintptr_t ulii;
9432   uintptr_t uljj;
9433   uintptr_t chrom_end2;
9434   uint32_t chrom_fo_idx;
9435   uint32_t chrom_fo_idx2;
9436   uint32_t chrom_idx2;
9437   uint32_t cur_window_end;
9438   uint32_t is_last_block;
9439   uint32_t missing_ct;
9440   uint32_t ujj;
9441 
9442   // common initialization between --epistasis and --fast-epistasis: remove
9443   // monomorphic and non-autosomal diploid sites
9444   if (is_custom_set1) {
9445     if (!sip->ct) {
9446       sprintf(g_logbuf, "Error: --%sepistasis set-by-%s requires a variant set to be loaded.\n", is_fast? "fast-" : "", is_set_by_set? "set" : "all");
9447       goto epistasis_report_ret_INVALID_CMDLINE_2;
9448     } else if (!is_set_by_set) {
9449       if (sip->ct > 1) {
9450 	logerrprint("Error: --{fast-}epistasis set-by-all requires exactly one set.  (--set-names or\n--set-collapse-all may be handy here.\n");
9451 	goto epistasis_report_ret_INVALID_CMDLINE;
9452       }
9453     } else if (sip->ct > 2) {
9454       logerrprint("Error: --{fast-}epistasis set-by-set requires exactly one or two sets.\n(--set-names or --set-collapse-all may be handy here.)\n");
9455       goto epistasis_report_ret_INVALID_CMDLINE;
9456     }
9457     if (bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude1)) {
9458       goto epistasis_report_ret_NOMEM;
9459     }
9460     unpack_set_unfiltered(marker_ct2, unfiltered_marker_ct, marker_exclude, sip->setdefs[0], marker_exclude1);
9461     if (is_set_by_set && (sip->ct == 1)) {
9462       marker_ct2 = unfiltered_marker_ct - popcount_longs(marker_exclude1, unfiltered_marker_ctl);
9463     } else {
9464       is_triangular = 0;
9465     }
9466     // if set-by-set with two sets, wait till after monomorphic sites are
9467     // removed to unpack 2nd set
9468   }
9469   if (pheno_nm_ct >= 0x20000000) {
9470     // may as well document the existence of sub-2b overflow conditions even
9471     // though they'll never come up
9472     logerrprint("Error: --{fast-}epistasis does not support >= 2^29 samples.\n");
9473     goto epistasis_report_ret_INVALID_CMDLINE;
9474   }
9475   if (!pheno_d) {
9476     if ((case_ct < 2) || ((!is_case_only) && (ctrl_ct < 2))) {
9477       sprintf(g_logbuf, "Error: --%sepistasis requires at least two cases%s.\n", is_fast? "fast-" : "", is_case_only? "" : " and two controls");
9478       goto epistasis_report_ret_INVALID_CMDLINE_2;
9479     }
9480     if (bigstack_alloc_ul(case_ctv2 + ctrl_ctl2, &casebuf)) {
9481       goto epistasis_report_ret_NOMEM;
9482     }
9483     ctrlbuf = &(casebuf[case_ctv2]);
9484     ctrlbuf[ctrl_ctl2 - 1] = 0;
9485   } else {
9486     case_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
9487     if (bigstack_alloc_ul(case_ctv2, &casebuf)) {
9488       goto epistasis_report_ret_NOMEM;
9489     }
9490   }
9491   casebuf[case_ctv2 - 2] = 0;
9492   casebuf[case_ctv2 - 1] = 0;
9493   // marker_exclude2 should be on top since we might free it
9494   if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
9495       bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude2)) {
9496     goto epistasis_report_ret_NOMEM;
9497   }
9498   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
9499   loadbuf[unfiltered_sample_ctv2 - 1] = 0;
9500   if ((!is_set_by_set) || (sip->ct == 2)) {
9501     memcpy(marker_exclude2, marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t));
9502   } else {
9503     memcpy(marker_exclude2, marker_exclude1, unfiltered_marker_ctl * sizeof(intptr_t));
9504   }
9505   if (do_joint_effects && epi_ip->je_cellmin) {
9506     cellminx3 = epi_ip->je_cellmin * 3;
9507     if ((case_ct < cellminx3 * 3) || ((!is_case_only) && (ctrl_ct < cellminx3 * 3))) {
9508       sprintf(g_logbuf, "Error: Too few cases or controls for --je-cellmin %u.\n", epi_ip->je_cellmin);
9509       goto epistasis_report_ret_INVALID_CMDLINE_2;
9510     }
9511     ulii = case_ctl2;
9512     if ((!is_case_only) && (ctrl_ctl2 > case_ctl2)) {
9513       ulii = ctrl_ctl2;
9514     }
9515     if (bigstack_alloc_ul(ulii, &ulptr)) {
9516       goto epistasis_report_ret_NOMEM;
9517     }
9518     fill_quatervec_55(ulii * BITCT2, ulptr);
9519   }
9520   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
9521     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
9522     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9523     if (is_set(chrom_info_ptr->haploid_mask, chrom_idx)) {
9524       uii = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx];
9525       fill_bits(uii, chrom_end - uii, marker_exclude2);
9526       marker_uidx = chrom_end;
9527       continue;
9528     }
9529     // may want to keep two window sizes' raw data loaded for marker 1, to
9530     // halve the number of non-sequential seeks?
9531     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9532       goto epistasis_report_ret_READ_FAIL;
9533     }
9534     while (marker_uidx < chrom_end) {
9535       if (is_set(marker_exclude2, marker_uidx)) {
9536 	marker_uidx = next_unset(marker_exclude2, marker_uidx, chrom_end);
9537 	if (marker_uidx == chrom_end) {
9538 	  break;
9539 	}
9540 	if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9541 	  goto epistasis_report_ret_READ_FAIL;
9542 	}
9543       }
9544       if ((!no_ueki) && (!cellminx3)) {
9545 	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, 0, bedfile, loadbuf, casebuf)) {
9546 	  goto epistasis_report_ret_READ_FAIL;
9547 	}
9548 	if (is_boost) {
9549 	  if (less_than_two_genotypes(casebuf, pheno_nm_ct)) {
9550 	    SET_BIT(marker_uidx, marker_exclude2);
9551 	  }
9552 	} else {
9553 	  if (is_monomorphic(casebuf, pheno_nm_ct)) {
9554 	    SET_BIT(marker_uidx, marker_exclude2);
9555 	  }
9556 	}
9557       } else {
9558         if (load_and_split(unfiltered_sample_ct, pheno_nm, pheno_c, bedfile, loadbuf, casebuf, ctrlbuf)) {
9559           goto epistasis_report_ret_READ_FAIL;
9560 	}
9561 	if (no_ueki) {
9562 	  if (is_monomorphic(casebuf, case_ct) || ((!is_case_only) && is_monomorphic(ctrlbuf, ctrl_ct))) {
9563 	    SET_BIT(marker_uidx, marker_exclude2);
9564 	  }
9565 	} else {
9566 	  genovec_3freq(casebuf, ulptr, case_ctl2, &missing_ct, &uii, &ujj);
9567 	  if ((uii < cellminx3) || (ujj < cellminx3) || (case_ct - uii - ujj - missing_ct < cellminx3)) {
9568 	    SET_BIT(marker_uidx, marker_exclude2);
9569 	  } else if (!is_case_only) {
9570 	    genovec_3freq(ctrlbuf, ulptr, ctrl_ctl2, &missing_ct, &uii, &ujj);
9571 	    if ((uii < cellminx3) || (ujj < cellminx3) || (ctrl_ct - uii - ujj - missing_ct < cellminx3)) {
9572 	      SET_BIT(marker_uidx, marker_exclude2);
9573 	    }
9574 	  }
9575 	}
9576       }
9577       marker_uidx++;
9578     }
9579   }
9580   ulii = unfiltered_marker_ct - popcount_longs(marker_exclude2, unfiltered_marker_ctl);
9581   if ((!ulii) || ((ulii == 1) && is_triangular)) {
9582     goto epistasis_report_ret_TOO_FEW_MARKERS;
9583   }
9584   if (ulii != marker_ct2) {
9585     if (!cellminx3) {
9586       LOGPRINTF("--%sepistasis: Skipping %" PRIuPTR " monomorphic/non-autosomal site%s.\n", is_fast? "fast-" : "", marker_ct2 - ulii, (marker_ct2 - ulii == 1)? "" : "s");
9587     } else {
9588       LOGPRINTF("--%sepistasis: Skipping %" PRIuPTR " site%s due to --je-cellmin setting.\n", is_fast? "fast-" : "", marker_ct2 - ulii, (marker_ct2 - ulii == 1)? "" : "s");
9589       bigstack_reset(ulptr);
9590     }
9591     marker_uidx_base = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
9592   } else if ((!is_custom_set1) || (!is_set_by_set)) {
9593     bigstack_reset(marker_exclude2);
9594     marker_exclude2 = marker_exclude;
9595   }
9596   if (is_triangular) {
9597     if (!marker_exclude1) {
9598       marker_exclude1 = marker_exclude2;
9599     }
9600     marker_ct1 = ulii;
9601     marker_ct2 = ulii;
9602     tests_expected = ((((uint64_t)marker_ct1) * (marker_ct1 - 1)) / 2);
9603   } else {
9604     bitvec_or(marker_exclude2, unfiltered_marker_ctl, marker_exclude1);
9605     marker_ct1 = unfiltered_marker_ct - popcount_longs(marker_exclude1, unfiltered_marker_ctl);
9606     if (sip->ct == 2) {
9607       if (bigstack_alloc_ul(unfiltered_marker_ctl, &ulptr)) {
9608 	goto epistasis_report_ret_NOMEM;
9609       }
9610       memcpy(ulptr, marker_exclude2, unfiltered_marker_ctl * sizeof(intptr_t));
9611       unpack_set_unfiltered(marker_ct2, unfiltered_marker_ct, marker_exclude, sip->setdefs[1], marker_exclude2);
9612       bitvec_or(ulptr, unfiltered_marker_ctl, marker_exclude2);
9613       bigstack_reset(ulptr);
9614       marker_ct2 = unfiltered_marker_ct - popcount_longs(marker_exclude2, unfiltered_marker_ctl);
9615     } else {
9616       marker_ct2 = ulii;
9617     }
9618     tests_expected = ((uint64_t)marker_ct1) * marker_ct2;
9619     if (!tests_expected) {
9620       goto epistasis_report_ret_TOO_FEW_MARKERS;
9621     }
9622   }
9623   if (parallel_tot > 1) {
9624     if (marker_ct1 < (1 + is_triangular) * parallel_tot) {
9625       sprintf(g_logbuf, "Error: Too few loci remaining for --parallel %u %u + --%sepistasis.\n", parallel_idx + 1, parallel_tot, is_fast? "fast-" : "");
9626       goto epistasis_report_ret_INVALID_CMDLINE_2;
9627     }
9628     if (is_triangular) {
9629       // If there are n markers, and we're computing the usual upper right
9630       // triangle, first row has n-1 entries, second row has n-2, etc.
9631       // Total entry count is n(n-1)/2; total entry count starting from row r
9632       // is (n-r)(n-r-1)/2... upside-down triangle_divide() calls produce a
9633       // good partition.
9634       // Divide first to avoid 64-bit integer overflow (!) on really huge jobs.
9635       // (Multiply-by-2 is there because triangle_divide() takes n(n-1) instead
9636       // of n(n-1)/2 as first parameter.)
9637       pct_thresh = (2 * tests_expected) / parallel_tot;
9638       // If parallel_idx == 0, the marker_ct >= 2 * parallel_tot condition
9639       // ensures the precision loss from dividing and remultiplying does not
9640       // cause the first marker to be dropped.
9641       marker_idx1_start = triangle_divide(pct_thresh * (parallel_tot - parallel_idx), -1);
9642       marker_idx1_end = triangle_divide(pct_thresh * (parallel_tot - parallel_idx - 1), -1);
9643       tests_expected = ((((uint64_t)marker_idx1_start) * (marker_idx1_start - 1)) - (((uint64_t)marker_idx1_end) * (marker_idx1_end - 1))) / 2;
9644       marker_idx1_start = marker_ct1 - marker_idx1_start;
9645       marker_idx1_end = marker_ct1 - marker_idx1_end;
9646     } else {
9647       marker_idx1_start = (parallel_idx * ((uint64_t)marker_ct1)) / parallel_tot;
9648       marker_idx1_end = ((parallel_idx + 1) * ((uint64_t)marker_ct1)) / parallel_tot;
9649       tests_expected = (marker_idx1_end - marker_idx1_start) * ((uint64_t)marker_ct2);
9650     }
9651   } else {
9652     marker_idx1_start = 0;
9653     marker_idx1_end = marker_ct1;
9654   }
9655   marker_idx1 = marker_idx1_start;
9656   job_size = marker_idx1_end - marker_idx1_start;
9657   if (max_thread_ct > job_size) {
9658     max_thread_ct = job_size;
9659   }
9660   if (bigstack_calloc_d(marker_ct1, &best_chisq) ||
9661       bigstack_calloc_ui(marker_ct1, &best_ids) ||
9662       bigstack_calloc_ui(marker_ct1, &n_sig_cts) ||
9663       bigstack_calloc_ui(marker_ct1, &fail_cts) ||
9664       bigstack_alloc_ui(max_thread_ct + 1, &g_epi_idx1_block_bounds) ||
9665       bigstack_alloc_ui(max_thread_ct, &g_epi_idx1_block_bounds16)) {
9666     goto epistasis_report_ret_NOMEM;
9667   }
9668   if (is_case_only_window || (!is_triangular)) {
9669     if (bigstack_calloc_ui(marker_ct1, &gap_cts)) {
9670       goto epistasis_report_ret_NOMEM;
9671     }
9672   }
9673   bigstack_mark3 = g_bigstack_base;
9674 
9675   g_epi_thread_ct = max_thread_ct;
9676   g_epi_case_ct = case_ct;
9677   g_epi_flag = modifier;
9678   g_epi_marker_ct = marker_ct2;
9679   g_epi_cellmin = cellminx3 / 3;
9680   // might want to provide a Bonferroni correction interface...
9681   if (is_boost) {
9682     if (epi_ip->epi1 == 0.0) {
9683       dxx = 0.000005;
9684     } else {
9685       dxx = epi_ip->epi1;
9686     }
9687     g_epi_alpha1sq[0] = inverse_chiprob(dxx, 4);
9688     g_epi_alpha1sq[1] = inverse_chiprob(dxx, 2);
9689     g_epi_alpha1sq[2] = inverse_chiprob(dxx, 1);
9690     g_epi_alpha2sq[0] = inverse_chiprob(epi_ip->epi2, 4);
9691     if (g_epi_alpha1sq[0] == g_epi_alpha2sq[0]) {
9692       // count final instead of screening p-value hits
9693       g_epi_alpha2sq[0] *= 1 + SMALL_EPSILON;
9694       g_epi_alpha2sq[1] = g_epi_alpha1sq[1] * (1 + SMALL_EPSILON);
9695       g_epi_alpha2sq[2] = g_epi_alpha1sq[2] * (1 + SMALL_EPSILON);
9696     } else {
9697       g_epi_alpha2sq[1] = inverse_chiprob(epi_ip->epi2, 2);
9698       g_epi_alpha2sq[2] = inverse_chiprob(epi_ip->epi2, 1);
9699     }
9700     if (bigstack_alloc_d(pheno_nm_ct + 1, &g_epi_recip_cache)) {
9701       goto epistasis_report_ret_NOMEM;
9702     }
9703     g_epi_recip_cache[0] = 0.0;
9704     for (uii = 1; uii <= pheno_nm_ct; uii++) {
9705       g_epi_recip_cache[uii] = 1.0 / ((double)((int32_t)uii));
9706     }
9707   } else {
9708     if (epi_ip->epi1 == 0.0) {
9709       dxx = 0.00005;
9710     } else {
9711       dxx = epi_ip->epi1 * 0.5;
9712     }
9713     dxx = ltqnorm(dxx);
9714     g_epi_alpha1sq[0] = dxx * dxx;
9715     dxx = ltqnorm(epi_ip->epi2 / 2);
9716     g_epi_alpha2sq[0] = dxx * dxx;
9717   }
9718   if (!is_fast) {
9719     if (pheno_d) {
9720       retval = epistasis_linear_regression(threads, epi_ip, bedfile, bed_offset, unfiltered_marker_ct, marker_reverse, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, marker_uidx_base, marker_ct1, marker_exclude1, marker_idx1_start, marker_idx1_end, marker_ct2, marker_exclude2, is_triangular, job_size, tests_expected, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_d, parallel_idx, parallel_tot, outname, outname_end, output_min_p, glm_vif_thresh, loadbuf, casebuf, best_chisq, best_ids, n_sig_cts, fail_cts, gap_cts);
9721     } else {
9722       retval = epistasis_logistic_regression(threads, epi_ip, bedfile, bed_offset, unfiltered_marker_ct, marker_reverse, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, marker_uidx_base, marker_ct1, marker_exclude1, marker_idx1_start, marker_idx1_end, marker_ct2, marker_exclude2, is_triangular, job_size, tests_expected, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_c, parallel_idx, parallel_tot, outname, outname_end, output_min_p, loadbuf, casebuf, best_chisq, best_ids, n_sig_cts, fail_cts, gap_cts);
9723     }
9724     if (retval) {
9725       goto epistasis_report_ret_1;
9726     }
9727   } else {
9728     pct_thresh = tests_expected / 100;
9729     if (is_case_only) {
9730       g_epi_ctrl_ct = 0;
9731       ctrl_ctv3 = 0;
9732       ctrl_ctsplit = 0;
9733       memcpy(outname_end, ".epi.co", 8);
9734     } else {
9735       g_epi_ctrl_ct = ctrl_ct;
9736       memcpy(outname_end, ".epi.cc", 8);
9737     }
9738     if (parallel_tot > 1) {
9739       outname_end[7] = '.';
9740       uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
9741     }
9742     tot_ctsplit = case_ctsplit + ctrl_ctsplit;
9743     if (fopen_checked(outname, "w", &outfile)) {
9744       goto epistasis_report_ret_OPEN_FAIL;
9745     }
9746     if (!parallel_idx) {
9747       wptr = memcpya(g_textbuf, "CHR1 ", 5);
9748       wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
9749       wptr = memcpya(wptr, " CHR2 ", 6);
9750       wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
9751       wptr = memcpya(wptr, "         STAT ", 14);
9752       if (is_boost) {
9753 	wptr = memcpya(wptr, "  DF ", 5);
9754       }
9755       if (!no_p_value) {
9756         wptr = memcpya(wptr, "           P ", 13);
9757       }
9758       *wptr++ = '\n';
9759       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9760 	goto epistasis_report_ret_WRITE_FAIL;
9761       }
9762     }
9763     // claim up to half of memory with idx1 bufs; each marker currently costs:
9764     //   (case_ctsplit + ctrl_ctsplit) * sizeof(intptr_t) for loose geno buf
9765     //   0.25 for missing tracker
9766     //   sizeof(int32_t) for offset (to skip bottom left triangle, and/or
9767     //     too-close pairs for case-only tests; will sometimes need to be
9768     //     larger when sets come into the picture
9769     //   sizeof(double) for best chisq,
9770     //   sizeof(int32_t) for best opposite ID,
9771     //   sizeof(int32_t) for N_SIG count,
9772     //   sizeof(int32_t) for per-site fail counts, and (bleah)
9773     //   marker_ct2 * sizeof(double) for the usually oversized results space
9774     cur_bigstack_left = bigstack_left();
9775     ulii = 4 * CACHELINE - 3 * sizeof(int32_t) + max_thread_ct * (5 * (CACHELINE - 4));
9776     if (cur_bigstack_left >= ulii) {
9777       cur_bigstack_left -= ulii;
9778     }
9779     ulii = tot_ctsplit * sizeof(intptr_t) + 4 * sizeof(int32_t) + sizeof(double) + marker_ct2 * sizeof(double);
9780     idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
9781     if (!idx1_block_size) {
9782       goto epistasis_report_ret_NOMEM;
9783     }
9784     if (idx1_block_size > job_size) {
9785       idx1_block_size = job_size;
9786     }
9787     // pad to avoid threads writing to same cacheline
9788     ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
9789     // offsets[] isn't really needed, but barely takes any memory
9790     // if 'case-only', want two more offsets columns to store where the "too
9791     // close" variants are
9792     bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
9793     bigstack_alloc_ul(tot_ctsplit * idx1_block_size, &g_epi_geno1);
9794     bigstack_alloc_ul(QUATERCT_TO_WORDCT(idx1_block_size), &g_epi_zmiss1);
9795     bigstack_alloc_d(idx1_block_size * marker_ct2, &g_epi_all_chisq);
9796     bigstack_alloc_d(ulii, &g_epi_best_chisq1);
9797     bigstack_alloc_ui(ulii, &g_epi_best_id1);
9798     bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
9799     bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
9800     for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
9801       g_epi_geno1[block_idx1 * tot_ctsplit + case_ctv3 - 1] = 0;
9802       g_epi_geno1[block_idx1 * tot_ctsplit + 2 * case_ctv3 - 1] = 0;
9803       g_epi_geno1[block_idx1 * tot_ctsplit + case_ctsplit - 1] = 0;
9804       g_epi_geno1[block_idx1 * tot_ctsplit + case_ctsplit + ctrl_ctv3 - 1] = 0;
9805       g_epi_geno1[block_idx1 * tot_ctsplit + case_ctsplit + 2 * ctrl_ctv3 - 1] = 0;
9806       g_epi_geno1[block_idx1 * tot_ctsplit + tot_ctsplit - 1] = 0;
9807     }
9808     if (is_triangular) {
9809       fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
9810     }
9811     // don't actually need best_chisq2, best_id2, n_sig_ct2, fail_ct2 if not
9812     // triangular, but rather not complicate/duplicate the common case inner
9813     // loop for now
9814     ulii = tot_ctsplit * sizeof(intptr_t) + 1 + is_boost * 6 * sizeof(double) + tot_stride * sizeof(int32_t) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
9815     idx2_block_size = (bigstack_left() - CACHELINE - is_boost * (CACHELINE - 8) - max_thread_ct * (5 * (CACHELINE - 4))) / ulii;
9816     if (idx2_block_size > marker_ct2) {
9817       idx2_block_size = marker_ct2;
9818     }
9819     idx2_block_size = round_up_pow2(idx2_block_size, 16);
9820     bigstack_mark2 = g_bigstack_base;
9821     while (1) {
9822       if (!idx2_block_size) {
9823 	goto epistasis_report_ret_NOMEM;
9824       }
9825       if (!(bigstack_alloc_ul(tot_ctsplit * idx2_block_size, &g_epi_geno2) ||
9826             bigstack_alloc_ul(QUATERCT_TO_WORDCT(idx2_block_size), &g_epi_zmiss2) ||
9827 	    bigstack_alloc_ui(idx2_block_size * tot_stride, &g_epi_tot2) ||
9828 	    bigstack_alloc_d(max_thread_ct * idx2_block_size, &g_epi_best_chisq2) ||
9829 	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
9830 	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
9831 	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
9832 	if ((!is_boost) || (!bigstack_alloc_d(6 * idx2_block_size, &g_epi_boost_precalc2))) {
9833 	  break;
9834 	}
9835       }
9836       bigstack_reset(bigstack_mark2);
9837       idx2_block_size -= 16;
9838     }
9839     for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
9840       g_epi_geno2[block_idx2 * tot_ctsplit + case_ctv3 - 1] = 0;
9841       g_epi_geno2[block_idx2 * tot_ctsplit + 2 * case_ctv3 - 1] = 0;
9842       g_epi_geno2[block_idx2 * tot_ctsplit + case_ctsplit - 1] = 0;
9843       g_epi_geno2[block_idx2 * tot_ctsplit + case_ctsplit + ctrl_ctv3 - 1] = 0;
9844       g_epi_geno2[block_idx2 * tot_ctsplit + case_ctsplit + 2 * ctrl_ctv3 - 1] = 0;
9845       g_epi_geno2[block_idx2 * tot_ctsplit + tot_ctsplit - 1] = 0;
9846     }
9847     marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
9848     if (marker_idx1) {
9849       marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
9850     }
9851     wptr = memcpya(g_logbuf, "--fast-epistasis", 16);
9852     if (is_boost) {
9853       wptr = memcpya(wptr, " boost", 6);
9854     } else if (no_ueki) {
9855       wptr = memcpya(wptr, " no-ueki", 8);
9856     } else if (do_joint_effects) {
9857       wptr = memcpya(wptr, " joint-effects", 14);
9858     }
9859     if (is_case_only) {
9860       wptr = memcpya(wptr, " case-only", 10);
9861     }
9862     wptr = memcpya(wptr, " to ", 4);
9863     wptr = strcpya(wptr, outname);
9864     memcpy(wptr, " ... ", 6);
9865     wordwrapb(16); // strlen("99% [processing]")
9866     logprintb();
9867     fputs("0%", stdout);
9868     do {
9869       fputs(" [processing]", stdout);
9870       fflush(stdout);
9871       if (idx1_block_size > marker_idx1_end - marker_idx1) {
9872         idx1_block_size = marker_idx1_end - marker_idx1;
9873         if (idx1_block_size < max_thread_ct) {
9874 	  max_thread_ct = idx1_block_size;
9875 	  g_epi_thread_ct = max_thread_ct;
9876 	}
9877       }
9878       g_epi_marker_idx1 = marker_idx1;
9879       dptr = g_epi_all_chisq;
9880       dptr2 = &(g_epi_all_chisq[idx1_block_size * marker_ct2]);
9881       do {
9882 	*dptr++ = -1;
9883       } while (dptr < dptr2);
9884       marker_uidx_tmp = marker_uidx;
9885       if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9886 	goto epistasis_report_ret_READ_FAIL;
9887       }
9888       cur_workload = idx1_block_size * marker_ct2;
9889       if (is_triangular) {
9890 	for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
9891 	  ulii = block_idx1 + marker_idx1 + 1;
9892 	  cur_workload -= ulii;
9893 	  // edit this during loading, when we have to know marker_uidx anyway,
9894 	  // if case-only
9895 	  g_epi_geno1_offsets[2 * block_idx1 + 1] = ulii;
9896 	}
9897       } else {
9898         fill_uint_zero(2 * idx1_block_size, g_epi_geno1_offsets);
9899 	marker_uidx2 = marker_uidx_base;
9900 	marker_idx2 = 0;
9901       }
9902       tests_complete += cur_workload;
9903       ulii = 0; // total number of tests
9904       g_epi_idx1_block_bounds[0] = 0;
9905       g_epi_idx1_block_bounds16[0] = 0;
9906       block_idx1 = 0;
9907       for (tidx = 1; tidx < max_thread_ct; tidx++) {
9908 	uljj = (((uint64_t)cur_workload) * tidx) / max_thread_ct;
9909 	if (is_triangular) {
9910 	  do {
9911 	    // slightly inaccurate for case-only due to the way --gap is
9912 	    // supported, but this doesn't affect any calculation results, only
9913 	    // the progress display
9914 	    ulii += marker_ct2 - g_epi_geno1_offsets[2 * block_idx1 + 1];
9915 	    block_idx1++;
9916 	  } while (ulii < uljj);
9917 	} else {
9918 	  do {
9919 	    ulii += marker_ct2;
9920 	    block_idx1++;
9921 	  } while (ulii < uljj);
9922 	}
9923 	uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
9924         g_epi_idx1_block_bounds[tidx] = block_idx1;
9925         g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
9926       }
9927       g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
9928       fill_ulong_zero(QUATERCT_TO_WORDCT(idx1_block_size), g_epi_zmiss1);
9929       chrom_end = 0;
9930       for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
9931         if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
9932 	  marker_uidx_tmp = next_unset_ul_unsafe(marker_exclude1, marker_uidx_tmp);
9933           if (fseeko(bedfile, bed_offset + (marker_uidx_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
9934 	    goto epistasis_report_ret_READ_FAIL;
9935 	  }
9936 	}
9937 	if (load_and_split3(bedfile, loadbuf, unfiltered_sample_ct, &(g_epi_geno1[block_idx1 * tot_ctsplit]), pheno_nm, pheno_c, case_ctv3, ctrl_ctv3, IS_SET(marker_reverse, marker_uidx_tmp), is_case_only, &ulii)) {
9938 	  goto epistasis_report_ret_READ_FAIL;
9939 	}
9940 	if (ulii) {
9941 	  g_epi_zmiss1[block_idx1 / BITCT2] |= ulii << (2 * (block_idx1 % BITCT2));
9942 	  // g_epi_tot1 doesn't need to exist, better for each thread to
9943 	  // determine those totals on the fly
9944 	}
9945 	if (is_case_only_window) {
9946 	  cur_window_end = marker_pos[marker_uidx_tmp] + case_only_gap;
9947 	  if (marker_uidx_tmp >= chrom_end) {
9948 	    chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx_tmp);
9949 	    chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
9950 	    if (is_triangular) {
9951 	      marker_uidx2 = marker_uidx_tmp;
9952 	      marker_idx2 = block_idx1 + marker_idx1;
9953 	      last_pos = marker_pos[marker_uidx_tmp];
9954 	    } else {
9955 	      uii = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx];
9956 	      if (marker_pos[marker_uidx_tmp] < case_only_gap) {
9957 		ujj = 0;
9958 	      } else {
9959 		ujj = marker_pos[marker_uidx_tmp] + 1 - case_only_gap;
9960 	      }
9961 	      marker_uidx2_trail = next_unset(marker_exclude2, uii + uint32arr_greater_than(&(marker_pos[uii]), marker_uidx_tmp + 1 - uii, ujj), chrom_end);
9962 	      marker_idx2_trail = marker_uidx2_trail - popcount_bit_idx(marker_exclude2, 0, marker_uidx2_trail);
9963 	      if (marker_uidx2_trail < chrom_end) {
9964 		first_pos = marker_pos[marker_uidx2_trail];
9965 		// this could be more efficient, but not a big deal since
9966 		// there aren't many chromosomes
9967 	        marker_uidx2 = next_unset(marker_exclude2, uii + uint32arr_greater_than(&(marker_pos[marker_uidx_tmp]), chrom_end - marker_uidx_tmp, cur_window_end), chrom_end);
9968 	      } else {
9969 		first_pos = 0x7fffffffU;
9970 		marker_uidx2 = chrom_end;
9971 	      }
9972 	      marker_idx2 = marker_idx2_trail + marker_uidx2 - marker_uidx2_trail - popcount_bit_idx(marker_exclude2, marker_uidx2_trail, marker_uidx2);
9973 	      if (marker_uidx2 < chrom_end) {
9974 		last_pos = marker_pos[marker_uidx2];
9975 	      } else {
9976 		last_pos = 0xffffffffU;
9977 	      }
9978 	    }
9979 	  }
9980 	  while (last_pos < cur_window_end) {
9981 	    marker_idx2++;
9982 	    marker_uidx2++;
9983 	    next_unset_ul_ck(marker_exclude2, chrom_end, &marker_uidx2);
9984 	    if (marker_uidx2 != chrom_end) {
9985 	      last_pos = marker_pos[marker_uidx2];
9986 	    } else {
9987 	      last_pos = 0xffffffffU;
9988 	    }
9989 	  }
9990 	  if (is_triangular) {
9991 	    ulii = block_idx1 + marker_idx1;
9992             gap_cts[ulii] += marker_idx2 - ulii - 1;
9993 	    while (++ulii < marker_idx2) {
9994 	      gap_cts[ulii] += 1;
9995 	    }
9996 	    g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2;
9997 	  } else {
9998 	    uii = marker_pos[marker_uidx_tmp];
9999 	    while (first_pos + case_only_gap <= uii) {
10000 	      marker_idx2_trail++;
10001 	      marker_uidx2_trail++;
10002 	      next_unset_ul_ck(marker_exclude2, chrom_end, &marker_uidx2_trail);
10003               if (marker_uidx2_trail != chrom_end) {
10004 		first_pos = marker_pos[marker_uidx2_trail];
10005 	      } else {
10006 		first_pos = 0x7fffffffU;
10007 	      }
10008 	    }
10009 	    if (marker_idx2 > marker_idx2_trail) {
10010 	      g_epi_geno1_offsets[2 * block_idx1] = marker_idx2_trail;
10011 	      g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2;
10012 	      gap_cts[block_idx1 + marker_idx1] = marker_idx2 - marker_idx2_trail;
10013 	    }
10014 	  }
10015 	} else if (!is_triangular) {
10016           if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
10017 	    // do not compare against self
10018 	    marker_idx2 += marker_uidx_tmp - marker_uidx2 - popcount_bit_idx(marker_exclude2, marker_uidx2, marker_uidx_tmp);
10019 	    marker_uidx2 = marker_uidx_tmp;
10020 	    g_epi_geno1_offsets[2 * block_idx1] = marker_idx2;
10021 	    g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2 + 1;
10022 	    gap_cts[block_idx1 + marker_idx1] = 1;
10023 	  }
10024 	}
10025       }
10026       marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
10027       if (is_triangular) {
10028 	marker_idx2 = marker_idx1 + 1;
10029         marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2);
10030       } else {
10031         marker_idx2 = 0;
10032       }
10033       if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
10034 	goto epistasis_report_ret_READ_FAIL;
10035       }
10036       cur_idx2_block_size = idx2_block_size;
10037       do {
10038 	if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
10039 	  cur_idx2_block_size = marker_ct2 - marker_idx2;
10040 	}
10041 	fill_ulong_zero(QUATERCT_TO_WORDCT(cur_idx2_block_size), g_epi_zmiss2);
10042         for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
10043           if (IS_SET(marker_exclude2, marker_uidx2)) {
10044 	    marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
10045             if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
10046 	      goto epistasis_report_ret_READ_FAIL;
10047 	    }
10048 	  }
10049 	  ulptr = &(g_epi_geno2[block_idx2 * tot_ctsplit]);
10050 	  if (load_and_split3(bedfile, loadbuf, unfiltered_sample_ct, ulptr, pheno_nm, pheno_c, case_ctv3, ctrl_ctv3, IS_SET(marker_reverse, marker_uidx2), is_case_only, &ulii)) {
10051 	    goto epistasis_report_ret_READ_FAIL;
10052 	  }
10053 	  uiptr = &(g_epi_tot2[block_idx2 * tot_stride]);
10054 	  uiptr[0] = popcount_longs(ulptr, case_ctv3);
10055 	  uiptr[1] = popcount_longs(&(ulptr[case_ctv3]), case_ctv3);
10056 	  uiptr[2] = popcount_longs(&(ulptr[2 * case_ctv3]), case_ctv3);
10057 	  if (!is_case_only) {
10058 	    ulptr = &(ulptr[case_ctv3 * 3]);
10059 	    uiptr[3] = popcount_longs(ulptr, ctrl_ctv3);
10060 	    uiptr[4] = popcount_longs(&(ulptr[ctrl_ctv3]), ctrl_ctv3);
10061 	    uiptr[5] = popcount_longs(&(ulptr[2 * ctrl_ctv3]), ctrl_ctv3);
10062 	    if (is_boost) {
10063 	      boost_calc_p_bc(uiptr[0], uiptr[1], uiptr[2], uiptr[3], uiptr[4], uiptr[5], &(g_epi_boost_precalc2[block_idx2 * 6]));
10064 	    }
10065 	  }
10066 	  if (ulii) {
10067 	    g_epi_zmiss2[block_idx2 / BITCT2] |= ulii << (2 * (block_idx2 % BITCT2));
10068 	  }
10069 	}
10070 	g_epi_idx2_block_size = cur_idx2_block_size;
10071 	g_epi_idx2_block_start = marker_idx2;
10072 	idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
10073         fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_n_sig_ct1);
10074 	fill_uint_zero(idx1_block_size + 15 * (max_thread_ct - 1), g_epi_fail_ct1);
10075         fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_n_sig_ct2);
10076 	fill_uint_zero(idx2_block_sizea16 * max_thread_ct, g_epi_fail_ct2);
10077 	for (tidx = 0; tidx < max_thread_ct; tidx++) {
10078 	  ulii = g_epi_idx1_block_bounds[tidx];
10079 	  uljj = g_epi_idx1_block_bounds[tidx + 1];
10080 	  memcpy(&(g_epi_best_chisq1[g_epi_idx1_block_bounds16[tidx]]), &(g_epi_all_chisq[marker_idx1 + ulii]), (uljj - ulii) * sizeof(double));
10081 	  ulii = g_epi_geno1_offsets[2 * ulii + 1];
10082 	  if (ulii < marker_idx2 + cur_idx2_block_size) {
10083 	    if (ulii <= marker_idx2) {
10084 	      ulii = 0;
10085 	    } else {
10086 	      ulii -= marker_idx2;
10087 	    }
10088 	    memcpy(&(g_epi_best_chisq2[tidx * idx2_block_sizea16 + ulii]), &(g_epi_all_chisq[marker_idx2 + ulii]), (cur_idx2_block_size - ulii) * sizeof(double));
10089 	  }
10090 	  // no need to initialize IDs since they are only referenced when a
10091 	  // higher chisq value is present, and when that happens an ID is
10092           // always written
10093 	}
10094 	is_last_block = (marker_idx2 + cur_idx2_block_size >= marker_ct2);
10095 	if (spawn_threads2(threads, &fast_epi_thread, max_thread_ct, is_last_block)) {
10096 	  goto epistasis_report_ret_THREAD_CREATE_FAIL;
10097 	}
10098 	fast_epi_thread((void*)0);
10099 	join_threads2(threads, max_thread_ct, is_last_block);
10100 	// merge best_chisq, best_ids, fail_cts
10101 	for (tidx = 0; tidx < max_thread_ct; tidx++) {
10102 	  ulii = g_epi_idx1_block_bounds[tidx];
10103 	  uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
10104 	  uii = g_epi_idx1_block_bounds16[tidx];
10105 	  dptr = &(g_epi_best_chisq1[uii]);
10106 	  uiptr = &(g_epi_best_id1[uii]);
10107 	  uiptr2 = &(g_epi_n_sig_ct1[uii]);
10108 	  uiptr3 = &(g_epi_fail_ct1[uii]);
10109 	  ulii += marker_idx1;
10110           dptr2 = &(best_chisq[ulii]);
10111           uiptr4 = &(n_sig_cts[ulii]);
10112           uiptr5 = &(fail_cts[ulii]);
10113 	  for (block_idx1 = 0; block_idx1 < uljj; block_idx1++, dptr2++, uiptr4++, uiptr5++) {
10114 	    dxx = *dptr++;
10115 	    if (dxx > (*dptr2)) {
10116 	      *dptr2 = dxx;
10117 	      best_ids[block_idx1 + ulii] = uiptr[block_idx1];
10118 	    }
10119             *uiptr4 += *uiptr2++;
10120             *uiptr5 += *uiptr3++;
10121 	  }
10122 	}
10123 	if (is_triangular) {
10124 	  for (tidx = 0; tidx < max_thread_ct; tidx++) {
10125 	    block_idx2 = g_epi_geno1_offsets[2 * g_epi_idx1_block_bounds[tidx] + 1];
10126 	    if (block_idx2 <= marker_idx2) {
10127 	      block_idx2 = 0;
10128 	    } else {
10129 	      block_idx2 -= marker_idx2;
10130 	    }
10131 	    dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + block_idx2]);
10132 	    uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
10133 	    uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
10134 	    uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
10135 	    dptr2 = &(best_chisq[block_idx2 + marker_idx2]);
10136 	    uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
10137 	    uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
10138 	    for (; block_idx2 < cur_idx2_block_size; block_idx2++, dptr2++, uiptr4++, uiptr5++) {
10139 	      dxx = *dptr++;
10140 	      if (dxx > (*dptr2)) {
10141 		*dptr2 = dxx;
10142 		best_ids[block_idx2 + marker_idx2] = uiptr[block_idx2];
10143 	      }
10144 	      *uiptr4 += *uiptr2++;
10145 	      *uiptr5 += *uiptr3++;
10146 	    }
10147 	  }
10148 	}
10149         marker_idx2 += cur_idx2_block_size;
10150       } while (marker_idx2 < marker_ct2);
10151       fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
10152       fflush(stdout);
10153       chrom_end = 0;
10154       block_idx1 = 0;
10155       while (1) {
10156 	next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
10157 	ujj = g_epi_geno1_offsets[2 * block_idx1];
10158 	marker_idx2 = 0;
10159 	dptr = &(g_epi_all_chisq[block_idx1 * marker_ct2]);
10160 	if (marker_uidx >= chrom_end) {
10161 	  chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
10162 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10163 	  chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
10164 	}
10165         wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
10166 	*wptr_start++ = ' ';
10167 	wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10168 	*wptr_start++ = ' ';
10169 	marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
10170 	for (chrom_fo_idx2 = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
10171 	  chrom_end2 = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx2 + 1];
10172 	  if (marker_uidx2 >= chrom_end2) {
10173 	    continue;
10174 	  }
10175           chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
10176           wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
10177 	  *wptr_start2++ = ' ';
10178 	  for (; marker_uidx2 < chrom_end2; ++marker_uidx2, next_unset_ul_ck(marker_exclude2, unfiltered_marker_ct, &marker_uidx2), ++marker_idx2, ++dptr) {
10179 	    if (marker_idx2 == ujj) {
10180 	      marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
10181 	      if (marker_idx2 == marker_ct2) {
10182 		goto epistasis_report_write_loop;
10183 	      }
10184 	      if (marker_idx2 > ujj) {
10185 	        marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2 - ujj);
10186 	        dptr = &(dptr[marker_idx2 - ujj]);
10187 	        if (marker_uidx2 >= chrom_end2) {
10188 		  break;
10189 	        }
10190 	      }
10191 	    } else if (marker_idx2 == marker_ct2) {
10192 	      goto epistasis_report_write_loop;
10193 	    }
10194 	    dxx = *dptr;
10195 	    if (dxx != -1) {
10196 	      wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
10197 	      *wptr++ = ' ';
10198 	      if (is_boost) {
10199 		if (dxx == dxx) { // not nan
10200 		  memcpy(ularr, &dxx, sizeof(double));
10201 		  uii = 4 >> (ularr[0] & 3);
10202 		  // don't want ugly e-324s when zero belongs
10203 		  ularr[0] &= ~(3 * ONELU);
10204 		  memcpy(&dxx, ularr, sizeof(double));
10205 		  wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
10206 		  wptr = memseta(wptr, 32, 4);
10207 		  *wptr++ = '0' + uii;
10208 		  *wptr++ = ' ';
10209 		} else {
10210                   wptr = memcpya(wptr, "         nan    0 ", 18);
10211 		  uii = 0;
10212 		}
10213 	      } else if (!no_ueki) {
10214 		wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
10215 		*wptr++ = ' ';
10216 	      } else {
10217 		// lower precision compatibility mode
10218                 wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
10219 	      }
10220 	      if (!no_p_value) {
10221 		if (!is_boost) {
10222 		  dxx = normdist(-sqrt(dxx)) * 2;
10223 		  wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
10224 		} else if (uii) {
10225 		  dxx = chiprob_p(dxx, uii);
10226 		  wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
10227 		} else {
10228 		  wptr = memcpya(wptr, "          NA ", 13);
10229 		}
10230 	      }
10231 	      *wptr++ = '\n';
10232 	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10233 		goto epistasis_report_ret_WRITE_FAIL;
10234 	      }
10235 	      // could remove this writeback in --epi1 1 case
10236 	      *dptr = -1;
10237 	    }
10238 	  }
10239 	}
10240       epistasis_report_write_loop:
10241 	block_idx1++;
10242 	marker_uidx++;
10243 	if (block_idx1 >= idx1_block_size) {
10244 	  break;
10245 	}
10246       }
10247       marker_idx1 += idx1_block_size;
10248       fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
10249       if (tests_complete >= pct_thresh) {
10250 	if (pct > 10) {
10251 	  putc_unlocked('\b', stdout);
10252 	}
10253 	pct = (tests_complete * 100LLU) / tests_expected;
10254 	if (pct < 100) {
10255 	  printf("\b\b%" PRIuPTR "%%", pct);
10256 	  fflush(stdout);
10257 	  pct_thresh = ((++pct) * ((uint64_t)tests_expected)) / 100;
10258 	}
10259       }
10260     } while (marker_idx1 < marker_idx1_end);
10261     if (fclose_null(&outfile)) {
10262       goto epistasis_report_ret_WRITE_FAIL;
10263     }
10264   }
10265   memcpy(&(outname_end[7]), ".summary", 9);
10266   if (parallel_tot > 1) {
10267     outname_end[15] = '.';
10268     uint32toa_x(parallel_idx + 1, '\0', &(outname_end[16]));
10269   }
10270   if (fopen_checked(outname, "w", &outfile)) {
10271     goto epistasis_report_ret_OPEN_FAIL;
10272   }
10273   wptr = memcpya(g_textbuf, " CHR ", 5);
10274   wptr = fw_strcpyn(plink_maxsnp, 3, "SNP", wptr);
10275   if (parallel_tot == 1) {
10276     wptr = strcpya(wptr, "        N_SIG        N_TOT         PROP   BEST_CHISQ BEST_CHR ");
10277   } else {
10278     wptr = strcpya(wptr, "        N_SIG        N_TOT   BEST_CHISQ BEST_CHR ");
10279   }
10280   wptr = fw_strcpyn(plink_maxsnp, 8, "BEST_SNP", wptr);
10281   wptr = memcpya(wptr, " \n", 2);
10282   if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10283     goto epistasis_report_ret_WRITE_FAIL;
10284   }
10285   bigstack_reset(bigstack_mark3);
10286   if (bigstack_alloc_ui(marker_ct1, &marker_idx_to_uidx)) {
10287     goto epistasis_report_ret_NOMEM;
10288   }
10289   fill_idx_to_uidx(marker_exclude2, unfiltered_marker_ct, marker_ct2, marker_idx_to_uidx);
10290   marker_idx1 = marker_idx1_start;
10291   marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
10292   if (marker_idx1) {
10293     marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
10294   }
10295   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
10296     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
10297     if (marker_uidx >= chrom_end) {
10298       continue;
10299     }
10300     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10301     wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
10302     *wptr_start++ = ' ';
10303     for (; marker_uidx < chrom_end; marker_uidx++, next_unset_ul_ck(marker_exclude1, unfiltered_marker_ct, &marker_uidx), marker_idx1++) {
10304       uii = n_sig_cts[marker_idx1];
10305       ujj = fail_cts[marker_idx1];
10306       if (gap_cts) {
10307 	ujj += gap_cts[marker_idx1];
10308       }
10309       tests_thrown_out += (uint64_t)ujj;
10310       // number of tests attempted in this run:
10311       // * if set1 and set2 are identical, there are
10312       //   marker_ct2 - 1 - marker_idx1_start cells between the row and the
10313       //   same-index column
10314       // * otherwise, gap_cts[] counted the number of skipped cells
10315       if (marker_idx1 < marker_idx1_end) {
10316 	if (is_triangular) {
10317 	  ujj = marker_ct2 - 1 - marker_idx1_start - ujj;
10318 	} else {
10319 	  ujj = marker_ct2 - ujj;
10320 	}
10321       } else {
10322 	// --parallel bugfix
10323 	ujj = job_size - ujj;
10324       }
10325       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10326       wptr = memseta(wptr, 32, 3);
10327       wptr = uint32toa_w10(uii, wptr);
10328       wptr = memseta(wptr, 32, 3);
10329       wptr = uint32toa_w10x(ujj, ' ', wptr);
10330       if (parallel_tot == 1) {
10331         wptr = dtoa_g_wxp4x(((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 12, ' ', wptr);
10332       }
10333       if (ujj) {
10334 	if (parallel_tot == 1) {
10335 	  // or cat mode
10336 	  wptr = dtoa_g_wxp4x(best_chisq[marker_idx1], 12, ' ', wptr);
10337 	} else {
10338 	  // greater precision for accurate merges
10339 	  wptr = dtoa_g_wxp8x(best_chisq[marker_idx1], 12, ' ', wptr);
10340 	}
10341 	uii = marker_idx_to_uidx[best_ids[marker_idx1]];
10342 	wptr = width_force(4, wptr, chrom_name_write(chrom_info_ptr, get_variant_chrom(chrom_info_ptr, uii), wptr));
10343 	*wptr++ = ' ';
10344 	wptr = fw_strcpy(plink_maxsnp, &(marker_ids[uii * max_marker_id_len]), wptr);
10345       } else {
10346 	wptr = memcpya(wptr, "          NA   NA", 17);
10347 	wptr = memseta(wptr, 32, plink_maxsnp - 1);
10348 	wptr = memcpya(wptr, "NA", 2);
10349       }
10350       wptr = memcpya(wptr, " \n", 2);
10351       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10352 	goto epistasis_report_ret_WRITE_FAIL;
10353       }
10354     }
10355   }
10356   if (is_triangular) {
10357     tests_thrown_out /= 2; // all fails double-counted in triangle case
10358   }
10359   fputs("\b\b", stdout);
10360   LOGPRINTF("done.\n");
10361   LOGPRINTFWW("%" PRIu64 " valid test%s performed, summary written to %s .\n", tests_expected - tests_thrown_out, (tests_expected - tests_thrown_out == 1)? "" : "s", outname);
10362 
10363   while (0) {
10364   epistasis_report_ret_NOMEM:
10365     retval = RET_NOMEM;
10366     break;
10367   epistasis_report_ret_OPEN_FAIL:
10368     retval = RET_OPEN_FAIL;
10369     break;
10370   epistasis_report_ret_READ_FAIL:
10371     retval = RET_READ_FAIL;
10372     break;
10373   epistasis_report_ret_WRITE_FAIL:
10374     retval = RET_WRITE_FAIL;
10375     break;
10376   epistasis_report_ret_TOO_FEW_MARKERS:
10377     if (pheno_d) {
10378       if (is_triangular) {
10379         logerrprint("Error: --epistasis requires 2+ non-monomorphic autosomal diploid loci.\n");
10380       } else {
10381         logerrprint("Error: Each --epistasis set must contain at least one non-monomorphic autosomal\ndiploid site.\n");
10382       }
10383     } else {
10384       if (is_triangular) {
10385         logerrprint("Error: --{fast-}epistasis requires 2+ autosomal diploid loci not monomorphic in\neither cases or controls.\n");
10386       } else {
10387         logerrprint("Error: Each --{fast-}epistasis set must contain at least one autosomal diploid\nlocus not monomorphic in either cases or controls.\n");
10388       }
10389     }
10390     retval = RET_INVALID_CMDLINE;
10391     break;
10392   epistasis_report_ret_INVALID_CMDLINE_2:
10393     logerrprintb();
10394   epistasis_report_ret_INVALID_CMDLINE:
10395     retval = RET_INVALID_CMDLINE;
10396     break;
10397   epistasis_report_ret_THREAD_CREATE_FAIL:
10398     retval = RET_THREAD_CREATE_FAIL;
10399     break;
10400   }
10401  epistasis_report_ret_1:
10402   fclose_cond(outfile);
10403   bigstack_reset(bigstack_mark);
10404   return retval;
10405 }
10406 
indep_pairphase(Ld_info * ldip,FILE * bedfile,uintptr_t bed_offset,uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,char * marker_ids,uintptr_t max_marker_id_len,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uint32_t * marker_pos,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,uintptr_t * sex_male,char * outname,char * outname_end,uint32_t hh_exists)10407 int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
10408   // Like ld_prune(), except that it computes the full 3x3 contingency table,
10409   // and is always in pairwise mode.
10410   unsigned char* bigstack_mark = g_bigstack_base;
10411   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
10412   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
10413   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
10414   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
10415   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
10416   uintptr_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
10417   // no actual case/control split here, but keep the variables the same to
10418   // minimize divergence from ld_report_dprime()
10419   uintptr_t founder_ctsplit = 3 * founder_ctv3;
10420   uintptr_t final_mask = get_final_mask(founder_ct);
10421   uintptr_t window_max = 1;
10422   uintptr_t* founder_include2 = nullptr;
10423   uintptr_t* founder_male_include2 = nullptr;
10424   uintptr_t* sex_male_collapsed = nullptr;
10425   uintptr_t* cur_geno1_male = nullptr;
10426   double prune_ld_thresh = ldip->prune_last_param * (1 + SMALL_EPSILON);
10427   uint32_t window_is_kb = (ldip->modifier / LD_PRUNE_KB_WINDOW) & 1;
10428   uint32_t ld_window_size = ldip->prune_window_size;
10429   uint32_t ld_window_incr = ldip->prune_window_incr;
10430   uint32_t tot_exclude_ct = 0;
10431   uint32_t at_least_one_prune = 0;
10432   uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
10433   int32_t retval = 0;
10434   uint32_t tot1[6];
10435   uint32_t counts[18];
10436   uintptr_t* loadbuf_raw;
10437   uintptr_t* loadbuf;
10438   uintptr_t* dummy_nm;
10439   uintptr_t* pruned_arr;
10440   uintptr_t* geno;
10441   uintptr_t* zmiss;
10442   uintptr_t* cur_geno1;
10443   uintptr_t* cur_geno2;
10444   uint32_t* live_indices;
10445   uint32_t* start_arr;
10446   uint32_t* cur_tots;
10447   uint32_t* cur_tot2;
10448   uintptr_t window_maxl;
10449   uintptr_t cur_exclude_ct;
10450   uintptr_t ulii;
10451   uintptr_t uljj;
10452   uintptr_t ulkk;
10453   double freq1x;
10454   double freq2x;
10455   double freqx1;
10456   double freqx2;
10457   double freq11;
10458   double freq11_expected;
10459   double rsq;
10460   uint32_t pct_thresh;
10461   uint32_t window_unfiltered_start;
10462   uint32_t window_unfiltered_end;
10463   uint32_t cur_window_size;
10464   uint32_t cur_chrom;
10465   uint32_t chrom_start;
10466   uint32_t chrom_end;
10467   uint32_t is_haploid;
10468   uint32_t is_x;
10469   uint32_t is_y;
10470   uint32_t prev_end;
10471   uint32_t nm_fixed;
10472   uint32_t cur_zmiss2;
10473   uint32_t pct;
10474   uint32_t uii;
10475   if (founder_ct < 2) {
10476     logerrprint("Warning: Skipping --indep-pairphase since there are less than two founders.\n(--make-founders may come in handy here.)\n");
10477     goto indep_pairphase_ret_1;
10478   }
10479   if (is_set(chrom_info_ptr->chrom_mask, 0)) {
10480     ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, 0);
10481     if (chrom_info_ptr->zero_extra_chroms) {
10482       for (uii = chrom_info_ptr->max_code + 1; uii < chrom_code_end; uii++) {
10483 	ulii += count_chrom_markers(chrom_info_ptr, marker_exclude, uii);
10484       }
10485       chrom_code_end = chrom_info_ptr->max_code + 1;
10486     }
10487     marker_ct -= ulii;
10488     LOGPRINTF("--indep-pairphase: Ignoring %" PRIuPTR " chromosome 0 variant%s.\n", ulii, (ulii == 1)? "" : "s");
10489   }
10490   if (marker_ct < 2) {
10491     logerrprint("Error: Too few variants for --indep-pairphase.\n");
10492     goto indep_pairphase_ret_INVALID_FORMAT;
10493   }
10494 
10495   // no need to force founder_male_include2 initialization here
10496   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, hh_exists, 1, &founder_include2, &founder_male_include2)) {
10497     goto indep_pairphase_ret_NOMEM;
10498   }
10499 
10500   if (window_is_kb) {
10501     // determine maximum number of markers that may need to be loaded at once
10502     for (cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
10503       if (is_set(chrom_info_ptr->chrom_mask, cur_chrom)) {
10504 	window_max = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, cur_chrom, 0x7fffffff, ld_window_size * 1000, window_max);
10505       }
10506     }
10507   }
10508 
10509   window_unfiltered_start = ld_prune_next_valid_chrom_start(marker_exclude, 0, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
10510 
10511   if (bigstack_alloc_ul(unfiltered_marker_ctl, &pruned_arr)) {
10512     goto indep_pairphase_ret_NOMEM;
10513   }
10514 
10515   memcpy(pruned_arr, marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t));
10516 
10517   if (!window_is_kb) {
10518     window_max = ld_window_size;
10519   }
10520   window_maxl = BITCT_TO_WORDCT(window_max);
10521   if (bigstack_alloc_ui(window_max, &live_indices) ||
10522       bigstack_alloc_ui(window_max, &start_arr) ||
10523       bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw) ||
10524       bigstack_alloc_ul(founder_ctl * 2, &loadbuf) ||
10525       bigstack_alloc_ul(founder_ctl, &dummy_nm) ||
10526       bigstack_alloc_ul(founder_ctsplit * window_max, &geno) ||
10527       bigstack_alloc_ul(window_maxl, &zmiss) ||
10528       bigstack_alloc_ui(window_max * 3, &cur_tots)) {
10529     goto indep_pairphase_ret_NOMEM;
10530   }
10531   loadbuf[founder_ctl * 2 - 2] = 0;
10532   loadbuf[founder_ctl * 2 - 1] = 0;
10533   fill_all_bits(founder_ct, dummy_nm);
10534   // bugfix: this loop must start at 0, not 1
10535   for (ulii = 0; ulii < window_max; ulii++) {
10536     geno[ulii * founder_ctsplit + founder_ctv3 - 1] = 0;
10537     geno[ulii * founder_ctsplit + 2 * founder_ctv3 - 1] = 0;
10538     geno[ulii * founder_ctsplit + founder_ctsplit - 1] = 0;
10539   }
10540   if ((chrom_info_ptr->xymt_codes[X_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[X_OFFSET])) {
10541     if (bigstack_alloc_ul(founder_ctl, &sex_male_collapsed) ||
10542         bigstack_alloc_ul(founder_ctsplit, &cur_geno1_male)) {
10543       goto indep_pairphase_ret_NOMEM;
10544     }
10545     copy_bitarr_subset(sex_male, founder_info, unfiltered_sample_ct, founder_ct, sex_male_collapsed);
10546   }
10547   do {
10548     prev_end = 0;
10549     ld_prune_start_chrom(window_is_kb, &cur_chrom, &chrom_end, window_unfiltered_start, live_indices, start_arr, &window_unfiltered_end, ld_window_size, &cur_window_size, unfiltered_marker_ct, pruned_arr, chrom_info_ptr, marker_pos, &is_haploid, &is_x, &is_y);
10550     cur_exclude_ct = 0;
10551     fill_ulong_zero(window_maxl, zmiss);
10552     if (cur_window_size > 1) {
10553       for (ulii = 0; ulii < (uintptr_t)cur_window_size; ulii++) {
10554 	uljj = live_indices[ulii];
10555 	if (fseeko(bedfile, bed_offset + (uljj * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
10556 	  goto indep_pairphase_ret_READ_FAIL;
10557 	}
10558 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uljj), bedfile, loadbuf_raw, loadbuf)) {
10559 	  goto indep_pairphase_ret_READ_FAIL;
10560 	}
10561 	if (is_haploid && hh_exists) {
10562 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf);
10563 	}
10564 	cur_geno1 = &(geno[ulii * founder_ctsplit]);
10565 	load_and_split3(nullptr, loadbuf, founder_ct, cur_geno1, dummy_nm, dummy_nm, founder_ctv3, 0, 0, 1, &ulkk);
10566 	cur_tots[ulii * 3] = popcount_longs(cur_geno1, founder_ctv3);
10567 	cur_tots[ulii * 3 + 1] = popcount_longs(&(cur_geno1[founder_ctv3]), founder_ctv3);
10568 	cur_tots[ulii * 3 + 2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
10569 	if ((!cur_tots[ulii * 3 + 1]) && ((!cur_tots[ulii * 3]) || (!cur_tots[ulii * 3 + 2]))) {
10570 	  SET_BIT(uljj, pruned_arr);
10571 	  cur_exclude_ct++;
10572 	} else if (ulkk == 3) {
10573 	  SET_BIT(ulii, zmiss);
10574 	}
10575       }
10576     }
10577     pct = 1;
10578     chrom_start = get_chrom_start_vidx(chrom_info_ptr, cur_chrom);
10579     pct_thresh = window_unfiltered_start + ((uint64_t)pct * (chrom_end - chrom_start)) / 100;
10580     while ((window_unfiltered_start < chrom_end) || (cur_window_size > 1)) {
10581       if (cur_window_size > 1) {
10582 	do {
10583 	  at_least_one_prune = 0;
10584 	  for (ulii = 0; ulii < cur_window_size - 1; ulii++) {
10585 	    if (IS_SET(pruned_arr, live_indices[ulii])) {
10586 	      continue;
10587 	    }
10588 	    uljj = ulii + 1;
10589 	    while (live_indices[uljj] < start_arr[ulii]) {
10590 	      if (++uljj == cur_window_size) {
10591 		goto indep_pairphase_skip_marker;
10592 	      }
10593 	    }
10594 	    cur_geno1 = &(geno[ulii * founder_ctsplit]);
10595 	    memcpy(tot1, &(cur_tots[ulii * 3]), 3 * sizeof(int32_t));
10596 	    nm_fixed = is_set_ul(zmiss, ulii);
10597 	    if (is_x) {
10598 	      memcpy(cur_geno1_male, cur_geno1, founder_ctsplit * sizeof(intptr_t));
10599               bitvec_and(sex_male_collapsed, founder_ctv3, cur_geno1_male);
10600 	      tot1[3] = popcount_longs(cur_geno1_male, founder_ctv3);
10601               bitvec_and(sex_male_collapsed, founder_ctv3, &(cur_geno1_male[founder_ctv3]));
10602 	      tot1[4] = popcount_longs(&(cur_geno1_male[founder_ctv3]), founder_ctv3);
10603               bitvec_and(sex_male_collapsed, founder_ctv3, &(cur_geno1_male[2 * founder_ctv3]));
10604 	      tot1[5] = popcount_longs(&(cur_geno1_male[2 * founder_ctv3]), founder_ctv3);
10605 	    }
10606 	    for (; uljj < cur_window_size; uljj++) {
10607 	      if (IS_SET(pruned_arr, live_indices[uljj])) {
10608 		continue;
10609 	      }
10610 	      cur_geno2 = &(geno[uljj * founder_ctsplit]);
10611 	      cur_tot2 = &(cur_tots[uljj * 3]);
10612 	      cur_zmiss2 = IS_SET(zmiss, uljj);
10613 	      if (nm_fixed) {
10614 		two_locus_count_table_zmiss1(cur_geno1, cur_geno2, counts, founder_ctv3, cur_zmiss2);
10615 		if (cur_zmiss2) {
10616                   counts[2] = tot1[0] - counts[0] - counts[1];
10617                   counts[5] = tot1[1] - counts[3] - counts[4];
10618 		}
10619                 counts[6] = cur_tot2[0] - counts[0] - counts[3];
10620                 counts[7] = cur_tot2[1] - counts[1] - counts[4];
10621                 counts[8] = cur_tot2[2] - counts[2] - counts[5];
10622 	      } else {
10623                 two_locus_count_table(cur_geno1, cur_geno2, counts, founder_ctv3, cur_zmiss2);
10624                 if (cur_zmiss2) {
10625                   counts[2] = tot1[0] - counts[0] - counts[1];
10626                   counts[5] = tot1[1] - counts[3] - counts[4];
10627                   counts[8] = tot1[2] - counts[6] - counts[7];
10628 		}
10629 	      }
10630               if (is_x) {
10631                 two_locus_count_table(cur_geno1_male, cur_geno2, &(counts[9]), founder_ctv3, cur_zmiss2);
10632                 if (cur_zmiss2) {
10633                   counts[11] = tot1[3] - counts[9] - counts[10];
10634                   counts[14] = tot1[4] - counts[12] - counts[13];
10635                   counts[17] = tot1[5] - counts[15] - counts[16];
10636 		}
10637 	      }
10638 	      if (!em_phase_hethet_nobase(counts, is_x, is_x, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
10639 		freq11_expected = freqx1 * freq1x;
10640 		rsq = freq11 - freq11_expected;
10641 		rsq = rsq * rsq / (freq11_expected * freq2x * freqx2);
10642 		if (rsq > prune_ld_thresh) {
10643 		  at_least_one_prune = 1;
10644 		  cur_exclude_ct++;
10645 		  // remove marker with lower MAF
10646 		  if (get_maf(set_allele_freqs[live_indices[ulii]]) < (1 - SMALL_EPSILON) * get_maf(set_allele_freqs[live_indices[uljj]])) {
10647 		    SET_BIT(live_indices[ulii], pruned_arr);
10648 		  } else {
10649 		    SET_BIT(live_indices[uljj], pruned_arr);
10650 		    uljj++;
10651 		    while (uljj < cur_window_size) {
10652 		      if (!IS_SET(pruned_arr, live_indices[uljj])) {
10653 			break;
10654 		      }
10655 		      uljj++;
10656 		    }
10657 		    if (uljj < cur_window_size) {
10658 		      start_arr[ulii] = live_indices[uljj];
10659 		    }
10660 		  }
10661 		  break;
10662 		}
10663 	      }
10664 	    }
10665 	    if (uljj == cur_window_size) {
10666 	    indep_pairphase_skip_marker:
10667 	      start_arr[ulii] = window_unfiltered_end;
10668 	    }
10669 	  }
10670 	} while (at_least_one_prune);
10671       }
10672       for (uii = 0; uii < ld_window_incr; uii++) {
10673 	if (window_unfiltered_start == chrom_end) {
10674 	  break;
10675 	}
10676 	window_unfiltered_start++;
10677 	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_start);
10678       }
10679       if (window_unfiltered_start == chrom_end) {
10680 	break;
10681       }
10682       if (window_unfiltered_start >= pct_thresh) {
10683 	pct = ((window_unfiltered_start - chrom_start) * 100LLU) / (chrom_end - chrom_start);
10684 	printf("\r%u%%", pct++);
10685 	fflush(stdout);
10686 	pct_thresh = chrom_start + (((uint64_t)pct * (chrom_end - chrom_start)) / 100);
10687       }
10688       uljj = 0;
10689       if (window_unfiltered_end < window_unfiltered_start) {
10690 	window_unfiltered_end = window_unfiltered_start;
10691       }
10692       // copy back previously loaded/computed results
10693       while (live_indices[uljj] < window_unfiltered_start) {
10694 	uljj++;
10695 	if (uljj == cur_window_size) {
10696 	  break;
10697 	}
10698       }
10699       for (ulii = 0; uljj < cur_window_size; uljj++) {
10700 	if (IS_SET(pruned_arr, live_indices[uljj])) {
10701 	  continue;
10702 	}
10703 	memcpy(&(geno[ulii * founder_ctsplit]), &(geno[uljj * founder_ctsplit]), founder_ctsplit * sizeof(intptr_t));
10704 	live_indices[ulii] = live_indices[uljj];
10705 	start_arr[ulii] = start_arr[uljj];
10706 	memcpy(&(cur_tots[ulii * 3]), &(cur_tots[uljj * 3]), 3 * sizeof(int32_t));
10707 	// bugfix: forgot to update zmiss
10708 	if (IS_SET(zmiss, uljj)) {
10709 	  SET_BIT(ulii, zmiss);
10710 	} else {
10711 	  CLEAR_BIT(ulii, zmiss);
10712 	}
10713 	ulii++;
10714       }
10715       clear_bits(ulii, window_max, zmiss);
10716 
10717       prev_end = ulii;
10718       cur_window_size = ulii;
10719       if (window_is_kb) {
10720 	uljj = 0;
10721 	ulkk = window_unfiltered_end;
10722 	while ((window_unfiltered_end < chrom_end) && (marker_pos[window_unfiltered_end] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
10723 	  uljj++;
10724 	  window_unfiltered_end++;
10725 	  next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
10726 	}
10727 	window_unfiltered_end = ulkk;
10728       } else {
10729 	uljj = ld_window_incr;
10730       }
10731       for (ulii = 0; ulii < uljj; ulii++) {
10732 	if (window_unfiltered_end == chrom_end) {
10733 	  break;
10734 	}
10735 	live_indices[cur_window_size] = window_unfiltered_end;
10736 	if (cur_window_size > prev_end) {
10737 	  start_arr[cur_window_size - 1] = window_unfiltered_end;
10738 	}
10739 	if (fseeko(bedfile, bed_offset + (window_unfiltered_end * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
10740 	  goto indep_pairphase_ret_READ_FAIL;
10741 	}
10742 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end), bedfile, loadbuf_raw, loadbuf)) {
10743 	  goto indep_pairphase_ret_READ_FAIL;
10744 	}
10745 	if (is_haploid && hh_exists) {
10746 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf);
10747 	}
10748 	cur_geno1 = &(geno[cur_window_size * founder_ctsplit]);
10749 	load_and_split3(nullptr, loadbuf, founder_ct, cur_geno1, dummy_nm, dummy_nm, founder_ctv3, 0, 0, 1, &ulkk);
10750 	cur_tots[((uintptr_t)cur_window_size) * 3] = popcount_longs(cur_geno1, founder_ctv3);
10751 	cur_tots[((uintptr_t)cur_window_size) * 3 + 1] = popcount_longs(&(cur_geno1[founder_ctv3]), founder_ctv3);
10752 	cur_tots[((uintptr_t)cur_window_size) * 3 + 2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
10753 	if ((!cur_tots[((uintptr_t)cur_window_size) * 3 + 1]) && ((!cur_tots[((uintptr_t)cur_window_size) * 3]) || (!cur_tots[((uintptr_t)cur_window_size) * 3 + 2]))) {
10754 	  SET_BIT(window_unfiltered_end, pruned_arr);
10755 	  cur_exclude_ct++;
10756 	} else if (ulkk == 3) {
10757 	  SET_BIT(cur_window_size, zmiss);
10758 	}
10759 	cur_window_size++;
10760 	window_unfiltered_end++;
10761 	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
10762       }
10763       if (cur_window_size > prev_end) {
10764 	start_arr[cur_window_size - 1] = window_unfiltered_end;
10765       }
10766     }
10767     putc_unlocked('\r', stdout);
10768     LOGPRINTF("Pruned %" PRIuPTR " variant%s from chromosome %u, leaving %" PRIuPTR ".\n", cur_exclude_ct, (cur_exclude_ct == 1)? "" : "s", cur_chrom, chrom_end - chrom_start - popcount_bit_idx(marker_exclude, chrom_start, chrom_end) - cur_exclude_ct);
10769     tot_exclude_ct += cur_exclude_ct;
10770 
10771     // advance chromosomes as necessary
10772     window_unfiltered_start = ld_prune_next_valid_chrom_start(pruned_arr, window_unfiltered_start, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
10773   } while (window_unfiltered_start < unfiltered_marker_ct);
10774 
10775   LOGPRINTF("Pruning complete.  %u of %" PRIuPTR " variants removed.\n", tot_exclude_ct, marker_ct);
10776   retval = ld_prune_write(outname, outname_end, marker_exclude, pruned_arr, marker_ids, max_marker_id_len, chrom_info_ptr, chrom_code_end);
10777   if (retval) {
10778     goto indep_pairphase_ret_1;
10779   }
10780 
10781   while (0) {
10782   indep_pairphase_ret_NOMEM:
10783     retval = RET_NOMEM;
10784     break;
10785   indep_pairphase_ret_READ_FAIL:
10786     retval = RET_READ_FAIL;
10787     break;
10788   indep_pairphase_ret_INVALID_FORMAT:
10789     retval = RET_INVALID_FORMAT;
10790     break;
10791   }
10792  indep_pairphase_ret_1:
10793   bigstack_reset(bigstack_mark);
10794   return retval;
10795 }
10796 
10797 typedef struct ll_epi_summary_struct {
10798   struct ll_epi_summary_struct* next;
10799   double best_chisq;
10800   char* best_chr_and_snp; // separate allocation; tab-delimited
10801   uint32_t n_sig;
10802   uint32_t n_tot;
10803   uint32_t id_len; // variant ID NOT null-terminated
10804   char strbuf[];
10805 } Ll_epi_summary;
10806 
10807 // N.B. moves g_bigstack_base in word-size instead of cacheline increments
lle_alloc(char * chrom_id,uint32_t chrom_len,char * marker_id,uint32_t marker_id_len,uint32_t nsig,uint32_t ntot,double chisq)10808 Ll_epi_summary* lle_alloc(char* chrom_id, uint32_t chrom_len, char* marker_id, uint32_t marker_id_len, uint32_t nsig, uint32_t ntot, double chisq) {
10809   uintptr_t alloc_size = (sizeof(Ll_epi_summary) + chrom_len + marker_id_len + sizeof(intptr_t)) & (~(sizeof(intptr_t) - ONELU));
10810   Ll_epi_summary* newptr = (Ll_epi_summary*)g_bigstack_base;
10811   if (bigstack_left() < alloc_size) {
10812     return nullptr;
10813   }
10814   g_bigstack_base = &(g_bigstack_base[alloc_size]);
10815   newptr->next = nullptr;
10816   newptr->best_chisq = chisq;
10817   newptr->n_sig = nsig;
10818   newptr->n_tot = ntot;
10819   newptr->id_len = marker_id_len;
10820   memcpy(newptr->strbuf, marker_id, marker_id_len);
10821   memcpyx(&(newptr->strbuf[marker_id_len]), chrom_id, chrom_len, '\0');
10822   return newptr;
10823 }
10824 
validate_epistasis_summary_header(char * bufptr)10825 int32_t validate_epistasis_summary_header(char* bufptr) {
10826   uint32_t slen = strlen_se(bufptr);
10827   int32_t retval = 0;
10828   if ((slen != 3) || memcmp(bufptr, "CHR", 3)) {
10829     return RET_INVALID_FORMAT;
10830   }
10831   bufptr = skip_initial_spaces(&(bufptr[3]));
10832   slen = strlen_se(bufptr);
10833   if ((slen != 3) || memcmp(bufptr, "SNP", 3)) {
10834     return RET_INVALID_FORMAT;
10835   }
10836   bufptr = skip_initial_spaces(&(bufptr[3]));
10837   slen = strlen_se(bufptr);
10838   if ((slen != 5) || memcmp(bufptr, "N_SIG", 5)) {
10839     return RET_INVALID_FORMAT;
10840   }
10841   bufptr = skip_initial_spaces(&(bufptr[5]));
10842   slen = strlen_se(bufptr);
10843   if ((slen != 5) || memcmp(bufptr, "N_TOT", 5)) {
10844     return RET_INVALID_FORMAT;
10845   }
10846   bufptr = skip_initial_spaces(&(bufptr[5]));
10847   slen = strlen_se(bufptr);
10848   if (slen == 4) {
10849     if (memcmp(bufptr, "PROP", 4)) {
10850       return RET_INVALID_FORMAT;
10851     }
10852     retval = -1;
10853     bufptr = skip_initial_spaces(&(bufptr[4]));
10854     slen = strlen_se(bufptr);
10855   }
10856   if ((slen != 10) || memcmp(bufptr, "BEST_CHISQ", 10)) {
10857     return RET_INVALID_FORMAT;
10858   }
10859   bufptr = skip_initial_spaces(&(bufptr[10]));
10860   slen = strlen_se(bufptr);
10861   if ((slen != 8) || memcmp(bufptr, "BEST_CHR", 8)) {
10862     return RET_INVALID_FORMAT;
10863   }
10864   bufptr = skip_initial_spaces(&(bufptr[8]));
10865   slen = strlen_se(bufptr);
10866   if ((slen != 8) || memcmp(bufptr, "BEST_SNP", 8)) {
10867     return RET_INVALID_FORMAT;
10868   }
10869   bufptr = skip_initial_spaces(&(bufptr[8]));
10870   if (!is_eoln_kns(*bufptr)) {
10871     return RET_INVALID_FORMAT;
10872   }
10873   return retval;
10874 }
10875 
epi_summary_merge(Epi_info * epi_ip,char * outname,char * outname_end)10876 int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
10877   unsigned char* bigstack_mark = g_bigstack_base;
10878   FILE* infile = nullptr;
10879   FILE* outfile = nullptr;
10880   char* inprefix = epi_ip->summary_merge_prefix;
10881   char* inprefix_end = (char*)memchr(inprefix, 0, FNAMESIZE);
10882   Ll_epi_summary* list_start = nullptr;
10883   // first .3 entry is later than first .2 entry, etc., so we can save
10884   // ourselves some linked list traversal time by starting the first-entry scan
10885   // after where the last one left off.
10886   Ll_epi_summary* last_start = nullptr;
10887   Ll_epi_summary** lle_pp = &list_start; // end-of-list pointer for first file
10888   uint32_t file_ct = epi_ip->summary_merge_ct;
10889   int32_t retval = 0;
10890   char* bufptr;
10891   char* bufptr2;
10892   char* bufptr3;
10893   char* bufptr4;
10894   char* id_ptr;
10895   char* nsig_ptr;
10896   char* ntot_ptr;
10897   char* best_chisq_ptr;
10898   char* best_chr_ptr;
10899   char* best_marker_ptr;
10900   Ll_epi_summary* lle_ptr; // traverser for remaining files
10901   uintptr_t line_idx;
10902   uintptr_t ulii;
10903   double cur_chisq;
10904   uint32_t plink_maxsnp;
10905   uint32_t file_idx;
10906   uint32_t chrom_len;
10907   uint32_t id_len;
10908   uint32_t is_first_entry;
10909   int32_t nsig;
10910   int32_t ntot;
10911   if (inprefix_end[-1] == '.') {
10912     inprefix_end--;
10913   }
10914   ulii = (uintptr_t)(inprefix_end - inprefix);
10915   if ((ulii >= 16) && (!memcmp(".summary", &(inprefix[ulii - 8]), 8))) {
10916     inprefix_end -= 8;
10917     ulii -= 8;
10918   }
10919   bufptr = &(inprefix[ulii - 2]);
10920   if (memcmp(".epi.", &(inprefix[ulii - 7]), 5) || (memcmp("cc", bufptr, 2) && memcmp("co", bufptr, 2) && memcmp("qt", bufptr, 2))) {
10921     LOGERRPRINTFWW("Error: Invalid --epistasis-summary-merge filename prefix '%s'. (*.epi.cc, *.epi.co, or *.epi.qt expected.)\n", inprefix);
10922     goto epi_summary_merge_ret_INVALID_CMDLINE;
10923   }
10924   inprefix_end = memcpya(inprefix_end, ".summary.", 9);
10925   memcpyx(outname_end, &(inprefix[ulii - 7]), 15, '\0');
10926   // Started out using a hash table, but on second thought, it's unnecessary
10927   // given the possibilities for distributed .summary files.
10928   // 1. ALL x ALL, SET x SET: First file lists all marker IDs in the final
10929   //    order; first entry in remaining files should match an entry in the
10930   //    middle and the rest match sequentially from there.
10931   // 2. SET1 x ALL, SET1 x SET2: No duplication whatsoever.  Output will be
10932   //    such that cat actually works, but asking users to conditionally use cat
10933   //    would add confusion for little reason; instead we detect the telltale
10934   //    "PROP" in the first file's header line and switch to cat.
10935 
10936   g_textbuf[MAXLINELEN - 1] = ' ';
10937   memcpy(inprefix_end, "1", 2);
10938   if (fopen_checked(inprefix, "r", &infile)) {
10939     goto epi_summary_merge_ret_OPEN_FAIL;
10940   }
10941   retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", g_textbuf, &bufptr, &line_idx);
10942   if (retval) {
10943     goto epi_summary_merge_ret_1;
10944   }
10945   retval = validate_epistasis_summary_header(bufptr);
10946   if (retval) {
10947     if (retval == -1) {
10948       // switch to cat mode.  meow.
10949       fclose_null(&infile);
10950       if (fopen_checked(outname, FOPEN_WB, &outfile)) {
10951 	goto epi_summary_merge_ret_OPEN_FAIL;
10952       }
10953       for (file_idx = 1; file_idx <= file_ct; file_idx++) {
10954         uint32toa_x(file_idx, '\0', inprefix_end);
10955 	if (fopen_checked(inprefix, FOPEN_RB, &infile)) {
10956 	  goto epi_summary_merge_ret_OPEN_FAIL;
10957 	}
10958 	while (1) {
10959 	  ulii = fread(g_textbuf, 1, MAXLINELEN, infile);
10960           if (!ulii) {
10961 	    break;
10962 	  }
10963 	  if (fwrite_checked(g_textbuf, ulii, outfile)) {
10964 	    goto epi_summary_merge_ret_WRITE_FAIL;
10965 	  }
10966 	}
10967 	if (fclose_null(&infile)) {
10968 	  goto epi_summary_merge_ret_READ_FAIL;
10969 	}
10970       }
10971       retval = 0;
10972       goto epi_summary_merge_success;
10973     }
10974     goto epi_summary_merge_ret_INVALID_HEADER;
10975   }
10976   bufptr2 = token_endnn(bufptr);
10977   bufptr = skip_initial_spaces(bufptr2);
10978   plink_maxsnp = ((uintptr_t)(token_endnn(bufptr) - bufptr2)) - 1;
10979   while (fgets(g_textbuf, MAXLINELEN, infile)) {
10980     line_idx++;
10981     if (!g_textbuf[MAXLINELEN - 1]) {
10982       goto epi_summary_merge_ret_LONG_LINE;
10983     }
10984     bufptr = skip_initial_spaces(g_textbuf);
10985     if (is_eoln_kns(*bufptr)) {
10986       continue;
10987     }
10988     chrom_len = strlen_se(bufptr);
10989     id_ptr = skip_initial_spaces(&(bufptr[chrom_len]));
10990     id_len = strlen_se(id_ptr);
10991     nsig_ptr = skip_initial_spaces(&(id_ptr[id_len]));
10992     ntot_ptr = next_token(nsig_ptr);
10993     best_chisq_ptr = next_token(ntot_ptr);
10994     best_chr_ptr = next_token(best_chisq_ptr);
10995     if (no_more_tokens_kns(best_chr_ptr)) {
10996       goto epi_summary_merge_ret_MISSING_TOKENS;
10997     }
10998     if (scan_uint_icap(nsig_ptr, (uint32_t*)&nsig)) {
10999       goto epi_summary_merge_ret_INVALID_NSIG;
11000     }
11001     if (scan_uint_icap(ntot_ptr, (uint32_t*)&ntot)) {
11002       goto epi_summary_merge_ret_INVALID_NTOT;
11003     }
11004     if (ntot) {
11005       if (scan_double(best_chisq_ptr, &cur_chisq)) {
11006 	goto epi_summary_merge_ret_INVALID_CHISQ;
11007       }
11008     } else {
11009       cur_chisq = 0;
11010     }
11011     *lle_pp = lle_alloc(bufptr, chrom_len, id_ptr, id_len, nsig, ntot, cur_chisq);
11012     if (!(*lle_pp)) {
11013       goto epi_summary_merge_ret_NOMEM;
11014     }
11015     chrom_len = strlen_se(best_chr_ptr);
11016     best_marker_ptr = skip_initial_spaces(&(best_chr_ptr[chrom_len]));
11017     id_len = strlen_se(best_marker_ptr);
11018     if (!id_len) {
11019       goto epi_summary_merge_ret_MISSING_TOKENS;
11020     }
11021     // throw in an extra word, to reduce the need for reallocation
11022     ulii = (chrom_len + id_len + 1 + 2 * sizeof(intptr_t)) & (~(sizeof(intptr_t) - 1));
11023     if (ulii > bigstack_left()) {
11024       goto epi_summary_merge_ret_NOMEM;
11025     }
11026     bufptr = (char*)g_bigstack_base;
11027     memcpyx(bufptr, best_chr_ptr, chrom_len, '\t');
11028     memcpy(&(bufptr[chrom_len + 1]), best_marker_ptr, id_len);
11029     // pad with nulls then tab-terminate, so we can find the buffer end later
11030     memset(&(bufptr[chrom_len + id_len + 1]), 0, ulii - chrom_len - id_len - 2);
11031     bufptr[ulii - 1] = '\t';
11032     (*lle_pp)->best_chr_and_snp = bufptr;
11033     lle_pp = &((*lle_pp)->next);
11034     g_bigstack_base = &(g_bigstack_base[ulii]);
11035   }
11036   if (fclose_null(&infile)) {
11037     goto epi_summary_merge_ret_READ_FAIL;
11038   }
11039   if (!list_start) {
11040     LOGPREPRINTFWW("Error: %s has no entries.\n", inprefix);
11041     goto epi_summary_merge_ret_INVALID_FORMAT_2;
11042   }
11043   last_start = list_start->next;
11044   for (file_idx = 2; file_idx <= file_ct; file_idx++) {
11045     uint32toa_x(file_idx, '\0', inprefix_end);
11046     if (fopen_checked(inprefix, "r", &infile)) {
11047       goto epi_summary_merge_ret_OPEN_FAIL;
11048     }
11049     retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", g_textbuf, &bufptr, &line_idx);
11050     if (retval) {
11051       goto epi_summary_merge_ret_1;
11052     }
11053     retval = validate_epistasis_summary_header(bufptr);
11054     if (retval) {
11055       goto epi_summary_merge_ret_INVALID_HEADER;
11056     }
11057     lle_ptr = last_start;
11058     is_first_entry = 1;
11059     while (fgets(g_textbuf, MAXLINELEN, infile)) {
11060       line_idx++;
11061       if (!g_textbuf[MAXLINELEN - 1]) {
11062 	goto epi_summary_merge_ret_LONG_LINE;
11063       }
11064       bufptr = skip_initial_spaces(g_textbuf);
11065       if (is_eoln_kns(*bufptr)) {
11066 	continue;
11067       }
11068       if (!lle_ptr) {
11069         LOGPREPRINTFWW("Error: More lines than expected in %s.\n", inprefix);
11070 	goto epi_summary_merge_ret_INVALID_FORMAT_2;
11071       }
11072       chrom_len = strlen_se(bufptr);
11073       id_ptr = skip_initial_spaces(&(bufptr[chrom_len]));
11074       id_len = strlen_se(id_ptr);
11075       nsig_ptr = skip_initial_spaces(&(id_ptr[id_len]));
11076       ntot_ptr = next_token(nsig_ptr);
11077       best_chisq_ptr = next_token(ntot_ptr);
11078       best_chr_ptr = next_token(best_chisq_ptr);
11079       if (no_more_tokens_kns(best_chr_ptr)) {
11080 	goto epi_summary_merge_ret_MISSING_TOKENS;
11081       }
11082       if (scan_uint_icap(nsig_ptr, (uint32_t*)&nsig)) {
11083 	goto epi_summary_merge_ret_INVALID_NSIG;
11084       }
11085       if (scan_uint_icap(ntot_ptr, (uint32_t*)&ntot)) {
11086 	goto epi_summary_merge_ret_INVALID_NTOT;
11087       }
11088       if (!is_first_entry) {
11089 	if ((lle_ptr->id_len != id_len) || memcmp(lle_ptr->strbuf, id_ptr, id_len) || (strlen(&(lle_ptr->strbuf[id_len])) != chrom_len) || memcmp(&(lle_ptr->strbuf[id_len]), bufptr, chrom_len)) {
11090 	  goto epi_summary_merge_ret_MISMATCH;
11091 	}
11092       } else {
11093 	while (1) {
11094 	  if (!lle_ptr) {
11095 	    goto epi_summary_merge_ret_MISMATCH;
11096 	  }
11097 	  if ((lle_ptr->id_len == id_len) && (!memcmp(lle_ptr->strbuf, id_ptr, id_len))) {
11098 	    break;
11099 	  }
11100           lle_ptr = lle_ptr->next;
11101 	}
11102         if ((strlen(&(lle_ptr->strbuf[id_len])) != chrom_len) || memcmp(&(lle_ptr->strbuf[id_len]), bufptr, chrom_len)) {
11103 	  goto epi_summary_merge_ret_MISMATCH;
11104 	}
11105 	last_start = lle_ptr->next;
11106 	is_first_entry = 0;
11107       }
11108       if (ntot) {
11109 	if (scan_double(best_chisq_ptr, &cur_chisq)) {
11110 	  goto epi_summary_merge_ret_INVALID_CHISQ;
11111 	}
11112 	lle_ptr->n_sig += nsig;
11113 	lle_ptr->n_tot += ntot;
11114         if (cur_chisq > lle_ptr->best_chisq) {
11115 	  chrom_len = strlen_se(best_chr_ptr);
11116           best_marker_ptr = skip_initial_spaces(&(best_chr_ptr[chrom_len]));
11117           id_len = strlen_se(best_marker_ptr);
11118 	  if (!id_len) {
11119 	    goto epi_summary_merge_ret_MISSING_TOKENS;
11120 	  }
11121           lle_ptr->best_chisq = cur_chisq;
11122 	  bufptr = lle_ptr->best_chr_and_snp;
11123           bufptr2 = (char*)memchr(bufptr, '\t', MAXLINELEN);
11124 	  bufptr3 = (char*)memchr(++bufptr2, 0, MAXLINELEN);
11125 	  bufptr4 = (char*)memchr(bufptr3, '\t', MAXLINELEN);
11126 	  ulii = (uintptr_t)(bufptr4 - bufptr);
11127 	  if (ulii <= chrom_len + id_len + 1) {
11128 	    ulii = (chrom_len + id_len + 1 + sizeof(intptr_t)) & (~(sizeof(intptr_t) - 1));
11129             if (ulii > bigstack_left()) {
11130 	      goto epi_summary_merge_ret_NOMEM;
11131 	    }
11132             bufptr = (char*)g_bigstack_base;
11133 	    bufptr3 = &(bufptr[ulii - 1]);
11134 	    *bufptr3 = '\t';
11135             lle_ptr->best_chr_and_snp = bufptr;
11136             g_bigstack_base = &(g_bigstack_base[ulii]);
11137 	  }
11138 	  bufptr = memcpyax(bufptr, best_chr_ptr, chrom_len, '\t');
11139 	  bufptr = memcpya(bufptr, best_marker_ptr, id_len);
11140 	  if (bufptr < bufptr3) {
11141 	    memset(bufptr, 0, bufptr3 - bufptr);
11142 	  }
11143 	}
11144       }
11145       lle_ptr = lle_ptr->next;
11146     }
11147     if (fclose_null(&infile)) {
11148       goto epi_summary_merge_ret_READ_FAIL;
11149     }
11150   }
11151 
11152   if (fopen_checked(outname, "w", &outfile)) {
11153     goto epi_summary_merge_ret_OPEN_FAIL;
11154   }
11155   bufptr = memcpya(g_textbuf, " CHR ", 5);
11156   bufptr = fw_strcpyn(plink_maxsnp, 3, "SNP", bufptr);
11157   bufptr = strcpya(bufptr, "        N_SIG        N_TOT         PROP   BEST_CHISQ BEST_CHR ");
11158   bufptr = fw_strcpyn(plink_maxsnp, 8, "BEST_SNP", bufptr);
11159   bufptr = memcpya(bufptr, " \n", 2);
11160   if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
11161     goto epi_summary_merge_ret_WRITE_FAIL;
11162   }
11163   lle_ptr = list_start;
11164   do {
11165     bufptr2 = lle_ptr->strbuf;
11166     id_len = lle_ptr->id_len;
11167     bufptr3 = &(bufptr2[id_len]);
11168     bufptr = fw_strcpy(4, bufptr3, g_textbuf);
11169     *bufptr++ = ' ';
11170     bufptr = fw_strcpyn(plink_maxsnp, id_len, bufptr2, bufptr);
11171     nsig = lle_ptr->n_sig;
11172     ntot = lle_ptr->n_tot;
11173     bufptr = memseta(bufptr, 32, 3);
11174     bufptr = uint32toa_w10(nsig, bufptr);
11175     bufptr = memseta(bufptr, 32, 3);
11176     bufptr = uint32toa_w10x(ntot, ' ', bufptr);
11177     bufptr = dtoa_g_wxp4x(((double)((int32_t)nsig)) / ((double)((int32_t)ntot)), 12, ' ', bufptr);
11178     bufptr = dtoa_g_wxp4x(lle_ptr->best_chisq, 12, ' ', bufptr);
11179     // no need to special-case ntot == 0, this code correctly copies 'NA'
11180     bufptr2 = lle_ptr->best_chr_and_snp;
11181     bufptr3 = (char*)memchr(bufptr2, '\t', MAXLINELEN);
11182     ulii = (uintptr_t)(bufptr3 - bufptr2);
11183     bufptr = fw_strcpyn(4, ulii, bufptr2, bufptr);
11184     *bufptr++ = ' ';
11185     bufptr = fw_strcpy(plink_maxsnp, &(bufptr3[1]), bufptr);
11186     bufptr = memcpya(bufptr, " \n", 2);
11187     if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
11188       goto epi_summary_merge_ret_WRITE_FAIL;
11189     }
11190     lle_ptr = lle_ptr->next;
11191   } while (lle_ptr);
11192 
11193  epi_summary_merge_success:
11194   if (fclose_null(&outfile)) {
11195     // just kidding!  no success
11196     goto epi_summary_merge_ret_WRITE_FAIL;
11197   }
11198   LOGPRINTFWW("--epistasis-summary-merge: Merged summary written to %s .\n", outname);
11199   while (0) {
11200   epi_summary_merge_ret_NOMEM:
11201     retval = RET_NOMEM;
11202     break;
11203   epi_summary_merge_ret_OPEN_FAIL:
11204     retval = RET_OPEN_FAIL;
11205     break;
11206   epi_summary_merge_ret_READ_FAIL:
11207     retval = RET_READ_FAIL;
11208     break;
11209   epi_summary_merge_ret_WRITE_FAIL:
11210     retval = RET_WRITE_FAIL;
11211     break;
11212   epi_summary_merge_ret_INVALID_CMDLINE:
11213     retval = RET_INVALID_CMDLINE;
11214     break;
11215   epi_summary_merge_ret_MISMATCH:
11216     logerrprint("Error: --epistasis-summary-merge files were generated from different datasets\nand/or settings.\n");
11217     retval = RET_INVALID_FORMAT;
11218     break;
11219   epi_summary_merge_ret_INVALID_NSIG:
11220     LOGERRPRINTFWW("Error: Invalid N_SIG value on line %" PRIuPTR " of %s .\n", line_idx, inprefix);
11221     retval = RET_INVALID_FORMAT;
11222     break;
11223   epi_summary_merge_ret_INVALID_NTOT:
11224     LOGERRPRINTFWW("Error: Invalid N_SIG value on line %" PRIuPTR " of %s .\n", line_idx, inprefix);
11225     retval = RET_INVALID_FORMAT;
11226     break;
11227   epi_summary_merge_ret_INVALID_CHISQ:
11228     LOGERRPRINTFWW("Error: Invalid BEST_CHISQ value on line %" PRIuPTR " of %s .\n", line_idx, inprefix);
11229     retval = RET_INVALID_FORMAT;
11230     break;
11231   epi_summary_merge_ret_MISSING_TOKENS:
11232     LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, inprefix);
11233     retval = RET_INVALID_FORMAT;
11234     break;
11235   epi_summary_merge_ret_LONG_LINE:
11236     LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, inprefix);
11237     retval = RET_INVALID_FORMAT;
11238     break;
11239   epi_summary_merge_ret_INVALID_HEADER:
11240     LOGPREPRINTFWW(g_logbuf, "Error: Invalid --epistasis-summary-merge header in %s.\n", inprefix);
11241   epi_summary_merge_ret_INVALID_FORMAT_2:
11242     logerrprintb();
11243     retval = RET_INVALID_FORMAT;
11244     break;
11245   }
11246  epi_summary_merge_ret_1:
11247   fclose_cond(infile);
11248   fclose_cond(outfile);
11249   bigstack_reset(bigstack_mark);
11250   return retval;
11251 }
11252 
test_mishap_write_line(FILE * outfile,char * wptr,uint32_t prev_alen,uint32_t next_alen,const char * prev_aptr,const char * next_aptr,double * total_cts,double * curhap_cts,double tot_recip,double output_min_p,char * flankstr,uint32_t flanklen)11253 void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint32_t next_alen, const char* prev_aptr, const char* next_aptr, double* total_cts, double* curhap_cts, double tot_recip, double output_min_p, char* flankstr, uint32_t flanklen) {
11254   // total_cts[0] = caseN[0] + caseN[1]
11255   // total_cts[1] = controlN[0] + controlN[1]
11256   char* tbuf_cur = g_textbuf;
11257   double casen_1 = total_cts[0] - curhap_cts[0];
11258   double ctrln_1 = total_cts[1] - curhap_cts[1];
11259   uint32_t uii = prev_alen + next_alen;
11260   char* wptr2;
11261   double row_mult;
11262   double cur_expected;
11263   double dxx;
11264   double chisq;
11265   if (uii <= 10) {
11266     wptr = memseta(wptr, 32, 10 - uii);
11267     if (prev_alen) {
11268       wptr = memcpya(wptr, prev_aptr, prev_alen);
11269     }
11270     if (next_alen) {
11271       wptr = memcpya(wptr, next_aptr, next_alen);
11272     }
11273   } else {
11274     fwrite(g_textbuf, 1, (uintptr_t)(wptr - g_textbuf), outfile);
11275     if (prev_alen) {
11276       fputs(prev_aptr, outfile);
11277     }
11278     if (next_alen) {
11279       fputs(next_aptr, outfile);
11280     }
11281     tbuf_cur = wptr;
11282   }
11283   *wptr++ = ' ';
11284   if (total_cts[0] > 0.0) {
11285     wptr = dtoa_g_wxp3(curhap_cts[0] / total_cts[0], 8, wptr);
11286   } else {
11287     wptr = memcpya(wptr, "      NA", 8);
11288   }
11289   *wptr++ = ' ';
11290   if (total_cts[1] > 0.0) {
11291     wptr = dtoa_g_wxp3(curhap_cts[1] / total_cts[1], 8, wptr);
11292   } else {
11293     wptr = memcpya(wptr, "      NA", 8);
11294   }
11295   *wptr++ = ' ';
11296   wptr2 = dtoa_g(curhap_cts[0], wptr);
11297   *wptr2++ = '/';
11298   wptr2 = dtoa_g(curhap_cts[1], wptr2);
11299   wptr = width_force(20, wptr, wptr2);
11300   *wptr++ = ' ';
11301   wptr2 = dtoa_g(casen_1, wptr);
11302   *wptr2++ = '/';
11303   wptr2 = dtoa_g(ctrln_1, wptr2);
11304   wptr = width_force(20, wptr, wptr2);
11305   *wptr++ = ' ';
11306   if ((curhap_cts[0] > 0.0) && (curhap_cts[1] > 0.0) && (casen_1 > 0.0) && (ctrln_1 > 0.0)) {
11307     row_mult = (curhap_cts[0] + curhap_cts[1]) * tot_recip;
11308     cur_expected = row_mult * total_cts[0];
11309     dxx = curhap_cts[0] - cur_expected;
11310     chisq = dxx * dxx / cur_expected;
11311     cur_expected = row_mult * total_cts[1];
11312     dxx = curhap_cts[1] - cur_expected;
11313     chisq += dxx * dxx / cur_expected;
11314     row_mult = (total_cts[0] + total_cts[1]) * tot_recip - row_mult;
11315     cur_expected = row_mult * total_cts[0];
11316     dxx = casen_1 - cur_expected;
11317     chisq += dxx * dxx / cur_expected;
11318     cur_expected = row_mult * total_cts[1];
11319     dxx = ctrln_1 - cur_expected;
11320     chisq += dxx * dxx / cur_expected;
11321     wptr = dtoa_g_wxp3(chisq, 8, wptr);
11322     *wptr++ = ' ';
11323     dxx = chiprob_p(chisq, 1);
11324     wptr = dtoa_g_wxp3(MAXV(dxx, output_min_p), 8, wptr);
11325   } else {
11326     wptr = memcpya(wptr, "      NA       NA", 17);
11327   }
11328   wptr = memcpya(wptr, flankstr, flanklen);
11329   fwrite(tbuf_cur, 1, (uintptr_t)(wptr - tbuf_cur), outfile);
11330 }
11331 
test_mishap(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_reverse,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,double min_maf,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sample_exclude,uintptr_t sample_ct)11332 int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, double min_maf, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct) {
11333   unsigned char* bigstack_mark = g_bigstack_base;
11334   FILE* outfile = nullptr;
11335   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
11336   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
11337   uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
11338   uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
11339   uintptr_t final_mask = get_final_mask(sample_ct);
11340   char* tbuf2 = &(g_textbuf[MAXLINELEN]);
11341   char* wptr2 = nullptr;
11342   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
11343   uint32_t inspected_ct = 0;
11344   uint32_t missing_ct_next = 0;
11345   uint32_t prev_a1len = 0;
11346   uint32_t prev_a2len = 0;
11347   int32_t retval = 0;
11348   // need following counts:
11349   //   all 9 flanking hap combinations | current missing
11350   //     [0]: prev = 00, next = 00
11351   //     [1]: prev = 00, next = 10
11352   //     [2]: prev = 00, next = 11
11353   //     [3]: prev = 10, next = 00
11354   //     ...
11355   //   all 9 flanking hap combinations | current nonmissing [9..17]
11356   uint32_t counts[27];
11357   // [0]: central call missing, all haps clearing --maf (caseN[0] + caseN[1])
11358   // [1]: central call nonmissing, all haps clear --maf (ctrlN[0] + ctrlN[1])
11359   // [2k], k in 1..4: caseN[0] for current hap
11360   // [2k+1]: controlN[0] for current hap
11361   // note that all numbers are actually double raw counts
11362   double hap_ct_table[10];
11363   uintptr_t* loadbuf_raw;
11364   uintptr_t* loadbuf;
11365   uintptr_t* loadbuf_end;
11366   uintptr_t* prevsnp_ptr;
11367   uintptr_t* cursnp_ptr;
11368   uintptr_t* nextsnp_ptr;
11369   uintptr_t* maskbuf_mid;
11370   uintptr_t* maskbuf;
11371   char* wptr;
11372   uint32_t* uiptr;
11373   uintptr_t marker_uidx_prev;
11374   uintptr_t marker_uidx_cur;
11375   uintptr_t marker_uidx_next;
11376   double freq11;
11377   double tot_recip;
11378   double dxx;
11379   double dyy;
11380   double dzz;
11381   double dww;
11382   double orig_cmiss_tot;
11383   double orig_cnm_tot;
11384   uint32_t flanklen;
11385   uint32_t missing_ct_cur;
11386   uint32_t chrom_fo_idx;
11387   uint32_t chrom_idx;
11388   uint32_t chrom_end;
11389   uint32_t next_a1len;
11390   uint32_t next_a2len;
11391   uint32_t uii;
11392   uint32_t ujj;
11393   uint32_t ukk;
11394   uint32_t umm;
11395   if (is_set(chrom_info_ptr->haploid_mask, 1)) {
11396     logerrprint("Error: --test-mishap can only be used on diploid genomes.\n");
11397     goto test_mishap_ret_INVALID_CMDLINE;
11398   }
11399   if (sample_ct >= 0x40000000) {
11400     logerrprint("Error: --test-mishap does not support >= 2^30 samples.\n");
11401     goto test_mishap_ret_INVALID_CMDLINE;
11402   }
11403   if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
11404       bigstack_alloc_ul(sample_ctv2 * 3, &loadbuf) ||
11405       bigstack_alloc_ul(sample_ctv2, &maskbuf_mid) ||
11406       bigstack_alloc_ul(sample_ctv2, &maskbuf)) {
11407     goto test_mishap_ret_NOMEM;
11408   }
11409   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
11410   loadbuf[sample_ctv2 - 2] = 0;
11411   loadbuf[sample_ctv2 - 1] = 0;
11412   loadbuf[2 * sample_ctv2 - 2] = 0;
11413   loadbuf[2 * sample_ctv2 - 1] = 0;
11414   loadbuf[3 * sample_ctv2 - 2] = 0;
11415   loadbuf[3 * sample_ctv2 - 1] = 0;
11416   loadbuf_end = &(loadbuf[sample_ctv2 * 3]);
11417   tbuf2[0] = ' ';
11418   memcpy(outname_end, ".missing.hap", 13);
11419   if (fopen_checked(outname, "w", &outfile)) {
11420     goto test_mishap_ret_OPEN_FAIL;
11421   }
11422   sprintf(g_textbuf, "%%%us  HAPLOTYPE      F_0      F_1                 M_H1                 M_H2    CHISQ        P FLANKING\n", plink_maxsnp);
11423   fprintf(outfile, g_textbuf, "SNP");
11424   min_maf *= 1 - SMALL_EPSILON;
11425   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
11426     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
11427     if (is_set(chrom_info_ptr->haploid_mask, chrom_idx)) {
11428       continue;
11429     }
11430     marker_uidx_cur = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx];
11431     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
11432     marker_uidx_cur = next_unset_ul(marker_exclude, marker_uidx_cur, chrom_end);
11433     if (marker_uidx_cur == chrom_end) {
11434       continue;
11435     }
11436     marker_uidx_next = next_unset_ul(marker_exclude, marker_uidx_cur + 1, chrom_end);
11437     if (marker_uidx_next == chrom_end) {
11438       continue;
11439     }
11440     prevsnp_ptr = loadbuf;
11441     fill_ulong_zero(sample_ctl2, prevsnp_ptr);
11442     cursnp_ptr = &(loadbuf[sample_ctv2]);
11443     if (fseeko(bedfile, bed_offset + marker_uidx_cur * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
11444       goto test_mishap_ret_READ_FAIL;
11445     }
11446     if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_cur), bedfile, loadbuf_raw, cursnp_ptr)) {
11447       goto test_mishap_ret_READ_FAIL;
11448     }
11449     missing_ct_cur = count_01(cursnp_ptr, sample_ctl2);
11450     marker_uidx_prev = ~ZEROLU;
11451     for (; marker_uidx_cur < chrom_end; marker_uidx_prev = marker_uidx_cur, marker_uidx_cur = marker_uidx_next, prevsnp_ptr = cursnp_ptr, cursnp_ptr = nextsnp_ptr, missing_ct_cur = missing_ct_next, marker_uidx_next++) {
11452       nextsnp_ptr = &(cursnp_ptr[sample_ctv2]);
11453       if (nextsnp_ptr == loadbuf_end) {
11454 	nextsnp_ptr = loadbuf;
11455       }
11456       if (marker_uidx_next < chrom_end) {
11457 	if (IS_SET(marker_exclude, marker_uidx_next)) {
11458 	  marker_uidx_next = next_unset_ul(marker_exclude, marker_uidx_next, chrom_end);
11459 	  if (marker_uidx_next == chrom_end) {
11460 	    goto test_mishap_last_chrom_snp;
11461 	  }
11462 	  if (fseeko(bedfile, bed_offset + marker_uidx_next * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
11463 	    goto test_mishap_ret_READ_FAIL;
11464 	  }
11465 	}
11466         if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_next), bedfile, loadbuf_raw, nextsnp_ptr)) {
11467           goto test_mishap_ret_READ_FAIL;
11468 	}
11469         missing_ct_next = count_01(nextsnp_ptr, sample_ctl2);
11470       } else {
11471       test_mishap_last_chrom_snp:
11472         fill_ulong_zero(sample_ctl2, nextsnp_ptr);
11473       }
11474       if (missing_ct_cur < 5) {
11475 	continue;
11476       }
11477       quatervec_copy_only_01(cursnp_ptr, unfiltered_sample_ct, maskbuf_mid);
11478       uiptr = counts;
11479       for (uii = 0; uii < 2; uii++) {
11480 	if (uii) {
11481 	  quatervec_01_invert(unfiltered_sample_ct, maskbuf_mid);
11482 	}
11483         for (ujj = 0; ujj < 3; ujj++) {
11484           vec_datamask(unfiltered_sample_ct, ujj + (ujj + 1) / 2, prevsnp_ptr, maskbuf_mid, maskbuf);
11485 	  ukk = popcount01_longs(maskbuf, sample_ctl2);
11486 	  genovec_3freq(nextsnp_ptr, maskbuf, sample_ctl2, &umm, &(uiptr[1]), &(uiptr[2]));
11487 	  uiptr[0] = ukk - umm - uiptr[1] - uiptr[2];
11488 	  uiptr = &(uiptr[3]);
11489 	}
11490       }
11491       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx_cur * max_marker_id_len]), g_textbuf);
11492       *wptr++ = ' ';
11493       if (marker_uidx_prev != ~ZEROLU) {
11494 	prev_a1len = strlen(marker_allele_ptrs[2 * marker_uidx_prev]);
11495 	prev_a2len = strlen(marker_allele_ptrs[2 * marker_uidx_prev + 1]);
11496 	wptr2 = strcpya(&(tbuf2[1]), &(marker_ids[marker_uidx_prev * max_marker_id_len]));
11497       }
11498       if (marker_uidx_next < chrom_end) {
11499 	next_a1len = strlen(marker_allele_ptrs[2 * marker_uidx_next]);
11500 	next_a2len = strlen(marker_allele_ptrs[2 * marker_uidx_next + 1]);
11501 	if (marker_uidx_prev != ~ZEROLU) {
11502 	  hap_ct_table[0] = (int32_t)(2 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4] + counts[5] + counts[6] + counts[7] + counts[8]));
11503 	  hap_ct_table[1] = (int32_t)(2 * (counts[9] + counts[10] + counts[11] + counts[12] + counts[13] + counts[14] + counts[15] + counts[16] + counts[17]));
11504 	  tot_recip = hap_ct_table[0] + hap_ct_table[1];
11505 	  if (tot_recip == 0.0) {
11506 	    // minor change: skip markers with zero observations.  (output
11507 	    // wouldn't match PLINK 1.07 anyway, due to EM phasing differences)
11508 	    continue;
11509 	  }
11510 	  orig_cmiss_tot = hap_ct_table[0];
11511 	  orig_cnm_tot = hap_ct_table[1];
11512 	  *wptr2++ = '|';
11513 	  wptr2 = strcpya(wptr2, &(marker_ids[marker_uidx_next * max_marker_id_len]));
11514 	  *wptr2++ = '\n';
11515 	  flanklen = (uintptr_t)(wptr2 - tbuf2);
11516 	  hap_ct_table[2] = (int32_t)(2 * counts[0] + counts[1] + counts[3]);
11517 	  hap_ct_table[3] = (int32_t)(2 * counts[9] + counts[10] + counts[12]);
11518 	  hap_ct_table[4] = (int32_t)(2 * counts[2] + counts[1] + counts[5]);
11519 	  hap_ct_table[5] = (int32_t)(2 * counts[11] + counts[10] + counts[14]);
11520 	  hap_ct_table[6] = (int32_t)(2 * counts[6] + counts[3] + counts[7]);
11521 	  hap_ct_table[7] = (int32_t)(2 * counts[15] + counts[12] + counts[16]);
11522 	  hap_ct_table[8] = (int32_t)(2 * counts[8] + counts[5] + counts[7]);
11523 	  hap_ct_table[9] = (int32_t)(2 * counts[17] + counts[14] + counts[16]);
11524 	  if (counts[4] + counts[13]) {
11525 	    for (uii = 0; uii < 9; uii++) {
11526 	      counts[18 + uii] = counts[uii] + counts[9 + uii];
11527 	    }
11528 	    // no need to check return value
11529 	    em_phase_hethet_nobase(&(counts[18]), 0, 0, &dxx, &dyy, &dzz, &dww, &freq11);
11530 	    // share of counts[4]/counts[13] which goes to 11 or 22 haplotype
11531 	    // (0.5 - dxx) is share which goes to 12/21 haps
11532 	    // (conveniently, there's a 0.5 and a 2 which cancel out here)
11533 	    dxx = (freq11 * tot_recip - (hap_ct_table[2] + hap_ct_table[3])) / ((double)((int32_t)(counts[4] + counts[13])));
11534 	    dyy = ((int32_t)counts[4]) * dxx;
11535 	    dzz = ((int32_t)counts[13]) * dxx;
11536 	    hap_ct_table[2] += dyy;
11537 	    hap_ct_table[3] += dzz;
11538 	    hap_ct_table[8] += dyy;
11539 	    hap_ct_table[9] += dzz;
11540 	    dxx = 1.0 - dxx;
11541 	    dyy = ((int32_t)counts[4]) * dxx;
11542 	    dzz = ((int32_t)counts[13]) * dxx;
11543 	    hap_ct_table[4] += dyy;
11544 	    hap_ct_table[5] += dzz;
11545 	    hap_ct_table[6] += dyy;
11546 	    hap_ct_table[7] += dzz;
11547 	  }
11548 	  dxx = min_maf * tot_recip;
11549 	  if (hap_ct_table[2] + hap_ct_table[3] < dxx) {
11550 	    hap_ct_table[0] -= hap_ct_table[2];
11551 	    hap_ct_table[1] -= hap_ct_table[3];
11552 	    tot_recip -= hap_ct_table[2] + hap_ct_table[3];
11553 	  }
11554 	  if (hap_ct_table[4] + hap_ct_table[5] < dxx) {
11555 	    hap_ct_table[0] -= hap_ct_table[4];
11556 	    hap_ct_table[1] -= hap_ct_table[5];
11557 	    tot_recip -= hap_ct_table[4] + hap_ct_table[5];
11558 	  }
11559 	  if (hap_ct_table[6] + hap_ct_table[7] < dxx) {
11560 	    hap_ct_table[0] -= hap_ct_table[6];
11561 	    hap_ct_table[1] -= hap_ct_table[7];
11562 	    tot_recip -= hap_ct_table[6] + hap_ct_table[7];
11563 	  }
11564 	  if (hap_ct_table[8] + hap_ct_table[9] < dxx) {
11565 	    hap_ct_table[0] -= hap_ct_table[8];
11566 	    hap_ct_table[1] -= hap_ct_table[9];
11567 	    tot_recip -= hap_ct_table[8] + hap_ct_table[9];
11568 	  }
11569 	  tot_recip = 1.0 / tot_recip;
11570 	  if (hap_ct_table[2] + hap_ct_table[3] >= dxx) {
11571 	    test_mishap_write_line(outfile, wptr, prev_a1len, next_a1len, marker_allele_ptrs[2 * marker_uidx_prev], marker_allele_ptrs[2 * marker_uidx_next], hap_ct_table, &(hap_ct_table[2]), tot_recip, output_min_p, tbuf2, flanklen);
11572 	  }
11573 	  if (hap_ct_table[6] + hap_ct_table[7] >= dxx) {
11574 	    test_mishap_write_line(outfile, wptr, prev_a2len, next_a1len, marker_allele_ptrs[2 * marker_uidx_prev + 1], marker_allele_ptrs[2 * marker_uidx_next], hap_ct_table, &(hap_ct_table[6]), tot_recip, output_min_p, tbuf2, flanklen);
11575 	  }
11576 	  if (hap_ct_table[4] + hap_ct_table[5] >= dxx) {
11577 	    test_mishap_write_line(outfile, wptr, prev_a1len, next_a2len, marker_allele_ptrs[2 * marker_uidx_prev], marker_allele_ptrs[2 * marker_uidx_next + 1], hap_ct_table, &(hap_ct_table[4]), tot_recip, output_min_p, tbuf2, flanklen);
11578 	  }
11579 	  if (hap_ct_table[8] + hap_ct_table[9] >= dxx) {
11580 	    test_mishap_write_line(outfile, wptr, prev_a2len, next_a2len, marker_allele_ptrs[2 * marker_uidx_prev + 1], marker_allele_ptrs[2 * marker_uidx_next + 1], hap_ct_table, &(hap_ct_table[8]), tot_recip, output_min_p, tbuf2, flanklen);
11581 	  }
11582 	} else {
11583 	  hap_ct_table[0] = (int32_t)(2 * (counts[0] + counts[1] + counts[2]));
11584 	  hap_ct_table[1] = (int32_t)(2 * (counts[9] + counts[10] + counts[11]));
11585 	  tot_recip = hap_ct_table[0] + hap_ct_table[1];
11586 	  if (tot_recip == 0.0) {
11587 	    continue;
11588 	  }
11589 	  orig_cmiss_tot = hap_ct_table[0];
11590 	  orig_cnm_tot = hap_ct_table[1];
11591 	  wptr2 = strcpya(&(tbuf2[1]), &(marker_ids[marker_uidx_next * max_marker_id_len]));
11592 	  *wptr2++ = '\n';
11593 	  flanklen = (uintptr_t)(wptr2 - tbuf2);
11594 	  dxx = min_maf * tot_recip;
11595 	  hap_ct_table[2] = (int32_t)(counts[0] * 2 + counts[1]);
11596 	  hap_ct_table[3] = (int32_t)(counts[9] * 2 + counts[10]);
11597 	  hap_ct_table[4] = (int32_t)(counts[2] * 2 + counts[1]);
11598 	  hap_ct_table[5] = (int32_t)(counts[11] * 2 + counts[10]);
11599 	  if (hap_ct_table[4] + hap_ct_table[5] < dxx) {
11600 	    hap_ct_table[0] = hap_ct_table[2];
11601 	    hap_ct_table[1] = hap_ct_table[3];
11602 	    tot_recip = hap_ct_table[2] + hap_ct_table[3];
11603 	  } else if (hap_ct_table[2] + hap_ct_table[3] < dxx) {
11604 	    hap_ct_table[0] = hap_ct_table[4];
11605 	    hap_ct_table[1] = hap_ct_table[5];
11606 	    tot_recip = hap_ct_table[4] + hap_ct_table[5];
11607 	  }
11608 	  tot_recip = 1.0 / tot_recip;
11609 	  if (hap_ct_table[2] + hap_ct_table[3] >= dxx) {
11610 	    test_mishap_write_line(outfile, wptr, 0, next_a1len, nullptr, marker_allele_ptrs[2 * marker_uidx_next], hap_ct_table, &(hap_ct_table[2]), tot_recip, output_min_p, tbuf2, flanklen);
11611 	  }
11612 	  if (hap_ct_table[4] + hap_ct_table[5] >= dxx) {
11613 	    test_mishap_write_line(outfile, wptr, 0, next_a2len, nullptr, marker_allele_ptrs[2 * marker_uidx_next + 1], hap_ct_table, &(hap_ct_table[4]), tot_recip, output_min_p, tbuf2, flanklen);
11614 	  }
11615 	}
11616       } else {
11617 	hap_ct_table[0] = (int32_t)(2 * (counts[0] + counts[3] + counts[6]));
11618 	hap_ct_table[1] = (int32_t)(2 * (counts[9] + counts[12] + counts[15]));
11619 	tot_recip = hap_ct_table[0] + hap_ct_table[1];
11620 	if (tot_recip == 0.0) {
11621 	  continue;
11622 	}
11623 	orig_cmiss_tot = hap_ct_table[0];
11624 	orig_cnm_tot = hap_ct_table[1];
11625 	*wptr2++ = '\n';
11626 	flanklen = (uintptr_t)(wptr2 - tbuf2);
11627 	dxx = min_maf * tot_recip;
11628 	hap_ct_table[2] = (int32_t)(counts[0] * 2 + counts[3]);
11629 	hap_ct_table[3] = (int32_t)(counts[9] * 2 + counts[12]);
11630 	hap_ct_table[4] = (int32_t)(counts[6] * 2 + counts[3]);
11631 	hap_ct_table[5] = (int32_t)(counts[15] * 2 + counts[12]);
11632 	if (hap_ct_table[4] + hap_ct_table[5] < dxx) {
11633 	  hap_ct_table[0] = hap_ct_table[2];
11634 	  hap_ct_table[1] = hap_ct_table[3];
11635 	  tot_recip = hap_ct_table[2] + hap_ct_table[3];
11636 	} else if (hap_ct_table[2] + hap_ct_table[3] < dxx) {
11637 	  hap_ct_table[0] = hap_ct_table[4];
11638 	  hap_ct_table[1] = hap_ct_table[5];
11639 	  tot_recip = hap_ct_table[4] + hap_ct_table[5];
11640 	}
11641 	tot_recip = 1.0 / tot_recip;
11642 	if (hap_ct_table[2] + hap_ct_table[3] >= dxx) {
11643 	  test_mishap_write_line(outfile, wptr, prev_a1len, 0, marker_allele_ptrs[2 * marker_uidx_prev], nullptr, hap_ct_table, &(hap_ct_table[2]), tot_recip, output_min_p, tbuf2, flanklen);
11644 	}
11645 	if (hap_ct_table[4] + hap_ct_table[5] >= dxx) {
11646 	  test_mishap_write_line(outfile, wptr, prev_a2len, 0, marker_allele_ptrs[2 * marker_uidx_prev + 1], nullptr, hap_ct_table, &(hap_ct_table[4]), tot_recip, output_min_p, tbuf2, flanklen);
11647 	}
11648       }
11649       hap_ct_table[0] = orig_cmiss_tot * 0.5;
11650       hap_ct_table[1] = orig_cnm_tot * 0.5;
11651       hap_ct_table[2] = (int32_t)(counts[1] + counts[3] + counts[4] + counts[5] + counts[7]);
11652       hap_ct_table[3] = (int32_t)(counts[10] + counts[12] + counts[13] + counts[14] + counts[16]);
11653       test_mishap_write_line(outfile, wptr, 6, 0, "HETERO", nullptr, hap_ct_table, &(hap_ct_table[2]), 1.0 / (hap_ct_table[0] + hap_ct_table[1]), output_min_p, tbuf2, flanklen);
11654       inspected_ct++;
11655       if (!(inspected_ct % 1000)) {
11656         printf("\r--test-mishap: %uk loci checked.", inspected_ct / 1000);
11657         fflush(stdout);
11658       }
11659     }
11660   }
11661 
11662   if (fclose_null(&outfile)) {
11663     goto test_mishap_ret_WRITE_FAIL;
11664   }
11665   putc_unlocked('\r', stdout);
11666   if (inspected_ct < marker_ct) {
11667     LOGPRINTF("--test-mishap: %u loc%s checked (%" PRIuPTR " skipped).\n", inspected_ct, (inspected_ct == 1)? "us" : "i", marker_ct - inspected_ct);
11668     LOGPREPRINTFWW("Report written to %s .\n", outname);
11669   } else {
11670     LOGPREPRINTFWW("--test-mishap: %u loc%s checked, report written to %s .\n", inspected_ct, (inspected_ct == 1)? "us" : "i", outname);
11671   }
11672   logprintb();
11673 
11674   while (0) {
11675   test_mishap_ret_NOMEM:
11676     retval = RET_NOMEM;
11677     break;
11678   test_mishap_ret_OPEN_FAIL:
11679     retval = RET_OPEN_FAIL;
11680     break;
11681   test_mishap_ret_READ_FAIL:
11682     retval = RET_READ_FAIL;
11683     break;
11684   test_mishap_ret_WRITE_FAIL:
11685     retval = RET_WRITE_FAIL;
11686     break;
11687   test_mishap_ret_INVALID_CMDLINE:
11688     retval = RET_WRITE_FAIL;
11689     break;
11690   }
11691   fclose_cond(outfile);
11692   bigstack_reset(bigstack_mark);
11693   return retval;
11694 }
11695 
11696 static uintptr_t* g_ld_load2_bitfield;
11697 static uintptr_t* g_ld_result_bitfield;
11698 
ld_map_thread(void * arg)11699 THREAD_RET_TYPE ld_map_thread(void* arg) {
11700   uintptr_t tidx = (uintptr_t)arg;
11701   uint32_t thread_ct = g_ld_thread_ct;
11702   // er, this use of "ctv" is nonstandard, probably want to fix this later
11703   uintptr_t marker_ctv = ((g_ld_marker_ct + 127) / 128) * (128 / BITCT);
11704   uintptr_t idx1_offset = g_ld_block_idx1;
11705   uintptr_t block_idx1_start = (tidx * g_ld_idx1_block_size) / thread_ct;
11706   uintptr_t block_idx1_end = ((tidx + 1) * g_ld_idx1_block_size) / thread_ct;
11707   uintptr_t founder_ct = g_ld_founder_ct;
11708   uintptr_t founder_ctwd = founder_ct / BITCT2;
11709   uintptr_t founder_ctwd12 = founder_ctwd / 12;
11710   uintptr_t founder_ctwd12_rem = founder_ctwd - (12 * founder_ctwd12);
11711   uintptr_t lshift_last = 2 * ((0x7fffffc0 - founder_ct) % BITCT2);
11712   uintptr_t founder_ct_192_long = g_ld_founder_ct_192_long;
11713   uintptr_t* geno1 = g_ld_geno1;
11714   uintptr_t* geno_masks1 = g_ld_geno_masks1;
11715   uint32_t* missing_cts1 = g_ld_missing_cts1;
11716   uint32_t founder_ct_mld_m1 = g_ld_founder_ct_mld_m1;
11717   uint32_t founder_ct_mld_rem = g_ld_founder_ct_mld_rem;
11718   uintptr_t* load2_bitfield = g_ld_load2_bitfield;
11719   uintptr_t* result_bitfield = g_ld_result_bitfield;
11720   double r2_thresh = g_ld_window_r2;
11721   int32_t dp_result[5];
11722   uintptr_t* geno_fixed_vec_ptr;
11723   uintptr_t* geno_var_vec_ptr;
11724   uintptr_t* mask_fixed_vec_ptr;
11725   uintptr_t* mask_var_vec_ptr;
11726   uintptr_t* geno2;
11727   uintptr_t* geno_masks2;
11728   uintptr_t* rb_cur;
11729   uint32_t* missing_cts2;
11730   uintptr_t block_idx1;
11731   uintptr_t block_idx2;
11732   double non_missing_ctd;
11733   double cov12;
11734   double dxx;
11735   double dyy;
11736   uint32_t marker_idx2_start;
11737   uint32_t marker_idx2;
11738   uint32_t marker_idx2_end;
11739   uint32_t fixed_missing_ct;
11740   uint32_t fixed_non_missing_ct;
11741   uint32_t non_missing_ct;
11742   uint32_t uii;
11743   while (1) {
11744     marker_idx2_start = g_ld_idx2_block_start;
11745     marker_idx2_end = g_ld_marker_ctm8;
11746     geno2 = g_ld_geno2;
11747     geno_masks2 = g_ld_geno_masks2;
11748     missing_cts2 = g_ld_missing_cts2;
11749     rb_cur = &(result_bitfield[block_idx1_start * marker_ctv]);
11750     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, rb_cur = &(rb_cur[marker_ctv])) {
11751       marker_idx2 = block_idx1 + idx1_offset + 1;
11752       if (marker_idx2 < marker_idx2_start) {
11753 	marker_idx2 = marker_idx2_start;
11754       } else if (marker_idx2 >= marker_idx2_end) {
11755         break;
11756       }
11757       marker_idx2 = next_set(rb_cur, marker_idx2, marker_idx2_end);
11758       if (marker_idx2 == marker_idx2_end) {
11759 	continue;
11760       }
11761       fixed_missing_ct = missing_cts1[block_idx1];
11762       fixed_non_missing_ct = founder_ct - fixed_missing_ct;
11763       geno_fixed_vec_ptr = &(geno1[block_idx1 * founder_ct_192_long]);
11764       mask_fixed_vec_ptr = &(geno_masks1[block_idx1 * founder_ct_192_long]);
11765       block_idx2 = popcount_bit_idx(load2_bitfield, marker_idx2_start, marker_idx2);
11766       while (1) {
11767         geno_var_vec_ptr = &(geno2[block_idx2 * founder_ct_192_long]);
11768         mask_var_vec_ptr = &(geno_masks2[block_idx2 * founder_ct_192_long]);
11769         non_missing_ct = fixed_non_missing_ct - missing_cts2[block_idx2];
11770         if (fixed_missing_ct && missing_cts2[block_idx2]) {
11771           non_missing_ct += ld_missing_ct_intersect(mask_var_vec_ptr, mask_fixed_vec_ptr, founder_ctwd12, founder_ctwd12_rem, lshift_last);
11772 	}
11773         dp_result[0] = founder_ct;
11774         dp_result[1] = -fixed_non_missing_ct;
11775         dp_result[2] = missing_cts2[block_idx2] - founder_ct;
11776         dp_result[3] = dp_result[1];
11777         dp_result[4] = dp_result[2];
11778 	ld_dot_prod(geno_var_vec_ptr, geno_fixed_vec_ptr, mask_var_vec_ptr, mask_fixed_vec_ptr, dp_result, founder_ct_mld_m1, founder_ct_mld_rem);
11779 	non_missing_ctd = (double)((int32_t)non_missing_ct);
11780         dxx = dp_result[1];
11781         dyy = dp_result[2];
11782         cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
11783         if (cov12 * cov12 <= r2_thresh * ((dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy))) {
11784           clear_bit(marker_idx2, rb_cur);
11785 	}
11786 	uii = marker_idx2++;
11787 	if (is_set(rb_cur, marker_idx2)) {
11788 	  if (marker_idx2 == marker_idx2_end) {
11789 	    break;
11790 	  }
11791 	  block_idx2++;
11792 	} else {
11793           marker_idx2 = next_set(rb_cur, marker_idx2, marker_idx2_end);
11794 	  if (marker_idx2 == marker_idx2_end) {
11795 	    break;
11796 	  }
11797           block_idx2 += popcount_bit_idx(load2_bitfield, uii, marker_idx2);
11798 	}
11799       }
11800     }
11801     if ((!tidx) || g_is_last_thread_block) {
11802       THREAD_RETURN;
11803     }
11804     THREAD_BLOCK_FINISH(tidx);
11805   }
11806 }
11807 
construct_ld_map(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,uintptr_t * marker_exclude,uintptr_t marker_ct,uintptr_t * marker_reverse,uint32_t * marker_idx_to_uidx,uintptr_t unfiltered_sample_ct,uintptr_t * founder_pnm,Set_info * sip,uintptr_t * set_incl,uintptr_t set_ct,uint32_t ** setdefs,char * outname,char * outname_end,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * sex_male,Chrom_info * chrom_info_ptr,uint32_t ignore_x,uint32_t hh_exists,uint32_t *** ld_map_ptr)11808 int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t* marker_reverse, uint32_t* marker_idx_to_uidx, uintptr_t unfiltered_sample_ct, uintptr_t* founder_pnm, Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct, uint32_t** setdefs, char* outname, char* outname_end, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, uint32_t ignore_x, uint32_t hh_exists, uint32_t*** ld_map_ptr) {
11809   // Takes a bunch of set definitions, and determines which pairs of same-set
11810   // markers reach/exceed the --set-r2 threshold, saving them (in setdef
11811   // format) to a newly stack-allocated ld_map[].
11812   // If --set-r2 write was specified, the map's contents are written to {output
11813   // prefix}.ldset.
11814   // Note that, when very large set(s) are present, and there's a moderate
11815   // amount of "random" long-range LD, the memory requirement may be huge.
11816   FILE* outfile = nullptr;
11817   unsigned char* bigstack_end_mark = g_bigstack_end;
11818   uintptr_t marker_ctv = ((marker_ct + 127) / 128) * (128 / BITCT);
11819   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
11820   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
11821   uintptr_t max_set_id_len = sip->max_name_len;
11822   uintptr_t founder_ct = popcount_longs(founder_pnm, unfiltered_sample_ctl);
11823   uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
11824   uintptr_t founder_ctv2 = founder_ctl * 2;
11825   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
11826   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
11827 #ifdef __LP64__
11828   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
11829 #else
11830   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
11831 #endif
11832   uintptr_t founder_ct_192_long = founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + founder_ct_mld_rem * (192 / BITCT2);
11833   uintptr_t final_mask = get_final_mask(founder_ct);
11834   uint32_t founder_trail_ct = founder_ct_192_long - founder_ctl * 2;
11835   uint32_t marker_idx = 0;
11836   uint32_t chrom_fo_idx = 0;
11837   uint32_t chrom_idx = 0;
11838   uint32_t is_haploid = 0;
11839   uint32_t is_x = 0;
11840   uint32_t is_y = 0;
11841   uint32_t range_end = 0;
11842   int32_t retval = 0;
11843   char charbuf[8];
11844   uintptr_t* loadbuf;
11845   uintptr_t* load2_bitfield;
11846   uintptr_t* founder_include2;
11847   uintptr_t* founder_male_include2;
11848   uintptr_t* tmp_set_bitfield;
11849   uintptr_t* geno1;
11850   uintptr_t* geno_masks1;
11851   uintptr_t* geno2;
11852   uintptr_t* geno_masks2;
11853   uintptr_t* result_bitfield;
11854   uintptr_t* rb_ptr;
11855   uintptr_t* loadbuf_ptr;
11856   uint32_t** ld_map;
11857   uint32_t* cur_setdef;
11858   uint32_t* cur_setdef2;
11859   char* sptr;
11860   char* wptr_start;
11861   char* wptr;
11862   uintptr_t memreq1;
11863   uintptr_t memreq2;
11864   uintptr_t minmem;
11865   uintptr_t idx1_block_size;
11866   uintptr_t idx2_block_size;
11867   uintptr_t cur_idx2_block_size;
11868   uintptr_t firstw;
11869   uintptr_t wlen;
11870   uintptr_t marker_uidx;
11871   uintptr_t marker_uidx2;
11872   uintptr_t ulii;
11873   uintptr_t uljj;
11874   uint32_t thread_ct;
11875   uint32_t chrom_end;
11876   uint32_t set_idx;
11877   uint32_t set_uidx;
11878   uint32_t idx1_block_end;
11879   uint32_t marker_idx2;
11880   uint32_t load_idx2_tot;
11881   uint32_t marker_load_idx2;
11882   uint32_t block_idx1;
11883   uint32_t block_idx2;
11884   uint32_t setdef_incr_aux;
11885   uint32_t setdef_incr_aux2;
11886   uint32_t is_last_block;
11887   uint32_t range_start;
11888   uint32_t uii;
11889   if (!founder_ct) {
11890     logerrprint("Error: Cannot construct LD map, since there are no founders with nonmissing\nphenotypes.  (--make-founders may come in handy here.)\n");
11891     goto construct_ld_map_ret_INVALID_CMDLINE;
11892   }
11893   ld_map = (uint32_t**)bigstack_alloc(marker_ct * sizeof(intptr_t));
11894   if (!ld_map) {
11895     goto construct_ld_map_ret_NOMEM;
11896   }
11897   *ld_map_ptr = ld_map;
11898   // To avoid too much back-and-forth disk seeking for large datasets, we
11899   // construct the LD map in blocks, using similar logic to the --r/--r2 and
11900   // --fast-epistasis computations.
11901   // 1. bigstack_end_alloc space for main window markers' raw data, bitfields
11902   //    for them listing intersecting markers in front (i.e. we only look at
11903   //    the upper right triangle of the LD matrix), and another union bitfield.
11904   //    Break the union into secondary windows, and for each secondary window:
11905   //    a. bigstack_end_alloc secondary window markers' raw data
11906   //    b. perform multithreaded LD calculations, saving results via in-place
11907   //       clearing of the first markers' bitfields
11908   //    Memory requirement per main window marker is:
11909   //      96 bytes per 192 founders for raw data (rounded up)
11910   //      32 bytes per 128 filtered markers (rounded up), for the results (16
11911   //      working, 16 final)
11912   //      4 bytes for missing_ct
11913   //      16 extra bytes to ensure enough setdef compression workspace
11914   //    Memory req. per secondary window marker is 4 + 96 bytes/192 founders.
11915   //    To reduce false sharing risk, each thread is assigned at least 4
11916   //    markers.
11917   // 2. populate the bottom left triangle of the result matrix by referring to
11918   //    earlier results
11919   // 3. save final results for each marker in compressed setdef format at the
11920   //    current workspace bottom
11921   // 4. dump .ldset file if necessary
11922   loadbuf = (uintptr_t*)bigstack_end_alloc(unfiltered_sample_ct4);
11923   if (!loadbuf) {
11924     // separate since unfiltered_sample_ct4 is a byte, not word, count
11925     goto construct_ld_map_ret_NOMEM;
11926   }
11927   if (bigstack_end_alloc_ul(marker_ctv, &load2_bitfield) ||
11928       bigstack_end_alloc_ul(marker_ctv, &tmp_set_bitfield) ||
11929       bigstack_end_alloc_ul(founder_ctv2, &founder_include2) ||
11930       bigstack_end_alloc_ul(founder_ctv2, &founder_male_include2)) {
11931     goto construct_ld_map_ret_NOMEM;
11932   }
11933   // bugfix: last word might not be initialized by unpack_set().  Also
11934   // initialize second-to-last word to defend against an unpack_set()
11935   // implementation change.
11936 #ifndef __LP64__
11937   // oh, this also matters in 32-bit case
11938   tmp_set_bitfield[marker_ctv - 4] = 0;
11939   tmp_set_bitfield[marker_ctv - 3] = 0;
11940 #endif
11941   tmp_set_bitfield[marker_ctv - 2] = 0;
11942   tmp_set_bitfield[marker_ctv - 1] = 0;
11943   g_ld_load2_bitfield = load2_bitfield;
11944   alloc_collapsed_haploid_filters(founder_pnm, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2);
11945   memreq2 = founder_ct_192_long * sizeof(intptr_t) * 2 + 4;
11946 
11947   // this guarantees enough room for save_set_bitfield() worst case
11948   memreq1 = memreq2 + marker_ctv * sizeof(intptr_t) * 2 + 16;
11949 
11950   minmem = memreq2 * BITCT;
11951   if (minmem < memreq1 * 4) {
11952     minmem = memreq1 * 4;
11953   }
11954   g_ld_marker_ct = marker_ct;
11955   g_ld_founder_ct = founder_ct;
11956   g_ld_founder_ct_192_long = founder_ct_192_long;
11957   g_ld_founder_ct_mld_m1 = founder_ct_mld_m1;
11958   g_ld_founder_ct_mld_rem = founder_ct_mld_rem;
11959   g_ld_window_r2 = sip->set_r2 * (1 - SMALL_EPSILON);
11960   do {
11961     ulii = bigstack_left() / 2;
11962     if (ulii < minmem) {
11963       goto construct_ld_map_ret_NOMEM;
11964     }
11965     idx1_block_size = (ulii / memreq1) & (~(3 * ONELU));
11966     if (idx1_block_size > marker_ct - marker_idx) {
11967       idx1_block_size = marker_ct - marker_idx;
11968     }
11969     thread_ct = g_thread_ct;
11970     if (thread_ct > idx1_block_size / 4) {
11971       thread_ct = idx1_block_size / 4;
11972       if (!thread_ct) {
11973 	thread_ct = 1;
11974       }
11975     }
11976     g_ld_thread_ct = thread_ct;
11977     idx2_block_size = (ulii / memreq2) & (~(BITCT - ONELU));
11978     if (idx2_block_size > marker_ct) {
11979       idx2_block_size = marker_ct;
11980     }
11981     g_ld_block_idx1 = marker_idx;
11982     g_ld_idx1_block_size = idx1_block_size;
11983     bigstack_end_alloc_ul(idx1_block_size * founder_ct_192_long, &geno1);
11984     bigstack_end_alloc_ul(idx1_block_size * founder_ct_192_long, &geno_masks1);
11985     bigstack_end_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
11986     bigstack_end_alloc_ul(idx2_block_size * founder_ct_192_long, &geno2);
11987     bigstack_end_alloc_ul(idx2_block_size * founder_ct_192_long, &geno_masks2);
11988     bigstack_end_alloc_ui(idx2_block_size, &g_ld_missing_cts2);
11989     bigstack_end_alloc_ul(idx1_block_size * marker_ctv, &result_bitfield);
11990     uljj = founder_trail_ct + 2;
11991     for (ulii = 1; ulii <= idx1_block_size; ulii++) {
11992       fill_ulong_zero(uljj, &(geno1[ulii * founder_ct_192_long - uljj]));
11993       fill_ulong_zero(uljj, &(geno_masks1[ulii * founder_ct_192_long - uljj]));
11994     }
11995     for (ulii = 1; ulii <= idx2_block_size; ulii++) {
11996       fill_ulong_zero(uljj, &(geno2[ulii * founder_ct_192_long - uljj]));
11997       fill_ulong_zero(uljj, &(geno_masks2[ulii * founder_ct_192_long - uljj]));
11998     }
11999     fill_ulong_zero(idx1_block_size * marker_ctv, result_bitfield);
12000     g_ld_geno1 = geno1;
12001     g_ld_geno_masks1 = geno_masks1;
12002     g_ld_geno2 = geno2;
12003     g_ld_geno_masks2 = geno_masks2;
12004     g_ld_result_bitfield = result_bitfield;
12005     idx1_block_end = marker_idx + idx1_block_size;
12006     fill_ulong_zero(marker_ctv, load2_bitfield);
12007     fill_ulong_zero(idx1_block_size * marker_ctv, result_bitfield);
12008     for (set_idx = 0; set_idx < set_ct; set_idx++) {
12009       cur_setdef = setdefs[set_idx];
12010       setdef_iter_init(cur_setdef, marker_ct, marker_idx, &marker_idx2, &setdef_incr_aux);
12011       if (setdef_iter(cur_setdef, &marker_idx2, &setdef_incr_aux) && (marker_idx2 < idx1_block_end)) {
12012 	unpack_set(marker_ct, cur_setdef, tmp_set_bitfield);
12013         get_set_wrange_align(tmp_set_bitfield, marker_ctv, &firstw, &wlen);
12014 	if (wlen) {
12015 	  uii = marker_idx2;
12016 	  do {
12017 	    bitvec_or(&(tmp_set_bitfield[firstw]), wlen, &(result_bitfield[((marker_idx2 - marker_idx) * marker_ctv + firstw)]));
12018 	    marker_idx2++;
12019 	    next_set_ck(tmp_set_bitfield, idx1_block_end, &marker_idx2);
12020 	  } while (marker_idx2 < idx1_block_end);
12021 	  // don't need to load the first intersecting member or anything
12022 	  // before it, since we're only traversing the upper right triangle
12023 	  wlen += firstw;
12024 #ifdef __LP64__
12025 	  firstw = 2 * (uii / 128);
12026 #else
12027 	  firstw = uii / 32;
12028 #endif
12029 	  clear_bits(0, uii + 1 - firstw * BITCT, &(tmp_set_bitfield[firstw]));
12030 	  bitvec_or(&(tmp_set_bitfield[firstw]), wlen - firstw, &(load2_bitfield[firstw]));
12031 	}
12032       }
12033     }
12034     load_idx2_tot = popcount_longs(load2_bitfield, marker_ctv);
12035     if (!load_idx2_tot) {
12036       // no new r^2 computations to make at all!
12037       goto construct_ld_map_no_new;
12038     }
12039     marker_uidx = next_unset_unsafe(marker_exclude, 0);
12040     if (marker_idx) {
12041       marker_uidx = jump_forward_unset_unsafe(marker_exclude, marker_uidx + 1, marker_idx);
12042     }
12043     marker_uidx2 = marker_uidx;
12044     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
12045       goto construct_ld_map_ret_READ_FAIL;
12046     }
12047     chrom_end = 0;
12048     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx++, block_idx1++) {
12049       if (IS_SET(marker_exclude, marker_uidx)) {
12050         marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
12051         if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
12052           goto construct_ld_map_ret_READ_FAIL;
12053 	}
12054       }
12055       if (marker_uidx >= chrom_end) {
12056         chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
12057         chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
12058 	chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
12059         is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
12060         is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
12061         is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
12062       }
12063       ulii = block_idx1 * founder_ct_192_long;
12064       loadbuf_ptr = &(geno1[ulii]);
12065       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf, loadbuf_ptr)) {
12066 	goto construct_ld_map_ret_READ_FAIL;
12067       }
12068       if (is_haploid && hh_exists) {
12069         haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf_ptr);
12070       }
12071       ld_process_load2(loadbuf_ptr, &(geno_masks1[ulii]), &(g_ld_missing_cts1[block_idx1]), founder_ct, is_x && (!ignore_x), founder_male_include2);
12072     }
12073     chrom_end = 0;
12074     cur_idx2_block_size = idx2_block_size;
12075     marker_idx2 = next_set_unsafe(load2_bitfield, 0);
12076     marker_uidx2 = jump_forward_unset_unsafe(marker_exclude, marker_uidx2 + 1, marker_idx2 - marker_idx);
12077     marker_load_idx2 = 0;
12078     if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
12079       goto construct_ld_map_ret_READ_FAIL;
12080     }
12081     do {
12082       if (cur_idx2_block_size > load_idx2_tot - marker_load_idx2) {
12083 	cur_idx2_block_size = load_idx2_tot - marker_load_idx2;
12084       }
12085       g_ld_idx2_block_start = marker_idx2;
12086       block_idx2 = 0;
12087       while (1) {
12088 	if (marker_uidx2 >= chrom_end) {
12089 	  chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
12090 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
12091 	  chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
12092 	  is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
12093 	  is_x = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[X_OFFSET]);
12094 	  is_y = (((int32_t)chrom_idx) == chrom_info_ptr->xymt_codes[Y_OFFSET]);
12095 	}
12096 	ulii = block_idx2 * founder_ct_192_long;
12097 	loadbuf_ptr = &(geno2[ulii]);
12098 	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, loadbuf_ptr)) {
12099 	  goto construct_ld_map_ret_READ_FAIL;
12100 	}
12101 	if (is_haploid && hh_exists) {
12102 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)loadbuf_ptr);
12103 	}
12104 	ld_process_load2(loadbuf_ptr, &(geno_masks2[ulii]), &(g_ld_missing_cts2[block_idx2]), founder_ct, is_x && (!ignore_x), founder_male_include2);
12105 	if (++block_idx2 == cur_idx2_block_size) {
12106 	  break;
12107 	}
12108         uii = marker_idx2++;
12109 	ulii = ++marker_uidx2;
12110         if (is_set(load2_bitfield, marker_idx2)) {
12111 	  next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx2);
12112 	} else {
12113           marker_idx2 = next_set_unsafe(load2_bitfield, marker_idx2);
12114           marker_uidx2 = jump_forward_unset_unsafe(marker_exclude, marker_uidx2, marker_idx2 - uii);
12115 	}
12116 	if (ulii < marker_uidx2) {
12117 	  if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
12118 	    goto construct_ld_map_ret_READ_FAIL;
12119 	  }
12120 	}
12121       }
12122       g_ld_marker_ctm8 = marker_idx2 + 1; // repurposed
12123       marker_load_idx2 += cur_idx2_block_size;
12124       is_last_block = (marker_load_idx2 == load_idx2_tot);
12125       if (spawn_threads2(threads, &ld_map_thread, thread_ct, is_last_block)) {
12126 	goto construct_ld_map_ret_THREAD_CREATE_FAIL;
12127       }
12128       ld_map_thread((void*)0);
12129       join_threads2(threads, thread_ct, is_last_block);
12130     } while (!is_last_block);
12131   construct_ld_map_no_new:
12132     for (block_idx1 = marker_idx; block_idx1 < idx1_block_end; block_idx1++) {
12133       rb_ptr = &(result_bitfield[(block_idx1 - marker_idx) * marker_ctv]);
12134       marker_idx2 = 0;
12135       while (1) {
12136 	marker_idx2 = next_set(rb_ptr, marker_idx2, block_idx1);
12137 	if (marker_idx2 == block_idx1) {
12138 	  clear_bit(block_idx1, rb_ptr);
12139 	  break;
12140 	}
12141 	if (!in_setdef(ld_map[marker_idx2], block_idx1)) {
12142 	  clear_bit(marker_idx2, rb_ptr);
12143 	}
12144 	marker_idx2++;
12145       }
12146       range_start = next_set(rb_ptr, 0, marker_ct);
12147       if (range_start != marker_ct) {
12148 	range_end = last_set_bit(rb_ptr, marker_ctv) + 1;
12149       }
12150       save_set_bitfield(rb_ptr, marker_ct, range_start, range_end, 0, &(ld_map[block_idx1]));
12151     }
12152     // free previous round of allocations
12153     bigstack_end_reset(founder_male_include2);
12154     marker_idx = idx1_block_end;
12155   } while (marker_idx < marker_ct);
12156   if (sip->modifier & SET_R2_WRITE) {
12157     memcpy(charbuf, outname_end, 8);
12158     memcpy(outname_end, ".ldset", 7);
12159     if (fopen_checked(outname, "w", &outfile)) {
12160       goto construct_ld_map_ret_OPEN_FAIL;
12161     }
12162     set_uidx = 0;
12163     for (set_idx = 0; set_idx < set_ct; set_uidx++, set_idx++) {
12164       next_set_unsafe_ck(set_incl, &set_uidx);
12165       sptr = &(sip->names[set_uidx * max_set_id_len]);
12166       uii = strlen(sptr);
12167       wptr_start = memcpyax(g_textbuf, sptr, uii, ' ');
12168       cur_setdef = setdefs[set_idx];
12169       setdef_iter_init(cur_setdef, marker_ct, 0, &marker_idx, &setdef_incr_aux);
12170 
12171       while (setdef_iter(cur_setdef, &marker_idx, &setdef_incr_aux)) {
12172         cur_setdef2 = ld_map[marker_idx];
12173 	// cur_setdef2 can contain variants outside of the current set, so we
12174 	// need to look at the intersection.
12175         setdef_iter_init(cur_setdef2, marker_ct, 0, &marker_idx2, &setdef_incr_aux2);
12176         uii = 0; // now this tracks whether a first match has been found
12177 	while (setdef_iter(cur_setdef2, &marker_idx2, &setdef_incr_aux2)) {
12178 	  if (in_setdef(cur_setdef, marker_idx2)) {
12179 	    if (!uii) {
12180 	      uii = 1;
12181 	      wptr = strcpyax(wptr_start, &(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), ' ');
12182 	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
12183 		goto construct_ld_map_ret_WRITE_FAIL;
12184 	      }
12185 	    }
12186 	    fputs(&(marker_ids[marker_idx_to_uidx[marker_idx2] * max_marker_id_len]), outfile);
12187 	    putc_unlocked(' ', outfile);
12188 	  }
12189 	  marker_idx2++;
12190 	}
12191 	if (uii) {
12192 	  if (putc_checked('\n', outfile)) {
12193 	    goto construct_ld_map_ret_WRITE_FAIL;
12194 	  }
12195 	}
12196         marker_idx++;
12197       }
12198     }
12199     if (fclose_null(&outfile)) {
12200       goto construct_ld_map_ret_WRITE_FAIL;
12201     }
12202     LOGPRINTFWW("--set-r2 write: LD map written to %s .\n", outname);
12203     memcpy(outname_end, charbuf, 8);
12204   } else {
12205     logprint("LD map constructed.\n");
12206   }
12207   while (0) {
12208   construct_ld_map_ret_NOMEM:
12209     retval = RET_NOMEM;
12210     break;
12211   construct_ld_map_ret_OPEN_FAIL:
12212     retval = RET_OPEN_FAIL;
12213     break;
12214   construct_ld_map_ret_READ_FAIL:
12215     retval = RET_READ_FAIL;
12216     break;
12217   construct_ld_map_ret_WRITE_FAIL:
12218     retval = RET_WRITE_FAIL;
12219     break;
12220   construct_ld_map_ret_INVALID_CMDLINE:
12221     retval = RET_INVALID_CMDLINE;
12222     break;
12223   construct_ld_map_ret_THREAD_CREATE_FAIL:
12224     retval = RET_THREAD_CREATE_FAIL;
12225     break;
12226   }
12227   fclose_cond(outfile);
12228   bigstack_end_reset(bigstack_end_mark);
12229   return retval;
12230 }
12231 
set_test_score(uintptr_t marker_ct,double chisq_threshold,uint32_t set_max,double * chisq_arr,uint32_t ** ld_map,uint32_t * cur_setdef,double * sorted_chisq_buf,uint32_t * sorted_marker_idx_buf,uint32_t * proxy_arr,uint32_t * raw_sig_ct_ptr,uint32_t * final_sig_ct_ptr,double * set_score_ptr)12232 void set_test_score(uintptr_t marker_ct, double chisq_threshold, uint32_t set_max, double* chisq_arr, uint32_t** ld_map, uint32_t* cur_setdef, double* sorted_chisq_buf, uint32_t* sorted_marker_idx_buf, uint32_t* proxy_arr, uint32_t* raw_sig_ct_ptr, uint32_t* final_sig_ct_ptr, double* set_score_ptr) {
12233   // set score statistic = mean of chi-square statistics of set
12234   // representatives.  --linear t statistics are converted to same-p-value 1df
12235   // chi-square stats out of necessity; in theory, this hack could be applied
12236   // to e.g. Fisher's exact test and the variable-df genotypic test as well,
12237   // but I'll hold off on that until/unless it's specifically requested.
12238 
12239   // sort variants by p-value, then iterate over setdefs, greedily selecting up
12240   // to sip->set_max significant independent variants from each.
12241   double chi_sum = 0.0;
12242   uint32_t raw_sig_ct = 0;
12243   uint32_t final_sig_ct = 0;
12244   uint32_t marker_idx;
12245   uint32_t setdef_incr_aux;
12246   uint32_t raw_idx;
12247   uint32_t ld_conflict;
12248   uint32_t uii;
12249   setdef_iter_init(cur_setdef, marker_ct, 0, &marker_idx, &setdef_incr_aux);
12250   while (setdef_iter(cur_setdef, &marker_idx, &setdef_incr_aux)) {
12251     if (chisq_arr[marker_idx] >= chisq_threshold) {
12252       sorted_chisq_buf[raw_sig_ct] = chisq_arr[marker_idx];
12253       sorted_marker_idx_buf[raw_sig_ct] = marker_idx;
12254       raw_sig_ct++;
12255     }
12256     marker_idx++;
12257   }
12258   if (!raw_sig_ct) {
12259     // not possible for initial pass, so no need to set raw_sig_ct_ptr, etc.
12260     // bugfix: actually, that comment was incorrect
12261     *set_score_ptr = 0.0;
12262     return;
12263   }
12264   qsort_ext2((char*)sorted_chisq_buf, raw_sig_ct, sizeof(double), double_cmp_deref, (char*)sorted_marker_idx_buf, sizeof(int32_t), (char*)proxy_arr, sizeof(double) + sizeof(int32_t));
12265   raw_idx = raw_sig_ct;
12266   do {
12267     raw_idx--;
12268     ld_conflict = 0;
12269     marker_idx = sorted_marker_idx_buf[raw_idx];
12270     for (uii = 0; uii < final_sig_ct; uii++) {
12271       if (in_setdef(ld_map[proxy_arr[uii]], marker_idx)) {
12272 	ld_conflict = 1;
12273 	break;
12274       }
12275     }
12276     if (!ld_conflict) {
12277       proxy_arr[final_sig_ct] = marker_idx;
12278       chi_sum += sorted_chisq_buf[raw_idx];
12279       if (++final_sig_ct == set_max) {
12280 	break;
12281       }
12282     }
12283   } while (raw_idx);
12284   *set_score_ptr = chi_sum / ((double)((int32_t)final_sig_ct));
12285   if (raw_sig_ct_ptr) {
12286     *raw_sig_ct_ptr = raw_sig_ct;
12287     *final_sig_ct_ptr = final_sig_ct;
12288   }
12289 }
12290 
set_test_common_init(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,double * orig_chisq,Set_info * sip,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sex_male,uintptr_t * founder_pnm,uint32_t ld_ignore_x,uint32_t hh_exists,const char * flag_descrip,uintptr_t * marker_ct_ptr,uintptr_t ** marker_exclude_ptr,uintptr_t ** set_incl_ptr,uint32_t ** marker_idx_to_uidx_ptr,uint32_t *** setdefs_ptr,uintptr_t * set_ct_ptr,uint32_t * max_sigset_size_ptr,uint32_t *** ld_map_ptr,double * chisq_threshold_ptr,double ** orig_set_scores_ptr,double ** sorted_chisq_buf_ptr,uint32_t ** sorted_marker_idx_buf_ptr,uint32_t ** proxy_arr_ptr,uintptr_t ** perm_adapt_set_unstopped_ptr,uint32_t ** perm_2success_ct_ptr,uint32_t ** perm_attempt_ct_ptr,uintptr_t ** unstopped_markers_ptr)12291 int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, double* orig_chisq, Set_info* sip, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, uintptr_t* founder_pnm, uint32_t ld_ignore_x, uint32_t hh_exists, const char* flag_descrip, uintptr_t* marker_ct_ptr, uintptr_t** marker_exclude_ptr, uintptr_t** set_incl_ptr, uint32_t** marker_idx_to_uidx_ptr, uint32_t*** setdefs_ptr, uintptr_t* set_ct_ptr, uint32_t* max_sigset_size_ptr, uint32_t*** ld_map_ptr, double* chisq_threshold_ptr, double** orig_set_scores_ptr, double** sorted_chisq_buf_ptr, uint32_t** sorted_marker_idx_buf_ptr, uint32_t** proxy_arr_ptr, uintptr_t** perm_adapt_set_unstopped_ptr, uint32_t** perm_2success_ct_ptr, uint32_t** perm_attempt_ct_ptr, uintptr_t** unstopped_markers_ptr) {
12292   // Assumes *marker_ct_ptr has value marker_ct_mid, and marker_exclude_ptr
12293   // initially points to marker_exclude_mid.
12294   // Side effect: allocates set_incl, marker_idx_to_uidx, ld_map, and several
12295   // other arrays on stack
12296   uintptr_t marker_ct_mid = *marker_ct_ptr;
12297   uintptr_t marker_ct = marker_ct_mid;
12298   uintptr_t raw_set_ct = sip->ct;
12299   uintptr_t raw_set_ctl = BITCT_TO_WORDCT(raw_set_ct);
12300   uintptr_t set_ct = 0;
12301   uintptr_t* marker_exclude_mid = *marker_exclude_ptr;
12302   double chisq_threshold = inverse_chiprob(sip->set_p, 1);
12303   uint32_t max_sigset_size = 0;
12304   int32_t retval = 0;
12305   uintptr_t marker_midx;
12306   uintptr_t set_uidx;
12307   uintptr_t* set_incl;
12308   uintptr_t* cur_bitfield;
12309   double* chisq_ptr;
12310   double* chisq_end;
12311   double* orig_set_scores;
12312   uint32_t** setdefs;
12313   uint32_t* marker_midx_to_idx;
12314   uint32_t* cur_setdef;
12315   uintptr_t marker_idx;
12316   uintptr_t set_idx;
12317   uint32_t range_ct;
12318   uint32_t range_idx;
12319   uint32_t range_offset;
12320   uint32_t range_stop;
12321   uint32_t include_out_of_bounds;
12322   uint32_t cur_set_size;
12323   uint32_t cur_range_size;
12324   uint32_t uii;
12325   if (bigstack_calloc_ul(raw_set_ctl, set_incl_ptr) ||
12326       bigstack_alloc_ui(marker_ct_orig, &marker_midx_to_idx)) {
12327     goto set_test_common_init_ret_NOMEM;
12328   }
12329   set_incl = *set_incl_ptr;
12330   fill_midx_to_idx(marker_exclude_orig, marker_exclude_mid, marker_ct, marker_midx_to_idx);
12331 
12332   // determine which sets contain at least one significant marker.  do not
12333   // attempt to calculate the sum statistic yet: we need the LD map for that.
12334   for (set_uidx = 0; set_uidx < raw_set_ct; set_uidx++) {
12335     cur_setdef = sip->setdefs[set_uidx];
12336     range_ct = cur_setdef[0];
12337     cur_set_size = 0;
12338     uii = 0; // found a significant marker?
12339     if (range_ct != 0xffffffffU) {
12340       for (range_idx = 0; range_idx < range_ct; range_idx++) {
12341         marker_midx = *(++cur_setdef);
12342         range_stop = *(++cur_setdef);
12343 	cur_range_size = range_stop - marker_midx;
12344         cur_set_size += cur_range_size;
12345         if (!uii) {
12346           chisq_ptr = &(orig_chisq[marker_midx_to_idx[marker_midx]]);
12347           chisq_end = &(chisq_ptr[cur_range_size]);
12348 	  for (; chisq_ptr < chisq_end; chisq_ptr++) {
12349 	    if (*chisq_ptr >= chisq_threshold) {
12350 	      uii = 1;
12351               break;
12352 	    }
12353 	  }
12354 	}
12355       }
12356     } else {
12357       range_offset = cur_setdef[1];
12358       range_stop = cur_setdef[2];
12359       include_out_of_bounds = cur_setdef[3];
12360       cur_bitfield = (uintptr_t*)(&(cur_setdef[4]));
12361       if (include_out_of_bounds && range_offset) {
12362         for (marker_midx = 0; marker_midx < range_offset; marker_midx++) {
12363 	  // all initial markers guaranteed to be in union, no
12364 	  // marker_midx_to_idx lookup needed
12365           if (orig_chisq[marker_midx] >= chisq_threshold) {
12366             uii = 1;
12367             break;
12368 	  }
12369 	}
12370         cur_set_size += range_offset;
12371       }
12372       cur_set_size += popcount_longs(cur_bitfield, ((range_stop + 127) / 128) * (128 / BITCT));
12373       if (!uii) {
12374         for (marker_midx = 0; marker_midx < range_stop; marker_midx++) {
12375           if (IS_SET(cur_bitfield, marker_midx)) {
12376             if (orig_chisq[marker_midx_to_idx[marker_midx + range_offset]] >= chisq_threshold) {
12377               uii = 1;
12378               break;
12379 	    }
12380 	  }
12381 	}
12382       }
12383       if (include_out_of_bounds && (range_offset + range_stop < marker_ct_orig)) {
12384         cur_set_size += marker_ct_orig - range_offset - range_stop;
12385         if (!uii) {
12386           for (marker_idx = marker_midx_to_idx[range_offset + range_stop]; marker_idx < marker_ct; marker_idx++) {
12387 	    // all trailing markers guaranteed to be in union
12388             if (orig_chisq[marker_idx] >= chisq_threshold) {
12389               uii = 1;
12390               break;
12391 	    }
12392 	  }
12393 	}
12394       }
12395     }
12396     if (uii) {
12397       SET_BIT(set_uidx, set_incl);
12398       set_ct++;
12399       if (cur_set_size > max_sigset_size) {
12400 	max_sigset_size = cur_set_size;
12401       }
12402     }
12403   }
12404   if (!set_ct) {
12405     logerrprint("Warning: No significant variants in any set.  Skipping permutation-based set\ntest.\n");
12406     goto set_test_common_init_ret_1;
12407   }
12408   LOGPRINTFWW("%s set test: Testing %" PRIuPTR " set%s with at least one significant variant.\n", flag_descrip, set_ct, (set_ct == 1)? "" : "s");
12409   bigstack_reset((unsigned char*)marker_midx_to_idx);
12410   if (set_ct < raw_set_ct) {
12411     marker_ct = marker_ct_orig;
12412     if (extract_set_union_unfiltered(sip, set_incl, unfiltered_marker_ct, marker_exclude_orig, marker_exclude_ptr, &marker_ct)) {
12413       goto set_test_common_init_ret_NOMEM;
12414     }
12415   }
12416   // Okay, we've pruned all we can, now it's time to suck it up and construct
12417   // the potentially huge LD map
12418   if (bigstack_alloc_ui(marker_ct, marker_idx_to_uidx_ptr)) {
12419     goto set_test_common_init_ret_NOMEM;
12420   }
12421   fill_idx_to_uidx(*marker_exclude_ptr, unfiltered_marker_ct, marker_ct, *marker_idx_to_uidx_ptr);
12422   if (marker_ct < marker_ct_orig) {
12423     if (setdefs_compress(sip, set_incl, set_ct, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, *marker_exclude_ptr, marker_ct, setdefs_ptr)) {
12424       goto set_test_common_init_ret_NOMEM;
12425     }
12426   } else {
12427     *setdefs_ptr = sip->setdefs;
12428   }
12429   setdefs = *setdefs_ptr;
12430   retval = construct_ld_map(threads, bedfile, bed_offset, *marker_exclude_ptr, marker_ct, marker_reverse, *marker_idx_to_uidx_ptr, unfiltered_sample_ct, founder_pnm, sip, set_incl, set_ct, setdefs, outname, outname_end, marker_ids, max_marker_id_len, sex_male, chrom_info_ptr, ld_ignore_x, hh_exists, ld_map_ptr);
12431   if (retval) {
12432     goto set_test_common_init_ret_1;
12433   }
12434   if (marker_ct_mid != marker_ct) {
12435     // caller needs to collapse other arrays
12436     inplace_delta_collapse_arr((char*)orig_chisq, sizeof(double), marker_ct_mid, marker_ct, marker_exclude_mid, *marker_exclude_ptr);
12437   }
12438   if (bigstack_alloc_d(set_ct, orig_set_scores_ptr) ||
12439       bigstack_alloc_d(max_sigset_size, sorted_chisq_buf_ptr) ||
12440       bigstack_alloc_ui(max_sigset_size, sorted_marker_idx_buf_ptr) ||
12441       // 3 int32s = max(sizeof(double), sizeof(intptr_t)) + sizeof(int32_t)
12442       bigstack_alloc_ui(max_sigset_size * 3LU, proxy_arr_ptr)) {
12443     goto set_test_common_init_ret_NOMEM;
12444   }
12445   orig_set_scores = *orig_set_scores_ptr;
12446   for (set_idx = 0; set_idx < set_ct; set_idx++) {
12447     // we're calling this again during final write anyway, so don't bother
12448     // saving raw_sig_ct or final_sig_ct now
12449     set_test_score(marker_ct, chisq_threshold, sip->set_max, orig_chisq, *ld_map_ptr, setdefs[set_idx], *sorted_chisq_buf_ptr, *sorted_marker_idx_buf_ptr, *proxy_arr_ptr, nullptr, nullptr, &(orig_set_scores[set_idx]));
12450   }
12451   // just treat --mperm as --perm with min_perms == max_perms, since this isn't
12452   // a proper max(T) test
12453   if (bigstack_alloc_ul(BITCT_TO_WORDCT(set_ct), perm_adapt_set_unstopped_ptr) ||
12454       bigstack_calloc_ui(set_ct, perm_2success_ct_ptr) ||
12455       bigstack_alloc_ui(set_ct, perm_attempt_ct_ptr) ||
12456       bigstack_alloc_ul(BITCT_TO_WORDCT(marker_ct), unstopped_markers_ptr)) {
12457     goto set_test_common_init_ret_NOMEM;
12458   }
12459   fill_all_bits(set_ct, *perm_adapt_set_unstopped_ptr);
12460   fill_all_bits(marker_ct, *unstopped_markers_ptr);
12461   while (0) {
12462   set_test_common_init_ret_NOMEM:
12463     retval = RET_NOMEM;
12464     break;
12465   }
12466  set_test_common_init_ret_1:
12467   *marker_ct_ptr = marker_ct;
12468   *set_ct_ptr = set_ct;
12469   *max_sigset_size_ptr = max_sigset_size;
12470   *chisq_threshold_ptr = chisq_threshold;
12471   return retval;
12472 }
12473 
compute_set_scores(uintptr_t marker_ct,uintptr_t perm_vec_ct,uintptr_t set_ct,double * chisq_matrix,double * orig_set_scores,double * sorted_chisq_buf,uint32_t * sorted_marker_idx_buf,uint32_t * proxy_arr,uint32_t ** setdefs,uint32_t ** ld_map,Aperm_info * apip,double chisq_threshold,double adaptive_ci_zt,uint32_t first_adapt_check,uint32_t perms_done,uint32_t set_max,uintptr_t * perm_adapt_set_unstopped,uint32_t * perm_2success_ct,uint32_t * perm_attempt_ct)12474 void compute_set_scores(uintptr_t marker_ct, uintptr_t perm_vec_ct, uintptr_t set_ct, double* chisq_matrix, double* orig_set_scores, double* sorted_chisq_buf, uint32_t* sorted_marker_idx_buf, uint32_t* proxy_arr, uint32_t** setdefs, uint32_t** ld_map, Aperm_info* apip, double chisq_threshold, double adaptive_ci_zt, uint32_t first_adapt_check, uint32_t perms_done, uint32_t set_max, uintptr_t* perm_adapt_set_unstopped, uint32_t* perm_2success_ct, uint32_t* perm_attempt_ct) {
12475   // compute set stats for the just-completed permutations
12476   uint32_t pidx_offset = perms_done - perm_vec_ct;
12477   uintptr_t set_idx;
12478   double stat_high;
12479   double stat_low;
12480   double cur_score;
12481   double pval;
12482   double dxx;
12483   uint32_t next_adapt_check;
12484   uint32_t pidx;
12485   uint32_t uii;
12486   for (set_idx = 0; set_idx < set_ct; set_idx++) {
12487     if (IS_SET(perm_adapt_set_unstopped, set_idx)) {
12488       next_adapt_check = first_adapt_check;
12489       uii = perm_2success_ct[set_idx];
12490       stat_high = orig_set_scores[set_idx] + EPSILON;
12491       stat_low = orig_set_scores[set_idx] - EPSILON;
12492       for (pidx = 0; pidx < perm_vec_ct;) {
12493 	set_test_score(marker_ct, chisq_threshold, set_max, &(chisq_matrix[pidx * marker_ct]), ld_map, setdefs[set_idx], sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, nullptr, nullptr, &cur_score);
12494 	if (cur_score > stat_high) {
12495 	  uii += 2;
12496 	} else if (cur_score > stat_low) {
12497 	  uii++;
12498 	}
12499 	if (++pidx == next_adapt_check - pidx_offset) {
12500 	  if (uii) {
12501 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
12502 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
12503 	    if ((pval - dxx > apip->alpha) || (pval + dxx < apip->alpha)) {
12504 	      CLEAR_BIT(set_idx, perm_adapt_set_unstopped);
12505 	      perm_attempt_ct[set_idx] = next_adapt_check;
12506 	      break;
12507 	    }
12508 	  }
12509 	  next_adapt_check += (int32_t)(apip->init_interval + ((int32_t)next_adapt_check) * apip->interval_slope);
12510 	}
12511       }
12512       perm_2success_ct[set_idx] = uii;
12513     }
12514   }
12515 }
12516 
write_set_test_results(char * outname,char * outname_end2,Set_info * sip,uint32_t ** ld_map,uint32_t ** setdefs,uintptr_t * set_incl,uintptr_t set_ct,uintptr_t marker_ct_orig,uintptr_t marker_ct,uint32_t * marker_idx_to_uidx,char * marker_ids,uintptr_t max_marker_id_len,uint32_t * perm_2success_ct,uint32_t * perm_attempt_ct,uint32_t mtest_adjust,uint32_t perm_count,double pfilter,double output_min_p,double chisq_threshold,double * orig_stats,double * sorted_chisq_buf,uint32_t * sorted_marker_idx_buf,uint32_t * proxy_arr)12517 int32_t write_set_test_results(char* outname, char* outname_end2, Set_info* sip, uint32_t** ld_map, uint32_t** setdefs, uintptr_t* set_incl, uintptr_t set_ct, uintptr_t marker_ct_orig, uintptr_t marker_ct, uint32_t* marker_idx_to_uidx, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* perm_2success_ct, uint32_t* perm_attempt_ct, uint32_t mtest_adjust, uint32_t perm_count, double pfilter, double output_min_p, double chisq_threshold, double* orig_stats, double* sorted_chisq_buf, uint32_t* sorted_marker_idx_buf, uint32_t* proxy_arr) {
12518   // assumes caller will free memory from stack
12519   FILE* outfile = nullptr;
12520   uintptr_t* nonempty_set_incl = nullptr;
12521   double* empirical_pvals = nullptr;
12522   uintptr_t raw_set_ct = sip->ct;
12523   uintptr_t max_set_id_len = sip->max_name_len;
12524   uint32_t nonempty_set_ct = 0;
12525   int32_t retval = 0;
12526   uintptr_t set_uidx;
12527   uintptr_t set_idx;
12528   char* bufptr;
12529   uint32_t* nonempty_set_idx_to_uidx;
12530   double cur_score;
12531   double pval;
12532   uint32_t raw_sig_ct;
12533   uint32_t final_sig_ct;
12534   uint32_t set_midx;
12535   uint32_t uii;
12536   if (set_ct && mtest_adjust) {
12537     if (alloc_and_populate_nonempty_set_incl(sip, &nonempty_set_ct, &nonempty_set_incl)) {
12538       goto write_set_test_results_ret_NOMEM;
12539     }
12540     if (bigstack_alloc_d(nonempty_set_ct, &empirical_pvals)) {
12541       goto write_set_test_results_ret_NOMEM;
12542     }
12543   }
12544   if (fopen_checked(outname, "w", &outfile)) {
12545     goto write_set_test_results_ret_OPEN_FAIL;
12546   }
12547   fprintf(outfile, "         SET   NSNP   NSIG   ISIG         EMP1 %sSNPS\n", perm_count? "          NP " : "");
12548   for (set_uidx = 0, set_midx = 0, set_idx = 0; set_uidx < raw_set_ct; set_uidx++) {
12549     bufptr = fw_strcpy(12, &(sip->names[set_uidx * max_set_id_len]), g_textbuf);
12550     *bufptr++ = ' ';
12551     bufptr = uint32toa_w6x(setdef_size(sip->setdefs[set_uidx], marker_ct_orig), ' ', bufptr);
12552     if (IS_SET(set_incl, set_uidx)) {
12553       set_test_score(marker_ct, chisq_threshold, sip->set_max, orig_stats, ld_map, setdefs[set_idx], sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, &raw_sig_ct, &final_sig_ct, &cur_score);
12554       bufptr = uint32toa_w6x(raw_sig_ct, ' ', bufptr);
12555       bufptr = uint32toa_w6x(final_sig_ct, ' ', bufptr);
12556       pval = ((double)(perm_2success_ct[set_idx] + 2)) / ((double)(2 * (perm_attempt_ct[set_idx] + 1)));
12557       if (empirical_pvals) {
12558 	empirical_pvals[set_midx] = pval;
12559       }
12560       if (pval <= pfilter) {
12561 	if (!perm_count) {
12562 	  bufptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, ' ', bufptr);
12563 	} else {
12564 	  bufptr = dtoa_g_wxp4(((double)perm_2success_ct[set_idx]) * 0.5, 12, bufptr);
12565 	  bufptr = memseta(bufptr, 32, 3);
12566 	  bufptr = uint32toa_w10x(perm_attempt_ct[set_idx], ' ', bufptr);
12567 	}
12568 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
12569 	  goto write_set_test_results_ret_WRITE_FAIL;
12570 	}
12571 	fputs(&(marker_ids[marker_idx_to_uidx[proxy_arr[0]] * max_marker_id_len]), outfile);
12572 	for (uii = 1; uii < final_sig_ct; uii++) {
12573 	  putc_unlocked('|', outfile);
12574 	  fputs(&(marker_ids[marker_idx_to_uidx[proxy_arr[uii]] * max_marker_id_len]), outfile);
12575 	}
12576 	if (putc_checked('\n', outfile)) {
12577 	  goto write_set_test_results_ret_WRITE_FAIL;
12578 	}
12579       }
12580       set_midx++;
12581       set_idx++;
12582     } else {
12583       if (!perm_count) {
12584         bufptr = memcpya(bufptr, "     0      0            1 NA\n", 30);
12585       } else {
12586         bufptr = memcpya(bufptr, "     0      0            0            0 NA\n", 43);
12587       }
12588       if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
12589 	goto write_set_test_results_ret_WRITE_FAIL;
12590       }
12591       if (nonempty_set_incl && is_set(nonempty_set_incl, set_uidx)) {
12592 	empirical_pvals[set_midx] = 1.0;
12593 	set_midx++;
12594       }
12595     }
12596   }
12597   if (fclose_null(&outfile)) {
12598     goto write_set_test_results_ret_WRITE_FAIL;
12599   }
12600   LOGPRINTFWW("Set test results written to %s .\n", outname);
12601   if (empirical_pvals) {
12602     if (bigstack_alloc_ui(nonempty_set_ct, &nonempty_set_idx_to_uidx)) {
12603       goto write_set_test_results_ret_NOMEM;
12604     }
12605     fill_idx_to_uidx_incl(nonempty_set_incl, raw_set_ct, nonempty_set_ct, nonempty_set_idx_to_uidx);
12606     // .qassoc.set.adjusted instead of .set.mperm.adjusted, etc.
12607     *outname_end2 = '\0';
12608     retval = multcomp(outname, outname_end2, nonempty_set_idx_to_uidx, nonempty_set_ct, sip->names, max_set_id_len, 0, nullptr, nullptr, pfilter, output_min_p, mtest_adjust, 1, 0.0, nullptr, empirical_pvals);
12609   }
12610   while (0) {
12611   write_set_test_results_ret_NOMEM:
12612     retval = RET_NOMEM;
12613     break;
12614   write_set_test_results_ret_OPEN_FAIL:
12615     retval = RET_OPEN_FAIL;
12616     break;
12617   write_set_test_results_ret_WRITE_FAIL:
12618     retval = RET_WRITE_FAIL;
12619     break;
12620   }
12621   fclose_cond(outfile);
12622   return retval;
12623 }
12624 
12625 typedef struct clump_entry_struct {
12626   double pval;
12627   struct clump_entry_struct* next;
12628   uint32_t fidx;
12629   char annot[];
12630 } Clump_entry;
12631 
12632 typedef struct cur_clump_info_struct {
12633   double r2;
12634   uint32_t marker_idx;
12635   uint32_t fidx;
12636 } Cur_clump_info;
12637 
12638 typedef struct clump_missing_id_struct {
12639   double pval;
12640   struct clump_missing_id_struct* next;
12641   char idstr[];
12642 } Clump_missing_id;
12643 
update_clump_histo(double pval,uintptr_t * histo)12644 void update_clump_histo(double pval, uintptr_t* histo) {
12645   if (pval < 0.001) {
12646     if (pval < 0.0001) {
12647       histo[4] += 1;
12648     } else {
12649       histo[3] += 1;
12650     }
12651   } else if (pval < 0.01) {
12652     histo[2] += 1;
12653   } else if (pval < 0.05) {
12654     histo[1] += 1;
12655   } else {
12656     histo[0] += 1;
12657   }
12658 }
12659 
clump_reports(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * founder_info,Clump_info * clump_ip,uintptr_t * sex_male,uint32_t hh_exists)12660 int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, Clump_info* clump_ip, uintptr_t* sex_male, uint32_t hh_exists) {
12661   unsigned char* bigstack_mark = g_bigstack_base;
12662   unsigned char* bigstack_end_mark = g_bigstack_end;
12663   gzFile gz_infile = nullptr;
12664   FILE* outfile = nullptr;
12665   FILE* outfile_ranges = nullptr;
12666   FILE* outfile_best = nullptr;
12667   uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
12668   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
12669   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
12670   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
12671   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl);
12672   uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
12673   uintptr_t founder_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(founder_ct);
12674   uintptr_t final_mask = get_final_mask(founder_ct);
12675   uintptr_t range_group_ct = 0;
12676   uintptr_t max_range_group_id_len = 0;
12677   uintptr_t max_header_len = 2;
12678   uintptr_t snpfield_search_ct = 1;
12679   uintptr_t pfield_search_ct = 1;
12680   uintptr_t annot_ct = 0;
12681   uintptr_t missing_variant_ct = 0;
12682   uintptr_t cur_rg_ct = 0;
12683   uintptr_t range_chrom_max = 0;
12684   uintptr_t unmatched_group_ct = 0;
12685   uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
12686   char* range_group_names = nullptr;
12687   char* fname_ptr = nullptr;
12688   char* annot_flattened = clump_ip->annotate_flattened;
12689   char* tbuf2 = &(g_textbuf[MAXLINELEN]);
12690   char* header2_ptr = nullptr;
12691   char* annot_ptr = nullptr;
12692   char* cur_rg_names = nullptr;
12693   uintptr_t* founder_include2 = nullptr;
12694   uintptr_t* founder_male_include2 = nullptr;
12695   uintptr_t* rg_chrom_bounds = nullptr;
12696   uint32_t** rg_setdefs = nullptr;
12697   uint32_t** cur_rg_setdefs = nullptr;
12698   Clump_missing_id* not_found_list = nullptr;
12699   uintptr_t* rangematch_bitfield = nullptr;
12700   double p1_thresh = clump_ip->p1;
12701   double p2_thresh = clump_ip->p2;
12702   double load_pthresh = 0.05;
12703   double r2_thresh = clump_ip->r2;
12704   uint32_t allow_overlap = clump_ip->modifier & CLUMP_ALLOW_OVERLAP;
12705   uint32_t clump_index_first = clump_ip->modifier & CLUMP_INDEX_FIRST;
12706   uint32_t clump_best = clump_ip->modifier & CLUMP_BEST;
12707   uint32_t clump_verbose = clump_ip->modifier & CLUMP_VERBOSE;
12708   uint32_t bp_radius = clump_ip->bp_radius;
12709   uint32_t best_fidx_match = 0xffffffffU;
12710   uint32_t require_multifile = clump_ip->modifier & CLUMP_REPLICATE;
12711   uint32_t index_eligible = 1;
12712   uint32_t header1_len = 0;
12713   uint32_t header2_len = 0;
12714   uint32_t file_ct = 0;
12715   uint32_t final_clump_ct = 0;
12716   uint32_t max_missing_id_len = 0;
12717   int32_t retval = 0;
12718   uintptr_t histo[5]; // NSIG, S05, S01, S001, S0001
12719   uint32_t index_tots[5];
12720   uint32_t counts[18];
12721   Clump_entry** clump_entries;
12722   Clump_entry* clump_entry_ptr;
12723   Clump_entry* best_entry_ptr;
12724   Cur_clump_info* cur_clump_base;
12725   Cur_clump_info* cc_ptr;
12726   uintptr_t* col_bitfield;
12727   uintptr_t* cur_bitfield;
12728   uintptr_t* loadbuf_raw;
12729   uintptr_t* index_data;
12730   uintptr_t* window_data;
12731   uintptr_t* window_data_ptr;
12732   char* sorted_missing_variant_ids;
12733   char* sorted_header_dict;
12734   char* loadbuft; // t is for text
12735   char* cur_a1;
12736   char* cur_a2;
12737   char* bufptr;
12738   char* bufptr2;
12739   char* bufptr3;
12740   char* bufptr4;
12741   uint32_t* header_id_map;
12742   uint32_t* marker_id_htable;
12743   uint32_t* parse_table;
12744   uint32_t* cur_parse_info;
12745   uint32_t* nsig_arr;
12746   uint32_t* pval_map;
12747   uint32_t* marker_uidx_to_idx;
12748   uint32_t* marker_idx_to_uidx;
12749   double* sorted_pvals;
12750   Clump_missing_id* cm_ptr;
12751   uintptr_t header_dict_ct;
12752   uintptr_t extra_annot_space;
12753   uintptr_t cur_bigstack_left;
12754   uintptr_t loadbuft_size;
12755   uintptr_t marker_idx;
12756   uintptr_t last_marker_idx;
12757   uintptr_t max_window_size; // universal bound
12758   uintptr_t cur_window_size;
12759   uintptr_t line_idx;
12760   uintptr_t ulii;
12761   uintptr_t uljj;
12762   uintptr_t ulkk;
12763   uintptr_t ulmm;
12764   double pval;
12765   double freq1x;
12766   double freq2x;
12767   double freqx1;
12768   double freqx2;
12769   double freq11;
12770   double freq11_expected;
12771   double cur_r2;
12772   double max_r2;
12773   double dxx;
12774   uint32_t marker_id_htable_size;
12775   uint32_t annot_ct_p2;
12776   uint32_t annot_ct_p2_ctl;
12777   uint32_t cur_read_ct;
12778   uint32_t index_ct;
12779   uint32_t sp_idx;
12780   uint32_t file_idx;
12781   uint32_t ivar_idx;
12782   uint32_t ivar_uidx;
12783   uint32_t cur_bp;
12784   uint32_t min_bp;
12785   uint32_t max_bp;
12786   uint32_t clump_chrom_idx;
12787   uint32_t clump_uidx_first;
12788   uint32_t clump_uidx_last;
12789   uint32_t index_fidx;
12790   uint32_t marker_uidx;
12791   uint32_t max_r2_uidx;
12792   uint32_t is_haploid;
12793   uint32_t is_x;
12794   uint32_t is_y;
12795   uint32_t a1_len;
12796   uint32_t a2_len;
12797   uint32_t allele_padding;
12798   uint32_t uii;
12799   uint32_t ujj;
12800   uint32_t ukk;
12801   uint32_t umm;
12802   int32_t ii;
12803   // suppress warning
12804   index_tots[3] = 0;
12805   index_tots[4] = 0;
12806 
12807   if (annot_flattened && (!clump_verbose) && (!clump_best)) {
12808     logerrprint("Error: --clump-annotate must be used with --clump-verbose or --clump-best.\n");
12809     goto clump_reports_ret_INVALID_CMDLINE;
12810   }
12811   if (!founder_ct) {
12812     logerrprint("Warning: Skipping --clump since there are no founders.  (--make-founders may\ncome in handy here.)\n");
12813     goto clump_reports_ret_1;
12814   }
12815   if (clump_best) {
12816     load_pthresh = 1.0;
12817   } else {
12818     if (p2_thresh > load_pthresh) {
12819       load_pthresh = p2_thresh;
12820     }
12821     if (p1_thresh >= load_pthresh) {
12822       // may as well maximize backwards compatibility re: which comparisons are
12823       // > vs. >=
12824       load_pthresh = p1_thresh * (1 + SMALL_EPSILON);
12825     }
12826   }
12827   if (clump_ip->range_fname) {
12828     // 1. load range file, sort, etc.
12829     retval = load_range_list_sortpos(clump_ip->range_fname, clump_ip->range_border, 0, nullptr, 0, chrom_info_ptr, &range_group_ct, &range_group_names, &max_range_group_id_len, &rg_chrom_bounds, &rg_setdefs, &range_chrom_max, "--clump-range");
12830     if (retval) {
12831       goto clump_reports_ret_1;
12832     }
12833   }
12834   // 2. create marker ID hash table, allocate index-tracking bitfield
12835   retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
12836   if (retval) {
12837     goto clump_reports_ret_1;
12838   }
12839   if (bigstack_calloc_ul(marker_ctl, &cur_bitfield) ||
12840       bigstack_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
12841     goto clump_reports_ret_NOMEM;
12842   }
12843   fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
12844   if (clump_ip->snpfield_search_order) {
12845     snpfield_search_ct = count_and_measure_multistr(clump_ip->snpfield_search_order, &max_header_len);
12846   } else {
12847     max_header_len = 4; // 'SNP' + null terminator
12848   }
12849   if (clump_ip->pfield_search_order) {
12850     pfield_search_ct = count_and_measure_multistr(clump_ip->pfield_search_order, &max_header_len);
12851   }
12852   if (annot_flattened) {
12853     annot_ct = count_and_measure_multistr(annot_flattened, &max_header_len);
12854   }
12855   header_dict_ct = snpfield_search_ct + pfield_search_ct + annot_ct;
12856   // parse_table[2k + 1] stores the number of additional fields to skip before
12857   // reading that particular entry.  For example, if variant IDs are in the
12858   // second column in the current file, while p-values are in the fifth column,
12859   // parse_table[1] is 1 and parse_table[3] = 2.
12860   // parse_table[2k] stores the type of field contents (0 = variant ID, 1 =
12861   // P-value, 2 or more = annotation).
12862   // In the main loop, cur_parse_info[2k] stores the in-loadbuft offset of the
12863   // the string with that parse_table[2k] index, and cur_parse_info[2k + 1]
12864   // stores string length.
12865   annot_ct_p2 = 2 + annot_ct;
12866   annot_ct_p2_ctl = (annot_ct + (BITCT + 1)) / BITCT;
12867   if (bigstack_alloc_c(max_header_len * header_dict_ct, &sorted_header_dict) ||
12868       bigstack_alloc_ui(header_dict_ct, &header_id_map) ||
12869       bigstack_alloc_ul(annot_ct_p2_ctl, &col_bitfield) ||
12870       bigstack_alloc_ui(annot_ct_p2 * 2, &parse_table) ||
12871       bigstack_alloc_ui(annot_ct_p2 * 2, &cur_parse_info)) {
12872     goto clump_reports_ret_NOMEM;
12873   }
12874   ulii = 0; // write position
12875   if (clump_ip->snpfield_search_order) {
12876     bufptr = clump_ip->snpfield_search_order;
12877     uii = 0x40000000;
12878     do {
12879       ujj = strlen(bufptr) + 1;
12880       memcpy(&(sorted_header_dict[ulii * max_header_len]), bufptr, ujj);
12881       header_id_map[ulii++] = uii++;
12882       bufptr = &(bufptr[ujj]);
12883     } while (*bufptr);
12884   } else {
12885     memcpy(sorted_header_dict, "SNP", 4);
12886     header_id_map[0] = 0x40000000;
12887     ulii++;
12888   }
12889   if (clump_ip->pfield_search_order) {
12890     bufptr = clump_ip->pfield_search_order;
12891     uii = 0x20000000;
12892     do {
12893       ujj = strlen(bufptr) + 1;
12894       memcpy(&(sorted_header_dict[ulii * max_header_len]), bufptr, ujj);
12895       header_id_map[ulii++] = uii++;
12896       bufptr = &(bufptr[ujj]);
12897     } while (*bufptr);
12898   } else {
12899     memcpy(&(sorted_header_dict[ulii * max_header_len]), "P", 2);
12900     header_id_map[ulii++] = 0x20000000;
12901   }
12902   if (annot_flattened) {
12903     bufptr = annot_flattened;
12904     uii = 2;
12905     do {
12906       ujj = strlen(bufptr) + 1;
12907       memcpy(&(sorted_header_dict[ulii * max_header_len]), bufptr, ujj);
12908       header_id_map[ulii++] = uii++;
12909       bufptr = &(bufptr[ujj]);
12910     } while (*bufptr);
12911   }
12912   if (qsort_ext(sorted_header_dict, header_dict_ct, max_header_len, strcmp_deref, (char*)header_id_map, sizeof(int32_t))) {
12913     goto clump_reports_ret_NOMEM;
12914   }
12915   if (scan_for_duplicate_ids(sorted_header_dict, header_dict_ct, max_header_len)) {
12916     logerrprint("Error: Duplicate --clump-snp-field/--clump-field/--clump-annotate field name.\n");
12917     goto clump_reports_ret_INVALID_CMDLINE;
12918   }
12919 
12920   if (bigstack_calloc_ui(marker_ct, &nsig_arr)) {
12921     goto clump_reports_ret_NOMEM;
12922   }
12923   clump_entries = (Clump_entry**)bigstack_alloc(marker_ct * sizeof(intptr_t));
12924   if (!clump_entries) {
12925     goto clump_reports_ret_NOMEM;
12926   }
12927   fill_ulong_zero(marker_ct, (uintptr_t*)clump_entries);
12928   // 3. load file(s) in sequence.  start with array of null pointers, allocate
12929   //    from bottom of stack (possibly need to save p-val, file number,
12930   //    annotations, and/or pointer to next entry) while updating
12931   //    p-val/reverse-lookup array
12932   bufptr = clump_ip->fnames_flattened;
12933   do {
12934     fname_ptr = bufptr;
12935     bufptr = strchr(bufptr, '\0');
12936     bufptr++;
12937     file_ct++;
12938   } while (*bufptr);
12939   loadbuft = (char*)g_bigstack_base;
12940   if (clump_best) {
12941     if (file_ct == 2) {
12942       if (!clump_index_first) {
12943         logerrprint("Error: --clump-best can no longer be used with two --clump files unless\n--clump-index-first is also specified.  (Contact the developers if this is\nproblematic.)\n");
12944         goto clump_reports_ret_INVALID_CMDLINE;
12945       }
12946     } else if (file_ct > 2) {
12947       logerrprint("Error: --clump-best can no longer be used with more than two --clump files.\n(Contact the developers if this is problematic.)\n");
12948       goto clump_reports_ret_INVALID_CMDLINE;
12949     }
12950     // only draw proxies from this file
12951     best_fidx_match = file_ct;
12952   }
12953   // Suppose the current line has a super-long allele code which must be saved
12954   // (since it will go into the ANNOT field).  Then the new allocation may need
12955   // to be the size of the entire line.  So, to be safe, we require the current
12956   // line to fit in ~half of available workspace.
12957   // To reduce the risk of 32-bit integer overflow bugs, we cap line length at
12958   // a bit under 2^30 instead of 2^31 here.
12959   extra_annot_space = (48 + 2 * annot_ct) & (~(15 * ONELU));
12960   cur_bigstack_left = bigstack_left();
12961   if (cur_bigstack_left <= 2 * MAXLINELEN + extra_annot_space) {
12962     goto clump_reports_ret_NOMEM;
12963   } else if (cur_bigstack_left - extra_annot_space >= MAXLINEBUFLEN) {
12964     loadbuft[(MAXLINEBUFLEN / 2) - 1] = ' ';
12965   }
12966   if (clump_index_first && (file_ct > 1)) {
12967     index_eligible = 0;
12968   }
12969   // load in reverse order since we're adding to the front of the linked lists
12970   for (file_idx = file_ct; file_idx; file_idx--) {
12971     retval = gzopen_read_checked(fname_ptr, &gz_infile);
12972     if (retval) {
12973       goto clump_reports_ret_1;
12974     }
12975     loadbuft_size = bigstack_left();
12976     if (loadbuft_size <= 2 * MAXLINELEN + extra_annot_space) {
12977       goto clump_reports_ret_NOMEM;
12978     }
12979     loadbuft_size = (loadbuft_size - extra_annot_space) / 2;
12980     if (loadbuft_size >= MAXLINEBUFLEN / 2) {
12981       loadbuft_size = MAXLINEBUFLEN / 2;
12982       // no space-termination needed
12983     } else {
12984       loadbuft[loadbuft_size - 1] = ' ';
12985     }
12986     ukk = 0x7fffffff; // highest-precedence variant ID header seen so far
12987     umm = 0x7fffffff; // highest-precedence p-value header seen so far
12988     // load_to_first_token() with potentially gzipped input.  Move this to
12989     // plink_common if anything else needs it.
12990     line_idx = 0;
12991     while (1) {
12992       line_idx++;
12993       if (!gzgets(gz_infile, loadbuft, loadbuft_size)) {
12994 	if (!gzeof(gz_infile)) {
12995 	  goto clump_reports_ret_READ_FAIL;
12996 	} else {
12997           LOGPREPRINTFWW("Error: Empty %s.\n", fname_ptr);
12998 	  goto clump_reports_ret_INVALID_FORMAT_2;
12999 	}
13000       }
13001       if (!(loadbuft[loadbuft_size - 1])) {
13002 	if (loadbuft_size == MAXLINEBUFLEN / 2) {
13003 	  LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname_ptr);
13004 	  goto clump_reports_ret_INVALID_FORMAT_2;
13005 	} else {
13006 	  goto clump_reports_ret_NOMEM;
13007 	}
13008       }
13009       bufptr = skip_initial_spaces(loadbuft);
13010       if (!is_eoln_kns(*bufptr)) {
13011 	break;
13012       }
13013     }
13014     fill_ulong_zero(annot_ct_p2_ctl, col_bitfield);
13015     uii = 0; // current 0-based column number
13016     // We don't know in advance when the highest-precedence SNP/p-val columns
13017     // will appear, so we initially populate parse_table with
13018     //   [2k]: header type index (0 = variant ID, 1 = p-val, 2+ = annot)
13019     //   [2k + 1]: 0-based column number
13020     // and then sort at the end.
13021     cur_read_ct = 2;
13022     parse_table[0] = 0;
13023     parse_table[2] = 1;
13024     do {
13025       bufptr2 = token_endnn(bufptr);
13026       ii = bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_header_dict, max_header_len, header_dict_ct);
13027       if (ii != -1) {
13028 	ujj = header_id_map[(uint32_t)ii];
13029         if (ujj >= 0x40000000) {
13030           if (ujj < ukk) {
13031 	    // ignore title if higher-precedence title already seen
13032 	    set_bit(0, col_bitfield);
13033 	    ukk = ujj;
13034 	    parse_table[1] = uii; // temporary storage
13035 	  } else if (ujj == ukk) {
13036 	    goto clump_reports_ret_DUPLICATE_HEADER_COL;
13037 	  }
13038 	} else if (ujj >= 0x20000000) {
13039 	  if (ujj < umm) {
13040 	    set_bit(1, col_bitfield);
13041             umm = ujj;
13042 	    parse_table[3] = uii;
13043 	  } else if (ujj == umm) {
13044 	    goto clump_reports_ret_DUPLICATE_HEADER_COL;
13045 	  }
13046 	} else {
13047 	  if (is_set(col_bitfield, ujj)) {
13048 	    goto clump_reports_ret_DUPLICATE_HEADER_COL;
13049 	  }
13050 	  set_bit(ujj, col_bitfield);
13051           parse_table[cur_read_ct * 2 + 1] = uii;
13052 	  parse_table[cur_read_ct * 2] = ujj;
13053 	  cur_read_ct++;
13054 	}
13055       }
13056       bufptr = skip_initial_spaces(bufptr2);
13057       uii++;
13058     } while (!is_eoln_kns(*bufptr));
13059     if (!is_set(col_bitfield, 0)) {
13060       LOGPREPRINTFWW("Error: No variant ID field found in %s.\n", fname_ptr);
13061       goto clump_reports_ret_INVALID_FORMAT_2;
13062     } else if (!is_set(col_bitfield, 1)) {
13063       LOGPREPRINTFWW("Error: No p-value field found in %s.\n", fname_ptr);
13064       goto clump_reports_ret_INVALID_FORMAT_2;
13065     }
13066 #ifdef __cplusplus
13067     std::sort((int64_t*)parse_table, (int64_t*)(&(parse_table[cur_read_ct * 2])));
13068 #else
13069     qsort((int64_t*)parse_table, cur_read_ct, sizeof(int64_t), llcmp);
13070 #endif
13071     for (uii = cur_read_ct - 1; uii; uii--) {
13072       parse_table[uii * 2 + 1] -= parse_table[uii * 2 - 1] + 1;
13073     }
13074   clump_reports_load_loop:
13075     while (1) {
13076       line_idx++;
13077       if (!gzgets(gz_infile, loadbuft, loadbuft_size)) {
13078 	if (!gzeof(gz_infile)) {
13079 	  goto clump_reports_ret_READ_FAIL;
13080 	}
13081 	break;
13082       }
13083       if (!loadbuft[loadbuft_size - 1]) {
13084 	if (loadbuft_size == MAXLINEBUFLEN / 2) {
13085 	  LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname_ptr);
13086 	  goto clump_reports_ret_INVALID_FORMAT_2;
13087 	}
13088 	goto clump_reports_ret_NOMEM;
13089       }
13090       bufptr = skip_initial_spaces(loadbuft);
13091       if (is_eoln_kns(*bufptr)) {
13092 	continue;
13093       }
13094       fill_uint_zero(annot_ct_p2 * 2, cur_parse_info);
13095       uii = 0;
13096       ukk = annot_ct * 2; // annotation string length
13097       for (; uii < cur_read_ct; uii++) {
13098 	bufptr = next_token_multz(bufptr, parse_table[uii * 2 + 1]);
13099         if (no_more_tokens_kns(bufptr)) {
13100 	  // PLINK 1.07 --clump just skips the line in this situation, instead
13101 	  // of erroring out, so we replicate that
13102 	  goto clump_reports_load_loop;
13103 	}
13104 	bufptr2 = token_endnn(bufptr);
13105 	ujj = parse_table[uii * 2] * 2;
13106 	cur_parse_info[ujj] = (uintptr_t)(bufptr - loadbuft);
13107         cur_parse_info[ujj + 1] = (uintptr_t)(bufptr2 - bufptr);
13108 	if (ujj > 2) {
13109 	  ukk += cur_parse_info[ujj + 1];
13110 	}
13111 	bufptr = skip_initial_spaces(bufptr2);
13112       }
13113       if (scan_double(&(loadbuft[cur_parse_info[2]]), &pval)) {
13114 	continue;
13115       }
13116       if (pval < 0.0) {
13117 	LOGPREPRINTFWW("Error: Negative p-value on line %" PRIuPTR " of %s.\n", line_idx, fname_ptr);
13118 	goto clump_reports_ret_INVALID_FORMAT_2;
13119       }
13120       marker_uidx = id_htable_find(&(loadbuft[cur_parse_info[0]]), cur_parse_info[1], marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
13121       if (marker_uidx == 0xffffffffU) {
13122 	// variant ID not in current fileset
13123 	if ((pval <= p1_thresh) && index_eligible) {
13124 	  // actually a top variant, track it
13125 	  missing_variant_ct++;
13126 	  // screw it, just allocate these outside the workspace
13127 	  uii = cur_parse_info[1];
13128 	  if (uii >= max_missing_id_len) {
13129 	    max_missing_id_len = uii + 1;
13130 	  }
13131 	  cm_ptr = (Clump_missing_id*)malloc(offsetof(Clump_missing_id, idstr) + uii + 1);
13132 	  cm_ptr->pval = pval;
13133 	  cm_ptr->next = not_found_list;
13134 	  not_found_list = cm_ptr;
13135 	  memcpyx(cm_ptr->idstr, &(loadbuft[cur_parse_info[0]]), uii, '\0');
13136 	}
13137 	continue;
13138       }
13139       marker_idx = marker_uidx_to_idx[marker_uidx];
13140       if (pval > load_pthresh) {
13141 	if (pval >= 0.05) {
13142 	  if (pval > 1) {
13143 	    LOGPREPRINTFWW("Error: p-value > 1 on line %" PRIuPTR " of %s.\n", line_idx, fname_ptr);
13144 	    goto clump_reports_ret_INVALID_FORMAT_2;
13145 	  }
13146 	  nsig_arr[marker_idx] += 1;
13147 	}
13148 	continue;
13149       }
13150       clump_entry_ptr = (Clump_entry*)bigstack_end_alloc(offsetof(Clump_entry, annot) + ukk - 1);
13151       if (!clump_entry_ptr) {
13152 	goto clump_reports_ret_NOMEM;
13153       }
13154       clump_entry_ptr->pval = pval;
13155       clump_entry_ptr->next = clump_entries[marker_idx];
13156       clump_entry_ptr->fidx = file_idx;
13157       if (annot_ct) {
13158 	bufptr = clump_entry_ptr->annot;
13159 	uii = 2;
13160 	while (1) {
13161           bufptr = memcpya(bufptr, &(loadbuft[cur_parse_info[uii * 2]]), cur_parse_info[uii * 2 + 1]);
13162 	  if (++uii == annot_ct_p2) {
13163 	    break;
13164 	  }
13165 	  bufptr = memcpya(bufptr, ", ", 2);
13166 	}
13167 	*bufptr = '\0';
13168       }
13169       clump_entries[marker_idx] = clump_entry_ptr;
13170       if ((pval <= p1_thresh) && index_eligible) {
13171 	set_bit(marker_idx, cur_bitfield);
13172       }
13173       loadbuft_size = bigstack_left();
13174       if (loadbuft_size <= 2 * MAXLINELEN + extra_annot_space) {
13175 	goto clump_reports_ret_NOMEM;
13176       }
13177       loadbuft_size = (loadbuft_size - extra_annot_space) / 2;
13178       if (loadbuft_size >= MAXLINEBUFLEN / 2) {
13179 	loadbuft_size = MAXLINEBUFLEN / 2;
13180 	// no space-termination needed
13181       } else {
13182 	loadbuft[loadbuft_size - 1] = ' ';
13183       }
13184     }
13185     if (gzclose_null(&gz_infile)) {
13186       goto clump_reports_ret_READ_FAIL;
13187     }
13188     if (file_idx > 1) {
13189       fname_ptr = &(fname_ptr[-3]);
13190       while (*fname_ptr) {
13191 	fname_ptr--;
13192       }
13193       fname_ptr++;
13194       if (clump_index_first && (file_idx == 2)) {
13195 	index_eligible = 1;
13196       }
13197     }
13198   }
13199   // 4. sort p-val array, greedily form clumps
13200   index_ct = popcount_longs(cur_bitfield, marker_ctl);
13201   if (!index_ct) {
13202     logerrprint("Warning: No significant --clump results.  Skipping.\n");
13203     goto clump_reports_ret_1;
13204   }
13205   if (bigstack_alloc_d(index_ct, &sorted_pvals) ||
13206       bigstack_alloc_ui(index_ct, &pval_map)) {
13207     goto clump_reports_ret_NOMEM;
13208   }
13209   marker_idx = 0;
13210   for (uii = 0; uii < index_ct; uii++, marker_idx++) {
13211     marker_idx = next_set_unsafe(cur_bitfield, marker_idx);
13212     clump_entry_ptr = clump_entries[marker_idx];
13213     pval = clump_entry_ptr->pval;
13214     if (!clump_index_first) {
13215       while (clump_entry_ptr->next) {
13216 	clump_entry_ptr = clump_entry_ptr->next;
13217 	if (clump_entry_ptr->pval < pval) {
13218 	  pval = clump_entry_ptr->pval;
13219 	}
13220       }
13221     }
13222     sorted_pvals[uii] = pval;
13223     pval_map[uii] = marker_idx;
13224   }
13225   if (qsort_ext((char*)sorted_pvals, index_ct, sizeof(double), double_cmp_deref, (char*)pval_map, sizeof(int32_t))) {
13226     goto clump_reports_ret_NOMEM;
13227   }
13228   if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
13229       bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
13230       bigstack_alloc_ul(5 * founder_ctv2, &index_data)) {
13231     goto clump_reports_ret_NOMEM;
13232   }
13233   for (uii = 1; uii <= 5; uii++) {
13234     index_data[uii * founder_ctv2 - 2] = 0;
13235     index_data[uii * founder_ctv2 - 1] = 0;
13236   }
13237   if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, &founder_include2, &founder_male_include2)) {
13238     goto clump_reports_ret_NOMEM;
13239  }
13240   if (clump_verbose && rg_setdefs) {
13241     if (bigstack_alloc_ul(BITCT_TO_WORDCT(range_chrom_max), &rangematch_bitfield)) {
13242       goto clump_reports_ret_NOMEM;
13243     }
13244   }
13245   window_data = (uintptr_t*)g_bigstack_base;
13246   max_window_size = bigstack_left() / (founder_ctv2 * sizeof(intptr_t) + sizeof(Cur_clump_info));
13247   if (!max_window_size) {
13248     goto clump_reports_ret_NOMEM;
13249   }
13250   fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
13251   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
13252   // now this indicates whether a variant has previously been in a clump
13253   fill_ulong_zero(marker_ctl, cur_bitfield);
13254   // 5. iterate through clumps, calculate r^2 and write output
13255   memcpy(outname_end, ".clumped", 9);
13256   if (fopen_checked(outname, "w", &outfile)) {
13257     goto clump_reports_ret_OPEN_FAIL;
13258   }
13259   bufptr = tbuf2;
13260   if (clump_verbose) {
13261     *bufptr++ = '\n';
13262   }
13263   bufptr = memcpya(bufptr, " CHR    F ", 10);
13264   bufptr = fw_strcpyn(plink_maxsnp, 3, "SNP", bufptr);
13265   // replicate the misaligned non-verbose header for now
13266   bufptr = memcpya(bufptr, "         BP          ", clump_verbose? 21 : 19);
13267   bufptr = strcpya(bufptr, "P    TOTAL   NSIG    S05    S01   S001  S0001");
13268   if (!clump_verbose) {
13269     bufptr = memcpya(bufptr, "    SP2\n", 8);
13270     if (fwrite_checked(tbuf2, bufptr - tbuf2, outfile)) {
13271       goto clump_reports_ret_WRITE_FAIL;
13272     }
13273     if (rg_setdefs) {
13274       memcpy(&(outname_end[8]), ".ranges", 8);
13275       if (fopen_checked(outname, "w", &outfile_ranges)) {
13276 	goto clump_reports_ret_OPEN_FAIL;
13277       }
13278       bufptr = fw_strcpyn(plink_maxsnp, 3, "SNP", &(tbuf2[5]));
13279       bufptr = strcpya(bufptr, "          P      N                          POS         KB RANGES\n");
13280       if (fwrite_checked(tbuf2, bufptr - tbuf2, outfile_ranges)) {
13281 	goto clump_reports_ret_WRITE_FAIL;
13282       }
13283     }
13284   } else {
13285     *bufptr++ = '\n';
13286     header2_ptr = bufptr;
13287     header1_len = (uintptr_t)(header2_ptr - tbuf2);
13288     *bufptr++ = '\n';
13289     bufptr = memseta(bufptr, 32, 19 + plink_maxsnp);
13290     bufptr = strcpya(bufptr, "KB      RSQ  ALLELES    F            P ");
13291     if (annot_flattened) {
13292       bufptr = memcpya(bufptr, "       ANNOT", 12);
13293     }
13294     bufptr = memcpya(bufptr, "\n  (INDEX) ", 11);
13295     header2_len = (uintptr_t)(bufptr - header2_ptr);
13296   }
13297   if (clump_best) {
13298     memcpy(&(outname_end[8]), ".best", 6);
13299     if (fopen_checked(outname, "w", &outfile_best)) {
13300       goto clump_reports_ret_OPEN_FAIL;
13301     }
13302     bufptr = fw_strcpyn(plink_maxsnp, 5, "INDEX", g_textbuf);
13303     *bufptr++ = ' ';
13304     bufptr = fw_strcpyn(plink_maxsnp, 4, "PSNP", bufptr);
13305     bufptr = strcpya(bufptr, "    RSQ       KB        P  ALLELES        F\n");
13306     if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
13307       goto clump_reports_ret_WRITE_FAIL;
13308     }
13309   }
13310   for (sp_idx = 0; sp_idx < index_ct; sp_idx++) {
13311     ivar_idx = pval_map[sp_idx];
13312     if ((!clump_best) && is_set(cur_bitfield, ivar_idx)) {
13313       continue;
13314     }
13315     ivar_uidx = marker_idx_to_uidx[ivar_idx];
13316     cur_bp = marker_pos[ivar_uidx];
13317     uii = get_variant_chrom_fo_idx(chrom_info_ptr, ivar_uidx);
13318     clump_chrom_idx = chrom_info_ptr->chrom_file_order[uii];
13319     ujj = chrom_info_ptr->chrom_fo_vidx_start[uii];
13320     if (cur_bp < bp_radius) {
13321       clump_uidx_first = ujj;
13322     } else {
13323       clump_uidx_first = ujj + uint32arr_greater_than(&(marker_pos[ujj]), ivar_uidx + 1 - ujj, cur_bp - bp_radius);
13324     }
13325     next_unset_unsafe_ck(marker_exclude, &clump_uidx_first);
13326     clump_uidx_last = ivar_uidx + uint32arr_greater_than(&(marker_pos[ivar_uidx]), chrom_info_ptr->chrom_fo_vidx_start[uii + 1] - ivar_uidx, cur_bp + bp_radius + 1);
13327     prev_unset_unsafe_ck(marker_exclude, &clump_uidx_last);
13328     marker_uidx = clump_uidx_first;
13329     marker_idx = ivar_idx + popcount_bit_idx(marker_exclude, clump_uidx_first, ivar_uidx) + clump_uidx_first - ivar_uidx;
13330     // Don't want to seek backwards in the file any more than necessary, so
13331     // 1. load all clump-inclusion candidates before index variant
13332     // 2. load index variant, compute pairwise r^2s
13333     // 3. load one clump-inclusion at a time after index variant, compute r^2
13334     // 4. write main result
13335     cur_window_size = 0;
13336     is_haploid = is_set(haploid_mask, clump_chrom_idx);
13337     is_x = (clump_chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]);
13338     is_y = (clump_chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]);
13339     window_data_ptr = window_data;
13340     for (; marker_idx < ivar_idx; marker_uidx++, marker_idx++) {
13341       next_unset_unsafe_ck(marker_exclude, &marker_uidx);
13342       if (((!allow_overlap) && is_set(cur_bitfield, marker_idx)) || ((!clump_entries[marker_idx]) && (!nsig_arr[marker_idx]))) {
13343 	continue;
13344       }
13345       if (++cur_window_size == max_window_size) {
13346 	goto clump_reports_ret_NOMEM;
13347       }
13348       if (fseeko(bedfile, bed_offset + marker_uidx * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
13349 	goto clump_reports_ret_READ_FAIL;
13350       }
13351       window_data_ptr[founder_ctv2 - 2] = 0;
13352       window_data_ptr[founder_ctv2 - 1] = 0;
13353       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data_ptr)) {
13354 	goto clump_reports_ret_READ_FAIL;
13355       }
13356       if (is_haploid) {
13357 	haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)window_data_ptr);
13358       }
13359       window_data_ptr = &(window_data_ptr[founder_ctv2]);
13360     }
13361     next_unset_unsafe_ck(marker_exclude, &marker_uidx);
13362     if (fseeko(bedfile, bed_offset + marker_uidx * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
13363       goto clump_reports_ret_READ_FAIL;
13364     }
13365     window_data_ptr[founder_ctv2 - 2] = 0;
13366     window_data_ptr[founder_ctv2 - 1] = 0;
13367     if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data_ptr)) {
13368       goto clump_reports_ret_READ_FAIL;
13369     }
13370     if (is_haploid) {
13371       haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)window_data_ptr);
13372     }
13373     vec_datamask(founder_ct, 0, window_data_ptr, founder_include2, index_data);
13374     index_tots[0] = popcount2_longs(index_data, founder_ctl2);
13375     vec_datamask(founder_ct, 2, window_data_ptr, founder_include2, &(index_data[founder_ctv2]));
13376     index_tots[1] = popcount2_longs(&(index_data[founder_ctv2]), founder_ctl2);
13377     vec_datamask(founder_ct, 3, window_data_ptr, founder_include2, &(index_data[2 * founder_ctv2]));
13378     index_tots[2] = popcount2_longs(&(index_data[2 * founder_ctv2]), founder_ctl2);
13379     if (is_x) {
13380       vec_datamask(founder_ct, 0, window_data_ptr, founder_male_include2, &(index_data[3 * founder_ctv2]));
13381       index_tots[3] = popcount2_longs(&(index_data[3 * founder_ctv2]), founder_ctl2);
13382       vec_datamask(founder_ct, 3, window_data_ptr, founder_male_include2, &(index_data[4 * founder_ctv2]));
13383       index_tots[4] = popcount2_longs(&(index_data[4 * founder_ctv2]), founder_ctl2);
13384     }
13385     if (!cur_window_size) {
13386       cur_clump_base = (Cur_clump_info*)(&(window_data[founder_ctv2]));
13387     } else {
13388       cur_clump_base = (Cur_clump_info*)window_data_ptr;
13389     }
13390     cc_ptr = cur_clump_base;
13391     window_data_ptr = window_data;
13392     marker_uidx = clump_uidx_first;
13393     marker_idx = ivar_idx + popcount_bit_idx(marker_exclude, clump_uidx_first, ivar_uidx) + clump_uidx_first - ivar_uidx;
13394     max_r2 = -1;
13395     max_r2_uidx = 0xffffffffU;
13396     fill_ulong_zero(5, histo);
13397     best_entry_ptr = nullptr;
13398     for (; marker_idx < ivar_idx; marker_uidx++, marker_idx++) {
13399       marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
13400       clump_entry_ptr = clump_entries[marker_idx];
13401       if (((!allow_overlap) && is_set(cur_bitfield, marker_idx)) || ((!clump_entry_ptr) && (!nsig_arr[marker_idx]))) {
13402 	continue;
13403       }
13404       genovec_3freq(window_data_ptr, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
13405       counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
13406       genovec_3freq(window_data_ptr, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
13407       counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
13408       genovec_3freq(window_data_ptr, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
13409       counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
13410       if (is_x) {
13411         genovec_3freq(window_data_ptr, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
13412         counts[9] = index_tots[3] - counts[9] - counts[11];
13413         genovec_3freq(window_data_ptr, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[15]), &(counts[16]), &(counts[17]));
13414         counts[15] = index_tots[4] - counts[15] - counts[17];
13415       }
13416       if (!em_phase_hethet_nobase(counts, is_x, is_x, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
13417 	freq11_expected = freqx1 * freq1x;
13418 	dxx = freq11 - freq11_expected;
13419 	cur_r2 = fabs(dxx);
13420         // if r^2 threshold is 0, let everything else through but exclude the
13421         // apparent zeroes.  Zeroes *are* included if r2_thresh is negative,
13422 	// though (only nans are rejected then).
13423         if (cur_r2 >= SMALL_EPSILON) {
13424 	  cur_r2 = cur_r2 * dxx / (freq11_expected * freq2x * freqx2);
13425 	} else {
13426 	  cur_r2 = 0;
13427 	}
13428 	if (fabs(cur_r2) > r2_thresh) {
13429 	  while (clump_entry_ptr) {
13430 	    dxx = clump_entry_ptr->pval;
13431 	    update_clump_histo(dxx, histo);
13432 	    if (dxx < p2_thresh) {
13433 	      if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
13434 		goto clump_reports_ret_NOMEM;
13435 	      }
13436 	      cc_ptr->r2 = cur_r2;
13437 	      cc_ptr->marker_idx = marker_idx;
13438 	      uii = clump_entry_ptr->fidx;
13439 	      cc_ptr->fidx = uii;
13440 	      if ((uii == best_fidx_match) && (fabs(cur_r2) > max_r2)) {
13441 		max_r2 = cur_r2;
13442 		max_r2_uidx = marker_uidx;
13443 		best_entry_ptr = clump_entry_ptr;
13444 	      }
13445 	      cc_ptr++;
13446 	    }
13447 	    clump_entry_ptr = clump_entry_ptr->next;
13448 	  }
13449 	  histo[0] += nsig_arr[marker_idx];
13450 	  set_bit(marker_idx, cur_bitfield);
13451 	}
13452       }
13453       window_data_ptr = &(window_data_ptr[founder_ctv2]);
13454     }
13455     pval = sorted_pvals[sp_idx];
13456     clump_entry_ptr = clump_entries[ivar_idx];
13457     uii = 0;
13458     if (clump_entry_ptr->pval != pval) {
13459       uii = 1;
13460       do {
13461 	dxx = clump_entry_ptr->pval;
13462 	update_clump_histo(dxx, histo);
13463 	if (dxx < p2_thresh) {
13464 	  if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
13465 	    goto clump_reports_ret_NOMEM;
13466 	  }
13467 	  cc_ptr->r2 = 1;
13468 	  cc_ptr->marker_idx = ivar_idx;
13469 	  cc_ptr->fidx = clump_entry_ptr->fidx;
13470 	  // clump_best match should be impossible here
13471 	  cc_ptr++;
13472 	}
13473 	clump_entry_ptr = clump_entry_ptr->next;
13474       } while (clump_entry_ptr->pval != pval);
13475     }
13476     index_fidx = clump_entry_ptr->fidx;
13477     if (annot_flattened) {
13478       annot_ptr = clump_entry_ptr->annot;
13479     }
13480     if ((!clump_best) || allow_overlap || (!is_set(cur_bitfield, ivar_idx))) {
13481       if (clump_entry_ptr->next) {
13482 	uii = 1;
13483 	do {
13484 	  clump_entry_ptr = clump_entry_ptr->next;
13485 	  dxx = clump_entry_ptr->pval;
13486 	  update_clump_histo(dxx, histo);
13487 	  if (dxx < p2_thresh) {
13488 	    if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
13489 	      goto clump_reports_ret_NOMEM;
13490 	    }
13491 	    cc_ptr->r2 = 1;
13492 	    cc_ptr->marker_idx = ivar_idx;
13493 	    cc_ptr->fidx = clump_entry_ptr->fidx;
13494 	    if (clump_best) {
13495 	      max_r2 = 1;
13496 	      max_r2_uidx = ivar_uidx;
13497 	      best_entry_ptr = clump_entry_ptr;
13498 	    }
13499 	    cc_ptr++;
13500 	  }
13501 	} while (clump_entry_ptr->next);
13502       }
13503     }
13504     // include co-located entries in the clump and mark the position as clumped
13505     // iff
13506     //   i. there were co-located entries in the first place, and either
13507     //     ii-a. overlaps are permitted or
13508     //     ii-b. index variant position was not previously clumped
13509     if ((uii || nsig_arr[ivar_idx]) && (allow_overlap || (!is_set(cur_bitfield, ivar_idx)))) {
13510       histo[0] += nsig_arr[ivar_idx];
13511       set_bit(ivar_idx, cur_bitfield);
13512     }
13513     marker_uidx = ivar_uidx;
13514     marker_idx = ivar_idx;
13515     while (marker_uidx < clump_uidx_last) {
13516       marker_uidx++;
13517       next_unset_unsafe_ck(marker_exclude, &marker_uidx);
13518       marker_idx++;
13519       clump_entry_ptr = clump_entries[marker_idx];
13520       if (((!allow_overlap) && is_set(cur_bitfield, marker_idx)) || ((!clump_entry_ptr) && (!nsig_arr[marker_idx]))) {
13521 	continue;
13522       }
13523       if (fseeko(bedfile, bed_offset + marker_uidx * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
13524 	goto clump_reports_ret_READ_FAIL;
13525       }
13526       window_data[founder_ctv2 - 2] = 0;
13527       window_data[founder_ctv2 - 1] = 0;
13528       if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data)) {
13529 	goto clump_reports_ret_READ_FAIL;
13530       }
13531       if (is_haploid) {
13532         haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)window_data);
13533       }
13534       genovec_3freq(window_data, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
13535       counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
13536       genovec_3freq(window_data, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
13537       counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
13538       genovec_3freq(window_data, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
13539       counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
13540       if (is_x) {
13541         genovec_3freq(window_data, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
13542         counts[9] = index_tots[3] - counts[9] - counts[11];
13543         genovec_3freq(window_data, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[15]), &(counts[16]), &(counts[17]));
13544         counts[15] = index_tots[4] - counts[15] - counts[17];
13545       }
13546       if (!em_phase_hethet_nobase(counts, is_x, is_x, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
13547         freq11_expected = freqx1 * freq1x;
13548 	dxx = freq11 - freq11_expected;
13549 	cur_r2 = fabs(dxx);
13550 	if (cur_r2 >= SMALL_EPSILON) {
13551 	  cur_r2 = cur_r2 * dxx / (freq11_expected * freq2x * freqx2);
13552 	} else {
13553 	  cur_r2 = 0;
13554 	}
13555 	if (fabs(cur_r2) > r2_thresh) {
13556 	  while (clump_entry_ptr) {
13557 	    dxx = clump_entry_ptr->pval;
13558             update_clump_histo(dxx, histo);
13559 	    if (dxx < p2_thresh) {
13560 	      if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
13561 		goto clump_reports_ret_NOMEM;
13562 	      }
13563 	      cc_ptr->r2 = cur_r2;
13564 	      cc_ptr->marker_idx = marker_idx;
13565 	      uii = clump_entry_ptr->fidx;
13566 	      cc_ptr->fidx = uii;
13567 	      if ((uii == best_fidx_match) && (fabs(cur_r2) > max_r2)) {
13568 		max_r2 = cur_r2;
13569 		max_r2_uidx = marker_uidx;
13570 		best_entry_ptr = clump_entry_ptr;
13571 	      }
13572 	      cc_ptr++;
13573 	    }
13574 	    clump_entry_ptr = clump_entry_ptr->next;
13575 	  }
13576 	  histo[0] += nsig_arr[marker_idx];
13577 	  set_bit(marker_idx, cur_bitfield);
13578 	}
13579       }
13580     }
13581     cur_window_size = (uintptr_t)(cc_ptr - cur_clump_base);
13582     if (require_multifile) {
13583       if (cur_window_size < 2) {
13584 	continue;
13585       }
13586       uii = cur_clump_base[0].fidx;
13587       for (ulii = 1; ulii < cur_window_size; ulii++) {
13588         if (uii != cur_clump_base[ulii].fidx) {
13589 	  break;
13590 	}
13591       }
13592       if (ulii == cur_window_size) {
13593 	continue;
13594       }
13595     }
13596     if (clump_best) {
13597       bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), g_textbuf);
13598       *bufptr++ = ' ';
13599       if (best_entry_ptr) {
13600 	bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[max_r2_uidx * max_marker_id_len]), bufptr);
13601 	*bufptr++ = ' ';
13602         if (max_r2_uidx == ivar_uidx) {
13603 	  bufptr = memcpya(bufptr, "     *", 6);
13604 	} else {
13605 	  bufptr = dtoa_g_wxp3(fabs(max_r2), 6, bufptr);
13606 	}
13607 	*bufptr++ = ' ';
13608 	bufptr = dtoa_g_wxp3x(((double)((int32_t)(marker_pos[max_r2_uidx] - cur_bp))) * 0.001, 8, ' ', bufptr);
13609 	bufptr = dtoa_g_wxp3x(best_entry_ptr->pval, 8, ' ', bufptr);
13610 	if (max_r2 > 0) {
13611 	  uii = 0;
13612 	} else {
13613 	  uii = 1;
13614 	}
13615         cur_a1 = marker_allele_ptrs[2 * ivar_uidx];
13616         cur_a2 = marker_allele_ptrs[2 * ivar_uidx + 1];
13617         bufptr2 = marker_allele_ptrs[2 * max_r2_uidx + uii];
13618         bufptr3 = marker_allele_ptrs[2 * max_r2_uidx + 1 - uii];
13619 	bufptr4 = cur_a1;
13620 	for (uii = 3; uii; uii--) {
13621 	  if (!(*(++bufptr4))) {
13622 	    bufptr4 = cur_a2;
13623 	    for (; uii; uii--) {
13624 	      if (!(*(++bufptr4))) {
13625 		bufptr4 = bufptr2;
13626 		for (; uii; uii--) {
13627 		  if (!(*(++bufptr4))) {
13628 		    bufptr4 = bufptr3;
13629 		    for (; uii; uii--) {
13630 		      if (!(*(++bufptr4))) {
13631 			bufptr = memseta(bufptr, 32, uii);
13632 			break;
13633 		      }
13634 		    }
13635 		    break;
13636 		  }
13637 		}
13638 		break;
13639 	      }
13640 	    }
13641 	    break;
13642 	  }
13643 	}
13644 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
13645 	  goto clump_reports_ret_WRITE_FAIL;
13646 	}
13647         fputs(cur_a1, outfile_best);
13648 	fputs(bufptr2, outfile_best);
13649 	putc_unlocked('/', outfile_best);
13650         fputs(cur_a2, outfile_best);
13651         fputs(bufptr3, outfile_best);
13652         g_textbuf[0] = ' ';
13653         bufptr = uint32toa_w8x(best_fidx_match, ' ', &(g_textbuf[1]));
13654 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
13655 	  goto clump_reports_ret_WRITE_FAIL;
13656 	}
13657 	if (annot_flattened) {
13658           fputs(best_entry_ptr->annot, outfile_best);
13659 	}
13660         putc_unlocked('\n', outfile_best);
13661       } else {
13662 	bufptr = fw_strcpyn(plink_maxsnp, 2, "NA", bufptr);
13663         bufptr = memcpya(bufptr, "     NA       NA       NA       NA       NA \n", 45);
13664 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
13665 	  goto clump_reports_ret_WRITE_FAIL;
13666 	}
13667       }
13668     }
13669     bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, clump_chrom_idx, g_textbuf));
13670     *bufptr++ = ' ';
13671     bufptr = uint32toa_w4(index_fidx, bufptr);
13672     *bufptr++ = ' ';
13673     bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), bufptr);
13674     *bufptr++ = ' ';
13675     bufptr = uint32toa_w10x(cur_bp, ' ', bufptr);
13676     bufptr = dtoa_g_wxp3x(pval, 10, ' ', bufptr);
13677 #ifdef __LP64__
13678     // may as well be paranoid
13679     bufptr = width_force(8, bufptr, int64toa((int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4]), bufptr));
13680     *bufptr++ = ' ';
13681     for (uii = 0; uii < 5; uii++) {
13682       bufptr = width_force(6, bufptr, int64toa((int64_t)((uintptr_t)histo[uii]), bufptr));
13683       *bufptr++ = ' ';
13684     }
13685 #else
13686     bufptr = uint32toa_w8x(histo[0] + histo[1] + histo[2] + histo[3] + histo[4], ' ', bufptr);
13687     for (uii = 0; uii < 5; uii++) {
13688       bufptr = uint32toa_w6x(histo[uii], ' ', bufptr);
13689     }
13690 #endif
13691     final_clump_ct++;
13692     min_bp = cur_bp;
13693     max_bp = cur_bp;
13694     if (cur_window_size) {
13695       marker_idx = cur_clump_base[0].marker_idx;
13696       if (marker_idx < ivar_idx) {
13697 	min_bp = marker_pos[marker_idx_to_uidx[marker_idx]];
13698       }
13699       marker_idx = cur_clump_base[cur_window_size - 1].marker_idx;
13700       if (marker_idx > ivar_idx) {
13701 	max_bp = marker_pos[marker_idx_to_uidx[marker_idx]];
13702       }
13703     }
13704     if (rg_setdefs) {
13705       ulii = rg_chrom_bounds[clump_chrom_idx];
13706       cur_rg_setdefs = &(rg_setdefs[ulii]);
13707       cur_rg_names = &(range_group_names[ulii * max_range_group_id_len + 4]);
13708       cur_rg_ct = rg_chrom_bounds[clump_chrom_idx + 1] - ulii;
13709     }
13710     if (!clump_verbose) {
13711       if (!cur_window_size) {
13712 	bufptr = memcpya(bufptr, "NONE\n", 5);
13713 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13714 	  goto clump_reports_ret_WRITE_FAIL;
13715 	}
13716       } else {
13717 	// avoid buffer overflow
13718 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13719 	  goto clump_reports_ret_WRITE_FAIL;
13720 	}
13721 	g_textbuf[0] = '(';
13722 	for (ulii = 0; ulii < cur_window_size;) {
13723           fputs(&(marker_ids[marker_idx_to_uidx[cur_clump_base[ulii].marker_idx] * max_marker_id_len]), outfile);
13724 	  bufptr = uint32toa_x(cur_clump_base[ulii].fidx, ')', &(g_textbuf[1]));
13725 	  ulii++;
13726 	  if (ulii != cur_window_size) {
13727 	    *bufptr++ = ',';
13728 	  }
13729 	  fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
13730 	}
13731 	if (putc_checked('\n', outfile)) {
13732 	  goto clump_reports_ret_WRITE_FAIL;
13733 	}
13734       }
13735       if (rg_setdefs) {
13736         bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, clump_chrom_idx, g_textbuf));
13737         *bufptr++ = ' ';
13738         bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), bufptr);
13739         *bufptr++ = ' ';
13740         bufptr = dtoa_g_wxp4x(pval, 10, ' ', bufptr);
13741         bufptr = uint32toa_w6x(cur_window_size + 1, ' ', bufptr);
13742 	if (clump_chrom_idx <= chrom_info_ptr->max_code) {
13743 	  bufptr2 = memcpyl3a(bufptr, "chr");
13744 	  bufptr2 = uint32toa(clump_chrom_idx, bufptr2);
13745 	} else if (chrom_info_ptr->zero_extra_chroms) {
13746 	  bufptr2 = memcpya(bufptr, "chr0", 4);
13747 	} else {
13748 	  bufptr2 = strcpya(bufptr, chrom_info_ptr->nonstd_names[clump_chrom_idx]);
13749 	}
13750         *bufptr2++ = ':';
13751         bufptr2 = uint32toa(min_bp, bufptr2);
13752         bufptr2 = memcpya(bufptr2, "..", 2);
13753         bufptr2 = uint32toa(max_bp, bufptr2);
13754         bufptr = width_force(28, bufptr, bufptr2);
13755         *bufptr++ = ' ';
13756         bufptr = width_force(10, bufptr, dtoa_g(((int32_t)(max_bp - min_bp + 1)) * 0.001, bufptr));
13757 	bufptr = memcpya(bufptr, " [", 2);
13758         if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_ranges)) {
13759 	  goto clump_reports_ret_WRITE_FAIL;
13760 	}
13761 	uljj = 0;
13762 	for (ulii = 0; ulii < cur_rg_ct; ulii++) {
13763 	  if (interval_in_setdef(cur_rg_setdefs[ulii], min_bp, max_bp)) {
13764             if (uljj) {
13765 	      putc_unlocked(',', outfile_ranges);
13766 	    } else {
13767 	      uljj = 1;
13768 	    }
13769             fputs(&(cur_rg_names[ulii * max_range_group_id_len]), outfile_ranges);
13770 	  }
13771 	}
13772 	fputs("]\n", outfile_ranges);
13773       }
13774     } else {
13775       if (fwrite_checked(tbuf2, header1_len, outfile)) {
13776 	goto clump_reports_ret_WRITE_FAIL;
13777       }
13778       *bufptr++ = '\n';
13779       if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13780 	goto clump_reports_ret_WRITE_FAIL;
13781       }
13782       if (cur_window_size) {
13783 	if (fwrite_checked(header2_ptr, header2_len, outfile)) {
13784 	  goto clump_reports_ret_WRITE_FAIL;
13785 	}
13786         bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), g_textbuf);
13787 	bufptr = memcpya(bufptr, "          0    1.000 ", 21);
13788 	cur_a1 = marker_allele_ptrs[2 * ivar_uidx];
13789 	a1_len = strlen(cur_a1);
13790 	if (a1_len < 8) {
13791 	  bufptr = memseta(bufptr, 32, 8 - a1_len);
13792 	}
13793 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13794 	  goto clump_reports_ret_WRITE_FAIL;
13795 	}
13796 	fwrite(cur_a1, 1, a1_len, outfile);
13797 	cur_a2 = marker_allele_ptrs[2 * ivar_uidx + 1];
13798         a2_len = strlen(cur_a2);
13799 	if (a1_len + a2_len < 5) {
13800 	  allele_padding = 5 - a1_len - a2_len;
13801 	} else {
13802 	  allele_padding = 0;
13803 	}
13804 	g_textbuf[0] = ' ';
13805         bufptr = uint32toa_w4x(index_fidx, ' ', &(g_textbuf[1]));
13806 	bufptr = dtoa_g_wxp3x(pval, 12, ' ', bufptr);
13807 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13808 	  goto clump_reports_ret_WRITE_FAIL;
13809 	}
13810 	if (annot_flattened) {
13811 	  bufptr2 = annot_ptr;
13812 	  for (uii = 11; uii; uii--) {
13813             if (!(*(++bufptr2))) {
13814 	      fwrite("           ", 1, uii, outfile);
13815 	      break;
13816 	    }
13817 	  }
13818 	  fputs(annot_ptr, outfile);
13819 	}
13820 	fputs("\n\n", outfile);
13821 	last_marker_idx = ~ZEROLU;
13822 	if (rg_setdefs) {
13823 	  fill_ulong_zero(BITCT_TO_WORDCT(cur_rg_ct), rangematch_bitfield);
13824 	  unmatched_group_ct = cur_rg_ct;
13825 	}
13826 	for (ulii = 0; ulii < cur_window_size; ulii++) {
13827 	  bufptr = memseta(g_textbuf, 32, 10);
13828 	  marker_idx = cur_clump_base[ulii].marker_idx;
13829 	  if (last_marker_idx != marker_idx) {
13830 	    marker_uidx = marker_idx_to_uidx[marker_idx];
13831 	    clump_entry_ptr = clump_entries[marker_idx];
13832 	    if (rg_setdefs) {
13833 	      uii = marker_pos[marker_uidx];
13834 	      uljj = 0; // range group idx
13835 	      ulkk = 0; // number of new matches
13836 	      for (ulmm = 0; ulmm < unmatched_group_ct; uljj++, ulmm++) {
13837 		next_unset_ul_unsafe_ck(rangematch_bitfield, &uljj);
13838 		if (interval_in_setdef(cur_rg_setdefs[uljj], uii, uii + 1)) {
13839 		  set_bit(uljj, rangematch_bitfield);
13840 		  ulkk++;
13841 		}
13842 	      }
13843 	      unmatched_group_ct -= ulkk;
13844 	    }
13845 	  }
13846 	  ukk = cur_clump_base[ulii].fidx;
13847 	  while (clump_entry_ptr->fidx != ukk) {
13848 	    clump_entry_ptr = clump_entry_ptr->next;
13849 	  }
13850 	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
13851 	  *bufptr++ = ' ';
13852 	  bufptr = dtoa_g_wxp3x(((double)(((int32_t)marker_pos[marker_uidx]) - ((int32_t)cur_bp))) * 0.001, 10, ' ', bufptr);
13853 	  cur_r2 = cur_clump_base[ulii].r2;
13854 	  if (cur_r2 > 0) {
13855 	    ujj = 0;
13856 	  } else {
13857 	    ujj = 1; // reversed phase
13858 	  }
13859 	  bufptr = dtoa_g_wxp3x(fabs(cur_r2), 8, ' ', bufptr);
13860 	  bufptr2 = marker_allele_ptrs[marker_uidx * 2 + ujj];
13861 	  bufptr3 = marker_allele_ptrs[marker_uidx * 2 + 1 - ujj];
13862 	  if (allele_padding) {
13863 	    bufptr4 = bufptr2;
13864 	    for (uii = allele_padding; uii; uii--) {
13865 	      // fast in common case, don't bother to compute strlen for long
13866 	      // indels
13867 	      if (!(*(++bufptr4))) {
13868 		bufptr4 = bufptr3;
13869 		for (; uii; uii--) {
13870 		  if (!(*(++bufptr4))) {
13871 		    bufptr = memseta(bufptr, 32, uii);
13872 		    break;
13873 		  }
13874 		}
13875 		break;
13876 	      }
13877 	    }
13878 	  }
13879 	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13880 	    goto clump_reports_ret_WRITE_FAIL;
13881 	  }
13882 	  fwrite(cur_a1, 1, a1_len, outfile);
13883 	  fputs(bufptr2, outfile);
13884 	  putc_unlocked('/', outfile);
13885 	  fwrite(cur_a2, 1, a2_len, outfile);
13886 	  fputs(bufptr3, outfile);
13887 	  g_textbuf[0] = ' ';
13888 	  bufptr = uint32toa_w4x(cur_clump_base[ulii].fidx, ' ', &(g_textbuf[1]));
13889 	  bufptr = dtoa_g_wxp3x(clump_entry_ptr->pval, 12, ' ', bufptr);
13890 	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13891 	    goto clump_reports_ret_WRITE_FAIL;
13892 	  }
13893 	  if (annot_flattened) {
13894 	    bufptr2 = clump_entry_ptr->annot;
13895 	    bufptr3 = bufptr2;
13896 	    for (uii = 11; uii; uii--) {
13897 	      if (!(*(++bufptr3))) {
13898 		fwrite("           ", 1, uii, outfile);
13899 		break;
13900 	      }
13901 	    }
13902 	    fputs(bufptr2, outfile);
13903 	  }
13904 	  putc_unlocked('\n', outfile);
13905 	  last_marker_idx = marker_idx;
13906 	}
13907 	bufptr = memcpya(g_textbuf, "\n          RANGE: ", 18);
13908 	if (clump_chrom_idx <= chrom_info_ptr->max_code) {
13909 	  bufptr = memcpyl3a(bufptr, "chr");
13910 	  bufptr = uint32toa(clump_chrom_idx, bufptr);
13911 	} else if (chrom_info_ptr->zero_extra_chroms) {
13912 	  bufptr = memcpya(bufptr, "chr0", 4);
13913 	} else {
13914 	  bufptr = strcpya(bufptr, chrom_info_ptr->nonstd_names[clump_chrom_idx]);
13915 	}
13916 	*bufptr++ = ':';
13917 	bufptr = uint32toa(min_bp, bufptr);
13918 	bufptr = memcpya(bufptr, "..", 2);
13919 	bufptr = uint32toa(max_bp, bufptr);
13920 	bufptr = memcpya(bufptr, "\n           SPAN: ", 18);
13921 	bufptr = uint32toa((max_bp - min_bp + 1) / 1000, bufptr);
13922 	bufptr = memcpyl3a(bufptr, "kb\n");
13923 	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
13924 	  goto clump_reports_ret_WRITE_FAIL;
13925 	}
13926 	if (rg_setdefs) {
13927 	  fputs("     GENES w/SNPs: ", outfile);
13928 	  ulii = 0;
13929 	  uljj = 0;
13930 	  unmatched_group_ct = cur_rg_ct - unmatched_group_ct;
13931 	  if (unmatched_group_ct) {
13932 	    while (1) {
13933 	      uljj = next_set_ul_unsafe(rangematch_bitfield, uljj);
13934 	      fputs(&(cur_rg_names[uljj * max_range_group_id_len]), outfile);
13935 	      if (!(--unmatched_group_ct)) {
13936 		break;
13937 	      }
13938 	      uljj++;
13939 	      putc_unlocked(',', outfile);
13940 	    }
13941 	  }
13942 	  putc_unlocked('\n', outfile);
13943 	}
13944       }
13945       if (rg_setdefs) {
13946 	if (!cur_window_size) {
13947 	  putc_unlocked('\n', outfile);
13948 	}
13949 	fputs("            GENES: ", outfile);
13950 	uljj = 0;
13951 	for (ulii = 0; ulii < cur_rg_ct; ulii++) {
13952 	  if (interval_in_setdef(cur_rg_setdefs[ulii], min_bp, max_bp)) {
13953             if (uljj) {
13954 	      if (uljj & 7) {
13955 		putc_unlocked(',', outfile);
13956 	      } else {
13957 		putc_unlocked('\n', outfile);
13958 	      }
13959 	    }
13960             fputs(&(cur_rg_names[ulii * max_range_group_id_len]), outfile);
13961 	    uljj++;
13962 	  }
13963 	}
13964 	putc_unlocked('\n', outfile);
13965       }
13966       if (fwrite_checked("\n------------------------------------------------------------------\n\n", 69, outfile)) {
13967 	goto clump_reports_ret_WRITE_FAIL;
13968       }
13969     }
13970   }
13971   putc_unlocked('\n', outfile);
13972   if (missing_variant_ct) {
13973     // 1. sort by ID (could switch this to hash table-based too)
13974     // 2. pick smallest pval when duplicates present
13975     // 3. sort by pval
13976     // 4. write results
13977     bigstack_double_reset(bigstack_mark, bigstack_end_mark);
13978     if (bigstack_alloc_c(missing_variant_ct * max_missing_id_len, &sorted_missing_variant_ids) ||
13979 	bigstack_alloc_d(missing_variant_ct, &sorted_pvals)) {
13980       goto clump_reports_ret_NOMEM;
13981     }
13982     for (ulii = 0; ulii < missing_variant_ct; ulii++) {
13983       cm_ptr = not_found_list;
13984       strcpy(&(sorted_missing_variant_ids[ulii * max_missing_id_len]), cm_ptr->idstr);
13985       sorted_pvals[ulii] = cm_ptr->pval;
13986       not_found_list = not_found_list->next;
13987       free(cm_ptr);
13988     }
13989     if (qsort_ext(sorted_missing_variant_ids, missing_variant_ct, max_missing_id_len, strcmp_deref, (char*)sorted_pvals, sizeof(double))) {
13990       goto clump_reports_ret_NOMEM;
13991     }
13992     bufptr = sorted_missing_variant_ids;
13993     uii = strlen(sorted_missing_variant_ids);
13994     for (ulii = 1; ulii < missing_variant_ct; ulii++) {
13995       bufptr2 = &(bufptr[max_missing_id_len]);
13996       ujj = strlen(bufptr2);
13997       if ((uii == ujj) && (!memcmp(bufptr, bufptr2, uii))) {
13998 	uljj = ulii - 1; // write index
13999 	pval = sorted_pvals[uljj];
14000 	if (pval > sorted_pvals[ulii]) {
14001 	  pval = sorted_pvals[ulii];
14002 	}
14003         while (++ulii < missing_variant_ct) {
14004 	  bufptr2 = &(bufptr2[max_missing_id_len]);
14005 	  ujj = strlen(bufptr2);
14006 	  if ((uii == ujj) && (!memcmp(bufptr, bufptr2, uii))) {
14007 	    if (pval > sorted_pvals[ulii]) {
14008 	      pval = sorted_pvals[ulii];
14009 	    }
14010 	  } else {
14011 	    sorted_pvals[uljj++] = pval;
14012 	    bufptr = &(bufptr[max_missing_id_len]);
14013 	    memcpy(bufptr, bufptr2, ujj + 1);
14014 	    pval = sorted_pvals[ulii];
14015 	    uii = ujj;
14016 	  }
14017 	}
14018 	sorted_pvals[uljj] = pval;
14019 	ulii = uljj + 1; // save final array length
14020 	break;
14021       }
14022       bufptr = bufptr2;
14023       uii = ujj;
14024     }
14025     missing_variant_ct = ulii;
14026     if (qsort_ext((char*)sorted_pvals, missing_variant_ct, sizeof(double), double_cmp_deref, sorted_missing_variant_ids, max_missing_id_len)) {
14027       goto clump_reports_ret_NOMEM;
14028     }
14029     if (clump_verbose) {
14030       for (ulii = 0; ulii < missing_variant_ct; ulii++) {
14031 	fputs(&(sorted_missing_variant_ids[ulii * max_missing_id_len]), outfile);
14032 	fputs(" not found in dataset\n", outfile);
14033       }
14034       LOGPRINTF("%" PRIuPTR " top variant ID%s missing; see the end of the .clumped file.\n", missing_variant_ct, (missing_variant_ct == 1)? "" : "s");
14035     } else {
14036       uljj = MINV(missing_variant_ct, 3);
14037       for (ulii = 0; ulii < uljj; ulii++) {
14038 	LOGERRPRINTFWW("Warning: '%s' is missing from the main dataset, and is a top variant.\n", &(sorted_missing_variant_ids[ulii * max_missing_id_len]));
14039       }
14040       if (missing_variant_ct > 3) {
14041         fprintf(stderr, "%" PRIuPTR " more top variant ID%s missing; see log file.\n", missing_variant_ct - 3, (missing_variant_ct == 4)? "" : "s");
14042 	for (ulii = 3; ulii < missing_variant_ct; ulii++) {
14043 	  LOGPREPRINTFWW("Warning: '%s' is missing from the main dataset, and is a top variant.\n", &(sorted_missing_variant_ids[ulii * max_missing_id_len]));
14044 	  logstr(g_logbuf);
14045 	}
14046       }
14047     }
14048   }
14049   putc_unlocked('\n', outfile);
14050   if (fclose_null(&outfile)) {
14051     goto clump_reports_ret_WRITE_FAIL;
14052   }
14053   outname_end[8] = '\0';
14054   LOGPRINTF("--clump: %u clump%s formed from %u top variant%s.\n", final_clump_ct, (final_clump_ct == 1)? "" : "s", index_ct, (index_ct == 1)? "" : "s");
14055   LOGPRINTFWW("Results written to %s .\n", outname);
14056   if (rg_setdefs && (!clump_verbose)) {
14057     memcpy(&(outname_end[8]), ".ranges", 8);
14058     LOGPRINTFWW("--clump-range: Clump/region overlaps reported in %s .\n", outname);
14059   }
14060   if (clump_best) {
14061     memcpy(&(outname_end[8]), ".best", 6);
14062     LOGPRINTFWW("--clump-best: Best proxies written to %s .\n", outname);
14063   }
14064   while (0) {
14065   clump_reports_ret_NOMEM:
14066     retval = RET_NOMEM;
14067     break;
14068   clump_reports_ret_OPEN_FAIL:
14069     retval = RET_OPEN_FAIL;
14070     break;
14071   clump_reports_ret_READ_FAIL:
14072     retval = RET_READ_FAIL;
14073     break;
14074   clump_reports_ret_WRITE_FAIL:
14075     retval = RET_WRITE_FAIL;
14076     break;
14077   clump_reports_ret_INVALID_CMDLINE:
14078     retval = RET_INVALID_CMDLINE;
14079     break;
14080   clump_reports_ret_DUPLICATE_HEADER_COL:
14081     *bufptr2 = '\0';
14082     LOGPREPRINTFWW("Error: Duplicate column header '%s' in %s.\n", bufptr, fname_ptr);
14083   clump_reports_ret_INVALID_FORMAT_2:
14084     logerrprintb();
14085     retval = RET_INVALID_FORMAT;
14086     break;
14087   }
14088  clump_reports_ret_1:
14089   bigstack_double_reset(bigstack_mark, bigstack_end_mark);
14090   gzclose_cond(gz_infile);
14091   fclose_cond(outfile);
14092   fclose_cond(outfile_ranges);
14093   fclose_cond(outfile_best);
14094   while (not_found_list) {
14095     cm_ptr = not_found_list;
14096     not_found_list = not_found_list->next;
14097     free(cm_ptr);
14098   }
14099   return retval;
14100 }
14101