1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink_common.h"
19 
20 #include "plink_assoc.h"
21 #include "plink_cluster.h"
22 #include "plink_ld.h"
23 #include "plink_matrix.h"
24 #include "plink_perm.h"
25 #include "plink_stats.h"
26 
aperm_init(Aperm_info * apip)27 void aperm_init(Aperm_info* apip) {
28   apip->min = 6;
29   apip->max = 1000000;
30   apip->alpha = 0;
31   apip->beta = 0.0001;
32   apip->init_interval = 1;
33   apip->interval_slope = 0.001;
34 }
35 
single_marker_cc_freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_setp,uint32_t * ctrl_missingp,uint32_t * case_setp,uint32_t * case_missingp)36 void single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_setp, uint32_t* ctrl_missingp, uint32_t* case_setp, uint32_t* case_missingp) {
37   // Counts the number of A2 alleles and missing calls for both cases and
38   // controls, for an autosomal marker.  (The caller is expected to calculate
39   // the A1 allele count.)
40   // See single_marker_freqs_and_hwe() for discussion.
41   //   A := genotype & 0x5555...
42   //   B := (genotype >> 1) & 0x5555...
43   //   C := A & (~B)
44   // missing: popcount(C)
45   // A2: [popcount(A) + popcount(B)] - popcount(C)
46   uint32_t tot_ctrl_ab = 0;
47   uint32_t tot_ctrl_c = 0;
48   uint32_t tot_case_ab = 0;
49   uint32_t tot_case_c = 0;
50   uintptr_t* lptr_end = &(lptr[sample_ctl2]);
51   uintptr_t loader;
52   uintptr_t loader2;
53   uintptr_t loader3;
54   uintptr_t loader4;
55 #ifdef __LP64__
56   uintptr_t cur_decr = 60;
57   uintptr_t* lptr_6x_end;
58   sample_ctl2 -= sample_ctl2 % 6;
59   while (sample_ctl2 >= 60) {
60   single_marker_cc_freqs_loop:
61     lptr_6x_end = &(lptr[cur_decr]);
62     count_2freq_dbl_960b((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)ctrl_include2, (__m128i*)case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
63     lptr = lptr_6x_end;
64     ctrl_include2 = &(ctrl_include2[cur_decr]);
65     case_include2 = &(case_include2[cur_decr]);
66     sample_ctl2 -= cur_decr;
67   }
68   if (sample_ctl2) {
69     cur_decr = sample_ctl2;
70     goto single_marker_cc_freqs_loop;
71   }
72 #else
73   uintptr_t* lptr_six_end = &(lptr[sample_ctl2 - (sample_ctl2 % 6)]);
74   while (lptr < lptr_six_end) {
75     count_2freq_dbl_24b(lptr, ctrl_include2, case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
76     lptr = &(lptr[6]);
77     ctrl_include2 = &(ctrl_include2[6]);
78     case_include2 = &(case_include2[6]);
79   }
80 #endif
81   while (lptr < lptr_end) {
82     loader = *lptr++;
83     loader2 = *ctrl_include2++;
84     loader3 = loader >> 1;
85     loader4 = loader2 & loader;
86     tot_ctrl_ab += popcount2_long(loader4 + (loader3 & loader2));
87     tot_ctrl_c += popcount2_long(loader4 & (~loader3));
88     loader2 = *case_include2++;
89     loader4 = loader2 & loader;
90     tot_case_ab += popcount2_long(loader4 + (loader3 & loader2));
91     tot_case_c += popcount2_long(loader4 & (~loader3));
92   }
93   *ctrl_missingp = tot_ctrl_c;
94   *ctrl_setp = tot_ctrl_ab - tot_ctrl_c;
95   *case_missingp = tot_case_c;
96   *case_setp = tot_case_ab - tot_case_c;
97 }
98 
haploid_single_marker_cc_freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_setp,uint32_t * ctrl_missingp,uint32_t * case_setp,uint32_t * case_missingp)99 void haploid_single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_setp, uint32_t* ctrl_missingp, uint32_t* case_setp, uint32_t* case_missingp) {
100   // Counts the number of A1 and A2 alleles for both cases and controls, for a
101   // haploid marker.
102   //   A := genotype & 0x5555...
103   //   B := (genotype >> 1) & 0x5555...
104   //   C := B ^ A
105   // missing: popcount(C)
106   // A2: popcount(A & B)
107   uint32_t tot_ctrl_ab = 0;
108   uint32_t tot_ctrl_c = 0;
109   uint32_t tot_case_ab = 0;
110   uint32_t tot_case_c = 0;
111   uintptr_t* lptr_end = &(lptr[sample_ctl2]);
112   uintptr_t loader;
113   uintptr_t loader2;
114   uintptr_t loader3;
115   while (lptr < lptr_end) {
116     loader = *lptr++;
117     loader2 = loader >> 1;
118     loader3 = loader2 ^ loader;
119     loader &= loader2;
120     loader2 = *ctrl_include2++;
121     tot_ctrl_ab += popcount2_long(loader & loader2);
122     tot_ctrl_c += popcount2_long(loader3 & loader2);
123     loader2 = *case_include2++;
124     tot_case_ab += popcount2_long(loader & loader2);
125     tot_case_c += popcount2_long(loader3 & loader2);
126   }
127   *ctrl_setp = tot_ctrl_ab;
128   *ctrl_missingp = tot_ctrl_c;
129   *case_setp = tot_case_ab;
130   *case_missingp = tot_case_c;
131 }
132 
single_marker_cc_3freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_hom2p,uint32_t * ctrl_hetp,uint32_t * ctrl_missingp,uint32_t * case_hom2p,uint32_t * case_hetp,uint32_t * case_missingp)133 void single_marker_cc_3freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_hom2p, uint32_t* ctrl_hetp, uint32_t* ctrl_missingp, uint32_t* case_hom2p, uint32_t* case_hetp, uint32_t* case_missingp) {
134   // Counts the number of heterozygotes, A2 homozygotes, and missing calls for
135   // both cases and controls.  Assumes marker is diploid.  The caller is
136   // expected to calculate the A1 allele count.
137   // See single_marker_freqs_and_hwe() for discussion.
138   //   A := genotype & 0x5555...
139   //   B := (genotype >> 1) & 0x5555...
140   //   C := A & B
141   //   popcount(C) = homozyg major ct
142   //   popcount(B) = het ct + homozyg major ct
143   //   popcount(A) = missing_ct + homozyg major ct
144   // hom2: popcount(C)
145   // het: popcount(B) - popcount(C)
146   // missing: popcount(A) - popcount(C)
147   uint32_t tot_ctrl_a = 0;
148   uint32_t tot_ctrl_b = 0;
149   uint32_t tot_ctrl_c = 0;
150   uint32_t tot_case_a = 0;
151   uint32_t tot_case_b = 0;
152   uint32_t tot_case_c = 0;
153   uintptr_t* lptr_end = &(lptr[sample_ctl2]);
154   uintptr_t loader;
155   uintptr_t loader2;
156   uintptr_t loader3;
157 #ifdef __LP64__
158   uintptr_t cur_decr = 120;
159   uintptr_t* lptr_12x_end;
160   sample_ctl2 -= sample_ctl2 % 12;
161   while (sample_ctl2 >= 120) {
162   single_marker_cc_3freqs_loop:
163     lptr_12x_end = &(lptr[cur_decr]);
164     count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
165     count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
166     lptr = lptr_12x_end;
167     ctrl_include2 = &(ctrl_include2[cur_decr]);
168     case_include2 = &(case_include2[cur_decr]);
169     sample_ctl2 -= cur_decr;
170   }
171   if (sample_ctl2) {
172     cur_decr = sample_ctl2;
173     goto single_marker_cc_3freqs_loop;
174   }
175 #else
176   uintptr_t* lptr_twelve_end = &(lptr[sample_ctl2 - (sample_ctl2 % 12)]);
177   while (lptr < lptr_twelve_end) {
178     count_3freq_48b(lptr, ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
179     count_3freq_48b(lptr, case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
180     lptr = &(lptr[12]);
181     ctrl_include2 = &(ctrl_include2[12]);
182     case_include2 = &(case_include2[12]);
183   }
184 #endif
185   while (lptr < lptr_end) {
186     //   A := genotype & 0x5555...
187     //   B := (genotype >> 1) & 0x5555...
188     //   C := A & B
189     //   popcount(C) = homozyg major ct
190     //   popcount(B) = het ct + homozyg major ct
191     //   popcount(A) = missing_ct + homozyg major ct
192     loader = *lptr++;
193     loader2 = *ctrl_include2++;
194     loader3 = (loader >> 1) & loader2;
195     loader2 &= loader;
196     tot_ctrl_a += popcount2_long(loader2);
197     tot_ctrl_b += popcount2_long(loader3);
198     tot_ctrl_c += popcount2_long(loader2 & loader3);
199     loader2 = *case_include2++;
200     loader3 = (loader >> 1) & loader2;
201     loader2 &= loader;
202     tot_case_a += popcount2_long(loader2);
203     tot_case_b += popcount2_long(loader3);
204     tot_case_c += popcount2_long(loader2 & loader3);
205   }
206   *ctrl_hom2p = tot_ctrl_c;
207   *ctrl_hetp = tot_ctrl_b - tot_ctrl_c;
208   *ctrl_missingp = tot_ctrl_a - tot_ctrl_c;
209   *case_hom2p = tot_case_c;
210   *case_hetp = tot_case_b - tot_case_c;
211   *case_missingp = tot_case_a - tot_case_c;
212 }
213 
adjust_print(double pval,double output_min_p,const char * output_min_p_str,uint32_t output_min_p_strlen,char ** bufpp)214 static inline void adjust_print(double pval, double output_min_p, const char* output_min_p_str, uint32_t output_min_p_strlen, char** bufpp) {
215   if (pval < 0) {
216     *bufpp = memcpya(*bufpp, "        NA ", 11);
217   } else if (pval <= output_min_p) {
218     *bufpp = memcpya(*bufpp, output_min_p_str, output_min_p_strlen);
219   } else {
220     *bufpp = dtoa_g_wxp4x(pval, 10, ' ', *bufpp);
221   }
222 }
223 
adjust_print_log10(double pval,double output_min_p,const char * output_min_logp_str,uint32_t output_min_logp_strlen,char ** bufpp)224 static inline void adjust_print_log10(double pval, double output_min_p, const char* output_min_logp_str, uint32_t output_min_logp_strlen, char** bufpp) {
225   if (pval < 0) {
226     *bufpp = memcpya(*bufpp, "        NA ", 11);
227   } else if (pval <= output_min_p) {
228     *bufpp = memcpya(*bufpp, output_min_logp_str, output_min_logp_strlen);
229   } else if (pval < 1) {
230     *bufpp = dtoa_g_wxp4x(-log10(pval), 10, ' ', *bufpp);
231   } else {
232     *bufpp = memcpya(*bufpp, "         0 ", 11);
233   }
234 }
235 
multcomp(char * outname,char * outname_end,uint32_t * marker_uidxs,uintptr_t chi_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,double * chi,double pfilter,double output_min_p,uint32_t mtest_adjust,uint32_t skip_gc,double adjust_lambda,uint32_t * tcnt,double * pvals)236 int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintptr_t chi_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, double* chi, double pfilter, double output_min_p, uint32_t mtest_adjust, uint32_t skip_gc, double adjust_lambda, uint32_t* tcnt, double* pvals) {
237   // Association statistics can be provided in three ways:
238   // 1. Just p-values (pvals[]).
239   // 2. T statistics (in chi[]) and dfs (in tcnt[]).
240   // 3. 1df chi-square stats (in chi[]).
241   unsigned char* bigstack_mark = g_bigstack_base;
242   uint32_t is_log10 = mtest_adjust & ADJUST_LOG10;
243   uint32_t qq_plot = mtest_adjust & ADJUST_QQ;
244   FILE* outfile = nullptr;
245   double pv_holm = 0.0;
246   double pv_sidak_sd = 0;
247   int32_t retval = 0;
248   uint32_t is_set_test = !plink_maxsnp;
249   uint32_t adjust_gc = (mtest_adjust & ADJUST_GC) && (!skip_gc);
250   uint32_t output_min_p_strlen = 11;
251   uint32_t uii = 0;
252   uint32_t* new_tcnt = nullptr;
253   double* unadj = nullptr;
254   char output_min_p_str[16];
255   uint32_t pct;
256   double* sp;
257   double* schi;
258   double* pv_bh;
259   double* pv_by;
260   uint32_t* new_order;
261   uint32_t cur_idx;
262   uintptr_t marker_uidx;
263   double dxx;
264   double dyy;
265   double dzz;
266   double harmonic_sum;
267   double dct;
268   double pval;
269   double unadj_pval;
270   double* pv_gc;
271   double lambda_recip;
272   double bonf;
273   double pv_sidak_ss;
274   char* bufptr;
275   uint32_t ujj;
276   uint32_t loop_end;
277 
278   if (bigstack_alloc_d(chi_ct, &sp) ||
279       bigstack_alloc_d(chi_ct, &schi) ||
280       bigstack_alloc_ui(chi_ct, &new_order)) {
281     goto multcomp_ret_NOMEM;
282   }
283   if (pvals) {
284     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
285       dxx = pvals[cur_idx];
286       dyy = inverse_chiprob(dxx, 1);
287       if (dyy >= 0) {
288 	sp[uii] = dxx;
289 	new_order[uii] = marker_uidxs[cur_idx];
290 	schi[uii] = dyy;
291 	uii++;
292       }
293     }
294   } else if (tcnt) {
295     if (bigstack_alloc_ui(chi_ct, &new_tcnt)) {
296       goto multcomp_ret_NOMEM;
297     }
298     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
299       ujj = tcnt[cur_idx];
300       if (ujj) {
301 	dxx = chi[cur_idx]; // not actually squared
302 	dyy = calc_tprob(dxx, ujj);
303 	if (dyy > -1) {
304 	  sp[uii] = dyy;
305 	  new_order[uii] = marker_uidxs[cur_idx];
306 	  schi[uii] = dxx * dxx;
307 	  new_tcnt[uii] = ujj;
308 	  uii++;
309 	}
310       }
311     }
312   } else {
313     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
314       dxx = chi[cur_idx];
315       if (dxx >= 0) {
316 	dyy = chiprob_p(dxx, 1);
317 	if (dyy > -1) {
318 	  sp[uii] = dyy;
319 	  new_order[uii] = marker_uidxs[cur_idx];
320 	  schi[uii] = dxx;
321 	  uii++;
322 	}
323       }
324     }
325   }
326   chi_ct = uii;
327   if (!chi_ct) {
328     logprint("Zero valid tests; --adjust skipped.\n");
329     goto multcomp_ret_1;
330   }
331   if (qsort_ext((char*)sp, chi_ct, sizeof(double), double_cmp_deref, (char*)new_order, sizeof(int32_t))) {
332     goto multcomp_ret_NOMEM;
333   }
334   if (tcnt) {
335     if (qsort_ext((char*)schi, chi_ct, sizeof(double), double_cmp_deref, (char*)new_tcnt, sizeof(int32_t))) {
336       goto multcomp_ret_NOMEM;
337     }
338   } else {
339 #ifdef __cplusplus
340     std::sort(schi, &(schi[chi_ct]));
341 #else
342     qsort(schi, chi_ct, sizeof(double), double_cmp);
343 #endif
344   }
345   dct = chi_ct;
346 
347   // get lambda...
348   if (skip_gc) {
349     lambda_recip = 1.0;
350   } else if (mtest_adjust & ADJUST_LAMBDA) {
351     lambda_recip = adjust_lambda;
352   } else {
353     if (chi_ct & 1) {
354       lambda_recip = schi[(chi_ct - 1) / 2];
355     } else {
356       lambda_recip = (schi[chi_ct / 2 - 1] + schi[chi_ct / 2]) * 0.5;
357     }
358     lambda_recip = lambda_recip / 0.456;
359     if (lambda_recip < 1.0) {
360       lambda_recip = 1.0;
361     }
362     LOGPRINTF("--adjust: Genomic inflation est. lambda (based on median chisq) = %g.\n", lambda_recip);
363   }
364   // ...now take the reciprocal (bugfix: forgot to do this with --lambda)
365   if (lambda_recip > 1.0) {
366     lambda_recip = 1.0 / lambda_recip;
367   }
368 
369   // handle reverse-order calculations
370   if (bigstack_alloc_d(chi_ct, &pv_bh) ||
371       bigstack_alloc_d(chi_ct, &pv_by) ||
372       bigstack_alloc_d(chi_ct, &pv_gc)) {
373     goto multcomp_ret_NOMEM;
374   }
375   if (adjust_gc) {
376     unadj = sp;
377     sp = pv_gc;
378   }
379   uii = chi_ct;
380   if (tcnt) {
381     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
382       uii--;
383       pv_gc[cur_idx] = calc_tprob(sqrt(schi[uii] * lambda_recip), new_tcnt[uii]);
384     }
385   } else {
386     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
387       pv_gc[cur_idx] = chiprob_p(schi[--uii] * lambda_recip, 1);
388     }
389   }
390 
391   dyy = sp[chi_ct - 1];
392   pv_bh[chi_ct - 1] = dyy;
393   harmonic_sum = 1.0;
394   for (cur_idx = chi_ct - 1; cur_idx > 0; cur_idx--) {
395     dzz = dct / ((double)((int32_t)cur_idx));
396     harmonic_sum += dzz;
397     dxx = dzz * sp[cur_idx - 1];
398     if (dyy > dxx) {
399       dyy = dxx;
400     }
401     pv_bh[cur_idx - 1] = dyy;
402   }
403 
404   dzz = 1.0 / dct;
405   harmonic_sum *= dzz;
406 
407   dyy = harmonic_sum * sp[chi_ct - 1];
408   if (dyy >= 1) {
409     dyy = 1;
410   }
411   pv_by[chi_ct - 1] = dyy;
412   harmonic_sum *= dct;
413   for (cur_idx = chi_ct - 1; cur_idx > 0; cur_idx--) {
414     dxx = (harmonic_sum / ((double)((int32_t)cur_idx))) * sp[cur_idx - 1];
415     if (dyy > dxx) {
416       dyy = dxx;
417     }
418     pv_by[cur_idx - 1] = dyy;
419   }
420 
421   uii = strlen(outname_end);
422   memcpy(&(outname_end[uii]), ".adjusted", 10);
423   if (fopen_checked(outname, "w", &outfile)) {
424     goto multcomp_ret_OPEN_FAIL;
425   }
426   if (!is_set_test) {
427     sprintf(g_textbuf, " CHR %%%us      UNADJ %s", plink_maxsnp, skip_gc? "" : "        GC ");
428     fprintf(outfile, g_textbuf, "SNP");
429   } else {
430     plink_maxsnp = max_marker_id_len - 1;
431     if (plink_maxsnp < 3) {
432       plink_maxsnp = 3;
433     }
434     sprintf(g_textbuf, " %%%us      UNADJ ", plink_maxsnp);
435     fprintf(outfile, g_textbuf, "SET");
436   }
437   if (qq_plot) {
438     fputs("        QQ ", outfile);
439   }
440   if (fputs_checked("      BONF       HOLM   SIDAK_SS   SIDAK_SD     FDR_BH     FDR_BY\n", outfile)) {
441     goto multcomp_ret_WRITE_FAIL;
442   }
443   fputs("0%", stdout);
444   fflush(stdout);
445   cur_idx = 0;
446   if (!is_log10) {
447     if (output_min_p == 0.0) {
448       memcpy(output_min_p_str, "       INF ", 11);
449     } else {
450       bufptr = dtoa_g_wxp4x(output_min_p, 10, ' ', output_min_p_str);
451       output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
452     }
453   } else {
454     if (output_min_p == 0.0) {
455       memcpy(output_min_p_str, "       INF ", 11);
456     } else {
457       bufptr = dtoa_g_wxp4x(-log10(output_min_p), 10, ' ', output_min_p_str);
458       output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
459     }
460   }
461   for (pct = 1; pct <= 100; pct++) {
462     loop_end = (((uint64_t)pct) * chi_ct) / 100LLU;
463     for (; cur_idx < loop_end; cur_idx++) {
464       pval = sp[cur_idx];
465       // if --pfilter specified, filter out both nan and negative pvals, since
466       // both are currently used by upstream functions
467       if ((pfilter != 2.0) && ((!(pval >= 0.0)) || (pval > pfilter))) {
468 	continue;
469       }
470       if (adjust_gc) {
471         unadj_pval = unadj[cur_idx];
472       } else {
473 	unadj_pval = pval;
474       }
475       marker_uidx = new_order[cur_idx];
476       if (!is_set_test) {
477         bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, get_variant_chrom(chrom_info_ptr, marker_uidx), g_textbuf));
478       } else {
479         bufptr = g_textbuf;
480       }
481       *bufptr++ = ' ';
482       bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
483       *bufptr++ = ' ';
484       if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
485 	goto multcomp_ret_WRITE_FAIL;
486       }
487       bonf = pval * dct;
488       if (bonf > 1) {
489 	bonf = 1;
490       }
491       if (pv_holm < 1) {
492 	dyy = (chi_ct - cur_idx) * pval;
493 	if (dyy > 1) {
494 	  pv_holm = 1;
495 	} else if (pv_holm < dyy) {
496 	  pv_holm = dyy;
497 	}
498       }
499       // avoid catastrophic cancellation for small p-values
500       // 1 - (1-p)^c = 1 - e^{c log(1-p)}
501       // 2^{-7} threshold is arbitrary
502       if (pval >= 0.0078125) {
503 	pv_sidak_ss = 1 - pow(1 - pval, dct);
504 	dyy = 1 - pow(1 - pval, dct - ((double)((int32_t)cur_idx)));
505       } else {
506 	pv_sidak_ss = 1 - exp(dct * log1p(-pval));
507 	dyy = dct - (double)((int32_t)cur_idx);
508 	dyy = 1 - exp(dyy * log1p(-pval));
509       }
510       if (pv_sidak_sd < dyy) {
511 	pv_sidak_sd = dyy;
512       }
513 
514       bufptr = g_textbuf;
515       if (!is_log10) {
516 	adjust_print(unadj_pval, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
517 	if (!skip_gc) {
518 	  adjust_print(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
519 	}
520 	if (qq_plot) {
521 	  bufptr = dtoa_g_wxp4x((((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ', bufptr);
522 	}
523 	adjust_print(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
524 	adjust_print(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
525 	adjust_print(pv_sidak_ss, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
526 	adjust_print(pv_sidak_sd, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
527 	adjust_print(pv_bh[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
528 	adjust_print(pv_by[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
529       } else {
530 	adjust_print_log10(pval, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
531 	if (!is_set_test) {
532 	  adjust_print_log10(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
533 	}
534 	if (qq_plot) {
535           // quasi-bugfix (23 Mar 2018): this should be logscale, both for
536           // consistency with plink 1.07 and because it makes more sense
537 	  bufptr = dtoa_g_wxp4x(-log10((((double)((int32_t)cur_idx)) + 0.5) * dzz), 10, ' ', bufptr);
538 	}
539 	adjust_print_log10(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
540 	adjust_print_log10(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
541 	adjust_print_log10(pv_sidak_ss, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
542 	adjust_print_log10(pv_sidak_sd, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
543 	adjust_print_log10(pv_bh[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
544 	adjust_print_log10(pv_by[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
545       }
546       *bufptr++ = '\n';
547       if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
548 	goto multcomp_ret_WRITE_FAIL;
549       }
550     }
551     if (pct < 100) {
552       if (pct > 10) {
553 	putc_unlocked('\b', stdout);
554       }
555       printf("\b\b%u%%", pct);
556       fflush(stdout);
557     }
558   }
559   fputs("\b\b\b", stdout);
560   LOGPRINTFWW("--adjust values (%" PRIuPTR " %s%s) written to %s .\n", chi_ct, is_set_test? "nonempty set" : "variant", (chi_ct == 1)? "" : "s", outname);
561 
562   while (0) {
563   multcomp_ret_NOMEM:
564     retval = RET_NOMEM;
565     break;
566   multcomp_ret_OPEN_FAIL:
567     retval = RET_OPEN_FAIL;
568     break;
569   multcomp_ret_WRITE_FAIL:
570     retval = RET_WRITE_FAIL;
571     break;
572   }
573  multcomp_ret_1:
574   fclose_cond(outfile);
575   bigstack_reset(bigstack_mark);
576   return retval;
577 }
578 
model_assoc_tna(uint32_t model_fisher,char * wptr)579 char* model_assoc_tna(uint32_t model_fisher, char* wptr) {
580   // write terminal NAs to buffer
581   if (model_fisher) {
582     return memcpya(wptr, "          NA\n", 13);
583   } else {
584     return memcpya(wptr, "          NA   NA           NA\n", 31);
585   }
586 }
587 
calc_git(uint32_t pheno_nm_ct,uint32_t perm_vec_ct,uintptr_t * __restrict__ loadbuf,uint32_t * perm_vecst,uint32_t * results_bufs,uint32_t * thread_wkspace)588 void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict__ loadbuf, uint32_t* perm_vecst, uint32_t* results_bufs, uint32_t* thread_wkspace) {
589   // Brian Browning's genotype indexing algorithm for low-MAF (and low missing
590   // frequency) markers.
591   // We accelerate it by using a special interleaved permutation representation
592   // which supports vector addition without occupying extra space: see
593   // generate_cc_perm_vec().  Counting the number of e.g. case heterozygote
594   // genotypes across all permutations then proceeds as follows:
595   // 1. For the first 15 heterozygote samples, just use 4-bit accumulators.
596   //    This allows the inner loop to increment 32 counters simultaneously.
597   // 2. Right before they'd otherwise be at risk of overflowing, we unfold the
598   //    4-bit accumulators into a larger buffer of 8-bit accumulators.  Then we
599   //    zero out the 4-bit accumulators, and restart the inner loop.
600   // 3. This can happen up to 17 times before the 8-bit accumulators risk
601   //    overflow.  Then, they are unfolded into the final output array of
602   //    32-bit ints, zeroed out, and the second loop restarts.
603   // Note that results_bufs[] is assumed to be zeroed out before this function
604   // is called.
605   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
606   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
607 #ifdef __LP64__
608   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
609   uint32_t perm_ct128x4 = perm_ct128 * 4;
610   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
611   uint32_t perm_ct16x4 = 4 * perm_ct16;
612   __m128i* permsv = (__m128i*)perm_vecst;
613   __m128i* gitv[9];
614 #else
615   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
616   uint32_t perm_ct32x4 = perm_ct32 * 4;
617   uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
618   uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
619   uint32_t perm_ct16x16 = 16 * perm_ct16;
620   uintptr_t* permsv = (uintptr_t*)perm_vecst;
621   uintptr_t* gitv[9];
622 #endif
623   uint32_t cur_cts[3];
624   uintptr_t ulii;
625   uint32_t uii;
626   uint32_t ujj;
627   uint32_t ukk;
628   uint32_t sample_type;
629 #ifdef __LP64__
630   // 4- and 8-bit partial counts
631   gitv[0] = (__m128i*)thread_wkspace;
632   gitv[1] = &(((__m128i*)thread_wkspace)[perm_ct128x4]);
633   gitv[2] = &(((__m128i*)thread_wkspace)[2 * perm_ct128x4]);
634   gitv[3] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4]);
635   gitv[4] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4 + 2 * perm_ct32]);
636   gitv[5] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4 + 4 * perm_ct32]);
637   gitv[6] = &(((__m128i*)results_bufs)[2 * perm_ct16x4]);
638   gitv[7] = &(((__m128i*)results_bufs)[perm_ct16x4]);
639   gitv[8] = (__m128i*)results_bufs;
640 #else
641   gitv[0] = (uintptr_t*)thread_wkspace;
642   gitv[1] = (uintptr_t*)(&(thread_wkspace[perm_ct32x4]));
643   gitv[2] = (uintptr_t*)(&(thread_wkspace[2 * perm_ct32x4]));
644   gitv[3] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4]));
645   gitv[4] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 2 * perm_ct8]));
646   gitv[5] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 4 * perm_ct8]));
647   gitv[6] = (uintptr_t*)(&(results_bufs[2 * perm_ct16x16]));
648   gitv[7] = (uintptr_t*)(&(results_bufs[perm_ct16x16]));
649   gitv[8] = (uintptr_t*)results_bufs;
650 #endif
651   cur_cts[0] = 0;
652   cur_cts[1] = 0;
653   cur_cts[2] = 0;
654   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
655     ulii = ~(*loadbuf++);
656     if (uii + 1 == pheno_nm_ctl2x) {
657       ujj = pheno_nm_ct & (BITCT2 - 1);
658       if (ujj) {
659 	ulii &= (ONELU << (ujj * 2)) - ONELU;
660       }
661     }
662     while (ulii) {
663       ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
664       sample_type = ((ulii >> ujj) & 3) - 1;
665       ukk = cur_cts[sample_type] + 1;
666       cur_cts[sample_type] = ukk;
667 #ifdef __LP64__
668       unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct128]), gitv[sample_type], perm_ct128);
669       if (!(ukk % 15)) {
670 	unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
671 	if (!(ukk % 255)) {
672 	  unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
673 	}
674       }
675 #else
676       unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct32]), gitv[sample_type], perm_ct32);
677       if (!(ukk % 15)) {
678 	unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
679 	if (!(ukk % 255)) {
680 	  unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
681 	}
682       }
683 #endif
684       ulii &= ~((3 * ONELU) << ujj);
685     }
686 #ifdef __LP64__
687     permsv = &(permsv[BITCT2 * perm_ct128]);
688 #else
689     permsv = &(permsv[BITCT2 * perm_ct32]);
690 #endif
691   }
692   for (sample_type = 0; sample_type < 3; sample_type++) {
693     uii = cur_cts[sample_type];
694 #ifdef __LP64__
695     if (uii % 15) {
696       unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
697     }
698     if (uii % 255) {
699       unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
700     }
701 #else
702     if (uii % 15) {
703       unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
704     }
705     if (uii % 255) {
706       unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
707     }
708 #endif
709   }
710 }
711 
calc_qgit(uint32_t pheno_nm_ct,uintptr_t perm_vec_ctcl8m,uint32_t num_perms_now,uintptr_t * __restrict__ loadbuf,double * perm_vecstd,double * thread_bufs)712 void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
713   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
714 #ifdef __LP64__
715   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
716   uint32_t row_mult = perm_vec_ctcl8m / 4;
717 
718   uint32_t loop_len = (num_perms_now + 1) / 2;
719   __m128d* permsv = (__m128d*)perm_vecstd;
720   __m128d* __restrict__ perm_readv;
721   __m128d* __restrict__ git_writev;
722   __m128d* __restrict__ git_write2v;
723   __m128d vxx;
724 #else
725   uint32_t row_mult = perm_vec_ctcl8m / 2;
726   double* __restrict__ perm_read;
727   double* __restrict__ git_write;
728   double* __restrict__ git_write2;
729   double dxx;
730 #endif
731   uintptr_t ulii;
732   uint32_t sample_type;
733   uint32_t uii;
734   uint32_t ujj;
735   uint32_t ukk;
736   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
737     ulii = ~(*loadbuf++);
738     if (uii + 1 == pheno_nm_ctl2x) {
739       ujj = pheno_nm_ct & (BITCT2 - 1);
740       if (ujj) {
741 	ulii &= (ONELU << (ujj * 2)) - ONELU;
742       }
743     }
744     while (ulii) {
745       ujj = CTZLU(ulii) & (BITCT - 2);
746       sample_type = (ulii >> ujj) & 3;
747 #ifdef __LP64__
748       // note that the gain from using SSE2 for double-precision arithmetic is
749       // typically minimal because modern cores tend to have two FPUs, so we
750       // should only use it opportunistically.  it's painless here, though.
751       perm_readv = &(permsv[ujj * row_mult]);
752       if (sample_type == 1) {
753 	git_writev = (__m128d*)thread_bufs;
754 	for (ukk = 0; ukk < loop_len; ukk++) {
755 	  *git_writev = _mm_add_pd(*git_writev, *perm_readv++);
756 	  git_writev++;
757 	}
758       } else if (sample_type == 3) {
759 	// hom rare
760 	git_writev = (__m128d*)thread_bufs;
761 	for (ukk = 0; ukk < loop_len; ukk++) {
762 	  vxx = *perm_readv++;
763 	  *git_writev = _mm_add_pd(*git_writev, _mm_add_pd(vxx, vxx));
764 	  git_writev++;
765 	}
766       } else {
767 	// missing
768 	git_writev = (__m128d*)(&(thread_bufs[perm_vec_ctcl8m]));
769 	git_write2v = (__m128d*)(&(thread_bufs[2 * perm_vec_ctcl8m]));
770 	for (ukk = 0; ukk < loop_len; ukk++) {
771 	  vxx = *perm_readv++;
772 	  *git_writev = _mm_add_pd(*git_writev, vxx);
773 	  git_writev++;
774 	  *git_write2v = _mm_add_pd(*git_write2v, _mm_mul_pd(vxx, vxx));
775 	  git_write2v++;
776 	}
777       }
778 #else
779       perm_read = &(perm_vecstd[ujj * row_mult]);
780       if (sample_type == 1) {
781 	git_write = thread_bufs;
782 	for (ukk = 0; ukk < num_perms_now; ukk++) {
783 	  *git_write += *perm_read++;
784 	  git_write++;
785 	}
786       } else if (sample_type == 3) {
787 	git_write = thread_bufs;
788 	for (ukk = 0; ukk < num_perms_now; ukk++) {
789 	  dxx = *perm_read++;
790 	  *git_write += dxx * 2;
791 	  git_write++;
792 	}
793       } else {
794 	git_write = &(thread_bufs[perm_vec_ctcl8m]);
795 	git_write2 = &(thread_bufs[2 * perm_vec_ctcl8m]);
796 	for (ukk = 0; ukk < num_perms_now; ukk++) {
797 	  dxx = *perm_read++;
798 	  *git_write += dxx;
799 	  git_write++;
800 	  *git_write2 += dxx * dxx;
801 	  git_write2++;
802 	}
803       }
804 #endif
805       ulii &= ~((3 * ONELU) << ujj);
806     }
807 #ifdef __LP64__
808     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
809 #else
810     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
811 #endif
812   }
813 }
814 
calc_qgit_lin(uint32_t pheno_nm_ct,uintptr_t perm_vec_ctcl8m,uint32_t num_perms_now,uintptr_t * __restrict__ loadbuf,double * perm_vecstd,double * thread_bufs)815 void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
816   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
817 #ifdef __LP64__
818   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
819   uint32_t row_mult = perm_vec_ctcl8m / 4;
820 
821   uint32_t loop_len = (num_perms_now + 1) / 2;
822   __m128d* permsv = (__m128d*)perm_vecstd;
823   __m128d* __restrict__ perm_readv;
824   __m128d* __restrict__ git_writev;
825   __m128d* __restrict__ git_write2v;
826   __m128d vxx;
827 #else
828   uint32_t row_mult = perm_vec_ctcl8m / 2;
829   double* __restrict__ perm_read;
830   double* __restrict__ git_write;
831   double* __restrict__ git_write2;
832   double dxx;
833 #endif
834   uintptr_t ulii;
835   uint32_t sample_type;
836   uint32_t uii;
837   uint32_t ujj;
838   uint32_t ukk;
839   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
840     ulii = ~(*loadbuf++);
841     if (uii + 1 == pheno_nm_ctl2x) {
842       ujj = pheno_nm_ct & (BITCT2 - 1);
843       if (ujj) {
844 	ulii &= (ONELU << (ujj * 2)) - ONELU;
845       }
846     }
847     while (ulii) {
848       ujj = CTZLU(ulii) & (BITCT - 2);
849       sample_type = (ulii >> ujj) & 3;
850 #ifdef __LP64__
851       perm_readv = &(permsv[ujj * row_mult]);
852       if (sample_type == 1) {
853 	git_writev = (__m128d*)thread_bufs;
854 	git_write2v = (__m128d*)(&(thread_bufs[perm_vec_ctcl8m]));
855       } else if (sample_type == 3) {
856 	// hom rare
857 	git_writev = (__m128d*)(&(thread_bufs[2 * perm_vec_ctcl8m]));
858 	git_write2v = (__m128d*)(&(thread_bufs[3 * perm_vec_ctcl8m]));
859       } else {
860 	// missing
861 	git_writev = (__m128d*)(&(thread_bufs[4 * perm_vec_ctcl8m]));
862 	git_write2v = (__m128d*)(&(thread_bufs[5 * perm_vec_ctcl8m]));
863       }
864       for (ukk = 0; ukk < loop_len; ukk++) {
865 	vxx = *perm_readv++;
866 	*git_writev = _mm_add_pd(*git_writev, vxx);
867 	git_writev++;
868 	*git_write2v = _mm_add_pd(*git_write2v, _mm_mul_pd(vxx, vxx));
869 	git_write2v++;
870       }
871 #else
872       perm_read = &(perm_vecstd[ujj * row_mult]);
873       if (sample_type == 1) {
874 	git_write = thread_bufs;
875 	git_write2 = &(thread_bufs[perm_vec_ctcl8m]);
876       } else if (sample_type == 3) {
877 	git_write = &(thread_bufs[2 * perm_vec_ctcl8m]);
878 	git_write2 = &(thread_bufs[3 * perm_vec_ctcl8m]);
879       } else {
880 	git_write = &(thread_bufs[4 * perm_vec_ctcl8m]);
881 	git_write2 = &(thread_bufs[5 * perm_vec_ctcl8m]);
882       }
883       for (ukk = 0; ukk < num_perms_now; ukk++) {
884 	dxx = *perm_read++;
885 	*git_write += dxx;
886 	git_write++;
887 	*git_write2 += dxx * dxx;
888 	git_write2++;
889       }
890 #endif
891       ulii &= ~((3 * ONELU) << ujj);
892     }
893 #ifdef __LP64__
894     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
895 #else
896     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
897 #endif
898   }
899 }
900 
901 #ifdef __LP64__
rem_cost_60v(__m128i * vec1,__m128i * vend,__m128i * vec2)902 uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
903   const __m128i m1 = {FIVEMASK, FIVEMASK};
904   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
905   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
906   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
907   __m128i loader;
908   __m128i loader2;
909   __m128i xor_vec;
910   __m128i detect_homcom;
911   __m128i result_a;
912   __m128i acc_a;
913   __m128i acc_b;
914   __univec acc;
915   acc.vi = _mm_setzero_si128();
916   do {
917     loader = *vec1++;
918     loader2 = *vec2++;
919     xor_vec = _mm_xor_si128(loader, loader2);
920     detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
921     acc_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
922     acc_b = _mm_andnot_si128(detect_homcom, acc_a);
923 
924     loader = *vec1++;
925     loader2 = *vec2++;
926     xor_vec = _mm_xor_si128(loader, loader2);
927     detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
928     result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
929     acc_a = _mm_add_epi64(acc_a, result_a);
930     acc_b = _mm_add_epi64(acc_b, _mm_andnot_si128(detect_homcom, result_a));
931 
932     loader = *vec1++;
933     loader2 = *vec2++;
934     xor_vec = _mm_xor_si128(loader, loader2);
935     detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
936     result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
937     acc_a = _mm_add_epi64(acc_a, result_a);
938     acc_b = _mm_add_epi64(acc_b, _mm_andnot_si128(detect_homcom, result_a));
939     acc_a = _mm_add_epi64(_mm_and_si128(acc_a, m2), _mm_and_si128(_mm_srli_epi64(acc_a, 2), m2));
940     acc_a = _mm_add_epi64(acc_a, _mm_add_epi64(_mm_and_si128(acc_b, m2), _mm_and_si128(_mm_srli_epi64(acc_b, 2), m2)));
941     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(acc_a, m4), _mm_and_si128(_mm_srli_epi64(acc_a, 4), m4)));
942   } while (vec1 < vend);
943   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
944   return ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
945 }
946 
qrem_cost2_40v(__m128i * vec1,__m128i * vend,__m128i * vec2)947 uintptr_t qrem_cost2_40v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
948   const __m128i m1 = {FIVEMASK, FIVEMASK};
949   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
950   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
951   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
952   __m128i loader;
953   __m128i loader2;
954   __m128i xor_vec;
955   __m128i detect_missing;
956   __m128i result_a;
957   __m128i result_b;
958   __m128i result_c;
959   __m128i inner_acc;
960   __univec acc;
961   acc.vi = _mm_setzero_si128();
962   do {
963     loader = *vec1++;
964     loader2 = *vec2++;
965     xor_vec = _mm_xor_si128(loader, loader2);
966     detect_missing = _mm_or_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), _mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2));
967     result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
968     result_b = _mm_and_si128(result_a, detect_missing);
969     inner_acc = _mm_and_si128(result_b, xor_vec);
970     inner_acc = _mm_add_epi64(_mm_add_epi64(result_a, result_b), inner_acc);
971     inner_acc = _mm_add_epi64(_mm_and_si128(inner_acc, m2), _mm_and_si128(_mm_srli_epi64(inner_acc, 2), m2));
972     loader = *vec1++;
973     loader2 = *vec2++;
974     xor_vec = _mm_xor_si128(loader, loader2);
975     detect_missing = _mm_or_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), _mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2));
976     result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
977     result_b = _mm_and_si128(result_a, detect_missing);
978     result_c = _mm_and_si128(result_b, xor_vec);
979     result_c = _mm_add_epi64(_mm_add_epi64(result_a, result_b), result_c);
980     inner_acc = _mm_add_epi64(inner_acc, _mm_add_epi64(_mm_and_si128(result_c, m2), _mm_and_si128(_mm_srli_epi64(result_c, 2), m2)));
981     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(inner_acc, m4), _mm_and_si128(_mm_srli_epi64(inner_acc, 4), m4)));
982   } while (vec1 < vend);
983   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
984   return ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
985 }
986 #else
rem_cost_6(uintptr_t * loadbuf1,uintptr_t * loadbuf2)987 uintptr_t rem_cost_6(uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
988   uintptr_t loader = *loadbuf1++;
989   uintptr_t loader2 = *loadbuf2++;
990   uintptr_t xor_word = loader ^ loader2;
991   uintptr_t detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
992   uintptr_t acc_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
993   uintptr_t acc_b = acc_a & (~detect_homcom);
994   uintptr_t result_a;
995   uintptr_t acc;
996 
997   loader = *loadbuf1++;
998   loader2 = *loadbuf2++;
999   xor_word = loader & loader2;
1000   detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1001   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1002   acc_a += result_a;
1003   acc_b += result_a & (~detect_homcom);
1004 
1005   loader = *loadbuf1++;
1006   loader2 = *loadbuf2++;
1007   xor_word = loader & loader2;
1008   detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1009   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1010   acc_a += result_a;
1011   acc_b += result_a & (~detect_homcom);
1012   acc_a = (acc_a & 0x33333333) + ((acc_a >> 2) & 0x33333333);
1013   acc_a += (acc_b & 0x33333333) + ((acc_b >> 2) & 0x33333333);
1014   acc = (acc_a & 0x0f0f0f0f) + ((acc_a >> 4) & 0x0f0f0f0f);
1015 
1016   loader = *loadbuf1++;
1017   loader2 = *loadbuf2++;
1018   xor_word = loader & loader2;
1019   detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1020   acc_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1021   acc_b = acc_a & (~detect_homcom);
1022 
1023   loader = *loadbuf1++;
1024   loader2 = *loadbuf2++;
1025   xor_word = loader & loader2;
1026   detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1027   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1028   acc_a += result_a;
1029   acc_b += result_a & (~detect_homcom);
1030 
1031   loader = *loadbuf1++;
1032   loader2 = *loadbuf2++;
1033   xor_word = loader & loader2;
1034   detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1035   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1036   acc_a += result_a;
1037   acc_b += result_a & (~detect_homcom);
1038   acc_a = (acc_a & 0x33333333) + ((acc_a >> 2) & 0x33333333);
1039   acc_a += (acc_b & 0x33333333) + ((acc_b >> 2) & 0x33333333);
1040   acc += (acc_a & 0x0f0f0f0f) + ((acc_a >> 4) & 0x0f0f0f0f);
1041   return (acc * 0x01010101) >> 24;
1042 }
1043 
qrem_cost2_4(uintptr_t * loadbuf1,uintptr_t * loadbuf2)1044 uintptr_t qrem_cost2_4(uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1045   uintptr_t loader = *loadbuf1++;
1046   uintptr_t loader2 = *loadbuf2++;
1047   uintptr_t xor_word = loader ^ loader2;
1048   uintptr_t detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1049   uintptr_t result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1050   uintptr_t result_b = result_a & detect_missing;
1051   uintptr_t inner_acc = result_b & xor_word;
1052   uintptr_t result_c;
1053   uintptr_t acc;
1054   inner_acc += result_a + result_b;
1055   inner_acc = (inner_acc & 0x33333333) + ((inner_acc >> 2) & 0x33333333);
1056 
1057   loader = *loadbuf1++;
1058   loader2 = *loadbuf2++;
1059   xor_word = loader & loader2;
1060   detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1061   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1062   result_b = result_a & detect_missing;
1063   result_c = result_b & xor_word;
1064   result_c += result_a + result_b;
1065   inner_acc += (result_c & 0x33333333) + ((result_c >> 2) & 0x33333333);
1066   acc = (inner_acc & 0x0f0f0f0f) + ((inner_acc >> 4) & 0x0f0f0f0f);
1067 
1068   loader = *loadbuf1++;
1069   loader2 = *loadbuf2++;
1070   xor_word = loader & loader2;
1071   detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1072   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1073   result_b = result_a & detect_missing;
1074   inner_acc = result_b & xor_word;
1075   inner_acc += result_a + result_b;
1076   inner_acc = (inner_acc & 0x33333333) + ((inner_acc >> 2) & 0x33333333);
1077 
1078   loader = *loadbuf1++;
1079   loader2 = *loadbuf2++;
1080   xor_word = loader & loader2;
1081   detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1082   result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1083   result_b = result_a & detect_missing;
1084   result_c = result_b & xor_word;
1085   result_c += result_a + result_b;
1086   inner_acc += (result_c & 0x33333333) + ((result_c >> 2) & 0x33333333);
1087   acc += (inner_acc & 0x0f0f0f0f) + ((inner_acc >> 4) & 0x0f0f0f0f);
1088   return (acc * 0x01010101) >> 24;
1089 }
1090 #endif
1091 
rem_cost(uintptr_t sample_ctv2,uintptr_t * loadbuf1,uintptr_t * loadbuf2)1092 uintptr_t rem_cost(uintptr_t sample_ctv2, uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1093   // Cost: 2 * (<-> neither side homcom) + (<-> homcom)
1094   //
1095   // We can efficiently calculate this as follows:
1096   //   xor = vec1 ^ vec2
1097   //   detect_homcom = (vec1 & (vec1 >> 1)) | (vec2 & (vec2 >> 1))
1098   //   A := (xor | (xor >> 1)) & 0x5555...
1099   //   B := A & (~detect_homcom)
1100   //   cost += popcount2(A + B)
1101   uintptr_t* lptr_end = &(loadbuf1[sample_ctv2]);
1102   uintptr_t cost = 0;
1103   uintptr_t loader;
1104   uintptr_t loader2;
1105   uintptr_t xor_word;
1106   uintptr_t detect_homcom;
1107   uintptr_t result_a;
1108   uintptr_t result_b;
1109 #ifdef __LP64__
1110   uintptr_t cur_decr = 60;
1111   uintptr_t* lptr_6x_end;
1112   sample_ctv2 -= sample_ctv2 % 6;
1113   while (sample_ctv2 >= 60) {
1114   rem_cost_loop:
1115     lptr_6x_end = &(loadbuf1[cur_decr]);
1116     cost += rem_cost_60v((__m128i*)loadbuf1, (__m128i*)lptr_6x_end, (__m128i*)loadbuf2);
1117     loadbuf1 = lptr_6x_end;
1118     loadbuf2 = &(loadbuf2[cur_decr]);
1119     sample_ctv2 -= cur_decr;
1120   }
1121   if (sample_ctv2) {
1122     cur_decr = sample_ctv2;
1123     goto rem_cost_loop;
1124   }
1125 #else
1126   uintptr_t* lptr_six_end = &(loadbuf1[sample_ctv2 - (sample_ctv2 % 6)]);
1127   while (loadbuf1 < lptr_six_end) {
1128     cost += rem_cost_6(loadbuf1, loadbuf2);
1129     loadbuf1 = &(loadbuf1[6]);
1130     loadbuf2 = &(loadbuf2[6]);
1131   }
1132 #endif
1133   while (loadbuf1 < lptr_end) {
1134     loader = *loadbuf1++;
1135     loader2 = *loadbuf2++;
1136     xor_word = loader ^ loader2;
1137     detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1138     result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1139     result_b = result_a & (~detect_homcom);
1140     cost += popcount2_long(result_a + result_b);
1141   }
1142   return cost;
1143 }
1144 
qrem_cost2(uintptr_t sample_ctl2,uintptr_t * loadbuf1,uintptr_t * loadbuf2)1145 uintptr_t qrem_cost2(uintptr_t sample_ctl2, uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1146   // Cost: 3 + 3 * (missing <-> homrar/het) + 2 * (missing <-> homcom) +
1147   //       (homrar <-> het/homcom) + (het <-> homcom)
1148   //
1149   // xor 01: 3 if 00-01, 1 of 10-11
1150   // xor 10: 2 if 01-11, 1 if 00-10
1151   // xor 11: 3 if 01-10, 1 if 00-11
1152   //
1153   // We can efficiently calculate this as follows:
1154   //   xor = vec1 ^ vec2
1155   //   detect_missing = (vec1 & (~(vec1 >> 1))) | (vec2 & (~(vec2 >> 1)))
1156   //   A := (xor | (xor >> 1)) & 0x5555...
1157   //   B := A & detect_missing
1158   //   C := B & xor
1159   //   cost += popcount2(A + B + C)
1160   // (I would not be surprised if a few operations could be shaved from this.)
1161   uintptr_t* lptr_end = &(loadbuf1[sample_ctl2]);
1162   uintptr_t cost = 3;
1163   uintptr_t loader;
1164   uintptr_t loader2;
1165   uintptr_t xor_word;
1166   uintptr_t detect_missing;
1167   uintptr_t result_a;
1168   uintptr_t result_b;
1169   uintptr_t result_c;
1170 #ifdef __LP64__
1171   uintptr_t cur_decr = 40;
1172   uintptr_t* lptr_4x_end;
1173   sample_ctl2 &= ~3LLU;
1174   while (sample_ctl2 >= 40) {
1175   qrem_cost2_loop:
1176     lptr_4x_end = &(loadbuf1[cur_decr]);
1177     cost += qrem_cost2_40v((__m128i*)loadbuf1, (__m128i*)lptr_4x_end, (__m128i*)loadbuf2);
1178     loadbuf1 = lptr_4x_end;
1179     loadbuf2 = &(loadbuf2[cur_decr]);
1180     sample_ctl2 -= cur_decr;
1181   }
1182   if (sample_ctl2) {
1183     cur_decr = sample_ctl2;
1184     goto qrem_cost2_loop;
1185   }
1186 #else
1187   uintptr_t* lptr_four_end = &(loadbuf1[sample_ctl2 & (~3)]);
1188   while (loadbuf1 < lptr_four_end) {
1189     cost += qrem_cost2_4(loadbuf1, loadbuf2);
1190     loadbuf1 = &(loadbuf1[4]);
1191     loadbuf2 = &(loadbuf2[4]);
1192   }
1193 #endif
1194   while (loadbuf1 < lptr_end) {
1195     loader = *loadbuf1++;
1196     loader2 = *loadbuf2++;
1197     xor_word = loader ^ loader2;
1198     detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1199     result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1200     result_b = result_a & detect_missing;
1201     result_c = result_b & xor_word;
1202     cost += popcount2_long(result_a + result_b + result_c);
1203   }
1204   return cost;
1205 }
1206 
1207 #ifdef __LP64__
calc_rem_merge4_two(uint32_t perm_ct128,__m128i * __restrict__ perm_ptr,__m128i * __restrict__ rem_merge4a,__m128i * __restrict__ rem_merge4b)1208 static inline void calc_rem_merge4_two(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4a, __m128i* __restrict__ rem_merge4b) {
1209   const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
1210   __m128i loader;
1211   __m128i loader2;
1212   uint32_t pbidx;
1213   for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
1214     loader = *perm_ptr++;
1215     loader2 = _mm_and_si128(loader, m1x4);
1216     rem_merge4a[0] = _mm_add_epi64(rem_merge4a[0], loader2);
1217     rem_merge4b[0] = _mm_add_epi64(rem_merge4b[0], loader2);
1218     loader = _mm_srli_epi64(loader, 1);
1219     loader2 = _mm_and_si128(loader, m1x4);
1220     rem_merge4a[1] = _mm_add_epi64(rem_merge4a[1], loader2);
1221     rem_merge4b[1] = _mm_add_epi64(rem_merge4b[1], loader2);
1222     loader = _mm_srli_epi64(loader, 1);
1223     loader2 = _mm_and_si128(loader, m1x4);
1224     rem_merge4a[2] = _mm_add_epi64(rem_merge4a[2], loader2);
1225     rem_merge4b[2] = _mm_add_epi64(rem_merge4b[2], loader2);
1226     loader = _mm_srli_epi64(loader, 1);
1227     loader2 = _mm_and_si128(loader, m1x4);
1228     rem_merge4a[3] = _mm_add_epi64(rem_merge4a[3], loader2);
1229     rem_merge4b[3] = _mm_add_epi64(rem_merge4b[3], loader2);
1230     rem_merge4a = &(rem_merge4a[4]);
1231     rem_merge4b = &(rem_merge4b[4]);
1232   }
1233 }
1234 
calc_rem_merge32_minus(uint32_t perm_ct16,__m128i * __restrict__ rem_merge8,__m128i * rem_write)1235 static inline void calc_rem_merge32_minus(uint32_t perm_ct16, __m128i* __restrict__ rem_merge8, __m128i* rem_write) {
1236   // temporary integer underflow is possible here, but by the end of the
1237   // calculation it should be reversed
1238   const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
1239   __m128i loader;
1240   uint32_t pbidx;
1241   for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
1242     loader = *rem_merge8;
1243     rem_write[0] = _mm_sub_epi64(rem_write[0], _mm_and_si128(loader, m8x32));
1244     loader = _mm_srli_epi64(loader, 8);
1245     rem_write[1] = _mm_sub_epi64(rem_write[1], _mm_and_si128(loader, m8x32));
1246     loader = _mm_srli_epi64(loader, 8);
1247     rem_write[2] = _mm_sub_epi64(rem_write[2], _mm_and_si128(loader, m8x32));
1248     loader = _mm_srli_epi64(loader, 8);
1249     rem_write[3] = _mm_sub_epi64(rem_write[3], _mm_and_si128(loader, m8x32));
1250     rem_write = &(rem_write[4]);
1251     *rem_merge8++ = _mm_setzero_si128();
1252   }
1253 }
1254 #else
calc_rem_merge4_two(uint32_t perm_ct32,uintptr_t * __restrict__ perm_ptr,uintptr_t * __restrict__ rem_merge4a,uintptr_t * __restrict__ rem_merge4b)1255 static inline void calc_rem_merge4_two(uint32_t perm_ct32, uintptr_t* __restrict__ perm_ptr, uintptr_t* __restrict__ rem_merge4a, uintptr_t* __restrict__ rem_merge4b) {
1256   uintptr_t loader;
1257   uintptr_t loader2;
1258   uint32_t pbidx;
1259   for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
1260     loader = *perm_ptr++;
1261     loader2 = loader & 0x11111111;
1262     rem_merge4a[0] += loader2;
1263     rem_merge4b[0] += loader2;
1264     loader2 = (loader >> 1) & 0x11111111;
1265     rem_merge4a[1] += loader2;
1266     rem_merge4b[1] += loader2;
1267     loader2 = (loader >> 2) & 0x11111111;
1268     rem_merge4a[2] += loader2;
1269     rem_merge4b[2] += loader2;
1270     loader2 = (loader >> 3) & 0x11111111;
1271     rem_merge4a[3] += loader2;
1272     rem_merge4b[3] += loader2;
1273     rem_merge4a = &(rem_merge4a[4]);
1274     rem_merge4b = &(rem_merge4b[4]);
1275   }
1276 }
1277 
calc_rem_merge32_minus(uint32_t perm_ct4,uintptr_t * __restrict__ rem_merge8,uintptr_t * __restrict__ rem_write)1278 static inline void calc_rem_merge32_minus(uint32_t perm_ct4, uintptr_t* __restrict__ rem_merge8, uintptr_t* __restrict__ rem_write) {
1279   uintptr_t loader;
1280   uint32_t pbidx;
1281   for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
1282     loader = *rem_merge8;
1283     rem_write[0] -= (uint8_t)loader;
1284     loader >>= 8;
1285     rem_write[1] -= (uint8_t)loader;
1286     loader >>= 8;
1287     rem_write[2] -= (uint8_t)loader;
1288     loader >>= 8;
1289     rem_write[3] -= loader;
1290     rem_write = &(rem_write[4]);
1291     *rem_merge8++ = 0;
1292   }
1293 }
1294 #endif
1295 
calc_rem(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,uint32_t * perm_vecst,uint32_t * results_bufs,uint32_t * thread_wkspace)1296 void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, uint32_t* perm_vecst, uint32_t* results_bufs, uint32_t* thread_wkspace) {
1297   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1298   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
1299   // [cur_xor - 1][cur_raw]
1300   // low 8 bits give index of first remv[] array to increment; next 8 bits give
1301   // second index if nonzero, or indicate its absence
1302   const uint32_t idx_table[3][4] = {{0x300, 0x102, 4, 5}, {0x500, 2, 0x104, 3}, {0, 0x502, 0x304, 1}};
1303 #ifdef __LP64__
1304   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
1305   uint32_t perm_ct128x4 = perm_ct128 * 4;
1306   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
1307   uint32_t perm_ct16x4 = 4 * perm_ct16;
1308   __m128i* permsv = (__m128i*)perm_vecst;
1309   // 0, 2, 4: homrar, missing, het ct increment
1310   // 1, 3, 5: homrar, missing, het ct decrement
1311   __m128i* remv[15];
1312   __m128i* __restrict__ perm_ptr;
1313 #else
1314   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
1315   uint32_t perm_ct32x4 = perm_ct32 * 4;
1316   uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
1317   uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
1318   uint32_t perm_ct16x16 = 16 * perm_ct16;
1319   uintptr_t* permsv = (uintptr_t*)perm_vecst;
1320   uintptr_t* remv[15];
1321   uintptr_t* perm_ptr;
1322 #endif
1323 
1324   uint32_t cur_cts[6];
1325   uintptr_t ulraw1;
1326   uintptr_t ulxor;
1327   uint32_t cur_xor;
1328   uint32_t cur_raw;
1329   uint32_t idx1;
1330   uint32_t idx2;
1331   uint32_t uii;
1332   uint32_t ujj;
1333   uint32_t ukk;
1334 #ifdef __LP64__
1335   for (uii = 0; uii < 6; uii++) {
1336     remv[uii] = &(((__m128i*)thread_wkspace)[uii * perm_ct128x4]);
1337   }
1338   for (uii = 0; uii < 6; uii++) {
1339     remv[uii + 6] = &(((__m128i*)thread_wkspace)[6 * perm_ct128x4 + 2 * uii * perm_ct32]);
1340   }
1341   remv[12] = (__m128i*)results_bufs;
1342   remv[13] = &(((__m128i*)results_bufs)[perm_ct16x4]);
1343   remv[14] = &(((__m128i*)results_bufs)[2 * perm_ct16x4]);
1344 #else
1345   for (uii = 0; uii < 6; uii++) {
1346     remv[uii] = (uintptr_t*)(&(thread_wkspace[uii * perm_ct32x4]));
1347   }
1348   for (uii = 0; uii < 6; uii++) {
1349     remv[uii + 6] = (uintptr_t*)(&(thread_wkspace[6 * perm_ct32x4 + 2 * uii * perm_ct8]));
1350   }
1351   remv[12] = (uintptr_t*)results_bufs;
1352   remv[13] = (uintptr_t*)(&(results_bufs[perm_ct16x16]));
1353   remv[14] = (uintptr_t*)(&(results_bufs[2 * perm_ct16x16]));
1354 #endif
1355 
1356   for (uii = 0; uii < 6; uii++) {
1357     cur_cts[uii] = 0;
1358   }
1359   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1360     ulraw1 = *loadbuf++;
1361     ulxor = ulraw1 ^ (*loadbuf_ref++);
1362     if (uii + 1 == pheno_nm_ctl2x) {
1363       ujj = pheno_nm_ct & (BITCT2 - 1);
1364       if (ujj) {
1365 	ulxor &= (ONELU << (ujj * 2)) - ONELU;
1366       }
1367     }
1368     while (ulxor) {
1369       ujj = CTZLU(ulxor) & (BITCT - 2);
1370       cur_xor = (ulxor >> ujj) & 3;
1371       cur_raw = (ulraw1 >> ujj) & 3;
1372       idx1 = idx_table[cur_xor - 1][cur_raw];
1373       idx2 = idx1 >> 8;
1374       idx1 &= 255;
1375 #ifdef __LP64__
1376       perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
1377       if (!idx2) {
1378 	unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct128);
1379       } else {
1380 	calc_rem_merge4_two(perm_ct128, perm_ptr, remv[idx1], remv[idx2]);
1381 	ukk = cur_cts[idx2] + 1;
1382 	cur_cts[idx2] = ukk;
1383 	if (!(ukk % 15)) {
1384 	  unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct32);
1385 	  if (!(ukk % 255)) {
1386 	    calc_rem_merge32_minus(perm_ct16, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
1387 	  }
1388 	}
1389       }
1390       ukk = cur_cts[idx1] + 1;
1391       cur_cts[idx1] = ukk;
1392       if (!(ukk % 15)) {
1393 	unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
1394 	if (!(ukk % 255)) {
1395 	  if (!(idx1 & 1)) {
1396 	    unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
1397 	  } else {
1398 	    calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1399 	  }
1400 	}
1401       }
1402 #else
1403       perm_ptr = &(permsv[(ujj / 2) * perm_ct32]);
1404       if (!idx2) {
1405 	unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct32);
1406       } else {
1407 	calc_rem_merge4_two(perm_ct32, perm_ptr, remv[idx1], remv[idx2]);
1408 	ukk = cur_cts[idx2] + 1;
1409 	cur_cts[idx2] = ukk;
1410 	if (!(ukk % 15)) {
1411 	  unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct8);
1412 	  if (!(ukk % 255)) {
1413 	    calc_rem_merge32_minus(perm_ct4, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
1414 	  }
1415 	}
1416       }
1417       ukk = cur_cts[idx1] + 1;
1418       cur_cts[idx1] = ukk;
1419       if (!(ukk % 15)) {
1420 	unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
1421 	if (!(ukk % 255)) {
1422 	  if (!(idx1 & 1)) {
1423 	    unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
1424 	  } else {
1425 	    calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1426 	  }
1427 	}
1428       }
1429 #endif
1430       ulxor &= ~((3 * ONELU) << ujj);
1431     }
1432 #ifdef __LP64__
1433     permsv = &(permsv[BITCT2 * perm_ct128]);
1434 #else
1435     permsv = &(permsv[BITCT2 * perm_ct32]);
1436 #endif
1437   }
1438   for (idx1 = 0; idx1 < 6; idx1++) {
1439     uii = cur_cts[idx1];
1440 #ifdef __LP64__
1441     if (uii % 15) {
1442       // todo: check if zeroing needed
1443       unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
1444     }
1445     if (uii % 255) {
1446       if (!(idx1 & 1)) {
1447 	unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
1448       } else {
1449 	calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1450       }
1451     }
1452 #else
1453     if (uii % 15) {
1454       unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
1455     }
1456     if (uii % 255) {
1457       if (!(idx1 & 1)) {
1458 	unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
1459       } else {
1460 	calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1461       }
1462     }
1463 #endif
1464   }
1465 }
1466 
calc_qrem(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,double * perm_vecstd,double * outbufs)1467 void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
1468   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
1469   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1470 #ifdef __LP64__
1471   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
1472   uint32_t row_mult = perm_vec_ctcl8m / 4;
1473 
1474   uint32_t loop_len = (perm_vec_ct + 1) / 2;
1475   __m128d* permsv = (__m128d*)perm_vecstd;
1476   __m128d* __restrict__ perm_readv;
1477   __m128d* __restrict__ rem_writev;
1478   __m128d* __restrict__ rem_write2v;
1479   __m128d* __restrict__ rem_write3v;
1480   __m128d vxx;
1481 #else
1482   uint32_t row_mult = perm_vec_ctcl8m / 2;
1483   double* __restrict__ perm_read;
1484   double* __restrict__ rem_write;
1485   double* __restrict__ rem_write2;
1486   double* __restrict__ rem_write3;
1487   double dxx;
1488 #endif
1489   uintptr_t ulraw1;
1490   uintptr_t ulxor;
1491   uint32_t cur_xor;
1492   uint32_t cur_raw;
1493   uint32_t uii;
1494   uint32_t ujj;
1495   uint32_t ukk;
1496   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1497     ulraw1 = *loadbuf++;
1498     ulxor = ulraw1 ^ (*loadbuf_ref++);
1499     if (uii + 1 == pheno_nm_ctl2x) {
1500       ujj = pheno_nm_ct & (BITCT2 - 1);
1501       if (ujj) {
1502 	ulxor &= (ONELU << (ujj * 2)) - ONELU;
1503       }
1504     }
1505     while (ulxor) {
1506       ujj = CTZLU(ulxor) & (BITCT - 2);
1507       cur_xor = (ulxor >> ujj) & 3;
1508       cur_raw = (ulraw1 >> ujj) & 3;
1509 #ifdef __LP64__
1510       perm_readv = &(permsv[ujj * row_mult]);
1511       rem_writev = (__m128d*)outbufs;
1512       rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1513       rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1514       if (cur_raw == 3) {
1515 	if (cur_xor == 1) {
1516 	  for (ukk = 0; ukk < loop_len; ukk++) {
1517 	    vxx = *perm_readv++;
1518 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1519 	    rem_writev++;
1520 	  }
1521 	} else if (cur_xor == 3) {
1522 	  for (ukk = 0; ukk < loop_len; ukk++) {
1523 	    vxx = *perm_readv++;
1524 	    *rem_writev = _mm_sub_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1525 	    rem_writev++;
1526 	  }
1527         } else {
1528 	  for (ukk = 0; ukk < loop_len; ukk++) {
1529 	    vxx = *perm_readv++;
1530 	    *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1531 	    rem_write2v++;
1532 	    *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1533 	    rem_write3v++;
1534 	  }
1535         }
1536       } else if (cur_raw == 2) {
1537 	if (cur_xor == 1) {
1538 	  for (ukk = 0; ukk < loop_len; ukk++) {
1539 	    vxx = *perm_readv++;
1540 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1541 	    rem_writev++;
1542 	  }
1543 	} else if (cur_xor == 2) {
1544 	  for (ukk = 0; ukk < loop_len; ukk++) {
1545 	    vxx = *perm_readv++;
1546 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1547 	    rem_writev++;
1548 	  }
1549 	} else {
1550 	  for (ukk = 0; ukk < loop_len; ukk++) {
1551 	    vxx = *perm_readv++;
1552 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1553 	    rem_writev++;
1554 	    *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1555 	    rem_write2v++;
1556 	    *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1557 	    rem_write3v++;
1558 	  }
1559 	}
1560       } else if (!cur_raw) {
1561 	if (cur_xor == 3) {
1562 	  for (ukk = 0; ukk < loop_len; ukk++) {
1563 	    vxx = *perm_readv++;
1564 	    *rem_writev = _mm_add_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1565 	    rem_writev++;
1566 	  }
1567 	} else if (cur_xor == 2) {
1568 	  for (ukk = 0; ukk < loop_len; ukk++) {
1569 	    vxx = *perm_readv++;
1570 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1571 	    rem_writev++;
1572 	  }
1573 	} else {
1574 	  for (ukk = 0; ukk < loop_len; ukk++) {
1575 	    vxx = *perm_readv++;
1576 	    *rem_writev = _mm_add_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1577 	    rem_writev++;
1578 	    *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1579 	    rem_write2v++;
1580 	    *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1581 	    rem_write3v++;
1582 	  }
1583 	}
1584       } else {
1585 	if (cur_xor == 2) {
1586 	  for (ukk = 0; ukk < loop_len; ukk++) {
1587 	    vxx = *perm_readv++;
1588 	    *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1589 	    rem_write2v++;
1590 	    *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1591 	    rem_write3v++;
1592 	  }
1593 	} else if (cur_xor == 3) {
1594 	  for (ukk = 0; ukk < loop_len; ukk++) {
1595 	    vxx = *perm_readv++;
1596 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1597 	    rem_writev++;
1598 	    *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1599 	    rem_write2v++;
1600 	    *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1601 	    rem_write3v++;
1602 	  }
1603 	} else {
1604 	  for (ukk = 0; ukk < loop_len; ukk++) {
1605 	    vxx = *perm_readv++;
1606 	    *rem_writev = _mm_sub_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1607 	    rem_writev++;
1608 	    *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1609 	    rem_write2v++;
1610 	    *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1611 	    rem_write3v++;
1612 	  }
1613 	}
1614       }
1615 #else
1616       perm_read = &(perm_vecstd[ujj * row_mult]);
1617       rem_write = outbufs;
1618       rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1619       rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1620       if (cur_raw == 3) {
1621 	if (cur_xor == 1) {
1622 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1623 	    dxx = *perm_read++;
1624 	    *rem_write -= dxx;
1625 	    rem_write++;
1626 	  }
1627 	} else if (cur_xor == 3) {
1628 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1629 	    dxx = *perm_read++;
1630 	    *rem_write -= 2 * dxx;
1631 	    rem_write++;
1632 	  }
1633 	} else {
1634 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1635 	    dxx = *perm_read++;
1636 	    *rem_write2 -= dxx;
1637 	    rem_write2++;
1638 	    *rem_write3 -= dxx * dxx;
1639 	    rem_write3++;
1640 	  }
1641 	}
1642       } else if (cur_raw == 2) {
1643 	if (cur_xor == 1) {
1644 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1645 	    dxx = *perm_read++;
1646 	    *rem_write += dxx;
1647 	    rem_write++;
1648 	  }
1649 	} else if (cur_xor == 2) {
1650 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1651 	    dxx = *perm_read++;
1652 	    *rem_write -= dxx;
1653 	    rem_write++;
1654 	  }
1655 	} else {
1656 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1657 	    dxx = *perm_read++;
1658 	    *rem_write += dxx;
1659 	    rem_write++;
1660 	    *rem_write2 -= dxx;
1661 	    rem_write2++;
1662 	    *rem_write3 -= dxx * dxx;
1663 	    rem_write3++;
1664 	  }
1665 	}
1666       } else if (!cur_raw) {
1667 	if (cur_xor == 3) {
1668 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1669 	    dxx = *perm_read++;
1670 	    *rem_write += 2 * dxx;
1671 	    rem_write++;
1672 	  }
1673 	} else if (cur_xor == 2) {
1674 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1675 	    dxx = *perm_read++;
1676 	    *rem_write += dxx;
1677 	    rem_write++;
1678 	  }
1679 	} else {
1680 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1681 	    dxx = *perm_read++;
1682 	    *rem_write += 2 * dxx;
1683 	    rem_write++;
1684 	    *rem_write2 -= dxx;
1685 	    rem_write2++;
1686 	    *rem_write3 -= dxx * dxx;
1687 	    rem_write3++;
1688 	  }
1689 	}
1690       } else {
1691 	if (cur_xor == 2) {
1692 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1693 	    dxx = *perm_read++;
1694 	    *rem_write2 += dxx;
1695 	    rem_write2++;
1696 	    *rem_write3 += dxx * dxx;
1697 	    rem_write3++;
1698 	  }
1699 	} else if (cur_xor == 3) {
1700 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1701 	    dxx = *perm_read++;
1702 	    *rem_write -= dxx;
1703 	    rem_write++;
1704 	    *rem_write2 += dxx;
1705 	    rem_write2++;
1706 	    *rem_write3 += dxx * dxx;
1707 	    rem_write3++;
1708 	  }
1709 	} else {
1710 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1711 	    dxx = *perm_read++;
1712 	    *rem_write -= 2 * dxx;
1713 	    rem_write++;
1714 	    *rem_write2 += dxx;
1715 	    rem_write2++;
1716 	    *rem_write3 += dxx * dxx;
1717 	    rem_write3++;
1718 	  }
1719 	}
1720       }
1721 #endif
1722       ulxor &= ~((3 * ONELU) << ujj);
1723     }
1724 #ifdef __LP64__
1725     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
1726 #else
1727     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
1728 #endif
1729   }
1730 }
1731 
calc_qrem_lin(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,double * perm_vecstd,double * outbufs)1732 void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
1733   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
1734   uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1735 #ifdef __LP64__
1736   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
1737   uint32_t row_mult = perm_vec_ctcl8m / 4;
1738 
1739   uint32_t loop_len = (perm_vec_ct + 1) / 2;
1740   __m128d* permsv = (__m128d*)perm_vecstd;
1741   __m128d* __restrict__ perm_readv;
1742   __m128d* __restrict__ rem_writev = (__m128d*)outbufs;
1743   __m128d* __restrict__ rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1744   __m128d* __restrict__ rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1745   __m128d* __restrict__ rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1746   __m128d* __restrict__ rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1747   __m128d* __restrict__ rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1748   __m128d vxx;
1749 #else
1750   uint32_t row_mult = perm_vec_ctcl8m / 2;
1751   double* __restrict__ perm_read;
1752   double* __restrict__ rem_write = outbufs;
1753   double* __restrict__ rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1754   double* __restrict__ rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1755   double* __restrict__ rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1756   double* __restrict__ rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
1757   double* __restrict__ rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
1758   double dxx;
1759 #endif
1760   uintptr_t ulraw1;
1761   uintptr_t ulxor;
1762   uint32_t cur_xor;
1763   uint32_t cur_raw;
1764   uint32_t uii;
1765   uint32_t ujj;
1766   uint32_t ukk;
1767   for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1768     ulraw1 = *loadbuf++;
1769     ulxor = ulraw1 ^ (*loadbuf_ref++);
1770     if (uii + 1 == pheno_nm_ctl2x) {
1771       ujj = pheno_nm_ct & (BITCT2 - 1);
1772       if (ujj) {
1773 	ulxor &= (ONELU << (ujj * 2)) - ONELU;
1774       }
1775     }
1776     while (ulxor) {
1777       ujj = CTZLU(ulxor) & (BITCT - 2);
1778       cur_xor = (ulxor >> ujj) & 3;
1779       cur_raw = (ulraw1 >> ujj) & 3;
1780 #ifdef __LP64__
1781       perm_readv = &(permsv[ujj * row_mult]);
1782       if (cur_raw == 3) {
1783 	if (cur_xor == 1) {
1784 	  for (ukk = 0; ukk < loop_len; ukk++) {
1785 	    vxx = *perm_readv++;
1786 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1787 	    rem_writev++;
1788 	    *rem_write2v = _mm_sub_pd(*rem_write2v, _mm_mul_pd(vxx, vxx));
1789 	    rem_write2v++;
1790 	  }
1791 	  rem_writev = (__m128d*)outbufs;
1792 	  rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1793 	} else if (cur_xor == 3) {
1794 	  for (ukk = 0; ukk < loop_len; ukk++) {
1795 	    vxx = *perm_readv++;
1796 	    *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1797 	    rem_write3v++;
1798 	    *rem_write4v = _mm_sub_pd(*rem_write4v, _mm_mul_pd(vxx, vxx));
1799 	    rem_write4v++;
1800 	  }
1801 	  rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1802 	  rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1803         } else {
1804 	  for (ukk = 0; ukk < loop_len; ukk++) {
1805 	    vxx = *perm_readv++;
1806 	    *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1807 	    rem_write5v++;
1808 	    *rem_write6v = _mm_sub_pd(*rem_write6v, _mm_mul_pd(vxx, vxx));
1809 	    rem_write6v++;
1810 	  }
1811 	  rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1812 	  rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1813         }
1814       } else if (cur_raw == 2) {
1815 	if (cur_xor == 1) {
1816 	  for (ukk = 0; ukk < loop_len; ukk++) {
1817 	    vxx = *perm_readv++;
1818 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1819 	    rem_writev++;
1820 	    *rem_write2v = _mm_add_pd(*rem_write2v, _mm_mul_pd(vxx, vxx));
1821 	    rem_write2v++;
1822 	  }
1823 	} else if (cur_xor == 2) {
1824 	  for (ukk = 0; ukk < loop_len; ukk++) {
1825 	    vxx = *perm_readv++;
1826 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1827 	    rem_writev++;
1828 	    *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1829 	    rem_write3v++;
1830 	    vxx = _mm_mul_pd(vxx, vxx);
1831 	    *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1832 	    rem_write2v++;
1833 	    *rem_write4v = _mm_sub_pd(*rem_write4v, vxx);
1834 	    rem_write4v++;
1835 	  }
1836 	  rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1837 	  rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1838 	} else {
1839 	  for (ukk = 0; ukk < loop_len; ukk++) {
1840 	    vxx = *perm_readv++;
1841 	    *rem_writev = _mm_add_pd(*rem_writev, vxx);
1842 	    rem_writev++;
1843 	    *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1844 	    rem_write5v++;
1845 	    vxx = _mm_mul_pd(vxx, vxx);
1846 	    *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1847 	    rem_write2v++;
1848 	    *rem_write6v = _mm_sub_pd(*rem_write6v, vxx);
1849 	    rem_write6v++;
1850 	  }
1851 	  rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1852 	  rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1853 	}
1854 	rem_writev = (__m128d*)outbufs;
1855 	rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1856       } else if (!cur_raw) {
1857 	if (cur_xor == 3) {
1858 	  for (ukk = 0; ukk < loop_len; ukk++) {
1859 	    vxx = *perm_readv++;
1860 	    *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1861 	    rem_write3v++;
1862 	    *rem_write4v = _mm_add_pd(*rem_write4v, _mm_mul_pd(vxx, vxx));
1863 	    rem_write4v++;
1864 	  }
1865 	} else if (cur_xor == 2) {
1866 	  for (ukk = 0; ukk < loop_len; ukk++) {
1867 	    vxx = *perm_readv++;
1868 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1869 	    rem_writev++;
1870 	    *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1871 	    rem_write3v++;
1872 	    vxx = _mm_mul_pd(vxx, vxx);
1873 	    *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1874 	    rem_write2v++;
1875 	    *rem_write4v = _mm_add_pd(*rem_write4v, vxx);
1876 	    rem_write4v++;
1877 	  }
1878 	  rem_writev = (__m128d*)outbufs;
1879 	  rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1880 	} else {
1881 	  for (ukk = 0; ukk < loop_len; ukk++) {
1882 	    vxx = *perm_readv++;
1883 	    *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1884 	    rem_write3v++;
1885 	    *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1886 	    rem_write5v++;
1887 	    vxx = _mm_mul_pd(vxx, vxx);
1888 	    *rem_write4v = _mm_add_pd(*rem_write4v, vxx);
1889 	    rem_write4v++;
1890 	    *rem_write6v = _mm_sub_pd(*rem_write6v, vxx);
1891 	    rem_write6v++;
1892 	  }
1893 	  rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1894 	  rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1895 	}
1896 	rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1897 	rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1898       } else {
1899 	if (cur_xor == 2) {
1900 	  for (ukk = 0; ukk < loop_len; ukk++) {
1901 	    vxx = *perm_readv++;
1902 	    *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1903 	    rem_write5v++;
1904 	    *rem_write6v = _mm_add_pd(*rem_write6v, _mm_mul_pd(vxx, vxx));
1905 	    rem_write6v++;
1906 	  }
1907 	} else if (cur_xor == 3) {
1908 	  for (ukk = 0; ukk < loop_len; ukk++) {
1909 	    vxx = *perm_readv++;
1910 	    *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1911 	    rem_writev++;
1912 	    *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1913 	    rem_write5v++;
1914 	    vxx = _mm_mul_pd(vxx, vxx);
1915 	    *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1916 	    rem_write2v++;
1917 	    *rem_write6v = _mm_add_pd(*rem_write6v, vxx);
1918 	    rem_write6v++;
1919 	  }
1920 	  rem_writev = (__m128d*)outbufs;
1921 	  rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1922 	} else {
1923 	  for (ukk = 0; ukk < loop_len; ukk++) {
1924 	    vxx = *perm_readv++;
1925 	    *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1926 	    rem_write3v++;
1927 	    *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1928 	    rem_write5v++;
1929 	    vxx = _mm_mul_pd(vxx, vxx);
1930 	    *rem_write4v = _mm_sub_pd(*rem_write4v, vxx);
1931 	    rem_write4v++;
1932 	    *rem_write6v = _mm_add_pd(*rem_write6v, vxx);
1933 	    rem_write6v++;
1934 	  }
1935 	  rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1936 	  rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1937 	}
1938 	rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1939 	rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1940       }
1941 #else
1942       perm_read = &(perm_vecstd[ujj * row_mult]);
1943       if (cur_raw == 3) {
1944 	if (cur_xor == 1) {
1945 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1946 	    dxx = *perm_read++;
1947 	    *rem_write -= dxx;
1948 	    rem_write++;
1949 	    *rem_write2 -= dxx * dxx;
1950 	    rem_write2++;
1951 	  }
1952           rem_write = outbufs;
1953           rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1954 	} else if (cur_xor == 3) {
1955 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1956 	    dxx = *perm_read++;
1957 	    *rem_write3 -= dxx;
1958 	    rem_write3++;
1959 	    *rem_write4 -= dxx * dxx;
1960 	    rem_write4++;
1961 	  }
1962           rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1963           rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1964 	} else {
1965 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1966 	    dxx = *perm_read++;
1967 	    *rem_write5 -= dxx;
1968 	    rem_write5++;
1969 	    *rem_write6 -= dxx * dxx;
1970 	    rem_write6++;
1971 	  }
1972           rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
1973           rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
1974 	}
1975       } else if (cur_raw == 2) {
1976 	if (cur_xor == 1) {
1977 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1978 	    dxx = *perm_read++;
1979 	    *rem_write += dxx;
1980 	    rem_write++;
1981 	    *rem_write2 += dxx * dxx;
1982 	    rem_write2++;
1983 	  }
1984 	} else if (cur_xor == 2) {
1985 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1986 	    dxx = *perm_read++;
1987 	    *rem_write += dxx;
1988 	    rem_write++;
1989 	    *rem_write3 -= dxx;
1990 	    rem_write3++;
1991 	    dxx *= dxx;
1992 	    *rem_write2 += dxx;
1993 	    rem_write2++;
1994 	    *rem_write4 -= dxx;
1995 	    rem_write4++;
1996 	  }
1997           rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1998           rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1999 	} else {
2000 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2001 	    dxx = *perm_read++;
2002 	    *rem_write += dxx;
2003 	    rem_write++;
2004 	    *rem_write5 -= dxx;
2005 	    rem_write5++;
2006 	    dxx *= dxx;
2007 	    *rem_write2 += dxx;
2008 	    rem_write2++;
2009 	    *rem_write6 -= dxx;
2010 	    rem_write6++;
2011 	  }
2012           rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2013           rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2014 	}
2015 	rem_write = outbufs;
2016 	rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2017       } else if (!cur_raw) {
2018 	if (cur_xor == 3) {
2019 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2020 	    dxx = *perm_read++;
2021 	    *rem_write3 += dxx;
2022 	    rem_write3++;
2023 	    *rem_write4 += dxx * dxx;
2024 	    rem_write4++;
2025 	  }
2026 	} else if (cur_xor == 2) {
2027 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2028 	    dxx = *perm_read++;
2029 	    *rem_write -= dxx;
2030 	    rem_write++;
2031 	    *rem_write3 += dxx;
2032 	    rem_write3++;
2033 	    dxx *= dxx;
2034 	    *rem_write2 -= dxx;
2035 	    rem_write2++;
2036 	    *rem_write4 += dxx;
2037 	    rem_write4++;
2038 	  }
2039 	  rem_write = outbufs;
2040 	  rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2041 	} else {
2042 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2043 	    dxx = *perm_read++;
2044 	    *rem_write3 += dxx;
2045 	    rem_write3++;
2046 	    *rem_write5 -= dxx;
2047 	    rem_write5++;
2048 	    dxx *= dxx;
2049 	    *rem_write4 += dxx;
2050 	    rem_write4++;
2051 	    *rem_write6 -= dxx;
2052 	    rem_write6++;
2053 	  }
2054           rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2055           rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2056 	}
2057 	rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
2058 	rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
2059       } else {
2060 	if (cur_xor == 2) {
2061 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2062 	    dxx = *perm_read++;
2063 	    *rem_write5 += dxx;
2064 	    rem_write5++;
2065 	    *rem_write6 += dxx * dxx;
2066 	    rem_write6++;
2067 	  }
2068 	} else if (cur_xor == 3) {
2069 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2070 	    dxx = *perm_read++;
2071 	    *rem_write -= dxx;
2072 	    rem_write++;
2073 	    *rem_write5 += dxx;
2074 	    rem_write5++;
2075 	    dxx *= dxx;
2076 	    *rem_write2 -= dxx;
2077 	    rem_write2++;
2078 	    *rem_write6 += dxx;
2079 	    rem_write6++;
2080 	  }
2081 	  rem_write = outbufs;
2082 	  rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2083 	} else {
2084 	  for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2085 	    dxx = *perm_read++;
2086 	    *rem_write3 -= dxx;
2087 	    rem_write3++;
2088 	    *rem_write5 += dxx;
2089 	    rem_write5++;
2090 	    dxx *= dxx;
2091 	    *rem_write4 -= dxx;
2092 	    rem_write4++;
2093 	    *rem_write6 += dxx;
2094 	    rem_write6++;
2095 	  }
2096 	  rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
2097 	  rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
2098 	}
2099 	rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2100 	rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2101       }
2102 #endif
2103       ulxor &= ~((3 * ONELU) << ujj);
2104     }
2105 #ifdef __LP64__
2106     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
2107 #else
2108     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
2109 #endif
2110   }
2111 }
2112 
check_for_better_rem_cost(uintptr_t best_cost,uint32_t maxt_block_base,uint32_t maxt_block_base2,uint32_t maxt_block_base3,uintptr_t marker_idx,uint32_t * __restrict__ missing_cts,uint32_t * __restrict__ homcom_cts,uint32_t * __restrict__ het_cts,uint16_t * ldrefs,uint32_t pheno_nm_ct,int32_t missing_ct,int32_t het_ct,int32_t homcom_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_cur,uint32_t * ldrefp)2113 void check_for_better_rem_cost(uintptr_t best_cost, uint32_t maxt_block_base, uint32_t maxt_block_base2, uint32_t maxt_block_base3, uintptr_t marker_idx, uint32_t* __restrict__ missing_cts, uint32_t* __restrict__ homcom_cts, uint32_t* __restrict__ het_cts, uint16_t* ldrefs, uint32_t pheno_nm_ct, int32_t missing_ct, int32_t het_ct, int32_t homcom_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_cur, uint32_t* ldrefp) {
2114   // Check if PERMORY-style LD exploitation is better than genotype indexing
2115   // algorithm.
2116   //
2117   // Effective inner loop iterations required for LD exploitation:
2118   //   2 * (<-> neither side homcom) + (<-> homcom) + constant
2119   // Simple lower bound:
2120   //   max(delta(homcom), delta(non-homcom)) + constant
2121   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2122   uint32_t marker_idx_tmp = maxt_block_base;
2123   uint32_t loop_ceil = maxt_block_base2;
2124   int32_t homrar_ct = pheno_nm_ct - missing_ct - het_ct - homcom_ct;
2125   int32_t missing_ct_tmp;
2126   int32_t het_ct_tmp;
2127   int32_t homcom_ct_tmp;
2128   int32_t homrar_ct_tmp;
2129   uint32_t marker_bidx2;
2130   uintptr_t homcom_delta;
2131   uintptr_t cur_cost;
2132   do {
2133     if (marker_idx_tmp == maxt_block_base2) {
2134       marker_idx_tmp = maxt_block_base3;
2135       loop_ceil = marker_idx;
2136     }
2137     for (; marker_idx_tmp < loop_ceil; marker_idx_tmp++) {
2138       if (ldrefs[marker_idx_tmp] != 65535) {
2139 	missing_ct_tmp = missing_cts[marker_idx_tmp];
2140 	homcom_ct_tmp = homcom_cts[marker_idx_tmp];
2141 	het_ct_tmp = het_cts[marker_idx_tmp];
2142 	homrar_ct_tmp = pheno_nm_ct - missing_ct_tmp - het_ct_tmp - homcom_ct_tmp;
2143 	homcom_delta = labs(((int32_t)homcom_ct) - homcom_ct_tmp);
2144 	cur_cost = labs(((int32_t)missing_ct) - missing_ct_tmp) + labs(((int32_t)homrar_ct) - homrar_ct_tmp) + labs(((int32_t)het_ct) - het_ct_tmp);
2145 	cur_cost = MAXV(homcom_delta, cur_cost);
2146 	if (cur_cost < best_cost) {
2147 	  marker_bidx2 = marker_idx_tmp - maxt_block_base;
2148 	  cur_cost = rem_cost(pheno_nm_ctv2, &(loadbuf[marker_bidx2 * pheno_nm_ctv2]), loadbuf_cur);
2149 	  if (cur_cost < best_cost) {
2150 	    *ldrefp = marker_bidx2;
2151 	    best_cost = cur_cost;
2152 	  }
2153 	}
2154       }
2155     }
2156   } while (marker_idx_tmp < marker_idx);
2157 }
2158 
2159 // multithread globals
2160 static double* g_orig_pvals;
2161 static double* g_orig_chisq;
2162 static double* g_mperm_save_all;
2163 
2164 // A separated-low-and-high-bit format was tried, and found to not really be
2165 // any better than the usual PLINK 2-bit format.
2166 static uintptr_t* g_loadbuf;
2167 
2168 static uint32_t* g_perm_vecst; // genotype indexing support
2169 static uint32_t* g_thread_git_wkspace;
2170 static uint32_t* g_resultbuf;
2171 
2172 // always use genotype indexing for QT --assoc
2173 static double* g_thread_git_qbufs;
2174 static double* g_qresultbuf;
2175 static double g_pheno_sum;
2176 static double g_pheno_ssq;
2177 static uint16_t* g_ldrefs;
2178 static double* g_orig_linsq; // square of Lin t-statistic
2179 
2180 // maximum number of precomputed table entries per marker
2181 static uint32_t g_precomp_width;
2182 // precomputed table contains entries for missing_cts ranging from
2183 //   g_precomp_start[marker_bidx] to
2184 //   (g_precomp_start[marker_bidx] + g_precomp_width - 1).
2185 static uint32_t g_precomp_start[MODEL_BLOCKSIZE];
2186 
2187 // Space for precomputed tables to accelerate permutation p-value computations.
2188 // The sizing and usage of this space varies depending on the permutation
2189 // analysis requested.  (The main objective here is to bring Fisher 2x2 exact
2190 // p-values to the masses.  There's a very minor chi-square speedup as well;
2191 // it's really only present because it allowed for simpler debugging of parts
2192 // of the Fisher logic.)
2193 //
2194 // In what follows,
2195 //   n := (g_precomp_width * marker_bidx) + missing_ct -
2196 //        g_precomp_start[marker_bidx].
2197 //
2198 // For --assoc perm/--model {dom|rec|trend} perm:
2199 //   g_precomp_ui[4n] and [4n + 1] define the interval with less extreme
2200 //     p-values than the original.  [4n + 2] and [4n + 3] define the
2201 //     interval with less or equally extreme p-values.
2202 //
2203 // For --assoc mperm fisher/--model {dom|rec} fisher:
2204 //   g_precomp_ui[6n]...[6n + 3] is as in --assoc perm.
2205 //   g_precomp_ui[6n + 4] and [6n + 5] are the floor and offset for the
2206 //     range of case_set_cts where Fisher p-value calculation is unnecessary.
2207 //   g_precomp_d[2n] and [2n + 1] are tot_prob and right_prob for
2208 //     fisher22_tail_pval().  (This is almost irrelevant.)
2209 //
2210 // For --assoc mperm/--model {dom|rec|trend} mperm:
2211 //   g_precomp_ui is as in --assoc mperm fisher.
2212 //   g_precomp_d[2n] and [2n + 1] are expm11 and recip_sum from
2213 //     chi22_get_coeffs()/ca_trend_get_coeffs().
2214 //
2215 // For --model perm-gen:
2216 //   No precomputation at all.
2217 //
2218 // For regular --model perm:
2219 //   g_precomp_ui[12n] to [12n + 3] cover the allelic test, [12n + 4] to
2220 //     [12n + 7] cover the dom test, and [12n + 8] to [12n + 11] cover rec.
2221 //     [12n + 4] is 0xffffffff if the dom and rec tests should be skipped.
2222 //
2223 // For regular --model mperm fisher:
2224 //   g_precomp_ui[18n] to [18n + 5] cover the allelic test, etc.
2225 //   g_precomp_d[6n] to [6n + 1] are fisher22_tail_pval() coefficients for the
2226 //     allelic test, etc.
2227 //
2228 // For regular --model mperm:
2229 //   g_precomp_ui as in --model mperm fisher.
2230 //   g_precomp_d[6n] and [6n + 1] are expm11 and recip_sum for the allelic
2231 //     test, etc.
2232 //
2233 static uint32_t* g_precomp_ui;
2234 static double* g_precomp_d;
2235 
2236 // X-chromosome: number of missing allele observations per marker relative to
2237 //   *all female* case (so all males automatically contribute at least 1)
2238 // elsewhere: number of missing samples for each marker
2239 static uint32_t* g_missing_cts;
2240 
2241 static uint32_t* g_set_cts;
2242 static uint32_t* g_het_cts;
2243 static uint32_t* g_homcom_cts;
2244 
2245 // This is *twice* the number of successes, because PLINK 1.07 counts tie as
2246 // 0.5.  (Actually, it randomizes instead of deterministically adding 0.5; this
2247 // randomization just adds noise so we don't replicate it.)
2248 static uint32_t* g_perm_2success_ct;
2249 static uint32_t* g_perm_attempt_ct;
2250 static double* g_maxt_extreme_stat;
2251 static double* g_maxt_thread_results;
2252 
2253 // to avoid pathological multithreading issues, this is not a bitset
2254 static unsigned char* g_perm_adapt_stop;
2255 
2256 static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
2257 static uintptr_t* g_sample_nonmale_include2;
2258 static uintptr_t* g_sample_male_include2;
2259 static uintptr_t* g_is_invalid_bitfield;
2260 static uint32_t g_model_fisher;
2261 static uint32_t g_fisher_midp;
2262 static uint32_t g_assoc_thread_ct;
2263 static uint32_t g_maxt_block_base;
2264 static uint32_t g_block_start;
2265 static uint32_t g_qblock_start;
2266 static uint32_t g_block_diff;
2267 static uint32_t g_perms_done;
2268 static uint32_t g_first_adapt_check;
2269 static uint32_t g_male_ct;
2270 static double g_adaptive_intercept;
2271 static double g_adaptive_slope;
2272 static double g_aperm_alpha;
2273 static double g_adaptive_ci_zt;
2274 static uint32_t g_is_x;
2275 static uint32_t g_is_y;
2276 
2277 // X, Y, MT.  note that X, and now MT as well, have max ploidy 2
2278 static uint32_t g_min_ploidy_1;
2279 
2280 static int32_t g_is_model_prec;
2281 
2282 static uint32_t* g_male_case_cts;
2283 
assoc_adapt_thread(void * arg)2284 THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
2285   uintptr_t tidx = (uintptr_t)arg;
2286   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
2287   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2288   uintptr_t perm_vec_ct = g_perm_vec_ct;
2289   uint32_t assoc_thread_ct = g_assoc_thread_ct;
2290   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2291   uint32_t model_fisher = g_model_fisher;
2292   uint32_t fisher_midp = g_fisher_midp;
2293   uint32_t precomp_width = g_precomp_width;
2294   uint32_t first_adapt_check = g_first_adapt_check;
2295   uint32_t case_ct = g_perm_case_ct;
2296   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2297   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2298   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2299   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
2300   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2301   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
2302   double adaptive_intercept = g_adaptive_intercept;
2303   double adaptive_slope = g_adaptive_slope;
2304   double adaptive_ci_zt = g_adaptive_ci_zt;
2305   double aperm_alpha = g_aperm_alpha;
2306   uintptr_t* __restrict__ loadbuf;
2307   double* __restrict__ orig_pvals;
2308   double* __restrict__ orig_chisq;
2309   uint32_t* __restrict__ missing_cts;
2310   uint32_t* __restrict__ set_cts;
2311   uint32_t* __restrict__ precomp_start;
2312   uint32_t* __restrict__ precomp_ui;
2313   uint32_t* gpui;
2314   uintptr_t marker_idx;
2315   uintptr_t pidx;
2316   uint32_t marker_bidx;
2317   uint32_t marker_bceil;
2318   uint32_t min_ploidy_1;
2319   uint32_t is_x;
2320   uint32_t is_y;
2321   uint32_t success_2start;
2322   uint32_t success_2incr;
2323   uint32_t next_adapt_check;
2324   uint32_t min_ploidy;
2325   intptr_t row1x_sum;
2326   intptr_t col1_sum;
2327   intptr_t col2_sum;
2328   intptr_t tot_obs;
2329   uint32_t missing_start;
2330   uint32_t case_set_ct;
2331   uint32_t case_missing_ct;
2332   uint32_t uii;
2333   double stat_high;
2334   double stat_low;
2335   double pval;
2336   double dxx;
2337   double dyy;
2338   double dzz;
2339   while (1) {
2340     if (g_block_diff <= assoc_thread_ct) {
2341       if (g_block_diff <= tidx) {
2342         goto assoc_adapt_thread_skip_all;
2343       }
2344       marker_bidx = g_block_start + tidx;
2345       marker_bceil = marker_bidx + 1;
2346     } else {
2347       marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2348       marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2349     }
2350     min_ploidy_1 = g_min_ploidy_1;
2351     loadbuf = g_loadbuf;
2352     orig_pvals = g_orig_pvals;
2353     orig_chisq = g_orig_chisq;
2354     missing_cts = g_missing_cts;
2355     set_cts = g_set_cts;
2356     precomp_start = g_precomp_start;
2357     precomp_ui = g_precomp_ui;
2358     is_x = g_is_x;
2359     is_y = g_is_y;
2360     if (min_ploidy_1) {
2361       min_ploidy = 1;
2362     } else {
2363       min_ploidy = 2;
2364     }
2365     for (; marker_bidx < marker_bceil; marker_bidx++) {
2366       // guaranteed during loading that g_perm_adapt_stop[] is not set yet
2367       marker_idx = g_adapt_m_table[marker_bidx];
2368       next_adapt_check = first_adapt_check;
2369       col1_sum = set_cts[marker_idx];
2370       if (is_x) {
2371 	row1x_sum = 2 * case_ct;
2372 	tot_obs = 2 * pheno_nm_ct - missing_cts[marker_idx];
2373       } else {
2374 	row1x_sum = min_ploidy * case_ct;
2375 	tot_obs = min_ploidy * (pheno_nm_ct - missing_cts[marker_idx]);
2376       }
2377       col2_sum = tot_obs - col1_sum;
2378       missing_start = precomp_start[marker_bidx];
2379       gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
2380       success_2start = perm_2success_ct[marker_idx];
2381       success_2incr = 0;
2382       if (orig_pvals[marker_idx] == -9) {
2383         perm_adapt_stop[marker_idx] = 1;
2384         perm_attempt_ct[marker_idx] = next_adapt_check;
2385         perm_2success_ct[marker_idx] = next_adapt_check;
2386         continue;
2387       }
2388       if (model_fisher) {
2389 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
2390 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
2391       } else {
2392 	stat_high = orig_chisq[marker_idx] + EPSILON;
2393 	stat_low = orig_chisq[marker_idx] - EPSILON;
2394       }
2395       for (pidx = 0; pidx < perm_vec_ct;) {
2396 	if (!min_ploidy_1) {
2397 	  genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2398 	} else if (is_x) {
2399 	  genovec_set_freq_x(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2400 	} else if (!is_y) {
2401 	  genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_set_ct);
2402 	  case_missing_ct += uii;
2403 	} else {
2404 	  genovec_set_freq_y(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2405 	}
2406 	// deliberate underflow
2407 	uii = (uint32_t)(case_missing_ct - missing_start);
2408 	if (uii < precomp_width) {
2409 	  if (case_set_ct < gpui[4 * uii]) {
2410 	    if (case_set_ct < gpui[4 * uii + 2]) {
2411 	      success_2incr += 2;
2412 	    } else {
2413 	      success_2incr++;
2414 	    }
2415 	  } else {
2416 	    if (case_set_ct >= gpui[4 * uii + 1]) {
2417 	      if (case_set_ct >= gpui[4 * uii + 3]) {
2418 		success_2incr += 2;
2419 	      } else {
2420 		success_2incr++;
2421 	      }
2422 	    }
2423 	  }
2424 	} else {
2425 	  uii = row1x_sum - case_missing_ct * min_ploidy; // row1_sum
2426 	  if (model_fisher) {
2427 	    dxx = fisher22(case_set_ct, uii - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - uii, fisher_midp);
2428 	    if (dxx < stat_low) {
2429 	      success_2incr += 2;
2430 	    } else if (dxx <= stat_high) {
2431 	      success_2incr++;
2432 	    }
2433 	  } else {
2434 	    dxx = chi22_eval(case_set_ct, uii, col1_sum, tot_obs);
2435 	    if (dxx > stat_high) {
2436 	      success_2incr += 2;
2437 	    } else {
2438 	      success_2incr++;
2439 	    }
2440 	  }
2441 	}
2442 	if (++pidx == next_adapt_check - pidx_offset) {
2443 	  uii = success_2start + success_2incr;
2444 	  if (uii) {
2445 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
2446 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
2447 	    dyy = pval - dxx; // lower bound
2448 	    dzz = pval + dxx; // upper bound
2449 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
2450 	      perm_adapt_stop[marker_idx] = 1;
2451 	      perm_attempt_ct[marker_idx] = next_adapt_check;
2452 	      break;
2453 	    }
2454 	  }
2455 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
2456 	}
2457       }
2458       perm_2success_ct[marker_idx] += success_2incr;
2459     }
2460   assoc_adapt_thread_skip_all:
2461     if ((!tidx) || g_is_last_thread_block) {
2462       THREAD_RETURN;
2463     }
2464     THREAD_BLOCK_FINISH(tidx);
2465   }
2466 }
2467 
assoc_maxt_thread(void * arg)2468 THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
2469   uintptr_t tidx = (uintptr_t)arg;
2470   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2471   uintptr_t perm_vec_ct = g_perm_vec_ct;
2472   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2473   uint32_t assoc_thread_ct = g_assoc_thread_ct;
2474   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2475   uint32_t model_fisher = g_model_fisher;
2476   uint32_t fisher_midp = g_fisher_midp;
2477 
2478   // currently safe for this to be uint32_t since perm_vec_ct < 2^30
2479   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
2480   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
2481   uint32_t* git_homrar_cts = nullptr;
2482   uint32_t* git_missing_cts = nullptr;
2483   uint32_t* git_het_cts = nullptr;
2484   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
2485   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
2486   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
2487   uint32_t precomp_width = g_precomp_width;
2488   uint32_t case_ct = g_perm_case_ct;
2489   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2490   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2491   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2492   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
2493   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2494   double* __restrict__ mperm_save_all = g_mperm_save_all;
2495   double* msa_ptr = nullptr;
2496   uintptr_t* __restrict__ loadbuf;
2497   uint32_t* __restrict__ missing_cts;
2498   uint32_t* __restrict__ set_cts;
2499   uint32_t* __restrict__ het_cts;
2500   uint32_t* __restrict__ homcom_cts;
2501   uint32_t* __restrict__ precomp_start;
2502   uint32_t* __restrict__ precomp_ui;
2503   double* __restrict__ precomp_d;
2504   double* __restrict__ orig_pvals;
2505   double* __restrict__ orig_chisq;
2506   uint16_t* ldrefs;
2507   uintptr_t* loadbuf_cur;
2508   uint32_t* resultbuf;
2509   uint32_t* gpui;
2510   double* gpd;
2511   uintptr_t pidx;
2512   uintptr_t marker_idx;
2513   intptr_t row1x_sum;
2514   intptr_t col1_sum;
2515   intptr_t col2_sum;
2516   intptr_t tot_obs;
2517   uint32_t block_start;
2518   uint32_t maxt_block_base;
2519   uint32_t maxt_block_base2;
2520   uint32_t marker_bidx_start;
2521   uint32_t maxt_block_base3;
2522   uint32_t marker_bidx;
2523   uint32_t marker_bceil;
2524   uint32_t is_x;
2525   uint32_t is_x_or_y;
2526   uint32_t min_ploidy_1;
2527   uint32_t min_ploidy;
2528   uint32_t success_2incr;
2529   uint32_t missing_start;
2530   uint32_t case_set_ct;
2531   uint32_t case_missing_ct;
2532   uint32_t uii;
2533   uint32_t ujj;
2534   uint32_t ukk;
2535   double stat_high;
2536   double stat_low;
2537   double sval;
2538   uint32_t missing_ct;
2539   uint32_t het_ct;
2540   uint32_t homcom_ct;
2541   uint32_t ldref;
2542   while (1) {
2543     block_start = g_block_start;
2544     if (g_block_diff <= assoc_thread_ct) {
2545       if (g_block_diff <= tidx) {
2546         goto assoc_maxt_thread_skip_all;
2547       }
2548       marker_bidx_start = block_start + tidx;
2549       marker_bceil = marker_bidx_start + 1;
2550     } else {
2551       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2552       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2553     }
2554     maxt_block_base = g_maxt_block_base;
2555     maxt_block_base2 = maxt_block_base + block_start;
2556     maxt_block_base3 = maxt_block_base + marker_bidx_start;
2557     marker_bidx = marker_bidx_start;
2558     marker_idx = maxt_block_base3;
2559     is_x = g_is_x;
2560     is_x_or_y = is_x || g_is_y;
2561     min_ploidy_1 = g_min_ploidy_1;
2562     memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
2563     if (min_ploidy_1) {
2564       min_ploidy = 1;
2565     } else {
2566       min_ploidy = 2;
2567     }
2568     loadbuf = g_loadbuf;
2569     missing_cts = g_missing_cts;
2570     set_cts = g_set_cts;
2571     het_cts = g_het_cts;
2572     homcom_cts = g_homcom_cts;
2573     precomp_start = g_precomp_start;
2574     precomp_ui = g_precomp_ui;
2575     precomp_d = g_precomp_d;
2576     orig_pvals = g_orig_pvals;
2577     orig_chisq = g_orig_chisq;
2578     resultbuf = g_resultbuf;
2579     ldrefs = g_ldrefs;
2580 
2581     if (mperm_save_all) {
2582       msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
2583     }
2584     for (; marker_bidx < marker_bceil; marker_bidx++) {
2585       if (orig_pvals[marker_idx] == -9) {
2586         if (msa_ptr) {
2587           for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2588             *msa_ptr++ = -9;
2589           }
2590         }
2591         perm_2success_ct[marker_idx++] += perm_vec_ct;
2592         continue;
2593       }
2594       if (model_fisher) {
2595 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
2596 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
2597       } else {
2598 	stat_high = orig_chisq[marker_idx] + EPSILON;
2599 	stat_low = orig_chisq[marker_idx] - EPSILON;
2600       }
2601       gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
2602       col1_sum = set_cts[marker_idx];
2603       missing_ct = missing_cts[marker_idx];
2604       if (is_x) {
2605 	row1x_sum = 2 * case_ct;
2606 	tot_obs = 2 * pheno_nm_ct - missing_ct;
2607       } else {
2608 	row1x_sum = min_ploidy * case_ct;
2609 	tot_obs = min_ploidy * (pheno_nm_ct - missing_ct);
2610       }
2611       col2_sum = tot_obs - col1_sum;
2612       gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
2613       missing_start = precomp_start[marker_bidx];
2614       success_2incr = 0;
2615       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
2616       if (!is_x_or_y) {
2617 	ldref = ldrefs[marker_idx];
2618 	if (!min_ploidy_1) {
2619 	  het_ct = het_cts[marker_idx];
2620 	  homcom_ct = (col1_sum - het_ct) / 2;
2621 	} else {
2622 	  het_ct = 0;
2623 	  homcom_ct = col1_sum;
2624 	}
2625 	git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
2626 	git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
2627 	git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
2628 	if (ldref == 65535) {
2629 	  ldref = marker_bidx;
2630 	  if (pheno_nm_ct - homcom_ct > 50) {
2631 	    check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
2632 	  }
2633 	  ldrefs[marker_idx] = ldref;
2634 	}
2635 	if (ldref == marker_bidx) {
2636 	  fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
2637 	  calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
2638 	  fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
2639 	} else {
2640 	  memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
2641 	  calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
2642 	}
2643       }
2644       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2645 	if (!is_x_or_y) {
2646 	  if (!min_ploidy_1) {
2647 	    case_missing_ct = git_missing_cts[pidx];
2648 	    case_set_ct = row1x_sum - (git_het_cts[pidx] + 2 * (case_missing_ct + git_homrar_cts[pidx]));
2649 	  } else {
2650 	    case_missing_ct = git_missing_cts[pidx] + git_het_cts[pidx];
2651 	    case_set_ct = row1x_sum - case_missing_ct - git_homrar_cts[pidx];
2652 	  }
2653 	} else {
2654 	  if (is_x) {
2655 	    genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2656 	  } else {
2657 	    genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2658 	  }
2659 	}
2660 	// deliberate underflow
2661 	uii = (uint32_t)(case_missing_ct - missing_start);
2662 	if (uii < precomp_width) {
2663 	  if (case_set_ct < gpui[6 * uii]) {
2664 	    if (case_set_ct < gpui[6 * uii + 2]) {
2665 	      success_2incr += 2;
2666 	    } else {
2667 	      success_2incr++;
2668 	    }
2669 	  } else {
2670 	    if (case_set_ct >= gpui[6 * uii + 1]) {
2671 	      if (case_set_ct >= gpui[6 * uii + 3]) {
2672 		success_2incr += 2;
2673 	      } else {
2674 		success_2incr++;
2675 	      }
2676 	    }
2677 	  }
2678 	  ukk = gpui[6 * uii + 4];
2679 	  ujj = (uint32_t)(case_set_ct - ukk); // deliberate underflow
2680 	  if (ujj >= gpui[6 * uii + 5]) {
2681 	    if (model_fisher) {
2682 	      ujj = row1x_sum - case_missing_ct * min_ploidy;
2683 	      // sval = fisher22(case_set_ct, ujj - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - ujj);
2684 	      sval = fisher22_tail_pval(ukk, ujj - ukk, col1_sum - ukk, col2_sum + ukk - ujj, gpui[6 * uii + 5] - 1, gpd[2 * uii], gpd[2 * uii + 1], fisher_midp, case_set_ct);
2685 	      if (results[pidx] > sval) {
2686 		results[pidx] = sval;
2687 	      }
2688 	    } else {
2689 	      sval = ((double)((intptr_t)case_set_ct)) - gpd[2 * uii];
2690 	      sval = sval * sval * gpd[2 * uii + 1];
2691 	      if (results[pidx] < sval) {
2692 		results[pidx] = sval;
2693 	      }
2694 	    }
2695 	  }
2696 	} else {
2697 	  uii = row1x_sum - case_missing_ct * min_ploidy;
2698 	  if (model_fisher) {
2699 	    sval = fisher22(case_set_ct, uii - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - uii, fisher_midp);
2700 	    if (sval < stat_low) {
2701 	      success_2incr += 2;
2702 	    } else if (sval <= stat_high) {
2703 	      success_2incr++;
2704 	    }
2705 	    if (results[pidx] > sval) {
2706 	      results[pidx] = sval;
2707 	    }
2708 	  } else {
2709 	    sval = chi22_eval(case_set_ct, uii, col1_sum, tot_obs);
2710 	    if (sval > stat_high) {
2711 	      success_2incr += 2;
2712 	    } else if (sval > stat_low) {
2713 	      success_2incr++;
2714 	    }
2715 	    if (results[pidx] < sval) {
2716 	      results[pidx] = sval;
2717 	    }
2718 	  }
2719 	  if (msa_ptr) {
2720 	    *msa_ptr++ = sval;
2721 	  }
2722 	}
2723       }
2724       perm_2success_ct[marker_idx++] += success_2incr;
2725     }
2726   assoc_maxt_thread_skip_all:
2727     if ((!tidx) || g_is_last_thread_block) {
2728       THREAD_RETURN;
2729     }
2730     THREAD_BLOCK_FINISH(tidx);
2731   }
2732 }
2733 
assoc_set_thread(void * arg)2734 THREAD_RET_TYPE assoc_set_thread(void* arg) {
2735   // Basically a simplified version of what assoc_maxt_thread() does; we save
2736   // chi-square stats for the given number of permutations for all still-active
2737   // variants.  Adaptive pruning, if applicable, happens outside this loop.
2738   //
2739   // LD-exploitation should be added if this sees significant usage.
2740   // (possible todo: permit Fisher test, converting p-values into equivalent
2741   // chi-square stats?)
2742   uintptr_t tidx = (uintptr_t)arg;
2743   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2744   uint32_t assoc_thread_ct = g_assoc_thread_ct;
2745   uintptr_t perm_vec_ct = g_perm_vec_ct;
2746   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2747   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
2748   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
2749   uint32_t* git_homrar_cts = nullptr;
2750   uint32_t* git_missing_cts = nullptr;
2751   uint32_t* git_het_cts = nullptr;
2752   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
2753   uint32_t* resultbuf = g_resultbuf;
2754   uint32_t case_ct = g_perm_case_ct;
2755   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2756   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2757   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2758   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
2759   double* msa_ptr = nullptr;
2760   uintptr_t* loadbuf;
2761   uintptr_t* loadbuf_cur;
2762   uint32_t* __restrict__ missing_cts;
2763   uint32_t* __restrict__ set_allele_cts;
2764   uintptr_t pidx;
2765   uintptr_t marker_idx;
2766   intptr_t row1x_sum;
2767   intptr_t col1_sum;
2768   intptr_t tot_obs;
2769   uint32_t block_start;
2770   uint32_t marker_bidx_start;
2771   uint32_t marker_bidx;
2772   uint32_t marker_bceil;
2773   uint32_t is_x;
2774   uint32_t is_x_or_y;
2775   uint32_t min_ploidy_1;
2776   uint32_t min_ploidy;
2777   uint32_t case_set_ct;
2778   uint32_t case_missing_ct;
2779   uint32_t missing_ct;
2780   while (1) {
2781     block_start = g_block_start;
2782     if (g_block_diff <= assoc_thread_ct) {
2783       if (g_block_diff <= tidx) {
2784 	goto assoc_set_thread_skip_all;
2785       }
2786       marker_bidx_start = block_start + tidx;
2787       marker_bceil = marker_bidx_start + 1;
2788     } else {
2789       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2790       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2791     }
2792     marker_bidx = marker_bidx_start;
2793     is_x = g_is_x;
2794     is_x_or_y = is_x || g_is_y;
2795     min_ploidy_1 = g_min_ploidy_1;
2796     min_ploidy = 2;
2797     if (min_ploidy_1) {
2798       min_ploidy = 1;
2799     }
2800     loadbuf = g_loadbuf;
2801     missing_cts = g_missing_cts;
2802     set_allele_cts = g_set_cts;
2803     for (; marker_bidx < marker_bceil; marker_bidx++) {
2804       marker_idx = g_adapt_m_table[marker_bidx];
2805       msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
2806       col1_sum = set_allele_cts[marker_idx];
2807       missing_ct = missing_cts[marker_idx];
2808       if (is_x) {
2809 	row1x_sum = 2 * case_ct;
2810 	tot_obs = 2 * pheno_nm_ct - missing_ct;
2811       } else {
2812 	row1x_sum = min_ploidy * case_ct;
2813 	tot_obs = min_ploidy * (pheno_nm_ct - missing_ct);
2814       }
2815       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
2816       if (!is_x_or_y) {
2817 	git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
2818 	git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
2819 	git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
2820 	fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
2821 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
2822 	fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
2823       }
2824       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2825 	if (!is_x_or_y) {
2826 	  if (!min_ploidy_1) {
2827 	    case_missing_ct = git_missing_cts[pidx];
2828 	    case_set_ct = row1x_sum - (git_het_cts[pidx] + 2 * (case_missing_ct + git_homrar_cts[pidx]));
2829 	  } else {
2830 	    case_missing_ct = git_missing_cts[pidx] + git_het_cts[pidx];
2831 	    case_set_ct = row1x_sum - case_missing_ct - git_homrar_cts[pidx];
2832 	  }
2833 	} else {
2834 	  if (is_x) {
2835 	    genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2836 	  } else {
2837 	    genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2838 	  }
2839 	}
2840 	// Fisher's exact test not supported since we are adding raw chi-square
2841 	// stats, so little to gain from precomputation
2842 	*msa_ptr++ = chi22_eval(case_set_ct, row1x_sum - case_missing_ct * min_ploidy, col1_sum, tot_obs);
2843       }
2844     }
2845   assoc_set_thread_skip_all:
2846     if ((!tidx) || g_is_last_thread_block) {
2847       THREAD_RETURN;
2848     }
2849     THREAD_BLOCK_FINISH(tidx);
2850   }
2851 }
2852 
qassoc_adapt_thread(void * arg)2853 THREAD_RET_TYPE qassoc_adapt_thread(void* arg) {
2854   uintptr_t tidx = (uintptr_t)arg;
2855   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2856   uintptr_t perm_vec_ct = g_perm_vec_ct;
2857   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2858   uint32_t first_adapt_check = g_first_adapt_check;
2859   uint32_t max_thread_ct = g_assoc_thread_ct;
2860   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2861   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
2862   double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
2863   double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
2864   double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
2865   double* __restrict__ perm_vecstd = g_perm_vecstd;
2866   unsigned char* perm_adapt_stop = g_perm_adapt_stop;
2867   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
2868   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2869   double adaptive_intercept = g_adaptive_intercept;
2870   double adaptive_slope = g_adaptive_slope;
2871   double adaptive_ci_zt = g_adaptive_ci_zt;
2872   double aperm_alpha = g_aperm_alpha;
2873   double pheno_sum = g_pheno_sum;
2874   double pheno_ssq = g_pheno_ssq;
2875   uint32_t* __restrict__ missing_cts;
2876   uint32_t* __restrict__ het_cts;
2877   uint32_t* __restrict__ homcom_cts;
2878   uintptr_t* __restrict__ loadbuf;
2879   double* __restrict__ orig_chiabs;
2880   uintptr_t next_cqg;
2881   uintptr_t marker_idx;
2882   uintptr_t pidx;
2883   uintptr_t ulii;
2884   intptr_t geno_sum;
2885   intptr_t geno_ssq;
2886   uint32_t marker_bidx;
2887   uint32_t marker_bceil;
2888   uint32_t missing_ct;
2889   uint32_t het_ct;
2890   uint32_t homcom_ct;
2891   uint32_t homrar_ct;
2892   uint32_t nanal;
2893   uint32_t cur_thread_ct;
2894   uint32_t next_adapt_check;
2895   uint32_t success_2start;
2896   uint32_t success_2incr;
2897   uint32_t uii;
2898   double nanal_recip;
2899   double nanal_m1_recip;
2900   double geno_mean;
2901   double geno_var;
2902   double qt_sum;
2903   double qt_ssq;
2904   double qt_g_prod;
2905   double qt_mean;
2906   double qt_var;
2907   double qt_g_covar;
2908   double beta;
2909   double betasq;
2910   double dxx;
2911   double dyy;
2912   double dzz;
2913   double stat_high;
2914   double stat_low;
2915   double sval;
2916   while (1) {
2917     cur_thread_ct = g_block_diff / CACHELINE_DBL;
2918     if (cur_thread_ct > max_thread_ct) {
2919       cur_thread_ct = max_thread_ct;
2920     } else if (!cur_thread_ct) {
2921       cur_thread_ct = 1;
2922     }
2923     if (cur_thread_ct <= tidx) {
2924       goto qassoc_adapt_thread_skip_all;
2925     }
2926     marker_bidx = g_qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
2927     marker_bceil = g_qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
2928     loadbuf = g_loadbuf;
2929     missing_cts = g_missing_cts;
2930     het_cts = g_het_cts;
2931     homcom_cts = g_homcom_cts;
2932     orig_chiabs = g_orig_chisq;
2933     for (; marker_bidx < marker_bceil; marker_bidx++) {
2934       marker_idx = g_adapt_m_table[marker_bidx];
2935       next_adapt_check = first_adapt_check;
2936       missing_ct = missing_cts[marker_idx];
2937       nanal = pheno_nm_ct - missing_ct;
2938       homcom_ct = homcom_cts[marker_idx];
2939       het_ct = het_cts[marker_idx];
2940       homrar_ct = nanal - het_ct - homcom_ct;
2941       if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal) || (homrar_ct == nanal)) {
2942 	// the current code might otherwise report a spurious association if
2943 	// geno_var is zero, so we explicitly check for it here.
2944 	perm_adapt_stop[marker_idx] = 1;
2945 	perm_attempt_ct[marker_idx] = 0;
2946 	continue;
2947       }
2948       sval = orig_chiabs[marker_idx];
2949       // tstat = beta / vbeta_sqrt
2950       // tstat^2 = beta * beta / vbeta;
2951       //         = beta^2 * (nanal - 2) / ((qt_var / geno_var) - beta^2)
2952       // [stop here for max(T) since nanal varies across markers]
2953       // tstat^2 / (nanal - 2) = beta^2 / ((qt_var / geno_var) - beta^2)
2954       //                       = beta^2 * geno_var / (qt_var - beta^2 * geno_var)
2955       // Larger values of this last statistic monotonically result in smaller
2956       // P-values, so this is what we use for comparison (this saves a few
2957       // floating point operations at the end).
2958       sval = sval * sval / ((double)(((int32_t)nanal) - 2));
2959       stat_high = sval + EPSILON;
2960       stat_low = sval - EPSILON;
2961       geno_sum = 2 * homrar_ct + het_ct;
2962       geno_ssq = 4 * homrar_ct + het_ct;
2963       nanal_recip = 1.0 / ((double)((int32_t)nanal));
2964       nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
2965       geno_mean = ((double)geno_sum) * nanal_recip;
2966       geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip;
2967       success_2start = perm_2success_ct[marker_idx];
2968       success_2incr = 0;
2969       next_cqg = 0;
2970       for (pidx = 0; pidx < perm_vec_ct;) {
2971 	if (pidx == next_cqg) {
2972 	  next_cqg = next_adapt_check;
2973 	  ulii = pidx + pidx_offset;
2974 	  if (next_cqg < ulii + (ulii >> 2)) {
2975 	    // increase ~25% at a time
2976 	    next_cqg = ulii + (ulii >> 2);
2977 	  }
2978 	  next_cqg -= pidx_offset;
2979 	  next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
2980 	  if (next_cqg > perm_vec_ct) {
2981 	    next_cqg = perm_vec_ct;
2982 	  }
2983 	  calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_g_prod[pidx]));
2984 	}
2985 	qt_sum = pheno_sum - git_qt_sum[pidx];
2986 	qt_ssq = pheno_ssq - git_qt_ssq[pidx];
2987 	qt_g_prod = git_qt_g_prod[pidx];
2988 	qt_mean = qt_sum * nanal_recip;
2989 	qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
2990 	qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
2991 	dxx = 1.0 / geno_var;
2992 	beta = qt_g_covar * dxx;
2993 	betasq = beta * beta;
2994 	sval = betasq / (qt_var * dxx - betasq);
2995 	if (sval > stat_high) {
2996 	  success_2incr += 2;
2997 	} else if (sval > stat_low) {
2998 	  success_2incr++;
2999 	}
3000 	if (++pidx == next_adapt_check - pidx_offset) {
3001 	  uii = success_2start + success_2incr;
3002 	  if (uii) {
3003 	    sval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3004 	    dxx = adaptive_ci_zt * sqrt(sval * (1 - sval) / ((int32_t)next_adapt_check));
3005 	    dyy = sval - dxx; // lower bound
3006 	    dzz = sval + dxx; // upper bound
3007 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3008 	      perm_adapt_stop[marker_idx] = 1;
3009 	      perm_attempt_ct[marker_idx] = next_adapt_check;
3010 	      fill_double_zero(next_cqg, git_qt_g_prod);
3011 	      fill_double_zero(next_cqg, git_qt_sum);
3012 	      fill_double_zero(next_cqg, git_qt_ssq);
3013 	      goto qassoc_adapt_thread_lesszero;
3014 	    }
3015 	  }
3016 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3017 	}
3018       }
3019       fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3020     qassoc_adapt_thread_lesszero:
3021       perm_2success_ct[marker_idx] += success_2incr;
3022     }
3023   qassoc_adapt_thread_skip_all:
3024     if ((!tidx) || g_is_last_thread_block) {
3025       THREAD_RETURN;
3026     }
3027     THREAD_BLOCK_FINISH(tidx);
3028   }
3029 }
3030 
qassoc_adapt_lin_thread(void * arg)3031 THREAD_RET_TYPE qassoc_adapt_lin_thread(void* arg) {
3032   uintptr_t tidx = (uintptr_t)arg;
3033   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3034   uintptr_t perm_vec_ct = g_perm_vec_ct;
3035   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3036   uint32_t first_adapt_check = g_first_adapt_check;
3037   uint32_t max_thread_ct = g_assoc_thread_ct;
3038   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3039   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3040   double* git_qt_het_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 6]);
3041   double* git_qt_het_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 1)]);
3042   double* git_qt_homrar_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 2)]);
3043   double* git_qt_homrar_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 3)]);
3044   double* git_qt_missing_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 4)]);
3045   double* git_qt_missing_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 5)]);
3046   double* __restrict__ perm_vecstd = g_perm_vecstd;
3047   unsigned char* perm_adapt_stop = g_perm_adapt_stop;
3048   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
3049   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3050   double adaptive_intercept = g_adaptive_intercept;
3051   double adaptive_slope = g_adaptive_slope;
3052   double adaptive_ci_zt = g_adaptive_ci_zt;
3053   double aperm_alpha = g_aperm_alpha;
3054   double pheno_sum = g_pheno_sum;
3055   double pheno_ssq = g_pheno_ssq;
3056   uint32_t* __restrict__ missing_cts;
3057   uint32_t* __restrict__ het_cts;
3058   uint32_t* __restrict__ homcom_cts;
3059   uintptr_t* __restrict__ loadbuf;
3060   double* __restrict__ orig_linsq;
3061   uintptr_t next_cqg;
3062   uintptr_t marker_idx;
3063   uintptr_t pidx;
3064   uintptr_t ulii;
3065   intptr_t geno_sum;
3066   uint32_t marker_bidx;
3067   uint32_t marker_bceil;
3068   uint32_t missing_ct;
3069   uint32_t het_ct;
3070   uint32_t homcom_ct;
3071   uint32_t homrar_ct;
3072   uint32_t nanal;
3073   uint32_t cur_thread_ct;
3074   double het_ctd;
3075   double homrar_ctd;
3076   double nanal_recip;
3077   double geno_mean;
3078   double geno_mean_sq;
3079   double geno_mean_coeff2;
3080   double geno_mean_coeff3;
3081   double qt_sum;
3082   double qt_ssq;
3083   double qt_het_sum;
3084   double qt_het_ssq;
3085   double qt_homrar_sum;
3086   double qt_homrar_ssq;
3087   double qt_g_prod;
3088   double qt_mean;
3089   double qt_g_prod_centered;
3090   double dxx;
3091   double dyy;
3092   double dzz;
3093   uint32_t next_adapt_check;
3094   uint32_t success_2start;
3095   uint32_t success_2incr;
3096   uint32_t uii;
3097   double stat_high;
3098   double stat_low;
3099   double sval;
3100   while (1) {
3101     cur_thread_ct = g_block_diff / CACHELINE_DBL;
3102     if (cur_thread_ct > max_thread_ct) {
3103       cur_thread_ct = max_thread_ct;
3104     } else if (!cur_thread_ct) {
3105       cur_thread_ct = 1;
3106     }
3107     if (cur_thread_ct <= tidx) {
3108       goto qassoc_adapt_lin_thread_skip_all;
3109     }
3110     marker_bidx = g_qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3111     marker_bceil = g_qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3112     loadbuf = g_loadbuf;
3113     missing_cts = g_missing_cts;
3114     het_cts = g_het_cts;
3115     homcom_cts = g_homcom_cts;
3116     orig_linsq = g_orig_linsq;
3117     for (; marker_bidx < marker_bceil; marker_bidx++) {
3118       marker_idx = g_adapt_m_table[marker_bidx];
3119       next_adapt_check = first_adapt_check;
3120       missing_ct = missing_cts[marker_idx];
3121       nanal = pheno_nm_ct - missing_ct;
3122       homcom_ct = homcom_cts[marker_idx];
3123       het_ct = het_cts[marker_idx];
3124       if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3125 	perm_adapt_stop[marker_idx] = 1;
3126 	perm_attempt_ct[marker_idx] = 0;
3127 	continue;
3128       }
3129       homrar_ct = nanal - het_ct - homcom_ct;
3130       sval = orig_linsq[marker_idx];
3131       stat_high = sval + EPSILON;
3132       stat_low = sval - EPSILON;
3133       geno_sum = 2 * homrar_ct + het_ct;
3134       nanal_recip = 1.0 / ((double)((int32_t)nanal));
3135       het_ctd = het_ct;
3136       homrar_ctd = homrar_ct;
3137       geno_mean = ((double)geno_sum) * nanal_recip;
3138       geno_mean_sq = geno_mean * geno_mean;
3139       geno_mean_coeff2 = 1 - 2 * geno_mean;
3140       geno_mean_coeff3 = 4 - 4 * geno_mean;
3141       success_2start = perm_2success_ct[marker_idx];
3142       success_2incr = 0;
3143       next_cqg = 0;
3144       for (pidx = 0; pidx < perm_vec_ct;) {
3145 	if (pidx == next_cqg) {
3146 	  next_cqg = next_adapt_check;
3147 	  ulii = pidx + pidx_offset;
3148 	  if (next_cqg < ulii + (ulii >> 2)) {
3149 	    // increase ~25% at a time
3150 	    next_cqg = ulii + (ulii >> 2);
3151 	  }
3152 	  next_cqg -= pidx_offset;
3153 	  next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
3154 	  if (next_cqg > perm_vec_ct) {
3155 	    next_cqg = perm_vec_ct;
3156 	  }
3157 	  calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_het_sum[pidx]));
3158 	}
3159 	qt_sum = pheno_sum - git_qt_missing_sum[pidx];
3160 	qt_ssq = pheno_ssq - git_qt_missing_ssq[pidx];
3161 	qt_het_sum = git_qt_het_sum[pidx];
3162 	qt_het_ssq = git_qt_het_ssq[pidx];
3163 	qt_homrar_sum = git_qt_homrar_sum[pidx];
3164 	qt_homrar_ssq = git_qt_homrar_ssq[pidx];
3165 	qt_g_prod = qt_het_sum + 2 * qt_homrar_sum;
3166 	qt_mean = qt_sum * nanal_recip;
3167 	qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
3168 	sval = qt_g_prod_centered * qt_g_prod_centered / (geno_mean_sq * (qt_ssq + (qt_mean - 2) * qt_sum) + geno_mean_coeff2 * (qt_het_ssq + qt_mean * (qt_mean * het_ctd - 2 * qt_het_sum)) + geno_mean_coeff3 * (qt_homrar_ssq + qt_mean * (qt_mean * homrar_ctd - 2 * qt_homrar_sum)));
3169 	if (sval > stat_high) {
3170 	  success_2incr += 2;
3171 	} else if (sval > stat_low) {
3172 	  success_2incr++;
3173 	}
3174 	if (++pidx == next_adapt_check - pidx_offset) {
3175 	  uii = success_2start + success_2incr;
3176 	  if (uii) {
3177 	    sval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3178 	    dxx = adaptive_ci_zt * sqrt(sval * (1 - sval) / ((int32_t)next_adapt_check));
3179 	    dyy = sval - dxx;
3180 	    dzz = sval + dxx;
3181 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3182 	      perm_adapt_stop[marker_idx] = 1;
3183 	      perm_attempt_ct[marker_idx] = next_adapt_check;
3184 	      fill_double_zero(next_cqg, git_qt_het_sum);
3185 	      fill_double_zero(next_cqg, git_qt_het_ssq);
3186 	      fill_double_zero(next_cqg, git_qt_homrar_sum);
3187 	      fill_double_zero(next_cqg, git_qt_homrar_ssq);
3188 	      fill_double_zero(next_cqg, git_qt_missing_sum);
3189 	      fill_double_zero(next_cqg, git_qt_missing_ssq);
3190 	      goto qassoc_adapt_lin_thread_lesszero;
3191 	    }
3192 	  }
3193 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3194 	}
3195       }
3196       fill_double_zero(perm_vec_ctcl8m * 6, git_qt_het_sum);
3197     qassoc_adapt_lin_thread_lesszero:
3198       perm_2success_ct[marker_idx] += success_2incr;
3199     }
3200   qassoc_adapt_lin_thread_skip_all:
3201     if ((!tidx) || g_is_last_thread_block) {
3202       THREAD_RETURN;
3203     }
3204     THREAD_BLOCK_FINISH(tidx);
3205   }
3206 }
3207 
qassoc_maxt_thread(void * arg)3208 THREAD_RET_TYPE qassoc_maxt_thread(void* arg) {
3209   uintptr_t tidx = (uintptr_t)arg;
3210   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3211   uintptr_t perm_vec_ct = g_perm_vec_ct;
3212   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3213   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3214   uint32_t max_thread_ct = g_assoc_thread_ct;
3215   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3216   double* __restrict__ perm_vecstd = g_perm_vecstd;
3217   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3218   double* msa_ptr = nullptr;
3219   double pheno_sum = g_pheno_sum;
3220   double pheno_ssq = g_pheno_ssq;
3221   double* git_qt_g_prod;
3222   double* git_qt_sum;
3223   double* git_qt_ssq;
3224   double* qresultbuf;
3225   double* __restrict__ orig_chiabs;
3226   uintptr_t* loadbuf;
3227   uint32_t* __restrict__ missing_cts;
3228   uint32_t* __restrict__ het_cts;
3229   uint32_t* __restrict__ homcom_cts;
3230   uint16_t* ldrefs;
3231   uintptr_t* loadbuf_cur;
3232   uintptr_t pidx;
3233   uintptr_t marker_idx;
3234   uint32_t qblock_start;
3235   uint32_t maxt_block_base;
3236   uint32_t maxt_block_base2;
3237   uint32_t marker_bidx_start;
3238   uint32_t maxt_block_base3;
3239   uint32_t marker_bidx;
3240   uint32_t marker_bceil;
3241   uint32_t marker_bidx2;
3242   uint32_t missing_ct;
3243   uint32_t het_ct;
3244   uint32_t homcom_ct;
3245   uint32_t homrar_ct;
3246   intptr_t geno_sum;
3247   intptr_t geno_ssq;
3248   uint32_t nanal;
3249   double nanal_recip;
3250   double nanal_m1_recip;
3251   double geno_mean;
3252   double geno_var;
3253   double qt_sum;
3254   double qt_ssq;
3255   double qt_g_prod;
3256   double qt_mean;
3257   double qt_var;
3258   double qt_g_covar;
3259   double nanal_m2d;
3260   double beta;
3261   double betasq;
3262   double dxx;
3263   uint32_t success_2incr;
3264   double stat_high;
3265   double stat_low;
3266   double sval;
3267   uintptr_t best_cost;
3268   uint32_t cur_thread_ct;
3269   uint32_t marker_idx_tmp;
3270   int32_t missing_ct_tmp;
3271   int32_t het_ct_tmp;
3272   int32_t homcom_ct_tmp;
3273   int32_t homrar_ct_tmp;
3274   uint32_t loop_ceil;
3275   uintptr_t cur_cost;
3276   uint32_t ldref;
3277   while (1) {
3278     cur_thread_ct = g_block_diff / CACHELINE_DBL;
3279     if (cur_thread_ct > max_thread_ct) {
3280       cur_thread_ct = max_thread_ct;
3281     } else if (!cur_thread_ct) {
3282       cur_thread_ct = 1;
3283     }
3284     if (cur_thread_ct <= tidx) {
3285       goto qassoc_maxt_thread_skip_all;
3286     }
3287     qblock_start = g_qblock_start;
3288     maxt_block_base = g_maxt_block_base;
3289     maxt_block_base2 = maxt_block_base + qblock_start;
3290     marker_bidx_start = qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3291     maxt_block_base3 = maxt_block_base + marker_bidx_start;
3292     marker_bidx = marker_bidx_start;
3293     marker_idx = maxt_block_base3;
3294     marker_bceil = qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3295     memcpy(results, &(g_maxt_extreme_stat[g_perms_done - perm_vec_ct]), perm_vec_ct * sizeof(double));
3296     if (g_mperm_save_all) {
3297       msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
3298     }
3299     loadbuf = g_loadbuf;
3300     qresultbuf = g_qresultbuf;
3301     orig_chiabs = g_orig_chisq;
3302     missing_cts = g_missing_cts;
3303     het_cts = g_het_cts;
3304     homcom_cts = g_homcom_cts;
3305     ldrefs = g_ldrefs;
3306     for (; marker_bidx < marker_bceil; marker_bidx++) {
3307       missing_ct = missing_cts[marker_idx];
3308       nanal = pheno_nm_ct - missing_ct;
3309       homcom_ct = homcom_cts[marker_idx];
3310       het_ct = het_cts[marker_idx];
3311       if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3312 	perm_2success_ct[marker_idx++] += perm_vec_ct;
3313 	if (msa_ptr) {
3314 	  for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3315 	    *msa_ptr++ = -9;
3316 	  }
3317 	}
3318 	continue;
3319       }
3320       homrar_ct = nanal - het_ct - homcom_ct;
3321       sval = orig_chiabs[marker_idx];
3322       sval = sval * sval;
3323       stat_high = sval + EPSILON;
3324       stat_low = sval - EPSILON;
3325       geno_sum = 2 * homrar_ct + het_ct;
3326       geno_ssq = 4 * homrar_ct + het_ct;
3327       nanal_recip = 1.0 / ((double)((int32_t)nanal));
3328       nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
3329       nanal_m2d = nanal - 2;
3330       geno_mean = ((double)geno_sum) * nanal_recip;
3331       geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip;
3332       success_2incr = 0;
3333       git_qt_g_prod = &(qresultbuf[3 * marker_bidx * perm_vec_ctcl8m]);
3334       git_qt_sum = &(git_qt_g_prod[perm_vec_ctcl8m]);
3335       git_qt_ssq = &(git_qt_g_prod[2 * perm_vec_ctcl8m]);
3336       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3337       ldref = ldrefs[marker_idx];
3338       if (ldref == 65535) {
3339 	// Addition loops required for genotype indexing:
3340 	//   het_ct + homrar_ct + 2 * missing_ct
3341 	//
3342 	// Addition/initial copy loops required for LD exploitation:
3343 	//   3 + 3 * (missing <-> homrar/het) + 2 * (missing <-> homcom) +
3344 	//   (homrar <-> het/homcom) + (het <-> homcom)
3345 	// Simple lower bound (may allow us to skip full LD cost calculation):
3346 	//   (delta(homrar) + 2*delta(missing) + delta(het) + delta(homcom)) / 2
3347 	best_cost = het_ct + homrar_ct + 2 * missing_ct;
3348 	ldref = marker_bidx;
3349 	marker_idx_tmp = maxt_block_base;
3350 	loop_ceil = maxt_block_base2;
3351 	do {
3352 	  if (marker_idx_tmp == maxt_block_base2) {
3353 	    marker_idx_tmp = maxt_block_base3;
3354 	    loop_ceil = marker_idx;
3355 	  }
3356 	  for (; marker_idx_tmp < loop_ceil; marker_idx_tmp++) {
3357 	    if (ldrefs[marker_idx_tmp] != 65535) {
3358 	      missing_ct_tmp = missing_cts[marker_idx_tmp];
3359 	      homcom_ct_tmp = homcom_cts[marker_idx_tmp];
3360 	      het_ct_tmp = het_cts[marker_idx_tmp];
3361 	      homrar_ct_tmp = pheno_nm_ct - missing_ct_tmp - het_ct_tmp - homcom_ct_tmp;
3362 	      cur_cost = labs(((int32_t)missing_ct) - missing_ct_tmp) + (labs(((int32_t)homrar_ct) - homrar_ct_tmp) + labs(((int32_t)het_ct) - het_ct_tmp) + labs(((int32_t)homcom_ct) - homcom_ct_tmp) + 7) / 2;
3363 	      if (cur_cost < best_cost) {
3364 		marker_bidx2 = marker_idx_tmp - maxt_block_base;
3365 		cur_cost = qrem_cost2(pheno_nm_ctv2, &(loadbuf[marker_bidx2 * pheno_nm_ctv2]), loadbuf_cur);
3366 		if (cur_cost < best_cost) {
3367 		  ldref = marker_bidx2;
3368 		  best_cost = cur_cost;
3369 		}
3370 	      }
3371 	    }
3372 	  }
3373 	} while (marker_idx_tmp < marker_idx);
3374 	ldrefs[marker_idx] = ldref;
3375       }
3376       if (ldref == marker_bidx) {
3377 	fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3378 	calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, loadbuf_cur, perm_vecstd, git_qt_g_prod);
3379       } else {
3380 	memcpy(git_qt_g_prod, &(qresultbuf[3 * ldref * perm_vec_ctcl8m]), 3 * perm_vec_ctcl8m * sizeof(double));
3381 	calc_qrem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecstd, git_qt_g_prod);
3382       }
3383       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3384 	qt_sum = pheno_sum - git_qt_sum[pidx];
3385 	qt_ssq = pheno_ssq - git_qt_ssq[pidx];
3386 	qt_g_prod = git_qt_g_prod[pidx];
3387 	qt_mean = qt_sum * nanal_recip;
3388 	qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
3389 	qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
3390 	dxx = 1.0 / geno_var;
3391 	beta = qt_g_covar * dxx;
3392 	betasq = beta * beta;
3393 	sval = betasq * nanal_m2d / (qt_var * dxx - betasq);
3394 	if (sval > stat_high) {
3395 	  success_2incr += 2;
3396 	} else if (sval > stat_low) {
3397 	  success_2incr++;
3398 	}
3399 	if (results[pidx] < sval) {
3400 	  results[pidx] = sval;
3401 	}
3402 	if (msa_ptr) {
3403 	  *msa_ptr++ = sval;
3404 	}
3405       }
3406       perm_2success_ct[marker_idx++] += success_2incr;
3407     }
3408   qassoc_maxt_thread_skip_all:
3409     if ((!tidx) || g_is_last_thread_block) {
3410       THREAD_RETURN;
3411     }
3412     THREAD_BLOCK_FINISH(tidx);
3413   }
3414 }
3415 
qassoc_maxt_lin_thread(void * arg)3416 THREAD_RET_TYPE qassoc_maxt_lin_thread(void* arg) {
3417   uintptr_t tidx = (uintptr_t)arg;
3418   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3419   uintptr_t perm_vec_ct = g_perm_vec_ct;
3420   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3421   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3422   uint32_t max_thread_ct = g_assoc_thread_ct;
3423   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3424   double* __restrict__ perm_vecstd = g_perm_vecstd;
3425   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3426   double* msa_ptr = nullptr;
3427   double pheno_sum = g_pheno_sum;
3428   double pheno_ssq = g_pheno_ssq;
3429   double* git_qt_het_sum;
3430   double* git_qt_het_ssq;
3431   double* git_qt_homrar_sum;
3432   double* git_qt_homrar_ssq;
3433   double* git_qt_missing_sum;
3434   double* git_qt_missing_ssq;
3435   uintptr_t* loadbuf;
3436   double* qresultbuf;
3437   uint32_t* __restrict__ missing_cts;
3438   uint32_t* __restrict__ het_cts;
3439   uint32_t* __restrict__ homcom_cts;
3440   uint16_t* ldrefs;
3441   double* __restrict__ orig_linsq;
3442   uintptr_t* loadbuf_cur;
3443   uintptr_t pidx;
3444   uintptr_t marker_idx;
3445   uint32_t qblock_start;
3446   uint32_t maxt_block_base;
3447   uint32_t maxt_block_base2;
3448   uint32_t marker_bidx_start;
3449   uint32_t maxt_block_base3;
3450   uint32_t marker_bidx;
3451   uint32_t marker_bceil;
3452   uint32_t missing_ct;
3453   uint32_t het_ct;
3454   uint32_t homcom_ct;
3455   uint32_t homrar_ct;
3456   intptr_t geno_sum;
3457   uint32_t nanal;
3458   uint32_t success_2incr;
3459   uint32_t cur_thread_ct;
3460   double het_ctd;
3461   double homrar_ctd;
3462   double nanal_recip;
3463   double geno_mean;
3464   double geno_mean_sq;
3465   double geno_mean_coeff2;
3466   double geno_mean_coeff3;
3467   double qt_sum;
3468   double qt_ssq;
3469   double qt_het_sum;
3470   double qt_het_ssq;
3471   double qt_homrar_sum;
3472   double qt_homrar_ssq;
3473   double qt_g_prod;
3474   double qt_mean;
3475   double qt_g_prod_centered;
3476   double stat_high;
3477   double stat_low;
3478   double sval;
3479   uint32_t ldref;
3480   while (1) {
3481     cur_thread_ct = g_block_diff / CACHELINE_DBL;
3482     if (cur_thread_ct > max_thread_ct) {
3483       cur_thread_ct = max_thread_ct;
3484     } else if (!cur_thread_ct) {
3485       cur_thread_ct = 1;
3486     }
3487     if (cur_thread_ct <= tidx) {
3488       goto qassoc_maxt_lin_thread_skip_all;
3489     }
3490     qblock_start = g_qblock_start;
3491     maxt_block_base = g_maxt_block_base;
3492     maxt_block_base2 = maxt_block_base + qblock_start;
3493     marker_bidx_start = qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3494     maxt_block_base3 = maxt_block_base + marker_bidx_start;
3495     marker_bidx = marker_bidx_start;
3496     marker_idx = maxt_block_base3;
3497     marker_bceil = qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3498     memcpy(results, &(g_maxt_extreme_stat[g_perms_done - perm_vec_ct]), perm_vec_ct * sizeof(double));
3499     if (g_mperm_save_all) {
3500       msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
3501     }
3502     loadbuf = g_loadbuf;
3503     qresultbuf = g_qresultbuf;
3504     missing_cts = g_missing_cts;
3505     het_cts = g_het_cts;
3506     homcom_cts = g_homcom_cts;
3507     ldrefs = g_ldrefs;
3508     orig_linsq = g_orig_linsq;
3509 
3510     for (; marker_bidx < marker_bceil; marker_bidx++) {
3511       missing_ct = missing_cts[marker_idx];
3512       nanal = pheno_nm_ct - missing_ct;
3513       homcom_ct = homcom_cts[marker_idx];
3514       het_ct = het_cts[marker_idx];
3515       if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3516 	perm_2success_ct[marker_idx++] += perm_vec_ct;
3517 	if (msa_ptr) {
3518 	  for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3519 	    *msa_ptr++ = -9;
3520 	  }
3521 	}
3522 	continue;
3523       }
3524       homrar_ct = nanal - het_ct - homcom_ct;
3525       sval = orig_linsq[marker_idx];
3526       stat_high = sval + EPSILON;
3527       stat_low = sval - EPSILON;
3528       geno_sum = 2 * homrar_ct + het_ct;
3529       nanal_recip = 1.0 / ((double)((int32_t)nanal));
3530       het_ctd = het_ct;
3531       homrar_ctd = homrar_ct;
3532       geno_mean = ((double)geno_sum) * nanal_recip;
3533       geno_mean_sq = geno_mean * geno_mean;
3534       geno_mean_coeff2 = 1 - 2 * geno_mean;
3535       geno_mean_coeff3 = 4 - 4 * geno_mean;
3536       success_2incr = 0;
3537       git_qt_het_sum = &(qresultbuf[6 * marker_bidx * perm_vec_ctcl8m]);
3538       git_qt_het_ssq = &(git_qt_het_sum[perm_vec_ctcl8m]);
3539       git_qt_homrar_sum = &(git_qt_het_sum[2 * perm_vec_ctcl8m]);
3540       git_qt_homrar_ssq = &(git_qt_het_sum[3 * perm_vec_ctcl8m]);
3541       git_qt_missing_sum = &(git_qt_het_sum[4 * perm_vec_ctcl8m]);
3542       git_qt_missing_ssq = &(git_qt_het_sum[5 * perm_vec_ctcl8m]);
3543       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3544       ldref = ldrefs[marker_idx];
3545       if (ldref == 65535) {
3546 	// 2x addition loops required for genotype indexing:
3547 	//   het_ct + homrar_ct + missing_ct
3548 	//
3549 	// 2x addition/initial copy loops required for LD exploitation:
3550 	//   3 + 2 * (<-> neither side homcom) + (<-> homcom)
3551 	// Simple lower bound (may allow us to skip full LD cost calculation):
3552 	//   3 + delta(homcom) if delta(homcom) >= sum of other deltas
3553 	//   3 + delta(non-homcom) otherwise
3554 	ldref = marker_bidx;
3555 	if (pheno_nm_ct - homcom_ct > 3) {
3556 	  check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 3, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
3557 	}
3558 	ldrefs[marker_idx] = ldref;
3559       }
3560       if (ldref == marker_bidx) {
3561 	fill_double_zero(perm_vec_ctcl8m * 6, git_qt_het_sum);
3562 	calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, loadbuf_cur, perm_vecstd, git_qt_het_sum);
3563       } else {
3564 	memcpy(git_qt_het_sum, &(qresultbuf[6 * ldref * perm_vec_ctcl8m]), 6 * perm_vec_ctcl8m * sizeof(double));
3565 	calc_qrem_lin(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecstd, git_qt_het_sum);
3566       }
3567       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3568 	qt_sum = pheno_sum - git_qt_missing_sum[pidx];
3569 	qt_ssq = pheno_ssq - git_qt_missing_ssq[pidx];
3570 	qt_het_sum = git_qt_het_sum[pidx];
3571 	qt_het_ssq = git_qt_het_ssq[pidx];
3572 	qt_homrar_sum = git_qt_homrar_sum[pidx];
3573 	qt_homrar_ssq = git_qt_homrar_ssq[pidx];
3574 	qt_g_prod = qt_het_sum + 2 * qt_homrar_sum;
3575 	qt_mean = qt_sum * nanal_recip;
3576 	qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
3577 	sval = qt_g_prod_centered * qt_g_prod_centered / (geno_mean_sq * (qt_ssq + (qt_mean - 2) * qt_sum) + geno_mean_coeff2 * (qt_het_ssq + qt_mean * (qt_mean * het_ctd - 2 * qt_het_sum)) + geno_mean_coeff3 * (qt_homrar_ssq + qt_mean * (qt_mean * homrar_ctd - 2 * qt_homrar_sum)));
3578 	if (sval > stat_high) {
3579 	  success_2incr += 2;
3580 	} else if (sval > stat_low) {
3581 	  success_2incr++;
3582 	}
3583 	if (results[pidx] < sval) {
3584 	  results[pidx] = sval;
3585 	}
3586 	if (msa_ptr) {
3587 	  *msa_ptr++ = sval;
3588 	}
3589       }
3590       perm_2success_ct[marker_idx++] += success_2incr;
3591     }
3592   qassoc_maxt_lin_thread_skip_all:
3593     if ((!tidx) || g_is_last_thread_block) {
3594       THREAD_RETURN;
3595     }
3596     THREAD_BLOCK_FINISH(tidx);
3597   }
3598 }
3599 
qassoc_set_thread(void * arg)3600 THREAD_RET_TYPE qassoc_set_thread(void* arg) {
3601   // Simplified version of qassoc_adapt/maxt_thread(), except we need to save
3602   // actual t-statistics.
3603   uintptr_t tidx = (uintptr_t)arg;
3604   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3605   uintptr_t perm_vec_ct = g_perm_vec_ct;
3606   uint32_t max_thread_ct = g_assoc_thread_ct;
3607   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3608   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3609   double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
3610   double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
3611   double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
3612   double* __restrict__ perm_vecstd = g_perm_vecstd;
3613   double pheno_sum = g_pheno_sum;
3614   double pheno_ssq = g_pheno_ssq;
3615   uint32_t* __restrict__ missing_cts;
3616   uint32_t* __restrict__ het_cts;
3617   uint32_t* __restrict__ homcom_cts;
3618   uintptr_t* __restrict__ loadbuf;
3619   uintptr_t marker_idx;
3620   uintptr_t pidx;
3621   intptr_t geno_sum;
3622   intptr_t geno_ssq;
3623   double* msa_ptr;
3624   uint32_t marker_bidx;
3625   uint32_t marker_bceil;
3626   uint32_t missing_ct;
3627   uint32_t het_ct;
3628   uint32_t homcom_ct;
3629   uint32_t homrar_ct;
3630   uint32_t nanal;
3631   uint32_t cur_thread_ct;
3632   double nanal_recip;
3633   double nanal_m1_recip;
3634   double nanal_m2_recip;
3635   double geno_mean;
3636   double geno_var_recip;
3637   double qt_sum;
3638   double qt_ssq;
3639   double qt_g_prod;
3640   double qt_mean;
3641   double qt_var;
3642   double qt_g_covar;
3643   double beta;
3644   double vbeta_sqrt;
3645   while (1) {
3646     cur_thread_ct = g_block_diff / CACHELINE_DBL;
3647     if (cur_thread_ct > max_thread_ct) {
3648       cur_thread_ct = max_thread_ct;
3649     } else if (!cur_thread_ct) {
3650       cur_thread_ct = 1;
3651     }
3652     if (cur_thread_ct <= tidx) {
3653       goto qassoc_set_thread_skip_all;
3654     }
3655     marker_bidx = (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3656     marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3657     loadbuf = g_loadbuf;
3658     missing_cts = g_missing_cts;
3659     het_cts = g_het_cts;
3660     homcom_cts = g_homcom_cts;
3661     for (; marker_bidx < marker_bceil; marker_bidx++) {
3662       marker_idx = g_adapt_m_table[marker_bidx];
3663       msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
3664       missing_ct = missing_cts[marker_idx];
3665       nanal = pheno_nm_ct - missing_ct;
3666       homcom_ct = homcom_cts[marker_idx];
3667       het_ct = het_cts[marker_idx];
3668       homrar_ct = nanal - het_ct - homcom_ct;
3669       geno_sum = 2 * homrar_ct + het_ct;
3670       geno_ssq = 4 * homrar_ct + het_ct;
3671       nanal_recip = 1.0 / ((double)((int32_t)nanal));
3672       nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
3673       nanal_m2_recip = 1.0 / ((double)(((int32_t)nanal) - 2));
3674       geno_mean = ((double)geno_sum) * nanal_recip;
3675       geno_var_recip = 1.0 / ((((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip);
3676       calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecstd, git_qt_g_prod);
3677       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3678 	qt_sum = pheno_sum - git_qt_sum[pidx];
3679 	qt_ssq = pheno_ssq - git_qt_ssq[pidx];
3680 	qt_g_prod = git_qt_g_prod[pidx];
3681 	qt_mean = qt_sum * nanal_recip;
3682 	qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
3683 	qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
3684 	beta = qt_g_covar * geno_var_recip;
3685 	vbeta_sqrt = sqrt((qt_var * geno_var_recip - beta * beta) * nanal_m2_recip);
3686 	*msa_ptr++ = fabs(beta / vbeta_sqrt);
3687       }
3688       fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3689     }
3690   qassoc_set_thread_skip_all:
3691     if ((!tidx) || g_is_last_thread_block) {
3692       THREAD_RETURN;
3693     }
3694     THREAD_BLOCK_FINISH(tidx);
3695   }
3696 }
3697 
model_adapt_domrec_thread(void * arg)3698 THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
3699   uintptr_t tidx = (uintptr_t)arg;
3700   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
3701   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3702   uintptr_t perm_vec_ct = g_perm_vec_ct;
3703   uint32_t assoc_thread_ct = g_assoc_thread_ct;
3704   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3705   uint32_t model_fisher = g_model_fisher;
3706   uint32_t fisher_midp = g_fisher_midp;
3707   uint32_t precomp_width = g_precomp_width;
3708   uint32_t first_adapt_check = g_first_adapt_check;
3709   uint32_t case_ct = g_perm_case_ct;
3710   int32_t is_model_prec = g_is_model_prec;
3711   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
3712   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
3713   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3714   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
3715   double adaptive_intercept = g_adaptive_intercept;
3716   double adaptive_slope = g_adaptive_slope;
3717   double adaptive_ci_zt = g_adaptive_ci_zt;
3718   double aperm_alpha = g_aperm_alpha;
3719   uintptr_t* __restrict__ loadbuf;
3720   double* __restrict__ orig_pvals;
3721   double* __restrict__ orig_chisq;
3722   uint32_t* __restrict__ missing_cts;
3723   uint32_t* __restrict__ het_cts;
3724   uint32_t* __restrict__ homcom_cts;
3725   uint32_t* __restrict__ precomp_start;
3726   uint32_t* __restrict__ precomp_ui;
3727   uint32_t* gpui;
3728   uintptr_t marker_idx;
3729   uintptr_t pidx;
3730   uint32_t marker_bidx;
3731   uint32_t marker_bceil;
3732   uint32_t success_2start;
3733   uint32_t success_2incr;
3734   uint32_t next_adapt_check;
3735   intptr_t col1_sum;
3736   intptr_t col2_sum;
3737   intptr_t tot_obs;
3738   uint32_t missing_start;
3739   uint32_t case_homx_ct;
3740   uint32_t case_missing_ct;
3741   uint32_t uii;
3742   double stat_high;
3743   double stat_low;
3744   double pval;
3745   double dxx;
3746   double dyy;
3747   double dzz;
3748   while (1) {
3749     if (g_block_diff <= assoc_thread_ct) {
3750       if (g_block_diff <= tidx) {
3751         goto model_adapt_domrec_thread_skip_all;
3752       }
3753       marker_bidx = g_block_start + tidx;
3754       marker_bceil = marker_bidx + 1;
3755     } else {
3756       marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
3757       marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
3758     }
3759     loadbuf = g_loadbuf;
3760     orig_pvals = g_orig_pvals;
3761     orig_chisq = g_orig_chisq;
3762     missing_cts = g_missing_cts;
3763     het_cts = g_het_cts;
3764     homcom_cts = g_homcom_cts;
3765     precomp_start = g_precomp_start;
3766     precomp_ui = g_precomp_ui;
3767     for (; marker_bidx < marker_bceil; marker_bidx++) {
3768       marker_idx = g_adapt_m_table[marker_bidx];
3769       if (model_fisher) {
3770         if (orig_pvals[marker_idx] == -9) {
3771           perm_adapt_stop[marker_idx] = 1;
3772           perm_attempt_ct[marker_idx] = 0;
3773           continue;
3774         }
3775 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
3776 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
3777       } else {
3778         if (orig_chisq[marker_idx] == -9) {
3779           perm_adapt_stop[marker_idx] = 1;
3780           perm_attempt_ct[marker_idx] = 0;
3781           continue;
3782         }
3783 	stat_high = orig_chisq[marker_idx] + EPSILON;
3784 	stat_low = orig_chisq[marker_idx] - EPSILON;
3785       }
3786       next_adapt_check = first_adapt_check;
3787       tot_obs = pheno_nm_ct - missing_cts[marker_idx];
3788       if (is_model_prec) {
3789 	col2_sum = homcom_cts[marker_idx] + het_cts[marker_idx];
3790 	col1_sum = tot_obs - col2_sum;
3791       } else {
3792 	col1_sum = homcom_cts[marker_idx];
3793 	col2_sum = tot_obs - col1_sum;
3794       }
3795       missing_start = precomp_start[marker_bidx];
3796       gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
3797       success_2start = perm_2success_ct[marker_idx];
3798       success_2incr = 0;
3799       for (pidx = 0; pidx < perm_vec_ct;) {
3800 	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_homx_ct);
3801 	if (is_model_prec) {
3802 	  case_homx_ct = case_ct - case_homx_ct - case_missing_ct - uii;
3803 	}
3804 	// deliberate underflow
3805 	uii = (uint32_t)(case_missing_ct - missing_start);
3806 	if (uii < precomp_width) {
3807 	  if (case_homx_ct < gpui[4 * uii]) {
3808 	    if (case_homx_ct < gpui[4 * uii + 2]) {
3809 	      success_2incr += 2;
3810 	    } else {
3811 	      success_2incr++;
3812 	    }
3813 	  } else {
3814 	    if (case_homx_ct >= gpui[4 * uii + 1]) {
3815 	      if (case_homx_ct >= gpui[4 * uii + 3]) {
3816 		success_2incr += 2;
3817 	      } else {
3818 		success_2incr++;
3819 	      }
3820 	    }
3821 	  }
3822 	} else {
3823 	  uii = case_ct - case_missing_ct;
3824 	  if (model_fisher) {
3825 	    dxx = fisher22(case_homx_ct, uii - case_homx_ct, col1_sum - case_homx_ct, col2_sum + case_homx_ct - uii, fisher_midp);
3826 	    if (dxx < stat_low) {
3827 	      success_2incr += 2;
3828 	    } else if (dxx <= stat_high) {
3829 	      success_2incr++;
3830 	    }
3831 	  } else {
3832 	    dxx = chi22_eval(case_homx_ct, uii, col1_sum, tot_obs);
3833 	    if (dxx > stat_high) {
3834 	      success_2incr += 2;
3835 	    } else if (dxx > stat_low) {
3836 	      success_2incr++;
3837 	    }
3838 	  }
3839 	}
3840 	if (++pidx == next_adapt_check - pidx_offset) {
3841 	  uii = success_2start + success_2incr;
3842 	  if (uii) {
3843 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3844 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
3845 	    dyy = pval - dxx; // lower bound
3846 	    dzz = pval + dxx; // upper bound
3847 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3848 	      perm_adapt_stop[marker_idx] = 1;
3849 	      perm_attempt_ct[marker_idx] = next_adapt_check;
3850 	      break;
3851 	    }
3852 	  }
3853 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3854 	}
3855       }
3856       perm_2success_ct[marker_idx] += success_2incr;
3857     }
3858   model_adapt_domrec_thread_skip_all:
3859     if ((!tidx) || g_is_last_thread_block) {
3860       THREAD_RETURN;
3861     }
3862     THREAD_BLOCK_FINISH(tidx);
3863   }
3864 }
3865 
model_maxt_domrec_thread(void * arg)3866 THREAD_RET_TYPE model_maxt_domrec_thread(void* arg) {
3867   uintptr_t tidx = (uintptr_t)arg;
3868   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3869   uintptr_t perm_vec_ct = g_perm_vec_ct;
3870   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3871   uint32_t assoc_thread_ct = g_assoc_thread_ct;
3872   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3873   uint32_t model_fisher = g_model_fisher;
3874   uint32_t fisher_midp = g_fisher_midp;
3875   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
3876   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
3877   uint32_t* git_homrar_cts = nullptr;
3878   uint32_t* git_missing_cts = nullptr;
3879   uint32_t* git_het_cts = nullptr;
3880   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
3881   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3882   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3883   uint32_t precomp_width = g_precomp_width;
3884   uint32_t case_ct = g_perm_case_ct;
3885   int32_t is_model_prec = g_is_model_prec;
3886   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
3887   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3888   double* __restrict__ mperm_save_all = g_mperm_save_all;
3889   double* msa_ptr = nullptr;
3890   uintptr_t* __restrict__ loadbuf;
3891   uint32_t* __restrict__ missing_cts;
3892   uint32_t* __restrict__ het_cts;
3893   uint32_t* __restrict__ homcom_cts;
3894   uint32_t* __restrict__ precomp_start;
3895   uint32_t* __restrict__ precomp_ui;
3896   double* __restrict__ precomp_d;
3897   double* __restrict__ orig_pvals;
3898   double* __restrict__ orig_chisq;
3899   uint16_t* ldrefs;
3900   uintptr_t* loadbuf_cur;
3901   uint32_t* resultbuf;
3902   uint32_t* gpui;
3903   double* gpd;
3904   uintptr_t pidx;
3905   uintptr_t marker_idx;
3906   intptr_t col1_sum;
3907   intptr_t col2_sum;
3908   intptr_t tot_obs;
3909   uint32_t block_start;
3910   uint32_t maxt_block_base;
3911   uint32_t maxt_block_base2;
3912   uint32_t marker_bidx_start;
3913   uint32_t maxt_block_base3;
3914   uint32_t marker_bidx;
3915   uint32_t marker_bceil;
3916   uint32_t success_2incr;
3917   uint32_t missing_start;
3918   uint32_t case_homx_ct;
3919   uint32_t case_missing_ct;
3920   uint32_t uii;
3921   uint32_t ujj;
3922   uint32_t ukk;
3923   double stat_high;
3924   double stat_low;
3925   double sval;
3926   uint32_t missing_ct;
3927   uint32_t het_ct;
3928   uint32_t homcom_ct;
3929   uint32_t ldref;
3930   while (1) {
3931     block_start = g_block_start;
3932     if (g_block_diff <= assoc_thread_ct) {
3933       if (g_block_diff <= tidx) {
3934         goto model_maxt_domrec_thread_skip_all;
3935       }
3936       marker_bidx_start = block_start + tidx;
3937       marker_bceil = marker_bidx_start + 1;
3938     } else {
3939       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
3940       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
3941     }
3942     maxt_block_base = g_maxt_block_base;
3943     maxt_block_base2 = maxt_block_base + block_start;
3944     maxt_block_base3 = maxt_block_base + marker_bidx_start;
3945     marker_bidx = marker_bidx_start;
3946     marker_idx = maxt_block_base3;
3947     loadbuf = g_loadbuf;
3948     missing_cts = g_missing_cts;
3949     het_cts = g_het_cts;
3950     homcom_cts = g_homcom_cts;
3951     precomp_start = g_precomp_start;
3952     precomp_ui = g_precomp_ui;
3953     precomp_d = g_precomp_d;
3954     orig_pvals = g_orig_pvals;
3955     orig_chisq = g_orig_chisq;
3956     resultbuf = g_resultbuf;
3957     ldrefs = g_ldrefs;
3958     memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
3959     if (mperm_save_all) {
3960       msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
3961     }
3962     for (; marker_bidx < marker_bceil; marker_bidx++) {
3963       if (model_fisher) {
3964 	if (orig_pvals[marker_idx] == -9) {
3965 	model_maxt_domrec_thread_skip_marker:
3966 	  marker_idx++;
3967 	  if (msa_ptr) {
3968 	    for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3969 	      *msa_ptr++ = -9;
3970 	    }
3971 	  }
3972 	  continue;
3973 	}
3974 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
3975 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
3976       } else {
3977 	if (orig_chisq[marker_idx] == -9) {
3978 	  goto model_maxt_domrec_thread_skip_marker;
3979 	}
3980 	stat_high = orig_chisq[marker_idx] + EPSILON;
3981 	stat_low = orig_chisq[marker_idx] - EPSILON;
3982       }
3983       gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
3984       missing_ct = missing_cts[marker_idx];
3985       het_ct = het_cts[marker_idx];
3986       homcom_ct = homcom_cts[marker_idx];
3987       tot_obs = pheno_nm_ct - missing_ct;
3988       if (is_model_prec) {
3989 	col2_sum = homcom_ct + het_ct;
3990 	col1_sum = tot_obs - col2_sum;
3991       } else {
3992 	col1_sum = homcom_ct;
3993 	col2_sum = tot_obs - col1_sum;
3994       }
3995       missing_start = precomp_start[marker_bidx];
3996       gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
3997       success_2incr = 0;
3998       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3999       ldref = ldrefs[marker_idx];
4000       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4001       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4002       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4003       if (ldref == 65535) {
4004 	ldref = marker_bidx;
4005 	if (pheno_nm_ct - homcom_ct > 50) {
4006 	  check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4007 	}
4008 	ldrefs[marker_idx] = ldref;
4009       }
4010       if (ldref == marker_bidx) {
4011 	fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4012 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4013 	fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4014       } else {
4015 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4016 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4017       }
4018       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4019 	case_missing_ct = git_missing_cts[pidx];
4020 	if (is_model_prec) {
4021 	  case_homx_ct = git_homrar_cts[pidx];
4022 	} else {
4023 	  case_homx_ct = case_ct - case_missing_ct - git_homrar_cts[pidx] - git_het_cts[pidx];
4024 	}
4025 	// deliberate underflow
4026 	uii = (uint32_t)(case_missing_ct - missing_start);
4027 	if (uii < precomp_width) {
4028 	  if (case_homx_ct < gpui[6 * uii]) {
4029 	    if (case_homx_ct < gpui[6 * uii + 2]) {
4030 	      success_2incr += 2;
4031 	    } else {
4032 	      success_2incr++;
4033 	    }
4034 	  } else {
4035 	    if (case_homx_ct >= gpui[6 * uii + 1]) {
4036 	      if (case_homx_ct >= gpui[6 * uii + 3]) {
4037 		success_2incr += 2;
4038 	      } else {
4039 		success_2incr++;
4040 	      }
4041 	    }
4042 	  }
4043 	  ukk = gpui[6 * uii + 4];
4044 	  ujj = (uint32_t)(case_homx_ct - ukk); // deliberate underflow
4045 	  if (ujj >= gpui[6 * uii + 5]) {
4046 	    if (model_fisher) {
4047 	      ujj = case_ct - case_missing_ct;
4048 	      sval = fisher22_tail_pval(ukk, ujj - ukk, col1_sum - ukk, col2_sum + ukk - ujj, gpui[6 * uii + 5] - 1, gpd[2 * uii], gpd[2 * uii + 1], fisher_midp, case_homx_ct);
4049 	      if (results[pidx] > sval) {
4050 		results[pidx] = sval;
4051 	      }
4052 	    } else {
4053 	      sval = ((double)((intptr_t)case_homx_ct)) - gpd[2 * uii];
4054 	      sval = sval * sval * gpd[2 * uii + 1];
4055 	      if (results[pidx] < sval) {
4056 		results[pidx] = sval;
4057 	      }
4058 	    }
4059 	  }
4060 	} else {
4061 	  uii = case_ct - case_missing_ct;
4062 	  if (model_fisher) {
4063 	    sval = fisher22(case_homx_ct, uii - case_homx_ct, col1_sum - case_homx_ct, col2_sum + case_homx_ct - uii, fisher_midp);
4064 	    if (sval < stat_low) {
4065 	      success_2incr += 2;
4066 	    } else if (sval <= stat_high) {
4067 	      success_2incr++;
4068 	    }
4069 	    if (results[pidx] > sval) {
4070 	      results[pidx] = sval;
4071 	    }
4072 	  } else {
4073 	    sval = chi22_eval(case_homx_ct, uii, col1_sum, tot_obs);
4074 	    if (sval > stat_high) {
4075 	      success_2incr += 2;
4076 	    } else if (sval > stat_low) {
4077 	      success_2incr++;
4078 	    }
4079 	    if (results[pidx] < sval) {
4080 	      results[pidx] = sval;
4081 	    }
4082 	  }
4083 	  if (msa_ptr) {
4084 	    *msa_ptr++ = sval;
4085 	  }
4086 	}
4087       }
4088       perm_2success_ct[marker_idx++] += success_2incr;
4089     }
4090   model_maxt_domrec_thread_skip_all:
4091     if ((!tidx) || g_is_last_thread_block) {
4092       THREAD_RETURN;
4093     }
4094     THREAD_BLOCK_FINISH(tidx);
4095   }
4096 }
4097 
model_set_domrec_thread(void * arg)4098 THREAD_RET_TYPE model_set_domrec_thread(void* arg) {
4099   // Similar to assoc_set_thread().
4100   uintptr_t tidx = (uintptr_t)arg;
4101   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4102   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4103   uintptr_t perm_vec_ct = g_perm_vec_ct;
4104   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4105   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4106   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4107   uint32_t* git_homrar_cts = nullptr;
4108   uint32_t* git_missing_cts = nullptr;
4109   uint32_t* git_het_cts = nullptr;
4110   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4111   uint32_t* resultbuf = g_resultbuf;
4112   uint32_t case_ct = g_perm_case_ct;
4113   int32_t is_model_prec = g_is_model_prec;
4114   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4115   double* msa_ptr = nullptr;
4116   uintptr_t* loadbuf;
4117   uintptr_t* loadbuf_cur;
4118   uint32_t* __restrict__ missing_cts;
4119   uint32_t* __restrict__ het_cts;
4120   uint32_t* __restrict__ homcom_cts;
4121   uintptr_t pidx;
4122   uintptr_t marker_idx;
4123   intptr_t col1_sum;
4124   intptr_t tot_obs;
4125   uint32_t block_start;
4126   uint32_t marker_bidx_start;
4127   uint32_t marker_bidx;
4128   uint32_t marker_bceil;
4129   uint32_t case_homx_ct;
4130   uint32_t case_missing_ct;
4131   uint32_t missing_ct;
4132   uint32_t het_ct;
4133   uint32_t homcom_ct;
4134   while (1) {
4135     block_start = g_block_start;
4136     if (g_block_diff <= assoc_thread_ct) {
4137       if (g_block_diff <= tidx) {
4138 	goto model_set_domrec_thread_skip_all;
4139       }
4140       marker_bidx_start = block_start + tidx;
4141       marker_bceil = marker_bidx_start + 1;
4142     } else {
4143       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4144       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4145     }
4146     marker_bidx = marker_bidx_start;
4147     loadbuf = g_loadbuf;
4148     missing_cts = g_missing_cts;
4149     het_cts = g_het_cts;
4150     homcom_cts = g_homcom_cts;
4151     for (; marker_bidx < marker_bceil; marker_bidx++) {
4152       marker_idx = g_adapt_m_table[marker_bidx];
4153       msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
4154       missing_ct = missing_cts[marker_idx];
4155       het_ct = het_cts[marker_idx];
4156       homcom_ct = homcom_cts[marker_idx];
4157       tot_obs = pheno_nm_ct - missing_ct;
4158       if (is_model_prec) {
4159 	col1_sum = tot_obs - homcom_ct - het_ct;
4160       } else {
4161         col1_sum = homcom_ct;
4162       }
4163       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4164       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4165       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4166       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4167       fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4168       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4169       fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4170       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4171 	case_missing_ct = git_missing_cts[pidx];
4172         if (is_model_prec) {
4173 	  case_homx_ct = git_homrar_cts[pidx];
4174 	} else {
4175 	  case_homx_ct = case_ct - case_missing_ct - git_homrar_cts[pidx] - git_het_cts[pidx];
4176 	}
4177 	*msa_ptr++ = chi22_eval(case_homx_ct, case_ct - case_missing_ct, col1_sum, tot_obs);
4178       }
4179     }
4180   model_set_domrec_thread_skip_all:
4181     if ((!tidx) || g_is_last_thread_block) {
4182       THREAD_RETURN;
4183     }
4184     THREAD_BLOCK_FINISH(tidx);
4185   }
4186 }
4187 
model_adapt_trend_thread(void * arg)4188 THREAD_RET_TYPE model_adapt_trend_thread(void* arg) {
4189   uintptr_t tidx = (uintptr_t)arg;
4190   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4191   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4192   uintptr_t perm_vec_ct = g_perm_vec_ct;
4193   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4194   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4195   uint32_t precomp_width = g_precomp_width;
4196   uint32_t first_adapt_check = g_first_adapt_check;
4197   uint32_t case_ct = g_perm_case_ct;
4198   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4199   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4200   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4201   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4202   double adaptive_intercept = g_adaptive_intercept;
4203   double adaptive_slope = g_adaptive_slope;
4204   double adaptive_ci_zt = g_adaptive_ci_zt;
4205   double aperm_alpha = g_aperm_alpha;
4206   uintptr_t* __restrict__ loadbuf;
4207   double* __restrict__ orig_pvals;
4208   double* __restrict__ orig_chisq;
4209   uint32_t* __restrict__ missing_cts;
4210   uint32_t* __restrict__ het_cts;
4211   uint32_t* __restrict__ homcom_cts;
4212   uint32_t* __restrict__ precomp_start;
4213   uint32_t* __restrict__ precomp_ui;
4214   uint32_t* gpui;
4215   uintptr_t marker_idx;
4216   uintptr_t pidx;
4217   uint32_t marker_bidx;
4218   uint32_t marker_bceil;
4219   uint32_t success_2start;
4220   uint32_t success_2incr;
4221   uint32_t next_adapt_check;
4222   intptr_t tot_obs;
4223   uint32_t missing_start;
4224   uint32_t het_ct;
4225   uint32_t homcom_ct;
4226   uint32_t case_com_ct;
4227   uint32_t case_missing_ct;
4228   uint32_t uii;
4229   double chisq_high;
4230   double chisq_low;
4231   double pval;
4232   double dxx;
4233   double dyy;
4234   double dzz;
4235   while (1) {
4236     if (g_block_diff <= assoc_thread_ct) {
4237       if (g_block_diff <= tidx) {
4238         goto model_adapt_trend_thread_skip_all;
4239       }
4240       marker_bidx = g_block_start + tidx;
4241       marker_bceil = marker_bidx + 1;
4242     } else {
4243       marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4244       marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4245     }
4246     loadbuf = g_loadbuf;
4247     orig_pvals = g_orig_pvals;
4248     orig_chisq = g_orig_chisq;
4249     missing_cts = g_missing_cts;
4250     het_cts = g_het_cts;
4251     homcom_cts = g_homcom_cts;
4252     precomp_start = g_precomp_start;
4253     precomp_ui = g_precomp_ui;
4254     for (; marker_bidx < marker_bceil; marker_bidx++) {
4255       marker_idx = g_adapt_m_table[marker_bidx];
4256       next_adapt_check = first_adapt_check;
4257       if (orig_pvals[marker_idx] == -9) {
4258 	perm_adapt_stop[marker_idx] = 1;
4259 	perm_attempt_ct[marker_idx] = next_adapt_check;
4260 	perm_2success_ct[marker_idx] = next_adapt_check;
4261 	continue;
4262       }
4263       tot_obs = pheno_nm_ct - missing_cts[marker_idx];
4264       het_ct = het_cts[marker_idx];
4265       homcom_ct = homcom_cts[marker_idx];
4266       missing_start = precomp_start[marker_bidx];
4267       gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
4268       success_2start = perm_2success_ct[marker_idx];
4269       success_2incr = 0;
4270       chisq_high = orig_chisq[marker_idx] + EPSILON;
4271       chisq_low = orig_chisq[marker_idx] - EPSILON;
4272       for (pidx = 0; pidx < perm_vec_ct;) {
4273 	genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_com_ct, &case_missing_ct);
4274 	// deliberate underflow
4275 	uii = (uint32_t)(case_missing_ct - missing_start);
4276 	if (uii < precomp_width) {
4277 	  if (case_com_ct < gpui[4 * uii]) {
4278 	    if (case_com_ct < gpui[4 * uii + 2]) {
4279 	      success_2incr += 2;
4280 	    } else {
4281 	      success_2incr++;
4282 	    }
4283 	  } else {
4284 	    if (case_com_ct >= gpui[4 * uii + 1]) {
4285 	      if (case_com_ct >= gpui[4 * uii + 3]) {
4286 		success_2incr += 2;
4287 	      } else {
4288 		success_2incr++;
4289 	      }
4290 	    }
4291 	  }
4292 	} else {
4293 	  uii = case_ct - case_missing_ct;
4294 	  dxx = ca_trend_eval(case_com_ct, uii, het_ct, homcom_ct, tot_obs);
4295 	  if (dxx > chisq_high) {
4296 	    success_2incr += 2;
4297 	  } else if (dxx > chisq_low) {
4298 	    success_2incr++;
4299 	  }
4300 	}
4301 	if (++pidx == next_adapt_check - pidx_offset) {
4302 	  uii = success_2start + success_2incr;
4303 	  if (uii) {
4304 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
4305 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
4306 	    dyy = pval - dxx; // lower bound
4307 	    dzz = pval + dxx; // upper bound
4308 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
4309 	      perm_adapt_stop[marker_idx] = 1;
4310 	      perm_attempt_ct[marker_idx] = next_adapt_check;
4311 	      break;
4312 	    }
4313 	  }
4314 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
4315 	}
4316       }
4317       perm_2success_ct[marker_idx] += success_2incr;
4318     }
4319   model_adapt_trend_thread_skip_all:
4320     if ((!tidx) || g_is_last_thread_block) {
4321       THREAD_RETURN;
4322     }
4323     THREAD_BLOCK_FINISH(tidx);
4324   }
4325 }
4326 
model_maxt_trend_thread(void * arg)4327 THREAD_RET_TYPE model_maxt_trend_thread(void* arg) {
4328   uintptr_t tidx = (uintptr_t)arg;
4329   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4330   uintptr_t perm_vec_ct = g_perm_vec_ct;
4331   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4332   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4333   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4334   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4335   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4336   uint32_t* git_homrar_cts = nullptr;
4337   uint32_t* git_missing_cts = nullptr;
4338   uint32_t* git_het_cts = nullptr;
4339   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4340   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
4341   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
4342   uint32_t precomp_width = g_precomp_width;
4343   uint32_t case_ct = g_perm_case_ct;
4344   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4345   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4346   double* __restrict__ mperm_save_all = g_mperm_save_all;
4347   double* msa_ptr = nullptr;
4348   uintptr_t* __restrict__ loadbuf;
4349   uint32_t* __restrict__ missing_cts;
4350   uint32_t* __restrict__ het_cts;
4351   uint32_t* __restrict__ homcom_cts;
4352   uint32_t* __restrict__ precomp_start;
4353   uint32_t* __restrict__ precomp_ui;
4354   double* __restrict__ precomp_d;
4355   double* __restrict__ orig_pvals;
4356   double* __restrict__ orig_chisq;
4357   uint16_t* ldrefs;
4358   uintptr_t* loadbuf_cur;
4359   uint32_t* resultbuf;
4360   uint32_t* gpui;
4361   double* gpd;
4362   uint32_t block_start;
4363   uint32_t maxt_block_base;
4364   uint32_t maxt_block_base2;
4365   uint32_t marker_bidx_start;
4366   uint32_t maxt_block_base3;
4367   uint32_t marker_bidx;
4368   uintptr_t marker_idx;
4369   uint32_t marker_bceil;
4370   uintptr_t pidx;
4371   intptr_t tot_obs;
4372   uint32_t success_2incr;
4373   uint32_t missing_start;
4374   uint32_t missing_ct;
4375   uint32_t het_ct;
4376   uint32_t homcom_ct;
4377   uint32_t ldref;
4378   uint32_t case_com_ct;
4379   uint32_t case_missing_ct;
4380   uint32_t uii;
4381   uint32_t ujj;
4382   uint32_t ukk;
4383   double chisq_high;
4384   double chisq_low;
4385   double chisq;
4386   while (1) {
4387     block_start = g_block_start;
4388     if (g_block_diff <= assoc_thread_ct) {
4389       if (g_block_diff <= tidx) {
4390         goto model_maxt_trend_thread_skip_all;
4391       }
4392       marker_bidx_start = block_start + tidx;
4393       marker_bceil = marker_bidx_start + 1;
4394     } else {
4395       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4396       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4397     }
4398     maxt_block_base = g_maxt_block_base;
4399     maxt_block_base2 = maxt_block_base + block_start;
4400     maxt_block_base3 = maxt_block_base + marker_bidx_start;
4401     marker_bidx = marker_bidx_start;
4402     marker_idx = maxt_block_base3;
4403     loadbuf = g_loadbuf;
4404     missing_cts = g_missing_cts;
4405     het_cts = g_het_cts;
4406     homcom_cts = g_homcom_cts;
4407     precomp_start = g_precomp_start;
4408     precomp_ui = g_precomp_ui;
4409     precomp_d = g_precomp_d;
4410     orig_pvals = g_orig_pvals;
4411     orig_chisq = g_orig_chisq;
4412     resultbuf = g_resultbuf;
4413     ldrefs = g_ldrefs;
4414     memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
4415     if (mperm_save_all) {
4416       msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
4417     }
4418     for (; marker_bidx < marker_bceil; marker_bidx++) {
4419       if (orig_pvals[marker_idx] == -9) {
4420 	perm_2success_ct[marker_idx++] += perm_vec_ct;
4421 	continue;
4422       }
4423       missing_ct = missing_cts[marker_idx];
4424       tot_obs = pheno_nm_ct - missing_ct;
4425       het_ct = het_cts[marker_idx];
4426       homcom_ct = homcom_cts[marker_idx];
4427       missing_start = precomp_start[marker_bidx];
4428       gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
4429       gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
4430       chisq_high = orig_chisq[marker_idx] + EPSILON;
4431       chisq_low = orig_chisq[marker_idx] - EPSILON;
4432       success_2incr = 0;
4433       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4434       ldref = ldrefs[marker_idx];
4435       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4436       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4437       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4438       if (ldref == 65535) {
4439 	ldref = marker_bidx;
4440 	if (pheno_nm_ct - homcom_ct > 50) {
4441 	  check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4442 	}
4443 	ldrefs[marker_idx] = ldref;
4444       }
4445       if (ldref == marker_bidx) {
4446 	fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4447 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4448 	fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4449       } else {
4450 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4451 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4452       }
4453       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4454 	case_missing_ct = git_missing_cts[pidx];
4455 	case_com_ct = 2 * (case_ct - case_missing_ct - git_homrar_cts[pidx]) - git_het_cts[pidx];
4456 	// deliberate underflow
4457 	uii = (uint32_t)(case_missing_ct - missing_start);
4458 	if (uii < precomp_width) {
4459 	  if (case_com_ct < gpui[6 * uii]) {
4460 	    if (case_com_ct < gpui[6 * uii + 2]) {
4461 	      success_2incr += 2;
4462 	    } else {
4463 	      success_2incr++;
4464 	    }
4465 	  } else {
4466 	    if (case_com_ct >= gpui[6 * uii + 1]) {
4467 	      if (case_com_ct >= gpui[6 * uii + 3]) {
4468 		success_2incr += 2;
4469 	      } else {
4470 		success_2incr++;
4471 	      }
4472 	    }
4473 	  }
4474 	  ukk = gpui[6 * uii + 4];
4475 	  ujj = (uint32_t)(case_com_ct - ukk); // deliberate underflow
4476 	  if (ujj >= gpui[6 * uii + 5]) {
4477 	    chisq = ((double)((intptr_t)case_com_ct)) - gpd[2 * uii];
4478 	    chisq = chisq * chisq * gpd[2 * uii + 1];
4479 	    if (results[pidx] < chisq) {
4480 	      results[pidx] = chisq;
4481 	    }
4482 	  }
4483 	} else {
4484 	  chisq = ca_trend_eval(case_com_ct, case_ct - case_missing_ct, het_ct, homcom_ct, tot_obs);
4485 	  if (chisq > chisq_high) {
4486 	    success_2incr += 2;
4487 	  } else if (chisq > chisq_low) {
4488 	    success_2incr++;
4489 	  }
4490 	  if (results[pidx] < chisq) {
4491 	    results[pidx] = chisq;
4492 	  }
4493 	  if (msa_ptr) {
4494 	    *msa_ptr++ = chisq;
4495 	  }
4496 	}
4497       }
4498       perm_2success_ct[marker_idx++] += success_2incr;
4499     }
4500   model_maxt_trend_thread_skip_all:
4501     if ((!tidx) || g_is_last_thread_block) {
4502       THREAD_RETURN;
4503     }
4504     THREAD_BLOCK_FINISH(tidx);
4505   }
4506 }
4507 
model_set_trend_thread(void * arg)4508 THREAD_RET_TYPE model_set_trend_thread(void* arg) {
4509   // Similar to model_set_domrec_thread().  (In fact, it's so similar that it
4510   // may be appropriate to merge the functions.)
4511   uintptr_t tidx = (uintptr_t)arg;
4512   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4513   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4514   uintptr_t perm_vec_ct = g_perm_vec_ct;
4515   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4516   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4517   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4518   uint32_t* git_homrar_cts = nullptr;
4519   uint32_t* git_missing_cts = nullptr;
4520   uint32_t* git_het_cts = nullptr;
4521   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4522   uint32_t* resultbuf = g_resultbuf;
4523   uint32_t case_ct = g_perm_case_ct;
4524   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4525   double* msa_ptr = nullptr;
4526   uintptr_t* loadbuf;
4527   uintptr_t* loadbuf_cur;
4528   uint32_t* __restrict__ missing_cts;
4529   uint32_t* __restrict__ het_cts;
4530   uint32_t* __restrict__ homcom_cts;
4531   uintptr_t pidx;
4532   uintptr_t marker_idx;
4533   intptr_t tot_obs;
4534   uint32_t block_start;
4535   uint32_t marker_bidx_start;
4536   uint32_t marker_bidx;
4537   uint32_t marker_bceil;
4538   uint32_t case_com_ct;
4539   uint32_t case_missing_ct;
4540   uint32_t missing_ct;
4541   uint32_t het_ct;
4542   uint32_t homcom_ct;
4543   while (1) {
4544     block_start = g_block_start;
4545     if (g_block_diff <= assoc_thread_ct) {
4546       if (g_block_diff <= tidx) {
4547 	goto model_set_trend_thread_skip_all;
4548       }
4549       marker_bidx_start = block_start + tidx;
4550       marker_bceil = marker_bidx_start + 1;
4551     } else {
4552       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4553       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4554     }
4555     marker_bidx = marker_bidx_start;
4556     loadbuf = g_loadbuf;
4557     missing_cts = g_missing_cts;
4558     het_cts = g_het_cts;
4559     homcom_cts = g_homcom_cts;
4560     for (; marker_bidx < marker_bceil; marker_bidx++) {
4561       marker_idx = g_adapt_m_table[marker_bidx];
4562       msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
4563       missing_ct = missing_cts[marker_idx];
4564       tot_obs = pheno_nm_ct - missing_ct;
4565       het_ct = het_cts[marker_idx];
4566       homcom_ct = homcom_cts[marker_idx];
4567       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4568       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4569       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4570       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4571       fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4572       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4573       fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4574       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4575 	case_missing_ct = git_missing_cts[pidx];
4576 	case_com_ct = 2 * (case_ct - case_missing_ct - git_homrar_cts[pidx]) - git_het_cts[pidx];
4577 	*msa_ptr++ = ca_trend_eval(case_com_ct, case_ct - case_missing_ct, het_ct, homcom_ct, tot_obs);
4578       }
4579     }
4580   model_set_trend_thread_skip_all:
4581     if ((!tidx) || g_is_last_thread_block) {
4582       THREAD_RETURN;
4583     }
4584     THREAD_BLOCK_FINISH(tidx);
4585   }
4586 }
4587 
model_adapt_gen_thread(void * arg)4588 THREAD_RET_TYPE model_adapt_gen_thread(void* arg) {
4589   uintptr_t tidx = (uintptr_t)arg;
4590   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4591   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4592   uintptr_t perm_vec_ct = g_perm_vec_ct;
4593   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4594   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4595   uint32_t model_fisher = g_model_fisher;
4596   uint32_t fisher_midp = g_fisher_midp;
4597   uint32_t first_adapt_check = g_first_adapt_check;
4598   uint32_t case_ct = g_perm_case_ct;
4599   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4600   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4601   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4602   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4603   double adaptive_intercept = g_adaptive_intercept;
4604   double adaptive_slope = g_adaptive_slope;
4605   double adaptive_ci_zt = g_adaptive_ci_zt;
4606   double aperm_alpha = g_aperm_alpha;
4607   uintptr_t* __restrict__ loadbuf;
4608   double* __restrict__ orig_pvals;
4609   double* __restrict__ orig_chisq;
4610   uint32_t* __restrict__ missing_cts;
4611   uint32_t* __restrict__ het_cts;
4612   uint32_t* __restrict__ homcom_cts;
4613   uintptr_t marker_idx;
4614   uintptr_t pidx;
4615   uint32_t marker_bidx;
4616   uint32_t marker_bceil;
4617   uint32_t success_2start;
4618   uint32_t success_2incr;
4619   uint32_t next_adapt_check;
4620   uint32_t missing_col;
4621   intptr_t tot_obs;
4622   intptr_t homcom_ct;
4623   intptr_t homrar_ct;
4624   intptr_t het_ct;
4625   uint32_t case_missing_ct;
4626   uint32_t case_het_ct;
4627   uint32_t case_homcom_ct;
4628   uint32_t uii;
4629   double stat_high;
4630   double stat_low;
4631   double pval;
4632   double dxx;
4633   double dyy;
4634   double dzz;
4635   while (1) {
4636     if (g_block_diff <= assoc_thread_ct) {
4637       if (g_block_diff <= tidx) {
4638         goto model_adapt_gen_thread_skip_all;
4639       }
4640       marker_bidx = g_block_start + tidx;
4641       marker_bceil = marker_bidx + 1;
4642     } else {
4643       marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4644       marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4645     }
4646     loadbuf = g_loadbuf;
4647     orig_pvals = g_orig_pvals;
4648     orig_chisq = g_orig_chisq;
4649     missing_cts = g_missing_cts;
4650     het_cts = g_het_cts;
4651     homcom_cts = g_homcom_cts;
4652     for (; marker_bidx < marker_bceil; marker_bidx++) {
4653       marker_idx = g_adapt_m_table[marker_bidx];
4654       if (model_fisher) {
4655         if (orig_pvals[marker_idx] == -9) {
4656           perm_adapt_stop[marker_idx] = 1;
4657           perm_attempt_ct[marker_idx] = 0;
4658           continue;
4659         }
4660 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
4661 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
4662       } else {
4663         if (orig_chisq[marker_idx] == -9) {
4664           perm_adapt_stop[marker_idx] = 1;
4665           perm_attempt_ct[marker_idx] = 0;
4666           continue;
4667         }
4668 	stat_high = orig_chisq[marker_idx] + EPSILON;
4669 	stat_low = orig_chisq[marker_idx] - EPSILON;
4670       }
4671       next_adapt_check = first_adapt_check;
4672       het_ct = het_cts[marker_idx];
4673       tot_obs = pheno_nm_ct - missing_cts[marker_idx];
4674       homcom_ct = homcom_cts[marker_idx];
4675       homrar_ct = tot_obs - het_ct - homcom_ct;
4676       if (!homcom_ct) {
4677 	missing_col = 3;
4678       } else if ((het_ct + homcom_ct == tot_obs) || (!het_ct)) {
4679 	missing_col = 2; // either no hom A1s or no hets (no need to distinguish)
4680       } else {
4681 	missing_col = 0;
4682       }
4683       success_2start = perm_2success_ct[marker_idx];
4684       success_2incr = 0;
4685       for (pidx = 0; pidx < perm_vec_ct;) {
4686 	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
4687 	if (model_fisher) {
4688 	  uii = case_ct - case_het_ct - case_homcom_ct - case_missing_ct;
4689 	  // this is very slow.  a precomputed 2-dimensional table could
4690 	  // improve matters, but I doubt it's worth the effort for now.
4691 	  dxx = fisher23(case_homcom_ct, case_het_ct, uii, homcom_ct - case_homcom_ct, het_ct - case_het_ct, homrar_ct - uii, fisher_midp);
4692 	  if (dxx < stat_low) {
4693 	    success_2incr += 2;
4694 	  } else if (dxx <= stat_high) {
4695 	    success_2incr++;
4696 	  }
4697 	} else {
4698 	  if (!missing_col) {
4699 	    dxx = chi23_eval(case_homcom_ct, case_het_ct, case_ct - case_missing_ct, homcom_ct, het_ct, tot_obs);
4700 	  } else if (missing_col == 3) {
4701 	    dxx = chi22_eval(case_het_ct, case_ct - case_missing_ct, het_ct, tot_obs);
4702 	  } else {
4703 	    dxx = chi22_eval(case_homcom_ct, case_ct - case_missing_ct, homcom_ct, tot_obs);
4704 	  }
4705 	  if (dxx > stat_high) {
4706 	    success_2incr += 2;
4707 	  } else if (dxx > stat_low) {
4708 	    success_2incr++;
4709 	  }
4710 	}
4711 	if (++pidx == next_adapt_check - pidx_offset) {
4712 	  uii = success_2start + success_2incr;
4713 	  if (uii) {
4714 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
4715 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
4716 	    dyy = pval - dxx; // lower bound
4717 	    dzz = pval + dxx; // upper bound
4718 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
4719 	      perm_adapt_stop[marker_idx] = 1;
4720 	      perm_attempt_ct[marker_idx] = next_adapt_check;
4721 	      break;
4722 	    }
4723 	  }
4724 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
4725 	}
4726       }
4727       perm_2success_ct[marker_idx] += success_2incr;
4728     }
4729   model_adapt_gen_thread_skip_all:
4730     if ((!tidx) || g_is_last_thread_block) {
4731       THREAD_RETURN;
4732     }
4733     THREAD_BLOCK_FINISH(tidx);
4734   }
4735 }
4736 
model_maxt_gen_thread(void * arg)4737 THREAD_RET_TYPE model_maxt_gen_thread(void* arg) {
4738   uintptr_t tidx = (uintptr_t)arg;
4739   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4740   uintptr_t perm_vec_ct = g_perm_vec_ct;
4741   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4742   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4743   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4744   uint32_t model_fisher = g_model_fisher;
4745   uint32_t fisher_midp = g_fisher_midp;
4746   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4747   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4748   uint32_t* git_homrar_cts = nullptr;
4749   uint32_t* git_missing_cts = nullptr;
4750   uint32_t* git_het_cts = nullptr;
4751   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4752   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
4753   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
4754   uint32_t case_ct = g_perm_case_ct;
4755   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4756   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4757   double* __restrict__ mperm_save_all = g_mperm_save_all;
4758   double* msa_ptr = nullptr;
4759   uintptr_t* __restrict__ loadbuf;
4760   uint32_t* __restrict__ missing_cts;
4761   uint32_t* __restrict__ het_cts;
4762   uint32_t* __restrict__ homcom_cts;
4763   double* __restrict__ orig_pvals;
4764   double* __restrict__ orig_chisq;
4765   uint16_t* ldrefs;
4766   uintptr_t* loadbuf_cur;
4767   uint32_t* resultbuf;
4768   uintptr_t pidx;
4769   uint32_t missing_col;
4770   intptr_t tot_obs;
4771   uintptr_t marker_idx;
4772   uint32_t block_start;
4773   uint32_t maxt_block_base;
4774   uint32_t maxt_block_base2;
4775   uint32_t marker_bidx_start;
4776   uint32_t maxt_block_base3;
4777   uint32_t marker_bidx;
4778   uint32_t marker_bceil;
4779   int32_t missing_ct;
4780   intptr_t homcom_ct;
4781   intptr_t homrar_ct;
4782   intptr_t het_ct;
4783   uint32_t ldref;
4784   uint32_t success_2incr;
4785   uint32_t case_missing_ct;
4786   uint32_t case_het_ct;
4787   uint32_t case_homcom_ct;
4788   uint32_t uii;
4789   double stat_high;
4790   double stat_low;
4791   double sval;
4792   while (1) {
4793     block_start = g_block_start;
4794     if (g_block_diff <= assoc_thread_ct) {
4795       if (g_block_diff <= tidx) {
4796         goto model_maxt_gen_thread_skip_all;
4797       }
4798       marker_bidx_start = block_start + tidx;
4799       marker_bceil = marker_bidx_start + 1;
4800     } else {
4801       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4802       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4803     }
4804     maxt_block_base = g_maxt_block_base;
4805     maxt_block_base2 = maxt_block_base + block_start;
4806     maxt_block_base3 = maxt_block_base + marker_bidx_start;
4807     marker_bidx = marker_bidx_start;
4808     marker_idx = maxt_block_base3;
4809     loadbuf = g_loadbuf;
4810     missing_cts = g_missing_cts;
4811     het_cts = g_het_cts;
4812     homcom_cts = g_homcom_cts;
4813     orig_pvals = g_orig_pvals;
4814     orig_chisq = g_orig_chisq;
4815     resultbuf = g_resultbuf;
4816     ldrefs = g_ldrefs;
4817 
4818     memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
4819     if (mperm_save_all) {
4820       msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
4821     }
4822     for (; marker_bidx < marker_bceil; marker_bidx++) {
4823       if (model_fisher) {
4824 	if (orig_pvals[marker_idx] == -9) {
4825 	model_maxt_gen_thread_skip_marker:
4826 	  marker_idx++;
4827 	  if (msa_ptr) {
4828 	    for (pidx = 0; pidx < perm_vec_ct; ++pidx) {
4829 	      *msa_ptr++ = -9;
4830 	    }
4831 	  }
4832 	  continue;
4833 	}
4834 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
4835 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
4836       } else {
4837 	if (orig_chisq[marker_idx] == -9) {
4838 	  goto model_maxt_gen_thread_skip_marker;
4839 	}
4840 	stat_high = orig_chisq[marker_idx] + EPSILON;
4841 	stat_low = orig_chisq[marker_idx] - EPSILON;
4842       }
4843       missing_ct = missing_cts[marker_idx];
4844       het_ct = het_cts[marker_idx];
4845       tot_obs = pheno_nm_ct - missing_ct;
4846       homcom_ct = homcom_cts[marker_idx];
4847       homrar_ct = tot_obs - het_ct - homcom_ct;
4848       if (!homcom_ct) {
4849 	missing_col = 3;
4850       } else if ((het_ct + homcom_ct == tot_obs) || (!het_ct)) {
4851 	missing_col = 2;
4852       } else {
4853 	missing_col = 0;
4854       }
4855       success_2incr = 0;
4856       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4857       ldref = ldrefs[marker_idx];
4858       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4859       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4860       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4861       if (ldref == 65535) {
4862 	ldref = marker_bidx;
4863 	if (pheno_nm_ct - homcom_ct > 50) {
4864 	  check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4865 	}
4866 	ldrefs[marker_idx] = ldref;
4867       }
4868       if (ldref == marker_bidx) {
4869 	fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4870 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4871 	fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4872       } else {
4873 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4874 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4875       }
4876       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4877 	case_missing_ct = git_missing_cts[pidx];
4878 	case_het_ct = git_het_cts[pidx];
4879 	case_homcom_ct = case_ct - case_missing_ct - case_het_ct - git_homrar_cts[pidx];
4880 	if (model_fisher) {
4881 	  uii = case_ct - case_het_ct - case_homcom_ct - case_missing_ct;
4882 	  sval = fisher23(case_homcom_ct, case_het_ct, uii, homcom_ct - case_homcom_ct, het_ct - case_het_ct, homrar_ct - uii, fisher_midp);
4883 	  if (sval < stat_low) {
4884 	    success_2incr += 2;
4885 	  } else if (sval <= stat_high) {
4886 	    success_2incr++;
4887 	  }
4888 	  if (results[pidx] > sval) {
4889 	    results[pidx] = sval;
4890 	  }
4891 	} else {
4892 	  if (!missing_col) {
4893 	    sval = chi23_eval(case_homcom_ct, case_het_ct, case_ct - case_missing_ct, homcom_ct, het_ct, tot_obs);
4894 	  } else if (missing_col == 3) {
4895 	    sval = chi22_eval(case_het_ct, case_ct - case_missing_ct, het_ct, tot_obs);
4896 	  } else {
4897 	    sval = chi22_eval(case_homcom_ct, case_ct - case_missing_ct, homcom_ct, tot_obs);
4898 	  }
4899 	  if (sval > stat_high) {
4900 	    success_2incr += 2;
4901 	  } else if (sval > stat_low) {
4902 	    success_2incr++;
4903 	  }
4904 	  if (results[pidx] < sval) {
4905 	    results[pidx] = sval;
4906 	  }
4907 	}
4908 	if (msa_ptr) {
4909 	  *msa_ptr++ = sval;
4910 	}
4911       }
4912       perm_2success_ct[marker_idx++] += success_2incr;
4913     }
4914   model_maxt_gen_thread_skip_all:
4915     if ((!tidx) || g_is_last_thread_block) {
4916       THREAD_RETURN;
4917     }
4918     THREAD_BLOCK_FINISH(tidx);
4919   }
4920 }
4921 
model_adapt_best_thread(void * arg)4922 THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
4923   uintptr_t tidx = (uintptr_t)arg;
4924   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4925   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4926   uintptr_t perm_vec_ct = g_perm_vec_ct;
4927   uint32_t assoc_thread_ct = g_assoc_thread_ct;
4928   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4929   uint32_t model_fisher = g_model_fisher;
4930   uint32_t fisher_midp = g_fisher_midp;
4931   uint32_t precomp_width = g_precomp_width;
4932   uint32_t first_adapt_check = g_first_adapt_check;
4933   uint32_t case_ct = g_perm_case_ct;
4934   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4935   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4936   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4937   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4938   double adaptive_intercept = g_adaptive_intercept;
4939   double adaptive_slope = g_adaptive_slope;
4940   double adaptive_ci_zt = g_adaptive_ci_zt;
4941   double aperm_alpha = g_aperm_alpha;
4942   uintptr_t* __restrict__ loadbuf;
4943   uintptr_t* is_invalid;
4944   double* __restrict__ orig_pvals;
4945   double* __restrict__ orig_chisq;
4946   uint32_t* __restrict__ missing_cts;
4947   uint32_t* __restrict__ het_cts;
4948   uint32_t* __restrict__ homcom_cts;
4949   uint32_t* __restrict__ precomp_start;
4950   uint32_t* __restrict__ precomp_ui;
4951   uint32_t* gpui;
4952   uintptr_t marker_idx;
4953   uintptr_t pidx;
4954   uint32_t marker_bidx;
4955   uint32_t marker_bceil;
4956   uint32_t success_2start;
4957   uint32_t success_2incr;
4958   uint32_t next_adapt_check;
4959   intptr_t tot_obs;
4960   intptr_t com_ct;
4961   intptr_t het_ct;
4962   intptr_t homrar_ct;
4963   intptr_t homcom_ct;
4964   uint32_t missing_start;
4965   uint32_t case_homrar_ct;
4966   uint32_t case_homcom_ct;
4967   uint32_t case_het_ct;
4968   uint32_t case_missing_ct;
4969   uint32_t case_com_ct;
4970   uint32_t skip_domrec;
4971   uint32_t uii;
4972   uint32_t ujj;
4973   uint32_t ukk;
4974   double stat_high;
4975   double stat_low;
4976   double pval;
4977   double dxx;
4978   double dyy;
4979   double dzz;
4980   while (1) {
4981     if (g_block_diff <= assoc_thread_ct) {
4982       if (g_block_diff <= tidx) {
4983         goto model_adapt_best_thread_skip_all;
4984       }
4985       marker_bidx = g_block_start + tidx;
4986       marker_bceil = marker_bidx + 1;
4987     } else {
4988       marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4989       marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4990     }
4991     loadbuf = g_loadbuf;
4992     is_invalid = g_is_invalid_bitfield;
4993     orig_pvals = g_orig_pvals;
4994     orig_chisq = g_orig_chisq;
4995     missing_cts = g_missing_cts;
4996     het_cts = g_het_cts;
4997     homcom_cts = g_homcom_cts;
4998     precomp_start = g_precomp_start;
4999     precomp_ui = g_precomp_ui;
5000 
5001     for (; marker_bidx < marker_bceil; marker_bidx++) {
5002       marker_idx = g_adapt_m_table[marker_bidx];
5003       if (model_fisher) {
5004         if (orig_pvals[marker_idx] == -9) {
5005 	  perm_adapt_stop[marker_idx] = 1;
5006 	  perm_attempt_ct[marker_idx] = 0;
5007 	  continue;
5008         }
5009 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
5010 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
5011       } else {
5012 	if (orig_chisq[marker_idx] == -9) {
5013 	  perm_adapt_stop[marker_idx] = 1;
5014 	  perm_attempt_ct[marker_idx] = 0;
5015 	  continue;
5016 	}
5017 	stat_high = orig_chisq[marker_idx] + EPSILON;
5018 	stat_low = orig_chisq[marker_idx] - EPSILON;
5019       }
5020       next_adapt_check = first_adapt_check;
5021       tot_obs = pheno_nm_ct - missing_cts[marker_idx];
5022       het_ct = het_cts[marker_idx];
5023       homcom_ct = homcom_cts[marker_idx];
5024       com_ct = homcom_ct * 2 + het_ct;
5025       homrar_ct = tot_obs - het_ct - homcom_ct;
5026       missing_start = precomp_start[marker_bidx];
5027       skip_domrec = IS_SET(is_invalid, marker_idx);
5028       gpui = &(precomp_ui[12 * precomp_width * marker_bidx]);
5029       success_2start = perm_2success_ct[marker_idx];
5030       success_2incr = 0;
5031       for (pidx = 0; pidx < perm_vec_ct;) {
5032 	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
5033 	case_homrar_ct = case_ct - case_missing_ct - case_het_ct - case_homcom_ct;
5034 	case_com_ct = case_het_ct + 2 * case_homcom_ct;
5035 	ujj = 0; // best increment so far
5036 	// deliberate underflow
5037 	uii = (uint32_t)(case_missing_ct - missing_start);
5038 	if (uii < precomp_width) {
5039 	  if (case_com_ct < gpui[12 * uii]) {
5040 	    if (case_com_ct < gpui[12 * uii + 2]) {
5041 	      goto model_adapt_best_thread_betterstat;
5042 	    } else {
5043 	      ujj = 1;
5044 	    }
5045 	  } else {
5046 	    if (case_com_ct >= gpui[12 * uii + 1]) {
5047 	      if (case_com_ct >= gpui[12 * uii + 3]) {
5048 		goto model_adapt_best_thread_betterstat;
5049 	      } else {
5050 		ujj = 1;
5051 	      }
5052 	    }
5053 	  }
5054 	  if (!skip_domrec) {
5055 	    if (case_homcom_ct < gpui[12 * uii + 4]) {
5056 	      if (case_homcom_ct < gpui[12 * uii + 6]) {
5057 		goto model_adapt_best_thread_betterstat;
5058 	      } else {
5059 		ujj = 1;
5060 	      }
5061 	    } else {
5062 	      if (case_homcom_ct >= gpui[12 * uii + 5]) {
5063 		if (case_homcom_ct >= gpui[12 * uii + 7]) {
5064 		  goto model_adapt_best_thread_betterstat;
5065 		} else {
5066 		  ujj = 1;
5067 		}
5068 	      }
5069 	    }
5070 	    if (case_homrar_ct < gpui[12 * uii + 8]) {
5071 	      if (case_homrar_ct < gpui[12 * uii + 10]) {
5072 		goto model_adapt_best_thread_betterstat;
5073 	      } else {
5074 		ujj = 1;
5075 	      }
5076 	    } else {
5077 	      if (case_homrar_ct >= gpui[12 * uii + 9]) {
5078 		if (case_homrar_ct >= gpui[12 * uii + 11]) {
5079 		  goto model_adapt_best_thread_betterstat;
5080 		} else {
5081 		  ujj = 1;
5082 		}
5083 	      }
5084 	    }
5085 	  }
5086 	} else if (1) {
5087 	  uii = case_ct - case_missing_ct; // nonmissing cases
5088 	  if (model_fisher) {
5089 	    ukk = tot_obs - uii; // nonmissing controls
5090 	    dxx = fisher22(case_com_ct, 2 * uii - case_com_ct, com_ct - case_com_ct, 2 * ukk + case_com_ct - com_ct, fisher_midp);
5091 	    if (dxx < stat_low) {
5092 	      goto model_adapt_best_thread_betterstat;
5093 	    } else if (dxx <= stat_high) {
5094 	      ujj = 1;
5095 	    }
5096 	    if (!skip_domrec) {
5097 	      dxx = fisher22(case_homcom_ct, uii - case_homcom_ct, homcom_ct - case_homcom_ct, ukk + case_homcom_ct - homcom_ct, fisher_midp);
5098 	      if (dxx < stat_low) {
5099 		goto model_adapt_best_thread_betterstat;
5100 	      } else if (dxx <= stat_high) {
5101 		ujj = 1;
5102 	      }
5103 	      dxx = fisher22(case_homrar_ct, uii - case_homrar_ct, homrar_ct - case_homrar_ct, ukk + case_homrar_ct - homrar_ct, fisher_midp);
5104 	      if (dxx < stat_low) {
5105 		goto model_adapt_best_thread_betterstat;
5106 	      } else if (dxx <= stat_high) {
5107 		ujj = 1;
5108 	      }
5109 	    }
5110 	  } else {
5111 	    dxx = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5112 	    if (dxx > stat_high) {
5113 	      goto model_adapt_best_thread_betterstat;
5114 	    } else if (dxx > stat_low) {
5115 	      ujj = 1;
5116 	    }
5117 	    if (!skip_domrec) {
5118 	      dxx = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5119 	      if (dxx > stat_high) {
5120 		goto model_adapt_best_thread_betterstat;
5121 	      } else if (dxx > stat_low) {
5122 		ujj = 1;
5123 	      }
5124 	      dxx = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5125 	      if (dxx > stat_high) {
5126 		goto model_adapt_best_thread_betterstat;
5127 	      } else if (dxx > stat_low) {
5128 		ujj = 1;
5129 	      }
5130 	    }
5131 	  }
5132 	} else {
5133 	model_adapt_best_thread_betterstat:
5134 	  ujj = 2;
5135 	}
5136 	success_2incr += ujj;
5137 	if (++pidx == next_adapt_check - pidx_offset) {
5138 	  uii = success_2start + success_2incr;
5139 	  if (uii) {
5140 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
5141 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
5142 	    dyy = pval - dxx; // lower bound
5143 	    dzz = pval + dxx; // upper bound
5144 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
5145 	      perm_adapt_stop[marker_idx] = 1;
5146 	      perm_attempt_ct[marker_idx] = next_adapt_check;
5147 	      break;
5148 	    }
5149 	  }
5150 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
5151 	}
5152       }
5153       perm_2success_ct[marker_idx] += success_2incr;
5154     }
5155   model_adapt_best_thread_skip_all:
5156     if ((!tidx) || g_is_last_thread_block) {
5157       THREAD_RETURN;
5158     }
5159     THREAD_BLOCK_FINISH(tidx);
5160   }
5161 }
5162 
model_maxt_best_thread(void * arg)5163 THREAD_RET_TYPE model_maxt_best_thread(void* arg) {
5164   uintptr_t tidx = (uintptr_t)arg;
5165   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
5166   uintptr_t perm_vec_ct = g_perm_vec_ct;
5167   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5168   uint32_t assoc_thread_ct = g_assoc_thread_ct;
5169   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
5170   uint32_t model_fisher = g_model_fisher;
5171   uint32_t fisher_midp = g_fisher_midp;
5172   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
5173   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
5174   uint32_t* git_homrar_cts = nullptr;
5175   uint32_t* git_missing_cts = nullptr;
5176   uint32_t* git_het_cts = nullptr;
5177   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5178   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
5179   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
5180   uint32_t precomp_width = g_precomp_width;
5181   uint32_t case_ct = g_perm_case_ct;
5182   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
5183   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
5184   double* __restrict__ mperm_save_all = g_mperm_save_all;
5185   double* msa_ptr = nullptr;
5186   uintptr_t* __restrict__ loadbuf;
5187   uintptr_t* is_invalid;
5188   uint32_t* __restrict__ missing_cts;
5189   uint32_t* __restrict__ het_cts;
5190   uint32_t* __restrict__ homcom_cts;
5191   uint32_t* __restrict__ precomp_start;
5192   uint32_t* __restrict__ precomp_ui;
5193   double* __restrict__ precomp_d;
5194   double* __restrict__ orig_pvals;
5195   double* __restrict__ orig_chisq;
5196   uint16_t* ldrefs;
5197   uintptr_t* loadbuf_cur;
5198   uint32_t* resultbuf;
5199   uint32_t* gpui;
5200   double* gpd;
5201   uintptr_t pidx;
5202   uintptr_t marker_idx;
5203   int32_t missing_ct;
5204   intptr_t tot_obs;
5205   intptr_t com_ct;
5206   intptr_t rar_ct;
5207   intptr_t het_ct;
5208   intptr_t homrar_ct;
5209   intptr_t homcom_ct;
5210   uint32_t block_start;
5211   uint32_t maxt_block_base;
5212   uint32_t maxt_block_base2;
5213   uint32_t marker_bidx_start;
5214   uint32_t maxt_block_base3;
5215   uint32_t marker_bidx;
5216   uint32_t marker_bceil;
5217   uint32_t ldref;
5218   uint32_t success_2incr;
5219   uint32_t missing_start;
5220   uint32_t case_homrar_ct;
5221   uint32_t case_homcom_ct;
5222   uint32_t case_het_ct;
5223   uint32_t case_missing_ct;
5224   uint32_t case_com_ct;
5225   uint32_t skip_domrec;
5226   uint32_t uii;
5227   uint32_t ujj;
5228   uint32_t ukk;
5229   uint32_t cur_add;
5230   double stat_high;
5231   double stat_low;
5232   double sval;
5233   double best_stat;
5234   double default_best_stat;
5235   while (1) {
5236     block_start = g_block_start;
5237     if (g_block_diff <= assoc_thread_ct) {
5238       if (g_block_diff <= tidx) {
5239         goto model_maxt_best_thread_skip_all;
5240       }
5241       marker_bidx_start = block_start + tidx;
5242       marker_bceil = marker_bidx_start + 1;
5243     } else {
5244       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
5245       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
5246     }
5247     maxt_block_base = g_maxt_block_base;
5248     maxt_block_base2 = maxt_block_base + block_start;
5249     maxt_block_base3 = maxt_block_base + marker_bidx_start;
5250     marker_bidx = marker_bidx_start;
5251     marker_idx = maxt_block_base3;
5252     loadbuf = g_loadbuf;
5253     is_invalid = g_is_invalid_bitfield;
5254     missing_cts = g_missing_cts;
5255     het_cts = g_het_cts;
5256     homcom_cts = g_homcom_cts;
5257     precomp_start = g_precomp_start;
5258     precomp_ui = g_precomp_ui;
5259     precomp_d = g_precomp_d;
5260     orig_pvals = g_orig_pvals;
5261     orig_chisq = g_orig_chisq;
5262     resultbuf = g_resultbuf;
5263     ldrefs = g_ldrefs;
5264 
5265     memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
5266     if (mperm_save_all) {
5267       msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
5268     }
5269     for (; marker_bidx < marker_bceil; marker_bidx++) {
5270       if (model_fisher) {
5271 	if (orig_pvals[marker_idx] == -9) {
5272 	  marker_idx++;
5273 	  continue;
5274 	}
5275 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
5276 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
5277 	default_best_stat = 1;
5278       } else {
5279 	if (orig_chisq[marker_idx] == -9) {
5280 	  marker_idx++;
5281 	  continue;
5282 	}
5283 	stat_high = orig_chisq[marker_idx] + EPSILON;
5284 	stat_low = orig_chisq[marker_idx] - EPSILON;
5285 	default_best_stat = 0;
5286       }
5287       gpd = &(precomp_d[6 * precomp_width * marker_bidx]);
5288       missing_ct = missing_cts[marker_idx];
5289       tot_obs = pheno_nm_ct - missing_ct;
5290       het_ct = het_cts[marker_idx];
5291       homcom_ct = homcom_cts[marker_idx];
5292       com_ct = 2 * homcom_ct + het_ct;
5293       rar_ct = tot_obs * 2 - com_ct;
5294       homrar_ct = tot_obs - homcom_ct - het_ct;
5295       missing_start = precomp_start[marker_bidx];
5296       skip_domrec = IS_SET(is_invalid, marker_idx);
5297       gpui = &(precomp_ui[18 * precomp_width * marker_bidx]);
5298       success_2incr = 0;
5299       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
5300       ldref = ldrefs[marker_idx];
5301       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
5302       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
5303       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
5304       if (ldref == 65535) {
5305 	ldref = marker_bidx;
5306 	if (pheno_nm_ct - homcom_ct > 50) {
5307 	  check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
5308 	}
5309 	ldrefs[marker_idx] = ldref;
5310       }
5311       if (ldref == marker_bidx) {
5312 	fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
5313 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
5314 	fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
5315       } else {
5316 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
5317 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
5318       }
5319       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5320 	case_missing_ct = git_missing_cts[pidx];
5321 	case_het_ct = git_het_cts[pidx];
5322 	case_homrar_ct = git_homrar_cts[pidx];
5323 	case_homcom_ct = case_ct - case_missing_ct - case_het_ct - case_homrar_ct;
5324 	case_com_ct = case_het_ct + 2 * case_homcom_ct;
5325 	cur_add = 0;
5326 	// deliberate underflow
5327 	uii = (uint32_t)(case_missing_ct - missing_start);
5328 	if (uii < precomp_width) {
5329 	  best_stat = default_best_stat;
5330 	  if (case_com_ct < gpui[18 * uii]) {
5331 	    if (case_com_ct < gpui[18 * uii + 2]) {
5332 	      cur_add = 2;
5333 	    } else {
5334 	      cur_add = 1;
5335 	    }
5336 	  } else {
5337 	    if (case_com_ct >= gpui[18 * uii + 1]) {
5338 	      if (case_com_ct >= gpui[18 * uii + 3]) {
5339 		cur_add = 2;
5340 	      } else {
5341 		cur_add = 1;
5342 	      }
5343 	    }
5344 	  }
5345 	  ukk = gpui[18 * uii + 4];
5346 	  ujj = (uint32_t)(case_com_ct - ukk); // deliberate underflow
5347 	  if (ujj >= gpui[18 * uii + 5]) {
5348 	    if (model_fisher) {
5349 	      ujj = 2 * (case_ct - case_missing_ct);
5350 	      best_stat = fisher22_tail_pval(ukk, ujj - ukk, com_ct - ukk, rar_ct + ukk - ujj, gpui[18 * uii + 5] - 1, gpd[6 * uii], gpd[6 * uii + 1], fisher_midp, case_com_ct);
5351 	    } else {
5352 	      best_stat = ((double)((intptr_t)case_com_ct)) - gpd[6 * uii];
5353 	      best_stat = best_stat * best_stat * gpd[6 * uii + 1];
5354 	    }
5355 	  }
5356 	  if (!skip_domrec) {
5357 	    if (cur_add != 2) {
5358 	      if (case_homcom_ct < gpui[18 * uii + 6]) {
5359 		if (case_homcom_ct < gpui[18 * uii + 8]) {
5360 		  goto model_maxt_best_thread_domrec2;
5361 		} else {
5362 		  cur_add = 1;
5363 		}
5364 	      } else {
5365 		if (case_homcom_ct >= gpui[18 * uii + 7]) {
5366 		  if (case_homcom_ct >= gpui[18 * uii + 9]) {
5367 		    goto model_maxt_best_thread_domrec2;
5368 		  } else {
5369 		    cur_add = 1;
5370 		  }
5371 		}
5372 	      }
5373 	      if (1) {
5374 		if (case_homrar_ct < gpui[18 * uii + 12]) {
5375 		  if (case_homrar_ct < gpui[18 * uii + 14]) {
5376 		    goto model_maxt_best_thread_domrec2;
5377 		  } else {
5378 		    cur_add = 1;
5379 		  }
5380 		} else {
5381 		  if (case_homrar_ct >= gpui[18 * uii + 13]) {
5382 		    if (case_homrar_ct >= gpui[18 * uii + 15]) {
5383 		      goto model_maxt_best_thread_domrec2;
5384 		    } else {
5385 		      cur_add = 1;
5386 		    }
5387 		  }
5388 		}
5389 	      } else {
5390 	      model_maxt_best_thread_domrec2:
5391 		cur_add = 2;
5392 	      }
5393 	    }
5394 	    ukk = gpui[18 * uii + 10];
5395 	    ujj = (uint32_t)(case_homcom_ct - ukk); // deliberate underflow
5396 	    if (ujj >= gpui[18 * uii + 11]) {
5397 	      if (model_fisher) {
5398 		ujj = case_ct - case_missing_ct;
5399 		sval = fisher22_tail_pval(ukk, ujj - ukk, homcom_ct - ukk, homrar_ct + het_ct + ukk - ujj, gpui[18 * uii + 11] - 1, gpd[6 * uii + 2], gpd[6 * uii + 3], fisher_midp, case_homcom_ct);
5400 		if (sval < best_stat) {
5401 		  best_stat = sval;
5402 		}
5403 	      } else {
5404 		sval = ((double)((intptr_t)case_homcom_ct)) - gpd[6 * uii + 2];
5405 		sval = sval * sval * gpd[6 * uii + 3];
5406 		if (sval > best_stat) {
5407 		  best_stat = sval;
5408 		}
5409 	      }
5410 	    }
5411 	    ukk = gpui[18 * uii + 16];
5412 	    ujj = (uint32_t)(case_homrar_ct - ukk); // deliberate underflow
5413 	    if (ujj >= gpui[18 * uii + 17]) {
5414 	      if (model_fisher) {
5415 		ujj = case_ct - case_missing_ct;
5416 		sval = fisher22_tail_pval(ukk, ujj - ukk, homrar_ct - ukk, homcom_ct + het_ct + ukk - ujj, gpui[18 * uii + 17] - 1, gpd[6 * uii + 4], gpd[6 * uii + 5], fisher_midp, case_homrar_ct);
5417 		if (sval < best_stat) {
5418 		  best_stat = sval;
5419 		}
5420 	      } else {
5421 		sval = ((double)((intptr_t)case_homrar_ct)) - gpd[6 * uii + 4];
5422 		sval = sval * sval * gpd[6 * uii + 5];
5423 		if (sval > best_stat) {
5424 		  best_stat = sval;
5425 		}
5426 	      }
5427 	    }
5428 	  }
5429 	} else {
5430 	  uii = case_ct - case_missing_ct;
5431 	  if (model_fisher) {
5432 	    ukk = tot_obs - uii;
5433 	    best_stat = fisher22(case_com_ct, 2 * uii - case_com_ct, com_ct - case_com_ct, 2 * ukk + case_com_ct - com_ct, fisher_midp);
5434 	    if (!skip_domrec) {
5435 	      sval = fisher22(case_homcom_ct, uii - case_homcom_ct, homcom_ct - case_homcom_ct, ukk + case_homcom_ct - homcom_ct, fisher_midp);
5436 	      if (sval < best_stat) {
5437 		best_stat = sval;
5438 	      }
5439 	      sval = fisher22(case_homrar_ct, uii - case_homrar_ct, homrar_ct - case_homrar_ct, ukk + case_homrar_ct - homrar_ct, fisher_midp);
5440 	      if (sval < best_stat) {
5441 		best_stat = sval;
5442 	      }
5443 	    }
5444 	    if (best_stat < stat_low) {
5445 	      cur_add = 2;
5446 	    } else if (best_stat <= stat_high) {
5447 	      cur_add = 1;
5448 	    }
5449 	  } else {
5450 	    best_stat = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5451 	    if (!skip_domrec) {
5452 	      sval = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5453 	      if (sval > best_stat) {
5454 		best_stat = sval;
5455 	      }
5456 	      sval = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5457 	      if (sval > best_stat) {
5458 		best_stat = sval;
5459 	      }
5460 	    }
5461 	    if (best_stat > stat_high) {
5462 	      cur_add = 2;
5463 	    } else if (best_stat > stat_low) {
5464 	      cur_add = 1;
5465 	    }
5466 	  }
5467 	  if (msa_ptr) {
5468 	    *msa_ptr++ = best_stat;
5469 	  }
5470 	}
5471 	success_2incr += cur_add;
5472 	if (model_fisher) {
5473 	  if (results[pidx] > best_stat) {
5474 	    results[pidx] = best_stat;
5475 	  }
5476 	} else {
5477 	  if (results[pidx] < best_stat) {
5478 	    results[pidx] = best_stat;
5479 	  }
5480 	}
5481       }
5482       perm_2success_ct[marker_idx++] += success_2incr;
5483     }
5484   model_maxt_best_thread_skip_all:
5485     if ((!tidx) || g_is_last_thread_block) {
5486       THREAD_RETURN;
5487     }
5488     THREAD_BLOCK_FINISH(tidx);
5489   }
5490 }
5491 
model_set_best_thread(void * arg)5492 THREAD_RET_TYPE model_set_best_thread(void* arg) {
5493   // Similar to model_set_domrec_thread().
5494   uintptr_t tidx = (uintptr_t)arg;
5495   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
5496   uint32_t assoc_thread_ct = g_assoc_thread_ct;
5497   uintptr_t perm_vec_ct = g_perm_vec_ct;
5498   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5499   uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
5500   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
5501   uint32_t* git_homrar_cts = nullptr;
5502   uint32_t* git_missing_cts = nullptr;
5503   uint32_t* git_het_cts = nullptr;
5504   uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5505   uint32_t* resultbuf = g_resultbuf;
5506   uint32_t case_ct = g_perm_case_ct;
5507   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
5508   double* msa_ptr = nullptr;
5509   uintptr_t* loadbuf;
5510   uintptr_t* loadbuf_cur;
5511   uintptr_t* is_invalid;
5512   uint32_t* __restrict__ missing_cts;
5513   uint32_t* __restrict__ het_cts;
5514   uint32_t* __restrict__ homcom_cts;
5515   uintptr_t pidx;
5516   uintptr_t marker_idx;
5517   intptr_t tot_obs;
5518   intptr_t com_ct;
5519   intptr_t het_ct;
5520   intptr_t homrar_ct;
5521   intptr_t homcom_ct;
5522   double best_stat;
5523   double sval;
5524   uint32_t block_start;
5525   uint32_t marker_bidx_start;
5526   uint32_t marker_bidx;
5527   uint32_t marker_bceil;
5528   uint32_t case_homrar_ct;
5529   uint32_t case_homcom_ct;
5530   uint32_t case_het_ct;
5531   uint32_t case_missing_ct;
5532   uint32_t case_com_ct;
5533   uint32_t skip_domrec;
5534   uint32_t uii;
5535   int32_t missing_ct;
5536   while (1) {
5537     block_start = g_block_start;
5538     if (g_block_diff <= assoc_thread_ct) {
5539       if (g_block_diff <= tidx) {
5540 	goto model_set_best_thread_skip_all;
5541       }
5542       marker_bidx_start = block_start + tidx;
5543       marker_bceil = marker_bidx_start + 1;
5544     } else {
5545       marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
5546       marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
5547     }
5548     marker_bidx = marker_bidx_start;
5549     loadbuf = g_loadbuf;
5550     is_invalid = g_is_invalid_bitfield;
5551     missing_cts = g_missing_cts;
5552     het_cts = g_het_cts;
5553     homcom_cts = g_homcom_cts;
5554     for (; marker_bidx < marker_bceil; marker_bidx++) {
5555       marker_idx = g_adapt_m_table[marker_bidx];
5556       msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
5557       missing_ct = missing_cts[marker_idx];
5558       tot_obs = pheno_nm_ct - missing_ct;
5559       het_ct = het_cts[marker_idx];
5560       homcom_ct = homcom_cts[marker_idx];
5561       com_ct = 2 * homcom_ct + het_ct;
5562       homrar_ct = tot_obs - homcom_ct - het_ct;
5563       skip_domrec = IS_SET(is_invalid, marker_idx);
5564       loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
5565       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
5566       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
5567       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
5568       fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
5569       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
5570       fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
5571       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5572 	case_missing_ct = git_missing_cts[pidx];
5573 	case_het_ct = git_het_cts[pidx];
5574 	case_homrar_ct = git_homrar_cts[pidx];
5575 	case_homcom_ct = case_ct - case_missing_ct - case_het_ct - case_homrar_ct;
5576 	case_com_ct = case_het_ct + 2 * case_homcom_ct;
5577 	uii = case_ct - case_missing_ct;
5578 	best_stat = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5579 	if (!skip_domrec) {
5580           sval = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5581 	  if (sval > best_stat) {
5582 	    best_stat = sval;
5583 	  }
5584 	  sval = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5585           if (sval > best_stat) {
5586             best_stat = sval;
5587 	  }
5588 	}
5589 	*msa_ptr++ = best_stat;
5590       }
5591     }
5592   model_set_best_thread_skip_all:
5593     if ((!tidx) || g_is_last_thread_block) {
5594       THREAD_RETURN;
5595     }
5596     THREAD_BLOCK_FINISH(tidx);
5597   }
5598 }
5599 
model_assoc_set_test(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,char * outname_end2,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,uintptr_t * marker_exclude_mid,uintptr_t marker_ct_mid,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sex_male,Aperm_info * apip,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * founder_pnm,uint32_t gender_req,uint32_t ld_ignore_x,uint32_t hh_exists,uint32_t perm_batch_size,Set_info * sip,uintptr_t * loadbuf_raw)5600 int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Aperm_info* apip, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* founder_pnm, uint32_t gender_req, uint32_t ld_ignore_x, uint32_t hh_exists, uint32_t perm_batch_size, Set_info* sip, uintptr_t* loadbuf_raw) {
5601   // Could reuse more of the code in model_assoc() since there's considerable
5602   // overlap, but there are enough differences between the regular and set
5603   // permutation tests that separating this out and doing a fair bit of
5604   // cut-and-paste is justifiable (especially for the first version of this
5605   // function).
5606 
5607   // There are three levels of marker subsets here.
5608   // 1. marker_exclude_orig refers to all markers which passed QC filters, etc.
5609   //    This is needed to interpret the main set data structure.
5610   // 2. marker_exclude_mid refers to all markers contained in at least one set.
5611   //    This is a subset of marker_exclude_orig.  (They are identical if
5612   //    --gene-all was specified.)  It was used during the single-marker
5613   //    association test phase, and describes which markers orig_chisq[],
5614   //    g_missing_cts[], etc. elements initially refer to.
5615   // 3. Finally, the marker_exclude used for set-based permutation testing
5616   //    refers to all markers contained in at least one *significant* set.
5617   //    orig_chisq is collapsed before permutation to be congruent to this
5618   //    marker_exclude.
5619   unsigned char* bigstack_mark = g_bigstack_base;
5620   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
5621   uintptr_t* marker_exclude = marker_exclude_mid;
5622   uintptr_t* unstopped_markers = nullptr;
5623   uintptr_t* loadbuf = g_loadbuf;
5624   uintptr_t* sample_male_include2 = g_sample_male_include2;
5625   uintptr_t* perm_adapt_set_unstopped = nullptr;
5626   char* tbuf2 = &(g_textbuf[MAXLINELEN]);
5627   double* orig_chisq = g_orig_chisq;
5628   double* sorted_chisq_buf = nullptr;
5629   uint32_t* marker_idx_to_uidx = nullptr;
5630   uint32_t* sorted_marker_idx_buf = nullptr;
5631   uint32_t* proxy_arr = nullptr;
5632   uint32_t* perm_2success_ct = nullptr;
5633   uint32_t* perm_attempt_ct = nullptr;
5634   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5635   uintptr_t marker_ct = marker_ct_mid;
5636   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
5637   uintptr_t ulii = 0;
5638   double adaptive_ci_zt = 0.0;
5639   uint32_t model_assoc = model_modifier & MODEL_ASSOC;
5640   uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
5641   uint32_t model_perm_best = !(model_modifier & MODEL_PMASK);
5642   uint32_t max_thread_ct = g_thread_ct;
5643   uint32_t perms_done = 0;
5644   int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
5645   int32_t retval = 0;
5646   uintptr_t* set_incl;
5647   uintptr_t* loadbuf_ptr;
5648   double* orig_set_scores;
5649   double* chisq_pmajor;
5650   double* chisq_ptr;
5651   double* read_dptr;
5652   double* write_dptr;
5653   unsigned char* bigstack_mark2;
5654   uint32_t** setdefs;
5655   uint32_t** ld_map;
5656   uintptr_t marker_uidx;
5657   uintptr_t marker_midx;
5658   uintptr_t marker_idx;
5659   uintptr_t marker_idx2;
5660   uintptr_t set_ct;
5661   uintptr_t set_idx;
5662   uintptr_t perm_vec_ct;
5663   uintptr_t perm_vec_ctcl4m;
5664   uintptr_t pidx;
5665   double chisq_threshold;
5666   double dxx;
5667   uint32_t perms_total;
5668   uint32_t block_size;
5669   uint32_t block_end;
5670   uint32_t assoc_thread_ct;
5671   uint32_t chrom_fo_idx;
5672   uint32_t chrom_end;
5673   uint32_t is_x;
5674   uint32_t is_y;
5675   uint32_t min_ploidy_1;
5676   uint32_t marker_unstopped_ct;
5677   uint32_t is_last_block;
5678   uint32_t first_adapt_check;
5679   uint32_t max_sigset_size;
5680   uint32_t marker_bidx;
5681   uint32_t uii;
5682   if (sip->set_test_lambda > 1.0) {
5683     dxx = 1.0 / sip->set_test_lambda;
5684     chisq_ptr = orig_chisq;
5685     for (marker_midx = 0; marker_midx < marker_ct; marker_midx++) {
5686       *chisq_ptr *= dxx;
5687       chisq_ptr++;
5688     }
5689   }
5690   ulii = (uintptr_t)(outname_end - outname);
5691   // don't want to overwrite .assoc extension, etc.
5692   memcpy(tbuf2, outname, ulii);
5693   retval = set_test_common_init(threads, bedfile, bed_offset, tbuf2, &(tbuf2[ulii]), unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_ids, max_marker_id_len, marker_reverse, orig_chisq, sip, chrom_info_ptr, unfiltered_sample_ct, sex_male, founder_pnm, ld_ignore_x, hh_exists, "--assoc/--model", &marker_ct, &marker_exclude, &set_incl, &marker_idx_to_uidx, &setdefs, &set_ct, &max_sigset_size, &ld_map, &chisq_threshold, &orig_set_scores, &sorted_chisq_buf, &sorted_marker_idx_buf, &proxy_arr, &perm_adapt_set_unstopped, &perm_2success_ct, &perm_attempt_ct, &unstopped_markers);
5694   if (retval) {
5695     goto model_assoc_set_test_ret_1;
5696   }
5697   if (!set_ct) {
5698     goto model_assoc_set_test_write;
5699   }
5700   if (marker_ct_mid != marker_ct) {
5701     // collapse these arrays so the permutation inner loop is faster
5702     inplace_delta_collapse_arr((char*)g_missing_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5703     if (model_assoc) {
5704       inplace_delta_collapse_arr((char*)g_set_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5705     } else {
5706       inplace_delta_collapse_arr((char*)g_het_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5707       inplace_delta_collapse_arr((char*)g_homcom_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5708       if (model_perm_best) {
5709 	inplace_delta_collapse_bitfield(g_is_invalid_bitfield, marker_ct, marker_exclude_mid, marker_exclude);
5710       }
5711     }
5712   }
5713 
5714   if (model_modifier & MODEL_PERM) {
5715     perms_total = apip->max;
5716     first_adapt_check = (apip->min < apip->init_interval)? ((int32_t)apip->init_interval) : apip->min;
5717     adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)set_ct)));
5718   } else {
5719     perms_total = model_mperm_val;
5720     first_adapt_check = perms_total + 1;
5721   }
5722   for (uii = 0; uii < set_ct; uii++) {
5723     perm_attempt_ct[uii] = perms_total;
5724   }
5725   if (max_thread_ct > perms_total) {
5726     max_thread_ct = perms_total;
5727   }
5728   if (bigstack_init_sfmtp(max_thread_ct)) {
5729     goto model_assoc_set_test_ret_NOMEM;
5730   }
5731   marker_unstopped_ct = marker_ct;
5732   g_block_start = 0; // will be nonzero sometimes after LD-exploitation added
5733 
5734   // generate a permutation batch, efficiently compute chi-square stats for all
5735   // variants in at least one tested set, compute set score, compare to base
5736   // set score.
5737   bigstack_mark2 = g_bigstack_base;
5738  model_assoc_set_test_more_perms:
5739   if (perms_done) {
5740     uii = apip->init_interval;
5741     while (first_adapt_check <= perms_done) {
5742       first_adapt_check += (int32_t)(uii + ((int32_t)first_adapt_check) * apip->interval_slope);
5743     }
5744   }
5745   // perm_vec_ct memory allocation dependencies:
5746   //   g_perm_vecst: 16 * ((perm_vec_ct + 127) / 128) * pheno_nm_ct
5747   //   g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 1152 * thread_ct
5748   //   g_resultbuf: MODEL_BLOCKSIZE * (4 * perm_vec_ct, CL-aligned) * 3
5749   //   g_perm_vecs: pheno_nm_ctv2 * sizeof(intptr_t) * perm_vec_ct
5750   //   g_mperm_save_all: MODEL_BLOCKSIZE * 8 * perm_vec_ct
5751   //   chisq_pmajor: marker_ct * 8 * perm_vec_ct
5752   // If we force perm_vec_ct to be a multiple of 128, then we have
5753   //   perm_vec_ct * (9 * max_thread_ct + 20 * MODEL_BLOCKSIZE +
5754   //                    pheno_nm_ct / 8 + sizeof(intptr_t) * pheno_nm_ctv2
5755   //                    + marker_ct * sizeof(double))
5756   perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 1152LL * max_thread_ct + 2560LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
5757   if (perm_vec_ct > perm_batch_size) {
5758     perm_vec_ct = perm_batch_size;
5759   }
5760   if (perm_vec_ct > perms_total - perms_done) {
5761     perm_vec_ct = perms_total - perms_done;
5762   } else if (!perm_vec_ct) {
5763     goto model_assoc_set_test_ret_NOMEM;
5764   }
5765   perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5766   perms_done += perm_vec_ct;
5767   g_perms_done = perms_done;
5768   g_perm_vec_ct = perm_vec_ct;
5769   bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
5770   g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
5771   ulii = 0;
5772   if (!g_perm_cluster_starts) {
5773     if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
5774       goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5775     }
5776     generate_cc_perms_thread((void*)ulii);
5777   } else {
5778     if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
5779       goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5780     }
5781     generate_cc_cluster_perms_thread((void*)ulii);
5782   }
5783   join_threads(threads, g_perm_generation_thread_ct);
5784   g_assoc_thread_ct = max_thread_ct;
5785   bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
5786 #ifdef __LP64__
5787   ulii = ((perm_vec_ct + 127) / 128) * 4;
5788   bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
5789 #else
5790   ulii = (perm_vec_ct + 31) / 32;
5791   bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
5792   ulii = ((perm_vec_ct + 63) / 64) * 2;
5793 #endif
5794   bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
5795   transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
5796   bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all);
5797   bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor);
5798   chrom_fo_idx = 0xffffffffU;
5799   marker_uidx = next_unset_unsafe(marker_exclude, 0);
5800   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5801     goto model_assoc_set_test_ret_READ_FAIL;
5802   }
5803   marker_idx = 0;
5804   marker_idx2 = 0;
5805   chrom_end = 0;
5806   do {
5807     if (marker_uidx >= chrom_end) {
5808       if (model_assoc) {
5809 	// exploit overflow
5810 	chrom_fo_idx++;
5811 	refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
5812 	min_ploidy_1 |= uii; // treat MT as haploid
5813 	g_min_ploidy_1 = min_ploidy_1;
5814 	uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5815 	g_is_y = is_y;
5816       } else {
5817 	// no need to skip MT/haploid here, since we error out on that case
5818 	// earlier
5819 	do {
5820 	  chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
5821 	} while (marker_uidx >= chrom_end);
5822 	uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5823 	is_x = (uii == (uint32_t)x_code);
5824       }
5825       g_is_x = is_x;
5826     }
5827     block_size = g_block_start;
5828     block_end = marker_unstopped_ct - marker_idx;
5829     if (block_end > MODEL_BLOCKSIZE) {
5830       block_end = MODEL_BLOCKSIZE;
5831     }
5832     do {
5833       if (!IS_SET(unstopped_markers, marker_idx2)) {
5834 	do {
5835 	  marker_uidx++;
5836 	  next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
5837 	  marker_idx2++;
5838 	} while ((marker_uidx < chrom_end) && (!IS_SET(unstopped_markers, marker_idx2)));
5839 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5840 	  goto model_assoc_set_test_ret_READ_FAIL;
5841 	}
5842 	if (marker_uidx >= chrom_end) {
5843 	  break;
5844 	}
5845       }
5846       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
5847       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
5848 	goto model_assoc_set_test_ret_READ_FAIL;
5849       }
5850       g_adapt_m_table[block_size] = marker_idx2++;
5851       if (is_x && (!model_assoc)) {
5852 	force_missing((unsigned char*)(&(loadbuf[block_size * pheno_nm_ctv2])), sample_male_include2, pheno_nm_ct);
5853       }
5854       block_size++;
5855       if (marker_idx + block_size == marker_unstopped_ct) {
5856 	break;
5857       }
5858       marker_uidx++;
5859       if (IS_SET(marker_exclude, marker_uidx)) {
5860 	marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
5861 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5862 	  goto model_assoc_set_test_ret_READ_FAIL;
5863 	}
5864       }
5865     } while ((block_size < block_end) && (marker_uidx < chrom_end));
5866     if (!block_size) {
5867       continue;
5868     }
5869     g_block_diff = block_size;
5870     assoc_thread_ct = g_block_diff;
5871     if (assoc_thread_ct > max_thread_ct) {
5872       assoc_thread_ct = max_thread_ct;
5873     }
5874     is_last_block = (marker_idx + block_size == marker_unstopped_ct);
5875     ulii = 0;
5876     if (model_assoc) {
5877       if (spawn_threads2(threads, &assoc_set_thread, max_thread_ct, is_last_block)) {
5878 	goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5879       }
5880       assoc_set_thread((void*)ulii);
5881     } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
5882       if (spawn_threads2(threads, &model_set_domrec_thread, max_thread_ct, is_last_block)) {
5883 	goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5884       }
5885       model_set_domrec_thread((void*)ulii);
5886     } else if (model_modifier & MODEL_PTREND) {
5887       if (spawn_threads2(threads, &model_set_trend_thread, max_thread_ct, is_last_block)) {
5888 	goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5889       }
5890       model_set_trend_thread((void*)ulii);
5891     } else {
5892       if (spawn_threads2(threads, &model_set_best_thread, max_thread_ct, is_last_block)) {
5893 	goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5894       }
5895       model_set_best_thread((void*)ulii);
5896     }
5897     join_threads2(threads, max_thread_ct, is_last_block);
5898     for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5899       // transpose
5900       read_dptr = &(g_mperm_save_all[pidx]);
5901       write_dptr = &(chisq_pmajor[pidx * marker_ct]);
5902       for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
5903 	write_dptr[g_adapt_m_table[marker_bidx]] = read_dptr[marker_bidx * perm_vec_ct];
5904       }
5905     }
5906     marker_idx += block_size;
5907   } while (marker_idx < marker_unstopped_ct);
5908   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
5909   bigstack_reset(bigstack_mark2);
5910   if (perms_done < perms_total) {
5911     if (model_modifier & MODEL_PERM) {
5912       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
5913 	perms_done = 0;
5914 	for (set_idx = 0; set_idx < set_ct; set_idx++) {
5915 	  if (perms_done < perm_attempt_ct[set_idx]) {
5916 	    perms_done = perm_attempt_ct[set_idx];
5917 	  }
5918 	}
5919 	goto model_assoc_set_test_perms_done;
5920       }
5921       // bugfix (7 Aug 2018): forgot to update marker_unstopped_ct
5922       marker_unstopped_ct = popcount_longs(unstopped_markers, (marker_ct + BITCT - 1) / BITCT);
5923     }
5924     printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
5925     fflush(stdout);
5926     goto model_assoc_set_test_more_perms;
5927   }
5928  model_assoc_set_test_perms_done:
5929   putc_unlocked('\r', stdout);
5930   LOGPRINTF("%u permutation%s complete.\n", perms_done, (perms_done != 1)? "s" : "");
5931  model_assoc_set_test_write:
5932   if (model_modifier & MODEL_PERM) {
5933     memcpy(outname_end2, ".set.perm", 10);
5934   } else {
5935     memcpy(outname_end2, ".set.mperm", 11);
5936   }
5937   retval = write_set_test_results(outname, &(outname_end2[4]), sip, ld_map, setdefs, set_incl, set_ct, marker_ct_orig, marker_ct, marker_idx_to_uidx, marker_ids, max_marker_id_len, perm_2success_ct, perm_attempt_ct, mtest_adjust, perm_count, pfilter, output_min_p, chisq_threshold, orig_chisq, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr);
5938   while (0) {
5939   model_assoc_set_test_ret_NOMEM:
5940     retval = RET_NOMEM;
5941     break;
5942   model_assoc_set_test_ret_READ_FAIL:
5943     retval = RET_READ_FAIL;
5944     break;
5945   model_assoc_set_test_ret_THREAD_CREATE_FAIL:
5946     retval = RET_THREAD_CREATE_FAIL;
5947     break;
5948   }
5949  model_assoc_set_test_ret_1:
5950   bigstack_reset(bigstack_mark);
5951   return retval;
5952 }
5953 
get_model_assoc_precomp_bounds(uint32_t missing_ct,uint32_t is_model,uint32_t * minp,uint32_t * ctp)5954 void get_model_assoc_precomp_bounds(uint32_t missing_ct, uint32_t is_model, uint32_t* minp, uint32_t* ctp) {
5955   // Estimate which case missing counts are most common.
5956   // Expected value = (g_perm_case_ct * missing_ct / g_perm_pheno_nm_ct)
5957   // If X-chromosome and (!is_model):
5958   //   Lower bound = max(0, missing_ct - 2 * (g_perm_pheno_nm_ct -
5959   //                 g_perm_case_ct))
5960   //   Upper bound = min(g_perm_case_ct * 2, missing_ct)
5961   //   (Could be a bit more precise if we tracked missing male and female
5962   //    counts separately, but whatever)
5963   //   Each male automatically contributes 1 to initial missing_ct!
5964   // Otherwise:
5965   //   Lower bound = max(0, missing_ct - (g_perm_pheno_nm_ct - g_perm_case_ct))
5966   //   Upper bound = min(g_perm_case_ct, missing_ct)
5967   double xval = ((double)(g_perm_case_ct * ((int64_t)missing_ct))) / ((double)((intptr_t)g_perm_pheno_nm_ct));
5968   intptr_t lbound = (intptr_t)(xval + EPSILON + 1 - ((double)((intptr_t)g_precomp_width)) * 0.5);
5969   intptr_t ctrl_ct = g_perm_pheno_nm_ct - g_perm_case_ct;
5970   intptr_t ubound = missing_ct;
5971   intptr_t lii;
5972   if (lbound < 0) {
5973     lbound = 0;
5974   }
5975   if (g_is_x && (!is_model)) {
5976     lii = missing_ct - (2 * ctrl_ct);
5977     if (((uintptr_t)ubound) > g_perm_case_ct * 2) {
5978       ubound = g_perm_case_ct * 2;
5979     }
5980   } else {
5981     lii = missing_ct - ctrl_ct;
5982     if (((uintptr_t)ubound) > g_perm_case_ct) {
5983       ubound = g_perm_case_ct;
5984     }
5985   }
5986   if (lii > lbound) {
5987     lbound = lii;
5988   }
5989   *minp = lbound;
5990   if ((intptr_t)(lbound + g_precomp_width) > ubound) {
5991     *ctp = ubound + 1 - lbound;
5992   } else {
5993     *ctp = g_precomp_width;
5994   }
5995 }
5996 
model_assoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_cell_ct,uint32_t model_mperm_val,double ci_size,double ci_zt,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * founder_info,uintptr_t * sex_male,uint32_t hh_exists,uint32_t ld_ignore_x,uint32_t perm_batch_size,Set_info * sip)5997 int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_cell_ct, uint32_t model_mperm_val, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* founder_info, uintptr_t* sex_male, uint32_t hh_exists, uint32_t ld_ignore_x, uint32_t perm_batch_size, Set_info* sip) {
5998   unsigned char* bigstack_mark = g_bigstack_base;
5999   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
6000   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
6001   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
6002   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
6003   int32_t retval = 0;
6004   FILE* outfile = nullptr;
6005   FILE* outfile_msa = nullptr;
6006   uintptr_t* marker_exclude = marker_exclude_orig;
6007   uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
6008   uintptr_t marker_ct = marker_ct_orig;
6009   uintptr_t perm_vec_ct = 0;
6010   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
6011   uint32_t model_assoc = model_modifier & MODEL_ASSOC;
6012   uint32_t model_perms = model_modifier & (MODEL_PERM | MODEL_MPERM);
6013   uint32_t is_set_test = model_modifier & MODEL_SET_TEST;
6014   uint32_t model_adapt_nst = (model_modifier & MODEL_PERM) && (!is_set_test);
6015   uint32_t model_maxt_nst = (model_modifier & MODEL_MPERM) && (!is_set_test);
6016   uint32_t model_perms_nst = model_perms && (!is_set_test);
6017   uint32_t model_trendonly = model_modifier & MODEL_TRENDONLY;
6018   uint32_t model_perm_best = !(model_modifier & MODEL_PMASK);
6019   uint32_t model_perm_count = model_modifier & MODEL_PERM_COUNT;
6020   uint32_t assoc_counts = model_modifier & MODEL_ASSOC_COUNTS;
6021   uint32_t display_ci = (ci_size > 0);
6022   uint32_t perms_total = 0;
6023   uint32_t male_ct = 0;
6024   uint32_t nonmale_ct = 0;
6025   uint32_t ctrl_male_ct = 0;
6026   uint32_t case_male_ct = 0;
6027   uint32_t ctrl_nonmale_ct = 0;
6028   uint32_t case_nonmale_ct = 0;
6029   uint32_t load_ctrl_ct = 0;
6030   uint32_t load_case_ct = 0;
6031   uint32_t precomp_width = 0;
6032   uint32_t is_y = 0;
6033   int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
6034   int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
6035   int32_t mt_code = chrom_info_ptr->xymt_codes[MT_OFFSET];
6036   uintptr_t* sample_nonmale_ctrl_include2 = nullptr;
6037   uintptr_t* sample_nonmale_case_include2 = nullptr;
6038   uintptr_t* sample_male_ctrl_include2 = nullptr;
6039   uintptr_t* sample_male_case_include2 = nullptr;
6040   uintptr_t* sample_male_include2 = nullptr;
6041   uintptr_t* cur_ctrl_include2 = nullptr;
6042   uintptr_t* cur_case_include2 = nullptr;
6043   uintptr_t* is_invalid_bitfield = nullptr;
6044   uintptr_t* founder_pnm = nullptr;
6045   uint32_t* perm_2success_ct = nullptr;
6046   uint32_t* perm_attempt_ct = nullptr;
6047   uint32_t* set_cts = nullptr;
6048   uint32_t* het_cts = nullptr;
6049   uint32_t* homcom_cts = nullptr;
6050   uint32_t* precomp_ui = nullptr;
6051   double* orig_chisq = nullptr;
6052   double* maxt_extreme_stat = nullptr;
6053   double* orig_odds = nullptr;
6054   double* precomp_d = nullptr;
6055   unsigned char* perm_adapt_stop = nullptr;
6056   double dxx = 0.0;
6057   double dww = 0.0;
6058   double dvv = 0.0;
6059   double mult_p = 0.0;
6060   double gen_p = 0.0;
6061   double dom_p = 0.0;
6062   double rec_p = 0.0;
6063   double ca_chisq = 0.0;
6064   double maxt_cur_extreme_stat = 0;
6065   uint32_t pct = 0;
6066   uint32_t max_thread_ct = g_thread_ct;
6067   uint32_t perm_pass_idx = 0;
6068   uintptr_t perm_vec_ctcl4m = 0;
6069   uint32_t model_fisher = model_modifier & MODEL_FISHER;
6070   uint32_t model_fisherx = model_fisher && (!(model_modifier & MODEL_PTREND));
6071   uint32_t fisher_midp = model_modifier & MODEL_FISHER_MIDP;
6072   char* writebuf = g_textbuf;
6073   char* chrom_name_ptr = nullptr;
6074   uint32_t chrom_name_len = 0;
6075   char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
6076   uint32_t mu_table[MODEL_BLOCKSIZE];
6077   uint32_t uibuf[4];
6078   char wbuf[48];
6079   char* wptr_start;
6080   char* wptr;
6081   char* wptr2;
6082   char* wptr_mid;
6083   char* wptr_mid2;
6084   char* outname_end2;
6085   uint32_t assoc_thread_ct;
6086   uint32_t fill_orig_chisq;
6087   uint32_t marker_unstopped_ct;
6088   uint32_t gender_req;
6089   uint32_t case_ct;
6090   uint32_t ctrl_ct;
6091   uint32_t chrom_fo_idx;
6092   uint32_t chrom_end;
6093   uint32_t marker_bidx;
6094   uint32_t block_size;
6095   uint32_t block_end;
6096   uint32_t perms_done;
6097   uintptr_t marker_uidx; // loading
6098   uintptr_t marker_uidx2; // writing
6099   uintptr_t marker_idx;
6100   uintptr_t marker_idx2;
6101   uint32_t* marker_idx_to_uidx;
6102   uint32_t* missp;
6103   uint32_t* setp;
6104   uint32_t* hetp;
6105   uint32_t* missing_cts;
6106   double* orig_pvals;
6107   double* orig_pvals_ptr;
6108   double* ooptr;
6109   uintptr_t* loadbuf_raw;
6110   uintptr_t* loadbuf;
6111   uintptr_t* loadbuf_ptr;
6112   uintptr_t* sample_ctrl_include2;
6113   uintptr_t* sample_case_include2;
6114   uint32_t load_sample_ct;
6115   uintptr_t ulii;
6116   uint32_t min_ploidy_1;
6117   uint32_t is_x;
6118   uint32_t is_last_block;
6119   uint32_t uii;
6120   uint32_t ujj;
6121   uint32_t ukk;
6122   uint32_t umm;
6123   uint32_t unn;
6124   uint32_t uoo;
6125   uint32_t upp;
6126   uint32_t uqq;
6127   uint32_t urr;
6128   uint32_t uss;
6129   uint32_t is_invalid;
6130   uint32_t marker_ctl;
6131   double pval;
6132   double dyy;
6133   double dzz;
6134   double da1;
6135   double da2;
6136   double du1;
6137   double du2;
6138   double ca_p;
6139   char* a1ptr;
6140   char* a2ptr;
6141   uint32_t loop_end;
6142   if (pheno_nm_ct < 2) {
6143     logerrprint("Warning: Skipping --assoc/--model since less than two phenotypes are present.\n");
6144     goto model_assoc_ret_1;
6145   }
6146   if (max_marker_allele_len > MAXLINELEN) {
6147     if (bigstack_alloc_c(2 * max_marker_allele_len + MAXLINELEN, &writebuf)) {
6148       goto model_assoc_ret_NOMEM;
6149     }
6150   }
6151   g_model_fisher = model_fisher;
6152   g_fisher_midp = fisher_midp;
6153   g_perm_pheno_nm_ct = pheno_nm_ct;
6154   perms_done = 0;
6155   g_is_model_prec = model_modifier / MODEL_PREC;
6156   g_perm_is_1bit = 0;
6157   g_mperm_save_all = nullptr;
6158   g_sample_male_include2 = nullptr;
6159   if (is_set_test) {
6160     if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
6161       goto model_assoc_ret_NOMEM;
6162     }
6163     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
6164     bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
6165     if (extract_set_union_unfiltered(sip, nullptr, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
6166       goto model_assoc_ret_NOMEM;
6167     }
6168   }
6169   if (model_maxt_nst) {
6170     perms_total = model_mperm_val;
6171     if (bigstack_alloc_d(perms_total, &maxt_extreme_stat)) {
6172       goto model_assoc_ret_NOMEM;
6173     }
6174     g_maxt_extreme_stat = maxt_extreme_stat;
6175     if (model_fisherx) {
6176       for (uii = 0; uii < perms_total; uii++) {
6177 	maxt_extreme_stat[uii] = 1;
6178       }
6179     } else {
6180       fill_double_zero(perms_total, maxt_extreme_stat);
6181     }
6182     if (mperm_save & MPERM_DUMP_ALL) {
6183       memcpy(outname_end, ".mperm.dump.all", 16);
6184       if (fopen_checked(outname, "w", &outfile_msa)) {
6185 	goto model_assoc_ret_OPEN_FAIL;
6186       }
6187       LOGPRINTFWW("Dumping all permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", outname);
6188     }
6189   } else {
6190     mperm_save = 0;
6191     if (model_adapt_nst) {
6192       g_aperm_alpha = apip->alpha;
6193       perms_total = apip->max;
6194       if (apip->min < apip->init_interval) {
6195 	g_first_adapt_check = (int32_t)(apip->init_interval);
6196       } else {
6197 	g_first_adapt_check = apip->min;
6198       }
6199       g_adaptive_intercept = apip->init_interval;
6200       g_adaptive_slope = apip->interval_slope;
6201     }
6202   }
6203   if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
6204     goto model_assoc_ret_NOMEM;
6205   }
6206   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
6207   if (model_assoc) {
6208     if (model_fisher) {
6209       outname_end2 = memcpyb(outname_end, ".assoc.fisher", 14);
6210     } else {
6211       outname_end2 = memcpyb(outname_end, ".assoc", 7);
6212     }
6213     if (fopen_checked(outname, "w", &outfile)) {
6214       goto model_assoc_ret_OPEN_FAIL;
6215     }
6216     sprintf(g_logbuf, "Writing C/C --assoc report to %s ... ", outname);
6217     wordwrapb(25); // strlen("[generating permutations]")
6218     logprintb();
6219     fflush(stdout);
6220     sprintf(g_textbuf, " CHR %%%us         BP   A1 ", plink_maxsnp);
6221     fprintf(outfile, g_textbuf, "SNP");
6222     if (assoc_counts) {
6223       fputs("     C_A      C_U   A2 ", outfile);
6224     } else {
6225       fputs("     F_A      F_U   A2 ", outfile);
6226     }
6227     if (!model_fisher) {
6228       fputs("       CHISQ ", outfile);
6229     }
6230     if (fputs_checked("           P           OR ", outfile)) {
6231       goto model_assoc_ret_WRITE_FAIL;
6232     }
6233     if (display_ci) {
6234       uii = (uint32_t)((int32_t)(ci_size * (100 + EPSILON)));
6235       if (uii >= 10) {
6236 	fprintf(outfile, "          SE          L%u          U%u ", uii, uii);
6237       } else {
6238 	fprintf(outfile, "          SE           L%u           U%u ", uii, uii);
6239       }
6240     }
6241     if (putc_checked('\n', outfile)) {
6242       goto model_assoc_ret_WRITE_FAIL;
6243     }
6244   } else {
6245     if (is_set(chrom_info_ptr->haploid_mask, 0)) {
6246       logerrprint("Error: --model cannot be used on haploid genomes.\n");
6247       goto model_assoc_ret_INVALID_CMDLINE;
6248     }
6249     uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 0, 1);
6250     if (uii) {
6251       if (is_set_test) {
6252 	// given how the data structures are currently designed, and how easy
6253 	// the command-line fix is, this is not worth the trouble of supporting
6254 	// (this problem illustrates why core data structures should use
6255 	// unfiltered indexes when possible, though)
6256 	logerrprint("Error: --model set-test cannot be used with sets containing MT/haploid\nvariants.  (You can use e.g. '--not-chr y, mt' to exclude them.)\n");
6257 	goto model_assoc_ret_INVALID_CMDLINE;
6258       }
6259       LOGPRINTF("Excluding %u MT/haploid variant%s from --model analysis.\n", uii, (uii == 1)? "" : "s");
6260       marker_ct -= uii;
6261       if (!marker_ct) {
6262 	logerrprint("Error: No variants remaining for --model analysis.\n");
6263 	goto model_assoc_ret_INVALID_CMDLINE;
6264       }
6265     }
6266     outname_end2 = memcpyb(outname_end, ".model", 7);
6267     if (fopen_checked(outname, "w", &outfile)) {
6268       goto model_assoc_ret_OPEN_FAIL;
6269     }
6270     sprintf(g_logbuf, "Writing --model report to %s ... ", outname);
6271     wordwrapb(25);
6272     logprintb();
6273     fflush(stdout);
6274     if (model_perm_best && model_perms) {
6275       outname_end2 = memcpyb(outname_end2, ".best", 6);
6276     } else if ((model_modifier & MODEL_PGEN) && model_perms) {
6277       outname_end2 = memcpyb(outname_end2, ".gen", 5);
6278     } else if (model_modifier & MODEL_PDOM) {
6279       outname_end2 = memcpyb(outname_end2, ".dom", 5);
6280     } else if (model_modifier & MODEL_PREC) {
6281       outname_end2 = memcpyb(outname_end2, ".rec", 5);
6282     } else if (model_modifier & MODEL_PTREND) {
6283       outname_end2 = memcpyb(outname_end2, ".trend", 7);
6284     }
6285     sprintf(g_textbuf, " CHR %%%us   A1   A2     TEST            AFF          UNAFF ", plink_maxsnp);
6286     fprintf(outfile, g_textbuf, "SNP");
6287     if (!model_fisher) {
6288       fputs("       CHISQ   DF ", outfile);
6289     } else {
6290       outname_end2 = memcpyb(outname_end2, ".fisher", 8);
6291     }
6292     if (fputs_checked("           P\n", outfile)) {
6293       goto model_assoc_ret_WRITE_FAIL;
6294     }
6295   }
6296   marker_ctl = BITCT_TO_WORDCT(marker_ct);
6297   g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
6298   if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &loadbuf) ||
6299       bigstack_alloc_d(marker_ct, &orig_pvals) ||
6300       bigstack_alloc_ui(marker_ct, &missing_cts)) {
6301     goto model_assoc_ret_NOMEM;
6302   }
6303   g_loadbuf = loadbuf;
6304   g_orig_pvals = orig_pvals;
6305   g_missing_cts = missing_cts;
6306   if (model_assoc) {
6307     if (bigstack_alloc_d(marker_ct, &orig_odds) ||
6308         bigstack_alloc_ui(marker_ct, &set_cts)) {
6309       goto model_assoc_ret_NOMEM;
6310     }
6311     g_set_cts = set_cts;
6312   }
6313   if ((!model_assoc) || model_maxt_nst) {
6314     if (bigstack_alloc_ui(marker_ct, &het_cts) ||
6315         bigstack_alloc_ui(marker_ct, &homcom_cts)) {
6316       goto model_assoc_ret_NOMEM;
6317     }
6318     g_het_cts = het_cts;
6319     g_homcom_cts = homcom_cts;
6320   }
6321   gender_req = ((x_code != -2) && is_set(chrom_info_ptr->chrom_mask, x_code)) || (model_assoc && (((y_code != -2) && is_set(chrom_info_ptr->chrom_mask, y_code))));
6322   if (gender_req) {
6323     if (bigstack_alloc_ul(pheno_nm_ctv2, &g_sample_nonmale_include2) ||
6324 	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_include2)) {
6325       goto model_assoc_ret_NOMEM;
6326     }
6327     g_sample_male_include2 = sample_male_include2;
6328     quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_include2);
6329     male_ct = popcount01_longs(sample_male_include2, pheno_nm_ctv2);
6330     quatervec_01_init_invert(sample_male_include2, pheno_nm_ct, g_sample_nonmale_include2);
6331     nonmale_ct = pheno_nm_ct - male_ct;
6332   }
6333   // Set test does not support Fisher stats, so currently guaranteed to be
6334   // true there.  Will need to modify this expression if we ever support
6335   // generation of synthetic chi-square stats from Fisher p-values.
6336   fill_orig_chisq = (!model_fisherx) || (mtest_adjust && (!model_fisher));
6337   if (fill_orig_chisq) {
6338     if (bigstack_calloc_d(marker_ct, &orig_chisq)) {
6339       goto model_assoc_ret_NOMEM;
6340     }
6341   }
6342   g_orig_chisq = orig_chisq;
6343 
6344   if (model_perms) {
6345     if (cluster_starts) {
6346       retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
6347       if (retval) {
6348 	goto model_assoc_ret_1;
6349       }
6350       if (!g_perm_cluster_ct) {
6351         logerrprint("Error: No size 2+ clusters for permutation test.\n");
6352 	goto model_assoc_ret_INVALID_CMDLINE;
6353       }
6354       retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
6355       if (retval) {
6356         goto model_assoc_ret_1;
6357       }
6358     } else {
6359       g_perm_cluster_starts = nullptr;
6360     }
6361     if (!is_set_test) {
6362       if (max_thread_ct > perms_total) {
6363 	max_thread_ct = perms_total;
6364       }
6365       if (bigstack_init_sfmtp(max_thread_ct)) {
6366 	goto model_assoc_ret_NOMEM;
6367       }
6368     }
6369     if (model_perm_best) {
6370       if (bigstack_calloc_ul(marker_ctl, &is_invalid_bitfield)) {
6371 	goto model_assoc_ret_NOMEM;
6372       }
6373       g_is_invalid_bitfield = is_invalid_bitfield;
6374     }
6375 
6376     if (!is_set_test) {
6377       g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
6378       if (!g_ldrefs) {
6379 	goto model_assoc_ret_NOMEM;
6380       }
6381 #ifdef __LP64__
6382       fill_ulong_one((marker_ct + 3) / 4, (uintptr_t*)g_ldrefs);
6383 #else
6384       fill_ulong_one((marker_ct + 1) / 2, (uintptr_t*)g_ldrefs);
6385 #endif
6386       if (!(mperm_save & MPERM_DUMP_ALL)) {
6387 	// 5.65686 = roughly 4 * sqrt(2), corresponding to 4 stdevs.  this is
6388 	// a somewhat arbitrary choice.
6389 	// currently just need this to never exceed (2^32 - 1) / (12 * 1024),
6390 	// to avoid uint32_t overflow.
6391 	precomp_width = (1 + (int32_t)(sqrt(pheno_nm_ct) * EXPECTED_MISSING_FREQ * 5.65686));
6392       } else {
6393 	precomp_width = 0;
6394       }
6395       g_precomp_width = precomp_width;
6396       if (bigstack_calloc_ui(marker_ct, &perm_2success_ct)) {
6397 	goto model_assoc_ret_NOMEM;
6398       }
6399       if (model_maxt_nst) {
6400 	if (model_fisherx) {
6401 	  if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC))) {
6402 	    if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
6403 		bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
6404 	      goto model_assoc_ret_NOMEM;
6405 	    }
6406 	  } else if (model_perm_best) {
6407 	    if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
6408 		bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
6409 	      goto model_assoc_ret_NOMEM;
6410 	    }
6411 	  }
6412 	} else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
6413 	  if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
6414 	      bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
6415 	    goto model_assoc_ret_NOMEM;
6416 	  }
6417 	} else if (model_perm_best) {
6418 	  if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
6419 	      bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
6420 	    goto model_assoc_ret_NOMEM;
6421 	  }
6422 	}
6423       } else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
6424 	if (bigstack_alloc_ui(precomp_width * 4 * MODEL_BLOCKSIZE, &precomp_ui)) {
6425 	  goto model_assoc_ret_NOMEM;
6426 	}
6427       } else if (model_perm_best) {
6428 	if (bigstack_alloc_ui(precomp_width * 12 * MODEL_BLOCKSIZE, &precomp_ui)) {
6429 	  goto model_assoc_ret_NOMEM;
6430 	}
6431       }
6432       g_perm_2success_ct = perm_2success_ct;
6433       if (model_adapt_nst) {
6434 	if (bigstack_alloc_ui(marker_ct, &perm_attempt_ct) ||
6435 
6436 	    // we need to zero out trailing bytes of the last word
6437 	    bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &perm_adapt_stop)) {
6438 	  goto model_assoc_ret_NOMEM;
6439 	}
6440 	g_perm_attempt_ct = perm_attempt_ct;
6441 	g_perm_adapt_stop = perm_adapt_stop;
6442 	ujj = apip->max;
6443 	for (uii = 0; uii < marker_ct; uii++) {
6444 	  perm_attempt_ct[uii] = ujj;
6445 	}
6446       }
6447     }
6448     if (!cluster_starts) {
6449       g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
6450       magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
6451     }
6452   }
6453   g_precomp_ui = precomp_ui;
6454   g_precomp_d = precomp_d;
6455   if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_ctrl_include2) ||
6456       bigstack_alloc_ul(pheno_nm_ctv2, &sample_case_include2)) {
6457     goto model_assoc_ret_NOMEM;
6458   }
6459   quaterarr_collapse_init(pheno_c, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_case_include2);
6460   case_ct = popcount01_longs(sample_case_include2, pheno_nm_ctv2);
6461   g_perm_case_ct = case_ct;
6462   quatervec_01_init_invert(sample_case_include2, pheno_nm_ct, sample_ctrl_include2);
6463   ctrl_ct = pheno_nm_ct - case_ct;
6464   if (gender_req) {
6465     // todo: get rid of these and just use the functions called by the
6466     // permutation tests
6467     if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_ctrl_include2) ||
6468 	bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_case_include2) ||
6469 	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_ctrl_include2) ||
6470 	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_case_include2)) {
6471       goto model_assoc_ret_NOMEM;
6472     }
6473     quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_case_include2);
6474     bitvec_and(sample_case_include2, pheno_nm_ctv2, sample_male_case_include2);
6475     case_male_ct = popcount01_longs(sample_male_case_include2, pheno_nm_ctv2);
6476     bitvec_andnot_copy(sample_male_include2, sample_male_case_include2, pheno_nm_ctv2, sample_male_ctrl_include2);
6477     bitvec_andnot_copy(sample_case_include2, sample_male_case_include2, pheno_nm_ctv2, sample_nonmale_case_include2);
6478     bitvec_andnot_copy(sample_ctrl_include2, sample_male_ctrl_include2, pheno_nm_ctv2, sample_nonmale_ctrl_include2);
6479     ctrl_male_ct = male_ct - case_male_ct;
6480     case_nonmale_ct = case_ct - case_male_ct;
6481     ctrl_nonmale_ct = ctrl_ct - ctrl_male_ct;
6482   }
6483 
6484   for (uii = 1; uii <= MODEL_BLOCKSIZE; uii++) {
6485     loadbuf[uii * pheno_nm_ctv2 - 2] = 0;
6486     loadbuf[uii * pheno_nm_ctv2 - 1] = 0;
6487   }
6488   if (model_perms) {
6489     if (bigstack_left() < pheno_nm_ctv2 * sizeof(intptr_t)) {
6490       goto model_assoc_ret_NOMEM;
6491     }
6492   }
6493   marker_unstopped_ct = marker_ct;
6494 
6495   // ----- begin main loop -----
6496  model_assoc_more_perms:
6497   if (model_perms_nst) {
6498     if (!perm_pass_idx) {
6499       fputs("[generating permutations]", stdout);
6500       fflush(stdout);
6501     }
6502     if (model_adapt_nst) {
6503       if (perm_pass_idx) {
6504 	uii = g_first_adapt_check;
6505 	ujj = apip->init_interval;
6506 	while (uii <= perms_done) {
6507 	  // APERM_MAX prevents infinite loop here
6508 	  uii += (int32_t)(ujj + ((int32_t)uii) * apip->interval_slope);
6509 	}
6510 	g_first_adapt_check = uii;
6511       }
6512       perm_vec_ct = bigstack_left() / (pheno_nm_ctv2 * sizeof(intptr_t));
6513     } else {
6514       // perm_vec_ct memory allocation dependencies:
6515       //   g_maxt_thread_results: (8 * perm_vec_ct, cacheline-aligned) *
6516       //     max_thread_ct
6517       //   g_perm_vecst: 16 * ((perm_vec_ct + 127) / 128) * pheno_nm_ct
6518       //   g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 1152 * thread_ct
6519       //   g_resultbuf: MODEL_BLOCKSIZE * (4 * perm_vec_ct, CL-aligned) * 3
6520       //   g_perm_vecs: pheno_nm_ctv2 * sizeof(intptr_t) * perm_vec_ct
6521       //   g_mperm_save_all (if needed): marker_ct * 8 * perm_vec_ct
6522       // If we force perm_vec_ct to be a multiple of 128, then we have
6523       //   perm_vec_ct * (17 * max_thread_ct + 12 * MODEL_BLOCKSIZE +
6524       //                    pheno_nm_ct / 8 + sizeof(intptr_t) * pheno_nm_ctv2
6525       //                    [+ marker_ct * sizeof(double) * mperm_save_all])
6526       //
6527       // Each max(T) thread has six buffers to support rapid execution of the
6528       // genotype indexing and LD exploiter algorithms:
6529       //   six with 4-bit accumulators, each has size perm_vec_ct / 2 bytes
6530       //   six with 8-bit accumulators, each has size perm_vec_ct bytes
6531       // The initial 6 multiplier is to allow heterozygote, homozygote minor,
6532       // and missing genotype increments and decrements to be counted
6533       // simultaneously.
6534       // Adding all this up, we have 9 * perm_vec_ct bytes, and multiplying
6535       // by 128 yields 1152.  The other thread_ct dependence contributes
6536       // 8 * perm_vec_ct bytes, multiplying by 128 yields 1024, and
6537       // 1152 + 1024 = 2176.
6538       if (mperm_save & MPERM_DUMP_ALL) {
6539         perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
6540       } else {
6541         perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct));
6542       }
6543     }
6544     if (perm_vec_ct > perms_total - perms_done) {
6545       perm_vec_ct = perms_total - perms_done;
6546     } else if (!perm_vec_ct) {
6547       goto model_assoc_ret_NOMEM;
6548     }
6549     perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
6550     perms_done += perm_vec_ct;
6551     g_perms_done = perms_done;
6552     g_perm_vec_ct = perm_vec_ct;
6553     bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
6554     g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
6555     ulii = 0;
6556     if (!cluster_starts) {
6557       if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
6558 	goto model_assoc_ret_THREAD_CREATE_FAIL;
6559       }
6560       generate_cc_perms_thread((void*)ulii);
6561     } else {
6562       if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
6563 	goto model_assoc_ret_THREAD_CREATE_FAIL;
6564       }
6565       generate_cc_cluster_perms_thread((void*)ulii);
6566     }
6567     join_threads(threads, g_perm_generation_thread_ct);
6568     g_assoc_thread_ct = max_thread_ct;
6569     if (!model_adapt_nst) {
6570       bigstack_alloc_d(max_thread_ct * round_up_pow2(perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
6571       bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
6572 #ifdef __LP64__
6573       ulii = ((perm_vec_ct + 127) / 128) * 4;
6574       bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
6575 #else
6576       ulii = (perm_vec_ct + 31) / 32;
6577       bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
6578       ulii = ((perm_vec_ct + 63) / 64) * 2;
6579 #endif
6580       bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
6581       transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
6582       if (mperm_save & MPERM_DUMP_ALL) {
6583 	bigstack_alloc_d(marker_ct * perm_vec_ct, &g_mperm_save_all);
6584       }
6585     }
6586     if (!perm_pass_idx) {
6587       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                         \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", stdout);
6588     }
6589   }
6590   if (!perm_pass_idx) {
6591     fputs("0%", stdout);
6592     fflush(stdout);
6593   }
6594   chrom_fo_idx = 0xffffffffU;
6595   marker_uidx = next_unset_unsafe(marker_exclude, 0);
6596   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6597     goto model_assoc_ret_READ_FAIL;
6598   }
6599   marker_idx = 0;
6600   marker_idx2 = 0;
6601   chrom_end = 0;
6602   loop_end = marker_ct / 100;
6603   do {
6604     if (marker_uidx >= chrom_end) {
6605       g_block_start = 0;
6606       if (model_assoc) {
6607 	// exploit overflow
6608 	chrom_fo_idx++;
6609 	refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
6610 	min_ploidy_1 |= uii; // treat MT as haploid
6611 	uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
6612 	if (min_ploidy_1 && (!is_x)) {
6613 	  if (is_y) {
6614 	    cur_ctrl_include2 = sample_male_ctrl_include2;
6615 	    cur_case_include2 = sample_male_case_include2;
6616 	    load_sample_ct = male_ct;
6617 	    load_case_ct = case_male_ct;
6618 	  } else {
6619 	    cur_ctrl_include2 = sample_ctrl_include2;
6620 	    cur_case_include2 = sample_case_include2;
6621 	    load_sample_ct = pheno_nm_ct;
6622 	    load_case_ct = case_ct;
6623 	  }
6624 	  load_ctrl_ct = load_sample_ct - load_case_ct;
6625 	}
6626 	g_min_ploidy_1 = min_ploidy_1;
6627 	g_is_y = is_y;
6628       } else {
6629 	while (1) {
6630 	  do {
6631 	    chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
6632 	  } while (marker_uidx >= chrom_end);
6633 	  uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
6634 	  is_x = (uii == (uint32_t)x_code);
6635 	  if (((!IS_SET(haploid_mask, uii)) && (uii != (uint32_t)mt_code)) || is_x) {
6636 	    break;
6637 	  }
6638 	  marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
6639 	}
6640 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6641 	  goto model_assoc_ret_READ_FAIL;
6642 	}
6643       }
6644       g_is_x = is_x;
6645       chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
6646     } else if (model_maxt_nst) {
6647       marker_idx -= MODEL_BLOCKKEEP;
6648       if (marker_idx) { // max(T) initial block special case, see below
6649         memcpy(loadbuf, &(loadbuf[(MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * pheno_nm_ctv2]), MODEL_BLOCKKEEP * pheno_nm_ctv2 * sizeof(intptr_t));
6650         memcpy(g_resultbuf, &(g_resultbuf[3 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl4m]), MODEL_BLOCKKEEP * perm_vec_ctcl4m * 3 * sizeof(int32_t));
6651       }
6652       g_block_start = MODEL_BLOCKKEEP;
6653     } else {
6654       g_block_start = 0;
6655     }
6656     block_size = g_block_start;
6657     block_end = marker_unstopped_ct - marker_idx;
6658     if ((!marker_idx) && (!block_size) && model_maxt_nst) {
6659       // For max(T) permutation tests, minimize how long we have to work with
6660       // crappy precomputed values.  Most important when using Fisher exact
6661       // test p-values.
6662       if (block_end > MODEL_BLOCKKEEP) {
6663 	block_end = MODEL_BLOCKKEEP;
6664       }
6665     } else if (block_end > MODEL_BLOCKSIZE) {
6666       block_end = MODEL_BLOCKSIZE;
6667     }
6668     do {
6669       if (model_adapt_nst && perm_adapt_stop[marker_idx2]) {
6670 	do {
6671 	  marker_uidx++;
6672 	  next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
6673 	  marker_idx2++;
6674 	} while ((marker_uidx < chrom_end) && perm_adapt_stop[marker_idx2]);
6675 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6676 	  goto model_assoc_ret_READ_FAIL;
6677 	}
6678 	if (marker_uidx >= chrom_end) {
6679 	  break;
6680 	}
6681       }
6682       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
6683       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
6684 	goto model_assoc_ret_READ_FAIL;
6685       }
6686       if (model_adapt_nst) {
6687 	g_adapt_m_table[block_size] = marker_idx2++;
6688       }
6689       if (is_x && (!model_assoc)) {
6690 	force_missing((unsigned char*)(&(loadbuf[block_size * pheno_nm_ctv2])), sample_male_include2, pheno_nm_ct);
6691       }
6692       // no need for usual haploid_fix since the popcount routines here
6693       // interpret het. haploids as missing anyway
6694       mu_table[block_size++] = marker_uidx;
6695       if (marker_idx + block_size == marker_unstopped_ct) {
6696 	break;
6697       }
6698       marker_uidx++;
6699       if (IS_SET(marker_exclude, marker_uidx)) {
6700 	marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
6701 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6702 	  goto model_assoc_ret_READ_FAIL;
6703 	}
6704       }
6705     } while ((block_size < block_end) && (marker_uidx < chrom_end));
6706     if (block_size == g_block_start) {
6707       continue;
6708     }
6709     if (!perm_pass_idx) {
6710       // basic --assoc/--model
6711       orig_pvals_ptr = &(orig_pvals[marker_idx + g_block_start]);
6712       missp = &(missing_cts[marker_idx + g_block_start]);
6713       if (model_assoc) {
6714 	setp = &(set_cts[marker_idx + g_block_start]);
6715 	ooptr = &(orig_odds[marker_idx + g_block_start]);
6716 	for (marker_bidx = g_block_start; marker_bidx < block_size; marker_bidx++) {
6717 	  marker_uidx2 = mu_table[marker_bidx];
6718 	  if (!min_ploidy_1) {
6719 	    if (model_maxt_nst) {
6720 	      single_marker_cc_3freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &unn, &uoo, &ujj, &upp, &uqq, &umm);
6721 	      het_cts[marker_idx + marker_bidx] = uoo + uqq;
6722 	      homcom_cts[marker_idx + marker_bidx] = unn + upp;
6723 	      uii = 2 * unn + uoo;
6724 	      ukk = 2 * upp + uqq;
6725 	    } else {
6726 	      single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &uii, &ujj, &ukk, &umm);
6727 	    }
6728 	    *missp = ujj + umm;
6729 	    *setp = uii + ukk;
6730 	    ujj = 2 * (ctrl_ct - ujj) - uii;
6731 	    umm = 2 * (case_ct - umm) - ukk;
6732 	  } else if (is_x) {
6733 	    single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_nonmale_ctrl_include2, sample_nonmale_case_include2, &uii, &ujj, &ukk, &umm);
6734 	    *missp = 2 * (ujj + umm);
6735 	    *setp = uii + ukk;
6736 	    ujj = 2 * (ctrl_nonmale_ct - ujj) - uii;
6737 	    umm = 2 * (case_nonmale_ct - umm) - ukk;
6738 	    haploid_single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_male_ctrl_include2, sample_male_case_include2, &unn, &uoo, &upp, &uqq);
6739 	    *missp += uoo + uqq + male_ct;
6740 	    *setp += unn + upp;
6741 	    uoo = ctrl_male_ct - uoo - unn;
6742 	    uqq = case_male_ct - uqq - upp;
6743 	    uii += unn;
6744 	    ujj += uoo;
6745 	    ukk += upp;
6746 	    umm += uqq;
6747 	  } else {
6748 	    haploid_single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), cur_ctrl_include2, cur_case_include2, &uii, &ujj, &ukk, &umm);
6749 	    *missp = ujj + umm;
6750 	    *setp = uii + ukk;
6751 	    ujj = load_ctrl_ct - ujj - uii;
6752 	    umm = load_case_ct - umm - ukk;
6753 	    if (is_y) {
6754 	      *missp += nonmale_ct;
6755 	    } else if (model_maxt_nst) {
6756 	      het_cts[marker_idx + marker_bidx] = 0;
6757 	      homcom_cts[marker_idx + marker_bidx] = *setp;
6758 	    }
6759 	  }
6760 	  da1 = umm;
6761 	  da2 = ukk;
6762 	  du1 = ujj;
6763 	  du2 = uii;
6764 	  if (model_fisher) {
6765             // bugfix (12 Jun 2018): If MAF is zero, test should not be
6766             // considered valid for --adjust or permutation testing purposes.
6767             // plink 1.07 got this right, but in a wrong way: it considered
6768             // *all* Fisher's-exact-test p-values of 1 to be invalid tests.  So
6769             // we don't generally want to match its output (even before
6770             // considering the problems with its fisher22 routine).
6771             if ((umm + ujj) && (ukk + uii)) {
6772               pval = fisher22(uii, ujj, ukk, umm, fisher_midp);
6773             } else {
6774               pval = -9;
6775             }
6776 	    *orig_pvals_ptr = pval;
6777 	  } else {
6778 	    if ((umm + ujj) && (ukk + uii)) {
6779 	      dxx = chi22_eval(ukk, ukk + umm, uii + ukk, uii + ujj + ukk + umm);
6780 	      pval = chiprob_p(dxx, 1);
6781 	      *orig_pvals_ptr = pval;
6782 	      if (fill_orig_chisq) {
6783 		orig_chisq[marker_idx + marker_bidx] = dxx;
6784 	      }
6785 	    } else {
6786 	      *orig_pvals_ptr = -9;
6787 	      pval = -1;
6788 	      dxx = 0;
6789 	      if (fill_orig_chisq) {
6790 		orig_chisq[marker_idx + marker_bidx] = -9;
6791 	      }
6792             }
6793 	  }
6794 	  *ooptr = (da1 * du2) / (du1 * da2);
6795 	  if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
6796 	    a1ptr = marker_allele_ptrs[2 * marker_uidx2];
6797 	    a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
6798 	    wptr = memcpyax(writebuf, chrom_name_ptr, chrom_name_len, ' ');
6799 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
6800 	    *wptr++ = ' ';
6801 	    wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
6802 	    wptr = fw_strcpy(4, a1ptr, wptr);
6803 	    *wptr++ = ' ';
6804 	    if (umm + ukk) {
6805 	      if (assoc_counts) {
6806 		wptr = uint32toa_w8(umm, wptr);
6807 	      } else {
6808 		wptr = dtoa_g_wxp4(da1 / (da1 + da2), 8, wptr);
6809 	      }
6810 	      *wptr++ = ' ';
6811 	    } else {
6812 	      wptr = memcpya(wptr, "      NA ", 9);
6813 	    }
6814 	    if (ujj + uii) {
6815 	      if (assoc_counts) {
6816 		wptr = uint32toa_w8(ujj, wptr);
6817 	      } else {
6818 		wptr = dtoa_g_wxp4(du1 / (du1 + du2), 8, wptr);
6819 	      }
6820 	    } else {
6821 	      wptr = memcpya(wptr, "      NA", 8);
6822 	    }
6823 	    *wptr = ' ';
6824 	    wptr = fw_strcpy(4, a2ptr, &(wptr[1]));
6825 	    *wptr++ = ' ';
6826 	    if (model_fisher) {
6827               if (pval == -9) {
6828                 wptr = memcpya(wptr, "           1", 12);
6829               } else {
6830                 wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
6831               }
6832 	    } else {
6833 	      if (pval > -1) {
6834 		wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
6835 		wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
6836 	      } else {
6837 		wptr = memcpya(wptr, "          NA           NA", 25);
6838 	      }
6839 	    }
6840 	    *wptr++ = ' ';
6841 	    if (du1 * da2 == 0.0) {
6842 	      wptr = memcpya(wptr, "          NA", 12);
6843 	      if (display_ci) {
6844 		wptr = memcpya(wptr, "           NA           NA           NA", 39);
6845 	      }
6846 	    } else {
6847 	      wptr = dtoa_g_wxp4(*ooptr, 12, wptr);
6848 	      if (display_ci) {
6849 		dxx = log(*ooptr);
6850 		dyy = sqrt(1 / da1 + 1 / da2 + 1 / du1 + 1 / du2);
6851 		dzz = ci_zt * dyy;
6852 		dww = exp(dxx - dzz);
6853 		dvv = exp(dxx + dzz);
6854 		*wptr++ = ' ';
6855 		wptr = dtoa_g_wxp4x(dyy, 12, ' ', wptr);
6856 		wptr = dtoa_g_wxp4x(dww, 12, ' ', wptr);
6857 		wptr = dtoa_g_wxp4(dvv, 12, wptr);
6858 	      }
6859 	    }
6860 	    wptr = memcpya(wptr, " \n", 2);
6861 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6862 	      goto model_assoc_ret_WRITE_FAIL;
6863 	    }
6864 	  }
6865 	  missp++;
6866 	  setp++;
6867 	  orig_pvals_ptr++;
6868 	  ooptr++;
6869 	}
6870       } else {
6871 	// repurpose setp as homcom_cts pointer
6872 	setp = &(homcom_cts[marker_idx + g_block_start]);
6873 	hetp = &(het_cts[marker_idx + g_block_start]);
6874 	for (marker_bidx = g_block_start; marker_bidx < block_size; marker_bidx++) {
6875 	  marker_uidx2 = mu_table[marker_bidx];
6876 	  single_marker_cc_3freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &uii, &ujj, &ukk, &umm, &unn, &uoo);
6877 	  *missp = ukk + uoo;
6878 	  *setp = uii + umm;
6879 	  ukk = pheno_nm_ct - case_ct - uii - ujj - ukk;
6880 	  uoo = case_ct - umm - unn - uoo;
6881 	  *hetp = ujj + unn;
6882 	  is_invalid = (uoo < model_cell_ct) || (unn < model_cell_ct) || (umm < model_cell_ct) || (ukk < model_cell_ct) || (ujj < model_cell_ct) || (uii < model_cell_ct);
6883 	  a1ptr = marker_allele_ptrs[2 * marker_uidx2];
6884 	  a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
6885 	  wptr = memcpya(writebuf, chrom_name_ptr, chrom_name_len);
6886 	  *wptr++ = ' ';
6887 	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
6888 	  *wptr++ = ' ';
6889 	  wptr = fw_strcpy(4, a1ptr, wptr);
6890 	  *wptr++ = ' ';
6891 	  wptr = fw_strcpy(4, a2ptr, wptr);
6892 	  memset(wptr, 32, 2);
6893 	  wptr = &(wptr[2]);
6894 	  wptr_mid = wptr;
6895 	  if (!model_trendonly) {
6896 	    memcpy(wptr, "   GENO ", 8);
6897 	    wptr2 = uint32toa_x(uoo, '/', wbuf);
6898 	    wptr2 = uint32toa_x(unn, '/', wptr2);
6899 	    wptr2 = uint32toa(umm, wptr2);
6900 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr[8]));
6901 	    *wptr++ = ' ';
6902 	    wptr2 = uint32toa_x(ukk, '/', wbuf);
6903 	    wptr2 = uint32toa_x(ujj, '/', wptr2);
6904 	    wptr2 = uint32toa(uii, wptr2);
6905 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
6906 	    *wptr++ = ' ';
6907 	    if (is_invalid) {
6908 	      gen_p = -9;
6909 	      if (fill_orig_chisq && (model_modifier & MODEL_PGEN)) {
6910 		orig_chisq[marker_idx + marker_bidx] = -9;
6911 	      }
6912 	    } else {
6913 	      if (model_fisher) {
6914 		gen_p = fisher23(uii, ujj, ukk, umm, unn, uoo, fisher_midp);
6915 	      } else {
6916 		chi23_evalx(uii, ujj, ukk, umm, unn, uoo, &dvv, &upp);
6917 		gen_p = chiprob_px(dvv, upp);
6918 		if (fill_orig_chisq && (model_modifier & MODEL_PGEN)) {
6919 		  if (dvv != -9) {
6920 		    orig_chisq[marker_idx + marker_bidx] = dvv;
6921 		  } else {
6922 		    orig_chisq[marker_idx + marker_bidx] = 0;
6923 		  }
6924 		}
6925 	      }
6926 	    }
6927 	    if (gen_p < -1) {
6928 	      wptr = model_assoc_tna(model_fisher, wptr);
6929 	    } else {
6930 	      if (!model_fisher) {
6931 		wptr = dtoa_g_wxp4(dvv, 12, wptr);
6932 		wptr = memcpya(wptr, "    ", 4);
6933 		*wptr++ = '0' + upp;
6934 		*wptr++ = ' ';
6935 	      }
6936 	      wptr = dtoa_g_wxp4x(MAXV(gen_p, output_min_p), 12, '\n', wptr);
6937 	    }
6938 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6939 	      goto model_assoc_ret_WRITE_FAIL;
6940 	    }
6941 	  }
6942 	  memcpy(wptr_mid, "  TREND ", 8);
6943 	  wptr2 = uint32toa_x(uoo * 2 + unn, '/', wbuf);
6944 	  wptr2 = uint32toa(umm * 2 + unn, wptr2);
6945 	  wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
6946 	  *wptr++ = ' ';
6947 	  wptr2 = uint32toa_x(ukk * 2 + ujj, '/', wbuf);
6948 	  wptr2 = uint32toa(uii * 2 + ujj, wptr2);
6949 	  wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
6950 	  *wptr++ = ' ';
6951 	  wptr_mid2 = wptr; // save this for next line
6952 	  ca_chisq = ca_trend_evalx(umm * 2 + unn, umm + unn + uoo, ujj + unn, uii + umm, uii + ujj + ukk + umm + unn + uoo);
6953 	  ca_p = chiprob_px(ca_chisq, 1);
6954 	  if (fill_orig_chisq && (model_modifier & MODEL_PTREND)) {
6955 	    if (ca_chisq != -9) {
6956 	      orig_chisq[marker_idx + marker_bidx] = ca_chisq;
6957 	    } else {
6958 	      orig_chisq[marker_idx + marker_bidx] = 0;
6959 	    }
6960 	  }
6961 	  if (ca_p > -1) {
6962 	    if (!model_fisher) {
6963 	      wptr = dtoa_g_wxp4(ca_chisq, 12, wptr);
6964 	      wptr = memcpya(wptr, "    1 ", 6);
6965 	    }
6966 	    wptr = dtoa_g_wxp4x(MAXV(ca_p, output_min_p), 12, '\n', wptr);
6967 	  } else {
6968 	    wptr = model_assoc_tna(model_fisher, wptr);
6969 	  }
6970 	  if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6971 	    goto model_assoc_ret_WRITE_FAIL;
6972 	  }
6973 	  if (!model_trendonly) {
6974 	    memcpy(wptr_mid, "ALLELIC", 7);
6975 	    wptr = wptr_mid2;
6976 	    if (model_fisher) {
6977 	      mult_p = fisher22(2 * uoo + unn, 2 * umm + unn, 2 * ukk + ujj, 2 * uii + ujj, fisher_midp);
6978 	    } else {
6979 	      dww = chi22_evalx(2 * uoo + unn, 2 * (uoo + unn + umm), 2 * (uoo + ukk) + unn + ujj, 2 * (uoo + unn + umm + ukk + ujj + uii));
6980 	      mult_p = chiprob_px(dww, 1);
6981 	    }
6982 	    if (mult_p > -1) {
6983 	      if (!model_fisher) {
6984 		wptr = dtoa_g_wxp4(dww, 12, wptr);
6985 		wptr = memcpya(wptr, "    1 ", 6);
6986 	      }
6987 	      wptr = dtoa_g_wxp4x(MAXV(mult_p, output_min_p), 12, '\n', wptr);
6988 	    } else {
6989 	      wptr = model_assoc_tna(model_fisher, wptr);
6990 	    }
6991 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6992 	      goto model_assoc_ret_WRITE_FAIL;
6993 	    }
6994 	    memcpy(wptr_mid, "    DOM", 7);
6995 	    wptr2 = uint32toa_x(uoo + unn, '/', wbuf);
6996 	    wptr2 = uint32toa(umm, wptr2);
6997 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
6998 	    *wptr++ = ' ';
6999 	    wptr2 = uint32toa_x(ukk + ujj, '/', wbuf);
7000 	    wptr2 = uint32toa(uii, wptr2);
7001 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
7002 	    *wptr++ = ' ';
7003 	    if (is_invalid) {
7004 	      dom_p = -9;
7005 	      if (fill_orig_chisq && (model_modifier & MODEL_PDOM)) {
7006 		orig_chisq[marker_idx + marker_bidx] = -9;
7007 	      }
7008 	    } else {
7009 	      if (model_fisher) {
7010 		dom_p = fisher22(uoo + unn, umm, ukk + ujj, uii, fisher_midp);
7011 	      } else {
7012 		dww = chi22_evalx(uoo + unn, uoo + unn + umm, uoo + unn + ukk + ujj, uoo + unn + umm + ukk + ujj + uii);
7013 		dom_p = chiprob_px(dww, 1);
7014 		if (fill_orig_chisq && (model_modifier & MODEL_PDOM)) {
7015 		  if (dww != -9) {
7016 		    orig_chisq[marker_idx + marker_bidx] = dww;
7017 		  } else {
7018 		    orig_chisq[marker_idx + marker_bidx] = 0;
7019 		  }
7020 		}
7021 	      }
7022 	    }
7023 	    if (dom_p < -1) {
7024 	      wptr = model_assoc_tna(model_fisher, wptr);
7025 	    } else {
7026 	      if (!model_fisher) {
7027 		wptr = dtoa_g_wxp4(dww, 12, wptr);
7028 		wptr = memcpya(wptr, "    1 ", 6);
7029 	      }
7030 	      wptr = dtoa_g_wxp4x(MAXV(dom_p, output_min_p), 12, '\n', wptr);
7031 	    }
7032 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
7033 	      goto model_assoc_ret_WRITE_FAIL;
7034 	    }
7035 	    memcpy(&(wptr_mid[4]), "REC", 3);
7036 	    wptr2 = uint32toa_x(uoo, '/', wbuf);
7037 	    wptr2 = uint32toa(unn + umm, wptr2);
7038 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
7039 	    *wptr++ = ' ';
7040 	    wptr2 = uint32toa_x(ukk, '/', wbuf);
7041 	    wptr2 = uint32toa(ujj + uii, wptr2);
7042 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
7043 	    *wptr++ = ' ';
7044 	    if (is_invalid) {
7045 	      rec_p = -9;
7046 	      if (fill_orig_chisq && (model_modifier & MODEL_PREC)) {
7047 		orig_chisq[marker_idx + marker_bidx] = -9;
7048 	      }
7049 	    } else {
7050 	      if (model_fisher) {
7051 		rec_p = fisher22(uoo, unn + umm, ukk, ujj + uii, fisher_midp);
7052 	      } else {
7053 		dww = chi22_evalx(uoo, uoo + unn + umm, uoo + ukk, uoo + unn + umm + ukk + ujj + uii);
7054 		rec_p = chiprob_px(dww, 1);
7055 		if (fill_orig_chisq && (model_modifier & MODEL_PREC)) {
7056 		  if (dww != -9) {
7057 		    orig_chisq[marker_idx + marker_bidx] = dww;
7058 		  } else {
7059 		    orig_chisq[marker_idx + marker_bidx] = 0;
7060 		  }
7061 		}
7062 	      }
7063 	    }
7064 	    if (rec_p < -1) {
7065 	      wptr = model_assoc_tna(model_fisher, wptr);
7066 	    } else {
7067 	      if (!model_fisher) {
7068 		wptr = dtoa_g_wxp4(dww, 12, wptr);
7069 		wptr = memcpya(wptr, "    1 ", 6);
7070 	      }
7071 	      wptr = dtoa_g_wxp4x(MAXV(rec_p, output_min_p), 12, '\n', wptr);
7072 	    }
7073 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
7074 	      goto model_assoc_ret_WRITE_FAIL;
7075 	    }
7076 	  }
7077 	  if (model_perm_best) {
7078 	    dxx = mult_p;
7079 	    if (!is_invalid) {
7080 	      if ((dom_p < dxx) && (dom_p >= 0)) {
7081 		dxx = dom_p;
7082 	      }
7083 	      if ((rec_p < dxx) && (rec_p >= 0)) {
7084 		dxx = rec_p;
7085 	      }
7086 	    }
7087 	    if (model_perms && is_invalid) {
7088 	      set_bit_ul(marker_idx + marker_bidx, is_invalid_bitfield);
7089 	    }
7090 	    if (fill_orig_chisq) {
7091 	      if (dxx != -9) {
7092 		orig_chisq[marker_idx + marker_bidx] = inverse_chiprob(dxx, 1);
7093 	      } else {
7094 		orig_chisq[marker_idx + marker_bidx] = -9;
7095 	      }
7096 	    }
7097 	  } else if (model_modifier & MODEL_PGEN) {
7098 	    dxx = (gen_p >= 0)? gen_p : -9;
7099 	  } else if (model_modifier & MODEL_PDOM) {
7100 	    dxx = (dom_p >= 0)? dom_p : -9;
7101 	  } else if (model_modifier & MODEL_PREC) {
7102 	    dxx = (rec_p >= 0)? rec_p : -9;
7103 	  } else if (model_modifier & MODEL_PTREND) {
7104 	    dxx = (ca_p >= 0)? ca_p : -9;
7105 	  }
7106 	  missp++;
7107 	  setp++;
7108 	  hetp++;
7109 	  *orig_pvals_ptr++ = dxx;
7110 	}
7111       }
7112     }
7113     if (model_perms_nst) {
7114       g_block_diff = block_size - g_block_start;
7115       assoc_thread_ct = g_block_diff;
7116       if (assoc_thread_ct > max_thread_ct) {
7117 	assoc_thread_ct = max_thread_ct;
7118       }
7119       if (model_maxt_nst) {
7120 	if (model_fisherx) {
7121 	  maxt_cur_extreme_stat = maxt_extreme_stat[0];
7122 	  for (uii = 1; uii < perm_vec_ct; uii++) {
7123 	    dxx = maxt_extreme_stat[uii];
7124 	    if (dxx > maxt_cur_extreme_stat) {
7125 	      maxt_cur_extreme_stat = dxx;
7126 	    }
7127 	  }
7128 	} else {
7129 	  maxt_cur_extreme_stat = maxt_extreme_stat[0];
7130 	  for (uii = 1; uii < perm_vec_ct; uii++) {
7131 	    dxx = maxt_extreme_stat[uii];
7132 	    if (dxx < maxt_cur_extreme_stat) {
7133 	      maxt_cur_extreme_stat = dxx;
7134 	    }
7135 	  }
7136 	}
7137       }
7138       if (model_assoc) {
7139 	if (min_ploidy_1) {
7140 	  uqq = 1;
7141 	} else {
7142 	  uqq = 2;
7143 	}
7144 	for (uii = g_block_start; uii < block_size; uii++) {
7145 	  if (model_adapt_nst) {
7146 	    urr = g_adapt_m_table[uii];
7147 	  } else {
7148 	    urr = marker_idx + uii;
7149 	  }
7150 	  upp = missing_cts[urr];
7151 	  get_model_assoc_precomp_bounds(upp, 0, &ujj, &ukk);
7152 	  g_precomp_start[uii] = ujj;
7153 	  uoo = set_cts[urr];
7154 	  if (is_x) {
7155 	    unn = 2 * case_ct;
7156 	    upp = 2 * pheno_nm_ct - upp;
7157 	  } else {
7158 	    unn = uqq * case_ct;
7159 	    upp = uqq * (pheno_nm_ct - upp);
7160 	  }
7161 	  ujj *= uqq;
7162 	  ukk += uii * precomp_width;
7163 	  if (model_fisher) {
7164 	    dxx = orig_pvals[urr];
7165 	    if (model_adapt_nst) {
7166 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7167 		fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - ujj, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7168 		ujj += uqq;
7169 	      }
7170 	    } else {
7171 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7172 		fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7173 		fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, unn - ujj, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7174 		precomp_ui[umm * 6 + 4] = uibuf[2];
7175 		precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7176 		ujj += uqq;
7177 	      }
7178 	    }
7179 	  } else {
7180 	    dxx = orig_chisq[urr];
7181 	    if (model_adapt_nst) {
7182 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7183 		chi22_precomp_val_bounds(dxx, unn - ujj, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7184 		ujj += uqq;
7185 	      }
7186 	    } else {
7187 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7188 		chi22_precomp_val_bounds(dxx, unn - ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7189 		chi22_precomp_val_bounds(maxt_cur_extreme_stat, unn - ujj, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7190 		precomp_ui[umm * 6 + 4] = uibuf[2];
7191 		precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7192 		ujj += uqq;
7193 	      }
7194 	    }
7195 	  }
7196 	}
7197       } else if (model_perm_best) {
7198 	for (uii = g_block_start; uii < block_size; uii++) {
7199 	  if (model_adapt_nst) {
7200 	    urr = g_adapt_m_table[uii];
7201 	  } else {
7202 	    urr = marker_idx + uii;
7203 	  }
7204 	  upp = missing_cts[urr];
7205 	  get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7206 	  g_precomp_start[uii] = ujj;
7207 	  unn = 2 * case_ct;
7208 	  uqq = 2 * (pheno_nm_ct - upp);
7209 	  uoo = 2 * homcom_cts[urr] + het_cts[urr];
7210 	  ukk += uii * precomp_width;
7211 	  uss = 2 * ujj;
7212 	  if (model_fisher) {
7213 	    dxx = orig_pvals[urr];
7214 	    if (model_adapt_nst) {
7215 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7216 	        fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - uss, uoo, uqq, &(precomp_ui[umm * 12]), nullptr);
7217 		uss += 2;
7218 	      }
7219 	    } else {
7220 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7221 	        fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - uss, uoo, uqq, &(precomp_ui[umm * 18]), nullptr);
7222 	        fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, 2 * case_ct - uss, uoo, uqq, uibuf, &(precomp_d[umm * 6]));
7223 		precomp_ui[umm * 18 + 4] = uibuf[2];
7224 		precomp_ui[umm * 18 + 5] = uibuf[3] - uibuf[2];
7225 		uss += 2;
7226 	      }
7227 	    }
7228 	    if (!IS_SET(is_invalid_bitfield, urr)) {
7229 	      upp = pheno_nm_ct - upp;
7230 	      uoo = homcom_cts[urr];
7231 	      uqq = upp - uoo - het_cts[urr];
7232 	      ujj = case_ct - ujj;
7233 	      if (model_adapt_nst) {
7234 		for (umm = uii * precomp_width; umm < ukk; umm++) {
7235 		  fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 12 + 4]), nullptr);
7236 		  fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uqq, upp, &(precomp_ui[umm * 12 + 8]), nullptr);
7237 		  ujj--;
7238 		}
7239 	      } else {
7240 		for (umm = uii * precomp_width; umm < ukk; umm++) {
7241 		  fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 18 + 6]), nullptr);
7242 		  fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj, uoo, upp, uibuf, &(precomp_d[umm * 6 + 2]));
7243 		  precomp_ui[umm * 18 + 10] = uibuf[2];
7244 		  precomp_ui[umm * 18 + 11] = uibuf[3] - uibuf[2];
7245 		  fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uqq, upp, &(precomp_ui[umm * 18 + 12]), nullptr);
7246 		  fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj, uqq, upp, uibuf, &(precomp_d[umm * 6 + 4]));
7247 		  precomp_ui[umm * 18 + 16] = uibuf[2];
7248 		  precomp_ui[umm * 18 + 17] = uibuf[3] - uibuf[2];
7249 		  ujj--;
7250 		}
7251 	      }
7252 	    }
7253 	  } else {
7254 	    dxx = orig_chisq[urr];
7255 	    if (model_adapt_nst) {
7256 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7257 		chi22_precomp_val_bounds(dxx, unn - uss, uoo, uqq, &(precomp_ui[umm * 12]), nullptr);
7258 		uss += 2;
7259 	      }
7260 	    } else {
7261 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7262 		chi22_precomp_val_bounds(dxx, unn - uss, uoo, uqq, &(precomp_ui[umm * 18]), nullptr);
7263 		chi22_precomp_val_bounds(maxt_cur_extreme_stat, unn - uss, uoo, uqq, uibuf, &(precomp_d[umm * 6]));
7264 		precomp_ui[umm * 18 + 4] = uibuf[2];
7265 		precomp_ui[umm * 18 + 5] = uibuf[3] - uibuf[2];
7266 		uss += 2;
7267 	      }
7268 	    }
7269 	    if (!IS_SET(is_invalid_bitfield, urr)) {
7270 	      upp = pheno_nm_ct - upp;
7271 	      uoo = homcom_cts[urr];
7272 	      uqq = upp - uoo - het_cts[urr];
7273 	      ujj = case_ct - ujj;
7274 	      if (model_adapt_nst) {
7275 		for (umm = uii * precomp_width; umm < ukk; umm++) {
7276 		  chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 12 + 4]), nullptr);
7277 		  chi22_precomp_val_bounds(dxx, ujj, uqq, upp, &(precomp_ui[umm * 12 + 8]), nullptr);
7278 		  ujj--;
7279 		}
7280 	      } else {
7281 		for (umm = uii * precomp_width; umm < ukk; umm++) {
7282 		  chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 18 + 6]), nullptr);
7283 		  chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj, uoo, upp, uibuf, &(precomp_d[umm * 6 + 2]));
7284 		  precomp_ui[umm * 18 + 10] = uibuf[2];
7285 		  precomp_ui[umm * 18 + 11] = uibuf[3] - uibuf[2];
7286 		  chi22_precomp_val_bounds(dxx, ujj, uqq, upp, &(precomp_ui[umm * 18 + 12]), nullptr);
7287 		  chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj, uqq, upp, uibuf, &(precomp_d[umm * 6 + 4]));
7288 		  precomp_ui[umm * 18 + 16] = uibuf[2];
7289 		  precomp_ui[umm * 18 + 17] = uibuf[3] - uibuf[2];
7290 		  ujj--;
7291 		}
7292 	      }
7293 	    }
7294 	  }
7295 	}
7296       } else if (model_modifier & MODEL_PTREND) {
7297 	for (uii = g_block_start; uii < block_size; uii++) {
7298 	  if (model_adapt_nst) {
7299 	    urr = g_adapt_m_table[uii];
7300 	  } else {
7301 	    urr = marker_idx + uii;
7302 	  }
7303 	  upp = missing_cts[urr];
7304 	  get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7305 	  g_precomp_start[uii] = ujj;
7306 	  unn = het_cts[urr];
7307 	  upp = pheno_nm_ct - upp; // tot_obs
7308 	  uoo = homcom_cts[urr];
7309 	  ukk += uii * precomp_width;
7310 	  ujj = case_ct - ujj;
7311 	  dxx = orig_chisq[urr];
7312 	  if (model_adapt_nst) {
7313 	    for (umm = uii * precomp_width; umm < ukk; umm++) {
7314 	      ca_trend_precomp_val_bounds(dxx, ujj--, unn, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7315 	    }
7316 	  } else {
7317 	    for (umm = uii * precomp_width; umm < ukk; umm++) {
7318 	      ca_trend_precomp_val_bounds(dxx, ujj, unn, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7319               ca_trend_precomp_val_bounds(maxt_cur_extreme_stat, ujj--, unn, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7320 	      precomp_ui[umm * 6 + 4] = uibuf[2];
7321 	      precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7322 	    }
7323 	  }
7324 	}
7325       } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7326 	for (uii = g_block_start; uii < block_size; uii++) {
7327 	  if (model_adapt_nst) {
7328 	    urr = g_adapt_m_table[uii];
7329 	  } else {
7330 	    urr = marker_idx + uii;
7331 	  }
7332 	  upp = missing_cts[urr];
7333 	  get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7334 	  g_precomp_start[uii] = ujj;
7335 	  upp = pheno_nm_ct - upp; // tot_obs
7336 	  if (model_modifier & MODEL_PREC) {
7337 	    uoo = upp - homcom_cts[urr] - het_cts[urr]; // col1_sum
7338 	  } else {
7339 	    uoo = homcom_cts[urr];
7340 	  }
7341 	  ukk += uii * precomp_width;
7342 	  ujj = case_ct - ujj;
7343 	  if (model_fisher) {
7344 	    dxx = orig_pvals[urr];
7345 	    if (model_adapt_nst) {
7346 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7347 	        fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj--, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7348 	      }
7349 	    } else {
7350 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7351 	        fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7352 	        fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj--, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7353 		precomp_ui[umm * 6 + 4] = uibuf[2];
7354 		precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7355 	      }
7356 	    }
7357 	  } else {
7358 	    dxx = orig_chisq[urr];
7359 	    if (model_adapt_nst) {
7360 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7361 		chi22_precomp_val_bounds(dxx, ujj--, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7362 	      }
7363 	    } else {
7364 	      for (umm = uii * precomp_width; umm < ukk; umm++) {
7365 		chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7366 		chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj--, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7367 		precomp_ui[umm * 6 + 4] = uibuf[2];
7368 		precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7369 	      }
7370 	    }
7371 	  }
7372 	}
7373       }
7374       is_last_block = (marker_idx + block_size == marker_unstopped_ct);
7375       ulii = 0;
7376       if (model_adapt_nst) {
7377 	if (model_assoc) {
7378 	  if (spawn_threads2(threads, &assoc_adapt_thread, max_thread_ct, is_last_block)) {
7379 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7380 	  }
7381 	  assoc_adapt_thread((void*)ulii);
7382 	} else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7383 	  if (spawn_threads2(threads, &model_adapt_domrec_thread, max_thread_ct, is_last_block)) {
7384 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7385 	  }
7386 	  model_adapt_domrec_thread((void*)ulii);
7387 	} else if (model_modifier & MODEL_PTREND) {
7388 	  if (spawn_threads2(threads, &model_adapt_trend_thread, max_thread_ct, is_last_block)) {
7389 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7390 	  }
7391 	  model_adapt_trend_thread((void*)ulii);
7392 	} else if (model_modifier & MODEL_PGEN) {
7393 	  if (spawn_threads2(threads, &model_adapt_gen_thread, max_thread_ct, is_last_block)) {
7394 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7395 	  }
7396 	  model_adapt_gen_thread((void*)ulii);
7397 	} else {
7398 	  if (spawn_threads2(threads, &model_adapt_best_thread, max_thread_ct, is_last_block)) {
7399 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7400 	  }
7401 	  model_adapt_best_thread((void*)ulii);
7402 	}
7403 	join_threads2(threads, max_thread_ct, is_last_block);
7404       } else {
7405 	g_maxt_block_base = marker_idx;
7406 	if (model_assoc) {
7407 	  if (spawn_threads2(threads, &assoc_maxt_thread, max_thread_ct, is_last_block)) {
7408 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7409 	  }
7410 	  assoc_maxt_thread((void*)ulii);
7411 	} else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7412 	  if (spawn_threads2(threads, &model_maxt_domrec_thread, max_thread_ct, is_last_block)) {
7413 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7414 	  }
7415 	  model_maxt_domrec_thread((void*)ulii);
7416 	} else if (model_modifier & MODEL_PTREND) {
7417 	  if (spawn_threads2(threads, &model_maxt_trend_thread, max_thread_ct, is_last_block)) {
7418 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7419 	  }
7420 	  model_maxt_trend_thread((void*)ulii);
7421 	} else if (model_modifier & MODEL_PGEN) {
7422 	  if (spawn_threads2(threads, &model_maxt_gen_thread, max_thread_ct, is_last_block)) {
7423 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7424 	  }
7425 	  model_maxt_gen_thread((void*)ulii);
7426 	} else {
7427 	  if (spawn_threads2(threads, &model_maxt_best_thread, max_thread_ct, is_last_block)) {
7428 	    goto model_assoc_ret_THREAD_CREATE_FAIL;
7429 	  }
7430 	  model_maxt_best_thread((void*)ulii);
7431 	}
7432 	join_threads2(threads, max_thread_ct, is_last_block);
7433 	ulii = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
7434         if (model_fisherx) {
7435 	  for (uii = 0; uii < assoc_thread_ct; uii++) {
7436 	    ooptr = &(g_maxt_thread_results[uii * ulii]);
7437 	    for (ujj = perms_done - perm_vec_ct; ujj < perms_done; ujj++) {
7438 	      dxx = *ooptr++;
7439 	      if (dxx < maxt_extreme_stat[ujj]) {
7440 		maxt_extreme_stat[ujj] = dxx;
7441 	      }
7442 	    }
7443 	  }
7444 	} else {
7445 	  for (uii = 0; uii < assoc_thread_ct; uii++) {
7446 	    ooptr = &(g_maxt_thread_results[uii * ulii]);
7447 	    for (ujj = perms_done - perm_vec_ct; ujj < perms_done; ujj++) {
7448 	      dxx = *ooptr++;
7449 	      if (dxx > maxt_extreme_stat[ujj]) {
7450 		maxt_extreme_stat[ujj] = dxx;
7451 	      }
7452 	    }
7453 	  }
7454 	}
7455       }
7456     }
7457     marker_idx += block_size;
7458     if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
7459       if (marker_idx < marker_unstopped_ct) {
7460 	if (pct >= 10) {
7461 	  putc_unlocked('\b', stdout);
7462 	}
7463 	pct = (marker_idx * 100LLU) / marker_unstopped_ct;
7464 	printf("\b\b%u%%", pct);
7465 	fflush(stdout);
7466 	loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
7467       }
7468     }
7469   } while (marker_idx < marker_unstopped_ct);
7470   if (!perm_pass_idx) {
7471     if (pct >= 10) {
7472       putc_unlocked('\b', stdout);
7473     }
7474     fputs("\b\b", stdout);
7475     logprint("done.\n");
7476     if (model_perms_nst) {
7477       bigstack_reset(g_perm_vecs);
7478     }
7479     if (fclose_null(&outfile)) {
7480       goto model_assoc_ret_WRITE_FAIL;
7481     }
7482     if (!is_set_test) {
7483       if (mtest_adjust) {
7484         if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
7485 	  goto model_assoc_ret_NOMEM;
7486         }
7487         fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
7488         retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, model_fisher? nullptr : orig_chisq, pfilter, output_min_p, mtest_adjust, (!model_assoc) && (!(model_modifier & MODEL_PTREND)), adjust_lambda, nullptr, model_fisher? orig_pvals : nullptr);
7489         if (retval) {
7490 	  goto model_assoc_ret_1;
7491         }
7492         bigstack_reset(marker_idx_to_uidx);
7493       }
7494       if (mperm_save & MPERM_DUMP_ALL) {
7495 	g_textbuf[0] = '0';
7496 	wptr = &(g_textbuf[1]);
7497 	a1ptr = &(g_textbuf[MAXLINELEN]);
7498 	if (model_fisherx) {
7499 	  for (uii = 0; uii < marker_ct; uii++) {
7500 	    *wptr++ = ' ';
7501 	    dxx = orig_pvals[uii];
7502 	    if (dxx >= 0) {
7503 	      wptr = dtoa_g(dxx, wptr);
7504 	    } else {
7505 	      wptr = memcpya(wptr, "NA", 2);
7506 	    }
7507 	    if (wptr >= a1ptr) {
7508 	      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7509 		goto model_assoc_ret_WRITE_FAIL;
7510 	      }
7511 	      wptr = g_textbuf;
7512 	    }
7513 	  }
7514 	} else {
7515 	  for (uii = 0; uii < marker_ct; uii++) {
7516 	    *wptr++ = ' ';
7517 	    dxx = orig_chisq[uii];
7518 	    if (dxx >= 0) {
7519 	      wptr = dtoa_g(dxx, wptr);
7520 	    } else {
7521 	      wptr = memcpya(wptr, "NA", 2);
7522 	    }
7523 	    if (wptr >= a1ptr) {
7524 	      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7525 		goto model_assoc_ret_WRITE_FAIL;
7526 	      }
7527 	      wptr = g_textbuf;
7528 	    }
7529 	  }
7530 	}
7531 	*wptr++ = '\n';
7532 	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7533 	  goto model_assoc_ret_WRITE_FAIL;
7534 	}
7535       }
7536     } else {
7537       retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
7538       if (retval) {
7539         goto model_assoc_ret_1;
7540       }
7541     }
7542   }
7543   if (model_perms_nst) {
7544     if (mperm_save & MPERM_DUMP_ALL) {
7545       if (perm_pass_idx) {
7546 	putc_unlocked(' ', stdout);
7547       }
7548       fputs("[dumping stats]", stdout);
7549       fflush(stdout);
7550       ulii = perm_vec_ct;
7551       ujj = 1 + perms_done - ulii;
7552       wptr = g_textbuf;
7553       a1ptr = &(g_textbuf[MAXLINELEN]);
7554       for (uii = 0; uii < ulii; uii++) {
7555 	wptr = uint32toa(uii + ujj, wptr);
7556         orig_pvals_ptr = &(g_mperm_save_all[uii]);
7557 	for (ukk = 0; ukk < marker_ct; ukk++) {
7558 	  *wptr++ = ' ';
7559 	  dxx = orig_pvals_ptr[ukk * ulii];
7560 	  if (dxx >= 0) {
7561 	    wptr = dtoa_g(dxx, wptr);
7562 	  } else {
7563 	    wptr = memcpya(wptr, "NA", 2);
7564 	  }
7565 	  if (wptr >= a1ptr) {
7566 	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7567 	      goto model_assoc_ret_WRITE_FAIL;
7568 	    }
7569 	    wptr = g_textbuf;
7570 	  }
7571 	}
7572 	*wptr++ = '\n';
7573       }
7574       if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7575 	goto model_assoc_ret_WRITE_FAIL;
7576       }
7577       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
7578     }
7579     bigstack_reset(g_perm_vecs);
7580     if (perms_done < perms_total) {
7581       if (model_adapt_nst) {
7582 	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
7583 	if (!marker_unstopped_ct) {
7584 	  goto model_assoc_adapt_perm_count;
7585 	}
7586       }
7587       printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
7588       fflush(stdout);
7589       perm_pass_idx++;
7590       goto model_assoc_more_perms;
7591     }
7592     if (model_adapt_nst) {
7593     model_assoc_adapt_perm_count:
7594       perms_done = 0;
7595       for (uii = 0; uii < marker_ct; uii++) {
7596 	if (perm_attempt_ct[uii] > perms_done) {
7597 	  perms_done = perm_attempt_ct[uii];
7598 	  if (perms_done == perms_total) {
7599 	    break;
7600 	  }
7601 	}
7602       }
7603     }
7604     putc_unlocked('\r', stdout);
7605     LOGPRINTF("%u %s permutation%s complete.\n", perms_done, model_maxt_nst? "max(T)" : "(adaptive)", (perms_done != 1)? "s" : "");
7606     if (model_fisher && (model_modifier & MODEL_PTREND)) {
7607       outname_end2 -= 7; // remove ".fisher"
7608     }
7609     if (model_adapt_nst) {
7610       memcpy(outname_end2, ".perm", 6);
7611     } else {
7612       if (mperm_save & MPERM_DUMP_BEST) {
7613 	if (bigstack_alloc_c(FNAMESIZE, &a1ptr)) {
7614 	  goto model_assoc_ret_NOMEM;
7615 	}
7616 	ulii = outname_end - outname;
7617 	memcpy(a1ptr, outname, ulii);
7618 	memcpy(&(a1ptr[ulii]), ".mperm.dump.best", 17);
7619 	LOGPRINTFWW("Dumping best permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", a1ptr);
7620 	if (fopen_checked(a1ptr, "w", &outfile)) {
7621 	  goto model_assoc_ret_OPEN_FAIL;
7622 	}
7623 	dxx = 0;
7624 	if (model_fisherx) {
7625 	  for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7626 	    if ((orig_pvals[marker_idx] != -9) && (orig_pvals[marker_idx] < dxx)) {
7627 	      dxx = orig_pvals[marker_idx];
7628 	    }
7629 	  }
7630 	  dxx = 1 - dxx;
7631 	} else {
7632 	  for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7633 	    if (orig_chisq[marker_idx] > dxx) {
7634 	      dxx = orig_chisq[marker_idx];
7635 	    }
7636 	  }
7637 	}
7638         memcpy(g_textbuf, "0 ", 2);
7639 	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
7640 	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
7641 	  goto model_assoc_ret_WRITE_FAIL;
7642 	}
7643 	for (uii = 0; uii < perms_total; uii++) {
7644 	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
7645 	  wptr = dtoa_gx(maxt_extreme_stat[uii], '\n', wptr);
7646 	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
7647 	    goto model_assoc_ret_WRITE_FAIL;
7648 	  }
7649 	}
7650 	if (fclose_null(&outfile)) {
7651 	  goto model_assoc_ret_WRITE_FAIL;
7652 	}
7653       }
7654       memcpy(outname_end2, ".mperm", 7);
7655     }
7656     if (fopen_checked(outname, "w", &outfile)) {
7657       goto model_assoc_ret_OPEN_FAIL;
7658     }
7659     if (model_adapt_nst) {
7660       sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
7661     } else {
7662       sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
7663 #ifdef __cplusplus
7664       std::sort(maxt_extreme_stat, &(maxt_extreme_stat[perms_total]));
7665 #else
7666       qsort(maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
7667 #endif
7668     }
7669     /*
7670     if (model_maxt_nst) {
7671       printf("extreme stats: %g %g\n", maxt_extreme_stat[0], maxt_extreme_stat[perms_total - 1]);
7672     }
7673     */
7674     fprintf(outfile, g_textbuf, "SNP");
7675     chrom_fo_idx = 0xffffffffU;
7676     marker_uidx = next_unset_unsafe(marker_exclude, 0);
7677     marker_idx = 0;
7678     dyy = 1.0 / ((double)((int32_t)perms_total + 1));
7679     dxx = 0.5 * dyy;
7680     while (1) {
7681       while (1) {
7682 	do {
7683           chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
7684 	} while (marker_uidx >= chrom_end);
7685 	uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
7686 	is_x = (uii == (uint32_t)x_code);
7687 	if (model_assoc || (((!IS_SET(haploid_mask, uii)) && (uii != (uint32_t)mt_code)) || is_x)) {
7688 	  break;
7689 	}
7690 	marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
7691       }
7692       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
7693       *wptr_start++ = ' ';
7694       wptr_start[plink_maxsnp] = ' ';
7695       for (; marker_uidx < chrom_end;) {
7696 	if (model_adapt_nst) {
7697 	  pval = ((double)(perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (perm_attempt_ct[marker_idx] + 1)));
7698 	} else {
7699 	  pval = ((double)(perm_2success_ct[marker_idx] + 2)) * dxx;
7700 	}
7701         if (pval <= pfilter) {
7702 	  fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
7703 	  wptr = &(wptr_start[1 + plink_maxsnp]);
7704 	  if ((!model_assoc) && ((model_adapt_nst && (!perm_attempt_ct[marker_idx])) || ((!model_adapt_nst) && ((model_fisherx && (orig_pvals[marker_idx] == -9)) || ((!model_fisherx) && (orig_chisq[marker_idx] == -9)))))) {
7705 	    // invalid
7706             wptr = memcpya(wptr, "          NA           NA", 25);
7707 	  } else {
7708 	    if (!model_perm_count) {
7709 	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
7710 	    } else {
7711 	      wptr = dtoa_g_wxp4x(((double)perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
7712 	    }
7713 	    if (model_adapt_nst) {
7714 	      wptr = memseta(wptr, 32, 2);
7715 	      wptr = uint32toa_w10(perm_attempt_ct[marker_idx], wptr);
7716 	    } else {
7717 	      if (model_fisherx) {
7718 		// minimum p-value
7719 		dzz = (int32_t)(doublearr_greater_than(maxt_extreme_stat, perms_total, orig_pvals[marker_idx] * (1.0 + EPSILON)) + 1);
7720 	      } else {
7721 		// maximum chisq
7722 		dzz = (int32_t)(perms_total - doublearr_greater_than(maxt_extreme_stat, perms_total, orig_chisq[marker_idx] - EPSILON) + 1);
7723 	      }
7724 	      if (!model_perm_count) {
7725 		wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
7726 	      } else {
7727 		wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
7728 	      }
7729 	    }
7730 	  }
7731 	  wptr = memcpya(wptr, " \n", 2);
7732 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
7733 	    goto model_assoc_ret_WRITE_FAIL;
7734 	  }
7735 	}
7736 	if (++marker_idx == marker_ct) {
7737 	  goto model_assoc_loop_end;
7738 	}
7739 	marker_uidx++;
7740         next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
7741       }
7742     }
7743   model_assoc_loop_end:
7744     if (fclose_null(&outfile)) {
7745       goto model_assoc_ret_WRITE_FAIL;
7746     }
7747     LOGPRINTFWW("Permutation test report written to %s .\n", outname);
7748   }
7749 
7750   while (0) {
7751   model_assoc_ret_NOMEM:
7752     retval = RET_NOMEM;
7753     break;
7754   model_assoc_ret_OPEN_FAIL:
7755     retval = RET_OPEN_FAIL;
7756     break;
7757   model_assoc_ret_READ_FAIL:
7758     retval = RET_READ_FAIL;
7759     break;
7760   model_assoc_ret_WRITE_FAIL:
7761     retval = RET_WRITE_FAIL;
7762     break;
7763   model_assoc_ret_INVALID_CMDLINE:
7764     retval = RET_INVALID_CMDLINE;
7765     break;
7766   model_assoc_ret_THREAD_CREATE_FAIL:
7767     retval = RET_THREAD_CREATE_FAIL;
7768     break;
7769   }
7770  model_assoc_ret_1:
7771   bigstack_reset(bigstack_mark);
7772   fclose_cond(outfile);
7773   fclose_cond(outfile_msa);
7774   return retval;
7775 }
7776 
qassoc_set_test(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,uintptr_t * marker_exclude_mid,uintptr_t marker_ct_mid,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sex_male,Aperm_info * apip,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * founder_pnm,uintptr_t * sample_include2,uintptr_t * sample_male_include2,uint32_t ld_ignore_x,uint32_t hh_exists,uint32_t hh_or_mt_exists,uint32_t perm_batch_size,Set_info * sip,uint32_t * tcnt,uintptr_t * loadbuf_raw)7777 int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Aperm_info* apip, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* founder_pnm, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uint32_t ld_ignore_x, uint32_t hh_exists, uint32_t hh_or_mt_exists, uint32_t perm_batch_size, Set_info* sip, uint32_t* tcnt, uintptr_t* loadbuf_raw) {
7778   // Similar to glm_linear_assoc_set_test().
7779   // Side effect: t-statistics in g_orig_chisq[] are clobbered and replaced
7780   // with same-p-value 1df chi-square statistics.
7781   unsigned char* bigstack_mark = g_bigstack_base;
7782   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
7783   uintptr_t* marker_exclude = marker_exclude_mid;
7784   uintptr_t* unstopped_markers = nullptr;
7785   uintptr_t* loadbuf = g_loadbuf;
7786   uintptr_t* perm_adapt_set_unstopped = nullptr;
7787   uintptr_t* regression_skip = nullptr;
7788   double* orig_stats = g_orig_chisq; // initially contains t-statistics
7789   double* sorted_chisq_buf = nullptr;
7790   uint32_t* marker_idx_to_uidx = nullptr;
7791   uint32_t* sorted_marker_idx_buf = nullptr;
7792   uint32_t* proxy_arr = nullptr;
7793   uint32_t* perm_2success_ct = nullptr;
7794   uint32_t* perm_attempt_ct = nullptr;
7795   uintptr_t marker_ct = marker_ct_mid;
7796   uintptr_t set_ct = 0;
7797   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
7798   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
7799   double adaptive_ci_zt = 0.0;
7800   uint32_t max_thread_ct = g_thread_ct;
7801   uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
7802   uint32_t perms_done = 0;
7803   int32_t retval = 0;
7804   unsigned char* bigstack_mark2;
7805   uintptr_t* set_incl;
7806   uintptr_t* loadbuf_ptr;
7807   double* orig_set_scores;
7808   double* chisq_pmajor;
7809   double* read_dptr;
7810   double* write_dptr;
7811   uint32_t** setdefs;
7812   uint32_t** ld_map;
7813   uintptr_t marker_ctl;
7814   uintptr_t marker_midx;
7815   uintptr_t set_idx;
7816   uintptr_t perm_vec_ct;
7817   uintptr_t perm_vec_ctcl8m;
7818   uintptr_t pidx;
7819   uintptr_t ulii;
7820   double chisq_threshold;
7821   double dxx;
7822   double dyy;
7823   uint32_t perms_total;
7824   uint32_t max_sigset_size;
7825   uint32_t marker_unstopped_ct;
7826   uint32_t is_last_block;
7827   uint32_t chrom_fo_idx;
7828   uint32_t chrom_end;
7829   uint32_t block_size;
7830   uint32_t block_end;
7831   uint32_t first_adapt_check;
7832   uint32_t marker_uidx;
7833   uint32_t marker_idx;
7834   uint32_t marker_idx2;
7835   uint32_t marker_bidx;
7836   uint32_t skip_ct;
7837   uint32_t uii;
7838   if (sip->set_test_lambda > 1.0) {
7839     dxx = 1.0 / sip->set_test_lambda;
7840   } else {
7841     dxx = 1.0;
7842   }
7843   for (marker_midx = 0; marker_midx < marker_ct; marker_midx++) {
7844     dyy = calc_tprob(orig_stats[marker_midx], tcnt[marker_midx]);
7845     if (dyy == 0.0) {
7846       dyy = MAX_INVERSE_CHIPROB_1DF * dxx;
7847     } else {
7848       orig_stats[marker_midx] = inverse_chiprob(dyy, 1) * dxx;
7849     }
7850   }
7851   retval = set_test_common_init(threads, bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_ids, max_marker_id_len, marker_reverse, orig_stats, sip, chrom_info_ptr, unfiltered_sample_ct, sex_male, founder_pnm, ld_ignore_x, hh_exists, "QT --assoc", &marker_ct, &marker_exclude, &set_incl, &marker_idx_to_uidx, &setdefs, &set_ct, &max_sigset_size, &ld_map, &chisq_threshold, &orig_set_scores, &sorted_chisq_buf, &sorted_marker_idx_buf, &proxy_arr, &perm_adapt_set_unstopped, &perm_2success_ct, &perm_attempt_ct, &unstopped_markers);
7852   if (retval) {
7853     goto qassoc_set_test_ret_1;
7854   }
7855   if (!set_ct) {
7856     goto qassoc_set_test_write;
7857   }
7858   marker_ctl = BITCT_TO_WORDCT(marker_ct);
7859   if (marker_ct_mid != marker_ct) {
7860     inplace_delta_collapse_arr((char*)tcnt, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7861     inplace_delta_collapse_arr((char*)g_missing_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7862     inplace_delta_collapse_arr((char*)g_het_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7863     inplace_delta_collapse_arr((char*)g_homcom_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7864   }
7865   if (bigstack_calloc_ul(marker_ctl, &regression_skip)) {
7866     goto qassoc_set_test_ret_NOMEM;
7867   }
7868   for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7869     // nanal
7870     uii = tcnt[marker_idx] + 2;
7871     if ((uii == 2) || (g_homcom_cts[marker_idx] == uii) || (g_het_cts[marker_idx] == uii) || (g_het_cts[marker_idx] + g_homcom_cts[marker_idx] == 0)) {
7872       // 0 df or no genotype variation, regression always fails
7873       SET_BIT(marker_idx, regression_skip);
7874     }
7875   }
7876   if (model_modifier & MODEL_PERM) {
7877     perms_total = apip->max;
7878     first_adapt_check = (apip->min < apip->init_interval)? ((int32_t)apip->init_interval) : apip->min;
7879     adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)set_ct)));
7880   } else {
7881     perms_total = model_mperm_val;
7882     first_adapt_check = perms_total + 1;
7883   }
7884   for (uii = 0; uii < set_ct; uii++) {
7885     perm_attempt_ct[uii] = perms_total;
7886   }
7887   if (max_thread_ct > perms_total) {
7888     max_thread_ct = perms_total;
7889   }
7890   if (bigstack_init_sfmtp(max_thread_ct)) {
7891     goto qassoc_set_test_ret_NOMEM;
7892   }
7893 
7894   bigstack_mark2 = g_bigstack_base;
7895  qassoc_set_test_more_perms:
7896   bitvec_and(unstopped_markers, marker_ctl, regression_skip);
7897   bitvec_andnot(regression_skip, marker_ctl, unstopped_markers);
7898   skip_ct = popcount_longs(regression_skip, marker_ctl);
7899   marker_unstopped_ct = popcount_longs(unstopped_markers, marker_ctl);
7900 
7901   if (perms_done) {
7902     uii = apip->init_interval;
7903     while (first_adapt_check <= perms_done) {
7904       first_adapt_check += (int32_t)(uii + ((int32_t)first_adapt_check) * apip->interval_slope);
7905     }
7906   }
7907   perm_vec_ct = perm_batch_size;
7908   // possible todo: split first batch to reduce adaptive overshoot
7909   if (perm_vec_ct > perms_total - perms_done) {
7910     perm_vec_ct = perms_total - perms_done;
7911   }
7912   g_perm_vec_ct = perm_vec_ct;
7913   if (perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
7914     g_perm_generation_thread_ct = max_thread_ct;
7915   } else {
7916     g_perm_generation_thread_ct = MAXV(perm_vec_ct / CACHELINE_INT32, 1);
7917   }
7918   perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
7919   if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd) ||
7920       bigstack_calloc_d(perm_vec_ctcl8m * 3 * max_thread_ct, &g_thread_git_qbufs)) {
7921     goto qassoc_set_test_ret_NOMEM;
7922   }
7923 
7924   ulii = 0;
7925   if (!g_perm_cluster_ct) {
7926     if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
7927       goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
7928     }
7929     generate_qt_perms_smajor_thread((void*)ulii);
7930   } else {
7931     if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
7932       goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
7933     }
7934     generate_qt_cluster_perms_smajor_thread((void*)ulii);
7935   }
7936   join_threads(threads, g_perm_generation_thread_ct);
7937   if (bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all) ||
7938       bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor)) {
7939     goto qassoc_set_test_ret_NOMEM;
7940   }
7941   for (pidx = 0; pidx < perm_vec_ct; pidx++) {
7942     write_dptr = &(chisq_pmajor[pidx * marker_ct]);
7943     for (marker_idx = 0, marker_idx2 = 0; marker_idx < skip_ct; marker_idx++, marker_idx2++) {
7944       next_set_unsafe_ck(regression_skip, &marker_idx2);
7945       write_dptr[marker_idx2] = -9;
7946     }
7947   }
7948   chrom_fo_idx = 0xffffffffU;
7949   marker_uidx = next_unset_unsafe(marker_exclude, 0);
7950   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7951     goto qassoc_set_test_ret_READ_FAIL;
7952   }
7953   marker_idx = 0;
7954   marker_idx2 = 0;
7955   chrom_end = 0;
7956   do {
7957     if (marker_uidx >= chrom_end) {
7958       // exploit overflow
7959       chrom_fo_idx++;
7960       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &g_min_ploidy_1);
7961       g_min_ploidy_1 |= uii; // treat MT as haploid
7962     }
7963     block_size = 0;
7964     block_end = marker_unstopped_ct - marker_idx;
7965     if (block_end > MODEL_BLOCKSIZE) {
7966       block_end = MODEL_BLOCKSIZE;
7967     }
7968     do {
7969       if (!IS_SET(unstopped_markers, marker_idx2)) {
7970         do {
7971 	  marker_uidx++;
7972 	  next_unset_unsafe_ck(marker_exclude, &marker_uidx);
7973 	  marker_idx2++;
7974         } while ((marker_uidx < chrom_end) && (!IS_SET(unstopped_markers, marker_idx2)));
7975 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7976 	  goto qassoc_set_test_ret_READ_FAIL;
7977 	}
7978 	if (marker_uidx >= chrom_end) {
7979 	  break;
7980 	}
7981       }
7982       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
7983       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
7984 	goto qassoc_set_test_ret_READ_FAIL;
7985       }
7986       if (g_min_ploidy_1 && hh_or_mt_exists) {
7987 	haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, pheno_nm_ct, g_is_x, g_is_y, (unsigned char*)loadbuf_ptr);
7988       }
7989       g_adapt_m_table[block_size] = marker_idx2++;
7990       block_size++;
7991       if (marker_idx + block_size == marker_unstopped_ct) {
7992 	break;
7993       }
7994       marker_uidx++;
7995       if (IS_SET(marker_exclude, marker_uidx)) {
7996 	marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
7997 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7998 	  goto qassoc_set_test_ret_READ_FAIL;
7999 	}
8000       }
8001     } while ((block_size < block_end) && (marker_uidx < chrom_end));
8002     if (!block_size) {
8003       continue;
8004     }
8005     is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
8006     g_block_diff = block_size;
8007     ulii = 0;
8008     if (spawn_threads2(threads, &qassoc_set_thread, max_thread_ct, is_last_block)) {
8009       goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
8010     }
8011     qassoc_set_thread((void*)ulii);
8012     join_threads2(threads, max_thread_ct, is_last_block);
8013 
8014     // convert to equivalent chi-square stats and transpose
8015     // (conversion has to be done here since dcdflib is not thread-safe)
8016     read_dptr = g_mperm_save_all;
8017     for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
8018       uii = g_adapt_m_table[marker_bidx];
8019       write_dptr = &(chisq_pmajor[uii]);
8020       uii = tcnt[uii];
8021       dyy = inverse_tprob(sip->set_p, uii);
8022       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
8023 	dxx = *read_dptr++;
8024 	if (dxx < dyy) {
8025 	  dxx = -9;
8026 	} else {
8027 	  dxx = calc_tprob(dxx, uii);
8028 	  if (dxx == 0.0) {
8029 	    dxx = MAX_INVERSE_CHIPROB_1DF;
8030 	  } else {
8031 	    dxx = inverse_chiprob(dxx, 1);
8032 	  }
8033 	}
8034 	// this is cache-unfriendly, may want to update in-place instead and
8035 	// separate out the transpose
8036 	write_dptr[pidx * marker_ct] = dxx;
8037       }
8038     }
8039     marker_idx += block_size;
8040   } while (marker_idx < marker_unstopped_ct);
8041   perms_done += perm_vec_ct;
8042   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
8043   bigstack_reset(bigstack_mark2);
8044   if (perms_done < perms_total) {
8045     if (model_modifier & MODEL_PERM) {
8046       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
8047 	perms_done = 0;
8048 	for (set_idx = 0; set_idx < set_ct; set_idx++) {
8049           if (perms_done < perm_attempt_ct[set_idx]) {
8050 	    perms_done = perm_attempt_ct[set_idx];
8051 	  }
8052 	}
8053 	goto qassoc_set_test_perms_done;
8054       }
8055     }
8056     printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
8057     fflush(stdout);
8058     goto qassoc_set_test_more_perms;
8059   }
8060  qassoc_set_test_perms_done:
8061   putc_unlocked('\r', stdout);
8062   LOGPRINTF("%u permutation%s complete.\n", perms_done, (perms_done != 1)? "s" : "");
8063  qassoc_set_test_write:
8064   if (model_modifier & MODEL_PERM) {
8065     memcpy(outname_end, ".qassoc.set.perm", 17);
8066   } else {
8067     memcpy(outname_end, ".qassoc.set.mperm", 18);
8068   }
8069   retval = write_set_test_results(outname, &(outname_end[11]), sip, ld_map, setdefs, set_incl, set_ct, marker_ct_orig, marker_ct, marker_idx_to_uidx, marker_ids, max_marker_id_len, perm_2success_ct, perm_attempt_ct, mtest_adjust, perm_count, pfilter, output_min_p, chisq_threshold, orig_stats, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr);
8070   while (0) {
8071   qassoc_set_test_ret_NOMEM:
8072     retval = RET_NOMEM;
8073     break;
8074   qassoc_set_test_ret_READ_FAIL:
8075     retval = RET_READ_FAIL;
8076     break;
8077   qassoc_set_test_ret_THREAD_CREATE_FAIL:
8078     retval = RET_THREAD_CREATE_FAIL;
8079     break;
8080   }
8081  qassoc_set_test_ret_1:
8082   bigstack_reset(bigstack_mark);
8083   return retval;
8084 }
8085 
qassoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,double * pheno_d,uintptr_t * founder_info,uintptr_t * sex_male,uint32_t hh_exists,uint32_t ld_ignore_x,uint32_t perm_batch_size,Set_info * sip)8086 int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, uintptr_t* sex_male, uint32_t hh_exists, uint32_t ld_ignore_x, uint32_t perm_batch_size, Set_info* sip) {
8087   unsigned char* bigstack_mark = g_bigstack_base;
8088   uintptr_t marker_ct = marker_ct_orig;
8089   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8090   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8091   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
8092   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
8093   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
8094   uintptr_t perm_vec_ctcl8m = 0;
8095   FILE* outfile = nullptr;
8096   FILE* outfile_qtm = nullptr;
8097   FILE* outfile_msa = nullptr;
8098   uint32_t is_set_test = model_modifier & MODEL_SET_TEST;
8099   uint32_t perm_adapt_nst = (model_modifier & MODEL_PERM) && (!is_set_test);
8100   uint32_t perm_maxt_nst = (model_modifier & MODEL_MPERM) && (!is_set_test);
8101   uint32_t do_perms = model_modifier & (MODEL_PERM | MODEL_MPERM);
8102   uint32_t do_perms_nst = do_perms && (!is_set_test);
8103   uint32_t qt_means = model_modifier & MODEL_QT_MEANS;
8104   uint32_t do_lin = model_modifier & MODEL_LIN;
8105   uint32_t qt_means_or_lin = qt_means || do_lin;
8106   uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
8107   uint32_t fill_orig_chiabs = do_perms || mtest_adjust;
8108   uint32_t perms_total = 0;
8109   uint32_t pct = 0;
8110   uint32_t max_thread_ct = g_thread_ct;
8111   uint32_t perm_pass_idx = 0;
8112   uint32_t mt_exists = (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET]);
8113   uint32_t hh_or_mt_exists = hh_exists | (mt_exists * NXMHH_EXISTS);
8114   int32_t retval = 0;
8115   double x11 = 0;
8116   double x12 = 0;
8117   double x22 = 0;
8118   uintptr_t* marker_exclude = marker_exclude_orig;
8119   uintptr_t* founder_pnm = nullptr;
8120   uintptr_t* sample_male_include2 = nullptr;
8121   uint32_t* tcnt = nullptr;
8122   char* chrom_name_ptr = nullptr;
8123   uint32_t chrom_name_len = 0;
8124   char chrom_name_buf[5];
8125   uint32_t mu_table[MODEL_BLOCKSIZE];
8126   char numbuf[16]; // ' -1.23456e-200\0' fits, barely
8127   char spacebuf[8];
8128   char* outname_end2;
8129   char* wptr_start;
8130   char* wptr;
8131   char* wptr_restart;
8132   uintptr_t* loadbuf_raw;
8133   uintptr_t* loadbuf_ptr;
8134   uintptr_t* lbptr2;
8135   uintptr_t* sample_include2;
8136   double* ooptr;
8137   double* dptr;
8138   double* dptr2;
8139   double* dptr3;
8140   uint32_t* marker_idx_to_uidx;
8141   uint32_t marker_unstopped_ct;
8142   uint32_t chrom_fo_idx;
8143   uint32_t chrom_end;
8144   uint32_t block_size;
8145   uint32_t block_end;
8146   uint32_t marker_bidx;
8147   uintptr_t marker_uidx; // loading
8148   uintptr_t marker_uidx2; // writing
8149   uintptr_t marker_idx;
8150   uintptr_t marker_idx2;
8151   uintptr_t sample_uidx;
8152   uintptr_t sample_uidx_stop;
8153   uintptr_t sample_idx;
8154   uintptr_t ulii;
8155   intptr_t geno_sum;
8156   intptr_t nanal;
8157   intptr_t geno_ssq;
8158   double nanal_recip;
8159   double qt_sum;
8160   double qt_ssq;
8161   double qt_g_prod;
8162   double qt_g_prod_centered;
8163   double qt_mean;
8164   double geno_mean;
8165   double qt_var;
8166   double geno_var;
8167   double qt_g_covar;
8168   double beta;
8169   double vbeta_sqrt;
8170   double tstat;
8171   double tp;
8172   double rsq;
8173   double qt_het_sum;
8174   double qt_het_ssq;
8175   double qt_homrar_sum;
8176   double qt_homrar_ssq;
8177   double qt_homcom_sum;
8178   double dxx;
8179   double dyy;
8180   double dzz;
8181   double pval;
8182   uint32_t homrar_ct;
8183   uint32_t missing_ct;
8184   uint32_t het_ct;
8185   uint32_t homcom_ct;
8186   uint32_t is_last_block;
8187   uint32_t loop_end;
8188   uint32_t uii;
8189   uint32_t ujj;
8190   uint32_t ukk;
8191   char* a1ptr;
8192   char* a2ptr;
8193   if (pheno_nm_ct < 2) {
8194     logerrprint("Warning: Skipping QT --assoc since less than two phenotypes are present.\n");
8195     goto qassoc_ret_1;
8196   }
8197   if (is_set_test) {
8198     if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
8199       goto qassoc_ret_NOMEM;
8200     }
8201     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
8202     bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
8203     if (extract_set_union_unfiltered(sip, nullptr, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
8204       goto qassoc_ret_NOMEM;
8205     }
8206   }
8207   memset(spacebuf, 32, 8);
8208   g_perm_pheno_nm_ct = pheno_nm_ct;
8209   g_perms_done = 0;
8210   g_mperm_save_all = nullptr;
8211   numbuf[0] = ' ';
8212   if (perm_maxt_nst) {
8213     perms_total = model_mperm_val;
8214     // square of t-stat
8215     if (bigstack_calloc_d(perms_total, &g_maxt_extreme_stat)) {
8216       goto qassoc_ret_NOMEM;
8217     }
8218     g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
8219     if (!g_ldrefs) {
8220       goto qassoc_ret_NOMEM;
8221     }
8222 #ifdef __LP64__
8223     fill_ulong_one((marker_ct + 3) / 4, (uintptr_t*)g_ldrefs);
8224 #else
8225     fill_ulong_one((marker_ct + 1) / 2, (uintptr_t*)g_ldrefs);
8226 #endif
8227     if (mperm_save & MPERM_DUMP_ALL) {
8228       memcpy(outname_end, ".mperm.dump.all", 16);
8229       if (fopen_checked(outname, "w", &outfile_msa)) {
8230 	goto qassoc_ret_OPEN_FAIL;
8231       }
8232       if (putc_checked('0', outfile_msa)) {
8233 	goto qassoc_ret_WRITE_FAIL;
8234       }
8235       LOGPRINTFWW("Dumping all permutation squared %sstats to %s .\n", do_lin? "Lin " : "Wald t-", outname);
8236     }
8237   } else {
8238     mperm_save = 0;
8239     if (perm_adapt_nst) {
8240       g_aperm_alpha = apip->alpha;
8241       perms_total = apip->max;
8242       if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
8243 	  bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop)) {
8244 	goto qassoc_ret_NOMEM;
8245       }
8246       ujj = apip->max;
8247       for (uii = 0; uii < marker_ct; uii++) {
8248 	g_perm_attempt_ct[uii] = ujj;
8249       }
8250       g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
8251       if (apip->min < apip->init_interval) {
8252 	g_first_adapt_check = (int32_t)(apip->init_interval);
8253       } else {
8254 	g_first_adapt_check = apip->min;
8255       }
8256       g_adaptive_intercept = apip->init_interval;
8257       g_adaptive_slope = apip->interval_slope;
8258     }
8259   }
8260   outname_end2 = memcpyb(outname_end, ".qassoc", 8);
8261   if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
8262     goto qassoc_ret_NOMEM;
8263   }
8264   loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
8265   loadbuf_raw[unfiltered_sample_ctv2 - 1] = 0;
8266   if (fill_orig_chiabs) {
8267     if (bigstack_alloc_d(marker_ct, &g_orig_chisq)) {
8268       goto qassoc_ret_NOMEM;
8269     }
8270     if (mtest_adjust || is_set_test) {
8271       if (bigstack_alloc_ui(marker_ct, &tcnt)) {
8272 	goto qassoc_ret_NOMEM;
8273       }
8274     }
8275   }
8276   if (fopen_checked(outname, "w", &outfile)) {
8277     goto qassoc_ret_OPEN_FAIL;
8278   }
8279   if (qt_means) {
8280     memcpy(outname_end2, ".means", 7);
8281     if (fopen_checked(outname, "w", &outfile_qtm)) {
8282       goto qassoc_ret_OPEN_FAIL;
8283     }
8284     sprintf(g_textbuf, " CHR %%%us  VALUE      G11      G12      G22\n", plink_maxsnp);
8285     fprintf(outfile_qtm, g_textbuf, "SNP");
8286     *outname_end2 = '\0';
8287   }
8288   if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
8289     logerrprint("Warning: QT --assoc doesn't handle X/Y/MT/haploid variants normally (try\n--linear).\n");
8290   }
8291   LOGPRINTFWW5("Writing QT --assoc report to %s ... ", outname);
8292   fflush(stdout);
8293   sprintf(g_textbuf, " CHR %%%us         BP    NMISS       BETA         SE         R2        T            P ", plink_maxsnp);
8294   fprintf(outfile, g_textbuf, "SNP");
8295   if (do_lin) {
8296     fputs("         LIN        LIN_P ", outfile);
8297   }
8298   if (putc_checked('\n', outfile)) {
8299     goto qassoc_ret_WRITE_FAIL;
8300   }
8301   if (do_perms) {
8302     if (model_modifier & MODEL_PERM) {
8303       if (perm_batch_size > apip->max) {
8304 	perm_batch_size = apip->max;
8305       }
8306     } else {
8307       if (perm_batch_size > model_mperm_val) {
8308 	perm_batch_size = model_mperm_val;
8309       }
8310     }
8311     uii = MINV(perm_batch_size, perms_total) / CACHELINE_DBL;
8312     if (max_thread_ct > uii) {
8313       max_thread_ct = MAXV(uii, 1);
8314     }
8315     if (cluster_starts) {
8316       retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, nullptr, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, nullptr, nullptr);
8317       if (retval) {
8318 	goto qassoc_ret_1;
8319       }
8320       if (!g_perm_cluster_ct) {
8321         logerrprint("Error: No size 2+ clusters for permutation test.\n");
8322         goto qassoc_ret_INVALID_CMDLINE;
8323       }
8324       if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
8325           bigstack_alloc_ui(max_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
8326 	goto qassoc_ret_NOMEM;
8327       }
8328       fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
8329     }
8330     if (bigstack_alloc_ui(marker_ct, &g_missing_cts) ||
8331 	bigstack_alloc_ui(marker_ct, &g_het_cts) ||
8332 	bigstack_alloc_ui(marker_ct, &g_homcom_cts)) {
8333       goto qassoc_ret_NOMEM;
8334     }
8335     if (!is_set_test) {
8336       if (bigstack_init_sfmtp(max_thread_ct)) {
8337 	goto qassoc_ret_NOMEM;
8338       }
8339       if (bigstack_calloc_ui(marker_ct, &g_perm_2success_ct)) {
8340 	goto qassoc_ret_NOMEM;
8341       }
8342     }
8343   }
8344   if (do_lin) {
8345     if (bigstack_alloc_d(marker_ct, &g_orig_linsq)) {
8346       goto qassoc_ret_NOMEM;
8347     }
8348   }
8349   if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &g_loadbuf) ||
8350       bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
8351       bigstack_alloc_ul(pheno_nm_ctv2, &sample_include2)) {
8352     goto qassoc_ret_NOMEM;
8353   }
8354   fill_quatervec_55(pheno_nm_ct, sample_include2);
8355   if (alloc_collapsed_haploid_filters(pheno_nm, sex_male, unfiltered_sample_ct, pheno_nm_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2)) {
8356     goto qassoc_ret_NOMEM;
8357   }
8358   marker_unstopped_ct = marker_ct;
8359   if (bigstack_alloc_d(pheno_nm_ct, &g_perm_pheno_d2)) {
8360     goto qassoc_ret_NOMEM;
8361   }
8362   g_pheno_sum = 0;
8363   g_pheno_ssq = 0;
8364   sample_uidx = 0;
8365   sample_idx = 0;
8366   dptr = g_perm_pheno_d2;
8367   do {
8368     sample_uidx = next_set_ul_unsafe(pheno_nm, sample_uidx);
8369     sample_uidx_stop = next_unset_ul(pheno_nm, sample_uidx, unfiltered_sample_ct);
8370     sample_idx += sample_uidx_stop - sample_uidx;
8371     dptr2 = &(pheno_d[sample_uidx]);
8372     sample_uidx = sample_uidx_stop;
8373     dptr3 = &(pheno_d[sample_uidx_stop]);
8374     do {
8375       dxx = *dptr2++;
8376       *dptr++ = dxx;
8377       g_pheno_sum += dxx;
8378       g_pheno_ssq += dxx * dxx;
8379     } while (dptr2 < dptr3);
8380   } while (sample_idx < pheno_nm_ct);
8381   fputs("0%", stdout);
8382   fflush(stdout);
8383 
8384   // ----- begin main loop -----
8385  qassoc_more_perms:
8386   if (do_perms_nst) {
8387     if (perm_adapt_nst && perm_pass_idx) {
8388       while (g_first_adapt_check <= g_perms_done) {
8389 	// APERM_MAX prevents infinite loop here
8390 	g_first_adapt_check += (int32_t)(apip->init_interval + ((int32_t)g_first_adapt_check) * apip->interval_slope);
8391       }
8392     }
8393     // g_perm_vec_ct memory allocation dependencies:
8394     //   g_maxt_thread_results: (8 * perm_vec_ct, CL-aligned) * thread_ct
8395     //   g_perm_vecstd: (8 * perm_vec_ct, CL-aligned) * pheno_nm_ct
8396     //   g_mperm_save_all (if needed): marker_ct * 8 * perm_vec_ct
8397     //   adaptive, Wald:
8398     //     g_thread_git_qbufs: (8 * perm_vec_ct, CL-aligned) * 3 * thread_ct
8399     //   adaptive, Lin:
8400     //     g_thread_git_qbufs: (8 * perm_vec_ct, CL-aligned) * 6 * thread_ct
8401     //   max(T), Wald:
8402     //     g_qresultbuf: MODEL_BLOCKSIZE * (8 * perm_vec_ct, CL-aligned) * 3
8403     //   max(T), Lin:
8404     //     g_qresultbuf: MODEL_BLOCKSIZE * (8 * perm_vec_ct, CL-aligned) * 6
8405     g_perm_vec_ct = perm_batch_size;
8406     if (g_perm_vec_ct > perms_total - g_perms_done) {
8407       g_perm_vec_ct = perms_total - g_perms_done;
8408     }
8409     perm_vec_ctcl8m = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
8410     if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
8411       goto qassoc_ret_NOMEM;
8412     }
8413     ulii = do_lin? 6 : 3;
8414     if (perm_maxt_nst) {
8415       if (bigstack_alloc_d(max_thread_ct * perm_vec_ctcl8m, &g_maxt_thread_results) ||
8416 	  bigstack_alloc_d(ulii * MODEL_BLOCKSIZE * perm_vec_ctcl8m, &g_qresultbuf)) {
8417 	goto qassoc_ret_NOMEM;
8418       }
8419       if (mperm_save & MPERM_DUMP_ALL) {
8420 	if (bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all)) {
8421 	  goto qassoc_ret_NOMEM;
8422 	}
8423       }
8424     } else {
8425       if (bigstack_calloc_d(perm_vec_ctcl8m * ulii * max_thread_ct, &g_thread_git_qbufs)) {
8426 	goto qassoc_ret_NOMEM;
8427       }
8428     }
8429     g_perms_done += g_perm_vec_ct;
8430     if (g_perm_vec_ct >= CACHELINE_DBL * max_thread_ct) {
8431       g_perm_generation_thread_ct = max_thread_ct;
8432     } else {
8433       g_perm_generation_thread_ct = MAXV(g_perm_vec_ct / CACHELINE_DBL, 1);
8434     }
8435     ulii = 0;
8436     if (!cluster_starts) {
8437       if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
8438 	goto qassoc_ret_THREAD_CREATE_FAIL;
8439       }
8440       generate_qt_perms_smajor_thread((void*)ulii);
8441     } else {
8442       if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
8443 	goto qassoc_ret_THREAD_CREATE_FAIL;
8444       }
8445       generate_qt_cluster_perms_smajor_thread((void*)ulii);
8446     }
8447     join_threads(threads, g_perm_generation_thread_ct);
8448     g_assoc_thread_ct = max_thread_ct;
8449   }
8450   chrom_fo_idx = 0xffffffffU;
8451   marker_uidx = next_unset_unsafe(marker_exclude, 0);
8452   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8453     goto qassoc_ret_READ_FAIL;
8454   }
8455   marker_idx = 0;
8456   marker_idx2 = 0;
8457   chrom_end = 0;
8458   loop_end = marker_ct / 100;
8459   do {
8460     if (marker_uidx >= chrom_end) {
8461       g_qblock_start = 0;
8462       // exploit overflow
8463       chrom_fo_idx++;
8464       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &g_min_ploidy_1);
8465       g_min_ploidy_1 |= uii; // treat MT as haploid
8466       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
8467       chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
8468     } else if (perm_maxt_nst) {
8469       marker_idx -= MODEL_BLOCKKEEP;
8470       memcpy(g_loadbuf, &(g_loadbuf[(MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * pheno_nm_ctv2]), MODEL_BLOCKKEEP * pheno_nm_ctv2 * sizeof(intptr_t));
8471       if (!do_lin) {
8472 	memcpy(g_qresultbuf, &(g_qresultbuf[3 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl8m]), MODEL_BLOCKKEEP * perm_vec_ctcl8m * 3 * sizeof(double));
8473       } else {
8474 	memcpy(g_qresultbuf, &(g_qresultbuf[6 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl8m]), MODEL_BLOCKKEEP * perm_vec_ctcl8m * 6 * sizeof(double));
8475       }
8476       g_qblock_start = MODEL_BLOCKKEEP;
8477     } else {
8478       g_qblock_start = 0;
8479     }
8480     block_size = g_qblock_start;
8481     block_end = marker_unstopped_ct - marker_idx;
8482     if (block_end > MODEL_BLOCKSIZE) {
8483       block_end = MODEL_BLOCKSIZE;
8484     }
8485     do {
8486       if (perm_adapt_nst && g_perm_adapt_stop[marker_idx2]) {
8487 	do {
8488 	  marker_uidx++;
8489 	  next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
8490 	  marker_idx2++;
8491 	} while ((marker_uidx < chrom_end) && g_perm_adapt_stop[marker_idx2]);
8492 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8493 	  goto qassoc_ret_READ_FAIL;
8494 	}
8495 	if (marker_uidx >= chrom_end) {
8496 	  break;
8497 	}
8498       }
8499       loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv2]);
8500       if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
8501 	goto qassoc_ret_READ_FAIL;
8502       }
8503       if (g_min_ploidy_1 && hh_or_mt_exists) {
8504 	haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, pheno_nm_ct, g_is_x, g_is_y, (unsigned char*)loadbuf_ptr);
8505       }
8506       if (perm_adapt_nst) {
8507 	g_adapt_m_table[block_size] = marker_idx2++;
8508       }
8509       mu_table[block_size++] = marker_uidx;
8510       if (marker_idx + block_size == marker_unstopped_ct) {
8511 	break;
8512       }
8513       marker_uidx++;
8514       if (IS_SET(marker_exclude, marker_uidx)) {
8515 	marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
8516 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8517 	  goto qassoc_ret_READ_FAIL;
8518 	}
8519       }
8520     } while ((block_size < block_end) && (marker_uidx < chrom_end));
8521     if (block_size == g_qblock_start) {
8522       continue;
8523     }
8524     if (!perm_pass_idx) {
8525       for (marker_bidx = g_qblock_start; marker_bidx < block_size; marker_bidx++) {
8526 	marker_uidx2 = mu_table[marker_bidx];
8527         marker_idx_to_uidx[marker_idx + marker_bidx] = marker_uidx2;
8528 	loadbuf_ptr = &(g_loadbuf[marker_bidx * pheno_nm_ctv2]);
8529 	genovec_3freq(loadbuf_ptr, sample_include2, pheno_nm_ctv2, &missing_ct, &het_ct, &homcom_ct);
8530 	nanal = pheno_nm_ct - missing_ct;
8531 	wptr = memcpya(g_textbuf, chrom_name_ptr, chrom_name_len);
8532 	*wptr++ = ' ';
8533         wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
8534 	*wptr++ = ' ';
8535 	wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
8536 	wptr = uint32toa_w8x(nanal, ' ', wptr);
8537 	homrar_ct = nanal - het_ct - homcom_ct;
8538 	if (do_perms) {
8539 	  g_missing_cts[marker_idx + marker_bidx] = missing_ct;
8540 	  g_homcom_cts[marker_idx + marker_bidx] = homcom_ct;
8541 	  g_het_cts[marker_idx + marker_bidx] = het_ct;
8542 	}
8543 	geno_sum = 2 * homrar_ct + het_ct;
8544 	geno_ssq = 4 * homrar_ct + het_ct;
8545 	qt_sum = g_pheno_sum;
8546 	qt_g_prod = 0;
8547 	qt_ssq = g_pheno_ssq;
8548 	lbptr2 = loadbuf_ptr;
8549 	uii = 0;
8550 	qt_het_sum = 0;
8551 	qt_het_ssq = 0;
8552 	qt_homrar_sum = 0;
8553 	qt_homrar_ssq = 0;
8554 	do {
8555 	  ulii = ~(*lbptr2++);
8556 	  if (uii + BITCT2 > pheno_nm_ct) {
8557 	    ulii &= (ONELU << ((pheno_nm_ct & (BITCT2 - 1)) * 2)) - ONELU;
8558 	  }
8559 	  while (ulii) {
8560 	    ujj = CTZLU(ulii) & (BITCT - 2);
8561 	    ukk = (ulii >> ujj) & 3;
8562 	    sample_idx = uii + (ujj / 2);
8563 	    dxx = g_perm_pheno_d2[sample_idx];
8564 	    if (ukk == 1) {
8565 	      qt_g_prod += dxx;
8566 	      if (qt_means_or_lin) {
8567 		qt_het_sum += dxx;
8568 		qt_het_ssq += dxx * dxx;
8569 	      }
8570 	    } else if (ukk == 3) {
8571 	      qt_g_prod += 2 * dxx;
8572 	      if (qt_means_or_lin) {
8573 		qt_homrar_sum += dxx;
8574 		qt_homrar_ssq += dxx * dxx;
8575 	      }
8576 	    } else {
8577 	      qt_sum -= dxx;
8578 	      qt_ssq -= dxx * dxx;
8579 	    }
8580 	    ulii &= ~((3 * ONELU) << ujj);
8581 	  }
8582 	  uii += BITCT2;
8583 	} while (uii < pheno_nm_ct);
8584 	nanal_recip = 1.0 / ((double)nanal);
8585 	qt_mean = qt_sum * nanal_recip;
8586 	geno_mean = ((double)geno_sum) * nanal_recip;
8587 	dxx = 1.0 / ((double)(nanal - 1));
8588 	qt_var = (qt_ssq - qt_sum * qt_mean) * dxx;
8589 	geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * dxx;
8590 	qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
8591 	qt_g_covar = qt_g_prod_centered * dxx;
8592 
8593 	dxx = 1.0 / geno_var;
8594 	beta = qt_g_covar * dxx;
8595 	vbeta_sqrt = sqrt((qt_var * dxx - beta * beta) / ((double)(nanal - 2)));
8596 	tstat = beta / vbeta_sqrt;
8597 	if (fill_orig_chiabs) {
8598 	  g_orig_chisq[marker_idx + marker_bidx] = tstat;
8599 	  if (tcnt) {
8600 	    tcnt[marker_idx + marker_bidx] = (nanal > 2)? (nanal - 2) : 0;
8601 	  }
8602 	}
8603 	if (do_lin) {
8604 	  // Square of Lin statistic:
8605 	  //   \frac{(\sum_{i=1}^nU_{ji})^2}{\sum_{i=1}^nU_{ji}^2}
8606 	  // where U_{ji} = (Y_i - \bar{Y_{\dot}})(X_{ji} - \bar{X_{j\dot}}),
8607 	  // Y_{\dot}s are phenotypes, and X_{\dot\dot}s are genotypes.
8608 	  //
8609 	  // We evaluate the denominator by separating the sum into three
8610 	  // components (one for each possible genotype value), each of which
8611 	  // can be computed from the partial sums/sums-of-squares we already
8612 	  // have.
8613 	  g_orig_linsq[marker_idx + marker_bidx] = qt_g_prod_centered * qt_g_prod_centered / (geno_mean * geno_mean * (qt_ssq - 2 * qt_sum + qt_mean * qt_sum) + (1 - 2 * geno_mean) * (qt_het_ssq - 2 * qt_het_sum * qt_mean + qt_mean * qt_mean * ((intptr_t)het_ct)) + (4 - 4 * geno_mean) * (qt_homrar_ssq - 2 * qt_homrar_sum * qt_mean + qt_mean * qt_mean * ((intptr_t)homrar_ct)));
8614 	}
8615 	if (nanal > 1) {
8616 	  tp = calc_tprob(tstat, nanal - 2);
8617 	  rsq = (qt_g_covar * qt_g_covar) / (qt_var * geno_var);
8618 	  if (mperm_save & MPERM_DUMP_ALL) {
8619 	    if (!do_lin) {
8620 	      if (tp >= 0) {
8621 		dtoa_gx(tstat * tstat, '\0', &(numbuf[1]));
8622 		fputs(numbuf, outfile_msa);
8623 	      } else {
8624 		fputs(" NA", outfile_msa);
8625 	      }
8626 	    } else {
8627 	      dxx = g_orig_linsq[marker_idx + marker_bidx];
8628 	      if ((nanal > 2) && realnum(dxx)) {
8629 		dtoa_gx(dxx, '\0', &(numbuf[1]));
8630 		fputs(numbuf, outfile_msa);
8631 	      } else {
8632 		fputs(" NA", outfile_msa);
8633 	      }
8634 	    }
8635 	  }
8636 	  if ((pfilter != 2.0) && ((tp > pfilter) || (tp == -9))) {
8637 	    continue;
8638 	  }
8639 	  if (!realnum(beta)) {
8640 	    wptr = memcpya(wptr, "        NA         NA         NA ", 33);
8641 	  } else {
8642 	    wptr = dtoa_g_wxp4x(beta, 10, ' ', wptr);
8643 	    wptr = dtoa_g_wxp4x(vbeta_sqrt, 10, ' ', wptr);
8644 	    wptr = dtoa_g_wxp4x(rsq, 10, ' ', wptr);
8645 	  }
8646 	  if (tp >= 0) {
8647 	    wptr = dtoa_g_wxp4x(tstat, 8, ' ', wptr);
8648 	    wptr = dtoa_g_wxp4(MAXV(tp, output_min_p), 12, wptr);
8649 	  } else {
8650 	    wptr = memcpya(wptr, "      NA           NA", 21);
8651 	  }
8652 	  if (do_lin && (nanal > 2)) {
8653 	    dxx = g_orig_linsq[marker_idx + marker_bidx];
8654 	    if (realnum(dxx)) {
8655 	      *wptr++ = ' ';
8656 	      dxx = sqrt(dxx);
8657 	      wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
8658 	      dxx = calc_tprob(dxx, nanal - 2);
8659 	      wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
8660 	    } else {
8661 	      wptr = memcpya(wptr, "           NA           NA", 26);
8662 	    }
8663 	  }
8664 	  wptr = memcpya(wptr, " \n", 2);
8665 	} else if (pfilter != 2.0) {
8666 	  continue;
8667 	} else {
8668 	  wptr = memcpya(wptr, "        NA         NA         NA       NA           NA ", 55);
8669 	  if (mperm_save & MPERM_DUMP_ALL) {
8670 	    fputs(" NA", outfile_msa);
8671 	  }
8672 	  if (do_lin) {
8673 	    wptr = memcpya(wptr, "          NA           NA ", 26);
8674 	  }
8675 	  *wptr++ = '\n';
8676 	}
8677 	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
8678 	  goto qassoc_ret_WRITE_FAIL;
8679 	}
8680 	if (qt_means) {
8681 	  wptr_restart = &(g_textbuf[2 + chrom_name_len + plink_maxsnp]);
8682 	  wptr = memcpya(wptr_restart, "  GENO ", 7);
8683 	  a1ptr = marker_allele_ptrs[2 * marker_uidx2];
8684 	  a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
8685 	  uii = strlen(a1ptr);
8686 	  ujj = strlen(a2ptr);
8687 	  if (uii < 4) {
8688 	    wptr = memseta(wptr, 32, 7 - 2 * uii);
8689 	  }
8690 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8691 	    goto qassoc_ret_WRITE_FAIL;
8692 	  }
8693 	  fputs(a1ptr, outfile_qtm);
8694 	  putc_unlocked('/', outfile_qtm);
8695 	  fputs(a1ptr, outfile_qtm);
8696 	  putc_unlocked(' ', outfile_qtm);
8697 	  if (uii + ujj < 7) {
8698 	    fwrite(spacebuf, 1, 7 - uii - ujj, outfile_qtm);
8699 	  }
8700 	  fputs(a1ptr, outfile_qtm);
8701 	  putc_unlocked('/', outfile_qtm);
8702 	  fputs(a2ptr, outfile_qtm);
8703 	  putc_unlocked(' ', outfile_qtm);
8704 	  if (ujj < 4) {
8705 	    fwrite(spacebuf, 1, 7 - 2 * ujj, outfile_qtm);
8706 	  }
8707           fputs(a2ptr, outfile_qtm);
8708 	  putc_unlocked('/', outfile_qtm);
8709           fputs(a2ptr, outfile_qtm);
8710 	  putc_unlocked('\n', outfile_qtm);
8711 	  wptr = memcpya(wptr_restart, "COUNTS ", 7);
8712 	  wptr = uint32toa_w8x(homrar_ct, ' ', wptr);
8713 	  wptr = uint32toa_w8x(het_ct, ' ', wptr);
8714 	  wptr = uint32toa_w8x(homcom_ct, '\n', wptr);
8715 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8716 	    goto qassoc_ret_WRITE_FAIL;
8717 	  }
8718 	  wptr = memcpya(wptr_restart, "  FREQ ", 7);
8719 	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homrar_ct), 8, ' ', wptr);
8720 	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)het_ct), 8, ' ', wptr);
8721 	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homcom_ct), 8, '\n', wptr);
8722 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8723 	    goto qassoc_ret_WRITE_FAIL;
8724 	  }
8725 	  wptr = memcpya(wptr_restart, "  MEAN ", 7);
8726 	  qt_homcom_sum = qt_sum - qt_homrar_sum - qt_het_sum;
8727 	  if (homrar_ct) {
8728 	    x11 = qt_homrar_sum / ((double)homrar_ct);
8729 	    wptr = dtoa_g_wxp4(x11, 8, wptr);
8730 	  } else {
8731 	    wptr = memcpya(wptr, "      NA", 8);
8732 	  }
8733 	  *wptr++ = ' ';
8734 	  if (het_ct) {
8735 	    x12 = qt_het_sum / ((double)het_ct);
8736 	    wptr = dtoa_g_wxp4(x12, 8, wptr);
8737 	  } else {
8738 	    wptr = memcpya(wptr, "      NA", 8);
8739 	  }
8740 	  *wptr++ = ' ';
8741 	  if (homcom_ct) {
8742 	    x22 = qt_homcom_sum / ((double)homcom_ct);
8743 	    wptr = dtoa_g_wxp4(x22, 8, wptr);
8744 	  } else {
8745 	    wptr = memcpya(wptr, "      NA", 8);
8746 	  }
8747 	  *wptr++ = '\n';
8748 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8749 	    goto qassoc_ret_WRITE_FAIL;
8750 	  }
8751 	  wptr = memcpya(wptr_restart, "    SD ", 7);
8752 	  if (homrar_ct > 1) {
8753             dxx = sqrt((qt_homrar_ssq - qt_homrar_sum * x11) / ((double)((intptr_t)homrar_ct - 1)));
8754 	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
8755 	  } else if (homrar_ct == 1) {
8756 	    wptr = memcpya(wptr, "       0", 8);
8757 	  } else {
8758 	    wptr = memcpya(wptr, "      NA", 8);
8759 	  }
8760 	  *wptr++ = ' ';
8761 	  if (het_ct > 1) {
8762             dxx = sqrt((qt_het_ssq - qt_het_sum * x12) / ((double)((intptr_t)het_ct - 1)));
8763 	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
8764 	  } else if (het_ct == 1) {
8765 	    wptr = memcpya(wptr, "       0", 8);
8766 	  } else {
8767 	    wptr = memcpya(wptr, "      NA", 8);
8768 	  }
8769 	  *wptr++ = ' ';
8770 	  if (homcom_ct > 1) {
8771             dxx = sqrt((qt_ssq - qt_het_ssq - qt_homrar_ssq - qt_homcom_sum * x22) / ((double)((intptr_t)homcom_ct - 1)));
8772 	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
8773 	  } else if (homcom_ct == 1) {
8774 	    wptr = memcpya(wptr, "       0", 8);
8775 	  } else {
8776 	    wptr = memcpya(wptr, "      NA", 8);
8777 	  }
8778 	  *wptr++ = '\n';
8779 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8780 	    goto qassoc_ret_WRITE_FAIL;
8781 	  }
8782 	}
8783       }
8784     }
8785     if (do_perms_nst) {
8786       is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
8787       g_block_diff = block_size - g_qblock_start;
8788       ulii = 0;
8789       if (perm_maxt_nst) {
8790 	g_maxt_block_base = marker_idx;
8791 	// don't actually use maxt_cur_extreme_stat here?...
8792 	if (!do_lin) {
8793 	  if (spawn_threads2(threads, &qassoc_maxt_thread, max_thread_ct, is_last_block)) {
8794 	    goto qassoc_ret_THREAD_CREATE_FAIL;
8795 	  }
8796 	  qassoc_maxt_thread((void*)ulii);
8797 	} else {
8798 	  if (spawn_threads2(threads, &qassoc_maxt_lin_thread, max_thread_ct, is_last_block)) {
8799 	    goto qassoc_ret_THREAD_CREATE_FAIL;
8800 	  }
8801 	  qassoc_maxt_lin_thread((void*)ulii);
8802 	}
8803         join_threads2(threads, max_thread_ct, is_last_block);
8804 	ukk = g_block_diff / CACHELINE_DBL;
8805 	if (ukk > max_thread_ct) {
8806 	  ukk = max_thread_ct;
8807 	} else if (!ukk) {
8808 	  ukk = 1;
8809 	}
8810 	ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
8811 	for (uii = 0; uii < ukk; uii++) {
8812 	  ooptr = &(g_maxt_thread_results[uii * ulii]);
8813 	  for (ujj = g_perms_done - g_perm_vec_ct; ujj < g_perms_done; ujj++) {
8814 	    dxx = *ooptr++;
8815 	    if (dxx > g_maxt_extreme_stat[ujj]) {
8816 	      g_maxt_extreme_stat[ujj] = dxx;
8817 	    }
8818 	  }
8819 	}
8820       } else {
8821 	if (!do_lin) {
8822 	  if (spawn_threads2(threads, &qassoc_adapt_thread, max_thread_ct, is_last_block)) {
8823 	    goto qassoc_ret_THREAD_CREATE_FAIL;
8824 	  }
8825 	  qassoc_adapt_thread((void*)ulii);
8826 	} else {
8827 	  if (spawn_threads2(threads, &qassoc_adapt_lin_thread, max_thread_ct, is_last_block)) {
8828 	    goto qassoc_ret_THREAD_CREATE_FAIL;
8829 	  }
8830 	  qassoc_adapt_lin_thread((void*)ulii);
8831 	}
8832         join_threads2(threads, max_thread_ct, is_last_block);
8833       }
8834     }
8835     marker_idx += block_size;
8836     if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
8837       if (marker_idx < marker_unstopped_ct) {
8838 	if (pct >= 10) {
8839 	  putc_unlocked('\b', stdout);
8840 	}
8841 	pct = (marker_idx * 100LLU) / marker_unstopped_ct;
8842 	printf("\b\b%u%%", pct);
8843 	fflush(stdout);
8844 	loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
8845       }
8846     }
8847   } while (marker_idx < marker_unstopped_ct);
8848   if (!perm_pass_idx) {
8849     if (pct >= 10) {
8850       putc_unlocked('\b', stdout);
8851     }
8852     fputs("\b\b", stdout);
8853     logprint("done.\n");
8854     if (qt_means) {
8855       LOGPRINTFWW("QT means report saved to %s.means .\n", outname);
8856       if (fclose_null(&outfile_qtm)) {
8857 	goto qassoc_ret_WRITE_FAIL;
8858       }
8859     }
8860     if (fclose_null(&outfile)) {
8861       goto qassoc_ret_WRITE_FAIL;
8862     }
8863     if (!is_set_test) {
8864       if (do_perms_nst) {
8865 	bigstack_reset(g_perm_vecstd);
8866       }
8867       if (mtest_adjust) {
8868 	if (do_lin) {
8869 	  for (uii = 0; uii < marker_ct; uii++) {
8870 	    g_orig_chisq[uii] = sqrt(g_orig_linsq[uii]);
8871 	  }
8872 	}
8873 	retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, g_orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, tcnt, nullptr);
8874 	if (retval) {
8875 	  goto qassoc_ret_1;
8876 	}
8877       }
8878       if (mperm_save & MPERM_DUMP_ALL) {
8879 	if (putc_checked('\n', outfile_msa)) {
8880 	  goto qassoc_ret_WRITE_FAIL;
8881 	}
8882       }
8883     } else {
8884       retval = qassoc_set_test(threads, bedfile, bed_offset, outname, outname_end, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, sample_include2, sample_male_include2, ld_ignore_x, hh_exists, hh_or_mt_exists, perm_batch_size, sip, tcnt, loadbuf_raw);
8885       if (retval) {
8886 	goto qassoc_ret_1;
8887       }
8888     }
8889   }
8890   if (do_perms_nst) {
8891     if (mperm_save & MPERM_DUMP_ALL) {
8892       if (perm_pass_idx) {
8893 	putc_unlocked(' ', stdout);
8894       }
8895       fputs("[dumping stats]", stdout);
8896       fflush(stdout);
8897       ulii = g_perm_vec_ct;
8898       ujj = 1 + g_perms_done - ulii;
8899       wptr = g_textbuf;
8900       a1ptr = &(g_textbuf[MAXLINELEN]);
8901       for (uii = 0; uii < ulii; uii++) {
8902 	wptr = uint32toa(uii + ujj, wptr);
8903 	ooptr = &(g_mperm_save_all[uii]);
8904 	for (ukk = 0; ukk < marker_ct; ukk++) {
8905 	  *wptr++ = ' ';
8906 	  dxx = ooptr[ukk * ulii];
8907 	  if (dxx >= 0) {
8908 	    wptr = dtoa_g(dxx, wptr);
8909 	  } else {
8910 	    wptr = memcpya(wptr, "NA", 2);
8911 	  }
8912 	  if (wptr >= a1ptr) {
8913 	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
8914 	      goto qassoc_ret_WRITE_FAIL;
8915 	    }
8916 	    wptr = g_textbuf;
8917 	  }
8918 	}
8919 	*wptr++ = '\n';
8920       }
8921       if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
8922 	goto qassoc_ret_WRITE_FAIL;
8923       }
8924       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
8925     }
8926     bigstack_reset(g_perm_vecstd);
8927     if (g_perms_done < perms_total) {
8928       if (perm_adapt_nst) {
8929 	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
8930 	if (!marker_unstopped_ct) {
8931 	  goto qassoc_adapt_perm_count;
8932 	}
8933       }
8934       printf("\r%u permutation%s complete.", g_perms_done, (g_perms_done != 1)? "s" : "");
8935       fflush(stdout);
8936       perm_pass_idx++;
8937       goto qassoc_more_perms;
8938     }
8939     if (perm_adapt_nst) {
8940     qassoc_adapt_perm_count:
8941       g_perms_done = 0;
8942       for (uii = 0; uii < marker_ct; uii++) {
8943 	if (g_perm_attempt_ct[uii] > g_perms_done) {
8944 	  g_perms_done = g_perm_attempt_ct[uii];
8945 	  if (g_perms_done == perms_total) {
8946 	    break;
8947 	  }
8948 	}
8949       }
8950     }
8951     putc_unlocked('\r', stdout);
8952     LOGPRINTF("%u %s permutation%s complete.\n", g_perms_done, perm_maxt_nst? "max(T)" : "(adaptive)", (g_perms_done != 1)? "s" : "");
8953 
8954     if (perm_adapt_nst) {
8955       memcpy(outname_end2, ".perm", 6);
8956     } else {
8957       if (mperm_save & MPERM_DUMP_BEST) {
8958 	memcpy(outname_end, ".mperm.dump.best", 17);
8959 	LOGPRINTFWW("Dumping best permutation squared %sstats to %s .\n", do_lin? "Lin " : "Wald t-", outname);
8960 	if (fopen_checked(outname, "w", &outfile)) {
8961 	  goto qassoc_ret_OPEN_FAIL;
8962 	}
8963 	dxx = 0;
8964 	if (!do_lin) {
8965 	  for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
8966 	    if (fabs(g_orig_chisq[marker_idx]) > dxx) {
8967 	      dxx = fabs(g_orig_chisq[marker_idx]);
8968 	    }
8969 	  }
8970 	  dxx = dxx * dxx;
8971 	} else {
8972 	  for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
8973 	    if (g_orig_linsq[marker_idx] > dxx) {
8974 	      dxx = g_orig_linsq[marker_idx];
8975 	    }
8976 	  }
8977 	}
8978         memcpy(g_textbuf, "0 ", 2);
8979 	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
8980 	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
8981 	  goto qassoc_ret_WRITE_FAIL;
8982 	}
8983 	for (uii = 0; uii < perms_total; uii++) {
8984 	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
8985 	  wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
8986 	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
8987 	    goto qassoc_ret_WRITE_FAIL;
8988 	  }
8989 	}
8990 	if (fclose_null(&outfile)) {
8991 	  goto qassoc_ret_WRITE_FAIL;
8992 	}
8993 	memcpy(outname_end, ".qassoc", 7); // deliberately not null-terminated
8994       }
8995       memcpy(outname_end2, ".mperm", 7);
8996     }
8997     if (fopen_checked(outname, "w", &outfile)) {
8998       goto qassoc_ret_OPEN_FAIL;
8999     }
9000     if (perm_adapt_nst) {
9001       sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
9002     } else {
9003       sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
9004 #ifdef __cplusplus
9005       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
9006 #else
9007       qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
9008 #endif
9009     }
9010     // (debugging)
9011     // if (perm_maxt) {
9012     //   printf("extreme stats: %g %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[(perms_total - 1) / 2], g_maxt_extreme_stat[perms_total - 1]);
9013     // }
9014     fprintf(outfile, g_textbuf, "SNP");
9015     chrom_fo_idx = 0xffffffffU;
9016     marker_uidx = next_unset_unsafe(marker_exclude, 0);
9017     marker_idx = 0;
9018     dyy = 1.0 / ((double)((int32_t)perms_total + 1));
9019     dxx = 0.5 * dyy;
9020     while (1) {
9021       do {
9022 	chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
9023       } while (marker_uidx >= chrom_end);
9024       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9025       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
9026       *wptr_start++ = ' ';
9027       wptr_start[plink_maxsnp] = ' ';
9028       for (; marker_uidx < chrom_end;) {
9029 	if (perm_adapt_nst) {
9030 	  pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (g_perm_attempt_ct[marker_idx] + 1)));
9031 	} else {
9032 	  pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) * dxx;
9033 	}
9034         if (pval <= pfilter) {
9035 	  fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
9036 	  wptr = &(wptr_start[1 + plink_maxsnp]);
9037 	  if (perm_adapt_nst && (!g_perm_attempt_ct[marker_idx])) {
9038 	    // invalid
9039             wptr = memcpya(wptr, "          NA           NA", 25);
9040 	  } else {
9041 	    if (!perm_count) {
9042 	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
9043 	    } else {
9044 	      wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
9045 	    }
9046 	    if (perm_adapt_nst) {
9047 	      wptr = memseta(wptr, 32, 2);
9048 	      wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
9049 	    } else {
9050 	      // maximum chisq
9051 	      // N.B. numbers in maxt_extreme_stat[] have been pre-squared
9052 	      // while orig_chisq[] has not been
9053 	      if (do_lin) {
9054 		dzz = g_orig_linsq[marker_idx];
9055 	      } else {
9056 		dzz = g_orig_chisq[marker_idx] * g_orig_chisq[marker_idx];
9057 	      }
9058 	      dzz = (int32_t)(perms_total - doublearr_greater_than(g_maxt_extreme_stat, perms_total, dzz - EPSILON) + 1);
9059 	      if (!perm_count) {
9060 		wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
9061 	      } else {
9062 		wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
9063 	      }
9064 	    }
9065 	  }
9066 	  wptr = memcpya(wptr, " \n", 2);
9067 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9068 	    goto qassoc_ret_WRITE_FAIL;
9069 	  }
9070 	}
9071 	if (++marker_idx == marker_ct) {
9072 	  goto qassoc_loop_end;
9073 	}
9074 	marker_uidx++;
9075         next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
9076       }
9077     }
9078   qassoc_loop_end:
9079     if (fclose_null(&outfile)) {
9080       goto qassoc_ret_WRITE_FAIL;
9081     }
9082     LOGPRINTFWW("Permutation test report written to %s .\n", outname);
9083   }
9084 
9085   while (0) {
9086   qassoc_ret_NOMEM:
9087     retval = RET_NOMEM;
9088     break;
9089   qassoc_ret_OPEN_FAIL:
9090     retval = RET_OPEN_FAIL;
9091     break;
9092   qassoc_ret_READ_FAIL:
9093     retval = RET_READ_FAIL;
9094     break;
9095   qassoc_ret_WRITE_FAIL:
9096     retval = RET_WRITE_FAIL;
9097     break;
9098   qassoc_ret_INVALID_CMDLINE:
9099     retval = RET_INVALID_CMDLINE;
9100     break;
9101   qassoc_ret_THREAD_CREATE_FAIL:
9102     retval = RET_THREAD_CREATE_FAIL;
9103     break;
9104   }
9105  qassoc_ret_1:
9106   bigstack_reset(bigstack_mark);
9107   fclose_cond(outfile);
9108   fclose_cond(outfile_qtm);
9109   fclose_cond(outfile_msa);
9110   return retval;
9111 }
9112 
gxe_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t sample_ct,uintptr_t * sample_exclude,uintptr_t * pheno_nm,double * pheno_d,uintptr_t * gxe_covar_nm,uintptr_t * gxe_covar_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)9113 int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* gxe_covar_nm, uintptr_t* gxe_covar_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
9114   unsigned char* bigstack_mark = g_bigstack_base;
9115   FILE* outfile = nullptr;
9116   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
9117   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9118   uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
9119   uintptr_t covar_nm_ct = popcount_longs(gxe_covar_nm, sample_ctl);
9120   uintptr_t covar_nm_ctl = BITCT_TO_WORDCT(covar_nm_ct);
9121   // gxe_covar_c has opposite truth value from ->bcovar in PLINK 1.07 gxe.cpp;
9122   // see lines 50-58 in gxe.cpp
9123   uintptr_t group2_size = popcount_longs(gxe_covar_c, sample_ctl);
9124   uintptr_t group1_size = covar_nm_ct - group2_size;
9125   uintptr_t male_ct = 0;
9126   uintptr_t male_ctl = 0;
9127   uintptr_t group1_size_male = 0;
9128   uintptr_t group2_size_male = 0;
9129   uintptr_t marker_uidx = 0;
9130   uintptr_t final_mask = 0;
9131   uintptr_t* sample_include2 = nullptr;
9132   uintptr_t* sample_male_include2 = nullptr;
9133   uintptr_t* sample_male_all_include2 = nullptr;
9134   uintptr_t* group1_include2 = nullptr;
9135   uintptr_t* group2_include2 = nullptr;
9136   uintptr_t* group1_male_include2 = nullptr;
9137   uintptr_t* group2_male_include2 = nullptr;
9138   uintptr_t* covar_nm_raw = nullptr;
9139   uintptr_t* covar_nm_male_raw = nullptr;
9140   uintptr_t* cur_sample_i2 = nullptr;
9141   uintptr_t* cur_sample_male_i2 = nullptr;
9142   uintptr_t* cur_group1_i2 = nullptr;
9143   uintptr_t* cur_group2_i2 = nullptr;
9144   uintptr_t* cur_covar_nm_raw = nullptr;
9145   double* pheno_d_collapsed = nullptr;
9146   double* pheno_d_male_collapsed = nullptr;
9147   double* cur_pheno_d = nullptr;
9148   char* wptr_start = nullptr;
9149   uintptr_t cur_sample_ct = 0;
9150   uintptr_t cur_sample_ctv2 = 0;
9151   uintptr_t cur_group1_size = 0;
9152   uintptr_t cur_group2_size = 0;
9153   uint32_t y_exists = (chrom_info_ptr->xymt_codes[Y_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[Y_OFFSET]);
9154   uint32_t mt_exists = (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET]);
9155   uint32_t skip_y = 0;
9156   double pheno_sum_g1 = 0;
9157   double pheno_ssq_g1 = 0;
9158   double pheno_sum_g2 = 0;
9159   double pheno_ssq_g2 = 0;
9160   double pheno_sum_male_g1 = 0;
9161   double pheno_ssq_male_g1 = 0;
9162   double pheno_sum_male_g2 = 0;
9163   double pheno_ssq_male_g2 = 0;
9164   double base_pheno_sum_g1 = 0;
9165   double base_pheno_ssq_g1 = 0;
9166   double base_pheno_sum_g2 = 0;
9167   double base_pheno_ssq_g2 = 0;
9168   int32_t retval = 0;
9169   uintptr_t* loadbuf_raw;
9170   uintptr_t* loadbuf;
9171   uintptr_t* loadbuf_ptr;
9172   uintptr_t* cgr_ptr;
9173   char* wptr;
9174   uint32_t chrom_fo_idx;
9175   uint32_t chrom_end;
9176   uintptr_t loop_end;
9177   uintptr_t marker_idx;
9178   uintptr_t sample_uidx;
9179   uintptr_t sample_uidx_stop;
9180   uintptr_t sample_idx;
9181   uintptr_t sample_idx2;
9182   uintptr_t sample_idx2_offset;
9183   uintptr_t ulii;
9184   uintptr_t uljj;
9185   uintptr_t ulkk;
9186   uintptr_t ulmm;
9187   uintptr_t ulnn;
9188   double dxx;
9189   double qt_sum1;
9190   double qt_ssq1;
9191   double qt_g_prod1;
9192   double nanal_recip1;
9193   double nanal_m1_recip1;
9194   double geno_mean1;
9195   double g_var1;
9196   double qt_var1;
9197   double qt_g_covar1;
9198   double beta1;
9199   double vbeta1;
9200 
9201   double qt_sum2;
9202   double qt_ssq2;
9203   double qt_g_prod2;
9204   double nanal_recip2;
9205   double nanal_m1_recip2;
9206   double geno_mean2;
9207   double g_var2;
9208   double qt_var2;
9209   double qt_g_covar2;
9210   double beta2;
9211   double vbeta2;
9212 
9213   double zval;
9214 
9215   uint32_t is_x;
9216   uint32_t is_y;
9217   uint32_t is_mt;
9218   uint32_t min_ploidy_1;
9219   uint32_t pct;
9220 
9221   uint32_t missing_ct1;
9222   uint32_t het_ct1;
9223   uint32_t homcom_ct1;
9224   uint32_t homrar_ct1;
9225   uint32_t nanal1;
9226   uint32_t geno_sum1;
9227   uint32_t geno_ssq1;
9228 
9229   uint32_t missing_ct2;
9230   uint32_t het_ct2;
9231   uint32_t homcom_ct2;
9232   uint32_t homrar_ct2;
9233   uint32_t nanal2;
9234   uint32_t geno_sum2;
9235   uint32_t geno_ssq2;
9236 
9237   if (group1_size < 3) {
9238     logerrprint("Error: First --gxe group has fewer than three members.\n");
9239     goto gxe_assoc_ret_INVALID_CMDLINE;
9240   } else if (group2_size < 3) {
9241     logerrprint("Error: Second --gxe group has fewer than three members.\n");
9242     goto gxe_assoc_ret_INVALID_CMDLINE;
9243   }
9244   if (bigstack_alloc_ul(unfiltered_sample_ctl * 2, &loadbuf_raw) ||
9245       bigstack_alloc_ul(covar_nm_ctl * 2, &loadbuf) ||
9246       bigstack_calloc_ul(unfiltered_sample_ctl, &covar_nm_raw) ||
9247       bigstack_alloc_d(covar_nm_ct, &pheno_d_collapsed)) {
9248     goto gxe_assoc_ret_NOMEM;
9249   }
9250   loadbuf_raw[unfiltered_sample_ctl * 2 - 1] = 0;
9251 
9252   sample_uidx = 0;
9253   sample_idx = 0;
9254   sample_idx2 = 0;
9255   do {
9256     sample_uidx = next_unset_ul_unsafe(sample_exclude, sample_uidx);
9257     sample_uidx_stop = next_set_ul(sample_exclude, sample_uidx, unfiltered_sample_ct);
9258     do {
9259       if (IS_SET(gxe_covar_nm, sample_idx)) {
9260         SET_BIT(sample_uidx, covar_nm_raw);
9261         dxx = pheno_d[sample_uidx];
9262         if (IS_SET(gxe_covar_c, sample_idx)) {
9263 	  pheno_sum_g2 += dxx;
9264 	  pheno_ssq_g2 += dxx * dxx;
9265 	} else {
9266 	  pheno_sum_g1 += dxx;
9267 	  pheno_ssq_g1 += dxx * dxx;
9268 	}
9269 	pheno_d_collapsed[sample_idx2++] = dxx;
9270       }
9271       sample_idx++;
9272     } while (++sample_uidx < sample_uidx_stop);
9273   } while (sample_idx < sample_ct);
9274 
9275   if (bigstack_alloc_ul(covar_nm_ctl * 2, &group1_include2) ||
9276       bigstack_calloc_ul(covar_nm_ctl * 2, &group2_include2)) {
9277     goto gxe_assoc_ret_NOMEM;
9278   }
9279   fill_quatervec_55(covar_nm_ct, group1_include2);
9280   sample_idx = 0;
9281   sample_idx2 = 0;
9282   do {
9283     sample_idx = next_set_ul_unsafe(gxe_covar_nm, sample_idx);
9284     sample_uidx_stop = next_unset_ul(gxe_covar_nm, sample_idx, sample_ct);
9285     do {
9286       if (IS_SET(gxe_covar_c, sample_idx)) {
9287 	SET_BIT_DBL(sample_idx2, group2_include2);
9288       }
9289       sample_idx2++;
9290     } while (++sample_idx < sample_uidx_stop);
9291   } while (sample_idx2 < covar_nm_ct);
9292   bitvec_andnot(group2_include2, covar_nm_ctl * 2, group1_include2);
9293 
9294   hh_or_mt_exists |= mt_exists * NXMHH_EXISTS;
9295   if ((hh_or_mt_exists & NXMHH_EXISTS) || y_exists) {
9296     if (bigstack_alloc_ul(covar_nm_ctl * 2, &sample_include2)) {
9297       goto gxe_assoc_ret_NOMEM;
9298     }
9299     fill_quatervec_55(covar_nm_ct, sample_include2);
9300   }
9301   if ((hh_or_mt_exists & XMHH_EXISTS) || y_exists) {
9302     if (bigstack_calloc_ul(covar_nm_ctl * 2, &sample_male_include2)) {
9303       goto gxe_assoc_ret_NOMEM;
9304     }
9305     sample_uidx = 0;
9306     sample_idx = 0;
9307     sample_idx2 = 0;
9308     do {
9309       sample_uidx = next_unset_ul_unsafe(sample_exclude, sample_uidx);
9310       sample_uidx_stop = next_set_ul(sample_exclude, sample_uidx, unfiltered_sample_ct);
9311       do {
9312         if (IS_SET(gxe_covar_nm, sample_idx)) {
9313           if (IS_SET(sex_male, sample_uidx)) {
9314 	    SET_BIT_DBL(sample_idx2, sample_male_include2);
9315 	    male_ct++;
9316 	  }
9317 	  sample_idx2++;
9318 	}
9319 	sample_idx++;
9320       } while (++sample_uidx < sample_uidx_stop);
9321     } while (sample_idx < sample_ct);
9322     male_ctl = BITCT_TO_WORDCT(male_ct);
9323     if (y_exists) {
9324       group1_size_male = popcount_longs_exclude(sample_male_include2, group2_include2, covar_nm_ctl * 2);
9325       group2_size_male = male_ct - group1_size_male;
9326       if ((group1_size_male < 3) || (group2_size_male < 3)) {
9327         logerrprint("Warning: Skipping Y chromosome for --gxe since a group has less than 3 males.\n");
9328 	skip_y = 1;
9329       }
9330       // currently still need to initialize covar_nm_male_raw even on skip_y
9331       if (bigstack_alloc_ul(male_ctl * 2, &sample_male_all_include2) ||
9332           bigstack_alloc_ul(male_ctl * 2, &group1_male_include2) ||
9333 	  bigstack_calloc_ul(male_ctl * 2, &group2_male_include2) ||
9334 	  bigstack_alloc_d(male_ct, &pheno_d_male_collapsed) ||
9335 	  bigstack_alloc_ul(unfiltered_sample_ctl, &covar_nm_male_raw)) {
9336 	goto gxe_assoc_ret_NOMEM;
9337       }
9338       fill_quatervec_55(male_ct, sample_male_all_include2);
9339       fill_quatervec_55(male_ct, group1_male_include2);
9340       sample_idx = 0;
9341       for (sample_idx2 = 0; sample_idx2 < covar_nm_ct; sample_idx2++) {
9342 	if (IS_SET_DBL(sample_male_include2, sample_idx2)) {
9343 	  dxx = pheno_d_collapsed[sample_idx2];
9344 	  if (IS_SET_DBL(group2_include2, sample_idx2)) {
9345 	    SET_BIT_DBL(sample_idx, group2_male_include2);
9346 	    pheno_sum_male_g2 += dxx;
9347 	    pheno_ssq_male_g2 += dxx * dxx;
9348 	  } else {
9349 	    pheno_sum_male_g1 += dxx;
9350             pheno_ssq_male_g1 += dxx * dxx;
9351 	  }
9352 	  pheno_d_male_collapsed[sample_idx++] = dxx;
9353 	}
9354       }
9355       bitvec_andnot(group2_male_include2, male_ctl * 2, group1_male_include2);
9356       for (ulii = 0; ulii < unfiltered_sample_ctl; ulii++) {
9357 	covar_nm_male_raw[ulii] = covar_nm_raw[ulii] & sex_male[ulii];
9358       }
9359     }
9360   }
9361 
9362   memcpy(outname_end, ".qassoc.gxe", 12);
9363   if (fopen_checked(outname, "w", &outfile)) {
9364     goto gxe_assoc_ret_OPEN_FAIL;
9365   }
9366   if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
9367     logerrprint("Warning: --gxe doesn't currently handle X/Y/MT/haploid variants properly.\n");
9368   }
9369   LOGPRINTFWW5("Writing --gxe report to %s ... ", outname);
9370   fputs("0%", stdout);
9371   fflush(stdout);
9372   sprintf(g_textbuf, " CHR %%%us   NMISS1      BETA1        SE1   NMISS2      BETA2        SE2    Z_GXE        P_GXE \n", plink_maxsnp);
9373   fprintf(outfile, g_textbuf, "SNP");
9374 
9375   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
9376     goto gxe_assoc_ret_READ_FAIL;
9377   }
9378   // exploit overflow for initialization
9379   chrom_fo_idx = 0xffffffffU;
9380   marker_uidx = 0;
9381   marker_idx = 0;
9382   chrom_end = 0;
9383   for (pct = 1; pct <= 100; pct++) {
9384     loop_end = (((uint64_t)pct) * marker_ct) / 100;
9385     for (; marker_idx < loop_end; marker_idx++) {
9386       if (IS_SET(marker_exclude, marker_uidx)) {
9387 	marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
9388 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
9389 	  goto gxe_assoc_ret_READ_FAIL;
9390 	}
9391       }
9392       if (marker_uidx >= chrom_end) {
9393 	chrom_fo_idx++;
9394 	refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &min_ploidy_1);
9395 	min_ploidy_1 |= is_mt;
9396 	if (!is_y) {
9397 	  cur_sample_ct = covar_nm_ct;
9398 	  cur_group1_size = group1_size;
9399           cur_group2_size = group2_size;
9400 	  base_pheno_sum_g1 = pheno_sum_g1;
9401 	  base_pheno_ssq_g1 = pheno_ssq_g1;
9402           base_pheno_sum_g2 = pheno_sum_g2;
9403           base_pheno_ssq_g2 = pheno_ssq_g2;
9404           cur_sample_i2 = sample_include2;
9405           cur_sample_male_i2 = sample_male_include2;
9406 	  cur_group1_i2 = group1_include2;
9407           cur_group2_i2 = group2_include2;
9408           cur_pheno_d = pheno_d_collapsed;
9409 	  cur_covar_nm_raw = covar_nm_raw;
9410 	} else {
9411 	  cur_sample_ct = male_ct;
9412 	  cur_group1_size = group1_size_male;
9413           cur_group2_size = group2_size_male;
9414           base_pheno_sum_g1 = pheno_sum_male_g1;
9415 	  base_pheno_ssq_g1 = pheno_ssq_male_g1;
9416           base_pheno_sum_g2 = pheno_sum_male_g2;
9417 	  base_pheno_ssq_g2 = pheno_ssq_male_g2;
9418           cur_sample_i2 = sample_male_all_include2;
9419           cur_sample_male_i2 = sample_male_all_include2;
9420           cur_group1_i2 = group1_male_include2;
9421           cur_group2_i2 = group2_male_include2;
9422           cur_pheno_d = pheno_d_male_collapsed;
9423 	  cur_covar_nm_raw = covar_nm_male_raw;
9424 	}
9425 	wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], g_textbuf));
9426 	*wptr_start++ = ' ';
9427 	cur_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(cur_sample_ct);
9428         loadbuf[cur_sample_ctv2 - 1] = 0;
9429 	final_mask = get_final_mask(cur_sample_ct);
9430       }
9431 
9432       if (load_and_collapse_incl(unfiltered_sample_ct, cur_sample_ct, cur_covar_nm_raw, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
9433 	goto gxe_assoc_ret_READ_FAIL;
9434       }
9435       if (is_y && skip_y) {
9436 	marker_uidx++;
9437 	continue;
9438       }
9439       if (min_ploidy_1) {
9440 	haploid_fix(hh_or_mt_exists, cur_sample_i2, cur_sample_male_i2, cur_sample_ct, is_x, is_y, (unsigned char*)loadbuf);
9441       }
9442 
9443       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
9444       *wptr++ = ' ';
9445 
9446       // We are interested in the following quantities:
9447       //   qt_var{1,2}: (qt_ssq - (qt_sum^2 / N)) / (N-1)
9448       //   g_var{1,2}: (geno_ssq - (geno_sum^2 / N)) / (N-1)
9449       //   qt_g_covar{1,2}: (qt_g_prod - ((qt_sum * geno_sum) / N)) / (N-1)
9450 
9451       single_marker_cc_3freqs(cur_sample_ctv2, loadbuf, cur_group1_i2, cur_group2_i2, &homcom_ct1, &het_ct1, &missing_ct1, &homcom_ct2, &het_ct2, &missing_ct2);
9452       nanal1 = ((uint32_t)cur_group1_size) - missing_ct1;
9453       nanal2 = ((uint32_t)cur_group2_size) - missing_ct2;
9454       homrar_ct1 = nanal1 - (het_ct1 + homcom_ct1);
9455       homrar_ct2 = nanal2 - (het_ct2 + homcom_ct2);
9456       geno_sum1 = 2 * homrar_ct1 + het_ct1;
9457       geno_sum2 = 2 * homrar_ct2 + het_ct2;
9458       geno_ssq1 = 4 * homrar_ct1 + het_ct1;
9459       geno_ssq2 = 4 * homrar_ct2 + het_ct2;
9460 
9461       if ((nanal1 > 2) && (nanal2 > 2)) {
9462 	nanal_recip1 = 1.0 / ((int32_t)nanal1);
9463 	nanal_recip2 = 1.0 / ((int32_t)nanal2);
9464 	nanal_m1_recip1 = 1.0 / ((int32_t)(nanal1 - 1));
9465 	nanal_m1_recip2 = 1.0 / ((int32_t)(nanal2 - 1));
9466 	geno_mean1 = geno_sum1 * nanal_recip1;
9467 	g_var1 = (geno_ssq1 - geno_sum1 * geno_mean1) * nanal_m1_recip1;
9468         geno_mean2 = geno_sum2 * nanal_recip2;
9469         g_var2 = (geno_ssq2 - geno_sum2 * geno_mean2) * nanal_m1_recip2;
9470 	if ((g_var1 == 0) || (g_var2 == 0)) {
9471 	  goto gxe_assoc_nan_line;
9472 	}
9473 	qt_sum1 = base_pheno_sum_g1;
9474 	qt_ssq1 = base_pheno_ssq_g1;
9475 	qt_sum2 = base_pheno_sum_g2;
9476 	qt_ssq2 = base_pheno_ssq_g2;
9477 	qt_g_prod1 = 0;
9478 	qt_g_prod2 = 0;
9479 	sample_idx2_offset = 0;
9480 	loadbuf_ptr = loadbuf;
9481 	cgr_ptr = cur_group2_i2;
9482 	do {
9483 	  ulmm = ~(*loadbuf_ptr++);
9484 	  if (sample_idx2_offset + BITCT2 > cur_sample_ct) {
9485 	    ulmm &= (ONELU << ((cur_sample_ct & (BITCT2 - 1)) * 2)) - ONELU;
9486 	  }
9487 	  if (ulmm) {
9488 	    ulnn = (*cgr_ptr) * 3;
9489             ulii = ulmm & (~ulnn);
9490             while (ulii) {
9491 	      uljj = CTZLU(ulii) & (BITCT - 2);
9492 	      ulkk = (ulii >> uljj) & 3;
9493 	      sample_idx2 = sample_idx2_offset + (uljj / 2);
9494 	      dxx = cur_pheno_d[sample_idx2];
9495 	      if (ulkk == 1) {
9496 		// het
9497 		qt_g_prod1 += dxx;
9498 	      } else if (ulkk == 3) {
9499 		// hom rare
9500 		qt_g_prod1 += 2 * dxx;
9501 	      } else {
9502 		// missing
9503 		qt_sum1 -= dxx;
9504 		qt_ssq1 -= dxx * dxx;
9505 	      }
9506 	      ulii &= ~((3 * ONELU) << uljj);
9507 	    }
9508 	    ulii = ulmm & ulnn;
9509 	    while (ulii) {
9510 	      uljj = CTZLU(ulii) & (BITCT - 2);
9511 	      ulkk = (ulii >> uljj) & 3;
9512 	      sample_idx2 = sample_idx2_offset + (uljj / 2);
9513 	      dxx = cur_pheno_d[sample_idx2];
9514 	      if (ulkk == 1) {
9515 		qt_g_prod2 += dxx;
9516 	      } else if (ulkk == 3) {
9517 		qt_g_prod2 += 2 * dxx;
9518 	      } else {
9519 		qt_sum2 -= dxx;
9520 		qt_ssq2 -= dxx * dxx;
9521 	      }
9522 	      ulii &= ~((3 * ONELU) << uljj);
9523 	    }
9524 	  }
9525 	  cgr_ptr++;
9526 	  sample_idx2_offset += BITCT2;
9527 	} while (sample_idx2_offset < cur_sample_ct);
9528         qt_var1 = (qt_ssq1 - (qt_sum1 * qt_sum1 * nanal_recip1)) * nanal_m1_recip1;
9529         qt_var2 = (qt_ssq2 - (qt_sum2 * qt_sum2 * nanal_recip2)) * nanal_m1_recip2;
9530 	qt_g_covar1 = (qt_g_prod1 - (qt_sum1 * geno_mean1)) * nanal_m1_recip1;
9531         qt_g_covar2 = (qt_g_prod2 - (qt_sum2 * geno_mean2)) * nanal_m1_recip2;
9532 	beta1 = qt_g_covar1 / g_var1;
9533         beta2 = qt_g_covar2 / g_var2;
9534         vbeta1 = (qt_var1 / g_var1 - (qt_g_covar1 * qt_g_covar1) / (g_var1 * g_var1)) / ((double)(((int32_t)nanal1) - 2));
9535 
9536         vbeta2 = (qt_var2 / g_var2 - (qt_g_covar2 * qt_g_covar2) / (g_var2 * g_var2)) / ((double)(((int32_t)nanal2) - 2));
9537         if (vbeta1 + vbeta2 <= 0) {
9538 	  goto gxe_assoc_nan_line;
9539 	}
9540         zval = (beta1 - beta2) / sqrt(vbeta1 + vbeta2);
9541         wptr = uint32toa_w8x(nanal1, ' ', wptr);
9542         wptr = dtoa_g_wxp4x(beta1, 10, ' ', wptr);
9543         wptr = dtoa_g_wxp4x(sqrt(vbeta1), 10, ' ', wptr);
9544         wptr = uint32toa_w8x(nanal2, ' ', wptr);
9545         wptr = dtoa_g_wxp4x(beta2, 10, ' ', wptr);
9546         wptr = dtoa_g_wxp4x(sqrt(vbeta2), 10, ' ', wptr);
9547         wptr = dtoa_g_wxp4x(zval, 8, ' ', wptr);
9548 	dxx = chiprob_p(zval * zval, 1);
9549         wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, '\n', wptr);
9550       } else {
9551       gxe_assoc_nan_line:
9552         wptr = memcpya(wptr, "      NA         NA         NA       NA         NA         NA       NA           NA\n", 84);
9553       }
9554       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9555 	goto gxe_assoc_ret_WRITE_FAIL;
9556       }
9557       marker_uidx++;
9558     }
9559     if (pct < 100) {
9560       if (pct > 10) {
9561         putc_unlocked('\b', stdout);
9562       }
9563       printf("\b\b%u%%", pct);
9564       fflush(stdout);
9565     }
9566   }
9567   if (fclose_null(&outfile)) {
9568     goto gxe_assoc_ret_WRITE_FAIL;
9569   }
9570   if (pct >= 10) {
9571     putc_unlocked('\b', stdout);
9572   }
9573   fputs("\b\b", stdout);
9574   logprint("done.\n");
9575 
9576   while (0) {
9577   gxe_assoc_ret_NOMEM:
9578     retval = RET_NOMEM;
9579     break;
9580   gxe_assoc_ret_OPEN_FAIL:
9581     retval = RET_OPEN_FAIL;
9582     break;
9583   gxe_assoc_ret_READ_FAIL:
9584     retval = RET_READ_FAIL;
9585     break;
9586   gxe_assoc_ret_WRITE_FAIL:
9587     retval = RET_WRITE_FAIL;
9588     break;
9589   gxe_assoc_ret_INVALID_CMDLINE:
9590     retval = RET_INVALID_CMDLINE;
9591     break;
9592   }
9593   bigstack_reset(bigstack_mark);
9594   fclose_cond(outfile);
9595   return retval;
9596 }
9597 
calc_git_missing(uint32_t pheno_nm_ct,uint32_t perm_vec_ct,uintptr_t * __restrict__ loadbuf,uint32_t * perm_vecst,uint32_t * thread_wkspace)9598 void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict__ loadbuf, uint32_t* perm_vecst, uint32_t* thread_wkspace) {
9599   // Simplified calc_git() for when we only need to distinguish between missing
9600   // and nonmissing.
9601   // thread_wkspace[] is assumed to be zeroed out before this function is
9602   // called.
9603   uint32_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9604 #ifdef __LP64__
9605   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
9606   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
9607   uint32_t perm_ct128x4 = perm_ct128 * 4;
9608   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
9609   __m128i* permsv = (__m128i*)perm_vecst;
9610   __m128i* gitv[3];
9611 #else
9612   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
9613   uint32_t perm_ct32x4 = perm_ct32 * 4;
9614   uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
9615   uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
9616   uintptr_t* permsv = (uintptr_t*)perm_vecst;
9617   uintptr_t* gitv[3];
9618 #endif
9619   uint32_t cur_ct;
9620   uintptr_t ulii;
9621   uint32_t uii;
9622   uint32_t ujj;
9623 #ifdef __LP64__
9624   // 4- and 8-bit partial counts
9625   gitv[0] = &(((__m128i*)thread_wkspace)[8 * perm_ct128x4]);
9626   gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
9627   gitv[2] = (__m128i*)thread_wkspace;
9628 #else
9629   gitv[0] = (uintptr_t*)(&(thread_wkspace[8 * perm_ct32x4]));
9630   gitv[1] = (uintptr_t*)(&(thread_wkspace[9 * perm_ct32x4]));
9631   gitv[2] = (uintptr_t*)thread_wkspace;
9632 #endif
9633   cur_ct = 0;
9634   for (uii = 0; uii < pheno_nm_ctl; uii++) {
9635     ulii = *loadbuf++;
9636     if (uii + 1 == pheno_nm_ctl) {
9637       ujj = pheno_nm_ct & (BITCT2 - 1);
9638       if (ujj) {
9639 	ulii &= (ONELU << ujj) - ONELU;
9640       }
9641     }
9642     while (ulii) {
9643       ujj = CTZLU(ulii);
9644       cur_ct++;
9645 #ifdef __LP64__
9646       unroll_incr_1_4(&(permsv[ujj * perm_ct128]), gitv[0], perm_ct128);
9647       if (!(cur_ct % 15)) {
9648 	unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct32);
9649 	if (!(cur_ct % 255)) {
9650 	  unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct16);
9651 	}
9652       }
9653 #else
9654       unroll_incr_1_4(&(permsv[ujj * perm_ct32]), gitv[0], perm_ct32);
9655       if (!(cur_ct % 15)) {
9656 	unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct8);
9657 	if (!(cur_ct % 255)) {
9658 	  unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct4);
9659 	}
9660       }
9661 #endif
9662       ulii &= ulii - 1;
9663     }
9664 #ifdef __LP64__
9665     permsv = &(permsv[BITCT * perm_ct128]);
9666 #else
9667     permsv = &(permsv[BITCT * perm_ct32]);
9668 #endif
9669   }
9670 #ifdef __LP64__
9671   if (cur_ct % 15) {
9672     unroll_incr_4_8(gitv[0], gitv[1], perm_ct32);
9673   }
9674   if (cur_ct % 255) {
9675     unroll_incr_8_32(gitv[1], gitv[2], perm_ct16);
9676   }
9677 #else
9678   if (cur_ct % 15) {
9679     unroll_incr_4_8(gitv[0], gitv[1], perm_ct8);
9680   }
9681   if (cur_ct % 255) {
9682     unroll_incr_8_32(gitv[1], gitv[2], perm_ct4);
9683   }
9684 #endif
9685 }
9686 
testmiss_adapt_thread(void * arg)9687 THREAD_RET_TYPE testmiss_adapt_thread(void* arg) {
9688   uintptr_t tidx = (uintptr_t)arg;
9689   uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
9690   uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9691   uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9692   uintptr_t perm_vec_ct = g_perm_vec_ct;
9693   uint32_t max_thread_ct = g_assoc_thread_ct;
9694   uint32_t pidx_offset = g_perms_done;
9695   uint32_t is_midp = g_fisher_midp;
9696   uint32_t first_adapt_check = g_first_adapt_check;
9697   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
9698   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
9699   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
9700   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
9701   // this can be cached since testmiss() computes all raw p-values before
9702   // starting permutation test
9703   double* __restrict__ orig_pvals = g_orig_pvals;
9704   double adaptive_intercept = g_adaptive_intercept;
9705   double adaptive_slope = g_adaptive_slope;
9706   double adaptive_ci_zt = g_adaptive_ci_zt;
9707   double aperm_alpha = g_aperm_alpha;
9708   double stat_high = 0;
9709   double stat_low = 0;
9710   uint32_t missing_sum = 0;
9711   uint32_t nm_sum = 0;
9712   uint32_t* male_case_cts = nullptr;
9713   uintptr_t* __restrict__ loadbuf;
9714   uintptr_t* loadbuf_ptr;
9715   uint32_t* __restrict__ precomp_ui;
9716   uint32_t* __restrict__ missing_cts;
9717   uint32_t* gpui;
9718   uintptr_t marker_idx;
9719   uintptr_t pidx;
9720   uint32_t marker_bidx;
9721   uint32_t marker_bceil;
9722   uint32_t is_y;
9723   uint32_t valid_obs_ct;
9724   uint32_t success_2start;
9725   uint32_t success_2incr;
9726   uint32_t next_adapt_check;
9727   uint32_t missing_case_ct;
9728   uint32_t case_ct;
9729   uint32_t uii;
9730   double pval;
9731   double dxx;
9732   double dyy;
9733   double dzz;
9734   while (1) {
9735     if (g_block_diff <= max_thread_ct) {
9736       if (g_block_diff <= tidx) {
9737 	goto testmiss_adapt_thread_skip_all;
9738       }
9739       marker_bidx = tidx;
9740       marker_bceil = tidx + 1;
9741     } else {
9742       marker_bidx = (((uint64_t)tidx) * g_block_diff) / max_thread_ct;
9743       marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / max_thread_ct;
9744     }
9745     is_y = 0;
9746     if (g_is_y) {
9747       valid_obs_ct = g_male_ct;
9748       if (valid_obs_ct != pheno_nm_ct) {
9749 	is_y = 1; // if all male, can pretend as if this isn't Ychr
9750 	male_case_cts = g_male_case_cts;
9751       }
9752     } else {
9753       valid_obs_ct = pheno_nm_ct;
9754     }
9755     loadbuf = g_loadbuf;
9756     precomp_ui = g_precomp_ui;
9757     missing_cts = g_missing_cts;
9758     for (; marker_bidx < marker_bceil; marker_bidx++) {
9759       marker_idx = g_adapt_m_table[marker_bidx];
9760       next_adapt_check = first_adapt_check;
9761       gpui = &(precomp_ui[4 * marker_bidx]);
9762       if (is_y) {
9763 	missing_sum = missing_cts[marker_idx];
9764 	nm_sum = valid_obs_ct - missing_sum;
9765 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
9766 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
9767       }
9768       success_2start = perm_2success_ct[marker_idx];
9769       success_2incr = 0;
9770       loadbuf_ptr = &(loadbuf[marker_bidx * pheno_nm_ctv]);
9771       for (pidx = 0; pidx < perm_vec_ct;) {
9772 	missing_case_ct = popcount_longs_intersect(loadbuf_ptr, &(perm_vecs[pidx * pheno_nm_ctv]), pheno_nm_ctl);
9773 	if (!is_y) {
9774 	  if (missing_case_ct < gpui[0]) {
9775 	    if (missing_case_ct < gpui[2]) {
9776 	      success_2incr += 2;
9777 	    } else {
9778 	      success_2incr++;
9779 	    }
9780 	  } else {
9781 	    if (missing_case_ct >= gpui[1]) {
9782 	      if (missing_case_ct >= gpui[3]) {
9783 		success_2incr += 2;
9784 	      } else {
9785 		success_2incr++;
9786 	      }
9787 	    }
9788 	  }
9789 	} else {
9790 	  case_ct = male_case_cts[pidx];
9791 	  pval = fisher22(missing_case_ct, case_ct - missing_case_ct, missing_sum - missing_case_ct, nm_sum + missing_case_ct - case_ct, is_midp);
9792 	  if (pval < stat_low) {
9793 	    success_2incr += 2;
9794 	  } else if (pval <= stat_high) {
9795 	    success_2incr++;
9796 	  }
9797 	}
9798 	if (++pidx == next_adapt_check - pidx_offset) {
9799 	  uii = success_2start + success_2incr;
9800 	  if (uii) {
9801 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
9802 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
9803 	    dyy = pval - dxx;
9804 	    dzz = pval + dxx;
9805 	    if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
9806 	      perm_adapt_stop[marker_idx] = 1;
9807 	      perm_attempt_ct[marker_idx] = next_adapt_check;
9808 	      break;
9809 	    }
9810 	  }
9811 	  next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
9812 	}
9813       }
9814       perm_2success_ct[marker_idx] += success_2incr;
9815     }
9816   testmiss_adapt_thread_skip_all:
9817     if ((!tidx) || g_is_last_thread_block) {
9818       THREAD_RETURN;
9819     }
9820     THREAD_BLOCK_FINISH(tidx);
9821   }
9822 }
9823 
testmiss_maxt_thread(void * arg)9824 THREAD_RET_TYPE testmiss_maxt_thread(void* arg) {
9825   uintptr_t tidx = (uintptr_t)arg;
9826   uintptr_t perm_vec_ct = g_perm_vec_ct;
9827   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
9828   uint32_t is_midp = g_fisher_midp;
9829   uint32_t max_thread_ct = g_assoc_thread_ct;
9830   uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9831   uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9832   uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
9833   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
9834   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 176]);
9835   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
9836   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
9837   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
9838   double* __restrict__ orig_pvals = g_orig_pvals;
9839   double* msa_ptr = nullptr;
9840   uint32_t* male_case_cts = nullptr;
9841   uint32_t* gpui = nullptr;
9842   double* gpd = nullptr;
9843   double stat_high = 0;
9844   double stat_low = 0;
9845   uint32_t case_ct = g_perm_case_ct;
9846   uint32_t cur_case_ct = case_ct;
9847   uintptr_t* loadbuf;
9848   uintptr_t* loadbuf_ptr;
9849   uint32_t* precomp_ui;
9850   uint32_t* __restrict__ missing_cts;
9851   double* __restrict__ precomp_d;
9852   uintptr_t pidx;
9853   uintptr_t marker_idx;
9854   double pval;
9855   uint32_t marker_bidx_start;
9856   uint32_t marker_bidx;
9857   uint32_t marker_bceil;
9858   uint32_t is_y;
9859   uint32_t valid_obs_ct;
9860   uint32_t missing_sum;
9861   uint32_t nm_sum;
9862   uint32_t success_2incr;
9863   uint32_t missing_case_ct;
9864   uint32_t uii;
9865   uint32_t ujj;
9866   while (1) {
9867     if (g_block_diff <= max_thread_ct) {
9868       if (g_block_diff <= tidx) {
9869 	goto testmiss_maxt_thread_skip_all;
9870       }
9871       marker_bidx_start = tidx;
9872       marker_bceil = tidx + 1;
9873     } else {
9874       marker_bidx_start = (((uint64_t)tidx) * g_block_diff) / max_thread_ct;
9875       marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / max_thread_ct;
9876     }
9877     marker_bidx = marker_bidx_start;
9878     marker_idx = g_maxt_block_base + marker_bidx_start;
9879     memcpy(results, &(g_maxt_extreme_stat[g_perms_done]), perm_vec_ct * sizeof(double));
9880     is_y = 0;
9881     if (g_is_y) {
9882       valid_obs_ct = g_male_ct;
9883       if (valid_obs_ct != pheno_nm_ct) {
9884 	is_y = 1;
9885 	male_case_cts = g_male_case_cts;
9886 	precomp_ui = nullptr;
9887       }
9888     } else {
9889       valid_obs_ct = pheno_nm_ct;
9890     }
9891     loadbuf = g_loadbuf;
9892     missing_cts = g_missing_cts;
9893     precomp_d = g_precomp_d;
9894     if (g_mperm_save_all) {
9895       msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
9896       precomp_ui = nullptr;
9897     } else {
9898       precomp_ui = g_precomp_ui;
9899     }
9900     for (; marker_bidx < marker_bceil; marker_bidx++) {
9901       missing_sum = missing_cts[marker_idx];
9902       nm_sum = valid_obs_ct - missing_sum;
9903       if (precomp_ui) {
9904 	gpui = &(precomp_ui[6 * marker_bidx]);
9905 	gpd = &(precomp_d[2 * marker_bidx]);
9906       } else {
9907 	stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
9908 	stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
9909       }
9910       loadbuf_ptr = &(loadbuf[marker_bidx * pheno_nm_ctv]);
9911       success_2incr = 0;
9912       fill_uint_zero(perm_ct128 * 176, thread_git_wkspace);
9913       calc_git_missing(pheno_nm_ct, perm_vec_ct, loadbuf_ptr, perm_vecst, thread_git_wkspace);
9914       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
9915 	missing_case_ct = thread_git_wkspace[pidx];
9916 	if (precomp_ui) {
9917 	  if (missing_case_ct < gpui[0]) {
9918 	    if (missing_case_ct < gpui[2]) {
9919 	      success_2incr += 2;
9920 	    } else {
9921 	      success_2incr++;
9922 	    }
9923 	  } else {
9924 	    if (missing_case_ct >= gpui[1]) {
9925 	      if (missing_case_ct >= gpui[3]) {
9926 		success_2incr += 2;
9927 	      } else {
9928 		success_2incr++;
9929 	      }
9930 	    }
9931 	  }
9932 	  ujj = gpui[4];
9933 	  uii = (uint32_t)(missing_case_ct - ujj); // deliberate underflow
9934 	  if (uii >= gpui[5]) {
9935 	    pval = fisher22_tail_pval(ujj, missing_sum - ujj, case_ct - ujj, nm_sum + ujj - case_ct, gpui[5] - 1, gpd[0], gpd[1], is_midp, missing_case_ct);
9936 	    if (results[pidx] > pval) {
9937 	      results[pidx] = pval;
9938 	    }
9939 	  }
9940 	} else {
9941 	  if (is_y) {
9942 	    cur_case_ct = male_case_cts[pidx];
9943 	  }
9944 	  pval = fisher22(missing_case_ct, missing_sum - missing_case_ct, cur_case_ct - missing_case_ct, nm_sum + missing_case_ct - cur_case_ct, is_midp);
9945 	  if (pval < stat_low) {
9946 	    success_2incr += 2;
9947 	  } else if (pval <= stat_high) {
9948 	    success_2incr++;
9949 	  }
9950 	  if (results[pidx] > pval) {
9951 	    results[pidx] = pval;
9952 	  }
9953 	  if (msa_ptr) {
9954 	    *msa_ptr++ = pval;
9955 	  }
9956 	}
9957       }
9958       perm_2success_ct[marker_idx++] += success_2incr;
9959     }
9960   testmiss_maxt_thread_skip_all:
9961     if ((!tidx) || g_is_last_thread_block) {
9962       THREAD_RETURN;
9963     }
9964     THREAD_BLOCK_FINISH(tidx);
9965   }
9966 }
9967 
testmiss(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t testmiss_mperm_val,uint32_t testmiss_modifier,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_exists)9968 int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t testmiss_mperm_val, uint32_t testmiss_modifier, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_exists) {
9969   // Simple variant of model_assoc().
9970   unsigned char* bigstack_mark = g_bigstack_base;
9971   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
9972   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9973   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
9974   uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9975   uintptr_t cur_sample_ctl = pheno_nm_ctl;
9976   uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9977   uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
9978   uintptr_t marker_uidx = next_unset_unsafe(marker_exclude_orig, 0);
9979   double maxt_cur_extreme_stat = 0;
9980   FILE* outfile = nullptr;
9981   FILE* outfile_msa = nullptr;
9982   uintptr_t* sample_hh_include2 = nullptr;
9983   uintptr_t* sample_hh_male_include2 = nullptr;
9984   uintptr_t* pheno_male_nm2 = nullptr;
9985   uintptr_t* pheno_c_collapsed_male = nullptr;
9986   uintptr_t* sex_male_collapsed = nullptr;
9987   char* wptr_start = nullptr;
9988   char* tbuf2 = &(g_textbuf[MAXLINELEN]);
9989   uint32_t perm_adapt = testmiss_modifier & TESTMISS_PERM;
9990   uint32_t perm_maxt = testmiss_modifier & TESTMISS_MPERM;
9991   uint32_t perm_count = testmiss_modifier & TESTMISS_PERM_COUNT;
9992   uint32_t midp = testmiss_modifier & TESTMISS_MIDP;
9993   uint32_t do_perms = perm_adapt | perm_maxt;
9994   uint32_t perms_total = 0;
9995   uint32_t chrom_fo_idx = 0xffffffffU;
9996   uint32_t is_x = 0;
9997   // don't treat MT heterozygous call as missing
9998   uint32_t is_haploid = 0;
9999   uint32_t skip_y = 0;
10000   uint32_t cur_pheno_nm_ct = pheno_nm_ct;
10001   uint32_t case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
10002   uint32_t ctrl_ct = pheno_nm_ct - case_ct;
10003   uint32_t male_ct = popcount_longs_intersect(sex_male, pheno_nm, unfiltered_sample_ctl);
10004   uint32_t case_ct_y = popcount_longs_intersect(sex_male, pheno_c, unfiltered_sample_ctl);
10005   uint32_t ctrl_ct_y = male_ct - case_ct_y;
10006   uint32_t cur_case_ct = case_ct;
10007   uint32_t cur_ctrl_ct = ctrl_ct;
10008   uint32_t chrom_end = 0;
10009   uint32_t mperm_dump_all = 0;
10010   uint32_t max_thread_ct = g_thread_ct;
10011   uintptr_t pheno_male_nm_ctl = BITCT_TO_WORDCT(male_ct);
10012   int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
10013   int32_t retval = 0;
10014   uint32_t uibuf[4];
10015   uintptr_t* loadbuf_raw;
10016   uintptr_t* pheno_nm2;
10017   uintptr_t* cur_pheno_nm2;
10018   uintptr_t* pheno_c_collapsed;
10019   uintptr_t* cur_pheno_c_collapsed;
10020   uintptr_t* missing_bitfield;
10021   uintptr_t* marker_exclude;
10022   uintptr_t* loadbuf_ptr;
10023   double* dptr;
10024   uint32_t* marker_idx_to_uidx;
10025   char* outname_end2;
10026   char* wptr;
10027   uintptr_t marker_uidx_end;
10028   uintptr_t marker_ct;
10029   uintptr_t marker_unstopped_ct;
10030   uintptr_t marker_idx;
10031   uintptr_t marker_idx2;
10032   uintptr_t block_size;
10033   uintptr_t block_end;
10034   uintptr_t perm_idx;
10035   uintptr_t ulii;
10036   double pval;
10037   double cur_case_ct_recip;
10038   double cur_ctrl_ct_recip;
10039   double dxx;
10040   double dyy;
10041   double dzz;
10042   uint32_t missing_ct;
10043   uint32_t marker_cidx;
10044   uint32_t is_last_block;
10045   uint32_t uii;
10046   uint32_t ujj;
10047   uint32_t ukk;
10048   uint32_t umm;
10049   if ((!case_ct) || (!ctrl_ct)) {
10050     logerrprint("Warning: Skipping --test-missing since at least one case and one control is\nrequired.\n");
10051     goto testmiss_ret_1;
10052   }
10053   cur_case_ct_recip = 1.0 / ((double)((int32_t)case_ct));
10054   cur_ctrl_ct_recip = 1.0 / ((double)((int32_t)ctrl_ct));
10055   // Y chromosome requires special handling--only male genotypes should be
10056   // considered.
10057   if ((y_code == -2) || (!is_set(chrom_info_ptr->chrom_mask, y_code))) {
10058     skip_y = 1;
10059   } else if ((!case_ct_y) || (!ctrl_ct_y)) {
10060     logerrprint("Warning: --test-missing is skipping Y chromosome since at least one male case\nand one male control are necessary.\n");
10061     skip_y = 1;
10062   }
10063   if (perm_maxt) {
10064     mperm_dump_all = mperm_save & MPERM_DUMP_ALL;
10065     perms_total = testmiss_mperm_val;
10066     if (bigstack_alloc_d(perms_total, &g_maxt_extreme_stat)) {
10067       goto testmiss_ret_NOMEM;
10068     }
10069     for (uii = 0; uii < perms_total; uii++) {
10070       g_maxt_extreme_stat[uii] = 1;
10071     }
10072     if (mperm_dump_all) {
10073       memcpy(outname_end, ".mperm.dump.all", 16);
10074       if (fopen_checked(outname, "w", &outfile_msa)) {
10075         goto testmiss_ret_OPEN_FAIL;
10076       }
10077       LOGPRINTFWW("Dumping all permutation p-values to %s .\n", outname);
10078     }
10079   } else {
10080     mperm_save = 0;
10081     if (perm_adapt) {
10082       g_aperm_alpha = apip->alpha;
10083       perms_total = apip->max;
10084     }
10085   }
10086   // Sites with no (or all) missing calls are now excluded from the permutation
10087   // test.  Since it's likely that many such sites exist, we postpone the
10088   // associated memory allocations until after the basic .missing report is
10089   // generated.
10090   if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
10091       bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_nm2) ||
10092       bigstack_alloc_ul(pheno_nm_ctl, &pheno_c_collapsed) ||
10093       bigstack_alloc_ul(pheno_nm_ctl, &missing_bitfield) ||
10094       bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude)) {
10095     goto testmiss_ret_NOMEM;
10096   }
10097   memcpy(marker_exclude, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
10098   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
10099   init_quaterarr_from_bitarr(pheno_nm, unfiltered_sample_ct, pheno_nm2);
10100   cur_pheno_nm2 = pheno_nm2;
10101   copy_bitarr_subset(pheno_c, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, pheno_c_collapsed);
10102   cur_pheno_c_collapsed = pheno_c_collapsed;
10103   if (!skip_y) {
10104     if (bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_male_nm2) ||
10105         bigstack_alloc_ul(pheno_male_nm_ctl, &pheno_c_collapsed_male)) {
10106       goto testmiss_ret_NOMEM;
10107     }
10108     // temporary non-excluded male bitfield
10109     memcpy(pheno_male_nm2, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
10110     bitvec_and(sex_male, unfiltered_sample_ctl, pheno_male_nm2);
10111     copy_bitarr_subset(pheno_c, pheno_male_nm2, unfiltered_sample_ct, male_ct, pheno_c_collapsed_male);
10112     memcpy(pheno_male_nm2, pheno_nm2, unfiltered_sample_ctl2 * sizeof(intptr_t));
10113     apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, pheno_male_nm2);
10114   }
10115   outname_end2 = memcpyb(outname_end, ".missing", 9);
10116   if (fopen_checked(outname, "w", &outfile)) {
10117     goto testmiss_ret_OPEN_FAIL;
10118   }
10119   LOGPRINTFWW5("Writing --test-missing report to %s ... ", outname);
10120   fflush(stdout);
10121   sprintf(g_textbuf, " CHR %%%us     F_MISS_A     F_MISS_U            P \n", plink_maxsnp);
10122   fprintf(outfile, g_textbuf, "SNP");
10123   if (ferror(outfile)) {
10124     goto testmiss_ret_WRITE_FAIL;
10125   }
10126   // technically this part could be even faster with some custom code, but not
10127   // worth the additional maintenance
10128   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
10129     goto testmiss_ret_NOMEM;
10130   }
10131   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10132     goto testmiss_ret_READ_FAIL;
10133   }
10134   chrom_end = 0;
10135   // must be last allocation
10136   if (bigstack_alloc_d(marker_ct_orig, &g_orig_pvals)) {
10137     goto testmiss_ret_NOMEM;
10138   }
10139   dptr = g_orig_pvals;
10140   for (marker_idx = 0; marker_idx < marker_ct_orig; marker_uidx++, marker_idx++) {
10141     if (IS_SET(marker_exclude_orig, marker_uidx)) {
10142       marker_uidx = next_unset_ul_unsafe(marker_exclude_orig, marker_uidx);
10143       if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10144         goto testmiss_ret_NOMEM;
10145       }
10146     }
10147     if (marker_uidx >= chrom_end) {
10148       // exploit overflow
10149       chrom_fo_idx++;
10150       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &g_is_y, &uii, &is_haploid);
10151       if (!skip_y) {
10152         if (!g_is_y) {
10153           cur_pheno_nm2 = pheno_nm2;
10154           cur_pheno_nm_ct = pheno_nm_ct;
10155           cur_sample_ctl = pheno_nm_ctl;
10156           cur_case_ct = case_ct;
10157           cur_ctrl_ct = ctrl_ct;
10158 	  cur_pheno_c_collapsed = pheno_c_collapsed;
10159 	} else {
10160           cur_pheno_nm2 = pheno_male_nm2;
10161           cur_pheno_nm_ct = male_ct;
10162           cur_sample_ctl = pheno_male_nm_ctl;
10163           cur_case_ct = case_ct_y;
10164           cur_ctrl_ct = ctrl_ct_y;
10165 	  cur_pheno_c_collapsed = pheno_c_collapsed_male;
10166 	}
10167         cur_case_ct_recip = 1.0 / ((double)((int32_t)cur_case_ct));
10168         cur_ctrl_ct_recip = 1.0 / ((double)((int32_t)cur_ctrl_ct));
10169       } else if (g_is_y) {
10170         fill_bits(marker_uidx, chrom_end - marker_uidx, marker_exclude);
10171 	marker_idx += chrom_end - marker_uidx - 1 - popcount_bit_idx(marker_exclude_orig, marker_uidx, chrom_end);
10172 	marker_uidx = chrom_end - 1;
10173 	continue;
10174       }
10175       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10176       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
10177       *wptr_start++ = ' ';
10178     }
10179     if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10180       goto testmiss_ret_READ_FAIL;
10181     }
10182     if (is_haploid && hh_exists) {
10183       haploid_fix(hh_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, is_x, g_is_y, (unsigned char*)loadbuf_raw);
10184     }
10185     extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, cur_pheno_nm2, cur_pheno_nm_ct, missing_bitfield);
10186     missing_ct = popcount_longs(missing_bitfield, cur_sample_ctl);
10187     if ((!missing_ct) || (missing_ct == cur_pheno_nm_ct)) {
10188       SET_BIT(marker_uidx, marker_exclude);
10189       continue;
10190     }
10191     uii = popcount_longs_intersect(missing_bitfield, cur_pheno_c_collapsed, cur_sample_ctl);
10192     ujj = missing_ct - uii;
10193     pval = fisher22(uii, ujj, cur_case_ct - uii, cur_ctrl_ct - ujj, midp);
10194     *dptr++ = pval;
10195     if (!(pval <= pfilter)) {
10196       continue;
10197     }
10198     wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10199     *wptr++ = ' ';
10200     wptr = dtoa_g_wxp4x(((int32_t)uii) * cur_case_ct_recip, 12, ' ', wptr);
10201     wptr = dtoa_g_wxp4x(((int32_t)ujj) * cur_ctrl_ct_recip, 12, ' ', wptr);
10202     wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
10203     if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10204       goto testmiss_ret_WRITE_FAIL;
10205     }
10206   }
10207   if (fclose_null(&outfile)) {
10208     goto testmiss_ret_WRITE_FAIL;
10209   }
10210   logprint("done.\n");
10211   marker_ct = (uintptr_t)(dptr - g_orig_pvals);
10212   bigstack_shrink_top(g_orig_pvals, marker_ct * sizeof(double));
10213   if (mtest_adjust) {
10214     if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
10215       goto testmiss_ret_NOMEM;
10216     }
10217     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
10218     retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, nullptr, pfilter, output_min_p, mtest_adjust, 1, 0.0, nullptr, g_orig_pvals);
10219     if (retval) {
10220       goto testmiss_ret_1;
10221     }
10222   }
10223   if (do_perms) {
10224     if (!marker_ct) {
10225       logprint("Skipping --test-missing permutation test since all loci are degenerate.\n");
10226       goto testmiss_ret_1;
10227     }
10228     LOGPRINTF("Including %" PRIuPTR " loc%s in --test-missing permutation test.\n", marker_ct, (marker_ct == 1)? "us" : "i");
10229     if (mperm_dump_all) {
10230       g_textbuf[0] = '0';
10231       wptr = &(g_textbuf[1]);
10232       for (uii = 0; uii < marker_ct; uii++) {
10233         *wptr++ = ' ';
10234         dxx = g_orig_pvals[uii];
10235 	if (dxx >= 0) {
10236 	  wptr = dtoa_g(dxx, wptr);
10237 	} else {
10238 	  wptr = memcpya(wptr, "NA", 2);
10239 	}
10240 	if (wptr >= tbuf2) {
10241 	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10242 	    goto testmiss_ret_WRITE_FAIL;
10243 	  }
10244 	  wptr = g_textbuf;
10245 	}
10246       }
10247       *wptr++ = '\n';
10248       if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10249 	goto testmiss_ret_WRITE_FAIL;
10250       }
10251     }
10252 
10253     if (!skip_y) {
10254       // maybe all Y chromosome markers had no missing calls?
10255       uii = get_chrom_start_vidx(chrom_info_ptr, (uint32_t)y_code);
10256       ujj = get_chrom_end_vidx(chrom_info_ptr, (uint32_t)y_code);
10257       if (popcount_bit_idx(marker_exclude, uii, ujj) == ujj - uii) {
10258 	skip_y = 1;
10259       } else {
10260 	if (bigstack_alloc_ul(pheno_nm_ctl, &sex_male_collapsed)) {
10261 	  goto testmiss_ret_NOMEM;
10262 	}
10263 	copy_bitarr_subset(sex_male, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, sex_male_collapsed);
10264       }
10265     }
10266 
10267     if (cluster_starts) {
10268       retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
10269       if (retval) {
10270 	goto testmiss_ret_1;
10271       }
10272       if (!g_perm_cluster_ct) {
10273 	logerrprint("Error: No size 2+ clusters for permutation test.\n");
10274 	goto testmiss_ret_INVALID_CMDLINE;
10275       }
10276       retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
10277       if (retval) {
10278 	goto testmiss_ret_1;
10279       }
10280     } else {
10281       g_perm_cluster_starts = nullptr;
10282     }
10283     if (max_thread_ct > perms_total) {
10284       max_thread_ct = perms_total;
10285     }
10286     if (bigstack_init_sfmtp(max_thread_ct)) {
10287       goto testmiss_ret_NOMEM;
10288     }
10289     if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv, &g_loadbuf) ||
10290 	bigstack_calloc_ui(marker_ct, &g_perm_2success_ct) ||
10291 	bigstack_alloc_ui(marker_ct, &g_missing_cts)) {
10292       goto testmiss_ret_NOMEM;
10293     }
10294     for (uii = 1; uii <= MODEL_BLOCKSIZE; uii++) {
10295       g_loadbuf[uii * pheno_nm_ctv - 2] = 0;
10296       g_loadbuf[uii * pheno_nm_ctv - 1] = 0;
10297     }
10298     uii = marker_ct;
10299     if (perm_maxt) {
10300       if (!mperm_dump_all) {
10301 	if (bigstack_alloc_ui(6 * MODEL_BLOCKSIZE, &g_precomp_ui) ||
10302 	    bigstack_alloc_d(2 * MODEL_BLOCKSIZE, &g_precomp_d)) {
10303 	  goto testmiss_ret_NOMEM;
10304 	}
10305       }
10306     } else {
10307       if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
10308 	  bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop) ||
10309 	  bigstack_alloc_ui(4 * MODEL_BLOCKSIZE, &g_precomp_ui)) {
10310 	goto testmiss_ret_NOMEM;
10311       }
10312       for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
10313 	g_perm_attempt_ct[marker_idx] = perms_total;
10314       }
10315       g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
10316     }
10317     if (!cluster_starts) {
10318       g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
10319       magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
10320     }
10321     marker_unstopped_ct = marker_ct;
10322     g_perm_is_1bit = 1;
10323     g_perms_done = 0;
10324     g_perm_pheno_nm_ct = pheno_nm_ct;
10325     g_perm_case_ct = case_ct;
10326     g_male_ct = male_ct;
10327     g_fisher_midp = midp;
10328     g_mperm_save_all = nullptr;
10329     // ----- begin main loop -----
10330   testmiss_more_perms:
10331     if (perm_adapt) {
10332       if (g_perms_done) {
10333 	while (g_first_adapt_check <= g_perms_done) {
10334 	  g_first_adapt_check += (int32_t)(apip->init_interval + ((int32_t)g_first_adapt_check) * apip->interval_slope);
10335 	}
10336       } else {
10337 	if (apip->min < apip->init_interval) {
10338 	  g_first_adapt_check = (int32_t)(apip->init_interval);
10339 	} else {
10340 	  g_first_adapt_check = apip->min;
10341 	}
10342 	g_adaptive_intercept = apip->init_interval;
10343 	g_adaptive_slope = apip->interval_slope;
10344       }
10345       g_perm_vec_ct = (bigstack_left() - CACHELINE + sizeof(int32_t)) / (pheno_nm_ctv * sizeof(intptr_t) + (1 - skip_y) * sizeof(int32_t));
10346     } else {
10347       // g_perm_vec_ct memory allocation dependencies:
10348       //   g_maxt_thread_results: (8 * g_perm_vec_ct, cacheline-aligned) *
10349       //     max_thread_ct
10350       //   g_perm_vecst: 16 * ((g_perm_vec_ct + 127) / 128) * pheno_nm_ct
10351       //   g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 704 *
10352       //     max_thread_ct
10353       //   g_perm_vecs: pheno_nm_ctv * sizeof(intptr_t) * g_perm_vec_ct
10354       //   g_male_case_cts (if needed): sizeof(int32_t) * g_perm_vec_ct
10355       //   g_mperm_save_all (if needed): marker_ct * 8 * g_perm_vec_ct
10356       // Forcing g_perm_vec_ct to be a multiple of 128, total is
10357       //   g_perm_vec_ct * (13.5 * max_thread_ct + pheno_nm_ct / 8 + 4 +
10358       //                    sizeof(intptr_t) * pheno_nm_ctv
10359       //                    [+ marker_ct * sizeof(double) * mperm_save_all])
10360       if (mperm_dump_all) {
10361 	g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y) + 128LL * sizeof(double) * marker_ct));
10362       } else {
10363 	g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y)));
10364       }
10365     }
10366     if (g_perm_vec_ct > perms_total - g_perms_done) {
10367       g_perm_vec_ct = perms_total - g_perms_done;
10368     } else if (!g_perm_vec_ct) {
10369       goto testmiss_ret_NOMEM;
10370     }
10371     bigstack_alloc_ul(g_perm_vec_ct * pheno_nm_ctv, &g_perm_vecs);
10372     g_perm_generation_thread_ct = MINV(max_thread_ct, g_perm_vec_ct);
10373     ulii = 0;
10374     if (!cluster_starts) {
10375       if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
10376 	goto testmiss_ret_THREAD_CREATE_FAIL;
10377       }
10378       generate_cc_perms_thread((void*)ulii);
10379     } else {
10380       if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
10381 	goto testmiss_ret_THREAD_CREATE_FAIL;
10382       }
10383       generate_cc_cluster_perms_thread((void*)ulii);
10384     }
10385     join_threads(threads, g_perm_generation_thread_ct);
10386     g_assoc_thread_ct = max_thread_ct;
10387     if (perm_maxt) {
10388       bigstack_alloc_d(max_thread_ct * round_up_pow2(g_perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
10389 #ifdef __LP64__
10390       ulii = ((g_perm_vec_ct + 127) / 128) * 4;
10391       bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
10392 #else
10393       ulii = (g_perm_vec_ct + 31) / 32;
10394       bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
10395       ulii = ((g_perm_vec_ct + 127) / 128) * 4; // force 64-byte align
10396 #endif
10397       bigstack_calloc_ui(ulii * 44 * max_thread_ct, &g_thread_git_wkspace);
10398       transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
10399       if (mperm_dump_all) {
10400 	bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all);
10401       }
10402     }
10403     if (!skip_y) {
10404       bigstack_alloc_ui(g_perm_vec_ct, &g_male_case_cts);
10405       for (perm_idx = 0; perm_idx < g_perm_vec_ct; perm_idx++) {
10406 	g_male_case_cts[perm_idx] = popcount_longs_intersect(sex_male_collapsed, &(g_perm_vecs[perm_idx * pheno_nm_ctv]), pheno_nm_ctl);
10407       }
10408     }
10409     chrom_fo_idx = 0xffffffffU;
10410     marker_uidx = next_unset_unsafe(marker_exclude, 0);
10411     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10412       goto testmiss_ret_READ_FAIL;
10413     }
10414     marker_idx = 0;
10415     marker_idx2 = 0;
10416     chrom_end = 0;
10417     // only forced to terminate block at Y chromosome boundaries
10418     if (!skip_y) {
10419       marker_uidx_end = get_chrom_start_vidx(chrom_info_ptr, (uint32_t)y_code);
10420       pheno_male_nm_ctl = round_up_pow2(pheno_male_nm_ctl, 2);
10421     } else {
10422       marker_uidx_end = unfiltered_marker_ct;
10423     }
10424     do {
10425       block_size = 0;
10426       block_end = marker_unstopped_ct - marker_idx;
10427       if ((!marker_idx) && perm_maxt) {
10428 	if (block_end > MODEL_BLOCKKEEP) {
10429 	  block_end = MODEL_BLOCKKEEP;
10430 	}
10431       } else if (block_end > MODEL_BLOCKSIZE) {
10432 	block_end = MODEL_BLOCKSIZE;
10433       }
10434       if (marker_uidx >= marker_uidx_end) {
10435 	marker_uidx_end = get_chrom_end_vidx(chrom_info_ptr, (uint32_t)y_code);
10436 	if (marker_uidx >= marker_uidx_end) {
10437 	  marker_uidx_end = unfiltered_marker_ct;
10438 	}
10439       }
10440       do {
10441 	if (perm_adapt && g_perm_adapt_stop[marker_idx2]) {
10442 	  do {
10443 	    marker_uidx++;
10444 	    next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
10445 	    marker_idx2++;
10446 	  } while ((marker_uidx < marker_uidx_end) && g_perm_adapt_stop[marker_idx2]);
10447 	  if (marker_uidx >= marker_uidx_end) {
10448 	    break;
10449 	  }
10450 	  if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10451 	    goto testmiss_ret_READ_FAIL;
10452 	  }
10453 	}
10454 	if (marker_uidx >= chrom_end) {
10455 	  // exploit overflow
10456 	  chrom_fo_idx++;
10457 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &is_haploid);
10458 	  if (!g_is_y) {
10459 	    g_perm_case_ct = case_ct;
10460 	  } else {
10461 	    g_perm_case_ct = case_ct_y;
10462 	  }
10463 	}
10464 	if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10465 	  goto testmiss_ret_READ_FAIL;
10466 	}
10467 	if (is_haploid && hh_exists) {
10468 	  haploid_fix(hh_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, is_x, g_is_y, (unsigned char*)loadbuf_raw);
10469 	}
10470 	loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv]);
10471 	extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, pheno_nm2, pheno_nm_ct, loadbuf_ptr);
10472 	if (g_is_y) {
10473 	  bitvec_and(sex_male_collapsed, pheno_nm_ctl, loadbuf_ptr);
10474 	}
10475 	if (!g_perms_done) {
10476 	  missing_ct = popcount_longs(loadbuf_ptr, pheno_nm_ctl);
10477 	  if (perm_adapt) {
10478 	    g_missing_cts[marker_idx2] = missing_ct;
10479 	  } else {
10480 	    g_missing_cts[marker_idx + block_size] = missing_ct;
10481 	  }
10482 	}
10483 	if (perm_adapt) {
10484 	  g_adapt_m_table[block_size] = marker_idx2++;
10485 	}
10486 	block_size++;
10487 	if (marker_idx + block_size == marker_unstopped_ct) {
10488 	  break;
10489 	}
10490 	marker_uidx++;
10491 	if (IS_SET(marker_exclude, marker_uidx)) {
10492 	  marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
10493 	  if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10494 	    goto testmiss_ret_READ_FAIL;
10495 	  }
10496 	}
10497       } while ((block_size < block_end) && (marker_uidx < marker_uidx_end));
10498       g_block_diff = block_size;
10499       if ((!mperm_dump_all) && ((!g_is_y) || (male_ct == pheno_nm_ct))) {
10500 	if (perm_maxt) {
10501 	  maxt_cur_extreme_stat = g_maxt_extreme_stat[0];
10502 	  for (uii = 1; uii < g_perm_vec_ct; uii++) {
10503 	    dxx = g_maxt_extreme_stat[uii];
10504 	    if (dxx > maxt_cur_extreme_stat) {
10505 	      maxt_cur_extreme_stat = dxx;
10506 	    }
10507 	  }
10508 	}
10509 	// need raw p-values for --mperm-save-all
10510 	// valid case/control counts differ between permutations on Y
10511 	// chromosome, and I won't bother with g_precomp_width just for that
10512 	for (uii = 0; uii < block_size; uii++) {
10513 	  if (perm_adapt) {
10514 	    marker_cidx = g_adapt_m_table[uii];
10515 	  } else {
10516 	    marker_cidx = marker_idx + uii;
10517 	  }
10518 	  pval = g_orig_pvals[marker_cidx];
10519 	  missing_ct = g_missing_cts[marker_cidx];
10520 	  if (perm_adapt) {
10521 	    fisher22_precomp_pval_bounds(pval, midp, case_ct, missing_ct, pheno_nm_ct, &(g_precomp_ui[uii * 4]), nullptr);
10522 	  } else {
10523 	    fisher22_precomp_pval_bounds(pval, midp, case_ct, missing_ct, pheno_nm_ct, &(g_precomp_ui[uii * 6]), nullptr);
10524 	    fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, midp, case_ct, missing_ct, pheno_nm_ct, uibuf, &(g_precomp_d[uii * 2]));
10525 	    g_precomp_ui[uii * 6 + 4] = uibuf[2];
10526 	    g_precomp_ui[uii * 6 + 5] = uibuf[3] - uibuf[2];
10527 	  }
10528 	}
10529       }
10530       ulii = 0;
10531       is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
10532       if (perm_adapt) {
10533 	if (spawn_threads2(threads, &testmiss_adapt_thread, max_thread_ct, is_last_block)) {
10534 	  goto testmiss_ret_THREAD_CREATE_FAIL;
10535 	}
10536 	testmiss_adapt_thread((void*)ulii);
10537 	join_threads2(threads, max_thread_ct, is_last_block);
10538       } else {
10539 	g_maxt_block_base = marker_idx;
10540 	if (spawn_threads2(threads, &testmiss_maxt_thread, max_thread_ct, is_last_block)) {
10541 	  goto testmiss_ret_THREAD_CREATE_FAIL;
10542 	}
10543 	testmiss_maxt_thread((void*)ulii);
10544 	join_threads2(threads, max_thread_ct, is_last_block);
10545 	ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
10546 	umm = block_size;
10547 	if (umm > max_thread_ct) {
10548 	  umm = max_thread_ct;
10549 	}
10550 	for (uii = 0; uii < max_thread_ct; uii++) {
10551 	  dptr = &(g_maxt_thread_results[uii * ulii]);
10552 	  ujj = g_perms_done;
10553 	  ukk = ujj + g_perm_vec_ct;
10554 	  for (; ujj < ukk; ujj++) {
10555 	    dxx = *dptr++;
10556 	    if (dxx < g_maxt_extreme_stat[ujj]) {
10557 	      g_maxt_extreme_stat[ujj] = dxx;
10558 	    }
10559 	  }
10560 	}
10561       }
10562       marker_idx += block_size;
10563     } while (marker_idx < marker_unstopped_ct);
10564     if (mperm_dump_all) {
10565       if (g_perms_done) {
10566 	putc_unlocked(' ', stdout);
10567       }
10568       fputs("[dumping stats]", stdout);
10569       fflush(stdout);
10570       ulii = g_perm_vec_ct;
10571       ujj = 1 + g_perms_done;
10572       wptr = g_textbuf;
10573       tbuf2 = &(g_textbuf[MAXLINELEN]);
10574       for (uii = 0; uii < ulii; uii++) {
10575 	wptr = uint32toa(uii + ujj, wptr);
10576 	dptr = &(g_mperm_save_all[uii]);
10577 	for (ukk = 0; ukk < marker_ct; ukk++) {
10578 	  *wptr++ = ' ';
10579 	  dxx = dptr[ukk * ulii];
10580 	  if (dxx >= 0) {
10581 	    wptr = dtoa_g(dxx, wptr);
10582 	  } else {
10583 	    wptr = memcpya(wptr, "NA", 2);
10584 	  }
10585 	  if (wptr >= tbuf2) {
10586 	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10587 	      goto testmiss_ret_WRITE_FAIL;
10588 	    }
10589 	    wptr = g_textbuf;
10590 	  }
10591 	}
10592 	*wptr++ = '\n';
10593       }
10594       if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10595 	goto testmiss_ret_WRITE_FAIL;
10596       }
10597       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
10598     }
10599     // really should postpone this for --assoc/--model too
10600     g_perms_done += g_perm_vec_ct;
10601     bigstack_reset(g_perm_vecs);
10602     if (g_perms_done < perms_total) {
10603       if (perm_adapt) {
10604 	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
10605 	if (!marker_unstopped_ct) {
10606 	  goto testmiss_adapt_perm_count;
10607 	}
10608       }
10609       printf("\r%u permutation%s complete.", g_perms_done, (g_perms_done != 1)? "s" : "");
10610       fflush(stdout);
10611       goto testmiss_more_perms;
10612     }
10613     if (perm_adapt) {
10614     testmiss_adapt_perm_count:
10615       g_perms_done = 0;
10616       for (ulii = 0; ulii < marker_ct; ulii++) {
10617 	if (g_perm_attempt_ct[ulii] > g_perms_done) {
10618 	  g_perms_done = g_perm_attempt_ct[ulii];
10619 	  if (g_perms_done == perms_total) {
10620 	    break;
10621 	  }
10622 	}
10623       }
10624     }
10625     putc_unlocked('\r', stdout);
10626     LOGPRINTF("%u %s permutation%s complete.\n", g_perms_done, perm_maxt? "max(T)" : "(adaptive)", (g_perms_done != 1)? "s" : "");
10627     if (perm_adapt) {
10628       memcpy(outname_end2, ".perm", 6);
10629     } else {
10630       if (mperm_save & MPERM_DUMP_BEST) {
10631 	ulii = outname_end - outname;
10632 	memcpy(outname_end, ".mperm.dump.best", 17);
10633 	LOGPRINTFWW("Dumping best permutation p-values to %s .\n", outname);
10634 	if (fopen_checked(outname, "w", &outfile)) {
10635 	  goto testmiss_ret_OPEN_FAIL;
10636 	}
10637 	dxx = 1.0;
10638 	for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
10639 	  if (g_orig_pvals[marker_idx] < dxx) {
10640 	    dxx = g_orig_pvals[marker_idx];
10641 	  }
10642 	}
10643 	memcpy(g_textbuf, "0 ", 2);
10644 	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
10645 	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
10646 	  goto testmiss_ret_WRITE_FAIL;
10647 	}
10648 	for (uii = 0; uii < perms_total; uii++) {
10649 	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
10650 	  wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
10651 	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
10652 	    goto testmiss_ret_WRITE_FAIL;
10653 	  }
10654 	}
10655 	if (fclose_null(&outfile)) {
10656 	  goto testmiss_ret_WRITE_FAIL;
10657 	}
10658 	memcpy(outname_end, ".missing", 8);
10659       }
10660       memcpy(outname_end2, ".mperm", 7);
10661     }
10662     if (fopen_checked(outname, "w", &outfile)) {
10663       goto testmiss_ret_OPEN_FAIL;
10664     }
10665     if (perm_adapt) {
10666       sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
10667     } else {
10668       sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
10669 #ifdef __cplusplus
10670       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
10671 #else
10672       qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
10673 #endif
10674     }
10675     /*
10676     if (perm_maxt) {
10677       printf("extreme stats: %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[perms_total - 1]);
10678     }
10679     */
10680     fprintf(outfile, g_textbuf, "SNP");
10681     chrom_fo_idx = 0xffffffffU;
10682     marker_uidx = next_unset_unsafe(marker_exclude, 0);
10683     marker_idx = 0;
10684     dyy = 1.0 / ((double)((int32_t)perms_total + 1));
10685     dxx = 0.5 * dyy;
10686     while (1) {
10687       do {
10688 	chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
10689       } while (marker_uidx >= chrom_end);
10690       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10691       wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
10692       *wptr_start++ = ' ';
10693       wptr_start[plink_maxsnp] = ' ';
10694       for (; marker_uidx < chrom_end;) {
10695 	if (perm_adapt) {
10696 	  pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (g_perm_attempt_ct[marker_idx] + 1)));
10697 	} else {
10698 	  pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) * dxx;
10699 	}
10700 	if (pval <= pfilter) {
10701 	  fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10702 	  wptr = &(wptr_start[1 + plink_maxsnp]);
10703 	  if (!perm_count) {
10704 	    wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
10705 	  } else {
10706 	    wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
10707 	  }
10708 	  if (perm_adapt) {
10709 	    wptr = memseta(wptr, 32, 2);
10710 	    wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
10711 	  } else {
10712 	    // minimum p-value
10713 	    dzz = (int32_t)(doublearr_greater_than(g_maxt_extreme_stat, perms_total, g_orig_pvals[marker_idx] * (1.0 + EPSILON)) + 1);
10714 	    if (!perm_count) {
10715 	      wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
10716 	    } else {
10717 	      wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
10718 	    }
10719 	  }
10720 	  wptr = memcpya(wptr, " \n", 2);
10721 	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10722 	    goto testmiss_ret_WRITE_FAIL;
10723 	  }
10724 	}
10725 	if (++marker_idx == marker_ct) {
10726 	  goto testmiss_loop_end;
10727 	}
10728 	marker_uidx++;
10729 	next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
10730       }
10731     }
10732   testmiss_loop_end:
10733     if (fclose_null(&outfile)) {
10734       goto testmiss_ret_WRITE_FAIL;
10735     }
10736     LOGPRINTFWW("Permutation test report written to %s .\n", outname);
10737   }
10738 
10739   while (0) {
10740   testmiss_ret_NOMEM:
10741     retval = RET_NOMEM;
10742     break;
10743   testmiss_ret_OPEN_FAIL:
10744     retval = RET_OPEN_FAIL;
10745     break;
10746   testmiss_ret_READ_FAIL:
10747     retval = RET_READ_FAIL;
10748     break;
10749   testmiss_ret_WRITE_FAIL:
10750     retval = RET_WRITE_FAIL;
10751     break;
10752   testmiss_ret_INVALID_CMDLINE:
10753     retval = RET_INVALID_CMDLINE;
10754     break;
10755   testmiss_ret_THREAD_CREATE_FAIL:
10756     retval = RET_THREAD_CREATE_FAIL;
10757     break;
10758   }
10759  testmiss_ret_1:
10760   bigstack_reset(bigstack_mark);
10761   fclose_cond(outfile);
10762   fclose_cond(outfile_msa);
10763   return retval;
10764 }
10765 
cluster_assoc_init(const char * flag_name,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uintptr_t * cluster_bitfield,uintptr_t ** pheno_nm_11_ptr,uintptr_t ** pheno_nm_nonmale_11_ptr,uintptr_t ** pheno_nm_male_11_ptr,uint32_t ** sample_to_cluster_pheno_ptr,uint32_t ** cluster_pheno_gtots_ptr,uint32_t ** cur_cluster_pheno_gtots_ptr,uint32_t ** cluster_geno_cts_ptr,uintptr_t ** loadbuf_raw_ptr,uint32_t * cluster_ct2_ptr)10766 int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uintptr_t* cluster_bitfield, uintptr_t** pheno_nm_11_ptr, uintptr_t** pheno_nm_nonmale_11_ptr, uintptr_t** pheno_nm_male_11_ptr, uint32_t** sample_to_cluster_pheno_ptr, uint32_t** cluster_pheno_gtots_ptr, uint32_t** cur_cluster_pheno_gtots_ptr, uint32_t** cluster_geno_cts_ptr, uintptr_t** loadbuf_raw_ptr, uint32_t* cluster_ct2_ptr) {
10767   uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
10768   uint32_t cluster_ct2 = 0;
10769   uint32_t sample_ct = 0;
10770   uint32_t cluster_end = 0;
10771   uint32_t case_ct_total = 0;
10772   uint32_t is_mh2 = (flag_name[4] == '2'); // yeah, this is a hack
10773   uintptr_t* pheno_nm_nonmale_11 = nullptr;
10774   uintptr_t* pheno_nm_male_11 = nullptr;
10775   uintptr_t* pheno_nm_11;
10776   uint32_t* sample_to_cluster_pheno;
10777   uint32_t* cluster_pheno_gtots;
10778   uint32_t cluster_idx;
10779   uint32_t sample_uidx;
10780   uint32_t ctrl_ct;
10781   uint32_t case_ct;
10782   uint32_t ctrl_male_ct;
10783   uint32_t case_male_ct;
10784   uint32_t uii;
10785   uint32_t ujj;
10786   uint32_t ukk;
10787   if (cluster_ct < 2) {
10788     LOGERRPRINTF("Error: %s requires at least two valid clusters.\n", flag_name);
10789     return RET_INVALID_CMDLINE;
10790   }
10791   // 1. Identify clusters with at least one case and one control, and create
10792   //    new cluster data structures describing only these.
10793   // 2. Main loop efficiently skips homozygous A2s via use of CTZLU, and skips
10794   //    samples not in a valid cluster via application of the pheno_nm_11
10795   //    bitmask.  sample_to_cluster_pheno[] maps sample_uidx to (valid) cluster
10796   //    index (high 31 bits) and case/control status (low bit).
10797   if (bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_11_ptr) ||
10798       bigstack_alloc_ul(unfiltered_sample_ctl2, pheno_nm_nonmale_11_ptr) ||
10799       bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_male_11_ptr) ||
10800       bigstack_alloc_ui(unfiltered_sample_ct, sample_to_cluster_pheno_ptr) ||
10801       bigstack_alloc_ui(cluster_ct * 4, cluster_pheno_gtots_ptr)) {
10802     return RET_NOMEM;
10803   }
10804   pheno_nm_11 = *pheno_nm_11_ptr;
10805   pheno_nm_nonmale_11 = *pheno_nm_nonmale_11_ptr;
10806   pheno_nm_male_11 = *pheno_nm_male_11_ptr;
10807   sample_to_cluster_pheno = *sample_to_cluster_pheno_ptr;
10808   cluster_pheno_gtots = *cluster_pheno_gtots_ptr;
10809   for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
10810     uii = cluster_end;
10811     cluster_end = cluster_starts[cluster_idx + 1];
10812     for (; uii < cluster_end; uii++) {
10813       sample_uidx = cluster_map[uii];
10814       if (is_set(pheno_nm, sample_uidx)) {
10815 	if (is_mh2) {
10816 	  goto cluster_assoc_init_valid;
10817 	}
10818 	if (is_set(pheno_c, sample_uidx)) {
10819 	  // we have a case, check for a control
10820 	  while (++uii < cluster_end) {
10821 	    sample_uidx = cluster_map[uii];
10822 	    if (is_set(pheno_nm, sample_uidx) && (!is_set(pheno_c, sample_uidx))) {
10823 	      goto cluster_assoc_init_valid;
10824 	    }
10825 	  }
10826 	  continue;
10827 	} else {
10828 	  // we have a control, check for a case
10829 	  while (++uii < cluster_end) {
10830 	    sample_uidx = cluster_map[uii];
10831 	    if (is_set(pheno_c, sample_uidx)) {
10832 	      goto cluster_assoc_init_valid;
10833 	    }
10834 	  }
10835 	  continue;
10836 	}
10837       }
10838     }
10839     continue;
10840   cluster_assoc_init_valid:
10841     for (uii = cluster_starts[cluster_idx], ctrl_ct = 0, ctrl_male_ct = 0, case_ct = 0, case_male_ct = 0; uii < cluster_end; uii++) {
10842       sample_uidx = cluster_map[uii];
10843       if (is_set(pheno_nm, sample_uidx)) {
10844         pheno_nm_11[sample_uidx / BITCT2] |= (3 * ONELU) << (2 * (sample_uidx % BITCT2));
10845 	ukk = is_set(sex_male, sample_uidx);
10846 	if (ukk) {
10847 	  pheno_nm_male_11[sample_uidx / BITCT2] |= (3 * ONELU) << (2 * (sample_uidx % BITCT2));
10848 	}
10849 	ujj = is_set(pheno_c, sample_uidx);
10850 	sample_to_cluster_pheno[sample_uidx] = 2 * cluster_ct2 + ujj;
10851 	if (ujj) {
10852 	  case_ct++;
10853 	  case_male_ct += ukk;
10854 	} else {
10855 	  ctrl_ct++;
10856 	  ctrl_male_ct += ukk;
10857 	}
10858       }
10859     }
10860     cluster_pheno_gtots[4 * cluster_ct2] = ctrl_ct;
10861     cluster_pheno_gtots[4 * cluster_ct2 + 1] = ctrl_male_ct;
10862     cluster_pheno_gtots[4 * cluster_ct2 + 2] = case_ct;
10863     cluster_pheno_gtots[4 * cluster_ct2 + 3] = case_male_ct;
10864     sample_ct += ctrl_ct + case_ct;
10865     case_ct_total += case_ct;
10866     if (cluster_bitfield) {
10867       SET_BIT(cluster_idx, cluster_bitfield);
10868     }
10869     cluster_ct2++;
10870   }
10871   bitvec_andnot_copy(pheno_nm_11, pheno_nm_male_11, unfiltered_sample_ctl2, pheno_nm_nonmale_11);
10872   bigstack_shrink_top(cluster_pheno_gtots, cluster_ct2 * 4 * sizeof(int32_t));
10873   if (bigstack_alloc_ui(cluster_ct2 * 2, cur_cluster_pheno_gtots_ptr) ||
10874       bigstack_alloc_ui(cluster_ct2 * 4, cluster_geno_cts_ptr) ||
10875       bigstack_alloc_ul(unfiltered_sample_ctl2, loadbuf_raw_ptr)) {
10876     return RET_NOMEM;
10877   }
10878   if (cluster_ct2 < 2) {
10879     LOGERRPRINTF("Error: %s requires at least two valid clusters.\n", flag_name);
10880     return RET_INVALID_CMDLINE;
10881   } else if (sample_ct >= 0x40000000) {
10882     // silly, but I'll document this
10883     LOGERRPRINTF("Error: %s does not support >= 2^30 samples.\n", flag_name);
10884     return RET_INVALID_CMDLINE;
10885   }
10886   LOGPRINTF("%s: %u valid clusters, with a total of %u cases and %u controls.\n", flag_name, cluster_ct2, case_ct_total, sample_ct - case_ct_total);
10887   (*loadbuf_raw_ptr)[unfiltered_sample_ctl2 - 1] = 0;
10888   *cluster_ct2_ptr = cluster_ct2;
10889   return 0;
10890 }
10891 
cluster_assoc_load_one(FILE * bedfile,uintptr_t bed_offset,uintptr_t * marker_exclude,uintptr_t unfiltered_sample_ct,uintptr_t * sample_hh_include2,uintptr_t * sample_hh_male_include2,uintptr_t * loadbuf_raw,uintptr_t * pheno_nm_11,uintptr_t * pheno_nm_nonmale_11,uintptr_t * pheno_nm_male_11,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uint32_t hh_or_mt_exists,char * chrom_name_buf,uint32_t cluster_ct2,uint32_t * sample_to_cluster_pheno,uint32_t * cluster_pheno_gtots,uint32_t * cur_cluster_pheno_gtots,uint32_t * cluster_geno_cts,uintptr_t * marker_uidx_ptr,uint32_t * chrom_end_ptr,uint32_t * chrom_fo_idx_ptr,uint32_t * min_ploidy_1_ptr,uint32_t * is_x_ptr,uint32_t * is_y_ptr,char ** chrom_name_pp,uint32_t * chrom_name_len_ptr)10892 int32_t cluster_assoc_load_one(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_hh_include2, uintptr_t* sample_hh_male_include2, uintptr_t* loadbuf_raw, uintptr_t* pheno_nm_11, uintptr_t* pheno_nm_nonmale_11, uintptr_t* pheno_nm_male_11, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uint32_t hh_or_mt_exists, char* chrom_name_buf, uint32_t cluster_ct2, uint32_t* sample_to_cluster_pheno, uint32_t* cluster_pheno_gtots, uint32_t* cur_cluster_pheno_gtots, uint32_t* cluster_geno_cts, uintptr_t* marker_uidx_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* min_ploidy_1_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, char** chrom_name_pp, uint32_t* chrom_name_len_ptr) {
10893   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
10894   uintptr_t marker_uidx = *marker_uidx_ptr;
10895   uint32_t min_ploidy_1 = *min_ploidy_1_ptr;
10896   uintptr_t cur_word;
10897   uintptr_t* ulptr;
10898   uintptr_t* ulptr2;
10899   uint32_t chrom_fo_idx;
10900   uint32_t chrom_end;
10901   uint32_t chrom_idx;
10902   uint32_t cpidx;
10903   uint32_t sample_uidx_base;
10904   uint32_t sample_uidx;
10905   uint32_t uii;
10906   uint32_t ujj;
10907   if (IS_SET(marker_exclude, marker_uidx)) {
10908     marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
10909     *marker_uidx_ptr = marker_uidx;
10910     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10911       return RET_READ_FAIL;
10912     }
10913   }
10914   if (marker_uidx >= (*chrom_end_ptr)) {
10915     chrom_fo_idx = *chrom_fo_idx_ptr;
10916     do {
10917       chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
10918     } while (marker_uidx >= chrom_end);
10919     *chrom_end_ptr = chrom_end;
10920     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10921     min_ploidy_1 = is_set(chrom_info_ptr->haploid_mask, chrom_idx) || (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[MT_OFFSET]);
10922     *chrom_fo_idx_ptr = chrom_fo_idx;
10923     *min_ploidy_1_ptr = min_ploidy_1;
10924     *is_x_ptr = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]);
10925     *is_y_ptr = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]);
10926     if (!min_ploidy_1) {
10927       for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10928 	cur_cluster_pheno_gtots[cpidx] = 2 * cluster_pheno_gtots[cpidx * 2];
10929       }
10930     } else if (*is_x_ptr) {
10931       for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10932 	cur_cluster_pheno_gtots[cpidx] = 2 * cluster_pheno_gtots[cpidx * 2] - cluster_pheno_gtots[cpidx * 2 + 1];
10933       }
10934     } else if (*is_y_ptr) {
10935       for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10936 	cur_cluster_pheno_gtots[cpidx] = cluster_pheno_gtots[cpidx * 2 + 1];
10937       }
10938     } else {
10939       for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10940 	cur_cluster_pheno_gtots[cpidx] = cluster_pheno_gtots[cpidx * 2];
10941       }
10942     }
10943     if (chrom_name_len_ptr) {
10944       *chrom_name_pp = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, chrom_name_len_ptr, chrom_name_buf);
10945     } else {
10946       // --mh2
10947       // chrom_name_buf = g_textbuf in this case, and we return wptr_start
10948       *chrom_name_pp = chrom_name_write(chrom_info_ptr, chrom_idx, chrom_name_buf);
10949       *(*chrom_name_pp)++ = '\t';
10950     }
10951   }
10952   if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10953     return RET_READ_FAIL;
10954   }
10955   if (IS_SET(marker_reverse, marker_uidx)) {
10956     reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
10957   }
10958   if (min_ploidy_1 && hh_or_mt_exists) {
10959     haploid_fix(hh_or_mt_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, *is_x_ptr, *is_y_ptr, (unsigned char*)loadbuf_raw);
10960   }
10961   fill_uint_zero(4 * cluster_ct2, cluster_geno_cts);
10962   ulptr = loadbuf_raw;
10963   ulptr2 = pheno_nm_11;
10964   if ((!min_ploidy_1) || (*is_x_ptr)) {
10965     if (*is_x_ptr) {
10966       ulptr2 = pheno_nm_nonmale_11;
10967     }
10968     for (sample_uidx_base = 0; sample_uidx_base < unfiltered_sample_ct; sample_uidx_base += BITCT2) {
10969       cur_word = (~(*ulptr++)) & (*ulptr2++);
10970       while (cur_word) {
10971 	uii = CTZLU(cur_word) & (BITCT - 2);
10972 	ujj = (cur_word >> uii) & 3;
10973 	sample_uidx = sample_uidx_base + (uii / 2);
10974 	cpidx = sample_to_cluster_pheno[sample_uidx];
10975 	// this does the following branchlessly:
10976 	// 1. increment A1 count by one for heterozygous calls (ujj == 1)
10977 	// 2. increment missing count by two when ujj == 2
10978 	// 3. increment A1 count by two when ujj == 3
10979 	cluster_geno_cts[cpidx * 2 + (ujj == 2)] += 2 - (ujj == 1);
10980 	cur_word &= ~((3 * ONELU) << uii);
10981       }
10982     }
10983   }
10984   if (min_ploidy_1) {
10985     ulptr = loadbuf_raw;
10986     if ((*is_x_ptr) || (*is_y_ptr)) {
10987       ulptr2 = pheno_nm_male_11;
10988     }
10989     for (sample_uidx_base = 0; sample_uidx_base < unfiltered_sample_ct; sample_uidx_base += BITCT2) {
10990       cur_word = (~(*ulptr++)) & (*ulptr2++);
10991       while (cur_word) {
10992 	uii = CTZLU(cur_word) & (BITCT - 2);
10993 	ujj = (cur_word >> uii) & 3;
10994 	sample_uidx = sample_uidx_base + (uii / 2);
10995 	cpidx = sample_to_cluster_pheno[sample_uidx];
10996 	// increments A1 count by one, or missing count by one
10997 	cluster_geno_cts[cpidx * 2 + 3 - ujj] += 1;
10998 	cur_word &= ~((3 * ONELU) << uii);
10999       }
11000     }
11001   }
11002   return 0;
11003 }
11004 
cmh_assoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t cmh_mperm_val,uint32_t cmh_modifier,double ci_size,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists,Set_info * sip)11005 int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t cmh_mperm_val, uint32_t cmh_modifier, double ci_size, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists, Set_info* sip) {
11006   unsigned char* bigstack_mark = g_bigstack_base;
11007   FILE* outfile = nullptr;
11008   FILE* outfile_msa = nullptr;
11009   uintptr_t* sample_hh_include2 = nullptr;
11010   uintptr_t* sample_hh_male_include2 = nullptr;
11011   uint32_t* orig_df = nullptr;
11012   char* chrom_name_ptr = nullptr;
11013   uint32_t breslow_day = cmh_modifier & CLUSTER_CMH_BD;
11014   uint32_t perm_bd = cmh_modifier & CLUSTER_CMH_PERM_BD;
11015   uint32_t chrom_fo_idx = 0xffffffffU; // deliberate overflow
11016   uint32_t chrom_end = 0;
11017   uint32_t chrom_name_len = 0;
11018   uint32_t pct = 0;
11019   uint32_t min_ploidy_1 = 0;
11020   uint32_t is_x = 0;
11021   uint32_t is_y = 0;
11022   int32_t retval = 0;
11023   char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
11024   uintptr_t* pheno_nm_11;
11025   uintptr_t* pheno_nm_nonmale_11;
11026   uintptr_t* pheno_nm_male_11;
11027   uintptr_t* loadbuf_raw;
11028   double* orig_chisq;
11029   double* dptr;
11030   char* wptr;
11031   uint32_t* sample_to_cluster_pheno;
11032   uint32_t* cluster_pheno_gtots;
11033   uint32_t* cur_cluster_pheno_gtots;
11034   uint32_t* cluster_geno_cts;
11035   uint32_t* marker_idx_to_uidx;
11036   uint32_t* uiptr;
11037   uintptr_t marker_uidx;
11038   uintptr_t marker_idx;
11039   double ci_zt;
11040   double allele_ct_recip;
11041   double allele_ctm1_recip;
11042   double ctrl_ctd;
11043   double case_ctd;
11044   double ctrl_a1_ctd;
11045   double ctrl_a2_ctd;
11046   double case_a1_ctd;
11047   double case_a2_ctd;
11048   double a1_ctd;
11049   double a2_ctd;
11050   double mean_case_a1d;
11051   double var_case_a1d;
11052   double cmh_stat;
11053   double cmh_denom;
11054   double r2;
11055   double s2;
11056   double rtot;
11057   double stot;
11058   double v1;
11059   double v2;
11060   double v3;
11061   double odds_ratio;
11062   double se;
11063   double log_or;
11064   double pval;
11065   double one_minus_odds_ratio;
11066   double double_1mor_recip;
11067   double bdx2;
11068   double amax;
11069   double bb;
11070   double discrim;
11071   double as_plus;
11072   double as_minus;
11073   double a_star;
11074   double b_star;
11075   double c_star;
11076   double d_star;
11077   double dxx;
11078   double dyy;
11079   uint32_t cluster_idx;
11080   uint32_t loop_end;
11081   uint32_t ctrl_ct;
11082   uint32_t case_ct;
11083   uint32_t cluster_ct2;
11084   uint32_t allele_ct;
11085   uint32_t uii;
11086   int32_t cur_df;
11087 
11088   // The best data structures for permutation testing are somewhat different
11089   // from those for the single-pass computation, so we separate the logic.
11090 
11091   retval = cluster_assoc_init("--mh/--bd", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, nullptr, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct2);
11092   if (retval) {
11093     goto cmh_assoc_ret_1;
11094   }
11095   if (breslow_day && (cluster_ct2 > 10) && (!perm_bd)) {
11096     logerrprint("Warning: Breslow-Day statistics are unreliable with a large number of small\nclusters.  You may want to look at empirical p-values from the 'perm-bd'\nadaptive permutation test.\n");
11097   }
11098 
11099   memcpy(outname_end, ".cmh", 5);
11100   if (fopen_checked(outname, "w", &outfile)) {
11101     goto cmh_assoc_ret_OPEN_FAIL;
11102   }
11103   if (ci_size == 0.0) {
11104     ci_size = 0.95;
11105   }
11106   ci_zt = ltqnorm(1 - (1 - ci_size) / 2);
11107   LOGPRINTFWW5("Writing report to %s ... ", outname);
11108   fputs("0%", stdout);
11109   fflush(stdout);
11110   sprintf(g_textbuf, " CHR %%%us         BP   A1      MAF   A2      CHISQ          P         OR         SE        ", plink_maxsnp);
11111   fprintf(outfile, g_textbuf, "SNP");
11112   uii = (uint32_t)((int32_t)(ci_size * (100 + EPSILON)));
11113   if (uii >= 10) {
11114     fprintf(outfile, "L%u        U%u ", uii, uii);
11115   } else {
11116     fprintf(outfile, " L%u         U%u ", uii, uii);
11117   }
11118   if (breslow_day) {
11119     fputs("  CHISQ_BD       P_BD ", outfile);
11120   }
11121   if (putc_checked('\n', outfile)) {
11122     goto cmh_assoc_ret_WRITE_FAIL;
11123   }
11124   if ((chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET])) {
11125     hh_or_mt_exists |= NXMHH_EXISTS;
11126   }
11127   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11128     goto cmh_assoc_ret_NOMEM;
11129   }
11130   if (bigstack_alloc_d(marker_ct, &orig_chisq)) {
11131     goto cmh_assoc_ret_NOMEM;
11132   }
11133   if (perm_bd) {
11134     if (bigstack_alloc_ui(marker_ct, &orig_df)) {
11135       goto cmh_assoc_ret_NOMEM;
11136     }
11137   }
11138   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11139     goto cmh_assoc_ret_READ_FAIL;
11140   }
11141   dptr = orig_chisq;
11142   loop_end = marker_ct / 100;
11143   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11144     if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, chrom_name_buf, cluster_ct2, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &chrom_name_ptr, &chrom_name_len)) {
11145       goto cmh_assoc_ret_READ_FAIL;
11146     }
11147     cmh_stat = 0.0;
11148     cmh_denom = 0.0;
11149     rtot = 0.0;
11150     stot = 0.0;
11151     v1 = 0.0;
11152     v2 = 0.0;
11153     v3 = 0.0;
11154     for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11155       ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11156       case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11157       // skip cluster if all controls missing, or all cases missing
11158       if (ctrl_ct && case_ct) {
11159 	allele_ct = ctrl_ct + case_ct;
11160 	allele_ct_recip = 1.0 / ((double)((int32_t)allele_ct));
11161 	allele_ctm1_recip = 1.0 / ((double)((int32_t)(allele_ct - 1)));
11162 	ctrl_ctd = (double)((int32_t)ctrl_ct);
11163 	case_ctd = (double)((int32_t)case_ct);
11164 	ctrl_a1_ctd = (double)((int32_t)uiptr[0]);
11165 	ctrl_a2_ctd = ctrl_ctd - ctrl_a1_ctd;
11166 	case_a1_ctd = (double)((int32_t)uiptr[2]);
11167 	case_a2_ctd = case_ctd - case_a1_ctd;
11168 	a1_ctd = ctrl_a1_ctd + case_a1_ctd;
11169 	a2_ctd = ctrl_a2_ctd + case_a2_ctd;
11170         mean_case_a1d = case_ctd * a1_ctd * allele_ct_recip;
11171 	var_case_a1d = ctrl_ctd * case_ctd * a1_ctd * a2_ctd * allele_ct_recip * allele_ct_recip * allele_ctm1_recip;
11172 	cmh_stat += case_a1_ctd - mean_case_a1d;
11173         cmh_denom += var_case_a1d;
11174 	r2 = case_a1_ctd * ctrl_a2_ctd * allele_ct_recip;
11175 	s2 = case_a2_ctd * ctrl_a1_ctd * allele_ct_recip;
11176         rtot += r2;
11177         stot += s2;
11178 	v1 += allele_ct_recip * r2 * (case_a1_ctd + ctrl_a2_ctd);
11179 	v2 += allele_ct_recip * s2 * (case_a2_ctd + ctrl_a1_ctd);
11180         v3 += allele_ct_recip * ((case_a1_ctd + ctrl_a2_ctd) * s2 + (case_a2_ctd + ctrl_a1_ctd) * r2);
11181       }
11182     }
11183     cmh_stat *= cmh_stat / cmh_denom;
11184     odds_ratio = rtot / stot;
11185     se = sqrt(v1 / (2 * rtot * rtot) + v2 / (2 * stot * stot) + v3 / (2 * rtot * stot));
11186     log_or = log(odds_ratio);
11187     pval = chiprob_p(cmh_stat, 1);
11188     if (cmh_stat >= 0.0) {
11189       *dptr++ = cmh_stat;
11190     } else {
11191       *dptr++ = -9;
11192     }
11193     if ((pfilter == 2.0) || ((pval <= pfilter) && (pval != -9))) {
11194       wptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
11195       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr);
11196       *wptr++ = ' ';
11197       wptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', wptr);
11198       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11199 	goto cmh_assoc_ret_WRITE_FAIL;
11200       }
11201       fputs_w4(marker_allele_ptrs[marker_uidx * 2], outfile);
11202       g_textbuf[0] = ' ';
11203       wptr = dtoa_g_wxp4x(1.0 - set_allele_freqs[marker_uidx], 8, ' ', &(g_textbuf[1]));
11204       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11205 	goto cmh_assoc_ret_WRITE_FAIL;
11206       }
11207       fputs_w4(marker_allele_ptrs[marker_uidx * 2 + 1], outfile);
11208       if (realnum(cmh_stat)) {
11209 	g_textbuf[0] = ' ';
11210 	wptr = dtoa_g_wxp4x(cmh_stat, 10, ' ', &(g_textbuf[1]));
11211 	wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
11212       } else {
11213         wptr = memcpya(g_textbuf, "         NA         NA ", 23);
11214       }
11215       if (realnum(odds_ratio)) {
11216         wptr = dtoa_g_wxp4x(odds_ratio, 10, ' ', wptr);
11217       } else {
11218 	wptr = memcpya(wptr, "        NA ", 11);
11219       }
11220       if (realnum(se)) {
11221         wptr = dtoa_g_wxp4x(se, 10, ' ', wptr);
11222 	dxx = ci_zt * se;
11223 	dyy = exp(log_or - dxx);
11224 	if (realnum(dyy)) {
11225           wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
11226 	} else {
11227 	  wptr = memcpya(wptr, "        NA ", 11);
11228 	}
11229 	dyy = exp(log_or + dxx);
11230         if (realnum(dyy)) {
11231           wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
11232 	} else {
11233 	  wptr = memcpya(wptr, "        NA ", 11);
11234 	}
11235       } else {
11236 	wptr = memcpya(wptr, "        NA         NA         NA ", 33);
11237       }
11238       if (breslow_day) {
11239 	if (realnum(odds_ratio) && (odds_ratio != 1.0)) {
11240 	  one_minus_odds_ratio = 1.0 - odds_ratio;
11241           double_1mor_recip = 0.5 / one_minus_odds_ratio;
11242 	  bdx2 = 0.0;
11243 	  cur_df = -1;
11244 	  for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11245 	    ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11246 	    case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11247 	    if (ctrl_ct && case_ct) {
11248 	      cur_df++;
11249 	      ctrl_ctd = (double)((int32_t)ctrl_ct);
11250 	      case_ctd = (double)((int32_t)case_ct);
11251 	      ctrl_a1_ctd = (double)((int32_t)uiptr[0]);
11252 	      case_a1_ctd = (double)((int32_t)uiptr[2]);
11253 	      a1_ctd = ctrl_a1_ctd + case_a1_ctd;
11254 	      amax = MINV(case_ctd, a1_ctd);
11255 	      bb = ctrl_ctd + case_ctd * odds_ratio - a1_ctd * one_minus_odds_ratio;
11256 	      discrim = sqrt(bb * bb + 4 * one_minus_odds_ratio * odds_ratio * case_ctd * a1_ctd);
11257 	      as_plus = (-bb + discrim) * double_1mor_recip;
11258 	      as_minus = (-bb - discrim) * double_1mor_recip;
11259 	      a_star = ((as_minus <= amax) && (as_minus >= 0))? as_minus : as_plus;
11260               b_star = case_ctd - a_star;
11261               c_star = a1_ctd - a_star;
11262               d_star = ctrl_ctd - a1_ctd + a_star;
11263 
11264               // concordance fix (25 May 2018): print NA,NA instead of inf,0
11265               if ((a_star == 0.0) || (b_star == 0.0) || (c_star == 0.0) || (d_star == 0.0)) {
11266                 goto cmh_assoc_bd_fail;
11267               }
11268 
11269 	      // inverse variance
11270               dxx = 1.0 / a_star + 1.0 / b_star + 1.0 / c_star + 1.0 / d_star;
11271 
11272 	      dyy = case_a1_ctd - a_star;
11273 	      bdx2 += dyy * dyy * dxx;
11274 	    }
11275 	  }
11276 	  pval = chiprob_p(bdx2, cur_df);
11277 	  if (pval > -1) {
11278 	    wptr = dtoa_g_wxp4x(bdx2, 10, ' ', wptr);
11279 	    wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
11280 	  } else {
11281 	    goto cmh_assoc_bd_fail;
11282 	  }
11283 	} else {
11284 	cmh_assoc_bd_fail:
11285 	  wptr = memcpya(wptr, "        NA         NA ", 22);
11286 	}
11287       }
11288       *wptr++ = '\n';
11289       if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11290 	goto cmh_assoc_ret_WRITE_FAIL;
11291       }
11292     }
11293     if (marker_idx >= loop_end) {
11294       if (marker_idx < marker_ct) {
11295 	if (pct >= 10) {
11296 	  putc_unlocked('\b', stdout);
11297 	}
11298 	pct = (marker_idx * 100LLU) / marker_ct;
11299         printf("\b\b%u%%", pct);
11300         fflush(stdout);
11301         loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11302       }
11303     }
11304   }
11305   if (fclose_null(&outfile)) {
11306     goto cmh_assoc_ret_WRITE_FAIL;
11307   }
11308   if (pct >= 10) {
11309     putc_unlocked('\b', stdout);
11310   }
11311   fputs("\b\b", stdout);
11312   logprint("done.\n");
11313   if (mtest_adjust) {
11314     if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
11315       goto cmh_assoc_ret_NOMEM;
11316     }
11317     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
11318     retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, nullptr, nullptr);
11319   }
11320 
11321   if (cmh_modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM)) {
11322     logerrprint("Error: --mh/--bd permutation tests are currently under development.\n");
11323     goto cmh_assoc_ret_INVALID_CMDLINE;
11324   }
11325 
11326   // Given the genotypes at a marker, the following quantities are invariant
11327   // through permutations:
11328   // * set of possibly valid clusters (2+ nonmissing genotypes)
11329   // * allele counts in each cluster
11330   // while the following quantities need to be recomputed:
11331   // * [case x A1] and [case x A2] counts in each cluster (control counts can
11332   //   then be determined via subtraction; but note that [case x A2] CANNOT
11333   //   generally be determined from [case x A1] because the number of cases
11334   //   with missing genotypes may vary, though we could special-case
11335   //   no-missing-genotypes if this ever becomes popular enough to justify the
11336   //   complexity)
11337   //
11338   // To handle both large and small clusters efficiently without too much
11339   // special-casing, we preprocess the raw data so that each cluster's
11340   // genotypes occupy separate words.  (Exception: on 64-bit systems, clusters
11341   // of size <= 16 are stuffed into 4 bytes, to improve memory efficiency.)
11342   // This allows the inner loops to be based on bitwise operations and
11343   // sequential memory accessses.  We also scan for clusters containing only a
11344   // single genotype, or less than two nonmissing genotypes, and exclude them
11345   // from the main loop.
11346 
11347   // ...
11348 
11349   while (0) {
11350   cmh_assoc_ret_NOMEM:
11351     retval = RET_NOMEM;
11352     break;
11353   cmh_assoc_ret_OPEN_FAIL:
11354     retval = RET_OPEN_FAIL;
11355     break;
11356   cmh_assoc_ret_READ_FAIL:
11357     retval = RET_READ_FAIL;
11358     break;
11359   cmh_assoc_ret_WRITE_FAIL:
11360     retval = RET_WRITE_FAIL;
11361     break;
11362   cmh_assoc_ret_INVALID_CMDLINE:
11363     retval = RET_INVALID_CMDLINE;
11364     break;
11365   }
11366  cmh_assoc_ret_1:
11367   bigstack_reset(bigstack_mark);
11368   fclose_cond(outfile);
11369   fclose_cond(outfile_msa);
11370   return retval;
11371 }
11372 
cmh2_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)11373 int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
11374   unsigned char* bigstack_mark = g_bigstack_base;
11375   FILE* outfile = nullptr;
11376   uintptr_t* sample_hh_include2 = nullptr;
11377   uintptr_t* sample_hh_male_include2 = nullptr;
11378   char* wptr_start = nullptr;
11379   uint32_t chrom_fo_idx = 0xffffffffU;
11380   uint32_t chrom_end = 0;
11381   uint32_t pct = 0;
11382   uint32_t min_ploidy_1 = 0;
11383   uint32_t is_x = 0;
11384   uint32_t is_y = 0;
11385   uint32_t cluster_ct1 = 0;
11386   uint32_t ctrl_ct = 0;
11387   uint32_t case_ct = 0;
11388   int32_t retval = 0;
11389   uintptr_t* pheno_nm_11;
11390   uintptr_t* pheno_nm_nonmale_11;
11391   uintptr_t* pheno_nm_male_11;
11392   uintptr_t* loadbuf_raw;
11393   char* wptr;
11394   MATRIX_INVERT_BUF1_TYPE* mi_buf;
11395   double* ty_ctrl;
11396   double* ty_case;
11397   double* n0;
11398   double* u0;
11399   double* v0;
11400   double* dbl_2d_buf;
11401   double* dptr;
11402   double* dptr2;
11403   uint32_t* sample_to_cluster_pheno;
11404   uint32_t* cluster_pheno_gtots;
11405   uint32_t* cur_cluster_pheno_gtots;
11406   uint32_t* cluster_geno_cts;
11407   uint32_t* uiptr;
11408   uintptr_t marker_uidx;
11409   uintptr_t marker_idx;
11410   double ctrl_a1_ctd; // Tx[.][]
11411   double case_a1_ctd;
11412   double ctrl_ctd; // T[.]
11413   double case_ctd;
11414   double cur_ty_ctrl;
11415   double cur_ty_case;
11416   double ctrl_umult;
11417   double case_umult;
11418   double ctrl_vmult; // (Tx[] * (T[] - Tx[])) / (T[] * T[] * (T[]-1))
11419   double case_vmult;
11420   double cur_ctrl_vmult;
11421   double cur_case_vmult;
11422   double chisq;
11423   double dxx;
11424   uint32_t cur_ctrl_ct;
11425   uint32_t cur_case_ct;
11426   uint32_t cluster_ctrl_ct;
11427   uint32_t cluster_case_ct;
11428   uint32_t ctrl_a1_ct;
11429   uint32_t case_a1_ct;
11430   uint32_t cur_cluster_ct;
11431   uint32_t cur_cluster_ctm1;
11432   uint32_t cluster_idx;
11433   uint32_t loop_end;
11434   uint32_t uii;
11435   // no reason to keep X/Y/MT/haploid restriction
11436   retval = cluster_assoc_init("--mh2", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, nullptr, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct1);
11437   for (cluster_idx = 0; cluster_idx < cluster_ct1; cluster_idx++) {
11438     ctrl_ct += cluster_pheno_gtots[4 * cluster_idx];
11439     case_ct += cluster_pheno_gtots[4 * cluster_idx + 2];
11440   }
11441   if ((ctrl_ct < 2) || (case_ct < 2)) {
11442     logerrprint("Error: --mh2 requires at least two cases and two controls.\n");
11443     goto cmh2_assoc_ret_INVALID_CMDLINE;
11444   }
11445 #ifdef __LP64__
11446   if (cluster_ct1 > 46341) {
11447     // might actually be ok, but play it safe in case LAPACK matrix inversion
11448     // routine has an integer overflow here
11449     // (if/when we do permit this, will need to switch a few variables to type
11450     // uintptr_t)
11451     logerrprint("Error: --mh2 does not currently support more than 46341 clusters.\n");
11452     goto cmh2_assoc_ret_INVALID_CMDLINE;
11453   }
11454 #endif
11455   if (bigstack_alloc_d(cluster_ct1, &ty_ctrl) ||
11456       bigstack_alloc_d(cluster_ct1, &ty_case) ||
11457       bigstack_alloc_d(cluster_ct1 - 1, &n0) ||
11458       bigstack_alloc_d(cluster_ct1 - 1, &u0) ||
11459       bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &v0) ||
11460       bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &dbl_2d_buf)) {
11461     goto cmh2_assoc_ret_NOMEM;
11462   }
11463   mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc((cluster_ct1 - 1) * MATRIX_INVERT_BUF1_ELEM_ALLOC);
11464   if (!mi_buf) {
11465     goto cmh2_assoc_ret_NOMEM;
11466   }
11467   if ((chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET])) {
11468     hh_or_mt_exists |= NXMHH_EXISTS;
11469   }
11470   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11471     goto cmh2_assoc_ret_NOMEM;
11472   }
11473   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11474     goto cmh2_assoc_ret_READ_FAIL;
11475   }
11476   memcpy(outname_end, ".cmh2", 6);
11477   if (fopen_checked(outname, "w", &outfile)) {
11478     goto cmh2_assoc_ret_OPEN_FAIL;
11479   }
11480   LOGPRINTFWW5("Writing report to %s ... ", outname);
11481   fputs("0%", stdout);
11482   fflush(stdout);
11483   if (fputs_checked("CHR\tSNP\tCHISQ\tDF\tP\n", outfile)) {
11484     goto cmh2_assoc_ret_WRITE_FAIL;
11485   }
11486   loop_end = marker_ct / 100;
11487   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11488     if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, g_textbuf, cluster_ct1, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &wptr_start, nullptr)) {
11489       goto cmh2_assoc_ret_READ_FAIL;
11490     }
11491     wptr = strcpyax(wptr_start, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
11492     cur_ctrl_ct = 0;
11493     cur_case_ct = 0;
11494     ctrl_a1_ct = 0;
11495     case_a1_ct = 0;
11496     cur_cluster_ct = 0;
11497     for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct1; cluster_idx++, uiptr = &(uiptr[4])) {
11498       cluster_ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11499       cluster_case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11500       uii = cluster_ctrl_ct + cluster_case_ct;
11501       if (uii) {
11502 	// don't count toward cur_cluster_ct if all observations are missing
11503         n0[cur_cluster_ct] = (double)((int32_t)(uiptr[0] + uiptr[2]));
11504 	ctrl_a1_ct += uiptr[0];
11505         case_a1_ct += uiptr[2];
11506 	cur_ctrl_ct += cluster_ctrl_ct;
11507         cur_case_ct += cluster_case_ct;
11508 	ty_ctrl[cur_cluster_ct] = (double)((int32_t)cluster_ctrl_ct);
11509 	ty_case[cur_cluster_ct] = (double)((int32_t)cluster_case_ct);
11510 	cur_cluster_ct++;
11511       }
11512     }
11513 
11514     // This is always a 2xJx2 test (where J = cluster ct), so we can omit PLINK
11515     // 1.07 calcMantelHaenszel_IxJxK code which only comes into play for larger
11516     // I/K values.
11517     if (((!cur_ctrl_ct) && cur_case_ct) || ((!cur_case_ct) && cur_ctrl_ct) || (cur_cluster_ct == 1)) {
11518       // may as well distinguish 0df from other problems
11519       wptr = memcpya(wptr, "0\t0\tNA\n", 7);
11520       goto cmh2_assoc_fail2;
11521     } else if ((cur_ctrl_ct < 2) || (cur_case_ct < 2) || (!cur_cluster_ct)) {
11522       goto cmh2_assoc_fail;
11523     }
11524     cur_cluster_ctm1 = cur_cluster_ct - 1;
11525     ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
11526     case_ctd = (double)((int32_t)cur_case_ct);
11527     ctrl_a1_ctd = (double)((int32_t)ctrl_a1_ct);
11528     case_a1_ctd = (double)((int32_t)case_a1_ct);
11529     ctrl_umult = ctrl_a1_ctd / ctrl_ctd;
11530     case_umult = case_a1_ctd / case_ctd;
11531     ctrl_vmult = ctrl_umult * (ctrl_ctd - ctrl_a1_ctd) / (ctrl_ctd * (ctrl_ctd - 1));
11532     case_vmult = case_umult * (case_ctd - case_a1_ctd) / (case_ctd * (case_ctd - 1));
11533     for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11534       // instead of a two-step process where e.g. U[][] is filled first, and
11535       // then columnwise sums are saved to U0, we just fill U0 directly.
11536       cur_ty_ctrl = ty_ctrl[cluster_idx];
11537       cur_ty_case = ty_case[cluster_idx];
11538       u0[cluster_idx] = cur_ty_ctrl * ctrl_umult + cur_ty_case * case_umult;
11539       cur_ctrl_vmult = -cur_ty_ctrl * ctrl_vmult;
11540       cur_case_vmult = -cur_ty_case * case_vmult;
11541       dptr = &(v0[cluster_idx * cur_cluster_ct]);
11542       // should be guaranteed to be nonnegative, no need for fabs()?
11543       *dptr++ = (cur_ty_ctrl - ctrl_ctd) * cur_ctrl_vmult + (cur_ty_case - case_ctd) * cur_case_vmult;
11544       for (uii = cluster_idx + 1; uii < cur_cluster_ctm1; uii++) {
11545 	*dptr++ = ty_ctrl[uii] * cur_ctrl_vmult + ty_case[uii] * cur_case_vmult;
11546       }
11547     }
11548     for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11549       dptr = &(v0[cluster_idx * cur_cluster_ctm1]);
11550       dptr2 = &(v0[cluster_idx]);
11551       for (uii = 0; uii < cluster_idx; uii++) {
11552 	*dptr++ = dptr2[uii * cur_cluster_ctm1];
11553       }
11554     }
11555 
11556     if (!invert_matrix(cur_cluster_ctm1, v0, mi_buf, dbl_2d_buf)) {
11557       // Q = G'V{-1}G
11558       chisq = 0.0;
11559       for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11560 	dbl_2d_buf[cluster_idx] = n0[cluster_idx] - u0[cluster_idx];
11561       }
11562       dptr = v0;
11563       for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11564 	dxx = 0.0;
11565 	dptr2 = dbl_2d_buf;
11566 	for (uii = 0; uii < cur_cluster_ctm1; uii++) {
11567 	  dxx += (*dptr++) * (*dptr2++);
11568 	}
11569 	chisq += dxx * (dbl_2d_buf[cluster_idx]);
11570       }
11571       wptr = dtoa_gx(chisq, '\t', wptr);
11572       wptr = uint32toa_x(cur_cluster_ctm1, '\t', wptr);
11573       dxx = chiprob_p(chisq, (int32_t)cur_cluster_ctm1);
11574       wptr = dtoa_gx(MAXV(dxx, output_min_p), '\n', wptr);
11575     } else {
11576     cmh2_assoc_fail:
11577       wptr = memcpya(wptr, "NA\tNA\tNA\n", 9);
11578     }
11579   cmh2_assoc_fail2:
11580     if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11581       goto cmh2_assoc_ret_WRITE_FAIL;
11582     }
11583     if (marker_idx >= loop_end) {
11584       if (marker_idx < marker_ct) {
11585 	if (pct >= 10) {
11586 	  putc_unlocked('\b', stdout);
11587 	}
11588         pct = (marker_idx * 100LLU) / marker_ct;
11589         printf("\b\b%u%%", pct);
11590         fflush(stdout);
11591         loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11592       }
11593     }
11594   }
11595   if (fclose_null(&outfile)) {
11596     goto cmh2_assoc_ret_WRITE_FAIL;
11597   }
11598   if (pct >= 10) {
11599     putc_unlocked('\b', stdout);
11600   }
11601   fputs("\b\b", stdout);
11602   logprint("done.\n");
11603   while (0) {
11604   cmh2_assoc_ret_NOMEM:
11605     retval = RET_NOMEM;
11606     break;
11607   cmh2_assoc_ret_OPEN_FAIL:
11608     retval = RET_OPEN_FAIL;
11609     break;
11610   cmh2_assoc_ret_READ_FAIL:
11611     retval = RET_READ_FAIL;
11612     break;
11613   cmh2_assoc_ret_WRITE_FAIL:
11614     retval = RET_WRITE_FAIL;
11615     break;
11616   cmh2_assoc_ret_INVALID_CMDLINE:
11617     retval = RET_INVALID_CMDLINE;
11618     break;
11619   }
11620   bigstack_reset(bigstack_mark);
11621   fclose_cond(outfile);
11622   return retval;
11623 }
11624 
homog_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,char * cluster_ids,uintptr_t max_cluster_id_len,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)11625 int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, char* cluster_ids, uintptr_t max_cluster_id_len, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
11626   unsigned char* bigstack_mark = g_bigstack_base;
11627   unsigned char* bigstack_end_mark = g_bigstack_end;
11628   FILE* outfile = nullptr;
11629   uintptr_t* sample_hh_include2 = nullptr;
11630   uintptr_t* sample_hh_male_include2 = nullptr;
11631   char* writebuf = g_textbuf;
11632   char* chrom_name_ptr = nullptr;
11633   uint32_t cluster_ct2 = 0;
11634   uint32_t chrom_fo_idx = 0xffffffffU;
11635   uint32_t chrom_end = 0;
11636   uint32_t chrom_name_len = 0;
11637   uint32_t pct = 0;
11638   uint32_t min_ploidy_1 = 0;
11639   uint32_t is_x = 0;
11640   uint32_t is_y = 0;
11641   int32_t retval = 0;
11642   char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
11643   uintptr_t* cluster_bitfield;
11644   uintptr_t* pheno_nm_11;
11645   uintptr_t* pheno_nm_nonmale_11;
11646   uintptr_t* pheno_nm_male_11;
11647   uintptr_t* loadbuf_raw;
11648   double* cluster_tables;
11649   double* cluster_chisq;
11650   double* cluster_or;
11651   double* dptr;
11652   char* cluster_ids_collapsed;
11653   char* wptr_start;
11654   char* wptr;
11655   uint32_t* sample_to_cluster_pheno;
11656   uint32_t* cluster_pheno_gtots;
11657   uint32_t* cur_cluster_pheno_gtots;
11658   uint32_t* cluster_geno_cts;
11659   uint32_t* uiptr;
11660   uintptr_t marker_uidx;
11661   uintptr_t marker_idx;
11662   uintptr_t ulii;
11663   double cluster_ct2d;
11664   double cluster_ct2m1d;
11665   double case_ctd;
11666   double ctrl_ctd;
11667   double case_a1_ctd;
11668   double case_a2_ctd;
11669   double ctrl_a1_ctd;
11670   double ctrl_a2_ctd;
11671   double case_a2_recip;
11672   double ctrl_a1_recip;
11673   double ln_or;
11674   double se_sq_recip;
11675   double x_total;
11676   double x_assoc1;
11677   double x_assoc2;
11678   double x_assoc;
11679   double dxx;
11680   uint32_t cluster_idx;
11681   uint32_t loop_end;
11682   ulii = 2 * max_marker_allele_len + MAX_ID_SLEN + max_marker_id_len + max_cluster_id_len + 256;
11683   if (ulii > MAXLINELEN) {
11684     if (bigstack_alloc_c(ulii, &writebuf)) {
11685       goto homog_assoc_ret_NOMEM;
11686     }
11687   }
11688   if (bigstack_end_calloc_ul(BITCT_TO_WORDCT(cluster_ct), &cluster_bitfield)) {
11689     goto homog_assoc_ret_NOMEM;
11690   }
11691   // Factor out common initialization with cmh_assoc().
11692   retval = cluster_assoc_init("--homog", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, cluster_bitfield, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct2);
11693   if (retval) {
11694     goto homog_assoc_ret_1;
11695   }
11696   if (cluster_ct == cluster_ct2) {
11697     cluster_ids_collapsed = cluster_ids;
11698   } else {
11699     if (bigstack_alloc_c(cluster_ct2 * max_cluster_id_len, &cluster_ids_collapsed)) {
11700       goto homog_assoc_ret_NOMEM;
11701     }
11702     for (ulii = 0, cluster_idx = 0; cluster_idx < cluster_ct2; ulii++, cluster_idx++) {
11703       next_set_ul_unsafe_ck(cluster_bitfield, &ulii);
11704       memcpy(&(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), &(cluster_ids[ulii * max_cluster_id_len]), max_cluster_id_len);
11705     }
11706   }
11707   bigstack_end_reset(bigstack_end_mark);
11708   cluster_ct2d = (double)((int32_t)cluster_ct2);
11709   cluster_ct2m1d = (double)((int32_t)cluster_ct2 - 1);
11710   if (bigstack_alloc_d(cluster_ct2 * 4, &cluster_tables) ||
11711       bigstack_alloc_d(cluster_ct2, &cluster_or) ||
11712       bigstack_alloc_d(cluster_ct2, &cluster_chisq)) {
11713     goto homog_assoc_ret_NOMEM;
11714   }
11715   if (cluster_ct2 > 10) {
11716     logerrprint("Warning: --homog statistics can be unreliable with small clusters.\n");
11717   }
11718 
11719   memcpy(outname_end, ".homog", 7);
11720   if (fopen_checked(outname, "w", &outfile)) {
11721     goto homog_assoc_ret_OPEN_FAIL;
11722   }
11723   LOGPRINTFWW5("Writing report to %s ... ", outname);
11724   fputs("0%", stdout);
11725   fflush(stdout);
11726   // misaligned for backward compatibility
11727   sprintf(g_textbuf, " CHR %%%us   A1   A2      F_A      F_U      N_A      N_U     TEST      CHISQ   DF          P         OR\n", plink_maxsnp);
11728   fprintf(outfile, g_textbuf, "SNP");
11729   if (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) {
11730     hh_or_mt_exists |= NXMHH_EXISTS;
11731   }
11732   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11733     goto homog_assoc_ret_NOMEM;
11734   }
11735   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11736     goto homog_assoc_ret_READ_FAIL;
11737   }
11738   loop_end = marker_ct / 100;
11739   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11740     if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, chrom_name_buf, cluster_ct2, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &chrom_name_ptr, &chrom_name_len)) {
11741       goto homog_assoc_ret_READ_FAIL;
11742     }
11743     dptr = cluster_tables;
11744     x_total = 0.0;
11745     x_assoc1 = 0.0;
11746     x_assoc2 = 0.0;
11747     for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11748       ctrl_ctd = (double)((int32_t)(1 + cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1]));
11749       case_ctd = (double)((int32_t)(1 + cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3]));
11750       ctrl_a1_ctd = (double)((int32_t)uiptr[0]) + 0.5;
11751       ctrl_a2_ctd = ctrl_ctd - ctrl_a1_ctd;
11752       case_a1_ctd = (double)((int32_t)uiptr[2]) + 0.5;
11753       case_a2_ctd = case_ctd - case_a1_ctd;
11754       *dptr++ = case_a1_ctd;
11755       *dptr++ = case_a2_ctd;
11756       *dptr++ = ctrl_a1_ctd;
11757       *dptr++ = ctrl_a2_ctd;
11758       case_a2_recip = 1.0 / case_a2_ctd;
11759       ctrl_a1_recip = 1.0 / ctrl_a1_ctd;
11760       dxx = case_a1_ctd * ctrl_a2_ctd * case_a2_recip * ctrl_a1_recip;
11761       cluster_or[cluster_idx] = dxx;
11762       ln_or = log(dxx);
11763       se_sq_recip = 1.0 / ((1.0 / case_a1_ctd) + (1.0 / ctrl_a2_ctd) + case_a2_recip + ctrl_a1_recip);
11764       x_assoc2 += se_sq_recip;
11765       dxx = ln_or * se_sq_recip;
11766       x_assoc1 += dxx;
11767       dxx *= ln_or;
11768       cluster_chisq[cluster_idx] = dxx;
11769       x_total += dxx;
11770     }
11771     x_assoc = x_assoc1 * x_assoc1 / x_assoc2;
11772     wptr_start = memcpyax(writebuf, chrom_name_ptr, chrom_name_len, ' ');
11773     wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
11774     *wptr_start++ = ' ';
11775     wptr_start = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2], wptr_start);
11776     *wptr_start++ = ' ';
11777     wptr_start = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2 + 1], wptr_start);
11778     *wptr_start++ = ' ';
11779     wptr_start = memcpya(wptr_start, "      NA       NA       NA       NA ", 36);
11780     wptr = memcpya(wptr_start, " TOTAL ", 7);
11781     wptr = dtoa_g_wxp4x(x_total, 10, ' ', wptr);
11782     wptr = uint32toa_w4x(cluster_ct2, ' ', wptr);
11783     wptr = dtoa_g_wxp4x(chiprob_p(x_total, cluster_ct2d), 10, ' ', wptr);
11784     wptr = memcpya(wptr, "        NA\n", 11);
11785     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11786       goto homog_assoc_ret_WRITE_FAIL;
11787     }
11788     wptr = memcpya(wptr_start, " ASSOC ", 7);
11789     wptr = dtoa_g_wxp4(x_assoc, 10, wptr);
11790     wptr = memcpya(wptr, "    1 ", 6);
11791     wptr = dtoa_g_wxp4x(chiprob_p(x_assoc, 1), 10, ' ', wptr);
11792     wptr = memcpya(wptr, "        NA\n", 11);
11793     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11794       goto homog_assoc_ret_WRITE_FAIL;
11795     }
11796     dxx = x_total - x_assoc;
11797     wptr = memcpya(wptr_start, " HOMOG ", 7);
11798     wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
11799     wptr = uint32toa_w4x(cluster_ct2 - 1, ' ', wptr);
11800     wptr = dtoa_g_wxp4x(chiprob_p(dxx, cluster_ct2m1d), 10, ' ', wptr);
11801     wptr = memcpya(wptr, "        NA\n", 11);
11802     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11803       goto homog_assoc_ret_WRITE_FAIL;
11804     }
11805     wptr_start = &(wptr_start[-36]);
11806     for (cluster_idx = 0, dptr = cluster_tables; cluster_idx < cluster_ct2; cluster_idx++, dptr = &(dptr[4])) {
11807       case_ctd = dptr[0] + dptr[1];
11808       ctrl_ctd = dptr[2] + dptr[3];
11809       if ((case_ctd < 1.5) || (ctrl_ctd < 1.5)) {
11810 	wptr = memcpya(wptr_start, "      NA       NA ", 18);
11811 	wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
11812 	wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
11813 	wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
11814         wptr = memcpya(wptr, "         NA   NA         NA         NA\n", 39);
11815       } else {
11816         wptr = dtoa_g_wxp4x(dptr[0] / case_ctd, 8, ' ', wptr_start);
11817         wptr = dtoa_g_wxp4x(dptr[2] / ctrl_ctd, 8, ' ', wptr);
11818 	wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
11819 	wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
11820 	wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
11821 	*wptr++ = ' ';
11822 	dxx = cluster_chisq[cluster_idx];
11823 	if (dxx < SMALL_EPSILON * SMALL_EPSILON) {
11824 	  // probably rounding error
11825 	  dxx = 0;
11826 	}
11827         wptr = dtoa_g_wxp4(dxx, 10, wptr);
11828         wptr = memcpya(wptr, "    1 ", 6);
11829 	wptr = dtoa_g_wxp4x(MAXV(chiprob_p(dxx, 1), output_min_p), 10, ' ', wptr);
11830 	dxx = cluster_or[cluster_idx];
11831         if (realnum(dxx)) {
11832           wptr = dtoa_g_wxp4x(dxx, 10, '\n', wptr);
11833 	} else {
11834 	  wptr = memcpya(wptr, "        NA\n", 11);
11835 	}
11836 	if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11837 	  goto homog_assoc_ret_WRITE_FAIL;
11838 	}
11839       }
11840     }
11841     if (marker_idx >= loop_end) {
11842       if (marker_idx < marker_ct) {
11843 	if (pct >= 10) {
11844 	  putc_unlocked('\b', stdout);
11845 	}
11846         pct = (marker_idx * 100LLU) / marker_ct;
11847         printf("\b\b%u%%", pct);
11848         fflush(stdout);
11849         loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11850       }
11851     }
11852   }
11853   if (fclose_null(&outfile)) {
11854     goto homog_assoc_ret_WRITE_FAIL;
11855   }
11856   if (pct >= 10) {
11857     putc_unlocked('\b', stdout);
11858   }
11859   fputs("\b\b", stdout);
11860   logprint("done.\n");
11861   while (0) {
11862   homog_assoc_ret_NOMEM:
11863     retval = RET_NOMEM;
11864     break;
11865   homog_assoc_ret_OPEN_FAIL:
11866     retval = RET_OPEN_FAIL;
11867     break;
11868   homog_assoc_ret_READ_FAIL:
11869     retval = RET_READ_FAIL;
11870     break;
11871   homog_assoc_ret_WRITE_FAIL:
11872     retval = RET_WRITE_FAIL;
11873     break;
11874   }
11875  homog_assoc_ret_1:
11876   bigstack_double_reset(bigstack_mark, bigstack_end_mark);
11877   fclose_cond(outfile);
11878   return retval;
11879 }
11880