1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink_common.h"
19
20 #include "plink_assoc.h"
21 #include "plink_cluster.h"
22 #include "plink_ld.h"
23 #include "plink_matrix.h"
24 #include "plink_perm.h"
25 #include "plink_stats.h"
26
aperm_init(Aperm_info * apip)27 void aperm_init(Aperm_info* apip) {
28 apip->min = 6;
29 apip->max = 1000000;
30 apip->alpha = 0;
31 apip->beta = 0.0001;
32 apip->init_interval = 1;
33 apip->interval_slope = 0.001;
34 }
35
single_marker_cc_freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_setp,uint32_t * ctrl_missingp,uint32_t * case_setp,uint32_t * case_missingp)36 void single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_setp, uint32_t* ctrl_missingp, uint32_t* case_setp, uint32_t* case_missingp) {
37 // Counts the number of A2 alleles and missing calls for both cases and
38 // controls, for an autosomal marker. (The caller is expected to calculate
39 // the A1 allele count.)
40 // See single_marker_freqs_and_hwe() for discussion.
41 // A := genotype & 0x5555...
42 // B := (genotype >> 1) & 0x5555...
43 // C := A & (~B)
44 // missing: popcount(C)
45 // A2: [popcount(A) + popcount(B)] - popcount(C)
46 uint32_t tot_ctrl_ab = 0;
47 uint32_t tot_ctrl_c = 0;
48 uint32_t tot_case_ab = 0;
49 uint32_t tot_case_c = 0;
50 uintptr_t* lptr_end = &(lptr[sample_ctl2]);
51 uintptr_t loader;
52 uintptr_t loader2;
53 uintptr_t loader3;
54 uintptr_t loader4;
55 #ifdef __LP64__
56 uintptr_t cur_decr = 60;
57 uintptr_t* lptr_6x_end;
58 sample_ctl2 -= sample_ctl2 % 6;
59 while (sample_ctl2 >= 60) {
60 single_marker_cc_freqs_loop:
61 lptr_6x_end = &(lptr[cur_decr]);
62 count_2freq_dbl_960b((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)ctrl_include2, (__m128i*)case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
63 lptr = lptr_6x_end;
64 ctrl_include2 = &(ctrl_include2[cur_decr]);
65 case_include2 = &(case_include2[cur_decr]);
66 sample_ctl2 -= cur_decr;
67 }
68 if (sample_ctl2) {
69 cur_decr = sample_ctl2;
70 goto single_marker_cc_freqs_loop;
71 }
72 #else
73 uintptr_t* lptr_six_end = &(lptr[sample_ctl2 - (sample_ctl2 % 6)]);
74 while (lptr < lptr_six_end) {
75 count_2freq_dbl_24b(lptr, ctrl_include2, case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
76 lptr = &(lptr[6]);
77 ctrl_include2 = &(ctrl_include2[6]);
78 case_include2 = &(case_include2[6]);
79 }
80 #endif
81 while (lptr < lptr_end) {
82 loader = *lptr++;
83 loader2 = *ctrl_include2++;
84 loader3 = loader >> 1;
85 loader4 = loader2 & loader;
86 tot_ctrl_ab += popcount2_long(loader4 + (loader3 & loader2));
87 tot_ctrl_c += popcount2_long(loader4 & (~loader3));
88 loader2 = *case_include2++;
89 loader4 = loader2 & loader;
90 tot_case_ab += popcount2_long(loader4 + (loader3 & loader2));
91 tot_case_c += popcount2_long(loader4 & (~loader3));
92 }
93 *ctrl_missingp = tot_ctrl_c;
94 *ctrl_setp = tot_ctrl_ab - tot_ctrl_c;
95 *case_missingp = tot_case_c;
96 *case_setp = tot_case_ab - tot_case_c;
97 }
98
haploid_single_marker_cc_freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_setp,uint32_t * ctrl_missingp,uint32_t * case_setp,uint32_t * case_missingp)99 void haploid_single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_setp, uint32_t* ctrl_missingp, uint32_t* case_setp, uint32_t* case_missingp) {
100 // Counts the number of A1 and A2 alleles for both cases and controls, for a
101 // haploid marker.
102 // A := genotype & 0x5555...
103 // B := (genotype >> 1) & 0x5555...
104 // C := B ^ A
105 // missing: popcount(C)
106 // A2: popcount(A & B)
107 uint32_t tot_ctrl_ab = 0;
108 uint32_t tot_ctrl_c = 0;
109 uint32_t tot_case_ab = 0;
110 uint32_t tot_case_c = 0;
111 uintptr_t* lptr_end = &(lptr[sample_ctl2]);
112 uintptr_t loader;
113 uintptr_t loader2;
114 uintptr_t loader3;
115 while (lptr < lptr_end) {
116 loader = *lptr++;
117 loader2 = loader >> 1;
118 loader3 = loader2 ^ loader;
119 loader &= loader2;
120 loader2 = *ctrl_include2++;
121 tot_ctrl_ab += popcount2_long(loader & loader2);
122 tot_ctrl_c += popcount2_long(loader3 & loader2);
123 loader2 = *case_include2++;
124 tot_case_ab += popcount2_long(loader & loader2);
125 tot_case_c += popcount2_long(loader3 & loader2);
126 }
127 *ctrl_setp = tot_ctrl_ab;
128 *ctrl_missingp = tot_ctrl_c;
129 *case_setp = tot_case_ab;
130 *case_missingp = tot_case_c;
131 }
132
single_marker_cc_3freqs(uintptr_t sample_ctl2,uintptr_t * lptr,uintptr_t * ctrl_include2,uintptr_t * case_include2,uint32_t * ctrl_hom2p,uint32_t * ctrl_hetp,uint32_t * ctrl_missingp,uint32_t * case_hom2p,uint32_t * case_hetp,uint32_t * case_missingp)133 void single_marker_cc_3freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* ctrl_include2, uintptr_t* case_include2, uint32_t* ctrl_hom2p, uint32_t* ctrl_hetp, uint32_t* ctrl_missingp, uint32_t* case_hom2p, uint32_t* case_hetp, uint32_t* case_missingp) {
134 // Counts the number of heterozygotes, A2 homozygotes, and missing calls for
135 // both cases and controls. Assumes marker is diploid. The caller is
136 // expected to calculate the A1 allele count.
137 // See single_marker_freqs_and_hwe() for discussion.
138 // A := genotype & 0x5555...
139 // B := (genotype >> 1) & 0x5555...
140 // C := A & B
141 // popcount(C) = homozyg major ct
142 // popcount(B) = het ct + homozyg major ct
143 // popcount(A) = missing_ct + homozyg major ct
144 // hom2: popcount(C)
145 // het: popcount(B) - popcount(C)
146 // missing: popcount(A) - popcount(C)
147 uint32_t tot_ctrl_a = 0;
148 uint32_t tot_ctrl_b = 0;
149 uint32_t tot_ctrl_c = 0;
150 uint32_t tot_case_a = 0;
151 uint32_t tot_case_b = 0;
152 uint32_t tot_case_c = 0;
153 uintptr_t* lptr_end = &(lptr[sample_ctl2]);
154 uintptr_t loader;
155 uintptr_t loader2;
156 uintptr_t loader3;
157 #ifdef __LP64__
158 uintptr_t cur_decr = 120;
159 uintptr_t* lptr_12x_end;
160 sample_ctl2 -= sample_ctl2 % 12;
161 while (sample_ctl2 >= 120) {
162 single_marker_cc_3freqs_loop:
163 lptr_12x_end = &(lptr[cur_decr]);
164 count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
165 count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
166 lptr = lptr_12x_end;
167 ctrl_include2 = &(ctrl_include2[cur_decr]);
168 case_include2 = &(case_include2[cur_decr]);
169 sample_ctl2 -= cur_decr;
170 }
171 if (sample_ctl2) {
172 cur_decr = sample_ctl2;
173 goto single_marker_cc_3freqs_loop;
174 }
175 #else
176 uintptr_t* lptr_twelve_end = &(lptr[sample_ctl2 - (sample_ctl2 % 12)]);
177 while (lptr < lptr_twelve_end) {
178 count_3freq_48b(lptr, ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
179 count_3freq_48b(lptr, case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
180 lptr = &(lptr[12]);
181 ctrl_include2 = &(ctrl_include2[12]);
182 case_include2 = &(case_include2[12]);
183 }
184 #endif
185 while (lptr < lptr_end) {
186 // A := genotype & 0x5555...
187 // B := (genotype >> 1) & 0x5555...
188 // C := A & B
189 // popcount(C) = homozyg major ct
190 // popcount(B) = het ct + homozyg major ct
191 // popcount(A) = missing_ct + homozyg major ct
192 loader = *lptr++;
193 loader2 = *ctrl_include2++;
194 loader3 = (loader >> 1) & loader2;
195 loader2 &= loader;
196 tot_ctrl_a += popcount2_long(loader2);
197 tot_ctrl_b += popcount2_long(loader3);
198 tot_ctrl_c += popcount2_long(loader2 & loader3);
199 loader2 = *case_include2++;
200 loader3 = (loader >> 1) & loader2;
201 loader2 &= loader;
202 tot_case_a += popcount2_long(loader2);
203 tot_case_b += popcount2_long(loader3);
204 tot_case_c += popcount2_long(loader2 & loader3);
205 }
206 *ctrl_hom2p = tot_ctrl_c;
207 *ctrl_hetp = tot_ctrl_b - tot_ctrl_c;
208 *ctrl_missingp = tot_ctrl_a - tot_ctrl_c;
209 *case_hom2p = tot_case_c;
210 *case_hetp = tot_case_b - tot_case_c;
211 *case_missingp = tot_case_a - tot_case_c;
212 }
213
adjust_print(double pval,double output_min_p,const char * output_min_p_str,uint32_t output_min_p_strlen,char ** bufpp)214 static inline void adjust_print(double pval, double output_min_p, const char* output_min_p_str, uint32_t output_min_p_strlen, char** bufpp) {
215 if (pval < 0) {
216 *bufpp = memcpya(*bufpp, " NA ", 11);
217 } else if (pval <= output_min_p) {
218 *bufpp = memcpya(*bufpp, output_min_p_str, output_min_p_strlen);
219 } else {
220 *bufpp = dtoa_g_wxp4x(pval, 10, ' ', *bufpp);
221 }
222 }
223
adjust_print_log10(double pval,double output_min_p,const char * output_min_logp_str,uint32_t output_min_logp_strlen,char ** bufpp)224 static inline void adjust_print_log10(double pval, double output_min_p, const char* output_min_logp_str, uint32_t output_min_logp_strlen, char** bufpp) {
225 if (pval < 0) {
226 *bufpp = memcpya(*bufpp, " NA ", 11);
227 } else if (pval <= output_min_p) {
228 *bufpp = memcpya(*bufpp, output_min_logp_str, output_min_logp_strlen);
229 } else if (pval < 1) {
230 *bufpp = dtoa_g_wxp4x(-log10(pval), 10, ' ', *bufpp);
231 } else {
232 *bufpp = memcpya(*bufpp, " 0 ", 11);
233 }
234 }
235
multcomp(char * outname,char * outname_end,uint32_t * marker_uidxs,uintptr_t chi_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,double * chi,double pfilter,double output_min_p,uint32_t mtest_adjust,uint32_t skip_gc,double adjust_lambda,uint32_t * tcnt,double * pvals)236 int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintptr_t chi_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, double* chi, double pfilter, double output_min_p, uint32_t mtest_adjust, uint32_t skip_gc, double adjust_lambda, uint32_t* tcnt, double* pvals) {
237 // Association statistics can be provided in three ways:
238 // 1. Just p-values (pvals[]).
239 // 2. T statistics (in chi[]) and dfs (in tcnt[]).
240 // 3. 1df chi-square stats (in chi[]).
241 unsigned char* bigstack_mark = g_bigstack_base;
242 uint32_t is_log10 = mtest_adjust & ADJUST_LOG10;
243 uint32_t qq_plot = mtest_adjust & ADJUST_QQ;
244 FILE* outfile = nullptr;
245 double pv_holm = 0.0;
246 double pv_sidak_sd = 0;
247 int32_t retval = 0;
248 uint32_t is_set_test = !plink_maxsnp;
249 uint32_t adjust_gc = (mtest_adjust & ADJUST_GC) && (!skip_gc);
250 uint32_t output_min_p_strlen = 11;
251 uint32_t uii = 0;
252 uint32_t* new_tcnt = nullptr;
253 double* unadj = nullptr;
254 char output_min_p_str[16];
255 uint32_t pct;
256 double* sp;
257 double* schi;
258 double* pv_bh;
259 double* pv_by;
260 uint32_t* new_order;
261 uint32_t cur_idx;
262 uintptr_t marker_uidx;
263 double dxx;
264 double dyy;
265 double dzz;
266 double harmonic_sum;
267 double dct;
268 double pval;
269 double unadj_pval;
270 double* pv_gc;
271 double lambda_recip;
272 double bonf;
273 double pv_sidak_ss;
274 char* bufptr;
275 uint32_t ujj;
276 uint32_t loop_end;
277
278 if (bigstack_alloc_d(chi_ct, &sp) ||
279 bigstack_alloc_d(chi_ct, &schi) ||
280 bigstack_alloc_ui(chi_ct, &new_order)) {
281 goto multcomp_ret_NOMEM;
282 }
283 if (pvals) {
284 for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
285 dxx = pvals[cur_idx];
286 dyy = inverse_chiprob(dxx, 1);
287 if (dyy >= 0) {
288 sp[uii] = dxx;
289 new_order[uii] = marker_uidxs[cur_idx];
290 schi[uii] = dyy;
291 uii++;
292 }
293 }
294 } else if (tcnt) {
295 if (bigstack_alloc_ui(chi_ct, &new_tcnt)) {
296 goto multcomp_ret_NOMEM;
297 }
298 for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
299 ujj = tcnt[cur_idx];
300 if (ujj) {
301 dxx = chi[cur_idx]; // not actually squared
302 dyy = calc_tprob(dxx, ujj);
303 if (dyy > -1) {
304 sp[uii] = dyy;
305 new_order[uii] = marker_uidxs[cur_idx];
306 schi[uii] = dxx * dxx;
307 new_tcnt[uii] = ujj;
308 uii++;
309 }
310 }
311 }
312 } else {
313 for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
314 dxx = chi[cur_idx];
315 if (dxx >= 0) {
316 dyy = chiprob_p(dxx, 1);
317 if (dyy > -1) {
318 sp[uii] = dyy;
319 new_order[uii] = marker_uidxs[cur_idx];
320 schi[uii] = dxx;
321 uii++;
322 }
323 }
324 }
325 }
326 chi_ct = uii;
327 if (!chi_ct) {
328 logprint("Zero valid tests; --adjust skipped.\n");
329 goto multcomp_ret_1;
330 }
331 if (qsort_ext((char*)sp, chi_ct, sizeof(double), double_cmp_deref, (char*)new_order, sizeof(int32_t))) {
332 goto multcomp_ret_NOMEM;
333 }
334 if (tcnt) {
335 if (qsort_ext((char*)schi, chi_ct, sizeof(double), double_cmp_deref, (char*)new_tcnt, sizeof(int32_t))) {
336 goto multcomp_ret_NOMEM;
337 }
338 } else {
339 #ifdef __cplusplus
340 std::sort(schi, &(schi[chi_ct]));
341 #else
342 qsort(schi, chi_ct, sizeof(double), double_cmp);
343 #endif
344 }
345 dct = chi_ct;
346
347 // get lambda...
348 if (skip_gc) {
349 lambda_recip = 1.0;
350 } else if (mtest_adjust & ADJUST_LAMBDA) {
351 lambda_recip = adjust_lambda;
352 } else {
353 if (chi_ct & 1) {
354 lambda_recip = schi[(chi_ct - 1) / 2];
355 } else {
356 lambda_recip = (schi[chi_ct / 2 - 1] + schi[chi_ct / 2]) * 0.5;
357 }
358 lambda_recip = lambda_recip / 0.456;
359 if (lambda_recip < 1.0) {
360 lambda_recip = 1.0;
361 }
362 LOGPRINTF("--adjust: Genomic inflation est. lambda (based on median chisq) = %g.\n", lambda_recip);
363 }
364 // ...now take the reciprocal (bugfix: forgot to do this with --lambda)
365 if (lambda_recip > 1.0) {
366 lambda_recip = 1.0 / lambda_recip;
367 }
368
369 // handle reverse-order calculations
370 if (bigstack_alloc_d(chi_ct, &pv_bh) ||
371 bigstack_alloc_d(chi_ct, &pv_by) ||
372 bigstack_alloc_d(chi_ct, &pv_gc)) {
373 goto multcomp_ret_NOMEM;
374 }
375 if (adjust_gc) {
376 unadj = sp;
377 sp = pv_gc;
378 }
379 uii = chi_ct;
380 if (tcnt) {
381 for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
382 uii--;
383 pv_gc[cur_idx] = calc_tprob(sqrt(schi[uii] * lambda_recip), new_tcnt[uii]);
384 }
385 } else {
386 for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
387 pv_gc[cur_idx] = chiprob_p(schi[--uii] * lambda_recip, 1);
388 }
389 }
390
391 dyy = sp[chi_ct - 1];
392 pv_bh[chi_ct - 1] = dyy;
393 harmonic_sum = 1.0;
394 for (cur_idx = chi_ct - 1; cur_idx > 0; cur_idx--) {
395 dzz = dct / ((double)((int32_t)cur_idx));
396 harmonic_sum += dzz;
397 dxx = dzz * sp[cur_idx - 1];
398 if (dyy > dxx) {
399 dyy = dxx;
400 }
401 pv_bh[cur_idx - 1] = dyy;
402 }
403
404 dzz = 1.0 / dct;
405 harmonic_sum *= dzz;
406
407 dyy = harmonic_sum * sp[chi_ct - 1];
408 if (dyy >= 1) {
409 dyy = 1;
410 }
411 pv_by[chi_ct - 1] = dyy;
412 harmonic_sum *= dct;
413 for (cur_idx = chi_ct - 1; cur_idx > 0; cur_idx--) {
414 dxx = (harmonic_sum / ((double)((int32_t)cur_idx))) * sp[cur_idx - 1];
415 if (dyy > dxx) {
416 dyy = dxx;
417 }
418 pv_by[cur_idx - 1] = dyy;
419 }
420
421 uii = strlen(outname_end);
422 memcpy(&(outname_end[uii]), ".adjusted", 10);
423 if (fopen_checked(outname, "w", &outfile)) {
424 goto multcomp_ret_OPEN_FAIL;
425 }
426 if (!is_set_test) {
427 sprintf(g_textbuf, " CHR %%%us UNADJ %s", plink_maxsnp, skip_gc? "" : " GC ");
428 fprintf(outfile, g_textbuf, "SNP");
429 } else {
430 plink_maxsnp = max_marker_id_len - 1;
431 if (plink_maxsnp < 3) {
432 plink_maxsnp = 3;
433 }
434 sprintf(g_textbuf, " %%%us UNADJ ", plink_maxsnp);
435 fprintf(outfile, g_textbuf, "SET");
436 }
437 if (qq_plot) {
438 fputs(" QQ ", outfile);
439 }
440 if (fputs_checked(" BONF HOLM SIDAK_SS SIDAK_SD FDR_BH FDR_BY\n", outfile)) {
441 goto multcomp_ret_WRITE_FAIL;
442 }
443 fputs("0%", stdout);
444 fflush(stdout);
445 cur_idx = 0;
446 if (!is_log10) {
447 if (output_min_p == 0.0) {
448 memcpy(output_min_p_str, " INF ", 11);
449 } else {
450 bufptr = dtoa_g_wxp4x(output_min_p, 10, ' ', output_min_p_str);
451 output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
452 }
453 } else {
454 if (output_min_p == 0.0) {
455 memcpy(output_min_p_str, " INF ", 11);
456 } else {
457 bufptr = dtoa_g_wxp4x(-log10(output_min_p), 10, ' ', output_min_p_str);
458 output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
459 }
460 }
461 for (pct = 1; pct <= 100; pct++) {
462 loop_end = (((uint64_t)pct) * chi_ct) / 100LLU;
463 for (; cur_idx < loop_end; cur_idx++) {
464 pval = sp[cur_idx];
465 // if --pfilter specified, filter out both nan and negative pvals, since
466 // both are currently used by upstream functions
467 if ((pfilter != 2.0) && ((!(pval >= 0.0)) || (pval > pfilter))) {
468 continue;
469 }
470 if (adjust_gc) {
471 unadj_pval = unadj[cur_idx];
472 } else {
473 unadj_pval = pval;
474 }
475 marker_uidx = new_order[cur_idx];
476 if (!is_set_test) {
477 bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, get_variant_chrom(chrom_info_ptr, marker_uidx), g_textbuf));
478 } else {
479 bufptr = g_textbuf;
480 }
481 *bufptr++ = ' ';
482 bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
483 *bufptr++ = ' ';
484 if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
485 goto multcomp_ret_WRITE_FAIL;
486 }
487 bonf = pval * dct;
488 if (bonf > 1) {
489 bonf = 1;
490 }
491 if (pv_holm < 1) {
492 dyy = (chi_ct - cur_idx) * pval;
493 if (dyy > 1) {
494 pv_holm = 1;
495 } else if (pv_holm < dyy) {
496 pv_holm = dyy;
497 }
498 }
499 // avoid catastrophic cancellation for small p-values
500 // 1 - (1-p)^c = 1 - e^{c log(1-p)}
501 // 2^{-7} threshold is arbitrary
502 if (pval >= 0.0078125) {
503 pv_sidak_ss = 1 - pow(1 - pval, dct);
504 dyy = 1 - pow(1 - pval, dct - ((double)((int32_t)cur_idx)));
505 } else {
506 pv_sidak_ss = 1 - exp(dct * log1p(-pval));
507 dyy = dct - (double)((int32_t)cur_idx);
508 dyy = 1 - exp(dyy * log1p(-pval));
509 }
510 if (pv_sidak_sd < dyy) {
511 pv_sidak_sd = dyy;
512 }
513
514 bufptr = g_textbuf;
515 if (!is_log10) {
516 adjust_print(unadj_pval, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
517 if (!skip_gc) {
518 adjust_print(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
519 }
520 if (qq_plot) {
521 bufptr = dtoa_g_wxp4x((((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ', bufptr);
522 }
523 adjust_print(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
524 adjust_print(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
525 adjust_print(pv_sidak_ss, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
526 adjust_print(pv_sidak_sd, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
527 adjust_print(pv_bh[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
528 adjust_print(pv_by[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
529 } else {
530 adjust_print_log10(pval, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
531 if (!is_set_test) {
532 adjust_print_log10(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
533 }
534 if (qq_plot) {
535 // quasi-bugfix (23 Mar 2018): this should be logscale, both for
536 // consistency with plink 1.07 and because it makes more sense
537 bufptr = dtoa_g_wxp4x(-log10((((double)((int32_t)cur_idx)) + 0.5) * dzz), 10, ' ', bufptr);
538 }
539 adjust_print_log10(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
540 adjust_print_log10(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
541 adjust_print_log10(pv_sidak_ss, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
542 adjust_print_log10(pv_sidak_sd, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
543 adjust_print_log10(pv_bh[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
544 adjust_print_log10(pv_by[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
545 }
546 *bufptr++ = '\n';
547 if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
548 goto multcomp_ret_WRITE_FAIL;
549 }
550 }
551 if (pct < 100) {
552 if (pct > 10) {
553 putc_unlocked('\b', stdout);
554 }
555 printf("\b\b%u%%", pct);
556 fflush(stdout);
557 }
558 }
559 fputs("\b\b\b", stdout);
560 LOGPRINTFWW("--adjust values (%" PRIuPTR " %s%s) written to %s .\n", chi_ct, is_set_test? "nonempty set" : "variant", (chi_ct == 1)? "" : "s", outname);
561
562 while (0) {
563 multcomp_ret_NOMEM:
564 retval = RET_NOMEM;
565 break;
566 multcomp_ret_OPEN_FAIL:
567 retval = RET_OPEN_FAIL;
568 break;
569 multcomp_ret_WRITE_FAIL:
570 retval = RET_WRITE_FAIL;
571 break;
572 }
573 multcomp_ret_1:
574 fclose_cond(outfile);
575 bigstack_reset(bigstack_mark);
576 return retval;
577 }
578
model_assoc_tna(uint32_t model_fisher,char * wptr)579 char* model_assoc_tna(uint32_t model_fisher, char* wptr) {
580 // write terminal NAs to buffer
581 if (model_fisher) {
582 return memcpya(wptr, " NA\n", 13);
583 } else {
584 return memcpya(wptr, " NA NA NA\n", 31);
585 }
586 }
587
calc_git(uint32_t pheno_nm_ct,uint32_t perm_vec_ct,uintptr_t * __restrict__ loadbuf,uint32_t * perm_vecst,uint32_t * results_bufs,uint32_t * thread_wkspace)588 void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict__ loadbuf, uint32_t* perm_vecst, uint32_t* results_bufs, uint32_t* thread_wkspace) {
589 // Brian Browning's genotype indexing algorithm for low-MAF (and low missing
590 // frequency) markers.
591 // We accelerate it by using a special interleaved permutation representation
592 // which supports vector addition without occupying extra space: see
593 // generate_cc_perm_vec(). Counting the number of e.g. case heterozygote
594 // genotypes across all permutations then proceeds as follows:
595 // 1. For the first 15 heterozygote samples, just use 4-bit accumulators.
596 // This allows the inner loop to increment 32 counters simultaneously.
597 // 2. Right before they'd otherwise be at risk of overflowing, we unfold the
598 // 4-bit accumulators into a larger buffer of 8-bit accumulators. Then we
599 // zero out the 4-bit accumulators, and restart the inner loop.
600 // 3. This can happen up to 17 times before the 8-bit accumulators risk
601 // overflow. Then, they are unfolded into the final output array of
602 // 32-bit ints, zeroed out, and the second loop restarts.
603 // Note that results_bufs[] is assumed to be zeroed out before this function
604 // is called.
605 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
606 uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
607 #ifdef __LP64__
608 uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
609 uint32_t perm_ct128x4 = perm_ct128 * 4;
610 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
611 uint32_t perm_ct16x4 = 4 * perm_ct16;
612 __m128i* permsv = (__m128i*)perm_vecst;
613 __m128i* gitv[9];
614 #else
615 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
616 uint32_t perm_ct32x4 = perm_ct32 * 4;
617 uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
618 uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
619 uint32_t perm_ct16x16 = 16 * perm_ct16;
620 uintptr_t* permsv = (uintptr_t*)perm_vecst;
621 uintptr_t* gitv[9];
622 #endif
623 uint32_t cur_cts[3];
624 uintptr_t ulii;
625 uint32_t uii;
626 uint32_t ujj;
627 uint32_t ukk;
628 uint32_t sample_type;
629 #ifdef __LP64__
630 // 4- and 8-bit partial counts
631 gitv[0] = (__m128i*)thread_wkspace;
632 gitv[1] = &(((__m128i*)thread_wkspace)[perm_ct128x4]);
633 gitv[2] = &(((__m128i*)thread_wkspace)[2 * perm_ct128x4]);
634 gitv[3] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4]);
635 gitv[4] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4 + 2 * perm_ct32]);
636 gitv[5] = &(((__m128i*)thread_wkspace)[3 * perm_ct128x4 + 4 * perm_ct32]);
637 gitv[6] = &(((__m128i*)results_bufs)[2 * perm_ct16x4]);
638 gitv[7] = &(((__m128i*)results_bufs)[perm_ct16x4]);
639 gitv[8] = (__m128i*)results_bufs;
640 #else
641 gitv[0] = (uintptr_t*)thread_wkspace;
642 gitv[1] = (uintptr_t*)(&(thread_wkspace[perm_ct32x4]));
643 gitv[2] = (uintptr_t*)(&(thread_wkspace[2 * perm_ct32x4]));
644 gitv[3] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4]));
645 gitv[4] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 2 * perm_ct8]));
646 gitv[5] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 4 * perm_ct8]));
647 gitv[6] = (uintptr_t*)(&(results_bufs[2 * perm_ct16x16]));
648 gitv[7] = (uintptr_t*)(&(results_bufs[perm_ct16x16]));
649 gitv[8] = (uintptr_t*)results_bufs;
650 #endif
651 cur_cts[0] = 0;
652 cur_cts[1] = 0;
653 cur_cts[2] = 0;
654 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
655 ulii = ~(*loadbuf++);
656 if (uii + 1 == pheno_nm_ctl2x) {
657 ujj = pheno_nm_ct & (BITCT2 - 1);
658 if (ujj) {
659 ulii &= (ONELU << (ujj * 2)) - ONELU;
660 }
661 }
662 while (ulii) {
663 ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
664 sample_type = ((ulii >> ujj) & 3) - 1;
665 ukk = cur_cts[sample_type] + 1;
666 cur_cts[sample_type] = ukk;
667 #ifdef __LP64__
668 unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct128]), gitv[sample_type], perm_ct128);
669 if (!(ukk % 15)) {
670 unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
671 if (!(ukk % 255)) {
672 unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
673 }
674 }
675 #else
676 unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct32]), gitv[sample_type], perm_ct32);
677 if (!(ukk % 15)) {
678 unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
679 if (!(ukk % 255)) {
680 unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
681 }
682 }
683 #endif
684 ulii &= ~((3 * ONELU) << ujj);
685 }
686 #ifdef __LP64__
687 permsv = &(permsv[BITCT2 * perm_ct128]);
688 #else
689 permsv = &(permsv[BITCT2 * perm_ct32]);
690 #endif
691 }
692 for (sample_type = 0; sample_type < 3; sample_type++) {
693 uii = cur_cts[sample_type];
694 #ifdef __LP64__
695 if (uii % 15) {
696 unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
697 }
698 if (uii % 255) {
699 unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
700 }
701 #else
702 if (uii % 15) {
703 unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
704 }
705 if (uii % 255) {
706 unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
707 }
708 #endif
709 }
710 }
711
calc_qgit(uint32_t pheno_nm_ct,uintptr_t perm_vec_ctcl8m,uint32_t num_perms_now,uintptr_t * __restrict__ loadbuf,double * perm_vecstd,double * thread_bufs)712 void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
713 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
714 #ifdef __LP64__
715 // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
716 uint32_t row_mult = perm_vec_ctcl8m / 4;
717
718 uint32_t loop_len = (num_perms_now + 1) / 2;
719 __m128d* permsv = (__m128d*)perm_vecstd;
720 __m128d* __restrict__ perm_readv;
721 __m128d* __restrict__ git_writev;
722 __m128d* __restrict__ git_write2v;
723 __m128d vxx;
724 #else
725 uint32_t row_mult = perm_vec_ctcl8m / 2;
726 double* __restrict__ perm_read;
727 double* __restrict__ git_write;
728 double* __restrict__ git_write2;
729 double dxx;
730 #endif
731 uintptr_t ulii;
732 uint32_t sample_type;
733 uint32_t uii;
734 uint32_t ujj;
735 uint32_t ukk;
736 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
737 ulii = ~(*loadbuf++);
738 if (uii + 1 == pheno_nm_ctl2x) {
739 ujj = pheno_nm_ct & (BITCT2 - 1);
740 if (ujj) {
741 ulii &= (ONELU << (ujj * 2)) - ONELU;
742 }
743 }
744 while (ulii) {
745 ujj = CTZLU(ulii) & (BITCT - 2);
746 sample_type = (ulii >> ujj) & 3;
747 #ifdef __LP64__
748 // note that the gain from using SSE2 for double-precision arithmetic is
749 // typically minimal because modern cores tend to have two FPUs, so we
750 // should only use it opportunistically. it's painless here, though.
751 perm_readv = &(permsv[ujj * row_mult]);
752 if (sample_type == 1) {
753 git_writev = (__m128d*)thread_bufs;
754 for (ukk = 0; ukk < loop_len; ukk++) {
755 *git_writev = _mm_add_pd(*git_writev, *perm_readv++);
756 git_writev++;
757 }
758 } else if (sample_type == 3) {
759 // hom rare
760 git_writev = (__m128d*)thread_bufs;
761 for (ukk = 0; ukk < loop_len; ukk++) {
762 vxx = *perm_readv++;
763 *git_writev = _mm_add_pd(*git_writev, _mm_add_pd(vxx, vxx));
764 git_writev++;
765 }
766 } else {
767 // missing
768 git_writev = (__m128d*)(&(thread_bufs[perm_vec_ctcl8m]));
769 git_write2v = (__m128d*)(&(thread_bufs[2 * perm_vec_ctcl8m]));
770 for (ukk = 0; ukk < loop_len; ukk++) {
771 vxx = *perm_readv++;
772 *git_writev = _mm_add_pd(*git_writev, vxx);
773 git_writev++;
774 *git_write2v = _mm_add_pd(*git_write2v, _mm_mul_pd(vxx, vxx));
775 git_write2v++;
776 }
777 }
778 #else
779 perm_read = &(perm_vecstd[ujj * row_mult]);
780 if (sample_type == 1) {
781 git_write = thread_bufs;
782 for (ukk = 0; ukk < num_perms_now; ukk++) {
783 *git_write += *perm_read++;
784 git_write++;
785 }
786 } else if (sample_type == 3) {
787 git_write = thread_bufs;
788 for (ukk = 0; ukk < num_perms_now; ukk++) {
789 dxx = *perm_read++;
790 *git_write += dxx * 2;
791 git_write++;
792 }
793 } else {
794 git_write = &(thread_bufs[perm_vec_ctcl8m]);
795 git_write2 = &(thread_bufs[2 * perm_vec_ctcl8m]);
796 for (ukk = 0; ukk < num_perms_now; ukk++) {
797 dxx = *perm_read++;
798 *git_write += dxx;
799 git_write++;
800 *git_write2 += dxx * dxx;
801 git_write2++;
802 }
803 }
804 #endif
805 ulii &= ~((3 * ONELU) << ujj);
806 }
807 #ifdef __LP64__
808 permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
809 #else
810 perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
811 #endif
812 }
813 }
814
calc_qgit_lin(uint32_t pheno_nm_ct,uintptr_t perm_vec_ctcl8m,uint32_t num_perms_now,uintptr_t * __restrict__ loadbuf,double * perm_vecstd,double * thread_bufs)815 void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
816 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
817 #ifdef __LP64__
818 // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
819 uint32_t row_mult = perm_vec_ctcl8m / 4;
820
821 uint32_t loop_len = (num_perms_now + 1) / 2;
822 __m128d* permsv = (__m128d*)perm_vecstd;
823 __m128d* __restrict__ perm_readv;
824 __m128d* __restrict__ git_writev;
825 __m128d* __restrict__ git_write2v;
826 __m128d vxx;
827 #else
828 uint32_t row_mult = perm_vec_ctcl8m / 2;
829 double* __restrict__ perm_read;
830 double* __restrict__ git_write;
831 double* __restrict__ git_write2;
832 double dxx;
833 #endif
834 uintptr_t ulii;
835 uint32_t sample_type;
836 uint32_t uii;
837 uint32_t ujj;
838 uint32_t ukk;
839 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
840 ulii = ~(*loadbuf++);
841 if (uii + 1 == pheno_nm_ctl2x) {
842 ujj = pheno_nm_ct & (BITCT2 - 1);
843 if (ujj) {
844 ulii &= (ONELU << (ujj * 2)) - ONELU;
845 }
846 }
847 while (ulii) {
848 ujj = CTZLU(ulii) & (BITCT - 2);
849 sample_type = (ulii >> ujj) & 3;
850 #ifdef __LP64__
851 perm_readv = &(permsv[ujj * row_mult]);
852 if (sample_type == 1) {
853 git_writev = (__m128d*)thread_bufs;
854 git_write2v = (__m128d*)(&(thread_bufs[perm_vec_ctcl8m]));
855 } else if (sample_type == 3) {
856 // hom rare
857 git_writev = (__m128d*)(&(thread_bufs[2 * perm_vec_ctcl8m]));
858 git_write2v = (__m128d*)(&(thread_bufs[3 * perm_vec_ctcl8m]));
859 } else {
860 // missing
861 git_writev = (__m128d*)(&(thread_bufs[4 * perm_vec_ctcl8m]));
862 git_write2v = (__m128d*)(&(thread_bufs[5 * perm_vec_ctcl8m]));
863 }
864 for (ukk = 0; ukk < loop_len; ukk++) {
865 vxx = *perm_readv++;
866 *git_writev = _mm_add_pd(*git_writev, vxx);
867 git_writev++;
868 *git_write2v = _mm_add_pd(*git_write2v, _mm_mul_pd(vxx, vxx));
869 git_write2v++;
870 }
871 #else
872 perm_read = &(perm_vecstd[ujj * row_mult]);
873 if (sample_type == 1) {
874 git_write = thread_bufs;
875 git_write2 = &(thread_bufs[perm_vec_ctcl8m]);
876 } else if (sample_type == 3) {
877 git_write = &(thread_bufs[2 * perm_vec_ctcl8m]);
878 git_write2 = &(thread_bufs[3 * perm_vec_ctcl8m]);
879 } else {
880 git_write = &(thread_bufs[4 * perm_vec_ctcl8m]);
881 git_write2 = &(thread_bufs[5 * perm_vec_ctcl8m]);
882 }
883 for (ukk = 0; ukk < num_perms_now; ukk++) {
884 dxx = *perm_read++;
885 *git_write += dxx;
886 git_write++;
887 *git_write2 += dxx * dxx;
888 git_write2++;
889 }
890 #endif
891 ulii &= ~((3 * ONELU) << ujj);
892 }
893 #ifdef __LP64__
894 permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
895 #else
896 perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
897 #endif
898 }
899 }
900
901 #ifdef __LP64__
rem_cost_60v(__m128i * vec1,__m128i * vend,__m128i * vec2)902 uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
903 const __m128i m1 = {FIVEMASK, FIVEMASK};
904 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
905 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
906 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
907 __m128i loader;
908 __m128i loader2;
909 __m128i xor_vec;
910 __m128i detect_homcom;
911 __m128i result_a;
912 __m128i acc_a;
913 __m128i acc_b;
914 __univec acc;
915 acc.vi = _mm_setzero_si128();
916 do {
917 loader = *vec1++;
918 loader2 = *vec2++;
919 xor_vec = _mm_xor_si128(loader, loader2);
920 detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
921 acc_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
922 acc_b = _mm_andnot_si128(detect_homcom, acc_a);
923
924 loader = *vec1++;
925 loader2 = *vec2++;
926 xor_vec = _mm_xor_si128(loader, loader2);
927 detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
928 result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
929 acc_a = _mm_add_epi64(acc_a, result_a);
930 acc_b = _mm_add_epi64(acc_b, _mm_andnot_si128(detect_homcom, result_a));
931
932 loader = *vec1++;
933 loader2 = *vec2++;
934 xor_vec = _mm_xor_si128(loader, loader2);
935 detect_homcom = _mm_or_si128(_mm_and_si128(_mm_srli_epi64(loader, 1), loader), _mm_and_si128(_mm_srli_epi64(loader2, 1), loader2));
936 result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
937 acc_a = _mm_add_epi64(acc_a, result_a);
938 acc_b = _mm_add_epi64(acc_b, _mm_andnot_si128(detect_homcom, result_a));
939 acc_a = _mm_add_epi64(_mm_and_si128(acc_a, m2), _mm_and_si128(_mm_srli_epi64(acc_a, 2), m2));
940 acc_a = _mm_add_epi64(acc_a, _mm_add_epi64(_mm_and_si128(acc_b, m2), _mm_and_si128(_mm_srli_epi64(acc_b, 2), m2)));
941 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(acc_a, m4), _mm_and_si128(_mm_srli_epi64(acc_a, 4), m4)));
942 } while (vec1 < vend);
943 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
944 return ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
945 }
946
qrem_cost2_40v(__m128i * vec1,__m128i * vend,__m128i * vec2)947 uintptr_t qrem_cost2_40v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
948 const __m128i m1 = {FIVEMASK, FIVEMASK};
949 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
950 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
951 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
952 __m128i loader;
953 __m128i loader2;
954 __m128i xor_vec;
955 __m128i detect_missing;
956 __m128i result_a;
957 __m128i result_b;
958 __m128i result_c;
959 __m128i inner_acc;
960 __univec acc;
961 acc.vi = _mm_setzero_si128();
962 do {
963 loader = *vec1++;
964 loader2 = *vec2++;
965 xor_vec = _mm_xor_si128(loader, loader2);
966 detect_missing = _mm_or_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), _mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2));
967 result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
968 result_b = _mm_and_si128(result_a, detect_missing);
969 inner_acc = _mm_and_si128(result_b, xor_vec);
970 inner_acc = _mm_add_epi64(_mm_add_epi64(result_a, result_b), inner_acc);
971 inner_acc = _mm_add_epi64(_mm_and_si128(inner_acc, m2), _mm_and_si128(_mm_srli_epi64(inner_acc, 2), m2));
972 loader = *vec1++;
973 loader2 = *vec2++;
974 xor_vec = _mm_xor_si128(loader, loader2);
975 detect_missing = _mm_or_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), _mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2));
976 result_a = _mm_and_si128(_mm_or_si128(xor_vec, _mm_srli_epi64(xor_vec, 1)), m1);
977 result_b = _mm_and_si128(result_a, detect_missing);
978 result_c = _mm_and_si128(result_b, xor_vec);
979 result_c = _mm_add_epi64(_mm_add_epi64(result_a, result_b), result_c);
980 inner_acc = _mm_add_epi64(inner_acc, _mm_add_epi64(_mm_and_si128(result_c, m2), _mm_and_si128(_mm_srli_epi64(result_c, 2), m2)));
981 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(inner_acc, m4), _mm_and_si128(_mm_srli_epi64(inner_acc, 4), m4)));
982 } while (vec1 < vend);
983 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
984 return ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
985 }
986 #else
rem_cost_6(uintptr_t * loadbuf1,uintptr_t * loadbuf2)987 uintptr_t rem_cost_6(uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
988 uintptr_t loader = *loadbuf1++;
989 uintptr_t loader2 = *loadbuf2++;
990 uintptr_t xor_word = loader ^ loader2;
991 uintptr_t detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
992 uintptr_t acc_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
993 uintptr_t acc_b = acc_a & (~detect_homcom);
994 uintptr_t result_a;
995 uintptr_t acc;
996
997 loader = *loadbuf1++;
998 loader2 = *loadbuf2++;
999 xor_word = loader & loader2;
1000 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1001 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1002 acc_a += result_a;
1003 acc_b += result_a & (~detect_homcom);
1004
1005 loader = *loadbuf1++;
1006 loader2 = *loadbuf2++;
1007 xor_word = loader & loader2;
1008 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1009 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1010 acc_a += result_a;
1011 acc_b += result_a & (~detect_homcom);
1012 acc_a = (acc_a & 0x33333333) + ((acc_a >> 2) & 0x33333333);
1013 acc_a += (acc_b & 0x33333333) + ((acc_b >> 2) & 0x33333333);
1014 acc = (acc_a & 0x0f0f0f0f) + ((acc_a >> 4) & 0x0f0f0f0f);
1015
1016 loader = *loadbuf1++;
1017 loader2 = *loadbuf2++;
1018 xor_word = loader & loader2;
1019 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1020 acc_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1021 acc_b = acc_a & (~detect_homcom);
1022
1023 loader = *loadbuf1++;
1024 loader2 = *loadbuf2++;
1025 xor_word = loader & loader2;
1026 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1027 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1028 acc_a += result_a;
1029 acc_b += result_a & (~detect_homcom);
1030
1031 loader = *loadbuf1++;
1032 loader2 = *loadbuf2++;
1033 xor_word = loader & loader2;
1034 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1035 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1036 acc_a += result_a;
1037 acc_b += result_a & (~detect_homcom);
1038 acc_a = (acc_a & 0x33333333) + ((acc_a >> 2) & 0x33333333);
1039 acc_a += (acc_b & 0x33333333) + ((acc_b >> 2) & 0x33333333);
1040 acc += (acc_a & 0x0f0f0f0f) + ((acc_a >> 4) & 0x0f0f0f0f);
1041 return (acc * 0x01010101) >> 24;
1042 }
1043
qrem_cost2_4(uintptr_t * loadbuf1,uintptr_t * loadbuf2)1044 uintptr_t qrem_cost2_4(uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1045 uintptr_t loader = *loadbuf1++;
1046 uintptr_t loader2 = *loadbuf2++;
1047 uintptr_t xor_word = loader ^ loader2;
1048 uintptr_t detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1049 uintptr_t result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1050 uintptr_t result_b = result_a & detect_missing;
1051 uintptr_t inner_acc = result_b & xor_word;
1052 uintptr_t result_c;
1053 uintptr_t acc;
1054 inner_acc += result_a + result_b;
1055 inner_acc = (inner_acc & 0x33333333) + ((inner_acc >> 2) & 0x33333333);
1056
1057 loader = *loadbuf1++;
1058 loader2 = *loadbuf2++;
1059 xor_word = loader & loader2;
1060 detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1061 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1062 result_b = result_a & detect_missing;
1063 result_c = result_b & xor_word;
1064 result_c += result_a + result_b;
1065 inner_acc += (result_c & 0x33333333) + ((result_c >> 2) & 0x33333333);
1066 acc = (inner_acc & 0x0f0f0f0f) + ((inner_acc >> 4) & 0x0f0f0f0f);
1067
1068 loader = *loadbuf1++;
1069 loader2 = *loadbuf2++;
1070 xor_word = loader & loader2;
1071 detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1072 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1073 result_b = result_a & detect_missing;
1074 inner_acc = result_b & xor_word;
1075 inner_acc += result_a + result_b;
1076 inner_acc = (inner_acc & 0x33333333) + ((inner_acc >> 2) & 0x33333333);
1077
1078 loader = *loadbuf1++;
1079 loader2 = *loadbuf2++;
1080 xor_word = loader & loader2;
1081 detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1082 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1083 result_b = result_a & detect_missing;
1084 result_c = result_b & xor_word;
1085 result_c += result_a + result_b;
1086 inner_acc += (result_c & 0x33333333) + ((result_c >> 2) & 0x33333333);
1087 acc += (inner_acc & 0x0f0f0f0f) + ((inner_acc >> 4) & 0x0f0f0f0f);
1088 return (acc * 0x01010101) >> 24;
1089 }
1090 #endif
1091
rem_cost(uintptr_t sample_ctv2,uintptr_t * loadbuf1,uintptr_t * loadbuf2)1092 uintptr_t rem_cost(uintptr_t sample_ctv2, uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1093 // Cost: 2 * (<-> neither side homcom) + (<-> homcom)
1094 //
1095 // We can efficiently calculate this as follows:
1096 // xor = vec1 ^ vec2
1097 // detect_homcom = (vec1 & (vec1 >> 1)) | (vec2 & (vec2 >> 1))
1098 // A := (xor | (xor >> 1)) & 0x5555...
1099 // B := A & (~detect_homcom)
1100 // cost += popcount2(A + B)
1101 uintptr_t* lptr_end = &(loadbuf1[sample_ctv2]);
1102 uintptr_t cost = 0;
1103 uintptr_t loader;
1104 uintptr_t loader2;
1105 uintptr_t xor_word;
1106 uintptr_t detect_homcom;
1107 uintptr_t result_a;
1108 uintptr_t result_b;
1109 #ifdef __LP64__
1110 uintptr_t cur_decr = 60;
1111 uintptr_t* lptr_6x_end;
1112 sample_ctv2 -= sample_ctv2 % 6;
1113 while (sample_ctv2 >= 60) {
1114 rem_cost_loop:
1115 lptr_6x_end = &(loadbuf1[cur_decr]);
1116 cost += rem_cost_60v((__m128i*)loadbuf1, (__m128i*)lptr_6x_end, (__m128i*)loadbuf2);
1117 loadbuf1 = lptr_6x_end;
1118 loadbuf2 = &(loadbuf2[cur_decr]);
1119 sample_ctv2 -= cur_decr;
1120 }
1121 if (sample_ctv2) {
1122 cur_decr = sample_ctv2;
1123 goto rem_cost_loop;
1124 }
1125 #else
1126 uintptr_t* lptr_six_end = &(loadbuf1[sample_ctv2 - (sample_ctv2 % 6)]);
1127 while (loadbuf1 < lptr_six_end) {
1128 cost += rem_cost_6(loadbuf1, loadbuf2);
1129 loadbuf1 = &(loadbuf1[6]);
1130 loadbuf2 = &(loadbuf2[6]);
1131 }
1132 #endif
1133 while (loadbuf1 < lptr_end) {
1134 loader = *loadbuf1++;
1135 loader2 = *loadbuf2++;
1136 xor_word = loader ^ loader2;
1137 detect_homcom = (loader & (loader >> 1)) | (loader2 & (loader2 >> 1));
1138 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1139 result_b = result_a & (~detect_homcom);
1140 cost += popcount2_long(result_a + result_b);
1141 }
1142 return cost;
1143 }
1144
qrem_cost2(uintptr_t sample_ctl2,uintptr_t * loadbuf1,uintptr_t * loadbuf2)1145 uintptr_t qrem_cost2(uintptr_t sample_ctl2, uintptr_t* loadbuf1, uintptr_t* loadbuf2) {
1146 // Cost: 3 + 3 * (missing <-> homrar/het) + 2 * (missing <-> homcom) +
1147 // (homrar <-> het/homcom) + (het <-> homcom)
1148 //
1149 // xor 01: 3 if 00-01, 1 of 10-11
1150 // xor 10: 2 if 01-11, 1 if 00-10
1151 // xor 11: 3 if 01-10, 1 if 00-11
1152 //
1153 // We can efficiently calculate this as follows:
1154 // xor = vec1 ^ vec2
1155 // detect_missing = (vec1 & (~(vec1 >> 1))) | (vec2 & (~(vec2 >> 1)))
1156 // A := (xor | (xor >> 1)) & 0x5555...
1157 // B := A & detect_missing
1158 // C := B & xor
1159 // cost += popcount2(A + B + C)
1160 // (I would not be surprised if a few operations could be shaved from this.)
1161 uintptr_t* lptr_end = &(loadbuf1[sample_ctl2]);
1162 uintptr_t cost = 3;
1163 uintptr_t loader;
1164 uintptr_t loader2;
1165 uintptr_t xor_word;
1166 uintptr_t detect_missing;
1167 uintptr_t result_a;
1168 uintptr_t result_b;
1169 uintptr_t result_c;
1170 #ifdef __LP64__
1171 uintptr_t cur_decr = 40;
1172 uintptr_t* lptr_4x_end;
1173 sample_ctl2 &= ~3LLU;
1174 while (sample_ctl2 >= 40) {
1175 qrem_cost2_loop:
1176 lptr_4x_end = &(loadbuf1[cur_decr]);
1177 cost += qrem_cost2_40v((__m128i*)loadbuf1, (__m128i*)lptr_4x_end, (__m128i*)loadbuf2);
1178 loadbuf1 = lptr_4x_end;
1179 loadbuf2 = &(loadbuf2[cur_decr]);
1180 sample_ctl2 -= cur_decr;
1181 }
1182 if (sample_ctl2) {
1183 cur_decr = sample_ctl2;
1184 goto qrem_cost2_loop;
1185 }
1186 #else
1187 uintptr_t* lptr_four_end = &(loadbuf1[sample_ctl2 & (~3)]);
1188 while (loadbuf1 < lptr_four_end) {
1189 cost += qrem_cost2_4(loadbuf1, loadbuf2);
1190 loadbuf1 = &(loadbuf1[4]);
1191 loadbuf2 = &(loadbuf2[4]);
1192 }
1193 #endif
1194 while (loadbuf1 < lptr_end) {
1195 loader = *loadbuf1++;
1196 loader2 = *loadbuf2++;
1197 xor_word = loader ^ loader2;
1198 detect_missing = (loader & (~(loader >> 1))) | (loader2 & (~(loader2 >> 1)));
1199 result_a = (xor_word | (xor_word >> 1)) & FIVEMASK;
1200 result_b = result_a & detect_missing;
1201 result_c = result_b & xor_word;
1202 cost += popcount2_long(result_a + result_b + result_c);
1203 }
1204 return cost;
1205 }
1206
1207 #ifdef __LP64__
calc_rem_merge4_two(uint32_t perm_ct128,__m128i * __restrict__ perm_ptr,__m128i * __restrict__ rem_merge4a,__m128i * __restrict__ rem_merge4b)1208 static inline void calc_rem_merge4_two(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4a, __m128i* __restrict__ rem_merge4b) {
1209 const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
1210 __m128i loader;
1211 __m128i loader2;
1212 uint32_t pbidx;
1213 for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
1214 loader = *perm_ptr++;
1215 loader2 = _mm_and_si128(loader, m1x4);
1216 rem_merge4a[0] = _mm_add_epi64(rem_merge4a[0], loader2);
1217 rem_merge4b[0] = _mm_add_epi64(rem_merge4b[0], loader2);
1218 loader = _mm_srli_epi64(loader, 1);
1219 loader2 = _mm_and_si128(loader, m1x4);
1220 rem_merge4a[1] = _mm_add_epi64(rem_merge4a[1], loader2);
1221 rem_merge4b[1] = _mm_add_epi64(rem_merge4b[1], loader2);
1222 loader = _mm_srli_epi64(loader, 1);
1223 loader2 = _mm_and_si128(loader, m1x4);
1224 rem_merge4a[2] = _mm_add_epi64(rem_merge4a[2], loader2);
1225 rem_merge4b[2] = _mm_add_epi64(rem_merge4b[2], loader2);
1226 loader = _mm_srli_epi64(loader, 1);
1227 loader2 = _mm_and_si128(loader, m1x4);
1228 rem_merge4a[3] = _mm_add_epi64(rem_merge4a[3], loader2);
1229 rem_merge4b[3] = _mm_add_epi64(rem_merge4b[3], loader2);
1230 rem_merge4a = &(rem_merge4a[4]);
1231 rem_merge4b = &(rem_merge4b[4]);
1232 }
1233 }
1234
calc_rem_merge32_minus(uint32_t perm_ct16,__m128i * __restrict__ rem_merge8,__m128i * rem_write)1235 static inline void calc_rem_merge32_minus(uint32_t perm_ct16, __m128i* __restrict__ rem_merge8, __m128i* rem_write) {
1236 // temporary integer underflow is possible here, but by the end of the
1237 // calculation it should be reversed
1238 const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
1239 __m128i loader;
1240 uint32_t pbidx;
1241 for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
1242 loader = *rem_merge8;
1243 rem_write[0] = _mm_sub_epi64(rem_write[0], _mm_and_si128(loader, m8x32));
1244 loader = _mm_srli_epi64(loader, 8);
1245 rem_write[1] = _mm_sub_epi64(rem_write[1], _mm_and_si128(loader, m8x32));
1246 loader = _mm_srli_epi64(loader, 8);
1247 rem_write[2] = _mm_sub_epi64(rem_write[2], _mm_and_si128(loader, m8x32));
1248 loader = _mm_srli_epi64(loader, 8);
1249 rem_write[3] = _mm_sub_epi64(rem_write[3], _mm_and_si128(loader, m8x32));
1250 rem_write = &(rem_write[4]);
1251 *rem_merge8++ = _mm_setzero_si128();
1252 }
1253 }
1254 #else
calc_rem_merge4_two(uint32_t perm_ct32,uintptr_t * __restrict__ perm_ptr,uintptr_t * __restrict__ rem_merge4a,uintptr_t * __restrict__ rem_merge4b)1255 static inline void calc_rem_merge4_two(uint32_t perm_ct32, uintptr_t* __restrict__ perm_ptr, uintptr_t* __restrict__ rem_merge4a, uintptr_t* __restrict__ rem_merge4b) {
1256 uintptr_t loader;
1257 uintptr_t loader2;
1258 uint32_t pbidx;
1259 for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
1260 loader = *perm_ptr++;
1261 loader2 = loader & 0x11111111;
1262 rem_merge4a[0] += loader2;
1263 rem_merge4b[0] += loader2;
1264 loader2 = (loader >> 1) & 0x11111111;
1265 rem_merge4a[1] += loader2;
1266 rem_merge4b[1] += loader2;
1267 loader2 = (loader >> 2) & 0x11111111;
1268 rem_merge4a[2] += loader2;
1269 rem_merge4b[2] += loader2;
1270 loader2 = (loader >> 3) & 0x11111111;
1271 rem_merge4a[3] += loader2;
1272 rem_merge4b[3] += loader2;
1273 rem_merge4a = &(rem_merge4a[4]);
1274 rem_merge4b = &(rem_merge4b[4]);
1275 }
1276 }
1277
calc_rem_merge32_minus(uint32_t perm_ct4,uintptr_t * __restrict__ rem_merge8,uintptr_t * __restrict__ rem_write)1278 static inline void calc_rem_merge32_minus(uint32_t perm_ct4, uintptr_t* __restrict__ rem_merge8, uintptr_t* __restrict__ rem_write) {
1279 uintptr_t loader;
1280 uint32_t pbidx;
1281 for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
1282 loader = *rem_merge8;
1283 rem_write[0] -= (uint8_t)loader;
1284 loader >>= 8;
1285 rem_write[1] -= (uint8_t)loader;
1286 loader >>= 8;
1287 rem_write[2] -= (uint8_t)loader;
1288 loader >>= 8;
1289 rem_write[3] -= loader;
1290 rem_write = &(rem_write[4]);
1291 *rem_merge8++ = 0;
1292 }
1293 }
1294 #endif
1295
calc_rem(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,uint32_t * perm_vecst,uint32_t * results_bufs,uint32_t * thread_wkspace)1296 void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, uint32_t* perm_vecst, uint32_t* results_bufs, uint32_t* thread_wkspace) {
1297 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1298 uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
1299 // [cur_xor - 1][cur_raw]
1300 // low 8 bits give index of first remv[] array to increment; next 8 bits give
1301 // second index if nonzero, or indicate its absence
1302 const uint32_t idx_table[3][4] = {{0x300, 0x102, 4, 5}, {0x500, 2, 0x104, 3}, {0, 0x502, 0x304, 1}};
1303 #ifdef __LP64__
1304 uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
1305 uint32_t perm_ct128x4 = perm_ct128 * 4;
1306 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
1307 uint32_t perm_ct16x4 = 4 * perm_ct16;
1308 __m128i* permsv = (__m128i*)perm_vecst;
1309 // 0, 2, 4: homrar, missing, het ct increment
1310 // 1, 3, 5: homrar, missing, het ct decrement
1311 __m128i* remv[15];
1312 __m128i* __restrict__ perm_ptr;
1313 #else
1314 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
1315 uint32_t perm_ct32x4 = perm_ct32 * 4;
1316 uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
1317 uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
1318 uint32_t perm_ct16x16 = 16 * perm_ct16;
1319 uintptr_t* permsv = (uintptr_t*)perm_vecst;
1320 uintptr_t* remv[15];
1321 uintptr_t* perm_ptr;
1322 #endif
1323
1324 uint32_t cur_cts[6];
1325 uintptr_t ulraw1;
1326 uintptr_t ulxor;
1327 uint32_t cur_xor;
1328 uint32_t cur_raw;
1329 uint32_t idx1;
1330 uint32_t idx2;
1331 uint32_t uii;
1332 uint32_t ujj;
1333 uint32_t ukk;
1334 #ifdef __LP64__
1335 for (uii = 0; uii < 6; uii++) {
1336 remv[uii] = &(((__m128i*)thread_wkspace)[uii * perm_ct128x4]);
1337 }
1338 for (uii = 0; uii < 6; uii++) {
1339 remv[uii + 6] = &(((__m128i*)thread_wkspace)[6 * perm_ct128x4 + 2 * uii * perm_ct32]);
1340 }
1341 remv[12] = (__m128i*)results_bufs;
1342 remv[13] = &(((__m128i*)results_bufs)[perm_ct16x4]);
1343 remv[14] = &(((__m128i*)results_bufs)[2 * perm_ct16x4]);
1344 #else
1345 for (uii = 0; uii < 6; uii++) {
1346 remv[uii] = (uintptr_t*)(&(thread_wkspace[uii * perm_ct32x4]));
1347 }
1348 for (uii = 0; uii < 6; uii++) {
1349 remv[uii + 6] = (uintptr_t*)(&(thread_wkspace[6 * perm_ct32x4 + 2 * uii * perm_ct8]));
1350 }
1351 remv[12] = (uintptr_t*)results_bufs;
1352 remv[13] = (uintptr_t*)(&(results_bufs[perm_ct16x16]));
1353 remv[14] = (uintptr_t*)(&(results_bufs[2 * perm_ct16x16]));
1354 #endif
1355
1356 for (uii = 0; uii < 6; uii++) {
1357 cur_cts[uii] = 0;
1358 }
1359 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1360 ulraw1 = *loadbuf++;
1361 ulxor = ulraw1 ^ (*loadbuf_ref++);
1362 if (uii + 1 == pheno_nm_ctl2x) {
1363 ujj = pheno_nm_ct & (BITCT2 - 1);
1364 if (ujj) {
1365 ulxor &= (ONELU << (ujj * 2)) - ONELU;
1366 }
1367 }
1368 while (ulxor) {
1369 ujj = CTZLU(ulxor) & (BITCT - 2);
1370 cur_xor = (ulxor >> ujj) & 3;
1371 cur_raw = (ulraw1 >> ujj) & 3;
1372 idx1 = idx_table[cur_xor - 1][cur_raw];
1373 idx2 = idx1 >> 8;
1374 idx1 &= 255;
1375 #ifdef __LP64__
1376 perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
1377 if (!idx2) {
1378 unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct128);
1379 } else {
1380 calc_rem_merge4_two(perm_ct128, perm_ptr, remv[idx1], remv[idx2]);
1381 ukk = cur_cts[idx2] + 1;
1382 cur_cts[idx2] = ukk;
1383 if (!(ukk % 15)) {
1384 unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct32);
1385 if (!(ukk % 255)) {
1386 calc_rem_merge32_minus(perm_ct16, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
1387 }
1388 }
1389 }
1390 ukk = cur_cts[idx1] + 1;
1391 cur_cts[idx1] = ukk;
1392 if (!(ukk % 15)) {
1393 unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
1394 if (!(ukk % 255)) {
1395 if (!(idx1 & 1)) {
1396 unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
1397 } else {
1398 calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1399 }
1400 }
1401 }
1402 #else
1403 perm_ptr = &(permsv[(ujj / 2) * perm_ct32]);
1404 if (!idx2) {
1405 unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct32);
1406 } else {
1407 calc_rem_merge4_two(perm_ct32, perm_ptr, remv[idx1], remv[idx2]);
1408 ukk = cur_cts[idx2] + 1;
1409 cur_cts[idx2] = ukk;
1410 if (!(ukk % 15)) {
1411 unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct8);
1412 if (!(ukk % 255)) {
1413 calc_rem_merge32_minus(perm_ct4, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
1414 }
1415 }
1416 }
1417 ukk = cur_cts[idx1] + 1;
1418 cur_cts[idx1] = ukk;
1419 if (!(ukk % 15)) {
1420 unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
1421 if (!(ukk % 255)) {
1422 if (!(idx1 & 1)) {
1423 unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
1424 } else {
1425 calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1426 }
1427 }
1428 }
1429 #endif
1430 ulxor &= ~((3 * ONELU) << ujj);
1431 }
1432 #ifdef __LP64__
1433 permsv = &(permsv[BITCT2 * perm_ct128]);
1434 #else
1435 permsv = &(permsv[BITCT2 * perm_ct32]);
1436 #endif
1437 }
1438 for (idx1 = 0; idx1 < 6; idx1++) {
1439 uii = cur_cts[idx1];
1440 #ifdef __LP64__
1441 if (uii % 15) {
1442 // todo: check if zeroing needed
1443 unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
1444 }
1445 if (uii % 255) {
1446 if (!(idx1 & 1)) {
1447 unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
1448 } else {
1449 calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1450 }
1451 }
1452 #else
1453 if (uii % 15) {
1454 unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
1455 }
1456 if (uii % 255) {
1457 if (!(idx1 & 1)) {
1458 unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
1459 } else {
1460 calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
1461 }
1462 }
1463 #endif
1464 }
1465 }
1466
calc_qrem(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,double * perm_vecstd,double * outbufs)1467 void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
1468 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
1469 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1470 #ifdef __LP64__
1471 // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
1472 uint32_t row_mult = perm_vec_ctcl8m / 4;
1473
1474 uint32_t loop_len = (perm_vec_ct + 1) / 2;
1475 __m128d* permsv = (__m128d*)perm_vecstd;
1476 __m128d* __restrict__ perm_readv;
1477 __m128d* __restrict__ rem_writev;
1478 __m128d* __restrict__ rem_write2v;
1479 __m128d* __restrict__ rem_write3v;
1480 __m128d vxx;
1481 #else
1482 uint32_t row_mult = perm_vec_ctcl8m / 2;
1483 double* __restrict__ perm_read;
1484 double* __restrict__ rem_write;
1485 double* __restrict__ rem_write2;
1486 double* __restrict__ rem_write3;
1487 double dxx;
1488 #endif
1489 uintptr_t ulraw1;
1490 uintptr_t ulxor;
1491 uint32_t cur_xor;
1492 uint32_t cur_raw;
1493 uint32_t uii;
1494 uint32_t ujj;
1495 uint32_t ukk;
1496 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1497 ulraw1 = *loadbuf++;
1498 ulxor = ulraw1 ^ (*loadbuf_ref++);
1499 if (uii + 1 == pheno_nm_ctl2x) {
1500 ujj = pheno_nm_ct & (BITCT2 - 1);
1501 if (ujj) {
1502 ulxor &= (ONELU << (ujj * 2)) - ONELU;
1503 }
1504 }
1505 while (ulxor) {
1506 ujj = CTZLU(ulxor) & (BITCT - 2);
1507 cur_xor = (ulxor >> ujj) & 3;
1508 cur_raw = (ulraw1 >> ujj) & 3;
1509 #ifdef __LP64__
1510 perm_readv = &(permsv[ujj * row_mult]);
1511 rem_writev = (__m128d*)outbufs;
1512 rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1513 rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1514 if (cur_raw == 3) {
1515 if (cur_xor == 1) {
1516 for (ukk = 0; ukk < loop_len; ukk++) {
1517 vxx = *perm_readv++;
1518 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1519 rem_writev++;
1520 }
1521 } else if (cur_xor == 3) {
1522 for (ukk = 0; ukk < loop_len; ukk++) {
1523 vxx = *perm_readv++;
1524 *rem_writev = _mm_sub_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1525 rem_writev++;
1526 }
1527 } else {
1528 for (ukk = 0; ukk < loop_len; ukk++) {
1529 vxx = *perm_readv++;
1530 *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1531 rem_write2v++;
1532 *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1533 rem_write3v++;
1534 }
1535 }
1536 } else if (cur_raw == 2) {
1537 if (cur_xor == 1) {
1538 for (ukk = 0; ukk < loop_len; ukk++) {
1539 vxx = *perm_readv++;
1540 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1541 rem_writev++;
1542 }
1543 } else if (cur_xor == 2) {
1544 for (ukk = 0; ukk < loop_len; ukk++) {
1545 vxx = *perm_readv++;
1546 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1547 rem_writev++;
1548 }
1549 } else {
1550 for (ukk = 0; ukk < loop_len; ukk++) {
1551 vxx = *perm_readv++;
1552 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1553 rem_writev++;
1554 *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1555 rem_write2v++;
1556 *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1557 rem_write3v++;
1558 }
1559 }
1560 } else if (!cur_raw) {
1561 if (cur_xor == 3) {
1562 for (ukk = 0; ukk < loop_len; ukk++) {
1563 vxx = *perm_readv++;
1564 *rem_writev = _mm_add_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1565 rem_writev++;
1566 }
1567 } else if (cur_xor == 2) {
1568 for (ukk = 0; ukk < loop_len; ukk++) {
1569 vxx = *perm_readv++;
1570 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1571 rem_writev++;
1572 }
1573 } else {
1574 for (ukk = 0; ukk < loop_len; ukk++) {
1575 vxx = *perm_readv++;
1576 *rem_writev = _mm_add_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1577 rem_writev++;
1578 *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1579 rem_write2v++;
1580 *rem_write3v = _mm_sub_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1581 rem_write3v++;
1582 }
1583 }
1584 } else {
1585 if (cur_xor == 2) {
1586 for (ukk = 0; ukk < loop_len; ukk++) {
1587 vxx = *perm_readv++;
1588 *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1589 rem_write2v++;
1590 *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1591 rem_write3v++;
1592 }
1593 } else if (cur_xor == 3) {
1594 for (ukk = 0; ukk < loop_len; ukk++) {
1595 vxx = *perm_readv++;
1596 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1597 rem_writev++;
1598 *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1599 rem_write2v++;
1600 *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1601 rem_write3v++;
1602 }
1603 } else {
1604 for (ukk = 0; ukk < loop_len; ukk++) {
1605 vxx = *perm_readv++;
1606 *rem_writev = _mm_sub_pd(*rem_writev, _mm_add_pd(vxx, vxx));
1607 rem_writev++;
1608 *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1609 rem_write2v++;
1610 *rem_write3v = _mm_add_pd(*rem_write3v, _mm_mul_pd(vxx, vxx));
1611 rem_write3v++;
1612 }
1613 }
1614 }
1615 #else
1616 perm_read = &(perm_vecstd[ujj * row_mult]);
1617 rem_write = outbufs;
1618 rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1619 rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1620 if (cur_raw == 3) {
1621 if (cur_xor == 1) {
1622 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1623 dxx = *perm_read++;
1624 *rem_write -= dxx;
1625 rem_write++;
1626 }
1627 } else if (cur_xor == 3) {
1628 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1629 dxx = *perm_read++;
1630 *rem_write -= 2 * dxx;
1631 rem_write++;
1632 }
1633 } else {
1634 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1635 dxx = *perm_read++;
1636 *rem_write2 -= dxx;
1637 rem_write2++;
1638 *rem_write3 -= dxx * dxx;
1639 rem_write3++;
1640 }
1641 }
1642 } else if (cur_raw == 2) {
1643 if (cur_xor == 1) {
1644 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1645 dxx = *perm_read++;
1646 *rem_write += dxx;
1647 rem_write++;
1648 }
1649 } else if (cur_xor == 2) {
1650 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1651 dxx = *perm_read++;
1652 *rem_write -= dxx;
1653 rem_write++;
1654 }
1655 } else {
1656 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1657 dxx = *perm_read++;
1658 *rem_write += dxx;
1659 rem_write++;
1660 *rem_write2 -= dxx;
1661 rem_write2++;
1662 *rem_write3 -= dxx * dxx;
1663 rem_write3++;
1664 }
1665 }
1666 } else if (!cur_raw) {
1667 if (cur_xor == 3) {
1668 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1669 dxx = *perm_read++;
1670 *rem_write += 2 * dxx;
1671 rem_write++;
1672 }
1673 } else if (cur_xor == 2) {
1674 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1675 dxx = *perm_read++;
1676 *rem_write += dxx;
1677 rem_write++;
1678 }
1679 } else {
1680 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1681 dxx = *perm_read++;
1682 *rem_write += 2 * dxx;
1683 rem_write++;
1684 *rem_write2 -= dxx;
1685 rem_write2++;
1686 *rem_write3 -= dxx * dxx;
1687 rem_write3++;
1688 }
1689 }
1690 } else {
1691 if (cur_xor == 2) {
1692 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1693 dxx = *perm_read++;
1694 *rem_write2 += dxx;
1695 rem_write2++;
1696 *rem_write3 += dxx * dxx;
1697 rem_write3++;
1698 }
1699 } else if (cur_xor == 3) {
1700 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1701 dxx = *perm_read++;
1702 *rem_write -= dxx;
1703 rem_write++;
1704 *rem_write2 += dxx;
1705 rem_write2++;
1706 *rem_write3 += dxx * dxx;
1707 rem_write3++;
1708 }
1709 } else {
1710 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1711 dxx = *perm_read++;
1712 *rem_write -= 2 * dxx;
1713 rem_write++;
1714 *rem_write2 += dxx;
1715 rem_write2++;
1716 *rem_write3 += dxx * dxx;
1717 rem_write3++;
1718 }
1719 }
1720 }
1721 #endif
1722 ulxor &= ~((3 * ONELU) << ujj);
1723 }
1724 #ifdef __LP64__
1725 permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
1726 #else
1727 perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
1728 #endif
1729 }
1730 }
1731
calc_qrem_lin(uint32_t pheno_nm_ct,uintptr_t perm_vec_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_ref,double * perm_vecstd,double * outbufs)1732 void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
1733 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
1734 uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
1735 #ifdef __LP64__
1736 // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
1737 uint32_t row_mult = perm_vec_ctcl8m / 4;
1738
1739 uint32_t loop_len = (perm_vec_ct + 1) / 2;
1740 __m128d* permsv = (__m128d*)perm_vecstd;
1741 __m128d* __restrict__ perm_readv;
1742 __m128d* __restrict__ rem_writev = (__m128d*)outbufs;
1743 __m128d* __restrict__ rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1744 __m128d* __restrict__ rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1745 __m128d* __restrict__ rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1746 __m128d* __restrict__ rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1747 __m128d* __restrict__ rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1748 __m128d vxx;
1749 #else
1750 uint32_t row_mult = perm_vec_ctcl8m / 2;
1751 double* __restrict__ perm_read;
1752 double* __restrict__ rem_write = outbufs;
1753 double* __restrict__ rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1754 double* __restrict__ rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1755 double* __restrict__ rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1756 double* __restrict__ rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
1757 double* __restrict__ rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
1758 double dxx;
1759 #endif
1760 uintptr_t ulraw1;
1761 uintptr_t ulxor;
1762 uint32_t cur_xor;
1763 uint32_t cur_raw;
1764 uint32_t uii;
1765 uint32_t ujj;
1766 uint32_t ukk;
1767 for (uii = 0; uii < pheno_nm_ctl2x; uii++) {
1768 ulraw1 = *loadbuf++;
1769 ulxor = ulraw1 ^ (*loadbuf_ref++);
1770 if (uii + 1 == pheno_nm_ctl2x) {
1771 ujj = pheno_nm_ct & (BITCT2 - 1);
1772 if (ujj) {
1773 ulxor &= (ONELU << (ujj * 2)) - ONELU;
1774 }
1775 }
1776 while (ulxor) {
1777 ujj = CTZLU(ulxor) & (BITCT - 2);
1778 cur_xor = (ulxor >> ujj) & 3;
1779 cur_raw = (ulraw1 >> ujj) & 3;
1780 #ifdef __LP64__
1781 perm_readv = &(permsv[ujj * row_mult]);
1782 if (cur_raw == 3) {
1783 if (cur_xor == 1) {
1784 for (ukk = 0; ukk < loop_len; ukk++) {
1785 vxx = *perm_readv++;
1786 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1787 rem_writev++;
1788 *rem_write2v = _mm_sub_pd(*rem_write2v, _mm_mul_pd(vxx, vxx));
1789 rem_write2v++;
1790 }
1791 rem_writev = (__m128d*)outbufs;
1792 rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1793 } else if (cur_xor == 3) {
1794 for (ukk = 0; ukk < loop_len; ukk++) {
1795 vxx = *perm_readv++;
1796 *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1797 rem_write3v++;
1798 *rem_write4v = _mm_sub_pd(*rem_write4v, _mm_mul_pd(vxx, vxx));
1799 rem_write4v++;
1800 }
1801 rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1802 rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1803 } else {
1804 for (ukk = 0; ukk < loop_len; ukk++) {
1805 vxx = *perm_readv++;
1806 *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1807 rem_write5v++;
1808 *rem_write6v = _mm_sub_pd(*rem_write6v, _mm_mul_pd(vxx, vxx));
1809 rem_write6v++;
1810 }
1811 rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1812 rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1813 }
1814 } else if (cur_raw == 2) {
1815 if (cur_xor == 1) {
1816 for (ukk = 0; ukk < loop_len; ukk++) {
1817 vxx = *perm_readv++;
1818 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1819 rem_writev++;
1820 *rem_write2v = _mm_add_pd(*rem_write2v, _mm_mul_pd(vxx, vxx));
1821 rem_write2v++;
1822 }
1823 } else if (cur_xor == 2) {
1824 for (ukk = 0; ukk < loop_len; ukk++) {
1825 vxx = *perm_readv++;
1826 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1827 rem_writev++;
1828 *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1829 rem_write3v++;
1830 vxx = _mm_mul_pd(vxx, vxx);
1831 *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1832 rem_write2v++;
1833 *rem_write4v = _mm_sub_pd(*rem_write4v, vxx);
1834 rem_write4v++;
1835 }
1836 rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1837 rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1838 } else {
1839 for (ukk = 0; ukk < loop_len; ukk++) {
1840 vxx = *perm_readv++;
1841 *rem_writev = _mm_add_pd(*rem_writev, vxx);
1842 rem_writev++;
1843 *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1844 rem_write5v++;
1845 vxx = _mm_mul_pd(vxx, vxx);
1846 *rem_write2v = _mm_add_pd(*rem_write2v, vxx);
1847 rem_write2v++;
1848 *rem_write6v = _mm_sub_pd(*rem_write6v, vxx);
1849 rem_write6v++;
1850 }
1851 rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1852 rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1853 }
1854 rem_writev = (__m128d*)outbufs;
1855 rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1856 } else if (!cur_raw) {
1857 if (cur_xor == 3) {
1858 for (ukk = 0; ukk < loop_len; ukk++) {
1859 vxx = *perm_readv++;
1860 *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1861 rem_write3v++;
1862 *rem_write4v = _mm_add_pd(*rem_write4v, _mm_mul_pd(vxx, vxx));
1863 rem_write4v++;
1864 }
1865 } else if (cur_xor == 2) {
1866 for (ukk = 0; ukk < loop_len; ukk++) {
1867 vxx = *perm_readv++;
1868 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1869 rem_writev++;
1870 *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1871 rem_write3v++;
1872 vxx = _mm_mul_pd(vxx, vxx);
1873 *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1874 rem_write2v++;
1875 *rem_write4v = _mm_add_pd(*rem_write4v, vxx);
1876 rem_write4v++;
1877 }
1878 rem_writev = (__m128d*)outbufs;
1879 rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1880 } else {
1881 for (ukk = 0; ukk < loop_len; ukk++) {
1882 vxx = *perm_readv++;
1883 *rem_write3v = _mm_add_pd(*rem_write3v, vxx);
1884 rem_write3v++;
1885 *rem_write5v = _mm_sub_pd(*rem_write5v, vxx);
1886 rem_write5v++;
1887 vxx = _mm_mul_pd(vxx, vxx);
1888 *rem_write4v = _mm_add_pd(*rem_write4v, vxx);
1889 rem_write4v++;
1890 *rem_write6v = _mm_sub_pd(*rem_write6v, vxx);
1891 rem_write6v++;
1892 }
1893 rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1894 rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1895 }
1896 rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1897 rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1898 } else {
1899 if (cur_xor == 2) {
1900 for (ukk = 0; ukk < loop_len; ukk++) {
1901 vxx = *perm_readv++;
1902 *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1903 rem_write5v++;
1904 *rem_write6v = _mm_add_pd(*rem_write6v, _mm_mul_pd(vxx, vxx));
1905 rem_write6v++;
1906 }
1907 } else if (cur_xor == 3) {
1908 for (ukk = 0; ukk < loop_len; ukk++) {
1909 vxx = *perm_readv++;
1910 *rem_writev = _mm_sub_pd(*rem_writev, vxx);
1911 rem_writev++;
1912 *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1913 rem_write5v++;
1914 vxx = _mm_mul_pd(vxx, vxx);
1915 *rem_write2v = _mm_sub_pd(*rem_write2v, vxx);
1916 rem_write2v++;
1917 *rem_write6v = _mm_add_pd(*rem_write6v, vxx);
1918 rem_write6v++;
1919 }
1920 rem_writev = (__m128d*)outbufs;
1921 rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
1922 } else {
1923 for (ukk = 0; ukk < loop_len; ukk++) {
1924 vxx = *perm_readv++;
1925 *rem_write3v = _mm_sub_pd(*rem_write3v, vxx);
1926 rem_write3v++;
1927 *rem_write5v = _mm_add_pd(*rem_write5v, vxx);
1928 rem_write5v++;
1929 vxx = _mm_mul_pd(vxx, vxx);
1930 *rem_write4v = _mm_sub_pd(*rem_write4v, vxx);
1931 rem_write4v++;
1932 *rem_write6v = _mm_add_pd(*rem_write6v, vxx);
1933 rem_write6v++;
1934 }
1935 rem_write3v = (__m128d*)(&(outbufs[2 * perm_vec_ctcl8m]));
1936 rem_write4v = (__m128d*)(&(outbufs[3 * perm_vec_ctcl8m]));
1937 }
1938 rem_write5v = (__m128d*)(&(outbufs[4 * perm_vec_ctcl8m]));
1939 rem_write6v = (__m128d*)(&(outbufs[5 * perm_vec_ctcl8m]));
1940 }
1941 #else
1942 perm_read = &(perm_vecstd[ujj * row_mult]);
1943 if (cur_raw == 3) {
1944 if (cur_xor == 1) {
1945 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1946 dxx = *perm_read++;
1947 *rem_write -= dxx;
1948 rem_write++;
1949 *rem_write2 -= dxx * dxx;
1950 rem_write2++;
1951 }
1952 rem_write = outbufs;
1953 rem_write2 = &(outbufs[perm_vec_ctcl8m]);
1954 } else if (cur_xor == 3) {
1955 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1956 dxx = *perm_read++;
1957 *rem_write3 -= dxx;
1958 rem_write3++;
1959 *rem_write4 -= dxx * dxx;
1960 rem_write4++;
1961 }
1962 rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1963 rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1964 } else {
1965 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1966 dxx = *perm_read++;
1967 *rem_write5 -= dxx;
1968 rem_write5++;
1969 *rem_write6 -= dxx * dxx;
1970 rem_write6++;
1971 }
1972 rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
1973 rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
1974 }
1975 } else if (cur_raw == 2) {
1976 if (cur_xor == 1) {
1977 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1978 dxx = *perm_read++;
1979 *rem_write += dxx;
1980 rem_write++;
1981 *rem_write2 += dxx * dxx;
1982 rem_write2++;
1983 }
1984 } else if (cur_xor == 2) {
1985 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
1986 dxx = *perm_read++;
1987 *rem_write += dxx;
1988 rem_write++;
1989 *rem_write3 -= dxx;
1990 rem_write3++;
1991 dxx *= dxx;
1992 *rem_write2 += dxx;
1993 rem_write2++;
1994 *rem_write4 -= dxx;
1995 rem_write4++;
1996 }
1997 rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
1998 rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
1999 } else {
2000 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2001 dxx = *perm_read++;
2002 *rem_write += dxx;
2003 rem_write++;
2004 *rem_write5 -= dxx;
2005 rem_write5++;
2006 dxx *= dxx;
2007 *rem_write2 += dxx;
2008 rem_write2++;
2009 *rem_write6 -= dxx;
2010 rem_write6++;
2011 }
2012 rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2013 rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2014 }
2015 rem_write = outbufs;
2016 rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2017 } else if (!cur_raw) {
2018 if (cur_xor == 3) {
2019 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2020 dxx = *perm_read++;
2021 *rem_write3 += dxx;
2022 rem_write3++;
2023 *rem_write4 += dxx * dxx;
2024 rem_write4++;
2025 }
2026 } else if (cur_xor == 2) {
2027 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2028 dxx = *perm_read++;
2029 *rem_write -= dxx;
2030 rem_write++;
2031 *rem_write3 += dxx;
2032 rem_write3++;
2033 dxx *= dxx;
2034 *rem_write2 -= dxx;
2035 rem_write2++;
2036 *rem_write4 += dxx;
2037 rem_write4++;
2038 }
2039 rem_write = outbufs;
2040 rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2041 } else {
2042 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2043 dxx = *perm_read++;
2044 *rem_write3 += dxx;
2045 rem_write3++;
2046 *rem_write5 -= dxx;
2047 rem_write5++;
2048 dxx *= dxx;
2049 *rem_write4 += dxx;
2050 rem_write4++;
2051 *rem_write6 -= dxx;
2052 rem_write6++;
2053 }
2054 rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2055 rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2056 }
2057 rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
2058 rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
2059 } else {
2060 if (cur_xor == 2) {
2061 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2062 dxx = *perm_read++;
2063 *rem_write5 += dxx;
2064 rem_write5++;
2065 *rem_write6 += dxx * dxx;
2066 rem_write6++;
2067 }
2068 } else if (cur_xor == 3) {
2069 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2070 dxx = *perm_read++;
2071 *rem_write -= dxx;
2072 rem_write++;
2073 *rem_write5 += dxx;
2074 rem_write5++;
2075 dxx *= dxx;
2076 *rem_write2 -= dxx;
2077 rem_write2++;
2078 *rem_write6 += dxx;
2079 rem_write6++;
2080 }
2081 rem_write = outbufs;
2082 rem_write2 = &(outbufs[perm_vec_ctcl8m]);
2083 } else {
2084 for (ukk = 0; ukk < perm_vec_ct; ukk++) {
2085 dxx = *perm_read++;
2086 *rem_write3 -= dxx;
2087 rem_write3++;
2088 *rem_write5 += dxx;
2089 rem_write5++;
2090 dxx *= dxx;
2091 *rem_write4 -= dxx;
2092 rem_write4++;
2093 *rem_write6 += dxx;
2094 rem_write6++;
2095 }
2096 rem_write3 = &(outbufs[2 * perm_vec_ctcl8m]);
2097 rem_write4 = &(outbufs[3 * perm_vec_ctcl8m]);
2098 }
2099 rem_write5 = &(outbufs[4 * perm_vec_ctcl8m]);
2100 rem_write6 = &(outbufs[5 * perm_vec_ctcl8m]);
2101 }
2102 #endif
2103 ulxor &= ~((3 * ONELU) << ujj);
2104 }
2105 #ifdef __LP64__
2106 permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
2107 #else
2108 perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
2109 #endif
2110 }
2111 }
2112
check_for_better_rem_cost(uintptr_t best_cost,uint32_t maxt_block_base,uint32_t maxt_block_base2,uint32_t maxt_block_base3,uintptr_t marker_idx,uint32_t * __restrict__ missing_cts,uint32_t * __restrict__ homcom_cts,uint32_t * __restrict__ het_cts,uint16_t * ldrefs,uint32_t pheno_nm_ct,int32_t missing_ct,int32_t het_ct,int32_t homcom_ct,uintptr_t * loadbuf,uintptr_t * loadbuf_cur,uint32_t * ldrefp)2113 void check_for_better_rem_cost(uintptr_t best_cost, uint32_t maxt_block_base, uint32_t maxt_block_base2, uint32_t maxt_block_base3, uintptr_t marker_idx, uint32_t* __restrict__ missing_cts, uint32_t* __restrict__ homcom_cts, uint32_t* __restrict__ het_cts, uint16_t* ldrefs, uint32_t pheno_nm_ct, int32_t missing_ct, int32_t het_ct, int32_t homcom_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_cur, uint32_t* ldrefp) {
2114 // Check if PERMORY-style LD exploitation is better than genotype indexing
2115 // algorithm.
2116 //
2117 // Effective inner loop iterations required for LD exploitation:
2118 // 2 * (<-> neither side homcom) + (<-> homcom) + constant
2119 // Simple lower bound:
2120 // max(delta(homcom), delta(non-homcom)) + constant
2121 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2122 uint32_t marker_idx_tmp = maxt_block_base;
2123 uint32_t loop_ceil = maxt_block_base2;
2124 int32_t homrar_ct = pheno_nm_ct - missing_ct - het_ct - homcom_ct;
2125 int32_t missing_ct_tmp;
2126 int32_t het_ct_tmp;
2127 int32_t homcom_ct_tmp;
2128 int32_t homrar_ct_tmp;
2129 uint32_t marker_bidx2;
2130 uintptr_t homcom_delta;
2131 uintptr_t cur_cost;
2132 do {
2133 if (marker_idx_tmp == maxt_block_base2) {
2134 marker_idx_tmp = maxt_block_base3;
2135 loop_ceil = marker_idx;
2136 }
2137 for (; marker_idx_tmp < loop_ceil; marker_idx_tmp++) {
2138 if (ldrefs[marker_idx_tmp] != 65535) {
2139 missing_ct_tmp = missing_cts[marker_idx_tmp];
2140 homcom_ct_tmp = homcom_cts[marker_idx_tmp];
2141 het_ct_tmp = het_cts[marker_idx_tmp];
2142 homrar_ct_tmp = pheno_nm_ct - missing_ct_tmp - het_ct_tmp - homcom_ct_tmp;
2143 homcom_delta = labs(((int32_t)homcom_ct) - homcom_ct_tmp);
2144 cur_cost = labs(((int32_t)missing_ct) - missing_ct_tmp) + labs(((int32_t)homrar_ct) - homrar_ct_tmp) + labs(((int32_t)het_ct) - het_ct_tmp);
2145 cur_cost = MAXV(homcom_delta, cur_cost);
2146 if (cur_cost < best_cost) {
2147 marker_bidx2 = marker_idx_tmp - maxt_block_base;
2148 cur_cost = rem_cost(pheno_nm_ctv2, &(loadbuf[marker_bidx2 * pheno_nm_ctv2]), loadbuf_cur);
2149 if (cur_cost < best_cost) {
2150 *ldrefp = marker_bidx2;
2151 best_cost = cur_cost;
2152 }
2153 }
2154 }
2155 }
2156 } while (marker_idx_tmp < marker_idx);
2157 }
2158
2159 // multithread globals
2160 static double* g_orig_pvals;
2161 static double* g_orig_chisq;
2162 static double* g_mperm_save_all;
2163
2164 // A separated-low-and-high-bit format was tried, and found to not really be
2165 // any better than the usual PLINK 2-bit format.
2166 static uintptr_t* g_loadbuf;
2167
2168 static uint32_t* g_perm_vecst; // genotype indexing support
2169 static uint32_t* g_thread_git_wkspace;
2170 static uint32_t* g_resultbuf;
2171
2172 // always use genotype indexing for QT --assoc
2173 static double* g_thread_git_qbufs;
2174 static double* g_qresultbuf;
2175 static double g_pheno_sum;
2176 static double g_pheno_ssq;
2177 static uint16_t* g_ldrefs;
2178 static double* g_orig_linsq; // square of Lin t-statistic
2179
2180 // maximum number of precomputed table entries per marker
2181 static uint32_t g_precomp_width;
2182 // precomputed table contains entries for missing_cts ranging from
2183 // g_precomp_start[marker_bidx] to
2184 // (g_precomp_start[marker_bidx] + g_precomp_width - 1).
2185 static uint32_t g_precomp_start[MODEL_BLOCKSIZE];
2186
2187 // Space for precomputed tables to accelerate permutation p-value computations.
2188 // The sizing and usage of this space varies depending on the permutation
2189 // analysis requested. (The main objective here is to bring Fisher 2x2 exact
2190 // p-values to the masses. There's a very minor chi-square speedup as well;
2191 // it's really only present because it allowed for simpler debugging of parts
2192 // of the Fisher logic.)
2193 //
2194 // In what follows,
2195 // n := (g_precomp_width * marker_bidx) + missing_ct -
2196 // g_precomp_start[marker_bidx].
2197 //
2198 // For --assoc perm/--model {dom|rec|trend} perm:
2199 // g_precomp_ui[4n] and [4n + 1] define the interval with less extreme
2200 // p-values than the original. [4n + 2] and [4n + 3] define the
2201 // interval with less or equally extreme p-values.
2202 //
2203 // For --assoc mperm fisher/--model {dom|rec} fisher:
2204 // g_precomp_ui[6n]...[6n + 3] is as in --assoc perm.
2205 // g_precomp_ui[6n + 4] and [6n + 5] are the floor and offset for the
2206 // range of case_set_cts where Fisher p-value calculation is unnecessary.
2207 // g_precomp_d[2n] and [2n + 1] are tot_prob and right_prob for
2208 // fisher22_tail_pval(). (This is almost irrelevant.)
2209 //
2210 // For --assoc mperm/--model {dom|rec|trend} mperm:
2211 // g_precomp_ui is as in --assoc mperm fisher.
2212 // g_precomp_d[2n] and [2n + 1] are expm11 and recip_sum from
2213 // chi22_get_coeffs()/ca_trend_get_coeffs().
2214 //
2215 // For --model perm-gen:
2216 // No precomputation at all.
2217 //
2218 // For regular --model perm:
2219 // g_precomp_ui[12n] to [12n + 3] cover the allelic test, [12n + 4] to
2220 // [12n + 7] cover the dom test, and [12n + 8] to [12n + 11] cover rec.
2221 // [12n + 4] is 0xffffffff if the dom and rec tests should be skipped.
2222 //
2223 // For regular --model mperm fisher:
2224 // g_precomp_ui[18n] to [18n + 5] cover the allelic test, etc.
2225 // g_precomp_d[6n] to [6n + 1] are fisher22_tail_pval() coefficients for the
2226 // allelic test, etc.
2227 //
2228 // For regular --model mperm:
2229 // g_precomp_ui as in --model mperm fisher.
2230 // g_precomp_d[6n] and [6n + 1] are expm11 and recip_sum for the allelic
2231 // test, etc.
2232 //
2233 static uint32_t* g_precomp_ui;
2234 static double* g_precomp_d;
2235
2236 // X-chromosome: number of missing allele observations per marker relative to
2237 // *all female* case (so all males automatically contribute at least 1)
2238 // elsewhere: number of missing samples for each marker
2239 static uint32_t* g_missing_cts;
2240
2241 static uint32_t* g_set_cts;
2242 static uint32_t* g_het_cts;
2243 static uint32_t* g_homcom_cts;
2244
2245 // This is *twice* the number of successes, because PLINK 1.07 counts tie as
2246 // 0.5. (Actually, it randomizes instead of deterministically adding 0.5; this
2247 // randomization just adds noise so we don't replicate it.)
2248 static uint32_t* g_perm_2success_ct;
2249 static uint32_t* g_perm_attempt_ct;
2250 static double* g_maxt_extreme_stat;
2251 static double* g_maxt_thread_results;
2252
2253 // to avoid pathological multithreading issues, this is not a bitset
2254 static unsigned char* g_perm_adapt_stop;
2255
2256 static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
2257 static uintptr_t* g_sample_nonmale_include2;
2258 static uintptr_t* g_sample_male_include2;
2259 static uintptr_t* g_is_invalid_bitfield;
2260 static uint32_t g_model_fisher;
2261 static uint32_t g_fisher_midp;
2262 static uint32_t g_assoc_thread_ct;
2263 static uint32_t g_maxt_block_base;
2264 static uint32_t g_block_start;
2265 static uint32_t g_qblock_start;
2266 static uint32_t g_block_diff;
2267 static uint32_t g_perms_done;
2268 static uint32_t g_first_adapt_check;
2269 static uint32_t g_male_ct;
2270 static double g_adaptive_intercept;
2271 static double g_adaptive_slope;
2272 static double g_aperm_alpha;
2273 static double g_adaptive_ci_zt;
2274 static uint32_t g_is_x;
2275 static uint32_t g_is_y;
2276
2277 // X, Y, MT. note that X, and now MT as well, have max ploidy 2
2278 static uint32_t g_min_ploidy_1;
2279
2280 static int32_t g_is_model_prec;
2281
2282 static uint32_t* g_male_case_cts;
2283
assoc_adapt_thread(void * arg)2284 THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
2285 uintptr_t tidx = (uintptr_t)arg;
2286 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
2287 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2288 uintptr_t perm_vec_ct = g_perm_vec_ct;
2289 uint32_t assoc_thread_ct = g_assoc_thread_ct;
2290 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2291 uint32_t model_fisher = g_model_fisher;
2292 uint32_t fisher_midp = g_fisher_midp;
2293 uint32_t precomp_width = g_precomp_width;
2294 uint32_t first_adapt_check = g_first_adapt_check;
2295 uint32_t case_ct = g_perm_case_ct;
2296 uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2297 uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2298 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2299 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
2300 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2301 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
2302 double adaptive_intercept = g_adaptive_intercept;
2303 double adaptive_slope = g_adaptive_slope;
2304 double adaptive_ci_zt = g_adaptive_ci_zt;
2305 double aperm_alpha = g_aperm_alpha;
2306 uintptr_t* __restrict__ loadbuf;
2307 double* __restrict__ orig_pvals;
2308 double* __restrict__ orig_chisq;
2309 uint32_t* __restrict__ missing_cts;
2310 uint32_t* __restrict__ set_cts;
2311 uint32_t* __restrict__ precomp_start;
2312 uint32_t* __restrict__ precomp_ui;
2313 uint32_t* gpui;
2314 uintptr_t marker_idx;
2315 uintptr_t pidx;
2316 uint32_t marker_bidx;
2317 uint32_t marker_bceil;
2318 uint32_t min_ploidy_1;
2319 uint32_t is_x;
2320 uint32_t is_y;
2321 uint32_t success_2start;
2322 uint32_t success_2incr;
2323 uint32_t next_adapt_check;
2324 uint32_t min_ploidy;
2325 intptr_t row1x_sum;
2326 intptr_t col1_sum;
2327 intptr_t col2_sum;
2328 intptr_t tot_obs;
2329 uint32_t missing_start;
2330 uint32_t case_set_ct;
2331 uint32_t case_missing_ct;
2332 uint32_t uii;
2333 double stat_high;
2334 double stat_low;
2335 double pval;
2336 double dxx;
2337 double dyy;
2338 double dzz;
2339 while (1) {
2340 if (g_block_diff <= assoc_thread_ct) {
2341 if (g_block_diff <= tidx) {
2342 goto assoc_adapt_thread_skip_all;
2343 }
2344 marker_bidx = g_block_start + tidx;
2345 marker_bceil = marker_bidx + 1;
2346 } else {
2347 marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2348 marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2349 }
2350 min_ploidy_1 = g_min_ploidy_1;
2351 loadbuf = g_loadbuf;
2352 orig_pvals = g_orig_pvals;
2353 orig_chisq = g_orig_chisq;
2354 missing_cts = g_missing_cts;
2355 set_cts = g_set_cts;
2356 precomp_start = g_precomp_start;
2357 precomp_ui = g_precomp_ui;
2358 is_x = g_is_x;
2359 is_y = g_is_y;
2360 if (min_ploidy_1) {
2361 min_ploidy = 1;
2362 } else {
2363 min_ploidy = 2;
2364 }
2365 for (; marker_bidx < marker_bceil; marker_bidx++) {
2366 // guaranteed during loading that g_perm_adapt_stop[] is not set yet
2367 marker_idx = g_adapt_m_table[marker_bidx];
2368 next_adapt_check = first_adapt_check;
2369 col1_sum = set_cts[marker_idx];
2370 if (is_x) {
2371 row1x_sum = 2 * case_ct;
2372 tot_obs = 2 * pheno_nm_ct - missing_cts[marker_idx];
2373 } else {
2374 row1x_sum = min_ploidy * case_ct;
2375 tot_obs = min_ploidy * (pheno_nm_ct - missing_cts[marker_idx]);
2376 }
2377 col2_sum = tot_obs - col1_sum;
2378 missing_start = precomp_start[marker_bidx];
2379 gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
2380 success_2start = perm_2success_ct[marker_idx];
2381 success_2incr = 0;
2382 if (orig_pvals[marker_idx] == -9) {
2383 perm_adapt_stop[marker_idx] = 1;
2384 perm_attempt_ct[marker_idx] = next_adapt_check;
2385 perm_2success_ct[marker_idx] = next_adapt_check;
2386 continue;
2387 }
2388 if (model_fisher) {
2389 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
2390 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
2391 } else {
2392 stat_high = orig_chisq[marker_idx] + EPSILON;
2393 stat_low = orig_chisq[marker_idx] - EPSILON;
2394 }
2395 for (pidx = 0; pidx < perm_vec_ct;) {
2396 if (!min_ploidy_1) {
2397 genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2398 } else if (is_x) {
2399 genovec_set_freq_x(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2400 } else if (!is_y) {
2401 genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_set_ct);
2402 case_missing_ct += uii;
2403 } else {
2404 genovec_set_freq_y(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2405 }
2406 // deliberate underflow
2407 uii = (uint32_t)(case_missing_ct - missing_start);
2408 if (uii < precomp_width) {
2409 if (case_set_ct < gpui[4 * uii]) {
2410 if (case_set_ct < gpui[4 * uii + 2]) {
2411 success_2incr += 2;
2412 } else {
2413 success_2incr++;
2414 }
2415 } else {
2416 if (case_set_ct >= gpui[4 * uii + 1]) {
2417 if (case_set_ct >= gpui[4 * uii + 3]) {
2418 success_2incr += 2;
2419 } else {
2420 success_2incr++;
2421 }
2422 }
2423 }
2424 } else {
2425 uii = row1x_sum - case_missing_ct * min_ploidy; // row1_sum
2426 if (model_fisher) {
2427 dxx = fisher22(case_set_ct, uii - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - uii, fisher_midp);
2428 if (dxx < stat_low) {
2429 success_2incr += 2;
2430 } else if (dxx <= stat_high) {
2431 success_2incr++;
2432 }
2433 } else {
2434 dxx = chi22_eval(case_set_ct, uii, col1_sum, tot_obs);
2435 if (dxx > stat_high) {
2436 success_2incr += 2;
2437 } else {
2438 success_2incr++;
2439 }
2440 }
2441 }
2442 if (++pidx == next_adapt_check - pidx_offset) {
2443 uii = success_2start + success_2incr;
2444 if (uii) {
2445 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
2446 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
2447 dyy = pval - dxx; // lower bound
2448 dzz = pval + dxx; // upper bound
2449 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
2450 perm_adapt_stop[marker_idx] = 1;
2451 perm_attempt_ct[marker_idx] = next_adapt_check;
2452 break;
2453 }
2454 }
2455 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
2456 }
2457 }
2458 perm_2success_ct[marker_idx] += success_2incr;
2459 }
2460 assoc_adapt_thread_skip_all:
2461 if ((!tidx) || g_is_last_thread_block) {
2462 THREAD_RETURN;
2463 }
2464 THREAD_BLOCK_FINISH(tidx);
2465 }
2466 }
2467
assoc_maxt_thread(void * arg)2468 THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
2469 uintptr_t tidx = (uintptr_t)arg;
2470 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2471 uintptr_t perm_vec_ct = g_perm_vec_ct;
2472 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2473 uint32_t assoc_thread_ct = g_assoc_thread_ct;
2474 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2475 uint32_t model_fisher = g_model_fisher;
2476 uint32_t fisher_midp = g_fisher_midp;
2477
2478 // currently safe for this to be uint32_t since perm_vec_ct < 2^30
2479 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
2480 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
2481 uint32_t* git_homrar_cts = nullptr;
2482 uint32_t* git_missing_cts = nullptr;
2483 uint32_t* git_het_cts = nullptr;
2484 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
2485 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
2486 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
2487 uint32_t precomp_width = g_precomp_width;
2488 uint32_t case_ct = g_perm_case_ct;
2489 uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2490 uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2491 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2492 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
2493 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2494 double* __restrict__ mperm_save_all = g_mperm_save_all;
2495 double* msa_ptr = nullptr;
2496 uintptr_t* __restrict__ loadbuf;
2497 uint32_t* __restrict__ missing_cts;
2498 uint32_t* __restrict__ set_cts;
2499 uint32_t* __restrict__ het_cts;
2500 uint32_t* __restrict__ homcom_cts;
2501 uint32_t* __restrict__ precomp_start;
2502 uint32_t* __restrict__ precomp_ui;
2503 double* __restrict__ precomp_d;
2504 double* __restrict__ orig_pvals;
2505 double* __restrict__ orig_chisq;
2506 uint16_t* ldrefs;
2507 uintptr_t* loadbuf_cur;
2508 uint32_t* resultbuf;
2509 uint32_t* gpui;
2510 double* gpd;
2511 uintptr_t pidx;
2512 uintptr_t marker_idx;
2513 intptr_t row1x_sum;
2514 intptr_t col1_sum;
2515 intptr_t col2_sum;
2516 intptr_t tot_obs;
2517 uint32_t block_start;
2518 uint32_t maxt_block_base;
2519 uint32_t maxt_block_base2;
2520 uint32_t marker_bidx_start;
2521 uint32_t maxt_block_base3;
2522 uint32_t marker_bidx;
2523 uint32_t marker_bceil;
2524 uint32_t is_x;
2525 uint32_t is_x_or_y;
2526 uint32_t min_ploidy_1;
2527 uint32_t min_ploidy;
2528 uint32_t success_2incr;
2529 uint32_t missing_start;
2530 uint32_t case_set_ct;
2531 uint32_t case_missing_ct;
2532 uint32_t uii;
2533 uint32_t ujj;
2534 uint32_t ukk;
2535 double stat_high;
2536 double stat_low;
2537 double sval;
2538 uint32_t missing_ct;
2539 uint32_t het_ct;
2540 uint32_t homcom_ct;
2541 uint32_t ldref;
2542 while (1) {
2543 block_start = g_block_start;
2544 if (g_block_diff <= assoc_thread_ct) {
2545 if (g_block_diff <= tidx) {
2546 goto assoc_maxt_thread_skip_all;
2547 }
2548 marker_bidx_start = block_start + tidx;
2549 marker_bceil = marker_bidx_start + 1;
2550 } else {
2551 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2552 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2553 }
2554 maxt_block_base = g_maxt_block_base;
2555 maxt_block_base2 = maxt_block_base + block_start;
2556 maxt_block_base3 = maxt_block_base + marker_bidx_start;
2557 marker_bidx = marker_bidx_start;
2558 marker_idx = maxt_block_base3;
2559 is_x = g_is_x;
2560 is_x_or_y = is_x || g_is_y;
2561 min_ploidy_1 = g_min_ploidy_1;
2562 memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
2563 if (min_ploidy_1) {
2564 min_ploidy = 1;
2565 } else {
2566 min_ploidy = 2;
2567 }
2568 loadbuf = g_loadbuf;
2569 missing_cts = g_missing_cts;
2570 set_cts = g_set_cts;
2571 het_cts = g_het_cts;
2572 homcom_cts = g_homcom_cts;
2573 precomp_start = g_precomp_start;
2574 precomp_ui = g_precomp_ui;
2575 precomp_d = g_precomp_d;
2576 orig_pvals = g_orig_pvals;
2577 orig_chisq = g_orig_chisq;
2578 resultbuf = g_resultbuf;
2579 ldrefs = g_ldrefs;
2580
2581 if (mperm_save_all) {
2582 msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
2583 }
2584 for (; marker_bidx < marker_bceil; marker_bidx++) {
2585 if (orig_pvals[marker_idx] == -9) {
2586 if (msa_ptr) {
2587 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2588 *msa_ptr++ = -9;
2589 }
2590 }
2591 perm_2success_ct[marker_idx++] += perm_vec_ct;
2592 continue;
2593 }
2594 if (model_fisher) {
2595 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
2596 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
2597 } else {
2598 stat_high = orig_chisq[marker_idx] + EPSILON;
2599 stat_low = orig_chisq[marker_idx] - EPSILON;
2600 }
2601 gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
2602 col1_sum = set_cts[marker_idx];
2603 missing_ct = missing_cts[marker_idx];
2604 if (is_x) {
2605 row1x_sum = 2 * case_ct;
2606 tot_obs = 2 * pheno_nm_ct - missing_ct;
2607 } else {
2608 row1x_sum = min_ploidy * case_ct;
2609 tot_obs = min_ploidy * (pheno_nm_ct - missing_ct);
2610 }
2611 col2_sum = tot_obs - col1_sum;
2612 gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
2613 missing_start = precomp_start[marker_bidx];
2614 success_2incr = 0;
2615 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
2616 if (!is_x_or_y) {
2617 ldref = ldrefs[marker_idx];
2618 if (!min_ploidy_1) {
2619 het_ct = het_cts[marker_idx];
2620 homcom_ct = (col1_sum - het_ct) / 2;
2621 } else {
2622 het_ct = 0;
2623 homcom_ct = col1_sum;
2624 }
2625 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
2626 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
2627 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
2628 if (ldref == 65535) {
2629 ldref = marker_bidx;
2630 if (pheno_nm_ct - homcom_ct > 50) {
2631 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
2632 }
2633 ldrefs[marker_idx] = ldref;
2634 }
2635 if (ldref == marker_bidx) {
2636 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
2637 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
2638 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
2639 } else {
2640 memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
2641 calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
2642 }
2643 }
2644 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2645 if (!is_x_or_y) {
2646 if (!min_ploidy_1) {
2647 case_missing_ct = git_missing_cts[pidx];
2648 case_set_ct = row1x_sum - (git_het_cts[pidx] + 2 * (case_missing_ct + git_homrar_cts[pidx]));
2649 } else {
2650 case_missing_ct = git_missing_cts[pidx] + git_het_cts[pidx];
2651 case_set_ct = row1x_sum - case_missing_ct - git_homrar_cts[pidx];
2652 }
2653 } else {
2654 if (is_x) {
2655 genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2656 } else {
2657 genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2658 }
2659 }
2660 // deliberate underflow
2661 uii = (uint32_t)(case_missing_ct - missing_start);
2662 if (uii < precomp_width) {
2663 if (case_set_ct < gpui[6 * uii]) {
2664 if (case_set_ct < gpui[6 * uii + 2]) {
2665 success_2incr += 2;
2666 } else {
2667 success_2incr++;
2668 }
2669 } else {
2670 if (case_set_ct >= gpui[6 * uii + 1]) {
2671 if (case_set_ct >= gpui[6 * uii + 3]) {
2672 success_2incr += 2;
2673 } else {
2674 success_2incr++;
2675 }
2676 }
2677 }
2678 ukk = gpui[6 * uii + 4];
2679 ujj = (uint32_t)(case_set_ct - ukk); // deliberate underflow
2680 if (ujj >= gpui[6 * uii + 5]) {
2681 if (model_fisher) {
2682 ujj = row1x_sum - case_missing_ct * min_ploidy;
2683 // sval = fisher22(case_set_ct, ujj - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - ujj);
2684 sval = fisher22_tail_pval(ukk, ujj - ukk, col1_sum - ukk, col2_sum + ukk - ujj, gpui[6 * uii + 5] - 1, gpd[2 * uii], gpd[2 * uii + 1], fisher_midp, case_set_ct);
2685 if (results[pidx] > sval) {
2686 results[pidx] = sval;
2687 }
2688 } else {
2689 sval = ((double)((intptr_t)case_set_ct)) - gpd[2 * uii];
2690 sval = sval * sval * gpd[2 * uii + 1];
2691 if (results[pidx] < sval) {
2692 results[pidx] = sval;
2693 }
2694 }
2695 }
2696 } else {
2697 uii = row1x_sum - case_missing_ct * min_ploidy;
2698 if (model_fisher) {
2699 sval = fisher22(case_set_ct, uii - case_set_ct, col1_sum - case_set_ct, col2_sum + case_set_ct - uii, fisher_midp);
2700 if (sval < stat_low) {
2701 success_2incr += 2;
2702 } else if (sval <= stat_high) {
2703 success_2incr++;
2704 }
2705 if (results[pidx] > sval) {
2706 results[pidx] = sval;
2707 }
2708 } else {
2709 sval = chi22_eval(case_set_ct, uii, col1_sum, tot_obs);
2710 if (sval > stat_high) {
2711 success_2incr += 2;
2712 } else if (sval > stat_low) {
2713 success_2incr++;
2714 }
2715 if (results[pidx] < sval) {
2716 results[pidx] = sval;
2717 }
2718 }
2719 if (msa_ptr) {
2720 *msa_ptr++ = sval;
2721 }
2722 }
2723 }
2724 perm_2success_ct[marker_idx++] += success_2incr;
2725 }
2726 assoc_maxt_thread_skip_all:
2727 if ((!tidx) || g_is_last_thread_block) {
2728 THREAD_RETURN;
2729 }
2730 THREAD_BLOCK_FINISH(tidx);
2731 }
2732 }
2733
assoc_set_thread(void * arg)2734 THREAD_RET_TYPE assoc_set_thread(void* arg) {
2735 // Basically a simplified version of what assoc_maxt_thread() does; we save
2736 // chi-square stats for the given number of permutations for all still-active
2737 // variants. Adaptive pruning, if applicable, happens outside this loop.
2738 //
2739 // LD-exploitation should be added if this sees significant usage.
2740 // (possible todo: permit Fisher test, converting p-values into equivalent
2741 // chi-square stats?)
2742 uintptr_t tidx = (uintptr_t)arg;
2743 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2744 uint32_t assoc_thread_ct = g_assoc_thread_ct;
2745 uintptr_t perm_vec_ct = g_perm_vec_ct;
2746 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2747 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
2748 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
2749 uint32_t* git_homrar_cts = nullptr;
2750 uint32_t* git_missing_cts = nullptr;
2751 uint32_t* git_het_cts = nullptr;
2752 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
2753 uint32_t* resultbuf = g_resultbuf;
2754 uint32_t case_ct = g_perm_case_ct;
2755 uintptr_t* __restrict__ male_vec = g_sample_male_include2;
2756 uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
2757 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
2758 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
2759 double* msa_ptr = nullptr;
2760 uintptr_t* loadbuf;
2761 uintptr_t* loadbuf_cur;
2762 uint32_t* __restrict__ missing_cts;
2763 uint32_t* __restrict__ set_allele_cts;
2764 uintptr_t pidx;
2765 uintptr_t marker_idx;
2766 intptr_t row1x_sum;
2767 intptr_t col1_sum;
2768 intptr_t tot_obs;
2769 uint32_t block_start;
2770 uint32_t marker_bidx_start;
2771 uint32_t marker_bidx;
2772 uint32_t marker_bceil;
2773 uint32_t is_x;
2774 uint32_t is_x_or_y;
2775 uint32_t min_ploidy_1;
2776 uint32_t min_ploidy;
2777 uint32_t case_set_ct;
2778 uint32_t case_missing_ct;
2779 uint32_t missing_ct;
2780 while (1) {
2781 block_start = g_block_start;
2782 if (g_block_diff <= assoc_thread_ct) {
2783 if (g_block_diff <= tidx) {
2784 goto assoc_set_thread_skip_all;
2785 }
2786 marker_bidx_start = block_start + tidx;
2787 marker_bceil = marker_bidx_start + 1;
2788 } else {
2789 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
2790 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
2791 }
2792 marker_bidx = marker_bidx_start;
2793 is_x = g_is_x;
2794 is_x_or_y = is_x || g_is_y;
2795 min_ploidy_1 = g_min_ploidy_1;
2796 min_ploidy = 2;
2797 if (min_ploidy_1) {
2798 min_ploidy = 1;
2799 }
2800 loadbuf = g_loadbuf;
2801 missing_cts = g_missing_cts;
2802 set_allele_cts = g_set_cts;
2803 for (; marker_bidx < marker_bceil; marker_bidx++) {
2804 marker_idx = g_adapt_m_table[marker_bidx];
2805 msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
2806 col1_sum = set_allele_cts[marker_idx];
2807 missing_ct = missing_cts[marker_idx];
2808 if (is_x) {
2809 row1x_sum = 2 * case_ct;
2810 tot_obs = 2 * pheno_nm_ct - missing_ct;
2811 } else {
2812 row1x_sum = min_ploidy * case_ct;
2813 tot_obs = min_ploidy * (pheno_nm_ct - missing_ct);
2814 }
2815 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
2816 if (!is_x_or_y) {
2817 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
2818 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
2819 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
2820 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
2821 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
2822 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
2823 }
2824 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
2825 if (!is_x_or_y) {
2826 if (!min_ploidy_1) {
2827 case_missing_ct = git_missing_cts[pidx];
2828 case_set_ct = row1x_sum - (git_het_cts[pidx] + 2 * (case_missing_ct + git_homrar_cts[pidx]));
2829 } else {
2830 case_missing_ct = git_missing_cts[pidx] + git_het_cts[pidx];
2831 case_set_ct = row1x_sum - case_missing_ct - git_homrar_cts[pidx];
2832 }
2833 } else {
2834 if (is_x) {
2835 genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2836 } else {
2837 genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
2838 }
2839 }
2840 // Fisher's exact test not supported since we are adding raw chi-square
2841 // stats, so little to gain from precomputation
2842 *msa_ptr++ = chi22_eval(case_set_ct, row1x_sum - case_missing_ct * min_ploidy, col1_sum, tot_obs);
2843 }
2844 }
2845 assoc_set_thread_skip_all:
2846 if ((!tidx) || g_is_last_thread_block) {
2847 THREAD_RETURN;
2848 }
2849 THREAD_BLOCK_FINISH(tidx);
2850 }
2851 }
2852
qassoc_adapt_thread(void * arg)2853 THREAD_RET_TYPE qassoc_adapt_thread(void* arg) {
2854 uintptr_t tidx = (uintptr_t)arg;
2855 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
2856 uintptr_t perm_vec_ct = g_perm_vec_ct;
2857 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
2858 uint32_t first_adapt_check = g_first_adapt_check;
2859 uint32_t max_thread_ct = g_assoc_thread_ct;
2860 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
2861 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
2862 double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
2863 double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
2864 double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
2865 double* __restrict__ perm_vecstd = g_perm_vecstd;
2866 unsigned char* perm_adapt_stop = g_perm_adapt_stop;
2867 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
2868 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
2869 double adaptive_intercept = g_adaptive_intercept;
2870 double adaptive_slope = g_adaptive_slope;
2871 double adaptive_ci_zt = g_adaptive_ci_zt;
2872 double aperm_alpha = g_aperm_alpha;
2873 double pheno_sum = g_pheno_sum;
2874 double pheno_ssq = g_pheno_ssq;
2875 uint32_t* __restrict__ missing_cts;
2876 uint32_t* __restrict__ het_cts;
2877 uint32_t* __restrict__ homcom_cts;
2878 uintptr_t* __restrict__ loadbuf;
2879 double* __restrict__ orig_chiabs;
2880 uintptr_t next_cqg;
2881 uintptr_t marker_idx;
2882 uintptr_t pidx;
2883 uintptr_t ulii;
2884 intptr_t geno_sum;
2885 intptr_t geno_ssq;
2886 uint32_t marker_bidx;
2887 uint32_t marker_bceil;
2888 uint32_t missing_ct;
2889 uint32_t het_ct;
2890 uint32_t homcom_ct;
2891 uint32_t homrar_ct;
2892 uint32_t nanal;
2893 uint32_t cur_thread_ct;
2894 uint32_t next_adapt_check;
2895 uint32_t success_2start;
2896 uint32_t success_2incr;
2897 uint32_t uii;
2898 double nanal_recip;
2899 double nanal_m1_recip;
2900 double geno_mean;
2901 double geno_var;
2902 double qt_sum;
2903 double qt_ssq;
2904 double qt_g_prod;
2905 double qt_mean;
2906 double qt_var;
2907 double qt_g_covar;
2908 double beta;
2909 double betasq;
2910 double dxx;
2911 double dyy;
2912 double dzz;
2913 double stat_high;
2914 double stat_low;
2915 double sval;
2916 while (1) {
2917 cur_thread_ct = g_block_diff / CACHELINE_DBL;
2918 if (cur_thread_ct > max_thread_ct) {
2919 cur_thread_ct = max_thread_ct;
2920 } else if (!cur_thread_ct) {
2921 cur_thread_ct = 1;
2922 }
2923 if (cur_thread_ct <= tidx) {
2924 goto qassoc_adapt_thread_skip_all;
2925 }
2926 marker_bidx = g_qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
2927 marker_bceil = g_qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
2928 loadbuf = g_loadbuf;
2929 missing_cts = g_missing_cts;
2930 het_cts = g_het_cts;
2931 homcom_cts = g_homcom_cts;
2932 orig_chiabs = g_orig_chisq;
2933 for (; marker_bidx < marker_bceil; marker_bidx++) {
2934 marker_idx = g_adapt_m_table[marker_bidx];
2935 next_adapt_check = first_adapt_check;
2936 missing_ct = missing_cts[marker_idx];
2937 nanal = pheno_nm_ct - missing_ct;
2938 homcom_ct = homcom_cts[marker_idx];
2939 het_ct = het_cts[marker_idx];
2940 homrar_ct = nanal - het_ct - homcom_ct;
2941 if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal) || (homrar_ct == nanal)) {
2942 // the current code might otherwise report a spurious association if
2943 // geno_var is zero, so we explicitly check for it here.
2944 perm_adapt_stop[marker_idx] = 1;
2945 perm_attempt_ct[marker_idx] = 0;
2946 continue;
2947 }
2948 sval = orig_chiabs[marker_idx];
2949 // tstat = beta / vbeta_sqrt
2950 // tstat^2 = beta * beta / vbeta;
2951 // = beta^2 * (nanal - 2) / ((qt_var / geno_var) - beta^2)
2952 // [stop here for max(T) since nanal varies across markers]
2953 // tstat^2 / (nanal - 2) = beta^2 / ((qt_var / geno_var) - beta^2)
2954 // = beta^2 * geno_var / (qt_var - beta^2 * geno_var)
2955 // Larger values of this last statistic monotonically result in smaller
2956 // P-values, so this is what we use for comparison (this saves a few
2957 // floating point operations at the end).
2958 sval = sval * sval / ((double)(((int32_t)nanal) - 2));
2959 stat_high = sval + EPSILON;
2960 stat_low = sval - EPSILON;
2961 geno_sum = 2 * homrar_ct + het_ct;
2962 geno_ssq = 4 * homrar_ct + het_ct;
2963 nanal_recip = 1.0 / ((double)((int32_t)nanal));
2964 nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
2965 geno_mean = ((double)geno_sum) * nanal_recip;
2966 geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip;
2967 success_2start = perm_2success_ct[marker_idx];
2968 success_2incr = 0;
2969 next_cqg = 0;
2970 for (pidx = 0; pidx < perm_vec_ct;) {
2971 if (pidx == next_cqg) {
2972 next_cqg = next_adapt_check;
2973 ulii = pidx + pidx_offset;
2974 if (next_cqg < ulii + (ulii >> 2)) {
2975 // increase ~25% at a time
2976 next_cqg = ulii + (ulii >> 2);
2977 }
2978 next_cqg -= pidx_offset;
2979 next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
2980 if (next_cqg > perm_vec_ct) {
2981 next_cqg = perm_vec_ct;
2982 }
2983 calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_g_prod[pidx]));
2984 }
2985 qt_sum = pheno_sum - git_qt_sum[pidx];
2986 qt_ssq = pheno_ssq - git_qt_ssq[pidx];
2987 qt_g_prod = git_qt_g_prod[pidx];
2988 qt_mean = qt_sum * nanal_recip;
2989 qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
2990 qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
2991 dxx = 1.0 / geno_var;
2992 beta = qt_g_covar * dxx;
2993 betasq = beta * beta;
2994 sval = betasq / (qt_var * dxx - betasq);
2995 if (sval > stat_high) {
2996 success_2incr += 2;
2997 } else if (sval > stat_low) {
2998 success_2incr++;
2999 }
3000 if (++pidx == next_adapt_check - pidx_offset) {
3001 uii = success_2start + success_2incr;
3002 if (uii) {
3003 sval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3004 dxx = adaptive_ci_zt * sqrt(sval * (1 - sval) / ((int32_t)next_adapt_check));
3005 dyy = sval - dxx; // lower bound
3006 dzz = sval + dxx; // upper bound
3007 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3008 perm_adapt_stop[marker_idx] = 1;
3009 perm_attempt_ct[marker_idx] = next_adapt_check;
3010 fill_double_zero(next_cqg, git_qt_g_prod);
3011 fill_double_zero(next_cqg, git_qt_sum);
3012 fill_double_zero(next_cqg, git_qt_ssq);
3013 goto qassoc_adapt_thread_lesszero;
3014 }
3015 }
3016 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3017 }
3018 }
3019 fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3020 qassoc_adapt_thread_lesszero:
3021 perm_2success_ct[marker_idx] += success_2incr;
3022 }
3023 qassoc_adapt_thread_skip_all:
3024 if ((!tidx) || g_is_last_thread_block) {
3025 THREAD_RETURN;
3026 }
3027 THREAD_BLOCK_FINISH(tidx);
3028 }
3029 }
3030
qassoc_adapt_lin_thread(void * arg)3031 THREAD_RET_TYPE qassoc_adapt_lin_thread(void* arg) {
3032 uintptr_t tidx = (uintptr_t)arg;
3033 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3034 uintptr_t perm_vec_ct = g_perm_vec_ct;
3035 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3036 uint32_t first_adapt_check = g_first_adapt_check;
3037 uint32_t max_thread_ct = g_assoc_thread_ct;
3038 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3039 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3040 double* git_qt_het_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 6]);
3041 double* git_qt_het_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 1)]);
3042 double* git_qt_homrar_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 2)]);
3043 double* git_qt_homrar_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 3)]);
3044 double* git_qt_missing_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 4)]);
3045 double* git_qt_missing_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 5)]);
3046 double* __restrict__ perm_vecstd = g_perm_vecstd;
3047 unsigned char* perm_adapt_stop = g_perm_adapt_stop;
3048 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
3049 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3050 double adaptive_intercept = g_adaptive_intercept;
3051 double adaptive_slope = g_adaptive_slope;
3052 double adaptive_ci_zt = g_adaptive_ci_zt;
3053 double aperm_alpha = g_aperm_alpha;
3054 double pheno_sum = g_pheno_sum;
3055 double pheno_ssq = g_pheno_ssq;
3056 uint32_t* __restrict__ missing_cts;
3057 uint32_t* __restrict__ het_cts;
3058 uint32_t* __restrict__ homcom_cts;
3059 uintptr_t* __restrict__ loadbuf;
3060 double* __restrict__ orig_linsq;
3061 uintptr_t next_cqg;
3062 uintptr_t marker_idx;
3063 uintptr_t pidx;
3064 uintptr_t ulii;
3065 intptr_t geno_sum;
3066 uint32_t marker_bidx;
3067 uint32_t marker_bceil;
3068 uint32_t missing_ct;
3069 uint32_t het_ct;
3070 uint32_t homcom_ct;
3071 uint32_t homrar_ct;
3072 uint32_t nanal;
3073 uint32_t cur_thread_ct;
3074 double het_ctd;
3075 double homrar_ctd;
3076 double nanal_recip;
3077 double geno_mean;
3078 double geno_mean_sq;
3079 double geno_mean_coeff2;
3080 double geno_mean_coeff3;
3081 double qt_sum;
3082 double qt_ssq;
3083 double qt_het_sum;
3084 double qt_het_ssq;
3085 double qt_homrar_sum;
3086 double qt_homrar_ssq;
3087 double qt_g_prod;
3088 double qt_mean;
3089 double qt_g_prod_centered;
3090 double dxx;
3091 double dyy;
3092 double dzz;
3093 uint32_t next_adapt_check;
3094 uint32_t success_2start;
3095 uint32_t success_2incr;
3096 uint32_t uii;
3097 double stat_high;
3098 double stat_low;
3099 double sval;
3100 while (1) {
3101 cur_thread_ct = g_block_diff / CACHELINE_DBL;
3102 if (cur_thread_ct > max_thread_ct) {
3103 cur_thread_ct = max_thread_ct;
3104 } else if (!cur_thread_ct) {
3105 cur_thread_ct = 1;
3106 }
3107 if (cur_thread_ct <= tidx) {
3108 goto qassoc_adapt_lin_thread_skip_all;
3109 }
3110 marker_bidx = g_qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3111 marker_bceil = g_qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3112 loadbuf = g_loadbuf;
3113 missing_cts = g_missing_cts;
3114 het_cts = g_het_cts;
3115 homcom_cts = g_homcom_cts;
3116 orig_linsq = g_orig_linsq;
3117 for (; marker_bidx < marker_bceil; marker_bidx++) {
3118 marker_idx = g_adapt_m_table[marker_bidx];
3119 next_adapt_check = first_adapt_check;
3120 missing_ct = missing_cts[marker_idx];
3121 nanal = pheno_nm_ct - missing_ct;
3122 homcom_ct = homcom_cts[marker_idx];
3123 het_ct = het_cts[marker_idx];
3124 if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3125 perm_adapt_stop[marker_idx] = 1;
3126 perm_attempt_ct[marker_idx] = 0;
3127 continue;
3128 }
3129 homrar_ct = nanal - het_ct - homcom_ct;
3130 sval = orig_linsq[marker_idx];
3131 stat_high = sval + EPSILON;
3132 stat_low = sval - EPSILON;
3133 geno_sum = 2 * homrar_ct + het_ct;
3134 nanal_recip = 1.0 / ((double)((int32_t)nanal));
3135 het_ctd = het_ct;
3136 homrar_ctd = homrar_ct;
3137 geno_mean = ((double)geno_sum) * nanal_recip;
3138 geno_mean_sq = geno_mean * geno_mean;
3139 geno_mean_coeff2 = 1 - 2 * geno_mean;
3140 geno_mean_coeff3 = 4 - 4 * geno_mean;
3141 success_2start = perm_2success_ct[marker_idx];
3142 success_2incr = 0;
3143 next_cqg = 0;
3144 for (pidx = 0; pidx < perm_vec_ct;) {
3145 if (pidx == next_cqg) {
3146 next_cqg = next_adapt_check;
3147 ulii = pidx + pidx_offset;
3148 if (next_cqg < ulii + (ulii >> 2)) {
3149 // increase ~25% at a time
3150 next_cqg = ulii + (ulii >> 2);
3151 }
3152 next_cqg -= pidx_offset;
3153 next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
3154 if (next_cqg > perm_vec_ct) {
3155 next_cqg = perm_vec_ct;
3156 }
3157 calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_het_sum[pidx]));
3158 }
3159 qt_sum = pheno_sum - git_qt_missing_sum[pidx];
3160 qt_ssq = pheno_ssq - git_qt_missing_ssq[pidx];
3161 qt_het_sum = git_qt_het_sum[pidx];
3162 qt_het_ssq = git_qt_het_ssq[pidx];
3163 qt_homrar_sum = git_qt_homrar_sum[pidx];
3164 qt_homrar_ssq = git_qt_homrar_ssq[pidx];
3165 qt_g_prod = qt_het_sum + 2 * qt_homrar_sum;
3166 qt_mean = qt_sum * nanal_recip;
3167 qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
3168 sval = qt_g_prod_centered * qt_g_prod_centered / (geno_mean_sq * (qt_ssq + (qt_mean - 2) * qt_sum) + geno_mean_coeff2 * (qt_het_ssq + qt_mean * (qt_mean * het_ctd - 2 * qt_het_sum)) + geno_mean_coeff3 * (qt_homrar_ssq + qt_mean * (qt_mean * homrar_ctd - 2 * qt_homrar_sum)));
3169 if (sval > stat_high) {
3170 success_2incr += 2;
3171 } else if (sval > stat_low) {
3172 success_2incr++;
3173 }
3174 if (++pidx == next_adapt_check - pidx_offset) {
3175 uii = success_2start + success_2incr;
3176 if (uii) {
3177 sval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3178 dxx = adaptive_ci_zt * sqrt(sval * (1 - sval) / ((int32_t)next_adapt_check));
3179 dyy = sval - dxx;
3180 dzz = sval + dxx;
3181 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3182 perm_adapt_stop[marker_idx] = 1;
3183 perm_attempt_ct[marker_idx] = next_adapt_check;
3184 fill_double_zero(next_cqg, git_qt_het_sum);
3185 fill_double_zero(next_cqg, git_qt_het_ssq);
3186 fill_double_zero(next_cqg, git_qt_homrar_sum);
3187 fill_double_zero(next_cqg, git_qt_homrar_ssq);
3188 fill_double_zero(next_cqg, git_qt_missing_sum);
3189 fill_double_zero(next_cqg, git_qt_missing_ssq);
3190 goto qassoc_adapt_lin_thread_lesszero;
3191 }
3192 }
3193 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3194 }
3195 }
3196 fill_double_zero(perm_vec_ctcl8m * 6, git_qt_het_sum);
3197 qassoc_adapt_lin_thread_lesszero:
3198 perm_2success_ct[marker_idx] += success_2incr;
3199 }
3200 qassoc_adapt_lin_thread_skip_all:
3201 if ((!tidx) || g_is_last_thread_block) {
3202 THREAD_RETURN;
3203 }
3204 THREAD_BLOCK_FINISH(tidx);
3205 }
3206 }
3207
qassoc_maxt_thread(void * arg)3208 THREAD_RET_TYPE qassoc_maxt_thread(void* arg) {
3209 uintptr_t tidx = (uintptr_t)arg;
3210 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3211 uintptr_t perm_vec_ct = g_perm_vec_ct;
3212 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3213 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3214 uint32_t max_thread_ct = g_assoc_thread_ct;
3215 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3216 double* __restrict__ perm_vecstd = g_perm_vecstd;
3217 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3218 double* msa_ptr = nullptr;
3219 double pheno_sum = g_pheno_sum;
3220 double pheno_ssq = g_pheno_ssq;
3221 double* git_qt_g_prod;
3222 double* git_qt_sum;
3223 double* git_qt_ssq;
3224 double* qresultbuf;
3225 double* __restrict__ orig_chiabs;
3226 uintptr_t* loadbuf;
3227 uint32_t* __restrict__ missing_cts;
3228 uint32_t* __restrict__ het_cts;
3229 uint32_t* __restrict__ homcom_cts;
3230 uint16_t* ldrefs;
3231 uintptr_t* loadbuf_cur;
3232 uintptr_t pidx;
3233 uintptr_t marker_idx;
3234 uint32_t qblock_start;
3235 uint32_t maxt_block_base;
3236 uint32_t maxt_block_base2;
3237 uint32_t marker_bidx_start;
3238 uint32_t maxt_block_base3;
3239 uint32_t marker_bidx;
3240 uint32_t marker_bceil;
3241 uint32_t marker_bidx2;
3242 uint32_t missing_ct;
3243 uint32_t het_ct;
3244 uint32_t homcom_ct;
3245 uint32_t homrar_ct;
3246 intptr_t geno_sum;
3247 intptr_t geno_ssq;
3248 uint32_t nanal;
3249 double nanal_recip;
3250 double nanal_m1_recip;
3251 double geno_mean;
3252 double geno_var;
3253 double qt_sum;
3254 double qt_ssq;
3255 double qt_g_prod;
3256 double qt_mean;
3257 double qt_var;
3258 double qt_g_covar;
3259 double nanal_m2d;
3260 double beta;
3261 double betasq;
3262 double dxx;
3263 uint32_t success_2incr;
3264 double stat_high;
3265 double stat_low;
3266 double sval;
3267 uintptr_t best_cost;
3268 uint32_t cur_thread_ct;
3269 uint32_t marker_idx_tmp;
3270 int32_t missing_ct_tmp;
3271 int32_t het_ct_tmp;
3272 int32_t homcom_ct_tmp;
3273 int32_t homrar_ct_tmp;
3274 uint32_t loop_ceil;
3275 uintptr_t cur_cost;
3276 uint32_t ldref;
3277 while (1) {
3278 cur_thread_ct = g_block_diff / CACHELINE_DBL;
3279 if (cur_thread_ct > max_thread_ct) {
3280 cur_thread_ct = max_thread_ct;
3281 } else if (!cur_thread_ct) {
3282 cur_thread_ct = 1;
3283 }
3284 if (cur_thread_ct <= tidx) {
3285 goto qassoc_maxt_thread_skip_all;
3286 }
3287 qblock_start = g_qblock_start;
3288 maxt_block_base = g_maxt_block_base;
3289 maxt_block_base2 = maxt_block_base + qblock_start;
3290 marker_bidx_start = qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3291 maxt_block_base3 = maxt_block_base + marker_bidx_start;
3292 marker_bidx = marker_bidx_start;
3293 marker_idx = maxt_block_base3;
3294 marker_bceil = qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3295 memcpy(results, &(g_maxt_extreme_stat[g_perms_done - perm_vec_ct]), perm_vec_ct * sizeof(double));
3296 if (g_mperm_save_all) {
3297 msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
3298 }
3299 loadbuf = g_loadbuf;
3300 qresultbuf = g_qresultbuf;
3301 orig_chiabs = g_orig_chisq;
3302 missing_cts = g_missing_cts;
3303 het_cts = g_het_cts;
3304 homcom_cts = g_homcom_cts;
3305 ldrefs = g_ldrefs;
3306 for (; marker_bidx < marker_bceil; marker_bidx++) {
3307 missing_ct = missing_cts[marker_idx];
3308 nanal = pheno_nm_ct - missing_ct;
3309 homcom_ct = homcom_cts[marker_idx];
3310 het_ct = het_cts[marker_idx];
3311 if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3312 perm_2success_ct[marker_idx++] += perm_vec_ct;
3313 if (msa_ptr) {
3314 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3315 *msa_ptr++ = -9;
3316 }
3317 }
3318 continue;
3319 }
3320 homrar_ct = nanal - het_ct - homcom_ct;
3321 sval = orig_chiabs[marker_idx];
3322 sval = sval * sval;
3323 stat_high = sval + EPSILON;
3324 stat_low = sval - EPSILON;
3325 geno_sum = 2 * homrar_ct + het_ct;
3326 geno_ssq = 4 * homrar_ct + het_ct;
3327 nanal_recip = 1.0 / ((double)((int32_t)nanal));
3328 nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
3329 nanal_m2d = nanal - 2;
3330 geno_mean = ((double)geno_sum) * nanal_recip;
3331 geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip;
3332 success_2incr = 0;
3333 git_qt_g_prod = &(qresultbuf[3 * marker_bidx * perm_vec_ctcl8m]);
3334 git_qt_sum = &(git_qt_g_prod[perm_vec_ctcl8m]);
3335 git_qt_ssq = &(git_qt_g_prod[2 * perm_vec_ctcl8m]);
3336 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3337 ldref = ldrefs[marker_idx];
3338 if (ldref == 65535) {
3339 // Addition loops required for genotype indexing:
3340 // het_ct + homrar_ct + 2 * missing_ct
3341 //
3342 // Addition/initial copy loops required for LD exploitation:
3343 // 3 + 3 * (missing <-> homrar/het) + 2 * (missing <-> homcom) +
3344 // (homrar <-> het/homcom) + (het <-> homcom)
3345 // Simple lower bound (may allow us to skip full LD cost calculation):
3346 // (delta(homrar) + 2*delta(missing) + delta(het) + delta(homcom)) / 2
3347 best_cost = het_ct + homrar_ct + 2 * missing_ct;
3348 ldref = marker_bidx;
3349 marker_idx_tmp = maxt_block_base;
3350 loop_ceil = maxt_block_base2;
3351 do {
3352 if (marker_idx_tmp == maxt_block_base2) {
3353 marker_idx_tmp = maxt_block_base3;
3354 loop_ceil = marker_idx;
3355 }
3356 for (; marker_idx_tmp < loop_ceil; marker_idx_tmp++) {
3357 if (ldrefs[marker_idx_tmp] != 65535) {
3358 missing_ct_tmp = missing_cts[marker_idx_tmp];
3359 homcom_ct_tmp = homcom_cts[marker_idx_tmp];
3360 het_ct_tmp = het_cts[marker_idx_tmp];
3361 homrar_ct_tmp = pheno_nm_ct - missing_ct_tmp - het_ct_tmp - homcom_ct_tmp;
3362 cur_cost = labs(((int32_t)missing_ct) - missing_ct_tmp) + (labs(((int32_t)homrar_ct) - homrar_ct_tmp) + labs(((int32_t)het_ct) - het_ct_tmp) + labs(((int32_t)homcom_ct) - homcom_ct_tmp) + 7) / 2;
3363 if (cur_cost < best_cost) {
3364 marker_bidx2 = marker_idx_tmp - maxt_block_base;
3365 cur_cost = qrem_cost2(pheno_nm_ctv2, &(loadbuf[marker_bidx2 * pheno_nm_ctv2]), loadbuf_cur);
3366 if (cur_cost < best_cost) {
3367 ldref = marker_bidx2;
3368 best_cost = cur_cost;
3369 }
3370 }
3371 }
3372 }
3373 } while (marker_idx_tmp < marker_idx);
3374 ldrefs[marker_idx] = ldref;
3375 }
3376 if (ldref == marker_bidx) {
3377 fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3378 calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, loadbuf_cur, perm_vecstd, git_qt_g_prod);
3379 } else {
3380 memcpy(git_qt_g_prod, &(qresultbuf[3 * ldref * perm_vec_ctcl8m]), 3 * perm_vec_ctcl8m * sizeof(double));
3381 calc_qrem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecstd, git_qt_g_prod);
3382 }
3383 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3384 qt_sum = pheno_sum - git_qt_sum[pidx];
3385 qt_ssq = pheno_ssq - git_qt_ssq[pidx];
3386 qt_g_prod = git_qt_g_prod[pidx];
3387 qt_mean = qt_sum * nanal_recip;
3388 qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
3389 qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
3390 dxx = 1.0 / geno_var;
3391 beta = qt_g_covar * dxx;
3392 betasq = beta * beta;
3393 sval = betasq * nanal_m2d / (qt_var * dxx - betasq);
3394 if (sval > stat_high) {
3395 success_2incr += 2;
3396 } else if (sval > stat_low) {
3397 success_2incr++;
3398 }
3399 if (results[pidx] < sval) {
3400 results[pidx] = sval;
3401 }
3402 if (msa_ptr) {
3403 *msa_ptr++ = sval;
3404 }
3405 }
3406 perm_2success_ct[marker_idx++] += success_2incr;
3407 }
3408 qassoc_maxt_thread_skip_all:
3409 if ((!tidx) || g_is_last_thread_block) {
3410 THREAD_RETURN;
3411 }
3412 THREAD_BLOCK_FINISH(tidx);
3413 }
3414 }
3415
qassoc_maxt_lin_thread(void * arg)3416 THREAD_RET_TYPE qassoc_maxt_lin_thread(void* arg) {
3417 uintptr_t tidx = (uintptr_t)arg;
3418 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3419 uintptr_t perm_vec_ct = g_perm_vec_ct;
3420 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3421 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3422 uint32_t max_thread_ct = g_assoc_thread_ct;
3423 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3424 double* __restrict__ perm_vecstd = g_perm_vecstd;
3425 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3426 double* msa_ptr = nullptr;
3427 double pheno_sum = g_pheno_sum;
3428 double pheno_ssq = g_pheno_ssq;
3429 double* git_qt_het_sum;
3430 double* git_qt_het_ssq;
3431 double* git_qt_homrar_sum;
3432 double* git_qt_homrar_ssq;
3433 double* git_qt_missing_sum;
3434 double* git_qt_missing_ssq;
3435 uintptr_t* loadbuf;
3436 double* qresultbuf;
3437 uint32_t* __restrict__ missing_cts;
3438 uint32_t* __restrict__ het_cts;
3439 uint32_t* __restrict__ homcom_cts;
3440 uint16_t* ldrefs;
3441 double* __restrict__ orig_linsq;
3442 uintptr_t* loadbuf_cur;
3443 uintptr_t pidx;
3444 uintptr_t marker_idx;
3445 uint32_t qblock_start;
3446 uint32_t maxt_block_base;
3447 uint32_t maxt_block_base2;
3448 uint32_t marker_bidx_start;
3449 uint32_t maxt_block_base3;
3450 uint32_t marker_bidx;
3451 uint32_t marker_bceil;
3452 uint32_t missing_ct;
3453 uint32_t het_ct;
3454 uint32_t homcom_ct;
3455 uint32_t homrar_ct;
3456 intptr_t geno_sum;
3457 uint32_t nanal;
3458 uint32_t success_2incr;
3459 uint32_t cur_thread_ct;
3460 double het_ctd;
3461 double homrar_ctd;
3462 double nanal_recip;
3463 double geno_mean;
3464 double geno_mean_sq;
3465 double geno_mean_coeff2;
3466 double geno_mean_coeff3;
3467 double qt_sum;
3468 double qt_ssq;
3469 double qt_het_sum;
3470 double qt_het_ssq;
3471 double qt_homrar_sum;
3472 double qt_homrar_ssq;
3473 double qt_g_prod;
3474 double qt_mean;
3475 double qt_g_prod_centered;
3476 double stat_high;
3477 double stat_low;
3478 double sval;
3479 uint32_t ldref;
3480 while (1) {
3481 cur_thread_ct = g_block_diff / CACHELINE_DBL;
3482 if (cur_thread_ct > max_thread_ct) {
3483 cur_thread_ct = max_thread_ct;
3484 } else if (!cur_thread_ct) {
3485 cur_thread_ct = 1;
3486 }
3487 if (cur_thread_ct <= tidx) {
3488 goto qassoc_maxt_lin_thread_skip_all;
3489 }
3490 qblock_start = g_qblock_start;
3491 maxt_block_base = g_maxt_block_base;
3492 maxt_block_base2 = maxt_block_base + qblock_start;
3493 marker_bidx_start = qblock_start + (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3494 maxt_block_base3 = maxt_block_base + marker_bidx_start;
3495 marker_bidx = marker_bidx_start;
3496 marker_idx = maxt_block_base3;
3497 marker_bceil = qblock_start + (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3498 memcpy(results, &(g_maxt_extreme_stat[g_perms_done - perm_vec_ct]), perm_vec_ct * sizeof(double));
3499 if (g_mperm_save_all) {
3500 msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
3501 }
3502 loadbuf = g_loadbuf;
3503 qresultbuf = g_qresultbuf;
3504 missing_cts = g_missing_cts;
3505 het_cts = g_het_cts;
3506 homcom_cts = g_homcom_cts;
3507 ldrefs = g_ldrefs;
3508 orig_linsq = g_orig_linsq;
3509
3510 for (; marker_bidx < marker_bceil; marker_bidx++) {
3511 missing_ct = missing_cts[marker_idx];
3512 nanal = pheno_nm_ct - missing_ct;
3513 homcom_ct = homcom_cts[marker_idx];
3514 het_ct = het_cts[marker_idx];
3515 if ((nanal < 3) || (homcom_ct == nanal) || (het_ct == nanal)) {
3516 perm_2success_ct[marker_idx++] += perm_vec_ct;
3517 if (msa_ptr) {
3518 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3519 *msa_ptr++ = -9;
3520 }
3521 }
3522 continue;
3523 }
3524 homrar_ct = nanal - het_ct - homcom_ct;
3525 sval = orig_linsq[marker_idx];
3526 stat_high = sval + EPSILON;
3527 stat_low = sval - EPSILON;
3528 geno_sum = 2 * homrar_ct + het_ct;
3529 nanal_recip = 1.0 / ((double)((int32_t)nanal));
3530 het_ctd = het_ct;
3531 homrar_ctd = homrar_ct;
3532 geno_mean = ((double)geno_sum) * nanal_recip;
3533 geno_mean_sq = geno_mean * geno_mean;
3534 geno_mean_coeff2 = 1 - 2 * geno_mean;
3535 geno_mean_coeff3 = 4 - 4 * geno_mean;
3536 success_2incr = 0;
3537 git_qt_het_sum = &(qresultbuf[6 * marker_bidx * perm_vec_ctcl8m]);
3538 git_qt_het_ssq = &(git_qt_het_sum[perm_vec_ctcl8m]);
3539 git_qt_homrar_sum = &(git_qt_het_sum[2 * perm_vec_ctcl8m]);
3540 git_qt_homrar_ssq = &(git_qt_het_sum[3 * perm_vec_ctcl8m]);
3541 git_qt_missing_sum = &(git_qt_het_sum[4 * perm_vec_ctcl8m]);
3542 git_qt_missing_ssq = &(git_qt_het_sum[5 * perm_vec_ctcl8m]);
3543 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3544 ldref = ldrefs[marker_idx];
3545 if (ldref == 65535) {
3546 // 2x addition loops required for genotype indexing:
3547 // het_ct + homrar_ct + missing_ct
3548 //
3549 // 2x addition/initial copy loops required for LD exploitation:
3550 // 3 + 2 * (<-> neither side homcom) + (<-> homcom)
3551 // Simple lower bound (may allow us to skip full LD cost calculation):
3552 // 3 + delta(homcom) if delta(homcom) >= sum of other deltas
3553 // 3 + delta(non-homcom) otherwise
3554 ldref = marker_bidx;
3555 if (pheno_nm_ct - homcom_ct > 3) {
3556 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 3, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
3557 }
3558 ldrefs[marker_idx] = ldref;
3559 }
3560 if (ldref == marker_bidx) {
3561 fill_double_zero(perm_vec_ctcl8m * 6, git_qt_het_sum);
3562 calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, loadbuf_cur, perm_vecstd, git_qt_het_sum);
3563 } else {
3564 memcpy(git_qt_het_sum, &(qresultbuf[6 * ldref * perm_vec_ctcl8m]), 6 * perm_vec_ctcl8m * sizeof(double));
3565 calc_qrem_lin(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecstd, git_qt_het_sum);
3566 }
3567 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3568 qt_sum = pheno_sum - git_qt_missing_sum[pidx];
3569 qt_ssq = pheno_ssq - git_qt_missing_ssq[pidx];
3570 qt_het_sum = git_qt_het_sum[pidx];
3571 qt_het_ssq = git_qt_het_ssq[pidx];
3572 qt_homrar_sum = git_qt_homrar_sum[pidx];
3573 qt_homrar_ssq = git_qt_homrar_ssq[pidx];
3574 qt_g_prod = qt_het_sum + 2 * qt_homrar_sum;
3575 qt_mean = qt_sum * nanal_recip;
3576 qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
3577 sval = qt_g_prod_centered * qt_g_prod_centered / (geno_mean_sq * (qt_ssq + (qt_mean - 2) * qt_sum) + geno_mean_coeff2 * (qt_het_ssq + qt_mean * (qt_mean * het_ctd - 2 * qt_het_sum)) + geno_mean_coeff3 * (qt_homrar_ssq + qt_mean * (qt_mean * homrar_ctd - 2 * qt_homrar_sum)));
3578 if (sval > stat_high) {
3579 success_2incr += 2;
3580 } else if (sval > stat_low) {
3581 success_2incr++;
3582 }
3583 if (results[pidx] < sval) {
3584 results[pidx] = sval;
3585 }
3586 if (msa_ptr) {
3587 *msa_ptr++ = sval;
3588 }
3589 }
3590 perm_2success_ct[marker_idx++] += success_2incr;
3591 }
3592 qassoc_maxt_lin_thread_skip_all:
3593 if ((!tidx) || g_is_last_thread_block) {
3594 THREAD_RETURN;
3595 }
3596 THREAD_BLOCK_FINISH(tidx);
3597 }
3598 }
3599
qassoc_set_thread(void * arg)3600 THREAD_RET_TYPE qassoc_set_thread(void* arg) {
3601 // Simplified version of qassoc_adapt/maxt_thread(), except we need to save
3602 // actual t-statistics.
3603 uintptr_t tidx = (uintptr_t)arg;
3604 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3605 uintptr_t perm_vec_ct = g_perm_vec_ct;
3606 uint32_t max_thread_ct = g_assoc_thread_ct;
3607 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3608 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3609 double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
3610 double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
3611 double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
3612 double* __restrict__ perm_vecstd = g_perm_vecstd;
3613 double pheno_sum = g_pheno_sum;
3614 double pheno_ssq = g_pheno_ssq;
3615 uint32_t* __restrict__ missing_cts;
3616 uint32_t* __restrict__ het_cts;
3617 uint32_t* __restrict__ homcom_cts;
3618 uintptr_t* __restrict__ loadbuf;
3619 uintptr_t marker_idx;
3620 uintptr_t pidx;
3621 intptr_t geno_sum;
3622 intptr_t geno_ssq;
3623 double* msa_ptr;
3624 uint32_t marker_bidx;
3625 uint32_t marker_bceil;
3626 uint32_t missing_ct;
3627 uint32_t het_ct;
3628 uint32_t homcom_ct;
3629 uint32_t homrar_ct;
3630 uint32_t nanal;
3631 uint32_t cur_thread_ct;
3632 double nanal_recip;
3633 double nanal_m1_recip;
3634 double nanal_m2_recip;
3635 double geno_mean;
3636 double geno_var_recip;
3637 double qt_sum;
3638 double qt_ssq;
3639 double qt_g_prod;
3640 double qt_mean;
3641 double qt_var;
3642 double qt_g_covar;
3643 double beta;
3644 double vbeta_sqrt;
3645 while (1) {
3646 cur_thread_ct = g_block_diff / CACHELINE_DBL;
3647 if (cur_thread_ct > max_thread_ct) {
3648 cur_thread_ct = max_thread_ct;
3649 } else if (!cur_thread_ct) {
3650 cur_thread_ct = 1;
3651 }
3652 if (cur_thread_ct <= tidx) {
3653 goto qassoc_set_thread_skip_all;
3654 }
3655 marker_bidx = (((uint64_t)tidx) * g_block_diff) / cur_thread_ct;
3656 marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / cur_thread_ct;
3657 loadbuf = g_loadbuf;
3658 missing_cts = g_missing_cts;
3659 het_cts = g_het_cts;
3660 homcom_cts = g_homcom_cts;
3661 for (; marker_bidx < marker_bceil; marker_bidx++) {
3662 marker_idx = g_adapt_m_table[marker_bidx];
3663 msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
3664 missing_ct = missing_cts[marker_idx];
3665 nanal = pheno_nm_ct - missing_ct;
3666 homcom_ct = homcom_cts[marker_idx];
3667 het_ct = het_cts[marker_idx];
3668 homrar_ct = nanal - het_ct - homcom_ct;
3669 geno_sum = 2 * homrar_ct + het_ct;
3670 geno_ssq = 4 * homrar_ct + het_ct;
3671 nanal_recip = 1.0 / ((double)((int32_t)nanal));
3672 nanal_m1_recip = 1.0 / ((double)(((int32_t)nanal) - 1));
3673 nanal_m2_recip = 1.0 / ((double)(((int32_t)nanal) - 2));
3674 geno_mean = ((double)geno_sum) * nanal_recip;
3675 geno_var_recip = 1.0 / ((((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip);
3676 calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecstd, git_qt_g_prod);
3677 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3678 qt_sum = pheno_sum - git_qt_sum[pidx];
3679 qt_ssq = pheno_ssq - git_qt_ssq[pidx];
3680 qt_g_prod = git_qt_g_prod[pidx];
3681 qt_mean = qt_sum * nanal_recip;
3682 qt_var = (qt_ssq - qt_sum * qt_mean) * nanal_m1_recip;
3683 qt_g_covar = (qt_g_prod - qt_sum * geno_mean) * nanal_m1_recip;
3684 beta = qt_g_covar * geno_var_recip;
3685 vbeta_sqrt = sqrt((qt_var * geno_var_recip - beta * beta) * nanal_m2_recip);
3686 *msa_ptr++ = fabs(beta / vbeta_sqrt);
3687 }
3688 fill_double_zero(perm_vec_ctcl8m * 3, git_qt_g_prod);
3689 }
3690 qassoc_set_thread_skip_all:
3691 if ((!tidx) || g_is_last_thread_block) {
3692 THREAD_RETURN;
3693 }
3694 THREAD_BLOCK_FINISH(tidx);
3695 }
3696 }
3697
model_adapt_domrec_thread(void * arg)3698 THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
3699 uintptr_t tidx = (uintptr_t)arg;
3700 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
3701 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3702 uintptr_t perm_vec_ct = g_perm_vec_ct;
3703 uint32_t assoc_thread_ct = g_assoc_thread_ct;
3704 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3705 uint32_t model_fisher = g_model_fisher;
3706 uint32_t fisher_midp = g_fisher_midp;
3707 uint32_t precomp_width = g_precomp_width;
3708 uint32_t first_adapt_check = g_first_adapt_check;
3709 uint32_t case_ct = g_perm_case_ct;
3710 int32_t is_model_prec = g_is_model_prec;
3711 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
3712 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
3713 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3714 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
3715 double adaptive_intercept = g_adaptive_intercept;
3716 double adaptive_slope = g_adaptive_slope;
3717 double adaptive_ci_zt = g_adaptive_ci_zt;
3718 double aperm_alpha = g_aperm_alpha;
3719 uintptr_t* __restrict__ loadbuf;
3720 double* __restrict__ orig_pvals;
3721 double* __restrict__ orig_chisq;
3722 uint32_t* __restrict__ missing_cts;
3723 uint32_t* __restrict__ het_cts;
3724 uint32_t* __restrict__ homcom_cts;
3725 uint32_t* __restrict__ precomp_start;
3726 uint32_t* __restrict__ precomp_ui;
3727 uint32_t* gpui;
3728 uintptr_t marker_idx;
3729 uintptr_t pidx;
3730 uint32_t marker_bidx;
3731 uint32_t marker_bceil;
3732 uint32_t success_2start;
3733 uint32_t success_2incr;
3734 uint32_t next_adapt_check;
3735 intptr_t col1_sum;
3736 intptr_t col2_sum;
3737 intptr_t tot_obs;
3738 uint32_t missing_start;
3739 uint32_t case_homx_ct;
3740 uint32_t case_missing_ct;
3741 uint32_t uii;
3742 double stat_high;
3743 double stat_low;
3744 double pval;
3745 double dxx;
3746 double dyy;
3747 double dzz;
3748 while (1) {
3749 if (g_block_diff <= assoc_thread_ct) {
3750 if (g_block_diff <= tidx) {
3751 goto model_adapt_domrec_thread_skip_all;
3752 }
3753 marker_bidx = g_block_start + tidx;
3754 marker_bceil = marker_bidx + 1;
3755 } else {
3756 marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
3757 marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
3758 }
3759 loadbuf = g_loadbuf;
3760 orig_pvals = g_orig_pvals;
3761 orig_chisq = g_orig_chisq;
3762 missing_cts = g_missing_cts;
3763 het_cts = g_het_cts;
3764 homcom_cts = g_homcom_cts;
3765 precomp_start = g_precomp_start;
3766 precomp_ui = g_precomp_ui;
3767 for (; marker_bidx < marker_bceil; marker_bidx++) {
3768 marker_idx = g_adapt_m_table[marker_bidx];
3769 if (model_fisher) {
3770 if (orig_pvals[marker_idx] == -9) {
3771 perm_adapt_stop[marker_idx] = 1;
3772 perm_attempt_ct[marker_idx] = 0;
3773 continue;
3774 }
3775 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
3776 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
3777 } else {
3778 if (orig_chisq[marker_idx] == -9) {
3779 perm_adapt_stop[marker_idx] = 1;
3780 perm_attempt_ct[marker_idx] = 0;
3781 continue;
3782 }
3783 stat_high = orig_chisq[marker_idx] + EPSILON;
3784 stat_low = orig_chisq[marker_idx] - EPSILON;
3785 }
3786 next_adapt_check = first_adapt_check;
3787 tot_obs = pheno_nm_ct - missing_cts[marker_idx];
3788 if (is_model_prec) {
3789 col2_sum = homcom_cts[marker_idx] + het_cts[marker_idx];
3790 col1_sum = tot_obs - col2_sum;
3791 } else {
3792 col1_sum = homcom_cts[marker_idx];
3793 col2_sum = tot_obs - col1_sum;
3794 }
3795 missing_start = precomp_start[marker_bidx];
3796 gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
3797 success_2start = perm_2success_ct[marker_idx];
3798 success_2incr = 0;
3799 for (pidx = 0; pidx < perm_vec_ct;) {
3800 genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_homx_ct);
3801 if (is_model_prec) {
3802 case_homx_ct = case_ct - case_homx_ct - case_missing_ct - uii;
3803 }
3804 // deliberate underflow
3805 uii = (uint32_t)(case_missing_ct - missing_start);
3806 if (uii < precomp_width) {
3807 if (case_homx_ct < gpui[4 * uii]) {
3808 if (case_homx_ct < gpui[4 * uii + 2]) {
3809 success_2incr += 2;
3810 } else {
3811 success_2incr++;
3812 }
3813 } else {
3814 if (case_homx_ct >= gpui[4 * uii + 1]) {
3815 if (case_homx_ct >= gpui[4 * uii + 3]) {
3816 success_2incr += 2;
3817 } else {
3818 success_2incr++;
3819 }
3820 }
3821 }
3822 } else {
3823 uii = case_ct - case_missing_ct;
3824 if (model_fisher) {
3825 dxx = fisher22(case_homx_ct, uii - case_homx_ct, col1_sum - case_homx_ct, col2_sum + case_homx_ct - uii, fisher_midp);
3826 if (dxx < stat_low) {
3827 success_2incr += 2;
3828 } else if (dxx <= stat_high) {
3829 success_2incr++;
3830 }
3831 } else {
3832 dxx = chi22_eval(case_homx_ct, uii, col1_sum, tot_obs);
3833 if (dxx > stat_high) {
3834 success_2incr += 2;
3835 } else if (dxx > stat_low) {
3836 success_2incr++;
3837 }
3838 }
3839 }
3840 if (++pidx == next_adapt_check - pidx_offset) {
3841 uii = success_2start + success_2incr;
3842 if (uii) {
3843 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
3844 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
3845 dyy = pval - dxx; // lower bound
3846 dzz = pval + dxx; // upper bound
3847 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
3848 perm_adapt_stop[marker_idx] = 1;
3849 perm_attempt_ct[marker_idx] = next_adapt_check;
3850 break;
3851 }
3852 }
3853 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
3854 }
3855 }
3856 perm_2success_ct[marker_idx] += success_2incr;
3857 }
3858 model_adapt_domrec_thread_skip_all:
3859 if ((!tidx) || g_is_last_thread_block) {
3860 THREAD_RETURN;
3861 }
3862 THREAD_BLOCK_FINISH(tidx);
3863 }
3864 }
3865
model_maxt_domrec_thread(void * arg)3866 THREAD_RET_TYPE model_maxt_domrec_thread(void* arg) {
3867 uintptr_t tidx = (uintptr_t)arg;
3868 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
3869 uintptr_t perm_vec_ct = g_perm_vec_ct;
3870 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
3871 uint32_t assoc_thread_ct = g_assoc_thread_ct;
3872 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
3873 uint32_t model_fisher = g_model_fisher;
3874 uint32_t fisher_midp = g_fisher_midp;
3875 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
3876 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
3877 uint32_t* git_homrar_cts = nullptr;
3878 uint32_t* git_missing_cts = nullptr;
3879 uint32_t* git_het_cts = nullptr;
3880 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
3881 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
3882 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
3883 uint32_t precomp_width = g_precomp_width;
3884 uint32_t case_ct = g_perm_case_ct;
3885 int32_t is_model_prec = g_is_model_prec;
3886 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
3887 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
3888 double* __restrict__ mperm_save_all = g_mperm_save_all;
3889 double* msa_ptr = nullptr;
3890 uintptr_t* __restrict__ loadbuf;
3891 uint32_t* __restrict__ missing_cts;
3892 uint32_t* __restrict__ het_cts;
3893 uint32_t* __restrict__ homcom_cts;
3894 uint32_t* __restrict__ precomp_start;
3895 uint32_t* __restrict__ precomp_ui;
3896 double* __restrict__ precomp_d;
3897 double* __restrict__ orig_pvals;
3898 double* __restrict__ orig_chisq;
3899 uint16_t* ldrefs;
3900 uintptr_t* loadbuf_cur;
3901 uint32_t* resultbuf;
3902 uint32_t* gpui;
3903 double* gpd;
3904 uintptr_t pidx;
3905 uintptr_t marker_idx;
3906 intptr_t col1_sum;
3907 intptr_t col2_sum;
3908 intptr_t tot_obs;
3909 uint32_t block_start;
3910 uint32_t maxt_block_base;
3911 uint32_t maxt_block_base2;
3912 uint32_t marker_bidx_start;
3913 uint32_t maxt_block_base3;
3914 uint32_t marker_bidx;
3915 uint32_t marker_bceil;
3916 uint32_t success_2incr;
3917 uint32_t missing_start;
3918 uint32_t case_homx_ct;
3919 uint32_t case_missing_ct;
3920 uint32_t uii;
3921 uint32_t ujj;
3922 uint32_t ukk;
3923 double stat_high;
3924 double stat_low;
3925 double sval;
3926 uint32_t missing_ct;
3927 uint32_t het_ct;
3928 uint32_t homcom_ct;
3929 uint32_t ldref;
3930 while (1) {
3931 block_start = g_block_start;
3932 if (g_block_diff <= assoc_thread_ct) {
3933 if (g_block_diff <= tidx) {
3934 goto model_maxt_domrec_thread_skip_all;
3935 }
3936 marker_bidx_start = block_start + tidx;
3937 marker_bceil = marker_bidx_start + 1;
3938 } else {
3939 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
3940 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
3941 }
3942 maxt_block_base = g_maxt_block_base;
3943 maxt_block_base2 = maxt_block_base + block_start;
3944 maxt_block_base3 = maxt_block_base + marker_bidx_start;
3945 marker_bidx = marker_bidx_start;
3946 marker_idx = maxt_block_base3;
3947 loadbuf = g_loadbuf;
3948 missing_cts = g_missing_cts;
3949 het_cts = g_het_cts;
3950 homcom_cts = g_homcom_cts;
3951 precomp_start = g_precomp_start;
3952 precomp_ui = g_precomp_ui;
3953 precomp_d = g_precomp_d;
3954 orig_pvals = g_orig_pvals;
3955 orig_chisq = g_orig_chisq;
3956 resultbuf = g_resultbuf;
3957 ldrefs = g_ldrefs;
3958 memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
3959 if (mperm_save_all) {
3960 msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
3961 }
3962 for (; marker_bidx < marker_bceil; marker_bidx++) {
3963 if (model_fisher) {
3964 if (orig_pvals[marker_idx] == -9) {
3965 model_maxt_domrec_thread_skip_marker:
3966 marker_idx++;
3967 if (msa_ptr) {
3968 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
3969 *msa_ptr++ = -9;
3970 }
3971 }
3972 continue;
3973 }
3974 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
3975 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
3976 } else {
3977 if (orig_chisq[marker_idx] == -9) {
3978 goto model_maxt_domrec_thread_skip_marker;
3979 }
3980 stat_high = orig_chisq[marker_idx] + EPSILON;
3981 stat_low = orig_chisq[marker_idx] - EPSILON;
3982 }
3983 gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
3984 missing_ct = missing_cts[marker_idx];
3985 het_ct = het_cts[marker_idx];
3986 homcom_ct = homcom_cts[marker_idx];
3987 tot_obs = pheno_nm_ct - missing_ct;
3988 if (is_model_prec) {
3989 col2_sum = homcom_ct + het_ct;
3990 col1_sum = tot_obs - col2_sum;
3991 } else {
3992 col1_sum = homcom_ct;
3993 col2_sum = tot_obs - col1_sum;
3994 }
3995 missing_start = precomp_start[marker_bidx];
3996 gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
3997 success_2incr = 0;
3998 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
3999 ldref = ldrefs[marker_idx];
4000 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4001 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4002 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4003 if (ldref == 65535) {
4004 ldref = marker_bidx;
4005 if (pheno_nm_ct - homcom_ct > 50) {
4006 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4007 }
4008 ldrefs[marker_idx] = ldref;
4009 }
4010 if (ldref == marker_bidx) {
4011 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4012 calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4013 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4014 } else {
4015 memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4016 calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4017 }
4018 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4019 case_missing_ct = git_missing_cts[pidx];
4020 if (is_model_prec) {
4021 case_homx_ct = git_homrar_cts[pidx];
4022 } else {
4023 case_homx_ct = case_ct - case_missing_ct - git_homrar_cts[pidx] - git_het_cts[pidx];
4024 }
4025 // deliberate underflow
4026 uii = (uint32_t)(case_missing_ct - missing_start);
4027 if (uii < precomp_width) {
4028 if (case_homx_ct < gpui[6 * uii]) {
4029 if (case_homx_ct < gpui[6 * uii + 2]) {
4030 success_2incr += 2;
4031 } else {
4032 success_2incr++;
4033 }
4034 } else {
4035 if (case_homx_ct >= gpui[6 * uii + 1]) {
4036 if (case_homx_ct >= gpui[6 * uii + 3]) {
4037 success_2incr += 2;
4038 } else {
4039 success_2incr++;
4040 }
4041 }
4042 }
4043 ukk = gpui[6 * uii + 4];
4044 ujj = (uint32_t)(case_homx_ct - ukk); // deliberate underflow
4045 if (ujj >= gpui[6 * uii + 5]) {
4046 if (model_fisher) {
4047 ujj = case_ct - case_missing_ct;
4048 sval = fisher22_tail_pval(ukk, ujj - ukk, col1_sum - ukk, col2_sum + ukk - ujj, gpui[6 * uii + 5] - 1, gpd[2 * uii], gpd[2 * uii + 1], fisher_midp, case_homx_ct);
4049 if (results[pidx] > sval) {
4050 results[pidx] = sval;
4051 }
4052 } else {
4053 sval = ((double)((intptr_t)case_homx_ct)) - gpd[2 * uii];
4054 sval = sval * sval * gpd[2 * uii + 1];
4055 if (results[pidx] < sval) {
4056 results[pidx] = sval;
4057 }
4058 }
4059 }
4060 } else {
4061 uii = case_ct - case_missing_ct;
4062 if (model_fisher) {
4063 sval = fisher22(case_homx_ct, uii - case_homx_ct, col1_sum - case_homx_ct, col2_sum + case_homx_ct - uii, fisher_midp);
4064 if (sval < stat_low) {
4065 success_2incr += 2;
4066 } else if (sval <= stat_high) {
4067 success_2incr++;
4068 }
4069 if (results[pidx] > sval) {
4070 results[pidx] = sval;
4071 }
4072 } else {
4073 sval = chi22_eval(case_homx_ct, uii, col1_sum, tot_obs);
4074 if (sval > stat_high) {
4075 success_2incr += 2;
4076 } else if (sval > stat_low) {
4077 success_2incr++;
4078 }
4079 if (results[pidx] < sval) {
4080 results[pidx] = sval;
4081 }
4082 }
4083 if (msa_ptr) {
4084 *msa_ptr++ = sval;
4085 }
4086 }
4087 }
4088 perm_2success_ct[marker_idx++] += success_2incr;
4089 }
4090 model_maxt_domrec_thread_skip_all:
4091 if ((!tidx) || g_is_last_thread_block) {
4092 THREAD_RETURN;
4093 }
4094 THREAD_BLOCK_FINISH(tidx);
4095 }
4096 }
4097
model_set_domrec_thread(void * arg)4098 THREAD_RET_TYPE model_set_domrec_thread(void* arg) {
4099 // Similar to assoc_set_thread().
4100 uintptr_t tidx = (uintptr_t)arg;
4101 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4102 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4103 uintptr_t perm_vec_ct = g_perm_vec_ct;
4104 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4105 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4106 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4107 uint32_t* git_homrar_cts = nullptr;
4108 uint32_t* git_missing_cts = nullptr;
4109 uint32_t* git_het_cts = nullptr;
4110 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4111 uint32_t* resultbuf = g_resultbuf;
4112 uint32_t case_ct = g_perm_case_ct;
4113 int32_t is_model_prec = g_is_model_prec;
4114 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4115 double* msa_ptr = nullptr;
4116 uintptr_t* loadbuf;
4117 uintptr_t* loadbuf_cur;
4118 uint32_t* __restrict__ missing_cts;
4119 uint32_t* __restrict__ het_cts;
4120 uint32_t* __restrict__ homcom_cts;
4121 uintptr_t pidx;
4122 uintptr_t marker_idx;
4123 intptr_t col1_sum;
4124 intptr_t tot_obs;
4125 uint32_t block_start;
4126 uint32_t marker_bidx_start;
4127 uint32_t marker_bidx;
4128 uint32_t marker_bceil;
4129 uint32_t case_homx_ct;
4130 uint32_t case_missing_ct;
4131 uint32_t missing_ct;
4132 uint32_t het_ct;
4133 uint32_t homcom_ct;
4134 while (1) {
4135 block_start = g_block_start;
4136 if (g_block_diff <= assoc_thread_ct) {
4137 if (g_block_diff <= tidx) {
4138 goto model_set_domrec_thread_skip_all;
4139 }
4140 marker_bidx_start = block_start + tidx;
4141 marker_bceil = marker_bidx_start + 1;
4142 } else {
4143 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4144 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4145 }
4146 marker_bidx = marker_bidx_start;
4147 loadbuf = g_loadbuf;
4148 missing_cts = g_missing_cts;
4149 het_cts = g_het_cts;
4150 homcom_cts = g_homcom_cts;
4151 for (; marker_bidx < marker_bceil; marker_bidx++) {
4152 marker_idx = g_adapt_m_table[marker_bidx];
4153 msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
4154 missing_ct = missing_cts[marker_idx];
4155 het_ct = het_cts[marker_idx];
4156 homcom_ct = homcom_cts[marker_idx];
4157 tot_obs = pheno_nm_ct - missing_ct;
4158 if (is_model_prec) {
4159 col1_sum = tot_obs - homcom_ct - het_ct;
4160 } else {
4161 col1_sum = homcom_ct;
4162 }
4163 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4164 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4165 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4166 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4167 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4168 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4169 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4170 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4171 case_missing_ct = git_missing_cts[pidx];
4172 if (is_model_prec) {
4173 case_homx_ct = git_homrar_cts[pidx];
4174 } else {
4175 case_homx_ct = case_ct - case_missing_ct - git_homrar_cts[pidx] - git_het_cts[pidx];
4176 }
4177 *msa_ptr++ = chi22_eval(case_homx_ct, case_ct - case_missing_ct, col1_sum, tot_obs);
4178 }
4179 }
4180 model_set_domrec_thread_skip_all:
4181 if ((!tidx) || g_is_last_thread_block) {
4182 THREAD_RETURN;
4183 }
4184 THREAD_BLOCK_FINISH(tidx);
4185 }
4186 }
4187
model_adapt_trend_thread(void * arg)4188 THREAD_RET_TYPE model_adapt_trend_thread(void* arg) {
4189 uintptr_t tidx = (uintptr_t)arg;
4190 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4191 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4192 uintptr_t perm_vec_ct = g_perm_vec_ct;
4193 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4194 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4195 uint32_t precomp_width = g_precomp_width;
4196 uint32_t first_adapt_check = g_first_adapt_check;
4197 uint32_t case_ct = g_perm_case_ct;
4198 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4199 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4200 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4201 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4202 double adaptive_intercept = g_adaptive_intercept;
4203 double adaptive_slope = g_adaptive_slope;
4204 double adaptive_ci_zt = g_adaptive_ci_zt;
4205 double aperm_alpha = g_aperm_alpha;
4206 uintptr_t* __restrict__ loadbuf;
4207 double* __restrict__ orig_pvals;
4208 double* __restrict__ orig_chisq;
4209 uint32_t* __restrict__ missing_cts;
4210 uint32_t* __restrict__ het_cts;
4211 uint32_t* __restrict__ homcom_cts;
4212 uint32_t* __restrict__ precomp_start;
4213 uint32_t* __restrict__ precomp_ui;
4214 uint32_t* gpui;
4215 uintptr_t marker_idx;
4216 uintptr_t pidx;
4217 uint32_t marker_bidx;
4218 uint32_t marker_bceil;
4219 uint32_t success_2start;
4220 uint32_t success_2incr;
4221 uint32_t next_adapt_check;
4222 intptr_t tot_obs;
4223 uint32_t missing_start;
4224 uint32_t het_ct;
4225 uint32_t homcom_ct;
4226 uint32_t case_com_ct;
4227 uint32_t case_missing_ct;
4228 uint32_t uii;
4229 double chisq_high;
4230 double chisq_low;
4231 double pval;
4232 double dxx;
4233 double dyy;
4234 double dzz;
4235 while (1) {
4236 if (g_block_diff <= assoc_thread_ct) {
4237 if (g_block_diff <= tidx) {
4238 goto model_adapt_trend_thread_skip_all;
4239 }
4240 marker_bidx = g_block_start + tidx;
4241 marker_bceil = marker_bidx + 1;
4242 } else {
4243 marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4244 marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4245 }
4246 loadbuf = g_loadbuf;
4247 orig_pvals = g_orig_pvals;
4248 orig_chisq = g_orig_chisq;
4249 missing_cts = g_missing_cts;
4250 het_cts = g_het_cts;
4251 homcom_cts = g_homcom_cts;
4252 precomp_start = g_precomp_start;
4253 precomp_ui = g_precomp_ui;
4254 for (; marker_bidx < marker_bceil; marker_bidx++) {
4255 marker_idx = g_adapt_m_table[marker_bidx];
4256 next_adapt_check = first_adapt_check;
4257 if (orig_pvals[marker_idx] == -9) {
4258 perm_adapt_stop[marker_idx] = 1;
4259 perm_attempt_ct[marker_idx] = next_adapt_check;
4260 perm_2success_ct[marker_idx] = next_adapt_check;
4261 continue;
4262 }
4263 tot_obs = pheno_nm_ct - missing_cts[marker_idx];
4264 het_ct = het_cts[marker_idx];
4265 homcom_ct = homcom_cts[marker_idx];
4266 missing_start = precomp_start[marker_bidx];
4267 gpui = &(precomp_ui[4 * precomp_width * marker_bidx]);
4268 success_2start = perm_2success_ct[marker_idx];
4269 success_2incr = 0;
4270 chisq_high = orig_chisq[marker_idx] + EPSILON;
4271 chisq_low = orig_chisq[marker_idx] - EPSILON;
4272 for (pidx = 0; pidx < perm_vec_ct;) {
4273 genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_com_ct, &case_missing_ct);
4274 // deliberate underflow
4275 uii = (uint32_t)(case_missing_ct - missing_start);
4276 if (uii < precomp_width) {
4277 if (case_com_ct < gpui[4 * uii]) {
4278 if (case_com_ct < gpui[4 * uii + 2]) {
4279 success_2incr += 2;
4280 } else {
4281 success_2incr++;
4282 }
4283 } else {
4284 if (case_com_ct >= gpui[4 * uii + 1]) {
4285 if (case_com_ct >= gpui[4 * uii + 3]) {
4286 success_2incr += 2;
4287 } else {
4288 success_2incr++;
4289 }
4290 }
4291 }
4292 } else {
4293 uii = case_ct - case_missing_ct;
4294 dxx = ca_trend_eval(case_com_ct, uii, het_ct, homcom_ct, tot_obs);
4295 if (dxx > chisq_high) {
4296 success_2incr += 2;
4297 } else if (dxx > chisq_low) {
4298 success_2incr++;
4299 }
4300 }
4301 if (++pidx == next_adapt_check - pidx_offset) {
4302 uii = success_2start + success_2incr;
4303 if (uii) {
4304 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
4305 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
4306 dyy = pval - dxx; // lower bound
4307 dzz = pval + dxx; // upper bound
4308 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
4309 perm_adapt_stop[marker_idx] = 1;
4310 perm_attempt_ct[marker_idx] = next_adapt_check;
4311 break;
4312 }
4313 }
4314 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
4315 }
4316 }
4317 perm_2success_ct[marker_idx] += success_2incr;
4318 }
4319 model_adapt_trend_thread_skip_all:
4320 if ((!tidx) || g_is_last_thread_block) {
4321 THREAD_RETURN;
4322 }
4323 THREAD_BLOCK_FINISH(tidx);
4324 }
4325 }
4326
model_maxt_trend_thread(void * arg)4327 THREAD_RET_TYPE model_maxt_trend_thread(void* arg) {
4328 uintptr_t tidx = (uintptr_t)arg;
4329 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4330 uintptr_t perm_vec_ct = g_perm_vec_ct;
4331 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4332 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4333 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4334 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4335 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4336 uint32_t* git_homrar_cts = nullptr;
4337 uint32_t* git_missing_cts = nullptr;
4338 uint32_t* git_het_cts = nullptr;
4339 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4340 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
4341 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
4342 uint32_t precomp_width = g_precomp_width;
4343 uint32_t case_ct = g_perm_case_ct;
4344 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4345 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4346 double* __restrict__ mperm_save_all = g_mperm_save_all;
4347 double* msa_ptr = nullptr;
4348 uintptr_t* __restrict__ loadbuf;
4349 uint32_t* __restrict__ missing_cts;
4350 uint32_t* __restrict__ het_cts;
4351 uint32_t* __restrict__ homcom_cts;
4352 uint32_t* __restrict__ precomp_start;
4353 uint32_t* __restrict__ precomp_ui;
4354 double* __restrict__ precomp_d;
4355 double* __restrict__ orig_pvals;
4356 double* __restrict__ orig_chisq;
4357 uint16_t* ldrefs;
4358 uintptr_t* loadbuf_cur;
4359 uint32_t* resultbuf;
4360 uint32_t* gpui;
4361 double* gpd;
4362 uint32_t block_start;
4363 uint32_t maxt_block_base;
4364 uint32_t maxt_block_base2;
4365 uint32_t marker_bidx_start;
4366 uint32_t maxt_block_base3;
4367 uint32_t marker_bidx;
4368 uintptr_t marker_idx;
4369 uint32_t marker_bceil;
4370 uintptr_t pidx;
4371 intptr_t tot_obs;
4372 uint32_t success_2incr;
4373 uint32_t missing_start;
4374 uint32_t missing_ct;
4375 uint32_t het_ct;
4376 uint32_t homcom_ct;
4377 uint32_t ldref;
4378 uint32_t case_com_ct;
4379 uint32_t case_missing_ct;
4380 uint32_t uii;
4381 uint32_t ujj;
4382 uint32_t ukk;
4383 double chisq_high;
4384 double chisq_low;
4385 double chisq;
4386 while (1) {
4387 block_start = g_block_start;
4388 if (g_block_diff <= assoc_thread_ct) {
4389 if (g_block_diff <= tidx) {
4390 goto model_maxt_trend_thread_skip_all;
4391 }
4392 marker_bidx_start = block_start + tidx;
4393 marker_bceil = marker_bidx_start + 1;
4394 } else {
4395 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4396 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4397 }
4398 maxt_block_base = g_maxt_block_base;
4399 maxt_block_base2 = maxt_block_base + block_start;
4400 maxt_block_base3 = maxt_block_base + marker_bidx_start;
4401 marker_bidx = marker_bidx_start;
4402 marker_idx = maxt_block_base3;
4403 loadbuf = g_loadbuf;
4404 missing_cts = g_missing_cts;
4405 het_cts = g_het_cts;
4406 homcom_cts = g_homcom_cts;
4407 precomp_start = g_precomp_start;
4408 precomp_ui = g_precomp_ui;
4409 precomp_d = g_precomp_d;
4410 orig_pvals = g_orig_pvals;
4411 orig_chisq = g_orig_chisq;
4412 resultbuf = g_resultbuf;
4413 ldrefs = g_ldrefs;
4414 memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
4415 if (mperm_save_all) {
4416 msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
4417 }
4418 for (; marker_bidx < marker_bceil; marker_bidx++) {
4419 if (orig_pvals[marker_idx] == -9) {
4420 perm_2success_ct[marker_idx++] += perm_vec_ct;
4421 continue;
4422 }
4423 missing_ct = missing_cts[marker_idx];
4424 tot_obs = pheno_nm_ct - missing_ct;
4425 het_ct = het_cts[marker_idx];
4426 homcom_ct = homcom_cts[marker_idx];
4427 missing_start = precomp_start[marker_bidx];
4428 gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
4429 gpd = &(precomp_d[2 * precomp_width * marker_bidx]);
4430 chisq_high = orig_chisq[marker_idx] + EPSILON;
4431 chisq_low = orig_chisq[marker_idx] - EPSILON;
4432 success_2incr = 0;
4433 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4434 ldref = ldrefs[marker_idx];
4435 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4436 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4437 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4438 if (ldref == 65535) {
4439 ldref = marker_bidx;
4440 if (pheno_nm_ct - homcom_ct > 50) {
4441 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4442 }
4443 ldrefs[marker_idx] = ldref;
4444 }
4445 if (ldref == marker_bidx) {
4446 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4447 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4448 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4449 } else {
4450 memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4451 calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4452 }
4453 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4454 case_missing_ct = git_missing_cts[pidx];
4455 case_com_ct = 2 * (case_ct - case_missing_ct - git_homrar_cts[pidx]) - git_het_cts[pidx];
4456 // deliberate underflow
4457 uii = (uint32_t)(case_missing_ct - missing_start);
4458 if (uii < precomp_width) {
4459 if (case_com_ct < gpui[6 * uii]) {
4460 if (case_com_ct < gpui[6 * uii + 2]) {
4461 success_2incr += 2;
4462 } else {
4463 success_2incr++;
4464 }
4465 } else {
4466 if (case_com_ct >= gpui[6 * uii + 1]) {
4467 if (case_com_ct >= gpui[6 * uii + 3]) {
4468 success_2incr += 2;
4469 } else {
4470 success_2incr++;
4471 }
4472 }
4473 }
4474 ukk = gpui[6 * uii + 4];
4475 ujj = (uint32_t)(case_com_ct - ukk); // deliberate underflow
4476 if (ujj >= gpui[6 * uii + 5]) {
4477 chisq = ((double)((intptr_t)case_com_ct)) - gpd[2 * uii];
4478 chisq = chisq * chisq * gpd[2 * uii + 1];
4479 if (results[pidx] < chisq) {
4480 results[pidx] = chisq;
4481 }
4482 }
4483 } else {
4484 chisq = ca_trend_eval(case_com_ct, case_ct - case_missing_ct, het_ct, homcom_ct, tot_obs);
4485 if (chisq > chisq_high) {
4486 success_2incr += 2;
4487 } else if (chisq > chisq_low) {
4488 success_2incr++;
4489 }
4490 if (results[pidx] < chisq) {
4491 results[pidx] = chisq;
4492 }
4493 if (msa_ptr) {
4494 *msa_ptr++ = chisq;
4495 }
4496 }
4497 }
4498 perm_2success_ct[marker_idx++] += success_2incr;
4499 }
4500 model_maxt_trend_thread_skip_all:
4501 if ((!tidx) || g_is_last_thread_block) {
4502 THREAD_RETURN;
4503 }
4504 THREAD_BLOCK_FINISH(tidx);
4505 }
4506 }
4507
model_set_trend_thread(void * arg)4508 THREAD_RET_TYPE model_set_trend_thread(void* arg) {
4509 // Similar to model_set_domrec_thread(). (In fact, it's so similar that it
4510 // may be appropriate to merge the functions.)
4511 uintptr_t tidx = (uintptr_t)arg;
4512 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4513 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4514 uintptr_t perm_vec_ct = g_perm_vec_ct;
4515 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4516 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4517 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4518 uint32_t* git_homrar_cts = nullptr;
4519 uint32_t* git_missing_cts = nullptr;
4520 uint32_t* git_het_cts = nullptr;
4521 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4522 uint32_t* resultbuf = g_resultbuf;
4523 uint32_t case_ct = g_perm_case_ct;
4524 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4525 double* msa_ptr = nullptr;
4526 uintptr_t* loadbuf;
4527 uintptr_t* loadbuf_cur;
4528 uint32_t* __restrict__ missing_cts;
4529 uint32_t* __restrict__ het_cts;
4530 uint32_t* __restrict__ homcom_cts;
4531 uintptr_t pidx;
4532 uintptr_t marker_idx;
4533 intptr_t tot_obs;
4534 uint32_t block_start;
4535 uint32_t marker_bidx_start;
4536 uint32_t marker_bidx;
4537 uint32_t marker_bceil;
4538 uint32_t case_com_ct;
4539 uint32_t case_missing_ct;
4540 uint32_t missing_ct;
4541 uint32_t het_ct;
4542 uint32_t homcom_ct;
4543 while (1) {
4544 block_start = g_block_start;
4545 if (g_block_diff <= assoc_thread_ct) {
4546 if (g_block_diff <= tidx) {
4547 goto model_set_trend_thread_skip_all;
4548 }
4549 marker_bidx_start = block_start + tidx;
4550 marker_bceil = marker_bidx_start + 1;
4551 } else {
4552 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4553 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4554 }
4555 marker_bidx = marker_bidx_start;
4556 loadbuf = g_loadbuf;
4557 missing_cts = g_missing_cts;
4558 het_cts = g_het_cts;
4559 homcom_cts = g_homcom_cts;
4560 for (; marker_bidx < marker_bceil; marker_bidx++) {
4561 marker_idx = g_adapt_m_table[marker_bidx];
4562 msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
4563 missing_ct = missing_cts[marker_idx];
4564 tot_obs = pheno_nm_ct - missing_ct;
4565 het_ct = het_cts[marker_idx];
4566 homcom_ct = homcom_cts[marker_idx];
4567 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4568 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4569 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4570 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4571 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4572 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4573 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4574 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4575 case_missing_ct = git_missing_cts[pidx];
4576 case_com_ct = 2 * (case_ct - case_missing_ct - git_homrar_cts[pidx]) - git_het_cts[pidx];
4577 *msa_ptr++ = ca_trend_eval(case_com_ct, case_ct - case_missing_ct, het_ct, homcom_ct, tot_obs);
4578 }
4579 }
4580 model_set_trend_thread_skip_all:
4581 if ((!tidx) || g_is_last_thread_block) {
4582 THREAD_RETURN;
4583 }
4584 THREAD_BLOCK_FINISH(tidx);
4585 }
4586 }
4587
model_adapt_gen_thread(void * arg)4588 THREAD_RET_TYPE model_adapt_gen_thread(void* arg) {
4589 uintptr_t tidx = (uintptr_t)arg;
4590 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4591 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4592 uintptr_t perm_vec_ct = g_perm_vec_ct;
4593 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4594 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4595 uint32_t model_fisher = g_model_fisher;
4596 uint32_t fisher_midp = g_fisher_midp;
4597 uint32_t first_adapt_check = g_first_adapt_check;
4598 uint32_t case_ct = g_perm_case_ct;
4599 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4600 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4601 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4602 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4603 double adaptive_intercept = g_adaptive_intercept;
4604 double adaptive_slope = g_adaptive_slope;
4605 double adaptive_ci_zt = g_adaptive_ci_zt;
4606 double aperm_alpha = g_aperm_alpha;
4607 uintptr_t* __restrict__ loadbuf;
4608 double* __restrict__ orig_pvals;
4609 double* __restrict__ orig_chisq;
4610 uint32_t* __restrict__ missing_cts;
4611 uint32_t* __restrict__ het_cts;
4612 uint32_t* __restrict__ homcom_cts;
4613 uintptr_t marker_idx;
4614 uintptr_t pidx;
4615 uint32_t marker_bidx;
4616 uint32_t marker_bceil;
4617 uint32_t success_2start;
4618 uint32_t success_2incr;
4619 uint32_t next_adapt_check;
4620 uint32_t missing_col;
4621 intptr_t tot_obs;
4622 intptr_t homcom_ct;
4623 intptr_t homrar_ct;
4624 intptr_t het_ct;
4625 uint32_t case_missing_ct;
4626 uint32_t case_het_ct;
4627 uint32_t case_homcom_ct;
4628 uint32_t uii;
4629 double stat_high;
4630 double stat_low;
4631 double pval;
4632 double dxx;
4633 double dyy;
4634 double dzz;
4635 while (1) {
4636 if (g_block_diff <= assoc_thread_ct) {
4637 if (g_block_diff <= tidx) {
4638 goto model_adapt_gen_thread_skip_all;
4639 }
4640 marker_bidx = g_block_start + tidx;
4641 marker_bceil = marker_bidx + 1;
4642 } else {
4643 marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4644 marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4645 }
4646 loadbuf = g_loadbuf;
4647 orig_pvals = g_orig_pvals;
4648 orig_chisq = g_orig_chisq;
4649 missing_cts = g_missing_cts;
4650 het_cts = g_het_cts;
4651 homcom_cts = g_homcom_cts;
4652 for (; marker_bidx < marker_bceil; marker_bidx++) {
4653 marker_idx = g_adapt_m_table[marker_bidx];
4654 if (model_fisher) {
4655 if (orig_pvals[marker_idx] == -9) {
4656 perm_adapt_stop[marker_idx] = 1;
4657 perm_attempt_ct[marker_idx] = 0;
4658 continue;
4659 }
4660 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
4661 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
4662 } else {
4663 if (orig_chisq[marker_idx] == -9) {
4664 perm_adapt_stop[marker_idx] = 1;
4665 perm_attempt_ct[marker_idx] = 0;
4666 continue;
4667 }
4668 stat_high = orig_chisq[marker_idx] + EPSILON;
4669 stat_low = orig_chisq[marker_idx] - EPSILON;
4670 }
4671 next_adapt_check = first_adapt_check;
4672 het_ct = het_cts[marker_idx];
4673 tot_obs = pheno_nm_ct - missing_cts[marker_idx];
4674 homcom_ct = homcom_cts[marker_idx];
4675 homrar_ct = tot_obs - het_ct - homcom_ct;
4676 if (!homcom_ct) {
4677 missing_col = 3;
4678 } else if ((het_ct + homcom_ct == tot_obs) || (!het_ct)) {
4679 missing_col = 2; // either no hom A1s or no hets (no need to distinguish)
4680 } else {
4681 missing_col = 0;
4682 }
4683 success_2start = perm_2success_ct[marker_idx];
4684 success_2incr = 0;
4685 for (pidx = 0; pidx < perm_vec_ct;) {
4686 genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
4687 if (model_fisher) {
4688 uii = case_ct - case_het_ct - case_homcom_ct - case_missing_ct;
4689 // this is very slow. a precomputed 2-dimensional table could
4690 // improve matters, but I doubt it's worth the effort for now.
4691 dxx = fisher23(case_homcom_ct, case_het_ct, uii, homcom_ct - case_homcom_ct, het_ct - case_het_ct, homrar_ct - uii, fisher_midp);
4692 if (dxx < stat_low) {
4693 success_2incr += 2;
4694 } else if (dxx <= stat_high) {
4695 success_2incr++;
4696 }
4697 } else {
4698 if (!missing_col) {
4699 dxx = chi23_eval(case_homcom_ct, case_het_ct, case_ct - case_missing_ct, homcom_ct, het_ct, tot_obs);
4700 } else if (missing_col == 3) {
4701 dxx = chi22_eval(case_het_ct, case_ct - case_missing_ct, het_ct, tot_obs);
4702 } else {
4703 dxx = chi22_eval(case_homcom_ct, case_ct - case_missing_ct, homcom_ct, tot_obs);
4704 }
4705 if (dxx > stat_high) {
4706 success_2incr += 2;
4707 } else if (dxx > stat_low) {
4708 success_2incr++;
4709 }
4710 }
4711 if (++pidx == next_adapt_check - pidx_offset) {
4712 uii = success_2start + success_2incr;
4713 if (uii) {
4714 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
4715 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
4716 dyy = pval - dxx; // lower bound
4717 dzz = pval + dxx; // upper bound
4718 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
4719 perm_adapt_stop[marker_idx] = 1;
4720 perm_attempt_ct[marker_idx] = next_adapt_check;
4721 break;
4722 }
4723 }
4724 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
4725 }
4726 }
4727 perm_2success_ct[marker_idx] += success_2incr;
4728 }
4729 model_adapt_gen_thread_skip_all:
4730 if ((!tidx) || g_is_last_thread_block) {
4731 THREAD_RETURN;
4732 }
4733 THREAD_BLOCK_FINISH(tidx);
4734 }
4735 }
4736
model_maxt_gen_thread(void * arg)4737 THREAD_RET_TYPE model_maxt_gen_thread(void* arg) {
4738 uintptr_t tidx = (uintptr_t)arg;
4739 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
4740 uintptr_t perm_vec_ct = g_perm_vec_ct;
4741 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4742 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4743 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4744 uint32_t model_fisher = g_model_fisher;
4745 uint32_t fisher_midp = g_fisher_midp;
4746 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
4747 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
4748 uint32_t* git_homrar_cts = nullptr;
4749 uint32_t* git_missing_cts = nullptr;
4750 uint32_t* git_het_cts = nullptr;
4751 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
4752 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
4753 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
4754 uint32_t case_ct = g_perm_case_ct;
4755 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
4756 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4757 double* __restrict__ mperm_save_all = g_mperm_save_all;
4758 double* msa_ptr = nullptr;
4759 uintptr_t* __restrict__ loadbuf;
4760 uint32_t* __restrict__ missing_cts;
4761 uint32_t* __restrict__ het_cts;
4762 uint32_t* __restrict__ homcom_cts;
4763 double* __restrict__ orig_pvals;
4764 double* __restrict__ orig_chisq;
4765 uint16_t* ldrefs;
4766 uintptr_t* loadbuf_cur;
4767 uint32_t* resultbuf;
4768 uintptr_t pidx;
4769 uint32_t missing_col;
4770 intptr_t tot_obs;
4771 uintptr_t marker_idx;
4772 uint32_t block_start;
4773 uint32_t maxt_block_base;
4774 uint32_t maxt_block_base2;
4775 uint32_t marker_bidx_start;
4776 uint32_t maxt_block_base3;
4777 uint32_t marker_bidx;
4778 uint32_t marker_bceil;
4779 int32_t missing_ct;
4780 intptr_t homcom_ct;
4781 intptr_t homrar_ct;
4782 intptr_t het_ct;
4783 uint32_t ldref;
4784 uint32_t success_2incr;
4785 uint32_t case_missing_ct;
4786 uint32_t case_het_ct;
4787 uint32_t case_homcom_ct;
4788 uint32_t uii;
4789 double stat_high;
4790 double stat_low;
4791 double sval;
4792 while (1) {
4793 block_start = g_block_start;
4794 if (g_block_diff <= assoc_thread_ct) {
4795 if (g_block_diff <= tidx) {
4796 goto model_maxt_gen_thread_skip_all;
4797 }
4798 marker_bidx_start = block_start + tidx;
4799 marker_bceil = marker_bidx_start + 1;
4800 } else {
4801 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4802 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4803 }
4804 maxt_block_base = g_maxt_block_base;
4805 maxt_block_base2 = maxt_block_base + block_start;
4806 maxt_block_base3 = maxt_block_base + marker_bidx_start;
4807 marker_bidx = marker_bidx_start;
4808 marker_idx = maxt_block_base3;
4809 loadbuf = g_loadbuf;
4810 missing_cts = g_missing_cts;
4811 het_cts = g_het_cts;
4812 homcom_cts = g_homcom_cts;
4813 orig_pvals = g_orig_pvals;
4814 orig_chisq = g_orig_chisq;
4815 resultbuf = g_resultbuf;
4816 ldrefs = g_ldrefs;
4817
4818 memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
4819 if (mperm_save_all) {
4820 msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
4821 }
4822 for (; marker_bidx < marker_bceil; marker_bidx++) {
4823 if (model_fisher) {
4824 if (orig_pvals[marker_idx] == -9) {
4825 model_maxt_gen_thread_skip_marker:
4826 marker_idx++;
4827 if (msa_ptr) {
4828 for (pidx = 0; pidx < perm_vec_ct; ++pidx) {
4829 *msa_ptr++ = -9;
4830 }
4831 }
4832 continue;
4833 }
4834 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
4835 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
4836 } else {
4837 if (orig_chisq[marker_idx] == -9) {
4838 goto model_maxt_gen_thread_skip_marker;
4839 }
4840 stat_high = orig_chisq[marker_idx] + EPSILON;
4841 stat_low = orig_chisq[marker_idx] - EPSILON;
4842 }
4843 missing_ct = missing_cts[marker_idx];
4844 het_ct = het_cts[marker_idx];
4845 tot_obs = pheno_nm_ct - missing_ct;
4846 homcom_ct = homcom_cts[marker_idx];
4847 homrar_ct = tot_obs - het_ct - homcom_ct;
4848 if (!homcom_ct) {
4849 missing_col = 3;
4850 } else if ((het_ct + homcom_ct == tot_obs) || (!het_ct)) {
4851 missing_col = 2;
4852 } else {
4853 missing_col = 0;
4854 }
4855 success_2incr = 0;
4856 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
4857 ldref = ldrefs[marker_idx];
4858 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
4859 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
4860 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
4861 if (ldref == 65535) {
4862 ldref = marker_bidx;
4863 if (pheno_nm_ct - homcom_ct > 50) {
4864 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
4865 }
4866 ldrefs[marker_idx] = ldref;
4867 }
4868 if (ldref == marker_bidx) {
4869 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
4870 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
4871 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
4872 } else {
4873 memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
4874 calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
4875 }
4876 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
4877 case_missing_ct = git_missing_cts[pidx];
4878 case_het_ct = git_het_cts[pidx];
4879 case_homcom_ct = case_ct - case_missing_ct - case_het_ct - git_homrar_cts[pidx];
4880 if (model_fisher) {
4881 uii = case_ct - case_het_ct - case_homcom_ct - case_missing_ct;
4882 sval = fisher23(case_homcom_ct, case_het_ct, uii, homcom_ct - case_homcom_ct, het_ct - case_het_ct, homrar_ct - uii, fisher_midp);
4883 if (sval < stat_low) {
4884 success_2incr += 2;
4885 } else if (sval <= stat_high) {
4886 success_2incr++;
4887 }
4888 if (results[pidx] > sval) {
4889 results[pidx] = sval;
4890 }
4891 } else {
4892 if (!missing_col) {
4893 sval = chi23_eval(case_homcom_ct, case_het_ct, case_ct - case_missing_ct, homcom_ct, het_ct, tot_obs);
4894 } else if (missing_col == 3) {
4895 sval = chi22_eval(case_het_ct, case_ct - case_missing_ct, het_ct, tot_obs);
4896 } else {
4897 sval = chi22_eval(case_homcom_ct, case_ct - case_missing_ct, homcom_ct, tot_obs);
4898 }
4899 if (sval > stat_high) {
4900 success_2incr += 2;
4901 } else if (sval > stat_low) {
4902 success_2incr++;
4903 }
4904 if (results[pidx] < sval) {
4905 results[pidx] = sval;
4906 }
4907 }
4908 if (msa_ptr) {
4909 *msa_ptr++ = sval;
4910 }
4911 }
4912 perm_2success_ct[marker_idx++] += success_2incr;
4913 }
4914 model_maxt_gen_thread_skip_all:
4915 if ((!tidx) || g_is_last_thread_block) {
4916 THREAD_RETURN;
4917 }
4918 THREAD_BLOCK_FINISH(tidx);
4919 }
4920 }
4921
model_adapt_best_thread(void * arg)4922 THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
4923 uintptr_t tidx = (uintptr_t)arg;
4924 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
4925 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
4926 uintptr_t perm_vec_ct = g_perm_vec_ct;
4927 uint32_t assoc_thread_ct = g_assoc_thread_ct;
4928 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
4929 uint32_t model_fisher = g_model_fisher;
4930 uint32_t fisher_midp = g_fisher_midp;
4931 uint32_t precomp_width = g_precomp_width;
4932 uint32_t first_adapt_check = g_first_adapt_check;
4933 uint32_t case_ct = g_perm_case_ct;
4934 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
4935 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
4936 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
4937 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
4938 double adaptive_intercept = g_adaptive_intercept;
4939 double adaptive_slope = g_adaptive_slope;
4940 double adaptive_ci_zt = g_adaptive_ci_zt;
4941 double aperm_alpha = g_aperm_alpha;
4942 uintptr_t* __restrict__ loadbuf;
4943 uintptr_t* is_invalid;
4944 double* __restrict__ orig_pvals;
4945 double* __restrict__ orig_chisq;
4946 uint32_t* __restrict__ missing_cts;
4947 uint32_t* __restrict__ het_cts;
4948 uint32_t* __restrict__ homcom_cts;
4949 uint32_t* __restrict__ precomp_start;
4950 uint32_t* __restrict__ precomp_ui;
4951 uint32_t* gpui;
4952 uintptr_t marker_idx;
4953 uintptr_t pidx;
4954 uint32_t marker_bidx;
4955 uint32_t marker_bceil;
4956 uint32_t success_2start;
4957 uint32_t success_2incr;
4958 uint32_t next_adapt_check;
4959 intptr_t tot_obs;
4960 intptr_t com_ct;
4961 intptr_t het_ct;
4962 intptr_t homrar_ct;
4963 intptr_t homcom_ct;
4964 uint32_t missing_start;
4965 uint32_t case_homrar_ct;
4966 uint32_t case_homcom_ct;
4967 uint32_t case_het_ct;
4968 uint32_t case_missing_ct;
4969 uint32_t case_com_ct;
4970 uint32_t skip_domrec;
4971 uint32_t uii;
4972 uint32_t ujj;
4973 uint32_t ukk;
4974 double stat_high;
4975 double stat_low;
4976 double pval;
4977 double dxx;
4978 double dyy;
4979 double dzz;
4980 while (1) {
4981 if (g_block_diff <= assoc_thread_ct) {
4982 if (g_block_diff <= tidx) {
4983 goto model_adapt_best_thread_skip_all;
4984 }
4985 marker_bidx = g_block_start + tidx;
4986 marker_bceil = marker_bidx + 1;
4987 } else {
4988 marker_bidx = g_block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
4989 marker_bceil = g_block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
4990 }
4991 loadbuf = g_loadbuf;
4992 is_invalid = g_is_invalid_bitfield;
4993 orig_pvals = g_orig_pvals;
4994 orig_chisq = g_orig_chisq;
4995 missing_cts = g_missing_cts;
4996 het_cts = g_het_cts;
4997 homcom_cts = g_homcom_cts;
4998 precomp_start = g_precomp_start;
4999 precomp_ui = g_precomp_ui;
5000
5001 for (; marker_bidx < marker_bceil; marker_bidx++) {
5002 marker_idx = g_adapt_m_table[marker_bidx];
5003 if (model_fisher) {
5004 if (orig_pvals[marker_idx] == -9) {
5005 perm_adapt_stop[marker_idx] = 1;
5006 perm_attempt_ct[marker_idx] = 0;
5007 continue;
5008 }
5009 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
5010 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
5011 } else {
5012 if (orig_chisq[marker_idx] == -9) {
5013 perm_adapt_stop[marker_idx] = 1;
5014 perm_attempt_ct[marker_idx] = 0;
5015 continue;
5016 }
5017 stat_high = orig_chisq[marker_idx] + EPSILON;
5018 stat_low = orig_chisq[marker_idx] - EPSILON;
5019 }
5020 next_adapt_check = first_adapt_check;
5021 tot_obs = pheno_nm_ct - missing_cts[marker_idx];
5022 het_ct = het_cts[marker_idx];
5023 homcom_ct = homcom_cts[marker_idx];
5024 com_ct = homcom_ct * 2 + het_ct;
5025 homrar_ct = tot_obs - het_ct - homcom_ct;
5026 missing_start = precomp_start[marker_bidx];
5027 skip_domrec = IS_SET(is_invalid, marker_idx);
5028 gpui = &(precomp_ui[12 * precomp_width * marker_bidx]);
5029 success_2start = perm_2success_ct[marker_idx];
5030 success_2incr = 0;
5031 for (pidx = 0; pidx < perm_vec_ct;) {
5032 genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
5033 case_homrar_ct = case_ct - case_missing_ct - case_het_ct - case_homcom_ct;
5034 case_com_ct = case_het_ct + 2 * case_homcom_ct;
5035 ujj = 0; // best increment so far
5036 // deliberate underflow
5037 uii = (uint32_t)(case_missing_ct - missing_start);
5038 if (uii < precomp_width) {
5039 if (case_com_ct < gpui[12 * uii]) {
5040 if (case_com_ct < gpui[12 * uii + 2]) {
5041 goto model_adapt_best_thread_betterstat;
5042 } else {
5043 ujj = 1;
5044 }
5045 } else {
5046 if (case_com_ct >= gpui[12 * uii + 1]) {
5047 if (case_com_ct >= gpui[12 * uii + 3]) {
5048 goto model_adapt_best_thread_betterstat;
5049 } else {
5050 ujj = 1;
5051 }
5052 }
5053 }
5054 if (!skip_domrec) {
5055 if (case_homcom_ct < gpui[12 * uii + 4]) {
5056 if (case_homcom_ct < gpui[12 * uii + 6]) {
5057 goto model_adapt_best_thread_betterstat;
5058 } else {
5059 ujj = 1;
5060 }
5061 } else {
5062 if (case_homcom_ct >= gpui[12 * uii + 5]) {
5063 if (case_homcom_ct >= gpui[12 * uii + 7]) {
5064 goto model_adapt_best_thread_betterstat;
5065 } else {
5066 ujj = 1;
5067 }
5068 }
5069 }
5070 if (case_homrar_ct < gpui[12 * uii + 8]) {
5071 if (case_homrar_ct < gpui[12 * uii + 10]) {
5072 goto model_adapt_best_thread_betterstat;
5073 } else {
5074 ujj = 1;
5075 }
5076 } else {
5077 if (case_homrar_ct >= gpui[12 * uii + 9]) {
5078 if (case_homrar_ct >= gpui[12 * uii + 11]) {
5079 goto model_adapt_best_thread_betterstat;
5080 } else {
5081 ujj = 1;
5082 }
5083 }
5084 }
5085 }
5086 } else if (1) {
5087 uii = case_ct - case_missing_ct; // nonmissing cases
5088 if (model_fisher) {
5089 ukk = tot_obs - uii; // nonmissing controls
5090 dxx = fisher22(case_com_ct, 2 * uii - case_com_ct, com_ct - case_com_ct, 2 * ukk + case_com_ct - com_ct, fisher_midp);
5091 if (dxx < stat_low) {
5092 goto model_adapt_best_thread_betterstat;
5093 } else if (dxx <= stat_high) {
5094 ujj = 1;
5095 }
5096 if (!skip_domrec) {
5097 dxx = fisher22(case_homcom_ct, uii - case_homcom_ct, homcom_ct - case_homcom_ct, ukk + case_homcom_ct - homcom_ct, fisher_midp);
5098 if (dxx < stat_low) {
5099 goto model_adapt_best_thread_betterstat;
5100 } else if (dxx <= stat_high) {
5101 ujj = 1;
5102 }
5103 dxx = fisher22(case_homrar_ct, uii - case_homrar_ct, homrar_ct - case_homrar_ct, ukk + case_homrar_ct - homrar_ct, fisher_midp);
5104 if (dxx < stat_low) {
5105 goto model_adapt_best_thread_betterstat;
5106 } else if (dxx <= stat_high) {
5107 ujj = 1;
5108 }
5109 }
5110 } else {
5111 dxx = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5112 if (dxx > stat_high) {
5113 goto model_adapt_best_thread_betterstat;
5114 } else if (dxx > stat_low) {
5115 ujj = 1;
5116 }
5117 if (!skip_domrec) {
5118 dxx = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5119 if (dxx > stat_high) {
5120 goto model_adapt_best_thread_betterstat;
5121 } else if (dxx > stat_low) {
5122 ujj = 1;
5123 }
5124 dxx = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5125 if (dxx > stat_high) {
5126 goto model_adapt_best_thread_betterstat;
5127 } else if (dxx > stat_low) {
5128 ujj = 1;
5129 }
5130 }
5131 }
5132 } else {
5133 model_adapt_best_thread_betterstat:
5134 ujj = 2;
5135 }
5136 success_2incr += ujj;
5137 if (++pidx == next_adapt_check - pidx_offset) {
5138 uii = success_2start + success_2incr;
5139 if (uii) {
5140 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
5141 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
5142 dyy = pval - dxx; // lower bound
5143 dzz = pval + dxx; // upper bound
5144 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
5145 perm_adapt_stop[marker_idx] = 1;
5146 perm_attempt_ct[marker_idx] = next_adapt_check;
5147 break;
5148 }
5149 }
5150 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
5151 }
5152 }
5153 perm_2success_ct[marker_idx] += success_2incr;
5154 }
5155 model_adapt_best_thread_skip_all:
5156 if ((!tidx) || g_is_last_thread_block) {
5157 THREAD_RETURN;
5158 }
5159 THREAD_BLOCK_FINISH(tidx);
5160 }
5161 }
5162
model_maxt_best_thread(void * arg)5163 THREAD_RET_TYPE model_maxt_best_thread(void* arg) {
5164 uintptr_t tidx = (uintptr_t)arg;
5165 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
5166 uintptr_t perm_vec_ct = g_perm_vec_ct;
5167 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5168 uint32_t assoc_thread_ct = g_assoc_thread_ct;
5169 uint32_t pidx_offset = g_perms_done - perm_vec_ct;
5170 uint32_t model_fisher = g_model_fisher;
5171 uint32_t fisher_midp = g_fisher_midp;
5172 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
5173 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
5174 uint32_t* git_homrar_cts = nullptr;
5175 uint32_t* git_missing_cts = nullptr;
5176 uint32_t* git_het_cts = nullptr;
5177 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5178 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
5179 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
5180 uint32_t precomp_width = g_precomp_width;
5181 uint32_t case_ct = g_perm_case_ct;
5182 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
5183 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
5184 double* __restrict__ mperm_save_all = g_mperm_save_all;
5185 double* msa_ptr = nullptr;
5186 uintptr_t* __restrict__ loadbuf;
5187 uintptr_t* is_invalid;
5188 uint32_t* __restrict__ missing_cts;
5189 uint32_t* __restrict__ het_cts;
5190 uint32_t* __restrict__ homcom_cts;
5191 uint32_t* __restrict__ precomp_start;
5192 uint32_t* __restrict__ precomp_ui;
5193 double* __restrict__ precomp_d;
5194 double* __restrict__ orig_pvals;
5195 double* __restrict__ orig_chisq;
5196 uint16_t* ldrefs;
5197 uintptr_t* loadbuf_cur;
5198 uint32_t* resultbuf;
5199 uint32_t* gpui;
5200 double* gpd;
5201 uintptr_t pidx;
5202 uintptr_t marker_idx;
5203 int32_t missing_ct;
5204 intptr_t tot_obs;
5205 intptr_t com_ct;
5206 intptr_t rar_ct;
5207 intptr_t het_ct;
5208 intptr_t homrar_ct;
5209 intptr_t homcom_ct;
5210 uint32_t block_start;
5211 uint32_t maxt_block_base;
5212 uint32_t maxt_block_base2;
5213 uint32_t marker_bidx_start;
5214 uint32_t maxt_block_base3;
5215 uint32_t marker_bidx;
5216 uint32_t marker_bceil;
5217 uint32_t ldref;
5218 uint32_t success_2incr;
5219 uint32_t missing_start;
5220 uint32_t case_homrar_ct;
5221 uint32_t case_homcom_ct;
5222 uint32_t case_het_ct;
5223 uint32_t case_missing_ct;
5224 uint32_t case_com_ct;
5225 uint32_t skip_domrec;
5226 uint32_t uii;
5227 uint32_t ujj;
5228 uint32_t ukk;
5229 uint32_t cur_add;
5230 double stat_high;
5231 double stat_low;
5232 double sval;
5233 double best_stat;
5234 double default_best_stat;
5235 while (1) {
5236 block_start = g_block_start;
5237 if (g_block_diff <= assoc_thread_ct) {
5238 if (g_block_diff <= tidx) {
5239 goto model_maxt_best_thread_skip_all;
5240 }
5241 marker_bidx_start = block_start + tidx;
5242 marker_bceil = marker_bidx_start + 1;
5243 } else {
5244 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
5245 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
5246 }
5247 maxt_block_base = g_maxt_block_base;
5248 maxt_block_base2 = maxt_block_base + block_start;
5249 maxt_block_base3 = maxt_block_base + marker_bidx_start;
5250 marker_bidx = marker_bidx_start;
5251 marker_idx = maxt_block_base3;
5252 loadbuf = g_loadbuf;
5253 is_invalid = g_is_invalid_bitfield;
5254 missing_cts = g_missing_cts;
5255 het_cts = g_het_cts;
5256 homcom_cts = g_homcom_cts;
5257 precomp_start = g_precomp_start;
5258 precomp_ui = g_precomp_ui;
5259 precomp_d = g_precomp_d;
5260 orig_pvals = g_orig_pvals;
5261 orig_chisq = g_orig_chisq;
5262 resultbuf = g_resultbuf;
5263 ldrefs = g_ldrefs;
5264
5265 memcpy(results, &(g_maxt_extreme_stat[pidx_offset]), perm_vec_ct * sizeof(double));
5266 if (mperm_save_all) {
5267 msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
5268 }
5269 for (; marker_bidx < marker_bceil; marker_bidx++) {
5270 if (model_fisher) {
5271 if (orig_pvals[marker_idx] == -9) {
5272 marker_idx++;
5273 continue;
5274 }
5275 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
5276 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
5277 default_best_stat = 1;
5278 } else {
5279 if (orig_chisq[marker_idx] == -9) {
5280 marker_idx++;
5281 continue;
5282 }
5283 stat_high = orig_chisq[marker_idx] + EPSILON;
5284 stat_low = orig_chisq[marker_idx] - EPSILON;
5285 default_best_stat = 0;
5286 }
5287 gpd = &(precomp_d[6 * precomp_width * marker_bidx]);
5288 missing_ct = missing_cts[marker_idx];
5289 tot_obs = pheno_nm_ct - missing_ct;
5290 het_ct = het_cts[marker_idx];
5291 homcom_ct = homcom_cts[marker_idx];
5292 com_ct = 2 * homcom_ct + het_ct;
5293 rar_ct = tot_obs * 2 - com_ct;
5294 homrar_ct = tot_obs - homcom_ct - het_ct;
5295 missing_start = precomp_start[marker_bidx];
5296 skip_domrec = IS_SET(is_invalid, marker_idx);
5297 gpui = &(precomp_ui[18 * precomp_width * marker_bidx]);
5298 success_2incr = 0;
5299 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
5300 ldref = ldrefs[marker_idx];
5301 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
5302 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
5303 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
5304 if (ldref == 65535) {
5305 ldref = marker_bidx;
5306 if (pheno_nm_ct - homcom_ct > 50) {
5307 check_for_better_rem_cost(pheno_nm_ct - homcom_ct - 50, maxt_block_base, maxt_block_base2, maxt_block_base3, marker_idx, missing_cts, homcom_cts, het_cts, ldrefs, pheno_nm_ct, missing_ct, het_ct, homcom_ct, loadbuf, loadbuf_cur, &ldref);
5308 }
5309 ldrefs[marker_idx] = ldref;
5310 }
5311 if (ldref == marker_bidx) {
5312 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
5313 calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
5314 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
5315 } else {
5316 memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
5317 calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
5318 }
5319 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5320 case_missing_ct = git_missing_cts[pidx];
5321 case_het_ct = git_het_cts[pidx];
5322 case_homrar_ct = git_homrar_cts[pidx];
5323 case_homcom_ct = case_ct - case_missing_ct - case_het_ct - case_homrar_ct;
5324 case_com_ct = case_het_ct + 2 * case_homcom_ct;
5325 cur_add = 0;
5326 // deliberate underflow
5327 uii = (uint32_t)(case_missing_ct - missing_start);
5328 if (uii < precomp_width) {
5329 best_stat = default_best_stat;
5330 if (case_com_ct < gpui[18 * uii]) {
5331 if (case_com_ct < gpui[18 * uii + 2]) {
5332 cur_add = 2;
5333 } else {
5334 cur_add = 1;
5335 }
5336 } else {
5337 if (case_com_ct >= gpui[18 * uii + 1]) {
5338 if (case_com_ct >= gpui[18 * uii + 3]) {
5339 cur_add = 2;
5340 } else {
5341 cur_add = 1;
5342 }
5343 }
5344 }
5345 ukk = gpui[18 * uii + 4];
5346 ujj = (uint32_t)(case_com_ct - ukk); // deliberate underflow
5347 if (ujj >= gpui[18 * uii + 5]) {
5348 if (model_fisher) {
5349 ujj = 2 * (case_ct - case_missing_ct);
5350 best_stat = fisher22_tail_pval(ukk, ujj - ukk, com_ct - ukk, rar_ct + ukk - ujj, gpui[18 * uii + 5] - 1, gpd[6 * uii], gpd[6 * uii + 1], fisher_midp, case_com_ct);
5351 } else {
5352 best_stat = ((double)((intptr_t)case_com_ct)) - gpd[6 * uii];
5353 best_stat = best_stat * best_stat * gpd[6 * uii + 1];
5354 }
5355 }
5356 if (!skip_domrec) {
5357 if (cur_add != 2) {
5358 if (case_homcom_ct < gpui[18 * uii + 6]) {
5359 if (case_homcom_ct < gpui[18 * uii + 8]) {
5360 goto model_maxt_best_thread_domrec2;
5361 } else {
5362 cur_add = 1;
5363 }
5364 } else {
5365 if (case_homcom_ct >= gpui[18 * uii + 7]) {
5366 if (case_homcom_ct >= gpui[18 * uii + 9]) {
5367 goto model_maxt_best_thread_domrec2;
5368 } else {
5369 cur_add = 1;
5370 }
5371 }
5372 }
5373 if (1) {
5374 if (case_homrar_ct < gpui[18 * uii + 12]) {
5375 if (case_homrar_ct < gpui[18 * uii + 14]) {
5376 goto model_maxt_best_thread_domrec2;
5377 } else {
5378 cur_add = 1;
5379 }
5380 } else {
5381 if (case_homrar_ct >= gpui[18 * uii + 13]) {
5382 if (case_homrar_ct >= gpui[18 * uii + 15]) {
5383 goto model_maxt_best_thread_domrec2;
5384 } else {
5385 cur_add = 1;
5386 }
5387 }
5388 }
5389 } else {
5390 model_maxt_best_thread_domrec2:
5391 cur_add = 2;
5392 }
5393 }
5394 ukk = gpui[18 * uii + 10];
5395 ujj = (uint32_t)(case_homcom_ct - ukk); // deliberate underflow
5396 if (ujj >= gpui[18 * uii + 11]) {
5397 if (model_fisher) {
5398 ujj = case_ct - case_missing_ct;
5399 sval = fisher22_tail_pval(ukk, ujj - ukk, homcom_ct - ukk, homrar_ct + het_ct + ukk - ujj, gpui[18 * uii + 11] - 1, gpd[6 * uii + 2], gpd[6 * uii + 3], fisher_midp, case_homcom_ct);
5400 if (sval < best_stat) {
5401 best_stat = sval;
5402 }
5403 } else {
5404 sval = ((double)((intptr_t)case_homcom_ct)) - gpd[6 * uii + 2];
5405 sval = sval * sval * gpd[6 * uii + 3];
5406 if (sval > best_stat) {
5407 best_stat = sval;
5408 }
5409 }
5410 }
5411 ukk = gpui[18 * uii + 16];
5412 ujj = (uint32_t)(case_homrar_ct - ukk); // deliberate underflow
5413 if (ujj >= gpui[18 * uii + 17]) {
5414 if (model_fisher) {
5415 ujj = case_ct - case_missing_ct;
5416 sval = fisher22_tail_pval(ukk, ujj - ukk, homrar_ct - ukk, homcom_ct + het_ct + ukk - ujj, gpui[18 * uii + 17] - 1, gpd[6 * uii + 4], gpd[6 * uii + 5], fisher_midp, case_homrar_ct);
5417 if (sval < best_stat) {
5418 best_stat = sval;
5419 }
5420 } else {
5421 sval = ((double)((intptr_t)case_homrar_ct)) - gpd[6 * uii + 4];
5422 sval = sval * sval * gpd[6 * uii + 5];
5423 if (sval > best_stat) {
5424 best_stat = sval;
5425 }
5426 }
5427 }
5428 }
5429 } else {
5430 uii = case_ct - case_missing_ct;
5431 if (model_fisher) {
5432 ukk = tot_obs - uii;
5433 best_stat = fisher22(case_com_ct, 2 * uii - case_com_ct, com_ct - case_com_ct, 2 * ukk + case_com_ct - com_ct, fisher_midp);
5434 if (!skip_domrec) {
5435 sval = fisher22(case_homcom_ct, uii - case_homcom_ct, homcom_ct - case_homcom_ct, ukk + case_homcom_ct - homcom_ct, fisher_midp);
5436 if (sval < best_stat) {
5437 best_stat = sval;
5438 }
5439 sval = fisher22(case_homrar_ct, uii - case_homrar_ct, homrar_ct - case_homrar_ct, ukk + case_homrar_ct - homrar_ct, fisher_midp);
5440 if (sval < best_stat) {
5441 best_stat = sval;
5442 }
5443 }
5444 if (best_stat < stat_low) {
5445 cur_add = 2;
5446 } else if (best_stat <= stat_high) {
5447 cur_add = 1;
5448 }
5449 } else {
5450 best_stat = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5451 if (!skip_domrec) {
5452 sval = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5453 if (sval > best_stat) {
5454 best_stat = sval;
5455 }
5456 sval = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5457 if (sval > best_stat) {
5458 best_stat = sval;
5459 }
5460 }
5461 if (best_stat > stat_high) {
5462 cur_add = 2;
5463 } else if (best_stat > stat_low) {
5464 cur_add = 1;
5465 }
5466 }
5467 if (msa_ptr) {
5468 *msa_ptr++ = best_stat;
5469 }
5470 }
5471 success_2incr += cur_add;
5472 if (model_fisher) {
5473 if (results[pidx] > best_stat) {
5474 results[pidx] = best_stat;
5475 }
5476 } else {
5477 if (results[pidx] < best_stat) {
5478 results[pidx] = best_stat;
5479 }
5480 }
5481 }
5482 perm_2success_ct[marker_idx++] += success_2incr;
5483 }
5484 model_maxt_best_thread_skip_all:
5485 if ((!tidx) || g_is_last_thread_block) {
5486 THREAD_RETURN;
5487 }
5488 THREAD_BLOCK_FINISH(tidx);
5489 }
5490 }
5491
model_set_best_thread(void * arg)5492 THREAD_RET_TYPE model_set_best_thread(void* arg) {
5493 // Similar to model_set_domrec_thread().
5494 uintptr_t tidx = (uintptr_t)arg;
5495 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
5496 uint32_t assoc_thread_ct = g_assoc_thread_ct;
5497 uintptr_t perm_vec_ct = g_perm_vec_ct;
5498 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5499 uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
5500 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
5501 uint32_t* git_homrar_cts = nullptr;
5502 uint32_t* git_missing_cts = nullptr;
5503 uint32_t* git_het_cts = nullptr;
5504 uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5505 uint32_t* resultbuf = g_resultbuf;
5506 uint32_t case_ct = g_perm_case_ct;
5507 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
5508 double* msa_ptr = nullptr;
5509 uintptr_t* loadbuf;
5510 uintptr_t* loadbuf_cur;
5511 uintptr_t* is_invalid;
5512 uint32_t* __restrict__ missing_cts;
5513 uint32_t* __restrict__ het_cts;
5514 uint32_t* __restrict__ homcom_cts;
5515 uintptr_t pidx;
5516 uintptr_t marker_idx;
5517 intptr_t tot_obs;
5518 intptr_t com_ct;
5519 intptr_t het_ct;
5520 intptr_t homrar_ct;
5521 intptr_t homcom_ct;
5522 double best_stat;
5523 double sval;
5524 uint32_t block_start;
5525 uint32_t marker_bidx_start;
5526 uint32_t marker_bidx;
5527 uint32_t marker_bceil;
5528 uint32_t case_homrar_ct;
5529 uint32_t case_homcom_ct;
5530 uint32_t case_het_ct;
5531 uint32_t case_missing_ct;
5532 uint32_t case_com_ct;
5533 uint32_t skip_domrec;
5534 uint32_t uii;
5535 int32_t missing_ct;
5536 while (1) {
5537 block_start = g_block_start;
5538 if (g_block_diff <= assoc_thread_ct) {
5539 if (g_block_diff <= tidx) {
5540 goto model_set_best_thread_skip_all;
5541 }
5542 marker_bidx_start = block_start + tidx;
5543 marker_bceil = marker_bidx_start + 1;
5544 } else {
5545 marker_bidx_start = block_start + (((uint64_t)tidx) * g_block_diff) / assoc_thread_ct;
5546 marker_bceil = block_start + (((uint64_t)tidx + 1) * g_block_diff) / assoc_thread_ct;
5547 }
5548 marker_bidx = marker_bidx_start;
5549 loadbuf = g_loadbuf;
5550 is_invalid = g_is_invalid_bitfield;
5551 missing_cts = g_missing_cts;
5552 het_cts = g_het_cts;
5553 homcom_cts = g_homcom_cts;
5554 for (; marker_bidx < marker_bceil; marker_bidx++) {
5555 marker_idx = g_adapt_m_table[marker_bidx];
5556 msa_ptr = &(g_mperm_save_all[marker_bidx * perm_vec_ct]);
5557 missing_ct = missing_cts[marker_idx];
5558 tot_obs = pheno_nm_ct - missing_ct;
5559 het_ct = het_cts[marker_idx];
5560 homcom_ct = homcom_cts[marker_idx];
5561 com_ct = 2 * homcom_ct + het_ct;
5562 homrar_ct = tot_obs - homcom_ct - het_ct;
5563 skip_domrec = IS_SET(is_invalid, marker_idx);
5564 loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
5565 git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
5566 git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
5567 git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
5568 fill_uint_zero(3 * perm_vec_ctcl4m, git_homrar_cts);
5569 calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
5570 fill_uint_zero(perm_ctvc * 72 * BYTECT4, thread_git_wkspace);
5571 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5572 case_missing_ct = git_missing_cts[pidx];
5573 case_het_ct = git_het_cts[pidx];
5574 case_homrar_ct = git_homrar_cts[pidx];
5575 case_homcom_ct = case_ct - case_missing_ct - case_het_ct - case_homrar_ct;
5576 case_com_ct = case_het_ct + 2 * case_homcom_ct;
5577 uii = case_ct - case_missing_ct;
5578 best_stat = chi22_eval(case_com_ct, 2 * uii, com_ct, 2 * tot_obs);
5579 if (!skip_domrec) {
5580 sval = chi22_eval(case_homcom_ct, uii, homcom_ct, tot_obs);
5581 if (sval > best_stat) {
5582 best_stat = sval;
5583 }
5584 sval = chi22_eval(case_homrar_ct, uii, homrar_ct, tot_obs);
5585 if (sval > best_stat) {
5586 best_stat = sval;
5587 }
5588 }
5589 *msa_ptr++ = best_stat;
5590 }
5591 }
5592 model_set_best_thread_skip_all:
5593 if ((!tidx) || g_is_last_thread_block) {
5594 THREAD_RETURN;
5595 }
5596 THREAD_BLOCK_FINISH(tidx);
5597 }
5598 }
5599
model_assoc_set_test(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,char * outname_end2,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,uintptr_t * marker_exclude_mid,uintptr_t marker_ct_mid,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sex_male,Aperm_info * apip,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * founder_pnm,uint32_t gender_req,uint32_t ld_ignore_x,uint32_t hh_exists,uint32_t perm_batch_size,Set_info * sip,uintptr_t * loadbuf_raw)5600 int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Aperm_info* apip, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* founder_pnm, uint32_t gender_req, uint32_t ld_ignore_x, uint32_t hh_exists, uint32_t perm_batch_size, Set_info* sip, uintptr_t* loadbuf_raw) {
5601 // Could reuse more of the code in model_assoc() since there's considerable
5602 // overlap, but there are enough differences between the regular and set
5603 // permutation tests that separating this out and doing a fair bit of
5604 // cut-and-paste is justifiable (especially for the first version of this
5605 // function).
5606
5607 // There are three levels of marker subsets here.
5608 // 1. marker_exclude_orig refers to all markers which passed QC filters, etc.
5609 // This is needed to interpret the main set data structure.
5610 // 2. marker_exclude_mid refers to all markers contained in at least one set.
5611 // This is a subset of marker_exclude_orig. (They are identical if
5612 // --gene-all was specified.) It was used during the single-marker
5613 // association test phase, and describes which markers orig_chisq[],
5614 // g_missing_cts[], etc. elements initially refer to.
5615 // 3. Finally, the marker_exclude used for set-based permutation testing
5616 // refers to all markers contained in at least one *significant* set.
5617 // orig_chisq is collapsed before permutation to be congruent to this
5618 // marker_exclude.
5619 unsigned char* bigstack_mark = g_bigstack_base;
5620 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
5621 uintptr_t* marker_exclude = marker_exclude_mid;
5622 uintptr_t* unstopped_markers = nullptr;
5623 uintptr_t* loadbuf = g_loadbuf;
5624 uintptr_t* sample_male_include2 = g_sample_male_include2;
5625 uintptr_t* perm_adapt_set_unstopped = nullptr;
5626 char* tbuf2 = &(g_textbuf[MAXLINELEN]);
5627 double* orig_chisq = g_orig_chisq;
5628 double* sorted_chisq_buf = nullptr;
5629 uint32_t* marker_idx_to_uidx = nullptr;
5630 uint32_t* sorted_marker_idx_buf = nullptr;
5631 uint32_t* proxy_arr = nullptr;
5632 uint32_t* perm_2success_ct = nullptr;
5633 uint32_t* perm_attempt_ct = nullptr;
5634 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
5635 uintptr_t marker_ct = marker_ct_mid;
5636 uintptr_t final_mask = get_final_mask(pheno_nm_ct);
5637 uintptr_t ulii = 0;
5638 double adaptive_ci_zt = 0.0;
5639 uint32_t model_assoc = model_modifier & MODEL_ASSOC;
5640 uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
5641 uint32_t model_perm_best = !(model_modifier & MODEL_PMASK);
5642 uint32_t max_thread_ct = g_thread_ct;
5643 uint32_t perms_done = 0;
5644 int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
5645 int32_t retval = 0;
5646 uintptr_t* set_incl;
5647 uintptr_t* loadbuf_ptr;
5648 double* orig_set_scores;
5649 double* chisq_pmajor;
5650 double* chisq_ptr;
5651 double* read_dptr;
5652 double* write_dptr;
5653 unsigned char* bigstack_mark2;
5654 uint32_t** setdefs;
5655 uint32_t** ld_map;
5656 uintptr_t marker_uidx;
5657 uintptr_t marker_midx;
5658 uintptr_t marker_idx;
5659 uintptr_t marker_idx2;
5660 uintptr_t set_ct;
5661 uintptr_t set_idx;
5662 uintptr_t perm_vec_ct;
5663 uintptr_t perm_vec_ctcl4m;
5664 uintptr_t pidx;
5665 double chisq_threshold;
5666 double dxx;
5667 uint32_t perms_total;
5668 uint32_t block_size;
5669 uint32_t block_end;
5670 uint32_t assoc_thread_ct;
5671 uint32_t chrom_fo_idx;
5672 uint32_t chrom_end;
5673 uint32_t is_x;
5674 uint32_t is_y;
5675 uint32_t min_ploidy_1;
5676 uint32_t marker_unstopped_ct;
5677 uint32_t is_last_block;
5678 uint32_t first_adapt_check;
5679 uint32_t max_sigset_size;
5680 uint32_t marker_bidx;
5681 uint32_t uii;
5682 if (sip->set_test_lambda > 1.0) {
5683 dxx = 1.0 / sip->set_test_lambda;
5684 chisq_ptr = orig_chisq;
5685 for (marker_midx = 0; marker_midx < marker_ct; marker_midx++) {
5686 *chisq_ptr *= dxx;
5687 chisq_ptr++;
5688 }
5689 }
5690 ulii = (uintptr_t)(outname_end - outname);
5691 // don't want to overwrite .assoc extension, etc.
5692 memcpy(tbuf2, outname, ulii);
5693 retval = set_test_common_init(threads, bedfile, bed_offset, tbuf2, &(tbuf2[ulii]), unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_ids, max_marker_id_len, marker_reverse, orig_chisq, sip, chrom_info_ptr, unfiltered_sample_ct, sex_male, founder_pnm, ld_ignore_x, hh_exists, "--assoc/--model", &marker_ct, &marker_exclude, &set_incl, &marker_idx_to_uidx, &setdefs, &set_ct, &max_sigset_size, &ld_map, &chisq_threshold, &orig_set_scores, &sorted_chisq_buf, &sorted_marker_idx_buf, &proxy_arr, &perm_adapt_set_unstopped, &perm_2success_ct, &perm_attempt_ct, &unstopped_markers);
5694 if (retval) {
5695 goto model_assoc_set_test_ret_1;
5696 }
5697 if (!set_ct) {
5698 goto model_assoc_set_test_write;
5699 }
5700 if (marker_ct_mid != marker_ct) {
5701 // collapse these arrays so the permutation inner loop is faster
5702 inplace_delta_collapse_arr((char*)g_missing_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5703 if (model_assoc) {
5704 inplace_delta_collapse_arr((char*)g_set_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5705 } else {
5706 inplace_delta_collapse_arr((char*)g_het_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5707 inplace_delta_collapse_arr((char*)g_homcom_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
5708 if (model_perm_best) {
5709 inplace_delta_collapse_bitfield(g_is_invalid_bitfield, marker_ct, marker_exclude_mid, marker_exclude);
5710 }
5711 }
5712 }
5713
5714 if (model_modifier & MODEL_PERM) {
5715 perms_total = apip->max;
5716 first_adapt_check = (apip->min < apip->init_interval)? ((int32_t)apip->init_interval) : apip->min;
5717 adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)set_ct)));
5718 } else {
5719 perms_total = model_mperm_val;
5720 first_adapt_check = perms_total + 1;
5721 }
5722 for (uii = 0; uii < set_ct; uii++) {
5723 perm_attempt_ct[uii] = perms_total;
5724 }
5725 if (max_thread_ct > perms_total) {
5726 max_thread_ct = perms_total;
5727 }
5728 if (bigstack_init_sfmtp(max_thread_ct)) {
5729 goto model_assoc_set_test_ret_NOMEM;
5730 }
5731 marker_unstopped_ct = marker_ct;
5732 g_block_start = 0; // will be nonzero sometimes after LD-exploitation added
5733
5734 // generate a permutation batch, efficiently compute chi-square stats for all
5735 // variants in at least one tested set, compute set score, compare to base
5736 // set score.
5737 bigstack_mark2 = g_bigstack_base;
5738 model_assoc_set_test_more_perms:
5739 if (perms_done) {
5740 uii = apip->init_interval;
5741 while (first_adapt_check <= perms_done) {
5742 first_adapt_check += (int32_t)(uii + ((int32_t)first_adapt_check) * apip->interval_slope);
5743 }
5744 }
5745 // perm_vec_ct memory allocation dependencies:
5746 // g_perm_vecst: 16 * ((perm_vec_ct + 127) / 128) * pheno_nm_ct
5747 // g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 1152 * thread_ct
5748 // g_resultbuf: MODEL_BLOCKSIZE * (4 * perm_vec_ct, CL-aligned) * 3
5749 // g_perm_vecs: pheno_nm_ctv2 * sizeof(intptr_t) * perm_vec_ct
5750 // g_mperm_save_all: MODEL_BLOCKSIZE * 8 * perm_vec_ct
5751 // chisq_pmajor: marker_ct * 8 * perm_vec_ct
5752 // If we force perm_vec_ct to be a multiple of 128, then we have
5753 // perm_vec_ct * (9 * max_thread_ct + 20 * MODEL_BLOCKSIZE +
5754 // pheno_nm_ct / 8 + sizeof(intptr_t) * pheno_nm_ctv2
5755 // + marker_ct * sizeof(double))
5756 perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 1152LL * max_thread_ct + 2560LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
5757 if (perm_vec_ct > perm_batch_size) {
5758 perm_vec_ct = perm_batch_size;
5759 }
5760 if (perm_vec_ct > perms_total - perms_done) {
5761 perm_vec_ct = perms_total - perms_done;
5762 } else if (!perm_vec_ct) {
5763 goto model_assoc_set_test_ret_NOMEM;
5764 }
5765 perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
5766 perms_done += perm_vec_ct;
5767 g_perms_done = perms_done;
5768 g_perm_vec_ct = perm_vec_ct;
5769 bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
5770 g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
5771 ulii = 0;
5772 if (!g_perm_cluster_starts) {
5773 if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
5774 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5775 }
5776 generate_cc_perms_thread((void*)ulii);
5777 } else {
5778 if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
5779 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5780 }
5781 generate_cc_cluster_perms_thread((void*)ulii);
5782 }
5783 join_threads(threads, g_perm_generation_thread_ct);
5784 g_assoc_thread_ct = max_thread_ct;
5785 bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
5786 #ifdef __LP64__
5787 ulii = ((perm_vec_ct + 127) / 128) * 4;
5788 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
5789 #else
5790 ulii = (perm_vec_ct + 31) / 32;
5791 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
5792 ulii = ((perm_vec_ct + 63) / 64) * 2;
5793 #endif
5794 bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
5795 transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
5796 bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all);
5797 bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor);
5798 chrom_fo_idx = 0xffffffffU;
5799 marker_uidx = next_unset_unsafe(marker_exclude, 0);
5800 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5801 goto model_assoc_set_test_ret_READ_FAIL;
5802 }
5803 marker_idx = 0;
5804 marker_idx2 = 0;
5805 chrom_end = 0;
5806 do {
5807 if (marker_uidx >= chrom_end) {
5808 if (model_assoc) {
5809 // exploit overflow
5810 chrom_fo_idx++;
5811 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
5812 min_ploidy_1 |= uii; // treat MT as haploid
5813 g_min_ploidy_1 = min_ploidy_1;
5814 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5815 g_is_y = is_y;
5816 } else {
5817 // no need to skip MT/haploid here, since we error out on that case
5818 // earlier
5819 do {
5820 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
5821 } while (marker_uidx >= chrom_end);
5822 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
5823 is_x = (uii == (uint32_t)x_code);
5824 }
5825 g_is_x = is_x;
5826 }
5827 block_size = g_block_start;
5828 block_end = marker_unstopped_ct - marker_idx;
5829 if (block_end > MODEL_BLOCKSIZE) {
5830 block_end = MODEL_BLOCKSIZE;
5831 }
5832 do {
5833 if (!IS_SET(unstopped_markers, marker_idx2)) {
5834 do {
5835 marker_uidx++;
5836 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
5837 marker_idx2++;
5838 } while ((marker_uidx < chrom_end) && (!IS_SET(unstopped_markers, marker_idx2)));
5839 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5840 goto model_assoc_set_test_ret_READ_FAIL;
5841 }
5842 if (marker_uidx >= chrom_end) {
5843 break;
5844 }
5845 }
5846 loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
5847 if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
5848 goto model_assoc_set_test_ret_READ_FAIL;
5849 }
5850 g_adapt_m_table[block_size] = marker_idx2++;
5851 if (is_x && (!model_assoc)) {
5852 force_missing((unsigned char*)(&(loadbuf[block_size * pheno_nm_ctv2])), sample_male_include2, pheno_nm_ct);
5853 }
5854 block_size++;
5855 if (marker_idx + block_size == marker_unstopped_ct) {
5856 break;
5857 }
5858 marker_uidx++;
5859 if (IS_SET(marker_exclude, marker_uidx)) {
5860 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
5861 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
5862 goto model_assoc_set_test_ret_READ_FAIL;
5863 }
5864 }
5865 } while ((block_size < block_end) && (marker_uidx < chrom_end));
5866 if (!block_size) {
5867 continue;
5868 }
5869 g_block_diff = block_size;
5870 assoc_thread_ct = g_block_diff;
5871 if (assoc_thread_ct > max_thread_ct) {
5872 assoc_thread_ct = max_thread_ct;
5873 }
5874 is_last_block = (marker_idx + block_size == marker_unstopped_ct);
5875 ulii = 0;
5876 if (model_assoc) {
5877 if (spawn_threads2(threads, &assoc_set_thread, max_thread_ct, is_last_block)) {
5878 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5879 }
5880 assoc_set_thread((void*)ulii);
5881 } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
5882 if (spawn_threads2(threads, &model_set_domrec_thread, max_thread_ct, is_last_block)) {
5883 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5884 }
5885 model_set_domrec_thread((void*)ulii);
5886 } else if (model_modifier & MODEL_PTREND) {
5887 if (spawn_threads2(threads, &model_set_trend_thread, max_thread_ct, is_last_block)) {
5888 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5889 }
5890 model_set_trend_thread((void*)ulii);
5891 } else {
5892 if (spawn_threads2(threads, &model_set_best_thread, max_thread_ct, is_last_block)) {
5893 goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
5894 }
5895 model_set_best_thread((void*)ulii);
5896 }
5897 join_threads2(threads, max_thread_ct, is_last_block);
5898 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
5899 // transpose
5900 read_dptr = &(g_mperm_save_all[pidx]);
5901 write_dptr = &(chisq_pmajor[pidx * marker_ct]);
5902 for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
5903 write_dptr[g_adapt_m_table[marker_bidx]] = read_dptr[marker_bidx * perm_vec_ct];
5904 }
5905 }
5906 marker_idx += block_size;
5907 } while (marker_idx < marker_unstopped_ct);
5908 compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
5909 bigstack_reset(bigstack_mark2);
5910 if (perms_done < perms_total) {
5911 if (model_modifier & MODEL_PERM) {
5912 if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
5913 perms_done = 0;
5914 for (set_idx = 0; set_idx < set_ct; set_idx++) {
5915 if (perms_done < perm_attempt_ct[set_idx]) {
5916 perms_done = perm_attempt_ct[set_idx];
5917 }
5918 }
5919 goto model_assoc_set_test_perms_done;
5920 }
5921 // bugfix (7 Aug 2018): forgot to update marker_unstopped_ct
5922 marker_unstopped_ct = popcount_longs(unstopped_markers, (marker_ct + BITCT - 1) / BITCT);
5923 }
5924 printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
5925 fflush(stdout);
5926 goto model_assoc_set_test_more_perms;
5927 }
5928 model_assoc_set_test_perms_done:
5929 putc_unlocked('\r', stdout);
5930 LOGPRINTF("%u permutation%s complete.\n", perms_done, (perms_done != 1)? "s" : "");
5931 model_assoc_set_test_write:
5932 if (model_modifier & MODEL_PERM) {
5933 memcpy(outname_end2, ".set.perm", 10);
5934 } else {
5935 memcpy(outname_end2, ".set.mperm", 11);
5936 }
5937 retval = write_set_test_results(outname, &(outname_end2[4]), sip, ld_map, setdefs, set_incl, set_ct, marker_ct_orig, marker_ct, marker_idx_to_uidx, marker_ids, max_marker_id_len, perm_2success_ct, perm_attempt_ct, mtest_adjust, perm_count, pfilter, output_min_p, chisq_threshold, orig_chisq, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr);
5938 while (0) {
5939 model_assoc_set_test_ret_NOMEM:
5940 retval = RET_NOMEM;
5941 break;
5942 model_assoc_set_test_ret_READ_FAIL:
5943 retval = RET_READ_FAIL;
5944 break;
5945 model_assoc_set_test_ret_THREAD_CREATE_FAIL:
5946 retval = RET_THREAD_CREATE_FAIL;
5947 break;
5948 }
5949 model_assoc_set_test_ret_1:
5950 bigstack_reset(bigstack_mark);
5951 return retval;
5952 }
5953
get_model_assoc_precomp_bounds(uint32_t missing_ct,uint32_t is_model,uint32_t * minp,uint32_t * ctp)5954 void get_model_assoc_precomp_bounds(uint32_t missing_ct, uint32_t is_model, uint32_t* minp, uint32_t* ctp) {
5955 // Estimate which case missing counts are most common.
5956 // Expected value = (g_perm_case_ct * missing_ct / g_perm_pheno_nm_ct)
5957 // If X-chromosome and (!is_model):
5958 // Lower bound = max(0, missing_ct - 2 * (g_perm_pheno_nm_ct -
5959 // g_perm_case_ct))
5960 // Upper bound = min(g_perm_case_ct * 2, missing_ct)
5961 // (Could be a bit more precise if we tracked missing male and female
5962 // counts separately, but whatever)
5963 // Each male automatically contributes 1 to initial missing_ct!
5964 // Otherwise:
5965 // Lower bound = max(0, missing_ct - (g_perm_pheno_nm_ct - g_perm_case_ct))
5966 // Upper bound = min(g_perm_case_ct, missing_ct)
5967 double xval = ((double)(g_perm_case_ct * ((int64_t)missing_ct))) / ((double)((intptr_t)g_perm_pheno_nm_ct));
5968 intptr_t lbound = (intptr_t)(xval + EPSILON + 1 - ((double)((intptr_t)g_precomp_width)) * 0.5);
5969 intptr_t ctrl_ct = g_perm_pheno_nm_ct - g_perm_case_ct;
5970 intptr_t ubound = missing_ct;
5971 intptr_t lii;
5972 if (lbound < 0) {
5973 lbound = 0;
5974 }
5975 if (g_is_x && (!is_model)) {
5976 lii = missing_ct - (2 * ctrl_ct);
5977 if (((uintptr_t)ubound) > g_perm_case_ct * 2) {
5978 ubound = g_perm_case_ct * 2;
5979 }
5980 } else {
5981 lii = missing_ct - ctrl_ct;
5982 if (((uintptr_t)ubound) > g_perm_case_ct) {
5983 ubound = g_perm_case_ct;
5984 }
5985 }
5986 if (lii > lbound) {
5987 lbound = lii;
5988 }
5989 *minp = lbound;
5990 if ((intptr_t)(lbound + g_precomp_width) > ubound) {
5991 *ctp = ubound + 1 - lbound;
5992 } else {
5993 *ctp = g_precomp_width;
5994 }
5995 }
5996
model_assoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_cell_ct,uint32_t model_mperm_val,double ci_size,double ci_zt,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * founder_info,uintptr_t * sex_male,uint32_t hh_exists,uint32_t ld_ignore_x,uint32_t perm_batch_size,Set_info * sip)5997 int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_cell_ct, uint32_t model_mperm_val, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* founder_info, uintptr_t* sex_male, uint32_t hh_exists, uint32_t ld_ignore_x, uint32_t perm_batch_size, Set_info* sip) {
5998 unsigned char* bigstack_mark = g_bigstack_base;
5999 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
6000 uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
6001 uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
6002 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
6003 int32_t retval = 0;
6004 FILE* outfile = nullptr;
6005 FILE* outfile_msa = nullptr;
6006 uintptr_t* marker_exclude = marker_exclude_orig;
6007 uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
6008 uintptr_t marker_ct = marker_ct_orig;
6009 uintptr_t perm_vec_ct = 0;
6010 uintptr_t final_mask = get_final_mask(pheno_nm_ct);
6011 uint32_t model_assoc = model_modifier & MODEL_ASSOC;
6012 uint32_t model_perms = model_modifier & (MODEL_PERM | MODEL_MPERM);
6013 uint32_t is_set_test = model_modifier & MODEL_SET_TEST;
6014 uint32_t model_adapt_nst = (model_modifier & MODEL_PERM) && (!is_set_test);
6015 uint32_t model_maxt_nst = (model_modifier & MODEL_MPERM) && (!is_set_test);
6016 uint32_t model_perms_nst = model_perms && (!is_set_test);
6017 uint32_t model_trendonly = model_modifier & MODEL_TRENDONLY;
6018 uint32_t model_perm_best = !(model_modifier & MODEL_PMASK);
6019 uint32_t model_perm_count = model_modifier & MODEL_PERM_COUNT;
6020 uint32_t assoc_counts = model_modifier & MODEL_ASSOC_COUNTS;
6021 uint32_t display_ci = (ci_size > 0);
6022 uint32_t perms_total = 0;
6023 uint32_t male_ct = 0;
6024 uint32_t nonmale_ct = 0;
6025 uint32_t ctrl_male_ct = 0;
6026 uint32_t case_male_ct = 0;
6027 uint32_t ctrl_nonmale_ct = 0;
6028 uint32_t case_nonmale_ct = 0;
6029 uint32_t load_ctrl_ct = 0;
6030 uint32_t load_case_ct = 0;
6031 uint32_t precomp_width = 0;
6032 uint32_t is_y = 0;
6033 int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
6034 int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
6035 int32_t mt_code = chrom_info_ptr->xymt_codes[MT_OFFSET];
6036 uintptr_t* sample_nonmale_ctrl_include2 = nullptr;
6037 uintptr_t* sample_nonmale_case_include2 = nullptr;
6038 uintptr_t* sample_male_ctrl_include2 = nullptr;
6039 uintptr_t* sample_male_case_include2 = nullptr;
6040 uintptr_t* sample_male_include2 = nullptr;
6041 uintptr_t* cur_ctrl_include2 = nullptr;
6042 uintptr_t* cur_case_include2 = nullptr;
6043 uintptr_t* is_invalid_bitfield = nullptr;
6044 uintptr_t* founder_pnm = nullptr;
6045 uint32_t* perm_2success_ct = nullptr;
6046 uint32_t* perm_attempt_ct = nullptr;
6047 uint32_t* set_cts = nullptr;
6048 uint32_t* het_cts = nullptr;
6049 uint32_t* homcom_cts = nullptr;
6050 uint32_t* precomp_ui = nullptr;
6051 double* orig_chisq = nullptr;
6052 double* maxt_extreme_stat = nullptr;
6053 double* orig_odds = nullptr;
6054 double* precomp_d = nullptr;
6055 unsigned char* perm_adapt_stop = nullptr;
6056 double dxx = 0.0;
6057 double dww = 0.0;
6058 double dvv = 0.0;
6059 double mult_p = 0.0;
6060 double gen_p = 0.0;
6061 double dom_p = 0.0;
6062 double rec_p = 0.0;
6063 double ca_chisq = 0.0;
6064 double maxt_cur_extreme_stat = 0;
6065 uint32_t pct = 0;
6066 uint32_t max_thread_ct = g_thread_ct;
6067 uint32_t perm_pass_idx = 0;
6068 uintptr_t perm_vec_ctcl4m = 0;
6069 uint32_t model_fisher = model_modifier & MODEL_FISHER;
6070 uint32_t model_fisherx = model_fisher && (!(model_modifier & MODEL_PTREND));
6071 uint32_t fisher_midp = model_modifier & MODEL_FISHER_MIDP;
6072 char* writebuf = g_textbuf;
6073 char* chrom_name_ptr = nullptr;
6074 uint32_t chrom_name_len = 0;
6075 char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
6076 uint32_t mu_table[MODEL_BLOCKSIZE];
6077 uint32_t uibuf[4];
6078 char wbuf[48];
6079 char* wptr_start;
6080 char* wptr;
6081 char* wptr2;
6082 char* wptr_mid;
6083 char* wptr_mid2;
6084 char* outname_end2;
6085 uint32_t assoc_thread_ct;
6086 uint32_t fill_orig_chisq;
6087 uint32_t marker_unstopped_ct;
6088 uint32_t gender_req;
6089 uint32_t case_ct;
6090 uint32_t ctrl_ct;
6091 uint32_t chrom_fo_idx;
6092 uint32_t chrom_end;
6093 uint32_t marker_bidx;
6094 uint32_t block_size;
6095 uint32_t block_end;
6096 uint32_t perms_done;
6097 uintptr_t marker_uidx; // loading
6098 uintptr_t marker_uidx2; // writing
6099 uintptr_t marker_idx;
6100 uintptr_t marker_idx2;
6101 uint32_t* marker_idx_to_uidx;
6102 uint32_t* missp;
6103 uint32_t* setp;
6104 uint32_t* hetp;
6105 uint32_t* missing_cts;
6106 double* orig_pvals;
6107 double* orig_pvals_ptr;
6108 double* ooptr;
6109 uintptr_t* loadbuf_raw;
6110 uintptr_t* loadbuf;
6111 uintptr_t* loadbuf_ptr;
6112 uintptr_t* sample_ctrl_include2;
6113 uintptr_t* sample_case_include2;
6114 uint32_t load_sample_ct;
6115 uintptr_t ulii;
6116 uint32_t min_ploidy_1;
6117 uint32_t is_x;
6118 uint32_t is_last_block;
6119 uint32_t uii;
6120 uint32_t ujj;
6121 uint32_t ukk;
6122 uint32_t umm;
6123 uint32_t unn;
6124 uint32_t uoo;
6125 uint32_t upp;
6126 uint32_t uqq;
6127 uint32_t urr;
6128 uint32_t uss;
6129 uint32_t is_invalid;
6130 uint32_t marker_ctl;
6131 double pval;
6132 double dyy;
6133 double dzz;
6134 double da1;
6135 double da2;
6136 double du1;
6137 double du2;
6138 double ca_p;
6139 char* a1ptr;
6140 char* a2ptr;
6141 uint32_t loop_end;
6142 if (pheno_nm_ct < 2) {
6143 logerrprint("Warning: Skipping --assoc/--model since less than two phenotypes are present.\n");
6144 goto model_assoc_ret_1;
6145 }
6146 if (max_marker_allele_len > MAXLINELEN) {
6147 if (bigstack_alloc_c(2 * max_marker_allele_len + MAXLINELEN, &writebuf)) {
6148 goto model_assoc_ret_NOMEM;
6149 }
6150 }
6151 g_model_fisher = model_fisher;
6152 g_fisher_midp = fisher_midp;
6153 g_perm_pheno_nm_ct = pheno_nm_ct;
6154 perms_done = 0;
6155 g_is_model_prec = model_modifier / MODEL_PREC;
6156 g_perm_is_1bit = 0;
6157 g_mperm_save_all = nullptr;
6158 g_sample_male_include2 = nullptr;
6159 if (is_set_test) {
6160 if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
6161 goto model_assoc_ret_NOMEM;
6162 }
6163 memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
6164 bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
6165 if (extract_set_union_unfiltered(sip, nullptr, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
6166 goto model_assoc_ret_NOMEM;
6167 }
6168 }
6169 if (model_maxt_nst) {
6170 perms_total = model_mperm_val;
6171 if (bigstack_alloc_d(perms_total, &maxt_extreme_stat)) {
6172 goto model_assoc_ret_NOMEM;
6173 }
6174 g_maxt_extreme_stat = maxt_extreme_stat;
6175 if (model_fisherx) {
6176 for (uii = 0; uii < perms_total; uii++) {
6177 maxt_extreme_stat[uii] = 1;
6178 }
6179 } else {
6180 fill_double_zero(perms_total, maxt_extreme_stat);
6181 }
6182 if (mperm_save & MPERM_DUMP_ALL) {
6183 memcpy(outname_end, ".mperm.dump.all", 16);
6184 if (fopen_checked(outname, "w", &outfile_msa)) {
6185 goto model_assoc_ret_OPEN_FAIL;
6186 }
6187 LOGPRINTFWW("Dumping all permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", outname);
6188 }
6189 } else {
6190 mperm_save = 0;
6191 if (model_adapt_nst) {
6192 g_aperm_alpha = apip->alpha;
6193 perms_total = apip->max;
6194 if (apip->min < apip->init_interval) {
6195 g_first_adapt_check = (int32_t)(apip->init_interval);
6196 } else {
6197 g_first_adapt_check = apip->min;
6198 }
6199 g_adaptive_intercept = apip->init_interval;
6200 g_adaptive_slope = apip->interval_slope;
6201 }
6202 }
6203 if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
6204 goto model_assoc_ret_NOMEM;
6205 }
6206 loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
6207 if (model_assoc) {
6208 if (model_fisher) {
6209 outname_end2 = memcpyb(outname_end, ".assoc.fisher", 14);
6210 } else {
6211 outname_end2 = memcpyb(outname_end, ".assoc", 7);
6212 }
6213 if (fopen_checked(outname, "w", &outfile)) {
6214 goto model_assoc_ret_OPEN_FAIL;
6215 }
6216 sprintf(g_logbuf, "Writing C/C --assoc report to %s ... ", outname);
6217 wordwrapb(25); // strlen("[generating permutations]")
6218 logprintb();
6219 fflush(stdout);
6220 sprintf(g_textbuf, " CHR %%%us BP A1 ", plink_maxsnp);
6221 fprintf(outfile, g_textbuf, "SNP");
6222 if (assoc_counts) {
6223 fputs(" C_A C_U A2 ", outfile);
6224 } else {
6225 fputs(" F_A F_U A2 ", outfile);
6226 }
6227 if (!model_fisher) {
6228 fputs(" CHISQ ", outfile);
6229 }
6230 if (fputs_checked(" P OR ", outfile)) {
6231 goto model_assoc_ret_WRITE_FAIL;
6232 }
6233 if (display_ci) {
6234 uii = (uint32_t)((int32_t)(ci_size * (100 + EPSILON)));
6235 if (uii >= 10) {
6236 fprintf(outfile, " SE L%u U%u ", uii, uii);
6237 } else {
6238 fprintf(outfile, " SE L%u U%u ", uii, uii);
6239 }
6240 }
6241 if (putc_checked('\n', outfile)) {
6242 goto model_assoc_ret_WRITE_FAIL;
6243 }
6244 } else {
6245 if (is_set(chrom_info_ptr->haploid_mask, 0)) {
6246 logerrprint("Error: --model cannot be used on haploid genomes.\n");
6247 goto model_assoc_ret_INVALID_CMDLINE;
6248 }
6249 uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 0, 1);
6250 if (uii) {
6251 if (is_set_test) {
6252 // given how the data structures are currently designed, and how easy
6253 // the command-line fix is, this is not worth the trouble of supporting
6254 // (this problem illustrates why core data structures should use
6255 // unfiltered indexes when possible, though)
6256 logerrprint("Error: --model set-test cannot be used with sets containing MT/haploid\nvariants. (You can use e.g. '--not-chr y, mt' to exclude them.)\n");
6257 goto model_assoc_ret_INVALID_CMDLINE;
6258 }
6259 LOGPRINTF("Excluding %u MT/haploid variant%s from --model analysis.\n", uii, (uii == 1)? "" : "s");
6260 marker_ct -= uii;
6261 if (!marker_ct) {
6262 logerrprint("Error: No variants remaining for --model analysis.\n");
6263 goto model_assoc_ret_INVALID_CMDLINE;
6264 }
6265 }
6266 outname_end2 = memcpyb(outname_end, ".model", 7);
6267 if (fopen_checked(outname, "w", &outfile)) {
6268 goto model_assoc_ret_OPEN_FAIL;
6269 }
6270 sprintf(g_logbuf, "Writing --model report to %s ... ", outname);
6271 wordwrapb(25);
6272 logprintb();
6273 fflush(stdout);
6274 if (model_perm_best && model_perms) {
6275 outname_end2 = memcpyb(outname_end2, ".best", 6);
6276 } else if ((model_modifier & MODEL_PGEN) && model_perms) {
6277 outname_end2 = memcpyb(outname_end2, ".gen", 5);
6278 } else if (model_modifier & MODEL_PDOM) {
6279 outname_end2 = memcpyb(outname_end2, ".dom", 5);
6280 } else if (model_modifier & MODEL_PREC) {
6281 outname_end2 = memcpyb(outname_end2, ".rec", 5);
6282 } else if (model_modifier & MODEL_PTREND) {
6283 outname_end2 = memcpyb(outname_end2, ".trend", 7);
6284 }
6285 sprintf(g_textbuf, " CHR %%%us A1 A2 TEST AFF UNAFF ", plink_maxsnp);
6286 fprintf(outfile, g_textbuf, "SNP");
6287 if (!model_fisher) {
6288 fputs(" CHISQ DF ", outfile);
6289 } else {
6290 outname_end2 = memcpyb(outname_end2, ".fisher", 8);
6291 }
6292 if (fputs_checked(" P\n", outfile)) {
6293 goto model_assoc_ret_WRITE_FAIL;
6294 }
6295 }
6296 marker_ctl = BITCT_TO_WORDCT(marker_ct);
6297 g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
6298 if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &loadbuf) ||
6299 bigstack_alloc_d(marker_ct, &orig_pvals) ||
6300 bigstack_alloc_ui(marker_ct, &missing_cts)) {
6301 goto model_assoc_ret_NOMEM;
6302 }
6303 g_loadbuf = loadbuf;
6304 g_orig_pvals = orig_pvals;
6305 g_missing_cts = missing_cts;
6306 if (model_assoc) {
6307 if (bigstack_alloc_d(marker_ct, &orig_odds) ||
6308 bigstack_alloc_ui(marker_ct, &set_cts)) {
6309 goto model_assoc_ret_NOMEM;
6310 }
6311 g_set_cts = set_cts;
6312 }
6313 if ((!model_assoc) || model_maxt_nst) {
6314 if (bigstack_alloc_ui(marker_ct, &het_cts) ||
6315 bigstack_alloc_ui(marker_ct, &homcom_cts)) {
6316 goto model_assoc_ret_NOMEM;
6317 }
6318 g_het_cts = het_cts;
6319 g_homcom_cts = homcom_cts;
6320 }
6321 gender_req = ((x_code != -2) && is_set(chrom_info_ptr->chrom_mask, x_code)) || (model_assoc && (((y_code != -2) && is_set(chrom_info_ptr->chrom_mask, y_code))));
6322 if (gender_req) {
6323 if (bigstack_alloc_ul(pheno_nm_ctv2, &g_sample_nonmale_include2) ||
6324 bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_include2)) {
6325 goto model_assoc_ret_NOMEM;
6326 }
6327 g_sample_male_include2 = sample_male_include2;
6328 quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_include2);
6329 male_ct = popcount01_longs(sample_male_include2, pheno_nm_ctv2);
6330 quatervec_01_init_invert(sample_male_include2, pheno_nm_ct, g_sample_nonmale_include2);
6331 nonmale_ct = pheno_nm_ct - male_ct;
6332 }
6333 // Set test does not support Fisher stats, so currently guaranteed to be
6334 // true there. Will need to modify this expression if we ever support
6335 // generation of synthetic chi-square stats from Fisher p-values.
6336 fill_orig_chisq = (!model_fisherx) || (mtest_adjust && (!model_fisher));
6337 if (fill_orig_chisq) {
6338 if (bigstack_calloc_d(marker_ct, &orig_chisq)) {
6339 goto model_assoc_ret_NOMEM;
6340 }
6341 }
6342 g_orig_chisq = orig_chisq;
6343
6344 if (model_perms) {
6345 if (cluster_starts) {
6346 retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
6347 if (retval) {
6348 goto model_assoc_ret_1;
6349 }
6350 if (!g_perm_cluster_ct) {
6351 logerrprint("Error: No size 2+ clusters for permutation test.\n");
6352 goto model_assoc_ret_INVALID_CMDLINE;
6353 }
6354 retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
6355 if (retval) {
6356 goto model_assoc_ret_1;
6357 }
6358 } else {
6359 g_perm_cluster_starts = nullptr;
6360 }
6361 if (!is_set_test) {
6362 if (max_thread_ct > perms_total) {
6363 max_thread_ct = perms_total;
6364 }
6365 if (bigstack_init_sfmtp(max_thread_ct)) {
6366 goto model_assoc_ret_NOMEM;
6367 }
6368 }
6369 if (model_perm_best) {
6370 if (bigstack_calloc_ul(marker_ctl, &is_invalid_bitfield)) {
6371 goto model_assoc_ret_NOMEM;
6372 }
6373 g_is_invalid_bitfield = is_invalid_bitfield;
6374 }
6375
6376 if (!is_set_test) {
6377 g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
6378 if (!g_ldrefs) {
6379 goto model_assoc_ret_NOMEM;
6380 }
6381 #ifdef __LP64__
6382 fill_ulong_one((marker_ct + 3) / 4, (uintptr_t*)g_ldrefs);
6383 #else
6384 fill_ulong_one((marker_ct + 1) / 2, (uintptr_t*)g_ldrefs);
6385 #endif
6386 if (!(mperm_save & MPERM_DUMP_ALL)) {
6387 // 5.65686 = roughly 4 * sqrt(2), corresponding to 4 stdevs. this is
6388 // a somewhat arbitrary choice.
6389 // currently just need this to never exceed (2^32 - 1) / (12 * 1024),
6390 // to avoid uint32_t overflow.
6391 precomp_width = (1 + (int32_t)(sqrt(pheno_nm_ct) * EXPECTED_MISSING_FREQ * 5.65686));
6392 } else {
6393 precomp_width = 0;
6394 }
6395 g_precomp_width = precomp_width;
6396 if (bigstack_calloc_ui(marker_ct, &perm_2success_ct)) {
6397 goto model_assoc_ret_NOMEM;
6398 }
6399 if (model_maxt_nst) {
6400 if (model_fisherx) {
6401 if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC))) {
6402 if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
6403 bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
6404 goto model_assoc_ret_NOMEM;
6405 }
6406 } else if (model_perm_best) {
6407 if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
6408 bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
6409 goto model_assoc_ret_NOMEM;
6410 }
6411 }
6412 } else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
6413 if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
6414 bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
6415 goto model_assoc_ret_NOMEM;
6416 }
6417 } else if (model_perm_best) {
6418 if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
6419 bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
6420 goto model_assoc_ret_NOMEM;
6421 }
6422 }
6423 } else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
6424 if (bigstack_alloc_ui(precomp_width * 4 * MODEL_BLOCKSIZE, &precomp_ui)) {
6425 goto model_assoc_ret_NOMEM;
6426 }
6427 } else if (model_perm_best) {
6428 if (bigstack_alloc_ui(precomp_width * 12 * MODEL_BLOCKSIZE, &precomp_ui)) {
6429 goto model_assoc_ret_NOMEM;
6430 }
6431 }
6432 g_perm_2success_ct = perm_2success_ct;
6433 if (model_adapt_nst) {
6434 if (bigstack_alloc_ui(marker_ct, &perm_attempt_ct) ||
6435
6436 // we need to zero out trailing bytes of the last word
6437 bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &perm_adapt_stop)) {
6438 goto model_assoc_ret_NOMEM;
6439 }
6440 g_perm_attempt_ct = perm_attempt_ct;
6441 g_perm_adapt_stop = perm_adapt_stop;
6442 ujj = apip->max;
6443 for (uii = 0; uii < marker_ct; uii++) {
6444 perm_attempt_ct[uii] = ujj;
6445 }
6446 }
6447 }
6448 if (!cluster_starts) {
6449 g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
6450 magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
6451 }
6452 }
6453 g_precomp_ui = precomp_ui;
6454 g_precomp_d = precomp_d;
6455 if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_ctrl_include2) ||
6456 bigstack_alloc_ul(pheno_nm_ctv2, &sample_case_include2)) {
6457 goto model_assoc_ret_NOMEM;
6458 }
6459 quaterarr_collapse_init(pheno_c, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_case_include2);
6460 case_ct = popcount01_longs(sample_case_include2, pheno_nm_ctv2);
6461 g_perm_case_ct = case_ct;
6462 quatervec_01_init_invert(sample_case_include2, pheno_nm_ct, sample_ctrl_include2);
6463 ctrl_ct = pheno_nm_ct - case_ct;
6464 if (gender_req) {
6465 // todo: get rid of these and just use the functions called by the
6466 // permutation tests
6467 if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_ctrl_include2) ||
6468 bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_case_include2) ||
6469 bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_ctrl_include2) ||
6470 bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_case_include2)) {
6471 goto model_assoc_ret_NOMEM;
6472 }
6473 quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_case_include2);
6474 bitvec_and(sample_case_include2, pheno_nm_ctv2, sample_male_case_include2);
6475 case_male_ct = popcount01_longs(sample_male_case_include2, pheno_nm_ctv2);
6476 bitvec_andnot_copy(sample_male_include2, sample_male_case_include2, pheno_nm_ctv2, sample_male_ctrl_include2);
6477 bitvec_andnot_copy(sample_case_include2, sample_male_case_include2, pheno_nm_ctv2, sample_nonmale_case_include2);
6478 bitvec_andnot_copy(sample_ctrl_include2, sample_male_ctrl_include2, pheno_nm_ctv2, sample_nonmale_ctrl_include2);
6479 ctrl_male_ct = male_ct - case_male_ct;
6480 case_nonmale_ct = case_ct - case_male_ct;
6481 ctrl_nonmale_ct = ctrl_ct - ctrl_male_ct;
6482 }
6483
6484 for (uii = 1; uii <= MODEL_BLOCKSIZE; uii++) {
6485 loadbuf[uii * pheno_nm_ctv2 - 2] = 0;
6486 loadbuf[uii * pheno_nm_ctv2 - 1] = 0;
6487 }
6488 if (model_perms) {
6489 if (bigstack_left() < pheno_nm_ctv2 * sizeof(intptr_t)) {
6490 goto model_assoc_ret_NOMEM;
6491 }
6492 }
6493 marker_unstopped_ct = marker_ct;
6494
6495 // ----- begin main loop -----
6496 model_assoc_more_perms:
6497 if (model_perms_nst) {
6498 if (!perm_pass_idx) {
6499 fputs("[generating permutations]", stdout);
6500 fflush(stdout);
6501 }
6502 if (model_adapt_nst) {
6503 if (perm_pass_idx) {
6504 uii = g_first_adapt_check;
6505 ujj = apip->init_interval;
6506 while (uii <= perms_done) {
6507 // APERM_MAX prevents infinite loop here
6508 uii += (int32_t)(ujj + ((int32_t)uii) * apip->interval_slope);
6509 }
6510 g_first_adapt_check = uii;
6511 }
6512 perm_vec_ct = bigstack_left() / (pheno_nm_ctv2 * sizeof(intptr_t));
6513 } else {
6514 // perm_vec_ct memory allocation dependencies:
6515 // g_maxt_thread_results: (8 * perm_vec_ct, cacheline-aligned) *
6516 // max_thread_ct
6517 // g_perm_vecst: 16 * ((perm_vec_ct + 127) / 128) * pheno_nm_ct
6518 // g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 1152 * thread_ct
6519 // g_resultbuf: MODEL_BLOCKSIZE * (4 * perm_vec_ct, CL-aligned) * 3
6520 // g_perm_vecs: pheno_nm_ctv2 * sizeof(intptr_t) * perm_vec_ct
6521 // g_mperm_save_all (if needed): marker_ct * 8 * perm_vec_ct
6522 // If we force perm_vec_ct to be a multiple of 128, then we have
6523 // perm_vec_ct * (17 * max_thread_ct + 12 * MODEL_BLOCKSIZE +
6524 // pheno_nm_ct / 8 + sizeof(intptr_t) * pheno_nm_ctv2
6525 // [+ marker_ct * sizeof(double) * mperm_save_all])
6526 //
6527 // Each max(T) thread has six buffers to support rapid execution of the
6528 // genotype indexing and LD exploiter algorithms:
6529 // six with 4-bit accumulators, each has size perm_vec_ct / 2 bytes
6530 // six with 8-bit accumulators, each has size perm_vec_ct bytes
6531 // The initial 6 multiplier is to allow heterozygote, homozygote minor,
6532 // and missing genotype increments and decrements to be counted
6533 // simultaneously.
6534 // Adding all this up, we have 9 * perm_vec_ct bytes, and multiplying
6535 // by 128 yields 1152. The other thread_ct dependence contributes
6536 // 8 * perm_vec_ct bytes, multiplying by 128 yields 1024, and
6537 // 1152 + 1024 = 2176.
6538 if (mperm_save & MPERM_DUMP_ALL) {
6539 perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
6540 } else {
6541 perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct));
6542 }
6543 }
6544 if (perm_vec_ct > perms_total - perms_done) {
6545 perm_vec_ct = perms_total - perms_done;
6546 } else if (!perm_vec_ct) {
6547 goto model_assoc_ret_NOMEM;
6548 }
6549 perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
6550 perms_done += perm_vec_ct;
6551 g_perms_done = perms_done;
6552 g_perm_vec_ct = perm_vec_ct;
6553 bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
6554 g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
6555 ulii = 0;
6556 if (!cluster_starts) {
6557 if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
6558 goto model_assoc_ret_THREAD_CREATE_FAIL;
6559 }
6560 generate_cc_perms_thread((void*)ulii);
6561 } else {
6562 if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
6563 goto model_assoc_ret_THREAD_CREATE_FAIL;
6564 }
6565 generate_cc_cluster_perms_thread((void*)ulii);
6566 }
6567 join_threads(threads, g_perm_generation_thread_ct);
6568 g_assoc_thread_ct = max_thread_ct;
6569 if (!model_adapt_nst) {
6570 bigstack_alloc_d(max_thread_ct * round_up_pow2(perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
6571 bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
6572 #ifdef __LP64__
6573 ulii = ((perm_vec_ct + 127) / 128) * 4;
6574 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
6575 #else
6576 ulii = (perm_vec_ct + 31) / 32;
6577 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
6578 ulii = ((perm_vec_ct + 63) / 64) * 2;
6579 #endif
6580 bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
6581 transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
6582 if (mperm_save & MPERM_DUMP_ALL) {
6583 bigstack_alloc_d(marker_ct * perm_vec_ct, &g_mperm_save_all);
6584 }
6585 }
6586 if (!perm_pass_idx) {
6587 fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", stdout);
6588 }
6589 }
6590 if (!perm_pass_idx) {
6591 fputs("0%", stdout);
6592 fflush(stdout);
6593 }
6594 chrom_fo_idx = 0xffffffffU;
6595 marker_uidx = next_unset_unsafe(marker_exclude, 0);
6596 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6597 goto model_assoc_ret_READ_FAIL;
6598 }
6599 marker_idx = 0;
6600 marker_idx2 = 0;
6601 chrom_end = 0;
6602 loop_end = marker_ct / 100;
6603 do {
6604 if (marker_uidx >= chrom_end) {
6605 g_block_start = 0;
6606 if (model_assoc) {
6607 // exploit overflow
6608 chrom_fo_idx++;
6609 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
6610 min_ploidy_1 |= uii; // treat MT as haploid
6611 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
6612 if (min_ploidy_1 && (!is_x)) {
6613 if (is_y) {
6614 cur_ctrl_include2 = sample_male_ctrl_include2;
6615 cur_case_include2 = sample_male_case_include2;
6616 load_sample_ct = male_ct;
6617 load_case_ct = case_male_ct;
6618 } else {
6619 cur_ctrl_include2 = sample_ctrl_include2;
6620 cur_case_include2 = sample_case_include2;
6621 load_sample_ct = pheno_nm_ct;
6622 load_case_ct = case_ct;
6623 }
6624 load_ctrl_ct = load_sample_ct - load_case_ct;
6625 }
6626 g_min_ploidy_1 = min_ploidy_1;
6627 g_is_y = is_y;
6628 } else {
6629 while (1) {
6630 do {
6631 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
6632 } while (marker_uidx >= chrom_end);
6633 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
6634 is_x = (uii == (uint32_t)x_code);
6635 if (((!IS_SET(haploid_mask, uii)) && (uii != (uint32_t)mt_code)) || is_x) {
6636 break;
6637 }
6638 marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
6639 }
6640 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6641 goto model_assoc_ret_READ_FAIL;
6642 }
6643 }
6644 g_is_x = is_x;
6645 chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
6646 } else if (model_maxt_nst) {
6647 marker_idx -= MODEL_BLOCKKEEP;
6648 if (marker_idx) { // max(T) initial block special case, see below
6649 memcpy(loadbuf, &(loadbuf[(MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * pheno_nm_ctv2]), MODEL_BLOCKKEEP * pheno_nm_ctv2 * sizeof(intptr_t));
6650 memcpy(g_resultbuf, &(g_resultbuf[3 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl4m]), MODEL_BLOCKKEEP * perm_vec_ctcl4m * 3 * sizeof(int32_t));
6651 }
6652 g_block_start = MODEL_BLOCKKEEP;
6653 } else {
6654 g_block_start = 0;
6655 }
6656 block_size = g_block_start;
6657 block_end = marker_unstopped_ct - marker_idx;
6658 if ((!marker_idx) && (!block_size) && model_maxt_nst) {
6659 // For max(T) permutation tests, minimize how long we have to work with
6660 // crappy precomputed values. Most important when using Fisher exact
6661 // test p-values.
6662 if (block_end > MODEL_BLOCKKEEP) {
6663 block_end = MODEL_BLOCKKEEP;
6664 }
6665 } else if (block_end > MODEL_BLOCKSIZE) {
6666 block_end = MODEL_BLOCKSIZE;
6667 }
6668 do {
6669 if (model_adapt_nst && perm_adapt_stop[marker_idx2]) {
6670 do {
6671 marker_uidx++;
6672 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
6673 marker_idx2++;
6674 } while ((marker_uidx < chrom_end) && perm_adapt_stop[marker_idx2]);
6675 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6676 goto model_assoc_ret_READ_FAIL;
6677 }
6678 if (marker_uidx >= chrom_end) {
6679 break;
6680 }
6681 }
6682 loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
6683 if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
6684 goto model_assoc_ret_READ_FAIL;
6685 }
6686 if (model_adapt_nst) {
6687 g_adapt_m_table[block_size] = marker_idx2++;
6688 }
6689 if (is_x && (!model_assoc)) {
6690 force_missing((unsigned char*)(&(loadbuf[block_size * pheno_nm_ctv2])), sample_male_include2, pheno_nm_ct);
6691 }
6692 // no need for usual haploid_fix since the popcount routines here
6693 // interpret het. haploids as missing anyway
6694 mu_table[block_size++] = marker_uidx;
6695 if (marker_idx + block_size == marker_unstopped_ct) {
6696 break;
6697 }
6698 marker_uidx++;
6699 if (IS_SET(marker_exclude, marker_uidx)) {
6700 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
6701 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
6702 goto model_assoc_ret_READ_FAIL;
6703 }
6704 }
6705 } while ((block_size < block_end) && (marker_uidx < chrom_end));
6706 if (block_size == g_block_start) {
6707 continue;
6708 }
6709 if (!perm_pass_idx) {
6710 // basic --assoc/--model
6711 orig_pvals_ptr = &(orig_pvals[marker_idx + g_block_start]);
6712 missp = &(missing_cts[marker_idx + g_block_start]);
6713 if (model_assoc) {
6714 setp = &(set_cts[marker_idx + g_block_start]);
6715 ooptr = &(orig_odds[marker_idx + g_block_start]);
6716 for (marker_bidx = g_block_start; marker_bidx < block_size; marker_bidx++) {
6717 marker_uidx2 = mu_table[marker_bidx];
6718 if (!min_ploidy_1) {
6719 if (model_maxt_nst) {
6720 single_marker_cc_3freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &unn, &uoo, &ujj, &upp, &uqq, &umm);
6721 het_cts[marker_idx + marker_bidx] = uoo + uqq;
6722 homcom_cts[marker_idx + marker_bidx] = unn + upp;
6723 uii = 2 * unn + uoo;
6724 ukk = 2 * upp + uqq;
6725 } else {
6726 single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &uii, &ujj, &ukk, &umm);
6727 }
6728 *missp = ujj + umm;
6729 *setp = uii + ukk;
6730 ujj = 2 * (ctrl_ct - ujj) - uii;
6731 umm = 2 * (case_ct - umm) - ukk;
6732 } else if (is_x) {
6733 single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_nonmale_ctrl_include2, sample_nonmale_case_include2, &uii, &ujj, &ukk, &umm);
6734 *missp = 2 * (ujj + umm);
6735 *setp = uii + ukk;
6736 ujj = 2 * (ctrl_nonmale_ct - ujj) - uii;
6737 umm = 2 * (case_nonmale_ct - umm) - ukk;
6738 haploid_single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_male_ctrl_include2, sample_male_case_include2, &unn, &uoo, &upp, &uqq);
6739 *missp += uoo + uqq + male_ct;
6740 *setp += unn + upp;
6741 uoo = ctrl_male_ct - uoo - unn;
6742 uqq = case_male_ct - uqq - upp;
6743 uii += unn;
6744 ujj += uoo;
6745 ukk += upp;
6746 umm += uqq;
6747 } else {
6748 haploid_single_marker_cc_freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), cur_ctrl_include2, cur_case_include2, &uii, &ujj, &ukk, &umm);
6749 *missp = ujj + umm;
6750 *setp = uii + ukk;
6751 ujj = load_ctrl_ct - ujj - uii;
6752 umm = load_case_ct - umm - ukk;
6753 if (is_y) {
6754 *missp += nonmale_ct;
6755 } else if (model_maxt_nst) {
6756 het_cts[marker_idx + marker_bidx] = 0;
6757 homcom_cts[marker_idx + marker_bidx] = *setp;
6758 }
6759 }
6760 da1 = umm;
6761 da2 = ukk;
6762 du1 = ujj;
6763 du2 = uii;
6764 if (model_fisher) {
6765 // bugfix (12 Jun 2018): If MAF is zero, test should not be
6766 // considered valid for --adjust or permutation testing purposes.
6767 // plink 1.07 got this right, but in a wrong way: it considered
6768 // *all* Fisher's-exact-test p-values of 1 to be invalid tests. So
6769 // we don't generally want to match its output (even before
6770 // considering the problems with its fisher22 routine).
6771 if ((umm + ujj) && (ukk + uii)) {
6772 pval = fisher22(uii, ujj, ukk, umm, fisher_midp);
6773 } else {
6774 pval = -9;
6775 }
6776 *orig_pvals_ptr = pval;
6777 } else {
6778 if ((umm + ujj) && (ukk + uii)) {
6779 dxx = chi22_eval(ukk, ukk + umm, uii + ukk, uii + ujj + ukk + umm);
6780 pval = chiprob_p(dxx, 1);
6781 *orig_pvals_ptr = pval;
6782 if (fill_orig_chisq) {
6783 orig_chisq[marker_idx + marker_bidx] = dxx;
6784 }
6785 } else {
6786 *orig_pvals_ptr = -9;
6787 pval = -1;
6788 dxx = 0;
6789 if (fill_orig_chisq) {
6790 orig_chisq[marker_idx + marker_bidx] = -9;
6791 }
6792 }
6793 }
6794 *ooptr = (da1 * du2) / (du1 * da2);
6795 if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
6796 a1ptr = marker_allele_ptrs[2 * marker_uidx2];
6797 a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
6798 wptr = memcpyax(writebuf, chrom_name_ptr, chrom_name_len, ' ');
6799 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
6800 *wptr++ = ' ';
6801 wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
6802 wptr = fw_strcpy(4, a1ptr, wptr);
6803 *wptr++ = ' ';
6804 if (umm + ukk) {
6805 if (assoc_counts) {
6806 wptr = uint32toa_w8(umm, wptr);
6807 } else {
6808 wptr = dtoa_g_wxp4(da1 / (da1 + da2), 8, wptr);
6809 }
6810 *wptr++ = ' ';
6811 } else {
6812 wptr = memcpya(wptr, " NA ", 9);
6813 }
6814 if (ujj + uii) {
6815 if (assoc_counts) {
6816 wptr = uint32toa_w8(ujj, wptr);
6817 } else {
6818 wptr = dtoa_g_wxp4(du1 / (du1 + du2), 8, wptr);
6819 }
6820 } else {
6821 wptr = memcpya(wptr, " NA", 8);
6822 }
6823 *wptr = ' ';
6824 wptr = fw_strcpy(4, a2ptr, &(wptr[1]));
6825 *wptr++ = ' ';
6826 if (model_fisher) {
6827 if (pval == -9) {
6828 wptr = memcpya(wptr, " 1", 12);
6829 } else {
6830 wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
6831 }
6832 } else {
6833 if (pval > -1) {
6834 wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
6835 wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
6836 } else {
6837 wptr = memcpya(wptr, " NA NA", 25);
6838 }
6839 }
6840 *wptr++ = ' ';
6841 if (du1 * da2 == 0.0) {
6842 wptr = memcpya(wptr, " NA", 12);
6843 if (display_ci) {
6844 wptr = memcpya(wptr, " NA NA NA", 39);
6845 }
6846 } else {
6847 wptr = dtoa_g_wxp4(*ooptr, 12, wptr);
6848 if (display_ci) {
6849 dxx = log(*ooptr);
6850 dyy = sqrt(1 / da1 + 1 / da2 + 1 / du1 + 1 / du2);
6851 dzz = ci_zt * dyy;
6852 dww = exp(dxx - dzz);
6853 dvv = exp(dxx + dzz);
6854 *wptr++ = ' ';
6855 wptr = dtoa_g_wxp4x(dyy, 12, ' ', wptr);
6856 wptr = dtoa_g_wxp4x(dww, 12, ' ', wptr);
6857 wptr = dtoa_g_wxp4(dvv, 12, wptr);
6858 }
6859 }
6860 wptr = memcpya(wptr, " \n", 2);
6861 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6862 goto model_assoc_ret_WRITE_FAIL;
6863 }
6864 }
6865 missp++;
6866 setp++;
6867 orig_pvals_ptr++;
6868 ooptr++;
6869 }
6870 } else {
6871 // repurpose setp as homcom_cts pointer
6872 setp = &(homcom_cts[marker_idx + g_block_start]);
6873 hetp = &(het_cts[marker_idx + g_block_start]);
6874 for (marker_bidx = g_block_start; marker_bidx < block_size; marker_bidx++) {
6875 marker_uidx2 = mu_table[marker_bidx];
6876 single_marker_cc_3freqs(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), sample_ctrl_include2, sample_case_include2, &uii, &ujj, &ukk, &umm, &unn, &uoo);
6877 *missp = ukk + uoo;
6878 *setp = uii + umm;
6879 ukk = pheno_nm_ct - case_ct - uii - ujj - ukk;
6880 uoo = case_ct - umm - unn - uoo;
6881 *hetp = ujj + unn;
6882 is_invalid = (uoo < model_cell_ct) || (unn < model_cell_ct) || (umm < model_cell_ct) || (ukk < model_cell_ct) || (ujj < model_cell_ct) || (uii < model_cell_ct);
6883 a1ptr = marker_allele_ptrs[2 * marker_uidx2];
6884 a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
6885 wptr = memcpya(writebuf, chrom_name_ptr, chrom_name_len);
6886 *wptr++ = ' ';
6887 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
6888 *wptr++ = ' ';
6889 wptr = fw_strcpy(4, a1ptr, wptr);
6890 *wptr++ = ' ';
6891 wptr = fw_strcpy(4, a2ptr, wptr);
6892 memset(wptr, 32, 2);
6893 wptr = &(wptr[2]);
6894 wptr_mid = wptr;
6895 if (!model_trendonly) {
6896 memcpy(wptr, " GENO ", 8);
6897 wptr2 = uint32toa_x(uoo, '/', wbuf);
6898 wptr2 = uint32toa_x(unn, '/', wptr2);
6899 wptr2 = uint32toa(umm, wptr2);
6900 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr[8]));
6901 *wptr++ = ' ';
6902 wptr2 = uint32toa_x(ukk, '/', wbuf);
6903 wptr2 = uint32toa_x(ujj, '/', wptr2);
6904 wptr2 = uint32toa(uii, wptr2);
6905 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
6906 *wptr++ = ' ';
6907 if (is_invalid) {
6908 gen_p = -9;
6909 if (fill_orig_chisq && (model_modifier & MODEL_PGEN)) {
6910 orig_chisq[marker_idx + marker_bidx] = -9;
6911 }
6912 } else {
6913 if (model_fisher) {
6914 gen_p = fisher23(uii, ujj, ukk, umm, unn, uoo, fisher_midp);
6915 } else {
6916 chi23_evalx(uii, ujj, ukk, umm, unn, uoo, &dvv, &upp);
6917 gen_p = chiprob_px(dvv, upp);
6918 if (fill_orig_chisq && (model_modifier & MODEL_PGEN)) {
6919 if (dvv != -9) {
6920 orig_chisq[marker_idx + marker_bidx] = dvv;
6921 } else {
6922 orig_chisq[marker_idx + marker_bidx] = 0;
6923 }
6924 }
6925 }
6926 }
6927 if (gen_p < -1) {
6928 wptr = model_assoc_tna(model_fisher, wptr);
6929 } else {
6930 if (!model_fisher) {
6931 wptr = dtoa_g_wxp4(dvv, 12, wptr);
6932 wptr = memcpya(wptr, " ", 4);
6933 *wptr++ = '0' + upp;
6934 *wptr++ = ' ';
6935 }
6936 wptr = dtoa_g_wxp4x(MAXV(gen_p, output_min_p), 12, '\n', wptr);
6937 }
6938 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6939 goto model_assoc_ret_WRITE_FAIL;
6940 }
6941 }
6942 memcpy(wptr_mid, " TREND ", 8);
6943 wptr2 = uint32toa_x(uoo * 2 + unn, '/', wbuf);
6944 wptr2 = uint32toa(umm * 2 + unn, wptr2);
6945 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
6946 *wptr++ = ' ';
6947 wptr2 = uint32toa_x(ukk * 2 + ujj, '/', wbuf);
6948 wptr2 = uint32toa(uii * 2 + ujj, wptr2);
6949 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
6950 *wptr++ = ' ';
6951 wptr_mid2 = wptr; // save this for next line
6952 ca_chisq = ca_trend_evalx(umm * 2 + unn, umm + unn + uoo, ujj + unn, uii + umm, uii + ujj + ukk + umm + unn + uoo);
6953 ca_p = chiprob_px(ca_chisq, 1);
6954 if (fill_orig_chisq && (model_modifier & MODEL_PTREND)) {
6955 if (ca_chisq != -9) {
6956 orig_chisq[marker_idx + marker_bidx] = ca_chisq;
6957 } else {
6958 orig_chisq[marker_idx + marker_bidx] = 0;
6959 }
6960 }
6961 if (ca_p > -1) {
6962 if (!model_fisher) {
6963 wptr = dtoa_g_wxp4(ca_chisq, 12, wptr);
6964 wptr = memcpya(wptr, " 1 ", 6);
6965 }
6966 wptr = dtoa_g_wxp4x(MAXV(ca_p, output_min_p), 12, '\n', wptr);
6967 } else {
6968 wptr = model_assoc_tna(model_fisher, wptr);
6969 }
6970 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6971 goto model_assoc_ret_WRITE_FAIL;
6972 }
6973 if (!model_trendonly) {
6974 memcpy(wptr_mid, "ALLELIC", 7);
6975 wptr = wptr_mid2;
6976 if (model_fisher) {
6977 mult_p = fisher22(2 * uoo + unn, 2 * umm + unn, 2 * ukk + ujj, 2 * uii + ujj, fisher_midp);
6978 } else {
6979 dww = chi22_evalx(2 * uoo + unn, 2 * (uoo + unn + umm), 2 * (uoo + ukk) + unn + ujj, 2 * (uoo + unn + umm + ukk + ujj + uii));
6980 mult_p = chiprob_px(dww, 1);
6981 }
6982 if (mult_p > -1) {
6983 if (!model_fisher) {
6984 wptr = dtoa_g_wxp4(dww, 12, wptr);
6985 wptr = memcpya(wptr, " 1 ", 6);
6986 }
6987 wptr = dtoa_g_wxp4x(MAXV(mult_p, output_min_p), 12, '\n', wptr);
6988 } else {
6989 wptr = model_assoc_tna(model_fisher, wptr);
6990 }
6991 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
6992 goto model_assoc_ret_WRITE_FAIL;
6993 }
6994 memcpy(wptr_mid, " DOM", 7);
6995 wptr2 = uint32toa_x(uoo + unn, '/', wbuf);
6996 wptr2 = uint32toa(umm, wptr2);
6997 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
6998 *wptr++ = ' ';
6999 wptr2 = uint32toa_x(ukk + ujj, '/', wbuf);
7000 wptr2 = uint32toa(uii, wptr2);
7001 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
7002 *wptr++ = ' ';
7003 if (is_invalid) {
7004 dom_p = -9;
7005 if (fill_orig_chisq && (model_modifier & MODEL_PDOM)) {
7006 orig_chisq[marker_idx + marker_bidx] = -9;
7007 }
7008 } else {
7009 if (model_fisher) {
7010 dom_p = fisher22(uoo + unn, umm, ukk + ujj, uii, fisher_midp);
7011 } else {
7012 dww = chi22_evalx(uoo + unn, uoo + unn + umm, uoo + unn + ukk + ujj, uoo + unn + umm + ukk + ujj + uii);
7013 dom_p = chiprob_px(dww, 1);
7014 if (fill_orig_chisq && (model_modifier & MODEL_PDOM)) {
7015 if (dww != -9) {
7016 orig_chisq[marker_idx + marker_bidx] = dww;
7017 } else {
7018 orig_chisq[marker_idx + marker_bidx] = 0;
7019 }
7020 }
7021 }
7022 }
7023 if (dom_p < -1) {
7024 wptr = model_assoc_tna(model_fisher, wptr);
7025 } else {
7026 if (!model_fisher) {
7027 wptr = dtoa_g_wxp4(dww, 12, wptr);
7028 wptr = memcpya(wptr, " 1 ", 6);
7029 }
7030 wptr = dtoa_g_wxp4x(MAXV(dom_p, output_min_p), 12, '\n', wptr);
7031 }
7032 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
7033 goto model_assoc_ret_WRITE_FAIL;
7034 }
7035 memcpy(&(wptr_mid[4]), "REC", 3);
7036 wptr2 = uint32toa_x(uoo, '/', wbuf);
7037 wptr2 = uint32toa(unn + umm, wptr2);
7038 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
7039 *wptr++ = ' ';
7040 wptr2 = uint32toa_x(ukk, '/', wbuf);
7041 wptr2 = uint32toa(ujj + uii, wptr2);
7042 wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
7043 *wptr++ = ' ';
7044 if (is_invalid) {
7045 rec_p = -9;
7046 if (fill_orig_chisq && (model_modifier & MODEL_PREC)) {
7047 orig_chisq[marker_idx + marker_bidx] = -9;
7048 }
7049 } else {
7050 if (model_fisher) {
7051 rec_p = fisher22(uoo, unn + umm, ukk, ujj + uii, fisher_midp);
7052 } else {
7053 dww = chi22_evalx(uoo, uoo + unn + umm, uoo + ukk, uoo + unn + umm + ukk + ujj + uii);
7054 rec_p = chiprob_px(dww, 1);
7055 if (fill_orig_chisq && (model_modifier & MODEL_PREC)) {
7056 if (dww != -9) {
7057 orig_chisq[marker_idx + marker_bidx] = dww;
7058 } else {
7059 orig_chisq[marker_idx + marker_bidx] = 0;
7060 }
7061 }
7062 }
7063 }
7064 if (rec_p < -1) {
7065 wptr = model_assoc_tna(model_fisher, wptr);
7066 } else {
7067 if (!model_fisher) {
7068 wptr = dtoa_g_wxp4(dww, 12, wptr);
7069 wptr = memcpya(wptr, " 1 ", 6);
7070 }
7071 wptr = dtoa_g_wxp4x(MAXV(rec_p, output_min_p), 12, '\n', wptr);
7072 }
7073 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
7074 goto model_assoc_ret_WRITE_FAIL;
7075 }
7076 }
7077 if (model_perm_best) {
7078 dxx = mult_p;
7079 if (!is_invalid) {
7080 if ((dom_p < dxx) && (dom_p >= 0)) {
7081 dxx = dom_p;
7082 }
7083 if ((rec_p < dxx) && (rec_p >= 0)) {
7084 dxx = rec_p;
7085 }
7086 }
7087 if (model_perms && is_invalid) {
7088 set_bit_ul(marker_idx + marker_bidx, is_invalid_bitfield);
7089 }
7090 if (fill_orig_chisq) {
7091 if (dxx != -9) {
7092 orig_chisq[marker_idx + marker_bidx] = inverse_chiprob(dxx, 1);
7093 } else {
7094 orig_chisq[marker_idx + marker_bidx] = -9;
7095 }
7096 }
7097 } else if (model_modifier & MODEL_PGEN) {
7098 dxx = (gen_p >= 0)? gen_p : -9;
7099 } else if (model_modifier & MODEL_PDOM) {
7100 dxx = (dom_p >= 0)? dom_p : -9;
7101 } else if (model_modifier & MODEL_PREC) {
7102 dxx = (rec_p >= 0)? rec_p : -9;
7103 } else if (model_modifier & MODEL_PTREND) {
7104 dxx = (ca_p >= 0)? ca_p : -9;
7105 }
7106 missp++;
7107 setp++;
7108 hetp++;
7109 *orig_pvals_ptr++ = dxx;
7110 }
7111 }
7112 }
7113 if (model_perms_nst) {
7114 g_block_diff = block_size - g_block_start;
7115 assoc_thread_ct = g_block_diff;
7116 if (assoc_thread_ct > max_thread_ct) {
7117 assoc_thread_ct = max_thread_ct;
7118 }
7119 if (model_maxt_nst) {
7120 if (model_fisherx) {
7121 maxt_cur_extreme_stat = maxt_extreme_stat[0];
7122 for (uii = 1; uii < perm_vec_ct; uii++) {
7123 dxx = maxt_extreme_stat[uii];
7124 if (dxx > maxt_cur_extreme_stat) {
7125 maxt_cur_extreme_stat = dxx;
7126 }
7127 }
7128 } else {
7129 maxt_cur_extreme_stat = maxt_extreme_stat[0];
7130 for (uii = 1; uii < perm_vec_ct; uii++) {
7131 dxx = maxt_extreme_stat[uii];
7132 if (dxx < maxt_cur_extreme_stat) {
7133 maxt_cur_extreme_stat = dxx;
7134 }
7135 }
7136 }
7137 }
7138 if (model_assoc) {
7139 if (min_ploidy_1) {
7140 uqq = 1;
7141 } else {
7142 uqq = 2;
7143 }
7144 for (uii = g_block_start; uii < block_size; uii++) {
7145 if (model_adapt_nst) {
7146 urr = g_adapt_m_table[uii];
7147 } else {
7148 urr = marker_idx + uii;
7149 }
7150 upp = missing_cts[urr];
7151 get_model_assoc_precomp_bounds(upp, 0, &ujj, &ukk);
7152 g_precomp_start[uii] = ujj;
7153 uoo = set_cts[urr];
7154 if (is_x) {
7155 unn = 2 * case_ct;
7156 upp = 2 * pheno_nm_ct - upp;
7157 } else {
7158 unn = uqq * case_ct;
7159 upp = uqq * (pheno_nm_ct - upp);
7160 }
7161 ujj *= uqq;
7162 ukk += uii * precomp_width;
7163 if (model_fisher) {
7164 dxx = orig_pvals[urr];
7165 if (model_adapt_nst) {
7166 for (umm = uii * precomp_width; umm < ukk; umm++) {
7167 fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - ujj, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7168 ujj += uqq;
7169 }
7170 } else {
7171 for (umm = uii * precomp_width; umm < ukk; umm++) {
7172 fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7173 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, unn - ujj, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7174 precomp_ui[umm * 6 + 4] = uibuf[2];
7175 precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7176 ujj += uqq;
7177 }
7178 }
7179 } else {
7180 dxx = orig_chisq[urr];
7181 if (model_adapt_nst) {
7182 for (umm = uii * precomp_width; umm < ukk; umm++) {
7183 chi22_precomp_val_bounds(dxx, unn - ujj, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7184 ujj += uqq;
7185 }
7186 } else {
7187 for (umm = uii * precomp_width; umm < ukk; umm++) {
7188 chi22_precomp_val_bounds(dxx, unn - ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7189 chi22_precomp_val_bounds(maxt_cur_extreme_stat, unn - ujj, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7190 precomp_ui[umm * 6 + 4] = uibuf[2];
7191 precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7192 ujj += uqq;
7193 }
7194 }
7195 }
7196 }
7197 } else if (model_perm_best) {
7198 for (uii = g_block_start; uii < block_size; uii++) {
7199 if (model_adapt_nst) {
7200 urr = g_adapt_m_table[uii];
7201 } else {
7202 urr = marker_idx + uii;
7203 }
7204 upp = missing_cts[urr];
7205 get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7206 g_precomp_start[uii] = ujj;
7207 unn = 2 * case_ct;
7208 uqq = 2 * (pheno_nm_ct - upp);
7209 uoo = 2 * homcom_cts[urr] + het_cts[urr];
7210 ukk += uii * precomp_width;
7211 uss = 2 * ujj;
7212 if (model_fisher) {
7213 dxx = orig_pvals[urr];
7214 if (model_adapt_nst) {
7215 for (umm = uii * precomp_width; umm < ukk; umm++) {
7216 fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - uss, uoo, uqq, &(precomp_ui[umm * 12]), nullptr);
7217 uss += 2;
7218 }
7219 } else {
7220 for (umm = uii * precomp_width; umm < ukk; umm++) {
7221 fisher22_precomp_pval_bounds(dxx, fisher_midp, unn - uss, uoo, uqq, &(precomp_ui[umm * 18]), nullptr);
7222 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, 2 * case_ct - uss, uoo, uqq, uibuf, &(precomp_d[umm * 6]));
7223 precomp_ui[umm * 18 + 4] = uibuf[2];
7224 precomp_ui[umm * 18 + 5] = uibuf[3] - uibuf[2];
7225 uss += 2;
7226 }
7227 }
7228 if (!IS_SET(is_invalid_bitfield, urr)) {
7229 upp = pheno_nm_ct - upp;
7230 uoo = homcom_cts[urr];
7231 uqq = upp - uoo - het_cts[urr];
7232 ujj = case_ct - ujj;
7233 if (model_adapt_nst) {
7234 for (umm = uii * precomp_width; umm < ukk; umm++) {
7235 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 12 + 4]), nullptr);
7236 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uqq, upp, &(precomp_ui[umm * 12 + 8]), nullptr);
7237 ujj--;
7238 }
7239 } else {
7240 for (umm = uii * precomp_width; umm < ukk; umm++) {
7241 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 18 + 6]), nullptr);
7242 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj, uoo, upp, uibuf, &(precomp_d[umm * 6 + 2]));
7243 precomp_ui[umm * 18 + 10] = uibuf[2];
7244 precomp_ui[umm * 18 + 11] = uibuf[3] - uibuf[2];
7245 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uqq, upp, &(precomp_ui[umm * 18 + 12]), nullptr);
7246 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj, uqq, upp, uibuf, &(precomp_d[umm * 6 + 4]));
7247 precomp_ui[umm * 18 + 16] = uibuf[2];
7248 precomp_ui[umm * 18 + 17] = uibuf[3] - uibuf[2];
7249 ujj--;
7250 }
7251 }
7252 }
7253 } else {
7254 dxx = orig_chisq[urr];
7255 if (model_adapt_nst) {
7256 for (umm = uii * precomp_width; umm < ukk; umm++) {
7257 chi22_precomp_val_bounds(dxx, unn - uss, uoo, uqq, &(precomp_ui[umm * 12]), nullptr);
7258 uss += 2;
7259 }
7260 } else {
7261 for (umm = uii * precomp_width; umm < ukk; umm++) {
7262 chi22_precomp_val_bounds(dxx, unn - uss, uoo, uqq, &(precomp_ui[umm * 18]), nullptr);
7263 chi22_precomp_val_bounds(maxt_cur_extreme_stat, unn - uss, uoo, uqq, uibuf, &(precomp_d[umm * 6]));
7264 precomp_ui[umm * 18 + 4] = uibuf[2];
7265 precomp_ui[umm * 18 + 5] = uibuf[3] - uibuf[2];
7266 uss += 2;
7267 }
7268 }
7269 if (!IS_SET(is_invalid_bitfield, urr)) {
7270 upp = pheno_nm_ct - upp;
7271 uoo = homcom_cts[urr];
7272 uqq = upp - uoo - het_cts[urr];
7273 ujj = case_ct - ujj;
7274 if (model_adapt_nst) {
7275 for (umm = uii * precomp_width; umm < ukk; umm++) {
7276 chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 12 + 4]), nullptr);
7277 chi22_precomp_val_bounds(dxx, ujj, uqq, upp, &(precomp_ui[umm * 12 + 8]), nullptr);
7278 ujj--;
7279 }
7280 } else {
7281 for (umm = uii * precomp_width; umm < ukk; umm++) {
7282 chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 18 + 6]), nullptr);
7283 chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj, uoo, upp, uibuf, &(precomp_d[umm * 6 + 2]));
7284 precomp_ui[umm * 18 + 10] = uibuf[2];
7285 precomp_ui[umm * 18 + 11] = uibuf[3] - uibuf[2];
7286 chi22_precomp_val_bounds(dxx, ujj, uqq, upp, &(precomp_ui[umm * 18 + 12]), nullptr);
7287 chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj, uqq, upp, uibuf, &(precomp_d[umm * 6 + 4]));
7288 precomp_ui[umm * 18 + 16] = uibuf[2];
7289 precomp_ui[umm * 18 + 17] = uibuf[3] - uibuf[2];
7290 ujj--;
7291 }
7292 }
7293 }
7294 }
7295 }
7296 } else if (model_modifier & MODEL_PTREND) {
7297 for (uii = g_block_start; uii < block_size; uii++) {
7298 if (model_adapt_nst) {
7299 urr = g_adapt_m_table[uii];
7300 } else {
7301 urr = marker_idx + uii;
7302 }
7303 upp = missing_cts[urr];
7304 get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7305 g_precomp_start[uii] = ujj;
7306 unn = het_cts[urr];
7307 upp = pheno_nm_ct - upp; // tot_obs
7308 uoo = homcom_cts[urr];
7309 ukk += uii * precomp_width;
7310 ujj = case_ct - ujj;
7311 dxx = orig_chisq[urr];
7312 if (model_adapt_nst) {
7313 for (umm = uii * precomp_width; umm < ukk; umm++) {
7314 ca_trend_precomp_val_bounds(dxx, ujj--, unn, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7315 }
7316 } else {
7317 for (umm = uii * precomp_width; umm < ukk; umm++) {
7318 ca_trend_precomp_val_bounds(dxx, ujj, unn, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7319 ca_trend_precomp_val_bounds(maxt_cur_extreme_stat, ujj--, unn, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7320 precomp_ui[umm * 6 + 4] = uibuf[2];
7321 precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7322 }
7323 }
7324 }
7325 } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7326 for (uii = g_block_start; uii < block_size; uii++) {
7327 if (model_adapt_nst) {
7328 urr = g_adapt_m_table[uii];
7329 } else {
7330 urr = marker_idx + uii;
7331 }
7332 upp = missing_cts[urr];
7333 get_model_assoc_precomp_bounds(upp, 1, &ujj, &ukk);
7334 g_precomp_start[uii] = ujj;
7335 upp = pheno_nm_ct - upp; // tot_obs
7336 if (model_modifier & MODEL_PREC) {
7337 uoo = upp - homcom_cts[urr] - het_cts[urr]; // col1_sum
7338 } else {
7339 uoo = homcom_cts[urr];
7340 }
7341 ukk += uii * precomp_width;
7342 ujj = case_ct - ujj;
7343 if (model_fisher) {
7344 dxx = orig_pvals[urr];
7345 if (model_adapt_nst) {
7346 for (umm = uii * precomp_width; umm < ukk; umm++) {
7347 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj--, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7348 }
7349 } else {
7350 for (umm = uii * precomp_width; umm < ukk; umm++) {
7351 fisher22_precomp_pval_bounds(dxx, fisher_midp, ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7352 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, fisher_midp, ujj--, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7353 precomp_ui[umm * 6 + 4] = uibuf[2];
7354 precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7355 }
7356 }
7357 } else {
7358 dxx = orig_chisq[urr];
7359 if (model_adapt_nst) {
7360 for (umm = uii * precomp_width; umm < ukk; umm++) {
7361 chi22_precomp_val_bounds(dxx, ujj--, uoo, upp, &(precomp_ui[umm * 4]), nullptr);
7362 }
7363 } else {
7364 for (umm = uii * precomp_width; umm < ukk; umm++) {
7365 chi22_precomp_val_bounds(dxx, ujj, uoo, upp, &(precomp_ui[umm * 6]), nullptr);
7366 chi22_precomp_val_bounds(maxt_cur_extreme_stat, ujj--, uoo, upp, uibuf, &(precomp_d[umm * 2]));
7367 precomp_ui[umm * 6 + 4] = uibuf[2];
7368 precomp_ui[umm * 6 + 5] = uibuf[3] - uibuf[2];
7369 }
7370 }
7371 }
7372 }
7373 }
7374 is_last_block = (marker_idx + block_size == marker_unstopped_ct);
7375 ulii = 0;
7376 if (model_adapt_nst) {
7377 if (model_assoc) {
7378 if (spawn_threads2(threads, &assoc_adapt_thread, max_thread_ct, is_last_block)) {
7379 goto model_assoc_ret_THREAD_CREATE_FAIL;
7380 }
7381 assoc_adapt_thread((void*)ulii);
7382 } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7383 if (spawn_threads2(threads, &model_adapt_domrec_thread, max_thread_ct, is_last_block)) {
7384 goto model_assoc_ret_THREAD_CREATE_FAIL;
7385 }
7386 model_adapt_domrec_thread((void*)ulii);
7387 } else if (model_modifier & MODEL_PTREND) {
7388 if (spawn_threads2(threads, &model_adapt_trend_thread, max_thread_ct, is_last_block)) {
7389 goto model_assoc_ret_THREAD_CREATE_FAIL;
7390 }
7391 model_adapt_trend_thread((void*)ulii);
7392 } else if (model_modifier & MODEL_PGEN) {
7393 if (spawn_threads2(threads, &model_adapt_gen_thread, max_thread_ct, is_last_block)) {
7394 goto model_assoc_ret_THREAD_CREATE_FAIL;
7395 }
7396 model_adapt_gen_thread((void*)ulii);
7397 } else {
7398 if (spawn_threads2(threads, &model_adapt_best_thread, max_thread_ct, is_last_block)) {
7399 goto model_assoc_ret_THREAD_CREATE_FAIL;
7400 }
7401 model_adapt_best_thread((void*)ulii);
7402 }
7403 join_threads2(threads, max_thread_ct, is_last_block);
7404 } else {
7405 g_maxt_block_base = marker_idx;
7406 if (model_assoc) {
7407 if (spawn_threads2(threads, &assoc_maxt_thread, max_thread_ct, is_last_block)) {
7408 goto model_assoc_ret_THREAD_CREATE_FAIL;
7409 }
7410 assoc_maxt_thread((void*)ulii);
7411 } else if (model_modifier & (MODEL_PDOM | MODEL_PREC)) {
7412 if (spawn_threads2(threads, &model_maxt_domrec_thread, max_thread_ct, is_last_block)) {
7413 goto model_assoc_ret_THREAD_CREATE_FAIL;
7414 }
7415 model_maxt_domrec_thread((void*)ulii);
7416 } else if (model_modifier & MODEL_PTREND) {
7417 if (spawn_threads2(threads, &model_maxt_trend_thread, max_thread_ct, is_last_block)) {
7418 goto model_assoc_ret_THREAD_CREATE_FAIL;
7419 }
7420 model_maxt_trend_thread((void*)ulii);
7421 } else if (model_modifier & MODEL_PGEN) {
7422 if (spawn_threads2(threads, &model_maxt_gen_thread, max_thread_ct, is_last_block)) {
7423 goto model_assoc_ret_THREAD_CREATE_FAIL;
7424 }
7425 model_maxt_gen_thread((void*)ulii);
7426 } else {
7427 if (spawn_threads2(threads, &model_maxt_best_thread, max_thread_ct, is_last_block)) {
7428 goto model_assoc_ret_THREAD_CREATE_FAIL;
7429 }
7430 model_maxt_best_thread((void*)ulii);
7431 }
7432 join_threads2(threads, max_thread_ct, is_last_block);
7433 ulii = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
7434 if (model_fisherx) {
7435 for (uii = 0; uii < assoc_thread_ct; uii++) {
7436 ooptr = &(g_maxt_thread_results[uii * ulii]);
7437 for (ujj = perms_done - perm_vec_ct; ujj < perms_done; ujj++) {
7438 dxx = *ooptr++;
7439 if (dxx < maxt_extreme_stat[ujj]) {
7440 maxt_extreme_stat[ujj] = dxx;
7441 }
7442 }
7443 }
7444 } else {
7445 for (uii = 0; uii < assoc_thread_ct; uii++) {
7446 ooptr = &(g_maxt_thread_results[uii * ulii]);
7447 for (ujj = perms_done - perm_vec_ct; ujj < perms_done; ujj++) {
7448 dxx = *ooptr++;
7449 if (dxx > maxt_extreme_stat[ujj]) {
7450 maxt_extreme_stat[ujj] = dxx;
7451 }
7452 }
7453 }
7454 }
7455 }
7456 }
7457 marker_idx += block_size;
7458 if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
7459 if (marker_idx < marker_unstopped_ct) {
7460 if (pct >= 10) {
7461 putc_unlocked('\b', stdout);
7462 }
7463 pct = (marker_idx * 100LLU) / marker_unstopped_ct;
7464 printf("\b\b%u%%", pct);
7465 fflush(stdout);
7466 loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
7467 }
7468 }
7469 } while (marker_idx < marker_unstopped_ct);
7470 if (!perm_pass_idx) {
7471 if (pct >= 10) {
7472 putc_unlocked('\b', stdout);
7473 }
7474 fputs("\b\b", stdout);
7475 logprint("done.\n");
7476 if (model_perms_nst) {
7477 bigstack_reset(g_perm_vecs);
7478 }
7479 if (fclose_null(&outfile)) {
7480 goto model_assoc_ret_WRITE_FAIL;
7481 }
7482 if (!is_set_test) {
7483 if (mtest_adjust) {
7484 if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
7485 goto model_assoc_ret_NOMEM;
7486 }
7487 fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
7488 retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, model_fisher? nullptr : orig_chisq, pfilter, output_min_p, mtest_adjust, (!model_assoc) && (!(model_modifier & MODEL_PTREND)), adjust_lambda, nullptr, model_fisher? orig_pvals : nullptr);
7489 if (retval) {
7490 goto model_assoc_ret_1;
7491 }
7492 bigstack_reset(marker_idx_to_uidx);
7493 }
7494 if (mperm_save & MPERM_DUMP_ALL) {
7495 g_textbuf[0] = '0';
7496 wptr = &(g_textbuf[1]);
7497 a1ptr = &(g_textbuf[MAXLINELEN]);
7498 if (model_fisherx) {
7499 for (uii = 0; uii < marker_ct; uii++) {
7500 *wptr++ = ' ';
7501 dxx = orig_pvals[uii];
7502 if (dxx >= 0) {
7503 wptr = dtoa_g(dxx, wptr);
7504 } else {
7505 wptr = memcpya(wptr, "NA", 2);
7506 }
7507 if (wptr >= a1ptr) {
7508 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7509 goto model_assoc_ret_WRITE_FAIL;
7510 }
7511 wptr = g_textbuf;
7512 }
7513 }
7514 } else {
7515 for (uii = 0; uii < marker_ct; uii++) {
7516 *wptr++ = ' ';
7517 dxx = orig_chisq[uii];
7518 if (dxx >= 0) {
7519 wptr = dtoa_g(dxx, wptr);
7520 } else {
7521 wptr = memcpya(wptr, "NA", 2);
7522 }
7523 if (wptr >= a1ptr) {
7524 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7525 goto model_assoc_ret_WRITE_FAIL;
7526 }
7527 wptr = g_textbuf;
7528 }
7529 }
7530 }
7531 *wptr++ = '\n';
7532 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7533 goto model_assoc_ret_WRITE_FAIL;
7534 }
7535 }
7536 } else {
7537 retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
7538 if (retval) {
7539 goto model_assoc_ret_1;
7540 }
7541 }
7542 }
7543 if (model_perms_nst) {
7544 if (mperm_save & MPERM_DUMP_ALL) {
7545 if (perm_pass_idx) {
7546 putc_unlocked(' ', stdout);
7547 }
7548 fputs("[dumping stats]", stdout);
7549 fflush(stdout);
7550 ulii = perm_vec_ct;
7551 ujj = 1 + perms_done - ulii;
7552 wptr = g_textbuf;
7553 a1ptr = &(g_textbuf[MAXLINELEN]);
7554 for (uii = 0; uii < ulii; uii++) {
7555 wptr = uint32toa(uii + ujj, wptr);
7556 orig_pvals_ptr = &(g_mperm_save_all[uii]);
7557 for (ukk = 0; ukk < marker_ct; ukk++) {
7558 *wptr++ = ' ';
7559 dxx = orig_pvals_ptr[ukk * ulii];
7560 if (dxx >= 0) {
7561 wptr = dtoa_g(dxx, wptr);
7562 } else {
7563 wptr = memcpya(wptr, "NA", 2);
7564 }
7565 if (wptr >= a1ptr) {
7566 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7567 goto model_assoc_ret_WRITE_FAIL;
7568 }
7569 wptr = g_textbuf;
7570 }
7571 }
7572 *wptr++ = '\n';
7573 }
7574 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
7575 goto model_assoc_ret_WRITE_FAIL;
7576 }
7577 fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b ", stdout);
7578 }
7579 bigstack_reset(g_perm_vecs);
7580 if (perms_done < perms_total) {
7581 if (model_adapt_nst) {
7582 marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
7583 if (!marker_unstopped_ct) {
7584 goto model_assoc_adapt_perm_count;
7585 }
7586 }
7587 printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
7588 fflush(stdout);
7589 perm_pass_idx++;
7590 goto model_assoc_more_perms;
7591 }
7592 if (model_adapt_nst) {
7593 model_assoc_adapt_perm_count:
7594 perms_done = 0;
7595 for (uii = 0; uii < marker_ct; uii++) {
7596 if (perm_attempt_ct[uii] > perms_done) {
7597 perms_done = perm_attempt_ct[uii];
7598 if (perms_done == perms_total) {
7599 break;
7600 }
7601 }
7602 }
7603 }
7604 putc_unlocked('\r', stdout);
7605 LOGPRINTF("%u %s permutation%s complete.\n", perms_done, model_maxt_nst? "max(T)" : "(adaptive)", (perms_done != 1)? "s" : "");
7606 if (model_fisher && (model_modifier & MODEL_PTREND)) {
7607 outname_end2 -= 7; // remove ".fisher"
7608 }
7609 if (model_adapt_nst) {
7610 memcpy(outname_end2, ".perm", 6);
7611 } else {
7612 if (mperm_save & MPERM_DUMP_BEST) {
7613 if (bigstack_alloc_c(FNAMESIZE, &a1ptr)) {
7614 goto model_assoc_ret_NOMEM;
7615 }
7616 ulii = outname_end - outname;
7617 memcpy(a1ptr, outname, ulii);
7618 memcpy(&(a1ptr[ulii]), ".mperm.dump.best", 17);
7619 LOGPRINTFWW("Dumping best permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", a1ptr);
7620 if (fopen_checked(a1ptr, "w", &outfile)) {
7621 goto model_assoc_ret_OPEN_FAIL;
7622 }
7623 dxx = 0;
7624 if (model_fisherx) {
7625 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7626 if ((orig_pvals[marker_idx] != -9) && (orig_pvals[marker_idx] < dxx)) {
7627 dxx = orig_pvals[marker_idx];
7628 }
7629 }
7630 dxx = 1 - dxx;
7631 } else {
7632 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7633 if (orig_chisq[marker_idx] > dxx) {
7634 dxx = orig_chisq[marker_idx];
7635 }
7636 }
7637 }
7638 memcpy(g_textbuf, "0 ", 2);
7639 wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
7640 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
7641 goto model_assoc_ret_WRITE_FAIL;
7642 }
7643 for (uii = 0; uii < perms_total; uii++) {
7644 wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
7645 wptr = dtoa_gx(maxt_extreme_stat[uii], '\n', wptr);
7646 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
7647 goto model_assoc_ret_WRITE_FAIL;
7648 }
7649 }
7650 if (fclose_null(&outfile)) {
7651 goto model_assoc_ret_WRITE_FAIL;
7652 }
7653 }
7654 memcpy(outname_end2, ".mperm", 7);
7655 }
7656 if (fopen_checked(outname, "w", &outfile)) {
7657 goto model_assoc_ret_OPEN_FAIL;
7658 }
7659 if (model_adapt_nst) {
7660 sprintf(g_textbuf, " CHR %%%us EMP1 NP \n", plink_maxsnp);
7661 } else {
7662 sprintf(g_textbuf, " CHR %%%us EMP1 EMP2 \n", plink_maxsnp);
7663 #ifdef __cplusplus
7664 std::sort(maxt_extreme_stat, &(maxt_extreme_stat[perms_total]));
7665 #else
7666 qsort(maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
7667 #endif
7668 }
7669 /*
7670 if (model_maxt_nst) {
7671 printf("extreme stats: %g %g\n", maxt_extreme_stat[0], maxt_extreme_stat[perms_total - 1]);
7672 }
7673 */
7674 fprintf(outfile, g_textbuf, "SNP");
7675 chrom_fo_idx = 0xffffffffU;
7676 marker_uidx = next_unset_unsafe(marker_exclude, 0);
7677 marker_idx = 0;
7678 dyy = 1.0 / ((double)((int32_t)perms_total + 1));
7679 dxx = 0.5 * dyy;
7680 while (1) {
7681 while (1) {
7682 do {
7683 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
7684 } while (marker_uidx >= chrom_end);
7685 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
7686 is_x = (uii == (uint32_t)x_code);
7687 if (model_assoc || (((!IS_SET(haploid_mask, uii)) && (uii != (uint32_t)mt_code)) || is_x)) {
7688 break;
7689 }
7690 marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
7691 }
7692 wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
7693 *wptr_start++ = ' ';
7694 wptr_start[plink_maxsnp] = ' ';
7695 for (; marker_uidx < chrom_end;) {
7696 if (model_adapt_nst) {
7697 pval = ((double)(perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (perm_attempt_ct[marker_idx] + 1)));
7698 } else {
7699 pval = ((double)(perm_2success_ct[marker_idx] + 2)) * dxx;
7700 }
7701 if (pval <= pfilter) {
7702 fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
7703 wptr = &(wptr_start[1 + plink_maxsnp]);
7704 if ((!model_assoc) && ((model_adapt_nst && (!perm_attempt_ct[marker_idx])) || ((!model_adapt_nst) && ((model_fisherx && (orig_pvals[marker_idx] == -9)) || ((!model_fisherx) && (orig_chisq[marker_idx] == -9)))))) {
7705 // invalid
7706 wptr = memcpya(wptr, " NA NA", 25);
7707 } else {
7708 if (!model_perm_count) {
7709 wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
7710 } else {
7711 wptr = dtoa_g_wxp4x(((double)perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
7712 }
7713 if (model_adapt_nst) {
7714 wptr = memseta(wptr, 32, 2);
7715 wptr = uint32toa_w10(perm_attempt_ct[marker_idx], wptr);
7716 } else {
7717 if (model_fisherx) {
7718 // minimum p-value
7719 dzz = (int32_t)(doublearr_greater_than(maxt_extreme_stat, perms_total, orig_pvals[marker_idx] * (1.0 + EPSILON)) + 1);
7720 } else {
7721 // maximum chisq
7722 dzz = (int32_t)(perms_total - doublearr_greater_than(maxt_extreme_stat, perms_total, orig_chisq[marker_idx] - EPSILON) + 1);
7723 }
7724 if (!model_perm_count) {
7725 wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
7726 } else {
7727 wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
7728 }
7729 }
7730 }
7731 wptr = memcpya(wptr, " \n", 2);
7732 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
7733 goto model_assoc_ret_WRITE_FAIL;
7734 }
7735 }
7736 if (++marker_idx == marker_ct) {
7737 goto model_assoc_loop_end;
7738 }
7739 marker_uidx++;
7740 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
7741 }
7742 }
7743 model_assoc_loop_end:
7744 if (fclose_null(&outfile)) {
7745 goto model_assoc_ret_WRITE_FAIL;
7746 }
7747 LOGPRINTFWW("Permutation test report written to %s .\n", outname);
7748 }
7749
7750 while (0) {
7751 model_assoc_ret_NOMEM:
7752 retval = RET_NOMEM;
7753 break;
7754 model_assoc_ret_OPEN_FAIL:
7755 retval = RET_OPEN_FAIL;
7756 break;
7757 model_assoc_ret_READ_FAIL:
7758 retval = RET_READ_FAIL;
7759 break;
7760 model_assoc_ret_WRITE_FAIL:
7761 retval = RET_WRITE_FAIL;
7762 break;
7763 model_assoc_ret_INVALID_CMDLINE:
7764 retval = RET_INVALID_CMDLINE;
7765 break;
7766 model_assoc_ret_THREAD_CREATE_FAIL:
7767 retval = RET_THREAD_CREATE_FAIL;
7768 break;
7769 }
7770 model_assoc_ret_1:
7771 bigstack_reset(bigstack_mark);
7772 fclose_cond(outfile);
7773 fclose_cond(outfile_msa);
7774 return retval;
7775 }
7776
qassoc_set_test(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,uintptr_t * marker_exclude_mid,uintptr_t marker_ct_mid,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t * sex_male,Aperm_info * apip,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * founder_pnm,uintptr_t * sample_include2,uintptr_t * sample_male_include2,uint32_t ld_ignore_x,uint32_t hh_exists,uint32_t hh_or_mt_exists,uint32_t perm_batch_size,Set_info * sip,uint32_t * tcnt,uintptr_t * loadbuf_raw)7777 int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Aperm_info* apip, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* founder_pnm, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uint32_t ld_ignore_x, uint32_t hh_exists, uint32_t hh_or_mt_exists, uint32_t perm_batch_size, Set_info* sip, uint32_t* tcnt, uintptr_t* loadbuf_raw) {
7778 // Similar to glm_linear_assoc_set_test().
7779 // Side effect: t-statistics in g_orig_chisq[] are clobbered and replaced
7780 // with same-p-value 1df chi-square statistics.
7781 unsigned char* bigstack_mark = g_bigstack_base;
7782 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
7783 uintptr_t* marker_exclude = marker_exclude_mid;
7784 uintptr_t* unstopped_markers = nullptr;
7785 uintptr_t* loadbuf = g_loadbuf;
7786 uintptr_t* perm_adapt_set_unstopped = nullptr;
7787 uintptr_t* regression_skip = nullptr;
7788 double* orig_stats = g_orig_chisq; // initially contains t-statistics
7789 double* sorted_chisq_buf = nullptr;
7790 uint32_t* marker_idx_to_uidx = nullptr;
7791 uint32_t* sorted_marker_idx_buf = nullptr;
7792 uint32_t* proxy_arr = nullptr;
7793 uint32_t* perm_2success_ct = nullptr;
7794 uint32_t* perm_attempt_ct = nullptr;
7795 uintptr_t marker_ct = marker_ct_mid;
7796 uintptr_t set_ct = 0;
7797 uintptr_t final_mask = get_final_mask(pheno_nm_ct);
7798 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
7799 double adaptive_ci_zt = 0.0;
7800 uint32_t max_thread_ct = g_thread_ct;
7801 uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
7802 uint32_t perms_done = 0;
7803 int32_t retval = 0;
7804 unsigned char* bigstack_mark2;
7805 uintptr_t* set_incl;
7806 uintptr_t* loadbuf_ptr;
7807 double* orig_set_scores;
7808 double* chisq_pmajor;
7809 double* read_dptr;
7810 double* write_dptr;
7811 uint32_t** setdefs;
7812 uint32_t** ld_map;
7813 uintptr_t marker_ctl;
7814 uintptr_t marker_midx;
7815 uintptr_t set_idx;
7816 uintptr_t perm_vec_ct;
7817 uintptr_t perm_vec_ctcl8m;
7818 uintptr_t pidx;
7819 uintptr_t ulii;
7820 double chisq_threshold;
7821 double dxx;
7822 double dyy;
7823 uint32_t perms_total;
7824 uint32_t max_sigset_size;
7825 uint32_t marker_unstopped_ct;
7826 uint32_t is_last_block;
7827 uint32_t chrom_fo_idx;
7828 uint32_t chrom_end;
7829 uint32_t block_size;
7830 uint32_t block_end;
7831 uint32_t first_adapt_check;
7832 uint32_t marker_uidx;
7833 uint32_t marker_idx;
7834 uint32_t marker_idx2;
7835 uint32_t marker_bidx;
7836 uint32_t skip_ct;
7837 uint32_t uii;
7838 if (sip->set_test_lambda > 1.0) {
7839 dxx = 1.0 / sip->set_test_lambda;
7840 } else {
7841 dxx = 1.0;
7842 }
7843 for (marker_midx = 0; marker_midx < marker_ct; marker_midx++) {
7844 dyy = calc_tprob(orig_stats[marker_midx], tcnt[marker_midx]);
7845 if (dyy == 0.0) {
7846 dyy = MAX_INVERSE_CHIPROB_1DF * dxx;
7847 } else {
7848 orig_stats[marker_midx] = inverse_chiprob(dyy, 1) * dxx;
7849 }
7850 }
7851 retval = set_test_common_init(threads, bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_ids, max_marker_id_len, marker_reverse, orig_stats, sip, chrom_info_ptr, unfiltered_sample_ct, sex_male, founder_pnm, ld_ignore_x, hh_exists, "QT --assoc", &marker_ct, &marker_exclude, &set_incl, &marker_idx_to_uidx, &setdefs, &set_ct, &max_sigset_size, &ld_map, &chisq_threshold, &orig_set_scores, &sorted_chisq_buf, &sorted_marker_idx_buf, &proxy_arr, &perm_adapt_set_unstopped, &perm_2success_ct, &perm_attempt_ct, &unstopped_markers);
7852 if (retval) {
7853 goto qassoc_set_test_ret_1;
7854 }
7855 if (!set_ct) {
7856 goto qassoc_set_test_write;
7857 }
7858 marker_ctl = BITCT_TO_WORDCT(marker_ct);
7859 if (marker_ct_mid != marker_ct) {
7860 inplace_delta_collapse_arr((char*)tcnt, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7861 inplace_delta_collapse_arr((char*)g_missing_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7862 inplace_delta_collapse_arr((char*)g_het_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7863 inplace_delta_collapse_arr((char*)g_homcom_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
7864 }
7865 if (bigstack_calloc_ul(marker_ctl, ®ression_skip)) {
7866 goto qassoc_set_test_ret_NOMEM;
7867 }
7868 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
7869 // nanal
7870 uii = tcnt[marker_idx] + 2;
7871 if ((uii == 2) || (g_homcom_cts[marker_idx] == uii) || (g_het_cts[marker_idx] == uii) || (g_het_cts[marker_idx] + g_homcom_cts[marker_idx] == 0)) {
7872 // 0 df or no genotype variation, regression always fails
7873 SET_BIT(marker_idx, regression_skip);
7874 }
7875 }
7876 if (model_modifier & MODEL_PERM) {
7877 perms_total = apip->max;
7878 first_adapt_check = (apip->min < apip->init_interval)? ((int32_t)apip->init_interval) : apip->min;
7879 adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)set_ct)));
7880 } else {
7881 perms_total = model_mperm_val;
7882 first_adapt_check = perms_total + 1;
7883 }
7884 for (uii = 0; uii < set_ct; uii++) {
7885 perm_attempt_ct[uii] = perms_total;
7886 }
7887 if (max_thread_ct > perms_total) {
7888 max_thread_ct = perms_total;
7889 }
7890 if (bigstack_init_sfmtp(max_thread_ct)) {
7891 goto qassoc_set_test_ret_NOMEM;
7892 }
7893
7894 bigstack_mark2 = g_bigstack_base;
7895 qassoc_set_test_more_perms:
7896 bitvec_and(unstopped_markers, marker_ctl, regression_skip);
7897 bitvec_andnot(regression_skip, marker_ctl, unstopped_markers);
7898 skip_ct = popcount_longs(regression_skip, marker_ctl);
7899 marker_unstopped_ct = popcount_longs(unstopped_markers, marker_ctl);
7900
7901 if (perms_done) {
7902 uii = apip->init_interval;
7903 while (first_adapt_check <= perms_done) {
7904 first_adapt_check += (int32_t)(uii + ((int32_t)first_adapt_check) * apip->interval_slope);
7905 }
7906 }
7907 perm_vec_ct = perm_batch_size;
7908 // possible todo: split first batch to reduce adaptive overshoot
7909 if (perm_vec_ct > perms_total - perms_done) {
7910 perm_vec_ct = perms_total - perms_done;
7911 }
7912 g_perm_vec_ct = perm_vec_ct;
7913 if (perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
7914 g_perm_generation_thread_ct = max_thread_ct;
7915 } else {
7916 g_perm_generation_thread_ct = MAXV(perm_vec_ct / CACHELINE_INT32, 1);
7917 }
7918 perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
7919 if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd) ||
7920 bigstack_calloc_d(perm_vec_ctcl8m * 3 * max_thread_ct, &g_thread_git_qbufs)) {
7921 goto qassoc_set_test_ret_NOMEM;
7922 }
7923
7924 ulii = 0;
7925 if (!g_perm_cluster_ct) {
7926 if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
7927 goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
7928 }
7929 generate_qt_perms_smajor_thread((void*)ulii);
7930 } else {
7931 if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
7932 goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
7933 }
7934 generate_qt_cluster_perms_smajor_thread((void*)ulii);
7935 }
7936 join_threads(threads, g_perm_generation_thread_ct);
7937 if (bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all) ||
7938 bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor)) {
7939 goto qassoc_set_test_ret_NOMEM;
7940 }
7941 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
7942 write_dptr = &(chisq_pmajor[pidx * marker_ct]);
7943 for (marker_idx = 0, marker_idx2 = 0; marker_idx < skip_ct; marker_idx++, marker_idx2++) {
7944 next_set_unsafe_ck(regression_skip, &marker_idx2);
7945 write_dptr[marker_idx2] = -9;
7946 }
7947 }
7948 chrom_fo_idx = 0xffffffffU;
7949 marker_uidx = next_unset_unsafe(marker_exclude, 0);
7950 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7951 goto qassoc_set_test_ret_READ_FAIL;
7952 }
7953 marker_idx = 0;
7954 marker_idx2 = 0;
7955 chrom_end = 0;
7956 do {
7957 if (marker_uidx >= chrom_end) {
7958 // exploit overflow
7959 chrom_fo_idx++;
7960 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &g_min_ploidy_1);
7961 g_min_ploidy_1 |= uii; // treat MT as haploid
7962 }
7963 block_size = 0;
7964 block_end = marker_unstopped_ct - marker_idx;
7965 if (block_end > MODEL_BLOCKSIZE) {
7966 block_end = MODEL_BLOCKSIZE;
7967 }
7968 do {
7969 if (!IS_SET(unstopped_markers, marker_idx2)) {
7970 do {
7971 marker_uidx++;
7972 next_unset_unsafe_ck(marker_exclude, &marker_uidx);
7973 marker_idx2++;
7974 } while ((marker_uidx < chrom_end) && (!IS_SET(unstopped_markers, marker_idx2)));
7975 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7976 goto qassoc_set_test_ret_READ_FAIL;
7977 }
7978 if (marker_uidx >= chrom_end) {
7979 break;
7980 }
7981 }
7982 loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
7983 if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
7984 goto qassoc_set_test_ret_READ_FAIL;
7985 }
7986 if (g_min_ploidy_1 && hh_or_mt_exists) {
7987 haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, pheno_nm_ct, g_is_x, g_is_y, (unsigned char*)loadbuf_ptr);
7988 }
7989 g_adapt_m_table[block_size] = marker_idx2++;
7990 block_size++;
7991 if (marker_idx + block_size == marker_unstopped_ct) {
7992 break;
7993 }
7994 marker_uidx++;
7995 if (IS_SET(marker_exclude, marker_uidx)) {
7996 marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
7997 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
7998 goto qassoc_set_test_ret_READ_FAIL;
7999 }
8000 }
8001 } while ((block_size < block_end) && (marker_uidx < chrom_end));
8002 if (!block_size) {
8003 continue;
8004 }
8005 is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
8006 g_block_diff = block_size;
8007 ulii = 0;
8008 if (spawn_threads2(threads, &qassoc_set_thread, max_thread_ct, is_last_block)) {
8009 goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
8010 }
8011 qassoc_set_thread((void*)ulii);
8012 join_threads2(threads, max_thread_ct, is_last_block);
8013
8014 // convert to equivalent chi-square stats and transpose
8015 // (conversion has to be done here since dcdflib is not thread-safe)
8016 read_dptr = g_mperm_save_all;
8017 for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
8018 uii = g_adapt_m_table[marker_bidx];
8019 write_dptr = &(chisq_pmajor[uii]);
8020 uii = tcnt[uii];
8021 dyy = inverse_tprob(sip->set_p, uii);
8022 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
8023 dxx = *read_dptr++;
8024 if (dxx < dyy) {
8025 dxx = -9;
8026 } else {
8027 dxx = calc_tprob(dxx, uii);
8028 if (dxx == 0.0) {
8029 dxx = MAX_INVERSE_CHIPROB_1DF;
8030 } else {
8031 dxx = inverse_chiprob(dxx, 1);
8032 }
8033 }
8034 // this is cache-unfriendly, may want to update in-place instead and
8035 // separate out the transpose
8036 write_dptr[pidx * marker_ct] = dxx;
8037 }
8038 }
8039 marker_idx += block_size;
8040 } while (marker_idx < marker_unstopped_ct);
8041 perms_done += perm_vec_ct;
8042 compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
8043 bigstack_reset(bigstack_mark2);
8044 if (perms_done < perms_total) {
8045 if (model_modifier & MODEL_PERM) {
8046 if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
8047 perms_done = 0;
8048 for (set_idx = 0; set_idx < set_ct; set_idx++) {
8049 if (perms_done < perm_attempt_ct[set_idx]) {
8050 perms_done = perm_attempt_ct[set_idx];
8051 }
8052 }
8053 goto qassoc_set_test_perms_done;
8054 }
8055 }
8056 printf("\r%u permutation%s complete.", perms_done, (perms_done != 1)? "s" : "");
8057 fflush(stdout);
8058 goto qassoc_set_test_more_perms;
8059 }
8060 qassoc_set_test_perms_done:
8061 putc_unlocked('\r', stdout);
8062 LOGPRINTF("%u permutation%s complete.\n", perms_done, (perms_done != 1)? "s" : "");
8063 qassoc_set_test_write:
8064 if (model_modifier & MODEL_PERM) {
8065 memcpy(outname_end, ".qassoc.set.perm", 17);
8066 } else {
8067 memcpy(outname_end, ".qassoc.set.mperm", 18);
8068 }
8069 retval = write_set_test_results(outname, &(outname_end[11]), sip, ld_map, setdefs, set_incl, set_ct, marker_ct_orig, marker_ct, marker_idx_to_uidx, marker_ids, max_marker_id_len, perm_2success_ct, perm_attempt_ct, mtest_adjust, perm_count, pfilter, output_min_p, chisq_threshold, orig_stats, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr);
8070 while (0) {
8071 qassoc_set_test_ret_NOMEM:
8072 retval = RET_NOMEM;
8073 break;
8074 qassoc_set_test_ret_READ_FAIL:
8075 retval = RET_READ_FAIL;
8076 break;
8077 qassoc_set_test_ret_THREAD_CREATE_FAIL:
8078 retval = RET_THREAD_CREATE_FAIL;
8079 break;
8080 }
8081 qassoc_set_test_ret_1:
8082 bigstack_reset(bigstack_mark);
8083 return retval;
8084 }
8085
qassoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t model_modifier,uint32_t model_mperm_val,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,double * pheno_d,uintptr_t * founder_info,uintptr_t * sex_male,uint32_t hh_exists,uint32_t ld_ignore_x,uint32_t perm_batch_size,Set_info * sip)8086 int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, uintptr_t* sex_male, uint32_t hh_exists, uint32_t ld_ignore_x, uint32_t perm_batch_size, Set_info* sip) {
8087 unsigned char* bigstack_mark = g_bigstack_base;
8088 uintptr_t marker_ct = marker_ct_orig;
8089 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8090 uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8091 uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
8092 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
8093 uintptr_t final_mask = get_final_mask(pheno_nm_ct);
8094 uintptr_t perm_vec_ctcl8m = 0;
8095 FILE* outfile = nullptr;
8096 FILE* outfile_qtm = nullptr;
8097 FILE* outfile_msa = nullptr;
8098 uint32_t is_set_test = model_modifier & MODEL_SET_TEST;
8099 uint32_t perm_adapt_nst = (model_modifier & MODEL_PERM) && (!is_set_test);
8100 uint32_t perm_maxt_nst = (model_modifier & MODEL_MPERM) && (!is_set_test);
8101 uint32_t do_perms = model_modifier & (MODEL_PERM | MODEL_MPERM);
8102 uint32_t do_perms_nst = do_perms && (!is_set_test);
8103 uint32_t qt_means = model_modifier & MODEL_QT_MEANS;
8104 uint32_t do_lin = model_modifier & MODEL_LIN;
8105 uint32_t qt_means_or_lin = qt_means || do_lin;
8106 uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
8107 uint32_t fill_orig_chiabs = do_perms || mtest_adjust;
8108 uint32_t perms_total = 0;
8109 uint32_t pct = 0;
8110 uint32_t max_thread_ct = g_thread_ct;
8111 uint32_t perm_pass_idx = 0;
8112 uint32_t mt_exists = (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET]);
8113 uint32_t hh_or_mt_exists = hh_exists | (mt_exists * NXMHH_EXISTS);
8114 int32_t retval = 0;
8115 double x11 = 0;
8116 double x12 = 0;
8117 double x22 = 0;
8118 uintptr_t* marker_exclude = marker_exclude_orig;
8119 uintptr_t* founder_pnm = nullptr;
8120 uintptr_t* sample_male_include2 = nullptr;
8121 uint32_t* tcnt = nullptr;
8122 char* chrom_name_ptr = nullptr;
8123 uint32_t chrom_name_len = 0;
8124 char chrom_name_buf[5];
8125 uint32_t mu_table[MODEL_BLOCKSIZE];
8126 char numbuf[16]; // ' -1.23456e-200\0' fits, barely
8127 char spacebuf[8];
8128 char* outname_end2;
8129 char* wptr_start;
8130 char* wptr;
8131 char* wptr_restart;
8132 uintptr_t* loadbuf_raw;
8133 uintptr_t* loadbuf_ptr;
8134 uintptr_t* lbptr2;
8135 uintptr_t* sample_include2;
8136 double* ooptr;
8137 double* dptr;
8138 double* dptr2;
8139 double* dptr3;
8140 uint32_t* marker_idx_to_uidx;
8141 uint32_t marker_unstopped_ct;
8142 uint32_t chrom_fo_idx;
8143 uint32_t chrom_end;
8144 uint32_t block_size;
8145 uint32_t block_end;
8146 uint32_t marker_bidx;
8147 uintptr_t marker_uidx; // loading
8148 uintptr_t marker_uidx2; // writing
8149 uintptr_t marker_idx;
8150 uintptr_t marker_idx2;
8151 uintptr_t sample_uidx;
8152 uintptr_t sample_uidx_stop;
8153 uintptr_t sample_idx;
8154 uintptr_t ulii;
8155 intptr_t geno_sum;
8156 intptr_t nanal;
8157 intptr_t geno_ssq;
8158 double nanal_recip;
8159 double qt_sum;
8160 double qt_ssq;
8161 double qt_g_prod;
8162 double qt_g_prod_centered;
8163 double qt_mean;
8164 double geno_mean;
8165 double qt_var;
8166 double geno_var;
8167 double qt_g_covar;
8168 double beta;
8169 double vbeta_sqrt;
8170 double tstat;
8171 double tp;
8172 double rsq;
8173 double qt_het_sum;
8174 double qt_het_ssq;
8175 double qt_homrar_sum;
8176 double qt_homrar_ssq;
8177 double qt_homcom_sum;
8178 double dxx;
8179 double dyy;
8180 double dzz;
8181 double pval;
8182 uint32_t homrar_ct;
8183 uint32_t missing_ct;
8184 uint32_t het_ct;
8185 uint32_t homcom_ct;
8186 uint32_t is_last_block;
8187 uint32_t loop_end;
8188 uint32_t uii;
8189 uint32_t ujj;
8190 uint32_t ukk;
8191 char* a1ptr;
8192 char* a2ptr;
8193 if (pheno_nm_ct < 2) {
8194 logerrprint("Warning: Skipping QT --assoc since less than two phenotypes are present.\n");
8195 goto qassoc_ret_1;
8196 }
8197 if (is_set_test) {
8198 if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
8199 goto qassoc_ret_NOMEM;
8200 }
8201 memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
8202 bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
8203 if (extract_set_union_unfiltered(sip, nullptr, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
8204 goto qassoc_ret_NOMEM;
8205 }
8206 }
8207 memset(spacebuf, 32, 8);
8208 g_perm_pheno_nm_ct = pheno_nm_ct;
8209 g_perms_done = 0;
8210 g_mperm_save_all = nullptr;
8211 numbuf[0] = ' ';
8212 if (perm_maxt_nst) {
8213 perms_total = model_mperm_val;
8214 // square of t-stat
8215 if (bigstack_calloc_d(perms_total, &g_maxt_extreme_stat)) {
8216 goto qassoc_ret_NOMEM;
8217 }
8218 g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
8219 if (!g_ldrefs) {
8220 goto qassoc_ret_NOMEM;
8221 }
8222 #ifdef __LP64__
8223 fill_ulong_one((marker_ct + 3) / 4, (uintptr_t*)g_ldrefs);
8224 #else
8225 fill_ulong_one((marker_ct + 1) / 2, (uintptr_t*)g_ldrefs);
8226 #endif
8227 if (mperm_save & MPERM_DUMP_ALL) {
8228 memcpy(outname_end, ".mperm.dump.all", 16);
8229 if (fopen_checked(outname, "w", &outfile_msa)) {
8230 goto qassoc_ret_OPEN_FAIL;
8231 }
8232 if (putc_checked('0', outfile_msa)) {
8233 goto qassoc_ret_WRITE_FAIL;
8234 }
8235 LOGPRINTFWW("Dumping all permutation squared %sstats to %s .\n", do_lin? "Lin " : "Wald t-", outname);
8236 }
8237 } else {
8238 mperm_save = 0;
8239 if (perm_adapt_nst) {
8240 g_aperm_alpha = apip->alpha;
8241 perms_total = apip->max;
8242 if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
8243 bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop)) {
8244 goto qassoc_ret_NOMEM;
8245 }
8246 ujj = apip->max;
8247 for (uii = 0; uii < marker_ct; uii++) {
8248 g_perm_attempt_ct[uii] = ujj;
8249 }
8250 g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
8251 if (apip->min < apip->init_interval) {
8252 g_first_adapt_check = (int32_t)(apip->init_interval);
8253 } else {
8254 g_first_adapt_check = apip->min;
8255 }
8256 g_adaptive_intercept = apip->init_interval;
8257 g_adaptive_slope = apip->interval_slope;
8258 }
8259 }
8260 outname_end2 = memcpyb(outname_end, ".qassoc", 8);
8261 if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
8262 goto qassoc_ret_NOMEM;
8263 }
8264 loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
8265 loadbuf_raw[unfiltered_sample_ctv2 - 1] = 0;
8266 if (fill_orig_chiabs) {
8267 if (bigstack_alloc_d(marker_ct, &g_orig_chisq)) {
8268 goto qassoc_ret_NOMEM;
8269 }
8270 if (mtest_adjust || is_set_test) {
8271 if (bigstack_alloc_ui(marker_ct, &tcnt)) {
8272 goto qassoc_ret_NOMEM;
8273 }
8274 }
8275 }
8276 if (fopen_checked(outname, "w", &outfile)) {
8277 goto qassoc_ret_OPEN_FAIL;
8278 }
8279 if (qt_means) {
8280 memcpy(outname_end2, ".means", 7);
8281 if (fopen_checked(outname, "w", &outfile_qtm)) {
8282 goto qassoc_ret_OPEN_FAIL;
8283 }
8284 sprintf(g_textbuf, " CHR %%%us VALUE G11 G12 G22\n", plink_maxsnp);
8285 fprintf(outfile_qtm, g_textbuf, "SNP");
8286 *outname_end2 = '\0';
8287 }
8288 if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
8289 logerrprint("Warning: QT --assoc doesn't handle X/Y/MT/haploid variants normally (try\n--linear).\n");
8290 }
8291 LOGPRINTFWW5("Writing QT --assoc report to %s ... ", outname);
8292 fflush(stdout);
8293 sprintf(g_textbuf, " CHR %%%us BP NMISS BETA SE R2 T P ", plink_maxsnp);
8294 fprintf(outfile, g_textbuf, "SNP");
8295 if (do_lin) {
8296 fputs(" LIN LIN_P ", outfile);
8297 }
8298 if (putc_checked('\n', outfile)) {
8299 goto qassoc_ret_WRITE_FAIL;
8300 }
8301 if (do_perms) {
8302 if (model_modifier & MODEL_PERM) {
8303 if (perm_batch_size > apip->max) {
8304 perm_batch_size = apip->max;
8305 }
8306 } else {
8307 if (perm_batch_size > model_mperm_val) {
8308 perm_batch_size = model_mperm_val;
8309 }
8310 }
8311 uii = MINV(perm_batch_size, perms_total) / CACHELINE_DBL;
8312 if (max_thread_ct > uii) {
8313 max_thread_ct = MAXV(uii, 1);
8314 }
8315 if (cluster_starts) {
8316 retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, nullptr, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, nullptr, nullptr);
8317 if (retval) {
8318 goto qassoc_ret_1;
8319 }
8320 if (!g_perm_cluster_ct) {
8321 logerrprint("Error: No size 2+ clusters for permutation test.\n");
8322 goto qassoc_ret_INVALID_CMDLINE;
8323 }
8324 if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
8325 bigstack_alloc_ui(max_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
8326 goto qassoc_ret_NOMEM;
8327 }
8328 fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
8329 }
8330 if (bigstack_alloc_ui(marker_ct, &g_missing_cts) ||
8331 bigstack_alloc_ui(marker_ct, &g_het_cts) ||
8332 bigstack_alloc_ui(marker_ct, &g_homcom_cts)) {
8333 goto qassoc_ret_NOMEM;
8334 }
8335 if (!is_set_test) {
8336 if (bigstack_init_sfmtp(max_thread_ct)) {
8337 goto qassoc_ret_NOMEM;
8338 }
8339 if (bigstack_calloc_ui(marker_ct, &g_perm_2success_ct)) {
8340 goto qassoc_ret_NOMEM;
8341 }
8342 }
8343 }
8344 if (do_lin) {
8345 if (bigstack_alloc_d(marker_ct, &g_orig_linsq)) {
8346 goto qassoc_ret_NOMEM;
8347 }
8348 }
8349 if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &g_loadbuf) ||
8350 bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
8351 bigstack_alloc_ul(pheno_nm_ctv2, &sample_include2)) {
8352 goto qassoc_ret_NOMEM;
8353 }
8354 fill_quatervec_55(pheno_nm_ct, sample_include2);
8355 if (alloc_collapsed_haploid_filters(pheno_nm, sex_male, unfiltered_sample_ct, pheno_nm_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2)) {
8356 goto qassoc_ret_NOMEM;
8357 }
8358 marker_unstopped_ct = marker_ct;
8359 if (bigstack_alloc_d(pheno_nm_ct, &g_perm_pheno_d2)) {
8360 goto qassoc_ret_NOMEM;
8361 }
8362 g_pheno_sum = 0;
8363 g_pheno_ssq = 0;
8364 sample_uidx = 0;
8365 sample_idx = 0;
8366 dptr = g_perm_pheno_d2;
8367 do {
8368 sample_uidx = next_set_ul_unsafe(pheno_nm, sample_uidx);
8369 sample_uidx_stop = next_unset_ul(pheno_nm, sample_uidx, unfiltered_sample_ct);
8370 sample_idx += sample_uidx_stop - sample_uidx;
8371 dptr2 = &(pheno_d[sample_uidx]);
8372 sample_uidx = sample_uidx_stop;
8373 dptr3 = &(pheno_d[sample_uidx_stop]);
8374 do {
8375 dxx = *dptr2++;
8376 *dptr++ = dxx;
8377 g_pheno_sum += dxx;
8378 g_pheno_ssq += dxx * dxx;
8379 } while (dptr2 < dptr3);
8380 } while (sample_idx < pheno_nm_ct);
8381 fputs("0%", stdout);
8382 fflush(stdout);
8383
8384 // ----- begin main loop -----
8385 qassoc_more_perms:
8386 if (do_perms_nst) {
8387 if (perm_adapt_nst && perm_pass_idx) {
8388 while (g_first_adapt_check <= g_perms_done) {
8389 // APERM_MAX prevents infinite loop here
8390 g_first_adapt_check += (int32_t)(apip->init_interval + ((int32_t)g_first_adapt_check) * apip->interval_slope);
8391 }
8392 }
8393 // g_perm_vec_ct memory allocation dependencies:
8394 // g_maxt_thread_results: (8 * perm_vec_ct, CL-aligned) * thread_ct
8395 // g_perm_vecstd: (8 * perm_vec_ct, CL-aligned) * pheno_nm_ct
8396 // g_mperm_save_all (if needed): marker_ct * 8 * perm_vec_ct
8397 // adaptive, Wald:
8398 // g_thread_git_qbufs: (8 * perm_vec_ct, CL-aligned) * 3 * thread_ct
8399 // adaptive, Lin:
8400 // g_thread_git_qbufs: (8 * perm_vec_ct, CL-aligned) * 6 * thread_ct
8401 // max(T), Wald:
8402 // g_qresultbuf: MODEL_BLOCKSIZE * (8 * perm_vec_ct, CL-aligned) * 3
8403 // max(T), Lin:
8404 // g_qresultbuf: MODEL_BLOCKSIZE * (8 * perm_vec_ct, CL-aligned) * 6
8405 g_perm_vec_ct = perm_batch_size;
8406 if (g_perm_vec_ct > perms_total - g_perms_done) {
8407 g_perm_vec_ct = perms_total - g_perms_done;
8408 }
8409 perm_vec_ctcl8m = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
8410 if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
8411 goto qassoc_ret_NOMEM;
8412 }
8413 ulii = do_lin? 6 : 3;
8414 if (perm_maxt_nst) {
8415 if (bigstack_alloc_d(max_thread_ct * perm_vec_ctcl8m, &g_maxt_thread_results) ||
8416 bigstack_alloc_d(ulii * MODEL_BLOCKSIZE * perm_vec_ctcl8m, &g_qresultbuf)) {
8417 goto qassoc_ret_NOMEM;
8418 }
8419 if (mperm_save & MPERM_DUMP_ALL) {
8420 if (bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all)) {
8421 goto qassoc_ret_NOMEM;
8422 }
8423 }
8424 } else {
8425 if (bigstack_calloc_d(perm_vec_ctcl8m * ulii * max_thread_ct, &g_thread_git_qbufs)) {
8426 goto qassoc_ret_NOMEM;
8427 }
8428 }
8429 g_perms_done += g_perm_vec_ct;
8430 if (g_perm_vec_ct >= CACHELINE_DBL * max_thread_ct) {
8431 g_perm_generation_thread_ct = max_thread_ct;
8432 } else {
8433 g_perm_generation_thread_ct = MAXV(g_perm_vec_ct / CACHELINE_DBL, 1);
8434 }
8435 ulii = 0;
8436 if (!cluster_starts) {
8437 if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
8438 goto qassoc_ret_THREAD_CREATE_FAIL;
8439 }
8440 generate_qt_perms_smajor_thread((void*)ulii);
8441 } else {
8442 if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
8443 goto qassoc_ret_THREAD_CREATE_FAIL;
8444 }
8445 generate_qt_cluster_perms_smajor_thread((void*)ulii);
8446 }
8447 join_threads(threads, g_perm_generation_thread_ct);
8448 g_assoc_thread_ct = max_thread_ct;
8449 }
8450 chrom_fo_idx = 0xffffffffU;
8451 marker_uidx = next_unset_unsafe(marker_exclude, 0);
8452 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8453 goto qassoc_ret_READ_FAIL;
8454 }
8455 marker_idx = 0;
8456 marker_idx2 = 0;
8457 chrom_end = 0;
8458 loop_end = marker_ct / 100;
8459 do {
8460 if (marker_uidx >= chrom_end) {
8461 g_qblock_start = 0;
8462 // exploit overflow
8463 chrom_fo_idx++;
8464 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &g_min_ploidy_1);
8465 g_min_ploidy_1 |= uii; // treat MT as haploid
8466 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
8467 chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
8468 } else if (perm_maxt_nst) {
8469 marker_idx -= MODEL_BLOCKKEEP;
8470 memcpy(g_loadbuf, &(g_loadbuf[(MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * pheno_nm_ctv2]), MODEL_BLOCKKEEP * pheno_nm_ctv2 * sizeof(intptr_t));
8471 if (!do_lin) {
8472 memcpy(g_qresultbuf, &(g_qresultbuf[3 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl8m]), MODEL_BLOCKKEEP * perm_vec_ctcl8m * 3 * sizeof(double));
8473 } else {
8474 memcpy(g_qresultbuf, &(g_qresultbuf[6 * (MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * perm_vec_ctcl8m]), MODEL_BLOCKKEEP * perm_vec_ctcl8m * 6 * sizeof(double));
8475 }
8476 g_qblock_start = MODEL_BLOCKKEEP;
8477 } else {
8478 g_qblock_start = 0;
8479 }
8480 block_size = g_qblock_start;
8481 block_end = marker_unstopped_ct - marker_idx;
8482 if (block_end > MODEL_BLOCKSIZE) {
8483 block_end = MODEL_BLOCKSIZE;
8484 }
8485 do {
8486 if (perm_adapt_nst && g_perm_adapt_stop[marker_idx2]) {
8487 do {
8488 marker_uidx++;
8489 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
8490 marker_idx2++;
8491 } while ((marker_uidx < chrom_end) && g_perm_adapt_stop[marker_idx2]);
8492 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8493 goto qassoc_ret_READ_FAIL;
8494 }
8495 if (marker_uidx >= chrom_end) {
8496 break;
8497 }
8498 }
8499 loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv2]);
8500 if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
8501 goto qassoc_ret_READ_FAIL;
8502 }
8503 if (g_min_ploidy_1 && hh_or_mt_exists) {
8504 haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, pheno_nm_ct, g_is_x, g_is_y, (unsigned char*)loadbuf_ptr);
8505 }
8506 if (perm_adapt_nst) {
8507 g_adapt_m_table[block_size] = marker_idx2++;
8508 }
8509 mu_table[block_size++] = marker_uidx;
8510 if (marker_idx + block_size == marker_unstopped_ct) {
8511 break;
8512 }
8513 marker_uidx++;
8514 if (IS_SET(marker_exclude, marker_uidx)) {
8515 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
8516 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
8517 goto qassoc_ret_READ_FAIL;
8518 }
8519 }
8520 } while ((block_size < block_end) && (marker_uidx < chrom_end));
8521 if (block_size == g_qblock_start) {
8522 continue;
8523 }
8524 if (!perm_pass_idx) {
8525 for (marker_bidx = g_qblock_start; marker_bidx < block_size; marker_bidx++) {
8526 marker_uidx2 = mu_table[marker_bidx];
8527 marker_idx_to_uidx[marker_idx + marker_bidx] = marker_uidx2;
8528 loadbuf_ptr = &(g_loadbuf[marker_bidx * pheno_nm_ctv2]);
8529 genovec_3freq(loadbuf_ptr, sample_include2, pheno_nm_ctv2, &missing_ct, &het_ct, &homcom_ct);
8530 nanal = pheno_nm_ct - missing_ct;
8531 wptr = memcpya(g_textbuf, chrom_name_ptr, chrom_name_len);
8532 *wptr++ = ' ';
8533 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
8534 *wptr++ = ' ';
8535 wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
8536 wptr = uint32toa_w8x(nanal, ' ', wptr);
8537 homrar_ct = nanal - het_ct - homcom_ct;
8538 if (do_perms) {
8539 g_missing_cts[marker_idx + marker_bidx] = missing_ct;
8540 g_homcom_cts[marker_idx + marker_bidx] = homcom_ct;
8541 g_het_cts[marker_idx + marker_bidx] = het_ct;
8542 }
8543 geno_sum = 2 * homrar_ct + het_ct;
8544 geno_ssq = 4 * homrar_ct + het_ct;
8545 qt_sum = g_pheno_sum;
8546 qt_g_prod = 0;
8547 qt_ssq = g_pheno_ssq;
8548 lbptr2 = loadbuf_ptr;
8549 uii = 0;
8550 qt_het_sum = 0;
8551 qt_het_ssq = 0;
8552 qt_homrar_sum = 0;
8553 qt_homrar_ssq = 0;
8554 do {
8555 ulii = ~(*lbptr2++);
8556 if (uii + BITCT2 > pheno_nm_ct) {
8557 ulii &= (ONELU << ((pheno_nm_ct & (BITCT2 - 1)) * 2)) - ONELU;
8558 }
8559 while (ulii) {
8560 ujj = CTZLU(ulii) & (BITCT - 2);
8561 ukk = (ulii >> ujj) & 3;
8562 sample_idx = uii + (ujj / 2);
8563 dxx = g_perm_pheno_d2[sample_idx];
8564 if (ukk == 1) {
8565 qt_g_prod += dxx;
8566 if (qt_means_or_lin) {
8567 qt_het_sum += dxx;
8568 qt_het_ssq += dxx * dxx;
8569 }
8570 } else if (ukk == 3) {
8571 qt_g_prod += 2 * dxx;
8572 if (qt_means_or_lin) {
8573 qt_homrar_sum += dxx;
8574 qt_homrar_ssq += dxx * dxx;
8575 }
8576 } else {
8577 qt_sum -= dxx;
8578 qt_ssq -= dxx * dxx;
8579 }
8580 ulii &= ~((3 * ONELU) << ujj);
8581 }
8582 uii += BITCT2;
8583 } while (uii < pheno_nm_ct);
8584 nanal_recip = 1.0 / ((double)nanal);
8585 qt_mean = qt_sum * nanal_recip;
8586 geno_mean = ((double)geno_sum) * nanal_recip;
8587 dxx = 1.0 / ((double)(nanal - 1));
8588 qt_var = (qt_ssq - qt_sum * qt_mean) * dxx;
8589 geno_var = (((double)geno_ssq) - geno_sum * geno_mean) * dxx;
8590 qt_g_prod_centered = qt_g_prod - qt_sum * geno_mean;
8591 qt_g_covar = qt_g_prod_centered * dxx;
8592
8593 dxx = 1.0 / geno_var;
8594 beta = qt_g_covar * dxx;
8595 vbeta_sqrt = sqrt((qt_var * dxx - beta * beta) / ((double)(nanal - 2)));
8596 tstat = beta / vbeta_sqrt;
8597 if (fill_orig_chiabs) {
8598 g_orig_chisq[marker_idx + marker_bidx] = tstat;
8599 if (tcnt) {
8600 tcnt[marker_idx + marker_bidx] = (nanal > 2)? (nanal - 2) : 0;
8601 }
8602 }
8603 if (do_lin) {
8604 // Square of Lin statistic:
8605 // \frac{(\sum_{i=1}^nU_{ji})^2}{\sum_{i=1}^nU_{ji}^2}
8606 // where U_{ji} = (Y_i - \bar{Y_{\dot}})(X_{ji} - \bar{X_{j\dot}}),
8607 // Y_{\dot}s are phenotypes, and X_{\dot\dot}s are genotypes.
8608 //
8609 // We evaluate the denominator by separating the sum into three
8610 // components (one for each possible genotype value), each of which
8611 // can be computed from the partial sums/sums-of-squares we already
8612 // have.
8613 g_orig_linsq[marker_idx + marker_bidx] = qt_g_prod_centered * qt_g_prod_centered / (geno_mean * geno_mean * (qt_ssq - 2 * qt_sum + qt_mean * qt_sum) + (1 - 2 * geno_mean) * (qt_het_ssq - 2 * qt_het_sum * qt_mean + qt_mean * qt_mean * ((intptr_t)het_ct)) + (4 - 4 * geno_mean) * (qt_homrar_ssq - 2 * qt_homrar_sum * qt_mean + qt_mean * qt_mean * ((intptr_t)homrar_ct)));
8614 }
8615 if (nanal > 1) {
8616 tp = calc_tprob(tstat, nanal - 2);
8617 rsq = (qt_g_covar * qt_g_covar) / (qt_var * geno_var);
8618 if (mperm_save & MPERM_DUMP_ALL) {
8619 if (!do_lin) {
8620 if (tp >= 0) {
8621 dtoa_gx(tstat * tstat, '\0', &(numbuf[1]));
8622 fputs(numbuf, outfile_msa);
8623 } else {
8624 fputs(" NA", outfile_msa);
8625 }
8626 } else {
8627 dxx = g_orig_linsq[marker_idx + marker_bidx];
8628 if ((nanal > 2) && realnum(dxx)) {
8629 dtoa_gx(dxx, '\0', &(numbuf[1]));
8630 fputs(numbuf, outfile_msa);
8631 } else {
8632 fputs(" NA", outfile_msa);
8633 }
8634 }
8635 }
8636 if ((pfilter != 2.0) && ((tp > pfilter) || (tp == -9))) {
8637 continue;
8638 }
8639 if (!realnum(beta)) {
8640 wptr = memcpya(wptr, " NA NA NA ", 33);
8641 } else {
8642 wptr = dtoa_g_wxp4x(beta, 10, ' ', wptr);
8643 wptr = dtoa_g_wxp4x(vbeta_sqrt, 10, ' ', wptr);
8644 wptr = dtoa_g_wxp4x(rsq, 10, ' ', wptr);
8645 }
8646 if (tp >= 0) {
8647 wptr = dtoa_g_wxp4x(tstat, 8, ' ', wptr);
8648 wptr = dtoa_g_wxp4(MAXV(tp, output_min_p), 12, wptr);
8649 } else {
8650 wptr = memcpya(wptr, " NA NA", 21);
8651 }
8652 if (do_lin && (nanal > 2)) {
8653 dxx = g_orig_linsq[marker_idx + marker_bidx];
8654 if (realnum(dxx)) {
8655 *wptr++ = ' ';
8656 dxx = sqrt(dxx);
8657 wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
8658 dxx = calc_tprob(dxx, nanal - 2);
8659 wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
8660 } else {
8661 wptr = memcpya(wptr, " NA NA", 26);
8662 }
8663 }
8664 wptr = memcpya(wptr, " \n", 2);
8665 } else if (pfilter != 2.0) {
8666 continue;
8667 } else {
8668 wptr = memcpya(wptr, " NA NA NA NA NA ", 55);
8669 if (mperm_save & MPERM_DUMP_ALL) {
8670 fputs(" NA", outfile_msa);
8671 }
8672 if (do_lin) {
8673 wptr = memcpya(wptr, " NA NA ", 26);
8674 }
8675 *wptr++ = '\n';
8676 }
8677 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
8678 goto qassoc_ret_WRITE_FAIL;
8679 }
8680 if (qt_means) {
8681 wptr_restart = &(g_textbuf[2 + chrom_name_len + plink_maxsnp]);
8682 wptr = memcpya(wptr_restart, " GENO ", 7);
8683 a1ptr = marker_allele_ptrs[2 * marker_uidx2];
8684 a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
8685 uii = strlen(a1ptr);
8686 ujj = strlen(a2ptr);
8687 if (uii < 4) {
8688 wptr = memseta(wptr, 32, 7 - 2 * uii);
8689 }
8690 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8691 goto qassoc_ret_WRITE_FAIL;
8692 }
8693 fputs(a1ptr, outfile_qtm);
8694 putc_unlocked('/', outfile_qtm);
8695 fputs(a1ptr, outfile_qtm);
8696 putc_unlocked(' ', outfile_qtm);
8697 if (uii + ujj < 7) {
8698 fwrite(spacebuf, 1, 7 - uii - ujj, outfile_qtm);
8699 }
8700 fputs(a1ptr, outfile_qtm);
8701 putc_unlocked('/', outfile_qtm);
8702 fputs(a2ptr, outfile_qtm);
8703 putc_unlocked(' ', outfile_qtm);
8704 if (ujj < 4) {
8705 fwrite(spacebuf, 1, 7 - 2 * ujj, outfile_qtm);
8706 }
8707 fputs(a2ptr, outfile_qtm);
8708 putc_unlocked('/', outfile_qtm);
8709 fputs(a2ptr, outfile_qtm);
8710 putc_unlocked('\n', outfile_qtm);
8711 wptr = memcpya(wptr_restart, "COUNTS ", 7);
8712 wptr = uint32toa_w8x(homrar_ct, ' ', wptr);
8713 wptr = uint32toa_w8x(het_ct, ' ', wptr);
8714 wptr = uint32toa_w8x(homcom_ct, '\n', wptr);
8715 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8716 goto qassoc_ret_WRITE_FAIL;
8717 }
8718 wptr = memcpya(wptr_restart, " FREQ ", 7);
8719 wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homrar_ct), 8, ' ', wptr);
8720 wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)het_ct), 8, ' ', wptr);
8721 wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homcom_ct), 8, '\n', wptr);
8722 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8723 goto qassoc_ret_WRITE_FAIL;
8724 }
8725 wptr = memcpya(wptr_restart, " MEAN ", 7);
8726 qt_homcom_sum = qt_sum - qt_homrar_sum - qt_het_sum;
8727 if (homrar_ct) {
8728 x11 = qt_homrar_sum / ((double)homrar_ct);
8729 wptr = dtoa_g_wxp4(x11, 8, wptr);
8730 } else {
8731 wptr = memcpya(wptr, " NA", 8);
8732 }
8733 *wptr++ = ' ';
8734 if (het_ct) {
8735 x12 = qt_het_sum / ((double)het_ct);
8736 wptr = dtoa_g_wxp4(x12, 8, wptr);
8737 } else {
8738 wptr = memcpya(wptr, " NA", 8);
8739 }
8740 *wptr++ = ' ';
8741 if (homcom_ct) {
8742 x22 = qt_homcom_sum / ((double)homcom_ct);
8743 wptr = dtoa_g_wxp4(x22, 8, wptr);
8744 } else {
8745 wptr = memcpya(wptr, " NA", 8);
8746 }
8747 *wptr++ = '\n';
8748 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8749 goto qassoc_ret_WRITE_FAIL;
8750 }
8751 wptr = memcpya(wptr_restart, " SD ", 7);
8752 if (homrar_ct > 1) {
8753 dxx = sqrt((qt_homrar_ssq - qt_homrar_sum * x11) / ((double)((intptr_t)homrar_ct - 1)));
8754 wptr = dtoa_g_wxp4(dxx, 8, wptr);
8755 } else if (homrar_ct == 1) {
8756 wptr = memcpya(wptr, " 0", 8);
8757 } else {
8758 wptr = memcpya(wptr, " NA", 8);
8759 }
8760 *wptr++ = ' ';
8761 if (het_ct > 1) {
8762 dxx = sqrt((qt_het_ssq - qt_het_sum * x12) / ((double)((intptr_t)het_ct - 1)));
8763 wptr = dtoa_g_wxp4(dxx, 8, wptr);
8764 } else if (het_ct == 1) {
8765 wptr = memcpya(wptr, " 0", 8);
8766 } else {
8767 wptr = memcpya(wptr, " NA", 8);
8768 }
8769 *wptr++ = ' ';
8770 if (homcom_ct > 1) {
8771 dxx = sqrt((qt_ssq - qt_het_ssq - qt_homrar_ssq - qt_homcom_sum * x22) / ((double)((intptr_t)homcom_ct - 1)));
8772 wptr = dtoa_g_wxp4(dxx, 8, wptr);
8773 } else if (homcom_ct == 1) {
8774 wptr = memcpya(wptr, " 0", 8);
8775 } else {
8776 wptr = memcpya(wptr, " NA", 8);
8777 }
8778 *wptr++ = '\n';
8779 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
8780 goto qassoc_ret_WRITE_FAIL;
8781 }
8782 }
8783 }
8784 }
8785 if (do_perms_nst) {
8786 is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
8787 g_block_diff = block_size - g_qblock_start;
8788 ulii = 0;
8789 if (perm_maxt_nst) {
8790 g_maxt_block_base = marker_idx;
8791 // don't actually use maxt_cur_extreme_stat here?...
8792 if (!do_lin) {
8793 if (spawn_threads2(threads, &qassoc_maxt_thread, max_thread_ct, is_last_block)) {
8794 goto qassoc_ret_THREAD_CREATE_FAIL;
8795 }
8796 qassoc_maxt_thread((void*)ulii);
8797 } else {
8798 if (spawn_threads2(threads, &qassoc_maxt_lin_thread, max_thread_ct, is_last_block)) {
8799 goto qassoc_ret_THREAD_CREATE_FAIL;
8800 }
8801 qassoc_maxt_lin_thread((void*)ulii);
8802 }
8803 join_threads2(threads, max_thread_ct, is_last_block);
8804 ukk = g_block_diff / CACHELINE_DBL;
8805 if (ukk > max_thread_ct) {
8806 ukk = max_thread_ct;
8807 } else if (!ukk) {
8808 ukk = 1;
8809 }
8810 ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
8811 for (uii = 0; uii < ukk; uii++) {
8812 ooptr = &(g_maxt_thread_results[uii * ulii]);
8813 for (ujj = g_perms_done - g_perm_vec_ct; ujj < g_perms_done; ujj++) {
8814 dxx = *ooptr++;
8815 if (dxx > g_maxt_extreme_stat[ujj]) {
8816 g_maxt_extreme_stat[ujj] = dxx;
8817 }
8818 }
8819 }
8820 } else {
8821 if (!do_lin) {
8822 if (spawn_threads2(threads, &qassoc_adapt_thread, max_thread_ct, is_last_block)) {
8823 goto qassoc_ret_THREAD_CREATE_FAIL;
8824 }
8825 qassoc_adapt_thread((void*)ulii);
8826 } else {
8827 if (spawn_threads2(threads, &qassoc_adapt_lin_thread, max_thread_ct, is_last_block)) {
8828 goto qassoc_ret_THREAD_CREATE_FAIL;
8829 }
8830 qassoc_adapt_lin_thread((void*)ulii);
8831 }
8832 join_threads2(threads, max_thread_ct, is_last_block);
8833 }
8834 }
8835 marker_idx += block_size;
8836 if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
8837 if (marker_idx < marker_unstopped_ct) {
8838 if (pct >= 10) {
8839 putc_unlocked('\b', stdout);
8840 }
8841 pct = (marker_idx * 100LLU) / marker_unstopped_ct;
8842 printf("\b\b%u%%", pct);
8843 fflush(stdout);
8844 loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
8845 }
8846 }
8847 } while (marker_idx < marker_unstopped_ct);
8848 if (!perm_pass_idx) {
8849 if (pct >= 10) {
8850 putc_unlocked('\b', stdout);
8851 }
8852 fputs("\b\b", stdout);
8853 logprint("done.\n");
8854 if (qt_means) {
8855 LOGPRINTFWW("QT means report saved to %s.means .\n", outname);
8856 if (fclose_null(&outfile_qtm)) {
8857 goto qassoc_ret_WRITE_FAIL;
8858 }
8859 }
8860 if (fclose_null(&outfile)) {
8861 goto qassoc_ret_WRITE_FAIL;
8862 }
8863 if (!is_set_test) {
8864 if (do_perms_nst) {
8865 bigstack_reset(g_perm_vecstd);
8866 }
8867 if (mtest_adjust) {
8868 if (do_lin) {
8869 for (uii = 0; uii < marker_ct; uii++) {
8870 g_orig_chisq[uii] = sqrt(g_orig_linsq[uii]);
8871 }
8872 }
8873 retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, g_orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, tcnt, nullptr);
8874 if (retval) {
8875 goto qassoc_ret_1;
8876 }
8877 }
8878 if (mperm_save & MPERM_DUMP_ALL) {
8879 if (putc_checked('\n', outfile_msa)) {
8880 goto qassoc_ret_WRITE_FAIL;
8881 }
8882 }
8883 } else {
8884 retval = qassoc_set_test(threads, bedfile, bed_offset, outname, outname_end, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, sample_include2, sample_male_include2, ld_ignore_x, hh_exists, hh_or_mt_exists, perm_batch_size, sip, tcnt, loadbuf_raw);
8885 if (retval) {
8886 goto qassoc_ret_1;
8887 }
8888 }
8889 }
8890 if (do_perms_nst) {
8891 if (mperm_save & MPERM_DUMP_ALL) {
8892 if (perm_pass_idx) {
8893 putc_unlocked(' ', stdout);
8894 }
8895 fputs("[dumping stats]", stdout);
8896 fflush(stdout);
8897 ulii = g_perm_vec_ct;
8898 ujj = 1 + g_perms_done - ulii;
8899 wptr = g_textbuf;
8900 a1ptr = &(g_textbuf[MAXLINELEN]);
8901 for (uii = 0; uii < ulii; uii++) {
8902 wptr = uint32toa(uii + ujj, wptr);
8903 ooptr = &(g_mperm_save_all[uii]);
8904 for (ukk = 0; ukk < marker_ct; ukk++) {
8905 *wptr++ = ' ';
8906 dxx = ooptr[ukk * ulii];
8907 if (dxx >= 0) {
8908 wptr = dtoa_g(dxx, wptr);
8909 } else {
8910 wptr = memcpya(wptr, "NA", 2);
8911 }
8912 if (wptr >= a1ptr) {
8913 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
8914 goto qassoc_ret_WRITE_FAIL;
8915 }
8916 wptr = g_textbuf;
8917 }
8918 }
8919 *wptr++ = '\n';
8920 }
8921 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
8922 goto qassoc_ret_WRITE_FAIL;
8923 }
8924 fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b ", stdout);
8925 }
8926 bigstack_reset(g_perm_vecstd);
8927 if (g_perms_done < perms_total) {
8928 if (perm_adapt_nst) {
8929 marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
8930 if (!marker_unstopped_ct) {
8931 goto qassoc_adapt_perm_count;
8932 }
8933 }
8934 printf("\r%u permutation%s complete.", g_perms_done, (g_perms_done != 1)? "s" : "");
8935 fflush(stdout);
8936 perm_pass_idx++;
8937 goto qassoc_more_perms;
8938 }
8939 if (perm_adapt_nst) {
8940 qassoc_adapt_perm_count:
8941 g_perms_done = 0;
8942 for (uii = 0; uii < marker_ct; uii++) {
8943 if (g_perm_attempt_ct[uii] > g_perms_done) {
8944 g_perms_done = g_perm_attempt_ct[uii];
8945 if (g_perms_done == perms_total) {
8946 break;
8947 }
8948 }
8949 }
8950 }
8951 putc_unlocked('\r', stdout);
8952 LOGPRINTF("%u %s permutation%s complete.\n", g_perms_done, perm_maxt_nst? "max(T)" : "(adaptive)", (g_perms_done != 1)? "s" : "");
8953
8954 if (perm_adapt_nst) {
8955 memcpy(outname_end2, ".perm", 6);
8956 } else {
8957 if (mperm_save & MPERM_DUMP_BEST) {
8958 memcpy(outname_end, ".mperm.dump.best", 17);
8959 LOGPRINTFWW("Dumping best permutation squared %sstats to %s .\n", do_lin? "Lin " : "Wald t-", outname);
8960 if (fopen_checked(outname, "w", &outfile)) {
8961 goto qassoc_ret_OPEN_FAIL;
8962 }
8963 dxx = 0;
8964 if (!do_lin) {
8965 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
8966 if (fabs(g_orig_chisq[marker_idx]) > dxx) {
8967 dxx = fabs(g_orig_chisq[marker_idx]);
8968 }
8969 }
8970 dxx = dxx * dxx;
8971 } else {
8972 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
8973 if (g_orig_linsq[marker_idx] > dxx) {
8974 dxx = g_orig_linsq[marker_idx];
8975 }
8976 }
8977 }
8978 memcpy(g_textbuf, "0 ", 2);
8979 wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
8980 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
8981 goto qassoc_ret_WRITE_FAIL;
8982 }
8983 for (uii = 0; uii < perms_total; uii++) {
8984 wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
8985 wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
8986 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
8987 goto qassoc_ret_WRITE_FAIL;
8988 }
8989 }
8990 if (fclose_null(&outfile)) {
8991 goto qassoc_ret_WRITE_FAIL;
8992 }
8993 memcpy(outname_end, ".qassoc", 7); // deliberately not null-terminated
8994 }
8995 memcpy(outname_end2, ".mperm", 7);
8996 }
8997 if (fopen_checked(outname, "w", &outfile)) {
8998 goto qassoc_ret_OPEN_FAIL;
8999 }
9000 if (perm_adapt_nst) {
9001 sprintf(g_textbuf, " CHR %%%us EMP1 NP \n", plink_maxsnp);
9002 } else {
9003 sprintf(g_textbuf, " CHR %%%us EMP1 EMP2 \n", plink_maxsnp);
9004 #ifdef __cplusplus
9005 std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
9006 #else
9007 qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
9008 #endif
9009 }
9010 // (debugging)
9011 // if (perm_maxt) {
9012 // printf("extreme stats: %g %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[(perms_total - 1) / 2], g_maxt_extreme_stat[perms_total - 1]);
9013 // }
9014 fprintf(outfile, g_textbuf, "SNP");
9015 chrom_fo_idx = 0xffffffffU;
9016 marker_uidx = next_unset_unsafe(marker_exclude, 0);
9017 marker_idx = 0;
9018 dyy = 1.0 / ((double)((int32_t)perms_total + 1));
9019 dxx = 0.5 * dyy;
9020 while (1) {
9021 do {
9022 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
9023 } while (marker_uidx >= chrom_end);
9024 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9025 wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
9026 *wptr_start++ = ' ';
9027 wptr_start[plink_maxsnp] = ' ';
9028 for (; marker_uidx < chrom_end;) {
9029 if (perm_adapt_nst) {
9030 pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (g_perm_attempt_ct[marker_idx] + 1)));
9031 } else {
9032 pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) * dxx;
9033 }
9034 if (pval <= pfilter) {
9035 fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
9036 wptr = &(wptr_start[1 + plink_maxsnp]);
9037 if (perm_adapt_nst && (!g_perm_attempt_ct[marker_idx])) {
9038 // invalid
9039 wptr = memcpya(wptr, " NA NA", 25);
9040 } else {
9041 if (!perm_count) {
9042 wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
9043 } else {
9044 wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
9045 }
9046 if (perm_adapt_nst) {
9047 wptr = memseta(wptr, 32, 2);
9048 wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
9049 } else {
9050 // maximum chisq
9051 // N.B. numbers in maxt_extreme_stat[] have been pre-squared
9052 // while orig_chisq[] has not been
9053 if (do_lin) {
9054 dzz = g_orig_linsq[marker_idx];
9055 } else {
9056 dzz = g_orig_chisq[marker_idx] * g_orig_chisq[marker_idx];
9057 }
9058 dzz = (int32_t)(perms_total - doublearr_greater_than(g_maxt_extreme_stat, perms_total, dzz - EPSILON) + 1);
9059 if (!perm_count) {
9060 wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
9061 } else {
9062 wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
9063 }
9064 }
9065 }
9066 wptr = memcpya(wptr, " \n", 2);
9067 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9068 goto qassoc_ret_WRITE_FAIL;
9069 }
9070 }
9071 if (++marker_idx == marker_ct) {
9072 goto qassoc_loop_end;
9073 }
9074 marker_uidx++;
9075 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
9076 }
9077 }
9078 qassoc_loop_end:
9079 if (fclose_null(&outfile)) {
9080 goto qassoc_ret_WRITE_FAIL;
9081 }
9082 LOGPRINTFWW("Permutation test report written to %s .\n", outname);
9083 }
9084
9085 while (0) {
9086 qassoc_ret_NOMEM:
9087 retval = RET_NOMEM;
9088 break;
9089 qassoc_ret_OPEN_FAIL:
9090 retval = RET_OPEN_FAIL;
9091 break;
9092 qassoc_ret_READ_FAIL:
9093 retval = RET_READ_FAIL;
9094 break;
9095 qassoc_ret_WRITE_FAIL:
9096 retval = RET_WRITE_FAIL;
9097 break;
9098 qassoc_ret_INVALID_CMDLINE:
9099 retval = RET_INVALID_CMDLINE;
9100 break;
9101 qassoc_ret_THREAD_CREATE_FAIL:
9102 retval = RET_THREAD_CREATE_FAIL;
9103 break;
9104 }
9105 qassoc_ret_1:
9106 bigstack_reset(bigstack_mark);
9107 fclose_cond(outfile);
9108 fclose_cond(outfile_qtm);
9109 fclose_cond(outfile_msa);
9110 return retval;
9111 }
9112
gxe_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uintptr_t sample_ct,uintptr_t * sample_exclude,uintptr_t * pheno_nm,double * pheno_d,uintptr_t * gxe_covar_nm,uintptr_t * gxe_covar_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)9113 int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* gxe_covar_nm, uintptr_t* gxe_covar_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
9114 unsigned char* bigstack_mark = g_bigstack_base;
9115 FILE* outfile = nullptr;
9116 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
9117 uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9118 uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
9119 uintptr_t covar_nm_ct = popcount_longs(gxe_covar_nm, sample_ctl);
9120 uintptr_t covar_nm_ctl = BITCT_TO_WORDCT(covar_nm_ct);
9121 // gxe_covar_c has opposite truth value from ->bcovar in PLINK 1.07 gxe.cpp;
9122 // see lines 50-58 in gxe.cpp
9123 uintptr_t group2_size = popcount_longs(gxe_covar_c, sample_ctl);
9124 uintptr_t group1_size = covar_nm_ct - group2_size;
9125 uintptr_t male_ct = 0;
9126 uintptr_t male_ctl = 0;
9127 uintptr_t group1_size_male = 0;
9128 uintptr_t group2_size_male = 0;
9129 uintptr_t marker_uidx = 0;
9130 uintptr_t final_mask = 0;
9131 uintptr_t* sample_include2 = nullptr;
9132 uintptr_t* sample_male_include2 = nullptr;
9133 uintptr_t* sample_male_all_include2 = nullptr;
9134 uintptr_t* group1_include2 = nullptr;
9135 uintptr_t* group2_include2 = nullptr;
9136 uintptr_t* group1_male_include2 = nullptr;
9137 uintptr_t* group2_male_include2 = nullptr;
9138 uintptr_t* covar_nm_raw = nullptr;
9139 uintptr_t* covar_nm_male_raw = nullptr;
9140 uintptr_t* cur_sample_i2 = nullptr;
9141 uintptr_t* cur_sample_male_i2 = nullptr;
9142 uintptr_t* cur_group1_i2 = nullptr;
9143 uintptr_t* cur_group2_i2 = nullptr;
9144 uintptr_t* cur_covar_nm_raw = nullptr;
9145 double* pheno_d_collapsed = nullptr;
9146 double* pheno_d_male_collapsed = nullptr;
9147 double* cur_pheno_d = nullptr;
9148 char* wptr_start = nullptr;
9149 uintptr_t cur_sample_ct = 0;
9150 uintptr_t cur_sample_ctv2 = 0;
9151 uintptr_t cur_group1_size = 0;
9152 uintptr_t cur_group2_size = 0;
9153 uint32_t y_exists = (chrom_info_ptr->xymt_codes[Y_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[Y_OFFSET]);
9154 uint32_t mt_exists = (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET]);
9155 uint32_t skip_y = 0;
9156 double pheno_sum_g1 = 0;
9157 double pheno_ssq_g1 = 0;
9158 double pheno_sum_g2 = 0;
9159 double pheno_ssq_g2 = 0;
9160 double pheno_sum_male_g1 = 0;
9161 double pheno_ssq_male_g1 = 0;
9162 double pheno_sum_male_g2 = 0;
9163 double pheno_ssq_male_g2 = 0;
9164 double base_pheno_sum_g1 = 0;
9165 double base_pheno_ssq_g1 = 0;
9166 double base_pheno_sum_g2 = 0;
9167 double base_pheno_ssq_g2 = 0;
9168 int32_t retval = 0;
9169 uintptr_t* loadbuf_raw;
9170 uintptr_t* loadbuf;
9171 uintptr_t* loadbuf_ptr;
9172 uintptr_t* cgr_ptr;
9173 char* wptr;
9174 uint32_t chrom_fo_idx;
9175 uint32_t chrom_end;
9176 uintptr_t loop_end;
9177 uintptr_t marker_idx;
9178 uintptr_t sample_uidx;
9179 uintptr_t sample_uidx_stop;
9180 uintptr_t sample_idx;
9181 uintptr_t sample_idx2;
9182 uintptr_t sample_idx2_offset;
9183 uintptr_t ulii;
9184 uintptr_t uljj;
9185 uintptr_t ulkk;
9186 uintptr_t ulmm;
9187 uintptr_t ulnn;
9188 double dxx;
9189 double qt_sum1;
9190 double qt_ssq1;
9191 double qt_g_prod1;
9192 double nanal_recip1;
9193 double nanal_m1_recip1;
9194 double geno_mean1;
9195 double g_var1;
9196 double qt_var1;
9197 double qt_g_covar1;
9198 double beta1;
9199 double vbeta1;
9200
9201 double qt_sum2;
9202 double qt_ssq2;
9203 double qt_g_prod2;
9204 double nanal_recip2;
9205 double nanal_m1_recip2;
9206 double geno_mean2;
9207 double g_var2;
9208 double qt_var2;
9209 double qt_g_covar2;
9210 double beta2;
9211 double vbeta2;
9212
9213 double zval;
9214
9215 uint32_t is_x;
9216 uint32_t is_y;
9217 uint32_t is_mt;
9218 uint32_t min_ploidy_1;
9219 uint32_t pct;
9220
9221 uint32_t missing_ct1;
9222 uint32_t het_ct1;
9223 uint32_t homcom_ct1;
9224 uint32_t homrar_ct1;
9225 uint32_t nanal1;
9226 uint32_t geno_sum1;
9227 uint32_t geno_ssq1;
9228
9229 uint32_t missing_ct2;
9230 uint32_t het_ct2;
9231 uint32_t homcom_ct2;
9232 uint32_t homrar_ct2;
9233 uint32_t nanal2;
9234 uint32_t geno_sum2;
9235 uint32_t geno_ssq2;
9236
9237 if (group1_size < 3) {
9238 logerrprint("Error: First --gxe group has fewer than three members.\n");
9239 goto gxe_assoc_ret_INVALID_CMDLINE;
9240 } else if (group2_size < 3) {
9241 logerrprint("Error: Second --gxe group has fewer than three members.\n");
9242 goto gxe_assoc_ret_INVALID_CMDLINE;
9243 }
9244 if (bigstack_alloc_ul(unfiltered_sample_ctl * 2, &loadbuf_raw) ||
9245 bigstack_alloc_ul(covar_nm_ctl * 2, &loadbuf) ||
9246 bigstack_calloc_ul(unfiltered_sample_ctl, &covar_nm_raw) ||
9247 bigstack_alloc_d(covar_nm_ct, &pheno_d_collapsed)) {
9248 goto gxe_assoc_ret_NOMEM;
9249 }
9250 loadbuf_raw[unfiltered_sample_ctl * 2 - 1] = 0;
9251
9252 sample_uidx = 0;
9253 sample_idx = 0;
9254 sample_idx2 = 0;
9255 do {
9256 sample_uidx = next_unset_ul_unsafe(sample_exclude, sample_uidx);
9257 sample_uidx_stop = next_set_ul(sample_exclude, sample_uidx, unfiltered_sample_ct);
9258 do {
9259 if (IS_SET(gxe_covar_nm, sample_idx)) {
9260 SET_BIT(sample_uidx, covar_nm_raw);
9261 dxx = pheno_d[sample_uidx];
9262 if (IS_SET(gxe_covar_c, sample_idx)) {
9263 pheno_sum_g2 += dxx;
9264 pheno_ssq_g2 += dxx * dxx;
9265 } else {
9266 pheno_sum_g1 += dxx;
9267 pheno_ssq_g1 += dxx * dxx;
9268 }
9269 pheno_d_collapsed[sample_idx2++] = dxx;
9270 }
9271 sample_idx++;
9272 } while (++sample_uidx < sample_uidx_stop);
9273 } while (sample_idx < sample_ct);
9274
9275 if (bigstack_alloc_ul(covar_nm_ctl * 2, &group1_include2) ||
9276 bigstack_calloc_ul(covar_nm_ctl * 2, &group2_include2)) {
9277 goto gxe_assoc_ret_NOMEM;
9278 }
9279 fill_quatervec_55(covar_nm_ct, group1_include2);
9280 sample_idx = 0;
9281 sample_idx2 = 0;
9282 do {
9283 sample_idx = next_set_ul_unsafe(gxe_covar_nm, sample_idx);
9284 sample_uidx_stop = next_unset_ul(gxe_covar_nm, sample_idx, sample_ct);
9285 do {
9286 if (IS_SET(gxe_covar_c, sample_idx)) {
9287 SET_BIT_DBL(sample_idx2, group2_include2);
9288 }
9289 sample_idx2++;
9290 } while (++sample_idx < sample_uidx_stop);
9291 } while (sample_idx2 < covar_nm_ct);
9292 bitvec_andnot(group2_include2, covar_nm_ctl * 2, group1_include2);
9293
9294 hh_or_mt_exists |= mt_exists * NXMHH_EXISTS;
9295 if ((hh_or_mt_exists & NXMHH_EXISTS) || y_exists) {
9296 if (bigstack_alloc_ul(covar_nm_ctl * 2, &sample_include2)) {
9297 goto gxe_assoc_ret_NOMEM;
9298 }
9299 fill_quatervec_55(covar_nm_ct, sample_include2);
9300 }
9301 if ((hh_or_mt_exists & XMHH_EXISTS) || y_exists) {
9302 if (bigstack_calloc_ul(covar_nm_ctl * 2, &sample_male_include2)) {
9303 goto gxe_assoc_ret_NOMEM;
9304 }
9305 sample_uidx = 0;
9306 sample_idx = 0;
9307 sample_idx2 = 0;
9308 do {
9309 sample_uidx = next_unset_ul_unsafe(sample_exclude, sample_uidx);
9310 sample_uidx_stop = next_set_ul(sample_exclude, sample_uidx, unfiltered_sample_ct);
9311 do {
9312 if (IS_SET(gxe_covar_nm, sample_idx)) {
9313 if (IS_SET(sex_male, sample_uidx)) {
9314 SET_BIT_DBL(sample_idx2, sample_male_include2);
9315 male_ct++;
9316 }
9317 sample_idx2++;
9318 }
9319 sample_idx++;
9320 } while (++sample_uidx < sample_uidx_stop);
9321 } while (sample_idx < sample_ct);
9322 male_ctl = BITCT_TO_WORDCT(male_ct);
9323 if (y_exists) {
9324 group1_size_male = popcount_longs_exclude(sample_male_include2, group2_include2, covar_nm_ctl * 2);
9325 group2_size_male = male_ct - group1_size_male;
9326 if ((group1_size_male < 3) || (group2_size_male < 3)) {
9327 logerrprint("Warning: Skipping Y chromosome for --gxe since a group has less than 3 males.\n");
9328 skip_y = 1;
9329 }
9330 // currently still need to initialize covar_nm_male_raw even on skip_y
9331 if (bigstack_alloc_ul(male_ctl * 2, &sample_male_all_include2) ||
9332 bigstack_alloc_ul(male_ctl * 2, &group1_male_include2) ||
9333 bigstack_calloc_ul(male_ctl * 2, &group2_male_include2) ||
9334 bigstack_alloc_d(male_ct, &pheno_d_male_collapsed) ||
9335 bigstack_alloc_ul(unfiltered_sample_ctl, &covar_nm_male_raw)) {
9336 goto gxe_assoc_ret_NOMEM;
9337 }
9338 fill_quatervec_55(male_ct, sample_male_all_include2);
9339 fill_quatervec_55(male_ct, group1_male_include2);
9340 sample_idx = 0;
9341 for (sample_idx2 = 0; sample_idx2 < covar_nm_ct; sample_idx2++) {
9342 if (IS_SET_DBL(sample_male_include2, sample_idx2)) {
9343 dxx = pheno_d_collapsed[sample_idx2];
9344 if (IS_SET_DBL(group2_include2, sample_idx2)) {
9345 SET_BIT_DBL(sample_idx, group2_male_include2);
9346 pheno_sum_male_g2 += dxx;
9347 pheno_ssq_male_g2 += dxx * dxx;
9348 } else {
9349 pheno_sum_male_g1 += dxx;
9350 pheno_ssq_male_g1 += dxx * dxx;
9351 }
9352 pheno_d_male_collapsed[sample_idx++] = dxx;
9353 }
9354 }
9355 bitvec_andnot(group2_male_include2, male_ctl * 2, group1_male_include2);
9356 for (ulii = 0; ulii < unfiltered_sample_ctl; ulii++) {
9357 covar_nm_male_raw[ulii] = covar_nm_raw[ulii] & sex_male[ulii];
9358 }
9359 }
9360 }
9361
9362 memcpy(outname_end, ".qassoc.gxe", 12);
9363 if (fopen_checked(outname, "w", &outfile)) {
9364 goto gxe_assoc_ret_OPEN_FAIL;
9365 }
9366 if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
9367 logerrprint("Warning: --gxe doesn't currently handle X/Y/MT/haploid variants properly.\n");
9368 }
9369 LOGPRINTFWW5("Writing --gxe report to %s ... ", outname);
9370 fputs("0%", stdout);
9371 fflush(stdout);
9372 sprintf(g_textbuf, " CHR %%%us NMISS1 BETA1 SE1 NMISS2 BETA2 SE2 Z_GXE P_GXE \n", plink_maxsnp);
9373 fprintf(outfile, g_textbuf, "SNP");
9374
9375 if (fseeko(bedfile, bed_offset, SEEK_SET)) {
9376 goto gxe_assoc_ret_READ_FAIL;
9377 }
9378 // exploit overflow for initialization
9379 chrom_fo_idx = 0xffffffffU;
9380 marker_uidx = 0;
9381 marker_idx = 0;
9382 chrom_end = 0;
9383 for (pct = 1; pct <= 100; pct++) {
9384 loop_end = (((uint64_t)pct) * marker_ct) / 100;
9385 for (; marker_idx < loop_end; marker_idx++) {
9386 if (IS_SET(marker_exclude, marker_uidx)) {
9387 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
9388 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
9389 goto gxe_assoc_ret_READ_FAIL;
9390 }
9391 }
9392 if (marker_uidx >= chrom_end) {
9393 chrom_fo_idx++;
9394 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &min_ploidy_1);
9395 min_ploidy_1 |= is_mt;
9396 if (!is_y) {
9397 cur_sample_ct = covar_nm_ct;
9398 cur_group1_size = group1_size;
9399 cur_group2_size = group2_size;
9400 base_pheno_sum_g1 = pheno_sum_g1;
9401 base_pheno_ssq_g1 = pheno_ssq_g1;
9402 base_pheno_sum_g2 = pheno_sum_g2;
9403 base_pheno_ssq_g2 = pheno_ssq_g2;
9404 cur_sample_i2 = sample_include2;
9405 cur_sample_male_i2 = sample_male_include2;
9406 cur_group1_i2 = group1_include2;
9407 cur_group2_i2 = group2_include2;
9408 cur_pheno_d = pheno_d_collapsed;
9409 cur_covar_nm_raw = covar_nm_raw;
9410 } else {
9411 cur_sample_ct = male_ct;
9412 cur_group1_size = group1_size_male;
9413 cur_group2_size = group2_size_male;
9414 base_pheno_sum_g1 = pheno_sum_male_g1;
9415 base_pheno_ssq_g1 = pheno_ssq_male_g1;
9416 base_pheno_sum_g2 = pheno_sum_male_g2;
9417 base_pheno_ssq_g2 = pheno_ssq_male_g2;
9418 cur_sample_i2 = sample_male_all_include2;
9419 cur_sample_male_i2 = sample_male_all_include2;
9420 cur_group1_i2 = group1_male_include2;
9421 cur_group2_i2 = group2_male_include2;
9422 cur_pheno_d = pheno_d_male_collapsed;
9423 cur_covar_nm_raw = covar_nm_male_raw;
9424 }
9425 wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], g_textbuf));
9426 *wptr_start++ = ' ';
9427 cur_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(cur_sample_ct);
9428 loadbuf[cur_sample_ctv2 - 1] = 0;
9429 final_mask = get_final_mask(cur_sample_ct);
9430 }
9431
9432 if (load_and_collapse_incl(unfiltered_sample_ct, cur_sample_ct, cur_covar_nm_raw, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
9433 goto gxe_assoc_ret_READ_FAIL;
9434 }
9435 if (is_y && skip_y) {
9436 marker_uidx++;
9437 continue;
9438 }
9439 if (min_ploidy_1) {
9440 haploid_fix(hh_or_mt_exists, cur_sample_i2, cur_sample_male_i2, cur_sample_ct, is_x, is_y, (unsigned char*)loadbuf);
9441 }
9442
9443 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
9444 *wptr++ = ' ';
9445
9446 // We are interested in the following quantities:
9447 // qt_var{1,2}: (qt_ssq - (qt_sum^2 / N)) / (N-1)
9448 // g_var{1,2}: (geno_ssq - (geno_sum^2 / N)) / (N-1)
9449 // qt_g_covar{1,2}: (qt_g_prod - ((qt_sum * geno_sum) / N)) / (N-1)
9450
9451 single_marker_cc_3freqs(cur_sample_ctv2, loadbuf, cur_group1_i2, cur_group2_i2, &homcom_ct1, &het_ct1, &missing_ct1, &homcom_ct2, &het_ct2, &missing_ct2);
9452 nanal1 = ((uint32_t)cur_group1_size) - missing_ct1;
9453 nanal2 = ((uint32_t)cur_group2_size) - missing_ct2;
9454 homrar_ct1 = nanal1 - (het_ct1 + homcom_ct1);
9455 homrar_ct2 = nanal2 - (het_ct2 + homcom_ct2);
9456 geno_sum1 = 2 * homrar_ct1 + het_ct1;
9457 geno_sum2 = 2 * homrar_ct2 + het_ct2;
9458 geno_ssq1 = 4 * homrar_ct1 + het_ct1;
9459 geno_ssq2 = 4 * homrar_ct2 + het_ct2;
9460
9461 if ((nanal1 > 2) && (nanal2 > 2)) {
9462 nanal_recip1 = 1.0 / ((int32_t)nanal1);
9463 nanal_recip2 = 1.0 / ((int32_t)nanal2);
9464 nanal_m1_recip1 = 1.0 / ((int32_t)(nanal1 - 1));
9465 nanal_m1_recip2 = 1.0 / ((int32_t)(nanal2 - 1));
9466 geno_mean1 = geno_sum1 * nanal_recip1;
9467 g_var1 = (geno_ssq1 - geno_sum1 * geno_mean1) * nanal_m1_recip1;
9468 geno_mean2 = geno_sum2 * nanal_recip2;
9469 g_var2 = (geno_ssq2 - geno_sum2 * geno_mean2) * nanal_m1_recip2;
9470 if ((g_var1 == 0) || (g_var2 == 0)) {
9471 goto gxe_assoc_nan_line;
9472 }
9473 qt_sum1 = base_pheno_sum_g1;
9474 qt_ssq1 = base_pheno_ssq_g1;
9475 qt_sum2 = base_pheno_sum_g2;
9476 qt_ssq2 = base_pheno_ssq_g2;
9477 qt_g_prod1 = 0;
9478 qt_g_prod2 = 0;
9479 sample_idx2_offset = 0;
9480 loadbuf_ptr = loadbuf;
9481 cgr_ptr = cur_group2_i2;
9482 do {
9483 ulmm = ~(*loadbuf_ptr++);
9484 if (sample_idx2_offset + BITCT2 > cur_sample_ct) {
9485 ulmm &= (ONELU << ((cur_sample_ct & (BITCT2 - 1)) * 2)) - ONELU;
9486 }
9487 if (ulmm) {
9488 ulnn = (*cgr_ptr) * 3;
9489 ulii = ulmm & (~ulnn);
9490 while (ulii) {
9491 uljj = CTZLU(ulii) & (BITCT - 2);
9492 ulkk = (ulii >> uljj) & 3;
9493 sample_idx2 = sample_idx2_offset + (uljj / 2);
9494 dxx = cur_pheno_d[sample_idx2];
9495 if (ulkk == 1) {
9496 // het
9497 qt_g_prod1 += dxx;
9498 } else if (ulkk == 3) {
9499 // hom rare
9500 qt_g_prod1 += 2 * dxx;
9501 } else {
9502 // missing
9503 qt_sum1 -= dxx;
9504 qt_ssq1 -= dxx * dxx;
9505 }
9506 ulii &= ~((3 * ONELU) << uljj);
9507 }
9508 ulii = ulmm & ulnn;
9509 while (ulii) {
9510 uljj = CTZLU(ulii) & (BITCT - 2);
9511 ulkk = (ulii >> uljj) & 3;
9512 sample_idx2 = sample_idx2_offset + (uljj / 2);
9513 dxx = cur_pheno_d[sample_idx2];
9514 if (ulkk == 1) {
9515 qt_g_prod2 += dxx;
9516 } else if (ulkk == 3) {
9517 qt_g_prod2 += 2 * dxx;
9518 } else {
9519 qt_sum2 -= dxx;
9520 qt_ssq2 -= dxx * dxx;
9521 }
9522 ulii &= ~((3 * ONELU) << uljj);
9523 }
9524 }
9525 cgr_ptr++;
9526 sample_idx2_offset += BITCT2;
9527 } while (sample_idx2_offset < cur_sample_ct);
9528 qt_var1 = (qt_ssq1 - (qt_sum1 * qt_sum1 * nanal_recip1)) * nanal_m1_recip1;
9529 qt_var2 = (qt_ssq2 - (qt_sum2 * qt_sum2 * nanal_recip2)) * nanal_m1_recip2;
9530 qt_g_covar1 = (qt_g_prod1 - (qt_sum1 * geno_mean1)) * nanal_m1_recip1;
9531 qt_g_covar2 = (qt_g_prod2 - (qt_sum2 * geno_mean2)) * nanal_m1_recip2;
9532 beta1 = qt_g_covar1 / g_var1;
9533 beta2 = qt_g_covar2 / g_var2;
9534 vbeta1 = (qt_var1 / g_var1 - (qt_g_covar1 * qt_g_covar1) / (g_var1 * g_var1)) / ((double)(((int32_t)nanal1) - 2));
9535
9536 vbeta2 = (qt_var2 / g_var2 - (qt_g_covar2 * qt_g_covar2) / (g_var2 * g_var2)) / ((double)(((int32_t)nanal2) - 2));
9537 if (vbeta1 + vbeta2 <= 0) {
9538 goto gxe_assoc_nan_line;
9539 }
9540 zval = (beta1 - beta2) / sqrt(vbeta1 + vbeta2);
9541 wptr = uint32toa_w8x(nanal1, ' ', wptr);
9542 wptr = dtoa_g_wxp4x(beta1, 10, ' ', wptr);
9543 wptr = dtoa_g_wxp4x(sqrt(vbeta1), 10, ' ', wptr);
9544 wptr = uint32toa_w8x(nanal2, ' ', wptr);
9545 wptr = dtoa_g_wxp4x(beta2, 10, ' ', wptr);
9546 wptr = dtoa_g_wxp4x(sqrt(vbeta2), 10, ' ', wptr);
9547 wptr = dtoa_g_wxp4x(zval, 8, ' ', wptr);
9548 dxx = chiprob_p(zval * zval, 1);
9549 wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, '\n', wptr);
9550 } else {
9551 gxe_assoc_nan_line:
9552 wptr = memcpya(wptr, " NA NA NA NA NA NA NA NA\n", 84);
9553 }
9554 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
9555 goto gxe_assoc_ret_WRITE_FAIL;
9556 }
9557 marker_uidx++;
9558 }
9559 if (pct < 100) {
9560 if (pct > 10) {
9561 putc_unlocked('\b', stdout);
9562 }
9563 printf("\b\b%u%%", pct);
9564 fflush(stdout);
9565 }
9566 }
9567 if (fclose_null(&outfile)) {
9568 goto gxe_assoc_ret_WRITE_FAIL;
9569 }
9570 if (pct >= 10) {
9571 putc_unlocked('\b', stdout);
9572 }
9573 fputs("\b\b", stdout);
9574 logprint("done.\n");
9575
9576 while (0) {
9577 gxe_assoc_ret_NOMEM:
9578 retval = RET_NOMEM;
9579 break;
9580 gxe_assoc_ret_OPEN_FAIL:
9581 retval = RET_OPEN_FAIL;
9582 break;
9583 gxe_assoc_ret_READ_FAIL:
9584 retval = RET_READ_FAIL;
9585 break;
9586 gxe_assoc_ret_WRITE_FAIL:
9587 retval = RET_WRITE_FAIL;
9588 break;
9589 gxe_assoc_ret_INVALID_CMDLINE:
9590 retval = RET_INVALID_CMDLINE;
9591 break;
9592 }
9593 bigstack_reset(bigstack_mark);
9594 fclose_cond(outfile);
9595 return retval;
9596 }
9597
calc_git_missing(uint32_t pheno_nm_ct,uint32_t perm_vec_ct,uintptr_t * __restrict__ loadbuf,uint32_t * perm_vecst,uint32_t * thread_wkspace)9598 void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict__ loadbuf, uint32_t* perm_vecst, uint32_t* thread_wkspace) {
9599 // Simplified calc_git() for when we only need to distinguish between missing
9600 // and nonmissing.
9601 // thread_wkspace[] is assumed to be zeroed out before this function is
9602 // called.
9603 uint32_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9604 #ifdef __LP64__
9605 uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
9606 uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
9607 uint32_t perm_ct128x4 = perm_ct128 * 4;
9608 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
9609 __m128i* permsv = (__m128i*)perm_vecst;
9610 __m128i* gitv[3];
9611 #else
9612 uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
9613 uint32_t perm_ct32x4 = perm_ct32 * 4;
9614 uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
9615 uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
9616 uintptr_t* permsv = (uintptr_t*)perm_vecst;
9617 uintptr_t* gitv[3];
9618 #endif
9619 uint32_t cur_ct;
9620 uintptr_t ulii;
9621 uint32_t uii;
9622 uint32_t ujj;
9623 #ifdef __LP64__
9624 // 4- and 8-bit partial counts
9625 gitv[0] = &(((__m128i*)thread_wkspace)[8 * perm_ct128x4]);
9626 gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
9627 gitv[2] = (__m128i*)thread_wkspace;
9628 #else
9629 gitv[0] = (uintptr_t*)(&(thread_wkspace[8 * perm_ct32x4]));
9630 gitv[1] = (uintptr_t*)(&(thread_wkspace[9 * perm_ct32x4]));
9631 gitv[2] = (uintptr_t*)thread_wkspace;
9632 #endif
9633 cur_ct = 0;
9634 for (uii = 0; uii < pheno_nm_ctl; uii++) {
9635 ulii = *loadbuf++;
9636 if (uii + 1 == pheno_nm_ctl) {
9637 ujj = pheno_nm_ct & (BITCT2 - 1);
9638 if (ujj) {
9639 ulii &= (ONELU << ujj) - ONELU;
9640 }
9641 }
9642 while (ulii) {
9643 ujj = CTZLU(ulii);
9644 cur_ct++;
9645 #ifdef __LP64__
9646 unroll_incr_1_4(&(permsv[ujj * perm_ct128]), gitv[0], perm_ct128);
9647 if (!(cur_ct % 15)) {
9648 unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct32);
9649 if (!(cur_ct % 255)) {
9650 unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct16);
9651 }
9652 }
9653 #else
9654 unroll_incr_1_4(&(permsv[ujj * perm_ct32]), gitv[0], perm_ct32);
9655 if (!(cur_ct % 15)) {
9656 unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct8);
9657 if (!(cur_ct % 255)) {
9658 unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct4);
9659 }
9660 }
9661 #endif
9662 ulii &= ulii - 1;
9663 }
9664 #ifdef __LP64__
9665 permsv = &(permsv[BITCT * perm_ct128]);
9666 #else
9667 permsv = &(permsv[BITCT * perm_ct32]);
9668 #endif
9669 }
9670 #ifdef __LP64__
9671 if (cur_ct % 15) {
9672 unroll_incr_4_8(gitv[0], gitv[1], perm_ct32);
9673 }
9674 if (cur_ct % 255) {
9675 unroll_incr_8_32(gitv[1], gitv[2], perm_ct16);
9676 }
9677 #else
9678 if (cur_ct % 15) {
9679 unroll_incr_4_8(gitv[0], gitv[1], perm_ct8);
9680 }
9681 if (cur_ct % 255) {
9682 unroll_incr_8_32(gitv[1], gitv[2], perm_ct4);
9683 }
9684 #endif
9685 }
9686
testmiss_adapt_thread(void * arg)9687 THREAD_RET_TYPE testmiss_adapt_thread(void* arg) {
9688 uintptr_t tidx = (uintptr_t)arg;
9689 uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
9690 uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9691 uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9692 uintptr_t perm_vec_ct = g_perm_vec_ct;
9693 uint32_t max_thread_ct = g_assoc_thread_ct;
9694 uint32_t pidx_offset = g_perms_done;
9695 uint32_t is_midp = g_fisher_midp;
9696 uint32_t first_adapt_check = g_first_adapt_check;
9697 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
9698 uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
9699 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
9700 unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
9701 // this can be cached since testmiss() computes all raw p-values before
9702 // starting permutation test
9703 double* __restrict__ orig_pvals = g_orig_pvals;
9704 double adaptive_intercept = g_adaptive_intercept;
9705 double adaptive_slope = g_adaptive_slope;
9706 double adaptive_ci_zt = g_adaptive_ci_zt;
9707 double aperm_alpha = g_aperm_alpha;
9708 double stat_high = 0;
9709 double stat_low = 0;
9710 uint32_t missing_sum = 0;
9711 uint32_t nm_sum = 0;
9712 uint32_t* male_case_cts = nullptr;
9713 uintptr_t* __restrict__ loadbuf;
9714 uintptr_t* loadbuf_ptr;
9715 uint32_t* __restrict__ precomp_ui;
9716 uint32_t* __restrict__ missing_cts;
9717 uint32_t* gpui;
9718 uintptr_t marker_idx;
9719 uintptr_t pidx;
9720 uint32_t marker_bidx;
9721 uint32_t marker_bceil;
9722 uint32_t is_y;
9723 uint32_t valid_obs_ct;
9724 uint32_t success_2start;
9725 uint32_t success_2incr;
9726 uint32_t next_adapt_check;
9727 uint32_t missing_case_ct;
9728 uint32_t case_ct;
9729 uint32_t uii;
9730 double pval;
9731 double dxx;
9732 double dyy;
9733 double dzz;
9734 while (1) {
9735 if (g_block_diff <= max_thread_ct) {
9736 if (g_block_diff <= tidx) {
9737 goto testmiss_adapt_thread_skip_all;
9738 }
9739 marker_bidx = tidx;
9740 marker_bceil = tidx + 1;
9741 } else {
9742 marker_bidx = (((uint64_t)tidx) * g_block_diff) / max_thread_ct;
9743 marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / max_thread_ct;
9744 }
9745 is_y = 0;
9746 if (g_is_y) {
9747 valid_obs_ct = g_male_ct;
9748 if (valid_obs_ct != pheno_nm_ct) {
9749 is_y = 1; // if all male, can pretend as if this isn't Ychr
9750 male_case_cts = g_male_case_cts;
9751 }
9752 } else {
9753 valid_obs_ct = pheno_nm_ct;
9754 }
9755 loadbuf = g_loadbuf;
9756 precomp_ui = g_precomp_ui;
9757 missing_cts = g_missing_cts;
9758 for (; marker_bidx < marker_bceil; marker_bidx++) {
9759 marker_idx = g_adapt_m_table[marker_bidx];
9760 next_adapt_check = first_adapt_check;
9761 gpui = &(precomp_ui[4 * marker_bidx]);
9762 if (is_y) {
9763 missing_sum = missing_cts[marker_idx];
9764 nm_sum = valid_obs_ct - missing_sum;
9765 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
9766 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
9767 }
9768 success_2start = perm_2success_ct[marker_idx];
9769 success_2incr = 0;
9770 loadbuf_ptr = &(loadbuf[marker_bidx * pheno_nm_ctv]);
9771 for (pidx = 0; pidx < perm_vec_ct;) {
9772 missing_case_ct = popcount_longs_intersect(loadbuf_ptr, &(perm_vecs[pidx * pheno_nm_ctv]), pheno_nm_ctl);
9773 if (!is_y) {
9774 if (missing_case_ct < gpui[0]) {
9775 if (missing_case_ct < gpui[2]) {
9776 success_2incr += 2;
9777 } else {
9778 success_2incr++;
9779 }
9780 } else {
9781 if (missing_case_ct >= gpui[1]) {
9782 if (missing_case_ct >= gpui[3]) {
9783 success_2incr += 2;
9784 } else {
9785 success_2incr++;
9786 }
9787 }
9788 }
9789 } else {
9790 case_ct = male_case_cts[pidx];
9791 pval = fisher22(missing_case_ct, case_ct - missing_case_ct, missing_sum - missing_case_ct, nm_sum + missing_case_ct - case_ct, is_midp);
9792 if (pval < stat_low) {
9793 success_2incr += 2;
9794 } else if (pval <= stat_high) {
9795 success_2incr++;
9796 }
9797 }
9798 if (++pidx == next_adapt_check - pidx_offset) {
9799 uii = success_2start + success_2incr;
9800 if (uii) {
9801 pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
9802 dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
9803 dyy = pval - dxx;
9804 dzz = pval + dxx;
9805 if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
9806 perm_adapt_stop[marker_idx] = 1;
9807 perm_attempt_ct[marker_idx] = next_adapt_check;
9808 break;
9809 }
9810 }
9811 next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
9812 }
9813 }
9814 perm_2success_ct[marker_idx] += success_2incr;
9815 }
9816 testmiss_adapt_thread_skip_all:
9817 if ((!tidx) || g_is_last_thread_block) {
9818 THREAD_RETURN;
9819 }
9820 THREAD_BLOCK_FINISH(tidx);
9821 }
9822 }
9823
testmiss_maxt_thread(void * arg)9824 THREAD_RET_TYPE testmiss_maxt_thread(void* arg) {
9825 uintptr_t tidx = (uintptr_t)arg;
9826 uintptr_t perm_vec_ct = g_perm_vec_ct;
9827 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
9828 uint32_t is_midp = g_fisher_midp;
9829 uint32_t max_thread_ct = g_assoc_thread_ct;
9830 uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9831 uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9832 uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
9833 uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
9834 uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 176]);
9835 uint32_t* __restrict__ perm_vecst = g_perm_vecst;
9836 uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
9837 double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
9838 double* __restrict__ orig_pvals = g_orig_pvals;
9839 double* msa_ptr = nullptr;
9840 uint32_t* male_case_cts = nullptr;
9841 uint32_t* gpui = nullptr;
9842 double* gpd = nullptr;
9843 double stat_high = 0;
9844 double stat_low = 0;
9845 uint32_t case_ct = g_perm_case_ct;
9846 uint32_t cur_case_ct = case_ct;
9847 uintptr_t* loadbuf;
9848 uintptr_t* loadbuf_ptr;
9849 uint32_t* precomp_ui;
9850 uint32_t* __restrict__ missing_cts;
9851 double* __restrict__ precomp_d;
9852 uintptr_t pidx;
9853 uintptr_t marker_idx;
9854 double pval;
9855 uint32_t marker_bidx_start;
9856 uint32_t marker_bidx;
9857 uint32_t marker_bceil;
9858 uint32_t is_y;
9859 uint32_t valid_obs_ct;
9860 uint32_t missing_sum;
9861 uint32_t nm_sum;
9862 uint32_t success_2incr;
9863 uint32_t missing_case_ct;
9864 uint32_t uii;
9865 uint32_t ujj;
9866 while (1) {
9867 if (g_block_diff <= max_thread_ct) {
9868 if (g_block_diff <= tidx) {
9869 goto testmiss_maxt_thread_skip_all;
9870 }
9871 marker_bidx_start = tidx;
9872 marker_bceil = tidx + 1;
9873 } else {
9874 marker_bidx_start = (((uint64_t)tidx) * g_block_diff) / max_thread_ct;
9875 marker_bceil = (((uint64_t)tidx + 1) * g_block_diff) / max_thread_ct;
9876 }
9877 marker_bidx = marker_bidx_start;
9878 marker_idx = g_maxt_block_base + marker_bidx_start;
9879 memcpy(results, &(g_maxt_extreme_stat[g_perms_done]), perm_vec_ct * sizeof(double));
9880 is_y = 0;
9881 if (g_is_y) {
9882 valid_obs_ct = g_male_ct;
9883 if (valid_obs_ct != pheno_nm_ct) {
9884 is_y = 1;
9885 male_case_cts = g_male_case_cts;
9886 precomp_ui = nullptr;
9887 }
9888 } else {
9889 valid_obs_ct = pheno_nm_ct;
9890 }
9891 loadbuf = g_loadbuf;
9892 missing_cts = g_missing_cts;
9893 precomp_d = g_precomp_d;
9894 if (g_mperm_save_all) {
9895 msa_ptr = &(g_mperm_save_all[marker_idx * perm_vec_ct]);
9896 precomp_ui = nullptr;
9897 } else {
9898 precomp_ui = g_precomp_ui;
9899 }
9900 for (; marker_bidx < marker_bceil; marker_bidx++) {
9901 missing_sum = missing_cts[marker_idx];
9902 nm_sum = valid_obs_ct - missing_sum;
9903 if (precomp_ui) {
9904 gpui = &(precomp_ui[6 * marker_bidx]);
9905 gpd = &(precomp_d[2 * marker_bidx]);
9906 } else {
9907 stat_high = orig_pvals[marker_idx] * (1.0 + EPSILON);
9908 stat_low = orig_pvals[marker_idx] * (1.0 - EPSILON);
9909 }
9910 loadbuf_ptr = &(loadbuf[marker_bidx * pheno_nm_ctv]);
9911 success_2incr = 0;
9912 fill_uint_zero(perm_ct128 * 176, thread_git_wkspace);
9913 calc_git_missing(pheno_nm_ct, perm_vec_ct, loadbuf_ptr, perm_vecst, thread_git_wkspace);
9914 for (pidx = 0; pidx < perm_vec_ct; pidx++) {
9915 missing_case_ct = thread_git_wkspace[pidx];
9916 if (precomp_ui) {
9917 if (missing_case_ct < gpui[0]) {
9918 if (missing_case_ct < gpui[2]) {
9919 success_2incr += 2;
9920 } else {
9921 success_2incr++;
9922 }
9923 } else {
9924 if (missing_case_ct >= gpui[1]) {
9925 if (missing_case_ct >= gpui[3]) {
9926 success_2incr += 2;
9927 } else {
9928 success_2incr++;
9929 }
9930 }
9931 }
9932 ujj = gpui[4];
9933 uii = (uint32_t)(missing_case_ct - ujj); // deliberate underflow
9934 if (uii >= gpui[5]) {
9935 pval = fisher22_tail_pval(ujj, missing_sum - ujj, case_ct - ujj, nm_sum + ujj - case_ct, gpui[5] - 1, gpd[0], gpd[1], is_midp, missing_case_ct);
9936 if (results[pidx] > pval) {
9937 results[pidx] = pval;
9938 }
9939 }
9940 } else {
9941 if (is_y) {
9942 cur_case_ct = male_case_cts[pidx];
9943 }
9944 pval = fisher22(missing_case_ct, missing_sum - missing_case_ct, cur_case_ct - missing_case_ct, nm_sum + missing_case_ct - cur_case_ct, is_midp);
9945 if (pval < stat_low) {
9946 success_2incr += 2;
9947 } else if (pval <= stat_high) {
9948 success_2incr++;
9949 }
9950 if (results[pidx] > pval) {
9951 results[pidx] = pval;
9952 }
9953 if (msa_ptr) {
9954 *msa_ptr++ = pval;
9955 }
9956 }
9957 }
9958 perm_2success_ct[marker_idx++] += success_2incr;
9959 }
9960 testmiss_maxt_thread_skip_all:
9961 if ((!tidx) || g_is_last_thread_block) {
9962 THREAD_RETURN;
9963 }
9964 THREAD_BLOCK_FINISH(tidx);
9965 }
9966 }
9967
testmiss(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t testmiss_mperm_val,uint32_t testmiss_modifier,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_exists)9968 int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t testmiss_mperm_val, uint32_t testmiss_modifier, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_exists) {
9969 // Simple variant of model_assoc().
9970 unsigned char* bigstack_mark = g_bigstack_base;
9971 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
9972 uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9973 uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
9974 uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
9975 uintptr_t cur_sample_ctl = pheno_nm_ctl;
9976 uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
9977 uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
9978 uintptr_t marker_uidx = next_unset_unsafe(marker_exclude_orig, 0);
9979 double maxt_cur_extreme_stat = 0;
9980 FILE* outfile = nullptr;
9981 FILE* outfile_msa = nullptr;
9982 uintptr_t* sample_hh_include2 = nullptr;
9983 uintptr_t* sample_hh_male_include2 = nullptr;
9984 uintptr_t* pheno_male_nm2 = nullptr;
9985 uintptr_t* pheno_c_collapsed_male = nullptr;
9986 uintptr_t* sex_male_collapsed = nullptr;
9987 char* wptr_start = nullptr;
9988 char* tbuf2 = &(g_textbuf[MAXLINELEN]);
9989 uint32_t perm_adapt = testmiss_modifier & TESTMISS_PERM;
9990 uint32_t perm_maxt = testmiss_modifier & TESTMISS_MPERM;
9991 uint32_t perm_count = testmiss_modifier & TESTMISS_PERM_COUNT;
9992 uint32_t midp = testmiss_modifier & TESTMISS_MIDP;
9993 uint32_t do_perms = perm_adapt | perm_maxt;
9994 uint32_t perms_total = 0;
9995 uint32_t chrom_fo_idx = 0xffffffffU;
9996 uint32_t is_x = 0;
9997 // don't treat MT heterozygous call as missing
9998 uint32_t is_haploid = 0;
9999 uint32_t skip_y = 0;
10000 uint32_t cur_pheno_nm_ct = pheno_nm_ct;
10001 uint32_t case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
10002 uint32_t ctrl_ct = pheno_nm_ct - case_ct;
10003 uint32_t male_ct = popcount_longs_intersect(sex_male, pheno_nm, unfiltered_sample_ctl);
10004 uint32_t case_ct_y = popcount_longs_intersect(sex_male, pheno_c, unfiltered_sample_ctl);
10005 uint32_t ctrl_ct_y = male_ct - case_ct_y;
10006 uint32_t cur_case_ct = case_ct;
10007 uint32_t cur_ctrl_ct = ctrl_ct;
10008 uint32_t chrom_end = 0;
10009 uint32_t mperm_dump_all = 0;
10010 uint32_t max_thread_ct = g_thread_ct;
10011 uintptr_t pheno_male_nm_ctl = BITCT_TO_WORDCT(male_ct);
10012 int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
10013 int32_t retval = 0;
10014 uint32_t uibuf[4];
10015 uintptr_t* loadbuf_raw;
10016 uintptr_t* pheno_nm2;
10017 uintptr_t* cur_pheno_nm2;
10018 uintptr_t* pheno_c_collapsed;
10019 uintptr_t* cur_pheno_c_collapsed;
10020 uintptr_t* missing_bitfield;
10021 uintptr_t* marker_exclude;
10022 uintptr_t* loadbuf_ptr;
10023 double* dptr;
10024 uint32_t* marker_idx_to_uidx;
10025 char* outname_end2;
10026 char* wptr;
10027 uintptr_t marker_uidx_end;
10028 uintptr_t marker_ct;
10029 uintptr_t marker_unstopped_ct;
10030 uintptr_t marker_idx;
10031 uintptr_t marker_idx2;
10032 uintptr_t block_size;
10033 uintptr_t block_end;
10034 uintptr_t perm_idx;
10035 uintptr_t ulii;
10036 double pval;
10037 double cur_case_ct_recip;
10038 double cur_ctrl_ct_recip;
10039 double dxx;
10040 double dyy;
10041 double dzz;
10042 uint32_t missing_ct;
10043 uint32_t marker_cidx;
10044 uint32_t is_last_block;
10045 uint32_t uii;
10046 uint32_t ujj;
10047 uint32_t ukk;
10048 uint32_t umm;
10049 if ((!case_ct) || (!ctrl_ct)) {
10050 logerrprint("Warning: Skipping --test-missing since at least one case and one control is\nrequired.\n");
10051 goto testmiss_ret_1;
10052 }
10053 cur_case_ct_recip = 1.0 / ((double)((int32_t)case_ct));
10054 cur_ctrl_ct_recip = 1.0 / ((double)((int32_t)ctrl_ct));
10055 // Y chromosome requires special handling--only male genotypes should be
10056 // considered.
10057 if ((y_code == -2) || (!is_set(chrom_info_ptr->chrom_mask, y_code))) {
10058 skip_y = 1;
10059 } else if ((!case_ct_y) || (!ctrl_ct_y)) {
10060 logerrprint("Warning: --test-missing is skipping Y chromosome since at least one male case\nand one male control are necessary.\n");
10061 skip_y = 1;
10062 }
10063 if (perm_maxt) {
10064 mperm_dump_all = mperm_save & MPERM_DUMP_ALL;
10065 perms_total = testmiss_mperm_val;
10066 if (bigstack_alloc_d(perms_total, &g_maxt_extreme_stat)) {
10067 goto testmiss_ret_NOMEM;
10068 }
10069 for (uii = 0; uii < perms_total; uii++) {
10070 g_maxt_extreme_stat[uii] = 1;
10071 }
10072 if (mperm_dump_all) {
10073 memcpy(outname_end, ".mperm.dump.all", 16);
10074 if (fopen_checked(outname, "w", &outfile_msa)) {
10075 goto testmiss_ret_OPEN_FAIL;
10076 }
10077 LOGPRINTFWW("Dumping all permutation p-values to %s .\n", outname);
10078 }
10079 } else {
10080 mperm_save = 0;
10081 if (perm_adapt) {
10082 g_aperm_alpha = apip->alpha;
10083 perms_total = apip->max;
10084 }
10085 }
10086 // Sites with no (or all) missing calls are now excluded from the permutation
10087 // test. Since it's likely that many such sites exist, we postpone the
10088 // associated memory allocations until after the basic .missing report is
10089 // generated.
10090 if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
10091 bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_nm2) ||
10092 bigstack_alloc_ul(pheno_nm_ctl, &pheno_c_collapsed) ||
10093 bigstack_alloc_ul(pheno_nm_ctl, &missing_bitfield) ||
10094 bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude)) {
10095 goto testmiss_ret_NOMEM;
10096 }
10097 memcpy(marker_exclude, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
10098 loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
10099 init_quaterarr_from_bitarr(pheno_nm, unfiltered_sample_ct, pheno_nm2);
10100 cur_pheno_nm2 = pheno_nm2;
10101 copy_bitarr_subset(pheno_c, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, pheno_c_collapsed);
10102 cur_pheno_c_collapsed = pheno_c_collapsed;
10103 if (!skip_y) {
10104 if (bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_male_nm2) ||
10105 bigstack_alloc_ul(pheno_male_nm_ctl, &pheno_c_collapsed_male)) {
10106 goto testmiss_ret_NOMEM;
10107 }
10108 // temporary non-excluded male bitfield
10109 memcpy(pheno_male_nm2, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
10110 bitvec_and(sex_male, unfiltered_sample_ctl, pheno_male_nm2);
10111 copy_bitarr_subset(pheno_c, pheno_male_nm2, unfiltered_sample_ct, male_ct, pheno_c_collapsed_male);
10112 memcpy(pheno_male_nm2, pheno_nm2, unfiltered_sample_ctl2 * sizeof(intptr_t));
10113 apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, pheno_male_nm2);
10114 }
10115 outname_end2 = memcpyb(outname_end, ".missing", 9);
10116 if (fopen_checked(outname, "w", &outfile)) {
10117 goto testmiss_ret_OPEN_FAIL;
10118 }
10119 LOGPRINTFWW5("Writing --test-missing report to %s ... ", outname);
10120 fflush(stdout);
10121 sprintf(g_textbuf, " CHR %%%us F_MISS_A F_MISS_U P \n", plink_maxsnp);
10122 fprintf(outfile, g_textbuf, "SNP");
10123 if (ferror(outfile)) {
10124 goto testmiss_ret_WRITE_FAIL;
10125 }
10126 // technically this part could be even faster with some custom code, but not
10127 // worth the additional maintenance
10128 if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
10129 goto testmiss_ret_NOMEM;
10130 }
10131 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10132 goto testmiss_ret_READ_FAIL;
10133 }
10134 chrom_end = 0;
10135 // must be last allocation
10136 if (bigstack_alloc_d(marker_ct_orig, &g_orig_pvals)) {
10137 goto testmiss_ret_NOMEM;
10138 }
10139 dptr = g_orig_pvals;
10140 for (marker_idx = 0; marker_idx < marker_ct_orig; marker_uidx++, marker_idx++) {
10141 if (IS_SET(marker_exclude_orig, marker_uidx)) {
10142 marker_uidx = next_unset_ul_unsafe(marker_exclude_orig, marker_uidx);
10143 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10144 goto testmiss_ret_NOMEM;
10145 }
10146 }
10147 if (marker_uidx >= chrom_end) {
10148 // exploit overflow
10149 chrom_fo_idx++;
10150 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &g_is_y, &uii, &is_haploid);
10151 if (!skip_y) {
10152 if (!g_is_y) {
10153 cur_pheno_nm2 = pheno_nm2;
10154 cur_pheno_nm_ct = pheno_nm_ct;
10155 cur_sample_ctl = pheno_nm_ctl;
10156 cur_case_ct = case_ct;
10157 cur_ctrl_ct = ctrl_ct;
10158 cur_pheno_c_collapsed = pheno_c_collapsed;
10159 } else {
10160 cur_pheno_nm2 = pheno_male_nm2;
10161 cur_pheno_nm_ct = male_ct;
10162 cur_sample_ctl = pheno_male_nm_ctl;
10163 cur_case_ct = case_ct_y;
10164 cur_ctrl_ct = ctrl_ct_y;
10165 cur_pheno_c_collapsed = pheno_c_collapsed_male;
10166 }
10167 cur_case_ct_recip = 1.0 / ((double)((int32_t)cur_case_ct));
10168 cur_ctrl_ct_recip = 1.0 / ((double)((int32_t)cur_ctrl_ct));
10169 } else if (g_is_y) {
10170 fill_bits(marker_uidx, chrom_end - marker_uidx, marker_exclude);
10171 marker_idx += chrom_end - marker_uidx - 1 - popcount_bit_idx(marker_exclude_orig, marker_uidx, chrom_end);
10172 marker_uidx = chrom_end - 1;
10173 continue;
10174 }
10175 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10176 wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
10177 *wptr_start++ = ' ';
10178 }
10179 if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10180 goto testmiss_ret_READ_FAIL;
10181 }
10182 if (is_haploid && hh_exists) {
10183 haploid_fix(hh_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, is_x, g_is_y, (unsigned char*)loadbuf_raw);
10184 }
10185 extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, cur_pheno_nm2, cur_pheno_nm_ct, missing_bitfield);
10186 missing_ct = popcount_longs(missing_bitfield, cur_sample_ctl);
10187 if ((!missing_ct) || (missing_ct == cur_pheno_nm_ct)) {
10188 SET_BIT(marker_uidx, marker_exclude);
10189 continue;
10190 }
10191 uii = popcount_longs_intersect(missing_bitfield, cur_pheno_c_collapsed, cur_sample_ctl);
10192 ujj = missing_ct - uii;
10193 pval = fisher22(uii, ujj, cur_case_ct - uii, cur_ctrl_ct - ujj, midp);
10194 *dptr++ = pval;
10195 if (!(pval <= pfilter)) {
10196 continue;
10197 }
10198 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10199 *wptr++ = ' ';
10200 wptr = dtoa_g_wxp4x(((int32_t)uii) * cur_case_ct_recip, 12, ' ', wptr);
10201 wptr = dtoa_g_wxp4x(((int32_t)ujj) * cur_ctrl_ct_recip, 12, ' ', wptr);
10202 wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
10203 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10204 goto testmiss_ret_WRITE_FAIL;
10205 }
10206 }
10207 if (fclose_null(&outfile)) {
10208 goto testmiss_ret_WRITE_FAIL;
10209 }
10210 logprint("done.\n");
10211 marker_ct = (uintptr_t)(dptr - g_orig_pvals);
10212 bigstack_shrink_top(g_orig_pvals, marker_ct * sizeof(double));
10213 if (mtest_adjust) {
10214 if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
10215 goto testmiss_ret_NOMEM;
10216 }
10217 fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
10218 retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, nullptr, pfilter, output_min_p, mtest_adjust, 1, 0.0, nullptr, g_orig_pvals);
10219 if (retval) {
10220 goto testmiss_ret_1;
10221 }
10222 }
10223 if (do_perms) {
10224 if (!marker_ct) {
10225 logprint("Skipping --test-missing permutation test since all loci are degenerate.\n");
10226 goto testmiss_ret_1;
10227 }
10228 LOGPRINTF("Including %" PRIuPTR " loc%s in --test-missing permutation test.\n", marker_ct, (marker_ct == 1)? "us" : "i");
10229 if (mperm_dump_all) {
10230 g_textbuf[0] = '0';
10231 wptr = &(g_textbuf[1]);
10232 for (uii = 0; uii < marker_ct; uii++) {
10233 *wptr++ = ' ';
10234 dxx = g_orig_pvals[uii];
10235 if (dxx >= 0) {
10236 wptr = dtoa_g(dxx, wptr);
10237 } else {
10238 wptr = memcpya(wptr, "NA", 2);
10239 }
10240 if (wptr >= tbuf2) {
10241 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10242 goto testmiss_ret_WRITE_FAIL;
10243 }
10244 wptr = g_textbuf;
10245 }
10246 }
10247 *wptr++ = '\n';
10248 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10249 goto testmiss_ret_WRITE_FAIL;
10250 }
10251 }
10252
10253 if (!skip_y) {
10254 // maybe all Y chromosome markers had no missing calls?
10255 uii = get_chrom_start_vidx(chrom_info_ptr, (uint32_t)y_code);
10256 ujj = get_chrom_end_vidx(chrom_info_ptr, (uint32_t)y_code);
10257 if (popcount_bit_idx(marker_exclude, uii, ujj) == ujj - uii) {
10258 skip_y = 1;
10259 } else {
10260 if (bigstack_alloc_ul(pheno_nm_ctl, &sex_male_collapsed)) {
10261 goto testmiss_ret_NOMEM;
10262 }
10263 copy_bitarr_subset(sex_male, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, sex_male_collapsed);
10264 }
10265 }
10266
10267 if (cluster_starts) {
10268 retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
10269 if (retval) {
10270 goto testmiss_ret_1;
10271 }
10272 if (!g_perm_cluster_ct) {
10273 logerrprint("Error: No size 2+ clusters for permutation test.\n");
10274 goto testmiss_ret_INVALID_CMDLINE;
10275 }
10276 retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
10277 if (retval) {
10278 goto testmiss_ret_1;
10279 }
10280 } else {
10281 g_perm_cluster_starts = nullptr;
10282 }
10283 if (max_thread_ct > perms_total) {
10284 max_thread_ct = perms_total;
10285 }
10286 if (bigstack_init_sfmtp(max_thread_ct)) {
10287 goto testmiss_ret_NOMEM;
10288 }
10289 if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv, &g_loadbuf) ||
10290 bigstack_calloc_ui(marker_ct, &g_perm_2success_ct) ||
10291 bigstack_alloc_ui(marker_ct, &g_missing_cts)) {
10292 goto testmiss_ret_NOMEM;
10293 }
10294 for (uii = 1; uii <= MODEL_BLOCKSIZE; uii++) {
10295 g_loadbuf[uii * pheno_nm_ctv - 2] = 0;
10296 g_loadbuf[uii * pheno_nm_ctv - 1] = 0;
10297 }
10298 uii = marker_ct;
10299 if (perm_maxt) {
10300 if (!mperm_dump_all) {
10301 if (bigstack_alloc_ui(6 * MODEL_BLOCKSIZE, &g_precomp_ui) ||
10302 bigstack_alloc_d(2 * MODEL_BLOCKSIZE, &g_precomp_d)) {
10303 goto testmiss_ret_NOMEM;
10304 }
10305 }
10306 } else {
10307 if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
10308 bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop) ||
10309 bigstack_alloc_ui(4 * MODEL_BLOCKSIZE, &g_precomp_ui)) {
10310 goto testmiss_ret_NOMEM;
10311 }
10312 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
10313 g_perm_attempt_ct[marker_idx] = perms_total;
10314 }
10315 g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
10316 }
10317 if (!cluster_starts) {
10318 g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
10319 magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
10320 }
10321 marker_unstopped_ct = marker_ct;
10322 g_perm_is_1bit = 1;
10323 g_perms_done = 0;
10324 g_perm_pheno_nm_ct = pheno_nm_ct;
10325 g_perm_case_ct = case_ct;
10326 g_male_ct = male_ct;
10327 g_fisher_midp = midp;
10328 g_mperm_save_all = nullptr;
10329 // ----- begin main loop -----
10330 testmiss_more_perms:
10331 if (perm_adapt) {
10332 if (g_perms_done) {
10333 while (g_first_adapt_check <= g_perms_done) {
10334 g_first_adapt_check += (int32_t)(apip->init_interval + ((int32_t)g_first_adapt_check) * apip->interval_slope);
10335 }
10336 } else {
10337 if (apip->min < apip->init_interval) {
10338 g_first_adapt_check = (int32_t)(apip->init_interval);
10339 } else {
10340 g_first_adapt_check = apip->min;
10341 }
10342 g_adaptive_intercept = apip->init_interval;
10343 g_adaptive_slope = apip->interval_slope;
10344 }
10345 g_perm_vec_ct = (bigstack_left() - CACHELINE + sizeof(int32_t)) / (pheno_nm_ctv * sizeof(intptr_t) + (1 - skip_y) * sizeof(int32_t));
10346 } else {
10347 // g_perm_vec_ct memory allocation dependencies:
10348 // g_maxt_thread_results: (8 * g_perm_vec_ct, cacheline-aligned) *
10349 // max_thread_ct
10350 // g_perm_vecst: 16 * ((g_perm_vec_ct + 127) / 128) * pheno_nm_ct
10351 // g_thread_git_wkspace: ((perm_vec_ct + 127) / 128) * 704 *
10352 // max_thread_ct
10353 // g_perm_vecs: pheno_nm_ctv * sizeof(intptr_t) * g_perm_vec_ct
10354 // g_male_case_cts (if needed): sizeof(int32_t) * g_perm_vec_ct
10355 // g_mperm_save_all (if needed): marker_ct * 8 * g_perm_vec_ct
10356 // Forcing g_perm_vec_ct to be a multiple of 128, total is
10357 // g_perm_vec_ct * (13.5 * max_thread_ct + pheno_nm_ct / 8 + 4 +
10358 // sizeof(intptr_t) * pheno_nm_ctv
10359 // [+ marker_ct * sizeof(double) * mperm_save_all])
10360 if (mperm_dump_all) {
10361 g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y) + 128LL * sizeof(double) * marker_ct));
10362 } else {
10363 g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y)));
10364 }
10365 }
10366 if (g_perm_vec_ct > perms_total - g_perms_done) {
10367 g_perm_vec_ct = perms_total - g_perms_done;
10368 } else if (!g_perm_vec_ct) {
10369 goto testmiss_ret_NOMEM;
10370 }
10371 bigstack_alloc_ul(g_perm_vec_ct * pheno_nm_ctv, &g_perm_vecs);
10372 g_perm_generation_thread_ct = MINV(max_thread_ct, g_perm_vec_ct);
10373 ulii = 0;
10374 if (!cluster_starts) {
10375 if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
10376 goto testmiss_ret_THREAD_CREATE_FAIL;
10377 }
10378 generate_cc_perms_thread((void*)ulii);
10379 } else {
10380 if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
10381 goto testmiss_ret_THREAD_CREATE_FAIL;
10382 }
10383 generate_cc_cluster_perms_thread((void*)ulii);
10384 }
10385 join_threads(threads, g_perm_generation_thread_ct);
10386 g_assoc_thread_ct = max_thread_ct;
10387 if (perm_maxt) {
10388 bigstack_alloc_d(max_thread_ct * round_up_pow2(g_perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
10389 #ifdef __LP64__
10390 ulii = ((g_perm_vec_ct + 127) / 128) * 4;
10391 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
10392 #else
10393 ulii = (g_perm_vec_ct + 31) / 32;
10394 bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
10395 ulii = ((g_perm_vec_ct + 127) / 128) * 4; // force 64-byte align
10396 #endif
10397 bigstack_calloc_ui(ulii * 44 * max_thread_ct, &g_thread_git_wkspace);
10398 transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
10399 if (mperm_dump_all) {
10400 bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all);
10401 }
10402 }
10403 if (!skip_y) {
10404 bigstack_alloc_ui(g_perm_vec_ct, &g_male_case_cts);
10405 for (perm_idx = 0; perm_idx < g_perm_vec_ct; perm_idx++) {
10406 g_male_case_cts[perm_idx] = popcount_longs_intersect(sex_male_collapsed, &(g_perm_vecs[perm_idx * pheno_nm_ctv]), pheno_nm_ctl);
10407 }
10408 }
10409 chrom_fo_idx = 0xffffffffU;
10410 marker_uidx = next_unset_unsafe(marker_exclude, 0);
10411 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10412 goto testmiss_ret_READ_FAIL;
10413 }
10414 marker_idx = 0;
10415 marker_idx2 = 0;
10416 chrom_end = 0;
10417 // only forced to terminate block at Y chromosome boundaries
10418 if (!skip_y) {
10419 marker_uidx_end = get_chrom_start_vidx(chrom_info_ptr, (uint32_t)y_code);
10420 pheno_male_nm_ctl = round_up_pow2(pheno_male_nm_ctl, 2);
10421 } else {
10422 marker_uidx_end = unfiltered_marker_ct;
10423 }
10424 do {
10425 block_size = 0;
10426 block_end = marker_unstopped_ct - marker_idx;
10427 if ((!marker_idx) && perm_maxt) {
10428 if (block_end > MODEL_BLOCKKEEP) {
10429 block_end = MODEL_BLOCKKEEP;
10430 }
10431 } else if (block_end > MODEL_BLOCKSIZE) {
10432 block_end = MODEL_BLOCKSIZE;
10433 }
10434 if (marker_uidx >= marker_uidx_end) {
10435 marker_uidx_end = get_chrom_end_vidx(chrom_info_ptr, (uint32_t)y_code);
10436 if (marker_uidx >= marker_uidx_end) {
10437 marker_uidx_end = unfiltered_marker_ct;
10438 }
10439 }
10440 do {
10441 if (perm_adapt && g_perm_adapt_stop[marker_idx2]) {
10442 do {
10443 marker_uidx++;
10444 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
10445 marker_idx2++;
10446 } while ((marker_uidx < marker_uidx_end) && g_perm_adapt_stop[marker_idx2]);
10447 if (marker_uidx >= marker_uidx_end) {
10448 break;
10449 }
10450 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10451 goto testmiss_ret_READ_FAIL;
10452 }
10453 }
10454 if (marker_uidx >= chrom_end) {
10455 // exploit overflow
10456 chrom_fo_idx++;
10457 refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &is_haploid);
10458 if (!g_is_y) {
10459 g_perm_case_ct = case_ct;
10460 } else {
10461 g_perm_case_ct = case_ct_y;
10462 }
10463 }
10464 if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10465 goto testmiss_ret_READ_FAIL;
10466 }
10467 if (is_haploid && hh_exists) {
10468 haploid_fix(hh_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, is_x, g_is_y, (unsigned char*)loadbuf_raw);
10469 }
10470 loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv]);
10471 extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, pheno_nm2, pheno_nm_ct, loadbuf_ptr);
10472 if (g_is_y) {
10473 bitvec_and(sex_male_collapsed, pheno_nm_ctl, loadbuf_ptr);
10474 }
10475 if (!g_perms_done) {
10476 missing_ct = popcount_longs(loadbuf_ptr, pheno_nm_ctl);
10477 if (perm_adapt) {
10478 g_missing_cts[marker_idx2] = missing_ct;
10479 } else {
10480 g_missing_cts[marker_idx + block_size] = missing_ct;
10481 }
10482 }
10483 if (perm_adapt) {
10484 g_adapt_m_table[block_size] = marker_idx2++;
10485 }
10486 block_size++;
10487 if (marker_idx + block_size == marker_unstopped_ct) {
10488 break;
10489 }
10490 marker_uidx++;
10491 if (IS_SET(marker_exclude, marker_uidx)) {
10492 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
10493 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10494 goto testmiss_ret_READ_FAIL;
10495 }
10496 }
10497 } while ((block_size < block_end) && (marker_uidx < marker_uidx_end));
10498 g_block_diff = block_size;
10499 if ((!mperm_dump_all) && ((!g_is_y) || (male_ct == pheno_nm_ct))) {
10500 if (perm_maxt) {
10501 maxt_cur_extreme_stat = g_maxt_extreme_stat[0];
10502 for (uii = 1; uii < g_perm_vec_ct; uii++) {
10503 dxx = g_maxt_extreme_stat[uii];
10504 if (dxx > maxt_cur_extreme_stat) {
10505 maxt_cur_extreme_stat = dxx;
10506 }
10507 }
10508 }
10509 // need raw p-values for --mperm-save-all
10510 // valid case/control counts differ between permutations on Y
10511 // chromosome, and I won't bother with g_precomp_width just for that
10512 for (uii = 0; uii < block_size; uii++) {
10513 if (perm_adapt) {
10514 marker_cidx = g_adapt_m_table[uii];
10515 } else {
10516 marker_cidx = marker_idx + uii;
10517 }
10518 pval = g_orig_pvals[marker_cidx];
10519 missing_ct = g_missing_cts[marker_cidx];
10520 if (perm_adapt) {
10521 fisher22_precomp_pval_bounds(pval, midp, case_ct, missing_ct, pheno_nm_ct, &(g_precomp_ui[uii * 4]), nullptr);
10522 } else {
10523 fisher22_precomp_pval_bounds(pval, midp, case_ct, missing_ct, pheno_nm_ct, &(g_precomp_ui[uii * 6]), nullptr);
10524 fisher22_precomp_pval_bounds(maxt_cur_extreme_stat, midp, case_ct, missing_ct, pheno_nm_ct, uibuf, &(g_precomp_d[uii * 2]));
10525 g_precomp_ui[uii * 6 + 4] = uibuf[2];
10526 g_precomp_ui[uii * 6 + 5] = uibuf[3] - uibuf[2];
10527 }
10528 }
10529 }
10530 ulii = 0;
10531 is_last_block = (marker_idx + block_size >= marker_unstopped_ct);
10532 if (perm_adapt) {
10533 if (spawn_threads2(threads, &testmiss_adapt_thread, max_thread_ct, is_last_block)) {
10534 goto testmiss_ret_THREAD_CREATE_FAIL;
10535 }
10536 testmiss_adapt_thread((void*)ulii);
10537 join_threads2(threads, max_thread_ct, is_last_block);
10538 } else {
10539 g_maxt_block_base = marker_idx;
10540 if (spawn_threads2(threads, &testmiss_maxt_thread, max_thread_ct, is_last_block)) {
10541 goto testmiss_ret_THREAD_CREATE_FAIL;
10542 }
10543 testmiss_maxt_thread((void*)ulii);
10544 join_threads2(threads, max_thread_ct, is_last_block);
10545 ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
10546 umm = block_size;
10547 if (umm > max_thread_ct) {
10548 umm = max_thread_ct;
10549 }
10550 for (uii = 0; uii < max_thread_ct; uii++) {
10551 dptr = &(g_maxt_thread_results[uii * ulii]);
10552 ujj = g_perms_done;
10553 ukk = ujj + g_perm_vec_ct;
10554 for (; ujj < ukk; ujj++) {
10555 dxx = *dptr++;
10556 if (dxx < g_maxt_extreme_stat[ujj]) {
10557 g_maxt_extreme_stat[ujj] = dxx;
10558 }
10559 }
10560 }
10561 }
10562 marker_idx += block_size;
10563 } while (marker_idx < marker_unstopped_ct);
10564 if (mperm_dump_all) {
10565 if (g_perms_done) {
10566 putc_unlocked(' ', stdout);
10567 }
10568 fputs("[dumping stats]", stdout);
10569 fflush(stdout);
10570 ulii = g_perm_vec_ct;
10571 ujj = 1 + g_perms_done;
10572 wptr = g_textbuf;
10573 tbuf2 = &(g_textbuf[MAXLINELEN]);
10574 for (uii = 0; uii < ulii; uii++) {
10575 wptr = uint32toa(uii + ujj, wptr);
10576 dptr = &(g_mperm_save_all[uii]);
10577 for (ukk = 0; ukk < marker_ct; ukk++) {
10578 *wptr++ = ' ';
10579 dxx = dptr[ukk * ulii];
10580 if (dxx >= 0) {
10581 wptr = dtoa_g(dxx, wptr);
10582 } else {
10583 wptr = memcpya(wptr, "NA", 2);
10584 }
10585 if (wptr >= tbuf2) {
10586 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10587 goto testmiss_ret_WRITE_FAIL;
10588 }
10589 wptr = g_textbuf;
10590 }
10591 }
10592 *wptr++ = '\n';
10593 }
10594 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
10595 goto testmiss_ret_WRITE_FAIL;
10596 }
10597 fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b ", stdout);
10598 }
10599 // really should postpone this for --assoc/--model too
10600 g_perms_done += g_perm_vec_ct;
10601 bigstack_reset(g_perm_vecs);
10602 if (g_perms_done < perms_total) {
10603 if (perm_adapt) {
10604 marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
10605 if (!marker_unstopped_ct) {
10606 goto testmiss_adapt_perm_count;
10607 }
10608 }
10609 printf("\r%u permutation%s complete.", g_perms_done, (g_perms_done != 1)? "s" : "");
10610 fflush(stdout);
10611 goto testmiss_more_perms;
10612 }
10613 if (perm_adapt) {
10614 testmiss_adapt_perm_count:
10615 g_perms_done = 0;
10616 for (ulii = 0; ulii < marker_ct; ulii++) {
10617 if (g_perm_attempt_ct[ulii] > g_perms_done) {
10618 g_perms_done = g_perm_attempt_ct[ulii];
10619 if (g_perms_done == perms_total) {
10620 break;
10621 }
10622 }
10623 }
10624 }
10625 putc_unlocked('\r', stdout);
10626 LOGPRINTF("%u %s permutation%s complete.\n", g_perms_done, perm_maxt? "max(T)" : "(adaptive)", (g_perms_done != 1)? "s" : "");
10627 if (perm_adapt) {
10628 memcpy(outname_end2, ".perm", 6);
10629 } else {
10630 if (mperm_save & MPERM_DUMP_BEST) {
10631 ulii = outname_end - outname;
10632 memcpy(outname_end, ".mperm.dump.best", 17);
10633 LOGPRINTFWW("Dumping best permutation p-values to %s .\n", outname);
10634 if (fopen_checked(outname, "w", &outfile)) {
10635 goto testmiss_ret_OPEN_FAIL;
10636 }
10637 dxx = 1.0;
10638 for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
10639 if (g_orig_pvals[marker_idx] < dxx) {
10640 dxx = g_orig_pvals[marker_idx];
10641 }
10642 }
10643 memcpy(g_textbuf, "0 ", 2);
10644 wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
10645 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
10646 goto testmiss_ret_WRITE_FAIL;
10647 }
10648 for (uii = 0; uii < perms_total; uii++) {
10649 wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
10650 wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
10651 if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
10652 goto testmiss_ret_WRITE_FAIL;
10653 }
10654 }
10655 if (fclose_null(&outfile)) {
10656 goto testmiss_ret_WRITE_FAIL;
10657 }
10658 memcpy(outname_end, ".missing", 8);
10659 }
10660 memcpy(outname_end2, ".mperm", 7);
10661 }
10662 if (fopen_checked(outname, "w", &outfile)) {
10663 goto testmiss_ret_OPEN_FAIL;
10664 }
10665 if (perm_adapt) {
10666 sprintf(g_textbuf, " CHR %%%us EMP1 NP \n", plink_maxsnp);
10667 } else {
10668 sprintf(g_textbuf, " CHR %%%us EMP1 EMP2 \n", plink_maxsnp);
10669 #ifdef __cplusplus
10670 std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
10671 #else
10672 qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
10673 #endif
10674 }
10675 /*
10676 if (perm_maxt) {
10677 printf("extreme stats: %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[perms_total - 1]);
10678 }
10679 */
10680 fprintf(outfile, g_textbuf, "SNP");
10681 chrom_fo_idx = 0xffffffffU;
10682 marker_uidx = next_unset_unsafe(marker_exclude, 0);
10683 marker_idx = 0;
10684 dyy = 1.0 / ((double)((int32_t)perms_total + 1));
10685 dxx = 0.5 * dyy;
10686 while (1) {
10687 do {
10688 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
10689 } while (marker_uidx >= chrom_end);
10690 uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10691 wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
10692 *wptr_start++ = ' ';
10693 wptr_start[plink_maxsnp] = ' ';
10694 for (; marker_uidx < chrom_end;) {
10695 if (perm_adapt) {
10696 pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (g_perm_attempt_ct[marker_idx] + 1)));
10697 } else {
10698 pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) * dxx;
10699 }
10700 if (pval <= pfilter) {
10701 fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
10702 wptr = &(wptr_start[1 + plink_maxsnp]);
10703 if (!perm_count) {
10704 wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
10705 } else {
10706 wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
10707 }
10708 if (perm_adapt) {
10709 wptr = memseta(wptr, 32, 2);
10710 wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
10711 } else {
10712 // minimum p-value
10713 dzz = (int32_t)(doublearr_greater_than(g_maxt_extreme_stat, perms_total, g_orig_pvals[marker_idx] * (1.0 + EPSILON)) + 1);
10714 if (!perm_count) {
10715 wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
10716 } else {
10717 wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
10718 }
10719 }
10720 wptr = memcpya(wptr, " \n", 2);
10721 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
10722 goto testmiss_ret_WRITE_FAIL;
10723 }
10724 }
10725 if (++marker_idx == marker_ct) {
10726 goto testmiss_loop_end;
10727 }
10728 marker_uidx++;
10729 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
10730 }
10731 }
10732 testmiss_loop_end:
10733 if (fclose_null(&outfile)) {
10734 goto testmiss_ret_WRITE_FAIL;
10735 }
10736 LOGPRINTFWW("Permutation test report written to %s .\n", outname);
10737 }
10738
10739 while (0) {
10740 testmiss_ret_NOMEM:
10741 retval = RET_NOMEM;
10742 break;
10743 testmiss_ret_OPEN_FAIL:
10744 retval = RET_OPEN_FAIL;
10745 break;
10746 testmiss_ret_READ_FAIL:
10747 retval = RET_READ_FAIL;
10748 break;
10749 testmiss_ret_WRITE_FAIL:
10750 retval = RET_WRITE_FAIL;
10751 break;
10752 testmiss_ret_INVALID_CMDLINE:
10753 retval = RET_INVALID_CMDLINE;
10754 break;
10755 testmiss_ret_THREAD_CREATE_FAIL:
10756 retval = RET_THREAD_CREATE_FAIL;
10757 break;
10758 }
10759 testmiss_ret_1:
10760 bigstack_reset(bigstack_mark);
10761 fclose_cond(outfile);
10762 fclose_cond(outfile_msa);
10763 return retval;
10764 }
10765
cluster_assoc_init(const char * flag_name,uintptr_t unfiltered_sample_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uintptr_t * cluster_bitfield,uintptr_t ** pheno_nm_11_ptr,uintptr_t ** pheno_nm_nonmale_11_ptr,uintptr_t ** pheno_nm_male_11_ptr,uint32_t ** sample_to_cluster_pheno_ptr,uint32_t ** cluster_pheno_gtots_ptr,uint32_t ** cur_cluster_pheno_gtots_ptr,uint32_t ** cluster_geno_cts_ptr,uintptr_t ** loadbuf_raw_ptr,uint32_t * cluster_ct2_ptr)10766 int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uintptr_t* cluster_bitfield, uintptr_t** pheno_nm_11_ptr, uintptr_t** pheno_nm_nonmale_11_ptr, uintptr_t** pheno_nm_male_11_ptr, uint32_t** sample_to_cluster_pheno_ptr, uint32_t** cluster_pheno_gtots_ptr, uint32_t** cur_cluster_pheno_gtots_ptr, uint32_t** cluster_geno_cts_ptr, uintptr_t** loadbuf_raw_ptr, uint32_t* cluster_ct2_ptr) {
10767 uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
10768 uint32_t cluster_ct2 = 0;
10769 uint32_t sample_ct = 0;
10770 uint32_t cluster_end = 0;
10771 uint32_t case_ct_total = 0;
10772 uint32_t is_mh2 = (flag_name[4] == '2'); // yeah, this is a hack
10773 uintptr_t* pheno_nm_nonmale_11 = nullptr;
10774 uintptr_t* pheno_nm_male_11 = nullptr;
10775 uintptr_t* pheno_nm_11;
10776 uint32_t* sample_to_cluster_pheno;
10777 uint32_t* cluster_pheno_gtots;
10778 uint32_t cluster_idx;
10779 uint32_t sample_uidx;
10780 uint32_t ctrl_ct;
10781 uint32_t case_ct;
10782 uint32_t ctrl_male_ct;
10783 uint32_t case_male_ct;
10784 uint32_t uii;
10785 uint32_t ujj;
10786 uint32_t ukk;
10787 if (cluster_ct < 2) {
10788 LOGERRPRINTF("Error: %s requires at least two valid clusters.\n", flag_name);
10789 return RET_INVALID_CMDLINE;
10790 }
10791 // 1. Identify clusters with at least one case and one control, and create
10792 // new cluster data structures describing only these.
10793 // 2. Main loop efficiently skips homozygous A2s via use of CTZLU, and skips
10794 // samples not in a valid cluster via application of the pheno_nm_11
10795 // bitmask. sample_to_cluster_pheno[] maps sample_uidx to (valid) cluster
10796 // index (high 31 bits) and case/control status (low bit).
10797 if (bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_11_ptr) ||
10798 bigstack_alloc_ul(unfiltered_sample_ctl2, pheno_nm_nonmale_11_ptr) ||
10799 bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_male_11_ptr) ||
10800 bigstack_alloc_ui(unfiltered_sample_ct, sample_to_cluster_pheno_ptr) ||
10801 bigstack_alloc_ui(cluster_ct * 4, cluster_pheno_gtots_ptr)) {
10802 return RET_NOMEM;
10803 }
10804 pheno_nm_11 = *pheno_nm_11_ptr;
10805 pheno_nm_nonmale_11 = *pheno_nm_nonmale_11_ptr;
10806 pheno_nm_male_11 = *pheno_nm_male_11_ptr;
10807 sample_to_cluster_pheno = *sample_to_cluster_pheno_ptr;
10808 cluster_pheno_gtots = *cluster_pheno_gtots_ptr;
10809 for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
10810 uii = cluster_end;
10811 cluster_end = cluster_starts[cluster_idx + 1];
10812 for (; uii < cluster_end; uii++) {
10813 sample_uidx = cluster_map[uii];
10814 if (is_set(pheno_nm, sample_uidx)) {
10815 if (is_mh2) {
10816 goto cluster_assoc_init_valid;
10817 }
10818 if (is_set(pheno_c, sample_uidx)) {
10819 // we have a case, check for a control
10820 while (++uii < cluster_end) {
10821 sample_uidx = cluster_map[uii];
10822 if (is_set(pheno_nm, sample_uidx) && (!is_set(pheno_c, sample_uidx))) {
10823 goto cluster_assoc_init_valid;
10824 }
10825 }
10826 continue;
10827 } else {
10828 // we have a control, check for a case
10829 while (++uii < cluster_end) {
10830 sample_uidx = cluster_map[uii];
10831 if (is_set(pheno_c, sample_uidx)) {
10832 goto cluster_assoc_init_valid;
10833 }
10834 }
10835 continue;
10836 }
10837 }
10838 }
10839 continue;
10840 cluster_assoc_init_valid:
10841 for (uii = cluster_starts[cluster_idx], ctrl_ct = 0, ctrl_male_ct = 0, case_ct = 0, case_male_ct = 0; uii < cluster_end; uii++) {
10842 sample_uidx = cluster_map[uii];
10843 if (is_set(pheno_nm, sample_uidx)) {
10844 pheno_nm_11[sample_uidx / BITCT2] |= (3 * ONELU) << (2 * (sample_uidx % BITCT2));
10845 ukk = is_set(sex_male, sample_uidx);
10846 if (ukk) {
10847 pheno_nm_male_11[sample_uidx / BITCT2] |= (3 * ONELU) << (2 * (sample_uidx % BITCT2));
10848 }
10849 ujj = is_set(pheno_c, sample_uidx);
10850 sample_to_cluster_pheno[sample_uidx] = 2 * cluster_ct2 + ujj;
10851 if (ujj) {
10852 case_ct++;
10853 case_male_ct += ukk;
10854 } else {
10855 ctrl_ct++;
10856 ctrl_male_ct += ukk;
10857 }
10858 }
10859 }
10860 cluster_pheno_gtots[4 * cluster_ct2] = ctrl_ct;
10861 cluster_pheno_gtots[4 * cluster_ct2 + 1] = ctrl_male_ct;
10862 cluster_pheno_gtots[4 * cluster_ct2 + 2] = case_ct;
10863 cluster_pheno_gtots[4 * cluster_ct2 + 3] = case_male_ct;
10864 sample_ct += ctrl_ct + case_ct;
10865 case_ct_total += case_ct;
10866 if (cluster_bitfield) {
10867 SET_BIT(cluster_idx, cluster_bitfield);
10868 }
10869 cluster_ct2++;
10870 }
10871 bitvec_andnot_copy(pheno_nm_11, pheno_nm_male_11, unfiltered_sample_ctl2, pheno_nm_nonmale_11);
10872 bigstack_shrink_top(cluster_pheno_gtots, cluster_ct2 * 4 * sizeof(int32_t));
10873 if (bigstack_alloc_ui(cluster_ct2 * 2, cur_cluster_pheno_gtots_ptr) ||
10874 bigstack_alloc_ui(cluster_ct2 * 4, cluster_geno_cts_ptr) ||
10875 bigstack_alloc_ul(unfiltered_sample_ctl2, loadbuf_raw_ptr)) {
10876 return RET_NOMEM;
10877 }
10878 if (cluster_ct2 < 2) {
10879 LOGERRPRINTF("Error: %s requires at least two valid clusters.\n", flag_name);
10880 return RET_INVALID_CMDLINE;
10881 } else if (sample_ct >= 0x40000000) {
10882 // silly, but I'll document this
10883 LOGERRPRINTF("Error: %s does not support >= 2^30 samples.\n", flag_name);
10884 return RET_INVALID_CMDLINE;
10885 }
10886 LOGPRINTF("%s: %u valid clusters, with a total of %u cases and %u controls.\n", flag_name, cluster_ct2, case_ct_total, sample_ct - case_ct_total);
10887 (*loadbuf_raw_ptr)[unfiltered_sample_ctl2 - 1] = 0;
10888 *cluster_ct2_ptr = cluster_ct2;
10889 return 0;
10890 }
10891
cluster_assoc_load_one(FILE * bedfile,uintptr_t bed_offset,uintptr_t * marker_exclude,uintptr_t unfiltered_sample_ct,uintptr_t * sample_hh_include2,uintptr_t * sample_hh_male_include2,uintptr_t * loadbuf_raw,uintptr_t * pheno_nm_11,uintptr_t * pheno_nm_nonmale_11,uintptr_t * pheno_nm_male_11,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uint32_t hh_or_mt_exists,char * chrom_name_buf,uint32_t cluster_ct2,uint32_t * sample_to_cluster_pheno,uint32_t * cluster_pheno_gtots,uint32_t * cur_cluster_pheno_gtots,uint32_t * cluster_geno_cts,uintptr_t * marker_uidx_ptr,uint32_t * chrom_end_ptr,uint32_t * chrom_fo_idx_ptr,uint32_t * min_ploidy_1_ptr,uint32_t * is_x_ptr,uint32_t * is_y_ptr,char ** chrom_name_pp,uint32_t * chrom_name_len_ptr)10892 int32_t cluster_assoc_load_one(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_hh_include2, uintptr_t* sample_hh_male_include2, uintptr_t* loadbuf_raw, uintptr_t* pheno_nm_11, uintptr_t* pheno_nm_nonmale_11, uintptr_t* pheno_nm_male_11, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uint32_t hh_or_mt_exists, char* chrom_name_buf, uint32_t cluster_ct2, uint32_t* sample_to_cluster_pheno, uint32_t* cluster_pheno_gtots, uint32_t* cur_cluster_pheno_gtots, uint32_t* cluster_geno_cts, uintptr_t* marker_uidx_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* min_ploidy_1_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, char** chrom_name_pp, uint32_t* chrom_name_len_ptr) {
10893 uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
10894 uintptr_t marker_uidx = *marker_uidx_ptr;
10895 uint32_t min_ploidy_1 = *min_ploidy_1_ptr;
10896 uintptr_t cur_word;
10897 uintptr_t* ulptr;
10898 uintptr_t* ulptr2;
10899 uint32_t chrom_fo_idx;
10900 uint32_t chrom_end;
10901 uint32_t chrom_idx;
10902 uint32_t cpidx;
10903 uint32_t sample_uidx_base;
10904 uint32_t sample_uidx;
10905 uint32_t uii;
10906 uint32_t ujj;
10907 if (IS_SET(marker_exclude, marker_uidx)) {
10908 marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
10909 *marker_uidx_ptr = marker_uidx;
10910 if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
10911 return RET_READ_FAIL;
10912 }
10913 }
10914 if (marker_uidx >= (*chrom_end_ptr)) {
10915 chrom_fo_idx = *chrom_fo_idx_ptr;
10916 do {
10917 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[(++chrom_fo_idx) + 1U];
10918 } while (marker_uidx >= chrom_end);
10919 *chrom_end_ptr = chrom_end;
10920 chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
10921 min_ploidy_1 = is_set(chrom_info_ptr->haploid_mask, chrom_idx) || (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[MT_OFFSET]);
10922 *chrom_fo_idx_ptr = chrom_fo_idx;
10923 *min_ploidy_1_ptr = min_ploidy_1;
10924 *is_x_ptr = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[X_OFFSET]);
10925 *is_y_ptr = (chrom_idx == (uint32_t)chrom_info_ptr->xymt_codes[Y_OFFSET]);
10926 if (!min_ploidy_1) {
10927 for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10928 cur_cluster_pheno_gtots[cpidx] = 2 * cluster_pheno_gtots[cpidx * 2];
10929 }
10930 } else if (*is_x_ptr) {
10931 for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10932 cur_cluster_pheno_gtots[cpidx] = 2 * cluster_pheno_gtots[cpidx * 2] - cluster_pheno_gtots[cpidx * 2 + 1];
10933 }
10934 } else if (*is_y_ptr) {
10935 for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10936 cur_cluster_pheno_gtots[cpidx] = cluster_pheno_gtots[cpidx * 2 + 1];
10937 }
10938 } else {
10939 for (cpidx = 0; cpidx < 2 * cluster_ct2; cpidx++) {
10940 cur_cluster_pheno_gtots[cpidx] = cluster_pheno_gtots[cpidx * 2];
10941 }
10942 }
10943 if (chrom_name_len_ptr) {
10944 *chrom_name_pp = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, chrom_name_len_ptr, chrom_name_buf);
10945 } else {
10946 // --mh2
10947 // chrom_name_buf = g_textbuf in this case, and we return wptr_start
10948 *chrom_name_pp = chrom_name_write(chrom_info_ptr, chrom_idx, chrom_name_buf);
10949 *(*chrom_name_pp)++ = '\t';
10950 }
10951 }
10952 if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
10953 return RET_READ_FAIL;
10954 }
10955 if (IS_SET(marker_reverse, marker_uidx)) {
10956 reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
10957 }
10958 if (min_ploidy_1 && hh_or_mt_exists) {
10959 haploid_fix(hh_or_mt_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, *is_x_ptr, *is_y_ptr, (unsigned char*)loadbuf_raw);
10960 }
10961 fill_uint_zero(4 * cluster_ct2, cluster_geno_cts);
10962 ulptr = loadbuf_raw;
10963 ulptr2 = pheno_nm_11;
10964 if ((!min_ploidy_1) || (*is_x_ptr)) {
10965 if (*is_x_ptr) {
10966 ulptr2 = pheno_nm_nonmale_11;
10967 }
10968 for (sample_uidx_base = 0; sample_uidx_base < unfiltered_sample_ct; sample_uidx_base += BITCT2) {
10969 cur_word = (~(*ulptr++)) & (*ulptr2++);
10970 while (cur_word) {
10971 uii = CTZLU(cur_word) & (BITCT - 2);
10972 ujj = (cur_word >> uii) & 3;
10973 sample_uidx = sample_uidx_base + (uii / 2);
10974 cpidx = sample_to_cluster_pheno[sample_uidx];
10975 // this does the following branchlessly:
10976 // 1. increment A1 count by one for heterozygous calls (ujj == 1)
10977 // 2. increment missing count by two when ujj == 2
10978 // 3. increment A1 count by two when ujj == 3
10979 cluster_geno_cts[cpidx * 2 + (ujj == 2)] += 2 - (ujj == 1);
10980 cur_word &= ~((3 * ONELU) << uii);
10981 }
10982 }
10983 }
10984 if (min_ploidy_1) {
10985 ulptr = loadbuf_raw;
10986 if ((*is_x_ptr) || (*is_y_ptr)) {
10987 ulptr2 = pheno_nm_male_11;
10988 }
10989 for (sample_uidx_base = 0; sample_uidx_base < unfiltered_sample_ct; sample_uidx_base += BITCT2) {
10990 cur_word = (~(*ulptr++)) & (*ulptr2++);
10991 while (cur_word) {
10992 uii = CTZLU(cur_word) & (BITCT - 2);
10993 ujj = (cur_word >> uii) & 3;
10994 sample_uidx = sample_uidx_base + (uii / 2);
10995 cpidx = sample_to_cluster_pheno[sample_uidx];
10996 // increments A1 count by one, or missing count by one
10997 cluster_geno_cts[cpidx * 2 + 3 - ujj] += 1;
10998 cur_word &= ~((3 * ONELU) << uii);
10999 }
11000 }
11001 }
11002 return 0;
11003 }
11004
cmh_assoc(pthread_t * threads,FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,uint32_t cmh_mperm_val,uint32_t cmh_modifier,double ci_size,double pfilter,double output_min_p,uint32_t mtest_adjust,double adjust_lambda,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,uint32_t * marker_pos,char ** marker_allele_ptrs,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,Aperm_info * apip,uint32_t mperm_save,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists,Set_info * sip)11005 int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t cmh_mperm_val, uint32_t cmh_modifier, double ci_size, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, Aperm_info* apip, uint32_t mperm_save, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists, Set_info* sip) {
11006 unsigned char* bigstack_mark = g_bigstack_base;
11007 FILE* outfile = nullptr;
11008 FILE* outfile_msa = nullptr;
11009 uintptr_t* sample_hh_include2 = nullptr;
11010 uintptr_t* sample_hh_male_include2 = nullptr;
11011 uint32_t* orig_df = nullptr;
11012 char* chrom_name_ptr = nullptr;
11013 uint32_t breslow_day = cmh_modifier & CLUSTER_CMH_BD;
11014 uint32_t perm_bd = cmh_modifier & CLUSTER_CMH_PERM_BD;
11015 uint32_t chrom_fo_idx = 0xffffffffU; // deliberate overflow
11016 uint32_t chrom_end = 0;
11017 uint32_t chrom_name_len = 0;
11018 uint32_t pct = 0;
11019 uint32_t min_ploidy_1 = 0;
11020 uint32_t is_x = 0;
11021 uint32_t is_y = 0;
11022 int32_t retval = 0;
11023 char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
11024 uintptr_t* pheno_nm_11;
11025 uintptr_t* pheno_nm_nonmale_11;
11026 uintptr_t* pheno_nm_male_11;
11027 uintptr_t* loadbuf_raw;
11028 double* orig_chisq;
11029 double* dptr;
11030 char* wptr;
11031 uint32_t* sample_to_cluster_pheno;
11032 uint32_t* cluster_pheno_gtots;
11033 uint32_t* cur_cluster_pheno_gtots;
11034 uint32_t* cluster_geno_cts;
11035 uint32_t* marker_idx_to_uidx;
11036 uint32_t* uiptr;
11037 uintptr_t marker_uidx;
11038 uintptr_t marker_idx;
11039 double ci_zt;
11040 double allele_ct_recip;
11041 double allele_ctm1_recip;
11042 double ctrl_ctd;
11043 double case_ctd;
11044 double ctrl_a1_ctd;
11045 double ctrl_a2_ctd;
11046 double case_a1_ctd;
11047 double case_a2_ctd;
11048 double a1_ctd;
11049 double a2_ctd;
11050 double mean_case_a1d;
11051 double var_case_a1d;
11052 double cmh_stat;
11053 double cmh_denom;
11054 double r2;
11055 double s2;
11056 double rtot;
11057 double stot;
11058 double v1;
11059 double v2;
11060 double v3;
11061 double odds_ratio;
11062 double se;
11063 double log_or;
11064 double pval;
11065 double one_minus_odds_ratio;
11066 double double_1mor_recip;
11067 double bdx2;
11068 double amax;
11069 double bb;
11070 double discrim;
11071 double as_plus;
11072 double as_minus;
11073 double a_star;
11074 double b_star;
11075 double c_star;
11076 double d_star;
11077 double dxx;
11078 double dyy;
11079 uint32_t cluster_idx;
11080 uint32_t loop_end;
11081 uint32_t ctrl_ct;
11082 uint32_t case_ct;
11083 uint32_t cluster_ct2;
11084 uint32_t allele_ct;
11085 uint32_t uii;
11086 int32_t cur_df;
11087
11088 // The best data structures for permutation testing are somewhat different
11089 // from those for the single-pass computation, so we separate the logic.
11090
11091 retval = cluster_assoc_init("--mh/--bd", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, nullptr, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct2);
11092 if (retval) {
11093 goto cmh_assoc_ret_1;
11094 }
11095 if (breslow_day && (cluster_ct2 > 10) && (!perm_bd)) {
11096 logerrprint("Warning: Breslow-Day statistics are unreliable with a large number of small\nclusters. You may want to look at empirical p-values from the 'perm-bd'\nadaptive permutation test.\n");
11097 }
11098
11099 memcpy(outname_end, ".cmh", 5);
11100 if (fopen_checked(outname, "w", &outfile)) {
11101 goto cmh_assoc_ret_OPEN_FAIL;
11102 }
11103 if (ci_size == 0.0) {
11104 ci_size = 0.95;
11105 }
11106 ci_zt = ltqnorm(1 - (1 - ci_size) / 2);
11107 LOGPRINTFWW5("Writing report to %s ... ", outname);
11108 fputs("0%", stdout);
11109 fflush(stdout);
11110 sprintf(g_textbuf, " CHR %%%us BP A1 MAF A2 CHISQ P OR SE ", plink_maxsnp);
11111 fprintf(outfile, g_textbuf, "SNP");
11112 uii = (uint32_t)((int32_t)(ci_size * (100 + EPSILON)));
11113 if (uii >= 10) {
11114 fprintf(outfile, "L%u U%u ", uii, uii);
11115 } else {
11116 fprintf(outfile, " L%u U%u ", uii, uii);
11117 }
11118 if (breslow_day) {
11119 fputs(" CHISQ_BD P_BD ", outfile);
11120 }
11121 if (putc_checked('\n', outfile)) {
11122 goto cmh_assoc_ret_WRITE_FAIL;
11123 }
11124 if ((chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET])) {
11125 hh_or_mt_exists |= NXMHH_EXISTS;
11126 }
11127 if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11128 goto cmh_assoc_ret_NOMEM;
11129 }
11130 if (bigstack_alloc_d(marker_ct, &orig_chisq)) {
11131 goto cmh_assoc_ret_NOMEM;
11132 }
11133 if (perm_bd) {
11134 if (bigstack_alloc_ui(marker_ct, &orig_df)) {
11135 goto cmh_assoc_ret_NOMEM;
11136 }
11137 }
11138 if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11139 goto cmh_assoc_ret_READ_FAIL;
11140 }
11141 dptr = orig_chisq;
11142 loop_end = marker_ct / 100;
11143 for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11144 if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, chrom_name_buf, cluster_ct2, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &chrom_name_ptr, &chrom_name_len)) {
11145 goto cmh_assoc_ret_READ_FAIL;
11146 }
11147 cmh_stat = 0.0;
11148 cmh_denom = 0.0;
11149 rtot = 0.0;
11150 stot = 0.0;
11151 v1 = 0.0;
11152 v2 = 0.0;
11153 v3 = 0.0;
11154 for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11155 ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11156 case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11157 // skip cluster if all controls missing, or all cases missing
11158 if (ctrl_ct && case_ct) {
11159 allele_ct = ctrl_ct + case_ct;
11160 allele_ct_recip = 1.0 / ((double)((int32_t)allele_ct));
11161 allele_ctm1_recip = 1.0 / ((double)((int32_t)(allele_ct - 1)));
11162 ctrl_ctd = (double)((int32_t)ctrl_ct);
11163 case_ctd = (double)((int32_t)case_ct);
11164 ctrl_a1_ctd = (double)((int32_t)uiptr[0]);
11165 ctrl_a2_ctd = ctrl_ctd - ctrl_a1_ctd;
11166 case_a1_ctd = (double)((int32_t)uiptr[2]);
11167 case_a2_ctd = case_ctd - case_a1_ctd;
11168 a1_ctd = ctrl_a1_ctd + case_a1_ctd;
11169 a2_ctd = ctrl_a2_ctd + case_a2_ctd;
11170 mean_case_a1d = case_ctd * a1_ctd * allele_ct_recip;
11171 var_case_a1d = ctrl_ctd * case_ctd * a1_ctd * a2_ctd * allele_ct_recip * allele_ct_recip * allele_ctm1_recip;
11172 cmh_stat += case_a1_ctd - mean_case_a1d;
11173 cmh_denom += var_case_a1d;
11174 r2 = case_a1_ctd * ctrl_a2_ctd * allele_ct_recip;
11175 s2 = case_a2_ctd * ctrl_a1_ctd * allele_ct_recip;
11176 rtot += r2;
11177 stot += s2;
11178 v1 += allele_ct_recip * r2 * (case_a1_ctd + ctrl_a2_ctd);
11179 v2 += allele_ct_recip * s2 * (case_a2_ctd + ctrl_a1_ctd);
11180 v3 += allele_ct_recip * ((case_a1_ctd + ctrl_a2_ctd) * s2 + (case_a2_ctd + ctrl_a1_ctd) * r2);
11181 }
11182 }
11183 cmh_stat *= cmh_stat / cmh_denom;
11184 odds_ratio = rtot / stot;
11185 se = sqrt(v1 / (2 * rtot * rtot) + v2 / (2 * stot * stot) + v3 / (2 * rtot * stot));
11186 log_or = log(odds_ratio);
11187 pval = chiprob_p(cmh_stat, 1);
11188 if (cmh_stat >= 0.0) {
11189 *dptr++ = cmh_stat;
11190 } else {
11191 *dptr++ = -9;
11192 }
11193 if ((pfilter == 2.0) || ((pval <= pfilter) && (pval != -9))) {
11194 wptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
11195 wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr);
11196 *wptr++ = ' ';
11197 wptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', wptr);
11198 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11199 goto cmh_assoc_ret_WRITE_FAIL;
11200 }
11201 fputs_w4(marker_allele_ptrs[marker_uidx * 2], outfile);
11202 g_textbuf[0] = ' ';
11203 wptr = dtoa_g_wxp4x(1.0 - set_allele_freqs[marker_uidx], 8, ' ', &(g_textbuf[1]));
11204 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11205 goto cmh_assoc_ret_WRITE_FAIL;
11206 }
11207 fputs_w4(marker_allele_ptrs[marker_uidx * 2 + 1], outfile);
11208 if (realnum(cmh_stat)) {
11209 g_textbuf[0] = ' ';
11210 wptr = dtoa_g_wxp4x(cmh_stat, 10, ' ', &(g_textbuf[1]));
11211 wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
11212 } else {
11213 wptr = memcpya(g_textbuf, " NA NA ", 23);
11214 }
11215 if (realnum(odds_ratio)) {
11216 wptr = dtoa_g_wxp4x(odds_ratio, 10, ' ', wptr);
11217 } else {
11218 wptr = memcpya(wptr, " NA ", 11);
11219 }
11220 if (realnum(se)) {
11221 wptr = dtoa_g_wxp4x(se, 10, ' ', wptr);
11222 dxx = ci_zt * se;
11223 dyy = exp(log_or - dxx);
11224 if (realnum(dyy)) {
11225 wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
11226 } else {
11227 wptr = memcpya(wptr, " NA ", 11);
11228 }
11229 dyy = exp(log_or + dxx);
11230 if (realnum(dyy)) {
11231 wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
11232 } else {
11233 wptr = memcpya(wptr, " NA ", 11);
11234 }
11235 } else {
11236 wptr = memcpya(wptr, " NA NA NA ", 33);
11237 }
11238 if (breslow_day) {
11239 if (realnum(odds_ratio) && (odds_ratio != 1.0)) {
11240 one_minus_odds_ratio = 1.0 - odds_ratio;
11241 double_1mor_recip = 0.5 / one_minus_odds_ratio;
11242 bdx2 = 0.0;
11243 cur_df = -1;
11244 for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11245 ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11246 case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11247 if (ctrl_ct && case_ct) {
11248 cur_df++;
11249 ctrl_ctd = (double)((int32_t)ctrl_ct);
11250 case_ctd = (double)((int32_t)case_ct);
11251 ctrl_a1_ctd = (double)((int32_t)uiptr[0]);
11252 case_a1_ctd = (double)((int32_t)uiptr[2]);
11253 a1_ctd = ctrl_a1_ctd + case_a1_ctd;
11254 amax = MINV(case_ctd, a1_ctd);
11255 bb = ctrl_ctd + case_ctd * odds_ratio - a1_ctd * one_minus_odds_ratio;
11256 discrim = sqrt(bb * bb + 4 * one_minus_odds_ratio * odds_ratio * case_ctd * a1_ctd);
11257 as_plus = (-bb + discrim) * double_1mor_recip;
11258 as_minus = (-bb - discrim) * double_1mor_recip;
11259 a_star = ((as_minus <= amax) && (as_minus >= 0))? as_minus : as_plus;
11260 b_star = case_ctd - a_star;
11261 c_star = a1_ctd - a_star;
11262 d_star = ctrl_ctd - a1_ctd + a_star;
11263
11264 // concordance fix (25 May 2018): print NA,NA instead of inf,0
11265 if ((a_star == 0.0) || (b_star == 0.0) || (c_star == 0.0) || (d_star == 0.0)) {
11266 goto cmh_assoc_bd_fail;
11267 }
11268
11269 // inverse variance
11270 dxx = 1.0 / a_star + 1.0 / b_star + 1.0 / c_star + 1.0 / d_star;
11271
11272 dyy = case_a1_ctd - a_star;
11273 bdx2 += dyy * dyy * dxx;
11274 }
11275 }
11276 pval = chiprob_p(bdx2, cur_df);
11277 if (pval > -1) {
11278 wptr = dtoa_g_wxp4x(bdx2, 10, ' ', wptr);
11279 wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
11280 } else {
11281 goto cmh_assoc_bd_fail;
11282 }
11283 } else {
11284 cmh_assoc_bd_fail:
11285 wptr = memcpya(wptr, " NA NA ", 22);
11286 }
11287 }
11288 *wptr++ = '\n';
11289 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11290 goto cmh_assoc_ret_WRITE_FAIL;
11291 }
11292 }
11293 if (marker_idx >= loop_end) {
11294 if (marker_idx < marker_ct) {
11295 if (pct >= 10) {
11296 putc_unlocked('\b', stdout);
11297 }
11298 pct = (marker_idx * 100LLU) / marker_ct;
11299 printf("\b\b%u%%", pct);
11300 fflush(stdout);
11301 loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11302 }
11303 }
11304 }
11305 if (fclose_null(&outfile)) {
11306 goto cmh_assoc_ret_WRITE_FAIL;
11307 }
11308 if (pct >= 10) {
11309 putc_unlocked('\b', stdout);
11310 }
11311 fputs("\b\b", stdout);
11312 logprint("done.\n");
11313 if (mtest_adjust) {
11314 if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
11315 goto cmh_assoc_ret_NOMEM;
11316 }
11317 fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
11318 retval = multcomp(outname, outname_end, marker_idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, nullptr, nullptr);
11319 }
11320
11321 if (cmh_modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM)) {
11322 logerrprint("Error: --mh/--bd permutation tests are currently under development.\n");
11323 goto cmh_assoc_ret_INVALID_CMDLINE;
11324 }
11325
11326 // Given the genotypes at a marker, the following quantities are invariant
11327 // through permutations:
11328 // * set of possibly valid clusters (2+ nonmissing genotypes)
11329 // * allele counts in each cluster
11330 // while the following quantities need to be recomputed:
11331 // * [case x A1] and [case x A2] counts in each cluster (control counts can
11332 // then be determined via subtraction; but note that [case x A2] CANNOT
11333 // generally be determined from [case x A1] because the number of cases
11334 // with missing genotypes may vary, though we could special-case
11335 // no-missing-genotypes if this ever becomes popular enough to justify the
11336 // complexity)
11337 //
11338 // To handle both large and small clusters efficiently without too much
11339 // special-casing, we preprocess the raw data so that each cluster's
11340 // genotypes occupy separate words. (Exception: on 64-bit systems, clusters
11341 // of size <= 16 are stuffed into 4 bytes, to improve memory efficiency.)
11342 // This allows the inner loops to be based on bitwise operations and
11343 // sequential memory accessses. We also scan for clusters containing only a
11344 // single genotype, or less than two nonmissing genotypes, and exclude them
11345 // from the main loop.
11346
11347 // ...
11348
11349 while (0) {
11350 cmh_assoc_ret_NOMEM:
11351 retval = RET_NOMEM;
11352 break;
11353 cmh_assoc_ret_OPEN_FAIL:
11354 retval = RET_OPEN_FAIL;
11355 break;
11356 cmh_assoc_ret_READ_FAIL:
11357 retval = RET_READ_FAIL;
11358 break;
11359 cmh_assoc_ret_WRITE_FAIL:
11360 retval = RET_WRITE_FAIL;
11361 break;
11362 cmh_assoc_ret_INVALID_CMDLINE:
11363 retval = RET_INVALID_CMDLINE;
11364 break;
11365 }
11366 cmh_assoc_ret_1:
11367 bigstack_reset(bigstack_mark);
11368 fclose_cond(outfile);
11369 fclose_cond(outfile_msa);
11370 return retval;
11371 }
11372
cmh2_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)11373 int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
11374 unsigned char* bigstack_mark = g_bigstack_base;
11375 FILE* outfile = nullptr;
11376 uintptr_t* sample_hh_include2 = nullptr;
11377 uintptr_t* sample_hh_male_include2 = nullptr;
11378 char* wptr_start = nullptr;
11379 uint32_t chrom_fo_idx = 0xffffffffU;
11380 uint32_t chrom_end = 0;
11381 uint32_t pct = 0;
11382 uint32_t min_ploidy_1 = 0;
11383 uint32_t is_x = 0;
11384 uint32_t is_y = 0;
11385 uint32_t cluster_ct1 = 0;
11386 uint32_t ctrl_ct = 0;
11387 uint32_t case_ct = 0;
11388 int32_t retval = 0;
11389 uintptr_t* pheno_nm_11;
11390 uintptr_t* pheno_nm_nonmale_11;
11391 uintptr_t* pheno_nm_male_11;
11392 uintptr_t* loadbuf_raw;
11393 char* wptr;
11394 MATRIX_INVERT_BUF1_TYPE* mi_buf;
11395 double* ty_ctrl;
11396 double* ty_case;
11397 double* n0;
11398 double* u0;
11399 double* v0;
11400 double* dbl_2d_buf;
11401 double* dptr;
11402 double* dptr2;
11403 uint32_t* sample_to_cluster_pheno;
11404 uint32_t* cluster_pheno_gtots;
11405 uint32_t* cur_cluster_pheno_gtots;
11406 uint32_t* cluster_geno_cts;
11407 uint32_t* uiptr;
11408 uintptr_t marker_uidx;
11409 uintptr_t marker_idx;
11410 double ctrl_a1_ctd; // Tx[.][]
11411 double case_a1_ctd;
11412 double ctrl_ctd; // T[.]
11413 double case_ctd;
11414 double cur_ty_ctrl;
11415 double cur_ty_case;
11416 double ctrl_umult;
11417 double case_umult;
11418 double ctrl_vmult; // (Tx[] * (T[] - Tx[])) / (T[] * T[] * (T[]-1))
11419 double case_vmult;
11420 double cur_ctrl_vmult;
11421 double cur_case_vmult;
11422 double chisq;
11423 double dxx;
11424 uint32_t cur_ctrl_ct;
11425 uint32_t cur_case_ct;
11426 uint32_t cluster_ctrl_ct;
11427 uint32_t cluster_case_ct;
11428 uint32_t ctrl_a1_ct;
11429 uint32_t case_a1_ct;
11430 uint32_t cur_cluster_ct;
11431 uint32_t cur_cluster_ctm1;
11432 uint32_t cluster_idx;
11433 uint32_t loop_end;
11434 uint32_t uii;
11435 // no reason to keep X/Y/MT/haploid restriction
11436 retval = cluster_assoc_init("--mh2", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, nullptr, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct1);
11437 for (cluster_idx = 0; cluster_idx < cluster_ct1; cluster_idx++) {
11438 ctrl_ct += cluster_pheno_gtots[4 * cluster_idx];
11439 case_ct += cluster_pheno_gtots[4 * cluster_idx + 2];
11440 }
11441 if ((ctrl_ct < 2) || (case_ct < 2)) {
11442 logerrprint("Error: --mh2 requires at least two cases and two controls.\n");
11443 goto cmh2_assoc_ret_INVALID_CMDLINE;
11444 }
11445 #ifdef __LP64__
11446 if (cluster_ct1 > 46341) {
11447 // might actually be ok, but play it safe in case LAPACK matrix inversion
11448 // routine has an integer overflow here
11449 // (if/when we do permit this, will need to switch a few variables to type
11450 // uintptr_t)
11451 logerrprint("Error: --mh2 does not currently support more than 46341 clusters.\n");
11452 goto cmh2_assoc_ret_INVALID_CMDLINE;
11453 }
11454 #endif
11455 if (bigstack_alloc_d(cluster_ct1, &ty_ctrl) ||
11456 bigstack_alloc_d(cluster_ct1, &ty_case) ||
11457 bigstack_alloc_d(cluster_ct1 - 1, &n0) ||
11458 bigstack_alloc_d(cluster_ct1 - 1, &u0) ||
11459 bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &v0) ||
11460 bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &dbl_2d_buf)) {
11461 goto cmh2_assoc_ret_NOMEM;
11462 }
11463 mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc((cluster_ct1 - 1) * MATRIX_INVERT_BUF1_ELEM_ALLOC);
11464 if (!mi_buf) {
11465 goto cmh2_assoc_ret_NOMEM;
11466 }
11467 if ((chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->xymt_codes[MT_OFFSET])) {
11468 hh_or_mt_exists |= NXMHH_EXISTS;
11469 }
11470 if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11471 goto cmh2_assoc_ret_NOMEM;
11472 }
11473 if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11474 goto cmh2_assoc_ret_READ_FAIL;
11475 }
11476 memcpy(outname_end, ".cmh2", 6);
11477 if (fopen_checked(outname, "w", &outfile)) {
11478 goto cmh2_assoc_ret_OPEN_FAIL;
11479 }
11480 LOGPRINTFWW5("Writing report to %s ... ", outname);
11481 fputs("0%", stdout);
11482 fflush(stdout);
11483 if (fputs_checked("CHR\tSNP\tCHISQ\tDF\tP\n", outfile)) {
11484 goto cmh2_assoc_ret_WRITE_FAIL;
11485 }
11486 loop_end = marker_ct / 100;
11487 for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11488 if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, g_textbuf, cluster_ct1, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &wptr_start, nullptr)) {
11489 goto cmh2_assoc_ret_READ_FAIL;
11490 }
11491 wptr = strcpyax(wptr_start, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
11492 cur_ctrl_ct = 0;
11493 cur_case_ct = 0;
11494 ctrl_a1_ct = 0;
11495 case_a1_ct = 0;
11496 cur_cluster_ct = 0;
11497 for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct1; cluster_idx++, uiptr = &(uiptr[4])) {
11498 cluster_ctrl_ct = cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1];
11499 cluster_case_ct = cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3];
11500 uii = cluster_ctrl_ct + cluster_case_ct;
11501 if (uii) {
11502 // don't count toward cur_cluster_ct if all observations are missing
11503 n0[cur_cluster_ct] = (double)((int32_t)(uiptr[0] + uiptr[2]));
11504 ctrl_a1_ct += uiptr[0];
11505 case_a1_ct += uiptr[2];
11506 cur_ctrl_ct += cluster_ctrl_ct;
11507 cur_case_ct += cluster_case_ct;
11508 ty_ctrl[cur_cluster_ct] = (double)((int32_t)cluster_ctrl_ct);
11509 ty_case[cur_cluster_ct] = (double)((int32_t)cluster_case_ct);
11510 cur_cluster_ct++;
11511 }
11512 }
11513
11514 // This is always a 2xJx2 test (where J = cluster ct), so we can omit PLINK
11515 // 1.07 calcMantelHaenszel_IxJxK code which only comes into play for larger
11516 // I/K values.
11517 if (((!cur_ctrl_ct) && cur_case_ct) || ((!cur_case_ct) && cur_ctrl_ct) || (cur_cluster_ct == 1)) {
11518 // may as well distinguish 0df from other problems
11519 wptr = memcpya(wptr, "0\t0\tNA\n", 7);
11520 goto cmh2_assoc_fail2;
11521 } else if ((cur_ctrl_ct < 2) || (cur_case_ct < 2) || (!cur_cluster_ct)) {
11522 goto cmh2_assoc_fail;
11523 }
11524 cur_cluster_ctm1 = cur_cluster_ct - 1;
11525 ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
11526 case_ctd = (double)((int32_t)cur_case_ct);
11527 ctrl_a1_ctd = (double)((int32_t)ctrl_a1_ct);
11528 case_a1_ctd = (double)((int32_t)case_a1_ct);
11529 ctrl_umult = ctrl_a1_ctd / ctrl_ctd;
11530 case_umult = case_a1_ctd / case_ctd;
11531 ctrl_vmult = ctrl_umult * (ctrl_ctd - ctrl_a1_ctd) / (ctrl_ctd * (ctrl_ctd - 1));
11532 case_vmult = case_umult * (case_ctd - case_a1_ctd) / (case_ctd * (case_ctd - 1));
11533 for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11534 // instead of a two-step process where e.g. U[][] is filled first, and
11535 // then columnwise sums are saved to U0, we just fill U0 directly.
11536 cur_ty_ctrl = ty_ctrl[cluster_idx];
11537 cur_ty_case = ty_case[cluster_idx];
11538 u0[cluster_idx] = cur_ty_ctrl * ctrl_umult + cur_ty_case * case_umult;
11539 cur_ctrl_vmult = -cur_ty_ctrl * ctrl_vmult;
11540 cur_case_vmult = -cur_ty_case * case_vmult;
11541 dptr = &(v0[cluster_idx * cur_cluster_ct]);
11542 // should be guaranteed to be nonnegative, no need for fabs()?
11543 *dptr++ = (cur_ty_ctrl - ctrl_ctd) * cur_ctrl_vmult + (cur_ty_case - case_ctd) * cur_case_vmult;
11544 for (uii = cluster_idx + 1; uii < cur_cluster_ctm1; uii++) {
11545 *dptr++ = ty_ctrl[uii] * cur_ctrl_vmult + ty_case[uii] * cur_case_vmult;
11546 }
11547 }
11548 for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11549 dptr = &(v0[cluster_idx * cur_cluster_ctm1]);
11550 dptr2 = &(v0[cluster_idx]);
11551 for (uii = 0; uii < cluster_idx; uii++) {
11552 *dptr++ = dptr2[uii * cur_cluster_ctm1];
11553 }
11554 }
11555
11556 if (!invert_matrix(cur_cluster_ctm1, v0, mi_buf, dbl_2d_buf)) {
11557 // Q = G'V{-1}G
11558 chisq = 0.0;
11559 for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11560 dbl_2d_buf[cluster_idx] = n0[cluster_idx] - u0[cluster_idx];
11561 }
11562 dptr = v0;
11563 for (cluster_idx = 0; cluster_idx < cur_cluster_ctm1; cluster_idx++) {
11564 dxx = 0.0;
11565 dptr2 = dbl_2d_buf;
11566 for (uii = 0; uii < cur_cluster_ctm1; uii++) {
11567 dxx += (*dptr++) * (*dptr2++);
11568 }
11569 chisq += dxx * (dbl_2d_buf[cluster_idx]);
11570 }
11571 wptr = dtoa_gx(chisq, '\t', wptr);
11572 wptr = uint32toa_x(cur_cluster_ctm1, '\t', wptr);
11573 dxx = chiprob_p(chisq, (int32_t)cur_cluster_ctm1);
11574 wptr = dtoa_gx(MAXV(dxx, output_min_p), '\n', wptr);
11575 } else {
11576 cmh2_assoc_fail:
11577 wptr = memcpya(wptr, "NA\tNA\tNA\n", 9);
11578 }
11579 cmh2_assoc_fail2:
11580 if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
11581 goto cmh2_assoc_ret_WRITE_FAIL;
11582 }
11583 if (marker_idx >= loop_end) {
11584 if (marker_idx < marker_ct) {
11585 if (pct >= 10) {
11586 putc_unlocked('\b', stdout);
11587 }
11588 pct = (marker_idx * 100LLU) / marker_ct;
11589 printf("\b\b%u%%", pct);
11590 fflush(stdout);
11591 loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11592 }
11593 }
11594 }
11595 if (fclose_null(&outfile)) {
11596 goto cmh2_assoc_ret_WRITE_FAIL;
11597 }
11598 if (pct >= 10) {
11599 putc_unlocked('\b', stdout);
11600 }
11601 fputs("\b\b", stdout);
11602 logprint("done.\n");
11603 while (0) {
11604 cmh2_assoc_ret_NOMEM:
11605 retval = RET_NOMEM;
11606 break;
11607 cmh2_assoc_ret_OPEN_FAIL:
11608 retval = RET_OPEN_FAIL;
11609 break;
11610 cmh2_assoc_ret_READ_FAIL:
11611 retval = RET_READ_FAIL;
11612 break;
11613 cmh2_assoc_ret_WRITE_FAIL:
11614 retval = RET_WRITE_FAIL;
11615 break;
11616 cmh2_assoc_ret_INVALID_CMDLINE:
11617 retval = RET_INVALID_CMDLINE;
11618 break;
11619 }
11620 bigstack_reset(bigstack_mark);
11621 fclose_cond(outfile);
11622 return retval;
11623 }
11624
homog_assoc(FILE * bedfile,uintptr_t bed_offset,char * outname,char * outname_end,double output_min_p,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t marker_ct,char * marker_ids,uintptr_t max_marker_id_len,uint32_t plink_maxsnp,char ** marker_allele_ptrs,uintptr_t max_marker_allele_len,uintptr_t * marker_reverse,Chrom_info * chrom_info_ptr,double * set_allele_freqs,uintptr_t unfiltered_sample_ct,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,char * cluster_ids,uintptr_t max_cluster_id_len,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,uintptr_t * sex_male,uint32_t hh_or_mt_exists)11625 int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, char* cluster_ids, uintptr_t max_cluster_id_len, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
11626 unsigned char* bigstack_mark = g_bigstack_base;
11627 unsigned char* bigstack_end_mark = g_bigstack_end;
11628 FILE* outfile = nullptr;
11629 uintptr_t* sample_hh_include2 = nullptr;
11630 uintptr_t* sample_hh_male_include2 = nullptr;
11631 char* writebuf = g_textbuf;
11632 char* chrom_name_ptr = nullptr;
11633 uint32_t cluster_ct2 = 0;
11634 uint32_t chrom_fo_idx = 0xffffffffU;
11635 uint32_t chrom_end = 0;
11636 uint32_t chrom_name_len = 0;
11637 uint32_t pct = 0;
11638 uint32_t min_ploidy_1 = 0;
11639 uint32_t is_x = 0;
11640 uint32_t is_y = 0;
11641 int32_t retval = 0;
11642 char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_SLEN];
11643 uintptr_t* cluster_bitfield;
11644 uintptr_t* pheno_nm_11;
11645 uintptr_t* pheno_nm_nonmale_11;
11646 uintptr_t* pheno_nm_male_11;
11647 uintptr_t* loadbuf_raw;
11648 double* cluster_tables;
11649 double* cluster_chisq;
11650 double* cluster_or;
11651 double* dptr;
11652 char* cluster_ids_collapsed;
11653 char* wptr_start;
11654 char* wptr;
11655 uint32_t* sample_to_cluster_pheno;
11656 uint32_t* cluster_pheno_gtots;
11657 uint32_t* cur_cluster_pheno_gtots;
11658 uint32_t* cluster_geno_cts;
11659 uint32_t* uiptr;
11660 uintptr_t marker_uidx;
11661 uintptr_t marker_idx;
11662 uintptr_t ulii;
11663 double cluster_ct2d;
11664 double cluster_ct2m1d;
11665 double case_ctd;
11666 double ctrl_ctd;
11667 double case_a1_ctd;
11668 double case_a2_ctd;
11669 double ctrl_a1_ctd;
11670 double ctrl_a2_ctd;
11671 double case_a2_recip;
11672 double ctrl_a1_recip;
11673 double ln_or;
11674 double se_sq_recip;
11675 double x_total;
11676 double x_assoc1;
11677 double x_assoc2;
11678 double x_assoc;
11679 double dxx;
11680 uint32_t cluster_idx;
11681 uint32_t loop_end;
11682 ulii = 2 * max_marker_allele_len + MAX_ID_SLEN + max_marker_id_len + max_cluster_id_len + 256;
11683 if (ulii > MAXLINELEN) {
11684 if (bigstack_alloc_c(ulii, &writebuf)) {
11685 goto homog_assoc_ret_NOMEM;
11686 }
11687 }
11688 if (bigstack_end_calloc_ul(BITCT_TO_WORDCT(cluster_ct), &cluster_bitfield)) {
11689 goto homog_assoc_ret_NOMEM;
11690 }
11691 // Factor out common initialization with cmh_assoc().
11692 retval = cluster_assoc_init("--homog", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, cluster_bitfield, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct2);
11693 if (retval) {
11694 goto homog_assoc_ret_1;
11695 }
11696 if (cluster_ct == cluster_ct2) {
11697 cluster_ids_collapsed = cluster_ids;
11698 } else {
11699 if (bigstack_alloc_c(cluster_ct2 * max_cluster_id_len, &cluster_ids_collapsed)) {
11700 goto homog_assoc_ret_NOMEM;
11701 }
11702 for (ulii = 0, cluster_idx = 0; cluster_idx < cluster_ct2; ulii++, cluster_idx++) {
11703 next_set_ul_unsafe_ck(cluster_bitfield, &ulii);
11704 memcpy(&(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), &(cluster_ids[ulii * max_cluster_id_len]), max_cluster_id_len);
11705 }
11706 }
11707 bigstack_end_reset(bigstack_end_mark);
11708 cluster_ct2d = (double)((int32_t)cluster_ct2);
11709 cluster_ct2m1d = (double)((int32_t)cluster_ct2 - 1);
11710 if (bigstack_alloc_d(cluster_ct2 * 4, &cluster_tables) ||
11711 bigstack_alloc_d(cluster_ct2, &cluster_or) ||
11712 bigstack_alloc_d(cluster_ct2, &cluster_chisq)) {
11713 goto homog_assoc_ret_NOMEM;
11714 }
11715 if (cluster_ct2 > 10) {
11716 logerrprint("Warning: --homog statistics can be unreliable with small clusters.\n");
11717 }
11718
11719 memcpy(outname_end, ".homog", 7);
11720 if (fopen_checked(outname, "w", &outfile)) {
11721 goto homog_assoc_ret_OPEN_FAIL;
11722 }
11723 LOGPRINTFWW5("Writing report to %s ... ", outname);
11724 fputs("0%", stdout);
11725 fflush(stdout);
11726 // misaligned for backward compatibility
11727 sprintf(g_textbuf, " CHR %%%us A1 A2 F_A F_U N_A N_U TEST CHISQ DF P OR\n", plink_maxsnp);
11728 fprintf(outfile, g_textbuf, "SNP");
11729 if (chrom_info_ptr->xymt_codes[MT_OFFSET] != -2) {
11730 hh_or_mt_exists |= NXMHH_EXISTS;
11731 }
11732 if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
11733 goto homog_assoc_ret_NOMEM;
11734 }
11735 if (fseeko(bedfile, bed_offset, SEEK_SET)) {
11736 goto homog_assoc_ret_READ_FAIL;
11737 }
11738 loop_end = marker_ct / 100;
11739 for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
11740 if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, chrom_name_buf, cluster_ct2, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &chrom_name_ptr, &chrom_name_len)) {
11741 goto homog_assoc_ret_READ_FAIL;
11742 }
11743 dptr = cluster_tables;
11744 x_total = 0.0;
11745 x_assoc1 = 0.0;
11746 x_assoc2 = 0.0;
11747 for (cluster_idx = 0, uiptr = cluster_geno_cts; cluster_idx < cluster_ct2; cluster_idx++, uiptr = &(uiptr[4])) {
11748 ctrl_ctd = (double)((int32_t)(1 + cur_cluster_pheno_gtots[cluster_idx * 2] - uiptr[1]));
11749 case_ctd = (double)((int32_t)(1 + cur_cluster_pheno_gtots[cluster_idx * 2 + 1] - uiptr[3]));
11750 ctrl_a1_ctd = (double)((int32_t)uiptr[0]) + 0.5;
11751 ctrl_a2_ctd = ctrl_ctd - ctrl_a1_ctd;
11752 case_a1_ctd = (double)((int32_t)uiptr[2]) + 0.5;
11753 case_a2_ctd = case_ctd - case_a1_ctd;
11754 *dptr++ = case_a1_ctd;
11755 *dptr++ = case_a2_ctd;
11756 *dptr++ = ctrl_a1_ctd;
11757 *dptr++ = ctrl_a2_ctd;
11758 case_a2_recip = 1.0 / case_a2_ctd;
11759 ctrl_a1_recip = 1.0 / ctrl_a1_ctd;
11760 dxx = case_a1_ctd * ctrl_a2_ctd * case_a2_recip * ctrl_a1_recip;
11761 cluster_or[cluster_idx] = dxx;
11762 ln_or = log(dxx);
11763 se_sq_recip = 1.0 / ((1.0 / case_a1_ctd) + (1.0 / ctrl_a2_ctd) + case_a2_recip + ctrl_a1_recip);
11764 x_assoc2 += se_sq_recip;
11765 dxx = ln_or * se_sq_recip;
11766 x_assoc1 += dxx;
11767 dxx *= ln_or;
11768 cluster_chisq[cluster_idx] = dxx;
11769 x_total += dxx;
11770 }
11771 x_assoc = x_assoc1 * x_assoc1 / x_assoc2;
11772 wptr_start = memcpyax(writebuf, chrom_name_ptr, chrom_name_len, ' ');
11773 wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
11774 *wptr_start++ = ' ';
11775 wptr_start = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2], wptr_start);
11776 *wptr_start++ = ' ';
11777 wptr_start = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2 + 1], wptr_start);
11778 *wptr_start++ = ' ';
11779 wptr_start = memcpya(wptr_start, " NA NA NA NA ", 36);
11780 wptr = memcpya(wptr_start, " TOTAL ", 7);
11781 wptr = dtoa_g_wxp4x(x_total, 10, ' ', wptr);
11782 wptr = uint32toa_w4x(cluster_ct2, ' ', wptr);
11783 wptr = dtoa_g_wxp4x(chiprob_p(x_total, cluster_ct2d), 10, ' ', wptr);
11784 wptr = memcpya(wptr, " NA\n", 11);
11785 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11786 goto homog_assoc_ret_WRITE_FAIL;
11787 }
11788 wptr = memcpya(wptr_start, " ASSOC ", 7);
11789 wptr = dtoa_g_wxp4(x_assoc, 10, wptr);
11790 wptr = memcpya(wptr, " 1 ", 6);
11791 wptr = dtoa_g_wxp4x(chiprob_p(x_assoc, 1), 10, ' ', wptr);
11792 wptr = memcpya(wptr, " NA\n", 11);
11793 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11794 goto homog_assoc_ret_WRITE_FAIL;
11795 }
11796 dxx = x_total - x_assoc;
11797 wptr = memcpya(wptr_start, " HOMOG ", 7);
11798 wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
11799 wptr = uint32toa_w4x(cluster_ct2 - 1, ' ', wptr);
11800 wptr = dtoa_g_wxp4x(chiprob_p(dxx, cluster_ct2m1d), 10, ' ', wptr);
11801 wptr = memcpya(wptr, " NA\n", 11);
11802 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11803 goto homog_assoc_ret_WRITE_FAIL;
11804 }
11805 wptr_start = &(wptr_start[-36]);
11806 for (cluster_idx = 0, dptr = cluster_tables; cluster_idx < cluster_ct2; cluster_idx++, dptr = &(dptr[4])) {
11807 case_ctd = dptr[0] + dptr[1];
11808 ctrl_ctd = dptr[2] + dptr[3];
11809 if ((case_ctd < 1.5) || (ctrl_ctd < 1.5)) {
11810 wptr = memcpya(wptr_start, " NA NA ", 18);
11811 wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
11812 wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
11813 wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
11814 wptr = memcpya(wptr, " NA NA NA NA\n", 39);
11815 } else {
11816 wptr = dtoa_g_wxp4x(dptr[0] / case_ctd, 8, ' ', wptr_start);
11817 wptr = dtoa_g_wxp4x(dptr[2] / ctrl_ctd, 8, ' ', wptr);
11818 wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
11819 wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
11820 wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
11821 *wptr++ = ' ';
11822 dxx = cluster_chisq[cluster_idx];
11823 if (dxx < SMALL_EPSILON * SMALL_EPSILON) {
11824 // probably rounding error
11825 dxx = 0;
11826 }
11827 wptr = dtoa_g_wxp4(dxx, 10, wptr);
11828 wptr = memcpya(wptr, " 1 ", 6);
11829 wptr = dtoa_g_wxp4x(MAXV(chiprob_p(dxx, 1), output_min_p), 10, ' ', wptr);
11830 dxx = cluster_or[cluster_idx];
11831 if (realnum(dxx)) {
11832 wptr = dtoa_g_wxp4x(dxx, 10, '\n', wptr);
11833 } else {
11834 wptr = memcpya(wptr, " NA\n", 11);
11835 }
11836 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
11837 goto homog_assoc_ret_WRITE_FAIL;
11838 }
11839 }
11840 }
11841 if (marker_idx >= loop_end) {
11842 if (marker_idx < marker_ct) {
11843 if (pct >= 10) {
11844 putc_unlocked('\b', stdout);
11845 }
11846 pct = (marker_idx * 100LLU) / marker_ct;
11847 printf("\b\b%u%%", pct);
11848 fflush(stdout);
11849 loop_end = (((uint64_t)pct + 1LLU) * marker_ct) / 100;
11850 }
11851 }
11852 }
11853 if (fclose_null(&outfile)) {
11854 goto homog_assoc_ret_WRITE_FAIL;
11855 }
11856 if (pct >= 10) {
11857 putc_unlocked('\b', stdout);
11858 }
11859 fputs("\b\b", stdout);
11860 logprint("done.\n");
11861 while (0) {
11862 homog_assoc_ret_NOMEM:
11863 retval = RET_NOMEM;
11864 break;
11865 homog_assoc_ret_OPEN_FAIL:
11866 retval = RET_OPEN_FAIL;
11867 break;
11868 homog_assoc_ret_READ_FAIL:
11869 retval = RET_READ_FAIL;
11870 break;
11871 homog_assoc_ret_WRITE_FAIL:
11872 retval = RET_WRITE_FAIL;
11873 break;
11874 }
11875 homog_assoc_ret_1:
11876 bigstack_double_reset(bigstack_mark, bigstack_end_mark);
11877 fclose_cond(outfile);
11878 return retval;
11879 }
11880