1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 #include "include/plink2_stats.h"
18 #include "plink2_adjust.h"
19 #include "plink2_compress_stream.h"
20 
21 #ifdef __cplusplus
22 namespace plink2 {
23 #endif
24 
InitAdjust(AdjustInfo * adjust_info_ptr,AdjustFileInfo * adjust_file_info_ptr)25 void InitAdjust(AdjustInfo* adjust_info_ptr, AdjustFileInfo* adjust_file_info_ptr) {
26   adjust_info_ptr->flags = kfAdjust0;
27   adjust_info_ptr->lambda = 0.0;
28   adjust_file_info_ptr->base.flags = kfAdjust0;
29   adjust_file_info_ptr->base.lambda = 0.0;
30   adjust_file_info_ptr->fname = nullptr;
31   adjust_file_info_ptr->test_name = nullptr;
32   adjust_file_info_ptr->chr_field = nullptr;
33   adjust_file_info_ptr->pos_field = nullptr;
34   adjust_file_info_ptr->id_field = nullptr;
35   adjust_file_info_ptr->ref_field = nullptr;
36   adjust_file_info_ptr->alt_field = nullptr;
37   adjust_file_info_ptr->a1_field = nullptr;
38   adjust_file_info_ptr->test_field = nullptr;
39   adjust_file_info_ptr->p_field = nullptr;
40 }
41 
CleanupAdjust(AdjustFileInfo * adjust_file_info_ptr)42 void CleanupAdjust(AdjustFileInfo* adjust_file_info_ptr) {
43   free_cond(adjust_file_info_ptr->a1_field);
44   free_cond(adjust_file_info_ptr->alt_field);
45   free_cond(adjust_file_info_ptr->chr_field);
46   if (adjust_file_info_ptr->fname) {
47     free(adjust_file_info_ptr->fname);
48     free_cond(adjust_file_info_ptr->pos_field);
49     free_cond(adjust_file_info_ptr->id_field);
50     free_cond(adjust_file_info_ptr->ref_field);
51     free_cond(adjust_file_info_ptr->test_field);
52     free_cond(adjust_file_info_ptr->p_field);
53   }
54 }
55 
56 typedef struct AdjAssocResultStruct {
57   double ln_pval;
58   double chisq;  // do we really need this?...
59   uint32_t variant_uidx;
60   uint32_t allele_idx;
61 #ifdef __cplusplus
operator <plink2::AdjAssocResultStruct62   bool operator<(const struct AdjAssocResultStruct& rhs) const {
63     return ln_pval < rhs.ln_pval;
64   }
65 #endif
66 } AdjAssocResult;
67 
adjust_print_ln(const char * output_min_p_str,double ln_pval,double output_min_ln,uint32_t output_min_p_slen,uint32_t is_neglog10,char ** bufpp)68 static inline void adjust_print_ln(const char* output_min_p_str, double ln_pval, double output_min_ln, uint32_t output_min_p_slen, uint32_t is_neglog10, char** bufpp) {
69   **bufpp = '\t';
70   *bufpp += 1;
71   if (ln_pval <= output_min_ln) {
72     *bufpp = memcpya(*bufpp, output_min_p_str, output_min_p_slen);
73   } else {
74     if (!is_neglog10) {
75       *bufpp = lntoa_g(ln_pval, *bufpp);
76     } else {
77       *bufpp = dtoa_g(ln_pval * (-1.0 / kLn10), *bufpp);
78     }
79   }
80 }
81 
82 // Now based around ln_pvals, to allow useful comparisons < 2.23e-308.
Multcomp(const uintptr_t * variant_include,const ChrInfo * cip,const char * const * chr_ids,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_include,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const char * const * loaded_a1,const AdjustInfo * adjust_info_ptr,const double * ln_pvals,const double * chisqs,uintptr_t orig_allele_ct,uint32_t max_allele_slen,double ln_pfilter,double output_min_ln,uint32_t skip_gc,uint32_t max_thread_ct,char * outname,char * outname_end)83 PglErr Multcomp(const uintptr_t* variant_include, const ChrInfo* cip, const char* const* chr_ids, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_include, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const char* const* loaded_a1, const AdjustInfo* adjust_info_ptr, const double* ln_pvals, const double* chisqs, uintptr_t orig_allele_ct, uint32_t max_allele_slen, double ln_pfilter, double output_min_ln, uint32_t skip_gc, uint32_t max_thread_ct, char* outname, char* outname_end) {
84   unsigned char* bigstack_mark = g_bigstack_base;
85   char* cswritep = nullptr;
86   CompressStreamState css;
87   PglErr reterr = kPglRetSuccess;
88   PreinitCstream(&css);
89   {
90     AdjAssocResult* sortbuf;
91     if (unlikely(BIGSTACK_ALLOC_X(AdjAssocResult, orig_allele_ct, &sortbuf))) {
92       goto Multcomp_ret_NOMEM;
93     }
94     uintptr_t valid_allele_ct = 0;
95     uintptr_t allele_uidx_base = 0;
96     uintptr_t allele_include_bits = allele_include[0];
97     if (!allele_idx_offsets) {
98       if (chisqs) {
99         if (ln_pvals) {
100           for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
101             const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
102             const double cur_chisq = chisqs[aidx];
103             if (cur_chisq >= 0.0) {
104               sortbuf[valid_allele_ct].chisq = cur_chisq;
105               sortbuf[valid_allele_ct].ln_pval = ln_pvals[aidx];
106               sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
107               sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
108               ++valid_allele_ct;
109             }
110           }
111         } else {
112           for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
113             const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
114             const double cur_chisq = chisqs[aidx];
115             if (cur_chisq >= 0.0) {
116               sortbuf[valid_allele_ct].chisq = cur_chisq;
117               sortbuf[valid_allele_ct].ln_pval = ChisqToLnP(cur_chisq, 1);
118               sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
119               sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
120               ++valid_allele_ct;
121             }
122           }
123         }
124       } else {
125         for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
126           const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
127           const double cur_ln_pval = ln_pvals[aidx];
128           if (cur_ln_pval <= 0.0) {
129             sortbuf[valid_allele_ct].chisq = LnPToChisq(cur_ln_pval);
130             sortbuf[valid_allele_ct].ln_pval = cur_ln_pval;
131             sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
132             sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
133             ++valid_allele_ct;
134           }
135         }
136       }
137     } else {
138       uintptr_t variant_uidx_base = 0;
139       uintptr_t variant_include_bits = variant_include[0];
140       uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
141       uintptr_t allele_idx_offset_start = allele_idx_offsets[variant_uidx];
142       uintptr_t allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
143       if (chisqs) {
144         if (ln_pvals) {
145           for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
146             const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
147             if (allele_uidx >= allele_idx_offset_end) {
148               variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
149               allele_idx_offset_start = allele_idx_offsets[variant_uidx];
150               allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
151             }
152             const double cur_chisq = chisqs[aidx];
153             if (cur_chisq >= 0.0) {
154               sortbuf[valid_allele_ct].chisq = cur_chisq;
155               sortbuf[valid_allele_ct].ln_pval = ln_pvals[aidx];
156               sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
157               sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
158               ++valid_allele_ct;
159             }
160           }
161         } else {
162           for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
163             const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
164             if (allele_uidx >= allele_idx_offset_end) {
165               variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
166               allele_idx_offset_start = allele_idx_offsets[variant_uidx];
167               allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
168             }
169             const double cur_chisq = chisqs[aidx];
170             if (cur_chisq >= 0.0) {
171               sortbuf[valid_allele_ct].chisq = cur_chisq;
172               sortbuf[valid_allele_ct].ln_pval = ChisqToLnP(cur_chisq, 1);
173               sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
174               sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
175               ++valid_allele_ct;
176             }
177           }
178         }
179       } else {
180         for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
181           const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
182           if (allele_uidx >= allele_idx_offset_end) {
183             variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
184             allele_idx_offset_start = allele_idx_offsets[variant_uidx];
185             allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
186           }
187           const double cur_ln_pval = ln_pvals[aidx];
188           if (cur_ln_pval <= 0.0) {
189             sortbuf[valid_allele_ct].chisq = LnPToChisq(cur_ln_pval);
190             sortbuf[valid_allele_ct].ln_pval = cur_ln_pval;
191             sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
192             sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
193             ++valid_allele_ct;
194           }
195         }
196       }
197     }
198     if (!valid_allele_ct) {
199       logputs("Zero valid tests; --adjust skipped.\n");
200       goto Multcomp_ret_1;
201     }
202     BigstackShrinkTop(sortbuf, valid_allele_ct * sizeof(AdjAssocResult));
203 
204     const uintptr_t overflow_buf_size = kCompressStreamBlock + 2 * kMaxIdSlen + 512 + 2 * max_allele_slen;
205     const AdjustFlags flags = adjust_info_ptr->flags;
206     const uint32_t output_zst = flags & kfAdjustZs;
207     OutnameZstSet(".adjusted", output_zst, outname_end);
208     reterr = InitCstreamAlloc(outname, 0, output_zst, max_thread_ct, overflow_buf_size, &css, &cswritep);
209     if (unlikely(reterr)) {
210       goto Multcomp_ret_1;
211     }
212     *cswritep++ = '#';
213     const uint32_t chr_col = flags & kfAdjustColChrom;
214     if (chr_col) {
215       cswritep = strcpya_k(cswritep, "CHROM\t");
216     }
217     if (flags & kfAdjustColPos) {
218       cswritep = strcpya_k(cswritep, "POS\t");
219     } else {
220       variant_bps = nullptr;
221     }
222     cswritep = strcpya_k(cswritep, "ID\t");
223     const uint32_t ref_col = flags & kfAdjustColRef;
224     if (ref_col) {
225       cswritep = strcpya_k(cswritep, "REF\t");
226     }
227     const uint32_t alt1_col = flags & kfAdjustColAlt1;
228     if (alt1_col) {
229       cswritep = strcpya_k(cswritep, "ALT1\t");
230     }
231     const uint32_t alt_col = flags & kfAdjustColAlt;
232     if (alt_col) {
233       cswritep = strcpya_k(cswritep, "ALT\t");
234     }
235     const uint32_t a1_col = (flags & kfAdjustColA1) && (loaded_a1 || cip);
236     if (a1_col) {
237       cswritep = strcpya_k(cswritep, "A1\t");
238     }
239     const uint32_t is_neglog10 = flags & kfAdjustLog10;
240     const uint32_t unadj_col = flags & kfAdjustColUnadj;
241     if (unadj_col) {
242       if (is_neglog10) {
243         cswritep = strcpya_k(cswritep, "LOG10_");
244       }
245       cswritep = strcpya_k(cswritep, "UNADJ\t");
246     }
247     const uint32_t gc_col = (flags & kfAdjustColGc) && (!skip_gc);
248     if (gc_col) {
249       if (is_neglog10) {
250         cswritep = strcpya_k(cswritep, "LOG10_");
251       }
252       cswritep = strcpya_k(cswritep, "GC\t");
253     }
254     const uint32_t qq_col = flags & kfAdjustColQq;
255     if (qq_col) {
256       if (is_neglog10) {
257         cswritep = strcpya_k(cswritep, "LOG10_");
258       }
259       cswritep = strcpya_k(cswritep, "QQ\t");
260     }
261     const uint32_t bonf_col = flags & kfAdjustColBonf;
262     if (bonf_col) {
263       if (is_neglog10) {
264         cswritep = strcpya_k(cswritep, "LOG10_");
265       }
266       cswritep = strcpya_k(cswritep, "BONF\t");
267     }
268     const uint32_t holm_col = flags & kfAdjustColHolm;
269     if (holm_col) {
270       if (is_neglog10) {
271         cswritep = strcpya_k(cswritep, "LOG10_");
272       }
273       cswritep = strcpya_k(cswritep, "HOLM\t");
274     }
275     const uint32_t sidakss_col = flags & kfAdjustColSidakss;
276     if (sidakss_col) {
277       if (is_neglog10) {
278         cswritep = strcpya_k(cswritep, "LOG10_");
279       }
280       cswritep = strcpya_k(cswritep, "SIDAK_SS\t");
281     }
282     const uint32_t sidaksd_col = flags & kfAdjustColSidaksd;
283     if (sidaksd_col) {
284       if (is_neglog10) {
285         cswritep = strcpya_k(cswritep, "LOG10_");
286       }
287       cswritep = strcpya_k(cswritep, "SIDAK_SD\t");
288     }
289     const uint32_t fdrbh_col = flags & kfAdjustColFdrbh;
290     if (fdrbh_col) {
291       if (is_neglog10) {
292         cswritep = strcpya_k(cswritep, "LOG10_");
293       }
294       cswritep = strcpya_k(cswritep, "FDR_BH\t");
295     }
296     double* ln_pv_by = nullptr;
297     if (flags & kfAdjustColFdrby) {
298       if (unlikely(bigstack_alloc_d(valid_allele_ct, &ln_pv_by))) {
299         goto Multcomp_ret_NOMEM;
300       }
301       if (is_neglog10) {
302         cswritep = strcpya_k(cswritep, "LOG10_");
303       }
304       cswritep = strcpya_k(cswritep, "FDR_BY\t");
305     }
306     DecrAppendBinaryEoln(&cswritep);
307 
308     // reverse-order calculations
309     double* ln_pv_bh;
310     double* ln_pv_gc;
311     double* unadj_sorted_ln_pvals;
312     if (unlikely(
313             bigstack_alloc_d(valid_allele_ct, &ln_pv_bh) ||
314             bigstack_alloc_d(valid_allele_ct, &ln_pv_gc) ||
315             bigstack_alloc_d(valid_allele_ct, &unadj_sorted_ln_pvals))) {
316       goto Multcomp_ret_NOMEM;
317     }
318 
319     STD_SORT_PAR_UNSEQ(valid_allele_ct, double_cmp, sortbuf);
320 
321     double lambda_recip = 1.0;
322     if (!skip_gc) {
323       if (adjust_info_ptr->lambda != 0.0) {
324         lambda_recip = 1.0 / adjust_info_ptr->lambda;
325       } else {
326         const uintptr_t valid_allele_ct_d2 = valid_allele_ct / 2;
327         double lambda = sortbuf[valid_allele_ct_d2].chisq;
328         if (!(valid_allele_ct % 2)) {
329           lambda = (lambda + sortbuf[valid_allele_ct_d2 - 1].chisq) * 0.5;
330         }
331         lambda = lambda / 0.456;
332         if (lambda < 1.0) {
333           lambda = 1.0;
334         }
335         logprintf("--adjust: Genomic inflation est. lambda (based on median chisq) = %g.\n", lambda);
336         lambda_recip = 1.0 / lambda;
337       }
338     }
339     double* sorted_ln_pvals = unadj_sorted_ln_pvals;
340     for (uintptr_t aidx = 0; aidx != valid_allele_ct; ++aidx) {
341       ln_pv_gc[aidx] = ChisqToLnP(sortbuf[aidx].chisq * lambda_recip, 1);
342       unadj_sorted_ln_pvals[aidx] = sortbuf[aidx].ln_pval;
343     }
344     if ((flags & kfAdjustGc) && (!skip_gc)) {
345       sorted_ln_pvals = ln_pv_gc;
346     }
347 
348     const uint32_t valid_allele_ct_m1 = valid_allele_ct - 1;
349     const double valid_allele_ctd = swtod(valid_allele_ct);
350     const double ln_valid_allele_ct = log(valid_allele_ctd);
351     double bh_ln_pval_min = sorted_ln_pvals[valid_allele_ct_m1];
352     ln_pv_bh[valid_allele_ct_m1] = bh_ln_pval_min;
353     double harmonic_sum = 1.0;
354     for (uint32_t aidx = valid_allele_ct_m1; aidx; --aidx) {
355       const double harmonic_term = valid_allele_ctd / u31tod(aidx);
356       harmonic_sum += harmonic_term;
357       const double bh_ln_pval = sorted_ln_pvals[aidx - 1] + log(harmonic_term);
358       if (bh_ln_pval_min > bh_ln_pval) {
359         bh_ln_pval_min = bh_ln_pval;
360       }
361       ln_pv_bh[aidx - 1] = bh_ln_pval_min;
362     }
363 
364     if (ln_pv_by) {
365       const double ln_harmonic_sum = log(harmonic_sum);
366       double by_ln_pval_min = sorted_ln_pvals[valid_allele_ct_m1] - ln_valid_allele_ct + ln_harmonic_sum;
367       if (by_ln_pval_min > 0.0) {
368         by_ln_pval_min = 0.0;
369       }
370       ln_pv_by[valid_allele_ct_m1] = by_ln_pval_min;
371       for (uint32_t aidx = valid_allele_ct_m1; aidx; --aidx) {
372         const double by_ln_pval = sorted_ln_pvals[aidx - 1] - log(u31tod(aidx)) + ln_harmonic_sum;
373         if (by_ln_pval_min > by_ln_pval) {
374           by_ln_pval_min = by_ln_pval;
375         }
376         ln_pv_by[aidx - 1] = by_ln_pval_min;
377       }
378     }
379 
380     char output_min_p_buf[24];
381     uint32_t output_min_p_slen;
382     if (!is_neglog10) {
383       char* str_end = lntoa_g(output_min_ln, output_min_p_buf);
384       output_min_p_slen = str_end - output_min_p_buf;
385     } else {
386       // -log10(p) output ignores --output-min-p now.
387       // instead, set to maximum int32 to distinguish from plink 1.x's
388       // much-less-extreme 'inf'.
389       strcpy_k(output_min_p_buf, "2147483647");
390       output_min_p_slen = 10;
391     }
392     const double valid_allele_ct_recip = 1.0 / valid_allele_ctd;
393     double ln_pv_sidak_sd = -DBL_MAX;
394     double ln_pv_holm = -DBL_MAX;
395     uint32_t cur_allele_ct = 2;
396     uint32_t aidx = 0;
397     for (; aidx < valid_allele_ct; ++aidx) {
398       double ln_pval = sorted_ln_pvals[aidx];
399       if (ln_pval > ln_pfilter) {
400         break;
401       }
402       const uint32_t variant_uidx = sortbuf[aidx].variant_uidx;
403       if (chr_col) {
404         if (cip) {
405           cswritep = chrtoa(cip, GetVariantChr(cip, variant_uidx), cswritep);
406         } else {
407           cswritep = strcpya(cswritep, chr_ids[variant_uidx]);
408         }
409         *cswritep++ = '\t';
410       }
411       if (variant_bps) {
412         cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
413       }
414       cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
415       uintptr_t allele_idx_offset_base = variant_uidx * 2;
416       if (allele_idx_offsets) {
417         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
418         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
419       }
420       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
421       if (ref_col) {
422         *cswritep++ = '\t';
423         cswritep = strcpya(cswritep, cur_alleles[0]);
424       }
425       if (alt1_col) {
426         *cswritep++ = '\t';
427         cswritep = strcpya(cswritep, cur_alleles[1]);
428       }
429       if (alt_col) {
430         *cswritep++ = '\t';
431         for (uint32_t allele_idx = 1; allele_idx != cur_allele_ct; ++allele_idx) {
432           if (unlikely(Cswrite(&css, &cswritep))) {
433             goto Multcomp_ret_WRITE_FAIL;
434           }
435           cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
436         }
437         --cswritep;
438       }
439       if (a1_col) {
440         *cswritep++ = '\t';
441         const char* cur_allele;
442         if (loaded_a1) {
443           // --adjust-file hack
444           cur_allele = loaded_a1[variant_uidx];
445         } else {
446           cur_allele = cur_alleles[sortbuf[aidx].allele_idx];
447         }
448         cswritep = strcpya(cswritep, cur_allele);
449       }
450       if (unadj_col) {
451         adjust_print_ln(output_min_p_buf, unadj_sorted_ln_pvals[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
452       }
453       if (gc_col) {
454         adjust_print_ln(output_min_p_buf, ln_pv_gc[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
455       }
456       const double aidx_d = swtod(aidx);
457       if (qq_col) {
458         *cswritep++ = '\t';
459         double qq_val = (aidx_d + 0.5) * valid_allele_ct_recip;
460         if (is_neglog10) {
461           qq_val = -log10(qq_val);
462         }
463         cswritep = dtoa_g(qq_val, cswritep);
464       }
465       if (bonf_col) {
466         double bonf_ln_pval = ln_pval + ln_valid_allele_ct;
467         if (bonf_ln_pval > 0.0) {
468           bonf_ln_pval = 0.0;
469         }
470         adjust_print_ln(output_min_p_buf, bonf_ln_pval, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
471       }
472       if (holm_col) {
473         if (ln_pv_holm < 0.0) {
474           const double ln_pv_holm_new = ln_pval + log(u31tod(valid_allele_ct - aidx));
475           if (ln_pv_holm_new > 0.0) {
476             ln_pv_holm = 0.0;
477           } else if (ln_pv_holm < ln_pv_holm_new) {
478             ln_pv_holm = ln_pv_holm_new;
479           }
480         }
481         adjust_print_ln(output_min_p_buf, ln_pv_holm, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
482       }
483       if (sidakss_col) {
484         // avoid catastrophic cancellation for small p-values
485         // 1 - (1-p)^c = 1 - e^{c log(1-p)}
486         // 2^{-7} threshold is arbitrary
487         // 2^{-90} corresponds to cp + (cp)^2/2! == cp in double-precision
488         // arithmetic, with several bits to spare
489         double ln_pv_sidak_ss;
490         if (ln_pval > -90 * kLn2) {
491           const double pval = exp(ln_pval);
492           double pv_sidak_ss;
493           if (ln_pval >= -7 * kLn2) {
494             pv_sidak_ss = 1 - pow(1 - pval, valid_allele_ctd);
495           } else {
496             pv_sidak_ss = 1 - exp(valid_allele_ctd * log1p(-pval));
497           }
498           ln_pv_sidak_ss = log(pv_sidak_ss);
499         } else {
500           // log(1-x) = -x - x^2/2 - x^3/3 + ...
501           // 1 - exp(x) = -x - x^2/2! - x^3/3! - ...
502           // if p <= 2^{-90},
503           //   log(1-p) is -p
504           //   1 - e^{-cp} is cp
505           ln_pv_sidak_ss = ln_pval + ln_valid_allele_ct;
506         }
507         adjust_print_ln(output_min_p_buf, ln_pv_sidak_ss, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
508       }
509       if (sidaksd_col) {
510         double ln_pv_sidak_sd_new;
511         if (ln_pval > -90 * kLn2) {
512           const double pval = exp(ln_pval);
513           double pv_sidak_sd_new;
514           if (ln_pval >= -7 * kLn2) {
515             pv_sidak_sd_new = 1 - pow(1 - pval, valid_allele_ctd - aidx_d);
516           } else {
517             const double cur_exp = valid_allele_ctd - aidx_d;
518             pv_sidak_sd_new = 1 - exp(cur_exp * log1p(-pval));
519           }
520           ln_pv_sidak_sd_new = log(pv_sidak_sd_new);
521         } else {
522           ln_pv_sidak_sd_new = ln_pval + log(valid_allele_ctd - aidx_d);
523         }
524         if (ln_pv_sidak_sd < ln_pv_sidak_sd_new) {
525           ln_pv_sidak_sd = ln_pv_sidak_sd_new;
526         }
527         adjust_print_ln(output_min_p_buf, ln_pv_sidak_sd, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
528       }
529       if (fdrbh_col) {
530         adjust_print_ln(output_min_p_buf, ln_pv_bh[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
531       }
532       if (ln_pv_by) {
533         adjust_print_ln(output_min_p_buf, ln_pv_by[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
534       }
535       AppendBinaryEoln(&cswritep);
536       if (unlikely(Cswrite(&css, &cswritep))) {
537         goto Multcomp_ret_WRITE_FAIL;
538       }
539     }
540     if (unlikely(CswriteCloseNull(&css, cswritep))) {
541       goto Multcomp_ret_WRITE_FAIL;
542     }
543     // don't use valid_allele_ct due to --pfilter
544     logprintfww("--adjust%s values (%" PRIuPTR " test%s) written to %s .\n", cip? "" : "-file", aidx, (aidx == 1)? "" : "s", outname);
545   }
546   while (0) {
547   Multcomp_ret_NOMEM:
548     reterr = kPglRetNomem;
549     break;
550   Multcomp_ret_WRITE_FAIL:
551     reterr = kPglRetWriteFail;
552     break;
553   }
554  Multcomp_ret_1:
555   CswriteCloseCond(&css, cswritep);
556   BigstackReset(bigstack_mark);
557   return reterr;
558 }
559 
AdjustFile(const AdjustFileInfo * afip,double ln_pfilter,double output_min_ln,uint32_t max_thread_ct,char * outname,char * outname_end)560 PglErr AdjustFile(const AdjustFileInfo* afip, double ln_pfilter, double output_min_ln, uint32_t max_thread_ct, char* outname, char* outname_end) {
561   unsigned char* bigstack_mark = g_bigstack_base;
562   unsigned char* bigstack_end_mark = g_bigstack_end;
563   const char* in_fname = afip->fname;
564   uintptr_t line_idx = 0;
565   PglErr reterr = kPglRetSuccess;
566   TextStream adjust_txs;
567   PreinitTextStream(&adjust_txs);
568   {
569     // Two-pass load.
570     // 1. Parse header line, count # of variants.
571     // intermission. Allocate top-level arrays.
572     // 2. Rewind and fill arrays.
573     // (some overlap with LoadPvar(), though that's one-pass.)
574     reterr = SizeAndInitTextStream(in_fname, bigstack_left() / 4, max_thread_ct, &adjust_txs);
575     if (unlikely(reterr)) {
576       goto AdjustFile_ret_1;
577     }
578 
579     const char* header_start;
580     do {
581       ++line_idx;
582       header_start = TextGet(&adjust_txs);
583       if (unlikely(!header_start)) {
584         reterr = TextStreamRawErrcode(&adjust_txs);
585         if (reterr == kPglRetEof) {
586           snprintf(g_logbuf, kLogbufSize, "Error: %s is empty.\n", in_fname);
587           goto AdjustFile_ret_MALFORMED_INPUT_WW;
588         }
589         goto AdjustFile_ret_TSTREAM_FAIL;
590       }
591     } while (strequal_k_unsafe(header_start, "##"));
592     if (*header_start == '#') {
593       ++header_start;
594     }
595 
596     const AdjustFlags flags = afip->base.flags;
597     // [0] = CHROM
598     // [1] = POS
599     // [2] = ID (required)
600     // [3] = REF
601     // [4] = ALT
602     // [5] = A1
603     // [6] = TEST (always scan)
604     // [7] = P (required)
605     const char* col_search_order[8];
606     const uint32_t need_chr = (flags & kfAdjustColChrom);
607     const uint32_t need_pos = (flags & kfAdjustColPos);
608     const uint32_t need_ref = (flags & kfAdjustColRef);
609     const uint32_t need_alt = (flags & (kfAdjustColAlt1 | kfAdjustColAlt));
610     uint32_t check_a1 = (flags & kfAdjustColA1);
611     const uint32_t alt_comma_truncate = (need_alt == kfAdjustColAlt1);
612     if (unlikely(need_alt == (kfAdjustColAlt1 | kfAdjustColAlt))) {
613       // Could theoretically support this later (allocate allele_idx_offsets
614       // and count # of multiallelic variants in first pass, etc.), but
615       // unlikely to be relevant.
616       // (For now, we abuse allele_storage by storing all comma-separated alt
617       // alleles in a single string, since Multcomp() is ok with that.)
618       logerrputs("Error: --adjust-file does not currently support simultaneous alt1 and alt\ncolumn output.\n");
619       goto AdjustFile_ret_INVALID_CMDLINE;
620     }
621     col_search_order[0] = need_chr? (afip->chr_field? afip->chr_field : "CHROM\0CHR\0") : "";
622     col_search_order[1] = need_pos? (afip->pos_field? afip->pos_field : "POS\0BP\0") : "";
623     col_search_order[2] = afip->id_field? afip->id_field : "ID\0SNP\0";
624     col_search_order[3] = need_ref? (afip->ref_field? afip->ref_field : "REF\0A2\0") : "";
625     col_search_order[4] = need_alt? (afip->alt_field? afip->alt_field : "ALT\0ALT1\0") : "";
626     col_search_order[5] = check_a1? (afip->a1_field? afip->a1_field : "A1\0") : "";
627     col_search_order[6] = afip->test_field? afip->test_field : "TEST\0";
628     const uint32_t input_log10 = (flags & kfAdjustInputLog10);
629     col_search_order[7] = afip->p_field? afip->p_field : (input_log10? "LOG10_P\0LOG10_UNADJ\0P\0UNADJ\0" : "P\0UNADJ\0");
630 
631     uint32_t col_skips[8];
632     uint32_t col_types[8];
633     uint32_t relevant_col_ct;
634     uint32_t found_type_bitset;
635     reterr = SearchHeaderLine(header_start, col_search_order, "adjust-file", 8, &relevant_col_ct, &found_type_bitset, col_skips, col_types);
636     if (unlikely(reterr)) {
637       goto AdjustFile_ret_1;
638     }
639     if (unlikely((found_type_bitset & 0x44) != 0x44)) {
640       logerrputs("Error: --adjust-file requires ID and P columns.\n");
641       goto AdjustFile_ret_INCONSISTENT_INPUT;
642     }
643     const char* test_name = afip->test_name;
644     uint32_t test_name_slen = 0;
645     uint32_t test_col_idx = 0;
646     if (test_name) {
647       test_name_slen = strlen(test_name);
648       // this duplicates a bit of work done in SearchHeaderLine(), but not a
649       // big deal
650       for (uint32_t relevant_col_idx = 0; ; ++relevant_col_idx) {
651         test_col_idx += col_skips[relevant_col_idx];
652         if (col_types[relevant_col_idx] == 6) {
653           break;
654         }
655       }
656     } else if (unlikely(found_type_bitset & 0x40)) {
657       snprintf(g_logbuf, kLogbufSize, "Error: TEST column present in %s, but no test= parameter was provided to --adjust-file.\n", in_fname);
658       goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
659     }
660     if (unlikely(need_chr && (!(found_type_bitset & 0x1)))) {
661       snprintf(g_logbuf, kLogbufSize, "Error: No chromosome column in %s.\n", in_fname);
662       goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
663     }
664     if (unlikely(need_pos && (!(found_type_bitset & 0x2)))) {
665       snprintf(g_logbuf, kLogbufSize, "Error: No bp coordinate column in %s.\n", in_fname);
666       goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
667     }
668     if (unlikely(need_ref && (!(found_type_bitset & 0x8)))) {
669       snprintf(g_logbuf, kLogbufSize, "Error: No REF column in %s.\n", in_fname);
670       goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
671     }
672     if (unlikely(need_alt && (!(found_type_bitset & 0x10)))) {
673       snprintf(g_logbuf, kLogbufSize, "Error: No ALT column in %s.\n", in_fname);
674       goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
675     }
676     if (unlikely(check_a1 && (!(found_type_bitset & 0x20)))) {
677       snprintf(g_logbuf, kLogbufSize, "Warning: No A1 column in %s. Omitting from output.\n", in_fname);
678       check_a1 = 0;
679     }
680 
681     uintptr_t entry_ct = 0;
682     while (1) {
683       ++line_idx;
684       const char* line_start = TextGet(&adjust_txs);
685       if (!line_start) {
686         if (likely(!TextStreamErrcode2(&adjust_txs, &reterr))) {
687           break;
688         }
689         goto AdjustFile_ret_TSTREAM_FAIL;
690       }
691       if (test_name) {
692         // Don't count different-test entries.
693         const char* test_name_start = NextTokenMult0(line_start, test_col_idx);
694         if (unlikely(!test_name_start)) {
695           goto AdjustFile_ret_MISSING_TOKENS;
696         }
697         const uint32_t cur_test_slen = strlen_se(test_name_start);
698         if ((cur_test_slen != test_name_slen) || (!memequal(test_name_start, test_name, test_name_slen))) {
699           continue;
700         }
701       }
702       ++entry_ct;
703     }
704 #ifdef __LP64__
705     // probably want to permit this soon
706     if (entry_ct > 0xffffffffU) {
707       logerrputs("Error: Too many entries for --adjust-file.\n");
708       reterr = kPglRetNotYetSupported;
709       goto AdjustFile_ret_1;
710     }
711 #endif
712 
713     reterr = TextRewind(&adjust_txs);
714     if (unlikely(reterr)) {
715       goto AdjustFile_ret_TSTREAM_FAIL;
716     }
717     const uintptr_t line_ct = line_idx - 1;
718     line_idx = 0;
719     do {
720       ++line_idx;
721       reterr = TextNextLineLstripK(&adjust_txs, &header_start);
722       if (unlikely(reterr)) {
723         goto AdjustFile_ret_TSTREAM_REWIND_FAIL;
724       }
725     } while (strequal_k_unsafe(header_start, "##"));
726 
727     const uintptr_t entry_ctl = BitCtToWordCt(entry_ct);
728     const uintptr_t entry_ctl2 = NypCtToWordCt(entry_ct);
729     uintptr_t* variant_include_dummy;
730     uintptr_t* allele_include_dummy;
731     if (unlikely(
732             bigstack_alloc_w(entry_ctl, &variant_include_dummy) ||
733             bigstack_alloc_w(entry_ctl2, &allele_include_dummy))) {
734       goto AdjustFile_ret_NOMEM;
735     }
736     SetAllBits(entry_ct, variant_include_dummy);
737     for (uintptr_t ulii = 0; ulii != entry_ct / kBitsPerWordD2; ++ulii) {
738       allele_include_dummy[ulii] = kMaskAAAA;
739     }
740     const uint32_t remainder = entry_ct % kBitsPerWordD2;
741     if (remainder) {
742       allele_include_dummy[entry_ct / kBitsPerWordD2] = kMaskAAAA >> (2 * (kBitsPerWordD2 - remainder));
743     }
744     char** chr_ids;
745     if (need_chr) {
746       if (unlikely(bigstack_alloc_cp(entry_ct, &chr_ids))) {
747         goto AdjustFile_ret_NOMEM;
748       }
749     } else {
750       chr_ids = nullptr;
751     }
752     uint32_t* variant_bps;
753     if (need_pos) {
754       if (unlikely(bigstack_alloc_u32(entry_ct, &variant_bps))) {
755         goto AdjustFile_ret_NOMEM;
756       }
757     } else {
758       variant_bps = nullptr;
759     }
760     char** variant_ids;
761     double* ln_pvals;
762     if (unlikely(
763             bigstack_alloc_cp(entry_ct, &variant_ids) ||
764             bigstack_alloc_d(entry_ct, &ln_pvals))) {
765       goto AdjustFile_ret_NOMEM;
766     }
767     char** allele_storage;
768     if (need_ref || need_alt) {
769       if (unlikely(bigstack_alloc_cp(entry_ct * 2, &allele_storage))) {
770         goto AdjustFile_ret_NOMEM;
771       }
772     } else {
773       allele_storage = nullptr;
774     }
775     char** a1_storage;
776     if (check_a1) {
777       if (unlikely(bigstack_alloc_cp(entry_ct, &a1_storage))) {
778         goto AdjustFile_ret_NOMEM;
779       }
780     } else {
781       a1_storage = nullptr;
782     }
783     unsigned char* tmp_alloc_base = g_bigstack_base;
784     unsigned char* tmp_alloc_end = g_bigstack_end;
785     uint32_t max_allele_slen = 1;
786     uintptr_t variant_idx = 0;
787     while (line_idx < line_ct) {
788       ++line_idx;
789       const char* line_start;
790       reterr = TextNextLineLstripK(&adjust_txs, &line_start);
791       if (unlikely(reterr)) {
792         goto AdjustFile_ret_TSTREAM_REWIND_FAIL;
793       }
794       const char* token_ptrs[8];
795       uint32_t token_slens[8];
796       if (unlikely(!TokenLexK0(line_start, col_types, col_skips, relevant_col_ct, token_ptrs, token_slens))) {
797         goto AdjustFile_ret_MISSING_TOKENS;
798       }
799       if (test_name) {
800         if ((token_slens[6] != test_name_slen) || (!memequal(token_ptrs[6], test_name, test_name_slen))) {
801           continue;
802         }
803       }
804       if (chr_ids) {
805         const uint32_t cur_slen = token_slens[0];
806         if (StoreStringAtBase(tmp_alloc_end, token_ptrs[0], cur_slen, &tmp_alloc_base, &(chr_ids[variant_idx]))) {
807           goto AdjustFile_ret_NOMEM;
808         }
809       }
810       if (variant_bps) {
811         if (unlikely(ScanUintDefcap(token_ptrs[1], &(variant_bps[variant_idx])))) {
812           snprintf(g_logbuf, kLogbufSize, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, in_fname);
813           goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
814         }
815       }
816       const uint32_t id_slen = token_slens[2];
817       if (StoreStringAtBase(tmp_alloc_end, token_ptrs[2], id_slen, &tmp_alloc_base, &(variant_ids[variant_idx]))) {
818         goto AdjustFile_ret_NOMEM;
819       }
820       if (need_ref) {
821         const uint32_t cur_slen = token_slens[3];
822         if (StoreStringAtBase(tmp_alloc_end, token_ptrs[3], cur_slen, &tmp_alloc_base, &(allele_storage[2 * variant_idx]))) {
823           goto AdjustFile_ret_NOMEM;
824         }
825       }
826       if (need_alt) {
827         const char* alt_str = token_ptrs[4];
828         uint32_t cur_slen = token_slens[4];
829         if (alt_comma_truncate) {
830           const char* alt_comma = S_CAST(const char*, memchr(alt_str, ',', cur_slen));
831           if (alt_comma) {
832             cur_slen = alt_comma - alt_str;
833           }
834         }
835         if (StoreStringAtBase(tmp_alloc_end, alt_str, cur_slen, &tmp_alloc_base, &(allele_storage[2 * variant_idx + 1]))) {
836           goto AdjustFile_ret_NOMEM;
837         }
838       }
839       if (check_a1) {
840         const uint32_t cur_slen = token_slens[5];
841         if (StoreStringAtBase(tmp_alloc_end, token_ptrs[5], cur_slen, &tmp_alloc_base, &(a1_storage[variant_idx]))) {
842           goto AdjustFile_ret_NOMEM;
843         }
844       }
845       const char* pval_str = token_ptrs[7];
846       double ln_pval;
847       if (!input_log10) {
848         if (!ScantokLn(pval_str, &ln_pval)) {
849           uint32_t cur_slen;
850         AdjustFile_alphabetic_pval:
851           cur_slen = token_slens[7];
852           if (IsNanStr(pval_str, cur_slen)) {
853             ln_pval = kLnPvalError;
854           } else if (likely(
855                        strequal_k(pval_str, "INF", cur_slen) ||
856                        (input_log10 && strequal_k(pval_str, "inf", cur_slen)))) {
857             // From plink 1.x, could be anything smaller than log(5e-324).
858             // Just fill with log(2.23e-308) for now.
859             ln_pval = kLnNormalMin;
860           } else {
861             goto AdjustFile_ret_INVALID_PVAL;
862           }
863         }
864       } else {
865         double neglog10_pval;
866         if (!ScantokDouble(pval_str, &neglog10_pval)) {
867           goto AdjustFile_alphabetic_pval;
868         }
869         ln_pval = neglog10_pval * (-kLn10);
870         if (unlikely(ln_pval > 0.0)) {
871           goto AdjustFile_ret_INVALID_PVAL;
872         }
873       }
874       ln_pvals[variant_idx] = ln_pval;
875       ++variant_idx;
876     }
877     BigstackEndReset(bigstack_end_mark);
878     BigstackBaseSet(tmp_alloc_base);
879     reterr = Multcomp(variant_include_dummy, nullptr, TO_CONSTCPCONSTP(chr_ids), variant_bps, TO_CONSTCPCONSTP(variant_ids), allele_include_dummy, nullptr, TO_CONSTCPCONSTP(allele_storage), TO_CONSTCPCONSTP(a1_storage), &(afip->base), ln_pvals, nullptr, entry_ct, max_allele_slen, ln_pfilter, output_min_ln, 0, max_thread_ct, outname, outname_end);
880     if (unlikely(reterr)) {
881       goto AdjustFile_ret_1;
882     }
883   }
884   while (0) {
885   AdjustFile_ret_NOMEM:
886     reterr = kPglRetNomem;
887     break;
888   AdjustFile_ret_TSTREAM_FAIL:
889     TextStreamErrPrint(in_fname, &adjust_txs);
890     break;
891   AdjustFile_ret_TSTREAM_REWIND_FAIL:
892     TextStreamErrPrintRewind(in_fname, &adjust_txs, &reterr);
893     break;
894   AdjustFile_ret_INVALID_CMDLINE:
895     reterr = kPglRetInvalidCmdline;
896     break;
897   AdjustFile_ret_MALFORMED_INPUT_WW:
898     WordWrapB(0);
899     logerrputsb();
900     reterr = kPglRetMalformedInput;
901     break;
902   AdjustFile_ret_MISSING_TOKENS:
903     snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, in_fname);
904   AdjustFile_ret_INCONSISTENT_INPUT_WW:
905     WordWrapB(0);
906     logerrputsb();
907   AdjustFile_ret_INCONSISTENT_INPUT:
908     reterr = kPglRetInconsistentInput;
909     break;
910   AdjustFile_ret_INVALID_PVAL:
911     logerrprintfww("Error: Invalid p-value on line %" PRIuPTR " of %s.\n", line_idx, in_fname);
912     reterr = kPglRetInconsistentInput;
913     break;
914   }
915  AdjustFile_ret_1:
916   CleanupTextStream2(in_fname, &adjust_txs, &reterr);
917   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
918   return reterr;
919 }
920 
921 #ifdef __cplusplus
922 }  // namespace plink2
923 #endif
924