1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 #include "include/plink2_stats.h"
18 #include "plink2_adjust.h"
19 #include "plink2_compress_stream.h"
20
21 #ifdef __cplusplus
22 namespace plink2 {
23 #endif
24
InitAdjust(AdjustInfo * adjust_info_ptr,AdjustFileInfo * adjust_file_info_ptr)25 void InitAdjust(AdjustInfo* adjust_info_ptr, AdjustFileInfo* adjust_file_info_ptr) {
26 adjust_info_ptr->flags = kfAdjust0;
27 adjust_info_ptr->lambda = 0.0;
28 adjust_file_info_ptr->base.flags = kfAdjust0;
29 adjust_file_info_ptr->base.lambda = 0.0;
30 adjust_file_info_ptr->fname = nullptr;
31 adjust_file_info_ptr->test_name = nullptr;
32 adjust_file_info_ptr->chr_field = nullptr;
33 adjust_file_info_ptr->pos_field = nullptr;
34 adjust_file_info_ptr->id_field = nullptr;
35 adjust_file_info_ptr->ref_field = nullptr;
36 adjust_file_info_ptr->alt_field = nullptr;
37 adjust_file_info_ptr->a1_field = nullptr;
38 adjust_file_info_ptr->test_field = nullptr;
39 adjust_file_info_ptr->p_field = nullptr;
40 }
41
CleanupAdjust(AdjustFileInfo * adjust_file_info_ptr)42 void CleanupAdjust(AdjustFileInfo* adjust_file_info_ptr) {
43 free_cond(adjust_file_info_ptr->a1_field);
44 free_cond(adjust_file_info_ptr->alt_field);
45 free_cond(adjust_file_info_ptr->chr_field);
46 if (adjust_file_info_ptr->fname) {
47 free(adjust_file_info_ptr->fname);
48 free_cond(adjust_file_info_ptr->pos_field);
49 free_cond(adjust_file_info_ptr->id_field);
50 free_cond(adjust_file_info_ptr->ref_field);
51 free_cond(adjust_file_info_ptr->test_field);
52 free_cond(adjust_file_info_ptr->p_field);
53 }
54 }
55
56 typedef struct AdjAssocResultStruct {
57 double ln_pval;
58 double chisq; // do we really need this?...
59 uint32_t variant_uidx;
60 uint32_t allele_idx;
61 #ifdef __cplusplus
operator <plink2::AdjAssocResultStruct62 bool operator<(const struct AdjAssocResultStruct& rhs) const {
63 return ln_pval < rhs.ln_pval;
64 }
65 #endif
66 } AdjAssocResult;
67
adjust_print_ln(const char * output_min_p_str,double ln_pval,double output_min_ln,uint32_t output_min_p_slen,uint32_t is_neglog10,char ** bufpp)68 static inline void adjust_print_ln(const char* output_min_p_str, double ln_pval, double output_min_ln, uint32_t output_min_p_slen, uint32_t is_neglog10, char** bufpp) {
69 **bufpp = '\t';
70 *bufpp += 1;
71 if (ln_pval <= output_min_ln) {
72 *bufpp = memcpya(*bufpp, output_min_p_str, output_min_p_slen);
73 } else {
74 if (!is_neglog10) {
75 *bufpp = lntoa_g(ln_pval, *bufpp);
76 } else {
77 *bufpp = dtoa_g(ln_pval * (-1.0 / kLn10), *bufpp);
78 }
79 }
80 }
81
82 // Now based around ln_pvals, to allow useful comparisons < 2.23e-308.
Multcomp(const uintptr_t * variant_include,const ChrInfo * cip,const char * const * chr_ids,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_include,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const char * const * loaded_a1,const AdjustInfo * adjust_info_ptr,const double * ln_pvals,const double * chisqs,uintptr_t orig_allele_ct,uint32_t max_allele_slen,double ln_pfilter,double output_min_ln,uint32_t skip_gc,uint32_t max_thread_ct,char * outname,char * outname_end)83 PglErr Multcomp(const uintptr_t* variant_include, const ChrInfo* cip, const char* const* chr_ids, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_include, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const char* const* loaded_a1, const AdjustInfo* adjust_info_ptr, const double* ln_pvals, const double* chisqs, uintptr_t orig_allele_ct, uint32_t max_allele_slen, double ln_pfilter, double output_min_ln, uint32_t skip_gc, uint32_t max_thread_ct, char* outname, char* outname_end) {
84 unsigned char* bigstack_mark = g_bigstack_base;
85 char* cswritep = nullptr;
86 CompressStreamState css;
87 PglErr reterr = kPglRetSuccess;
88 PreinitCstream(&css);
89 {
90 AdjAssocResult* sortbuf;
91 if (unlikely(BIGSTACK_ALLOC_X(AdjAssocResult, orig_allele_ct, &sortbuf))) {
92 goto Multcomp_ret_NOMEM;
93 }
94 uintptr_t valid_allele_ct = 0;
95 uintptr_t allele_uidx_base = 0;
96 uintptr_t allele_include_bits = allele_include[0];
97 if (!allele_idx_offsets) {
98 if (chisqs) {
99 if (ln_pvals) {
100 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
101 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
102 const double cur_chisq = chisqs[aidx];
103 if (cur_chisq >= 0.0) {
104 sortbuf[valid_allele_ct].chisq = cur_chisq;
105 sortbuf[valid_allele_ct].ln_pval = ln_pvals[aidx];
106 sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
107 sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
108 ++valid_allele_ct;
109 }
110 }
111 } else {
112 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
113 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
114 const double cur_chisq = chisqs[aidx];
115 if (cur_chisq >= 0.0) {
116 sortbuf[valid_allele_ct].chisq = cur_chisq;
117 sortbuf[valid_allele_ct].ln_pval = ChisqToLnP(cur_chisq, 1);
118 sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
119 sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
120 ++valid_allele_ct;
121 }
122 }
123 }
124 } else {
125 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
126 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
127 const double cur_ln_pval = ln_pvals[aidx];
128 if (cur_ln_pval <= 0.0) {
129 sortbuf[valid_allele_ct].chisq = LnPToChisq(cur_ln_pval);
130 sortbuf[valid_allele_ct].ln_pval = cur_ln_pval;
131 sortbuf[valid_allele_ct].variant_uidx = allele_uidx / 2;
132 sortbuf[valid_allele_ct].allele_idx = allele_uidx % 2;
133 ++valid_allele_ct;
134 }
135 }
136 }
137 } else {
138 uintptr_t variant_uidx_base = 0;
139 uintptr_t variant_include_bits = variant_include[0];
140 uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
141 uintptr_t allele_idx_offset_start = allele_idx_offsets[variant_uidx];
142 uintptr_t allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
143 if (chisqs) {
144 if (ln_pvals) {
145 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
146 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
147 if (allele_uidx >= allele_idx_offset_end) {
148 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
149 allele_idx_offset_start = allele_idx_offsets[variant_uidx];
150 allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
151 }
152 const double cur_chisq = chisqs[aidx];
153 if (cur_chisq >= 0.0) {
154 sortbuf[valid_allele_ct].chisq = cur_chisq;
155 sortbuf[valid_allele_ct].ln_pval = ln_pvals[aidx];
156 sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
157 sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
158 ++valid_allele_ct;
159 }
160 }
161 } else {
162 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
163 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
164 if (allele_uidx >= allele_idx_offset_end) {
165 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
166 allele_idx_offset_start = allele_idx_offsets[variant_uidx];
167 allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
168 }
169 const double cur_chisq = chisqs[aidx];
170 if (cur_chisq >= 0.0) {
171 sortbuf[valid_allele_ct].chisq = cur_chisq;
172 sortbuf[valid_allele_ct].ln_pval = ChisqToLnP(cur_chisq, 1);
173 sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
174 sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
175 ++valid_allele_ct;
176 }
177 }
178 }
179 } else {
180 for (uintptr_t aidx = 0; aidx != orig_allele_ct; ++aidx) {
181 const uintptr_t allele_uidx = BitIter1(allele_include, &allele_uidx_base, &allele_include_bits);
182 if (allele_uidx >= allele_idx_offset_end) {
183 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
184 allele_idx_offset_start = allele_idx_offsets[variant_uidx];
185 allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
186 }
187 const double cur_ln_pval = ln_pvals[aidx];
188 if (cur_ln_pval <= 0.0) {
189 sortbuf[valid_allele_ct].chisq = LnPToChisq(cur_ln_pval);
190 sortbuf[valid_allele_ct].ln_pval = cur_ln_pval;
191 sortbuf[valid_allele_ct].variant_uidx = variant_uidx;
192 sortbuf[valid_allele_ct].allele_idx = allele_uidx - allele_idx_offset_start;
193 ++valid_allele_ct;
194 }
195 }
196 }
197 }
198 if (!valid_allele_ct) {
199 logputs("Zero valid tests; --adjust skipped.\n");
200 goto Multcomp_ret_1;
201 }
202 BigstackShrinkTop(sortbuf, valid_allele_ct * sizeof(AdjAssocResult));
203
204 const uintptr_t overflow_buf_size = kCompressStreamBlock + 2 * kMaxIdSlen + 512 + 2 * max_allele_slen;
205 const AdjustFlags flags = adjust_info_ptr->flags;
206 const uint32_t output_zst = flags & kfAdjustZs;
207 OutnameZstSet(".adjusted", output_zst, outname_end);
208 reterr = InitCstreamAlloc(outname, 0, output_zst, max_thread_ct, overflow_buf_size, &css, &cswritep);
209 if (unlikely(reterr)) {
210 goto Multcomp_ret_1;
211 }
212 *cswritep++ = '#';
213 const uint32_t chr_col = flags & kfAdjustColChrom;
214 if (chr_col) {
215 cswritep = strcpya_k(cswritep, "CHROM\t");
216 }
217 if (flags & kfAdjustColPos) {
218 cswritep = strcpya_k(cswritep, "POS\t");
219 } else {
220 variant_bps = nullptr;
221 }
222 cswritep = strcpya_k(cswritep, "ID\t");
223 const uint32_t ref_col = flags & kfAdjustColRef;
224 if (ref_col) {
225 cswritep = strcpya_k(cswritep, "REF\t");
226 }
227 const uint32_t alt1_col = flags & kfAdjustColAlt1;
228 if (alt1_col) {
229 cswritep = strcpya_k(cswritep, "ALT1\t");
230 }
231 const uint32_t alt_col = flags & kfAdjustColAlt;
232 if (alt_col) {
233 cswritep = strcpya_k(cswritep, "ALT\t");
234 }
235 const uint32_t a1_col = (flags & kfAdjustColA1) && (loaded_a1 || cip);
236 if (a1_col) {
237 cswritep = strcpya_k(cswritep, "A1\t");
238 }
239 const uint32_t is_neglog10 = flags & kfAdjustLog10;
240 const uint32_t unadj_col = flags & kfAdjustColUnadj;
241 if (unadj_col) {
242 if (is_neglog10) {
243 cswritep = strcpya_k(cswritep, "LOG10_");
244 }
245 cswritep = strcpya_k(cswritep, "UNADJ\t");
246 }
247 const uint32_t gc_col = (flags & kfAdjustColGc) && (!skip_gc);
248 if (gc_col) {
249 if (is_neglog10) {
250 cswritep = strcpya_k(cswritep, "LOG10_");
251 }
252 cswritep = strcpya_k(cswritep, "GC\t");
253 }
254 const uint32_t qq_col = flags & kfAdjustColQq;
255 if (qq_col) {
256 if (is_neglog10) {
257 cswritep = strcpya_k(cswritep, "LOG10_");
258 }
259 cswritep = strcpya_k(cswritep, "QQ\t");
260 }
261 const uint32_t bonf_col = flags & kfAdjustColBonf;
262 if (bonf_col) {
263 if (is_neglog10) {
264 cswritep = strcpya_k(cswritep, "LOG10_");
265 }
266 cswritep = strcpya_k(cswritep, "BONF\t");
267 }
268 const uint32_t holm_col = flags & kfAdjustColHolm;
269 if (holm_col) {
270 if (is_neglog10) {
271 cswritep = strcpya_k(cswritep, "LOG10_");
272 }
273 cswritep = strcpya_k(cswritep, "HOLM\t");
274 }
275 const uint32_t sidakss_col = flags & kfAdjustColSidakss;
276 if (sidakss_col) {
277 if (is_neglog10) {
278 cswritep = strcpya_k(cswritep, "LOG10_");
279 }
280 cswritep = strcpya_k(cswritep, "SIDAK_SS\t");
281 }
282 const uint32_t sidaksd_col = flags & kfAdjustColSidaksd;
283 if (sidaksd_col) {
284 if (is_neglog10) {
285 cswritep = strcpya_k(cswritep, "LOG10_");
286 }
287 cswritep = strcpya_k(cswritep, "SIDAK_SD\t");
288 }
289 const uint32_t fdrbh_col = flags & kfAdjustColFdrbh;
290 if (fdrbh_col) {
291 if (is_neglog10) {
292 cswritep = strcpya_k(cswritep, "LOG10_");
293 }
294 cswritep = strcpya_k(cswritep, "FDR_BH\t");
295 }
296 double* ln_pv_by = nullptr;
297 if (flags & kfAdjustColFdrby) {
298 if (unlikely(bigstack_alloc_d(valid_allele_ct, &ln_pv_by))) {
299 goto Multcomp_ret_NOMEM;
300 }
301 if (is_neglog10) {
302 cswritep = strcpya_k(cswritep, "LOG10_");
303 }
304 cswritep = strcpya_k(cswritep, "FDR_BY\t");
305 }
306 DecrAppendBinaryEoln(&cswritep);
307
308 // reverse-order calculations
309 double* ln_pv_bh;
310 double* ln_pv_gc;
311 double* unadj_sorted_ln_pvals;
312 if (unlikely(
313 bigstack_alloc_d(valid_allele_ct, &ln_pv_bh) ||
314 bigstack_alloc_d(valid_allele_ct, &ln_pv_gc) ||
315 bigstack_alloc_d(valid_allele_ct, &unadj_sorted_ln_pvals))) {
316 goto Multcomp_ret_NOMEM;
317 }
318
319 STD_SORT_PAR_UNSEQ(valid_allele_ct, double_cmp, sortbuf);
320
321 double lambda_recip = 1.0;
322 if (!skip_gc) {
323 if (adjust_info_ptr->lambda != 0.0) {
324 lambda_recip = 1.0 / adjust_info_ptr->lambda;
325 } else {
326 const uintptr_t valid_allele_ct_d2 = valid_allele_ct / 2;
327 double lambda = sortbuf[valid_allele_ct_d2].chisq;
328 if (!(valid_allele_ct % 2)) {
329 lambda = (lambda + sortbuf[valid_allele_ct_d2 - 1].chisq) * 0.5;
330 }
331 lambda = lambda / 0.456;
332 if (lambda < 1.0) {
333 lambda = 1.0;
334 }
335 logprintf("--adjust: Genomic inflation est. lambda (based on median chisq) = %g.\n", lambda);
336 lambda_recip = 1.0 / lambda;
337 }
338 }
339 double* sorted_ln_pvals = unadj_sorted_ln_pvals;
340 for (uintptr_t aidx = 0; aidx != valid_allele_ct; ++aidx) {
341 ln_pv_gc[aidx] = ChisqToLnP(sortbuf[aidx].chisq * lambda_recip, 1);
342 unadj_sorted_ln_pvals[aidx] = sortbuf[aidx].ln_pval;
343 }
344 if ((flags & kfAdjustGc) && (!skip_gc)) {
345 sorted_ln_pvals = ln_pv_gc;
346 }
347
348 const uint32_t valid_allele_ct_m1 = valid_allele_ct - 1;
349 const double valid_allele_ctd = swtod(valid_allele_ct);
350 const double ln_valid_allele_ct = log(valid_allele_ctd);
351 double bh_ln_pval_min = sorted_ln_pvals[valid_allele_ct_m1];
352 ln_pv_bh[valid_allele_ct_m1] = bh_ln_pval_min;
353 double harmonic_sum = 1.0;
354 for (uint32_t aidx = valid_allele_ct_m1; aidx; --aidx) {
355 const double harmonic_term = valid_allele_ctd / u31tod(aidx);
356 harmonic_sum += harmonic_term;
357 const double bh_ln_pval = sorted_ln_pvals[aidx - 1] + log(harmonic_term);
358 if (bh_ln_pval_min > bh_ln_pval) {
359 bh_ln_pval_min = bh_ln_pval;
360 }
361 ln_pv_bh[aidx - 1] = bh_ln_pval_min;
362 }
363
364 if (ln_pv_by) {
365 const double ln_harmonic_sum = log(harmonic_sum);
366 double by_ln_pval_min = sorted_ln_pvals[valid_allele_ct_m1] - ln_valid_allele_ct + ln_harmonic_sum;
367 if (by_ln_pval_min > 0.0) {
368 by_ln_pval_min = 0.0;
369 }
370 ln_pv_by[valid_allele_ct_m1] = by_ln_pval_min;
371 for (uint32_t aidx = valid_allele_ct_m1; aidx; --aidx) {
372 const double by_ln_pval = sorted_ln_pvals[aidx - 1] - log(u31tod(aidx)) + ln_harmonic_sum;
373 if (by_ln_pval_min > by_ln_pval) {
374 by_ln_pval_min = by_ln_pval;
375 }
376 ln_pv_by[aidx - 1] = by_ln_pval_min;
377 }
378 }
379
380 char output_min_p_buf[24];
381 uint32_t output_min_p_slen;
382 if (!is_neglog10) {
383 char* str_end = lntoa_g(output_min_ln, output_min_p_buf);
384 output_min_p_slen = str_end - output_min_p_buf;
385 } else {
386 // -log10(p) output ignores --output-min-p now.
387 // instead, set to maximum int32 to distinguish from plink 1.x's
388 // much-less-extreme 'inf'.
389 strcpy_k(output_min_p_buf, "2147483647");
390 output_min_p_slen = 10;
391 }
392 const double valid_allele_ct_recip = 1.0 / valid_allele_ctd;
393 double ln_pv_sidak_sd = -DBL_MAX;
394 double ln_pv_holm = -DBL_MAX;
395 uint32_t cur_allele_ct = 2;
396 uint32_t aidx = 0;
397 for (; aidx < valid_allele_ct; ++aidx) {
398 double ln_pval = sorted_ln_pvals[aidx];
399 if (ln_pval > ln_pfilter) {
400 break;
401 }
402 const uint32_t variant_uidx = sortbuf[aidx].variant_uidx;
403 if (chr_col) {
404 if (cip) {
405 cswritep = chrtoa(cip, GetVariantChr(cip, variant_uidx), cswritep);
406 } else {
407 cswritep = strcpya(cswritep, chr_ids[variant_uidx]);
408 }
409 *cswritep++ = '\t';
410 }
411 if (variant_bps) {
412 cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
413 }
414 cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
415 uintptr_t allele_idx_offset_base = variant_uidx * 2;
416 if (allele_idx_offsets) {
417 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
418 cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
419 }
420 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
421 if (ref_col) {
422 *cswritep++ = '\t';
423 cswritep = strcpya(cswritep, cur_alleles[0]);
424 }
425 if (alt1_col) {
426 *cswritep++ = '\t';
427 cswritep = strcpya(cswritep, cur_alleles[1]);
428 }
429 if (alt_col) {
430 *cswritep++ = '\t';
431 for (uint32_t allele_idx = 1; allele_idx != cur_allele_ct; ++allele_idx) {
432 if (unlikely(Cswrite(&css, &cswritep))) {
433 goto Multcomp_ret_WRITE_FAIL;
434 }
435 cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
436 }
437 --cswritep;
438 }
439 if (a1_col) {
440 *cswritep++ = '\t';
441 const char* cur_allele;
442 if (loaded_a1) {
443 // --adjust-file hack
444 cur_allele = loaded_a1[variant_uidx];
445 } else {
446 cur_allele = cur_alleles[sortbuf[aidx].allele_idx];
447 }
448 cswritep = strcpya(cswritep, cur_allele);
449 }
450 if (unadj_col) {
451 adjust_print_ln(output_min_p_buf, unadj_sorted_ln_pvals[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
452 }
453 if (gc_col) {
454 adjust_print_ln(output_min_p_buf, ln_pv_gc[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
455 }
456 const double aidx_d = swtod(aidx);
457 if (qq_col) {
458 *cswritep++ = '\t';
459 double qq_val = (aidx_d + 0.5) * valid_allele_ct_recip;
460 if (is_neglog10) {
461 qq_val = -log10(qq_val);
462 }
463 cswritep = dtoa_g(qq_val, cswritep);
464 }
465 if (bonf_col) {
466 double bonf_ln_pval = ln_pval + ln_valid_allele_ct;
467 if (bonf_ln_pval > 0.0) {
468 bonf_ln_pval = 0.0;
469 }
470 adjust_print_ln(output_min_p_buf, bonf_ln_pval, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
471 }
472 if (holm_col) {
473 if (ln_pv_holm < 0.0) {
474 const double ln_pv_holm_new = ln_pval + log(u31tod(valid_allele_ct - aidx));
475 if (ln_pv_holm_new > 0.0) {
476 ln_pv_holm = 0.0;
477 } else if (ln_pv_holm < ln_pv_holm_new) {
478 ln_pv_holm = ln_pv_holm_new;
479 }
480 }
481 adjust_print_ln(output_min_p_buf, ln_pv_holm, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
482 }
483 if (sidakss_col) {
484 // avoid catastrophic cancellation for small p-values
485 // 1 - (1-p)^c = 1 - e^{c log(1-p)}
486 // 2^{-7} threshold is arbitrary
487 // 2^{-90} corresponds to cp + (cp)^2/2! == cp in double-precision
488 // arithmetic, with several bits to spare
489 double ln_pv_sidak_ss;
490 if (ln_pval > -90 * kLn2) {
491 const double pval = exp(ln_pval);
492 double pv_sidak_ss;
493 if (ln_pval >= -7 * kLn2) {
494 pv_sidak_ss = 1 - pow(1 - pval, valid_allele_ctd);
495 } else {
496 pv_sidak_ss = 1 - exp(valid_allele_ctd * log1p(-pval));
497 }
498 ln_pv_sidak_ss = log(pv_sidak_ss);
499 } else {
500 // log(1-x) = -x - x^2/2 - x^3/3 + ...
501 // 1 - exp(x) = -x - x^2/2! - x^3/3! - ...
502 // if p <= 2^{-90},
503 // log(1-p) is -p
504 // 1 - e^{-cp} is cp
505 ln_pv_sidak_ss = ln_pval + ln_valid_allele_ct;
506 }
507 adjust_print_ln(output_min_p_buf, ln_pv_sidak_ss, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
508 }
509 if (sidaksd_col) {
510 double ln_pv_sidak_sd_new;
511 if (ln_pval > -90 * kLn2) {
512 const double pval = exp(ln_pval);
513 double pv_sidak_sd_new;
514 if (ln_pval >= -7 * kLn2) {
515 pv_sidak_sd_new = 1 - pow(1 - pval, valid_allele_ctd - aidx_d);
516 } else {
517 const double cur_exp = valid_allele_ctd - aidx_d;
518 pv_sidak_sd_new = 1 - exp(cur_exp * log1p(-pval));
519 }
520 ln_pv_sidak_sd_new = log(pv_sidak_sd_new);
521 } else {
522 ln_pv_sidak_sd_new = ln_pval + log(valid_allele_ctd - aidx_d);
523 }
524 if (ln_pv_sidak_sd < ln_pv_sidak_sd_new) {
525 ln_pv_sidak_sd = ln_pv_sidak_sd_new;
526 }
527 adjust_print_ln(output_min_p_buf, ln_pv_sidak_sd, output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
528 }
529 if (fdrbh_col) {
530 adjust_print_ln(output_min_p_buf, ln_pv_bh[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
531 }
532 if (ln_pv_by) {
533 adjust_print_ln(output_min_p_buf, ln_pv_by[aidx], output_min_ln, output_min_p_slen, is_neglog10, &cswritep);
534 }
535 AppendBinaryEoln(&cswritep);
536 if (unlikely(Cswrite(&css, &cswritep))) {
537 goto Multcomp_ret_WRITE_FAIL;
538 }
539 }
540 if (unlikely(CswriteCloseNull(&css, cswritep))) {
541 goto Multcomp_ret_WRITE_FAIL;
542 }
543 // don't use valid_allele_ct due to --pfilter
544 logprintfww("--adjust%s values (%" PRIuPTR " test%s) written to %s .\n", cip? "" : "-file", aidx, (aidx == 1)? "" : "s", outname);
545 }
546 while (0) {
547 Multcomp_ret_NOMEM:
548 reterr = kPglRetNomem;
549 break;
550 Multcomp_ret_WRITE_FAIL:
551 reterr = kPglRetWriteFail;
552 break;
553 }
554 Multcomp_ret_1:
555 CswriteCloseCond(&css, cswritep);
556 BigstackReset(bigstack_mark);
557 return reterr;
558 }
559
AdjustFile(const AdjustFileInfo * afip,double ln_pfilter,double output_min_ln,uint32_t max_thread_ct,char * outname,char * outname_end)560 PglErr AdjustFile(const AdjustFileInfo* afip, double ln_pfilter, double output_min_ln, uint32_t max_thread_ct, char* outname, char* outname_end) {
561 unsigned char* bigstack_mark = g_bigstack_base;
562 unsigned char* bigstack_end_mark = g_bigstack_end;
563 const char* in_fname = afip->fname;
564 uintptr_t line_idx = 0;
565 PglErr reterr = kPglRetSuccess;
566 TextStream adjust_txs;
567 PreinitTextStream(&adjust_txs);
568 {
569 // Two-pass load.
570 // 1. Parse header line, count # of variants.
571 // intermission. Allocate top-level arrays.
572 // 2. Rewind and fill arrays.
573 // (some overlap with LoadPvar(), though that's one-pass.)
574 reterr = SizeAndInitTextStream(in_fname, bigstack_left() / 4, max_thread_ct, &adjust_txs);
575 if (unlikely(reterr)) {
576 goto AdjustFile_ret_1;
577 }
578
579 const char* header_start;
580 do {
581 ++line_idx;
582 header_start = TextGet(&adjust_txs);
583 if (unlikely(!header_start)) {
584 reterr = TextStreamRawErrcode(&adjust_txs);
585 if (reterr == kPglRetEof) {
586 snprintf(g_logbuf, kLogbufSize, "Error: %s is empty.\n", in_fname);
587 goto AdjustFile_ret_MALFORMED_INPUT_WW;
588 }
589 goto AdjustFile_ret_TSTREAM_FAIL;
590 }
591 } while (strequal_k_unsafe(header_start, "##"));
592 if (*header_start == '#') {
593 ++header_start;
594 }
595
596 const AdjustFlags flags = afip->base.flags;
597 // [0] = CHROM
598 // [1] = POS
599 // [2] = ID (required)
600 // [3] = REF
601 // [4] = ALT
602 // [5] = A1
603 // [6] = TEST (always scan)
604 // [7] = P (required)
605 const char* col_search_order[8];
606 const uint32_t need_chr = (flags & kfAdjustColChrom);
607 const uint32_t need_pos = (flags & kfAdjustColPos);
608 const uint32_t need_ref = (flags & kfAdjustColRef);
609 const uint32_t need_alt = (flags & (kfAdjustColAlt1 | kfAdjustColAlt));
610 uint32_t check_a1 = (flags & kfAdjustColA1);
611 const uint32_t alt_comma_truncate = (need_alt == kfAdjustColAlt1);
612 if (unlikely(need_alt == (kfAdjustColAlt1 | kfAdjustColAlt))) {
613 // Could theoretically support this later (allocate allele_idx_offsets
614 // and count # of multiallelic variants in first pass, etc.), but
615 // unlikely to be relevant.
616 // (For now, we abuse allele_storage by storing all comma-separated alt
617 // alleles in a single string, since Multcomp() is ok with that.)
618 logerrputs("Error: --adjust-file does not currently support simultaneous alt1 and alt\ncolumn output.\n");
619 goto AdjustFile_ret_INVALID_CMDLINE;
620 }
621 col_search_order[0] = need_chr? (afip->chr_field? afip->chr_field : "CHROM\0CHR\0") : "";
622 col_search_order[1] = need_pos? (afip->pos_field? afip->pos_field : "POS\0BP\0") : "";
623 col_search_order[2] = afip->id_field? afip->id_field : "ID\0SNP\0";
624 col_search_order[3] = need_ref? (afip->ref_field? afip->ref_field : "REF\0A2\0") : "";
625 col_search_order[4] = need_alt? (afip->alt_field? afip->alt_field : "ALT\0ALT1\0") : "";
626 col_search_order[5] = check_a1? (afip->a1_field? afip->a1_field : "A1\0") : "";
627 col_search_order[6] = afip->test_field? afip->test_field : "TEST\0";
628 const uint32_t input_log10 = (flags & kfAdjustInputLog10);
629 col_search_order[7] = afip->p_field? afip->p_field : (input_log10? "LOG10_P\0LOG10_UNADJ\0P\0UNADJ\0" : "P\0UNADJ\0");
630
631 uint32_t col_skips[8];
632 uint32_t col_types[8];
633 uint32_t relevant_col_ct;
634 uint32_t found_type_bitset;
635 reterr = SearchHeaderLine(header_start, col_search_order, "adjust-file", 8, &relevant_col_ct, &found_type_bitset, col_skips, col_types);
636 if (unlikely(reterr)) {
637 goto AdjustFile_ret_1;
638 }
639 if (unlikely((found_type_bitset & 0x44) != 0x44)) {
640 logerrputs("Error: --adjust-file requires ID and P columns.\n");
641 goto AdjustFile_ret_INCONSISTENT_INPUT;
642 }
643 const char* test_name = afip->test_name;
644 uint32_t test_name_slen = 0;
645 uint32_t test_col_idx = 0;
646 if (test_name) {
647 test_name_slen = strlen(test_name);
648 // this duplicates a bit of work done in SearchHeaderLine(), but not a
649 // big deal
650 for (uint32_t relevant_col_idx = 0; ; ++relevant_col_idx) {
651 test_col_idx += col_skips[relevant_col_idx];
652 if (col_types[relevant_col_idx] == 6) {
653 break;
654 }
655 }
656 } else if (unlikely(found_type_bitset & 0x40)) {
657 snprintf(g_logbuf, kLogbufSize, "Error: TEST column present in %s, but no test= parameter was provided to --adjust-file.\n", in_fname);
658 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
659 }
660 if (unlikely(need_chr && (!(found_type_bitset & 0x1)))) {
661 snprintf(g_logbuf, kLogbufSize, "Error: No chromosome column in %s.\n", in_fname);
662 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
663 }
664 if (unlikely(need_pos && (!(found_type_bitset & 0x2)))) {
665 snprintf(g_logbuf, kLogbufSize, "Error: No bp coordinate column in %s.\n", in_fname);
666 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
667 }
668 if (unlikely(need_ref && (!(found_type_bitset & 0x8)))) {
669 snprintf(g_logbuf, kLogbufSize, "Error: No REF column in %s.\n", in_fname);
670 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
671 }
672 if (unlikely(need_alt && (!(found_type_bitset & 0x10)))) {
673 snprintf(g_logbuf, kLogbufSize, "Error: No ALT column in %s.\n", in_fname);
674 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
675 }
676 if (unlikely(check_a1 && (!(found_type_bitset & 0x20)))) {
677 snprintf(g_logbuf, kLogbufSize, "Warning: No A1 column in %s. Omitting from output.\n", in_fname);
678 check_a1 = 0;
679 }
680
681 uintptr_t entry_ct = 0;
682 while (1) {
683 ++line_idx;
684 const char* line_start = TextGet(&adjust_txs);
685 if (!line_start) {
686 if (likely(!TextStreamErrcode2(&adjust_txs, &reterr))) {
687 break;
688 }
689 goto AdjustFile_ret_TSTREAM_FAIL;
690 }
691 if (test_name) {
692 // Don't count different-test entries.
693 const char* test_name_start = NextTokenMult0(line_start, test_col_idx);
694 if (unlikely(!test_name_start)) {
695 goto AdjustFile_ret_MISSING_TOKENS;
696 }
697 const uint32_t cur_test_slen = strlen_se(test_name_start);
698 if ((cur_test_slen != test_name_slen) || (!memequal(test_name_start, test_name, test_name_slen))) {
699 continue;
700 }
701 }
702 ++entry_ct;
703 }
704 #ifdef __LP64__
705 // probably want to permit this soon
706 if (entry_ct > 0xffffffffU) {
707 logerrputs("Error: Too many entries for --adjust-file.\n");
708 reterr = kPglRetNotYetSupported;
709 goto AdjustFile_ret_1;
710 }
711 #endif
712
713 reterr = TextRewind(&adjust_txs);
714 if (unlikely(reterr)) {
715 goto AdjustFile_ret_TSTREAM_FAIL;
716 }
717 const uintptr_t line_ct = line_idx - 1;
718 line_idx = 0;
719 do {
720 ++line_idx;
721 reterr = TextNextLineLstripK(&adjust_txs, &header_start);
722 if (unlikely(reterr)) {
723 goto AdjustFile_ret_TSTREAM_REWIND_FAIL;
724 }
725 } while (strequal_k_unsafe(header_start, "##"));
726
727 const uintptr_t entry_ctl = BitCtToWordCt(entry_ct);
728 const uintptr_t entry_ctl2 = NypCtToWordCt(entry_ct);
729 uintptr_t* variant_include_dummy;
730 uintptr_t* allele_include_dummy;
731 if (unlikely(
732 bigstack_alloc_w(entry_ctl, &variant_include_dummy) ||
733 bigstack_alloc_w(entry_ctl2, &allele_include_dummy))) {
734 goto AdjustFile_ret_NOMEM;
735 }
736 SetAllBits(entry_ct, variant_include_dummy);
737 for (uintptr_t ulii = 0; ulii != entry_ct / kBitsPerWordD2; ++ulii) {
738 allele_include_dummy[ulii] = kMaskAAAA;
739 }
740 const uint32_t remainder = entry_ct % kBitsPerWordD2;
741 if (remainder) {
742 allele_include_dummy[entry_ct / kBitsPerWordD2] = kMaskAAAA >> (2 * (kBitsPerWordD2 - remainder));
743 }
744 char** chr_ids;
745 if (need_chr) {
746 if (unlikely(bigstack_alloc_cp(entry_ct, &chr_ids))) {
747 goto AdjustFile_ret_NOMEM;
748 }
749 } else {
750 chr_ids = nullptr;
751 }
752 uint32_t* variant_bps;
753 if (need_pos) {
754 if (unlikely(bigstack_alloc_u32(entry_ct, &variant_bps))) {
755 goto AdjustFile_ret_NOMEM;
756 }
757 } else {
758 variant_bps = nullptr;
759 }
760 char** variant_ids;
761 double* ln_pvals;
762 if (unlikely(
763 bigstack_alloc_cp(entry_ct, &variant_ids) ||
764 bigstack_alloc_d(entry_ct, &ln_pvals))) {
765 goto AdjustFile_ret_NOMEM;
766 }
767 char** allele_storage;
768 if (need_ref || need_alt) {
769 if (unlikely(bigstack_alloc_cp(entry_ct * 2, &allele_storage))) {
770 goto AdjustFile_ret_NOMEM;
771 }
772 } else {
773 allele_storage = nullptr;
774 }
775 char** a1_storage;
776 if (check_a1) {
777 if (unlikely(bigstack_alloc_cp(entry_ct, &a1_storage))) {
778 goto AdjustFile_ret_NOMEM;
779 }
780 } else {
781 a1_storage = nullptr;
782 }
783 unsigned char* tmp_alloc_base = g_bigstack_base;
784 unsigned char* tmp_alloc_end = g_bigstack_end;
785 uint32_t max_allele_slen = 1;
786 uintptr_t variant_idx = 0;
787 while (line_idx < line_ct) {
788 ++line_idx;
789 const char* line_start;
790 reterr = TextNextLineLstripK(&adjust_txs, &line_start);
791 if (unlikely(reterr)) {
792 goto AdjustFile_ret_TSTREAM_REWIND_FAIL;
793 }
794 const char* token_ptrs[8];
795 uint32_t token_slens[8];
796 if (unlikely(!TokenLexK0(line_start, col_types, col_skips, relevant_col_ct, token_ptrs, token_slens))) {
797 goto AdjustFile_ret_MISSING_TOKENS;
798 }
799 if (test_name) {
800 if ((token_slens[6] != test_name_slen) || (!memequal(token_ptrs[6], test_name, test_name_slen))) {
801 continue;
802 }
803 }
804 if (chr_ids) {
805 const uint32_t cur_slen = token_slens[0];
806 if (StoreStringAtBase(tmp_alloc_end, token_ptrs[0], cur_slen, &tmp_alloc_base, &(chr_ids[variant_idx]))) {
807 goto AdjustFile_ret_NOMEM;
808 }
809 }
810 if (variant_bps) {
811 if (unlikely(ScanUintDefcap(token_ptrs[1], &(variant_bps[variant_idx])))) {
812 snprintf(g_logbuf, kLogbufSize, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, in_fname);
813 goto AdjustFile_ret_INCONSISTENT_INPUT_WW;
814 }
815 }
816 const uint32_t id_slen = token_slens[2];
817 if (StoreStringAtBase(tmp_alloc_end, token_ptrs[2], id_slen, &tmp_alloc_base, &(variant_ids[variant_idx]))) {
818 goto AdjustFile_ret_NOMEM;
819 }
820 if (need_ref) {
821 const uint32_t cur_slen = token_slens[3];
822 if (StoreStringAtBase(tmp_alloc_end, token_ptrs[3], cur_slen, &tmp_alloc_base, &(allele_storage[2 * variant_idx]))) {
823 goto AdjustFile_ret_NOMEM;
824 }
825 }
826 if (need_alt) {
827 const char* alt_str = token_ptrs[4];
828 uint32_t cur_slen = token_slens[4];
829 if (alt_comma_truncate) {
830 const char* alt_comma = S_CAST(const char*, memchr(alt_str, ',', cur_slen));
831 if (alt_comma) {
832 cur_slen = alt_comma - alt_str;
833 }
834 }
835 if (StoreStringAtBase(tmp_alloc_end, alt_str, cur_slen, &tmp_alloc_base, &(allele_storage[2 * variant_idx + 1]))) {
836 goto AdjustFile_ret_NOMEM;
837 }
838 }
839 if (check_a1) {
840 const uint32_t cur_slen = token_slens[5];
841 if (StoreStringAtBase(tmp_alloc_end, token_ptrs[5], cur_slen, &tmp_alloc_base, &(a1_storage[variant_idx]))) {
842 goto AdjustFile_ret_NOMEM;
843 }
844 }
845 const char* pval_str = token_ptrs[7];
846 double ln_pval;
847 if (!input_log10) {
848 if (!ScantokLn(pval_str, &ln_pval)) {
849 uint32_t cur_slen;
850 AdjustFile_alphabetic_pval:
851 cur_slen = token_slens[7];
852 if (IsNanStr(pval_str, cur_slen)) {
853 ln_pval = kLnPvalError;
854 } else if (likely(
855 strequal_k(pval_str, "INF", cur_slen) ||
856 (input_log10 && strequal_k(pval_str, "inf", cur_slen)))) {
857 // From plink 1.x, could be anything smaller than log(5e-324).
858 // Just fill with log(2.23e-308) for now.
859 ln_pval = kLnNormalMin;
860 } else {
861 goto AdjustFile_ret_INVALID_PVAL;
862 }
863 }
864 } else {
865 double neglog10_pval;
866 if (!ScantokDouble(pval_str, &neglog10_pval)) {
867 goto AdjustFile_alphabetic_pval;
868 }
869 ln_pval = neglog10_pval * (-kLn10);
870 if (unlikely(ln_pval > 0.0)) {
871 goto AdjustFile_ret_INVALID_PVAL;
872 }
873 }
874 ln_pvals[variant_idx] = ln_pval;
875 ++variant_idx;
876 }
877 BigstackEndReset(bigstack_end_mark);
878 BigstackBaseSet(tmp_alloc_base);
879 reterr = Multcomp(variant_include_dummy, nullptr, TO_CONSTCPCONSTP(chr_ids), variant_bps, TO_CONSTCPCONSTP(variant_ids), allele_include_dummy, nullptr, TO_CONSTCPCONSTP(allele_storage), TO_CONSTCPCONSTP(a1_storage), &(afip->base), ln_pvals, nullptr, entry_ct, max_allele_slen, ln_pfilter, output_min_ln, 0, max_thread_ct, outname, outname_end);
880 if (unlikely(reterr)) {
881 goto AdjustFile_ret_1;
882 }
883 }
884 while (0) {
885 AdjustFile_ret_NOMEM:
886 reterr = kPglRetNomem;
887 break;
888 AdjustFile_ret_TSTREAM_FAIL:
889 TextStreamErrPrint(in_fname, &adjust_txs);
890 break;
891 AdjustFile_ret_TSTREAM_REWIND_FAIL:
892 TextStreamErrPrintRewind(in_fname, &adjust_txs, &reterr);
893 break;
894 AdjustFile_ret_INVALID_CMDLINE:
895 reterr = kPglRetInvalidCmdline;
896 break;
897 AdjustFile_ret_MALFORMED_INPUT_WW:
898 WordWrapB(0);
899 logerrputsb();
900 reterr = kPglRetMalformedInput;
901 break;
902 AdjustFile_ret_MISSING_TOKENS:
903 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, in_fname);
904 AdjustFile_ret_INCONSISTENT_INPUT_WW:
905 WordWrapB(0);
906 logerrputsb();
907 AdjustFile_ret_INCONSISTENT_INPUT:
908 reterr = kPglRetInconsistentInput;
909 break;
910 AdjustFile_ret_INVALID_PVAL:
911 logerrprintfww("Error: Invalid p-value on line %" PRIuPTR " of %s.\n", line_idx, in_fname);
912 reterr = kPglRetInconsistentInput;
913 break;
914 }
915 AdjustFile_ret_1:
916 CleanupTextStream2(in_fname, &adjust_txs, &reterr);
917 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
918 return reterr;
919 }
920
921 #ifdef __cplusplus
922 } // namespace plink2
923 #endif
924