1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink_common.h"
19
20 #include "plink_cluster.h"
21
22 // Inputs/outputs for multithreaded permutation generators.
23 uint32_t g_perm_pheno_nm_ct;
24 uint32_t g_perm_case_ct;
25 uint32_t g_perm_tot_quotient;
26 uint64_t g_perm_totq_magic;
27 uint32_t g_perm_totq_preshift;
28 uint32_t g_perm_totq_postshift;
29 uint32_t g_perm_totq_incr;
30 uint32_t g_perm_is_1bit;
31 uint32_t g_perm_generation_thread_ct;
32 uintptr_t g_perm_vec_ct;
33
34 uint32_t g_perm_cluster_ct;
35 uint32_t* g_perm_cluster_map;
36 uint32_t* g_perm_cluster_starts;
37 uint32_t* g_perm_cluster_case_cts;
38 uintptr_t* g_perm_cluster_cc_preimage;
39 uint32_t* g_perm_tot_quotients;
40 uint64_t* g_perm_totq_magics;
41 uint32_t* g_perm_totq_preshifts;
42 uint32_t* g_perm_totq_postshifts;
43 uint32_t* g_perm_totq_incrs;
44
45 uintptr_t* g_perm_vecs;
46
47 // always use genotype indexing for QT --assoc
48 double* g_perm_vecstd;
49 double* g_perm_pheno_d2;
50 uint32_t* g_perm_sample_to_cluster;
51 uint32_t* g_perm_qt_cluster_thread_wkspace;
52
53 // permutation-major instead of sample-major order for --linear (PERMORY
54 // speedups do not apply)
55 double* g_perm_pmajor;
56 uint32_t* g_perm_precomputed_mods; // [n] = 2^32 mod (n-2)
57
generate_cc_perm_vec(uint32_t tot_ct,uint32_t set_ct,uint32_t tot_quotient,uint64_t totq_magic,uint32_t totq_preshift,uint32_t totq_postshift,uint32_t totq_incr,uintptr_t * perm_vec,sfmt_t * sfmtp)58 void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
59 // Assumes tot_quotient is 2^32 / tot_ct, and
60 // totq_magic/totq_preshift/totq_postshift/totq_incr have been precomputed
61 // from magic_num().
62 uint32_t num_set = 0;
63 uint32_t upper_bound = tot_ct * tot_quotient - 1;
64 uintptr_t widx;
65 uintptr_t wcomp;
66 uintptr_t pv_val;
67 uint32_t urand;
68 uint32_t uii;
69 if (set_ct * 2 < tot_ct) {
70 fill_ulong_zero(QUATERCT_TO_ALIGNED_WORDCT(tot_ct), perm_vec);
71 for (; num_set < set_ct; num_set++) {
72 do {
73 do {
74 urand = sfmt_genrand_uint32(sfmtp);
75 } while (urand > upper_bound);
76 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
77 widx = uii / BITCT2;
78 wcomp = ONELU << (2 * (uii % BITCT2));
79 pv_val = perm_vec[widx];
80 } while (pv_val & wcomp);
81 perm_vec[widx] = pv_val | wcomp;
82 }
83 } else {
84 fill_quatervec_55(tot_ct, perm_vec);
85 set_ct = tot_ct - set_ct;
86 for (; num_set < set_ct; num_set++) {
87 do {
88 do {
89 urand = sfmt_genrand_uint32(sfmtp);
90 } while (urand > upper_bound);
91 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
92 widx = uii / BITCT2;
93 wcomp = ONELU << (2 * (uii % BITCT2));
94 pv_val = perm_vec[widx];
95 } while (!(pv_val & wcomp));
96 perm_vec[widx] = pv_val - wcomp;
97 }
98 }
99 }
100
generate_cc_perm1(uint32_t tot_ct,uint32_t set_ct,uint32_t tot_quotient,uint64_t totq_magic,uint32_t totq_preshift,uint32_t totq_postshift,uint32_t totq_incr,uintptr_t * perm_vec,sfmt_t * sfmtp)101 void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
102 // generate_cc_perm_vec() variant which uses 1-bit packing instead of 2.
103 uint32_t num_set = 0;
104 uint32_t upper_bound = tot_ct * tot_quotient - 1;
105 uintptr_t widx;
106 uintptr_t wcomp;
107 uintptr_t pv_val;
108 uint32_t urand;
109 uint32_t uii;
110 if (set_ct * 2 < tot_ct) {
111 fill_ulong_zero(BITCT_TO_WORDCT(tot_ct), perm_vec);
112 for (; num_set < set_ct; num_set++) {
113 do {
114 do {
115 urand = sfmt_genrand_uint32(sfmtp);
116 } while (urand > upper_bound);
117 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
118 widx = uii / BITCT;
119 wcomp = ONELU << (uii % BITCT);
120 pv_val = perm_vec[widx];
121 } while (pv_val & wcomp);
122 perm_vec[widx] = pv_val | wcomp;
123 }
124 } else {
125 fill_all_bits(tot_ct, perm_vec);
126 set_ct = tot_ct - set_ct;
127 for (; num_set < set_ct; num_set++) {
128 do {
129 do {
130 urand = sfmt_genrand_uint32(sfmtp);
131 } while (urand > upper_bound);
132 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
133 widx = uii / BITCT;
134 wcomp = ONELU << (uii % BITCT);
135 pv_val = perm_vec[widx];
136 } while (!(pv_val & wcomp));
137 perm_vec[widx] = pv_val - wcomp;
138 }
139 }
140 }
141
generate_cc_cluster_perm_vec(uint32_t tot_ct,uintptr_t * preimage,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t * cluster_case_cts,uint32_t * tot_quotients,uint64_t * totq_magics,uint32_t * totq_preshifts,uint32_t * totq_postshifts,uint32_t * totq_incrs,uintptr_t * perm_vec,sfmt_t * sfmtp)142 void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
143 uint32_t cluster_idx;
144 uint32_t target_ct;
145 uint32_t cluster_end;
146 uint32_t* map_ptr;
147 uint32_t num_swapped;
148 uint32_t cluster_size;
149 uint32_t upper_bound;
150 uint64_t totq_magic;
151 uint32_t totq_preshift;
152 uint32_t totq_postshift;
153 uint32_t totq_incr;
154 uintptr_t widx;
155 uintptr_t wcomp;
156 uintptr_t pv_val;
157 uint32_t urand;
158 uint32_t uii;
159 memcpy(perm_vec, preimage, QUATERCT_TO_ALIGNED_WORDCT(tot_ct) * sizeof(intptr_t));
160 for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
161 target_ct = cluster_case_cts[cluster_idx];
162 cluster_end = cluster_starts[cluster_idx + 1];
163 cluster_size = cluster_end - cluster_starts[cluster_idx];
164 if (target_ct && (target_ct != cluster_size)) {
165 upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
166 totq_magic = totq_magics[cluster_idx];
167 totq_preshift = totq_preshifts[cluster_idx];
168 totq_postshift = totq_postshifts[cluster_idx];
169 totq_incr = totq_incrs[cluster_idx];
170 map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
171 if (target_ct * 2 < cluster_size) {
172 for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
173 do {
174 do {
175 urand = sfmt_genrand_uint32(sfmtp);
176 } while (urand > upper_bound);
177 uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
178 widx = uii / BITCT2;
179 wcomp = ONELU << (2 * (uii % BITCT2));
180 pv_val = perm_vec[widx];
181 } while (pv_val & wcomp);
182 perm_vec[widx] = pv_val | wcomp;
183 }
184 } else {
185 target_ct = cluster_size - target_ct;
186 for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
187 do {
188 do {
189 urand = sfmt_genrand_uint32(sfmtp);
190 } while (urand > upper_bound);
191 uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
192 widx = uii / BITCT2;
193 wcomp = ONELU << (2 * (uii % BITCT2));
194 pv_val = perm_vec[widx];
195 } while (!(pv_val & wcomp));
196 perm_vec[widx] = pv_val - wcomp;
197 }
198 }
199 }
200 }
201 }
202
generate_cc_cluster_perm1(uint32_t tot_ct,uintptr_t * preimage,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t * cluster_case_cts,uint32_t * tot_quotients,uint64_t * totq_magics,uint32_t * totq_preshifts,uint32_t * totq_postshifts,uint32_t * totq_incrs,uintptr_t * perm_vec,sfmt_t * sfmtp)203 void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
204 uint32_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
205 uint32_t cluster_idx;
206 uint32_t target_ct;
207 uint32_t cluster_end;
208 uint32_t cluster_size;
209 uint32_t* map_ptr;
210 uint32_t num_swapped;
211 uint32_t upper_bound;
212 uint64_t totq_magic;
213 uint32_t totq_preshift;
214 uint32_t totq_postshift;
215 uint32_t totq_incr;
216 uintptr_t widx;
217 uintptr_t wcomp;
218 uintptr_t pv_val;
219 uint32_t urand;
220 uint32_t uii;
221 memcpy(perm_vec, preimage, tot_ctl * sizeof(intptr_t));
222 for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
223 target_ct = cluster_case_cts[cluster_idx];
224 cluster_end = cluster_starts[cluster_idx + 1];
225 cluster_size = cluster_end - cluster_starts[cluster_idx];
226 if (target_ct && (target_ct != cluster_size)) {
227 upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
228 totq_magic = totq_magics[cluster_idx];
229 totq_preshift = totq_preshifts[cluster_idx];
230 totq_postshift = totq_postshifts[cluster_idx];
231 totq_incr = totq_incrs[cluster_idx];
232 map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
233 if (target_ct * 2 < cluster_size) {
234 for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
235 do {
236 do {
237 urand = sfmt_genrand_uint32(sfmtp);
238 } while (urand > upper_bound);
239 uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
240 widx = uii / BITCT;
241 wcomp = ONELU << (uii % BITCT);
242 pv_val = perm_vec[widx];
243 } while (pv_val & wcomp);
244 perm_vec[widx] = pv_val | wcomp;
245 }
246 } else {
247 target_ct = cluster_size - target_ct;
248 for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
249 do {
250 do {
251 urand = sfmt_genrand_uint32(sfmtp);
252 } while (urand > upper_bound);
253 uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
254 widx = uii / BITCT;
255 wcomp = ONELU << (uii % BITCT);
256 pv_val = perm_vec[widx];
257 } while (!(pv_val & wcomp));
258 perm_vec[widx] = pv_val - wcomp;
259 }
260 }
261 }
262 }
263 }
264
generate_cc_perms_thread(void * arg)265 THREAD_RET_TYPE generate_cc_perms_thread(void* arg) {
266 intptr_t tidx = (intptr_t)arg;
267 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
268 uint32_t case_ct = g_perm_case_ct;
269 uint32_t tot_quotient = g_perm_tot_quotient;
270 uint64_t totq_magic = g_perm_totq_magic;
271 uint32_t totq_preshift = g_perm_totq_preshift;
272 uint32_t totq_postshift = g_perm_totq_postshift;
273 uint32_t totq_incr = g_perm_totq_incr;
274 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
275 sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
276 uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
277 uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
278 uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
279 if (!g_perm_is_1bit) {
280 pheno_nm_ctv *= 2;
281 for (; pidx < pmax; pidx++) {
282 generate_cc_perm_vec(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
283 }
284 } else {
285 // 16-byte alignment currently isn't needed; but it might be useful in the
286 // future, and the cost is low enough that I won't bother with writing the
287 // tiny-bit-more-efficient-half-the-time 8-byte alignment version for now.
288 pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
289 for (; pidx < pmax; pidx++) {
290 generate_cc_perm1(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
291 }
292 }
293 THREAD_RETURN;
294 }
295
generate_cc_cluster_perms_thread(void * arg)296 THREAD_RET_TYPE generate_cc_cluster_perms_thread(void* arg) {
297 intptr_t tidx = (intptr_t)arg;
298 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
299 uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
300 sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
301 uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
302 uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
303 uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
304 uint32_t cluster_ct = g_perm_cluster_ct;
305 uint32_t* cluster_map = g_perm_cluster_map;
306 uint32_t* cluster_starts = g_perm_cluster_starts;
307 uint32_t* cluster_case_cts = g_perm_cluster_case_cts;
308 uintptr_t* perm_cluster_cc_preimage = g_perm_cluster_cc_preimage;
309 uint32_t* tot_quotients = g_perm_tot_quotients;
310 uint64_t* totq_magics = g_perm_totq_magics;
311 uint32_t* totq_preshifts = g_perm_totq_preshifts;
312 uint32_t* totq_postshifts = g_perm_totq_postshifts;
313 uint32_t* totq_incrs = g_perm_totq_incrs;
314 if (!g_perm_is_1bit) {
315 pheno_nm_ctv *= 2;
316 for (; pidx < pmax; pidx++) {
317 generate_cc_cluster_perm_vec(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
318 }
319 } else {
320 pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
321 for (; pidx < pmax; pidx++) {
322 generate_cc_cluster_perm1(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
323 }
324 }
325 THREAD_RETURN;
326 }
327
generate_qt_perms_smajor_thread(void * arg)328 THREAD_RET_TYPE generate_qt_perms_smajor_thread(void* arg) {
329 // Used by QT --assoc and --make-perm-pheno.
330 //
331 // Takes an array of phenotype values in g_perm_pheno_d2 of length
332 // g_perm_pheno_nm_ct, and populates g_perm_vecstd[] with permutations of
333 // those values. Also requires g_sfmtp_arr[] and
334 // g_perm_generation_thread_ct to be initialized.
335 //
336 // g_perm_vecstd is sample-major. The nth permutation is stored across
337 // g_perm_vecstd[n]
338 // g_perm_vecstd[n + perm_vec_ctcl8m]
339 // g_perm_vecstd[n + 2 * perm_vec_ctcl8m]
340 // ...
341 uintptr_t tidx = (uintptr_t)arg;
342 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
343 uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
344 uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
345 double* pheno_d2 = g_perm_pheno_d2;
346 sfmt_t* sfmtp = g_sfmtp_arr[tidx];
347 uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
348 uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
349 double* perm_vecstd = &(g_perm_vecstd[pmin]);
350 uint32_t poffset = 0;
351 uint32_t sample_idx = 1;
352 uint32_t pdiff;
353 uint32_t tot_quotient;
354 uint32_t upper_bound;
355 uint64_t totq_magic;
356 uint32_t totq_preshift;
357 uint32_t totq_postshift;
358 uint32_t totq_incr;
359 uint32_t urand;
360 uint32_t uii;
361 double* wptr;
362 double* wptr2;
363 double* wptr3;
364 double cur_source;
365 if (tidx + 1 == g_perm_generation_thread_ct) {
366 pmax = g_perm_vec_ct;
367 }
368 pdiff = pmax - pmin;
369 cur_source = *pheno_d2++;
370 wptr = perm_vecstd;
371 for (; poffset < pdiff; poffset++) {
372 *wptr++ = cur_source;
373 }
374 for (; sample_idx < pheno_nm_ct; sample_idx++) {
375 tot_quotient = 0x100000000LLU / (sample_idx + 1);
376 upper_bound = (sample_idx + 1) * tot_quotient - 1;
377 magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
378 cur_source = *pheno_d2++;
379 wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
380 wptr2 = perm_vecstd;
381 for (poffset = 0; poffset < pdiff; poffset++) {
382 do {
383 urand = sfmt_genrand_uint32(sfmtp);
384 } while (urand > upper_bound);
385 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
386 wptr3 = &(wptr2[uii * perm_vec_ctcl8m]);
387 *wptr++ = *wptr3;
388 *wptr3 = cur_source;
389 wptr2++;
390 }
391 }
392 THREAD_RETURN;
393 }
394
generate_qt_cluster_perms_smajor_thread(void * arg)395 THREAD_RET_TYPE generate_qt_cluster_perms_smajor_thread(void* arg) {
396 // Variant of generate_qt_perms_smajor_thread() which restricts permutations
397 // to be within-cluster.
398 // On top of the generate_qt_perms_smajor_thread requirements, this also
399 // needs g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
400 // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
401 // initialized.
402 uintptr_t tidx = (uintptr_t)arg;
403 uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
404 uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
405 uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
406 double* pheno_d2 = g_perm_pheno_d2;
407 sfmt_t* sfmtp = g_sfmtp_arr[tidx];
408 uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
409 uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
410 double* perm_vecstd = &(g_perm_vecstd[pmin]);
411 uint32_t cluster_ct = g_perm_cluster_ct;
412 uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
413 uint32_t* cluster_map = g_perm_cluster_map;
414 uint32_t* cluster_starts = g_perm_cluster_starts;
415 uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
416 uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
417 uint32_t poffset = 0;
418 uint32_t sample_idx = 0;
419 uint32_t* cur_map_start;
420 uint32_t pdiff;
421 uint32_t cluster_idx;
422 uint32_t cur_in_cluster_pos;
423 uint32_t tot_quotient;
424 uint32_t upper_bound;
425 uint64_t totq_magic;
426 uint32_t totq_preshift;
427 uint32_t totq_postshift;
428 uint32_t totq_incr;
429 uint32_t urand;
430 uint32_t uii;
431 double* wptr;
432 double* wptr2;
433 double* wptr3;
434 double cur_source;
435 if (tidx + 1 == g_perm_generation_thread_ct) {
436 pmax = g_perm_vec_ct;
437 }
438 pdiff = pmax - pmin;
439 fill_uint_zero(cluster_ct, in_cluster_positions);
440 for (; sample_idx < pheno_nm_ct; sample_idx++) {
441 cur_source = *pheno_d2++;
442 cluster_idx = sample_to_cluster[sample_idx];
443 if (cluster_idx == 0xffffffffU) {
444 cur_in_cluster_pos = 0;
445 } else {
446 cur_in_cluster_pos = in_cluster_positions[cluster_idx];
447 in_cluster_positions[cluster_idx] += 1;
448 }
449 wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
450 if (!cur_in_cluster_pos) {
451 for (poffset = 0; poffset < pdiff; poffset++) {
452 *wptr++ = cur_source;
453 }
454 } else {
455 cur_map_start = &(cluster_map[cluster_starts[cluster_idx]]);
456 tot_quotient = 0x100000000LLU / (cur_in_cluster_pos + 1);
457 upper_bound = (cur_in_cluster_pos + 1) * tot_quotient - 1;
458 magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
459 wptr2 = perm_vecstd;
460 for (poffset = 0; poffset < pdiff; poffset++) {
461 do {
462 urand = sfmt_genrand_uint32(sfmtp);
463 } while (urand > upper_bound);
464 uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
465 wptr3 = &(wptr2[cur_map_start[uii] * perm_vec_ctcl8m]);
466 *wptr++ = *wptr3;
467 *wptr3 = cur_source;
468 wptr2++;
469 }
470 }
471 }
472 THREAD_RETURN;
473 }
474
generate_qt_perms_pmajor_thread(void * arg)475 THREAD_RET_TYPE generate_qt_perms_pmajor_thread(void* arg) {
476 // Used by --linear. Requires g_perm_pheno_nm_ct, g_perm_pheno_d2,
477 // g_sfmtp_arr, g_perm_generation_thread_ct, and g_perm_vec_ct to be
478 // initialized, and space must be allocated for g_perm_pmajor. The nth
479 // permutation (0-based) is stored in g_perm_pmajor indices
480 // [n * sample_valid_ct] to [(n + 1) * sample_valid_ct - 1]
481 // inclusive.
482 uintptr_t tidx = (uintptr_t)arg;
483 uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
484 uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
485 sfmt_t* sfmtp = g_sfmtp_arr[tidx];
486 uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
487 uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
488 double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
489 double* pheno_d2 = g_perm_pheno_d2;
490 uint32_t* precomputed_mods = g_perm_precomputed_mods;
491 uint32_t* lbound_ptr;
492 double* pheno_ptr;
493 uint32_t poffset;
494 uint32_t pdiff;
495 uint32_t sample_idx;
496 uint32_t urand;
497 uint32_t lbound;
498 if (tidx + 1 == g_perm_generation_thread_ct) {
499 pmax = g_perm_vec_ct;
500 }
501 pdiff = pmax - pmin;
502 for (poffset = 0; poffset < pdiff; poffset++) {
503 lbound_ptr = precomputed_mods;
504 pheno_ptr = pheno_d2;
505 perm_pmajor[0] = *pheno_ptr++;
506 for (sample_idx = 1; sample_idx < sample_valid_ct; sample_idx++) {
507 lbound = *lbound_ptr++;
508 do {
509 urand = sfmt_genrand_uint32(sfmtp);
510 } while (urand < lbound);
511 // er, this modulus operation is slow. but doesn't seem to be worthwhile
512 // to use magic numbers here.
513 urand %= sample_idx + 1;
514 perm_pmajor[sample_idx] = perm_pmajor[urand];
515 perm_pmajor[urand] = *pheno_ptr++;
516 }
517 perm_pmajor = &(perm_pmajor[sample_valid_ct]);
518 }
519 THREAD_RETURN;
520 }
521
generate_qt_cluster_perms_pmajor_thread(void * arg)522 THREAD_RET_TYPE generate_qt_cluster_perms_pmajor_thread(void* arg) {
523 // On top of the linear_gen_perms_thread requirements, this also needs
524 // g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
525 // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
526 // initialized.
527 uintptr_t tidx = (uintptr_t)arg;
528 uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
529 uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
530 sfmt_t* sfmtp = g_sfmtp_arr[tidx];
531 uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
532 uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
533 double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
534 double* pheno_d2 = g_perm_pheno_d2;
535 uint32_t* precomputed_mods = &(g_perm_precomputed_mods[-1]);
536 uint32_t cluster_ct = g_perm_cluster_ct;
537 uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
538 uint32_t* cluster_map = g_perm_cluster_map;
539 uint32_t* cluster_starts = g_perm_cluster_starts;
540 uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
541 uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
542 double* pheno_ptr;
543 uint32_t poffset;
544 uint32_t pdiff;
545 uint32_t cluster_idx;
546 uint32_t cur_in_cluster_pos;
547 uint32_t sample_idx;
548 uint32_t urand;
549 uint32_t lbound;
550 uint32_t uii;
551 if (tidx + 1 == g_perm_generation_thread_ct) {
552 pmax = g_perm_vec_ct;
553 }
554 pdiff = pmax - pmin;
555 for (poffset = 0; poffset < pdiff; poffset++) {
556 fill_uint_zero(cluster_ct, in_cluster_positions);
557 pheno_ptr = pheno_d2;
558 for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
559 cluster_idx = sample_to_cluster[sample_idx];
560 if (cluster_idx == 0xffffffffU) {
561 cur_in_cluster_pos = 0;
562 } else {
563 cur_in_cluster_pos = in_cluster_positions[cluster_idx];
564 in_cluster_positions[cluster_idx] += 1;
565 }
566 if (!cur_in_cluster_pos) {
567 perm_pmajor[sample_idx] = *pheno_ptr++;
568 } else {
569 lbound = precomputed_mods[cur_in_cluster_pos];
570 do {
571 urand = sfmt_genrand_uint32(sfmtp);
572 } while (urand < lbound);
573 urand %= (cur_in_cluster_pos + 1);
574 uii = cluster_map[cluster_starts[cluster_idx] + urand];
575 perm_pmajor[sample_idx] = perm_pmajor[uii];
576 perm_pmajor[uii] = *pheno_ptr++;
577 }
578 }
579 perm_pmajor = &(perm_pmajor[sample_valid_ct]);
580 }
581 THREAD_RETURN;
582 }
583
584
transpose_perms(uintptr_t * perm_vecs,uint32_t perm_vec_ct,uint32_t pheno_nm_ct,uint32_t * perm_vecst)585 void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
586 // Transpose permutations so PRESTO/PERMORY-style genotype indexing can work.
587 //
588 // We used a 32-ply interleaved format, to allow counts up to the uint32_t
589 // limit without giving up highly parallel adds in the calc_git() inner loop
590 // (performed with a combination of unroll_incr_1_4, unroll_incr_4_8, and
591 // unroll_incr_8_32). The index order is:
592 // 64-bit build:
593 // first 16 bytes: 0 32 64 96 16 48 80 112 4 36 68 100 20 52 84 116
594 // 8 40 72 104 24 56 88 120 12 44 76 108 28 60 92 124 1...
595 // next 16 bytes: 128 160 192...
596 //
597 // 32-bit build:
598 // first 4 bytes: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18...
599 // next 4 bytes: 32 40 48...
600 uintptr_t sample_idx = 0;
601 uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
602 #ifdef __LP64__
603 uint32_t wbuf[4];
604 uint32_t* wbptr;
605 #else
606 uint32_t wval;
607 #endif
608 uint32_t rshift;
609 uint32_t wshift;
610 uintptr_t* pvptr;
611 uintptr_t perm_idx;
612 for (; sample_idx < pheno_nm_ct; sample_idx++) {
613 perm_idx = 0;
614 pvptr = &(perm_vecs[sample_idx / BITCT2]);
615 rshift = 2 * (sample_idx % BITCT2);
616 goto transpose_perms_loop_start;
617 #ifdef __LP64__
618 do {
619 if (!(perm_idx % 4)) {
620 if (perm_idx % 128) {
621 wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
622 } else {
623 memcpy(perm_vecst, wbuf, 16);
624 perm_vecst = &(perm_vecst[4]);
625 transpose_perms_loop_start:
626 fill_uint_zero(4, wbuf);
627 wshift = 0;
628 }
629 wbptr = wbuf;
630 }
631 *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
632 wbptr++;
633 } while (++perm_idx < perm_vec_ct);
634 memcpy(perm_vecst, wbuf, 16);
635 perm_vecst = &(perm_vecst[4]);
636 #else
637 do {
638 if (perm_idx % 32) {
639 wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
640 } else {
641 *perm_vecst++ = wval;
642 transpose_perms_loop_start:
643 wval = 0;
644 wshift = 0;
645 }
646 wval |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
647 } while (++perm_idx < perm_vec_ct);
648 *perm_vecst++ = wval;
649 #endif
650 }
651 }
652
transpose_perm1s(uintptr_t * perm_vecs,uint32_t perm_vec_ct,uint32_t pheno_nm_ct,uint32_t * perm_vecst)653 void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
654 uintptr_t sample_idx = 0;
655 uintptr_t pheno_nm_ctv = BITCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
656 #ifdef __LP64__
657 uint32_t wbuf[4];
658 uint32_t* wbptr;
659 #else
660 uint32_t wval;
661 #endif
662 uint32_t rshift;
663 uint32_t wshift;
664 uintptr_t* pvptr;
665 uintptr_t perm_idx;
666 for (; sample_idx < pheno_nm_ct; sample_idx++) {
667 perm_idx = 0;
668 pvptr = &(perm_vecs[sample_idx / BITCT]);
669 rshift = sample_idx % BITCT;
670 goto transpose_perm1s_loop_start;
671 #ifdef __LP64__
672 do {
673 if (!(perm_idx % 4)) {
674 if (perm_idx % 128) {
675 wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
676 } else {
677 memcpy(perm_vecst, wbuf, 16);
678 perm_vecst = &(perm_vecst[4]);
679 transpose_perm1s_loop_start:
680 fill_uint_zero(2, wbuf);
681 wshift = 0;
682 }
683 wbptr = wbuf;
684 }
685 *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
686 wbptr++;
687 } while (++perm_idx < perm_vec_ct);
688 memcpy(perm_vecst, wbuf, 16);
689 perm_vecst = &(perm_vecst[4]);
690 #else
691 do {
692 if (perm_idx % 32) {
693 wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
694 } else {
695 *perm_vecst++ = wval;
696 transpose_perm1s_loop_start:
697 wval = 0;
698 wshift = 0;
699 }
700 wval |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
701 } while (++perm_idx < perm_vec_ct);
702 *perm_vecst++ = wval;
703 #endif
704 }
705 }
706
707
make_perm_pheno(pthread_t * threads,char * outname,char * outname_end,uintptr_t unfiltered_sample_ct,uintptr_t * sample_exclude,uintptr_t sample_ct,char * sample_ids,uintptr_t max_sample_id_len,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,double * pheno_d,char * output_missing_pheno,uint32_t permphe_ct)708 int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct) {
709 unsigned char* bigstack_mark = g_bigstack_base;
710 FILE* outfile = nullptr;
711 uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
712 uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
713 uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
714 uintptr_t perm_vec_ctcl8m = 0;
715 char* writebuf = nullptr;
716 int32_t retval = 0;
717 uintptr_t* ulptr;
718 double* dptr;
719 char* wptr;
720 uintptr_t sample_uidx;
721 uintptr_t sample_idx;
722 uintptr_t perm_idx;
723 uintptr_t ulii;
724 uint32_t sample_nmidx;
725 uint32_t rshift;
726 if (!pheno_nm_ct) {
727 logerrprint("Error: --make-perm-pheno requires phenotype data.\n");
728 goto make_perm_pheno_ret_INVALID_CMDLINE;
729 }
730 g_perm_generation_thread_ct = MINV(g_thread_ct, permphe_ct);
731 if (bigstack_init_sfmtp(g_perm_generation_thread_ct)) {
732 goto make_perm_pheno_ret_NOMEM;
733 }
734 g_perm_pheno_nm_ct = pheno_nm_ct;
735 g_perm_vec_ct = permphe_ct;
736 ulii = 0;
737 if (pheno_c) {
738 g_perm_is_1bit = 1;
739 g_perm_case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
740 // could seamlessly support multipass by using different permutation logic,
741 // but pointless in practice; better to just generate multiple files
742 if (bigstack_alloc_ul(permphe_ct * pheno_nm_ctv, &g_perm_vecs)) {
743 goto make_perm_pheno_ret_NOMEM;
744 }
745 if (cluster_starts) {
746 // most similar to testmiss()
747 retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
748 if (retval) {
749 goto make_perm_pheno_ret_1;
750 }
751 if (!g_perm_cluster_ct) {
752 logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
753 goto make_perm_pheno_ret_INVALID_CMDLINE;
754 }
755 retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
756 if (retval) {
757 goto make_perm_pheno_ret_1;
758 }
759 // not actually much of a point to multithreading since this is I/O
760 // bound, but what the hell, the permutation generators already support
761 // it
762 if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
763 goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
764 }
765 generate_cc_cluster_perms_thread((void*)ulii);
766 } else {
767 g_perm_cluster_starts = nullptr;
768 g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
769 magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
770 if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
771 goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
772 }
773 generate_cc_perms_thread((void*)ulii);
774 }
775 } else {
776 g_perm_pheno_d2 = (double*)alloc_and_init_collapsed_arr_incl((char*)pheno_d, sizeof(double), unfiltered_sample_ct, pheno_nm, pheno_nm_ct, 1);
777 if (!g_perm_pheno_d2) {
778 goto make_perm_pheno_ret_NOMEM;
779 }
780 perm_vec_ctcl8m = round_up_pow2(permphe_ct, CACHELINE_DBL);
781 if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
782 goto make_perm_pheno_ret_NOMEM;
783 }
784 if (cluster_starts) {
785 retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, nullptr, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, nullptr, nullptr);
786 if (retval) {
787 goto make_perm_pheno_ret_1;
788 }
789 if (!g_perm_cluster_ct) {
790 logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
791 goto make_perm_pheno_ret_INVALID_CMDLINE;
792 }
793 if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
794 bigstack_alloc_ui(g_perm_generation_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
795 goto make_perm_pheno_ret_NOMEM;
796 }
797 fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
798 if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
799 goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
800 }
801 generate_qt_cluster_perms_smajor_thread((void*)ulii);
802 } else {
803 if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
804 goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
805 }
806 generate_qt_perms_smajor_thread((void*)ulii);
807 }
808 if (bigstack_alloc_c(permphe_ct * 16LU, &writebuf)) {
809 goto make_perm_pheno_ret_NOMEM;
810 }
811 }
812 join_threads(threads, g_perm_generation_thread_ct);
813 memcpy(outname_end, ".pphe", 6);
814 if (fopen_checked(outname, "w", &outfile)) {
815 goto make_perm_pheno_ret_OPEN_FAIL;
816 }
817 sample_nmidx = 0;
818 for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
819 next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
820 fputs(&(sample_ids[sample_uidx * max_sample_id_len]), outfile);
821 if (!IS_SET(pheno_nm, sample_uidx)) {
822 for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
823 putc_unlocked('\t', outfile);
824 fputs(output_missing_pheno, outfile);
825 }
826 } else if (pheno_c) {
827 ulptr = &(g_perm_vecs[sample_nmidx / BITCT]);
828 rshift = sample_nmidx % BITCT;
829 for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
830 putc_unlocked('\t', outfile);
831 putc_unlocked('1' + ((ulptr[perm_idx * pheno_nm_ctv] >> rshift) & 1), outfile);
832 }
833 sample_nmidx++;
834 } else {
835 wptr = writebuf;
836 dptr = &(g_perm_vecstd[sample_nmidx * perm_vec_ctcl8m]);
837 for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
838 *wptr++ = '\t';
839 wptr = dtoa_g(*dptr++, wptr);
840 }
841 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
842 goto make_perm_pheno_ret_WRITE_FAIL;
843 }
844 sample_nmidx++;
845 }
846 if (putc_checked('\n', outfile)) {
847 goto make_perm_pheno_ret_WRITE_FAIL;
848 }
849 }
850 if (fclose_null(&outfile)) {
851 goto make_perm_pheno_ret_WRITE_FAIL;
852 }
853 LOGPRINTFWW("--make-perm-pheno: Permuted phenotypes written to %s .\n", outname);
854 while (0) {
855 make_perm_pheno_ret_NOMEM:
856 retval = RET_NOMEM;
857 break;
858 make_perm_pheno_ret_OPEN_FAIL:
859 retval = RET_OPEN_FAIL;
860 break;
861 make_perm_pheno_ret_WRITE_FAIL:
862 retval = RET_WRITE_FAIL;
863 break;
864 make_perm_pheno_ret_INVALID_CMDLINE:
865 retval = RET_INVALID_CMDLINE;
866 break;
867 make_perm_pheno_ret_THREAD_CREATE_FAIL:
868 retval = RET_THREAD_CREATE_FAIL;
869 break;
870 }
871 make_perm_pheno_ret_1:
872 bigstack_reset(bigstack_mark);
873 fclose_cond(outfile);
874 return retval;
875 }
876