1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink_common.h"
19 
20 #include "plink_cluster.h"
21 
22 // Inputs/outputs for multithreaded permutation generators.
23 uint32_t g_perm_pheno_nm_ct;
24 uint32_t g_perm_case_ct;
25 uint32_t g_perm_tot_quotient;
26 uint64_t g_perm_totq_magic;
27 uint32_t g_perm_totq_preshift;
28 uint32_t g_perm_totq_postshift;
29 uint32_t g_perm_totq_incr;
30 uint32_t g_perm_is_1bit;
31 uint32_t g_perm_generation_thread_ct;
32 uintptr_t g_perm_vec_ct;
33 
34 uint32_t g_perm_cluster_ct;
35 uint32_t* g_perm_cluster_map;
36 uint32_t* g_perm_cluster_starts;
37 uint32_t* g_perm_cluster_case_cts;
38 uintptr_t* g_perm_cluster_cc_preimage;
39 uint32_t* g_perm_tot_quotients;
40 uint64_t* g_perm_totq_magics;
41 uint32_t* g_perm_totq_preshifts;
42 uint32_t* g_perm_totq_postshifts;
43 uint32_t* g_perm_totq_incrs;
44 
45 uintptr_t* g_perm_vecs;
46 
47 // always use genotype indexing for QT --assoc
48 double* g_perm_vecstd;
49 double* g_perm_pheno_d2;
50 uint32_t* g_perm_sample_to_cluster;
51 uint32_t* g_perm_qt_cluster_thread_wkspace;
52 
53 // permutation-major instead of sample-major order for --linear (PERMORY
54 // speedups do not apply)
55 double* g_perm_pmajor;
56 uint32_t* g_perm_precomputed_mods; // [n] = 2^32 mod (n-2)
57 
generate_cc_perm_vec(uint32_t tot_ct,uint32_t set_ct,uint32_t tot_quotient,uint64_t totq_magic,uint32_t totq_preshift,uint32_t totq_postshift,uint32_t totq_incr,uintptr_t * perm_vec,sfmt_t * sfmtp)58 void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
59   // Assumes tot_quotient is 2^32 / tot_ct, and
60   // totq_magic/totq_preshift/totq_postshift/totq_incr have been precomputed
61   // from magic_num().
62   uint32_t num_set = 0;
63   uint32_t upper_bound = tot_ct * tot_quotient - 1;
64   uintptr_t widx;
65   uintptr_t wcomp;
66   uintptr_t pv_val;
67   uint32_t urand;
68   uint32_t uii;
69   if (set_ct * 2 < tot_ct) {
70     fill_ulong_zero(QUATERCT_TO_ALIGNED_WORDCT(tot_ct), perm_vec);
71     for (; num_set < set_ct; num_set++) {
72       do {
73 	do {
74 	  urand = sfmt_genrand_uint32(sfmtp);
75 	} while (urand > upper_bound);
76 	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
77         widx = uii / BITCT2;
78 	wcomp = ONELU << (2 * (uii % BITCT2));
79 	pv_val = perm_vec[widx];
80       } while (pv_val & wcomp);
81       perm_vec[widx] = pv_val | wcomp;
82     }
83   } else {
84     fill_quatervec_55(tot_ct, perm_vec);
85     set_ct = tot_ct - set_ct;
86     for (; num_set < set_ct; num_set++) {
87       do {
88 	do {
89 	  urand = sfmt_genrand_uint32(sfmtp);
90 	} while (urand > upper_bound);
91 	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
92         widx = uii / BITCT2;
93 	wcomp = ONELU << (2 * (uii % BITCT2));
94 	pv_val = perm_vec[widx];
95       } while (!(pv_val & wcomp));
96       perm_vec[widx] = pv_val - wcomp;
97     }
98   }
99 }
100 
generate_cc_perm1(uint32_t tot_ct,uint32_t set_ct,uint32_t tot_quotient,uint64_t totq_magic,uint32_t totq_preshift,uint32_t totq_postshift,uint32_t totq_incr,uintptr_t * perm_vec,sfmt_t * sfmtp)101 void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
102   // generate_cc_perm_vec() variant which uses 1-bit packing instead of 2.
103   uint32_t num_set = 0;
104   uint32_t upper_bound = tot_ct * tot_quotient - 1;
105   uintptr_t widx;
106   uintptr_t wcomp;
107   uintptr_t pv_val;
108   uint32_t urand;
109   uint32_t uii;
110   if (set_ct * 2 < tot_ct) {
111     fill_ulong_zero(BITCT_TO_WORDCT(tot_ct), perm_vec);
112     for (; num_set < set_ct; num_set++) {
113       do {
114 	do {
115 	  urand = sfmt_genrand_uint32(sfmtp);
116 	} while (urand > upper_bound);
117 	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
118         widx = uii / BITCT;
119 	wcomp = ONELU << (uii % BITCT);
120 	pv_val = perm_vec[widx];
121       } while (pv_val & wcomp);
122       perm_vec[widx] = pv_val | wcomp;
123     }
124   } else {
125     fill_all_bits(tot_ct, perm_vec);
126     set_ct = tot_ct - set_ct;
127     for (; num_set < set_ct; num_set++) {
128       do {
129 	do {
130 	  urand = sfmt_genrand_uint32(sfmtp);
131 	} while (urand > upper_bound);
132 	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
133         widx = uii / BITCT;
134 	wcomp = ONELU << (uii % BITCT);
135 	pv_val = perm_vec[widx];
136       } while (!(pv_val & wcomp));
137       perm_vec[widx] = pv_val - wcomp;
138     }
139   }
140 }
141 
generate_cc_cluster_perm_vec(uint32_t tot_ct,uintptr_t * preimage,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t * cluster_case_cts,uint32_t * tot_quotients,uint64_t * totq_magics,uint32_t * totq_preshifts,uint32_t * totq_postshifts,uint32_t * totq_incrs,uintptr_t * perm_vec,sfmt_t * sfmtp)142 void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
143   uint32_t cluster_idx;
144   uint32_t target_ct;
145   uint32_t cluster_end;
146   uint32_t* map_ptr;
147   uint32_t num_swapped;
148   uint32_t cluster_size;
149   uint32_t upper_bound;
150   uint64_t totq_magic;
151   uint32_t totq_preshift;
152   uint32_t totq_postshift;
153   uint32_t totq_incr;
154   uintptr_t widx;
155   uintptr_t wcomp;
156   uintptr_t pv_val;
157   uint32_t urand;
158   uint32_t uii;
159   memcpy(perm_vec, preimage, QUATERCT_TO_ALIGNED_WORDCT(tot_ct) * sizeof(intptr_t));
160   for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
161     target_ct = cluster_case_cts[cluster_idx];
162     cluster_end = cluster_starts[cluster_idx + 1];
163     cluster_size = cluster_end - cluster_starts[cluster_idx];
164     if (target_ct && (target_ct != cluster_size)) {
165       upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
166       totq_magic = totq_magics[cluster_idx];
167       totq_preshift = totq_preshifts[cluster_idx];
168       totq_postshift = totq_postshifts[cluster_idx];
169       totq_incr = totq_incrs[cluster_idx];
170       map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
171       if (target_ct * 2 < cluster_size) {
172 	for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
173 	  do {
174 	    do {
175 	      urand = sfmt_genrand_uint32(sfmtp);
176 	    } while (urand > upper_bound);
177 	    uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
178 	    widx = uii / BITCT2;
179 	    wcomp = ONELU << (2 * (uii % BITCT2));
180 	    pv_val = perm_vec[widx];
181 	  } while (pv_val & wcomp);
182 	  perm_vec[widx] = pv_val | wcomp;
183 	}
184       } else {
185 	target_ct = cluster_size - target_ct;
186 	for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
187 	  do {
188 	    do {
189 	      urand = sfmt_genrand_uint32(sfmtp);
190 	    } while (urand > upper_bound);
191 	    uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
192 	    widx = uii / BITCT2;
193 	    wcomp = ONELU << (2 * (uii % BITCT2));
194 	    pv_val = perm_vec[widx];
195 	  } while (!(pv_val & wcomp));
196 	  perm_vec[widx] = pv_val - wcomp;
197 	}
198       }
199     }
200   }
201 }
202 
generate_cc_cluster_perm1(uint32_t tot_ct,uintptr_t * preimage,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t * cluster_case_cts,uint32_t * tot_quotients,uint64_t * totq_magics,uint32_t * totq_preshifts,uint32_t * totq_postshifts,uint32_t * totq_incrs,uintptr_t * perm_vec,sfmt_t * sfmtp)203 void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
204   uint32_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
205   uint32_t cluster_idx;
206   uint32_t target_ct;
207   uint32_t cluster_end;
208   uint32_t cluster_size;
209   uint32_t* map_ptr;
210   uint32_t num_swapped;
211   uint32_t upper_bound;
212   uint64_t totq_magic;
213   uint32_t totq_preshift;
214   uint32_t totq_postshift;
215   uint32_t totq_incr;
216   uintptr_t widx;
217   uintptr_t wcomp;
218   uintptr_t pv_val;
219   uint32_t urand;
220   uint32_t uii;
221   memcpy(perm_vec, preimage, tot_ctl * sizeof(intptr_t));
222   for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
223     target_ct = cluster_case_cts[cluster_idx];
224     cluster_end = cluster_starts[cluster_idx + 1];
225     cluster_size = cluster_end - cluster_starts[cluster_idx];
226     if (target_ct && (target_ct != cluster_size)) {
227       upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
228       totq_magic = totq_magics[cluster_idx];
229       totq_preshift = totq_preshifts[cluster_idx];
230       totq_postshift = totq_postshifts[cluster_idx];
231       totq_incr = totq_incrs[cluster_idx];
232       map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
233       if (target_ct * 2 < cluster_size) {
234 	for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
235 	  do {
236 	    do {
237 	      urand = sfmt_genrand_uint32(sfmtp);
238 	    } while (urand > upper_bound);
239 	    uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
240 	    widx = uii / BITCT;
241 	    wcomp = ONELU << (uii % BITCT);
242 	    pv_val = perm_vec[widx];
243 	  } while (pv_val & wcomp);
244 	  perm_vec[widx] = pv_val | wcomp;
245 	}
246       } else {
247 	target_ct = cluster_size - target_ct;
248 	for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
249 	  do {
250 	    do {
251 	      urand = sfmt_genrand_uint32(sfmtp);
252 	    } while (urand > upper_bound);
253 	    uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
254 	    widx = uii / BITCT;
255 	    wcomp = ONELU << (uii % BITCT);
256 	    pv_val = perm_vec[widx];
257 	  } while (!(pv_val & wcomp));
258 	  perm_vec[widx] = pv_val - wcomp;
259 	}
260       }
261     }
262   }
263 }
264 
generate_cc_perms_thread(void * arg)265 THREAD_RET_TYPE generate_cc_perms_thread(void* arg) {
266   intptr_t tidx = (intptr_t)arg;
267   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
268   uint32_t case_ct = g_perm_case_ct;
269   uint32_t tot_quotient = g_perm_tot_quotient;
270   uint64_t totq_magic = g_perm_totq_magic;
271   uint32_t totq_preshift = g_perm_totq_preshift;
272   uint32_t totq_postshift = g_perm_totq_postshift;
273   uint32_t totq_incr = g_perm_totq_incr;
274   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
275   sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
276   uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
277   uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
278   uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
279   if (!g_perm_is_1bit) {
280     pheno_nm_ctv *= 2;
281     for (; pidx < pmax; pidx++) {
282       generate_cc_perm_vec(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
283     }
284   } else {
285     // 16-byte alignment currently isn't needed; but it might be useful in the
286     // future, and the cost is low enough that I won't bother with writing the
287     // tiny-bit-more-efficient-half-the-time 8-byte alignment version for now.
288     pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
289     for (; pidx < pmax; pidx++) {
290       generate_cc_perm1(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
291     }
292   }
293   THREAD_RETURN;
294 }
295 
generate_cc_cluster_perms_thread(void * arg)296 THREAD_RET_TYPE generate_cc_cluster_perms_thread(void* arg) {
297   intptr_t tidx = (intptr_t)arg;
298   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
299   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
300   sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
301   uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
302   uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
303   uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
304   uint32_t cluster_ct = g_perm_cluster_ct;
305   uint32_t* cluster_map = g_perm_cluster_map;
306   uint32_t* cluster_starts = g_perm_cluster_starts;
307   uint32_t* cluster_case_cts = g_perm_cluster_case_cts;
308   uintptr_t* perm_cluster_cc_preimage = g_perm_cluster_cc_preimage;
309   uint32_t* tot_quotients = g_perm_tot_quotients;
310   uint64_t* totq_magics = g_perm_totq_magics;
311   uint32_t* totq_preshifts = g_perm_totq_preshifts;
312   uint32_t* totq_postshifts = g_perm_totq_postshifts;
313   uint32_t* totq_incrs = g_perm_totq_incrs;
314   if (!g_perm_is_1bit) {
315     pheno_nm_ctv *= 2;
316     for (; pidx < pmax; pidx++) {
317       generate_cc_cluster_perm_vec(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
318     }
319   } else {
320     pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
321     for (; pidx < pmax; pidx++) {
322       generate_cc_cluster_perm1(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
323     }
324   }
325   THREAD_RETURN;
326 }
327 
generate_qt_perms_smajor_thread(void * arg)328 THREAD_RET_TYPE generate_qt_perms_smajor_thread(void* arg) {
329   // Used by QT --assoc and --make-perm-pheno.
330   //
331   // Takes an array of phenotype values in g_perm_pheno_d2 of length
332   // g_perm_pheno_nm_ct, and populates g_perm_vecstd[] with permutations of
333   // those values.  Also requires g_sfmtp_arr[] and
334   // g_perm_generation_thread_ct to be initialized.
335   //
336   // g_perm_vecstd is sample-major.  The nth permutation is stored across
337   //   g_perm_vecstd[n]
338   //   g_perm_vecstd[n + perm_vec_ctcl8m]
339   //   g_perm_vecstd[n + 2 * perm_vec_ctcl8m]
340   //   ...
341   uintptr_t tidx = (uintptr_t)arg;
342   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
343   uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
344   uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
345   double* pheno_d2 = g_perm_pheno_d2;
346   sfmt_t* sfmtp = g_sfmtp_arr[tidx];
347   uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
348   uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
349   double* perm_vecstd = &(g_perm_vecstd[pmin]);
350   uint32_t poffset = 0;
351   uint32_t sample_idx = 1;
352   uint32_t pdiff;
353   uint32_t tot_quotient;
354   uint32_t upper_bound;
355   uint64_t totq_magic;
356   uint32_t totq_preshift;
357   uint32_t totq_postshift;
358   uint32_t totq_incr;
359   uint32_t urand;
360   uint32_t uii;
361   double* wptr;
362   double* wptr2;
363   double* wptr3;
364   double cur_source;
365   if (tidx + 1 == g_perm_generation_thread_ct) {
366     pmax = g_perm_vec_ct;
367   }
368   pdiff = pmax - pmin;
369   cur_source = *pheno_d2++;
370   wptr = perm_vecstd;
371   for (; poffset < pdiff; poffset++) {
372     *wptr++ = cur_source;
373   }
374   for (; sample_idx < pheno_nm_ct; sample_idx++) {
375     tot_quotient = 0x100000000LLU / (sample_idx + 1);
376     upper_bound = (sample_idx + 1) * tot_quotient - 1;
377     magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
378     cur_source = *pheno_d2++;
379     wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
380     wptr2 = perm_vecstd;
381     for (poffset = 0; poffset < pdiff; poffset++) {
382       do {
383 	urand = sfmt_genrand_uint32(sfmtp);
384       } while (urand > upper_bound);
385       uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
386       wptr3 = &(wptr2[uii * perm_vec_ctcl8m]);
387       *wptr++ = *wptr3;
388       *wptr3 = cur_source;
389       wptr2++;
390     }
391   }
392   THREAD_RETURN;
393 }
394 
generate_qt_cluster_perms_smajor_thread(void * arg)395 THREAD_RET_TYPE generate_qt_cluster_perms_smajor_thread(void* arg) {
396   // Variant of generate_qt_perms_smajor_thread() which restricts permutations
397   // to be within-cluster.
398   // On top of the generate_qt_perms_smajor_thread requirements, this also
399   // needs g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
400   // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
401   // initialized.
402   uintptr_t tidx = (uintptr_t)arg;
403   uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
404   uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
405   uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
406   double* pheno_d2 = g_perm_pheno_d2;
407   sfmt_t* sfmtp = g_sfmtp_arr[tidx];
408   uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
409   uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
410   double* perm_vecstd = &(g_perm_vecstd[pmin]);
411   uint32_t cluster_ct = g_perm_cluster_ct;
412   uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
413   uint32_t* cluster_map = g_perm_cluster_map;
414   uint32_t* cluster_starts = g_perm_cluster_starts;
415   uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
416   uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
417   uint32_t poffset = 0;
418   uint32_t sample_idx = 0;
419   uint32_t* cur_map_start;
420   uint32_t pdiff;
421   uint32_t cluster_idx;
422   uint32_t cur_in_cluster_pos;
423   uint32_t tot_quotient;
424   uint32_t upper_bound;
425   uint64_t totq_magic;
426   uint32_t totq_preshift;
427   uint32_t totq_postshift;
428   uint32_t totq_incr;
429   uint32_t urand;
430   uint32_t uii;
431   double* wptr;
432   double* wptr2;
433   double* wptr3;
434   double cur_source;
435   if (tidx + 1 == g_perm_generation_thread_ct) {
436     pmax = g_perm_vec_ct;
437   }
438   pdiff = pmax - pmin;
439   fill_uint_zero(cluster_ct, in_cluster_positions);
440   for (; sample_idx < pheno_nm_ct; sample_idx++) {
441     cur_source = *pheno_d2++;
442     cluster_idx = sample_to_cluster[sample_idx];
443     if (cluster_idx == 0xffffffffU) {
444       cur_in_cluster_pos = 0;
445     } else {
446       cur_in_cluster_pos = in_cluster_positions[cluster_idx];
447       in_cluster_positions[cluster_idx] += 1;
448     }
449     wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
450     if (!cur_in_cluster_pos) {
451       for (poffset = 0; poffset < pdiff; poffset++) {
452         *wptr++ = cur_source;
453       }
454     } else {
455       cur_map_start = &(cluster_map[cluster_starts[cluster_idx]]);
456       tot_quotient = 0x100000000LLU / (cur_in_cluster_pos + 1);
457       upper_bound = (cur_in_cluster_pos + 1) * tot_quotient - 1;
458       magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
459       wptr2 = perm_vecstd;
460       for (poffset = 0; poffset < pdiff; poffset++) {
461 	do {
462 	  urand = sfmt_genrand_uint32(sfmtp);
463 	} while (urand > upper_bound);
464 	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
465 	wptr3 = &(wptr2[cur_map_start[uii] * perm_vec_ctcl8m]);
466 	*wptr++ = *wptr3;
467 	*wptr3 = cur_source;
468 	wptr2++;
469       }
470     }
471   }
472   THREAD_RETURN;
473 }
474 
generate_qt_perms_pmajor_thread(void * arg)475 THREAD_RET_TYPE generate_qt_perms_pmajor_thread(void* arg) {
476   // Used by --linear.  Requires g_perm_pheno_nm_ct, g_perm_pheno_d2,
477   // g_sfmtp_arr, g_perm_generation_thread_ct, and g_perm_vec_ct to be
478   // initialized, and space must be allocated for g_perm_pmajor.  The nth
479   // permutation (0-based) is stored in g_perm_pmajor indices
480   //   [n * sample_valid_ct] to [(n + 1) * sample_valid_ct - 1]
481   // inclusive.
482   uintptr_t tidx = (uintptr_t)arg;
483   uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
484   uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
485   sfmt_t* sfmtp = g_sfmtp_arr[tidx];
486   uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
487   uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
488   double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
489   double* pheno_d2 = g_perm_pheno_d2;
490   uint32_t* precomputed_mods = g_perm_precomputed_mods;
491   uint32_t* lbound_ptr;
492   double* pheno_ptr;
493   uint32_t poffset;
494   uint32_t pdiff;
495   uint32_t sample_idx;
496   uint32_t urand;
497   uint32_t lbound;
498   if (tidx + 1 == g_perm_generation_thread_ct) {
499     pmax = g_perm_vec_ct;
500   }
501   pdiff = pmax - pmin;
502   for (poffset = 0; poffset < pdiff; poffset++) {
503     lbound_ptr = precomputed_mods;
504     pheno_ptr = pheno_d2;
505     perm_pmajor[0] = *pheno_ptr++;
506     for (sample_idx = 1; sample_idx < sample_valid_ct; sample_idx++) {
507       lbound = *lbound_ptr++;
508       do {
509         urand = sfmt_genrand_uint32(sfmtp);
510       } while (urand < lbound);
511       // er, this modulus operation is slow.  but doesn't seem to be worthwhile
512       // to use magic numbers here.
513       urand %= sample_idx + 1;
514       perm_pmajor[sample_idx] = perm_pmajor[urand];
515       perm_pmajor[urand] = *pheno_ptr++;
516     }
517     perm_pmajor = &(perm_pmajor[sample_valid_ct]);
518   }
519   THREAD_RETURN;
520 }
521 
generate_qt_cluster_perms_pmajor_thread(void * arg)522 THREAD_RET_TYPE generate_qt_cluster_perms_pmajor_thread(void* arg) {
523   // On top of the linear_gen_perms_thread requirements, this also needs
524   // g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
525   // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
526   // initialized.
527   uintptr_t tidx = (uintptr_t)arg;
528   uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
529   uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
530   sfmt_t* sfmtp = g_sfmtp_arr[tidx];
531   uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
532   uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
533   double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
534   double* pheno_d2 = g_perm_pheno_d2;
535   uint32_t* precomputed_mods = &(g_perm_precomputed_mods[-1]);
536   uint32_t cluster_ct = g_perm_cluster_ct;
537   uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
538   uint32_t* cluster_map = g_perm_cluster_map;
539   uint32_t* cluster_starts = g_perm_cluster_starts;
540   uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
541   uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
542   double* pheno_ptr;
543   uint32_t poffset;
544   uint32_t pdiff;
545   uint32_t cluster_idx;
546   uint32_t cur_in_cluster_pos;
547   uint32_t sample_idx;
548   uint32_t urand;
549   uint32_t lbound;
550   uint32_t uii;
551   if (tidx + 1 == g_perm_generation_thread_ct) {
552     pmax = g_perm_vec_ct;
553   }
554   pdiff = pmax - pmin;
555   for (poffset = 0; poffset < pdiff; poffset++) {
556     fill_uint_zero(cluster_ct, in_cluster_positions);
557     pheno_ptr = pheno_d2;
558     for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
559       cluster_idx = sample_to_cluster[sample_idx];
560       if (cluster_idx == 0xffffffffU) {
561 	cur_in_cluster_pos = 0;
562       } else {
563 	cur_in_cluster_pos = in_cluster_positions[cluster_idx];
564 	in_cluster_positions[cluster_idx] += 1;
565       }
566       if (!cur_in_cluster_pos) {
567         perm_pmajor[sample_idx] = *pheno_ptr++;
568       } else {
569         lbound = precomputed_mods[cur_in_cluster_pos];
570         do {
571 	  urand = sfmt_genrand_uint32(sfmtp);
572 	} while (urand < lbound);
573 	urand %= (cur_in_cluster_pos + 1);
574 	uii = cluster_map[cluster_starts[cluster_idx] + urand];
575         perm_pmajor[sample_idx] = perm_pmajor[uii];
576 	perm_pmajor[uii] = *pheno_ptr++;
577       }
578     }
579     perm_pmajor = &(perm_pmajor[sample_valid_ct]);
580   }
581   THREAD_RETURN;
582 }
583 
584 
transpose_perms(uintptr_t * perm_vecs,uint32_t perm_vec_ct,uint32_t pheno_nm_ct,uint32_t * perm_vecst)585 void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
586   // Transpose permutations so PRESTO/PERMORY-style genotype indexing can work.
587   //
588   // We used a 32-ply interleaved format, to allow counts up to the uint32_t
589   // limit without giving up highly parallel adds in the calc_git() inner loop
590   // (performed with a combination of unroll_incr_1_4, unroll_incr_4_8, and
591   // unroll_incr_8_32).  The index order is:
592   // 64-bit build:
593   //   first 16 bytes: 0 32 64 96 16 48 80 112 4 36 68 100 20 52 84 116
594   //     8 40 72 104 24 56 88 120 12 44 76 108 28 60 92 124 1...
595   //   next 16 bytes: 128 160 192...
596   //
597   // 32-bit build:
598   //   first 4 bytes: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18...
599   //   next 4 bytes: 32 40 48...
600   uintptr_t sample_idx = 0;
601   uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
602 #ifdef __LP64__
603   uint32_t wbuf[4];
604   uint32_t* wbptr;
605 #else
606   uint32_t wval;
607 #endif
608   uint32_t rshift;
609   uint32_t wshift;
610   uintptr_t* pvptr;
611   uintptr_t perm_idx;
612   for (; sample_idx < pheno_nm_ct; sample_idx++) {
613     perm_idx = 0;
614     pvptr = &(perm_vecs[sample_idx / BITCT2]);
615     rshift = 2 * (sample_idx % BITCT2);
616     goto transpose_perms_loop_start;
617 #ifdef __LP64__
618     do {
619       if (!(perm_idx % 4)) {
620 	if (perm_idx % 128) {
621 	  wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
622 	} else {
623 	  memcpy(perm_vecst, wbuf, 16);
624 	  perm_vecst = &(perm_vecst[4]);
625 	transpose_perms_loop_start:
626 	  fill_uint_zero(4, wbuf);
627 	  wshift = 0;
628 	}
629 	wbptr = wbuf;
630       }
631       *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
632       wbptr++;
633     } while (++perm_idx < perm_vec_ct);
634     memcpy(perm_vecst, wbuf, 16);
635     perm_vecst = &(perm_vecst[4]);
636 #else
637     do {
638       if (perm_idx % 32) {
639 	wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
640       } else {
641 	*perm_vecst++ = wval;
642       transpose_perms_loop_start:
643 	wval = 0;
644 	wshift = 0;
645       }
646       wval |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
647     } while (++perm_idx < perm_vec_ct);
648     *perm_vecst++ = wval;
649 #endif
650   }
651 }
652 
transpose_perm1s(uintptr_t * perm_vecs,uint32_t perm_vec_ct,uint32_t pheno_nm_ct,uint32_t * perm_vecst)653 void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
654   uintptr_t sample_idx = 0;
655   uintptr_t pheno_nm_ctv = BITCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
656 #ifdef __LP64__
657   uint32_t wbuf[4];
658   uint32_t* wbptr;
659 #else
660   uint32_t wval;
661 #endif
662   uint32_t rshift;
663   uint32_t wshift;
664   uintptr_t* pvptr;
665   uintptr_t perm_idx;
666   for (; sample_idx < pheno_nm_ct; sample_idx++) {
667     perm_idx = 0;
668     pvptr = &(perm_vecs[sample_idx / BITCT]);
669     rshift = sample_idx % BITCT;
670     goto transpose_perm1s_loop_start;
671 #ifdef __LP64__
672     do {
673       if (!(perm_idx % 4)) {
674 	if (perm_idx % 128) {
675 	  wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
676 	} else {
677 	  memcpy(perm_vecst, wbuf, 16);
678 	  perm_vecst = &(perm_vecst[4]);
679 	transpose_perm1s_loop_start:
680 	  fill_uint_zero(2, wbuf);
681 	  wshift = 0;
682 	}
683 	wbptr = wbuf;
684       }
685       *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
686       wbptr++;
687     } while (++perm_idx < perm_vec_ct);
688     memcpy(perm_vecst, wbuf, 16);
689     perm_vecst = &(perm_vecst[4]);
690 #else
691     do {
692       if (perm_idx % 32) {
693 	wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
694       } else {
695 	*perm_vecst++ = wval;
696       transpose_perm1s_loop_start:
697 	wval = 0;
698 	wshift = 0;
699       }
700       wval |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
701     } while (++perm_idx < perm_vec_ct);
702     *perm_vecst++ = wval;
703 #endif
704   }
705 }
706 
707 
make_perm_pheno(pthread_t * threads,char * outname,char * outname_end,uintptr_t unfiltered_sample_ct,uintptr_t * sample_exclude,uintptr_t sample_ct,char * sample_ids,uintptr_t max_sample_id_len,uint32_t cluster_ct,uint32_t * cluster_map,uint32_t * cluster_starts,uint32_t pheno_nm_ct,uintptr_t * pheno_nm,uintptr_t * pheno_c,double * pheno_d,char * output_missing_pheno,uint32_t permphe_ct)708 int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct) {
709   unsigned char* bigstack_mark = g_bigstack_base;
710   FILE* outfile = nullptr;
711   uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
712   uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
713   uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
714   uintptr_t perm_vec_ctcl8m = 0;
715   char* writebuf = nullptr;
716   int32_t retval = 0;
717   uintptr_t* ulptr;
718   double* dptr;
719   char* wptr;
720   uintptr_t sample_uidx;
721   uintptr_t sample_idx;
722   uintptr_t perm_idx;
723   uintptr_t ulii;
724   uint32_t sample_nmidx;
725   uint32_t rshift;
726   if (!pheno_nm_ct) {
727     logerrprint("Error: --make-perm-pheno requires phenotype data.\n");
728     goto make_perm_pheno_ret_INVALID_CMDLINE;
729   }
730   g_perm_generation_thread_ct = MINV(g_thread_ct, permphe_ct);
731   if (bigstack_init_sfmtp(g_perm_generation_thread_ct)) {
732     goto make_perm_pheno_ret_NOMEM;
733   }
734   g_perm_pheno_nm_ct = pheno_nm_ct;
735   g_perm_vec_ct = permphe_ct;
736   ulii = 0;
737   if (pheno_c) {
738     g_perm_is_1bit = 1;
739     g_perm_case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
740     // could seamlessly support multipass by using different permutation logic,
741     // but pointless in practice; better to just generate multiple files
742     if (bigstack_alloc_ul(permphe_ct * pheno_nm_ctv, &g_perm_vecs)) {
743       goto make_perm_pheno_ret_NOMEM;
744     }
745     if (cluster_starts) {
746       // most similar to testmiss()
747       retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
748       if (retval) {
749 	goto make_perm_pheno_ret_1;
750       }
751       if (!g_perm_cluster_ct) {
752         logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
753         goto make_perm_pheno_ret_INVALID_CMDLINE;
754       }
755       retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
756       if (retval) {
757         goto make_perm_pheno_ret_1;
758       }
759       // not actually much of a point to multithreading since this is I/O
760       // bound, but what the hell, the permutation generators already support
761       // it
762       if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
763 	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
764       }
765       generate_cc_cluster_perms_thread((void*)ulii);
766     } else {
767       g_perm_cluster_starts = nullptr;
768       g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
769       magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
770       if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
771 	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
772       }
773       generate_cc_perms_thread((void*)ulii);
774     }
775   } else {
776     g_perm_pheno_d2 = (double*)alloc_and_init_collapsed_arr_incl((char*)pheno_d, sizeof(double), unfiltered_sample_ct, pheno_nm, pheno_nm_ct, 1);
777     if (!g_perm_pheno_d2) {
778       goto make_perm_pheno_ret_NOMEM;
779     }
780     perm_vec_ctcl8m = round_up_pow2(permphe_ct, CACHELINE_DBL);
781     if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
782       goto make_perm_pheno_ret_NOMEM;
783     }
784     if (cluster_starts) {
785       retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, nullptr, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, nullptr, nullptr);
786       if (retval) {
787 	goto make_perm_pheno_ret_1;
788       }
789       if (!g_perm_cluster_ct) {
790         logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
791         goto make_perm_pheno_ret_INVALID_CMDLINE;
792       }
793       if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
794           bigstack_alloc_ui(g_perm_generation_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
795 	goto make_perm_pheno_ret_NOMEM;
796       }
797       fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
798       if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
799 	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
800       }
801       generate_qt_cluster_perms_smajor_thread((void*)ulii);
802     } else {
803       if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
804 	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
805       }
806       generate_qt_perms_smajor_thread((void*)ulii);
807     }
808     if (bigstack_alloc_c(permphe_ct * 16LU, &writebuf)) {
809       goto make_perm_pheno_ret_NOMEM;
810     }
811   }
812   join_threads(threads, g_perm_generation_thread_ct);
813   memcpy(outname_end, ".pphe", 6);
814   if (fopen_checked(outname, "w", &outfile)) {
815     goto make_perm_pheno_ret_OPEN_FAIL;
816   }
817   sample_nmidx = 0;
818   for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
819     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
820     fputs(&(sample_ids[sample_uidx * max_sample_id_len]), outfile);
821     if (!IS_SET(pheno_nm, sample_uidx)) {
822       for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
823 	putc_unlocked('\t', outfile);
824 	fputs(output_missing_pheno, outfile);
825       }
826     } else if (pheno_c) {
827       ulptr = &(g_perm_vecs[sample_nmidx / BITCT]);
828       rshift = sample_nmidx % BITCT;
829       for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
830 	putc_unlocked('\t', outfile);
831         putc_unlocked('1' + ((ulptr[perm_idx * pheno_nm_ctv] >> rshift) & 1), outfile);
832       }
833       sample_nmidx++;
834     } else {
835       wptr = writebuf;
836       dptr = &(g_perm_vecstd[sample_nmidx * perm_vec_ctcl8m]);
837       for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
838 	*wptr++ = '\t';
839         wptr = dtoa_g(*dptr++, wptr);
840       }
841       if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
842 	goto make_perm_pheno_ret_WRITE_FAIL;
843       }
844       sample_nmidx++;
845     }
846     if (putc_checked('\n', outfile)) {
847       goto make_perm_pheno_ret_WRITE_FAIL;
848     }
849   }
850   if (fclose_null(&outfile)) {
851     goto make_perm_pheno_ret_WRITE_FAIL;
852   }
853   LOGPRINTFWW("--make-perm-pheno: Permuted phenotypes written to %s .\n", outname);
854   while (0) {
855   make_perm_pheno_ret_NOMEM:
856     retval = RET_NOMEM;
857     break;
858   make_perm_pheno_ret_OPEN_FAIL:
859     retval = RET_OPEN_FAIL;
860     break;
861   make_perm_pheno_ret_WRITE_FAIL:
862     retval = RET_WRITE_FAIL;
863     break;
864   make_perm_pheno_ret_INVALID_CMDLINE:
865     retval = RET_INVALID_CMDLINE;
866     break;
867   make_perm_pheno_ret_THREAD_CREATE_FAIL:
868     retval = RET_THREAD_CREATE_FAIL;
869     break;
870   }
871  make_perm_pheno_ret_1:
872   bigstack_reset(bigstack_mark);
873   fclose_cond(outfile);
874   return retval;
875 }
876