1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink_common.h"
19
20 #include "plink_set.h"
21
set_init(Set_info * sip,Annot_info * aip)22 void set_init(Set_info* sip, Annot_info* aip) {
23 sip->fname = nullptr;
24 sip->setnames_flattened = nullptr;
25 sip->subset_fname = nullptr;
26 sip->merged_set_name = nullptr;
27 sip->genekeep_flattened = nullptr;
28 sip->modifier = 0;
29 // bugfix (17 Jul 2018): make_set_border not zero-initialized
30 sip->make_set_border = 0;
31 sip->set_r2 = 0.5;
32 sip->set_p = 0.05;
33 sip->set_test_lambda = 0.0;
34 sip->set_max = 5;
35 sip->ct = 0;
36 // sip->names, sip->setdefs not accessed if ct == 0
37 aip->fname = nullptr;
38 aip->attrib_fname = nullptr;
39 aip->ranges_fname = nullptr;
40 aip->filter_fname = nullptr;
41 aip->snps_fname = nullptr;
42 aip->subset_fname = nullptr;
43 aip->snpfield = nullptr;
44 aip->modifier = 0;
45 aip->border = 0;
46 }
47
set_cleanup(Set_info * sip,Annot_info * aip)48 void set_cleanup(Set_info* sip, Annot_info* aip) {
49 free_cond(sip->fname);
50 free_cond(sip->setnames_flattened);
51 free_cond(sip->subset_fname);
52 free_cond(sip->merged_set_name);
53 free_cond(sip->genekeep_flattened);
54 free_cond(aip->fname);
55 free_cond(aip->attrib_fname);
56 free_cond(aip->ranges_fname);
57 free_cond(aip->filter_fname);
58 free_cond(aip->snps_fname);
59 free_cond(aip->subset_fname);
60 free_cond(aip->snpfield);
61 }
62
in_setdef(uint32_t * setdef,uint32_t marker_idx)63 uint32_t in_setdef(uint32_t* setdef, uint32_t marker_idx) {
64 uint32_t range_ct = setdef[0];
65 uint32_t idx_base;
66 if (range_ct != 0xffffffffU) {
67 if (!range_ct) {
68 return 0;
69 }
70 return (uint32arr_greater_than(&(setdef[1]), range_ct * 2, marker_idx + 1) & 1);
71 } else {
72 idx_base = setdef[1];
73 if ((marker_idx < idx_base) || (marker_idx >= idx_base + setdef[2])) {
74 return setdef[3];
75 }
76 return is_set((uintptr_t*)(&(setdef[4])), marker_idx - idx_base);
77 }
78 }
79
in_setdef_dist(uint32_t * setdef,uint32_t pos,uint32_t border,int32_t * dist_ptr)80 uint32_t in_setdef_dist(uint32_t* setdef, uint32_t pos, uint32_t border, int32_t* dist_ptr) {
81 uint32_t range_ct = setdef[0];
82 uint32_t uii;
83 int32_t ii;
84 uii = uint32arr_greater_than(&(setdef[1]), range_ct * 2, pos + 1);
85 if (uii & 1) {
86 *dist_ptr = 0;
87 return 1;
88 }
89 // check distance from two nearest interval boundaries
90 if (!uii) {
91 if (pos + border >= setdef[1]) {
92 *dist_ptr = ((int32_t)pos) - ((int32_t)setdef[1]);
93 return 1;
94 }
95 return 0;
96 } else if (uii == range_ct * 2) {
97 if (setdef[uii] + border > pos) {
98 *dist_ptr = pos + 1 - setdef[uii];
99 return 1;
100 }
101 return 0;
102 } else {
103 // this is actually the uncommon case, since range_ct is usually 1
104 // ties broken in favor of negative numbers, to match PLINK 1.07 annot.cpp
105 if (setdef[uii] + border > pos) {
106 ii = pos + 1 - setdef[uii];
107 if (pos + ((uint32_t)ii) > setdef[uii + 1]) {
108 ii = ((int32_t)pos) - ((int32_t)setdef[uii + 1]);
109 }
110 *dist_ptr = ii;
111 return 1;
112 } else if (pos + border >= setdef[uii + 1]) {
113 *dist_ptr = ((int32_t)pos) - ((int32_t)setdef[uii + 1]);
114 return 1;
115 }
116 return 0;
117 }
118 }
119
interval_in_setdef(uint32_t * setdef,uint32_t marker_idx_start,uint32_t marker_idx_end)120 uint32_t interval_in_setdef(uint32_t* setdef, uint32_t marker_idx_start, uint32_t marker_idx_end) {
121 // expects half-open interval coordinates as input
122 // assumes interval is nonempty
123 uint32_t range_ct = setdef[0];
124 uint32_t idx_base;
125 uint32_t uii;
126 if (range_ct != 0xffffffffU) {
127 if (!range_ct) {
128 return 0;
129 }
130 uii = uint32arr_greater_than(&(setdef[1]), range_ct * 2, marker_idx_start + 1);
131 if (uii & 1) {
132 return 1;
133 } else if (uii == range_ct * 2) {
134 return 0;
135 }
136 return uint32arr_greater_than(&(setdef[1 + uii]), range_ct * 2 - uii, marker_idx_end);
137 } else {
138 idx_base = setdef[1];
139 if ((marker_idx_end <= idx_base) || (marker_idx_start >= idx_base + setdef[2])) {
140 return setdef[3];
141 }
142 if (marker_idx_start < idx_base) {
143 if (setdef[3]) {
144 return 1;
145 }
146 marker_idx_start = 0;
147 } else {
148 marker_idx_start -= idx_base;
149 }
150 if (marker_idx_end > idx_base + setdef[2]) {
151 if (setdef[3]) {
152 return 1;
153 }
154 marker_idx_end = setdef[2];
155 } else {
156 marker_idx_end -= idx_base;
157 }
158 uii = next_set((uintptr_t*)(&(setdef[4])), marker_idx_start, marker_idx_end);
159 return (uii < marker_idx_end);
160 }
161 }
162
setdef_size(uint32_t * setdef,uint32_t marker_ct)163 uint32_t setdef_size(uint32_t* setdef, uint32_t marker_ct) {
164 uint32_t range_ct = setdef[0];
165 uint32_t total_size = 0;
166 uint32_t uii;
167 if (range_ct != 0xffffffffU) {
168 for (uii = 0; uii < range_ct; uii++) {
169 total_size += setdef[uii * 2 + 2] - setdef[uii * 2 + 1];
170 }
171 } else {
172 if (setdef[3]) {
173 total_size = marker_ct - setdef[2];
174 }
175 total_size += popcount_bit_idx((uintptr_t*)(&(setdef[4])), 0, setdef[2]);
176 }
177 return total_size;
178 }
179
setdef_iter_init(uint32_t * setdef,uint32_t marker_ct,uint32_t start_idx,uint32_t * cur_idx_ptr,uint32_t * aux_ptr)180 void setdef_iter_init(uint32_t* setdef, uint32_t marker_ct, uint32_t start_idx, uint32_t* cur_idx_ptr, uint32_t* aux_ptr) {
181 uint32_t range_ct = setdef[0];
182 uint32_t uii;
183 if (range_ct != 0xffffffffU) {
184 if (!range_ct) {
185 *cur_idx_ptr = 0;
186 *aux_ptr = 0;
187 return;
188 }
189 uii = uint32arr_greater_than(&(setdef[1]), range_ct * 2, start_idx + 1);
190 if (uii % 2) {
191 *cur_idx_ptr = start_idx;
192 *aux_ptr = uii + 1;
193 } else if (uii < range_ct * 2) {
194 *cur_idx_ptr = setdef[uii + 1];
195 *aux_ptr = uii + 2;
196 } else {
197 *cur_idx_ptr = setdef[uii];
198 *aux_ptr = uii;
199 }
200 } else {
201 // aux value may be redefined; just needs to be compatible with
202 // setdef_iter()
203 if (setdef[3]) {
204 *cur_idx_ptr = start_idx;
205 *aux_ptr = marker_ct - setdef[1];
206 } else {
207 if (start_idx <= setdef[1]) {
208 *cur_idx_ptr = setdef[1];
209 } else {
210 *cur_idx_ptr = start_idx;
211 }
212 *aux_ptr = setdef[2];
213 }
214 }
215 }
216
setdef_iter(uint32_t * setdef,uint32_t * cur_idx_ptr,uint32_t * aux_ptr)217 uint32_t setdef_iter(uint32_t* setdef, uint32_t* cur_idx_ptr, uint32_t* aux_ptr) {
218 // Iterator. Returns 0 if end of set, 1 if *not* end.
219 // Assumes cur_idx and aux were initialized with setdef_iter_init(), and
220 // (after the first call) cur_idx is incremented right before this is called.
221 uint32_t range_ct = setdef[0];
222 uint32_t cur_idx = *cur_idx_ptr;
223 uint32_t aux = *aux_ptr;
224 if (range_ct != 0xffffffffU) {
225 if (cur_idx < setdef[aux]) {
226 return 1;
227 } else if (aux < range_ct * 2) {
228 *cur_idx_ptr = setdef[aux + 1];
229 *aux_ptr = aux + 2;
230 return 1;
231 } else {
232 return 0;
233 }
234 } else if (cur_idx < setdef[1]) {
235 return 1; // only possible if setdef[3] set
236 } else {
237 cur_idx -= setdef[1];
238 if (cur_idx < setdef[2]) {
239 if (is_set((uintptr_t*)(&(setdef[4])), cur_idx)) {
240 return 1;
241 }
242 cur_idx = next_set((uintptr_t*)(&(setdef[4])), cur_idx, setdef[2]);
243 *cur_idx_ptr = cur_idx + setdef[1];
244 if (cur_idx < setdef[2]) {
245 return 1;
246 }
247 }
248 return (cur_idx < aux)? 1 : 0;
249 }
250 }
251
alloc_and_populate_nonempty_set_incl(Set_info * sip,uint32_t * nonempty_set_ct_ptr,uintptr_t ** nonempty_set_incl_ptr)252 uint32_t alloc_and_populate_nonempty_set_incl(Set_info* sip, uint32_t* nonempty_set_ct_ptr, uintptr_t** nonempty_set_incl_ptr) {
253 uint32_t raw_set_ct = sip->ct;
254 uint32_t raw_set_ctl = BITCT_TO_WORDCT(raw_set_ct);
255 uint32_t nonempty_set_ct = 0;
256 uintptr_t* nonempty_set_incl;
257 uint32_t set_uidx;
258 if (bigstack_calloc_ul(raw_set_ctl, nonempty_set_incl_ptr)) {
259 return 1;
260 }
261 nonempty_set_incl = *nonempty_set_incl_ptr;
262 for (set_uidx = 0; set_uidx < raw_set_ct; set_uidx++) {
263 if (sip->setdefs[set_uidx][0]) {
264 set_bit(set_uidx, nonempty_set_incl);
265 nonempty_set_ct++;
266 }
267 }
268 *nonempty_set_ct_ptr = nonempty_set_ct;
269 return 0;
270 }
271
load_range_list(FILE * infile,uint32_t track_set_names,uint32_t border_extend,uint32_t collapse_group,uint32_t fail_on_no_sets,uint32_t c_prefix,uint32_t allow_no_variants,uintptr_t subset_ct,char * sorted_subset_ids,uintptr_t max_subset_id_len,uint32_t * marker_pos,Chrom_info * chrom_info_ptr,uintptr_t * set_ct_ptr,char ** set_names_ptr,uintptr_t * max_set_id_len_ptr,Make_set_range *** make_set_range_arr_ptr,uint64_t ** range_sort_buf_ptr,const char * file_descrip)272 int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uint32_t allow_no_variants, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_len_ptr, Make_set_range*** make_set_range_arr_ptr, uint64_t** range_sort_buf_ptr, const char* file_descrip) {
273 // Called directly by extract_exclude_range(), define_sets(), and indirectly
274 // by annotate(), gene_report(), and clump_reports().
275 // Assumes caller will reset g_bigstack_end later.
276 Ll_str* make_set_ll = nullptr;
277 char* set_names = nullptr;
278 uintptr_t set_ct = 0;
279 uintptr_t max_set_id_len = 0;
280 uintptr_t line_idx = 0;
281 uint32_t chrom_start = 0;
282 uint32_t chrom_end = 0;
283 int32_t retval = 0;
284 Make_set_range** make_set_range_arr;
285 Make_set_range* msr_tmp;
286 Ll_str* ll_tmp;
287 char* bufptr;
288 char* bufptr2;
289 char* bufptr3;
290 uintptr_t set_idx;
291 uintptr_t ulii;
292 uint32_t chrom_idx;
293 uint32_t range_first;
294 uint32_t range_last;
295 uint32_t uii;
296 uint32_t ujj;
297 {
298 g_textbuf[MAXLINELEN - 1] = ' ';
299 // if we need to track set names, put together a sorted list
300 if (track_set_names) {
301 while (fgets(g_textbuf, MAXLINELEN, infile)) {
302 line_idx++;
303 if (!g_textbuf[MAXLINELEN - 1]) {
304 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
305 goto load_range_list_ret_INVALID_FORMAT_2;
306 }
307 char* textbuf_first_token = skip_initial_spaces(g_textbuf);
308 if (is_eoln_kns(*textbuf_first_token)) {
309 continue;
310 }
311 char* first_token_end = token_endnn(textbuf_first_token);
312 bufptr2 = next_token_mult(first_token_end, 3);
313 if (!collapse_group) {
314 bufptr3 = bufptr2;
315 } else {
316 bufptr3 = next_token(bufptr2);
317 }
318 if (no_more_tokens_kns(bufptr3)) {
319 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
320 goto load_range_list_ret_INVALID_FORMAT_2;
321 }
322 const uint32_t chrom_name_slen = (uintptr_t)(first_token_end - textbuf_first_token);
323 *first_token_end = '\0';
324 int32_t cur_chrom_code = get_chrom_code(textbuf_first_token, chrom_info_ptr, chrom_name_slen);
325 if (cur_chrom_code < 0) {
326 // kludge (21 Jan 2020): in the --extract/exclude range case, we
327 // should just skip lines that mention a chromosome absent from the
328 // dataset instead of erroring out, if --allow-extra-chr is
329 // specified. (And it's not obvious whether there's any value in
330 // erroring out when --allow-extra-chr isn't specified, for that
331 // matter; I won't bother for now since it would require adding
332 // another function argument.)
333 if ((!set_ct_ptr) && (cur_chrom_code == -1)) {
334 continue;
335 }
336 sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
337 goto load_range_list_ret_INVALID_FORMAT_2;
338 }
339 // chrom_mask check removed, we want to track empty sets
340 uii = strlen_se(bufptr2);
341 bufptr2[uii] = '\0';
342 if (subset_ct) {
343 if (bsearch_str(bufptr2, uii, sorted_subset_ids, max_subset_id_len, subset_ct) == -1) {
344 continue;
345 }
346 }
347 if (collapse_group) {
348 uii = strlen_se(bufptr3);
349 bufptr3[uii] = '\0';
350 }
351 // when there are repeats, they are likely to be next to each other
352 if (make_set_ll && (!strcmp(make_set_ll->ss, bufptr3))) {
353 continue;
354 }
355 uii++;
356 // argh, --clump counts positional overlaps which don't include any
357 // variants in the dataset. So we prefix set IDs with a chromosome
358 // index in that case (with leading zeroes) and treat cross-chromosome
359 // sets as distinct.
360 if (!marker_pos) {
361 uii += 4;
362 }
363 if (uii > max_set_id_len) {
364 max_set_id_len = uii;
365 }
366 if (bigstack_end_alloc_llstr(uii, &ll_tmp)) {
367 goto load_range_list_ret_NOMEM;
368 }
369 ll_tmp->next = make_set_ll;
370 if (marker_pos) {
371 memcpy(ll_tmp->ss, bufptr3, uii);
372 } else {
373 // quasi-bugfix (7 Oct 2017): forgot to check for this
374 if (cur_chrom_code > 9999) {
375 logerrprint("Error: This command does not support 10000+ contigs.\n");
376 goto load_range_list_ret_INVALID_FORMAT;
377 }
378 uitoa_z4((uint32_t)cur_chrom_code, ll_tmp->ss);
379 // if first character of gene name is a digit, natural sort has
380 // strange effects unless we force [3] to be nonnumeric...
381 ll_tmp->ss[3] -= 15;
382 memcpy(&(ll_tmp->ss[4]), bufptr3, uii - 4);
383 }
384 make_set_ll = ll_tmp;
385 set_ct++;
386 }
387 if (!set_ct) {
388 if (fail_on_no_sets) {
389 if (marker_pos) {
390 if (!allow_no_variants) {
391 // okay, this is a kludge
392 logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--make-set file.\n");
393 retval = RET_ALL_MARKERS_EXCLUDED;
394 goto load_range_list_ret_1;
395 }
396 } else {
397 if (subset_ct) {
398 logerrprint("Error: No --gene-subset genes present in --gene-report file.\n");
399 } else {
400 logerrprint("Error: Empty --gene-report file.\n");
401 }
402 goto load_range_list_ret_INVALID_FORMAT;
403 }
404 }
405 LOGERRPRINTF("Warning: No valid ranges in %s file.\n", file_descrip);
406 goto load_range_list_ret_1;
407 }
408 max_set_id_len += c_prefix;
409 if (max_set_id_len > MAX_ID_BLEN) {
410 logerrprint("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
411 goto load_range_list_ret_INVALID_FORMAT;
412 }
413 if (bigstack_alloc_c(set_ct * max_set_id_len, set_names_ptr)) {
414 goto load_range_list_ret_NOMEM;
415 }
416 set_names = *set_names_ptr;
417 if (!c_prefix) {
418 for (ulii = 0; ulii < set_ct; ulii++) {
419 strcpy(&(set_names[ulii * max_set_id_len]), make_set_ll->ss);
420 make_set_ll = make_set_ll->next;
421 }
422 } else {
423 for (ulii = 0; ulii < set_ct; ulii++) {
424 memcpy(&(set_names[ulii * max_set_id_len]), "C_", 2);
425 strcpy(&(set_names[ulii * max_set_id_len + 2]), make_set_ll->ss);
426 make_set_ll = make_set_ll->next;
427 }
428 }
429 qsort(set_names, set_ct, max_set_id_len, strcmp_natural);
430 set_ct = collapse_duplicate_ids(set_names, set_ct, max_set_id_len, nullptr);
431 bigstack_shrink_top(set_names, set_ct * max_set_id_len);
432 rewind(infile);
433 } else {
434 set_ct = 1;
435 }
436 make_set_range_arr = (Make_set_range**)bigstack_end_alloc(set_ct * sizeof(intptr_t));
437 if (!make_set_range_arr) {
438 goto load_range_list_ret_NOMEM;
439 }
440 for (set_idx = 0; set_idx < set_ct; set_idx++) {
441 make_set_range_arr[set_idx] = nullptr;
442 }
443 line_idx = 0;
444 while (fgets(g_textbuf, MAXLINELEN, infile)) {
445 line_idx++;
446 if (!g_textbuf[MAXLINELEN - 1]) {
447 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
448 goto load_range_list_ret_INVALID_FORMAT_2;
449 }
450 char* textbuf_first_token = skip_initial_spaces(g_textbuf);
451 if (is_eoln_kns(*textbuf_first_token)) {
452 continue;
453 }
454 char* first_token_end = token_endnn(textbuf_first_token);
455 bufptr2 = next_token_mult(first_token_end, 3);
456 if (!collapse_group) {
457 bufptr3 = bufptr2;
458 } else {
459 bufptr3 = next_token(bufptr2);
460 }
461 if (no_more_tokens_kns(bufptr3)) {
462 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
463 goto load_range_list_ret_INVALID_FORMAT_2;
464 }
465 const uint32_t chrom_name_slen = (uintptr_t)(first_token_end - textbuf_first_token);
466 *first_token_end = '\0';
467 int32_t cur_chrom_code = get_chrom_code(textbuf_first_token, chrom_info_ptr, chrom_name_slen);
468 if (cur_chrom_code < 0) {
469 if ((!set_ct_ptr) && (cur_chrom_code == -1)) {
470 continue;
471 }
472 sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
473 goto load_range_list_ret_INVALID_FORMAT_2;
474 }
475 if (!is_set(chrom_info_ptr->chrom_mask, cur_chrom_code)) {
476 continue;
477 }
478 chrom_idx = cur_chrom_code;
479 if (marker_pos) {
480 chrom_start = get_chrom_start_vidx(chrom_info_ptr, chrom_idx);
481 chrom_end = get_chrom_end_vidx(chrom_info_ptr, chrom_idx);
482 if (chrom_end == chrom_start) {
483 continue;
484 }
485 // might need to move this outside the if-statement later
486 if (subset_ct && (bsearch_str(bufptr2, strlen_se(bufptr2), sorted_subset_ids, max_subset_id_len, subset_ct) == -1)) {
487 continue;
488 }
489 }
490 bufptr = skip_initial_spaces(&(first_token_end[1]));
491 if (scan_uint_defcap(bufptr, &range_first)) {
492 sprintf(g_logbuf, "Error: Invalid range start position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
493 goto load_range_list_ret_INVALID_FORMAT_2;
494 }
495 bufptr = next_token(bufptr);
496 if (scan_uint_defcap(bufptr, &range_last)) {
497 sprintf(g_logbuf, "Error: Invalid range end position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
498 goto load_range_list_ret_INVALID_FORMAT_2;
499 }
500 if (range_last < range_first) {
501 sprintf(g_logbuf, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
502 wordwrapb(0);
503 goto load_range_list_ret_INVALID_FORMAT_2;
504 }
505 if (border_extend > range_first) {
506 range_first = 0;
507 } else {
508 range_first -= border_extend;
509 }
510 range_last += border_extend;
511 if (set_ct > 1) {
512 // bugfix: bsearch_str_natural requires null-terminated string
513 uii = strlen_se(bufptr3);
514 bufptr3[uii] = '\0';
515 if (c_prefix) {
516 bufptr3 = &(bufptr3[-2]);
517 memcpy(bufptr3, "C_", 2);
518 } else if (!marker_pos) {
519 bufptr3 = &(bufptr3[-4]);
520 uitoa_z4(chrom_idx, bufptr3);
521 bufptr3[3] -= 15;
522 }
523 // this should never fail
524 set_idx = (uint32_t)bsearch_str_natural(bufptr3, set_names, max_set_id_len, set_ct);
525 } else {
526 set_idx = 0;
527 }
528 if (marker_pos) {
529 // translate to within-chromosome uidx
530 range_first = uint32arr_greater_than(&(marker_pos[chrom_start]), chrom_end - chrom_start, range_first);
531 range_last = uint32arr_greater_than(&(marker_pos[chrom_start]), chrom_end - chrom_start, range_last + 1);
532 if (range_last > range_first) {
533 msr_tmp = (Make_set_range*)bigstack_end_alloc(sizeof(Make_set_range));
534 msr_tmp->next = make_set_range_arr[set_idx];
535 // normally, I'd keep chrom_idx here since that enables by-chromosome
536 // sorting, but that's probably not worth bloating Make_set_range from
537 // 16 to 32 bytes
538 msr_tmp->uidx_start = chrom_start + range_first;
539 msr_tmp->uidx_end = chrom_start + range_last;
540 make_set_range_arr[set_idx] = msr_tmp;
541 }
542 } else {
543 msr_tmp = (Make_set_range*)bigstack_end_alloc(sizeof(Make_set_range));
544 msr_tmp->next = make_set_range_arr[set_idx];
545 msr_tmp->uidx_start = range_first;
546 msr_tmp->uidx_end = range_last + 1;
547 make_set_range_arr[set_idx] = msr_tmp;
548 }
549 }
550 // allocate buffer for sorting ranges later
551 uii = 0;
552 for (set_idx = 0; set_idx < set_ct; set_idx++) {
553 ujj = 0;
554 msr_tmp = make_set_range_arr[set_idx];
555 while (msr_tmp) {
556 ujj++;
557 msr_tmp = msr_tmp->next;
558 }
559 if (ujj > uii) {
560 uii = ujj;
561 }
562 }
563 if (range_sort_buf_ptr) {
564 bigstack_end_alloc_ull(uii, range_sort_buf_ptr);
565 }
566 if (set_ct_ptr) {
567 *set_ct_ptr = set_ct;
568 }
569 if (max_set_id_len_ptr) {
570 *max_set_id_len_ptr = max_set_id_len;
571 }
572 *make_set_range_arr_ptr = make_set_range_arr;
573 }
574 while (0) {
575 load_range_list_ret_NOMEM:
576 retval = RET_NOMEM;
577 break;
578 load_range_list_ret_INVALID_FORMAT_2:
579 logerrprintb();
580 load_range_list_ret_INVALID_FORMAT:
581 retval = RET_INVALID_FORMAT;
582 break;
583 }
584 load_range_list_ret_1:
585 return retval;
586 }
587
extract_exclude_range(char * fname,uint32_t * marker_pos,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t * marker_exclude_ct_ptr,uint32_t is_exclude,uint32_t allow_no_variants,Chrom_info * chrom_info_ptr)588 int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t is_exclude, uint32_t allow_no_variants, Chrom_info* chrom_info_ptr) {
589 if (unfiltered_marker_ct == *marker_exclude_ct_ptr) {
590 return 0;
591 }
592 unsigned char* bigstack_mark = g_bigstack_base;
593 unsigned char* bigstack_end_mark = g_bigstack_end;
594 uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
595 FILE* infile = nullptr;
596 uintptr_t orig_marker_exclude_ct = *marker_exclude_ct_ptr;
597 Make_set_range** range_arr = nullptr;
598 int32_t retval = 0;
599 Make_set_range* msr_tmp;
600 uintptr_t* marker_exclude_new;
601 if (fopen_checked(fname, "r", &infile)) {
602 goto extract_exclude_range_ret_OPEN_FAIL;
603 }
604 retval = load_range_list(infile, 0, 0, 0, 0, 0, allow_no_variants, 0, nullptr, 0, marker_pos, chrom_info_ptr, nullptr, nullptr, nullptr, &range_arr, nullptr, is_exclude? "--exclude range" : "--extract range");
605 if (retval) {
606 goto extract_exclude_range_ret_1;
607 }
608 if (fclose_null(&infile)) {
609 goto extract_exclude_range_ret_READ_FAIL;
610 }
611 msr_tmp = range_arr[0];
612 if (is_exclude) {
613 while (msr_tmp) {
614 fill_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_exclude);
615 msr_tmp = msr_tmp->next;
616 }
617 } else {
618 bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_new);
619 if (!marker_exclude_new) {
620 goto extract_exclude_range_ret_NOMEM;
621 }
622 fill_all_bits(unfiltered_marker_ct, marker_exclude_new);
623 while (msr_tmp) {
624 clear_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_exclude_new);
625 msr_tmp = msr_tmp->next;
626 }
627 bitvec_or(marker_exclude_new, unfiltered_marker_ctl, marker_exclude);
628 }
629 *marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
630 if ((*marker_exclude_ct_ptr == unfiltered_marker_ct) && (!allow_no_variants)) {
631 LOGERRPRINTF("Error: All variants excluded by '--%s range'.\n", is_exclude? "exclude" : "extract");
632 retval = RET_ALL_MARKERS_EXCLUDED;
633 } else if (*marker_exclude_ct_ptr == orig_marker_exclude_ct) {
634 LOGERRPRINTF("Warning: No variants excluded by '--%s range'.\n", is_exclude? "exclude" : "extract");
635 } else {
636 orig_marker_exclude_ct = *marker_exclude_ct_ptr - orig_marker_exclude_ct;
637 LOGPRINTF("--%s range: %" PRIuPTR " variant%s excluded.\n", is_exclude? "exclude" : "extract", orig_marker_exclude_ct, (orig_marker_exclude_ct == 1)? "" : "s");
638 }
639 while (0) {
640 extract_exclude_range_ret_NOMEM:
641 retval = RET_NOMEM;
642 break;
643 extract_exclude_range_ret_OPEN_FAIL:
644 retval = RET_OPEN_FAIL;
645 break;
646 extract_exclude_range_ret_READ_FAIL:
647 retval = RET_READ_FAIL;
648 break;
649 }
650 extract_exclude_range_ret_1:
651 fclose_cond(infile);
652 bigstack_double_reset(bigstack_mark, bigstack_end_mark);
653 return retval;
654 }
655
save_set_bitfield(uintptr_t * marker_bitfield_tmp,uint32_t marker_ct,uint32_t range_start,uint32_t range_end,uint32_t complement_sets,uint32_t ** set_range_pp)656 uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, uint32_t range_start, uint32_t range_end, uint32_t complement_sets, uint32_t** set_range_pp) {
657 uintptr_t mem_req = ((marker_ct + 255) / 128) * 16;
658 uint32_t bound_bottom_d128 = range_start / 128;
659 uint32_t bound_top_d128 = (range_end - 1) / 128;
660 uint32_t set_bits_outer = complement_sets;
661 uint32_t do_flip = 0;
662 uint32_t range_idx = 0;
663 uint32_t* uiptr;
664 uint32_t range_ct_ceil;
665 uint32_t bit_idx;
666 uint32_t uii;
667 uint32_t ujj;
668 uint32_t ukk;
669 if (bigstack_left() < mem_req) {
670 return 1;
671 }
672 *set_range_pp = (uint32_t*)g_bigstack_base;
673 if (range_start == marker_ct) {
674 // empty or full set
675 save_set_bitfield_degen:
676 g_bigstack_base = &(g_bigstack_base[16]);
677 if (complement_sets) {
678 (*set_range_pp)[0] = 1;
679 (*set_range_pp)[1] = 0;
680 (*set_range_pp)[2] = marker_ct;
681 } else {
682 (*set_range_pp)[0] = 0;
683 }
684 return 0;
685 }
686 // profitable to invert?
687 ukk = bound_top_d128 - bound_bottom_d128;
688 if (!range_start) {
689 uii = next_unset(marker_bitfield_tmp, 0, range_end);
690 if (range_end == marker_ct) {
691 if (uii != marker_ct) {
692 ujj = prev_unset_unsafe(marker_bitfield_tmp, range_end - 1);
693 if ((ujj / 128) - (uii / 128) < ukk) {
694 do_flip = 1;
695 range_start = uii;
696 range_end = ujj + 1;
697 }
698 } else {
699 complement_sets = 1 - complement_sets;
700 goto save_set_bitfield_degen;
701 }
702 } else if (uii != range_end) {
703 // bugfix (2 Dec 2018): in the uii == range_end case, flipping would
704 // cause no bits to be set in the new [range_start, range_end), and this
705 // could cause a segfault. Simplest fix is to never flip in that case
706 // since we know the set can be represented as a single range.
707 if (((marker_ct - 1) / 128) - (uii / 128) < ukk) {
708 do_flip = 1;
709 range_start = uii;
710 range_end = marker_ct;
711 }
712 }
713 } else {
714 if (range_end == marker_ct) {
715 uii = prev_unset_unsafe(marker_bitfield_tmp, range_end - 1);
716 if ((uii / 128) < ukk) {
717 do_flip = 1;
718 range_start = 0;
719 range_end = uii + 1;
720 }
721 }
722 }
723 if (do_flip) {
724 set_bits_outer = 1 - set_bits_outer;
725 bound_bottom_d128 = range_start / 128;
726 bound_top_d128 = (range_end - 1) / 128;
727 }
728 bound_top_d128++;
729 // equal or greater than this -> use bitfield
730 range_ct_ceil = 2 * (1 + bound_top_d128 - bound_bottom_d128);
731 mem_req = range_ct_ceil * 8;
732 // try to compress as sequence of ranges
733 uiptr = &((*set_range_pp)[1]);
734 bit_idx = bound_bottom_d128 * 128;
735 if (set_bits_outer && bit_idx) {
736 if (!complement_sets) {
737 bit_idx = next_unset_unsafe(marker_bitfield_tmp, bit_idx);
738 } else {
739 bit_idx = next_set_unsafe(marker_bitfield_tmp, bit_idx);
740 }
741 *uiptr++ = 0;
742 *uiptr++ = bit_idx;
743 range_idx++;
744 }
745 ujj = bound_top_d128 * 128;
746 if (!complement_sets) {
747 do {
748 if (++range_idx == range_ct_ceil) {
749 goto save_set_bitfield_standard;
750 }
751 uii = next_set_unsafe(marker_bitfield_tmp, bit_idx);
752 bit_idx = uii + 1;
753 next_unset_unsafe_ck(marker_bitfield_tmp, &bit_idx);
754 *uiptr++ = uii;
755 *uiptr++ = bit_idx;
756 } while (bit_idx < range_end);
757 } else {
758 set_bit(ujj, marker_bitfield_tmp);
759 clear_bit(ujj + 1, marker_bitfield_tmp);
760 ukk = ujj;
761 if (ukk > marker_ct) {
762 ukk = marker_ct;
763 }
764 while (1) {
765 uii = bit_idx + 1;
766 next_unset_unsafe_ck(marker_bitfield_tmp, &uii);
767 if (uii >= ukk) {
768 break;
769 }
770 if (++range_idx == range_ct_ceil) {
771 goto save_set_bitfield_standard;
772 }
773 bit_idx = next_set_unsafe(marker_bitfield_tmp, uii);
774 *uiptr++ = uii;
775 *uiptr++ = bit_idx;
776 }
777 if (uiptr[-1] > marker_ct) {
778 uiptr[-1] = marker_ct;
779 }
780 }
781 if (set_bits_outer && (ujj < marker_ct)) {
782 if (uiptr[-1] == ujj) {
783 uiptr[-1] = marker_ct;
784 } else {
785 if (++range_idx == range_ct_ceil) {
786 goto save_set_bitfield_standard;
787 }
788 *uiptr++ = ujj;
789 *uiptr++ = marker_ct;
790 }
791 }
792 (*set_range_pp)[0] = range_idx;
793 mem_req = (1 + (range_idx / 2)) * 16;
794 while (0) {
795 save_set_bitfield_standard:
796 bound_bottom_d128 *= 128;
797 bound_top_d128 *= 128;
798 // bugfix
799 if (bound_top_d128 > marker_ct) {
800 bound_top_d128 = marker_ct;
801 }
802 (*set_range_pp)[0] = 0xffffffffU;
803 (*set_range_pp)[1] = bound_bottom_d128;
804 (*set_range_pp)[2] = bound_top_d128 - bound_bottom_d128;
805 (*set_range_pp)[3] = set_bits_outer;
806 memcpy(&((*set_range_pp)[4]), &(marker_bitfield_tmp[bound_bottom_d128 / BITCT]), mem_req - 16);
807 if (complement_sets) {
808 bitarr_invert(bound_top_d128 - bound_bottom_d128, (uintptr_t*)(&((*set_range_pp)[4])));
809 }
810 }
811 g_bigstack_base = &(g_bigstack_base[mem_req]);
812 return 0;
813 }
814
save_set_range(uint64_t * range_sort_buf,uint32_t marker_ct,uint32_t rsb_last_idx,uint32_t complement_sets,uint32_t ** set_range_pp)815 uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t rsb_last_idx, uint32_t complement_sets, uint32_t** set_range_pp) {
816 uint32_t* uiptr = (uint32_t*)g_bigstack_base;
817 uint32_t range_start = (uint32_t)(range_sort_buf[0] >> 32);
818 uint32_t range_end = (uint32_t)(range_sort_buf[rsb_last_idx]);
819 uint32_t bound_bottom_d128 = range_start / 128;
820 uint32_t bound_top_d128 = (range_end - 1) / 128;
821 uint32_t range_ct = bound_top_d128 - bound_bottom_d128;
822 uint32_t set_bits_outer = complement_sets;
823 uint32_t do_flip = 0; // flip set_bits_outer since that's more compact?
824 uint32_t rsb_idx = 0;
825 uintptr_t* bitfield_ptr = (uintptr_t*)(&(uiptr[4]));
826 uint64_t ullii;
827 uintptr_t mem_req;
828 uintptr_t ulii;
829 uint32_t uii;
830 uint32_t ujj;
831 if (bigstack_left() < (rsb_last_idx / 2) * 16 + 32) {
832 return 1;
833 }
834 *set_range_pp = uiptr;
835 if (!range_start) {
836 uii = (uint32_t)range_sort_buf[0];
837 if (range_end == marker_ct) {
838 if (uii != marker_ct) {
839 do_flip = 1;
840 range_start = uii;
841 range_end = (uint32_t)(range_sort_buf[rsb_last_idx] >> 32);
842 } else {
843 g_bigstack_base = &(g_bigstack_base[16]);
844 if (!complement_sets) {
845 uiptr[0] = 1;
846 uiptr[1] = 0;
847 uiptr[2] = marker_ct;
848 } else {
849 uiptr[0] = 0;
850 }
851 return 0;
852 }
853 } else {
854 if (((marker_ct - 1) / 128) - (uii / 128) < range_ct) {
855 do_flip = 1;
856 range_start = uii;
857 range_end = marker_ct;
858 }
859 }
860 } else {
861 if (range_end == marker_ct) {
862 if ((((uint32_t)(range_sort_buf[rsb_last_idx] - 1)) / 128) < range_ct) {
863 do_flip = 1;
864 range_start = 0;
865 range_end = range_sort_buf[rsb_last_idx];
866 }
867 }
868 }
869 if (do_flip) {
870 set_bits_outer = 1 - set_bits_outer;
871 bound_bottom_d128 = range_start / 128;
872 bound_top_d128 = (range_end - 1) / 128;
873 }
874 bound_top_d128++;
875 range_end = bound_top_d128 * 128;
876 if (range_end > marker_ct) {
877 range_end = marker_ct;
878 }
879 mem_req = 16 * (1 + bound_top_d128 - bound_bottom_d128);
880 if (!complement_sets) {
881 ulii = ((rsb_last_idx + 1) / 2) + 1;
882 ulii *= 16;
883 if (ulii > mem_req) {
884 fill_ulong_zero((bound_top_d128 - bound_bottom_d128) * (128 / BITCT), bitfield_ptr);
885 range_start = bound_bottom_d128 * 128;
886 if (do_flip) {
887 rsb_last_idx--;
888 if (range_start) {
889 // first range must begin at bit 0
890 uii = range_start;
891 ujj = (uint32_t)(range_sort_buf[0]);
892 goto save_set_range_late_start_1;
893 }
894 }
895 for (; rsb_idx <= rsb_last_idx; rsb_idx++) {
896 ullii = range_sort_buf[rsb_idx];
897 uii = (uint32_t)(ullii >> 32);
898 ujj = (uint32_t)ullii;
899 save_set_range_late_start_1:
900 fill_bits(uii - range_start, ujj - uii, bitfield_ptr);
901 }
902 if (do_flip) {
903 // last range may go past bitfield end
904 ullii = range_sort_buf[rsb_idx];
905 uii = (uint32_t)(ullii >> 32);
906 ujj = (uint32_t)ullii;
907 if (ujj > range_end) {
908 ujj = range_end;
909 }
910 fill_bits(uii - range_start, ujj - uii, bitfield_ptr);
911 }
912 goto save_set_range_bitfield_finish_encode;
913 }
914 g_bigstack_base = &(g_bigstack_base[ulii]);
915 *uiptr++ = rsb_last_idx + 1;
916 for (; rsb_idx <= rsb_last_idx; rsb_idx++) {
917 ullii = range_sort_buf[rsb_idx];
918 *uiptr++ = (uint32_t)(ullii >> 32);
919 *uiptr++ = (uint32_t)ullii;
920 }
921 } else {
922 range_ct = rsb_last_idx + 2;
923 if (((uint32_t)range_sort_buf[rsb_last_idx]) == marker_ct) {
924 range_ct--;
925 }
926 ullii = range_sort_buf[0];
927 range_start = (uint32_t)(ullii >> 32);
928 if (range_start) {
929 ulii = (range_ct / 2) + 1;
930 } else {
931 ulii = ((range_ct - 1) / 2) + 1;
932 }
933 ulii *= 16;
934 if (ulii > mem_req) {
935 range_start = bound_bottom_d128 * 128;
936 fill_all_bits(range_end - range_start, bitfield_ptr);
937 if (do_flip) {
938 rsb_last_idx--;
939 if (range_start) {
940 // first raw range must begin at bit 0, so complemented range must
941 // begin later
942 uii = range_start;
943 ujj = (uint32_t)(range_sort_buf[0]);
944 goto save_set_range_late_start_2;
945 }
946 }
947 for (; rsb_idx <= rsb_last_idx; rsb_idx++) {
948 ullii = range_sort_buf[rsb_idx];
949 uii = (uint32_t)(ullii >> 32);
950 ujj = (uint32_t)ullii;
951 save_set_range_late_start_2:
952 clear_bits(uii - range_start, ujj - uii, bitfield_ptr);
953 }
954 if (do_flip) {
955 ullii = range_sort_buf[rsb_idx];
956 uii = (uint32_t)(ullii >> 32);
957 ujj = (uint32_t)ullii;
958 if (ujj > range_end) {
959 ujj = range_end;
960 }
961 clear_bits(uii - range_start, ujj - uii, bitfield_ptr);
962 }
963 save_set_range_bitfield_finish_encode:
964 g_bigstack_base = &(g_bigstack_base[mem_req]);
965 uiptr[0] = 0xffffffffU;
966 uiptr[1] = range_start;
967 uiptr[2] = range_end - range_start;
968 uiptr[3] = set_bits_outer;
969 } else {
970 g_bigstack_base = &(g_bigstack_base[ulii]);
971 if (range_start) {
972 *uiptr++ = range_ct;
973 *uiptr++ = 0;
974 *uiptr++ = range_start;
975 } else {
976 *uiptr++ = range_ct - 1;
977 }
978 for (rsb_idx = 1; rsb_idx <= rsb_last_idx; rsb_idx++) {
979 *uiptr++ = (uint32_t)ullii;
980 ullii = range_sort_buf[rsb_idx];
981 *uiptr++ = (uint32_t)(ullii >> 32);
982 }
983 if (range_ct == rsb_last_idx + 2) {
984 *uiptr++ = (uint32_t)ullii;
985 *uiptr++ = marker_ct;
986 }
987 }
988 }
989 return 0;
990 }
991
define_sets(Set_info * sip,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uint32_t * marker_pos,uintptr_t * marker_exclude_ct_ptr,char * marker_ids,uintptr_t max_marker_id_len,Chrom_info * chrom_info_ptr,uint32_t allow_no_variants)992 int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t allow_no_variants) {
993 unsigned char* bigstack_end_mark = g_bigstack_end;
994 FILE* infile = nullptr;
995 char* sorted_marker_ids = nullptr;
996 char* sorted_genekeep_ids = nullptr;
997 uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
998 uintptr_t marker_exclude_ct = *marker_exclude_ct_ptr;
999 uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
1000 uintptr_t set_ct = 0;
1001 uint32_t make_set = sip->modifier & SET_MAKE_FROM_RANGES;
1002 uint32_t complement_sets = (sip->modifier / SET_COMPLEMENTS) & 1;
1003 uint32_t c_prefix = 2 * ((sip->modifier / SET_C_PREFIX) & 1);
1004 uint32_t gene_all = sip->modifier & SET_GENE_ALL;
1005 uint32_t curtoklen = 0;
1006 uint32_t in_set = 0;
1007 int32_t retval = 0;
1008 uintptr_t subset_ct = 0;
1009 uintptr_t max_subset_id_len = 0;
1010 uintptr_t genekeep_ct = 0;
1011 uintptr_t max_genekeep_len = 0;
1012 uintptr_t max_set_id_len = 0;
1013 Make_set_range** make_set_range_arr = nullptr;
1014 char* midbuf = &(g_textbuf[MAXLINELEN]);
1015 char* sorted_subset_ids = nullptr;
1016 char* set_names = nullptr;
1017 char* bufptr = nullptr;
1018 uint64_t* range_sort_buf = nullptr;
1019 char* bufptr2;
1020 char* bufptr3;
1021 char* buf_end;
1022 Make_set_range* msr_tmp;
1023 unsigned char* bigstack_end_mark2;
1024 uint32_t* marker_id_map;
1025 uint32_t* marker_uidx_to_idx;
1026 uint32_t** all_setdefs;
1027 uintptr_t* marker_exclude_new;
1028 uintptr_t* marker_bitfield_tmp;
1029 uintptr_t set_idx;
1030 uintptr_t bufsize;
1031 uintptr_t marker_ctp2l;
1032 uintptr_t ulii;
1033 uint64_t ullii;
1034 uint32_t range_first;
1035 uint32_t range_last;
1036 uint32_t slen;
1037 uint32_t uii;
1038 uint32_t ujj;
1039 uint32_t ukk;
1040 uint32_t umm;
1041 int32_t ii;
1042 // 1. validate and sort --gene parameter(s)
1043 if (sip->genekeep_flattened) {
1044 bufptr = sip->genekeep_flattened;
1045 if (sip->merged_set_name) {
1046 // degenerate case: --gene-all if merged set name present, fail
1047 // otherwise
1048 uii = strlen(sip->merged_set_name);
1049 while (1) {
1050 slen = strlen(bufptr);
1051 if ((slen == uii) && (!memcmp(bufptr, sip->merged_set_name, uii))) {
1052 break;
1053 }
1054 bufptr = &(bufptr[slen + 1]);
1055 if (!(*bufptr)) {
1056 if (!allow_no_variants) {
1057 goto define_sets_ret_ALL_MARKERS_EXCLUDED;
1058 } else {
1059 goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
1060 }
1061 }
1062 }
1063 free(sip->genekeep_flattened);
1064 sip->genekeep_flattened = nullptr;
1065 gene_all = 1;
1066 } else {
1067 do {
1068 slen = strlen(bufptr) + 1;
1069 if ((!c_prefix) || (!memcmp(bufptr, "C_", 2))) {
1070 if (slen > max_genekeep_len) {
1071 max_genekeep_len = slen;
1072 }
1073 genekeep_ct++;
1074 }
1075 bufptr = &(bufptr[slen]);
1076 } while (*bufptr);
1077 if (!genekeep_ct) {
1078 if (!allow_no_variants) {
1079 logerrprint("Error: All variants excluded by --gene.\n");
1080 goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
1081 } else {
1082 goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
1083 }
1084 }
1085 if (bigstack_end_alloc_c(genekeep_ct * max_genekeep_len, &sorted_genekeep_ids)) {
1086 goto define_sets_ret_NOMEM;
1087 }
1088 bufptr = sip->genekeep_flattened;
1089 ulii = 0;
1090 do {
1091 slen = strlen(bufptr) + 1;
1092 if ((!c_prefix) || (!memcmp(bufptr, "C_", 2))) {
1093 memcpy(&(sorted_genekeep_ids[ulii * max_genekeep_len]), bufptr, slen);
1094 ulii++;
1095 }
1096 bufptr = &(bufptr[slen]);
1097 } while (*bufptr);
1098 qsort(sorted_genekeep_ids, genekeep_ct, max_genekeep_len, strcmp_casted);
1099 }
1100 }
1101 // 2. if --set-names and/or --subset is present, (load and) sort those lists
1102 if (sip->setnames_flattened || sip->subset_fname) {
1103 if (sip->subset_fname) {
1104 if (fopen_checked(sip->subset_fname, FOPEN_RB, &infile)) {
1105 goto define_sets_ret_OPEN_FAIL;
1106 }
1107 retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
1108 if (retval) {
1109 if (retval == RET_INVALID_FORMAT) {
1110 logerrprint("Error: Pathologically long token in --subset file.\n");
1111 }
1112 goto define_sets_ret_1;
1113 }
1114 }
1115 ulii = subset_ct;
1116 if (sip->setnames_flattened) {
1117 subset_ct += count_and_measure_multistr(sip->setnames_flattened, &max_subset_id_len);
1118 }
1119 if (!subset_ct) {
1120 if ((gene_all || sip->genekeep_flattened) && ((!sip->merged_set_name) || (!complement_sets))) {
1121 if (!allow_no_variants) {
1122 if (sip->subset_fname) {
1123 logerrprint("Error: All variants excluded, since --subset file is empty.\n");
1124 } else {
1125 logerrprint("Error: All variants excluded, since --set-names was given no parameters.\n");
1126 }
1127 goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
1128 } else {
1129 goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
1130 }
1131 }
1132 if (sip->merged_set_name) {
1133 goto define_sets_merge_nothing;
1134 } else {
1135 if (sip->subset_fname) {
1136 logerrprint("Warning: Empty --subset file; no sets defined.\n");
1137 } else {
1138 logerrprint("Warning: No sets defined since --set-names was given no parameters.\n");
1139 }
1140 goto define_sets_ret_1;
1141 }
1142 }
1143 if (max_subset_id_len > MAX_ID_BLEN) {
1144 logerrprint("Error: Subset IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
1145 goto define_sets_ret_INVALID_FORMAT;
1146 }
1147 if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
1148 goto define_sets_ret_NOMEM;
1149 }
1150 if (sip->subset_fname) {
1151 if (ulii) {
1152 rewind(infile);
1153 retval = read_tokens(MAXLINELEN, ulii, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
1154 if (retval) {
1155 goto define_sets_ret_1;
1156 }
1157 }
1158 if (fclose_null(&infile)) {
1159 goto define_sets_ret_READ_FAIL;
1160 }
1161 }
1162 if (sip->setnames_flattened) {
1163 bufptr = sip->setnames_flattened;
1164 while (*bufptr) {
1165 slen = strlen(bufptr) + 1;
1166 memcpy(&(sorted_subset_ids[ulii * max_subset_id_len]), bufptr, slen);
1167 ulii++;
1168 bufptr = &(bufptr[slen]);
1169 }
1170 }
1171 qsort(sorted_subset_ids, subset_ct, max_subset_id_len, strcmp_casted);
1172 subset_ct = collapse_duplicate_ids(sorted_subset_ids, subset_ct, max_subset_id_len, nullptr);
1173 }
1174 if (fopen_checked(sip->fname, make_set? "r" : FOPEN_RB, &infile)) {
1175 goto define_sets_ret_OPEN_FAIL;
1176 }
1177 // 3. load --make-set range list
1178 if (make_set) {
1179 retval = load_range_list(infile, !sip->merged_set_name, sip->make_set_border, sip->modifier & SET_MAKE_COLLAPSE_GROUP, gene_all || sip->genekeep_flattened, c_prefix, allow_no_variants, subset_ct, sorted_subset_ids, max_subset_id_len, marker_pos, chrom_info_ptr, &set_ct, &set_names, &max_set_id_len, &make_set_range_arr, &range_sort_buf, "--make-set");
1180 if (retval) {
1181 goto define_sets_ret_1;
1182 }
1183 }
1184
1185 // 4. if --gene or --gene-all is present, pre-filter variants.
1186 if (gene_all || sip->genekeep_flattened) {
1187 bigstack_end_mark2 = g_bigstack_end;
1188 if (bigstack_end_calloc_ul(unfiltered_marker_ctl, &marker_bitfield_tmp) ||
1189 bigstack_end_alloc_ul(unfiltered_marker_ctl, &marker_exclude_new)) {
1190 goto define_sets_ret_NOMEM;
1191 }
1192 fill_all_bits(unfiltered_marker_ct, marker_exclude_new);
1193 // then include every variant that appears, or include every variant that
1194 // fails to appear in a fully loaded set in the complement case
1195 if (make_set) {
1196 for (set_idx = 0; set_idx < set_ct; set_idx++) {
1197 if (gene_all || (bsearch_str_nl(&(set_names[set_idx * max_set_id_len]), sorted_genekeep_ids, max_genekeep_len, genekeep_ct) != -1)) {
1198 msr_tmp = make_set_range_arr[set_idx];
1199 while (msr_tmp) {
1200 fill_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_bitfield_tmp);
1201 msr_tmp = msr_tmp->next;
1202 }
1203 }
1204 if (complement_sets) {
1205 bitvec_and(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
1206 fill_ulong_zero(unfiltered_marker_ctl, marker_bitfield_tmp);
1207 }
1208 }
1209 } else {
1210 if (bigstack_end_alloc_c(marker_ct * max_marker_id_len, &sorted_marker_ids) ||
1211 bigstack_end_alloc_ui(marker_ct, &marker_id_map)) {
1212 goto define_sets_ret_NOMEM;
1213 }
1214 retval = sort_item_ids_noalloc(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, sorted_marker_ids, marker_id_map);
1215 if (retval) {
1216 goto define_sets_ret_NOMEM;
1217 }
1218 // similar to read_tokens(), since it may be important to support very
1219 // long lines.
1220 while (1) {
1221 if (fread_checked(midbuf, MAXLINELEN, infile, &bufsize)) {
1222 goto define_sets_ret_READ_FAIL;
1223 }
1224 if (!bufsize) {
1225 if (curtoklen) {
1226 if ((curtoklen != 3) || memcmp(bufptr, "END", 3)) {
1227 goto define_sets_ret_INVALID_FORMAT_NO_END;
1228 } else if (!in_set) {
1229 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1230 }
1231 } else if (in_set) {
1232 goto define_sets_ret_INVALID_FORMAT_NO_END;
1233 }
1234 break;
1235 }
1236 buf_end = &(midbuf[bufsize]);
1237 *buf_end = ' ';
1238 buf_end[1] = '0';
1239 bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
1240 bufptr2 = midbuf;
1241 if (curtoklen) {
1242 goto define_sets_tok_start_1;
1243 }
1244 while (1) {
1245 while (*bufptr <= ' ') {
1246 bufptr++;
1247 }
1248 if (bufptr >= buf_end) {
1249 curtoklen = 0;
1250 break;
1251 }
1252 bufptr2 = &(bufptr[1]);
1253 define_sets_tok_start_1:
1254 while (*bufptr2 > ' ') {
1255 bufptr2++;
1256 }
1257 curtoklen = (uintptr_t)(bufptr2 - bufptr);
1258 if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
1259 if (curtoklen > MAXLINELEN) {
1260 logerrprint("Error: Excessively long token in --set file.\n");
1261 goto define_sets_ret_INVALID_FORMAT;
1262 }
1263 bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
1264 memcpy(bufptr3, bufptr, curtoklen);
1265 bufptr = bufptr3;
1266 break;
1267 }
1268 if ((curtoklen == 3) && (!memcmp(bufptr, "END", 3))) {
1269 if (!in_set) {
1270 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1271 }
1272 if (complement_sets) {
1273 bitvec_and(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
1274 fill_ulong_zero(unfiltered_marker_ctl, marker_bitfield_tmp);
1275 }
1276 in_set = 0;
1277 } else if (!in_set) {
1278 // bugfix: forgot to apply --gene here
1279 // if (gene_all || (bsearch_str_nl(&(set_names[set_idx * max_set_id_len]), sorted_genekeep_ids, max_genekeep_len, genekeep_ct) != -1)) {
1280
1281 if ((subset_ct && (bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_subset_ids, max_subset_id_len, subset_ct) == -1)) || (sorted_genekeep_ids && (bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_genekeep_ids, max_genekeep_len, genekeep_ct) == -1))) {
1282 in_set = 2; // ignore this set
1283 bufptr = &(bufptr2[1]);
1284 continue;
1285 }
1286 if (curtoklen >= max_set_id_len) {
1287 max_set_id_len = curtoklen + 1;
1288 }
1289 set_ct++;
1290 in_set = 1;
1291 } else if (in_set == 1) {
1292 ii = bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_marker_ids, max_marker_id_len, marker_ct);
1293 if (ii != -1) {
1294 set_bit(marker_id_map[(uint32_t)ii], marker_bitfield_tmp);
1295 }
1296 }
1297 bufptr = &(bufptr2[1]);
1298 }
1299 }
1300 if (!feof(infile)) {
1301 goto define_sets_ret_READ_FAIL;
1302 }
1303 if (!set_ct) {
1304 if (!complement_sets) {
1305 if (!allow_no_variants) {
1306 logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--set file.\n");
1307 goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
1308 } else {
1309 goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
1310 }
1311 } else {
1312 logerrprint("Warning: No sets defined from --set file.\n");
1313 goto define_sets_ret_1;
1314 }
1315 }
1316 }
1317 if (!complement_sets) {
1318 bitvec_andnot(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
1319 }
1320 bitvec_or(marker_exclude_new, unfiltered_marker_ctl, marker_exclude);
1321 marker_exclude_ct = popcount_longs(marker_exclude, unfiltered_marker_ctl);
1322 *marker_exclude_ct_ptr = marker_exclude_ct;
1323 if (marker_exclude_ct == unfiltered_marker_ct) {
1324 if (!allow_no_variants) {
1325 goto define_sets_ret_ALL_MARKERS_EXCLUDED;
1326 } else {
1327 goto define_sets_ret_1;
1328 }
1329 }
1330 marker_ct = unfiltered_marker_ct - marker_exclude_ct;
1331 rewind(infile);
1332 bigstack_end_reset(bigstack_end_mark2);
1333 } else if ((!make_set) && (!sip->merged_set_name)) {
1334 // 5. otherwise, with --set and no --set-collapse-all, count number of sets
1335 // and max_name_len.
1336 while (1) {
1337 if (fread_checked(midbuf, MAXLINELEN, infile, &bufsize)) {
1338 goto define_sets_ret_READ_FAIL;
1339 }
1340 if (!bufsize) {
1341 if (curtoklen) {
1342 if ((curtoklen != 3) || (memcmp(bufptr, "END", 3))) {
1343 goto define_sets_ret_INVALID_FORMAT_NO_END;
1344 } else if (!in_set) {
1345 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1346 }
1347 } else if (in_set) {
1348 goto define_sets_ret_INVALID_FORMAT_NO_END;
1349 }
1350 break;
1351 }
1352 buf_end = &(midbuf[bufsize]);
1353 *buf_end = ' ';
1354 buf_end[1] = '0';
1355 bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
1356 bufptr2 = midbuf;
1357 if (curtoklen) {
1358 goto define_sets_tok_start_2;
1359 }
1360 while (1) {
1361 while (*bufptr <= ' ') {
1362 bufptr++;
1363 }
1364 if (bufptr >= buf_end) {
1365 curtoklen = 0;
1366 break;
1367 }
1368 bufptr2 = &(bufptr[1]);
1369 define_sets_tok_start_2:
1370 while (*bufptr2 > ' ') {
1371 bufptr2++;
1372 }
1373 curtoklen = (uintptr_t)(bufptr2 - bufptr);
1374 if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
1375 bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
1376 memcpy(bufptr3, bufptr, curtoklen);
1377 bufptr = bufptr3;
1378 break;
1379 }
1380 if ((curtoklen == 3) && (!memcmp(bufptr, "END", 3))) {
1381 if (!in_set) {
1382 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1383 }
1384 in_set = 0;
1385 } else if (!in_set) {
1386 in_set = 1;
1387 if (subset_ct && (bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_subset_ids, max_subset_id_len, subset_ct) == -1)) {
1388 // no need for in_set = 2, just don't adjust set_ct/id_len
1389 bufptr = &(bufptr2[1]);
1390 continue;
1391 }
1392 if (curtoklen >= max_set_id_len) {
1393 max_set_id_len = curtoklen + 1;
1394 }
1395 set_ct++;
1396 }
1397 bufptr = &(bufptr2[1]);
1398 }
1399 }
1400 if (!set_ct) {
1401 logerrprint("Warning: No sets defined from --set file.\n");
1402 goto define_sets_ret_1;
1403 }
1404 rewind(infile);
1405 }
1406 // 6. allocate sip->names[], setdefs[] on stack
1407 if (bigstack_end_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
1408 goto define_sets_ret_NOMEM;
1409 }
1410 fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
1411 if (!set_names) {
1412 if (sip->merged_set_name) {
1413 set_ct = 1;
1414 max_set_id_len = strlen(sip->merged_set_name) + 1;
1415 if (max_set_id_len > MAX_ID_BLEN) {
1416 logerrprint("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
1417 goto define_sets_ret_INVALID_FORMAT;
1418 }
1419 if (bigstack_alloc_c(max_set_id_len, &set_names)) {
1420 goto define_sets_ret_NOMEM;
1421 }
1422 memcpy(set_names, sip->merged_set_name, max_set_id_len);
1423 } else {
1424 if (max_set_id_len > MAX_ID_BLEN) {
1425 logerrprint("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
1426 goto define_sets_ret_INVALID_FORMAT;
1427 }
1428 if (bigstack_alloc_c(set_ct * max_set_id_len, &set_names)) {
1429 goto define_sets_ret_NOMEM;
1430 }
1431 }
1432 }
1433 all_setdefs = (uint32_t**)bigstack_alloc(set_ct * sizeof(intptr_t));
1434 if (!all_setdefs) {
1435 goto define_sets_ret_NOMEM;
1436 }
1437 if (make_set) {
1438 // 7. If --make-set, allocate entries on stack
1439 for (set_idx = 0; set_idx < set_ct; set_idx++) {
1440 msr_tmp = make_set_range_arr[set_idx];
1441 // sort and merge intervals in O(n log n) instead of O(n^2) time
1442 uii = 0;
1443 while (msr_tmp) {
1444 range_first = msr_tmp->uidx_start;
1445 range_last = msr_tmp->uidx_end;
1446 msr_tmp = msr_tmp->next;
1447 if (IS_SET(marker_exclude, range_first)) {
1448 range_first = next_unset(marker_exclude, range_first, unfiltered_marker_ct);
1449 if (range_first == unfiltered_marker_ct) {
1450 continue;
1451 }
1452 }
1453 range_first = marker_uidx_to_idx[range_first];
1454 if (IS_SET(marker_exclude, range_last)) {
1455 range_last = next_unset(marker_exclude, range_last, unfiltered_marker_ct);
1456 }
1457 if (range_last == unfiltered_marker_ct) {
1458 range_last = marker_ct;
1459 } else {
1460 range_last = marker_uidx_to_idx[range_last];
1461 if (range_last == range_first) {
1462 continue;
1463 }
1464 }
1465 range_sort_buf[uii++] = (((uint64_t)range_first) << 32) | ((uint64_t)range_last);
1466 }
1467 if (!uii) {
1468 // special case: empty set
1469 if (bigstack_left() < 16) {
1470 goto define_sets_ret_NOMEM;
1471 }
1472 all_setdefs[set_idx] = (uint32_t*)g_bigstack_base;
1473 g_bigstack_base = &(g_bigstack_base[16]);
1474 if (!complement_sets) {
1475 all_setdefs[set_idx][0] = 0;
1476 } else {
1477 all_setdefs[set_idx][0] = 1;
1478 all_setdefs[set_idx][1] = 0;
1479 all_setdefs[set_idx][2] = marker_ct;
1480 }
1481 continue;
1482 }
1483 #ifdef __cplusplus
1484 std::sort((int64_t*)range_sort_buf, (int64_t*)(&(range_sort_buf[uii])));
1485 #else
1486 qsort((int64_t*)range_sort_buf, uii, sizeof(int64_t), llcmp);
1487 #endif
1488 ukk = 0; // current end of sorted interval list
1489 range_last = (uint32_t)range_sort_buf[0];
1490 for (ujj = 1; ujj < uii; ujj++) {
1491 ullii = range_sort_buf[ujj];
1492 range_first = (uint32_t)(ullii >> 32);
1493 if (range_first <= range_last) {
1494 umm = (uint32_t)ullii;
1495 if (umm > range_last) {
1496 range_last = umm;
1497 range_sort_buf[ukk] = (range_sort_buf[ukk] & 0xffffffff00000000LLU) | (ullii & 0xffffffffLLU);
1498 }
1499 } else {
1500 if (++ukk < ujj) {
1501 range_sort_buf[ukk] = ullii;
1502 }
1503 range_last = (uint32_t)ullii;
1504 }
1505 }
1506 if (save_set_range(range_sort_buf, marker_ct, ukk, complement_sets, &(all_setdefs[set_idx]))) {
1507 goto define_sets_ret_NOMEM;
1508 }
1509 }
1510 } else {
1511 // 8. If --set, load sets and allocate on stack
1512 set_idx = 0;
1513 in_set = 0;
1514 curtoklen = 0;
1515 range_first = marker_ct;
1516 range_last = 0;
1517 // guarantee two free bits at end to simplify loop termination checks (may
1518 // want to default to doing this...)
1519 marker_ctp2l = (marker_ct + (BITCT + 1)) / BITCT;
1520 if (bigstack_end_alloc_ul(marker_ctp2l, &marker_bitfield_tmp) ||
1521 bigstack_end_alloc_c(marker_ct * max_marker_id_len, &sorted_marker_ids) ||
1522 bigstack_end_alloc_ui(marker_ct, &marker_id_map)) {
1523 goto define_sets_ret_NOMEM;
1524 }
1525 retval = sort_item_ids_noalloc(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 1, strcmp_deref, sorted_marker_ids, marker_id_map);
1526 if (retval) {
1527 goto define_sets_ret_1;
1528 }
1529 #ifdef __LP64__
1530 fill_ulong_zero(round_up_pow2(marker_ctp2l, 2), marker_bitfield_tmp);
1531 #else
1532 fill_ulong_zero(round_up_pow2(marker_ctp2l, 4), marker_bitfield_tmp);
1533 #endif
1534 while (1) {
1535 if (fread_checked(midbuf, MAXLINELEN, infile, &bufsize)) {
1536 goto define_sets_ret_READ_FAIL;
1537 }
1538 if (!bufsize) {
1539 if (curtoklen) {
1540 if ((curtoklen != 3) || (memcmp(bufptr, "END", 3))) {
1541 goto define_sets_ret_INVALID_FORMAT_NO_END;
1542 } else if (!in_set) {
1543 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1544 }
1545 } else if (in_set) {
1546 goto define_sets_ret_INVALID_FORMAT_NO_END;
1547 }
1548 break;
1549 }
1550 buf_end = &(midbuf[bufsize]);
1551 *buf_end = ' ';
1552 buf_end[1] = '0';
1553 bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
1554 bufptr2 = midbuf;
1555 if (curtoklen) {
1556 goto define_sets_tok_start_3;
1557 }
1558 while (1) {
1559 while (*bufptr <= ' ') {
1560 bufptr++;
1561 }
1562 if (bufptr >= buf_end) {
1563 curtoklen = 0;
1564 break;
1565 }
1566 bufptr2 = &(bufptr[1]);
1567 define_sets_tok_start_3:
1568 while (*bufptr2 > ' ') {
1569 bufptr2++;
1570 }
1571 curtoklen = (uintptr_t)(bufptr2 - bufptr);
1572 if ((bufptr2 == buf_end) && (buf_end == &(g_textbuf[MAXLINELEN * 2]))) {
1573 bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
1574 memcpy(bufptr3, bufptr, curtoklen);
1575 bufptr = bufptr3;
1576 break;
1577 }
1578 if ((curtoklen == 3) && (!memcmp(bufptr, "END", 3))) {
1579 if (!in_set) {
1580 goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
1581 }
1582 if ((!sip->merged_set_name) && (in_set == 1)) {
1583 if (save_set_bitfield(marker_bitfield_tmp, marker_ct, range_first, range_last + 1, complement_sets, &(all_setdefs[set_idx]))) {
1584 goto define_sets_ret_NOMEM;
1585 }
1586 set_idx++;
1587 fill_ulong_zero(marker_ctp2l, marker_bitfield_tmp);
1588 range_first = marker_ct;
1589 range_last = 0;
1590 }
1591 in_set = 0;
1592 } else if (!in_set) {
1593 in_set = 1;
1594 if (subset_ct && (bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_subset_ids, max_subset_id_len, subset_ct) == -1)) {
1595 in_set = 2;
1596 bufptr = &(bufptr2[1]);
1597 continue;
1598 }
1599 if (!sip->merged_set_name) {
1600 memcpyx(&(set_names[set_idx * max_set_id_len]), bufptr, bufptr2 - bufptr, '\0');
1601 }
1602 } else if (in_set == 1) {
1603 ii = bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_marker_ids, max_marker_id_len, marker_ct);
1604 if (ii != -1) {
1605 uii = marker_id_map[(uint32_t)ii];
1606 if (uii < range_first) {
1607 range_first = uii;
1608 }
1609 if (uii > range_last) {
1610 range_last = uii;
1611 }
1612 set_bit(uii, marker_bitfield_tmp);
1613 }
1614 }
1615 bufptr = &(bufptr2[1]);
1616 }
1617 }
1618 if (sip->merged_set_name) {
1619 if (save_set_bitfield(marker_bitfield_tmp, marker_ct, range_first, range_last + 1, complement_sets, all_setdefs)) {
1620 goto define_sets_ret_NOMEM;
1621 }
1622 }
1623 }
1624 if (fclose_null(&infile)) {
1625 goto define_sets_ret_READ_FAIL;
1626 }
1627 sip->ct = set_ct;
1628 sip->names = set_names;
1629 sip->max_name_len = max_set_id_len;
1630 sip->setdefs = all_setdefs;
1631 LOGPRINTF("--%sset: %" PRIuPTR " set%s defined.\n", make_set? "make-" : "", set_ct, (set_ct == 1)? "" : "s");
1632 while (0) {
1633 define_sets_merge_nothing:
1634 sip->ct = 1;
1635 uii = strlen(sip->merged_set_name) + 1;
1636 bigstack_end_reset(bigstack_end_mark);
1637 sip->setdefs = (uint32_t**)bigstack_alloc(sizeof(intptr_t));
1638 if (!sip->setdefs) {
1639 goto define_sets_ret_NOMEM;
1640 }
1641 if (bigstack_alloc_c(uii, &sip->names) ||
1642 bigstack_alloc_ui(1 + 2 * complement_sets, &(sip->setdefs[0]))) {
1643 goto define_sets_ret_NOMEM;
1644 }
1645 memcpy(sip->names, sip->merged_set_name, uii);
1646 sip->max_name_len = uii;
1647 if (complement_sets) {
1648 sip->setdefs[0][0] = 1;
1649 sip->setdefs[0][1] = 0;
1650 sip->setdefs[0][2] = marker_ct;
1651 } else {
1652 sip->setdefs[0][0] = 0;
1653 }
1654 LOGPRINTF("--%sset: 1 set defined.\n", make_set? "make-" : "");
1655 break;
1656 define_sets_ret_NOMEM:
1657 retval = RET_NOMEM;
1658 break;
1659 define_sets_ret_OPEN_FAIL:
1660 retval = RET_OPEN_FAIL;
1661 break;
1662 define_sets_ret_READ_FAIL:
1663 retval = RET_READ_FAIL;
1664 break;
1665 define_sets_ret_ALL_MARKERS_EXCLUDED:
1666 logerrprint("Error: All variants excluded by --gene/--gene-all.\n");
1667 define_sets_ret_ALL_MARKERS_EXCLUDED_2:
1668 retval = RET_ALL_MARKERS_EXCLUDED;
1669 break;
1670 define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED:
1671 fill_all_bits(unfiltered_marker_ct, marker_exclude);
1672 *marker_exclude_ct_ptr = unfiltered_marker_ct;
1673 break;
1674 define_sets_ret_INVALID_FORMAT_EXTRA_END:
1675 logerrprint("Error: Extra 'END' token in --set file.\n");
1676 retval = RET_INVALID_FORMAT;
1677 break;
1678 define_sets_ret_INVALID_FORMAT_NO_END:
1679 logerrprint("Error: Last token in --set file isn't 'END'.\n");
1680 define_sets_ret_INVALID_FORMAT:
1681 retval = RET_INVALID_FORMAT;
1682 break;
1683 }
1684 define_sets_ret_1:
1685 bigstack_end_reset(bigstack_end_mark);
1686 fclose_cond(infile);
1687 return retval;
1688 }
1689
write_set(Set_info * sip,char * outname,char * outname_end,uint32_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,char * marker_ids,uintptr_t max_marker_id_len,uint32_t * marker_pos,Chrom_info * chrom_info_ptr)1690 int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr) {
1691 unsigned char* bigstack_mark = g_bigstack_base;
1692 FILE* outfile = nullptr;
1693 uintptr_t set_ct = sip->ct;
1694 uintptr_t max_set_name_len = sip->max_name_len;
1695 uintptr_t set_idx = 0;
1696 uint32_t chrom_idx = 0;
1697 int32_t retval = 0;
1698 uintptr_t* ulptr;
1699 uint32_t* marker_idx_to_uidx;
1700 uint32_t* last_idx;
1701 uint32_t* next_adj;
1702 uint32_t* cur_set_ptr;
1703 char* cur_setting;
1704 char* writebuf;
1705 char* bufptr;
1706 char* cptr;
1707 uintptr_t marker_uidx;
1708 uint32_t marker_idx;
1709 uint32_t chrom_end;
1710 uint32_t range_ct;
1711 uint32_t range_start;
1712 uint32_t uii;
1713 uint32_t ujj;
1714 uint32_t ukk;
1715 if (sip->modifier & SET_WRITE_TABLE) {
1716 memcpy(outname_end, ".set.table", 11);
1717 if (fopen_checked(outname, "w", &outfile)) {
1718 goto write_set_ret_OPEN_FAIL;
1719 }
1720 fputs("SNP\tCHR\tBP", outfile);
1721 for (set_idx = 0; set_idx < set_ct; set_idx++) {
1722 putc_unlocked('\t', outfile);
1723 fputs(&(sip->names[set_idx * max_set_name_len]), outfile);
1724 }
1725 if (putc_checked('\n', outfile)) {
1726 goto write_set_ret_WRITE_FAIL;
1727 }
1728 if (bigstack_calloc_ui(set_ct, &last_idx) ||
1729 bigstack_calloc_ui(set_ct, &next_adj) ||
1730 bigstack_alloc_c(set_ct, &cur_setting) ||
1731 bigstack_alloc_c(2 * set_ct, &writebuf)) {
1732 goto write_set_ret_NOMEM;
1733 }
1734 marker_uidx = 0;
1735 g_textbuf[0] = '\t';
1736 chrom_end = 0;
1737 for (set_idx = 1; set_idx < set_ct; set_idx++) {
1738 writebuf[2 * set_idx - 1] = '\t';
1739 }
1740 writebuf[2 * set_ct - 1] = '\n';
1741 for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
1742 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
1743 if (marker_uidx >= chrom_end) {
1744 uii = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
1745 chrom_idx = chrom_info_ptr->chrom_file_order[uii];
1746 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[uii];
1747 }
1748 fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
1749 bufptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(g_textbuf[1]));
1750 *bufptr++ = '\t';
1751 bufptr = uint32toa_x(marker_pos[marker_uidx], '\t', bufptr);
1752 // do not keep double-tab (if it was intentional, it should have been in
1753 // the header line too...)
1754 fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
1755 bufptr = writebuf;
1756 cptr = cur_setting;
1757 for (set_idx = 0; set_idx < set_ct; set_idx++) {
1758 if (next_adj[set_idx] <= marker_idx) {
1759 cur_set_ptr = sip->setdefs[set_idx];
1760 range_ct = cur_set_ptr[0];
1761 if (range_ct != 0xffffffffU) {
1762 uii = last_idx[set_idx];
1763 if (uii < range_ct) {
1764 range_start = sip->setdefs[set_idx][2 * uii + 1];
1765 if (range_start > marker_idx) {
1766 *cptr = '0';
1767 next_adj[set_idx] = range_start;
1768 } else {
1769 *cptr = '1';
1770 next_adj[set_idx] = sip->setdefs[set_idx][2 * uii + 2];
1771 last_idx[set_idx] = uii + 1;
1772 }
1773 } else {
1774 *cptr = '0';
1775 next_adj[set_idx] = marker_ct;
1776 }
1777 } else {
1778 range_start = cur_set_ptr[1];
1779 uii = cur_set_ptr[3];
1780 if (marker_idx >= range_start) {
1781 ujj = cur_set_ptr[2];
1782 if (marker_idx < ujj + range_start) {
1783 ulptr = (uintptr_t*)(&(cur_set_ptr[4]));
1784 ukk = marker_idx - range_start;
1785 if (IS_SET(ulptr, ukk)) {
1786 *cptr = '1';
1787 ukk++;
1788 next_unset_ck(ulptr, ujj, &ukk);
1789 } else {
1790 *cptr = '0';
1791 ukk = next_set(ulptr, ukk, ujj);
1792 }
1793 next_adj[set_idx] = range_start + ukk;
1794 } else {
1795 *cptr = '0' + uii;
1796 next_adj[set_idx] = marker_ct;
1797 }
1798 } else {
1799 *cptr = '0' + uii;
1800 next_adj[set_idx] = range_start;
1801 }
1802 }
1803 }
1804 *bufptr = *cptr++;
1805 bufptr = &(bufptr[2]);
1806 }
1807 if (fwrite_checked(writebuf, 2 * set_ct, outfile)) {
1808 goto write_set_ret_WRITE_FAIL;
1809 }
1810 }
1811 if (fclose_null(&outfile)) {
1812 goto write_set_ret_WRITE_FAIL;
1813 }
1814 LOGPRINTFWW("--set-table: %s written.\n", outname);
1815 }
1816 if (sip->modifier & SET_WRITE_LIST) {
1817 memcpy(outname_end, ".set", 5);
1818 if (fopen_checked(outname, "w", &outfile)) {
1819 goto write_set_ret_OPEN_FAIL;
1820 }
1821 if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
1822 goto write_set_ret_NOMEM;
1823 }
1824 fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
1825 for (set_idx = 0; set_idx < set_ct; set_idx++) {
1826 fputs(&(sip->names[set_idx * max_set_name_len]), outfile);
1827 putc_unlocked('\n', outfile);
1828 cur_set_ptr = sip->setdefs[set_idx];
1829 range_ct = cur_set_ptr[0];
1830 if (range_ct != 0xffffffffU) {
1831 for (uii = 0; uii < range_ct; uii++) {
1832 ujj = cur_set_ptr[uii * 2 + 2];
1833 for (marker_idx = cur_set_ptr[uii * 2 + 1]; marker_idx < ujj; marker_idx++) {
1834 fputs(&(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), outfile);
1835 putc_unlocked('\n', outfile);
1836 }
1837 }
1838 } else {
1839 range_start = cur_set_ptr[1];
1840 uii = cur_set_ptr[2];
1841 ulptr = (uintptr_t*)(&(cur_set_ptr[4]));
1842 if (cur_set_ptr[3]) {
1843 for (marker_idx = 0; marker_idx < range_start; marker_idx++) {
1844 fputs(&(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), outfile);
1845 putc_unlocked('\n', outfile);
1846 }
1847 }
1848 marker_idx = 0;
1849 while (1) {
1850 next_set_ck(ulptr, uii, &marker_idx);
1851 if (marker_idx == uii) {
1852 break;
1853 }
1854 fputs(&(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), outfile);
1855 putc_unlocked('\n', outfile);
1856 marker_idx++;
1857 }
1858 if ((range_start + uii < marker_ct) && cur_set_ptr[3]) {
1859 for (marker_idx = range_start + uii; marker_idx < marker_ct; marker_idx++) {
1860 fputs(&(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), outfile);
1861 putc_unlocked('\n', outfile);
1862 }
1863 }
1864 }
1865 if (fputs_checked("END\n\n", outfile)) {
1866 goto write_set_ret_WRITE_FAIL;
1867 }
1868 }
1869 if (fclose_null(&outfile)) {
1870 goto write_set_ret_WRITE_FAIL;
1871 }
1872 LOGPRINTFWW("--write-set: %s written.\n", outname);
1873 }
1874 while (0) {
1875 write_set_ret_NOMEM:
1876 retval = RET_NOMEM;
1877 break;
1878 write_set_ret_OPEN_FAIL:
1879 retval = RET_OPEN_FAIL;
1880 break;
1881 write_set_ret_WRITE_FAIL:
1882 retval = RET_WRITE_FAIL;
1883 break;
1884 }
1885 fclose_cond(outfile);
1886 bigstack_reset(bigstack_mark);
1887 return retval;
1888 }
1889
unpack_set(uintptr_t marker_ct,uint32_t * setdef,uintptr_t * include_bitfield)1890 void unpack_set(uintptr_t marker_ct, uint32_t* setdef, uintptr_t* include_bitfield) {
1891 uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
1892 uint32_t range_ct = setdef[0];
1893 uint32_t keep_outer;
1894 uint32_t range_start;
1895 uint32_t uii;
1896 if (range_ct == 0xffffffffU) {
1897 range_start = setdef[1];
1898 range_ct = setdef[2];
1899 keep_outer = setdef[3];
1900 if (range_start) {
1901 if (keep_outer) {
1902 fill_ulong_one(range_start / BITCT, include_bitfield);
1903 } else {
1904 fill_ulong_zero(range_start / BITCT, include_bitfield);
1905 }
1906 }
1907 memcpy(&(include_bitfield[range_start / BITCT]), (uintptr_t*)(&(setdef[4])), ((range_ct + 127) / 128) * 16);
1908 uii = range_start + range_ct;
1909 if (uii < marker_ct) {
1910 if (keep_outer) {
1911 fill_bits(uii, marker_ct - uii, include_bitfield);
1912 } else {
1913 fill_ulong_zero(marker_ctl - uii / BITCT, &(include_bitfield[uii / BITCT]));
1914 }
1915 }
1916 } else {
1917 fill_ulong_zero(marker_ctl, include_bitfield);
1918 for (uii = 0; uii < range_ct; uii++) {
1919 range_start = setdef[uii * 2 + 1];
1920 fill_bits(range_start, setdef[uii * 2 + 2] - range_start, include_bitfield);
1921 }
1922 }
1923 }
1924
unpack_set_unfiltered(uintptr_t marker_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uint32_t * setdef,uintptr_t * new_exclude)1925 void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* setdef, uintptr_t* new_exclude) {
1926 uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
1927 uintptr_t last_uidx = next_unset_unsafe(marker_exclude, 0);
1928 uintptr_t marker_uidx = last_uidx;
1929 uint32_t range_ct = setdef[0];
1930 uint32_t range_end = 0;
1931 uintptr_t* bitfield_ptr;
1932 uint32_t* uiptr;
1933 uint32_t keep_outer;
1934 uint32_t range_start;
1935 uint32_t range_idx;
1936 memcpy(new_exclude, marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t));
1937 if (range_ct == 0xffffffffU) {
1938 range_start = setdef[1];
1939 range_ct = setdef[2];
1940 keep_outer = setdef[3];
1941 bitfield_ptr = (uintptr_t*)(&(setdef[4]));
1942 if (range_start) {
1943 // if nonzero, range_start also must be greater than 1
1944 marker_uidx = jump_forward_unset_unsafe(marker_exclude, last_uidx + 1, range_start);
1945 if (!keep_outer) {
1946 fill_bits(last_uidx, marker_uidx - last_uidx, new_exclude);
1947 }
1948 }
1949 for (range_idx = 0; range_idx < range_ct; range_idx++, marker_uidx++) {
1950 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
1951 // we know that range representation is not more compact, so probably not
1952 // worthwhile to use next_unset/next_set/fill_bits() here
1953 if (!IS_SET(bitfield_ptr, range_idx)) {
1954 SET_BIT(marker_uidx, new_exclude);
1955 }
1956 }
1957 if ((!keep_outer) && (range_start + range_ct < marker_ct)) {
1958 fill_bits(marker_uidx, unfiltered_marker_ct - marker_uidx, new_exclude);
1959 }
1960 } else {
1961 uiptr = &(setdef[1]);
1962 range_idx = 0;
1963 if ((!setdef[1]) && range_ct) {
1964 range_start = *uiptr++;
1965 goto unpack_set_unfiltered_late_start;
1966 }
1967 for (; range_idx < range_ct; range_idx++) {
1968 range_start = *uiptr++;
1969 if (range_start > range_end) {
1970 marker_uidx = jump_forward_unset_unsafe(marker_exclude, last_uidx + 1, range_start - range_end);
1971 }
1972 fill_bits(last_uidx, marker_uidx - last_uidx, new_exclude);
1973 next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
1974 unpack_set_unfiltered_late_start:
1975 range_end = *uiptr++;
1976 if (range_end == marker_ct) {
1977 last_uidx = unfiltered_marker_ct;
1978 break;
1979 }
1980 last_uidx = jump_forward_unset_unsafe(marker_exclude, marker_uidx + 1, range_end - range_start);
1981 }
1982 fill_bits(last_uidx, unfiltered_marker_ct - last_uidx, new_exclude);
1983 }
1984 }
1985
extract_set_union(uint32_t ** setdefs,uintptr_t set_ct,uintptr_t * set_incl,uintptr_t * filtered_union,uintptr_t marker_ct)1986 uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_incl, uintptr_t* filtered_union, uintptr_t marker_ct) {
1987 uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
1988
1989 // these track known filled words at the beginning and end. (just intended
1990 // to detect early exit opportunities; doesn't need to be perfect.)
1991 uint32_t unset_startw = 0;
1992 uint32_t unset_endw = marker_ctl;
1993
1994 uint32_t* cur_setdef;
1995 uintptr_t set_idx;
1996 uint32_t range_ct;
1997 uint32_t range_idx;
1998 uint32_t range_start;
1999 uint32_t range_end;
2000 uint32_t keep_outer;
2001 uint32_t read_offset;
2002 fill_ulong_zero(marker_ctl, filtered_union);
2003 for (set_idx = 0; set_idx < set_ct; set_idx++) {
2004 if (set_incl && (!IS_SET(set_incl, set_idx))) {
2005 continue;
2006 }
2007 cur_setdef = setdefs[set_idx];
2008 range_ct = cur_setdef[0];
2009 if (range_ct == 0xffffffffU) {
2010 range_start = cur_setdef[1] / BITCT;
2011 range_end = range_start + (cur_setdef[2] / BITCT);
2012 keep_outer = cur_setdef[3];
2013 if (range_end > unset_startw) {
2014 read_offset = 0;
2015 if (range_start > unset_startw) {
2016 if (keep_outer) {
2017 fill_ulong_one(range_start, filtered_union);
2018 unset_startw = range_start;
2019 }
2020 } else {
2021 read_offset = unset_startw - range_start;
2022 range_start = unset_startw;
2023 }
2024 if (range_end > unset_endw) {
2025 range_end = unset_endw;
2026 }
2027 if (range_start < range_end) {
2028 bitvec_or((uintptr_t*)(&(cur_setdef[4 + (BITCT / 32) * read_offset])), range_end - range_start, &(filtered_union[range_start]));
2029 }
2030 }
2031 if (keep_outer && (range_end < unset_endw)) {
2032 // may overfill end
2033 fill_ulong_one(unset_endw - range_end, &(filtered_union[range_end]));
2034 unset_endw = range_end;
2035 }
2036 } else if (range_ct) {
2037 cur_setdef++;
2038 if (unset_startw) {
2039 // skip all ranges with end <= unset_startw * BITCT
2040 read_offset = uint32arr_greater_than(cur_setdef, range_ct * 2, unset_startw * BITCT + 1) / 2;
2041 if (read_offset) {
2042 if (range_ct == read_offset) {
2043 continue;
2044 }
2045 cur_setdef = &(cur_setdef[read_offset * 2]);
2046 range_ct -= read_offset;
2047 }
2048 }
2049 if (unset_endw < marker_ctl) {
2050 // and skip all ranges with start >= unset_endw * BITCT
2051 range_ct = (uint32arr_greater_than(cur_setdef, range_ct * 2, unset_endw * BITCT) + 1) / 2;
2052 }
2053 if (range_ct) {
2054 range_start = *(cur_setdef++);
2055 range_end = *(cur_setdef++);
2056 if (range_start < unset_startw * BITCT) {
2057 range_start = unset_startw * BITCT;
2058 }
2059 if (range_ct > 1) {
2060 fill_bits(range_start, range_end - range_start, filtered_union);
2061 for (range_idx = 2; range_idx < range_ct; range_idx++) {
2062 range_start = *(cur_setdef++);
2063 range_end = *(cur_setdef++);
2064 fill_bits(range_start, range_end - range_start, filtered_union);
2065 }
2066 range_start = *(cur_setdef++);
2067 range_end = *(cur_setdef++);
2068 }
2069 if (range_end > unset_endw * BITCT) {
2070 range_end = unset_endw * BITCT;
2071 }
2072 fill_bits(range_start, range_end - range_start, filtered_union);
2073 }
2074 }
2075 while (1) {
2076 if (unset_startw >= unset_endw) {
2077 goto extract_set_union_exit_early;
2078 }
2079 if (~(filtered_union[unset_startw])) {
2080 // guaranteed to terminate
2081 while (!(~(filtered_union[unset_endw - 1]))) {
2082 unset_endw--;
2083 }
2084 break;
2085 }
2086 unset_startw++;
2087 }
2088 }
2089 extract_set_union_exit_early:
2090 zero_trailing_bits(marker_ct, filtered_union);
2091 return popcount_longs(filtered_union, marker_ctl);
2092 }
2093
extract_set_union_unfiltered(Set_info * sip,uintptr_t * set_incl,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude,uintptr_t ** union_marker_exclude_ptr,uintptr_t * union_marker_ct_ptr)2094 uint32_t extract_set_union_unfiltered(Set_info* sip, uintptr_t* set_incl, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t** union_marker_exclude_ptr, uintptr_t* union_marker_ct_ptr) {
2095 // If union = all remaining markers, simply makes union_marker_exclude_ptr
2096 // point to marker_exclude. Otherwise, allocates union_marker_exclude on the
2097 // "stack".
2098 // Assumes marker_ct is initial value of *union_marker_ct_ptr.
2099 uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
2100 uintptr_t orig_marker_ct = *union_marker_ct_ptr;
2101 uintptr_t orig_marker_ctl = BITCT_TO_WORDCT(orig_marker_ct);
2102 uintptr_t* union_marker_exclude;
2103 uintptr_t* filtered_union;
2104 if (bigstack_alloc_ul(unfiltered_marker_ctl, &union_marker_exclude) ||
2105 bigstack_alloc_ul(orig_marker_ctl, &filtered_union)) {
2106 return 1;
2107 }
2108 *union_marker_ct_ptr = extract_set_union(sip->setdefs, sip->ct, set_incl, filtered_union, orig_marker_ct);
2109 if ((*union_marker_ct_ptr) == orig_marker_ct) {
2110 bigstack_reset(union_marker_exclude);
2111 *union_marker_exclude_ptr = marker_exclude;
2112 } else {
2113 uncollapse_copy_flip_include_arr(filtered_union, unfiltered_marker_ct, marker_exclude, union_marker_exclude);
2114 bigstack_reset(filtered_union);
2115 *union_marker_exclude_ptr = union_marker_exclude;
2116 }
2117 return 0;
2118 }
2119
setdefs_compress(Set_info * sip,uintptr_t * set_incl,uintptr_t set_ct,uintptr_t unfiltered_marker_ct,uintptr_t * marker_exclude_orig,uintptr_t marker_ct_orig,uintptr_t * marker_exclude,uintptr_t marker_ct,uint32_t *** new_setdefs_ptr)2120 uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude, uintptr_t marker_ct, uint32_t*** new_setdefs_ptr) {
2121 printf("entering\n");
2122 // currently assumes marker_exclude does not exclude anything in the union of
2123 // the remaining sets
2124 unsigned char* bigstack_end_mark = g_bigstack_end;
2125 uintptr_t marker_ctlv = ((marker_ct + 127) / 128) * (128 / BITCT);
2126 uint32_t set_uidx = 0;
2127 uintptr_t* cur_bitfield;
2128 uintptr_t* read_bitfield;
2129 uint32_t** new_setdefs;
2130 uint32_t* marker_midx_to_idx;
2131 uint32_t* cur_setdef;
2132 uintptr_t set_idx;
2133 uint32_t range_ct;
2134 uint32_t range_idx;
2135 uint32_t range_offset;
2136 uint32_t range_stop;
2137 uint32_t range_start;
2138 uint32_t range_end;
2139 uint32_t include_out_of_bounds;
2140 uint32_t marker_midx;
2141 new_setdefs = (uint32_t**)bigstack_alloc(set_ct * sizeof(intptr_t));
2142 if (!new_setdefs) {
2143 return 1;
2144 }
2145 if (bigstack_end_alloc_ul(marker_ctlv, &cur_bitfield) ||
2146 bigstack_end_alloc_ui(marker_ct_orig, &marker_midx_to_idx)) {
2147 bigstack_end_reset(bigstack_end_mark);
2148 return 1;
2149 }
2150 fill_midx_to_idx(marker_exclude_orig, marker_exclude, marker_ct, marker_midx_to_idx);
2151 for (set_idx = 0; set_idx < set_ct; set_uidx++, set_idx++) {
2152 if (set_incl) {
2153 next_set_unsafe_ck(set_incl, &set_uidx);
2154 }
2155 cur_setdef = sip->setdefs[set_uidx];
2156 fill_ulong_zero(marker_ctlv, cur_bitfield);
2157 range_ct = cur_setdef[0];
2158 range_start = marker_ct;
2159 range_end = 0;
2160 if (range_ct != 0xffffffffU) {
2161 if (range_ct) {
2162 range_start = marker_midx_to_idx[cur_setdef[1]];
2163 for (range_idx = 0; range_idx < range_ct; range_idx++) {
2164 range_offset = *(++cur_setdef);
2165 range_stop = *(++cur_setdef);
2166 fill_bits(marker_midx_to_idx[range_offset], range_stop - range_offset, cur_bitfield);
2167 }
2168 range_end = marker_midx_to_idx[range_offset] + range_stop - range_offset;
2169 }
2170 } else {
2171 range_offset = cur_setdef[1];
2172 range_stop = cur_setdef[2];
2173 include_out_of_bounds = cur_setdef[3];
2174 read_bitfield = (uintptr_t*)(&(cur_setdef[4]));
2175 if (include_out_of_bounds && range_offset) {
2176 fill_ulong_one(range_offset / BITCT, cur_bitfield);
2177 range_start = 0;
2178 }
2179 for (marker_midx = 0; marker_midx < range_stop; marker_midx++) {
2180 if (IS_SET(read_bitfield, marker_midx)) {
2181 set_bit(marker_midx_to_idx[marker_midx + range_offset], cur_bitfield);
2182 }
2183 }
2184 if (include_out_of_bounds && (range_offset + range_stop < marker_ct_orig)) {
2185 fill_bits(marker_midx_to_idx[range_offset + range_stop], marker_ct_orig - range_offset - range_stop, cur_bitfield);
2186 range_end = marker_ct;
2187 } else {
2188 range_end = 1 + last_set_bit(cur_bitfield, BITCT_TO_WORDCT(marker_ct));
2189 }
2190 if (range_start) {
2191 range_start = marker_midx_to_idx[next_set_unsafe(read_bitfield, 0) + range_offset];
2192 }
2193 }
2194 if (save_set_bitfield(cur_bitfield, marker_ct, range_start, range_end, 0, &(new_setdefs[set_idx]))) {
2195 bigstack_end_reset(bigstack_end_mark);
2196 return 1;
2197 }
2198 }
2199 *new_setdefs_ptr = new_setdefs;
2200 bigstack_end_reset(bigstack_end_mark);
2201 return 0;
2202 }
2203
load_range_list_sortpos(char * fname,uint32_t border_extend,uintptr_t subset_ct,char * sorted_subset_ids,uintptr_t max_subset_id_len,Chrom_info * chrom_info_ptr,uintptr_t * gene_ct_ptr,char ** gene_names_ptr,uintptr_t * max_gene_id_len_ptr,uintptr_t ** chrom_bounds_ptr,uint32_t *** genedefs_ptr,uintptr_t * chrom_max_gene_ct_ptr,const char * file_descrip)2204 int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, Chrom_info* chrom_info_ptr, uintptr_t* gene_ct_ptr, char** gene_names_ptr, uintptr_t* max_gene_id_len_ptr, uintptr_t** chrom_bounds_ptr, uint32_t*** genedefs_ptr, uintptr_t* chrom_max_gene_ct_ptr, const char* file_descrip) {
2205 // --annotate, --clump-range, --gene-report
2206 unsigned char* bigstack_end_mark = g_bigstack_end;
2207 FILE* infile = nullptr;
2208 uintptr_t gene_ct = 0;
2209 uintptr_t max_gene_id_len = 0;
2210 uintptr_t chrom_max_gene_ct = 0;
2211 uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
2212 uint32_t chrom_idx = 0;
2213 Make_set_range** gene_arr;
2214 Make_set_range* msr_tmp;
2215 uint64_t* range_sort_buf;
2216 uintptr_t* chrom_bounds;
2217 uint32_t** genedefs;
2218 char* gene_names;
2219 char* bufptr;
2220 uint32_t* uiptr;
2221 uint64_t ullii;
2222 uintptr_t gene_idx;
2223 uintptr_t ulii;
2224 uint32_t range_first;
2225 uint32_t range_last;
2226 uint32_t uii;
2227 uint32_t ujj;
2228 uint32_t ukk;
2229 uint32_t umm;
2230 int32_t retval;
2231 if (fopen_checked(fname, "r", &infile)) {
2232 goto load_range_list_sortpos_ret_OPEN_FAIL;
2233 }
2234 retval = load_range_list(infile, 1, border_extend, 0, 0, 0, 0, subset_ct, sorted_subset_ids, 0, nullptr, chrom_info_ptr, &gene_ct, gene_names_ptr, &max_gene_id_len, &gene_arr, &range_sort_buf, file_descrip);
2235 if (retval) {
2236 goto load_range_list_sortpos_ret_1;
2237 }
2238 gene_names = *gene_names_ptr;
2239 if (bigstack_alloc_ul(chrom_code_end + 1, chrom_bounds_ptr)) {
2240 goto load_range_list_sortpos_ret_NOMEM;
2241 }
2242 chrom_bounds = *chrom_bounds_ptr;
2243 chrom_bounds[0] = 0;
2244 genedefs = (uint32_t**)bigstack_alloc(gene_ct * sizeof(intptr_t));
2245 if (!genedefs) {
2246 goto load_range_list_sortpos_ret_NOMEM;
2247 }
2248 for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
2249 bufptr = &(gene_names[gene_idx * max_gene_id_len]);
2250 // instead of subtracting 33/48 separately from each character, just
2251 // subtract
2252 // 48 * (1000 + 100 + 10) + 33 once at the end.
2253 // (last prefix character must be nonnumeric to prevent weird natural sort
2254 // interaction)
2255 uii = (((unsigned char)bufptr[0]) * 1000) + (((unsigned char)bufptr[1]) * 100) + (((unsigned char)bufptr[2]) * 10) + ((unsigned char)bufptr[3]) - 53313;
2256 if (chrom_idx < uii) {
2257 ulii = gene_idx - chrom_bounds[chrom_idx];
2258 if (ulii > chrom_max_gene_ct) {
2259 chrom_max_gene_ct = ulii;
2260 }
2261 do {
2262 chrom_bounds[++chrom_idx] = gene_idx;
2263 } while (chrom_idx < uii);
2264 }
2265 msr_tmp = gene_arr[gene_idx];
2266 uii = 0;
2267 while (msr_tmp) {
2268 range_sort_buf[uii++] = (((uint64_t)(msr_tmp->uidx_start)) << 32) | ((uint64_t)(msr_tmp->uidx_end));
2269 msr_tmp = msr_tmp->next;
2270 }
2271 if (!uii) {
2272 if (bigstack_left() < 16) {
2273 goto load_range_list_sortpos_ret_NOMEM;
2274 }
2275 genedefs[gene_idx] = (uint32_t*)g_bigstack_base;
2276 g_bigstack_base = &(g_bigstack_base[16]);
2277 genedefs[gene_idx][0] = 0;
2278 continue;
2279 }
2280 #ifdef __cplusplus
2281 std::sort((int64_t*)range_sort_buf, (int64_t*)(&(range_sort_buf[uii])));
2282 #else
2283 qsort(range_sort_buf, uii, sizeof(int64_t), llcmp);
2284 #endif
2285 ukk = 0; // current end of sorted interval list
2286 range_last = (uint32_t)range_sort_buf[0];
2287 for (ujj = 1; ujj < uii; ujj++) {
2288 ullii = range_sort_buf[ujj];
2289 range_first = (uint32_t)(ullii >> 32);
2290 if (range_first <= range_last) {
2291 umm = (uint32_t)ullii;
2292 if (umm > range_last) {
2293 range_last = umm;
2294 range_sort_buf[ukk] = (range_sort_buf[ukk] & 0xffffffff00000000LLU) | (ullii & 0xffffffffLLU);
2295 }
2296 } else {
2297 if (++ukk < ujj) {
2298 range_sort_buf[ukk] = ullii;
2299 }
2300 range_last = (uint32_t)ullii;
2301 }
2302 }
2303
2304 // this boilerplate can be removed once a 16-instead-of-64-byte-aligned
2305 // bigstack_alloc() exists
2306 ulii = (((++ukk) * 2 + 4) * sizeof(int32_t)) & (~(15 * ONELU));
2307 if (bigstack_left() < ulii) {
2308 goto load_range_list_sortpos_ret_NOMEM;
2309 }
2310 genedefs[gene_idx] = (uint32_t*)g_bigstack_base;
2311 g_bigstack_base = &(g_bigstack_base[ulii]);
2312 uiptr = genedefs[gene_idx];
2313 *uiptr++ = ukk;
2314 for (uii = 0; uii < ukk; uii++) {
2315 ullii = range_sort_buf[uii];
2316 *uiptr++ = (uint32_t)(ullii >> 32);
2317 *uiptr++ = (uint32_t)ullii;
2318 }
2319 }
2320 ulii = gene_ct - chrom_bounds[chrom_idx];
2321 if (ulii > chrom_max_gene_ct) {
2322 chrom_max_gene_ct = ulii;
2323 }
2324 while (chrom_idx < chrom_code_end) {
2325 chrom_bounds[++chrom_idx] = gene_ct;
2326 }
2327 if (fclose_null(&infile)) {
2328 goto load_range_list_sortpos_ret_READ_FAIL;
2329 }
2330 *gene_ct_ptr = gene_ct;
2331 *max_gene_id_len_ptr = max_gene_id_len;
2332 *chrom_max_gene_ct_ptr = chrom_max_gene_ct;
2333 *genedefs_ptr = genedefs;
2334 while (0) {
2335 load_range_list_sortpos_ret_NOMEM:
2336 retval = RET_NOMEM;
2337 break;
2338 load_range_list_sortpos_ret_OPEN_FAIL:
2339 retval = RET_OPEN_FAIL;
2340 break;
2341 load_range_list_sortpos_ret_READ_FAIL:
2342 retval = RET_READ_FAIL;
2343 break;
2344 }
2345 load_range_list_sortpos_ret_1:
2346 bigstack_end_reset(bigstack_end_mark);
2347 fclose_cond(infile);
2348 return retval;
2349 }
2350
scrape_extra_chroms(const char * fname,const char * file_descrip,Chrom_info * chrom_info_ptr)2351 int32_t scrape_extra_chroms(const char* fname, const char* file_descrip, Chrom_info* chrom_info_ptr) {
2352 // scan first column of file, add these to chromosome names.
2353 // may want to add an option for this to search for "CHR"/"#CHROM" column
2354 uintptr_t line_idx = 0;
2355 FILE* infile = nullptr;
2356 int32_t retval = 0;
2357 {
2358 if (fopen_checked(fname, "r", &infile)) {
2359 goto scrape_extra_chroms_ret_OPEN_FAIL;
2360 }
2361 g_textbuf[MAXLINELEN - 1] = ' ';
2362 while (fgets(g_textbuf, MAXLINELEN, infile)) {
2363 ++line_idx;
2364 if (!g_textbuf[MAXLINELEN - 1]) {
2365 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, file_descrip);
2366 goto scrape_extra_chroms_ret_INVALID_FORMAT_2;
2367 }
2368 char* first_token = skip_initial_spaces(g_textbuf);
2369 if (is_eoln_kns(*first_token)) {
2370 continue;
2371 }
2372 char* first_token_end = token_endnn(first_token);
2373 const uint32_t chrom_name_slen = (uintptr_t)(first_token_end - first_token);
2374 *first_token_end = '\0';
2375 int32_t dummy;
2376 retval = get_or_add_chrom_code(first_token, file_descrip, line_idx, chrom_name_slen, 1, chrom_info_ptr, &dummy);
2377 if (retval) {
2378 goto scrape_extra_chroms_ret_1;
2379 }
2380 }
2381 if (fclose_null(&infile)) {
2382 goto scrape_extra_chroms_ret_READ_FAIL;
2383 }
2384 }
2385 while (0) {
2386 scrape_extra_chroms_ret_OPEN_FAIL:
2387 retval = RET_OPEN_FAIL;
2388 break;
2389 scrape_extra_chroms_ret_READ_FAIL:
2390 retval = RET_READ_FAIL;
2391 break;
2392 scrape_extra_chroms_ret_INVALID_FORMAT_2:
2393 logerrprintb();
2394 retval = RET_INVALID_FORMAT;
2395 break;
2396 }
2397 scrape_extra_chroms_ret_1:
2398 fclose_cond(infile);
2399 return retval;
2400 }
2401
annotate(const Annot_info * aip,uint32_t allow_extra_chroms,char * outname,char * outname_end,double pfilter,Chrom_info * chrom_info_ptr)2402 int32_t annotate(const Annot_info* aip, uint32_t allow_extra_chroms, char* outname, char* outname_end, double pfilter, Chrom_info* chrom_info_ptr) {
2403 unsigned char* bigstack_mark = g_bigstack_base;
2404 unsigned char* bigstack_end_mark = g_bigstack_end;
2405 gzFile gz_attribfile = nullptr;
2406 FILE* infile = nullptr;
2407 FILE* outfile = nullptr;
2408 char* sorted_snplist = nullptr;
2409 char* sorted_attr_ids = nullptr; // natural-sorted
2410 char* sorted_snplist_attr_ids = nullptr;
2411 char* sorted_subset_ids = nullptr;
2412 char* range_names = nullptr;
2413 char* filter_range_names = nullptr;
2414 char* wptr = nullptr;
2415 const char* snp_field = nullptr;
2416 uintptr_t* attr_bitfields = nullptr;
2417 uintptr_t* chrom_bounds = nullptr;
2418 uintptr_t* chrom_filter_bounds = nullptr;
2419 uint32_t** rangedefs = nullptr;
2420 uint32_t** filter_rangedefs = nullptr;
2421 uint32_t* range_idx_lookup = nullptr;
2422 uint32_t* attr_id_remap = nullptr;
2423 uint32_t* merged_attr_idx_buf = nullptr;
2424 const char constsnpstr[] = "SNP";
2425 const char constdotstr[] = ".";
2426 const char constnastr[] = "NA";
2427 const char constdot4str[] = " .";
2428 const char constna4str[] = " NA";
2429 const char* no_annot_str = (aip->modifier & ANNOT_NA)? constnastr : constdotstr;
2430 const char* no_sign_str = (aip->modifier & ANNOT_NA)? constna4str : constdot4str;
2431 uintptr_t snplist_ct = 0;
2432 uintptr_t max_snplist_id_len = 0;
2433 uintptr_t snplist_attr_ct = 0;
2434 uintptr_t max_snplist_attr_id_len = 0;
2435 uintptr_t attr_id_ct = 0;
2436 uintptr_t attr_id_ctl = 0;
2437 uintptr_t max_attr_id_len = 0;
2438 uintptr_t subset_ct = 0;
2439 uintptr_t max_subset_id_len = 0;
2440 uintptr_t range_ct = 0;
2441 uintptr_t max_range_name_len = 0;
2442 uintptr_t filter_range_ct = 0;
2443 uintptr_t max_filter_range_name_len = 0;
2444 uintptr_t chrom_max_range_ct = 0;
2445 uintptr_t chrom_max_filter_range_ct = 0;
2446 uintptr_t annot_row_ct = 0;
2447 uintptr_t total_row_ct = 0;
2448 uint32_t max_onevar_attr_ct = 0;
2449 uint32_t snp_field_len = 0;
2450 uint32_t border = aip->border;
2451 uint32_t need_var_id = (aip->attrib_fname || aip->snps_fname);
2452 uint32_t need_pos = (aip->ranges_fname || aip->filter_fname);
2453 uint32_t do_pfilter = (pfilter != 2.0);
2454 uint32_t token_ct = need_var_id + 2 * need_pos + do_pfilter;
2455 uint32_t block01 = (aip->modifier & ANNOT_BLOCK);
2456 uint32_t prune = (aip->modifier & ANNOT_PRUNE);
2457 uint32_t range_dist = !(aip->modifier & ANNOT_MINIMAL);
2458 uint32_t track_distance = (aip->modifier & ANNOT_DISTANCE);
2459 uint32_t col_idx = 0;
2460 uint32_t seq_idx = 0;
2461 uint32_t max_header_len = 3;
2462 uint32_t unique_annot_ct = 0;
2463 uint32_t unique_annot_ctlw = 0;
2464 int32_t chrom_idx = 0;
2465 int32_t min_dist = 0;
2466 int32_t retval = 0;
2467
2468 // col_skips[0..(token_ct - 1)] stores deltas between adjacent column indices
2469 // ([0] = 0-based index of first column), and token_ptrs[0..(token_ct - 1)]
2470 // points to those token start positions in the current line.
2471 // Since the order of the columns may vary, col_sequence[0] = col_skips
2472 // position of CHR index, [1] = BP index, [2] = SNP index, and [3] = P index.
2473 char* token_ptrs[4];
2474 uint32_t col_skips[4];
2475 uint32_t col_sequence[4];
2476
2477 Ll_str** attr_id_htable;
2478 Ll_str** ll_pptr;
2479 Ll_str* ll_ptr;
2480 char* loadbuf;
2481 char* merged_attr_ids;
2482 char* writebuf;
2483 char* bufptr;
2484 char* bufptr2;
2485 uintptr_t* ulptr;
2486 uint32_t* uiptr;
2487 uintptr_t loadbuf_size;
2488 uintptr_t line_idx;
2489 uintptr_t range_idx;
2490 uintptr_t ulii;
2491 uintptr_t uljj;
2492 double pval;
2493 uint32_t slen;
2494 uint32_t write_idx;
2495 uint32_t cur_bp;
2496 uint32_t abs_min_dist;
2497 uint32_t at_least_one_annot;
2498 uint32_t uii;
2499 uint32_t ujj;
2500 int32_t sorted_idx;
2501 int32_t ii;
2502 if (need_var_id) {
2503 if (aip->snpfield) {
2504 snp_field = aip->snpfield;
2505 snp_field_len = strlen(snp_field);
2506 if (snp_field_len > 3) {
2507 max_header_len = 3;
2508 }
2509 } else {
2510 snp_field = constsnpstr;
2511 snp_field_len = 3;
2512 }
2513 if (aip->snps_fname) {
2514 if (fopen_checked(aip->snps_fname, FOPEN_RB, &infile)) {
2515 goto annotate_ret_OPEN_FAIL;
2516 }
2517 retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &snplist_ct, &max_snplist_id_len);
2518 if (retval) {
2519 if (retval == RET_INVALID_FORMAT) {
2520 logerrprint("Error: Pathologically long token in --annotate snps file.\n");
2521 }
2522 goto annotate_ret_1;
2523 }
2524 if (!snplist_ct) {
2525 sprintf(g_logbuf, "Error: %s is empty.\n", aip->snps_fname);
2526 goto annotate_ret_INVALID_FORMAT_WW;
2527 }
2528 if (bigstack_alloc_c(snplist_ct * max_snplist_id_len, &sorted_snplist)) {
2529 goto annotate_ret_NOMEM;
2530 }
2531 rewind(infile);
2532 retval = read_tokens(MAXLINELEN, snplist_ct, max_snplist_id_len, infile, g_textbuf, sorted_snplist);
2533 if (retval) {
2534 goto annotate_ret_1;
2535 }
2536 if (fclose_null(&infile)) {
2537 goto annotate_ret_READ_FAIL;
2538 }
2539 qsort(sorted_snplist, snplist_ct, max_snplist_id_len, strcmp_casted);
2540 ulii = collapse_duplicate_ids(sorted_snplist, snplist_ct, max_snplist_id_len, nullptr);
2541 if (ulii < snplist_ct) {
2542 snplist_ct = ulii;
2543 bigstack_shrink_top(sorted_snplist, snplist_ct * max_snplist_id_len);
2544 }
2545 }
2546 if (aip->attrib_fname) {
2547 retval = gzopen_read_checked(aip->attrib_fname, &gz_attribfile);
2548 if (retval) {
2549 goto annotate_ret_1;
2550 }
2551 line_idx = 0;
2552 g_textbuf[MAXLINELEN - 1] = ' ';
2553 // two-pass load.
2554 // 1. determine attribute set, as well as relevant variant ID count and
2555 // max length
2556 // intermission. extract attribute names from hash table, natural sort,
2557 // deallocate hash table
2558 // 2. save relevant variant IDs and attribute bitfields, then qsort_ext()
2559 attr_id_htable = (Ll_str**)bigstack_end_alloc(HASHMEM);
2560 if (!attr_id_htable) {
2561 goto annotate_ret_NOMEM;
2562 }
2563 for (uii = 0; uii < HASHSIZE; uii++) {
2564 attr_id_htable[uii] = nullptr;
2565 }
2566 while (1) {
2567 line_idx++;
2568 if (!gzgets(gz_attribfile, g_textbuf, MAXLINELEN)) {
2569 if (!gzeof(gz_attribfile)) {
2570 goto annotate_ret_READ_FAIL;
2571 }
2572 break;
2573 }
2574 if (!g_textbuf[MAXLINELEN - 1]) {
2575 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->attrib_fname);
2576 goto annotate_ret_INVALID_FORMAT_WW;
2577 }
2578 bufptr = skip_initial_spaces(g_textbuf);
2579 if (is_eoln_kns(*bufptr)) {
2580 continue;
2581 }
2582 bufptr2 = token_endnn(bufptr);
2583 slen = (uintptr_t)(bufptr2 - bufptr);
2584 bufptr2 = skip_initial_spaces(bufptr2);
2585 if (is_eoln_kns(*bufptr2) || (sorted_snplist && (bsearch_str(bufptr, slen, sorted_snplist, max_snplist_id_len, snplist_ct) == -1))) {
2586 continue;
2587 }
2588 snplist_attr_ct++;
2589 if (slen >= max_snplist_attr_id_len) {
2590 max_snplist_attr_id_len = slen + 1;
2591 }
2592 ujj = 0;
2593 do {
2594 ujj++;
2595 bufptr = token_endnn(bufptr2);
2596 slen = (uintptr_t)(bufptr - bufptr2);
2597 bufptr = skip_initial_spaces(bufptr);
2598 bufptr2[slen] = '\0';
2599 uii = hashval2(bufptr2, slen++);
2600 ll_pptr = &(attr_id_htable[uii]);
2601 while (1) {
2602 ll_ptr = *ll_pptr;
2603 if (!ll_ptr) {
2604 #ifdef __LP64__
2605 // we'll run out of memory way earlier in 32-bit mode
2606 if (attr_id_ct == 0x80000000LLU) {
2607 sprintf(g_logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
2608 goto annotate_ret_INVALID_FORMAT_WW;
2609 }
2610 #endif
2611 attr_id_ct++;
2612 if (bigstack_end_alloc_llstr(slen, &ll_ptr)) {
2613 goto annotate_ret_NOMEM;
2614 }
2615 ll_ptr->next = nullptr;
2616 memcpy(ll_ptr->ss, bufptr2, slen);
2617 if (slen > max_attr_id_len) {
2618 max_attr_id_len = slen;
2619 }
2620 *ll_pptr = ll_ptr;
2621 break;
2622 }
2623 if (!strcmp(ll_ptr->ss, bufptr2)) {
2624 break;
2625 }
2626 ll_pptr = &(ll_ptr->next);
2627 }
2628 bufptr2 = bufptr;
2629 } while (!is_eoln_kns(*bufptr2));
2630 if (ujj > max_onevar_attr_ct) {
2631 max_onevar_attr_ct = ujj;
2632 }
2633 }
2634 if (!attr_id_ct) {
2635 sprintf(g_logbuf, "Error: No attributes in %s.\n", aip->attrib_fname);
2636 goto annotate_ret_INVALID_FORMAT_WW;
2637 }
2638 if (max_onevar_attr_ct > attr_id_ct) {
2639 // pathological case: a line of the attribute file has the same
2640 // attribute repeated over and over again for some reason
2641 max_onevar_attr_ct = attr_id_ct;
2642 }
2643 if (bigstack_alloc_c(attr_id_ct * max_attr_id_len, &sorted_attr_ids)) {
2644 goto annotate_ret_NOMEM;
2645 }
2646 ulii = 0;
2647 for (uii = 0; uii < HASHSIZE; uii++) {
2648 ll_ptr = attr_id_htable[uii];
2649 while (ll_ptr) {
2650 strcpy(&(sorted_attr_ids[ulii * max_attr_id_len]), ll_ptr->ss);
2651 ulii++;
2652 ll_ptr = ll_ptr->next;
2653 }
2654 }
2655 qsort(sorted_attr_ids, attr_id_ct, max_attr_id_len, strcmp_natural);
2656 bigstack_end_reset(bigstack_end_mark);
2657 gzrewind(gz_attribfile);
2658 attr_id_ctl = BITCT_TO_WORDCT(attr_id_ct);
2659 if (bigstack_alloc_c(snplist_attr_ct * max_snplist_attr_id_len, &sorted_snplist_attr_ids) ||
2660 bigstack_calloc_ul(snplist_attr_ct * attr_id_ctl, &attr_bitfields)) {
2661 goto annotate_ret_NOMEM;
2662 }
2663 for (ulii = 0; ulii < snplist_attr_ct; ulii++) {
2664 annotate_skip_line:
2665 if (!gzgets(gz_attribfile, g_textbuf, MAXLINELEN)) {
2666 goto annotate_ret_READ_FAIL;
2667 }
2668 bufptr = skip_initial_spaces(g_textbuf);
2669 if (is_eoln_kns(*bufptr)) {
2670 goto annotate_skip_line;
2671 }
2672 bufptr2 = token_endnn(bufptr);
2673 slen = (uintptr_t)(bufptr2 - bufptr);
2674 bufptr2 = skip_initial_spaces(bufptr2);
2675 if (is_eoln_kns(*bufptr2) || (sorted_snplist && (bsearch_str(bufptr, slen, sorted_snplist, max_snplist_id_len, snplist_ct) == -1))) {
2676 goto annotate_skip_line;
2677 }
2678 memcpyx(&(sorted_snplist_attr_ids[ulii * max_snplist_attr_id_len]), bufptr, slen, '\0');
2679 ulptr = &(attr_bitfields[ulii * attr_id_ctl]);
2680 do {
2681 bufptr = token_endnn(bufptr2);
2682 slen = (uintptr_t)(bufptr - bufptr2);
2683 bufptr = skip_initial_spaces(bufptr);
2684 bufptr2[slen] = '\0';
2685 sorted_idx = bsearch_str_natural(bufptr2, sorted_attr_ids, max_attr_id_len, attr_id_ct);
2686 set_bit(sorted_idx, ulptr);
2687 bufptr2 = bufptr;
2688 } while (!is_eoln_kns(*bufptr2));
2689 }
2690 gzclose(gz_attribfile);
2691 gz_attribfile = nullptr;
2692 if (qsort_ext(sorted_snplist_attr_ids, snplist_attr_ct, max_snplist_attr_id_len, strcmp_deref, (char*)attr_bitfields, attr_id_ctl * sizeof(intptr_t))) {
2693 goto annotate_ret_NOMEM;
2694 }
2695 LOGPRINTFWW("--annotate attrib: %" PRIuPTR " variant ID%s and %" PRIuPTR " unique attribute%s loaded from %s.\n", snplist_attr_ct, (snplist_attr_ct == 1)? "" : "s", attr_id_ct, (attr_id_ct == 1)? "" : "s", aip->attrib_fname);
2696 }
2697 }
2698 if (need_pos) {
2699 if (aip->ranges_fname) {
2700 if (allow_extra_chroms) {
2701 retval = scrape_extra_chroms(aip->ranges_fname, "--annotate ranges file", chrom_info_ptr);
2702 if (retval) {
2703 goto annotate_ret_1;
2704 }
2705 }
2706 if (aip->subset_fname) {
2707 if (fopen_checked(aip->subset_fname, FOPEN_RB, &infile)) {
2708 goto annotate_ret_OPEN_FAIL;
2709 }
2710 retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
2711 if (retval) {
2712 if (retval == RET_INVALID_FORMAT) {
2713 logerrprint("Error: Pathologically long token in --annotate subset file.\n");
2714 }
2715 goto annotate_ret_1;
2716 }
2717 if (!subset_ct) {
2718 logerrprint("Error: --annotate subset file is empty.\n");
2719 goto annotate_ret_INVALID_FORMAT;
2720 }
2721 if (max_subset_id_len > MAX_ID_BLEN) {
2722 logerrprint("Error: --annotate subset IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
2723 goto annotate_ret_INVALID_FORMAT;
2724 }
2725 if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
2726 goto annotate_ret_NOMEM;
2727 }
2728 rewind(infile);
2729 retval = read_tokens(MAXLINELEN, subset_ct, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
2730 if (retval) {
2731 goto annotate_ret_1;
2732 }
2733 if (fclose_null(&infile)) {
2734 goto annotate_ret_READ_FAIL;
2735 }
2736 qsort(sorted_subset_ids, subset_ct, max_subset_id_len, strcmp_casted);
2737 subset_ct = collapse_duplicate_ids(sorted_subset_ids, subset_ct, max_subset_id_len, nullptr);
2738 }
2739 // normally can't use border here because we need nearest distance
2740 retval = load_range_list_sortpos(aip->ranges_fname, (block01 && (!track_distance))? border : 0, subset_ct, sorted_subset_ids, max_subset_id_len, chrom_info_ptr, &range_ct, &range_names, &max_range_name_len, &chrom_bounds, &rangedefs, &chrom_max_range_ct, "--annotate ranges");
2741 if (retval) {
2742 goto annotate_ret_1;
2743 }
2744 #ifdef __LP64__
2745 if (range_ct > 0x80000000LLU) {
2746 sprintf(g_logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
2747 goto annotate_ret_INVALID_FORMAT_WW;
2748 }
2749 #endif
2750 if (range_ct != 1) {
2751 LOGPRINTFWW("--annotate ranges: %" PRIuPTR " annotations loaded from %s (counting multi-chromosome annotations once per spanned chromosome).\n", range_ct, aip->ranges_fname);
2752 } else {
2753 LOGPRINTFWW("--annotate ranges: 1 annotation loaded from %s.\n", aip->ranges_fname);
2754 }
2755 }
2756 bigstack_end_reset(bigstack_end_mark);
2757 if (aip->filter_fname) {
2758 if (allow_extra_chroms) {
2759 retval = scrape_extra_chroms(aip->filter_fname, "--annotate filter file", chrom_info_ptr);
2760 if (retval) {
2761 goto annotate_ret_1;
2762 }
2763 }
2764 retval = load_range_list_sortpos(aip->filter_fname, border, 0, nullptr, 0, chrom_info_ptr, &filter_range_ct, &filter_range_names, &max_filter_range_name_len, &chrom_filter_bounds, &filter_rangedefs, &chrom_max_filter_range_ct, "--annotate filter");
2765 if (retval) {
2766 goto annotate_ret_1;
2767 }
2768 }
2769 }
2770 if (block01) {
2771 ulii = attr_id_ct + range_ct;
2772 if (range_names) {
2773 // need [range_names idx -> merged natural sort order] and
2774 // [attribute idx -> merged natural sort order] lookup tables
2775 if (ulii > 0x3fffffff) {
2776 logerrprint("Error: Too many unique annotations for '--annotate block' (max 1073741823).\n");
2777 goto annotate_ret_INVALID_FORMAT;
2778 }
2779 if (bigstack_alloc_ui(ulii, &range_idx_lookup)) {
2780 goto annotate_ret_NOMEM;
2781 }
2782 if (attr_id_ct) {
2783 attr_id_remap = &(range_idx_lookup[range_ct]);
2784 }
2785 // create a master natural-sorted annotation ID list
2786
2787 uii = MAXV((max_range_name_len - 4), max_attr_id_len);
2788 // this must persist until the output header line has been written
2789 if (bigstack_end_alloc_ui(ulii, &merged_attr_idx_buf) ||
2790 // this is larger and doesn't need to persist
2791 bigstack_end_alloc_c(ulii * uii, &merged_attr_ids)) {
2792 goto annotate_ret_NOMEM;
2793 }
2794 uiptr = merged_attr_idx_buf;
2795 for (uljj = 0; uljj < range_ct; uljj++) {
2796 strcpy(&(merged_attr_ids[uljj * uii]), &(range_names[uljj * max_range_name_len + 4]));
2797 *uiptr++ = uljj;
2798 }
2799 if (attr_id_ct) {
2800 for (ujj = 0, uljj = range_ct; uljj < ulii; uljj++, ujj++) {
2801 strcpy(&(merged_attr_ids[uljj * uii]), &(sorted_attr_ids[ujj * max_attr_id_len]));
2802 *uiptr++ = ujj + 0x80000000U;
2803 }
2804 }
2805 if (qsort_ext(merged_attr_ids, ulii, uii, strcmp_natural_deref, (char*)merged_attr_idx_buf, sizeof(int32_t))) {
2806 goto annotate_ret_NOMEM;
2807 }
2808 bigstack_end_reset(merged_attr_idx_buf);
2809
2810 // similar to collapse_duplicate_ids(), except we need to save lookup
2811 // info
2812 // uljj = read idx
2813 uljj = 0;
2814 write_idx = 0;
2815 bufptr = merged_attr_ids; // write pointer
2816 ujj = merged_attr_idx_buf[0];
2817 while (1) {
2818 if (ujj < 0x80000000U) {
2819 range_idx_lookup[ujj] = 2 * write_idx + 1;
2820 } else {
2821 attr_id_remap[ujj & 0x7fffffff] = 2 * write_idx + 1;
2822 }
2823 if (++uljj == ulii) {
2824 break;
2825 }
2826 ujj = merged_attr_idx_buf[uljj];
2827 if (strcmp(bufptr, &(merged_attr_ids[uljj * uii]))) {
2828 bufptr = &(bufptr[uii]);
2829 if (++write_idx != uljj) {
2830 merged_attr_idx_buf[write_idx] = ujj;
2831 strcpy(bufptr, &(merged_attr_ids[uljj * uii]));
2832 }
2833 }
2834 }
2835 unique_annot_ct = write_idx + 1;
2836 } else {
2837 unique_annot_ct = attr_id_ct;
2838 }
2839 #ifdef __LP64__
2840 unique_annot_ctlw = (unique_annot_ct + 3) / 4;
2841 #else
2842 unique_annot_ctlw = (unique_annot_ct + 1) / 2;
2843 #endif
2844 LOGPRINTF("--annotate block: %u unique annotation%s present.\n", unique_annot_ct, (unique_annot_ct == 1)? "" : "s");
2845 if (unique_annot_ct > 1000) {
2846 logerrprint("Warning: Output file may be very large. Are you sure you want >1000 additional\ncolumns per line? If not, restart without 'block'.\n");
2847 }
2848 if (bigstack_alloc_c(unique_annot_ctlw * sizeof(intptr_t), &writebuf)) {
2849 goto annotate_ret_NOMEM;
2850 }
2851 ulptr = (uintptr_t*)writebuf;
2852 for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
2853 // fill with repeated " 0"
2854 #ifdef __LP64__
2855 *ulptr++ = 0x3020302030203020LLU;
2856 #else
2857 *ulptr++ = 0x30203020;
2858 #endif
2859 }
2860 wptr = &(writebuf[unique_annot_ct * 2]);
2861 } else {
2862 // worst case: max_onevar_attr_ct attributes and chrom_max_range_ct range
2863 // annotations
2864 if (bigstack_alloc_c((max_onevar_attr_ct * max_attr_id_len) + (chrom_max_range_ct * (max_range_name_len + (3 + 16 * (border != 0)) * range_dist)), &writebuf)) {
2865 goto annotate_ret_NOMEM;
2866 }
2867 }
2868 loadbuf = (char*)g_bigstack_base;
2869 loadbuf_size = bigstack_left();
2870 if (loadbuf_size > MAXLINEBUFLEN) {
2871 loadbuf_size = MAXLINEBUFLEN;
2872 } else if (loadbuf_size <= MAXLINELEN) {
2873 goto annotate_ret_NOMEM;
2874 }
2875 // drop undocumented support for gzipped PLINK report input files, since it
2876 // came with conditional gzipping of output, and that's a pain to do right if
2877 // we want to support lines longer than 128K
2878 retval = open_and_load_to_first_token(&infile, aip->fname, loadbuf_size, '\0', "--annotate file", loadbuf, &bufptr, &line_idx);
2879 if (retval) {
2880 goto annotate_ret_1;
2881 }
2882 ujj = 0; // bitfield tracking which columns have already been found
2883 do {
2884 bufptr2 = token_endnn(bufptr);
2885 slen = (uintptr_t)(bufptr2 - bufptr);
2886 if (slen <= max_header_len) {
2887 if (need_pos && (slen == 3) && (!memcmp(bufptr, "CHR", 3))) {
2888 uii = 0;
2889 } else if (need_pos && (slen == 2) && (!memcmp(bufptr, "BP", 2))) {
2890 uii = 1;
2891 } else if (need_var_id && (slen == snp_field_len) && (!memcmp(bufptr, snp_field, snp_field_len))) {
2892 uii = 2;
2893 } else if ((slen == do_pfilter) && (*bufptr == 'P')) {
2894 uii = 3;
2895 } else {
2896 uii = 4;
2897 }
2898 if (uii != 4) {
2899 if ((ujj >> uii) & 1) {
2900 *bufptr2 = '\0';
2901 sprintf(g_logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, aip->fname);
2902 goto annotate_ret_INVALID_FORMAT_WW;
2903 }
2904 ujj |= 1 << uii;
2905 col_skips[seq_idx] = col_idx;
2906 col_sequence[seq_idx++] = uii;
2907 }
2908 }
2909 bufptr = skip_initial_spaces(bufptr2);
2910 col_idx++;
2911 } while (!is_eoln_kns(*bufptr));
2912 if (seq_idx != token_ct) {
2913 sprintf(g_logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", aip->fname);
2914 goto annotate_ret_INVALID_FORMAT_WW;
2915 }
2916 // bugfix: must go backwards
2917 for (ujj = seq_idx - 1; ujj; --ujj) {
2918 col_skips[ujj] -= col_skips[ujj - 1];
2919 }
2920 memcpy(outname_end, ".annot", 7);
2921 if (fopen_checked(outname, "w", &outfile)) {
2922 goto annotate_ret_OPEN_FAIL;
2923 }
2924 if (fwrite_checked(loadbuf, (uintptr_t)(bufptr - loadbuf), outfile)) {
2925 goto annotate_ret_WRITE_FAIL;
2926 }
2927 if (track_distance) {
2928 fputs(" DIST SGN", outfile);
2929 }
2930 if (block01) {
2931 if (!range_ct) {
2932 for (ulii = 0; ulii < attr_id_ct; ulii++) {
2933 putc_unlocked(' ', outfile);
2934 fputs(&(sorted_attr_ids[ulii * max_attr_id_len]), outfile);
2935 }
2936 } else {
2937 for (uii = 0; uii < unique_annot_ct; uii++) {
2938 putc_unlocked(' ', outfile);
2939 ujj = merged_attr_idx_buf[uii];
2940 if (ujj < 0x80000000U) {
2941 fputs(&(range_names[ujj * max_range_name_len + 4]), outfile);
2942 } else {
2943 fputs(&(sorted_attr_ids[(ujj & 0x7fffffff) * max_attr_id_len]), outfile);
2944 }
2945 }
2946 bigstack_end_reset(bigstack_end_mark);
2947 loadbuf_size = bigstack_left();
2948 if (loadbuf_size > MAXLINEBUFLEN) {
2949 loadbuf_size = MAXLINEBUFLEN;
2950 }
2951 loadbuf[loadbuf_size - 1] = ' ';
2952 }
2953 } else {
2954 fputs(" ANNOT", outfile);
2955 }
2956 putc_unlocked('\n', outfile);
2957 while (fgets(loadbuf, loadbuf_size, infile)) {
2958 line_idx++;
2959 if (!loadbuf[loadbuf_size - 1]) {
2960 if (loadbuf_size == MAXLINEBUFLEN) {
2961 sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->fname);
2962 goto annotate_ret_INVALID_FORMAT_WW;
2963 } else {
2964 goto annotate_ret_NOMEM;
2965 }
2966 }
2967 bufptr = skip_initial_spaces(loadbuf);
2968 if (is_eoln_kns(*bufptr)) {
2969 continue;
2970 }
2971 bufptr = next_token_multz(bufptr, col_skips[0]);
2972 token_ptrs[col_sequence[0]] = bufptr;
2973 for (seq_idx = 1; seq_idx < token_ct; seq_idx++) {
2974 bufptr = next_token_mult(bufptr, col_skips[seq_idx]);
2975 token_ptrs[col_sequence[seq_idx]] = bufptr;
2976 }
2977 if (!bufptr) {
2978 continue;
2979 }
2980
2981 if (need_pos) {
2982 // CHR
2983 // can't use get_chrom_code_destructive() due to later
2984 // strchr(bufptr, '\0') call
2985 chrom_idx = get_chrom_code_counted(chrom_info_ptr, strlen_se(token_ptrs[0]), token_ptrs[0]);
2986 if (chrom_idx < 0) {
2987 continue;
2988 }
2989
2990 // BP
2991 if (scan_uint_defcap(token_ptrs[1], &cur_bp)) {
2992 continue;
2993 }
2994
2995 if (chrom_filter_bounds) {
2996 ulii = chrom_filter_bounds[((uint32_t)chrom_idx) + 1];
2997 for (range_idx = chrom_filter_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
2998 if (in_setdef(filter_rangedefs[range_idx], cur_bp)) {
2999 break;
3000 }
3001 }
3002 if (range_idx == ulii) {
3003 continue;
3004 }
3005 }
3006 }
3007
3008 if (sorted_snplist) {
3009 if (bsearch_str(token_ptrs[2], strlen_se(token_ptrs[2]), sorted_snplist, max_snplist_id_len, snplist_ct) == -1) {
3010 continue;
3011 }
3012 }
3013
3014 // P
3015 if (do_pfilter) {
3016 if (scan_double(token_ptrs[3], &pval) || (!(pval <= pfilter))) {
3017 continue;
3018 }
3019 }
3020
3021 abs_min_dist = 0x80000000U;
3022 if (!block01) {
3023 wptr = writebuf;
3024 if (chrom_bounds) {
3025 ulii = chrom_bounds[((uint32_t)chrom_idx) + 1];
3026 if (!border) {
3027 for (range_idx = chrom_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
3028 if (in_setdef(rangedefs[range_idx], cur_bp)) {
3029 wptr = strcpya(wptr, &(range_names[range_idx * max_range_name_len + 4]));
3030 if (range_dist) {
3031 wptr = memcpya(wptr, "(0)|", 4);
3032 } else {
3033 *wptr++ = '|';
3034 }
3035 }
3036 }
3037 if (wptr != writebuf) {
3038 abs_min_dist = 0;
3039 }
3040 } else {
3041 for (range_idx = chrom_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
3042 if (in_setdef_dist(rangedefs[range_idx], cur_bp, border, &ii)) {
3043 ujj = abs(ii);
3044 if (ujj < abs_min_dist) {
3045 abs_min_dist = ujj;
3046 min_dist = ii;
3047 }
3048 wptr = strcpya(wptr, &(range_names[range_idx * max_range_name_len + 4]));
3049 if (range_dist) {
3050 if (ii == 0) {
3051 wptr = memcpya(wptr, "(0)|", 4);
3052 } else {
3053 *wptr++ = '(';
3054 if (ii > 0) {
3055 *wptr++ = '+';
3056 }
3057 wptr = dtoa_g_wxp4(((double)ii) * 0.001, 1, wptr);
3058 wptr = memcpya(wptr, "kb)|", 4);
3059 }
3060 } else {
3061 *wptr++ = '|';
3062 }
3063 }
3064 }
3065 }
3066 }
3067 if (sorted_snplist_attr_ids) {
3068 bufptr2 = token_ptrs[2];
3069 slen = (uintptr_t)(token_endnn(bufptr2) - bufptr2);
3070 sorted_idx = bsearch_str(bufptr2, slen, sorted_snplist_attr_ids, max_snplist_attr_id_len, snplist_attr_ct);
3071 if (sorted_idx != -1) {
3072 ulptr = &(attr_bitfields[((uint32_t)sorted_idx) * attr_id_ctl]);
3073 for (uii = 0; uii < attr_id_ct; uii += BITCT) {
3074 ulii = *ulptr++;
3075 while (ulii) {
3076 ujj = CTZLU(ulii);
3077 wptr = strcpyax(wptr, &(sorted_attr_ids[(uii + ujj) * max_attr_id_len]), '|');
3078 ulii &= ulii - 1;
3079 }
3080 }
3081 }
3082 }
3083 at_least_one_annot = (wptr != writebuf);
3084 if (at_least_one_annot) {
3085 wptr--;
3086 annot_row_ct++;
3087 } else {
3088 if (prune) {
3089 continue;
3090 }
3091 wptr = strcpya(wptr, no_annot_str);
3092 }
3093 } else {
3094 at_least_one_annot = 0;
3095 if (chrom_bounds) {
3096 ulii = chrom_bounds[((uint32_t)chrom_idx) + 1];
3097 if (!border) {
3098 for (range_idx = chrom_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
3099 if (in_setdef(rangedefs[range_idx], cur_bp)) {
3100 writebuf[range_idx_lookup[range_idx]] = '1';
3101 at_least_one_annot = 1;
3102 }
3103 }
3104 if (at_least_one_annot) {
3105 abs_min_dist = 0;
3106 }
3107 } else if (!track_distance) {
3108 for (range_idx = chrom_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
3109 if (in_setdef(rangedefs[range_idx], cur_bp)) {
3110 writebuf[range_idx_lookup[range_idx]] = '1';
3111 at_least_one_annot = 1;
3112 }
3113 }
3114 } else {
3115 for (range_idx = chrom_bounds[(uint32_t)chrom_idx]; range_idx < ulii; range_idx++) {
3116 if (in_setdef_dist(rangedefs[range_idx], cur_bp, border, &ii)) {
3117 ujj = abs(ii);
3118 if (ujj < abs_min_dist) {
3119 abs_min_dist = ujj;
3120 min_dist = ii;
3121 }
3122 writebuf[range_idx_lookup[range_idx]] = '1';
3123 at_least_one_annot = 1;
3124 }
3125 }
3126 }
3127 }
3128 if (sorted_snplist_attr_ids) {
3129 bufptr2 = token_ptrs[2];
3130 slen = (uintptr_t)(token_endnn(bufptr2) - bufptr2);
3131 sorted_idx = bsearch_str(bufptr2, slen, sorted_snplist_attr_ids, max_snplist_attr_id_len, snplist_attr_ct);
3132 if (sorted_idx != -1) {
3133 ulptr = &(attr_bitfields[((uint32_t)sorted_idx) * attr_id_ctl]);
3134 for (uii = 0; uii < attr_id_ct; uii += BITCT) {
3135 ulii = *ulptr++;
3136 if (ulii) {
3137 do {
3138 ujj = CTZLU(ulii);
3139 writebuf[attr_id_remap[uii + ujj]] = '1';
3140 ulii &= ulii - 1;
3141 } while (ulii);
3142 at_least_one_annot = 1;
3143 }
3144 }
3145 }
3146 }
3147
3148 if (at_least_one_annot) {
3149 annot_row_ct++;
3150 } else if (prune) {
3151 continue;
3152 }
3153 }
3154 bufptr = strchr(bufptr, '\0');
3155 if (bufptr[-1] == '\n') {
3156 bufptr--;
3157 if (bufptr[-1] == '\r') {
3158 bufptr--;
3159 }
3160 }
3161 total_row_ct++;
3162 if (fwrite_checked(loadbuf, bufptr - loadbuf, outfile)) {
3163 goto annotate_ret_WRITE_FAIL;
3164 }
3165 if (track_distance) {
3166 if (abs_min_dist != 0x80000000U) {
3167 bufptr2 = width_force(12, g_textbuf, dtoa_g(((double)((int32_t)abs_min_dist)) * 0.001, g_textbuf));
3168 if (!abs_min_dist) {
3169 bufptr2 = memcpya(bufptr2, no_sign_str, 4);
3170 } else {
3171 bufptr2 = memseta(bufptr2, 32, 3);
3172 *bufptr2++ = (min_dist > 0)? '+' : '-';
3173 }
3174 } else {
3175 bufptr2 = memseta(g_textbuf, 32, 8);
3176 bufptr2 = memcpya(bufptr2, no_sign_str, 4);
3177 bufptr2 = memcpya(bufptr2, no_sign_str, 4);
3178 }
3179 if (fwrite_checked(g_textbuf, bufptr2 - g_textbuf, outfile)) {
3180 goto annotate_ret_WRITE_FAIL;
3181 }
3182 }
3183 putc_unlocked(' ', outfile);
3184 if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
3185 goto annotate_ret_WRITE_FAIL;
3186 }
3187
3188 putc_unlocked('\n', outfile);
3189 if (block01 && at_least_one_annot) {
3190 // reinitialize
3191 ulptr = (uintptr_t*)writebuf;
3192 for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
3193 #ifdef __LP64__
3194 *ulptr++ = 0x3020302030203020LLU;
3195 #else
3196 *ulptr++ = 0x30203020;
3197 #endif
3198 }
3199 }
3200 }
3201 if (fclose_null(&infile)) {
3202 goto annotate_ret_READ_FAIL;
3203 }
3204 if (fclose_null(&outfile)) {
3205 goto annotate_ret_WRITE_FAIL;
3206 }
3207 if (!prune) {
3208 LOGPRINTFWW("--annotate: %" PRIuPTR " out of %" PRIuPTR " row%s annotated; new report written to %s .\n", annot_row_ct, total_row_ct, (total_row_ct == 1)? "" : "s", outname);
3209 } else {
3210 LOGPRINTFWW("--annotate: %" PRIuPTR " row%s annotated; new report written to %s .\n", total_row_ct, (total_row_ct == 1)? "" : "s", outname);
3211 }
3212 while (0) {
3213 annotate_ret_NOMEM:
3214 retval = RET_NOMEM;
3215 break;
3216 annotate_ret_OPEN_FAIL:
3217 retval = RET_OPEN_FAIL;
3218 break;
3219 annotate_ret_READ_FAIL:
3220 retval = RET_READ_FAIL;
3221 break;
3222 annotate_ret_WRITE_FAIL:
3223 retval = RET_WRITE_FAIL;
3224 break;
3225 annotate_ret_INVALID_FORMAT_WW:
3226 wordwrapb(0);
3227 logerrprintb();
3228 annotate_ret_INVALID_FORMAT:
3229 retval = RET_INVALID_FORMAT;
3230 break;
3231 }
3232 annotate_ret_1:
3233 bigstack_double_reset(bigstack_mark, bigstack_end_mark);
3234 fclose_cond(infile);
3235 gzclose_cond(gz_attribfile);
3236 fclose_cond(outfile);
3237 return retval;
3238 }
3239
gene_report(char * fname,char * glist,char * subset_fname,uint32_t border,uint32_t allow_extra_chroms,char * extractname,const char * snp_field,char * outname,char * outname_end,double pfilter,Chrom_info * chrom_info_ptr)3240 int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t border, uint32_t allow_extra_chroms, char* extractname, const char* snp_field, char* outname, char* outname_end, double pfilter, Chrom_info* chrom_info_ptr) {
3241 // similar to define_sets() and --clump
3242 unsigned char* bigstack_mark = g_bigstack_base;
3243 unsigned char* bigstack_end_mark = g_bigstack_end;
3244 FILE* infile = nullptr;
3245 FILE* outfile = nullptr;
3246 uintptr_t subset_ct = 0;
3247 uintptr_t max_subset_id_len = 0;
3248 uintptr_t extract_ct = 0;
3249 uintptr_t max_extract_id_len = 0;
3250 const char constsnpstr[] = "SNP";
3251 char* sorted_subset_ids = nullptr;
3252 char* sorted_extract_ids = nullptr;
3253 uintptr_t* chrom_bounds = nullptr;
3254 uint32_t** genedefs = nullptr;
3255 uint64_t saved_line_ct = 0;
3256 uint32_t do_pfilter = (pfilter != 2.0);
3257 uint32_t token_ct = 2 + (extractname != nullptr) + do_pfilter;
3258 uint32_t snp_field_len = 0;
3259 uint32_t col_idx = 0;
3260 uint32_t seq_idx = 0;
3261 uint32_t cur_bp = 0;
3262 uint32_t found_header_bitfield = 0;
3263 int32_t retval = 0;
3264
3265 // see --annotate comment on col_skips.
3266 char* token_ptrs[4];
3267 uint32_t col_skips[4];
3268 uint32_t col_sequence[4];
3269 uint64_t* gene_match_list_end;
3270 uint64_t* gene_match_list;
3271 uint64_t* gene_match_list_ptr;
3272 uint32_t* gene_chridx_to_nameidx;
3273 uint32_t* gene_nameidx_to_chridx;
3274 uint32_t* uiptr;
3275 char** line_lookup;
3276 char* gene_names;
3277 char* loadbuf;
3278 char* header_ptr;
3279 char* first_line_ptr;
3280 char* linebuf_top;
3281 char* bufptr;
3282 char* bufptr2;
3283 uintptr_t gene_ct;
3284 uintptr_t max_gene_name_len;
3285 uintptr_t chrom_max_gene_ct;
3286 uintptr_t loadbuf_size;
3287 uintptr_t gene_idx;
3288 uintptr_t linebuf_left;
3289 uintptr_t line_idx;
3290 uintptr_t ulii;
3291 uint64_t ullii;
3292 double pval;
3293 uint32_t slen;
3294 uint32_t header_len;
3295 uint32_t range_idx;
3296 uint32_t range_ct;
3297 uint32_t uii;
3298 uint32_t ujj;
3299 int32_t chrom_idx;
3300 if (subset_fname) {
3301 if (fopen_checked(subset_fname, FOPEN_RB, &infile)) {
3302 goto gene_report_ret_OPEN_FAIL;
3303 }
3304 retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
3305 if (retval) {
3306 if (retval == RET_INVALID_FORMAT) {
3307 logerrprint("Error: Pathologically long token in --gene-subset file.\n");
3308 }
3309 goto gene_report_ret_1;
3310 }
3311 if (!subset_ct) {
3312 logerrprint("Error: --gene-subset file is empty.\n");
3313 goto gene_report_ret_INVALID_FORMAT;
3314 }
3315 if (max_subset_id_len > MAX_ID_BLEN) {
3316 logerrprint("Error: --gene-subset IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
3317 goto gene_report_ret_INVALID_FORMAT;
3318 }
3319 if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
3320 goto gene_report_ret_NOMEM;
3321 }
3322 rewind(infile);
3323 retval = read_tokens(MAXLINELEN, subset_ct, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
3324 if (retval) {
3325 goto gene_report_ret_1;
3326 }
3327 if (fclose_null(&infile)) {
3328 goto gene_report_ret_READ_FAIL;
3329 }
3330 qsort(sorted_subset_ids, subset_ct, max_subset_id_len, strcmp_casted);
3331 subset_ct = collapse_duplicate_ids(sorted_subset_ids, subset_ct, max_subset_id_len, nullptr);
3332 }
3333 if (extractname) {
3334 if (fopen_checked(extractname, FOPEN_RB, &infile)) {
3335 goto gene_report_ret_OPEN_FAIL;
3336 }
3337 retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &extract_ct, &max_extract_id_len);
3338 if (retval) {
3339 goto gene_report_ret_1;
3340 }
3341 if (!extract_ct) {
3342 logerrprint("Error: Empty --extract file.\n");
3343 goto gene_report_ret_INVALID_FORMAT;
3344 }
3345 if (max_extract_id_len > MAX_ID_BLEN) {
3346 logerrprint("Error: --extract IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
3347 goto gene_report_ret_INVALID_FORMAT;
3348 }
3349 if (bigstack_alloc_c(extract_ct * max_extract_id_len, &sorted_extract_ids)) {
3350 goto gene_report_ret_NOMEM;
3351 }
3352 rewind(infile);
3353 // todo: switch to hash table to avoid sort
3354 retval = read_tokens(MAXLINELEN, extract_ct, max_extract_id_len, infile, g_textbuf, sorted_extract_ids);
3355 if (retval) {
3356 goto gene_report_ret_1;
3357 }
3358 if (fclose_null(&infile)) {
3359 goto gene_report_ret_READ_FAIL;
3360 }
3361 qsort(sorted_extract_ids, extract_ct, max_extract_id_len, strcmp_casted);
3362 ulii = collapse_duplicate_ids(sorted_extract_ids, extract_ct, max_extract_id_len, nullptr);
3363 if (ulii < extract_ct) {
3364 extract_ct = ulii;
3365 bigstack_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
3366 }
3367 }
3368 if (allow_extra_chroms) {
3369 // ugh, silly for this to require 'file' at the end while load_range_list()
3370 // does not
3371 retval = scrape_extra_chroms(glist, "--gene-report file", chrom_info_ptr);
3372 if (retval) {
3373 goto gene_report_ret_1;
3374 }
3375 }
3376 retval = load_range_list_sortpos(glist, 0, subset_ct, sorted_subset_ids, max_subset_id_len, chrom_info_ptr, &gene_ct, &gene_names, &max_gene_name_len, &chrom_bounds, &genedefs, &chrom_max_gene_ct, "--gene-report");
3377 if (retval) {
3378 goto gene_report_ret_1;
3379 }
3380 #ifdef __LP64__
3381 if (gene_ct > 0x80000000LLU) {
3382 sprintf(g_logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
3383 goto gene_report_ret_INVALID_FORMAT_WW;
3384 }
3385 #endif
3386
3387 bigstack_end_reset(bigstack_end_mark);
3388 // gene_names is sorted primarily by chromosome index, and secondarily by
3389 // gene name. Final output will be the other way around, so we need a
3390 // remapping table.
3391 // This logic needs to change a bit if support for unplaced contigs is added
3392 // or MAX_CHROM_TEXTNUM_SLEN changes.
3393 if (bigstack_alloc_ui(gene_ct, &gene_chridx_to_nameidx) ||
3394 bigstack_alloc_ui(gene_ct, &gene_nameidx_to_chridx) ||
3395 bigstack_alloc_c(gene_ct * max_gene_name_len, &loadbuf)) {
3396 goto gene_report_ret_NOMEM;
3397 }
3398 for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
3399 gene_nameidx_to_chridx[gene_idx] = gene_idx;
3400 }
3401 for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
3402 bufptr = &(gene_names[gene_idx * max_gene_name_len]);
3403 slen = strlen(&(bufptr[4]));
3404 bufptr2 = memcpyax(&(loadbuf[gene_idx * max_gene_name_len]), &(bufptr[4]), slen, ' ');
3405 memcpyx(bufptr2, &(bufptr[2]), 2, '\0');
3406 }
3407 if (qsort_ext(loadbuf, gene_ct, max_gene_name_len, strcmp_natural_deref, (char*)gene_nameidx_to_chridx, sizeof(int32_t))) {
3408 goto gene_report_ret_NOMEM;
3409 }
3410 // We also need to perform the reverse lookup.
3411 for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
3412 gene_chridx_to_nameidx[gene_nameidx_to_chridx[gene_idx]] = gene_idx;
3413 }
3414 bigstack_reset((unsigned char*)loadbuf);
3415
3416 linebuf_left = bigstack_left();
3417 if (linebuf_left < MAXLINELEN + 64) {
3418 goto gene_report_ret_NOMEM;
3419 }
3420 // mirror g_bigstack_base/g_bigstack_end since we'll be doing
3421 // nonstandard-size allocations
3422 linebuf_top = (char*)g_bigstack_base;
3423 gene_match_list_end = (uint64_t*)g_bigstack_end;
3424 gene_match_list = gene_match_list_end;
3425
3426 header_ptr = linebuf_top;
3427 loadbuf = memcpya(header_ptr, "kb ) ", 5);
3428 if (border) {
3429 loadbuf = memcpya(loadbuf, " plus ", 6);
3430 loadbuf = dtoa_g(((double)((int32_t)border)) * 0.001, loadbuf);
3431 loadbuf = memcpya(loadbuf, "kb border ", 10);
3432 }
3433 loadbuf = memcpya(loadbuf, "\n\n DIST ", 15);
3434 linebuf_left -= (loadbuf - header_ptr);
3435 loadbuf_size = linebuf_left;
3436 if (loadbuf_size > MAXLINEBUFLEN) {
3437 loadbuf_size = MAXLINEBUFLEN;
3438 }
3439 uii = 3; // maximum relevant header length
3440 col_idx = 0;
3441 if (extractname) {
3442 if (snp_field) {
3443 snp_field_len = strlen(snp_field);
3444 if (snp_field_len > uii) {
3445 uii = snp_field_len;
3446 }
3447 } else {
3448 snp_field = constsnpstr;
3449 snp_field_len = 3;
3450 }
3451 }
3452 retval = open_and_load_to_first_token(&infile, fname, loadbuf_size, '\0', "--gene-report file", loadbuf, &bufptr, &line_idx);
3453 if (retval) {
3454 goto gene_report_ret_1;
3455 }
3456 do {
3457 bufptr2 = token_endnn(bufptr);
3458 slen = (uintptr_t)(bufptr2 - bufptr);
3459 if (slen <= uii) {
3460 if ((slen == 3) && (!memcmp(bufptr, "CHR", 3))) {
3461 ujj = 0;
3462 } else if ((slen == 2) && (!memcmp(bufptr, "BP", 2))) {
3463 ujj = 1;
3464 } else if ((slen == snp_field_len) && (!memcmp(bufptr, snp_field, snp_field_len))) {
3465 ujj = 2;
3466 } else if ((slen == do_pfilter) && (*bufptr == 'P')) {
3467 ujj = 3;
3468 } else {
3469 ujj = 4;
3470 }
3471 if (ujj != 4) {
3472 if ((found_header_bitfield >> ujj) & 1) {
3473 *bufptr2 = '\0';
3474 sprintf(g_logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, fname);
3475 goto gene_report_ret_INVALID_FORMAT_WW;
3476 }
3477 found_header_bitfield |= 1 << ujj;
3478 col_skips[seq_idx] = col_idx;
3479 col_sequence[seq_idx++] = ujj;
3480 }
3481 }
3482 bufptr = skip_initial_spaces(bufptr2);
3483 col_idx++;
3484 } while (!is_eoln_kns(*bufptr));
3485 if (seq_idx != token_ct) {
3486 sprintf(g_logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", fname);
3487 goto gene_report_ret_INVALID_FORMAT_WW;
3488 }
3489 // bugfix
3490 for (uii = seq_idx - 1; uii; --uii) {
3491 col_skips[uii] -= col_skips[uii - 1];
3492 }
3493 // assume *bufptr is now \n (if it isn't, header line is never written to
3494 // output anyway)
3495 header_len = 1 + (uintptr_t)(bufptr - header_ptr);
3496 linebuf_left -= header_len;
3497 linebuf_top = &(linebuf_top[header_len]);
3498 first_line_ptr = linebuf_top;
3499 while (1) {
3500 // store raw line contents at bottom of stack (growing upwards), and gene
3501 // matches at top of stack (8 byte entries, growing downwards). Worst
3502 // case, a line matches chrom_max_gene_ct genes, so we prevent the text
3503 // loading buffer from using the last chrom_max_gene_ct * 8 bytes of
3504 // workspace memory.
3505 if (linebuf_left <= chrom_max_gene_ct * 8 + 8 + (uintptr_t)(((char*)gene_match_list_end) - ((char*)gene_match_list)) + MAXLINELEN) {
3506 goto gene_report_ret_NOMEM;
3507 }
3508 loadbuf_size = linebuf_left - (chrom_max_gene_ct * 8) - 8 - (uintptr_t)(((char*)gene_match_list_end) - ((char*)gene_match_list));
3509 if (loadbuf_size > MAXLINEBUFLEN) {
3510 loadbuf_size = MAXLINEBUFLEN;
3511 }
3512 // leave 8 bytes to save line length and parsed position
3513 loadbuf = &(linebuf_top[8]);
3514 loadbuf[loadbuf_size - 1] = ' ';
3515 gene_report_load_loop:
3516 if (!fgets(loadbuf, loadbuf_size, infile)) {
3517 break;
3518 }
3519 line_idx++;
3520 if (!loadbuf[loadbuf_size - 1]) {
3521 goto gene_report_ret_LONG_LINE;
3522 }
3523 bufptr = skip_initial_spaces(loadbuf);
3524 if (is_eoln_kns(*bufptr)) {
3525 goto gene_report_load_loop;
3526 }
3527 bufptr = next_token_multz(bufptr, col_skips[0]);
3528 token_ptrs[col_sequence[0]] = bufptr;
3529 for (seq_idx = 1; seq_idx < token_ct; seq_idx++) {
3530 bufptr = next_token_mult(bufptr, col_skips[seq_idx]);
3531 token_ptrs[col_sequence[seq_idx]] = bufptr;
3532 }
3533 if (!bufptr) {
3534 goto gene_report_load_loop;
3535 }
3536
3537 // CHR
3538 chrom_idx = get_chrom_code_counted(chrom_info_ptr, strlen_se(token_ptrs[0]), token_ptrs[0]);
3539 if (chrom_idx < 0) {
3540 // todo: log warning?
3541 goto gene_report_load_loop;
3542 }
3543
3544 // BP
3545 if (scan_uint_defcap(token_ptrs[1], &cur_bp)) {
3546 goto gene_report_load_loop;
3547 }
3548
3549 // variant ID
3550 if (sorted_extract_ids) {
3551 bufptr2 = token_ptrs[2];
3552 if (bsearch_str(bufptr2, strlen_se(bufptr2), sorted_extract_ids, max_extract_id_len, extract_ct) == -1) {
3553 goto gene_report_load_loop;
3554 }
3555 }
3556
3557 // P
3558 if (do_pfilter) {
3559 if (scan_double(token_ptrs[3], &pval) || (!(pval <= pfilter))) {
3560 goto gene_report_load_loop;
3561 }
3562 }
3563
3564 ulii = chrom_bounds[((uint32_t)chrom_idx) + 1];
3565 gene_match_list_ptr = gene_match_list;
3566 if (cur_bp > border) {
3567 uii = cur_bp - border;
3568 } else {
3569 uii = 0;
3570 }
3571 ujj = cur_bp + border;
3572 for (gene_idx = chrom_bounds[(uint32_t)chrom_idx]; gene_idx < ulii; gene_idx++) {
3573 if (interval_in_setdef(genedefs[gene_idx], uii, ujj)) {
3574 *(--gene_match_list) = (((uint64_t)gene_chridx_to_nameidx[gene_idx]) << 32) | saved_line_ct;
3575 }
3576 }
3577 if (gene_match_list_ptr == gene_match_list) {
3578 goto gene_report_load_loop;
3579 }
3580
3581 slen = strlen(bufptr);
3582 // this could be a non-\n-terminated last line...
3583 if (bufptr[slen - 1] != '\n') {
3584 bufptr[slen++] = '\n';
3585 }
3586 slen += (uintptr_t)(bufptr - loadbuf);
3587 *((uint32_t*)linebuf_top) = slen;
3588 ((uint32_t*)linebuf_top)[1] = cur_bp;
3589 linebuf_left -= slen + 8;
3590 linebuf_top = &(linebuf_top[slen + 8]);
3591 #ifdef __LP64__
3592 if (saved_line_ct == 0x100000000LLU) {
3593 sprintf(g_logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
3594 goto gene_report_ret_INVALID_FORMAT_WW;
3595 }
3596 #endif
3597 saved_line_ct++;
3598 }
3599 if (fclose_null(&infile)) {
3600 goto gene_report_ret_READ_FAIL;
3601 }
3602 // saved line index -> line contents lookup
3603 // allocate off far end since linebuf_top is not aligned at all
3604 if (linebuf_left < saved_line_ct * sizeof(intptr_t) + (uintptr_t)(((char*)gene_match_list_end) - ((char*)gene_match_list))) {
3605 goto gene_report_ret_NOMEM;
3606 }
3607 line_lookup = (char**)(&(((uintptr_t*)gene_match_list)[-((intptr_t)((uintptr_t)saved_line_ct))]));
3608 bufptr = first_line_ptr;
3609 for (uii = 0; uii < saved_line_ct; uii++) {
3610 line_lookup[uii] = bufptr;
3611 bufptr = &(bufptr[(*((uint32_t*)bufptr)) + 8]);
3612 }
3613 #ifdef __cplusplus
3614 std::sort((int64_t*)gene_match_list, (int64_t*)gene_match_list_end);
3615 #else
3616 qsort((int64_t*)gene_match_list, (uintptr_t)(gene_match_list_end - gene_match_list), sizeof(int64_t), llcmp);
3617 #endif
3618
3619 memcpy(outname_end, ".range.report", 14);
3620 if (fopen_checked(outname, "w", &outfile)) {
3621 goto gene_report_ret_OPEN_FAIL;
3622 }
3623 ulii = ~ZEROLU; // current gene index
3624 while (gene_match_list < gene_match_list_end) {
3625 ullii = *gene_match_list++;
3626 if ((uintptr_t)(ullii >> 32) != ulii) {
3627 // new gene
3628 if (ulii != ~ZEROLU) {
3629 fputs("\n\n", outfile);
3630 }
3631 ulii = (uintptr_t)(ullii >> 32);
3632 gene_idx = gene_nameidx_to_chridx[ulii];
3633 bufptr = &(gene_names[gene_idx * max_gene_name_len]);
3634 fputs(&(bufptr[4]), outfile);
3635 // 53313 = ('0' * (1000 + 100 + 10)) + ('0' - 15)
3636 chrom_idx = ((unsigned char)bufptr[0]) * 1000 + ((unsigned char)bufptr[1]) * 100 + ((unsigned char)bufptr[2]) * 10 + ((unsigned char)bufptr[3]) - 53313;
3637 strcpy(g_textbuf, " -- ");
3638 // plink 1.07 explicitly precedes chromosome codes with "chr" here.
3639 // obviously "chrchr1" doesn't look right, and neither does
3640 // chr[contig name], so make the chr prefix conditional.
3641 bufptr = &(g_textbuf[4]);
3642 if ((!(chrom_info_ptr->output_encoding & CHR_OUTPUT_PREFIX)) && ((chrom_idx <= ((int32_t)chrom_info_ptr->max_code)) || chrom_info_ptr->zero_extra_chroms)) {
3643 bufptr = memcpyl3a(bufptr, "chr");
3644 }
3645 bufptr = chrom_name_write(chrom_info_ptr, chrom_idx, bufptr);
3646 *bufptr++ = ':';
3647 if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
3648 goto gene_report_ret_WRITE_FAIL;
3649 }
3650 uiptr = genedefs[gene_idx];
3651 range_ct = *uiptr++;
3652 ujj = 0; // gene length
3653 for (range_idx = 0; range_idx < range_ct; range_idx++) {
3654 if (range_idx) {
3655 fputs(", ", outfile);
3656 }
3657 cur_bp = *uiptr++;
3658 bufptr = uint32toa(cur_bp, g_textbuf);
3659 bufptr = memcpya(bufptr, "..", 2);
3660 uii = *uiptr++;
3661 bufptr = uint32toa(uii - 1, bufptr);
3662 fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
3663 ujj += uii - cur_bp;
3664 }
3665 bufptr = memcpyl3a(g_textbuf, " ( ");
3666 bufptr = dtoa_g(((double)((int32_t)ujj)) * 0.001, bufptr);
3667 fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
3668 if (fwrite_checked(header_ptr, header_len, outfile)) {
3669 goto gene_report_ret_WRITE_FAIL;
3670 }
3671 cur_bp = genedefs[gene_idx][1];
3672 }
3673 bufptr = line_lookup[(uint32_t)ullii];
3674 uii = *((uint32_t*)bufptr); // line length
3675 ujj = ((uint32_t*)bufptr)[1]; // bp
3676 bufptr2 = dtoa_g_wxp4(((double)((int32_t)(ujj - cur_bp))) * 0.001, 10, g_textbuf);
3677 bufptr2 = memcpyl3a(bufptr2, "kb ");
3678 fwrite(g_textbuf, 1, bufptr2 - g_textbuf, outfile);
3679 fwrite(&(bufptr[8]), 1, uii, outfile);
3680 }
3681 if (ulii != ~ZEROLU) {
3682 fputs("\n\n", outfile);
3683 }
3684 if (fclose_null(&outfile)) {
3685 goto gene_report_ret_WRITE_FAIL;
3686 }
3687 LOGPRINTFWW("--gene-report: gene-based report written to %s .\n", outname);
3688 while (0) {
3689 gene_report_ret_LONG_LINE:
3690 if (loadbuf_size == MAXLINEBUFLEN) {
3691 LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
3692 retval = RET_INVALID_FORMAT;
3693 break;
3694 }
3695 gene_report_ret_NOMEM:
3696 retval = RET_NOMEM;
3697 break;
3698 gene_report_ret_OPEN_FAIL:
3699 retval = RET_OPEN_FAIL;
3700 break;
3701 gene_report_ret_READ_FAIL:
3702 retval = RET_READ_FAIL;
3703 break;
3704 gene_report_ret_WRITE_FAIL:
3705 retval = RET_WRITE_FAIL;
3706 break;
3707 gene_report_ret_INVALID_FORMAT_WW:
3708 wordwrapb(0);
3709 logerrprintb();
3710 gene_report_ret_INVALID_FORMAT:
3711 retval = RET_INVALID_FORMAT;
3712 break;
3713 }
3714 gene_report_ret_1:
3715 bigstack_double_reset(bigstack_mark, bigstack_end_mark);
3716 fclose_cond(infile);
3717 fclose_cond(outfile);
3718 return retval;
3719 }
3720