1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_set.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
24 typedef struct MakeSetRangeStruct {
25   NONCOPYABLE(MakeSetRangeStruct);
26   struct MakeSetRangeStruct* next;
27   uint32_t uidx_start;
28   uint32_t uidx_end;
29 } MakeSetRange;
30 
31 static_assert(kMaxChrCodeDigits == 5, "LoadIntervalBed() must be updated.");
LoadIntervalBed(const ChrInfo * cip,const uint32_t * variant_bps,const char * sorted_subset_ids,const char * file_descrip,uint32_t zero_based,uint32_t track_set_names,uint32_t border_extend,uint32_t fail_on_no_sets,uint32_t c_prefix,uintptr_t subset_ct,uintptr_t max_subset_id_blen,TextStream * txsp,uintptr_t * set_ct_ptr,char ** set_names_ptr,uintptr_t * max_set_id_blen_ptr,uint64_t ** range_sort_buf_ptr,MakeSetRange *** make_set_range_arr_ptr)32 PglErr LoadIntervalBed(const ChrInfo* cip, const uint32_t* variant_bps, const char* sorted_subset_ids, const char* file_descrip, uint32_t zero_based, uint32_t track_set_names, uint32_t border_extend, uint32_t fail_on_no_sets, uint32_t c_prefix, uintptr_t subset_ct, uintptr_t max_subset_id_blen, TextStream* txsp, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_blen_ptr, uint64_t** range_sort_buf_ptr, MakeSetRange*** make_set_range_arr_ptr) {
33   // In plink 1.9, this was named load_range_list() and called directly by
34   // ExtractExcludeRange(), define_sets(), and indirectly by annotate(),
35   // gene_report(), and clump_reports().  That function required set IDs in
36   // column 4, and interpreted column 5 as a set "group label".
37   // However, column 5 wasn't used very often, and without it, we're free to
38   // generalize this function to point at UCSC interval-BED files.
39   //
40   // Assumes caller will reset g_bigstack_end later.
41   PglErr reterr = kPglRetSuccess;
42   {
43     LlStr* make_set_ll = nullptr;
44     char* set_names = nullptr;
45     uintptr_t set_ct = 0;
46     uintptr_t max_set_id_blen = 0;
47     // if we need to track set names, put together a sorted list
48     if (track_set_names) {
49       uintptr_t line_idx = 1;
50       for (char* line_iter = TextLineEnd(txsp); TextGetUnsafe(txsp, &line_iter); ++line_idx) {
51         char* line_start = line_iter;
52         char* first_token_end = CurTokenEnd(line_start);
53         char* cur_set_id = NextTokenMult(first_token_end, 3);
54         char* last_token = cur_set_id;
55         if (unlikely(NoMoreTokensKns(last_token))) {
56           snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, file_descrip);
57           goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
58         }
59         const uint32_t chr_name_slen = first_token_end - line_start;
60         *first_token_end = '\0';
61         const uint32_t cur_chr_code = GetChrCode(line_start, cip, chr_name_slen);
62         if (IsI32Neg(cur_chr_code)) {
63           // kludge (21 Jan 2020): --extract/exclude range should not error out
64           // if a line mentions a chromosome code not in the dataset.
65           // TODO: better condition for skipping this line.  (not totally
66           // trivial since some future callers may need to track empty sets)
67           if (likely((!set_ct_ptr) && (cur_chr_code == UINT32_MAX))) {
68             continue;
69           }
70           snprintf(g_logbuf, kLogbufSize, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
71           goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
72         }
73         // chr_mask check removed, we want to track empty sets
74         uint32_t set_id_slen = strlen_se(cur_set_id);
75         // we're about to possibly clobber \n, so advance line_iter now
76         line_iter = AdvPastDelim(&(cur_set_id[set_id_slen]), '\n');
77         cur_set_id[set_id_slen] = '\0';
78         if (subset_ct) {
79           if (bsearch_str(cur_set_id, sorted_subset_ids, set_id_slen, max_subset_id_blen, subset_ct) == -1) {
80             continue;
81           }
82         }
83         // when there are repeats, they are likely to be next to each other
84         if (make_set_ll && strequal_overread(make_set_ll->str, last_token)) {
85           continue;
86         }
87         uint32_t set_id_blen = set_id_slen + 1;
88         // argh, --clump counts positional overlaps which don't include any
89         // variants in the dataset.  So we prefix set IDs with a chromosome
90         // index in that case (with leading zeroes) and treat cross-chromosome
91         // sets as distinct.
92         if (!variant_bps) {
93           set_id_blen += kMaxChrCodeDigits;
94         }
95         if (set_id_blen > max_set_id_blen) {
96           max_set_id_blen = set_id_blen;
97         }
98         LlStr* ll_tmp;
99         if (unlikely(bigstack_end_alloc_llstr(set_id_blen, &ll_tmp))) {
100           goto LoadIntervalBed_ret_NOMEM;
101         }
102         ll_tmp->next = make_set_ll;
103         if (variant_bps) {
104           memcpy(ll_tmp->str, last_token, set_id_blen);
105         } else {
106           u32toa_z5(cur_chr_code, ll_tmp->str);
107           // if first character of gene name is a digit, natural sort has
108           // strange effects unless we force [3] to be nonnumeric...
109           ll_tmp->str[kMaxChrCodeDigits - 1] -= 15;
110           memcpy(&(ll_tmp->str[kMaxChrCodeDigits]), last_token, set_id_blen - kMaxChrCodeDigits);
111         }
112         make_set_ll = ll_tmp;
113         ++set_ct;
114       }
115       if (unlikely(TextStreamErrcode2(txsp, &reterr))) {
116         goto LoadIntervalBed_ret_TSTREAM_FAIL;
117       }
118       if (!set_ct) {
119         if (unlikely(fail_on_no_sets)) {
120           if (variant_bps) {
121             logerrputs("Error: All variants excluded by --gene[-all], since no sets were defined from\n--make-set file.\n");
122             reterr = kPglRetMalformedInput;
123             goto LoadIntervalBed_ret_1;
124           } else {
125             if (subset_ct) {
126               logerrputs("Error: No --gene-subset genes present in --gene-report file.\n");
127               reterr = kPglRetInconsistentInput;
128             } else {
129               logerrputs("Error: Empty --gene-report file.\n");
130               reterr = kPglRetMalformedInput;
131             }
132             goto LoadIntervalBed_ret_1;
133           }
134         }
135         logerrprintfww("Warning: No valid ranges in %s.\n", file_descrip);
136         goto LoadIntervalBed_ret_1;
137       }
138       // c_prefix is 0 or 2
139       max_set_id_blen += c_prefix;
140       if (unlikely(max_set_id_blen > kMaxIdBlen)) {
141         logerrputs("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
142         goto LoadIntervalBed_ret_MALFORMED_INPUT;
143       }
144       const char** strptr_arr;
145       if (unlikely(
146               bigstack_alloc_c(set_ct * max_set_id_blen, set_names_ptr) ||
147               bigstack_alloc_kcp(set_ct, &strptr_arr))) {
148         goto LoadIntervalBed_ret_NOMEM;
149       }
150       set_names = *set_names_ptr;
151       for (uintptr_t set_idx = 0; set_idx != set_ct; ++set_idx) {
152         strptr_arr[set_idx] = make_set_ll->str;
153         make_set_ll = make_set_ll->next;
154       }
155       StrptrArrNsort(set_ct, strptr_arr);
156       set_ct = CopyAndDedupSortedStrptrsToStrbox(strptr_arr, set_ct, max_set_id_blen, &(set_names[c_prefix]));
157       if (c_prefix) {
158         for (uintptr_t set_idx = 0; set_idx != set_ct; ++set_idx) {
159           memcpy_k(&(set_names[set_idx * max_set_id_blen]), "C_", 2);
160         }
161       }
162       BigstackShrinkTop(set_names, set_ct * max_set_id_blen);
163       reterr = TextRewind(txsp);
164       if (unlikely(reterr)) {
165         goto LoadIntervalBed_ret_TSTREAM_FAIL;
166       }
167     } else {
168       set_ct = 1;
169     }
170     MakeSetRange** make_set_range_arr = S_CAST(MakeSetRange**, bigstack_end_alloc(set_ct * sizeof(intptr_t)));
171     if (unlikely(!make_set_range_arr)) {
172       goto LoadIntervalBed_ret_NOMEM;
173     }
174     ZeroPtrArr(set_ct, make_set_range_arr);
175     uintptr_t line_idx = 0;
176     uint32_t chr_start = 0;
177     uint32_t chr_end = 0;
178     for (char* line_iter = &(TextLineEnd(txsp)[-1]); ; line_iter = AdvToDelim(line_iter, '\n')) {
179     LoadIntervalBed_LINE_ITER_ALREADY_ADVANCED:
180       ++line_iter;
181       ++line_idx;
182       reterr = TextGetUnsafe(txsp, &line_iter);
183       if (reterr) {
184         if (likely(reterr == kPglRetEof)) {
185           if (unlikely(track_set_names && (line_idx == 1))) {
186             goto LoadIntervalBed_ret_REWIND_FAIL;
187           }
188           reterr = kPglRetSuccess;
189           break;
190         }
191         goto LoadIntervalBed_ret_TSTREAM_FAIL;
192       }
193       char* line_start = line_iter;
194       char* first_token_end = CurTokenEnd(line_start);
195       char* last_token = NextTokenMult(first_token_end, 2 + track_set_names);
196       if (unlikely(NoMoreTokensKns(last_token))) {
197         snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, file_descrip);
198         goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
199       }
200       const uint32_t chr_name_slen = first_token_end - line_start;
201       *first_token_end = '\0';
202       const uint32_t cur_chr_code = GetChrCode(line_start, cip, chr_name_slen);
203       if (IsI32Neg(cur_chr_code)) {
204         if (likely((!set_ct_ptr) && (cur_chr_code == UINT32_MAX))) {
205           continue;
206         }
207         snprintf(g_logbuf, kLogbufSize, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
208         goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
209       }
210       line_iter = CurTokenEnd(last_token);
211       if (!IsSet(cip->chr_mask, cur_chr_code)) {
212         continue;
213       }
214       if (variant_bps) {
215         const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[cur_chr_code];
216         chr_start = cip->chr_fo_vidx_start[chr_fo_idx];
217         chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
218         if (chr_end == chr_start) {
219           continue;
220         }
221         // (track_set_names must be true if subset_ct is nonzero)
222         // might need to move this outside the if-statement later
223         if (subset_ct && (bsearch_str(last_token, sorted_subset_ids, strlen_se(last_token), max_subset_id_blen, subset_ct) == -1)) {
224           continue;
225         }
226       }
227       const char* linebuf_iter = FirstNonTspace(&(first_token_end[1]));
228       uint32_t range_first;
229       if (unlikely(ScanmovUintDefcap(&linebuf_iter, &range_first))) {
230         snprintf(g_logbuf, kLogbufSize, "Error: Invalid range start position on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
231         goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
232       }
233       range_first += zero_based;
234       linebuf_iter = NextToken(linebuf_iter);
235       uint32_t range_last;
236       if (unlikely(ScanmovUintDefcap(&linebuf_iter, &range_last))) {
237         snprintf(g_logbuf, kLogbufSize, "Error: Invalid range end position on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
238         goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
239       }
240       if (unlikely(range_last < range_first)) {
241         snprintf(g_logbuf, kLogbufSize, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
242         goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
243       }
244       if (border_extend > range_first) {
245         range_first = 0;
246       } else {
247         range_first -= border_extend;
248       }
249       range_last += border_extend;
250       const uint32_t last_token_slen = line_iter - last_token;
251       // about to potentially clobber \n, advance now
252       line_iter = AdvToDelim(line_iter, '\n');
253       uint32_t cur_set_idx = 0;
254       if (set_ct > 1) {
255         // bugfix: bsearch_str_natural requires null-terminated string
256         last_token[last_token_slen] = '\0';
257         if (c_prefix) {
258           last_token = &(last_token[-2]);
259           memcpy_k(last_token, "C_", 2);
260         } else if (!variant_bps) {
261           last_token = &(last_token[-S_CAST(int32_t, kMaxChrCodeDigits)]);
262           u32toa_z5(cur_chr_code, last_token);
263           last_token[kMaxChrCodeDigits - 1] -= 15;
264         }
265         // this should never fail
266         cur_set_idx = bsearch_str_natural(last_token, set_names, max_set_id_blen, set_ct);
267       }
268       if (variant_bps) {
269         // translate to within-chromosome uidx
270         range_first = CountSortedSmallerU32(&(variant_bps[chr_start]), chr_end - chr_start, range_first);
271         range_last = CountSortedSmallerU32(&(variant_bps[chr_start]), chr_end - chr_start, range_last + 1);
272         if (range_last > range_first) {
273           MakeSetRange* msr_tmp = S_CAST(MakeSetRange*, bigstack_end_alloc(sizeof(MakeSetRange)));
274           if (unlikely(!msr_tmp)) {
275             goto LoadIntervalBed_ret_NOMEM;
276           }
277           msr_tmp->next = make_set_range_arr[cur_set_idx];
278           // normally, I'd keep chr_idx here since that enables by-chromosome
279           // sorting, but that's probably not worth bloating MakeSetRange
280           // from 16 to 32 bytes
281           msr_tmp->uidx_start = chr_start + range_first;
282           msr_tmp->uidx_end = chr_start + range_last;
283           make_set_range_arr[cur_set_idx] = msr_tmp;
284         }
285       } else {
286         MakeSetRange* msr_tmp = S_CAST(MakeSetRange*, bigstack_end_alloc(sizeof(MakeSetRange)));
287         if (unlikely(!msr_tmp)) {
288           goto LoadIntervalBed_ret_NOMEM;
289         }
290         msr_tmp->next = make_set_range_arr[cur_set_idx];
291         msr_tmp->uidx_start = range_first;
292         msr_tmp->uidx_end = range_last + 1;
293         make_set_range_arr[cur_set_idx] = msr_tmp;
294       }
295       goto LoadIntervalBed_LINE_ITER_ALREADY_ADVANCED;
296     }
297     // allocate buffer for sorting ranges later
298     uint32_t max_set_range_ct = 0;
299     for (uint32_t set_idx = 0; set_idx != set_ct; ++set_idx) {
300       uint32_t cur_set_range_ct = 0;
301       MakeSetRange* msr_tmp = make_set_range_arr[set_idx];
302       while (msr_tmp) {
303         ++cur_set_range_ct;
304         msr_tmp = msr_tmp->next;
305       }
306       if (cur_set_range_ct > max_set_range_ct) {
307         max_set_range_ct = cur_set_range_ct;
308       }
309     }
310     if (range_sort_buf_ptr) {
311       if (unlikely(bigstack_end_alloc_u64(max_set_range_ct, range_sort_buf_ptr))) {
312         goto LoadIntervalBed_ret_NOMEM;
313       }
314     }
315     if (set_ct_ptr) {
316       *set_ct_ptr = set_ct;
317     }
318     if (max_set_id_blen_ptr) {
319       *max_set_id_blen_ptr = max_set_id_blen;
320     }
321     *make_set_range_arr_ptr = make_set_range_arr;
322   }
323   while (0) {
324   LoadIntervalBed_ret_NOMEM:
325     reterr = kPglRetNomem;
326     break;
327   LoadIntervalBed_ret_REWIND_FAIL:
328     logerrprintfww(kErrprintfRewind, file_descrip);
329     reterr = kPglRetRewindFail;
330     break;
331   LoadIntervalBed_ret_TSTREAM_FAIL:
332     TextStreamErrPrint(file_descrip, txsp);
333     break;
334   LoadIntervalBed_ret_MALFORMED_INPUT_WW:
335     WordWrapB(0);
336     logerrputsb();
337   LoadIntervalBed_ret_MALFORMED_INPUT:
338     reterr = kPglRetMalformedInput;
339     break;
340   }
341  LoadIntervalBed_ret_1:
342   return reterr;
343 }
344 
ExtractExcludeRange(const char * fnames,const ChrInfo * cip,const uint32_t * variant_bps,uint32_t raw_variant_ct,VfilterType vft,uint32_t zero_based,uint32_t bed_border_bp,uint32_t max_thread_ct,uintptr_t * variant_include,uint32_t * variant_ct_ptr)345 PglErr ExtractExcludeRange(const char* fnames, const ChrInfo* cip, const uint32_t* variant_bps, uint32_t raw_variant_ct, VfilterType vft, uint32_t zero_based, uint32_t bed_border_bp, uint32_t max_thread_ct, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
346   const uint32_t orig_variant_ct = *variant_ct_ptr;
347   if (!orig_variant_ct) {
348     return kPglRetSuccess;
349   }
350   unsigned char* bigstack_mark = g_bigstack_base;
351   unsigned char* bigstack_end_mark = g_bigstack_end;
352   PglErr reterr = kPglRetSuccess;
353   const char* fname_txs = nullptr;
354   TextStream txs;
355   PreinitTextStream(&txs);
356   {
357     const uintptr_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
358     uintptr_t* variant_include_mask = nullptr;
359     if (vft != kVfilterExclude) {
360       if (unlikely(bigstack_calloc_w(raw_variant_ctl, &variant_include_mask))) {
361         goto ExtractExcludeRange_ret_NOMEM;
362       }
363     }
364     const char* fnames_iter = fnames;
365     do {
366       if (fnames_iter == fnames) {
367         fname_txs = fnames_iter;
368         reterr = InitTextStream(fnames_iter, kTextStreamBlenFast, MAXV(max_thread_ct - 1, 1), &txs);
369         if (unlikely(reterr)) {
370           goto ExtractExcludeRange_ret_TSTREAM_FAIL;
371         }
372       } else {
373         reterr = TextRetarget(fnames_iter, &txs);
374         if (unlikely(reterr)) {
375           goto ExtractExcludeRange_ret_TSTREAM_FAIL;
376         }
377         fname_txs = fnames_iter;
378       }
379       MakeSetRange** range_arr = nullptr;
380       reterr = LoadIntervalBed(cip, variant_bps, nullptr, fname_txs, zero_based, 0, bed_border_bp, 0, 0, 0, 0, &txs, nullptr, nullptr, nullptr, nullptr, &range_arr);
381       if (unlikely(reterr)) {
382         goto ExtractExcludeRange_ret_1;
383       }
384       MakeSetRange* msr_tmp = range_arr[0];
385       if (vft == kVfilterExclude) {
386         while (msr_tmp) {
387           ClearBitsNz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include);
388           msr_tmp = msr_tmp->next;
389         }
390       } else {
391         while (msr_tmp) {
392           FillBitsNz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include_mask);
393           msr_tmp = msr_tmp->next;
394         }
395         if (vft == kVfilterExtractIntersect) {
396           BitvecAnd(variant_include_mask, raw_variant_ctl, variant_include);
397           ZeroWArr(raw_variant_ctl, variant_include_mask);
398         }
399       }
400       fnames_iter = strnul(fnames_iter);
401       ++fnames_iter;
402     } while (*fnames_iter);
403     if (vft == kVfilterExtract) {
404       BitvecAnd(variant_include_mask, raw_variant_ctl, variant_include);
405     }
406     *variant_ct_ptr = PopcountWords(variant_include, raw_variant_ctl);
407     const char* vft_name = g_vft_names[vft];
408     if (*variant_ct_ptr == orig_variant_ct) {
409       logerrprintf("Warning: No variants excluded by '--%s bed%c'.\n", vft_name, '1' - zero_based);
410     } else {
411       const uint32_t excluded_ct = orig_variant_ct - (*variant_ct_ptr);
412       logprintf("--%s bed%c: %u variant%s excluded.\n", vft_name, '1' - zero_based, excluded_ct, (excluded_ct == 1)? "" : "s");
413     }
414   }
415   while (0) {
416   ExtractExcludeRange_ret_NOMEM:
417     reterr = kPglRetNomem;
418     break;
419   ExtractExcludeRange_ret_TSTREAM_FAIL:
420     TextStreamErrPrint(fname_txs, &txs);
421     break;
422   }
423  ExtractExcludeRange_ret_1:
424   if (fname_txs) {
425     CleanupTextStream2(fname_txs, &txs, &reterr);
426   }
427   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
428   return reterr;
429 }
430 
431 #ifdef __cplusplus
432 }
433 #endif
434