1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_set.h"
19
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23
24 typedef struct MakeSetRangeStruct {
25 NONCOPYABLE(MakeSetRangeStruct);
26 struct MakeSetRangeStruct* next;
27 uint32_t uidx_start;
28 uint32_t uidx_end;
29 } MakeSetRange;
30
31 static_assert(kMaxChrCodeDigits == 5, "LoadIntervalBed() must be updated.");
LoadIntervalBed(const ChrInfo * cip,const uint32_t * variant_bps,const char * sorted_subset_ids,const char * file_descrip,uint32_t zero_based,uint32_t track_set_names,uint32_t border_extend,uint32_t fail_on_no_sets,uint32_t c_prefix,uintptr_t subset_ct,uintptr_t max_subset_id_blen,TextStream * txsp,uintptr_t * set_ct_ptr,char ** set_names_ptr,uintptr_t * max_set_id_blen_ptr,uint64_t ** range_sort_buf_ptr,MakeSetRange *** make_set_range_arr_ptr)32 PglErr LoadIntervalBed(const ChrInfo* cip, const uint32_t* variant_bps, const char* sorted_subset_ids, const char* file_descrip, uint32_t zero_based, uint32_t track_set_names, uint32_t border_extend, uint32_t fail_on_no_sets, uint32_t c_prefix, uintptr_t subset_ct, uintptr_t max_subset_id_blen, TextStream* txsp, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_blen_ptr, uint64_t** range_sort_buf_ptr, MakeSetRange*** make_set_range_arr_ptr) {
33 // In plink 1.9, this was named load_range_list() and called directly by
34 // ExtractExcludeRange(), define_sets(), and indirectly by annotate(),
35 // gene_report(), and clump_reports(). That function required set IDs in
36 // column 4, and interpreted column 5 as a set "group label".
37 // However, column 5 wasn't used very often, and without it, we're free to
38 // generalize this function to point at UCSC interval-BED files.
39 //
40 // Assumes caller will reset g_bigstack_end later.
41 PglErr reterr = kPglRetSuccess;
42 {
43 LlStr* make_set_ll = nullptr;
44 char* set_names = nullptr;
45 uintptr_t set_ct = 0;
46 uintptr_t max_set_id_blen = 0;
47 // if we need to track set names, put together a sorted list
48 if (track_set_names) {
49 uintptr_t line_idx = 1;
50 for (char* line_iter = TextLineEnd(txsp); TextGetUnsafe(txsp, &line_iter); ++line_idx) {
51 char* line_start = line_iter;
52 char* first_token_end = CurTokenEnd(line_start);
53 char* cur_set_id = NextTokenMult(first_token_end, 3);
54 char* last_token = cur_set_id;
55 if (unlikely(NoMoreTokensKns(last_token))) {
56 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, file_descrip);
57 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
58 }
59 const uint32_t chr_name_slen = first_token_end - line_start;
60 *first_token_end = '\0';
61 const uint32_t cur_chr_code = GetChrCode(line_start, cip, chr_name_slen);
62 if (IsI32Neg(cur_chr_code)) {
63 // kludge (21 Jan 2020): --extract/exclude range should not error out
64 // if a line mentions a chromosome code not in the dataset.
65 // TODO: better condition for skipping this line. (not totally
66 // trivial since some future callers may need to track empty sets)
67 if (likely((!set_ct_ptr) && (cur_chr_code == UINT32_MAX))) {
68 continue;
69 }
70 snprintf(g_logbuf, kLogbufSize, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
71 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
72 }
73 // chr_mask check removed, we want to track empty sets
74 uint32_t set_id_slen = strlen_se(cur_set_id);
75 // we're about to possibly clobber \n, so advance line_iter now
76 line_iter = AdvPastDelim(&(cur_set_id[set_id_slen]), '\n');
77 cur_set_id[set_id_slen] = '\0';
78 if (subset_ct) {
79 if (bsearch_str(cur_set_id, sorted_subset_ids, set_id_slen, max_subset_id_blen, subset_ct) == -1) {
80 continue;
81 }
82 }
83 // when there are repeats, they are likely to be next to each other
84 if (make_set_ll && strequal_overread(make_set_ll->str, last_token)) {
85 continue;
86 }
87 uint32_t set_id_blen = set_id_slen + 1;
88 // argh, --clump counts positional overlaps which don't include any
89 // variants in the dataset. So we prefix set IDs with a chromosome
90 // index in that case (with leading zeroes) and treat cross-chromosome
91 // sets as distinct.
92 if (!variant_bps) {
93 set_id_blen += kMaxChrCodeDigits;
94 }
95 if (set_id_blen > max_set_id_blen) {
96 max_set_id_blen = set_id_blen;
97 }
98 LlStr* ll_tmp;
99 if (unlikely(bigstack_end_alloc_llstr(set_id_blen, &ll_tmp))) {
100 goto LoadIntervalBed_ret_NOMEM;
101 }
102 ll_tmp->next = make_set_ll;
103 if (variant_bps) {
104 memcpy(ll_tmp->str, last_token, set_id_blen);
105 } else {
106 u32toa_z5(cur_chr_code, ll_tmp->str);
107 // if first character of gene name is a digit, natural sort has
108 // strange effects unless we force [3] to be nonnumeric...
109 ll_tmp->str[kMaxChrCodeDigits - 1] -= 15;
110 memcpy(&(ll_tmp->str[kMaxChrCodeDigits]), last_token, set_id_blen - kMaxChrCodeDigits);
111 }
112 make_set_ll = ll_tmp;
113 ++set_ct;
114 }
115 if (unlikely(TextStreamErrcode2(txsp, &reterr))) {
116 goto LoadIntervalBed_ret_TSTREAM_FAIL;
117 }
118 if (!set_ct) {
119 if (unlikely(fail_on_no_sets)) {
120 if (variant_bps) {
121 logerrputs("Error: All variants excluded by --gene[-all], since no sets were defined from\n--make-set file.\n");
122 reterr = kPglRetMalformedInput;
123 goto LoadIntervalBed_ret_1;
124 } else {
125 if (subset_ct) {
126 logerrputs("Error: No --gene-subset genes present in --gene-report file.\n");
127 reterr = kPglRetInconsistentInput;
128 } else {
129 logerrputs("Error: Empty --gene-report file.\n");
130 reterr = kPglRetMalformedInput;
131 }
132 goto LoadIntervalBed_ret_1;
133 }
134 }
135 logerrprintfww("Warning: No valid ranges in %s.\n", file_descrip);
136 goto LoadIntervalBed_ret_1;
137 }
138 // c_prefix is 0 or 2
139 max_set_id_blen += c_prefix;
140 if (unlikely(max_set_id_blen > kMaxIdBlen)) {
141 logerrputs("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
142 goto LoadIntervalBed_ret_MALFORMED_INPUT;
143 }
144 const char** strptr_arr;
145 if (unlikely(
146 bigstack_alloc_c(set_ct * max_set_id_blen, set_names_ptr) ||
147 bigstack_alloc_kcp(set_ct, &strptr_arr))) {
148 goto LoadIntervalBed_ret_NOMEM;
149 }
150 set_names = *set_names_ptr;
151 for (uintptr_t set_idx = 0; set_idx != set_ct; ++set_idx) {
152 strptr_arr[set_idx] = make_set_ll->str;
153 make_set_ll = make_set_ll->next;
154 }
155 StrptrArrNsort(set_ct, strptr_arr);
156 set_ct = CopyAndDedupSortedStrptrsToStrbox(strptr_arr, set_ct, max_set_id_blen, &(set_names[c_prefix]));
157 if (c_prefix) {
158 for (uintptr_t set_idx = 0; set_idx != set_ct; ++set_idx) {
159 memcpy_k(&(set_names[set_idx * max_set_id_blen]), "C_", 2);
160 }
161 }
162 BigstackShrinkTop(set_names, set_ct * max_set_id_blen);
163 reterr = TextRewind(txsp);
164 if (unlikely(reterr)) {
165 goto LoadIntervalBed_ret_TSTREAM_FAIL;
166 }
167 } else {
168 set_ct = 1;
169 }
170 MakeSetRange** make_set_range_arr = S_CAST(MakeSetRange**, bigstack_end_alloc(set_ct * sizeof(intptr_t)));
171 if (unlikely(!make_set_range_arr)) {
172 goto LoadIntervalBed_ret_NOMEM;
173 }
174 ZeroPtrArr(set_ct, make_set_range_arr);
175 uintptr_t line_idx = 0;
176 uint32_t chr_start = 0;
177 uint32_t chr_end = 0;
178 for (char* line_iter = &(TextLineEnd(txsp)[-1]); ; line_iter = AdvToDelim(line_iter, '\n')) {
179 LoadIntervalBed_LINE_ITER_ALREADY_ADVANCED:
180 ++line_iter;
181 ++line_idx;
182 reterr = TextGetUnsafe(txsp, &line_iter);
183 if (reterr) {
184 if (likely(reterr == kPglRetEof)) {
185 if (unlikely(track_set_names && (line_idx == 1))) {
186 goto LoadIntervalBed_ret_REWIND_FAIL;
187 }
188 reterr = kPglRetSuccess;
189 break;
190 }
191 goto LoadIntervalBed_ret_TSTREAM_FAIL;
192 }
193 char* line_start = line_iter;
194 char* first_token_end = CurTokenEnd(line_start);
195 char* last_token = NextTokenMult(first_token_end, 2 + track_set_names);
196 if (unlikely(NoMoreTokensKns(last_token))) {
197 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, file_descrip);
198 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
199 }
200 const uint32_t chr_name_slen = first_token_end - line_start;
201 *first_token_end = '\0';
202 const uint32_t cur_chr_code = GetChrCode(line_start, cip, chr_name_slen);
203 if (IsI32Neg(cur_chr_code)) {
204 if (likely((!set_ct_ptr) && (cur_chr_code == UINT32_MAX))) {
205 continue;
206 }
207 snprintf(g_logbuf, kLogbufSize, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
208 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
209 }
210 line_iter = CurTokenEnd(last_token);
211 if (!IsSet(cip->chr_mask, cur_chr_code)) {
212 continue;
213 }
214 if (variant_bps) {
215 const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[cur_chr_code];
216 chr_start = cip->chr_fo_vidx_start[chr_fo_idx];
217 chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
218 if (chr_end == chr_start) {
219 continue;
220 }
221 // (track_set_names must be true if subset_ct is nonzero)
222 // might need to move this outside the if-statement later
223 if (subset_ct && (bsearch_str(last_token, sorted_subset_ids, strlen_se(last_token), max_subset_id_blen, subset_ct) == -1)) {
224 continue;
225 }
226 }
227 const char* linebuf_iter = FirstNonTspace(&(first_token_end[1]));
228 uint32_t range_first;
229 if (unlikely(ScanmovUintDefcap(&linebuf_iter, &range_first))) {
230 snprintf(g_logbuf, kLogbufSize, "Error: Invalid range start position on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
231 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
232 }
233 range_first += zero_based;
234 linebuf_iter = NextToken(linebuf_iter);
235 uint32_t range_last;
236 if (unlikely(ScanmovUintDefcap(&linebuf_iter, &range_last))) {
237 snprintf(g_logbuf, kLogbufSize, "Error: Invalid range end position on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
238 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
239 }
240 if (unlikely(range_last < range_first)) {
241 snprintf(g_logbuf, kLogbufSize, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s.\n", line_idx, file_descrip);
242 goto LoadIntervalBed_ret_MALFORMED_INPUT_WW;
243 }
244 if (border_extend > range_first) {
245 range_first = 0;
246 } else {
247 range_first -= border_extend;
248 }
249 range_last += border_extend;
250 const uint32_t last_token_slen = line_iter - last_token;
251 // about to potentially clobber \n, advance now
252 line_iter = AdvToDelim(line_iter, '\n');
253 uint32_t cur_set_idx = 0;
254 if (set_ct > 1) {
255 // bugfix: bsearch_str_natural requires null-terminated string
256 last_token[last_token_slen] = '\0';
257 if (c_prefix) {
258 last_token = &(last_token[-2]);
259 memcpy_k(last_token, "C_", 2);
260 } else if (!variant_bps) {
261 last_token = &(last_token[-S_CAST(int32_t, kMaxChrCodeDigits)]);
262 u32toa_z5(cur_chr_code, last_token);
263 last_token[kMaxChrCodeDigits - 1] -= 15;
264 }
265 // this should never fail
266 cur_set_idx = bsearch_str_natural(last_token, set_names, max_set_id_blen, set_ct);
267 }
268 if (variant_bps) {
269 // translate to within-chromosome uidx
270 range_first = CountSortedSmallerU32(&(variant_bps[chr_start]), chr_end - chr_start, range_first);
271 range_last = CountSortedSmallerU32(&(variant_bps[chr_start]), chr_end - chr_start, range_last + 1);
272 if (range_last > range_first) {
273 MakeSetRange* msr_tmp = S_CAST(MakeSetRange*, bigstack_end_alloc(sizeof(MakeSetRange)));
274 if (unlikely(!msr_tmp)) {
275 goto LoadIntervalBed_ret_NOMEM;
276 }
277 msr_tmp->next = make_set_range_arr[cur_set_idx];
278 // normally, I'd keep chr_idx here since that enables by-chromosome
279 // sorting, but that's probably not worth bloating MakeSetRange
280 // from 16 to 32 bytes
281 msr_tmp->uidx_start = chr_start + range_first;
282 msr_tmp->uidx_end = chr_start + range_last;
283 make_set_range_arr[cur_set_idx] = msr_tmp;
284 }
285 } else {
286 MakeSetRange* msr_tmp = S_CAST(MakeSetRange*, bigstack_end_alloc(sizeof(MakeSetRange)));
287 if (unlikely(!msr_tmp)) {
288 goto LoadIntervalBed_ret_NOMEM;
289 }
290 msr_tmp->next = make_set_range_arr[cur_set_idx];
291 msr_tmp->uidx_start = range_first;
292 msr_tmp->uidx_end = range_last + 1;
293 make_set_range_arr[cur_set_idx] = msr_tmp;
294 }
295 goto LoadIntervalBed_LINE_ITER_ALREADY_ADVANCED;
296 }
297 // allocate buffer for sorting ranges later
298 uint32_t max_set_range_ct = 0;
299 for (uint32_t set_idx = 0; set_idx != set_ct; ++set_idx) {
300 uint32_t cur_set_range_ct = 0;
301 MakeSetRange* msr_tmp = make_set_range_arr[set_idx];
302 while (msr_tmp) {
303 ++cur_set_range_ct;
304 msr_tmp = msr_tmp->next;
305 }
306 if (cur_set_range_ct > max_set_range_ct) {
307 max_set_range_ct = cur_set_range_ct;
308 }
309 }
310 if (range_sort_buf_ptr) {
311 if (unlikely(bigstack_end_alloc_u64(max_set_range_ct, range_sort_buf_ptr))) {
312 goto LoadIntervalBed_ret_NOMEM;
313 }
314 }
315 if (set_ct_ptr) {
316 *set_ct_ptr = set_ct;
317 }
318 if (max_set_id_blen_ptr) {
319 *max_set_id_blen_ptr = max_set_id_blen;
320 }
321 *make_set_range_arr_ptr = make_set_range_arr;
322 }
323 while (0) {
324 LoadIntervalBed_ret_NOMEM:
325 reterr = kPglRetNomem;
326 break;
327 LoadIntervalBed_ret_REWIND_FAIL:
328 logerrprintfww(kErrprintfRewind, file_descrip);
329 reterr = kPglRetRewindFail;
330 break;
331 LoadIntervalBed_ret_TSTREAM_FAIL:
332 TextStreamErrPrint(file_descrip, txsp);
333 break;
334 LoadIntervalBed_ret_MALFORMED_INPUT_WW:
335 WordWrapB(0);
336 logerrputsb();
337 LoadIntervalBed_ret_MALFORMED_INPUT:
338 reterr = kPglRetMalformedInput;
339 break;
340 }
341 LoadIntervalBed_ret_1:
342 return reterr;
343 }
344
ExtractExcludeRange(const char * fnames,const ChrInfo * cip,const uint32_t * variant_bps,uint32_t raw_variant_ct,VfilterType vft,uint32_t zero_based,uint32_t bed_border_bp,uint32_t max_thread_ct,uintptr_t * variant_include,uint32_t * variant_ct_ptr)345 PglErr ExtractExcludeRange(const char* fnames, const ChrInfo* cip, const uint32_t* variant_bps, uint32_t raw_variant_ct, VfilterType vft, uint32_t zero_based, uint32_t bed_border_bp, uint32_t max_thread_ct, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
346 const uint32_t orig_variant_ct = *variant_ct_ptr;
347 if (!orig_variant_ct) {
348 return kPglRetSuccess;
349 }
350 unsigned char* bigstack_mark = g_bigstack_base;
351 unsigned char* bigstack_end_mark = g_bigstack_end;
352 PglErr reterr = kPglRetSuccess;
353 const char* fname_txs = nullptr;
354 TextStream txs;
355 PreinitTextStream(&txs);
356 {
357 const uintptr_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
358 uintptr_t* variant_include_mask = nullptr;
359 if (vft != kVfilterExclude) {
360 if (unlikely(bigstack_calloc_w(raw_variant_ctl, &variant_include_mask))) {
361 goto ExtractExcludeRange_ret_NOMEM;
362 }
363 }
364 const char* fnames_iter = fnames;
365 do {
366 if (fnames_iter == fnames) {
367 fname_txs = fnames_iter;
368 reterr = InitTextStream(fnames_iter, kTextStreamBlenFast, MAXV(max_thread_ct - 1, 1), &txs);
369 if (unlikely(reterr)) {
370 goto ExtractExcludeRange_ret_TSTREAM_FAIL;
371 }
372 } else {
373 reterr = TextRetarget(fnames_iter, &txs);
374 if (unlikely(reterr)) {
375 goto ExtractExcludeRange_ret_TSTREAM_FAIL;
376 }
377 fname_txs = fnames_iter;
378 }
379 MakeSetRange** range_arr = nullptr;
380 reterr = LoadIntervalBed(cip, variant_bps, nullptr, fname_txs, zero_based, 0, bed_border_bp, 0, 0, 0, 0, &txs, nullptr, nullptr, nullptr, nullptr, &range_arr);
381 if (unlikely(reterr)) {
382 goto ExtractExcludeRange_ret_1;
383 }
384 MakeSetRange* msr_tmp = range_arr[0];
385 if (vft == kVfilterExclude) {
386 while (msr_tmp) {
387 ClearBitsNz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include);
388 msr_tmp = msr_tmp->next;
389 }
390 } else {
391 while (msr_tmp) {
392 FillBitsNz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include_mask);
393 msr_tmp = msr_tmp->next;
394 }
395 if (vft == kVfilterExtractIntersect) {
396 BitvecAnd(variant_include_mask, raw_variant_ctl, variant_include);
397 ZeroWArr(raw_variant_ctl, variant_include_mask);
398 }
399 }
400 fnames_iter = strnul(fnames_iter);
401 ++fnames_iter;
402 } while (*fnames_iter);
403 if (vft == kVfilterExtract) {
404 BitvecAnd(variant_include_mask, raw_variant_ctl, variant_include);
405 }
406 *variant_ct_ptr = PopcountWords(variant_include, raw_variant_ctl);
407 const char* vft_name = g_vft_names[vft];
408 if (*variant_ct_ptr == orig_variant_ct) {
409 logerrprintf("Warning: No variants excluded by '--%s bed%c'.\n", vft_name, '1' - zero_based);
410 } else {
411 const uint32_t excluded_ct = orig_variant_ct - (*variant_ct_ptr);
412 logprintf("--%s bed%c: %u variant%s excluded.\n", vft_name, '1' - zero_based, excluded_ct, (excluded_ct == 1)? "" : "s");
413 }
414 }
415 while (0) {
416 ExtractExcludeRange_ret_NOMEM:
417 reterr = kPglRetNomem;
418 break;
419 ExtractExcludeRange_ret_TSTREAM_FAIL:
420 TextStreamErrPrint(fname_txs, &txs);
421 break;
422 }
423 ExtractExcludeRange_ret_1:
424 if (fname_txs) {
425 CleanupTextStream2(fname_txs, &txs, &reterr);
426 }
427 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
428 return reterr;
429 }
430
431 #ifdef __cplusplus
432 }
433 #endif
434