1 /*
2 Copyright (C) 2014 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <config.h>
26 #include <strings.h>
27
28 #include "htslib/hts.h"
29 #include "htslib/kstring.h"
30 #include "htslib/kseq.h"
31 #include "htslib/khash_str2int.h"
32 #include "htslib/regidx.h"
33 #include "hts_internal.h"
34
35 #define LIDX_SHIFT 13 // number of insignificant index bits
36
37 // List of regions for one chromosome
38 typedef struct
39 {
40 int *idx, nidx;
41 int nregs, mregs; // n:used, m:alloced
42 reg_t *regs;
43 void *payload;
44 }
45 reglist_t;
46
47 // Container of all sequences
48 struct _regidx_t
49 {
50 int nseq, mseq; // n:used, m:alloced
51 reglist_t *seq; // regions for each sequence
52 void *seq2regs; // hash for fast lookup from chr name to regions
53 char **seq_names;
54 regidx_free_f free; // function to free any data allocated by regidx_parse_f
55 regidx_parse_f parse; // parse one input line
56 void *usr; // user data to pass to regidx_parse_f
57
58 // temporary data for index initialization
59 kstring_t str;
60 int rid_prev, start_prev, end_prev;
61 int payload_size;
62 void *payload;
63 };
64
regidx_seq_nregs(regidx_t * idx,const char * seq)65 int regidx_seq_nregs(regidx_t *idx, const char *seq)
66 {
67 int iseq;
68 if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
69 return idx->seq[iseq].nregs;
70 }
71
regidx_nregs(regidx_t * idx)72 int regidx_nregs(regidx_t *idx)
73 {
74 int i, nregs = 0;
75 for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs;
76 return nregs;
77 }
78
regidx_seq_names(regidx_t * idx,int * n)79 char **regidx_seq_names(regidx_t *idx, int *n)
80 {
81 *n = idx->nseq;
82 return idx->seq_names;
83 }
84
_regidx_build_index(regidx_t * idx)85 int _regidx_build_index(regidx_t *idx)
86 {
87 int iseq;
88 for (iseq=0; iseq<idx->nseq; iseq++)
89 {
90 reglist_t *list = &idx->seq[iseq];
91 int j,k, imax = 0; // max index bin
92 for (j=0; j<list->nregs; j++)
93 {
94 int ibeg = list->regs[j].start >> LIDX_SHIFT;
95 int iend = list->regs[j].end >> LIDX_SHIFT;
96 if ( imax < iend + 1 )
97 {
98 int old_imax = imax;
99 imax = iend + 1;
100 kroundup32(imax);
101 list->idx = (int*) realloc(list->idx, imax*sizeof(int));
102 for (k=old_imax; k<imax; k++) list->idx[k] = -1;
103 }
104 if ( ibeg==iend )
105 {
106 if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
107 }
108 else
109 {
110 for (k=ibeg; k<=iend; k++)
111 if ( list->idx[k]<0 ) list->idx[k] = j;
112 }
113 list->nidx = iend + 1;
114 }
115 }
116 return 0;
117 }
118
regidx_insert(regidx_t * idx,char * line)119 int regidx_insert(regidx_t *idx, char *line)
120 {
121 if ( !line )
122 return _regidx_build_index(idx);
123
124 char *chr_from, *chr_to;
125 reg_t reg;
126 int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr);
127 if ( ret==-2 ) return -1; // error
128 if ( ret==-1 ) return 0; // skip the line
129
130 int rid;
131 idx->str.l = 0;
132 kputsn(chr_from, chr_to-chr_from+1, &idx->str);
133 if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
134 {
135 idx->nseq++;
136 int m_prev = idx->mseq;
137 hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
138 hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
139 idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
140 rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
141 }
142
143 reglist_t *list = &idx->seq[rid];
144 list->nregs++;
145 int m_prev = list->mregs;
146 hts_expand(reg_t,list->nregs,list->mregs,list->regs);
147 list->regs[list->nregs-1] = reg;
148 if ( idx->payload_size )
149 {
150 if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
151 memcpy((char*)list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
152 }
153
154 if ( idx->rid_prev==rid )
155 {
156 if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
157 {
158 hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d",
159 idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
160 return -1;
161 }
162 }
163 idx->rid_prev = rid;
164 idx->start_prev = reg.start;
165 idx->end_prev = reg.end;
166 return 0;
167 }
168
regidx_init(const char * fname,regidx_parse_f parser,regidx_free_f free_f,size_t payload_size,void * usr_dat)169 regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
170 {
171 if ( !parser )
172 {
173 if ( !fname ) parser = regidx_parse_tab;
174 else
175 {
176 int len = strlen(fname);
177 if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
178 parser = regidx_parse_bed;
179 else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
180 parser = regidx_parse_bed;
181 else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
182 parser = regidx_parse_bed;
183 else
184 parser = regidx_parse_tab;
185 }
186 }
187
188 regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
189 idx->free = free_f;
190 idx->parse = parser;
191 idx->usr = usr_dat;
192 idx->seq2regs = khash_str2int_init();
193 idx->rid_prev = -1;
194 idx->start_prev = -1;
195 idx->end_prev = -1;
196 idx->payload_size = payload_size;
197 if ( payload_size ) idx->payload = malloc(payload_size);
198
199 if ( !fname ) return idx;
200
201 kstring_t str = {0,0,0};
202
203 htsFile *fp = hts_open(fname,"r");
204 if ( !fp ) goto error;
205
206 while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
207 {
208 if ( regidx_insert(idx, str.s) ) goto error;
209 }
210 regidx_insert(idx, NULL);
211
212 free(str.s);
213 hts_close(fp);
214 return idx;
215
216 error:
217 free(str.s);
218 if ( fp ) hts_close(fp);
219 regidx_destroy(idx);
220 return NULL;
221 }
222
regidx_destroy(regidx_t * idx)223 void regidx_destroy(regidx_t *idx)
224 {
225 int i, j;
226 for (i=0; i<idx->nseq; i++)
227 {
228 reglist_t *list = &idx->seq[i];
229 if ( idx->free )
230 {
231 for (j=0; j<list->nregs; j++)
232 idx->free((char*)list->payload + idx->payload_size*j);
233 }
234 free(list->payload);
235 free(list->regs);
236 free(list->idx);
237 }
238 free(idx->seq_names);
239 free(idx->seq);
240 free(idx->str.s);
241 free(idx->payload);
242 khash_str2int_destroy_free(idx->seq2regs);
243 free(idx);
244 }
245
regidx_overlap(regidx_t * idx,const char * chr,uint32_t from,uint32_t to,regitr_t * itr)246 int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr)
247 {
248 if ( itr ) itr->i = itr->n = 0;
249
250 int iseq;
251 if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
252
253 reglist_t *list = &idx->seq[iseq];
254 if ( !list->nregs ) return 0;
255
256 int i, ibeg = from>>LIDX_SHIFT;
257 int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
258 if ( ireg < 0 )
259 {
260 // linear search; if slow, replace with binary search
261 if ( ibeg > list->nidx ) ibeg = list->nidx;
262 for (i=ibeg - 1; i>=0; i--)
263 if ( list->idx[i] >=0 ) break;
264 ireg = i>=0 ? list->idx[i] : 0;
265 }
266 for (i=ireg; i<list->nregs; i++)
267 {
268 if ( list->regs[i].start > to ) return 0; // no match
269 if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
270 }
271
272 if ( i>=list->nregs ) return 0; // no match
273
274 if ( !itr ) return 1;
275
276 itr->i = 0;
277 itr->n = list->nregs - i;
278 itr->reg = &idx->seq[iseq].regs[i];
279 if ( idx->payload_size )
280 itr->payload = (char*)idx->seq[iseq].payload + i*idx->payload_size;
281 else
282 itr->payload = NULL;
283
284 return 1;
285 }
286
regidx_parse_bed(const char * line,char ** chr_beg,char ** chr_end,reg_t * reg,void * payload,void * usr)287 int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
288 {
289 char *ss = (char*) line;
290 while ( *ss && isspace_c(*ss) ) ss++;
291 if ( !*ss ) return -1; // skip blank lines
292 if ( *ss=='#' ) return -1; // skip comments
293
294 char *se = ss;
295 while ( *se && !isspace_c(*se) ) se++;
296 if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
297
298 *chr_beg = ss;
299 *chr_end = se-1;
300
301 ss = se+1;
302 reg->start = hts_parse_decimal(ss, &se, 0);
303 if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
304
305 ss = se+1;
306 reg->end = hts_parse_decimal(ss, &se, 0) - 1;
307 if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
308
309 return 0;
310 }
311
regidx_parse_tab(const char * line,char ** chr_beg,char ** chr_end,reg_t * reg,void * payload,void * usr)312 int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
313 {
314 char *ss = (char*) line;
315 while ( *ss && isspace_c(*ss) ) ss++;
316 if ( !*ss ) return -1; // skip blank lines
317 if ( *ss=='#' ) return -1; // skip comments
318
319 char *se = ss;
320 while ( *se && !isspace_c(*se) ) se++;
321 if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
322
323 *chr_beg = ss;
324 *chr_end = se-1;
325
326 ss = se+1;
327 reg->start = hts_parse_decimal(ss, &se, 0) - 1;
328 if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
329
330 if ( !se[0] || !se[1] )
331 reg->end = reg->start;
332 else
333 {
334 ss = se+1;
335 reg->end = hts_parse_decimal(ss, &se, 0);
336 if ( ss==se ) reg->end = reg->start;
337 else reg->end--;
338 }
339
340 return 0;
341 }
342
343