1 /*
2     Copyright (C) 2014 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #include <config.h>
26 #include <strings.h>
27 
28 #include "htslib/hts.h"
29 #include "htslib/kstring.h"
30 #include "htslib/kseq.h"
31 #include "htslib/khash_str2int.h"
32 #include "htslib/regidx.h"
33 #include "hts_internal.h"
34 
35 #define LIDX_SHIFT 13   // number of insignificant index bits
36 
37 // List of regions for one chromosome
38 typedef struct
39 {
40     int *idx, nidx;
41     int nregs, mregs;   // n:used, m:alloced
42     reg_t *regs;
43     void *payload;
44 }
45 reglist_t;
46 
47 // Container of all sequences
48 struct _regidx_t
49 {
50     int nseq, mseq;     // n:used, m:alloced
51     reglist_t *seq;     // regions for each sequence
52     void *seq2regs;     // hash for fast lookup from chr name to regions
53     char **seq_names;
54     regidx_free_f free;     // function to free any data allocated by regidx_parse_f
55     regidx_parse_f parse;   // parse one input line
56     void *usr;              // user data to pass to regidx_parse_f
57 
58     // temporary data for index initialization
59     kstring_t str;
60     int rid_prev, start_prev, end_prev;
61     int payload_size;
62     void *payload;
63 };
64 
regidx_seq_nregs(regidx_t * idx,const char * seq)65 int regidx_seq_nregs(regidx_t *idx, const char *seq)
66 {
67     int iseq;
68     if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
69     return idx->seq[iseq].nregs;
70 }
71 
regidx_nregs(regidx_t * idx)72 int regidx_nregs(regidx_t *idx)
73 {
74     int i, nregs = 0;
75     for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs;
76     return nregs;
77 }
78 
regidx_seq_names(regidx_t * idx,int * n)79 char **regidx_seq_names(regidx_t *idx, int *n)
80 {
81     *n = idx->nseq;
82     return idx->seq_names;
83 }
84 
_regidx_build_index(regidx_t * idx)85 int _regidx_build_index(regidx_t *idx)
86 {
87     int iseq;
88     for (iseq=0; iseq<idx->nseq; iseq++)
89     {
90         reglist_t *list = &idx->seq[iseq];
91         int j,k, imax = 0;   // max index bin
92         for (j=0; j<list->nregs; j++)
93         {
94             int ibeg = list->regs[j].start >> LIDX_SHIFT;
95             int iend = list->regs[j].end >> LIDX_SHIFT;
96             if ( imax < iend + 1 )
97             {
98                 int old_imax = imax;
99                 imax = iend + 1;
100                 kroundup32(imax);
101                 list->idx = (int*) realloc(list->idx, imax*sizeof(int));
102                 for (k=old_imax; k<imax; k++) list->idx[k] = -1;
103             }
104             if ( ibeg==iend )
105             {
106                 if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
107             }
108             else
109             {
110                 for (k=ibeg; k<=iend; k++)
111                     if ( list->idx[k]<0 ) list->idx[k] = j;
112             }
113             list->nidx = iend + 1;
114         }
115     }
116     return 0;
117 }
118 
regidx_insert(regidx_t * idx,char * line)119 int regidx_insert(regidx_t *idx, char *line)
120 {
121     if ( !line )
122         return _regidx_build_index(idx);
123 
124     char *chr_from, *chr_to;
125     reg_t reg;
126     int ret = idx->parse(line,&chr_from,&chr_to,&reg,idx->payload,idx->usr);
127     if ( ret==-2 ) return -1;   // error
128     if ( ret==-1 ) return 0;    // skip the line
129 
130     int rid;
131     idx->str.l = 0;
132     kputsn(chr_from, chr_to-chr_from+1, &idx->str);
133     if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
134     {
135         idx->nseq++;
136         int m_prev = idx->mseq;
137         hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
138         hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
139         idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
140         rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
141     }
142 
143     reglist_t *list = &idx->seq[rid];
144     list->nregs++;
145     int m_prev = list->mregs;
146     hts_expand(reg_t,list->nregs,list->mregs,list->regs);
147     list->regs[list->nregs-1] = reg;
148     if ( idx->payload_size )
149     {
150         if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
151         memcpy((char*)list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
152     }
153 
154     if ( idx->rid_prev==rid )
155     {
156         if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
157         {
158             hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d",
159                 idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
160             return -1;
161         }
162     }
163     idx->rid_prev = rid;
164     idx->start_prev = reg.start;
165     idx->end_prev = reg.end;
166     return 0;
167 }
168 
regidx_init(const char * fname,regidx_parse_f parser,regidx_free_f free_f,size_t payload_size,void * usr_dat)169 regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
170 {
171     if ( !parser )
172     {
173         if ( !fname ) parser = regidx_parse_tab;
174         else
175         {
176             int len = strlen(fname);
177             if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
178                 parser = regidx_parse_bed;
179             else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
180                 parser = regidx_parse_bed;
181             else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
182                 parser = regidx_parse_bed;
183             else
184                 parser = regidx_parse_tab;
185         }
186     }
187 
188     regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
189     idx->free  = free_f;
190     idx->parse = parser;
191     idx->usr   = usr_dat;
192     idx->seq2regs = khash_str2int_init();
193     idx->rid_prev   = -1;
194     idx->start_prev = -1;
195     idx->end_prev   = -1;
196     idx->payload_size = payload_size;
197     if ( payload_size ) idx->payload = malloc(payload_size);
198 
199     if ( !fname ) return idx;
200 
201     kstring_t str = {0,0,0};
202 
203     htsFile *fp = hts_open(fname,"r");
204     if ( !fp ) goto error;
205 
206     while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
207     {
208         if ( regidx_insert(idx, str.s) ) goto error;
209     }
210     regidx_insert(idx, NULL);
211 
212     free(str.s);
213     hts_close(fp);
214     return idx;
215 
216 error:
217     free(str.s);
218     if ( fp ) hts_close(fp);
219     regidx_destroy(idx);
220     return NULL;
221 }
222 
regidx_destroy(regidx_t * idx)223 void regidx_destroy(regidx_t *idx)
224 {
225     int i, j;
226     for (i=0; i<idx->nseq; i++)
227     {
228         reglist_t *list = &idx->seq[i];
229         if ( idx->free )
230         {
231             for (j=0; j<list->nregs; j++)
232                 idx->free((char*)list->payload + idx->payload_size*j);
233         }
234         free(list->payload);
235         free(list->regs);
236         free(list->idx);
237     }
238     free(idx->seq_names);
239     free(idx->seq);
240     free(idx->str.s);
241     free(idx->payload);
242     khash_str2int_destroy_free(idx->seq2regs);
243     free(idx);
244 }
245 
regidx_overlap(regidx_t * idx,const char * chr,uint32_t from,uint32_t to,regitr_t * itr)246 int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr)
247 {
248     if ( itr ) itr->i = itr->n = 0;
249 
250     int iseq;
251     if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
252 
253     reglist_t *list = &idx->seq[iseq];
254     if ( !list->nregs ) return 0;
255 
256     int i, ibeg = from>>LIDX_SHIFT;
257     int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
258     if ( ireg < 0 )
259     {
260         // linear search; if slow, replace with binary search
261         if ( ibeg > list->nidx ) ibeg = list->nidx;
262         for (i=ibeg - 1; i>=0; i--)
263             if ( list->idx[i] >=0 ) break;
264         ireg = i>=0 ? list->idx[i] : 0;
265     }
266     for (i=ireg; i<list->nregs; i++)
267     {
268         if ( list->regs[i].start > to ) return 0;   // no match
269         if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
270     }
271 
272     if ( i>=list->nregs ) return 0;   // no match
273 
274     if ( !itr ) return 1;
275 
276     itr->i = 0;
277     itr->n = list->nregs - i;
278     itr->reg = &idx->seq[iseq].regs[i];
279     if ( idx->payload_size )
280         itr->payload = (char*)idx->seq[iseq].payload + i*idx->payload_size;
281     else
282         itr->payload = NULL;
283 
284     return 1;
285 }
286 
regidx_parse_bed(const char * line,char ** chr_beg,char ** chr_end,reg_t * reg,void * payload,void * usr)287 int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
288 {
289     char *ss = (char*) line;
290     while ( *ss && isspace_c(*ss) ) ss++;
291     if ( !*ss ) return -1;      // skip blank lines
292     if ( *ss=='#' ) return -1;  // skip comments
293 
294     char *se = ss;
295     while ( *se && !isspace_c(*se) ) se++;
296     if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
297 
298     *chr_beg = ss;
299     *chr_end = se-1;
300 
301     ss = se+1;
302     reg->start = hts_parse_decimal(ss, &se, 0);
303     if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
304 
305     ss = se+1;
306     reg->end = hts_parse_decimal(ss, &se, 0) - 1;
307     if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
308 
309     return 0;
310 }
311 
regidx_parse_tab(const char * line,char ** chr_beg,char ** chr_end,reg_t * reg,void * payload,void * usr)312 int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
313 {
314     char *ss = (char*) line;
315     while ( *ss && isspace_c(*ss) ) ss++;
316     if ( !*ss ) return -1;      // skip blank lines
317     if ( *ss=='#' ) return -1;  // skip comments
318 
319     char *se = ss;
320     while ( *se && !isspace_c(*se) ) se++;
321     if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
322 
323     *chr_beg = ss;
324     *chr_end = se-1;
325 
326     ss = se+1;
327     reg->start = hts_parse_decimal(ss, &se, 0) - 1;
328     if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; }
329 
330     if ( !se[0] || !se[1] )
331         reg->end = reg->start;
332     else
333     {
334         ss = se+1;
335         reg->end = hts_parse_decimal(ss, &se, 0);
336         if ( ss==se ) reg->end = reg->start;
337         else reg->end--;
338     }
339 
340     return 0;
341 }
342 
343