1 #include "bcftools.pysam.h"
2 
3 /*
4     Copyright (C) 2014-2016 Genome Research Ltd.
5 
6     Author: Petr Danecek <pd3@sanger.ac.uk>
7 
8     Permission is hereby granted, free of charge, to any person obtaining a copy
9     of this software and associated documentation files (the "Software"), to deal
10     in the Software without restriction, including without limitation the rights
11     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12     copies of the Software, and to permit persons to whom the Software is
13     furnished to do so, subject to the following conditions:
14 
15     The above copyright notice and this permission notice shall be included in
16     all copies or substantial portions of the Software.
17 
18     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24     THE SOFTWARE.
25 */
26 
27 #include <htslib/khash_str2int.h>
28 #include <htslib/kseq.h>
29 #include <htslib/hts.h>
30 #include "bcftools.h"
31 #include "ploidy.h"
32 
33 struct _ploidy_t
34 {
35     int nsex, msex;     // number of genders, m:number of allocated elements in id2sex array
36     int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
37     int *sex2dflt;
38     regidx_t *idx;
39     regitr_t *itr;
40     void *sex2id;
41     char **id2sex;
42     kstring_t tmp_str;
43 };
44 
45 typedef struct
46 {
47     int sex, ploidy;
48 }
49 sex_ploidy_t;
50 
51 
ploidy_regions(ploidy_t * ploidy)52 regidx_t *ploidy_regions(ploidy_t *ploidy)
53 {
54     return ploidy->idx;
55 }
56 
ploidy_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)57 int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
58 {
59     int i, ret;
60     ploidy_t *ploidy = (ploidy_t*) usr;
61     void *sex2id = ploidy->sex2id;
62 
63     // Check for special case of default ploidy "* * * <sex> <ploidy>"
64     int default_ploidy_def = 0;
65 
66     char *ss = (char*) line;
67     while ( *ss && isspace(*ss) ) ss++;
68     if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
69         default_ploidy_def = 1; // definition of default ploidy, chr="*"
70     else
71     {
72         // Fill CHR,FROM,TO
73         ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
74         if ( ret!=0 ) return ret;
75     }
76 
77     // Skip the fields already parsed by regidx_parse_tab
78     ss = (char*) line;
79     while ( *ss && isspace(*ss) ) ss++;
80     for (i=0; i<3; i++)
81     {
82         while ( *ss && !isspace(*ss) ) ss++;
83         if ( !*ss ) return -2;  // wrong number of fields
84         while ( *ss && isspace(*ss) ) ss++;
85     }
86     if ( !*ss ) return -2;
87 
88     // Parse the payload
89     char *se = ss;
90     while ( *se && !isspace(*se) ) se++;
91     if ( !*se || se==ss ) error("Could not parse: %s\n", line);
92     ploidy->tmp_str.l = 0;
93     kputsn(ss,se-ss,&ploidy->tmp_str);
94 
95     sex_ploidy_t *sp = (sex_ploidy_t*) payload;
96     if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
97     {
98         ploidy->nsex++;
99         hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
100         ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
101         sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
102         ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
103         ploidy->sex2dflt[ploidy->nsex-1] = -1;
104     }
105 
106     ss = se;
107     while ( *se && isspace(*se) ) se++;
108     if ( !*se ) error("Could not parse: %s\n", line);
109     sp->ploidy = strtol(ss,&se,10);
110     if ( ss==se ) error("Could not parse: %s\n", line);
111     if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
112     if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
113 
114     // Special case, chr="*" stands for a default value
115     if ( default_ploidy_def )
116     {
117         ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
118         return -1;
119     }
120 
121     return 0;
122 }
123 
_set_defaults(ploidy_t * ploidy,int dflt)124 static void _set_defaults(ploidy_t *ploidy, int dflt)
125 {
126     int i;
127     if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
128     for (i=0; i<ploidy->nsex; i++)
129         if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
130 
131     ploidy->dflt = dflt;
132     if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
133     if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
134 }
135 
ploidy_init(const char * fname,int dflt)136 ploidy_t *ploidy_init(const char *fname, int dflt)
137 {
138     ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
139     if ( !pld ) return NULL;
140 
141     pld->min = pld->max = -1;
142     pld->sex2id = khash_str2int_init();
143     pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
144     if ( !pld->idx )
145     {
146         ploidy_destroy(pld);
147         return NULL;
148     }
149     pld->itr = regitr_init(pld->idx);
150     _set_defaults(pld,dflt);
151     return pld;
152 }
153 
ploidy_init_string(const char * str,int dflt)154 ploidy_t *ploidy_init_string(const char *str, int dflt)
155 {
156     ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
157     if ( !pld ) return NULL;
158 
159     pld->min = pld->max = -1;
160     pld->sex2id = khash_str2int_init();
161     pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
162     pld->itr = regitr_init(pld->idx);
163 
164     kstring_t tmp = {0,0,0};
165     const char *ss = str;
166     while ( *ss )
167     {
168         while ( *ss && isspace(*ss) ) ss++;
169         const char *se = ss;
170         while ( *se && *se!='\r' && *se!='\n' ) se++;
171         tmp.l = 0;
172         kputsn(ss, se-ss, &tmp);
173         regidx_insert(pld->idx,tmp.s);
174         while ( *se && isspace(*se) ) se++;
175         ss = se;
176     }
177     free(tmp.s);
178 
179     _set_defaults(pld,dflt);
180     return pld;
181 }
182 
ploidy_destroy(ploidy_t * ploidy)183 void ploidy_destroy(ploidy_t *ploidy)
184 {
185     if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
186     if ( ploidy->itr ) regitr_destroy(ploidy->itr);
187     if ( ploidy->idx ) regidx_destroy(ploidy->idx);
188     free(ploidy->id2sex);
189     free(ploidy->tmp_str.s);
190     free(ploidy->sex2dflt);
191     free(ploidy);
192 }
193 
ploidy_query(ploidy_t * ploidy,char * seq,int pos,int * sex2ploidy,int * min,int * max)194 int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
195 {
196     int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
197 
198     if ( !sex2ploidy && !min && !max ) return ret;
199 
200     if ( !ret )
201     {
202         // no overlap
203         if ( min ) *min = ploidy->dflt;
204         if ( max ) *max = ploidy->dflt;
205         if ( sex2ploidy )
206             for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
207         return 0;
208     }
209 
210     int _min = INT_MAX, _max = -1;
211     if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
212 
213     while ( regitr_overlap(ploidy->itr) )
214     {
215         int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
216         int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
217         if ( pld!=ploidy->dflt )
218         {
219             if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
220             if ( _min > pld ) _min = pld;
221             if ( _max < pld ) _max = pld;
222         }
223     }
224     if ( _max==-1 ) _max = _min = ploidy->dflt;
225     if ( max ) *max = _max;
226     if ( min ) *min = _min;
227 
228     return 1;
229 }
230 
ploidy_nsex(ploidy_t * ploidy)231 int ploidy_nsex(ploidy_t *ploidy)
232 {
233     return ploidy->nsex;
234 }
235 
ploidy_id2sex(ploidy_t * ploidy,int id)236 char *ploidy_id2sex(ploidy_t *ploidy, int id)
237 {
238     if ( id<0 || id>=ploidy->nsex ) return NULL;
239     return ploidy->id2sex[id];
240 }
241 
ploidy_sex2id(ploidy_t * ploidy,char * sex)242 int ploidy_sex2id(ploidy_t *ploidy, char *sex)
243 {
244     int id;
245     if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
246     return id;
247 }
248 
ploidy_add_sex(ploidy_t * ploidy,const char * sex)249 int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
250 {
251     int id;
252     if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
253     ploidy->nsex++;
254     hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
255     ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
256     ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
257     ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
258     return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
259 }
260 
ploidy_max(ploidy_t * ploidy)261 int ploidy_max(ploidy_t *ploidy)
262 {
263     return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
264 }
265 
ploidy_min(ploidy_t * ploidy)266 int ploidy_min(ploidy_t *ploidy)
267 {
268     return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
269 }
270 
271