1 /*
2     Copyright (C) 2014-2016 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #include <htslib/khash_str2int.h>
26 #include <htslib/kseq.h>
27 #include <htslib/hts.h>
28 #include "bcftools.h"
29 #include "ploidy.h"
30 
31 struct _ploidy_t
32 {
33     int nsex, msex;     // number of genders, m:number of allocated elements in id2sex array
34     int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
35     int *sex2dflt;
36     regidx_t *idx;
37     regitr_t *itr;
38     void *sex2id;
39     char **id2sex;
40     kstring_t tmp_str;
41 };
42 
43 typedef struct
44 {
45     int sex, ploidy;
46 }
47 sex_ploidy_t;
48 
49 
ploidy_regions(ploidy_t * ploidy)50 regidx_t *ploidy_regions(ploidy_t *ploidy)
51 {
52     return ploidy->idx;
53 }
54 
ploidy_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)55 int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
56 {
57     int i, ret;
58     ploidy_t *ploidy = (ploidy_t*) usr;
59     void *sex2id = ploidy->sex2id;
60 
61     // Check for special case of default ploidy "* * * <sex> <ploidy>"
62     int default_ploidy_def = 0;
63 
64     char *ss = (char*) line;
65     while ( *ss && isspace(*ss) ) ss++;
66     if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
67         default_ploidy_def = 1; // definition of default ploidy, chr="*"
68     else
69     {
70         // Fill CHR,FROM,TO
71         ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
72         if ( ret!=0 ) return ret;
73     }
74 
75     // Skip the fields already parsed by regidx_parse_tab
76     ss = (char*) line;
77     while ( *ss && isspace(*ss) ) ss++;
78     for (i=0; i<3; i++)
79     {
80         while ( *ss && !isspace(*ss) ) ss++;
81         if ( !*ss ) return -2;  // wrong number of fields
82         while ( *ss && isspace(*ss) ) ss++;
83     }
84     if ( !*ss ) return -2;
85 
86     // Parse the payload
87     char *se = ss;
88     while ( *se && !isspace(*se) ) se++;
89     if ( !*se || se==ss ) error("Could not parse: %s\n", line);
90     ploidy->tmp_str.l = 0;
91     kputsn(ss,se-ss,&ploidy->tmp_str);
92 
93     sex_ploidy_t *sp = (sex_ploidy_t*) payload;
94     if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
95     {
96         ploidy->nsex++;
97         hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
98         ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
99         sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
100         ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
101         ploidy->sex2dflt[ploidy->nsex-1] = -1;
102     }
103 
104     ss = se;
105     while ( *se && isspace(*se) ) se++;
106     if ( !*se ) error("Could not parse: %s\n", line);
107     sp->ploidy = strtol(ss,&se,10);
108     if ( ss==se ) error("Could not parse: %s\n", line);
109     if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
110     if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
111 
112     // Special case, chr="*" stands for a default value
113     if ( default_ploidy_def )
114     {
115         ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
116         return -1;
117     }
118 
119     return 0;
120 }
121 
_set_defaults(ploidy_t * ploidy,int dflt)122 static void _set_defaults(ploidy_t *ploidy, int dflt)
123 {
124     int i;
125     if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
126     for (i=0; i<ploidy->nsex; i++)
127         if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
128 
129     ploidy->dflt = dflt;
130     if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
131     if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
132 }
133 
ploidy_init(const char * fname,int dflt)134 ploidy_t *ploidy_init(const char *fname, int dflt)
135 {
136     ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
137     if ( !pld ) return NULL;
138 
139     pld->min = pld->max = -1;
140     pld->sex2id = khash_str2int_init();
141     pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
142     if ( !pld->idx )
143     {
144         ploidy_destroy(pld);
145         return NULL;
146     }
147     pld->itr = regitr_init(pld->idx);
148     _set_defaults(pld,dflt);
149     return pld;
150 }
151 
ploidy_init_string(const char * str,int dflt)152 ploidy_t *ploidy_init_string(const char *str, int dflt)
153 {
154     ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
155     if ( !pld ) return NULL;
156 
157     pld->min = pld->max = -1;
158     pld->sex2id = khash_str2int_init();
159     pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
160     pld->itr = regitr_init(pld->idx);
161 
162     kstring_t tmp = {0,0,0};
163     const char *ss = str;
164     while ( *ss )
165     {
166         while ( *ss && isspace(*ss) ) ss++;
167         const char *se = ss;
168         while ( *se && *se!='\r' && *se!='\n' ) se++;
169         tmp.l = 0;
170         kputsn(ss, se-ss, &tmp);
171         regidx_insert(pld->idx,tmp.s);
172         while ( *se && isspace(*se) ) se++;
173         ss = se;
174     }
175     free(tmp.s);
176 
177     _set_defaults(pld,dflt);
178     return pld;
179 }
180 
ploidy_destroy(ploidy_t * ploidy)181 void ploidy_destroy(ploidy_t *ploidy)
182 {
183     if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
184     if ( ploidy->itr ) regitr_destroy(ploidy->itr);
185     if ( ploidy->idx ) regidx_destroy(ploidy->idx);
186     free(ploidy->id2sex);
187     free(ploidy->tmp_str.s);
188     free(ploidy->sex2dflt);
189     free(ploidy);
190 }
191 
ploidy_query(ploidy_t * ploidy,char * seq,int pos,int * sex2ploidy,int * min,int * max)192 int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
193 {
194     int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
195 
196     if ( !sex2ploidy && !min && !max ) return ret;
197 
198     if ( !ret )
199     {
200         // no overlap
201         if ( min ) *min = ploidy->dflt;
202         if ( max ) *max = ploidy->dflt;
203         if ( sex2ploidy )
204             for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
205         return 0;
206     }
207 
208     int _min = INT_MAX, _max = -1;
209     if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
210 
211     while ( regitr_overlap(ploidy->itr) )
212     {
213         int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
214         int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
215         if ( pld!=ploidy->dflt )
216         {
217             if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
218             if ( _min > pld ) _min = pld;
219             if ( _max < pld ) _max = pld;
220         }
221     }
222     if ( _max==-1 ) _max = _min = ploidy->dflt;
223     if ( max ) *max = _max;
224     if ( min ) *min = _min;
225 
226     return 1;
227 }
228 
ploidy_nsex(ploidy_t * ploidy)229 int ploidy_nsex(ploidy_t *ploidy)
230 {
231     return ploidy->nsex;
232 }
233 
ploidy_id2sex(ploidy_t * ploidy,int id)234 char *ploidy_id2sex(ploidy_t *ploidy, int id)
235 {
236     if ( id<0 || id>=ploidy->nsex ) return NULL;
237     return ploidy->id2sex[id];
238 }
239 
ploidy_sex2id(ploidy_t * ploidy,char * sex)240 int ploidy_sex2id(ploidy_t *ploidy, char *sex)
241 {
242     int id;
243     if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
244     return id;
245 }
246 
ploidy_add_sex(ploidy_t * ploidy,const char * sex)247 int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
248 {
249     int id;
250     if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
251     ploidy->nsex++;
252     hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
253     ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
254     ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
255     ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
256     return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
257 }
258 
ploidy_max(ploidy_t * ploidy)259 int ploidy_max(ploidy_t *ploidy)
260 {
261     return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
262 }
263 
ploidy_min(ploidy_t * ploidy)264 int ploidy_min(ploidy_t *ploidy)
265 {
266     return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
267 }
268 
269