1 #include "bcftools.pysam.h"
2
3 /*
4 Copyright (C) 2014-2016 Genome Research Ltd.
5
6 Author: Petr Danecek <pd3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 THE SOFTWARE.
25 */
26
27 #include <htslib/khash_str2int.h>
28 #include <htslib/kseq.h>
29 #include <htslib/hts.h>
30 #include "bcftools.h"
31 #include "ploidy.h"
32
33 struct _ploidy_t
34 {
35 int nsex, msex; // number of genders, m:number of allocated elements in id2sex array
36 int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
37 int *sex2dflt;
38 regidx_t *idx;
39 regitr_t *itr;
40 void *sex2id;
41 char **id2sex;
42 kstring_t tmp_str;
43 };
44
45 typedef struct
46 {
47 int sex, ploidy;
48 }
49 sex_ploidy_t;
50
51
ploidy_regions(ploidy_t * ploidy)52 regidx_t *ploidy_regions(ploidy_t *ploidy)
53 {
54 return ploidy->idx;
55 }
56
ploidy_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)57 int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
58 {
59 int i, ret;
60 ploidy_t *ploidy = (ploidy_t*) usr;
61 void *sex2id = ploidy->sex2id;
62
63 // Check for special case of default ploidy "* * * <sex> <ploidy>"
64 int default_ploidy_def = 0;
65
66 char *ss = (char*) line;
67 while ( *ss && isspace(*ss) ) ss++;
68 if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
69 default_ploidy_def = 1; // definition of default ploidy, chr="*"
70 else
71 {
72 // Fill CHR,FROM,TO
73 ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
74 if ( ret!=0 ) return ret;
75 }
76
77 // Skip the fields already parsed by regidx_parse_tab
78 ss = (char*) line;
79 while ( *ss && isspace(*ss) ) ss++;
80 for (i=0; i<3; i++)
81 {
82 while ( *ss && !isspace(*ss) ) ss++;
83 if ( !*ss ) return -2; // wrong number of fields
84 while ( *ss && isspace(*ss) ) ss++;
85 }
86 if ( !*ss ) return -2;
87
88 // Parse the payload
89 char *se = ss;
90 while ( *se && !isspace(*se) ) se++;
91 if ( !*se || se==ss ) error("Could not parse: %s\n", line);
92 ploidy->tmp_str.l = 0;
93 kputsn(ss,se-ss,&ploidy->tmp_str);
94
95 sex_ploidy_t *sp = (sex_ploidy_t*) payload;
96 if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
97 {
98 ploidy->nsex++;
99 hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
100 ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
101 sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
102 ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
103 ploidy->sex2dflt[ploidy->nsex-1] = -1;
104 }
105
106 ss = se;
107 while ( *se && isspace(*se) ) se++;
108 if ( !*se ) error("Could not parse: %s\n", line);
109 sp->ploidy = strtol(ss,&se,10);
110 if ( ss==se ) error("Could not parse: %s\n", line);
111 if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
112 if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
113
114 // Special case, chr="*" stands for a default value
115 if ( default_ploidy_def )
116 {
117 ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
118 return -1;
119 }
120
121 return 0;
122 }
123
_set_defaults(ploidy_t * ploidy,int dflt)124 static void _set_defaults(ploidy_t *ploidy, int dflt)
125 {
126 int i;
127 if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
128 for (i=0; i<ploidy->nsex; i++)
129 if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
130
131 ploidy->dflt = dflt;
132 if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
133 if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
134 }
135
ploidy_init(const char * fname,int dflt)136 ploidy_t *ploidy_init(const char *fname, int dflt)
137 {
138 ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
139 if ( !pld ) return NULL;
140
141 pld->min = pld->max = -1;
142 pld->sex2id = khash_str2int_init();
143 pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
144 if ( !pld->idx )
145 {
146 ploidy_destroy(pld);
147 return NULL;
148 }
149 pld->itr = regitr_init(pld->idx);
150 _set_defaults(pld,dflt);
151 return pld;
152 }
153
ploidy_init_string(const char * str,int dflt)154 ploidy_t *ploidy_init_string(const char *str, int dflt)
155 {
156 ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
157 if ( !pld ) return NULL;
158
159 pld->min = pld->max = -1;
160 pld->sex2id = khash_str2int_init();
161 pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
162 pld->itr = regitr_init(pld->idx);
163
164 kstring_t tmp = {0,0,0};
165 const char *ss = str;
166 while ( *ss )
167 {
168 while ( *ss && isspace(*ss) ) ss++;
169 const char *se = ss;
170 while ( *se && *se!='\r' && *se!='\n' ) se++;
171 tmp.l = 0;
172 kputsn(ss, se-ss, &tmp);
173 regidx_insert(pld->idx,tmp.s);
174 while ( *se && isspace(*se) ) se++;
175 ss = se;
176 }
177 free(tmp.s);
178
179 _set_defaults(pld,dflt);
180 return pld;
181 }
182
ploidy_destroy(ploidy_t * ploidy)183 void ploidy_destroy(ploidy_t *ploidy)
184 {
185 if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
186 if ( ploidy->itr ) regitr_destroy(ploidy->itr);
187 if ( ploidy->idx ) regidx_destroy(ploidy->idx);
188 free(ploidy->id2sex);
189 free(ploidy->tmp_str.s);
190 free(ploidy->sex2dflt);
191 free(ploidy);
192 }
193
ploidy_query(ploidy_t * ploidy,char * seq,int pos,int * sex2ploidy,int * min,int * max)194 int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
195 {
196 int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
197
198 if ( !sex2ploidy && !min && !max ) return ret;
199
200 if ( !ret )
201 {
202 // no overlap
203 if ( min ) *min = ploidy->dflt;
204 if ( max ) *max = ploidy->dflt;
205 if ( sex2ploidy )
206 for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
207 return 0;
208 }
209
210 int _min = INT_MAX, _max = -1;
211 if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
212
213 while ( regitr_overlap(ploidy->itr) )
214 {
215 int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
216 int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
217 if ( pld!=ploidy->dflt )
218 {
219 if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
220 if ( _min > pld ) _min = pld;
221 if ( _max < pld ) _max = pld;
222 }
223 }
224 if ( _max==-1 ) _max = _min = ploidy->dflt;
225 if ( max ) *max = _max;
226 if ( min ) *min = _min;
227
228 return 1;
229 }
230
ploidy_nsex(ploidy_t * ploidy)231 int ploidy_nsex(ploidy_t *ploidy)
232 {
233 return ploidy->nsex;
234 }
235
ploidy_id2sex(ploidy_t * ploidy,int id)236 char *ploidy_id2sex(ploidy_t *ploidy, int id)
237 {
238 if ( id<0 || id>=ploidy->nsex ) return NULL;
239 return ploidy->id2sex[id];
240 }
241
ploidy_sex2id(ploidy_t * ploidy,char * sex)242 int ploidy_sex2id(ploidy_t *ploidy, char *sex)
243 {
244 int id;
245 if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
246 return id;
247 }
248
ploidy_add_sex(ploidy_t * ploidy,const char * sex)249 int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
250 {
251 int id;
252 if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
253 ploidy->nsex++;
254 hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
255 ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
256 ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
257 ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
258 return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
259 }
260
ploidy_max(ploidy_t * ploidy)261 int ploidy_max(ploidy_t *ploidy)
262 {
263 return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
264 }
265
ploidy_min(ploidy_t * ploidy)266 int ploidy_min(ploidy_t *ploidy)
267 {
268 return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
269 }
270
271