1 /*
2 Copyright (C) 2014-2016 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <htslib/khash_str2int.h>
26 #include <htslib/kseq.h>
27 #include <htslib/hts.h>
28 #include "bcftools.h"
29 #include "ploidy.h"
30
31 struct _ploidy_t
32 {
33 int nsex, msex; // number of genders, m:number of allocated elements in id2sex array
34 int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
35 int *sex2dflt;
36 regidx_t *idx;
37 regitr_t *itr;
38 void *sex2id;
39 char **id2sex;
40 kstring_t tmp_str;
41 };
42
43 typedef struct
44 {
45 int sex, ploidy;
46 }
47 sex_ploidy_t;
48
49
ploidy_regions(ploidy_t * ploidy)50 regidx_t *ploidy_regions(ploidy_t *ploidy)
51 {
52 return ploidy->idx;
53 }
54
ploidy_parse(const char * line,char ** chr_beg,char ** chr_end,uint32_t * beg,uint32_t * end,void * payload,void * usr)55 int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
56 {
57 int i, ret;
58 ploidy_t *ploidy = (ploidy_t*) usr;
59 void *sex2id = ploidy->sex2id;
60
61 // Check for special case of default ploidy "* * * <sex> <ploidy>"
62 int default_ploidy_def = 0;
63
64 char *ss = (char*) line;
65 while ( *ss && isspace(*ss) ) ss++;
66 if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) )
67 default_ploidy_def = 1; // definition of default ploidy, chr="*"
68 else
69 {
70 // Fill CHR,FROM,TO
71 ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
72 if ( ret!=0 ) return ret;
73 }
74
75 // Skip the fields already parsed by regidx_parse_tab
76 ss = (char*) line;
77 while ( *ss && isspace(*ss) ) ss++;
78 for (i=0; i<3; i++)
79 {
80 while ( *ss && !isspace(*ss) ) ss++;
81 if ( !*ss ) return -2; // wrong number of fields
82 while ( *ss && isspace(*ss) ) ss++;
83 }
84 if ( !*ss ) return -2;
85
86 // Parse the payload
87 char *se = ss;
88 while ( *se && !isspace(*se) ) se++;
89 if ( !*se || se==ss ) error("Could not parse: %s\n", line);
90 ploidy->tmp_str.l = 0;
91 kputsn(ss,se-ss,&ploidy->tmp_str);
92
93 sex_ploidy_t *sp = (sex_ploidy_t*) payload;
94 if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 )
95 {
96 ploidy->nsex++;
97 hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
98 ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
99 sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
100 ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
101 ploidy->sex2dflt[ploidy->nsex-1] = -1;
102 }
103
104 ss = se;
105 while ( *se && isspace(*se) ) se++;
106 if ( !*se ) error("Could not parse: %s\n", line);
107 sp->ploidy = strtol(ss,&se,10);
108 if ( ss==se ) error("Could not parse: %s\n", line);
109 if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
110 if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
111
112 // Special case, chr="*" stands for a default value
113 if ( default_ploidy_def )
114 {
115 ploidy->sex2dflt[ploidy->nsex-1] = sp->ploidy;
116 return -1;
117 }
118
119 return 0;
120 }
121
_set_defaults(ploidy_t * ploidy,int dflt)122 static void _set_defaults(ploidy_t *ploidy, int dflt)
123 {
124 int i;
125 if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
126 for (i=0; i<ploidy->nsex; i++)
127 if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
128
129 ploidy->dflt = dflt;
130 if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
131 if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
132 }
133
ploidy_init(const char * fname,int dflt)134 ploidy_t *ploidy_init(const char *fname, int dflt)
135 {
136 ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
137 if ( !pld ) return NULL;
138
139 pld->min = pld->max = -1;
140 pld->sex2id = khash_str2int_init();
141 pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
142 if ( !pld->idx )
143 {
144 ploidy_destroy(pld);
145 return NULL;
146 }
147 pld->itr = regitr_init(pld->idx);
148 _set_defaults(pld,dflt);
149 return pld;
150 }
151
ploidy_init_string(const char * str,int dflt)152 ploidy_t *ploidy_init_string(const char *str, int dflt)
153 {
154 ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
155 if ( !pld ) return NULL;
156
157 pld->min = pld->max = -1;
158 pld->sex2id = khash_str2int_init();
159 pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
160 pld->itr = regitr_init(pld->idx);
161
162 kstring_t tmp = {0,0,0};
163 const char *ss = str;
164 while ( *ss )
165 {
166 while ( *ss && isspace(*ss) ) ss++;
167 const char *se = ss;
168 while ( *se && *se!='\r' && *se!='\n' ) se++;
169 tmp.l = 0;
170 kputsn(ss, se-ss, &tmp);
171 regidx_insert(pld->idx,tmp.s);
172 while ( *se && isspace(*se) ) se++;
173 ss = se;
174 }
175 free(tmp.s);
176
177 _set_defaults(pld,dflt);
178 return pld;
179 }
180
ploidy_destroy(ploidy_t * ploidy)181 void ploidy_destroy(ploidy_t *ploidy)
182 {
183 if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
184 if ( ploidy->itr ) regitr_destroy(ploidy->itr);
185 if ( ploidy->idx ) regidx_destroy(ploidy->idx);
186 free(ploidy->id2sex);
187 free(ploidy->tmp_str.s);
188 free(ploidy->sex2dflt);
189 free(ploidy);
190 }
191
ploidy_query(ploidy_t * ploidy,char * seq,int pos,int * sex2ploidy,int * min,int * max)192 int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
193 {
194 int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
195
196 if ( !sex2ploidy && !min && !max ) return ret;
197
198 if ( !ret )
199 {
200 // no overlap
201 if ( min ) *min = ploidy->dflt;
202 if ( max ) *max = ploidy->dflt;
203 if ( sex2ploidy )
204 for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->sex2dflt[i];
205 return 0;
206 }
207
208 int _min = INT_MAX, _max = -1;
209 if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
210
211 while ( regitr_overlap(ploidy->itr) )
212 {
213 int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
214 int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
215 if ( pld!=ploidy->dflt )
216 {
217 if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
218 if ( _min > pld ) _min = pld;
219 if ( _max < pld ) _max = pld;
220 }
221 }
222 if ( _max==-1 ) _max = _min = ploidy->dflt;
223 if ( max ) *max = _max;
224 if ( min ) *min = _min;
225
226 return 1;
227 }
228
ploidy_nsex(ploidy_t * ploidy)229 int ploidy_nsex(ploidy_t *ploidy)
230 {
231 return ploidy->nsex;
232 }
233
ploidy_id2sex(ploidy_t * ploidy,int id)234 char *ploidy_id2sex(ploidy_t *ploidy, int id)
235 {
236 if ( id<0 || id>=ploidy->nsex ) return NULL;
237 return ploidy->id2sex[id];
238 }
239
ploidy_sex2id(ploidy_t * ploidy,char * sex)240 int ploidy_sex2id(ploidy_t *ploidy, char *sex)
241 {
242 int id;
243 if ( khash_str2int_get(ploidy->sex2id,sex,&id)!=0 ) return -1;
244 return id;
245 }
246
ploidy_add_sex(ploidy_t * ploidy,const char * sex)247 int ploidy_add_sex(ploidy_t *ploidy, const char *sex)
248 {
249 int id;
250 if ( khash_str2int_get(ploidy->sex2id, sex, &id)==0 ) return id;
251 ploidy->nsex++;
252 hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex);
253 ploidy->id2sex[ploidy->nsex-1] = strdup(sex);
254 ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
255 ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
256 return khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
257 }
258
ploidy_max(ploidy_t * ploidy)259 int ploidy_max(ploidy_t *ploidy)
260 {
261 return ploidy->dflt > ploidy->max ? ploidy->dflt : ploidy->max;
262 }
263
ploidy_min(ploidy_t * ploidy)264 int ploidy_min(ploidy_t *ploidy)
265 {
266 return ploidy->dflt < ploidy->min ? ploidy->dflt : ploidy->min;
267 }
268
269