1 /*
2 Copyright (C) 2017 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <strings.h>
28 #include <getopt.h>
29 #include <stdarg.h>
30 #include <stdint.h>
31 #include <htslib/vcf.h>
32 #include <htslib/synced_bcf_reader.h>
33 #include <htslib/vcfutils.h>
34 #include <htslib/kseq.h>
35 #include <inttypes.h>
36 #include <unistd.h>
37 #include "bcftools.h"
38
39 typedef struct
40 {
41 int argc;
42 char **argv, *fname, *region, **regs;
43 int region_is_file, nregs, regs_free;
44 int *smpl, nsmpl, *nsites, min_sites, gt_id;
45 kstring_t tmps;
46 bcf1_t *rec;
47 tbx_t *tbx;
48 hts_idx_t *idx;
49 hts_itr_t *itr;
50 htsFile *fp;
51 bcf_hdr_t *hdr;
52 }
53 args_t;
54
about(void)55 const char *about(void)
56 {
57 return "Print samples without genotypes in a region or chromosome\n";
58 }
59
usage_text(void)60 static const char *usage_text(void)
61 {
62 return
63 "\n"
64 "About: Print samples without genotypess in a region (-r/-R) or chromosome (the default)\n"
65 "\n"
66 "Usage: bcftools +check-sparsity <file.vcf.gz> [Plugin Options]\n"
67 "Plugin options:\n"
68 " -n, --n-markers <int> minimum number of required markers [1]\n"
69 " -r, --regions <chr:beg-end> restrict to comma-separated list of regions\n"
70 " -R, --regions-file <file> restrict to regions listed in a file\n"
71 "\n";
72 }
73
init_data(args_t * args)74 static void init_data(args_t *args)
75 {
76 args->fp = hts_open(args->fname,"r");
77 if ( !args->fp ) error("Could not read %s\n", args->fname);
78 args->hdr = bcf_hdr_read(args->fp);
79 if ( !args->hdr ) error("Could not read the header: %s\n", args->fname);
80
81 args->rec = bcf_init1();
82 args->gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
83 if ( args->gt_id<0 ) error("Error: GT field is not present\n");
84
85 int i;
86 args->nsmpl = bcf_hdr_nsamples(args->hdr);
87 args->nsites = (int*) calloc(args->nsmpl, sizeof(int));
88 args->smpl = (int*) malloc(sizeof(int)*args->nsmpl);
89 for (i=0; i<args->nsmpl; i++) args->smpl[i] = i;
90
91 if ( strcmp("-",args->fname) ) // not reading from stdin
92 {
93 if ( hts_get_format(args->fp)->format==vcf )
94 {
95 args->tbx = tbx_index_load(args->fname);
96 if ( !args->tbx && args->region ) error("Could not load the VCF index, please drop the -r/-R option\n");
97 }
98 else if ( hts_get_format(args->fp)->format==bcf )
99 {
100 args->idx = bcf_index_load(args->fname);
101 if ( !args->idx && args->region ) error("Could not load the BCF index, please drop the -r/-R option\n");
102 }
103 }
104 else if ( args->region ) error("Cannot use index with this file, please drop the -r/-R option\n");
105
106 if ( args->tbx || args->idx )
107 {
108 if ( args->region )
109 {
110 args->regs = hts_readlist(args->region, args->region_is_file, &args->nregs);
111 if ( !args->regs ) error("Could not parse regions: %s\n", args->region);
112 args->regs_free = 1;
113 }
114 else
115 args->regs = (char**) (args->tbx ? tbx_seqnames(args->tbx, &args->nregs) : bcf_index_seqnames(args->idx, args->hdr, &args->nregs));
116 }
117 }
destroy_data(args_t * args)118 static void destroy_data(args_t *args)
119 {
120 int i;
121 if ( args->regs_free )
122 for (i=0; i<args->nregs; i++) free(args->regs[i]);
123 free(args->regs);
124 bcf_hdr_destroy(args->hdr);
125 bcf_destroy(args->rec);
126 free(args->tmps.s);
127 free(args->smpl);
128 free(args->nsites);
129 if ( args->itr ) hts_itr_destroy(args->itr);
130 if ( args->tbx ) tbx_destroy(args->tbx);
131 if ( args->idx ) hts_idx_destroy(args->idx);
132 if ( hts_close(args->fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname);
133 }
134
report(args_t * args,const char * reg)135 static void report(args_t *args, const char *reg)
136 {
137 int i;
138 for (i=0; i<args->nsmpl; i++)
139 printf("%s\t%s\n", reg, args->hdr->samples[args->smpl[i]]);
140 args->nsmpl = bcf_hdr_nsamples(args->hdr);
141 for (i=0; i<args->nsmpl; i++) args->smpl[i] = i;
142 memset(args->nsites, 0, sizeof(int)*args->nsmpl);
143 }
test_region(args_t * args,char * reg)144 static void test_region(args_t *args, char *reg)
145 {
146 if ( args->tbx )
147 {
148 args->itr = tbx_itr_querys(args->tbx,reg);
149 if ( !args->itr ) return;
150 }
151 else if ( args->idx )
152 {
153 args->itr = bcf_itr_querys(args->idx,args->hdr,reg);
154 if ( !args->itr ) return;
155 }
156
157 int ret,i, rid = -1, nread = 0;
158 while (1)
159 {
160 if ( args->tbx )
161 {
162 if ( (ret=tbx_itr_next(args->fp, args->tbx, args->itr, &args->tmps)) < 0 ) break; // no more lines
163 ret = vcf_parse1(&args->tmps, args->hdr, args->rec);
164 if ( ret<0 ) error("Could not parse the line: %s\n", args->tmps.s);
165 }
166 else if ( args->idx )
167 {
168 ret = bcf_itr_next(args->fp, args->itr, args->rec);
169 if ( ret < -1 ) error("Could not parse a line from %s\n", reg);
170 if ( ret < 0 ) break; // no more lines or an error
171 }
172 else
173 {
174 if ( args->fp->format.format==vcf )
175 {
176 if ( (ret=hts_getline(args->fp, KS_SEP_LINE, &args->tmps)) < 0 ) break; // no more lines
177 ret = vcf_parse1(&args->tmps, args->hdr, args->rec);
178 if ( ret<0 ) error("Could not parse the line: %s\n", args->tmps.s);
179 }
180 else if ( args->fp->format.format==bcf )
181 {
182 ret = bcf_read1(args->fp, args->hdr, args->rec);
183 if ( ret < -1 ) error("Could not parse %s\n", args->fname);
184 if ( ret < 0 ) break; // no more lines or an error
185 }
186 if ( rid!=-1 && rid!=args->rec->rid )
187 {
188 report(args, bcf_hdr_id2name(args->hdr,rid));
189 nread = 0;
190 }
191 rid = args->rec->rid;
192 }
193
194 bcf_unpack(args->rec, BCF_UN_FMT);
195 bcf_fmt_t *fmt_gt = NULL;
196 for (i=0; i<args->rec->n_fmt; i++)
197 if ( args->rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &args->rec->d.fmt[i]; break; }
198 if ( !fmt_gt ) continue; // no GT tag
199 if ( fmt_gt->n==0 ) continue; // empty?!
200 if ( fmt_gt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n");
201
202 // update the array of missing samples
203 for (i=0; i<args->nsmpl; i++)
204 {
205 int8_t *ptr = (int8_t*) (fmt_gt->p + args->smpl[i]*fmt_gt->size);
206 int ial = 0;
207 for (ial=0; ial<fmt_gt->n; ial++)
208 if ( ptr[ial]==bcf_gt_missing || ptr[ial]==bcf_int8_vector_end ) break;
209 if ( ial==0 ) continue; // missing
210 if ( ++args->nsites[i] < args->min_sites ) continue;
211 if ( i+1<args->nsmpl )
212 {
213 memmove(args->smpl+i, args->smpl+i+1, sizeof(int)*(args->nsmpl-i-1));
214 memmove(args->nsites+i, args->nsites+i+1, sizeof(int)*(args->nsmpl-i-1));
215 }
216 args->nsmpl--;
217 i--;
218 }
219 nread = 1;
220 if ( !args->nsmpl ) break;
221 }
222 if ( nread ) report(args, rid==-1 ? reg : bcf_hdr_id2name(args->hdr,rid));
223
224 tbx_itr_destroy(args->itr);
225 args->itr = NULL;
226 }
227
run(int argc,char ** argv)228 int run(int argc, char **argv)
229 {
230 args_t *args = (args_t*) calloc(1,sizeof(args_t));
231 args->argc = argc; args->argv = argv;
232 args->min_sites = 1;
233 static struct option loptions[] =
234 {
235 {"n-markers",required_argument,NULL,'n'},
236 {"regions",required_argument,NULL,'r'},
237 {"regions-file",required_argument,NULL,'R'},
238 {NULL,0,NULL,0}
239 };
240 int c,i;
241 char *tmp;
242 while ((c = getopt_long(argc, argv, "vr:R:n:",loptions,NULL)) >= 0)
243 {
244 switch (c)
245 {
246 case 'n':
247 args->min_sites = strtol(optarg,&tmp,10);
248 if ( *tmp ) error("Could not parse: -n %s\n", optarg);
249 break;
250 case 'R': args->region_is_file = 1; // fall-through
251 case 'r': args->region = optarg; break;
252 case 'h':
253 case '?':
254 default: error("%s", usage_text()); break;
255 }
256 }
257
258 if ( optind>=argc )
259 {
260 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
261 else error("%s",usage_text());
262 }
263 else args->fname = argv[optind];
264 init_data(args);
265
266 for (i=0; i<args->nregs; i++) test_region(args, args->regs[i]);
267 if ( !args->nregs ) test_region(args, NULL);
268
269 destroy_data(args);
270 free(args);
271 return 0;
272 }
273
274