1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "profile_mendelian.h"
25
26 namespace
27 {
28
29 class HetStats
30 {
31 public:
32
33 std::string individual_id;
34 int32_t individual_index;
35 int32_t no_non_pseudoautosomal_variant_sites;
36 int32_t no_het;
37 int32_t no_hom_ref;
38 int32_t no_hom_alt;
39 float heterozygosity;
40 };
41
42 class Igor : Program
43 {
44 public:
45
46 std::string version;
47
48 ///////////
49 //options//
50 ///////////
51 std::string input_vcf_file;
52 std::string input_ped_file;
53 std::string output_vcf_file;
54 std::string output_tabulate_dir;
55 std::string output_pdf_file;
56 std::vector<GenomeInterval> intervals;
57 std::string interval_list;
58 int32_t min_depth;
59 float min_gq;
60 bool ignore_non_variants;
61 bool output_sites;
62
63 ///////
64 //i/o//
65 ///////
66 BCFOrderedReader *odr;
67 BCFOrderedWriter *odw;
68
69 ///////////////
70 //general use//
71 ///////////////
72 kstring_t variant;
73
74 //////////
75 //filter//
76 //////////
77 std::string fexp;
78 Filter filter;
79 bool filter_exists;
80
81 /////////
82 //stats//
83 /////////
84 uint32_t no_trios;
85 uint32_t no_dups;
86 uint32_t no_biallelic_variants;
87 uint32_t no_biallelic_variants_dups;
88 uint32_t no_multiallelic_variants;
89 uint32_t duplicate_biallelic_vntr_genotypes_concordant;
90 uint32_t duplicate_biallelic_vntr_genotypes_disconcordant;
91 uint32_t no_failed_min_depth;
92
93 //for biallelics
94 int32_t trio_genotypes[3][3][3];
95 int32_t trio_multiallelic_genotypes[4][2]; //number of distinct parental alleles (4) vs mendelian error or not
96 int32_t duplicate_genotypes[3][3];
97
98 //for multiallelics
99 std::vector<std::vector<std::vector<int32_t> > > trios_multiallelic_genotypes;
100
101 //for non pseudoautosomal variants
102 std::vector<HetStats> males;
103 std::vector<HetStats> females;
104
105 /////////
106 //tools//
107 /////////
108 Pedigree *pedigree;
109 VariantManip *vm;
110
Igor(int argc,char ** argv)111 Igor(int argc, char ** argv)
112 {
113 //////////////////////////
114 //options initialization//
115 //////////////////////////
116 try
117 {
118 std::string desc = "Profile Mendelian Errors.";
119
120 version = "0.5";
121 TCLAP::CmdLine cmd(desc, ' ', version);
122 VTOutput my; cmd.setOutput(&my);
123 TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
124 TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
125 TCLAP::ValueArg<std::string> arg_input_ped_file("p", "p", "pedigree file", true, "", "str", cmd);
126 TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
127
128 TCLAP::ValueArg<std::string> arg_output_tabulate_dir("x", "x", "output latex directory [tabulate_mendelian]", false, "tabulate_mendelian", "str", cmd);
129 TCLAP::ValueArg<std::string> arg_output_pdf_file("y", "y", "output pdf file [mendelian.pdf]", false, "mendelian.pdf", "str", cmd);
130
131 TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file annotated with site error statistics [\"\"]", false, "", "str", cmd);
132 TCLAP::ValueArg<int32_t> arg_min_depth("d", "d", "minimum depth", false, 0, "str", cmd);
133 TCLAP::ValueArg<float> arg_min_gq("q", "q", "minimum genotype quality", false, 2, "str", cmd);
134 TCLAP::SwitchArg arg_ignore_non_variants("n", "n", "ignore non variants", cmd, false);
135 TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
136
137 cmd.parse(argc, argv);
138
139 input_vcf_file = arg_input_vcf_file.getValue();
140 input_ped_file = arg_input_ped_file.getValue();
141 output_vcf_file = arg_output_vcf_file.getValue();
142 output_sites = output_vcf_file=="" ? false : true;
143 fexp = arg_fexp.getValue();
144 output_tabulate_dir = arg_output_tabulate_dir.getValue();
145 output_pdf_file = arg_output_pdf_file.getValue();
146 min_depth = arg_min_depth.getValue();
147 min_gq = arg_min_gq.getValue();
148 ignore_non_variants = arg_ignore_non_variants.getValue();
149 parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
150 }
151 catch (TCLAP::ArgException &e)
152 {
153 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
154 abort();
155 }
156 };
157
initialize()158 void initialize()
159 {
160 //////////////////////
161 //i/o initialization//
162 //////////////////////
163 odr = new BCFOrderedReader(input_vcf_file, intervals);
164
165 if (bcf_hdr_nsamples(odr->hdr)==0)
166 {
167 fprintf(stderr, "[%s:%d %s] No samples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
168 exit(1);
169 }
170
171 ///////////////////////////
172 //output VCF file //
173 ///////////////////////////
174 if (output_sites)
175 {
176 odw = new BCFOrderedWriter(output_vcf_file, 0);
177 bcf_hdr_t *hdr = bcf_hdr_subset(odr->hdr, 0, 0, 0);
178 odw->set_hdr(hdr);
179 bcf_hdr_destroy(hdr);
180 bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_DISC,Number=1,Type=Integer,Description=\"No. of sites with mendelian discordance.\">");
181 bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_INF,Number=1,Type=Integer,Description=\"No. of mendelian informative sites. For biallelics, excludes HOMREF trios and HET parents.\">");
182 bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_TOT,Number=1,Type=Integer,Description=\"No. of mendelian sample-site pairs.\">");
183 bcf_hdr_append(odw->hdr, "##INFO=<ID=DUP_DISC,Number=1,Type=Integer,Description=\"No. of sites with duplicate discordance.\">");
184 bcf_hdr_append(odw->hdr, "##INFO=<ID=DUP_TOT,Number=1,Type=Integer,Description=\"No. of duplicates sample-site pairs.\">");
185 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HOMREF,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous reference genotypes for males.\">");
186 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HET,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with heterozygous genotypes for males.\">");
187 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HOMALT,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous alternate genotypes for males.\">");
188 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HOMREF,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous reference genotypes for females.\">");
189 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HET,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with heterozygous genotypes for females.\">");
190 bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HOMALT,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous alternate genotypes for females.\">");
191 bcf_hdr_sync(odw->hdr);
192 odw->write_hdr();
193 }
194
195 ///////////////////////////
196 //ped file initialization//
197 ///////////////////////////
198 pedigree = new Pedigree(input_ped_file);
199
200 /////////////////////////
201 //filter initialization//
202 /////////////////////////
203 filter.parse(fexp.c_str());
204 filter_exists = fexp!="";
205
206 ///////////////
207 //general use//
208 ///////////////
209 variant = {0,0,0};
210
211 ////////////////////////
212 //stats initialization//
213 ////////////////////////
214 no_trios = 0;
215 no_biallelic_variants = 0;
216 no_biallelic_variants_dups = 0;
217 no_multiallelic_variants = 0;
218 no_failed_min_depth = 0;
219 duplicate_biallelic_vntr_genotypes_concordant = 0;
220 duplicate_biallelic_vntr_genotypes_disconcordant = 0;
221
222 for (int32_t i=0; i<3; ++i)
223 {
224 for (int32_t j=0; j<3; ++j)
225 {
226 for (int32_t k=0; k<3; ++k)
227 {
228 trio_genotypes[i][j][k] = 0;
229 }
230 }
231 }
232
233 for (int32_t a=0; a<3; ++a)
234 {
235 for (int32_t b=0; b<3; ++b)
236 {
237 duplicate_genotypes[a][b] = 0;
238 }
239 }
240
241 for (int32_t a=0; a<4; ++a)
242 {
243 for (int32_t b=0; b<2; ++b)
244 {
245 trio_multiallelic_genotypes[a][b] = 0;
246 }
247 }
248
249 /////////
250 //tools//
251 /////////
252 vm = new VariantManip();
253 }
254
profile_mendelian()255 void profile_mendelian()
256 {
257 bcf_hdr_t *h = odr->hdr;
258 bcf1_t *v = bcf_init1();
259
260 Variant variant;
261
262 ///////////////////////
263 //initializing pedigree
264 ///////////////////////
265 std::vector<Trio> trios;
266 std::vector<Duplicate> dups;
267
268 std::vector<PEDRecord>& recs = pedigree->recs;
269
270 for (size_t i=0; i<recs.size(); ++i)
271 {
272 if (recs[i].is_trio())
273 {
274 int32_t individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].individual[0].c_str());
275 int32_t father_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].father.c_str());
276 int32_t mother_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].mother.c_str());
277
278 if (individual_index>=0 && father_index>=0 && mother_index>=0)
279 {
280 Trio trio(individual_index, father_index, mother_index, recs[i].individual_sex);
281 trios.push_back(trio);
282 }
283 }
284
285 if (recs[i].is_duplicated())
286 {
287 PEDRecord& rec = recs[i];
288 int32_t no_pairs_added = 0;
289
290 int32_t individual_sex = rec.individual_sex;
291
292 for (size_t a=0; a<rec.individual.size()-1; ++a)
293 {
294 int32_t individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, rec.individual[a].c_str());
295
296 for (size_t b=a+1; b<rec.individual.size(); ++b)
297 {
298 int32_t duplicate_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, rec.individual[b].c_str());
299
300 Duplicate dup(individual_index, duplicate_index, individual_sex);
301 dups.push_back(dup);
302 ++no_pairs_added;
303 }
304 }
305 }
306
307 HetStats stats;
308 stats.individual_id = recs[i].individual[0];
309 stats.individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].individual[0].c_str());;
310 stats.no_non_pseudoautosomal_variant_sites = 0;
311 stats.no_het = 0;
312 stats.no_hom_ref = 0;
313 stats.no_hom_alt = 0;
314 stats.heterozygosity = 0;
315
316 if (recs[i].is_female())
317 {
318 females.push_back(stats);
319 }
320 else
321 {
322 males.push_back(stats);
323 }
324 }
325
326 no_trios = trios.size();
327 no_dups = dups.size();
328
329 std::cerr << "No. of trios detected: " << trios.size() << "\n";
330 std::cerr << "No. of duplicates detected: " << dups.size() << "\n";
331 std::cerr << "No. of males detected: " << males.size() << "\n";
332 std::cerr << "No. of females detected: " << females.size() << "\n";
333
334 int32_t missing = 0;
335 int32_t mendel_homalt_err = 0;
336
337 int32_t nsample = bcf_hdr_get_n_sample(h);
338 int32_t *gts = NULL;
339 int32_t *dps = (int32_t *) malloc(nsample*sizeof(int32_t));
340 int32_t n = 0;
341 int32_t n_dp = nsample;
342
343 float *cgs = NULL;
344 int32_t n_cg = 0;
345
346 while(odr->read(v))
347 {
348 bcf_unpack(v, BCF_UN_IND);
349 int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
350
351 if (filter_exists)
352 {
353 vm->classify_variant(odr->hdr, v, variant);
354 if (!filter.apply(odr->hdr, v, &variant))
355 {
356 continue;
357 }
358 }
359
360 int32_t no_alleles = bcf_get_n_allele(v);
361
362 //biallelic
363 if (no_alleles==2)
364 {
365 if (vtype!=VT_VNTR)
366 {
367 int k = bcf_get_genotypes(h, v, >s, &n);
368 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
369
370 if (r==-1)
371 {
372 r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
373
374 if (r==-1)
375 {
376 for (uint32_t i=0; i<nsample; ++i)
377 {
378 dps[i] = min_depth;
379 }
380 }
381 }
382
383 bool variant_used = false;
384
385 ///////////////////////
386 //mendelian concordance
387 ///////////////////////
388 int32_t no_mendelian_discordance = 0;
389 int32_t no_mendelian_informative_sites = 0;
390 int32_t no_mendelian_sample_site_pairs = 0;
391 for (int32_t i=0; i<trios.size(); ++i)
392 {
393 int32_t j = trios[i].father_index;
394 int32_t f1 = bcf_gt_allele(gts[(j<<1)]);
395 int32_t f2 = bcf_gt_allele(gts[(j<<1)+1]);
396 int32_t min_dp = dps[j];
397
398 j = trios[i].mother_index;
399 int32_t m1 = bcf_gt_allele(gts[(j<<1)]);
400 int32_t m2 = bcf_gt_allele(gts[(j<<1)+1]);
401 min_dp = dps[j]<min_dp ? dps[j] : min_dp;
402
403 j = trios[i].child_index;
404 int32_t c1 = bcf_gt_allele(gts[(j<<1)]);
405 int32_t c2 = bcf_gt_allele(gts[(j<<1)+1]);
406 min_dp = dps[j]<min_dp ? dps[j] : min_dp;
407
408 if (min_dp<min_depth)
409 {
410 ++no_failed_min_depth;
411 continue;
412 }
413
414 if (!(c1<0 || c2<0 || f1<0 || f2<0 || m1<0 || m2<0))
415 {
416 if (!ignore_non_variants || (c1+c2+f1+f2+m1+m2!=0))
417 {
418 ++trio_genotypes[f1+f2][m1+m2][c1+c2];
419 variant_used = true;
420
421 if (is_mendelian_discordant(c1, c2, f1, f2, m1, m2))
422 {
423 ++no_mendelian_discordance;
424 }
425
426 if (!(c1+c2+f1+f2+m1+m2==0) && !(f1==1&&f2==1&&m1==1&&m2==1))
427 {
428 ++no_mendelian_informative_sites;
429 }
430 ++no_mendelian_sample_site_pairs;
431 }
432 }
433 }
434 if (variant_used)
435 {
436 ++no_biallelic_variants;
437 }
438
439 if (output_sites)
440 {
441 // bcf_set_n_sample(v, 0);
442 bcf_update_info_int32(odw->hdr, v, "MEN_DISC", &no_mendelian_discordance, 1);
443 bcf_update_info_int32(odw->hdr, v, "MEN_INF", &no_mendelian_informative_sites, 1);
444 bcf_update_info_int32(odw->hdr, v, "MEN_TOT", &no_mendelian_sample_site_pairs, 1);
445 }
446
447 ///////////////////////
448 //duplicate concordance
449 ///////////////////////
450 if (dups.size()>0)
451 {
452 int32_t no_duplicate_discordance = 0;
453 int32_t no_duplicate_sample_site_pairs = 0;
454
455 for (int32_t i=0; i<dups.size(); ++i)
456 {
457 int32_t j = dups[i].individual_index;
458 int32_t a1 = bcf_gt_allele(gts[(j<<1)]);
459 int32_t a2 = bcf_gt_allele(gts[(j<<1)+1]);
460 int32_t min_dp = dps[j];
461
462 j = dups[i].duplicate_index;
463 int32_t b1 = bcf_gt_allele(gts[(j<<1)]);
464 int32_t b2 = bcf_gt_allele(gts[(j<<1)+1]);
465 min_dp = dps[j]<min_dp ? dps[j] : min_dp;
466
467 if (min_dp<min_depth)
468 {
469 ++no_failed_min_depth;
470 continue;
471 }
472
473 if (!(a1<0 || a2<0 || b1<0 || b2<0))
474 {
475 ++duplicate_genotypes[a1+a2][b1+b2];
476 variant_used = true;
477
478 if (is_duplicate_discordant(a1, a2, b1, b2))
479 {
480 ++no_duplicate_discordance;
481 }
482 ++no_duplicate_sample_site_pairs;
483 }
484 }
485 ++no_biallelic_variants_dups;
486
487 if (output_sites)
488 {
489 bcf_update_info_int32(odw->hdr, v, "DUP_DISC", &no_duplicate_discordance, 1);
490 bcf_update_info_int32(odw->hdr, v, "DUP_TOT", &no_duplicate_sample_site_pairs, 1);
491 }
492 }// end of iterating through dups
493 }
494 else //VNTR
495 {
496 int k = bcf_get_format_float(h, v, "CG", &cgs, &n_cg);
497 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
498
499 if (r==-1)
500 {
501 r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
502
503 if (r==-1)
504 {
505 for (uint32_t i=0; i<nsample; ++i)
506 {
507 dps[i] = min_depth;
508 }
509 }
510 }
511
512 bool variant_used = false;
513
514 ///////////////////////
515 //mendelian concordance
516 ///////////////////////
517 for (int32_t i=0; i<trios.size(); ++i)
518 {
519 int32_t j = trios[i].father_index;
520 float f1 = cgs[j<<1];
521 float f2 = cgs[(j<<1)+1];
522
523 j = trios[i].mother_index;
524 float m1 = cgs[j<<1];
525 float m2 = cgs[(j<<1)+1];
526
527 j = trios[i].child_index;
528 float c1 = cgs[j<<1];
529 float c2 = cgs[(j<<1)+1];
530
531 if (!(f1<0 || f2<0 || m1<0 || m2<0 || c1<0 || c2<0))
532 {
533 if (!ignore_non_variants || (f1+f2+m1+m2+c1+c2!=0))
534 {
535 variant_used = true;
536
537 //count number of distinct parental alleles
538 std::map<int32_t, int32_t> distinct_alleles;
539 distinct_alleles[f1] = 0;
540 distinct_alleles[f2] = 0;
541 distinct_alleles[m1] = 0;
542 distinct_alleles[m2] = 0;
543 int32_t no_distinct_parental_alleles = distinct_alleles.size();
544
545 //check if transmission is possible
546 if (((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
547 ((c2==f1||c2==f2) && (c1==m1||c1==m2)))
548 {
549 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][0];
550 // std::cerr << "*******************************************\n";
551 // std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
552 // std::cerr << c1 << " " << c2 << "\n";
553 }
554 else
555 {
556 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][1];
557
558 // if (no_distinct_parental_alleles==4)
559 // {
560 // std::cerr << "==========================================\n";
561 // std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
562 // std::cerr << c1 << " " << c2 << "\n";
563
564
565 // }
566 }
567 }
568 }
569 }
570 if (variant_used) ++no_multiallelic_variants;
571
572 ///////////////////////
573 //duplicate concordance
574 ///////////////////////
575 for (int32_t i=0; i<dups.size(); ++i)
576 {
577 int32_t j = dups[i].individual_index;
578 float a1 = cgs[j<<1];
579 float a2 = cgs[(j<<1)+1];
580
581 j = dups[i].duplicate_index;
582 float b1 = cgs[j<<1];
583 float b2 = cgs[(j<<1)+1];
584
585 if (!(a1<0 || a2<0 || b1<0 || b2<0))
586 {
587 variant_used = true;
588
589 //check if equal genotypes regardless of order
590 if (((a1==b1&&a2==b2) || (a1==b2||a2==b1)))
591 {
592 ++duplicate_biallelic_vntr_genotypes_concordant;
593 }
594 else
595 {
596 ++duplicate_biallelic_vntr_genotypes_disconcordant;
597 }
598 }
599 }
600 }
601 }
602 //multiallelics
603 else
604 {
605 int k = bcf_get_genotypes(h, v, >s, &n);
606 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
607
608 if (r==-1)
609 {
610 //support platypus calls
611 r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
612
613 if (r==-1)
614 {
615 for (uint32_t i=0; i<nsample; ++i)
616 {
617 dps[i] = min_depth;
618 }
619 }
620 }
621
622 bool variant_used = false;
623
624 for (int32_t i=0; i<trios.size(); ++i)
625 {
626 int32_t j = trios[i].father_index;
627 int32_t f1 = bcf_gt_allele(gts[(j<<1)]);
628 int32_t f2 = bcf_gt_allele(gts[(j<<1)+1]);
629 int32_t min_dp = dps[j];
630
631 j = trios[i].mother_index;
632 int32_t m1 = bcf_gt_allele(gts[(j<<1)]);
633 int32_t m2 = bcf_gt_allele(gts[(j<<1)+1]);
634 min_dp = dps[j]<min_dp ? dps[j] : min_dp;
635
636 j = trios[i].child_index;
637 int32_t c1 = bcf_gt_allele(gts[(j<<1)]);
638 int32_t c2 = bcf_gt_allele(gts[(j<<1)+1]);
639 min_dp = dps[j]<min_dp ? dps[j] : min_dp;
640
641 if (min_dp<min_depth)
642 {
643 ++no_failed_min_depth;
644 continue;
645 }
646
647 if (!(f1<0 || f2<0 || m1<0 || m2<0 || c1<0 || c2<0))
648 {
649 if (!ignore_non_variants || (f1+f2+m1+m2+c1+c2!=0))
650 {
651 variant_used = true;
652
653 //count number of distinct parental alleles
654 std::map<int32_t, int32_t> distinct_alleles;
655 distinct_alleles[f1] = 0;
656 distinct_alleles[f2] = 0;
657 distinct_alleles[m1] = 0;
658 distinct_alleles[m2] = 0;
659 int32_t no_distinct_parental_alleles = distinct_alleles.size();
660
661
662 //check if transmission is possible
663 if (((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
664 ((c2==f1||c2==f2) && (c1==m1||c1==m2)))
665 {
666 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][0];
667 // std::cerr << "*******************************************\n";
668 // std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
669 // std::cerr << c1 << " " << c2 << "\n";
670 }
671 else
672 {
673 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][1];
674
675 // if (no_distinct_parental_alleles==4)
676 // {
677 // std::cerr << "==========================================\n";
678 // std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
679 // std::cerr << c1 << " " << c2 << "\n";
680
681
682 // }
683 }
684
685 //implement 2 versions
686 //1. based on fixed genotypes
687
688 //mendelian error estimates based on hard counts.
689 //HOM HOM
690 //AA BB => AB
691 //BB CC => BC
692 //CC DD => CD
693
694 //HET HET
695 //AB AB => AA AB BB - can have errors
696 //AC AD => AA AC AD CD = can have
697 //AB CD => AC AD BC BD
698
699 //HOM HET
700 //AA AB => AA AB
701 //AA BC => AB AC
702
703 }
704 }
705 }//end procesing trios
706 if (variant_used) ++no_multiallelic_variants;
707
708 }
709
710 //ignores actual ploidy
711 //this analysis assumes that the
712 //sites are of ploidy one and that
713 //the genotyper for that site assumes
714 //that it is a diploid.
715 //we want to cosider multiallelics here too.
716 if (bcf_get_info_flag(odr->hdr, v, "NPAR", 0, 0))
717 {
718 //extract genotype array
719 int k = bcf_get_genotypes(h, v, >s, &n);
720 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
721
722 if (r==-1)
723 {
724 r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
725
726 if (r==-1)
727 {
728 for (uint32_t i=0; i<nsample; ++i)
729 {
730 dps[i] = min_depth;
731 }
732 }
733 }
734
735 //site statistics
736 int32_t no_npar_male_homref = 0;
737 int32_t no_npar_male_het = 0;
738 int32_t no_npar_male_homalt = 0;
739 int32_t no_npar_female_homref = 0;
740 int32_t no_npar_female_het = 0;
741 int32_t no_npar_female_homalt = 0;
742
743 for (int32_t i=0; i<males.size(); ++i)
744 {
745 int32_t j = males[i].individual_index;
746 int32_t j1 = bcf_gt_allele(gts[(j<<1)]);
747 int32_t j2 = bcf_gt_allele(gts[(j<<1)+1]);
748 int32_t min_dp = dps[j]<min_dp ? dps[j] : min_dp;
749
750 if (j1!=-1 && j2!=-1)
751 {
752 if (j1+j2==0)
753 {
754 ++males[i].no_hom_ref;
755 ++no_npar_male_homref;
756 }
757 else if (j1!=j2)
758 {
759 ++males[i].no_het;
760 ++no_npar_male_het;
761 }
762 else if (j1==j2)
763 {
764 ++males[i].no_hom_alt;
765 ++no_npar_male_homalt;
766 }
767 }
768 }
769
770 for (int32_t i=0; i<females.size(); ++i)
771 {
772 int32_t j = females[i].individual_index;
773 int32_t j1 = bcf_gt_allele(gts[(j<<1)]);
774 int32_t j2 = bcf_gt_allele(gts[(j<<1)+1]);
775 int32_t min_dp = dps[j]<min_dp ? dps[j] : min_dp;
776
777 if (j1!=-1 && j2!=-1)
778 {
779 if (j1+j2==0)
780 {
781 ++females[i].no_hom_ref;
782 ++no_npar_female_homref;
783 }
784 else if (j1!=j2)
785 {
786 ++females[i].no_het;
787 ++no_npar_female_het;
788 }
789 else if (j1==j2)
790 {
791 ++females[i].no_hom_alt;
792 ++no_npar_female_homalt;
793 }
794 }
795 }
796
797 // std::cerr << "======================== \n";
798 // std::cerr << "m_homref " << no_npar_male_homref << "\n";
799 // std::cerr << "m_het " << no_npar_male_het << "\n";
800 // std::cerr << "m_homalt " << no_npar_male_homalt << "\n";
801 // std::cerr << "f_homref " << no_npar_female_homref << "\n";
802 // std::cerr << "f_het " << no_npar_female_het << "\n";
803 // std::cerr << "f_homalt " << no_npar_female_homalt << "\n";
804
805
806 if (output_sites)
807 {
808 bcf_update_info_int32(odw->hdr, v, "NPAR_M_HOMREF", &no_npar_male_homref, 1);
809 bcf_update_info_int32(odw->hdr, v, "NPAR_M_HET", &no_npar_male_het, 1);
810 bcf_update_info_int32(odw->hdr, v, "NPAR_M_HOMALT", &no_npar_male_homalt, 1);
811 bcf_update_info_int32(odw->hdr, v, "NPAR_F_HOMREF", &no_npar_female_homref, 1);
812 bcf_update_info_int32(odw->hdr, v, "NPAR_F_HET", &no_npar_female_het, 1);
813 bcf_update_info_int32(odw->hdr, v, "NPAR_F_HOMALT", &no_npar_female_homalt, 1);
814
815 // bcf_subset(odw->hdr, v, 0, 0);
816 // bcf_print(odw->hdr, v);
817 }
818
819
820 }
821
822 if (output_sites)
823 {
824 bcf_subset(odw->hdr, v, 0, 0);
825 odw->write(v);
826 }
827 }
828
829 free(gts);
830 if (dps) free(dps);
831 odr->close();
832 if (output_sites)
833 {
834 odw->close();
835 }
836
837 };
838
print_options()839 void print_options()
840 {
841 std::clog << "profile_mendelian v" << version << "\n\n";
842 std::clog << "options: input VCF file " << input_vcf_file << "\n";
843 std::clog << " [p] input PED file " << input_ped_file << "\n";
844 std::clog << " [o] output VCF file " << output_vcf_file << "\n";
845 std::clog << " [d] minimum depth " << min_depth << "\n";
846 print_str_op(" [f] filter ", fexp);
847 print_str_op(" [x] output tabulate directory ", output_tabulate_dir);
848 print_str_op(" [y] output pdf file ", output_pdf_file);
849 print_int_op(" [i] intervals ", intervals);
850 std::clog << "\n";
851 }
852
is_mendelian_discordant(int32_t c1,int32_t c2,int32_t f1,int32_t f2,int32_t m1,int32_t m2)853 bool is_mendelian_discordant(int32_t c1, int32_t c2, int32_t f1, int32_t f2, int32_t m1, int32_t m2)
854 {
855 return !(((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
856 ((c2==f1||c2==f2) && (c1==m1||c1==m2)));
857 }
858
is_duplicate_discordant(int32_t a1,int32_t a2,int32_t b1,int32_t b2)859 bool is_duplicate_discordant(int32_t a1, int32_t a2, int32_t b1, int32_t b2)
860 {
861 return !((a1==b1&&a2==b2) || (a1==b2||a2==b1));
862 }
863
get_error_rate(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)864 float get_error_rate(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
865 {
866 if (collapse==-1) //ALL
867 {
868 float total = 0;
869 float error_count = 0;
870
871 for (int32_t i=0; i<3; ++i)
872 {
873 for (int32_t j=0; j<3; ++j)
874 {
875 f = i;
876 m = j;
877 total += gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
878
879 if (f==m && f!=1)//0/0,2/2
880 {
881 error_count += gt[f][m][1] + gt[f][m][2-f];
882 }
883 else if (abs(f-m)==2) //0/2,2/0
884 {
885 error_count += gt[f][m][0] + gt[f][m][2];
886 }
887 else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
888 {
889 error_count += gt[f][m][f==1?2-m:2-f];
890 }
891 else if (f==1 && m==1)//1/1
892 {
893 error_count += 0;
894 }
895 }
896 }
897
898 return error_count/total*100;
899 }
900 else if (collapse==0) //no collapse
901 {
902 float total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
903 float error_count = 0;
904 if (f==m && f!=1)//0/0,2/2
905 {
906 error_count = gt[f][m][1] + gt[f][m][2-f];
907 }
908 else if (abs(f-m)==2) //0/2,2/0
909 {
910 error_count = gt[f][m][0] + gt[f][m][2];
911 }
912 else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
913 {
914 error_count = gt[f][m][f==1?2-m:2-f];
915 }
916 else if (f==1 && m==1)//1/1
917 {
918 error_count = 0;
919 }
920
921 return error_count/total*100;
922 }
923 else if (collapse==1) //ignoring father/mother
924 {
925 float total = 0;
926 float error_count = 0;
927 if (f==m && f!=1)//0/0,2/2
928 {
929 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
930 error_count = gt[f][m][1] + gt[f][m][2-f];
931 }
932 else if (abs(f-m)==2) //0/2,2/0
933 {
934 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
935 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
936 error_count = gt[f][m][0] + gt[f][m][2];
937 error_count += gt[m][f][0] + gt[m][f][2];
938 }
939 else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
940 {
941 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
942 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
943 error_count = gt[f][m][f==1?2-m:2-f];
944 error_count += gt[m][f][f==1?2-m:2-f];
945 }
946 else if (f==1 && m==1)//1/1
947 {
948 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
949 error_count = 0;
950 }
951
952 return error_count/total*100;
953 }
954 else if (collapse==2) //ignoring allele and father/mother
955 {
956 float total = 0;
957 float error_count = 0;
958 if (f==m && f!=1)//0/0,2/2
959 {
960 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
961 total += gt[2-f][2-m][0] + gt[2-f][2-m][1] + gt[2-f][2-m][2];
962 error_count = gt[f][m][1] + gt[f][m][2-f];
963 error_count += gt[2-f][2-m][1] + gt[2-f][2-m][f];
964 }
965 else if (abs(f-m)==2) //0/2,2/0
966 {
967 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
968 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
969 error_count = gt[f][m][0] + gt[f][m][2];
970 error_count += gt[m][f][0] + gt[m][f][2];
971 }
972 else if (abs(f-m)==1)// 1/2,2/1,1/0,0/1
973 {
974 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
975 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
976 total += gt[2-f][2-m][0] + gt[2-f][2-m][1] + gt[2-f][2-m][2];
977 total += gt[2-m][2-f][0] + gt[2-m][2-f][1] + gt[2-m][2-f][2];
978
979 error_count = gt[f][m][f==1?2-m:2-f];
980 error_count += gt[m][f][f==1?2-m:2-f];
981 error_count += gt[2-f][2-m][f==1?m:f];
982 error_count += gt[2-m][2-f][f==1?m:f];
983 }
984 else if (f==1 && m==1)//1/1
985 {
986 //total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
987 error_count = 0;
988 }
989
990 return error_count/total*100;
991 }
992
993 return (0.0/0.0);
994 };
995
get_homhet_ratio(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)996 float get_homhet_ratio(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
997 {
998 float hom = 0;
999 float het = 0;
1000 if (collapse==0) //ALL
1001 {
1002 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1003 {
1004 hom = gt[f][m][f==1?m:f];
1005 het = gt[f][m][1];
1006 }
1007 else if (f==1 && m==1)//1/1
1008 {
1009 hom = gt[f][m][0]+gt[f][m][2];
1010 het = gt[f][m][1];
1011 }
1012 }
1013 else if (collapse==1) //no collapse
1014 {
1015 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1016 {
1017 hom = gt[f][m][f==1?m:f];
1018 hom += gt[m][f][f==1?m:f];
1019 het = gt[f][m][1];
1020 het += gt[m][f][1];
1021 }
1022 else if (f==1 && m==1)//1/1
1023 {
1024 hom = gt[f][m][0]+gt[f][m][2];
1025 het = gt[f][m][1];
1026 }
1027 }
1028 else if (collapse==2) //no collapse
1029 {
1030 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1031 {
1032 hom = gt[f][m][f==1?m:f];
1033 hom += gt[m][f][f==1?m:f];
1034 hom += gt[2-f][2-m][f==1?2-m:2-f];
1035 hom += gt[2-m][2-f][f==1?2-m:2-f];
1036 het = gt[f][m][1];
1037 het += gt[m][f][1];
1038 het += gt[2-f][2-m][1];
1039 het += gt[2-m][2-f][1];
1040 }
1041 else if (f==1 && m==1)//1/1
1042 {
1043 hom = gt[f][m][0]+gt[f][m][2];
1044 het = gt[f][m][1];
1045 }
1046 }
1047
1048 return (het==0 ? (0.0/0.0) : hom/het);
1049 };
1050
get_homhet_proportion(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)1051 float get_homhet_proportion(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
1052 {
1053 float hom = 0;
1054 float het = 0;
1055 if (collapse==0) //ALL
1056 {
1057 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1058 {
1059 hom = gt[f][m][f==1?m:f];
1060 het = gt[f][m][1];
1061 }
1062 else if (f==1 && m==1)//1/1
1063 {
1064 hom = gt[f][m][0]+gt[f][m][2];
1065 het = gt[f][m][1];
1066 }
1067 }
1068 else if (collapse==1) //no collapse
1069 {
1070 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1071 {
1072 hom = gt[f][m][f==1?m:f];
1073 hom += gt[m][f][f==1?m:f];
1074 het = gt[f][m][1];
1075 het += gt[m][f][1];
1076 }
1077 else if (f==1 && m==1)//1/1
1078 {
1079 hom = gt[f][m][0]+gt[f][m][2];
1080 het = gt[f][m][1];
1081 }
1082 }
1083 else if (collapse==2) //no collapse
1084 {
1085 if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1086 {
1087 hom = gt[f][m][f==1?m:f];
1088 hom += gt[m][f][f==1?m:f];
1089 hom += gt[2-f][2-m][f==1?2-m:2-f];
1090 hom += gt[2-m][2-f][f==1?2-m:2-f];
1091 het = gt[f][m][1];
1092 het += gt[m][f][1];
1093 het += gt[2-f][2-m][1];
1094 het += gt[2-m][2-f][1];
1095 }
1096 else if (f==1 && m==1)//1/1
1097 {
1098 hom = gt[f][m][0]+gt[f][m][2];
1099 het = gt[f][m][1];
1100 }
1101 }
1102
1103 return ((het+hom)==0 ? (0.0/0.0) : het/(hom+het)*100);
1104 };
1105
get_dups_error_rate(int32_t gt[3][3],int32_t a,int32_t b)1106 float get_dups_error_rate(int32_t gt[3][3], int32_t a, int32_t b)
1107 {
1108 float total = 0;
1109 float error_count = 0;
1110
1111 //total error
1112 if (a==-1 && b==-1)
1113 {
1114 total += gt[a][0] + gt[a][1] + gt[a][2];
1115 for (a=0; a<3; ++a)
1116 {
1117 for (int32_t b=0; b<3; ++b)
1118 {
1119 if (a!=b)
1120 {
1121 error_count += gt[a][b];
1122 }
1123
1124 total += gt[a][b];
1125 }
1126 }
1127 }
1128 else if (b==-1)
1129 {
1130 total += gt[a][0] + gt[a][1] + gt[a][2];
1131 for (int32_t b=0; b<3; ++b)
1132 {
1133 if (a!=b)
1134 {
1135 error_count += gt[a][b];
1136 }
1137 }
1138 }
1139 else if (a==-1)
1140 {
1141 total += gt[0][b] + gt[1][b] + gt[2][b];
1142 for (int32_t a=0; a<3; ++a)
1143 {
1144 if (a!=b)
1145 {
1146 error_count += gt[a][b];
1147 }
1148 }
1149 }
1150
1151 if (total!=0)
1152 {
1153 return error_count/total*100;
1154 }
1155 else
1156 {
1157 return (0.0/0.0);
1158 }
1159 };
1160
print_pdf()1161 void print_pdf()
1162 {
1163 append_cwd(output_tabulate_dir);
1164
1165 //generate file
1166 int32_t ret = mkdir(output_tabulate_dir.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
1167
1168 std::string filepath = output_tabulate_dir + "/tabulate.tex";
1169 FILE *out = fopen(filepath.c_str(), "w");
1170
1171 std::string g2s[3] = {"R/R","R/A","A/A"};
1172
1173 fprintf(out, "\\PassOptionsToPackage{table}{xcolor}\n");
1174 fprintf(out, "\\documentclass{beamer}\n");
1175 fprintf(out, "\\begin{document}\n");
1176 fprintf(out, "\\begin{frame}{Mendel Error (All parental genotypes)}\n");
1177 fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1178 fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1179 fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1180 fprintf(out, "\\rowcolor{blue!50}\n");
1181 fprintf(out, "father& mother & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1182 for (int32_t i=0; i<3; ++i)
1183 {
1184 for (int32_t j=0; j<3; ++j)
1185 {
1186 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 0), get_homhet_ratio(trio_genotypes, i, j, 0), get_homhet_proportion(trio_genotypes, i, j, 0));
1187 }
1188 }
1189 fprintf(out, "\\end{tabular}}\n");
1190 fprintf(out, "\\end{frame}\n");
1191
1192 fprintf(out, "\\begin{frame}{Mendel Error (Collapsed genotypes)}\n");
1193 fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1194 fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1195 fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1196 fprintf(out, "\\rowcolor{blue!50}\n");
1197 fprintf(out, "\\multicolumn{2}{c}{Parental} & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1198 for (int32_t i=0; i<3; ++i)
1199 {
1200 for (int32_t j=0; j<3; ++j)
1201 {
1202 if (i!=j)
1203 {
1204 if (i<j)
1205 {
1206 int32_t rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1207 int32_t ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1208 int32_t aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1209 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), rr, ra, aa, get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1210 }
1211 }
1212 else
1213 {
1214 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1215 }
1216 }
1217 }
1218 fprintf(out, "\n");
1219 fprintf(out, "\\end{tabular}}\n");
1220 fprintf(out, "\\end{frame}\n");
1221
1222 fprintf(out, "\\begin{frame}{Mendel Error (Collapse parental allelotypes)}\n");
1223 fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1224 fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1225 fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1226 fprintf(out, "\\rowcolor{blue!50}\n");
1227 fprintf(out, "\\multicolumn{2}{c}{Parental} & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1228 int32_t i,j, rr, ra, aa;
1229 i=0; j=0;
1230 rr = trio_genotypes[i][j][0] + trio_genotypes[2][2][0];
1231 ra = trio_genotypes[i][j][1] + trio_genotypes[2][2][1];
1232 aa = trio_genotypes[i][j][2] + trio_genotypes[2][2][2];
1233 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOM", "HOM", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1234 i=0; j=1;
1235 rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0] + trio_genotypes[2-i][j][0] + trio_genotypes[j][2-i][0];
1236 ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1] + trio_genotypes[2-i][j][1] + trio_genotypes[j][2-i][1];
1237 aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2] + trio_genotypes[2-i][j][2] + trio_genotypes[j][2-i][2];
1238 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOM", "HET", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1239 i=1; j=1;
1240 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HET", "HET", trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1241 i=0; j=2;
1242 rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1243 ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1244 aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1245 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOMREF", "HOMALT", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1246 fprintf(out, "\n");
1247 fprintf(out, "\\end{tabular}}\n");
1248 fprintf(out, "\\end{frame}\n");
1249
1250 fprintf(out, "\\begin{frame}{Mendel Error Overall Summary}\n");
1251 fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1252 fprintf(out, "\\begin{tabular}{lr}\n");
1253 fprintf(out, "total mendelian error & %7.3f \\%% \\\\ \n", get_error_rate(trio_genotypes, -1, -1, -1));
1254 fprintf(out, "no. of trios & %d \\\\ \n", no_trios);
1255 fprintf(out, "no. of variants & %d \\\\ \n", no_biallelic_variants);
1256 fprintf(out, "\n");
1257 fprintf(out, "\\end{tabular}}\n");
1258 fprintf(out, "\\end{frame}\n");
1259
1260 fprintf(out, "\\end{document}\n");
1261 fprintf(out, "\n");
1262
1263 fclose(out);
1264
1265 std::string cmd = "cd " + output_tabulate_dir + "; pdflatex tabulate.tex > run.log; mv tabulate.pdf " + output_pdf_file;
1266
1267 int32_t sys_ret = system(cmd.c_str());
1268 };
1269
print_stats()1270 void print_stats()
1271 {
1272 print_biallelic_trio_stats();
1273 print_multiallelic_trio_stats();
1274 print_biallelic_duplicate_stats();
1275 print_biallelic_vntr_duplicate_stats();
1276 }
1277
print_biallelic_trio_stats()1278 void print_biallelic_trio_stats()
1279 {
1280 if (no_trios==0 || no_biallelic_variants==0) return;
1281
1282 std::string g2s[3] = {"R/R","R/A","A/A"};
1283
1284 fprintf(stderr, "\n");
1285 fprintf(stderr, " Mendelian Errors (Biallelics)");
1286 fprintf(stderr, "\n");
1287 fprintf(stderr, "\n");
1288 fprintf(stderr, " Father Mother R/R R/A A/A Error(%%) HomHet Het(%%)\n");
1289 for (int32_t i=0; i<3; ++i)
1290 {
1291 for (int32_t j=0; j<3; ++j)
1292 {
1293 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 0), get_homhet_ratio(trio_genotypes, i, j, 0), get_homhet_proportion(trio_genotypes, i, j, 0));
1294 }
1295 }
1296 fprintf(stderr, "\n");
1297 fprintf(stderr, " Parental R/R R/A A/A Error(%%) HomHet Het(%%)\n");
1298 for (int32_t i=0; i<3; ++i)
1299 {
1300 for (int32_t j=0; j<3; ++j)
1301 {
1302 if (i!=j)
1303 {
1304 if (i<j)
1305 {
1306 int32_t rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1307 int32_t ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1308 int32_t aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1309 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), rr, ra, aa, get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1310 }
1311 }
1312 else
1313 {
1314 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1315 }
1316 }
1317 }
1318 fprintf(stderr, "\n");
1319 fprintf(stderr, " Parental R/R R/A A/A Error(%%) HomHet Het(%%)\n");
1320 int32_t i,j, rr, ra, aa;
1321 i=0; j=0;
1322 rr = trio_genotypes[i][j][0] + trio_genotypes[2][2][0];
1323 ra = trio_genotypes[i][j][1] + trio_genotypes[2][2][1];
1324 aa = trio_genotypes[i][j][2] + trio_genotypes[2][2][2];
1325 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", "HOM", "HOM", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1326 i=0; j=1;
1327 rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0] + trio_genotypes[2-i][j][0] + trio_genotypes[j][2-i][0];
1328 ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1] + trio_genotypes[2-i][j][1] + trio_genotypes[j][2-i][1];
1329 aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2] + trio_genotypes[2-i][j][2] + trio_genotypes[j][2-i][2];
1330 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", "HOM", "HET", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1331 i=1; j=1;
1332 fprintf(stderr, " %s %s %10d %10d %10d %6.2f %4.2f %5.2f\n", "HET", "HET", trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1333 i=0; j=2;
1334 rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1335 ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1336 aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1337 fprintf(stderr, " %s %s%10d %10d %10d %6.2f %4.2f %5.2f\n", "HOMREF", "HOMALT", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1338 fprintf(stderr, "\n");
1339 fprintf(stderr, " total mendelian error : %7.3f%%\n", get_error_rate(trio_genotypes, -1, -1, -1));
1340 fprintf(stderr, "\n");
1341 fprintf(stderr, " no. of trios : %d\n", no_trios);
1342 fprintf(stderr, " no. of biallelic variants : %d\n", no_biallelic_variants);
1343 fprintf(stderr, "\n");
1344 fprintf(stderr, " no. of trio-sites that fail min depth : %d\n", no_failed_min_depth);
1345 fprintf(stderr, "\n");
1346 };
1347
print_multiallelic_trio_stats()1348 void print_multiallelic_trio_stats()
1349 {
1350 if (no_trios==0 || no_multiallelic_variants==0) return;
1351
1352 fprintf(stderr, "\n");
1353 fprintf(stderr, " Mendelian Errors (Multiallelics)");
1354 fprintf(stderr, "\n");
1355 fprintf(stderr, "\n");
1356 fprintf(stderr, " No. of distinct Transmitted Not transmitted Error(%%)\n");
1357 fprintf(stderr, " parental alleles \n");
1358 fprintf(stderr, " 1 %10d %10d %6.2f\n",
1359 trio_multiallelic_genotypes[0][0],
1360 trio_multiallelic_genotypes[0][1],
1361 (100*(float) trio_multiallelic_genotypes[0][1] /((float)trio_multiallelic_genotypes[0][0]+trio_multiallelic_genotypes[0][1])));
1362 fprintf(stderr, " 2 %10d %10d %6.2f\n",
1363 trio_multiallelic_genotypes[1][0],
1364 trio_multiallelic_genotypes[1][1],
1365 (100*(float) trio_multiallelic_genotypes[1][1] /((float)trio_multiallelic_genotypes[1][0]+trio_multiallelic_genotypes[1][1])));
1366 fprintf(stderr, " 3 %10d %10d %6.2f\n",
1367 trio_multiallelic_genotypes[2][0],
1368 trio_multiallelic_genotypes[2][1],
1369 (100*(float) trio_multiallelic_genotypes[2][1] /((float)trio_multiallelic_genotypes[2][0]+trio_multiallelic_genotypes[2][1])));
1370 fprintf(stderr, " 4 %10d %10d %6.2f\n",
1371 trio_multiallelic_genotypes[3][0],
1372 trio_multiallelic_genotypes[3][1],
1373 (100*(float) trio_multiallelic_genotypes[3][1] /((float)trio_multiallelic_genotypes[3][0]+trio_multiallelic_genotypes[3][1])));
1374
1375 fprintf(stderr, "\n");
1376
1377 int32_t errors = trio_multiallelic_genotypes[0][1] + trio_multiallelic_genotypes[1][1] + trio_multiallelic_genotypes[2][1] + trio_multiallelic_genotypes[3][1];
1378 int32_t nonerrors = trio_multiallelic_genotypes[0][0] + trio_multiallelic_genotypes[1][0] + trio_multiallelic_genotypes[2][0] + trio_multiallelic_genotypes[3][0];
1379 fprintf(stderr, " total mendelian error : %7.3f%%\n", (100*(float)errors/((float)errors+nonerrors)));
1380 fprintf(stderr, "\n");
1381 fprintf(stderr, " no. of trios : %d\n", no_trios);
1382 fprintf(stderr, " no. of multiallelic variants : %d\n", no_multiallelic_variants);
1383 fprintf(stderr, "\n");
1384 fprintf(stderr, " no. of trio-sites that fail min depth : %d\n", no_failed_min_depth);
1385 fprintf(stderr, "\n");
1386 };
1387
print_biallelic_vntr_duplicate_stats()1388 void print_biallelic_vntr_duplicate_stats()
1389 {
1390 if (no_dups==0 || (duplicate_biallelic_vntr_genotypes_concordant+duplicate_biallelic_vntr_genotypes_disconcordant)==0) return;
1391
1392 fprintf(stderr, "\n");
1393 fprintf(stderr, " Duplicate Errors (Biallelics VNTRs)");
1394 fprintf(stderr, "\n");
1395 uint32_t total = duplicate_biallelic_vntr_genotypes_concordant + duplicate_biallelic_vntr_genotypes_disconcordant;
1396 float err_rate = 100* (float) duplicate_biallelic_vntr_genotypes_disconcordant / total;
1397 fprintf(stderr, " total duplicate error : %10u\n", duplicate_biallelic_vntr_genotypes_disconcordant);
1398 fprintf(stderr, " total sites : %10u\n", total);
1399 fprintf(stderr, "\n");
1400
1401 fprintf(stderr, " duplicate error : %7.3f%%\n", err_rate);
1402 fprintf(stderr, "\n");
1403 };
1404
print_biallelic_duplicate_stats()1405 void print_biallelic_duplicate_stats()
1406 {
1407 if (no_dups==0 || no_biallelic_variants==0) return;
1408
1409 fprintf(stderr, "\n");
1410 fprintf(stderr, " Duplicate Errors (Biallelics)");
1411 fprintf(stderr, "\n");
1412 fprintf(stderr, "\n");
1413 fprintf(stderr, " Duplicate\n");
1414 fprintf(stderr, " Individual R/R R/A A/A Error(%%)\n");
1415 fprintf(stderr, " R/R %10d %10d %10d %6.2f \n",
1416 duplicate_genotypes[0][0],
1417 duplicate_genotypes[0][1],
1418 duplicate_genotypes[0][2],
1419 get_dups_error_rate(duplicate_genotypes, 0, -1));
1420 fprintf(stderr, " R/A %10d %10d %10d %6.2f \n",
1421 duplicate_genotypes[1][0],
1422 duplicate_genotypes[1][1],
1423 duplicate_genotypes[1][2],
1424 get_dups_error_rate(duplicate_genotypes, 1, -1));
1425 fprintf(stderr, " A/A %10d %10d %10d %6.2f \n",
1426 duplicate_genotypes[2][0],
1427 duplicate_genotypes[2][1],
1428 duplicate_genotypes[2][2],
1429 get_dups_error_rate(duplicate_genotypes, 2, -1));
1430 int32_t rr = duplicate_genotypes[0][0] + duplicate_genotypes[1][0] + duplicate_genotypes[2][0];
1431 int32_t ra = duplicate_genotypes[0][1] + duplicate_genotypes[1][1] + duplicate_genotypes[2][1];
1432 int32_t aa = duplicate_genotypes[0][2] + duplicate_genotypes[1][2] + duplicate_genotypes[2][2];
1433 fprintf(stderr, " Error(%%) %6.2f %6.2f %6.2f %6.2f \n",
1434 get_dups_error_rate(duplicate_genotypes, -1, 0),
1435 get_dups_error_rate(duplicate_genotypes, -1, 1),
1436 get_dups_error_rate(duplicate_genotypes, -1, 2),
1437 get_dups_error_rate(duplicate_genotypes, -1, -1));
1438
1439 fprintf(stderr, "\n");
1440 fprintf(stderr, " total duplicate error : %7.3f%%\n", get_dups_error_rate(duplicate_genotypes, -1, -1));
1441 fprintf(stderr, "\n");
1442 fprintf(stderr, " no. of duplicate comparisons : %d\n", no_dups);
1443 fprintf(stderr, " no. of biallelic variants : %d\n", no_biallelic_variants_dups);
1444 fprintf(stderr, "\n");
1445 fprintf(stderr, " no. of duplicate-site pairs that fail min depth : %d\n", no_failed_min_depth);
1446 fprintf(stderr, "\n");
1447 fprintf(stderr, "\n");
1448 };
1449
print_nonpseudoautosomal_sites_stats()1450 void print_nonpseudoautosomal_sites_stats()
1451 {
1452 if (no_dups==0 || no_biallelic_variants==0) return;
1453
1454 //average site heterozygosty for males
1455 //average site het for females
1456 //hom ref stats
1457 //hom alt stats
1458
1459 fprintf(stderr, "\n");
1460 fprintf(stderr, " Non pseudo autosomal (Biallelics)");
1461 fprintf(stderr, "\n");
1462 fprintf(stderr, " Duplicate\n");
1463 fprintf(stderr, " Individual Average Standard deviation Max Min\n");
1464 fprintf(stderr, " Male Heterozygosity %10d %10d %10d %6.2f \n",
1465 duplicate_genotypes[0][0],
1466 duplicate_genotypes[0][1],
1467 duplicate_genotypes[0][2],
1468 get_dups_error_rate(duplicate_genotypes, 0, -1));
1469 fprintf(stderr, "\n");
1470 fprintf(stderr, "\n");
1471 };
1472
~Igor()1473 ~Igor()
1474 {
1475 };
1476
1477 private:
1478 };
1479
1480 }
1481
profile_mendelian(int argc,char ** argv)1482 void profile_mendelian(int argc, char ** argv)
1483 {
1484 Igor igor(argc, argv);
1485 igor.print_options();
1486 igor.initialize();
1487 igor.profile_mendelian();
1488 igor.print_stats();
1489 // igor.print_pdf();
1490 }
1491
1492