1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "profile_mendelian.h"
25 
26 namespace
27 {
28 
29 class HetStats
30 {
31     public:
32 
33     std::string individual_id;
34     int32_t individual_index;
35     int32_t no_non_pseudoautosomal_variant_sites;
36     int32_t no_het;
37     int32_t no_hom_ref;
38     int32_t no_hom_alt;
39     float heterozygosity;
40 };
41 
42 class Igor : Program
43 {
44     public:
45 
46     std::string version;
47 
48     ///////////
49     //options//
50     ///////////
51     std::string input_vcf_file;
52     std::string input_ped_file;
53     std::string output_vcf_file;
54     std::string output_tabulate_dir;
55     std::string output_pdf_file;
56     std::vector<GenomeInterval> intervals;
57     std::string interval_list;
58     int32_t min_depth;
59     float min_gq;
60     bool ignore_non_variants;
61     bool output_sites;
62 
63     ///////
64     //i/o//
65     ///////
66     BCFOrderedReader *odr;
67     BCFOrderedWriter *odw;
68 
69     ///////////////
70     //general use//
71     ///////////////
72     kstring_t variant;
73 
74     //////////
75     //filter//
76     //////////
77     std::string fexp;
78     Filter filter;
79     bool filter_exists;
80 
81     /////////
82     //stats//
83     /////////
84     uint32_t no_trios;
85     uint32_t no_dups;
86     uint32_t no_biallelic_variants;
87     uint32_t no_biallelic_variants_dups;
88     uint32_t no_multiallelic_variants;
89     uint32_t duplicate_biallelic_vntr_genotypes_concordant;
90     uint32_t duplicate_biallelic_vntr_genotypes_disconcordant;
91     uint32_t no_failed_min_depth;
92 
93     //for biallelics
94     int32_t trio_genotypes[3][3][3];
95     int32_t trio_multiallelic_genotypes[4][2];  //number of distinct parental alleles (4) vs mendelian error or not
96     int32_t duplicate_genotypes[3][3];
97 
98     //for multiallelics
99     std::vector<std::vector<std::vector<int32_t> > > trios_multiallelic_genotypes;
100 
101     //for non pseudoautosomal variants
102     std::vector<HetStats> males;
103     std::vector<HetStats> females;
104 
105     /////////
106     //tools//
107     /////////
108     Pedigree *pedigree;
109     VariantManip *vm;
110 
Igor(int argc,char ** argv)111     Igor(int argc, char ** argv)
112     {
113         //////////////////////////
114         //options initialization//
115         //////////////////////////
116         try
117         {
118             std::string desc = "Profile Mendelian Errors.";
119 
120             version = "0.5";
121             TCLAP::CmdLine cmd(desc, ' ', version);
122             VTOutput my; cmd.setOutput(&my);
123             TCLAP::ValueArg<std::string> arg_intervals("i", "i", "intervals", false, "", "str", cmd);
124             TCLAP::ValueArg<std::string> arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd);
125             TCLAP::ValueArg<std::string> arg_input_ped_file("p", "p", "pedigree file", true, "", "str", cmd);
126             TCLAP::ValueArg<std::string> arg_fexp("f", "f", "filter expression []", false, "", "str", cmd);
127 
128             TCLAP::ValueArg<std::string> arg_output_tabulate_dir("x", "x", "output latex directory [tabulate_mendelian]", false, "tabulate_mendelian", "str", cmd);
129             TCLAP::ValueArg<std::string> arg_output_pdf_file("y", "y", "output pdf file [mendelian.pdf]", false, "mendelian.pdf", "str", cmd);
130 
131             TCLAP::ValueArg<std::string> arg_output_vcf_file("o", "o", "output VCF file annotated with site error statistics [\"\"]", false, "", "str", cmd);
132             TCLAP::ValueArg<int32_t> arg_min_depth("d", "d", "minimum depth", false, 0, "str", cmd);
133             TCLAP::ValueArg<float> arg_min_gq("q", "q", "minimum genotype quality", false, 2, "str", cmd);
134             TCLAP::SwitchArg arg_ignore_non_variants("n", "n", "ignore non variants", cmd, false);
135             TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "","file", cmd);
136 
137             cmd.parse(argc, argv);
138 
139             input_vcf_file = arg_input_vcf_file.getValue();
140             input_ped_file = arg_input_ped_file.getValue();
141             output_vcf_file = arg_output_vcf_file.getValue();
142             output_sites = output_vcf_file=="" ? false : true;
143             fexp = arg_fexp.getValue();
144             output_tabulate_dir = arg_output_tabulate_dir.getValue();
145             output_pdf_file = arg_output_pdf_file.getValue();
146             min_depth = arg_min_depth.getValue();
147             min_gq = arg_min_gq.getValue();
148             ignore_non_variants = arg_ignore_non_variants.getValue();
149             parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue());
150         }
151         catch (TCLAP::ArgException &e)
152         {
153             std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
154             abort();
155         }
156     };
157 
initialize()158     void initialize()
159     {
160         //////////////////////
161         //i/o initialization//
162         //////////////////////
163         odr = new BCFOrderedReader(input_vcf_file, intervals);
164 
165         if (bcf_hdr_nsamples(odr->hdr)==0)
166         {
167             fprintf(stderr, "[%s:%d %s] No samples in VCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, input_vcf_file.c_str());
168             exit(1);
169         }
170 
171         ///////////////////////////
172         //output VCF file        //
173         ///////////////////////////
174         if (output_sites)
175         {
176             odw = new BCFOrderedWriter(output_vcf_file, 0);
177             bcf_hdr_t *hdr = bcf_hdr_subset(odr->hdr, 0, 0, 0);
178             odw->set_hdr(hdr);
179             bcf_hdr_destroy(hdr);
180             bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_DISC,Number=1,Type=Integer,Description=\"No. of sites with mendelian discordance.\">");
181             bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_INF,Number=1,Type=Integer,Description=\"No. of mendelian informative sites. For biallelics, excludes HOMREF trios and HET parents.\">");
182             bcf_hdr_append(odw->hdr, "##INFO=<ID=MEN_TOT,Number=1,Type=Integer,Description=\"No. of mendelian sample-site pairs.\">");
183             bcf_hdr_append(odw->hdr, "##INFO=<ID=DUP_DISC,Number=1,Type=Integer,Description=\"No. of sites with duplicate discordance.\">");
184             bcf_hdr_append(odw->hdr, "##INFO=<ID=DUP_TOT,Number=1,Type=Integer,Description=\"No. of duplicates sample-site pairs.\">");
185             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HOMREF,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous reference genotypes for males.\">");
186             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HET,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with heterozygous genotypes for males.\">");
187             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_M_HOMALT,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous alternate genotypes for males.\">");
188             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HOMREF,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous reference genotypes for females.\">");
189             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HET,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with heterozygous genotypes for females.\">");
190             bcf_hdr_append(odw->hdr, "##INFO=<ID=NPAR_F_HOMALT,Number=1,Type=Integer,Description=\"No. of non pseudoautosomal region variants with homozygous alternate genotypes for females.\">");
191             bcf_hdr_sync(odw->hdr);
192             odw->write_hdr();
193         }
194 
195         ///////////////////////////
196         //ped file initialization//
197         ///////////////////////////
198         pedigree = new Pedigree(input_ped_file);
199 
200         /////////////////////////
201         //filter initialization//
202         /////////////////////////
203         filter.parse(fexp.c_str());
204         filter_exists = fexp!="";
205 
206         ///////////////
207         //general use//
208         ///////////////
209         variant = {0,0,0};
210 
211         ////////////////////////
212         //stats initialization//
213         ////////////////////////
214         no_trios = 0;
215         no_biallelic_variants = 0;
216         no_biallelic_variants_dups = 0;
217         no_multiallelic_variants = 0;
218         no_failed_min_depth = 0;
219         duplicate_biallelic_vntr_genotypes_concordant = 0;
220         duplicate_biallelic_vntr_genotypes_disconcordant = 0;
221 
222         for (int32_t i=0; i<3; ++i)
223         {
224             for (int32_t j=0; j<3; ++j)
225             {
226                 for (int32_t k=0; k<3; ++k)
227                 {
228                     trio_genotypes[i][j][k] = 0;
229                 }
230             }
231         }
232 
233         for (int32_t a=0; a<3; ++a)
234         {
235             for (int32_t b=0; b<3; ++b)
236             {
237                 duplicate_genotypes[a][b] = 0;
238             }
239         }
240 
241         for (int32_t a=0; a<4; ++a)
242         {
243             for (int32_t b=0; b<2; ++b)
244             {
245                 trio_multiallelic_genotypes[a][b] = 0;
246             }
247         }
248 
249         /////////
250         //tools//
251         /////////
252         vm = new VariantManip();
253     }
254 
profile_mendelian()255     void profile_mendelian()
256     {
257         bcf_hdr_t *h = odr->hdr;
258         bcf1_t *v = bcf_init1();
259 
260         Variant variant;
261 
262         ///////////////////////
263         //initializing pedigree
264         ///////////////////////
265         std::vector<Trio> trios;
266         std::vector<Duplicate> dups;
267 
268         std::vector<PEDRecord>& recs =  pedigree->recs;
269 
270         for (size_t i=0; i<recs.size(); ++i)
271         {
272             if (recs[i].is_trio())
273             {
274                 int32_t individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].individual[0].c_str());
275                 int32_t father_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].father.c_str());
276                 int32_t mother_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].mother.c_str());
277 
278                 if (individual_index>=0 && father_index>=0 && mother_index>=0)
279                 {
280                     Trio trio(individual_index, father_index, mother_index, recs[i].individual_sex);
281                     trios.push_back(trio);
282                 }
283             }
284 
285             if (recs[i].is_duplicated())
286             {
287                 PEDRecord& rec = recs[i];
288                 int32_t no_pairs_added = 0;
289 
290                 int32_t individual_sex = rec.individual_sex;
291 
292                 for (size_t a=0; a<rec.individual.size()-1; ++a)
293                 {
294                     int32_t individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, rec.individual[a].c_str());
295 
296                     for (size_t b=a+1; b<rec.individual.size(); ++b)
297                     {
298                         int32_t duplicate_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, rec.individual[b].c_str());
299 
300                         Duplicate dup(individual_index, duplicate_index, individual_sex);
301                         dups.push_back(dup);
302                         ++no_pairs_added;
303                     }
304                 }
305             }
306 
307             HetStats stats;
308             stats.individual_id = recs[i].individual[0];
309             stats.individual_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, recs[i].individual[0].c_str());;
310             stats.no_non_pseudoautosomal_variant_sites = 0;
311             stats.no_het = 0;
312             stats.no_hom_ref = 0;
313             stats.no_hom_alt = 0;
314             stats.heterozygosity = 0;
315 
316             if (recs[i].is_female())
317             {
318                 females.push_back(stats);
319             }
320             else
321             {
322                 males.push_back(stats);
323             }
324         }
325 
326         no_trios = trios.size();
327         no_dups = dups.size();
328 
329         std::cerr << "No. of trios detected: " << trios.size() << "\n";
330         std::cerr << "No. of duplicates detected: " << dups.size() << "\n";
331         std::cerr << "No. of males detected: " << males.size() << "\n";
332         std::cerr << "No. of females detected: " << females.size() << "\n";
333 
334         int32_t missing = 0;
335         int32_t mendel_homalt_err = 0;
336 
337         int32_t nsample = bcf_hdr_get_n_sample(h);
338         int32_t *gts = NULL;
339         int32_t *dps = (int32_t *) malloc(nsample*sizeof(int32_t));
340         int32_t n = 0;
341         int32_t n_dp = nsample;
342 
343         float *cgs = NULL;
344         int32_t n_cg = 0;
345 
346         while(odr->read(v))
347         {
348             bcf_unpack(v, BCF_UN_IND);
349             int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
350 
351             if (filter_exists)
352             {
353                 vm->classify_variant(odr->hdr, v, variant);
354                 if (!filter.apply(odr->hdr, v, &variant))
355                 {
356                     continue;
357                 }
358             }
359 
360             int32_t no_alleles = bcf_get_n_allele(v);
361 
362             //biallelic
363             if (no_alleles==2)
364             {
365                 if (vtype!=VT_VNTR)
366                 {
367                     int k = bcf_get_genotypes(h, v, &gts, &n);
368                     int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
369 
370                     if (r==-1)
371                     {
372                         r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
373 
374                         if (r==-1)
375                         {
376                             for (uint32_t i=0; i<nsample; ++i)
377                             {
378                                 dps[i] = min_depth;
379                             }
380                         }
381                     }
382 
383                     bool variant_used = false;
384 
385                     ///////////////////////
386                     //mendelian concordance
387                     ///////////////////////
388                     int32_t no_mendelian_discordance = 0;
389                     int32_t no_mendelian_informative_sites = 0;
390                     int32_t no_mendelian_sample_site_pairs = 0;
391                     for (int32_t i=0; i<trios.size(); ++i)
392                     {
393                         int32_t j = trios[i].father_index;
394                         int32_t f1 = bcf_gt_allele(gts[(j<<1)]);
395                         int32_t f2 = bcf_gt_allele(gts[(j<<1)+1]);
396                         int32_t min_dp = dps[j];
397 
398                         j = trios[i].mother_index;
399                         int32_t m1 = bcf_gt_allele(gts[(j<<1)]);
400                         int32_t m2 = bcf_gt_allele(gts[(j<<1)+1]);
401                         min_dp = dps[j]<min_dp ? dps[j] : min_dp;
402 
403                         j = trios[i].child_index;
404                         int32_t c1 = bcf_gt_allele(gts[(j<<1)]);
405                         int32_t c2 = bcf_gt_allele(gts[(j<<1)+1]);
406                         min_dp = dps[j]<min_dp ? dps[j] : min_dp;
407 
408                         if (min_dp<min_depth)
409                         {
410                             ++no_failed_min_depth;
411                             continue;
412                         }
413 
414                         if (!(c1<0 || c2<0 || f1<0 || f2<0 || m1<0 || m2<0))
415                         {
416                             if (!ignore_non_variants || (c1+c2+f1+f2+m1+m2!=0))
417                             {
418                                 ++trio_genotypes[f1+f2][m1+m2][c1+c2];
419                                 variant_used = true;
420 
421                                 if (is_mendelian_discordant(c1, c2, f1, f2, m1, m2))
422                                 {
423                                     ++no_mendelian_discordance;
424                                 }
425 
426                                 if (!(c1+c2+f1+f2+m1+m2==0) && !(f1==1&&f2==1&&m1==1&&m2==1))
427                                 {
428                                     ++no_mendelian_informative_sites;
429                                 }
430                                 ++no_mendelian_sample_site_pairs;
431                             }
432                         }
433                     }
434                     if (variant_used)
435                     {
436                         ++no_biallelic_variants;
437                     }
438 
439                     if (output_sites)
440                     {
441 //                        bcf_set_n_sample(v, 0);
442                         bcf_update_info_int32(odw->hdr, v, "MEN_DISC", &no_mendelian_discordance, 1);
443                         bcf_update_info_int32(odw->hdr, v, "MEN_INF", &no_mendelian_informative_sites, 1);
444                         bcf_update_info_int32(odw->hdr, v, "MEN_TOT", &no_mendelian_sample_site_pairs, 1);
445                     }
446 
447                     ///////////////////////
448                     //duplicate concordance
449                     ///////////////////////
450                     if (dups.size()>0)
451                     {
452                         int32_t no_duplicate_discordance = 0;
453                         int32_t no_duplicate_sample_site_pairs = 0;
454 
455                         for (int32_t i=0; i<dups.size(); ++i)
456                         {
457                             int32_t j = dups[i].individual_index;
458                             int32_t a1 = bcf_gt_allele(gts[(j<<1)]);
459                             int32_t a2 = bcf_gt_allele(gts[(j<<1)+1]);
460                             int32_t min_dp = dps[j];
461 
462                             j = dups[i].duplicate_index;
463                             int32_t b1 = bcf_gt_allele(gts[(j<<1)]);
464                             int32_t b2 = bcf_gt_allele(gts[(j<<1)+1]);
465                             min_dp = dps[j]<min_dp ? dps[j] : min_dp;
466 
467                             if (min_dp<min_depth)
468                             {
469                                 ++no_failed_min_depth;
470                                 continue;
471                             }
472 
473                             if (!(a1<0 || a2<0 || b1<0 || b2<0))
474                             {
475                                 ++duplicate_genotypes[a1+a2][b1+b2];
476                                 variant_used = true;
477 
478                                 if (is_duplicate_discordant(a1, a2, b1, b2))
479                                 {
480                                     ++no_duplicate_discordance;
481                                 }
482                                 ++no_duplicate_sample_site_pairs;
483                             }
484                         }
485                         ++no_biallelic_variants_dups;
486 
487                         if (output_sites)
488                         {
489                             bcf_update_info_int32(odw->hdr, v, "DUP_DISC", &no_duplicate_discordance, 1);
490                             bcf_update_info_int32(odw->hdr, v, "DUP_TOT", &no_duplicate_sample_site_pairs, 1);
491                         }
492                     }// end of iterating through dups
493                 }
494                 else //VNTR
495                 {
496                     int k = bcf_get_format_float(h, v, "CG", &cgs, &n_cg);
497                     int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
498 
499                     if (r==-1)
500                     {
501                         r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
502 
503                         if (r==-1)
504                         {
505                             for (uint32_t i=0; i<nsample; ++i)
506                             {
507                                 dps[i] = min_depth;
508                             }
509                         }
510                     }
511 
512                     bool variant_used = false;
513 
514                     ///////////////////////
515                     //mendelian concordance
516                     ///////////////////////
517                     for (int32_t i=0; i<trios.size(); ++i)
518                     {
519                         int32_t j = trios[i].father_index;
520                         float f1 = cgs[j<<1];
521                         float f2 = cgs[(j<<1)+1];
522 
523                         j = trios[i].mother_index;
524                         float m1 = cgs[j<<1];
525                         float m2 = cgs[(j<<1)+1];
526 
527                         j = trios[i].child_index;
528                         float c1 = cgs[j<<1];
529                         float c2 = cgs[(j<<1)+1];
530 
531                         if (!(f1<0 || f2<0 || m1<0 || m2<0 || c1<0 || c2<0))
532                         {
533                             if (!ignore_non_variants || (f1+f2+m1+m2+c1+c2!=0))
534                             {
535                                 variant_used = true;
536 
537                                 //count number of distinct parental alleles
538                                 std::map<int32_t, int32_t> distinct_alleles;
539                                 distinct_alleles[f1] = 0;
540                                 distinct_alleles[f2] = 0;
541                                 distinct_alleles[m1] = 0;
542                                 distinct_alleles[m2] = 0;
543                                 int32_t no_distinct_parental_alleles = distinct_alleles.size();
544 
545                                 //check if transmission is possible
546                                 if (((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
547                                     ((c2==f1||c2==f2) && (c1==m1||c1==m2)))
548                                 {
549                                     ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][0];
550     //                                    std::cerr << "*******************************************\n";
551     //                                    std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
552     //                                    std::cerr << c1 << " " << c2 << "\n";
553                                 }
554                                 else
555                                 {
556                                     ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][1];
557 
558     //                                if (no_distinct_parental_alleles==4)
559     //                                {
560     //                                    std::cerr << "==========================================\n";
561     //                                    std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
562     //                                    std::cerr << c1 << " " << c2 << "\n";
563 
564 
565     //                                }
566                                 }
567                             }
568                         }
569                     }
570                     if (variant_used) ++no_multiallelic_variants;
571 
572                     ///////////////////////
573                     //duplicate concordance
574                     ///////////////////////
575                     for (int32_t i=0; i<dups.size(); ++i)
576                     {
577                         int32_t j = dups[i].individual_index;
578                         float a1 = cgs[j<<1];
579                         float a2 = cgs[(j<<1)+1];
580 
581                         j = dups[i].duplicate_index;
582                         float b1 = cgs[j<<1];
583                         float b2 = cgs[(j<<1)+1];
584 
585                         if (!(a1<0 || a2<0 || b1<0 || b2<0))
586                         {
587                             variant_used = true;
588 
589                             //check if equal genotypes regardless of order
590                             if (((a1==b1&&a2==b2) || (a1==b2||a2==b1)))
591                             {
592                                 ++duplicate_biallelic_vntr_genotypes_concordant;
593                             }
594                             else
595                             {
596                                 ++duplicate_biallelic_vntr_genotypes_disconcordant;
597                             }
598                         }
599                     }
600                 }
601             }
602             //multiallelics
603             else
604             {
605                 int k = bcf_get_genotypes(h, v, &gts, &n);
606                 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
607 
608                 if (r==-1)
609                 {
610                     //support platypus calls
611                     r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
612 
613                     if (r==-1)
614                     {
615                         for (uint32_t i=0; i<nsample; ++i)
616                         {
617                             dps[i] = min_depth;
618                         }
619                     }
620                 }
621 
622                 bool variant_used = false;
623 
624                 for (int32_t i=0; i<trios.size(); ++i)
625                 {
626                     int32_t j = trios[i].father_index;
627                     int32_t f1 = bcf_gt_allele(gts[(j<<1)]);
628                     int32_t f2 = bcf_gt_allele(gts[(j<<1)+1]);
629                     int32_t min_dp = dps[j];
630 
631                     j = trios[i].mother_index;
632                     int32_t m1 = bcf_gt_allele(gts[(j<<1)]);
633                     int32_t m2 = bcf_gt_allele(gts[(j<<1)+1]);
634                     min_dp = dps[j]<min_dp ? dps[j] : min_dp;
635 
636                     j = trios[i].child_index;
637                     int32_t c1 = bcf_gt_allele(gts[(j<<1)]);
638                     int32_t c2 = bcf_gt_allele(gts[(j<<1)+1]);
639                     min_dp = dps[j]<min_dp ? dps[j] : min_dp;
640 
641                     if (min_dp<min_depth)
642                     {
643                         ++no_failed_min_depth;
644                         continue;
645                     }
646 
647                     if (!(f1<0 || f2<0 || m1<0 || m2<0 || c1<0 || c2<0))
648                     {
649                         if (!ignore_non_variants || (f1+f2+m1+m2+c1+c2!=0))
650                         {
651                             variant_used = true;
652 
653                             //count number of distinct parental alleles
654                             std::map<int32_t, int32_t> distinct_alleles;
655                             distinct_alleles[f1] = 0;
656                             distinct_alleles[f2] = 0;
657                             distinct_alleles[m1] = 0;
658                             distinct_alleles[m2] = 0;
659                             int32_t no_distinct_parental_alleles = distinct_alleles.size();
660 
661 
662                             //check if transmission is possible
663                             if (((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
664                                 ((c2==f1||c2==f2) && (c1==m1||c1==m2)))
665                             {
666                                 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][0];
667 //                                    std::cerr << "*******************************************\n";
668 //                                    std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
669 //                                    std::cerr << c1 << " " << c2 << "\n";
670                             }
671                             else
672                             {
673                                 ++trio_multiallelic_genotypes[no_distinct_parental_alleles-1][1];
674 
675 //                                if (no_distinct_parental_alleles==4)
676 //                                {
677 //                                    std::cerr << "==========================================\n";
678 //                                    std::cerr << f1 << " " << f2 << " " << m1 << " " << m2 << "\n";
679 //                                    std::cerr << c1 << " " << c2 << "\n";
680 
681 
682 //                                }
683                             }
684 
685                         //implement 2 versions
686                         //1. based on fixed genotypes
687 
688                         //mendelian error estimates based on hard counts.
689                         //HOM HOM
690                         //AA BB => AB
691                         //BB CC => BC
692                         //CC DD => CD
693 
694                         //HET HET
695                         //AB AB => AA AB BB  - can have errors
696                         //AC AD => AA AC AD CD = can have
697                         //AB CD => AC AD BC BD
698 
699                         //HOM HET
700                         //AA AB => AA AB
701                         //AA BC => AB AC
702 
703                         }
704                     }
705                 }//end procesing trios
706                 if (variant_used) ++no_multiallelic_variants;
707 
708             }
709 
710             //ignores actual ploidy
711             //this analysis assumes that the
712             //sites are of ploidy one and that
713             //the genotyper for that site assumes
714             //that it is a diploid.
715             //we want to cosider multiallelics here too.
716             if (bcf_get_info_flag(odr->hdr, v, "NPAR", 0, 0))
717             {
718                 //extract genotype array
719                 int k = bcf_get_genotypes(h, v, &gts, &n);
720                 int r = bcf_get_format_int32(h, v, "DP", &dps, &n_dp);
721 
722                 if (r==-1)
723                 {
724                     r = bcf_get_format_int32(h, v, "NR", &dps, &n_dp);
725 
726                     if (r==-1)
727                     {
728                         for (uint32_t i=0; i<nsample; ++i)
729                         {
730                             dps[i] = min_depth;
731                         }
732                     }
733                 }
734 
735                 //site statistics
736                 int32_t no_npar_male_homref = 0;
737                 int32_t no_npar_male_het = 0;
738                 int32_t no_npar_male_homalt = 0;
739                 int32_t no_npar_female_homref = 0;
740                 int32_t no_npar_female_het = 0;
741                 int32_t no_npar_female_homalt = 0;
742 
743                 for (int32_t i=0; i<males.size(); ++i)
744                 {
745                     int32_t j = males[i].individual_index;
746                     int32_t j1 = bcf_gt_allele(gts[(j<<1)]);
747                     int32_t j2 = bcf_gt_allele(gts[(j<<1)+1]);
748                     int32_t min_dp = dps[j]<min_dp ? dps[j] : min_dp;
749 
750                     if (j1!=-1 && j2!=-1)
751                     {
752                         if (j1+j2==0)
753                         {
754                             ++males[i].no_hom_ref;
755                             ++no_npar_male_homref;
756                         }
757                         else if (j1!=j2)
758                         {
759                             ++males[i].no_het;
760                             ++no_npar_male_het;
761                         }
762                         else if (j1==j2)
763                         {
764                             ++males[i].no_hom_alt;
765                             ++no_npar_male_homalt;
766                         }
767                     }
768                 }
769 
770                 for (int32_t i=0; i<females.size(); ++i)
771                 {
772                     int32_t j = females[i].individual_index;
773                     int32_t j1 = bcf_gt_allele(gts[(j<<1)]);
774                     int32_t j2 = bcf_gt_allele(gts[(j<<1)+1]);
775                     int32_t min_dp = dps[j]<min_dp ? dps[j] : min_dp;
776 
777                     if (j1!=-1 && j2!=-1)
778                     {
779                         if (j1+j2==0)
780                         {
781                             ++females[i].no_hom_ref;
782                             ++no_npar_female_homref;
783                         }
784                         else if (j1!=j2)
785                         {
786                             ++females[i].no_het;
787                             ++no_npar_female_het;
788                         }
789                         else if (j1==j2)
790                         {
791                             ++females[i].no_hom_alt;
792                             ++no_npar_female_homalt;
793                         }
794                     }
795                 }
796 
797 //                std::cerr << "======================== \n";
798 //                std::cerr << "m_homref " << no_npar_male_homref << "\n";
799 //                std::cerr << "m_het " << no_npar_male_het << "\n";
800 //                std::cerr << "m_homalt " << no_npar_male_homalt << "\n";
801 //                std::cerr << "f_homref " << no_npar_female_homref << "\n";
802 //                std::cerr << "f_het " << no_npar_female_het << "\n";
803 //                std::cerr << "f_homalt " << no_npar_female_homalt << "\n";
804 
805 
806                 if (output_sites)
807                 {
808                     bcf_update_info_int32(odw->hdr, v, "NPAR_M_HOMREF", &no_npar_male_homref, 1);
809                     bcf_update_info_int32(odw->hdr, v, "NPAR_M_HET", &no_npar_male_het, 1);
810                     bcf_update_info_int32(odw->hdr, v, "NPAR_M_HOMALT", &no_npar_male_homalt, 1);
811                     bcf_update_info_int32(odw->hdr, v, "NPAR_F_HOMREF", &no_npar_female_homref, 1);
812                     bcf_update_info_int32(odw->hdr, v, "NPAR_F_HET", &no_npar_female_het, 1);
813                     bcf_update_info_int32(odw->hdr, v, "NPAR_F_HOMALT", &no_npar_female_homalt, 1);
814 
815 //                bcf_subset(odw->hdr, v, 0, 0);
816 //                bcf_print(odw->hdr, v);
817                 }
818 
819 
820             }
821 
822             if (output_sites)
823             {
824                 bcf_subset(odw->hdr, v, 0, 0);
825                 odw->write(v);
826             }
827         }
828 
829         free(gts);
830         if (dps) free(dps);
831         odr->close();
832         if (output_sites)
833         {
834             odw->close();
835         }
836 
837     };
838 
print_options()839     void print_options()
840     {
841         std::clog << "profile_mendelian v" << version << "\n\n";
842         std::clog << "options:     input VCF file            " << input_vcf_file << "\n";
843         std::clog << "         [p] input PED file            " << input_ped_file << "\n";
844         std::clog << "         [o] output VCF file           " << output_vcf_file << "\n";
845         std::clog << "         [d] minimum depth             " << min_depth << "\n";
846         print_str_op("         [f] filter                    ", fexp);
847         print_str_op("         [x] output tabulate directory ", output_tabulate_dir);
848         print_str_op("         [y] output pdf file           ", output_pdf_file);
849         print_int_op("         [i] intervals                 ", intervals);
850         std::clog << "\n";
851     }
852 
is_mendelian_discordant(int32_t c1,int32_t c2,int32_t f1,int32_t f2,int32_t m1,int32_t m2)853     bool is_mendelian_discordant(int32_t c1, int32_t c2, int32_t f1, int32_t f2, int32_t m1, int32_t m2)
854     {
855         return !(((c1==f1||c1==f2) && (c2==m1||c2==m2)) ||
856                  ((c2==f1||c2==f2) && (c1==m1||c1==m2)));
857     }
858 
is_duplicate_discordant(int32_t a1,int32_t a2,int32_t b1,int32_t b2)859     bool is_duplicate_discordant(int32_t a1, int32_t a2, int32_t b1, int32_t b2)
860     {
861         return !((a1==b1&&a2==b2) || (a1==b2||a2==b1));
862     }
863 
get_error_rate(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)864     float get_error_rate(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
865     {
866         if (collapse==-1) //ALL
867         {
868             float total = 0;
869             float error_count = 0;
870 
871             for (int32_t i=0; i<3; ++i)
872             {
873                 for (int32_t j=0; j<3; ++j)
874                 {
875                     f = i;
876                     m = j;
877                     total += gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
878 
879                     if (f==m && f!=1)//0/0,2/2
880                     {
881                         error_count += gt[f][m][1] + gt[f][m][2-f];
882                     }
883                     else if (abs(f-m)==2) //0/2,2/0
884                     {
885                         error_count += gt[f][m][0] + gt[f][m][2];
886                     }
887                     else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
888                     {
889                         error_count += gt[f][m][f==1?2-m:2-f];
890                     }
891                     else if (f==1 && m==1)//1/1
892                     {
893                         error_count += 0;
894                     }
895                 }
896             }
897 
898             return error_count/total*100;
899         }
900         else if (collapse==0) //no collapse
901         {
902             float total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
903             float error_count = 0;
904             if (f==m && f!=1)//0/0,2/2
905             {
906                 error_count = gt[f][m][1] + gt[f][m][2-f];
907             }
908             else if (abs(f-m)==2) //0/2,2/0
909             {
910                 error_count = gt[f][m][0] + gt[f][m][2];
911             }
912             else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
913             {
914                 error_count = gt[f][m][f==1?2-m:2-f];
915             }
916             else if (f==1 && m==1)//1/1
917             {
918                 error_count = 0;
919             }
920 
921             return error_count/total*100;
922         }
923         else if (collapse==1) //ignoring father/mother
924         {
925             float total = 0;
926             float error_count = 0;
927             if (f==m && f!=1)//0/0,2/2
928             {
929                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
930                 error_count = gt[f][m][1] + gt[f][m][2-f];
931             }
932             else if (abs(f-m)==2) //0/2,2/0
933             {
934                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
935                 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
936                 error_count = gt[f][m][0] + gt[f][m][2];
937                 error_count += gt[m][f][0] + gt[m][f][2];
938             }
939             else if (abs(f-m)==1)//1/2,2/1,1/0,0/1
940             {
941                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
942                 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
943                 error_count = gt[f][m][f==1?2-m:2-f];
944                 error_count += gt[m][f][f==1?2-m:2-f];
945             }
946             else if (f==1 && m==1)//1/1
947             {
948                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
949                 error_count = 0;
950             }
951 
952             return error_count/total*100;
953         }
954         else if (collapse==2) //ignoring allele and father/mother
955         {
956             float total = 0;
957             float error_count = 0;
958             if (f==m && f!=1)//0/0,2/2
959             {
960                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
961                 total += gt[2-f][2-m][0] + gt[2-f][2-m][1] + gt[2-f][2-m][2];
962                 error_count = gt[f][m][1] + gt[f][m][2-f];
963                 error_count += gt[2-f][2-m][1] + gt[2-f][2-m][f];
964             }
965             else if (abs(f-m)==2) //0/2,2/0
966             {
967                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
968                 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
969                 error_count = gt[f][m][0] + gt[f][m][2];
970                 error_count += gt[m][f][0] + gt[m][f][2];
971             }
972             else if (abs(f-m)==1)// 1/2,2/1,1/0,0/1
973             {
974                 total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
975                 total += gt[m][f][0] + gt[m][f][1] + gt[m][f][2];
976                 total += gt[2-f][2-m][0] + gt[2-f][2-m][1] + gt[2-f][2-m][2];
977                 total += gt[2-m][2-f][0] + gt[2-m][2-f][1] + gt[2-m][2-f][2];
978 
979                 error_count = gt[f][m][f==1?2-m:2-f];
980                 error_count += gt[m][f][f==1?2-m:2-f];
981                 error_count += gt[2-f][2-m][f==1?m:f];
982                 error_count += gt[2-m][2-f][f==1?m:f];
983             }
984             else if (f==1 && m==1)//1/1
985             {
986                 //total = gt[f][m][0] + gt[f][m][1] + gt[f][m][2];
987                 error_count = 0;
988             }
989 
990             return error_count/total*100;
991         }
992 
993         return (0.0/0.0);
994     };
995 
get_homhet_ratio(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)996     float get_homhet_ratio(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
997     {
998         float hom = 0;
999         float het = 0;
1000         if (collapse==0) //ALL
1001         {
1002             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1003             {
1004                 hom = gt[f][m][f==1?m:f];
1005                 het = gt[f][m][1];
1006             }
1007             else if (f==1 && m==1)//1/1
1008             {
1009                 hom = gt[f][m][0]+gt[f][m][2];
1010                 het = gt[f][m][1];
1011             }
1012         }
1013         else if (collapse==1) //no collapse
1014         {
1015             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1016             {
1017                 hom = gt[f][m][f==1?m:f];
1018                 hom += gt[m][f][f==1?m:f];
1019                 het = gt[f][m][1];
1020                 het += gt[m][f][1];
1021             }
1022             else if (f==1 && m==1)//1/1
1023             {
1024                 hom = gt[f][m][0]+gt[f][m][2];
1025                 het = gt[f][m][1];
1026             }
1027         }
1028         else if (collapse==2) //no collapse
1029         {
1030             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1031             {
1032                 hom = gt[f][m][f==1?m:f];
1033                 hom += gt[m][f][f==1?m:f];
1034                 hom += gt[2-f][2-m][f==1?2-m:2-f];
1035                 hom += gt[2-m][2-f][f==1?2-m:2-f];
1036                 het = gt[f][m][1];
1037                 het += gt[m][f][1];
1038                 het += gt[2-f][2-m][1];
1039                 het += gt[2-m][2-f][1];
1040             }
1041             else if (f==1 && m==1)//1/1
1042             {
1043                 hom = gt[f][m][0]+gt[f][m][2];
1044                 het = gt[f][m][1];
1045             }
1046         }
1047 
1048         return (het==0 ? (0.0/0.0) : hom/het);
1049     };
1050 
get_homhet_proportion(int32_t gt[3][3][3],int32_t f,int32_t m,int32_t collapse)1051     float get_homhet_proportion(int32_t gt[3][3][3], int32_t f, int32_t m, int32_t collapse)
1052     {
1053         float hom = 0;
1054         float het = 0;
1055         if (collapse==0) //ALL
1056         {
1057             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1058             {
1059                 hom = gt[f][m][f==1?m:f];
1060                 het = gt[f][m][1];
1061             }
1062             else if (f==1 && m==1)//1/1
1063             {
1064                 hom = gt[f][m][0]+gt[f][m][2];
1065                 het = gt[f][m][1];
1066             }
1067         }
1068         else if (collapse==1) //no collapse
1069         {
1070             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1071             {
1072                 hom = gt[f][m][f==1?m:f];
1073                 hom += gt[m][f][f==1?m:f];
1074                 het = gt[f][m][1];
1075                 het += gt[m][f][1];
1076             }
1077             else if (f==1 && m==1)//1/1
1078             {
1079                 hom = gt[f][m][0]+gt[f][m][2];
1080                 het = gt[f][m][1];
1081             }
1082         }
1083         else if (collapse==2) //no collapse
1084         {
1085             if (abs(f-m)==1)//1/2,2/1,1/0,0/1
1086             {
1087                 hom = gt[f][m][f==1?m:f];
1088                 hom += gt[m][f][f==1?m:f];
1089                 hom += gt[2-f][2-m][f==1?2-m:2-f];
1090                 hom += gt[2-m][2-f][f==1?2-m:2-f];
1091                 het = gt[f][m][1];
1092                 het += gt[m][f][1];
1093                 het += gt[2-f][2-m][1];
1094                 het += gt[2-m][2-f][1];
1095             }
1096             else if (f==1 && m==1)//1/1
1097             {
1098                 hom = gt[f][m][0]+gt[f][m][2];
1099                 het = gt[f][m][1];
1100             }
1101         }
1102 
1103         return ((het+hom)==0 ? (0.0/0.0) : het/(hom+het)*100);
1104     };
1105 
get_dups_error_rate(int32_t gt[3][3],int32_t a,int32_t b)1106     float get_dups_error_rate(int32_t gt[3][3], int32_t a, int32_t b)
1107     {
1108         float total = 0;
1109         float error_count = 0;
1110 
1111         //total error
1112         if (a==-1 && b==-1)
1113         {
1114             total += gt[a][0] + gt[a][1] + gt[a][2];
1115             for (a=0; a<3; ++a)
1116             {
1117                for (int32_t b=0; b<3; ++b)
1118                 {
1119                     if (a!=b)
1120                     {
1121                         error_count += gt[a][b];
1122                     }
1123 
1124                     total += gt[a][b];
1125                 }
1126             }
1127         }
1128         else if (b==-1)
1129         {
1130             total += gt[a][0] + gt[a][1] + gt[a][2];
1131             for (int32_t b=0; b<3; ++b)
1132             {
1133                 if (a!=b)
1134                 {
1135                     error_count += gt[a][b];
1136                 }
1137             }
1138         }
1139         else if (a==-1)
1140         {
1141             total += gt[0][b] + gt[1][b] + gt[2][b];
1142             for (int32_t a=0; a<3; ++a)
1143             {
1144                 if (a!=b)
1145                 {
1146                     error_count += gt[a][b];
1147                 }
1148             }
1149         }
1150 
1151         if (total!=0)
1152         {
1153             return error_count/total*100;
1154         }
1155         else
1156         {
1157             return (0.0/0.0);
1158         }
1159     };
1160 
print_pdf()1161     void print_pdf()
1162     {
1163         append_cwd(output_tabulate_dir);
1164 
1165         //generate file
1166         int32_t ret = mkdir(output_tabulate_dir.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
1167 
1168         std::string filepath = output_tabulate_dir + "/tabulate.tex";
1169         FILE *out = fopen(filepath.c_str(), "w");
1170 
1171         std::string g2s[3] = {"R/R","R/A","A/A"};
1172 
1173         fprintf(out, "\\PassOptionsToPackage{table}{xcolor}\n");
1174         fprintf(out, "\\documentclass{beamer}\n");
1175         fprintf(out, "\\begin{document}\n");
1176         fprintf(out, "\\begin{frame}{Mendel Error (All parental genotypes)}\n");
1177         fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1178         fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1179         fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1180         fprintf(out, "\\rowcolor{blue!50}\n");
1181         fprintf(out, "father& mother & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1182         for (int32_t i=0; i<3; ++i)
1183         {
1184             for (int32_t j=0; j<3; ++j)
1185             {
1186                 fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 0), get_homhet_ratio(trio_genotypes, i, j, 0), get_homhet_proportion(trio_genotypes, i, j, 0));
1187             }
1188         }
1189         fprintf(out, "\\end{tabular}}\n");
1190         fprintf(out, "\\end{frame}\n");
1191 
1192         fprintf(out, "\\begin{frame}{Mendel Error (Collapsed genotypes)}\n");
1193         fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1194         fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1195         fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1196         fprintf(out, "\\rowcolor{blue!50}\n");
1197         fprintf(out, "\\multicolumn{2}{c}{Parental}  & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1198         for (int32_t i=0; i<3; ++i)
1199         {
1200             for (int32_t j=0; j<3; ++j)
1201             {
1202                 if (i!=j)
1203                 {
1204                     if (i<j)
1205                     {
1206                         int32_t rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1207                         int32_t ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1208                         int32_t aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1209                         fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), rr, ra, aa, get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1210                     }
1211                 }
1212                 else
1213                 {
1214                     fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1215                 }
1216             }
1217         }
1218         fprintf(out, "\n");
1219         fprintf(out, "\\end{tabular}}\n");
1220         fprintf(out, "\\end{frame}\n");
1221 
1222         fprintf(out, "\\begin{frame}{Mendel Error (Collapse parental allelotypes)}\n");
1223         fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1224         fprintf(out, "\\rowcolors{2}{blue!25}{blue!10}\n");
1225         fprintf(out, "\\begin{tabular}{ccrrrrrr}\n");
1226         fprintf(out, "\\rowcolor{blue!50}\n");
1227         fprintf(out, "\\multicolumn{2}{c}{Parental}  & R/R & R/A & A/A & Error \\%% & HomHet & Het \\%%\\\\ \n");
1228         int32_t i,j, rr, ra, aa;
1229         i=0; j=0;
1230         rr = trio_genotypes[i][j][0] + trio_genotypes[2][2][0];
1231         ra = trio_genotypes[i][j][1] + trio_genotypes[2][2][1];
1232         aa = trio_genotypes[i][j][2] + trio_genotypes[2][2][2];
1233         fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOM", "HOM", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1234         i=0; j=1;
1235         rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0] + trio_genotypes[2-i][j][0] + trio_genotypes[j][2-i][0];
1236         ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1] + trio_genotypes[2-i][j][1] + trio_genotypes[j][2-i][1];
1237         aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2] + trio_genotypes[2-i][j][2] + trio_genotypes[j][2-i][2];
1238         fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOM", "HET", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1239         i=1; j=1;
1240         fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HET", "HET", trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1241         i=0; j=2;
1242         rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1243         ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1244         aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1245         fprintf(out, "%s&%s&%10d&%10d&%10d&%6.2f&%4.2f&%5.2f \\\\ \n", "HOMREF", "HOMALT", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1246         fprintf(out, "\n");
1247         fprintf(out, "\\end{tabular}}\n");
1248         fprintf(out, "\\end{frame}\n");
1249 
1250         fprintf(out, "\\begin{frame}{Mendel Error Overall Summary}\n");
1251         fprintf(out, "\\resizebox{\\linewidth}{!}{\n");
1252         fprintf(out, "\\begin{tabular}{lr}\n");
1253         fprintf(out, "total mendelian error & %7.3f \\%% \\\\ \n", get_error_rate(trio_genotypes, -1, -1, -1));
1254         fprintf(out, "no. of trios     & %d \\\\ \n", no_trios);
1255         fprintf(out, "no. of variants  & %d \\\\ \n", no_biallelic_variants);
1256         fprintf(out, "\n");
1257         fprintf(out, "\\end{tabular}}\n");
1258         fprintf(out, "\\end{frame}\n");
1259 
1260         fprintf(out, "\\end{document}\n");
1261         fprintf(out, "\n");
1262 
1263         fclose(out);
1264 
1265         std::string cmd = "cd "  + output_tabulate_dir + "; pdflatex tabulate.tex > run.log; mv tabulate.pdf " + output_pdf_file;
1266 
1267         int32_t sys_ret = system(cmd.c_str());
1268     };
1269 
print_stats()1270     void print_stats()
1271     {
1272         print_biallelic_trio_stats();
1273         print_multiallelic_trio_stats();
1274         print_biallelic_duplicate_stats();
1275         print_biallelic_vntr_duplicate_stats();
1276     }
1277 
print_biallelic_trio_stats()1278     void print_biallelic_trio_stats()
1279     {
1280         if (no_trios==0 || no_biallelic_variants==0) return;
1281 
1282         std::string g2s[3] = {"R/R","R/A","A/A"};
1283 
1284         fprintf(stderr, "\n");
1285         fprintf(stderr, "     Mendelian Errors (Biallelics)");
1286         fprintf(stderr, "\n");
1287         fprintf(stderr, "\n");
1288         fprintf(stderr, "     Father Mother       R/R          R/A          A/A    Error(%%) HomHet    Het(%%)\n");
1289         for (int32_t i=0; i<3; ++i)
1290         {
1291             for (int32_t j=0; j<3; ++j)
1292             {
1293                 fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 0), get_homhet_ratio(trio_genotypes, i, j, 0), get_homhet_proportion(trio_genotypes, i, j, 0));
1294             }
1295         }
1296         fprintf(stderr, "\n");
1297         fprintf(stderr, "     Parental            R/R          R/A          A/A    Error(%%) HomHet    Het(%%)\n");
1298         for (int32_t i=0; i<3; ++i)
1299         {
1300             for (int32_t j=0; j<3; ++j)
1301             {
1302                 if (i!=j)
1303                 {
1304                     if (i<j)
1305                     {
1306                         int32_t rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1307                         int32_t ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1308                         int32_t aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1309                         fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), rr, ra, aa, get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1310                     }
1311                 }
1312                 else
1313                 {
1314                     fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", g2s[i].c_str(), g2s[j].c_str(), trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 1), get_homhet_ratio(trio_genotypes, i, j, 1), get_homhet_proportion(trio_genotypes, i, j, 1));
1315                 }
1316             }
1317         }
1318         fprintf(stderr, "\n");
1319         fprintf(stderr, "     Parental            R/R          R/A          A/A    Error(%%) HomHet    Het(%%)\n");
1320         int32_t i,j, rr, ra, aa;
1321         i=0; j=0;
1322         rr = trio_genotypes[i][j][0] + trio_genotypes[2][2][0];
1323         ra = trio_genotypes[i][j][1] + trio_genotypes[2][2][1];
1324         aa = trio_genotypes[i][j][2] + trio_genotypes[2][2][2];
1325         fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", "HOM", "HOM", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1326         i=0; j=1;
1327         rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0] + trio_genotypes[2-i][j][0] + trio_genotypes[j][2-i][0];
1328         ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1] + trio_genotypes[2-i][j][1] + trio_genotypes[j][2-i][1];
1329         aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2] + trio_genotypes[2-i][j][2] + trio_genotypes[j][2-i][2];
1330         fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", "HOM", "HET", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1331         i=1; j=1;
1332         fprintf(stderr, "     %s    %s   %10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", "HET", "HET", trio_genotypes[i][j][0], trio_genotypes[i][j][1], trio_genotypes[i][j][2], get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1333         i=0; j=2;
1334         rr = trio_genotypes[i][j][0] + trio_genotypes[j][i][0];
1335         ra = trio_genotypes[i][j][1] + trio_genotypes[j][i][1];
1336         aa = trio_genotypes[i][j][2] + trio_genotypes[j][i][2];
1337         fprintf(stderr, "     %s %s%10d   %10d   %10d   %6.2f      %4.2f  %5.2f\n", "HOMREF", "HOMALT", rr, ra, aa, get_error_rate(trio_genotypes, i, j, 2), get_homhet_ratio(trio_genotypes, i, j, 2), get_homhet_proportion(trio_genotypes, i, j, 2));
1338         fprintf(stderr, "\n");
1339         fprintf(stderr, "     total mendelian error : %7.3f%%\n", get_error_rate(trio_genotypes, -1, -1, -1));
1340         fprintf(stderr, "\n");
1341         fprintf(stderr, "     no. of trios               : %d\n", no_trios);
1342         fprintf(stderr, "     no. of biallelic variants  : %d\n", no_biallelic_variants);
1343         fprintf(stderr, "\n");
1344         fprintf(stderr, "     no. of trio-sites that fail min depth  : %d\n", no_failed_min_depth);
1345         fprintf(stderr, "\n");
1346     };
1347 
print_multiallelic_trio_stats()1348     void print_multiallelic_trio_stats()
1349     {
1350         if (no_trios==0 || no_multiallelic_variants==0) return;
1351 
1352         fprintf(stderr, "\n");
1353         fprintf(stderr, "     Mendelian Errors (Multiallelics)");
1354         fprintf(stderr, "\n");
1355         fprintf(stderr, "\n");
1356         fprintf(stderr, "     No. of distinct               Transmitted      Not transmitted    Error(%%)\n");
1357         fprintf(stderr, "     parental alleles                                   \n");
1358         fprintf(stderr, "            1                       %10d           %10d   %6.2f\n",
1359                                 trio_multiallelic_genotypes[0][0],
1360                                 trio_multiallelic_genotypes[0][1],
1361                                 (100*(float) trio_multiallelic_genotypes[0][1] /((float)trio_multiallelic_genotypes[0][0]+trio_multiallelic_genotypes[0][1])));
1362         fprintf(stderr, "            2                       %10d           %10d   %6.2f\n",
1363                                 trio_multiallelic_genotypes[1][0],
1364                                 trio_multiallelic_genotypes[1][1],
1365                                 (100*(float) trio_multiallelic_genotypes[1][1] /((float)trio_multiallelic_genotypes[1][0]+trio_multiallelic_genotypes[1][1])));
1366         fprintf(stderr, "            3                       %10d           %10d   %6.2f\n",
1367                                 trio_multiallelic_genotypes[2][0],
1368                                 trio_multiallelic_genotypes[2][1],
1369                                 (100*(float) trio_multiallelic_genotypes[2][1] /((float)trio_multiallelic_genotypes[2][0]+trio_multiallelic_genotypes[2][1])));
1370         fprintf(stderr, "            4                       %10d           %10d   %6.2f\n",
1371                                 trio_multiallelic_genotypes[3][0],
1372                                 trio_multiallelic_genotypes[3][1],
1373                                 (100*(float) trio_multiallelic_genotypes[3][1] /((float)trio_multiallelic_genotypes[3][0]+trio_multiallelic_genotypes[3][1])));
1374 
1375         fprintf(stderr, "\n");
1376 
1377         int32_t errors = trio_multiallelic_genotypes[0][1] + trio_multiallelic_genotypes[1][1] + trio_multiallelic_genotypes[2][1] + trio_multiallelic_genotypes[3][1];
1378         int32_t nonerrors = trio_multiallelic_genotypes[0][0] + trio_multiallelic_genotypes[1][0] + trio_multiallelic_genotypes[2][0] + trio_multiallelic_genotypes[3][0];
1379         fprintf(stderr, "     total mendelian error : %7.3f%%\n", (100*(float)errors/((float)errors+nonerrors)));
1380         fprintf(stderr, "\n");
1381         fprintf(stderr, "     no. of trios                  : %d\n", no_trios);
1382         fprintf(stderr, "     no. of multiallelic variants  : %d\n", no_multiallelic_variants);
1383         fprintf(stderr, "\n");
1384         fprintf(stderr, "     no. of trio-sites that fail min depth  : %d\n", no_failed_min_depth);
1385         fprintf(stderr, "\n");
1386     };
1387 
print_biallelic_vntr_duplicate_stats()1388     void print_biallelic_vntr_duplicate_stats()
1389     {
1390         if (no_dups==0 || (duplicate_biallelic_vntr_genotypes_concordant+duplicate_biallelic_vntr_genotypes_disconcordant)==0) return;
1391 
1392         fprintf(stderr, "\n");
1393         fprintf(stderr, "     Duplicate Errors (Biallelics VNTRs)");
1394         fprintf(stderr, "\n");
1395         uint32_t total = duplicate_biallelic_vntr_genotypes_concordant + duplicate_biallelic_vntr_genotypes_disconcordant;
1396         float err_rate =  100* (float) duplicate_biallelic_vntr_genotypes_disconcordant / total;
1397         fprintf(stderr, "     total duplicate error : %10u\n", duplicate_biallelic_vntr_genotypes_disconcordant);
1398         fprintf(stderr, "     total sites           : %10u\n", total);
1399         fprintf(stderr, "\n");
1400 
1401         fprintf(stderr, "     duplicate error      : %7.3f%%\n", err_rate);
1402         fprintf(stderr, "\n");
1403     };
1404 
print_biallelic_duplicate_stats()1405     void print_biallelic_duplicate_stats()
1406     {
1407         if (no_dups==0 || no_biallelic_variants==0) return;
1408 
1409         fprintf(stderr, "\n");
1410         fprintf(stderr, "     Duplicate Errors (Biallelics)");
1411         fprintf(stderr, "\n");
1412         fprintf(stderr, "\n");
1413         fprintf(stderr, "                                     Duplicate\n");
1414         fprintf(stderr, "     Individual            R/R          R/A          A/A    Error(%%)\n");
1415         fprintf(stderr, "     R/R             %10d  %10d   %10d   %6.2f     \n",
1416                                 duplicate_genotypes[0][0],
1417                                 duplicate_genotypes[0][1],
1418                                 duplicate_genotypes[0][2],
1419                                 get_dups_error_rate(duplicate_genotypes, 0, -1));
1420         fprintf(stderr, "     R/A             %10d  %10d   %10d   %6.2f     \n",
1421                                 duplicate_genotypes[1][0],
1422                                 duplicate_genotypes[1][1],
1423                                 duplicate_genotypes[1][2],
1424                                 get_dups_error_rate(duplicate_genotypes, 1, -1));
1425         fprintf(stderr, "     A/A             %10d  %10d   %10d   %6.2f     \n",
1426                                 duplicate_genotypes[2][0],
1427                                 duplicate_genotypes[2][1],
1428                                 duplicate_genotypes[2][2],
1429                                 get_dups_error_rate(duplicate_genotypes, 2, -1));
1430         int32_t rr = duplicate_genotypes[0][0] + duplicate_genotypes[1][0] + duplicate_genotypes[2][0];
1431         int32_t ra = duplicate_genotypes[0][1] + duplicate_genotypes[1][1] + duplicate_genotypes[2][1];
1432         int32_t aa = duplicate_genotypes[0][2] + duplicate_genotypes[1][2] + duplicate_genotypes[2][2];
1433         fprintf(stderr, "   Error(%%)              %6.2f      %6.2f       %6.2f   %6.2f     \n",
1434                                 get_dups_error_rate(duplicate_genotypes, -1, 0),
1435                                 get_dups_error_rate(duplicate_genotypes, -1, 1),
1436                                 get_dups_error_rate(duplicate_genotypes, -1, 2),
1437                                 get_dups_error_rate(duplicate_genotypes, -1, -1));
1438 
1439         fprintf(stderr, "\n");
1440         fprintf(stderr, "     total duplicate error : %7.3f%%\n", get_dups_error_rate(duplicate_genotypes, -1, -1));
1441         fprintf(stderr, "\n");
1442         fprintf(stderr, "     no. of duplicate comparisons    : %d\n", no_dups);
1443         fprintf(stderr, "     no. of biallelic variants       : %d\n", no_biallelic_variants_dups);
1444         fprintf(stderr, "\n");
1445         fprintf(stderr, "     no. of duplicate-site pairs that fail min depth  : %d\n", no_failed_min_depth);
1446         fprintf(stderr, "\n");
1447         fprintf(stderr, "\n");
1448     };
1449 
print_nonpseudoautosomal_sites_stats()1450     void print_nonpseudoautosomal_sites_stats()
1451     {
1452         if (no_dups==0 || no_biallelic_variants==0) return;
1453 
1454         //average site heterozygosty for males
1455         //average site het for females
1456         //hom ref stats
1457         //hom alt stats
1458 
1459         fprintf(stderr, "\n");
1460         fprintf(stderr, "     Non pseudo autosomal (Biallelics)");
1461         fprintf(stderr, "\n");
1462         fprintf(stderr, "                                     Duplicate\n");
1463         fprintf(stderr, "     Individual            Average   Standard deviation  Max Min\n");
1464         fprintf(stderr, "     Male Heterozygosity  %10d  %10d   %10d   %6.2f     \n",
1465                                 duplicate_genotypes[0][0],
1466                                 duplicate_genotypes[0][1],
1467                                 duplicate_genotypes[0][2],
1468                                 get_dups_error_rate(duplicate_genotypes, 0, -1));
1469         fprintf(stderr, "\n");
1470         fprintf(stderr, "\n");
1471     };
1472 
~Igor()1473     ~Igor()
1474     {
1475     };
1476 
1477     private:
1478 };
1479 
1480 }
1481 
profile_mendelian(int argc,char ** argv)1482 void profile_mendelian(int argc, char ** argv)
1483 {
1484     Igor igor(argc, argv);
1485     igor.print_options();
1486     igor.initialize();
1487     igor.profile_mendelian();
1488     igor.print_stats();
1489 //    igor.print_pdf();
1490 }
1491 
1492