1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "align.h"
25 #include "tbx_ordered_reader.h"
26 #include "bed.h"
27 #include "pcre2.h"
28 #include "pregex.h"
29 #include "reference_sequence.h"
30
31 namespace
32 {
33
print_time(float t)34 void print_time(float t)
35 {
36 if (t<60)
37 {
38 fprintf(stderr, "Alignment time elapsed: %fs\n\n", t);
39 }
40 else if (t<60*60) //less than an hour
41 {
42 fprintf(stderr, "Time elapsed: %dm %ds\n\n", ((int32_t)(t/60)), ((int32_t)fmod(t, 60)));
43 }
44 else if (t<60*60*24) //less than a day
45 {
46 double m = fmod(t, 60*60); //remaining minutes
47 fprintf(stderr, "Time elapsed: %dh %dm %ds\n\n", ((int32_t)(t/(60*60))), ((int32_t)(m/60)), ((int32_t)fmod(m, 60)));
48 }
49 else if (t<60*60*24*365) //less than a year
50 {
51 double h = fmod(t, 60*60*24); //remaining hours
52 double m = fmod(h, 60*60); //remaining minutes
53 fprintf(stderr, "Alignment time elapsed: %dd %dh %dm %.6fs\n\n", ((int32_t)(t/(60*60*24))), ((int32_t)(h/(60*60))), ((int32_t)(m/60)), (fmod(m, 60)));
54 }
55 };
56
57 class Igor : Program
58 {
59 public:
60
61 ///////////
62 //options//
63 ///////////
64 std::string method;
65 std::vector<std::string> x;
66 std::string sequence;
67
68 bool debug;
69 uint32_t no;
70
71
72
Igor(int argc,char ** argv)73 Igor(int argc, char **argv)
74 {
75
76 };
77
78 // /**
79 // * n choose r.
80 // */
81 // uint32_t choose(uint32_t n, uint32_t r)
82 // {
83 // if (r>n)
84 // {
85 // return 0;
86 // }
87 // else if (r==n)
88 // {
89 // return 1;
90 // }
91 // else if (r==0)
92 // {
93 // return 1;
94 // }
95 // else
96 // {
97 // if (r>(n>>1))
98 // {
99 // r = n-r;
100 // }
101 //
102 // uint32_t num = n;
103 // uint32_t denum = 1;
104 //
105 // for (uint32_t i=1; i<r; ++i)
106 // {
107 // num *= n-i;
108 // denum *= i+1;
109 // }
110 //
111 // return num/denum;
112 // }
113 // }
114 //
115 //
116 // /**
117 // * Gets number of genotypes from number of alleles and ploidy.
118 // */
119 // uint32_t bcf_ap2g(uint32_t no_allele, uint32_t no_ploidy)
120 // {
121 // return choose(no_ploidy+no_allele-1, no_allele-1);
122 // }
123 //
124 // /**
125 // * Gets number of genotypes from number of alleles and ploidy.
126 // */
127 // uint32_t bcf_g2i(std::string genotype)
128 // {
129 // uint32_t index = 0;
130 // for (uint32_t i=0; i<genotype.size(); ++i)
131 // {
132 // uint32_t allele = genotype.at(i)-65;
133 // index += bcf_ap2g(allele, i+1);
134 // }
135 // return index;
136 // }
137 //
138 // void print_genotypes(uint32_t A, uint32_t P, std::string genotype)
139 // {
140 // if (genotype.size()==P)
141 // {
142 // std::cerr << no << ") " << genotype << " " << bcf_g2i(genotype) << "\n";
143 // ++no;
144 // }
145 // else
146 // {
147 // for (uint32_t a=0; a<A; ++a)
148 // {
149 // std::string s(1,(char)(a+65));
150 // s.append(genotype);
151 // print_genotypes(a+1, P, s);
152 // }
153 // }
154 // }
155
156 /**
157 * Computes composition and entropy ofrepeat tract.
158 */
compute_composition_and_entropy(std::string & repeat_tract)159 void compute_composition_and_entropy(std::string& repeat_tract)
160 {
161 int32_t comp[4];
162 float entropy;
163 float kl_divergence;
164 float entropy2;
165 float kl_divergence2;
166
167 int32_t aux_comp[4] = {0,0,0,0};
168 int32_t aux_comp2[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
169 int32_t b2i[10] = {0,1,0,2,0,0,0,0,0,3};
170
171 // ACGT x ACGT
172 // AA - 0 = 0*0 + 0
173 // AC - 1
174 // AG - 2
175 // AT - 3
176 // CA - 4 = 1*4 + 0 = 1<<2
177 // CC - 5
178 // CG - 6
179 // CT - 7
180 // GA - 8 = 2*4 + 0 = 2<<1 = 10=>100
181 // GC - 9
182 // GG - 10
183 // GT - 11
184 // TA - 12 = 3*4 + 0 = 3<<1 11=>110=>1100
185 // TC - 13
186 // TG - 14
187 // TT - 15 = 3*4+3 = 15
188
189 int32_t n = repeat_tract.size();
190
191 for (uint32_t i=0; i<n; ++i)
192 {
193 uint32_t b1 = b2i[(repeat_tract.at(i)-65)>>1];
194
195 ++aux_comp[b1];
196 if (i<n-1)
197 {
198 uint32_t b2 = b2i[(repeat_tract.at(i+1)-65)>>1];
199 uint32_t bb = (b1<<2) + b2;
200 ++aux_comp2[bb];
201 }
202 }
203
204 float p[4] = {(float) aux_comp[0]/n, (float) aux_comp[1]/n, (float) aux_comp[2]/n, (float) aux_comp[3]/n};
205
206 entropy = 0;
207 if (p[0]) entropy += p[0] * std::log2(p[0]);
208 if (p[1]) entropy += p[1] * std::log2(p[1]);
209 if (p[2]) entropy += p[2] * std::log2(p[2]);
210 if (p[3]) entropy += p[3] * std::log2(p[3]);
211 kl_divergence = p[0] + p[1] + p[2] + p[3];
212 kl_divergence = entropy + 2*kl_divergence;
213 entropy = -entropy;
214 entropy = std::round(100*entropy)/100;
215
216 comp[0] = std::round(p[0] * 100);
217 comp[1] = std::round(p[1] * 100);
218 comp[2] = std::round(p[2] * 100);
219 comp[3] = std::round(p[3] * 100);
220
221 std::cerr << "tract: " << repeat_tract << "\n";
222 // std::cerr << "A: " << comp[0] << " " << aux_comp[0] << "\n";
223 // std::cerr << "C: " << comp[1] << " " << aux_comp[1] << "\n";
224 // std::cerr << "G: " << comp[2] << " " << aux_comp[2] << "\n";
225 // std::cerr << "T: " << comp[3] << " " << aux_comp[3] << "\n";
226 // std::cerr << "\n";
227 std::cerr << "entropy : " << entropy << "\n";
228 std::cerr << "kl_divergence : " << kl_divergence << "\n";
229
230 entropy2 = 0;
231 kl_divergence2 = 0;
232 float log2q = -4;
233 if (n!=1)
234 {
235 float p2[16];
236 for (uint32_t i=0; i<16; ++i)
237 {
238 p2[i] = (float)aux_comp2[i]/(n-1);
239 }
240
241 for (uint32_t i=0; i<16; ++i)
242 {
243 if (p2[i])
244 {
245 entropy2 += p2[i]* std::log2(p2[i]);
246 kl_divergence2 += p2[i];
247 }
248 }
249 kl_divergence2 = entropy2 + 4*kl_divergence2;
250 entropy2 = -entropy2;
251
252 entropy2 = std::round(100*entropy2)/100;
253
254 // std::cerr << "tract: " << repeat_tract << "\n";
255 // std::cerr << "AA: " << aux_comp2[0] << " " << p2[0] << "\n";
256 // std::cerr << "AC: " << aux_comp2[1] << " " << p2[1] << "\n";
257 // std::cerr << "AG: " << aux_comp2[2] << " " << p2[2] << "\n";
258 // std::cerr << "AT: " << aux_comp2[3] << " " << p2[3] << "\n";
259 // std::cerr << "CA: " << aux_comp2[4] << " " << p2[4] << "\n";
260 // std::cerr << "CC: " << aux_comp2[5] << " " << p2[5] << "\n";
261 // std::cerr << "CG: " << aux_comp2[6] << " " << p2[6] << "\n";
262 // std::cerr << "CT: " << aux_comp2[7] << " " << p2[7] << "\n";
263 // std::cerr << "GA: " << aux_comp2[8] << " " << p2[8] << "\n";
264 // std::cerr << "GC: " << aux_comp2[9] << " " << p2[9] << "\n";
265 // std::cerr << "GG: " << aux_comp2[10] << " " << p2[10] << "\n";
266 // std::cerr << "GT: " << aux_comp2[11] << " " << p2[11] << "\n";
267 // std::cerr << "TA: " << aux_comp2[12] << " " << p2[12] << "\n";
268 // std::cerr << "TC: " << aux_comp2[13] << " " << p2[13] << "\n";
269 // std::cerr << "TG: " << aux_comp2[14] << " " << p2[14] << "\n";
270 // std::cerr << "TT: " << aux_comp2[15] << " " << p2[15] << "\n";
271 // std::cerr << "\n";
272 std::cerr << "entropy2 : " << entropy2 << "\n";
273 std::cerr << "kl_divergence2 : " << kl_divergence2 << "\n";
274 }
275
276 }
277
test(int argc,char ** argv)278 void test(int argc, char ** argv)
279 {
280 version = "0.5";
281
282 //////////////////////////
283 //options initialization//
284 //////////////////////////
285 try
286 {
287 std::string desc = "vt test -m detect_motif -s ACTGACT \n";
288
289 std::string version = "0.5";
290 TCLAP::CmdLine cmd(desc, ' ', version);
291 VTOutput my; cmd.setOutput(&my);
292 TCLAP::UnlabeledMultiArg<std::string> arg_x("ap", "#ploidy #alleles", true, "", cmd);
293
294 cmd.parse(argc, argv);
295
296 x = arg_x.getValue();
297 }
298 catch (TCLAP::ArgException &e)
299 {
300 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
301 abort();
302 }
303
304 no = 1;
305 for (uint32_t i=0; i<x.size(); ++i)
306 {
307 compute_composition_and_entropy(x[i]);
308 }
309 // print_genotypes(x[0], x[1], "");
310 // uint32_t g = bcf_ap2g(x[0], x[1]);
311 //
312 // std::cerr << "A: " << x[0] << " P: " << x[1] << " G: " << g << "\n";
313
314 // vcfFile *vcf = bcf_open(x[0].c_str(), "rb");
315 // bcf_hdr_t *h = bcf_hdr_read(vcf);
316 // bcf1_t *v = bcf_init();
317 //
318 //
319 // std::cerr << "writing to " << x[1] << "\n";
320 // vcfFile *ovcf = bcf_open(x[1].c_str(), "wu");
321 // bcf_hdr_write(ovcf, h);
322 //
323 // while (bcf_read(vcf, h, v)>=0)
324 // {
325 // //std::cerr << "test\n";
326 // bcf_write(ovcf, h, v);
327 // }
328 //
329 // bcf_close(ovcf);
330 // bcf_close(vcf);
331 };
332
print_stats()333 void print_stats()
334 {
335 std::clog << "\n";
336 };
337
338 /**
339 * Get a sequence. User have to free the char* returned.
340 */
get_sequence(std::string & chrom,uint32_t pos1,uint32_t len,faidx_t * fai)341 char* get_sequence(std::string& chrom, uint32_t pos1, uint32_t len, faidx_t* fai)
342 {
343 int ref_len = 0;
344
345 char* seq = faidx_fetch_uc_seq(fai, chrom.c_str(), pos1-1, pos1+len-2, &ref_len);
346
347 if (!seq || ref_len!=len)
348 {
349 fprintf(stderr, "[%s:%d %s] failure to extract sequence from fasta file: %s:%d: >\n", __FILE__, __LINE__, __FUNCTION__, chrom.c_str(), pos1-1);
350 exit(1);
351 }
352
353 return seq;
354 };
355
356 /**
357 * Analyse an mdust file
358 * Compute how much of genome it is off.
359 */
analyse_mdust(int argc,char ** argv)360 void analyse_mdust(int argc, char ** argv)
361 {
362 std::string bed_file;
363 std::string ref_fasta_file;
364 faidx_t *fai;
365
366 try
367 {
368 std::string desc = "analyse_mdust -m detect_motif -s ACTGACT \n";
369
370 std::string version = "0.5";
371 TCLAP::CmdLine cmd(desc, ' ', version);
372 VTOutput my; cmd.setOutput(&my);
373 TCLAP::ValueArg<std::string> arg_bed_file("b", "b", "BED file", true, "", "string", cmd);
374 TCLAP::ValueArg<std::string> arg_ref_fasta_file("r", "r", "reference FASTA file", true, "", "string", cmd);
375
376 cmd.parse(argc, argv);
377
378 bed_file = arg_bed_file.getValue();
379 ref_fasta_file = arg_ref_fasta_file.getValue();
380 }
381 catch (TCLAP::ArgException &e)
382 {
383 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
384 abort();
385 }
386
387 TBXOrderedReader todr(bed_file);
388
389 if (ref_fasta_file!="")
390 {
391 fai = fai_load(ref_fasta_file.c_str());
392 if (fai==NULL)
393 {
394 fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str());
395 exit(1);
396 }
397 }
398
399 kstring_t s = {0,0,0};
400
401 uint32_t no_non_n = 0;
402 uint32_t no = 0;
403
404 std::string chrom = "";
405
406 while (todr.read(&s))
407 {
408 BEDRecord br(&s);
409 //std::cerr << br.chrom << " " << br.end1 << "-" << br.start1 << " (" << (br.end1-br.start1+1)<< ")\n";
410
411 if (br.chrom!=chrom)
412 {
413 chrom = br.chrom;
414 if (chrom=="GL000207.1")
415 {
416 break;
417 }
418 std::cerr << chrom << "\n";
419 }
420
421 int ref_len = 0;
422 char* seq = faidx_fetch_uc_seq(fai, br.chrom.c_str(), br.beg1-1, br.end1-1, &ref_len);
423 //std::cerr << seq << " (" << ref_len <<")\n";
424
425 for (uint32_t i=0; i<ref_len; ++i)
426 {
427 if (seq[i]!='N')
428 {
429 ++no_non_n;
430 }
431
432 ++no;
433 }
434
435 free(seq);
436 }
437
438 std::cerr << no_non_n << "/" << no << "\n";
439 };
440
441 /**
442 * Analyse an mdust file
443 * Compute how much of genome it is off.
444 */
analyse_str(int argc,char ** argv)445 void analyse_str(int argc, char ** argv)
446 {
447 std::string bed_file;
448 std::string ref_fasta_file;
449 faidx_t *fai;
450
451 try
452 {
453 std::string desc = "analyse_mdust -m detect_motif -s ACTGACT \n";
454
455 std::string version = "0.5";
456 TCLAP::CmdLine cmd(desc, ' ', version);
457 VTOutput my; cmd.setOutput(&my);
458 TCLAP::ValueArg<std::string> arg_ref("v", "v", "reference", true, "", "string", cmd);
459 TCLAP::ValueArg<std::string> arg_alt("a", "a", "alternative", true, "", "string", cmd);
460 TCLAP::ValueArg<std::string> arg_ref_fasta_file("r", "r", "reference FASTA file", true, "", "string", cmd);
461
462 cmd.parse(argc, argv);
463
464 ref_fasta_file = arg_ref_fasta_file.getValue();
465 }
466 catch (TCLAP::ArgException &e)
467 {
468 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
469 abort();
470 }
471
472 TBXOrderedReader todr(bed_file);
473
474 if (ref_fasta_file!="")
475 {
476 fai = fai_load(ref_fasta_file.c_str());
477 if (fai==NULL)
478 {
479 fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str());
480 exit(1);
481 }
482 }
483 };
484
485 /**
486 * Analyse an mdust file
487 * Compute how much of genome it is off.
488 */
test_pcre2(int argc,char ** argv)489 void test_pcre2(int argc, char ** argv)
490 {
491 std::string regex;
492 std::string text;
493 faidx_t *fai;
494
495 try
496 {
497 std::string desc = "analyse_mdust -m detect_motif -s ACTGACT \n";
498
499 std::string version = "0.5";
500 TCLAP::CmdLine cmd(desc, ' ', version);
501 VTOutput my; cmd.setOutput(&my);
502 TCLAP::ValueArg<std::string> arg_regex("r", "r", "regular expression", true, "", "string", cmd);
503 TCLAP::ValueArg<std::string> arg_text("t", "t", "text", true, "", "string", cmd);
504
505 cmd.parse(argc, argv);
506
507 regex = arg_regex.getValue();
508 text = arg_text.getValue();
509 }
510 catch (TCLAP::ArgException &e)
511 {
512 std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
513 abort();
514 }
515
516 printf("regex = %s\n", regex.c_str());
517 printf("text = %s\n", text.c_str());
518
519 PERLregex pregex;
520
521 pregex.set(regex);
522 std::cerr << "matched: " << pregex.match(text) << "\n";
523
524
525 // pcre2_code *re;
526 // PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
527 // PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
528 // PCRE2_SPTR name_table;
529 //
530 // int crlf_is_newline;
531 // int errornumber;
532 // int find_all;
533 // int i;
534 // int namecount;
535 // int name_entry_size;
536 // int rc;
537 // int utf8;
538 //
539 // uint32_t option_bits;
540 // uint32_t newline;
541 //
542 // PCRE2_SIZE erroroffset;
543 // PCRE2_SIZE *ovector;
544 //
545 // size_t subject_length;
546 // pcre2_match_data *match_data;
547 //
548 // pattern = (PCRE2_SPTR)regex.c_str();
549 // subject = (PCRE2_SPTR)text.c_str();
550 // subject_length = strlen((char *)subject);
551 //
552 // re = pcre2_compile(
553 // pattern, /* the pattern */
554 // PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
555 // 0, /* default options */
556 // &errornumber, /* for error number */
557 // &erroroffset, /* for error offset */
558 // NULL);
559 //
560 // if (re == NULL)
561 // {
562 // PCRE2_UCHAR buffer[256];
563 // pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
564 // printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
565 // buffer);
566 //
567 // }
568 //
569 // match_data = pcre2_match_data_create_from_pattern(re, NULL);
570 //
571 // rc = pcre2_match(
572 // re, /* the compiled pattern */
573 // subject, /* the subject string */
574 // subject_length, /* the length of the subject */
575 // 0, /* start at offset 0 in the subject */
576 // 0, /* default options */
577 // match_data, /* block for storing the result */
578 // NULL); /* use default match context */
579 //
580 // if (rc < 0)
581 // {
582 // switch(rc)
583 // {
584 // case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
585 // /*
586 // Handle other special cases if you like
587 // */
588 // default: printf("Matching error %d\n", rc); break;
589 // }
590 // pcre2_match_data_free(match_data); /* Release memory used for the match */
591 // pcre2_code_free(re); /* data and the compiled pattern. */
592 // }
593 //
594 // if (rc>0)
595 // {
596 // printf("matched!\n");
597 // }
598
599 };
600
601
test_ip2g(int argc,char ** argv)602 void test_ip2g(int argc, char ** argv)
603 {
604 printf("allele\tploidy\tindex\tverified\n");
605
606 for (int32_t no_allele=2; no_allele<=6; ++no_allele)
607 {
608 for (int32_t no_ploidy=2; no_ploidy<=6; ++no_ploidy)
609 {
610 int32_t max_index = bcf_ap2g(no_allele, no_ploidy)-1;
611
612 for (int32_t index=0; index<=max_index; ++index)
613 {
614 fprintf(stderr, "%d\t%d\t%d\t", no_allele, no_ploidy, index);
615
616
617 std::vector<int32_t> genotypes = bcf_ip2g(index, no_ploidy);
618
619 for (int32_t j=0; j<no_ploidy; ++j)
620 {
621 if (j) printf("/");
622 printf("%d", genotypes[j]);
623 }
624
625 uint32_t i = bcf_g2i(genotypes);
626
627 std::string v = (i==index) ? "correct" : "wrong";
628
629 printf("\t%s\n", v.c_str());
630
631 }
632 }
633 }
634 };
~Igor()635 ~Igor() {};
636
637 private:
638 };
639
640 }
641
test(int argc,char ** argv)642 void test(int argc, char ** argv)
643 {
644 Igor igor(argc, argv);
645
646
647 igor.test_ip2g(argc, argv);
648 // printf ("std::isnan(0.0) : %d\n",std::isnan(0.0));
649 // printf ("std::isnan(1.0/0.0) : %d\n",std::isnan(1.0/0.0));
650 // printf ("std::isnan(-1.0/0.0) : %d\n",std::isnan(-1.0/0.0));
651 // printf ("std::isnan(sqrt(-1.0)): %d\n",std::isnan(sqrt(-1.0)));
652 // printf ("std::isnan((double)sqrt(-1.0)): %d\n",std::isnan((double)sqrt(-1.0)));
653 // printf ("std::isnan((float)sqrt(-1.0)): %d\n",std::isnan((float)sqrt(-1.0)));
654 // printf ("std::isnanf((double)sqrt(-1.0)): %d\n",std::isnanf((double)sqrt(-1.0)));
655 // printf ("std::isnanf((float)sqrt(-1.0)): %d\n",std::isnanf((float)sqrt(-1.0)));
656 //igor.analyse_mdust(argc, argv);
657
658 //igor.test_pcre2(argc, argv);
659
660 //igor.test();
661 };
662