1 /* The MIT License
2
3 Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "lhmm1.h"
25
26 #include "lhmm1.h"
27
LHMM1()28 LHMM1::LHMM1()
29 {
30 initialize(500);
31 };
32
initialize(int32_t maxLength)33 void LHMM1::initialize(int32_t maxLength)
34 {
35 this->maxLength = maxLength;
36 // delta = 0.01;
37 // epsilon = 0.1;
38 // tau = 0.1;
39 // eta = 0.001;
40
41 // delta = 0.01;
42 // epsilon = 0.1;
43 // tau = 0.1;
44 // eta = 0.001;
45
46 // last
47 // delta = 0.001;
48 // epsilon = 0.5;
49 // tau = 0.1;
50 // eta = 0.01;
51
52 // delta = 0.01;
53 // epsilon = 0.5;
54 // tau = 0.1;
55 // eta = 0.001;
56
57 delta = 0.001;
58 epsilon = 0.5;
59 tau = 0.1;
60 eta = 0.001;
61
62 logOneSixteenth = log10(1.0/16.0);
63
64 //for matched portion
65 tMM = log10(1-2*delta-tau);
66 tMI = log10(delta);
67 tMD = log10(delta);
68 tII = log10(epsilon);
69 tDD = log10(epsilon);
70 tDM = log10(1-epsilon);
71 tIM = log10(1-epsilon);
72
73 txx = log10((1-eta)/(1-eta));
74 txy = log10((eta*(1-eta))/(eta*(1-eta)));
75 tyy = log10((1-eta)/(1-eta));
76
77 txm = log10((eta*eta)/(eta*eta*(1-eta)*(1-eta)));
78 tym = log10((eta)/(eta*(1-eta)*(1-eta)));
79 tsm = log10((eta*eta)/(eta*eta*eta*(1-eta)*(1-eta)));
80 tmm = log10((1-2*delta-tau)/((1-eta)*(1-eta)));
81 tim = log10((1-epsilon)/(1-eta));
82 tdm = log10((1-epsilon)/(1-eta));
83
84 tmd = log10(delta/(1-eta));
85 tdd = log10(epsilon/(1-eta));
86
87 tmi = log10(delta/(1-eta));
88 tii = log10(epsilon/(1-eta));
89
90 tmw = log10((tau*(1-eta))/(eta*(1-eta)));
91 tww = log10((1-eta)/(1-eta));
92
93 tmz = log10((tau*eta*(1-eta))/(eta*eta*(1-eta)));
94 twz = log10((eta*(1-eta))/(eta*(1-eta)));
95 tzz = log10((1-eta)/(1-eta));
96
97 //assume alignments can't possibly be maxLength bases or more
98 //todo:: allow for resizing if you do encounter fragments which are really long
99 std::vector<double> temp(maxLength+1, -DBL_MAX);
100 std::vector<char> tempX(maxLength+1, 'Y');
101 tempX[0] = 'X';
102 std::vector<char> tempY(maxLength+1, 'Y');
103 tempY[0] = 'X';
104
105 X.clear();
106 Y.clear();
107 M.clear();
108 I.clear();
109 D.clear();
110 W.clear();
111 Z.clear();
112 pathX.clear();
113 pathY.clear();
114 pathM.clear();
115 pathD.clear();
116 pathI.clear();
117 pathW.clear();
118 pathZ.clear();
119
120 for (uint32_t i=0; i<maxLength+1; ++i)
121 {
122 X.push_back(temp);
123 Y.push_back(temp);
124 M.push_back(temp);
125 I.push_back(temp);
126 D.push_back(temp);
127 W.push_back(temp);
128 Z.push_back(temp);
129 pathX.push_back(tempX);
130 pathY.push_back(tempY);
131 pathM.push_back(tempX);
132 pathD.push_back(tempX);
133 pathI.push_back(tempX);
134 pathW.push_back(tempX);
135 pathZ.push_back(tempX);
136 }
137
138 logEta = log10(eta);
139 logTau = log10(tau);
140
141 X[0][0] = 0;
142 Y[0][0] = 0;
143 M[0][0] = 0;
144 W[0][0] = 0;
145 Z[0][0] = 0;
146 pathX[0][0] = 'N';
147 pathX[1][0] = 'S';
148 pathY[0][0] = 'N';
149 pathY[0][1] = 'S';
150 pathM[0][0] = 'N';
151 pathM[1][1] = 'S';
152
153 for (uint32_t k=1; k<maxLength+1; ++k)
154 {
155 X[k][0] = X[k-1][0] + txx;
156 X[0][k] = -DBL_MAX;
157 Y[k][0] = -DBL_MAX;
158 Y[0][k] = Y[0][k-1] + tyy;
159 W[k][0] = W[k-1][0] + tww;
160 W[0][k] = -DBL_MAX;
161 Z[k][0] = -DBL_MAX;
162 Z[0][k] = Z[0][k-1] + tzz;
163 }
164
165 X[0][0] = -DBL_MAX;
166 Y[0][0] = -DBL_MAX;
167 W[0][0] = -DBL_MAX;
168 Z[0][0] = -DBL_MAX;
169
170 };
171
containsIndel()172 bool LHMM1::containsIndel()
173 {
174 return (indelStatusInPath.size()!=0);
175 }
176
177 /**
178 convert PLs to probabilities.
179 */
pl2prob(uint32_t PL)180 double LHMM1::pl2prob(uint32_t PL)
181 {
182 if (PL>=PLs.size())
183 {
184 if (PL > 3236)
185 PL = 3236;
186
187 for (uint32_t i=PLs.size(); i<=PL; ++i)
188 {
189 PLs.push_back(pow(10, -((double) i)/10.0));
190 }
191 }
192
193 return PLs[PL];
194 }
195
196 /**
197 *Updates matchStart, matchEnd, globalMaxPath and path.
198 */
tracePath()199 void LHMM1::tracePath()
200 {
201 double globalMax = M[xlen][ylen];
202 char globalMaxPath = 'M';
203 if (W[xlen][ylen]>globalMax)
204 {
205 globalMax = W[xlen][ylen];
206 globalMaxPath = 'W';
207 }
208 if (Z[xlen][ylen]>globalMax)
209 {
210 globalMax = Z[xlen][ylen];
211 globalMaxPath = 'Z';
212 }
213
214 matchStartX = -1;
215 matchEndX = -1;
216 matchStartY = -1;
217 matchEndY = -1;
218 noBasesAligned = 0;
219 if (globalMaxPath == 'M')
220 {
221 matchEndX = xlen;
222 matchEndY = ylen;
223 ++noBasesAligned;
224 }
225
226 maxLogOdds = globalMax;
227
228 //std::cerr << "\tGlobal max Path : " << globalMaxPath << "\n";
229 //std::cerr << "\tGlobal max log odds : " << globalMax << "\n";
230
231 std::stringstream ss;
232 ss << globalMaxPath;
233
234 //recursively trace path
235 tracePath(ss, globalMaxPath, xlen, ylen);
236
237 path = reverse(ss.str());
238
239 bool inI = false;
240 bool inD = false;
241 //std::cerr << "*************************************\n";
242 //std::cerr << "PATH :" << path << "\n";
243
244 ////////////////////////////////////////////////
245 //records appearance of Indels in probe and read
246 ////////////////////////////////////////////////
247 indelStartsInX.clear();
248 indelEndsInX.clear();
249 indelStartsInY.clear();
250 indelEndsInY.clear();
251 indelStartsInPath.clear();
252 indelEndsInPath.clear();
253 indelStatusInPath.clear();
254
255 uint32_t x=0, y=0;
256 for (uint32_t i=0; i<path.size(); ++i)
257 {
258 if (!inI && path[i]=='I')
259 {
260 inI = true;
261 indelStartsInPath.push_back(i);
262 indelStatusInPath.push_back('I');
263 indelStartsInX.push_back(x);
264 indelStartsInY.push_back(y);
265 }
266
267 if (inI && path[i]!='I')
268 {
269 inI = false;
270 indelEndsInPath.push_back(i);
271 indelEndsInX.push_back(x);
272 indelEndsInY.push_back(y);
273 }
274
275 if (!inD && path[i]=='D')
276 {
277 inD = true;
278 indelStartsInPath.push_back(i);
279 indelStatusInPath.push_back('D');
280 indelStartsInX.push_back(x);
281 indelStartsInY.push_back(y);
282 }
283
284 if (inD && path[i]!='D')
285 {
286 inD = false;
287 indelEndsInPath.push_back(i);
288 indelEndsInX.push_back(x);
289 indelEndsInY.push_back(y);
290 }
291
292 if (path[i]=='M')
293 {
294 ++x;
295 ++y;
296 }
297
298 if (path[i]=='I' || path[i]=='Y' || path[i]=='Z')
299 {
300 ++y;
301 }
302
303 if (path[i]=='D' || path[i]=='X' || path[i]=='W')
304 {
305 ++x;
306 }
307 }
308
309 // std::cerr << "\t# INS in Path : ";
310 // for (uint32_t i=0; i<insertionStartsInPath.size(); ++i)
311 // {
312 // std::cerr << "(" << insertionStartsInPath[i] << "," << insertionEndsInPath[i] << ") ";
313 // }
314 // std::cerr << "\n";
315 //
316 // std::cerr << "\t# DEL in Path : ";
317 // for (uint32_t i=0; i<deletionStartsInPath.size(); ++i)
318 // {
319 // std::cerr << "(" << deletionStartsInPath[i] << "," << deletionEndsInPath[i] << ") ";
320 // }
321 // std::cerr << "\n";
322 // std::cerr << "*************************************\n";
323 //left align path
324 left_align();
325
326 // std::cerr << "*************************************\n";
327 // printAlignment();
328 // std::cerr << "*************************************\n";
329
330 // std::cerr << "\tPath: " << path << "\n";
331 }
332
tracePath(std::stringstream & ss,char state,uint32_t i,uint32_t j)333 void LHMM1::tracePath(std::stringstream& ss, char state, uint32_t i, uint32_t j)
334 {
335 //std::cout << state << "\n";
336 if (i>0 && j>0)
337 {
338 if (state=='X')
339 {
340 //std::cerr << pathX[i][j] << " " << i << " " << j << " " << matchStartY << "\n";
341 ss << pathX[i][j];
342 tracePath(ss, pathX[i][j], i-1, j);
343 }
344 else if (state=='Y')
345 {
346 //std::cerr << pathY[i][j] << " " << i << " " << j << " " << matchStartY << "\n";
347 ss << pathY[i][j];
348 tracePath(ss, pathY[i][j], i, j-1);
349 }
350 else if (state=='M')
351 {
352 // if (matchEndX==-1)
353 // {
354 // matchEndX = i;
355 // matchEndY = j;
356 //
357 // //std::cerr << "set matchEndX " << matchEndX << " " << matchEndY << "\n";
358 // }
359 //
360 if (matchStartX==-1 && (pathM[i][j] =='X' || pathM[i][j]=='Y'))
361 {
362 matchStartX = i;
363 matchStartY = j;
364
365 //std::cerr << "set match starts " << matchStartX << " " << matchStartY << "\n";
366 }
367
368
369 //std::cout << pathM[i][j] << " " << i << " " << j << " " << matchEndY << "\n";
370 ss << pathM[i][j];
371 tracePath(ss, pathM[i][j], i-1, j-1);
372 ++noBasesAligned;
373 }
374 else if (state=='I')
375 {
376 //std::cout << pathI[i][j] << " " << i << " " << j << "\n";
377 ss << pathI[i][j];
378 tracePath(ss, pathI[i][j], i, j-1);
379 }
380 else if (state=='D')
381 {
382 //std::cout << pathD[i][j] << " " << i << " " << j << "\n";
383 ss << pathD[i][j];
384 tracePath(ss, pathD[i][j], i-1, j);
385 ++noBasesAligned;
386 }
387 else if (state=='W')
388 {
389 if (matchEndX==-1 && pathW[i][j] =='M')
390 {
391 matchEndX = i-1;
392 matchEndY = j;
393 }
394
395
396 //std::cout << pathW[i][j] << " " << i << " " << j << "\n";
397 ss << pathW[i][j];
398 tracePath(ss, pathW[i][j], i-1, j);
399 }
400 else if (state=='Z')
401 {
402 if (matchEndX==-1 && pathZ[i][j] =='M')
403 {
404 matchEndX = i;
405 matchEndY = j-1;
406 }
407
408 //std::cout << pathZ[i][j] << " " << i << " " << j << "\n";
409 ss << pathZ[i][j];
410 tracePath(ss, pathZ[i][j], i, j-1);
411 }
412 else if (state=='S')
413 {
414 //std::cout << "\n";
415 if (matchStartX==-1)
416 {
417 matchStartX = i+1;
418 }
419
420 if (matchStartY==-1)
421 {
422 matchStartY = j+1;
423 }
424
425 }
426 }
427 else if (i==0 && j>0)
428 {
429 if (matchStartY==-1)
430 {
431 matchStartY = j+1;
432 }
433
434 // std::cout << pathY[i][j] << " " << i << " " << j << "\n";
435 ss << pathY[i][j];
436 tracePath(ss, pathY[i][j], i, j-1);
437 }
438 else if (i>0 && j==0)
439 {
440 if (matchStartY==-1)
441 {
442 matchStartY = j+1;
443 }
444 // std::cout << pathX[i][j] << " " << i << " " << j << "\n";
445 ss << pathX[i][j];
446 tracePath(ss, pathX[i][j], i-1, j);
447 }
448 else
449 {
450 //std::cout << "\n";
451 //ss << pathX[i][j];
452 }
453 }
454
455 /**
456 *Left align indels in an alignment
457 */
left_align()458 void LHMM1::left_align()
459 {
460 //.......... GATAAGTGAG GAGGAAAAGC GA---GGAGA TCTTATTTGACAACTGCE
461 //YYYYYYYYYY MMMMMMMMMM MMMMMMMMMM MMIIIMMMMM MMMMMWWWWWWWWWWWWE
462 //ACTGCTTCTA GATAAGTGAG GAGGAAAAGC GATAAGGAGA TCTTA............E
463 //# INS in Probe : (22,22)
464 //# INS in Read : (32,35)
465
466 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
467 {
468 if (indelStatusInPath[i]=='I')
469 {
470 while (indelStartsInX[i]>1 && indelStartsInY[i]>1)
471 {
472 // printAlignment();
473 //
474 // std::cerr << "X[" << indelStartsInX[i]-1 << "] " << x[indelStartsInX[i]-1] << "\n";
475 // std::cerr << "Y[" << indelStartsInY[i]-1 << "] " << y[indelStartsInY[i]-1] << "\n";
476 // std::cerr << "Y[" << indelEndsInY[i]-1 << "] " << y[indelEndsInY[i]-1] << "\n";
477 //
478 if (//(indelStartsInY[i]!=indelEndsInY[i]-1) && //single indel need not be shifted
479 x[indelStartsInX[i]-1]==y[indelStartsInY[i]-1] &&
480 y[indelStartsInY[i]-1]==y[indelEndsInY[i]-1])
481 {
482 // std::cerr << "CHECK BEFORE:" << path << "\n";
483 // std::cerr << "P[" << indelStartsInPath[i]-1 << "] " << path[indelStartsInPath[i]-1] << "\n";
484 // std::cerr << "P[" << indelEndsInPath[i]-1 << "] " << path[indelEndsInPath[i]-1] << "\n";
485
486 path[indelStartsInPath[i]-1] = 'I';
487 path[indelEndsInPath[i]-1] = 'M';
488
489 --indelStartsInX[i];
490 --indelEndsInX[i];
491 --indelStartsInY[i];
492 --indelEndsInY[i];
493 --indelStartsInPath[i];
494 --indelEndsInPath[i];
495
496 // std::cerr << "SHIFT 1 to the left\n";
497
498 }
499 else
500 {
501 break;
502 }
503 }
504 }
505 else
506 {
507 // ==================
508 // 2) 6:81319178:AATTT:A:-4
509 // ==================
510 // ref probe ATCTATTAAAGATATATGTCAATTTATTAGTGCATATCAAAGAGCAAGT (20/49)
511 // read sequence GATATATGTCAATTAGTGCATATCAAAGAGCAAGTTGACATGTTTTTTCAATATATTCATTTCTCTAATTTATCTC
512 // ==================
513 // X SATCTATTAAAGATATATGTCAATTTATTAGTGCATATCAAAGAGCAAGT.........................................E
514 // Path SXXXXXXXXXXMMMMMMMMMMMMMDDDDMMMMMMMMMMMMMMMMMMMMMMZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZE
515 // Y S..........GATATATGTCAAT----TAGTGCATATCAAAGAGCAAGTTGACATGTTTTTTCAATATATTCATTTCTCTAATTTATCTCE
516 // Match Start : 0
517 // Match End : 35
518 // # Aligned Bases : 39
519 // # Matched Bases : 35
520 // # Mismatched Bases : 0
521 // # INS in Probe :
522 // # INS in Read :
523 // # INS in Path :
524 // # DEL in Probe : (23,27)
525 // # DEL in Read : (13,13)
526 // # DEL in Path : (24,28)
527
528 while (indelStartsInX[i]>1 && indelStartsInY[i]>1)
529 {
530 // printAlignment();
531 // std::cerr << "X[" << indelStartsInX[i]-1 << "] " << x[indelStartsInX[i]-1] << "\n";
532 // std::cerr << "Y[" << indelStartsInY[i]-1 << "] " << y[indelStartsInY[i]-1] << "\n";
533 // std::cerr << "Y[" << indelEndsInY[i]-1 << "] " << y[indelEndsInY[i]-1] << "\n";
534
535 if (//(indelStartsInX[i]!=indelEndsInX[i]-1) && //single indel need not be shifted
536 x[indelStartsInX[i]-1]==y[indelStartsInY[i]-1] &&
537 x[indelStartsInX[i]-1]==x[indelEndsInX[i]-1])
538 {
539 // std::cerr << "CHECK BEFORE:" << path << "\n";
540 // std::cerr << "P[" << indelStartsInPath[i]-1 << "] " << path[indelStartsInPath[i]-1] << "\n";
541 // std::cerr << "P[" << indelEndsInPath[i]-1 << "] " << path[indelEndsInPath[i]-1] << "\n";
542
543 path[indelStartsInPath[i]-1] = 'D';
544 path[indelEndsInPath[i]-1] = 'M';
545
546 --indelStartsInX[i];
547 --indelEndsInX[i];
548 --indelStartsInY[i];
549 --indelEndsInY[i];
550 --indelStartsInPath[i];
551 --indelEndsInPath[i];
552
553 // std::cerr << "SHIFT 1 to the left\n";
554
555 }
556 else
557 {
558 break;
559 }
560 }
561
562 }
563 }
564 }
565
566 /**
567 Align and compute genotype likelihood.
568 */
align(double & llk,const char * _x,const char * _y,const char * _qual,bool debug)569 void LHMM1::align(double& llk, const char* _x, const char* _y, const char* _qual, bool debug)
570 {
571 x = _x;
572 y = _y;
573 qual = _qual;
574
575 //adds a starting character at the fron of each string that must be matched
576 xlen = strlen(x);
577 ylen = strlen(y);
578
579 int32_t mlen = xlen>ylen?xlen:ylen;
580 if (mlen>this->maxLength)
581 {
582 initialize(mlen+10);
583 }
584
585 double max = 0;
586 char maxPath = 'X';
587
588 //std::cerr << "x: " << x << " " << xlen << "\n" ;
589 //std::cerr << "y: " << y << " " << ylen << "\n" ;
590
591 //construct possible solutions
592 for (uint32_t i=1; i<=xlen; ++i)
593 {
594 for (uint32_t j=1; j<=ylen; ++j)
595 {
596 //std::cerr << i << " " << j << "\n" ;
597
598 //X
599 double xx = X[i-1][j] + txx;
600
601 max = xx;
602 maxPath = 'X';
603
604 X[i][j] = max;
605 pathX[i][j] = maxPath;
606
607 //Y
608 double xy = X[i][j-1] + txy;
609 double yy = Y[i][j-1] + tyy;
610
611 max = xy;
612 maxPath = 'X';
613
614 if (yy>max)
615 {
616 max = yy;
617 maxPath = 'Y';
618 }
619
620 Y[i][j] = max;
621 pathY[i][j] = maxPath;
622
623 //M
624 double xm = X[i-1][j-1] + txm;
625 double ym = Y[i-1][j-1] + tym;
626 double mm = M[i-1][j-1] + ((i==1&&j==1) ? tsm : tmm);
627 double im = I[i-1][j-1] + tim;
628 double dm = D[i-1][j-1] + tdm;
629
630 max = xm;
631 maxPath = 'X';
632
633 if (ym>max) //special case
634 {
635 max = ym;
636 maxPath = 'Y';
637 }
638 if (mm>max)
639 {
640 max = mm;
641 maxPath = (i==1&&j==1) ? 'S' : 'M';
642 }
643 if (im>max)
644 {
645 max = im;
646 maxPath = 'I';
647 }
648 if (dm>max)
649 {
650 max = dm;
651 maxPath = 'D';
652 }
653
654 M[i][j] = max + logEmissionOdds(x[i-1], y[j-1], pl2prob((uint32_t) qual[j-1]-33));
655 pathM[i][j] = maxPath;
656
657 //D
658 double md = M[i-1][j] + tmd;
659 double dd = D[i-1][j] + tdd;
660
661 max = md;
662 maxPath = 'M';
663
664 if (dd>max)
665 {
666 max = dd;
667 maxPath = 'D';
668 }
669
670 D[i][j] = max;
671 pathD[i][j] = maxPath;
672
673 //I
674 double mi = M[i][j-1] + tmi;
675 double ii = I[i][j-1] + tii;
676
677 max = mi;
678 maxPath = 'M';
679
680 if (ii>max)
681 {
682 max = ii;
683 maxPath = 'I';
684 }
685
686 I[i][j] = max;
687 pathI[i][j] = maxPath;
688
689 //W
690 double mw = M[i-1][j] + tmw;
691 double ww = W[i-1][j] + tww;
692
693 max = mw;
694 maxPath = 'M';
695
696 if (ww>max)
697 {
698 max = ww;
699 maxPath = 'W';
700 }
701
702 W[i][j] = max;
703 pathW[i][j] = maxPath;
704
705 //Z
706 double mz = M[i][j-1] + tmz;
707 double wz = W[i][j-1] + twz;
708 double zz = Z[i][j-1] + tzz;
709
710 max = mz;
711 maxPath = 'M';
712
713 if (wz>max)
714 {
715 max = wz;
716 maxPath = 'W';
717 }
718 if (zz>max)
719 {
720 max = zz;
721 maxPath = 'Z';
722 }
723
724 Z[i][j] = max;
725 pathZ[i][j] = maxPath;
726 }
727
728 M[xlen][ylen] += logTau-logEta;
729 }
730
731 if (debug)
732 {
733 std::cerr << "\n=X=\n";
734 printVector(X, xlen+1, ylen+1);
735 std::cerr << "\n=Y=\n";
736 printVector(Y, xlen+1, ylen+1);
737 std::cerr << "\n=M=\n";
738 printVector(M, xlen+1, ylen+1);
739 std::cerr << "\n=D=\n";
740 printVector(D, xlen+1, ylen+1);
741 std::cerr << "\n=I=\n";
742 printVector(I, xlen+1, ylen+1);
743 std::cerr << "\n=W=\n";
744 printVector(W, xlen+1, ylen+1);
745 std::cerr << "\n=Z=\n";
746 printVector(Z, xlen+1, ylen+1);
747 std::cerr << "\n=Path X=\n";
748 printVector(pathX, xlen+1, ylen+1);
749 std::cerr << "\n=Path Y=\n";
750 printVector(pathY, xlen+1, ylen+1);
751 std::cerr << "\n=Path M=\n";
752 printVector(pathM, xlen+1, ylen+1);
753 std::cerr << "\n=Path D=\n";
754 printVector(pathD, xlen+1, ylen+1);
755 std::cerr << "\n=Path I=\n";
756 printVector(pathI, xlen+1, ylen+1);
757 std::cerr << "\n=Path W=\n";
758 printVector(pathW, xlen+1, ylen+1);
759 std::cerr << "\n=Path Z=\n";
760 printVector(pathZ, xlen+1, ylen+1);
761 }
762
763 tracePath();
764 };
765
766 /**
767 *Compute log likelihood of matched portion of alignment
768 */
computeLogLikelihood(double & llk,std::string & _path,const char * qual)769 void LHMM1::computeLogLikelihood(double& llk, std::string& _path, const char* qual)
770 {
771
772 // std::cerr <<" basequals " << qual << "\n";
773 // std::cerr <<" read " << y << "\n";
774 // std::cerr <<" path " << path << "\n";
775 matchedBases = 0;
776 mismatchedBases = 0;
777
778 llk = 0;
779 double q = 0;
780 char lastState = 'S';
781 uint32_t xIndex=0, yIndex=0;
782 for (uint32_t i=0; i<_path.size(); ++i)
783 {
784 char state = _path.at(i);
785 //std::cerr << state << " " << xIndex << " " << yIndex << "\n";
786
787 if (state=='M')
788 {
789 q = pl2prob((uint32_t) qual[yIndex]-33);
790 llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], q));
791 //compute for perfect fit
792
793 if (x[xIndex]== y[yIndex])
794 {
795 ++matchedBases;
796 }
797 else
798 {
799 ++mismatchedBases;
800 }
801
802 if (lastState=='M')
803 {
804 llk = LogTool::log10prod(llk, tMM);
805 }
806 else if (lastState=='D')
807 {
808 llk = LogTool::log10prod(llk, tDM);
809 }
810 else if (lastState=='I')
811 {
812 llk = LogTool::log10prod(llk, tIM);
813 }
814
815 //std::cerr << "M) "<< lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << " " << pl2prob((uint32_t) qual[yIndex]-33) << " "<< qual[yIndex-1]-33 << "\n";
816
817 ++xIndex;
818 ++yIndex;
819 }
820 else if (state=='D')
821 {
822 if (lastState=='M')
823 {
824 llk = LogTool::log10prod(llk, tMD);
825 }
826 else if (lastState=='D')
827 {
828 llk = LogTool::log10prod(llk, tDD);
829 }
830
831 //std::cerr << "D) " << lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << "\n";
832
833 ++xIndex;
834 }
835 else if (state=='I')
836 {
837 if (lastState=='M')
838 {
839 llk = LogTool::log10prod(llk, tMI);
840 }
841 else if (lastState=='I')
842 {
843 llk = LogTool::log10prod(llk, tII);
844 }
845
846 //std::cerr << "I) " << lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << "\n";
847
848 ++yIndex;
849 }
850 else if (state=='X')
851 {
852 ++xIndex;
853 }
854 else if (state=='Y')
855 {
856 ++yIndex;
857 }
858 else if (state=='W')
859 {
860 ++xIndex;
861 }
862 else if (state=='Z')
863 {
864 ++yIndex;
865 }
866
867 lastState = state;
868 }
869 };
870
871 /**
872 * Compute log likelihood of matched portion of alignment
873 */
computeLogLikelihood(double & llk,double & perfectllk,std::string & _path,const char * qual)874 void LHMM1::computeLogLikelihood(double& llk, double& perfectllk, std::string& _path, const char* qual)
875 {
876
877 // std::cerr <<" basequals " << qual << "\n";
878 // std::cerr <<" read " << y << "\n";
879 // std::cerr <<" path " << path << "\n";
880 matchedBases = 0;
881 mismatchedBases = 0;
882
883 llk = 0;
884 perfectllk = 0;
885 double q = 0;
886 char lastState = 'S';
887 uint32_t xIndex=0, yIndex=0;
888 for (uint32_t i=0; i<_path.size(); ++i)
889 {
890
891 char state = _path.at(i);
892 //std::cerr << state << " " << xIndex << " " << yIndex << "\n";
893
894 if (state=='M')
895 {
896 q = pl2prob((uint32_t) qual[yIndex]-33);
897 llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], q));
898 //compute for perfect fit
899 perfectllk = LogTool::log10prod(perfectllk, logEmission(x[xIndex], x[xIndex], q));
900
901 if (x[xIndex]== y[yIndex])
902 {
903 ++matchedBases;
904 }
905 else
906 {
907 ++mismatchedBases;
908 }
909
910 if (lastState=='M')
911 {
912 llk = LogTool::log10prod(llk, tMM);
913 }
914 else if (lastState=='D')
915 {
916 llk = LogTool::log10prod(llk, tDM);
917 }
918 else if (lastState=='I')
919 {
920 llk = LogTool::log10prod(llk, tIM);
921 }
922
923 perfectllk = LogTool::log10prod(llk, tMM);
924
925 //std::cerr << "M) "<< lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << " " << pl2prob((uint32_t) qual[yIndex]-33) << " "<< qual[yIndex-1]-33 << "\n";
926
927 ++xIndex;
928 ++yIndex;
929 }
930 else if (state=='D')
931 {
932 if (lastState=='M')
933 {
934 llk = LogTool::log10prod(llk, tMD);
935 }
936 else if (lastState=='D')
937 {
938 llk = LogTool::log10prod(llk, tDD);
939 }
940
941 // std::cerr << "D) " << lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << "\n";
942
943 ++xIndex;
944 }
945 else if (state=='I')
946 {
947 if (lastState=='M')
948 {
949 llk = LogTool::log10prod(llk, tMI);
950 }
951 else if (lastState=='I')
952 {
953 llk = LogTool::log10prod(llk, tII);
954 }
955
956 // std::cerr << "I) " << lastState << " " << llk << " " << x[xIndex] << " " << y[yIndex] << "\n";
957
958 ++yIndex;
959 }
960 else if (state=='X')
961 {
962 ++xIndex;
963 }
964 else if (state=='Y')
965 {
966 ++yIndex;
967 }
968 else if (state=='W')
969 {
970 ++xIndex;
971 }
972 else if (state=='Z')
973 {
974 ++yIndex;
975 }
976
977 lastState = state;
978 }
979 };
980
981 /**
982 *Compute log likelihood of matched portion of alignment
983 */
computeLogLikelihood(double & llk,const char * qual)984 void LHMM1::computeLogLikelihood(double& llk, const char* qual)
985 {
986 tracePath();
987 llk = 0;
988 char lastState = 'S';
989 uint32_t xIndex=0, yIndex=0;
990
991 for (uint32_t i=0; i<path.size(); ++i)
992 {
993 //std::cerr << xIndex << " " << yIndex << "\n";
994
995 char state = path.at(i);
996
997 if (state=='M')
998 {
999 llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], pl2prob((uint32_t) qual[yIndex-1]-33)));
1000 if (lastState=='M')
1001 {
1002 llk = LogTool::log10prod(llk, tMM);
1003 }
1004 else if (lastState=='D')
1005 {
1006 llk = LogTool::log10prod(llk, tDM);
1007 }
1008 else if (lastState=='I')
1009 {
1010 llk = LogTool::log10prod(llk, tIM);
1011 }
1012
1013 ++xIndex;
1014 ++yIndex;
1015 }
1016 else if (state=='D')
1017 {
1018 if (lastState=='M')
1019 {
1020 llk = LogTool::log10prod(llk, tMD);
1021 }
1022 else if (lastState=='D')
1023 {
1024 llk = LogTool::log10prod(llk, tDD);
1025 }
1026 ++xIndex;
1027 }
1028 else if (state=='I')
1029 {
1030 if (lastState=='M')
1031 {
1032 llk = LogTool::log10prod(llk, tMI);
1033 }
1034 else if (lastState=='I')
1035 {
1036 llk = LogTool::log10prod(llk, tII);
1037 }
1038 ++yIndex;
1039 }
1040 else if (state=='X')
1041 {
1042 ++xIndex;
1043 }
1044 else if (state=='Y')
1045 {
1046 ++yIndex;
1047 }
1048 else if (state=='W')
1049 {
1050 ++xIndex;
1051 }
1052 else if (state=='Z')
1053 {
1054 ++yIndex;
1055 }
1056
1057 lastState = state;
1058 }
1059 }
1060
getPath()1061 std::string& LHMM1::getPath()
1062 {
1063 return path;
1064 }
1065
logEmissionOdds(char readBase,char probeBase,double e)1066 double LHMM1::logEmissionOdds(char readBase, char probeBase, double e)
1067 {
1068 //4 encodes for N
1069 if (readBase=='N' || probeBase=='N')
1070 {
1071 //silent match
1072 return 0;
1073 }
1074
1075 if (readBase!=probeBase)
1076 {
1077 return log10(e/3)-logOneSixteenth;
1078 }
1079 else
1080 {
1081 return log10(1-e)-logOneSixteenth;
1082 }
1083 };
1084
logEmission(char readBase,char probeBase,double e)1085 double LHMM1::logEmission(char readBase, char probeBase, double e)
1086 {
1087 //4 encodes for N
1088 if (readBase=='N' || probeBase=='N')
1089 {
1090 //silent match
1091 return 0;
1092 }
1093
1094 if (readBase!=probeBase)
1095 {
1096 return log10(e/3);
1097 }
1098 else
1099 {
1100 return log10(1-e);
1101 }
1102 };
1103
emission(char readBase,char probeBase,double e)1104 double LHMM1::emission(char readBase, char probeBase, double e)
1105 {
1106 //4 encodes for N
1107 if (readBase=='N' || probeBase=='N')
1108 {
1109 //silent match
1110 return 0;
1111 }
1112
1113 if (readBase!=probeBase)
1114 {
1115 return e/3;
1116 }
1117 else
1118 {
1119 return 1-e;
1120 }
1121 };
1122
reverse(std::string s)1123 std::string LHMM1::reverse(std::string s)
1124 {
1125 std::string rs;
1126 for (std::string::reverse_iterator rit=s.rbegin() ; rit < s.rend(); rit++ )
1127 {
1128 rs.push_back(*rit);
1129 }
1130
1131 return rs;
1132 };
1133
printVector(std::vector<std::vector<double>> & v,uint32_t xLen,uint32_t yLen)1134 void LHMM1::printVector(std::vector<std::vector<double> >& v, uint32_t xLen, uint32_t yLen)
1135 {
1136 for (uint32_t i=0; i<xLen; ++i)
1137 {
1138 for (uint32_t j=0; j<yLen; ++j)
1139 {
1140 std::cerr << (v[i][j]==-DBL_MAX?-1000:v[i][j]) << "\t";
1141 }
1142
1143 std::cerr << "\n";
1144 }
1145 };
1146
printVector(std::vector<std::vector<char>> & v,uint32_t xLen,uint32_t yLen)1147 void LHMM1::printVector(std::vector<std::vector<char> >& v, uint32_t xLen, uint32_t yLen)
1148 {
1149 for (uint32_t i=0; i<xLen; ++i)
1150 {
1151 for (uint32_t j=0; j<yLen; ++j)
1152 {
1153 std::cerr << v[i][j] << "\t";
1154 }
1155
1156 std::cerr << "\n";
1157 }
1158 };
1159
printVector(std::vector<std::vector<double>> & v)1160 void LHMM1::printVector(std::vector<std::vector<double> >& v)
1161 {
1162 for (uint32_t i=0; i<v.size(); ++i)
1163 {
1164 for (uint32_t j=0; j<v[i].size(); ++j)
1165 {
1166 std::cerr << v[i][j] << "\t";
1167 }
1168
1169 std::cerr << "\n";
1170 }
1171 };
1172
printAlignment(std::string & pad,std::stringstream & log)1173 void LHMM1::printAlignment(std::string& pad, std::stringstream& log)
1174 {
1175 std::stringstream xAligned;
1176 std::stringstream yAligned;
1177 std::stringstream qualAligned;
1178 uint32_t xIndex=0, yIndex=0;
1179 for (uint32_t i=0; i<path.size(); ++i)
1180 {
1181 char state = path.at(i);
1182 if (state=='S' || state=='E')
1183 {
1184 xAligned << state;
1185 yAligned << state;
1186 qualAligned << state;
1187 }
1188 else if (state=='X'||state=='W')
1189 {
1190 xAligned << x[xIndex++];
1191 yAligned << '.';
1192 qualAligned << '.';
1193 }
1194 else if (state=='M')
1195 {
1196 xAligned << x[xIndex++];
1197 yAligned << y[yIndex];
1198 qualAligned << qual[yIndex++];
1199 }
1200 else if (state=='D')
1201 {
1202 xAligned << x[xIndex++];
1203 yAligned << '-';
1204 qualAligned << '-';
1205 }
1206 else if (state=='I')
1207 {
1208 xAligned << '-';
1209 yAligned << y[yIndex];
1210 qualAligned << qual[yIndex++];
1211 }
1212 else if (state=='Y'||state=='Z')
1213 {
1214 xAligned << '.';
1215 yAligned << y[yIndex];
1216 qualAligned << qual[yIndex++];
1217 }
1218 }
1219
1220 log << pad << "X " << xAligned.str() << "E\n";
1221 log << pad << "Path " << path << "E\n";
1222 log << pad << "Y " << yAligned.str() << "E\n";
1223 log << pad << "Qual " << qualAligned.str() << "E\n\n";
1224
1225 log << pad << "Alignment in Probe : [" << matchStartX << "," << matchEndX<< "]\n";
1226 log << pad << "Alignment in Read : [" << matchStartY << "," << matchEndY<< "]\n";
1227 log << pad << "# Aligned Bases : " << noBasesAligned << "\n";
1228 log << pad << "# Matched Bases : " << matchedBases << "\n";
1229 log << pad << "# Mismatched Bases : " << mismatchedBases << "\n";
1230
1231 log << pad << "# INS in Probe : ";
1232 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1233 {
1234 if (indelStatusInPath[i]=='I')
1235 {
1236 log << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1237 }
1238 }
1239 log << "\n";
1240
1241 log << pad << "# DELS in Probe : ";
1242 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1243 {
1244 if (indelStatusInPath[i]=='D')
1245 {
1246 log << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1247 }
1248 }
1249 log << "\n";
1250
1251 log << pad << "# INS in Read : ";
1252 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1253 {
1254 if (indelStatusInPath[i]=='I')
1255 {
1256 log << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1257 }
1258 }
1259 log << "\n";
1260
1261 log << pad << "# DELS in Read : ";
1262 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1263 {
1264 if (indelStatusInPath[i]=='D')
1265 {
1266 log << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1267 }
1268 }
1269 log << "\n";
1270
1271 log << pad << "Max Log odds : " << maxLogOdds << "\n";
1272 };
1273
printAlignment(std::string & pad)1274 void LHMM1::printAlignment(std::string& pad)
1275 {
1276 //std::cerr << "\tAlignments\n" ;
1277
1278 //trace back
1279 //std::cerr << "\tBefore alignment" << "\n";
1280 //std::cerr << "\tX " << x << "\n";
1281 //std::cerr << "\tY " << y << "\n";
1282 //std::cerr << "\n";
1283 //std::cerr << "\tAfter alignment" << "\n";
1284 //use path to align x and y
1285 std::stringstream xAligned;
1286 std::stringstream yAligned;
1287 std::stringstream qualAligned;
1288 uint32_t xIndex=0, yIndex=0;
1289 for (uint32_t i=0; i<path.size(); ++i)
1290 {
1291 char state = path.at(i);
1292 if (state=='S' || state=='E')
1293 {
1294 xAligned << state;
1295 yAligned << state;
1296 qualAligned << state;
1297 }
1298 else if (state=='X'||state=='W')
1299 {
1300 xAligned << x[xIndex++];
1301 yAligned << '.';
1302 qualAligned << '.';
1303 }
1304 else if (state=='M')
1305 {
1306 xAligned << x[xIndex++];
1307 yAligned << y[yIndex];
1308 qualAligned << qual[yIndex++];
1309 }
1310 else if (state=='D')
1311 {
1312 xAligned << x[xIndex++];
1313 yAligned << '-';
1314 qualAligned << '-';
1315 }
1316 else if (state=='I')
1317 {
1318 xAligned << '-';
1319 yAligned << y[yIndex];
1320 qualAligned << qual[yIndex++];
1321 }
1322 else if (state=='Y'||state=='Z')
1323 {
1324 xAligned << '.';
1325 yAligned << y[yIndex];
1326 qualAligned << qual[yIndex++];
1327 }
1328 }
1329
1330 std::cerr << pad << "X " << xAligned.str() << "E\n";
1331 std::cerr << pad << "Path " << path << "E\n";
1332 std::cerr << pad << "Y " << yAligned.str() << "E\n";
1333 std::cerr << pad << "Qual " << qualAligned.str() << "E\n\n";
1334
1335 std::cerr << pad << "Alignment in Probe : [" << matchStartX << "," << matchEndX<< "]\n";
1336 std::cerr << pad << "Alignment in Read : [" << matchStartY << "," << matchEndY<< "]\n";
1337 std::cerr << pad << "# Aligned Bases : " << noBasesAligned << "\n";
1338 std::cerr << pad << "# Matched Bases : " << matchedBases << "\n";
1339 std::cerr << pad << "# Mismatched Bases : " << mismatchedBases << "\n";
1340
1341 std::cerr << pad << "# INS in Probe : ";
1342 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1343 {
1344 if (indelStatusInPath[i]=='I')
1345 {
1346 std::cerr << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1347 }
1348 }
1349 std::cerr << "\n";
1350
1351 std::cerr << pad << "# DELS in Probe : ";
1352 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1353 {
1354 if (indelStatusInPath[i]=='D')
1355 {
1356 std::cerr << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1357 }
1358 }
1359 std::cerr << "\n";
1360
1361 std::cerr << pad << "# INS in Read : ";
1362 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1363 {
1364 if (indelStatusInPath[i]=='I')
1365 {
1366 std::cerr << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1367 }
1368 }
1369 std::cerr << "\n";
1370
1371 std::cerr << pad << "# DELS in Read : ";
1372 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1373 {
1374 if (indelStatusInPath[i]=='D')
1375 {
1376 std::cerr << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1377 }
1378 }
1379 std::cerr << "\n";
1380
1381 std::cerr << pad << "Max Log odds : " << maxLogOdds << "\n";
1382 };
1383
printAlignment()1384 void LHMM1::printAlignment()
1385 {
1386 std::string pad = "\t";
1387 printAlignment(pad);
1388 };
1389
score(char a,char b)1390 double LHMM1::score(char a, char b)
1391 {
1392 if (a==b)
1393 return 0;
1394 else
1395 return -1;
1396 };
1397
1398 /**
1399 * Checks if deletion exists in alignment.
1400 */
deletion_start_exists(uint32_t pos,uint32_t & rpos)1401 bool LHMM1::deletion_start_exists(uint32_t pos, uint32_t& rpos)
1402 {
1403 rpos = 0;
1404 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1405 {
1406 if (indelStatusInPath[i]=='D' &&
1407 indelStartsInX[i]==pos)
1408 {
1409 rpos = indelStartsInY[i];
1410 return true;
1411 }
1412 }
1413
1414 return false;
1415 }
1416
1417 /**
1418 * Checks if insertion exists in alignment.
1419 */
insertion_start_exists(uint32_t pos,uint32_t & rpos)1420 bool LHMM1::insertion_start_exists(uint32_t pos, uint32_t& rpos)
1421 {
1422 rpos = 0;
1423 for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1424 {
1425 if (indelStatusInPath[i]=='I' &&
1426 indelStartsInX[i]==pos)
1427 {
1428 rpos = indelStartsInY[i];
1429 return true;
1430 }
1431 }
1432
1433 return false;
1434 }