1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "lhmm1.h"
25 
26 #include "lhmm1.h"
27 
LHMM1()28 LHMM1::LHMM1()
29 {
30     initialize(500);
31 };
32 
initialize(int32_t maxLength)33 void LHMM1::initialize(int32_t maxLength)
34 {
35     this->maxLength = maxLength;
36 //  delta = 0.01;
37 //  epsilon = 0.1;
38 //  tau = 0.1;
39 //  eta = 0.001;
40 
41 //  delta = 0.01;
42 //  epsilon = 0.1;
43 //  tau = 0.1;
44 //  eta = 0.001;
45 
46 //  last
47 //  delta = 0.001;
48 //  epsilon = 0.5;
49 //  tau = 0.1;
50 //  eta = 0.01;
51 
52 //  delta = 0.01;
53 //  epsilon = 0.5;
54 //  tau = 0.1;
55 //  eta = 0.001;
56 
57     delta = 0.001;
58     epsilon = 0.5;
59     tau = 0.1;
60     eta = 0.001;
61 
62     logOneSixteenth = log10(1.0/16.0);
63 
64     //for matched portion
65     tMM = log10(1-2*delta-tau);
66     tMI = log10(delta);
67     tMD = log10(delta);
68     tII = log10(epsilon);
69     tDD = log10(epsilon);
70     tDM = log10(1-epsilon);
71     tIM = log10(1-epsilon);
72 
73     txx = log10((1-eta)/(1-eta));
74     txy = log10((eta*(1-eta))/(eta*(1-eta)));
75     tyy = log10((1-eta)/(1-eta));
76 
77     txm = log10((eta*eta)/(eta*eta*(1-eta)*(1-eta)));
78     tym = log10((eta)/(eta*(1-eta)*(1-eta)));
79     tsm = log10((eta*eta)/(eta*eta*eta*(1-eta)*(1-eta)));
80     tmm = log10((1-2*delta-tau)/((1-eta)*(1-eta)));
81     tim = log10((1-epsilon)/(1-eta));
82     tdm = log10((1-epsilon)/(1-eta));
83 
84     tmd = log10(delta/(1-eta));
85     tdd = log10(epsilon/(1-eta));
86 
87     tmi = log10(delta/(1-eta));
88     tii = log10(epsilon/(1-eta));
89 
90     tmw = log10((tau*(1-eta))/(eta*(1-eta)));
91     tww = log10((1-eta)/(1-eta));
92 
93     tmz = log10((tau*eta*(1-eta))/(eta*eta*(1-eta)));
94     twz = log10((eta*(1-eta))/(eta*(1-eta)));
95     tzz = log10((1-eta)/(1-eta));
96 
97     //assume alignments can't possibly be maxLength bases or more
98     //todo:: allow for resizing if you do encounter fragments which are really long
99     std::vector<double> temp(maxLength+1, -DBL_MAX);
100     std::vector<char> tempX(maxLength+1, 'Y');
101     tempX[0] = 'X';
102     std::vector<char> tempY(maxLength+1, 'Y');
103     tempY[0] = 'X';
104 
105     X.clear();
106     Y.clear();
107     M.clear();
108     I.clear();
109     D.clear();
110     W.clear();
111     Z.clear();
112     pathX.clear();
113     pathY.clear();
114     pathM.clear();
115     pathD.clear();
116     pathI.clear();
117     pathW.clear();
118     pathZ.clear();
119 
120     for (uint32_t i=0; i<maxLength+1; ++i)
121     {
122         X.push_back(temp);
123         Y.push_back(temp);
124         M.push_back(temp);
125         I.push_back(temp);
126         D.push_back(temp);
127         W.push_back(temp);
128         Z.push_back(temp);
129         pathX.push_back(tempX);
130         pathY.push_back(tempY);
131         pathM.push_back(tempX);
132         pathD.push_back(tempX);
133         pathI.push_back(tempX);
134         pathW.push_back(tempX);
135         pathZ.push_back(tempX);
136     }
137 
138     logEta = log10(eta);
139     logTau = log10(tau);
140 
141     X[0][0] = 0;
142     Y[0][0] = 0;
143     M[0][0] = 0;
144     W[0][0] = 0;
145     Z[0][0] = 0;
146     pathX[0][0] = 'N';
147     pathX[1][0] = 'S';
148     pathY[0][0] = 'N';
149     pathY[0][1] = 'S';
150     pathM[0][0] = 'N';
151     pathM[1][1] = 'S';
152 
153     for (uint32_t k=1; k<maxLength+1; ++k)
154     {
155         X[k][0] = X[k-1][0] + txx;
156         X[0][k] = -DBL_MAX;
157         Y[k][0] = -DBL_MAX;
158         Y[0][k] = Y[0][k-1] + tyy;
159         W[k][0] = W[k-1][0] + tww;
160         W[0][k] = -DBL_MAX;
161         Z[k][0] = -DBL_MAX;
162         Z[0][k] = Z[0][k-1] + tzz;
163     }
164 
165     X[0][0] = -DBL_MAX;
166     Y[0][0] = -DBL_MAX;
167     W[0][0] = -DBL_MAX;
168     Z[0][0] = -DBL_MAX;
169 
170 };
171 
containsIndel()172 bool LHMM1::containsIndel()
173 {
174     return (indelStatusInPath.size()!=0);
175 }
176 
177 /**
178 convert PLs to probabilities.
179 */
pl2prob(uint32_t PL)180 double LHMM1::pl2prob(uint32_t PL)
181 {
182     if (PL>=PLs.size())
183     {
184         if (PL > 3236)
185             PL = 3236;
186 
187         for (uint32_t i=PLs.size(); i<=PL; ++i)
188         {
189             PLs.push_back(pow(10, -((double) i)/10.0));
190         }
191     }
192 
193     return PLs[PL];
194 }
195 
196 /**
197  *Updates matchStart, matchEnd, globalMaxPath and path.
198  */
tracePath()199 void LHMM1::tracePath()
200 {
201     double globalMax = M[xlen][ylen];
202     char globalMaxPath = 'M';
203     if (W[xlen][ylen]>globalMax)
204     {
205         globalMax = W[xlen][ylen];
206         globalMaxPath = 'W';
207     }
208     if (Z[xlen][ylen]>globalMax)
209     {
210         globalMax = Z[xlen][ylen];
211         globalMaxPath = 'Z';
212     }
213 
214     matchStartX = -1;
215     matchEndX = -1;
216     matchStartY = -1;
217     matchEndY = -1;
218     noBasesAligned = 0;
219     if (globalMaxPath == 'M')
220     {
221         matchEndX = xlen;
222         matchEndY = ylen;
223         ++noBasesAligned;
224     }
225 
226     maxLogOdds = globalMax;
227 
228     //std::cerr << "\tGlobal max Path : " << globalMaxPath << "\n";
229     //std::cerr << "\tGlobal max log odds : " << globalMax << "\n";
230 
231     std::stringstream ss;
232     ss << globalMaxPath;
233 
234     //recursively trace path
235     tracePath(ss, globalMaxPath, xlen, ylen);
236 
237     path = reverse(ss.str());
238 
239     bool inI = false;
240     bool inD = false;
241     //std::cerr << "*************************************\n";
242     //std::cerr << "PATH :" << path << "\n";
243 
244     ////////////////////////////////////////////////
245     //records appearance of Indels in probe and read
246     ////////////////////////////////////////////////
247     indelStartsInX.clear();
248     indelEndsInX.clear();
249     indelStartsInY.clear();
250     indelEndsInY.clear();
251     indelStartsInPath.clear();
252     indelEndsInPath.clear();
253     indelStatusInPath.clear();
254 
255     uint32_t x=0, y=0;
256     for (uint32_t i=0; i<path.size(); ++i)
257     {
258         if (!inI && path[i]=='I')
259         {
260             inI = true;
261             indelStartsInPath.push_back(i);
262             indelStatusInPath.push_back('I');
263             indelStartsInX.push_back(x);
264             indelStartsInY.push_back(y);
265         }
266 
267         if (inI && path[i]!='I')
268         {
269             inI = false;
270             indelEndsInPath.push_back(i);
271             indelEndsInX.push_back(x);
272             indelEndsInY.push_back(y);
273         }
274 
275         if (!inD && path[i]=='D')
276         {
277             inD = true;
278             indelStartsInPath.push_back(i);
279             indelStatusInPath.push_back('D');
280             indelStartsInX.push_back(x);
281             indelStartsInY.push_back(y);
282         }
283 
284         if (inD && path[i]!='D')
285         {
286             inD = false;
287             indelEndsInPath.push_back(i);
288             indelEndsInX.push_back(x);
289             indelEndsInY.push_back(y);
290         }
291 
292         if (path[i]=='M')
293         {
294             ++x;
295             ++y;
296         }
297 
298         if (path[i]=='I' || path[i]=='Y' || path[i]=='Z')
299         {
300             ++y;
301         }
302 
303         if (path[i]=='D' || path[i]=='X' || path[i]=='W')
304         {
305             ++x;
306         }
307     }
308 
309 //    std::cerr << "\t# INS in Path : ";
310 //    for (uint32_t i=0; i<insertionStartsInPath.size(); ++i)
311 //    {
312 //         std::cerr << "(" << insertionStartsInPath[i] << "," << insertionEndsInPath[i] << ") ";
313 //    }
314 //    std::cerr << "\n";
315 //
316 //    std::cerr << "\t# DEL in Path : ";
317 //    for (uint32_t i=0; i<deletionStartsInPath.size(); ++i)
318 //    {
319 //         std::cerr << "(" << deletionStartsInPath[i] << "," << deletionEndsInPath[i] << ") ";
320 //    }
321 //    std::cerr << "\n";
322 //    std::cerr << "*************************************\n";
323     //left align path
324     left_align();
325 
326 //    std::cerr << "*************************************\n";
327 //    printAlignment();
328 //    std::cerr << "*************************************\n";
329 
330     // std::cerr << "\tPath: " << path << "\n";
331 }
332 
tracePath(std::stringstream & ss,char state,uint32_t i,uint32_t j)333 void LHMM1::tracePath(std::stringstream& ss, char state, uint32_t i, uint32_t j)
334 {
335     //std::cout << state << "\n";
336     if (i>0 && j>0)
337     {
338         if (state=='X')
339         {
340             //std::cerr << pathX[i][j] << " " << i << " " << j << " " << matchStartY << "\n";
341             ss << pathX[i][j];
342             tracePath(ss, pathX[i][j], i-1, j);
343         }
344         else if (state=='Y')
345         {
346             //std::cerr << pathY[i][j] << " " << i << " " << j << " " << matchStartY << "\n";
347             ss << pathY[i][j];
348             tracePath(ss, pathY[i][j], i, j-1);
349         }
350         else if (state=='M')
351         {
352 //            if (matchEndX==-1)
353 //            {
354 //                matchEndX = i;
355 //                matchEndY = j;
356 //
357 //                //std::cerr << "set matchEndX "  << matchEndX  << " " << matchEndY  <<  "\n";
358 //            }
359 //
360             if (matchStartX==-1 && (pathM[i][j] =='X' || pathM[i][j]=='Y'))
361             {
362                matchStartX = i;
363                matchStartY = j;
364 
365                //std::cerr << "set match starts "  << matchStartX  << " " << matchStartY  <<  "\n";
366             }
367 
368 
369             //std::cout << pathM[i][j] << " " << i << " " << j << " " << matchEndY << "\n";
370             ss << pathM[i][j];
371             tracePath(ss, pathM[i][j], i-1, j-1);
372             ++noBasesAligned;
373         }
374         else if (state=='I')
375         {
376             //std::cout << pathI[i][j] << " " << i << " " << j << "\n";
377             ss << pathI[i][j];
378             tracePath(ss, pathI[i][j], i, j-1);
379         }
380         else if (state=='D')
381         {
382             //std::cout << pathD[i][j] << " " << i << " " << j << "\n";
383             ss << pathD[i][j];
384             tracePath(ss, pathD[i][j], i-1, j);
385             ++noBasesAligned;
386         }
387         else if (state=='W')
388         {
389             if (matchEndX==-1 && pathW[i][j] =='M')
390             {
391                 matchEndX = i-1;
392                 matchEndY = j;
393             }
394 
395 
396             //std::cout << pathW[i][j] << " " << i << " " << j << "\n";
397             ss << pathW[i][j];
398             tracePath(ss, pathW[i][j], i-1, j);
399         }
400         else if (state=='Z')
401         {
402             if (matchEndX==-1 && pathZ[i][j] =='M')
403             {
404                 matchEndX = i;
405                 matchEndY = j-1;
406             }
407 
408             //std::cout << pathZ[i][j] << " " << i << " " << j << "\n";
409             ss << pathZ[i][j];
410             tracePath(ss, pathZ[i][j], i, j-1);
411         }
412         else if (state=='S')
413         {
414             //std::cout << "\n";
415             if (matchStartX==-1)
416             {
417                matchStartX = i+1;
418             }
419 
420             if (matchStartY==-1)
421             {
422                matchStartY = j+1;
423             }
424 
425         }
426     }
427     else if (i==0 && j>0)
428     {
429         if (matchStartY==-1)
430         {
431            matchStartY = j+1;
432         }
433 
434         // std::cout << pathY[i][j] << " " << i << " " << j << "\n";
435         ss << pathY[i][j];
436         tracePath(ss, pathY[i][j], i, j-1);
437     }
438     else if (i>0 && j==0)
439     {
440         if (matchStartY==-1)
441         {
442            matchStartY = j+1;
443         }
444         // std::cout << pathX[i][j] << " " << i << " " << j << "\n";
445         ss << pathX[i][j];
446         tracePath(ss, pathX[i][j], i-1, j);
447     }
448     else
449     {
450         //std::cout << "\n";
451         //ss << pathX[i][j];
452     }
453 }
454 
455 /**
456  *Left align indels in an alignment
457  */
left_align()458 void LHMM1::left_align()
459 {
460 //.......... GATAAGTGAG GAGGAAAAGC GA---GGAGA TCTTATTTGACAACTGCE
461 //YYYYYYYYYY MMMMMMMMMM MMMMMMMMMM MMIIIMMMMM MMMMMWWWWWWWWWWWWE
462 //ACTGCTTCTA GATAAGTGAG GAGGAAAAGC GATAAGGAGA TCTTA............E
463 //# INS in Probe : (22,22)
464 //# INS in Read : (32,35)
465 
466     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
467     {
468         if (indelStatusInPath[i]=='I')
469         {
470             while (indelStartsInX[i]>1 && indelStartsInY[i]>1)
471             {
472 //                printAlignment();
473 //
474 //                std::cerr << "X[" << indelStartsInX[i]-1 << "] "  << x[indelStartsInX[i]-1] << "\n";
475 //                std::cerr << "Y[" << indelStartsInY[i]-1 << "] "  << y[indelStartsInY[i]-1] << "\n";
476 //                std::cerr << "Y[" << indelEndsInY[i]-1 << "] "  << y[indelEndsInY[i]-1] << "\n";
477 //
478                 if (//(indelStartsInY[i]!=indelEndsInY[i]-1) && //single indel need not be shifted
479                     x[indelStartsInX[i]-1]==y[indelStartsInY[i]-1] &&
480                     y[indelStartsInY[i]-1]==y[indelEndsInY[i]-1])
481                 {
482 //                    std::cerr << "CHECK BEFORE:" << path << "\n";
483 //                    std::cerr << "P[" << indelStartsInPath[i]-1 << "] "  << path[indelStartsInPath[i]-1] << "\n";
484 //                    std::cerr << "P[" << indelEndsInPath[i]-1 << "] "  << path[indelEndsInPath[i]-1] << "\n";
485 
486                     path[indelStartsInPath[i]-1] = 'I';
487                     path[indelEndsInPath[i]-1] = 'M';
488 
489                     --indelStartsInX[i];
490                     --indelEndsInX[i];
491                     --indelStartsInY[i];
492                     --indelEndsInY[i];
493                     --indelStartsInPath[i];
494                     --indelEndsInPath[i];
495 
496 //                    std::cerr << "SHIFT 1 to the left\n";
497 
498                 }
499                 else
500                 {
501                     break;
502                 }
503             }
504         }
505         else
506         {
507             //  ==================
508             //  2) 6:81319178:AATTT:A:-4
509             //  ==================
510             //  ref probe     ATCTATTAAAGATATATGTCAATTTATTAGTGCATATCAAAGAGCAAGT (20/49)
511             //  read sequence GATATATGTCAATTAGTGCATATCAAAGAGCAAGTTGACATGTTTTTTCAATATATTCATTTCTCTAATTTATCTC
512             //  ==================
513             //  X    SATCTATTAAAGATATATGTCAATTTATTAGTGCATATCAAAGAGCAAGT.........................................E
514             //  Path SXXXXXXXXXXMMMMMMMMMMMMMDDDDMMMMMMMMMMMMMMMMMMMMMMZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZE
515             //  Y    S..........GATATATGTCAAT----TAGTGCATATCAAAGAGCAAGTTGACATGTTTTTTCAATATATTCATTTCTCTAATTTATCTCE
516             //  Match Start     : 0
517             //  Match End       : 35
518             //  # Aligned Bases : 39
519             //  # Matched Bases : 35
520             //  # Mismatched Bases : 0
521             //  # INS in Probe :
522             //  # INS in Read :
523             //  # INS in Path :
524             //  # DEL in Probe : (23,27)
525             //  # DEL in Read : (13,13)
526             //  # DEL in Path : (24,28)
527 
528             while (indelStartsInX[i]>1 && indelStartsInY[i]>1)
529             {
530 //                printAlignment();
531 //                std::cerr << "X[" << indelStartsInX[i]-1 << "] "  << x[indelStartsInX[i]-1] << "\n";
532 //                std::cerr << "Y[" << indelStartsInY[i]-1 << "] "  << y[indelStartsInY[i]-1] << "\n";
533 //                std::cerr << "Y[" << indelEndsInY[i]-1 << "] "  << y[indelEndsInY[i]-1] << "\n";
534 
535                 if (//(indelStartsInX[i]!=indelEndsInX[i]-1) && //single indel need not be shifted
536                     x[indelStartsInX[i]-1]==y[indelStartsInY[i]-1] &&
537                     x[indelStartsInX[i]-1]==x[indelEndsInX[i]-1])
538                 {
539 //                    std::cerr << "CHECK BEFORE:" << path << "\n";
540 //                    std::cerr << "P[" << indelStartsInPath[i]-1 << "] "  << path[indelStartsInPath[i]-1] << "\n";
541 //                    std::cerr << "P[" << indelEndsInPath[i]-1 << "] "  << path[indelEndsInPath[i]-1] << "\n";
542 
543                     path[indelStartsInPath[i]-1] = 'D';
544                     path[indelEndsInPath[i]-1] = 'M';
545 
546                     --indelStartsInX[i];
547                     --indelEndsInX[i];
548                     --indelStartsInY[i];
549                     --indelEndsInY[i];
550                     --indelStartsInPath[i];
551                     --indelEndsInPath[i];
552 
553 //                    std::cerr << "SHIFT 1 to the left\n";
554 
555                 }
556                 else
557                 {
558                     break;
559                 }
560             }
561 
562         }
563     }
564 }
565 
566 /**
567 Align and compute genotype likelihood.
568 */
align(double & llk,const char * _x,const char * _y,const char * _qual,bool debug)569 void LHMM1::align(double& llk, const char* _x, const char* _y, const char* _qual, bool debug)
570 {
571     x = _x;
572     y = _y;
573     qual = _qual;
574 
575     //adds a starting character at the fron of each string that must be matched
576     xlen = strlen(x);
577     ylen = strlen(y);
578 
579     int32_t mlen = xlen>ylen?xlen:ylen;
580     if (mlen>this->maxLength)
581     {
582         initialize(mlen+10);
583     }
584 
585     double max = 0;
586     char maxPath = 'X';
587 
588     //std::cerr << "x: " << x << " " << xlen << "\n" ;
589     //std::cerr << "y: " << y << " " << ylen << "\n" ;
590 
591     //construct possible solutions
592     for (uint32_t i=1; i<=xlen; ++i)
593     {
594         for (uint32_t j=1; j<=ylen; ++j)
595         {
596             //std::cerr << i << " " << j  << "\n" ;
597 
598             //X
599             double xx = X[i-1][j] + txx;
600 
601             max = xx;
602             maxPath = 'X';
603 
604             X[i][j] = max;
605             pathX[i][j] = maxPath;
606 
607             //Y
608             double xy = X[i][j-1] + txy;
609             double yy = Y[i][j-1] + tyy;
610 
611             max = xy;
612             maxPath = 'X';
613 
614             if (yy>max)
615             {
616                 max = yy;
617                 maxPath = 'Y';
618             }
619 
620             Y[i][j] = max;
621             pathY[i][j] = maxPath;
622 
623             //M
624             double xm = X[i-1][j-1] + txm;
625             double ym = Y[i-1][j-1] + tym;
626             double mm = M[i-1][j-1] + ((i==1&&j==1) ? tsm : tmm);
627             double im = I[i-1][j-1] + tim;
628             double dm = D[i-1][j-1] + tdm;
629 
630             max = xm;
631             maxPath = 'X';
632 
633             if (ym>max) //special case
634             {
635                 max = ym;
636                 maxPath = 'Y';
637             }
638             if (mm>max)
639             {
640                 max = mm;
641                 maxPath = (i==1&&j==1) ? 'S' : 'M';
642             }
643             if (im>max)
644             {
645                 max = im;
646                 maxPath = 'I';
647             }
648             if (dm>max)
649             {
650                 max = dm;
651                 maxPath = 'D';
652             }
653 
654             M[i][j] = max + logEmissionOdds(x[i-1], y[j-1], pl2prob((uint32_t) qual[j-1]-33));
655             pathM[i][j] = maxPath;
656 
657             //D
658             double md = M[i-1][j] + tmd;
659             double dd = D[i-1][j] + tdd;
660 
661             max = md;
662             maxPath = 'M';
663 
664             if (dd>max)
665             {
666                 max = dd;
667                 maxPath = 'D';
668             }
669 
670             D[i][j] = max;
671             pathD[i][j] = maxPath;
672 
673             //I
674             double mi = M[i][j-1] + tmi;
675             double ii = I[i][j-1] + tii;
676 
677             max = mi;
678             maxPath = 'M';
679 
680             if (ii>max)
681             {
682                 max = ii;
683                 maxPath = 'I';
684             }
685 
686             I[i][j] = max;
687             pathI[i][j] = maxPath;
688 
689             //W
690             double mw = M[i-1][j] + tmw;
691             double ww = W[i-1][j] + tww;
692 
693             max = mw;
694             maxPath = 'M';
695 
696             if (ww>max)
697             {
698                 max = ww;
699                 maxPath = 'W';
700             }
701 
702             W[i][j] = max;
703             pathW[i][j] = maxPath;
704 
705             //Z
706             double mz = M[i][j-1] + tmz;
707             double wz = W[i][j-1] + twz;
708             double zz = Z[i][j-1] + tzz;
709 
710             max = mz;
711             maxPath = 'M';
712 
713             if (wz>max)
714             {
715                 max = wz;
716                 maxPath = 'W';
717             }
718             if (zz>max)
719             {
720                 max = zz;
721                 maxPath = 'Z';
722             }
723 
724             Z[i][j] = max;
725             pathZ[i][j] = maxPath;
726         }
727 
728         M[xlen][ylen] += logTau-logEta;
729     }
730 
731     if (debug)
732     {
733         std::cerr << "\n=X=\n";
734         printVector(X, xlen+1, ylen+1);
735         std::cerr << "\n=Y=\n";
736         printVector(Y, xlen+1, ylen+1);
737         std::cerr << "\n=M=\n";
738         printVector(M, xlen+1, ylen+1);
739         std::cerr << "\n=D=\n";
740         printVector(D, xlen+1, ylen+1);
741         std::cerr << "\n=I=\n";
742         printVector(I, xlen+1, ylen+1);
743         std::cerr << "\n=W=\n";
744         printVector(W, xlen+1, ylen+1);
745         std::cerr << "\n=Z=\n";
746         printVector(Z, xlen+1, ylen+1);
747         std::cerr << "\n=Path X=\n";
748         printVector(pathX, xlen+1, ylen+1);
749         std::cerr << "\n=Path Y=\n";
750         printVector(pathY, xlen+1, ylen+1);
751         std::cerr << "\n=Path M=\n";
752         printVector(pathM, xlen+1, ylen+1);
753         std::cerr << "\n=Path D=\n";
754         printVector(pathD, xlen+1, ylen+1);
755         std::cerr << "\n=Path I=\n";
756         printVector(pathI, xlen+1, ylen+1);
757         std::cerr << "\n=Path W=\n";
758         printVector(pathW, xlen+1, ylen+1);
759         std::cerr << "\n=Path Z=\n";
760         printVector(pathZ, xlen+1, ylen+1);
761     }
762 
763     tracePath();
764 };
765 
766 /**
767 *Compute log likelihood of matched portion of alignment
768 */
computeLogLikelihood(double & llk,std::string & _path,const char * qual)769 void LHMM1::computeLogLikelihood(double& llk, std::string& _path, const char* qual)
770 {
771 
772 //  std::cerr <<" basequals " << qual << "\n";
773 //  std::cerr <<" read      " << y << "\n";
774 //  std::cerr <<" path      " << path << "\n";
775     matchedBases = 0;
776     mismatchedBases = 0;
777 
778     llk = 0;
779     double q = 0;
780     char lastState = 'S';
781     uint32_t xIndex=0, yIndex=0;
782     for (uint32_t i=0; i<_path.size(); ++i)
783     {
784         char state = _path.at(i);
785         //std::cerr << state << " " << xIndex << " " << yIndex << "\n";
786 
787         if (state=='M')
788         {
789             q = pl2prob((uint32_t) qual[yIndex]-33);
790             llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], q));
791             //compute for perfect fit
792 
793             if (x[xIndex]== y[yIndex])
794             {
795                 ++matchedBases;
796             }
797             else
798             {
799                 ++mismatchedBases;
800             }
801 
802             if (lastState=='M')
803             {
804                 llk = LogTool::log10prod(llk, tMM);
805             }
806             else if (lastState=='D')
807             {
808                 llk = LogTool::log10prod(llk, tDM);
809             }
810             else if (lastState=='I')
811             {
812                 llk = LogTool::log10prod(llk, tIM);
813             }
814 
815             //std::cerr << "M) "<< lastState << " "  << llk << " " << x[xIndex] << " " <<  y[yIndex] << " " << pl2prob((uint32_t) qual[yIndex]-33) << " "<< qual[yIndex-1]-33 << "\n";
816 
817             ++xIndex;
818             ++yIndex;
819         }
820         else if (state=='D')
821         {
822             if (lastState=='M')
823             {
824                 llk = LogTool::log10prod(llk, tMD);
825             }
826             else if (lastState=='D')
827             {
828                 llk = LogTool::log10prod(llk, tDD);
829             }
830 
831             //std::cerr << "D) " << lastState << " " << llk << " " << x[xIndex] << " " <<  y[yIndex] << "\n";
832 
833             ++xIndex;
834         }
835         else if (state=='I')
836         {
837             if (lastState=='M')
838             {
839                 llk = LogTool::log10prod(llk, tMI);
840             }
841             else if (lastState=='I')
842             {
843                 llk = LogTool::log10prod(llk, tII);
844             }
845 
846             //std::cerr << "I) " << lastState << " " << llk << " " << x[xIndex] << " " <<  y[yIndex] << "\n";
847 
848             ++yIndex;
849         }
850         else if (state=='X')
851         {
852             ++xIndex;
853         }
854         else if (state=='Y')
855         {
856             ++yIndex;
857         }
858         else if (state=='W')
859         {
860             ++xIndex;
861         }
862         else if (state=='Z')
863         {
864             ++yIndex;
865         }
866 
867         lastState = state;
868     }
869 };
870 
871 /**
872  * Compute log likelihood of matched portion of alignment
873  */
computeLogLikelihood(double & llk,double & perfectllk,std::string & _path,const char * qual)874 void LHMM1::computeLogLikelihood(double& llk, double& perfectllk, std::string& _path, const char* qual)
875 {
876 
877 //  std::cerr <<" basequals " << qual << "\n";
878 //  std::cerr <<" read      " << y << "\n";
879 //  std::cerr <<" path      " << path << "\n";
880     matchedBases = 0;
881     mismatchedBases = 0;
882 
883     llk = 0;
884     perfectllk = 0;
885     double q = 0;
886     char lastState = 'S';
887     uint32_t xIndex=0, yIndex=0;
888     for (uint32_t i=0; i<_path.size(); ++i)
889     {
890 
891         char state = _path.at(i);
892         //std::cerr << state << " " << xIndex << " " << yIndex << "\n";
893 
894         if (state=='M')
895         {
896             q = pl2prob((uint32_t) qual[yIndex]-33);
897             llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], q));
898             //compute for perfect fit
899             perfectllk = LogTool::log10prod(perfectllk, logEmission(x[xIndex], x[xIndex], q));
900 
901             if (x[xIndex]== y[yIndex])
902             {
903                 ++matchedBases;
904             }
905             else
906             {
907                 ++mismatchedBases;
908             }
909 
910             if (lastState=='M')
911             {
912                 llk = LogTool::log10prod(llk, tMM);
913             }
914             else if (lastState=='D')
915             {
916                 llk = LogTool::log10prod(llk, tDM);
917             }
918             else if (lastState=='I')
919             {
920                 llk = LogTool::log10prod(llk, tIM);
921             }
922 
923             perfectllk = LogTool::log10prod(llk, tMM);
924 
925             //std::cerr << "M) "<< lastState << " "  << llk << " " << x[xIndex] << " " <<  y[yIndex] << " " << pl2prob((uint32_t) qual[yIndex]-33) << " "<< qual[yIndex-1]-33 << "\n";
926 
927             ++xIndex;
928             ++yIndex;
929         }
930         else if (state=='D')
931         {
932             if (lastState=='M')
933             {
934                 llk = LogTool::log10prod(llk, tMD);
935             }
936             else if (lastState=='D')
937             {
938                 llk = LogTool::log10prod(llk, tDD);
939             }
940 
941            // std::cerr << "D) " << lastState << " " << llk << " " << x[xIndex] << " " <<  y[yIndex] << "\n";
942 
943             ++xIndex;
944         }
945         else if (state=='I')
946         {
947             if (lastState=='M')
948             {
949                 llk = LogTool::log10prod(llk, tMI);
950             }
951             else if (lastState=='I')
952             {
953                 llk = LogTool::log10prod(llk, tII);
954             }
955 
956             // std::cerr << "I) " << lastState << " " << llk << " " << x[xIndex] << " " <<  y[yIndex] << "\n";
957 
958             ++yIndex;
959         }
960         else if (state=='X')
961         {
962             ++xIndex;
963         }
964         else if (state=='Y')
965         {
966             ++yIndex;
967         }
968         else if (state=='W')
969         {
970             ++xIndex;
971         }
972         else if (state=='Z')
973         {
974             ++yIndex;
975         }
976 
977         lastState = state;
978     }
979 };
980 
981 /**
982 *Compute log likelihood of matched portion of alignment
983 */
computeLogLikelihood(double & llk,const char * qual)984 void LHMM1::computeLogLikelihood(double& llk, const char* qual)
985 {
986     tracePath();
987     llk = 0;
988     char lastState = 'S';
989     uint32_t xIndex=0, yIndex=0;
990 
991     for (uint32_t i=0; i<path.size(); ++i)
992     {
993         //std::cerr << xIndex << " " << yIndex << "\n";
994 
995         char state = path.at(i);
996 
997         if (state=='M')
998         {
999             llk = LogTool::log10prod(llk, logEmission(x[xIndex], y[yIndex], pl2prob((uint32_t) qual[yIndex-1]-33)));
1000             if (lastState=='M')
1001             {
1002                 llk = LogTool::log10prod(llk, tMM);
1003             }
1004             else if (lastState=='D')
1005             {
1006                 llk = LogTool::log10prod(llk, tDM);
1007             }
1008             else if (lastState=='I')
1009             {
1010                 llk = LogTool::log10prod(llk, tIM);
1011             }
1012 
1013             ++xIndex;
1014             ++yIndex;
1015         }
1016         else if (state=='D')
1017         {
1018             if (lastState=='M')
1019             {
1020                 llk = LogTool::log10prod(llk, tMD);
1021             }
1022             else if (lastState=='D')
1023             {
1024                 llk = LogTool::log10prod(llk, tDD);
1025             }
1026             ++xIndex;
1027         }
1028         else if (state=='I')
1029         {
1030             if (lastState=='M')
1031             {
1032                 llk = LogTool::log10prod(llk, tMI);
1033             }
1034             else if (lastState=='I')
1035             {
1036                 llk = LogTool::log10prod(llk, tII);
1037             }
1038             ++yIndex;
1039         }
1040         else if (state=='X')
1041         {
1042             ++xIndex;
1043         }
1044         else if (state=='Y')
1045         {
1046             ++yIndex;
1047         }
1048         else if (state=='W')
1049         {
1050             ++xIndex;
1051         }
1052         else if (state=='Z')
1053         {
1054             ++yIndex;
1055         }
1056 
1057         lastState = state;
1058     }
1059 }
1060 
getPath()1061 std::string& LHMM1::getPath()
1062 {
1063     return path;
1064 }
1065 
logEmissionOdds(char readBase,char probeBase,double e)1066 double LHMM1::logEmissionOdds(char readBase, char probeBase, double e)
1067 {
1068     //4 encodes for N
1069     if (readBase=='N' || probeBase=='N')
1070     {
1071         //silent match
1072         return 0;
1073     }
1074 
1075     if (readBase!=probeBase)
1076     {
1077         return log10(e/3)-logOneSixteenth;
1078     }
1079     else
1080     {
1081         return log10(1-e)-logOneSixteenth;
1082     }
1083 };
1084 
logEmission(char readBase,char probeBase,double e)1085 double LHMM1::logEmission(char readBase, char probeBase, double e)
1086 {
1087     //4 encodes for N
1088     if (readBase=='N' || probeBase=='N')
1089     {
1090         //silent match
1091         return 0;
1092     }
1093 
1094     if (readBase!=probeBase)
1095     {
1096         return log10(e/3);
1097     }
1098     else
1099     {
1100         return log10(1-e);
1101     }
1102 };
1103 
emission(char readBase,char probeBase,double e)1104 double LHMM1::emission(char readBase, char probeBase, double e)
1105 {
1106     //4 encodes for N
1107     if (readBase=='N' || probeBase=='N')
1108     {
1109         //silent match
1110         return 0;
1111     }
1112 
1113     if (readBase!=probeBase)
1114     {
1115         return e/3;
1116     }
1117     else
1118     {
1119         return 1-e;
1120     }
1121 };
1122 
reverse(std::string s)1123 std::string LHMM1::reverse(std::string s)
1124 {
1125     std::string rs;
1126     for (std::string::reverse_iterator rit=s.rbegin() ; rit < s.rend(); rit++ )
1127     {
1128         rs.push_back(*rit);
1129     }
1130 
1131     return rs;
1132 };
1133 
printVector(std::vector<std::vector<double>> & v,uint32_t xLen,uint32_t yLen)1134 void LHMM1::printVector(std::vector<std::vector<double> >& v, uint32_t xLen, uint32_t yLen)
1135 {
1136     for (uint32_t i=0; i<xLen; ++i)
1137     {
1138         for (uint32_t j=0; j<yLen; ++j)
1139         {
1140           std::cerr << (v[i][j]==-DBL_MAX?-1000:v[i][j]) << "\t";
1141         }
1142 
1143         std::cerr << "\n";
1144     }
1145 };
1146 
printVector(std::vector<std::vector<char>> & v,uint32_t xLen,uint32_t yLen)1147 void LHMM1::printVector(std::vector<std::vector<char> >& v, uint32_t xLen, uint32_t yLen)
1148 {
1149     for (uint32_t i=0; i<xLen; ++i)
1150     {
1151         for (uint32_t j=0; j<yLen; ++j)
1152         {
1153           std::cerr << v[i][j] << "\t";
1154         }
1155 
1156         std::cerr << "\n";
1157     }
1158 };
1159 
printVector(std::vector<std::vector<double>> & v)1160 void LHMM1::printVector(std::vector<std::vector<double> >& v)
1161 {
1162     for (uint32_t i=0; i<v.size(); ++i)
1163     {
1164         for (uint32_t j=0; j<v[i].size(); ++j)
1165         {
1166           std::cerr << v[i][j] << "\t";
1167         }
1168 
1169         std::cerr << "\n";
1170     }
1171 };
1172 
printAlignment(std::string & pad,std::stringstream & log)1173 void LHMM1::printAlignment(std::string& pad, std::stringstream& log)
1174 {
1175     std::stringstream xAligned;
1176     std::stringstream yAligned;
1177     std::stringstream qualAligned;
1178     uint32_t xIndex=0, yIndex=0;
1179     for (uint32_t i=0; i<path.size(); ++i)
1180     {
1181         char state = path.at(i);
1182         if (state=='S' || state=='E')
1183         {
1184             xAligned << state;
1185             yAligned << state;
1186             qualAligned << state;
1187         }
1188         else if (state=='X'||state=='W')
1189         {
1190             xAligned << x[xIndex++];
1191             yAligned << '.';
1192             qualAligned << '.';
1193         }
1194         else if (state=='M')
1195         {
1196             xAligned << x[xIndex++];
1197             yAligned << y[yIndex];
1198             qualAligned << qual[yIndex++];
1199         }
1200         else if (state=='D')
1201         {
1202             xAligned << x[xIndex++];
1203             yAligned << '-';
1204             qualAligned << '-';
1205         }
1206         else if (state=='I')
1207         {
1208             xAligned << '-';
1209             yAligned << y[yIndex];
1210             qualAligned << qual[yIndex++];
1211         }
1212         else if (state=='Y'||state=='Z')
1213         {
1214             xAligned << '.';
1215             yAligned << y[yIndex];
1216             qualAligned << qual[yIndex++];
1217         }
1218     }
1219 
1220     log << pad << "X    " << xAligned.str() << "E\n";
1221     log << pad << "Path " << path << "E\n";
1222     log << pad << "Y    " << yAligned.str() << "E\n";
1223     log << pad << "Qual " << qualAligned.str() << "E\n\n";
1224 
1225     log << pad << "Alignment in Probe : [" << matchStartX << "," << matchEndX<< "]\n";
1226     log << pad << "Alignment in Read  : [" << matchStartY << "," << matchEndY<< "]\n";
1227     log << pad << "# Aligned Bases : " << noBasesAligned << "\n";
1228     log << pad << "# Matched Bases : " << matchedBases << "\n";
1229     log << pad << "# Mismatched Bases : " << mismatchedBases << "\n";
1230 
1231     log << pad << "# INS in Probe : ";
1232     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1233     {
1234         if (indelStatusInPath[i]=='I')
1235         {
1236             log << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1237         }
1238     }
1239     log << "\n";
1240 
1241     log << pad << "# DELS in Probe : ";
1242     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1243     {
1244         if (indelStatusInPath[i]=='D')
1245         {
1246             log << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1247         }
1248     }
1249     log << "\n";
1250 
1251     log << pad << "# INS in Read : ";
1252     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1253     {
1254         if (indelStatusInPath[i]=='I')
1255         {
1256             log << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1257         }
1258     }
1259     log << "\n";
1260 
1261     log << pad << "# DELS in Read : ";
1262     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1263     {
1264         if (indelStatusInPath[i]=='D')
1265         {
1266             log << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1267         }
1268     }
1269     log << "\n";
1270 
1271     log << pad << "Max Log odds    : " << maxLogOdds << "\n";
1272 };
1273 
printAlignment(std::string & pad)1274 void LHMM1::printAlignment(std::string& pad)
1275 {
1276     //std::cerr << "\tAlignments\n" ;
1277 
1278     //trace back
1279     //std::cerr << "\tBefore alignment" << "\n";
1280     //std::cerr << "\tX    " << x << "\n";
1281     //std::cerr << "\tY    " << y << "\n";
1282     //std::cerr << "\n";
1283     //std::cerr << "\tAfter alignment" << "\n";
1284     //use path to align x and y
1285     std::stringstream xAligned;
1286     std::stringstream yAligned;
1287     std::stringstream qualAligned;
1288     uint32_t xIndex=0, yIndex=0;
1289     for (uint32_t i=0; i<path.size(); ++i)
1290     {
1291         char state = path.at(i);
1292         if (state=='S' || state=='E')
1293         {
1294             xAligned << state;
1295             yAligned << state;
1296             qualAligned << state;
1297         }
1298         else if (state=='X'||state=='W')
1299         {
1300             xAligned << x[xIndex++];
1301             yAligned << '.';
1302             qualAligned << '.';
1303         }
1304         else if (state=='M')
1305         {
1306             xAligned << x[xIndex++];
1307             yAligned << y[yIndex];
1308             qualAligned << qual[yIndex++];
1309         }
1310         else if (state=='D')
1311         {
1312             xAligned << x[xIndex++];
1313             yAligned << '-';
1314             qualAligned << '-';
1315         }
1316         else if (state=='I')
1317         {
1318             xAligned << '-';
1319             yAligned << y[yIndex];
1320             qualAligned << qual[yIndex++];
1321         }
1322         else if (state=='Y'||state=='Z')
1323         {
1324             xAligned << '.';
1325             yAligned << y[yIndex];
1326             qualAligned << qual[yIndex++];
1327         }
1328     }
1329 
1330     std::cerr << pad << "X    " << xAligned.str() << "E\n";
1331     std::cerr << pad << "Path " << path << "E\n";
1332     std::cerr << pad << "Y    " << yAligned.str() << "E\n";
1333     std::cerr << pad << "Qual " << qualAligned.str() << "E\n\n";
1334 
1335     std::cerr << pad << "Alignment in Probe : [" << matchStartX << "," << matchEndX<< "]\n";
1336     std::cerr << pad << "Alignment in Read  : [" << matchStartY << "," << matchEndY<< "]\n";
1337     std::cerr << pad << "# Aligned Bases : " << noBasesAligned << "\n";
1338     std::cerr << pad << "# Matched Bases : " << matchedBases << "\n";
1339     std::cerr << pad << "# Mismatched Bases : " << mismatchedBases << "\n";
1340 
1341     std::cerr << pad << "# INS in Probe : ";
1342     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1343     {
1344         if (indelStatusInPath[i]=='I')
1345         {
1346             std::cerr << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1347         }
1348     }
1349     std::cerr << "\n";
1350 
1351     std::cerr << pad << "# DELS in Probe : ";
1352     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1353     {
1354         if (indelStatusInPath[i]=='D')
1355         {
1356             std::cerr << "[" << indelStartsInX[i] << "," << indelEndsInX[i] <<"] ";
1357         }
1358     }
1359     std::cerr << "\n";
1360 
1361     std::cerr << pad << "# INS in Read : ";
1362     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1363     {
1364         if (indelStatusInPath[i]=='I')
1365         {
1366             std::cerr << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1367         }
1368     }
1369     std::cerr << "\n";
1370 
1371     std::cerr << pad << "# DELS in Read : ";
1372     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1373     {
1374         if (indelStatusInPath[i]=='D')
1375         {
1376             std::cerr << "[" << indelStartsInY[i] << "," << indelEndsInY[i] <<"] ";
1377         }
1378     }
1379     std::cerr << "\n";
1380 
1381     std::cerr << pad << "Max Log odds    : " << maxLogOdds << "\n";
1382 };
1383 
printAlignment()1384 void LHMM1::printAlignment()
1385 {
1386     std::string pad = "\t";
1387     printAlignment(pad);
1388 };
1389 
score(char a,char b)1390 double LHMM1::score(char a, char b)
1391 {
1392     if (a==b)
1393         return 0;
1394     else
1395         return -1;
1396 };
1397 
1398 /**
1399  * Checks if deletion exists in alignment.
1400  */
deletion_start_exists(uint32_t pos,uint32_t & rpos)1401 bool LHMM1::deletion_start_exists(uint32_t pos, uint32_t& rpos)
1402 {
1403     rpos = 0;
1404     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1405     {
1406         if (indelStatusInPath[i]=='D' &&
1407             indelStartsInX[i]==pos)
1408         {
1409             rpos = indelStartsInY[i];
1410             return true;
1411         }
1412     }
1413 
1414     return false;
1415 }
1416 
1417 /**
1418  * Checks if insertion exists in alignment.
1419  */
insertion_start_exists(uint32_t pos,uint32_t & rpos)1420 bool LHMM1::insertion_start_exists(uint32_t pos, uint32_t& rpos)
1421 {
1422     rpos = 0;
1423     for (uint32_t i=0; i<indelStatusInPath.size(); ++i)
1424     {
1425         if (indelStatusInPath[i]=='I' &&
1426             indelStartsInX[i]==pos)
1427         {
1428             rpos = indelStartsInY[i];
1429             return true;
1430         }
1431     }
1432 
1433     return false;
1434 }