1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "ahmm.h"
25 
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28 
29 #define S       0
30 #define M       1
31 #define D       2
32 #define I       3
33 #define Z       4
34 #define E       5
35 #define N       6
36 #define TBD     7
37 #define NSTATES 6
38 
39 #define LFLANK    0
40 #define MOTIF     1
41 #define RFLANK    2
42 #define UNMODELED 3
43 #define UNCERTAIN 4
44 
45 //match type
46 #define MODEL 0
47 #define READ  1
48 #define MATCH 2
49 
50 /*for indexing single array*/
51 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
52 
53 /*functions for getting trace back information*/
54 #define track_get_u(t)      (((t)&0xF8000000)>>27)
55 #define track_get_d(t)      (((t)&0x07000000)>>24)
56 #define track_get_c(t)      (((t)&0x00FFF000)>>12)
57 #define track_get_p(t)      (((t)&0x00000FFF))
58 #define track_get_base(t)   (model[track_get_d(t)][track_get_p(t)-1])
59 #define track_valid(t)      ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
60 #define track_set_u(t,u)    (((t)&0x07FFFFFF)|((u)<<27))
61 #define track_set_d(t,d)    (((t)&0xF8FFFFFF)|((d)<<24))
62 #define track_set_c(t,c)    (((t)&0xFF000FFF)|((c)<<12))
63 #define track_set_p(t,p)    (((t)&0xFFFFF000)|(p))
64 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
65 
66 //[N|!|0|0]
67 #define NULL_TRACK  0x7C000000
68 //[N|l|0|0]
69 #define START_TRACK 0x78000000
70 
71 /**
72  * Constructor.
73  */
AHMM(bool debug)74 AHMM::AHMM(bool debug)
75 {
76     this->debug = debug;
77     initialize();
78 };
79 
80 /**
81  * Destructor.
82  */
~AHMM()83 AHMM::~AHMM()
84 {
85     delete optimal_path;
86 
87     for (size_t state=S; state<=E; ++state)
88     {
89         delete V[state];
90         delete U[state];
91     }
92 
93     delete V;
94     delete U;
95 };
96 
97 /**
98  * Initializes objects; helper function for constructor.
99  */
initialize()100 void AHMM::initialize()
101 {
102     initialize_structures();
103     initialize_T();
104     initialize_UV();
105 };
106 
107 /**
108  * Initializes objects for constructor.
109  */
initialize_structures()110 void AHMM::initialize_structures()
111 {
112     max_len = MAXLEN;
113 
114     model = new char*[3];
115     model[LFLANK] = NULL;
116     model[MOTIF] = NULL;
117     model[RFLANK] = NULL;
118 
119     lflen = 0;
120     mlen = 0;
121 
122     motif_discordance = new int32_t[MAXLEN];
123     optimal_path = new int32_t[MAXLEN<<2];
124     optimal_path_traced = false;
125 
126     typedef int32_t (AHMM::*move) (int32_t t, int32_t j);
127     V = new float*[NSTATES];
128     U = new int32_t*[NSTATES];
129     moves = new move*[NSTATES];
130     for (size_t state=S; state<=E; ++state)
131     {
132         V[state] = new float[MAXLEN*MAXLEN];
133         U[state] = new int32_t[MAXLEN*MAXLEN];
134         moves[state] = new move[NSTATES];
135     }
136 
137     for (size_t state=S; state<=E; ++state)
138     {
139         moves[state] = new move[NSTATES];
140     }
141 
142     moves[S][M]  = &AHMM::move_S_M;
143     moves[M][M]  = &AHMM::move_M_M;
144     moves[D][M]  = &AHMM::move_D_M;
145     moves[I][M]  = &AHMM::move_I_M;
146     moves[S][D]  = &AHMM::move_S_D;
147     moves[M][D]  = &AHMM::move_M_D;
148     moves[D][D]  = &AHMM::move_D_D;
149     moves[S][I]  = &AHMM::move_S_I;
150     moves[M][I]  = &AHMM::move_M_I;
151     moves[I][I]  = &AHMM::move_I_I;
152 };
153 
154 /**
155  * Initialize transition matrix based on parameters.
156  */
initialize_T()157 void AHMM::initialize_T()
158 {
159     float delta = par.delta;
160     float epsilon = par.epsilon;
161     float tau = par.tau;
162     float eta = par.eta;
163 
164     for (size_t i=S; i<=E; ++i)
165     {
166         for (size_t j=S; j<=E; ++j)
167         {
168             T[i][j] = -INFINITY;
169         }
170     }
171 
172     T[S][M] = log10(((1-delta))/((1-eta)*(1-eta)));
173     T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
174     T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
175     T[I][M] = T[D][M];
176 
177     T[S][D] = log10(delta/(1-eta));
178     T[M][D] = log10(delta/(1-eta));
179     T[D][D] = log10(epsilon/(1-eta));;
180 
181     T[M][I] = T[M][D];
182     T[I][I] = T[D][D];
183 };
184 
185 /**
186  * Initializes U and V.
187  */
initialize_UV()188 void AHMM::initialize_UV()
189 {
190     for (size_t i=0; i<MAXLEN; ++i)
191     {
192         for (size_t j=0; j<MAXLEN; ++j)
193         {
194             size_t c = index(i,j);
195 
196             V[S][c] = -INFINITY;
197             U[S][c] = NULL_TRACK;
198 
199             V[M][c] = -INFINITY;
200             if (!i || !j)
201             {
202                 U[M][c] = make_track(N,UNMODELED,0,0);
203             }
204             else
205             {
206                 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
207             }
208 
209             V[D][c] = -INFINITY;
210             if (!i || !j)
211             {
212                 U[D][c] = make_track(N,UNMODELED,0,0);
213             }
214             else
215             {
216                 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
217             }
218 
219             V[I][c] = -INFINITY;
220             if (!i || !j)
221             {
222                 U[I][c] = make_track(N,UNMODELED,0,0);
223             }
224             else
225             {
226                 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
227             }
228         }
229     }
230 
231     V[S][index(0,0)] = 0;
232     U[S][index(0,0)] = START_TRACK;
233 
234     V[M][index(0,0)] = -INFINITY;
235 };
236 
237 /**
238  * Sets a model.
239  */
set_model(const char * motif)240 void AHMM::set_model(const char* motif)
241 {
242     if (model[MOTIF]) free(model[MOTIF]);
243 
244     model[MOTIF] = strdup(motif);
245 
246     mlen = strlen(model[MOTIF]);
247 }
248 
249 /**
250  * Sets delta.
251  */
set_delta(float delta)252 void AHMM::set_delta(float delta)
253 {
254     par.delta = delta;
255 }
256 
257 /**
258  * Sets epsilon.
259  */
set_epsilon(float epsilon)260 void AHMM::set_epsilon(float epsilon)
261 {
262     par.epsilon = epsilon;
263 }
264 
265 /**
266  * Sets tau.
267  */
set_tau(float tau)268 void AHMM::set_tau(float tau)
269 {
270     par.tau = tau;
271 }
272 
273 /**
274  * Sets eta.
275  */
set_eta(float eta)276 void AHMM::set_eta(float eta)
277 {
278     par.eta = eta;
279 }
280 
281 /**
282  * Sets mismatch penalty.
283  */
set_mismatch_penalty(float mismatch_penalty)284 void AHMM::set_mismatch_penalty(float mismatch_penalty)
285 {
286     par.mismatch_penalty = mismatch_penalty;
287 }
288 
289 /**
290  * Sets debug.
291  */
set_debug(bool debug)292 void AHMM::set_debug(bool debug)
293 {
294     this->debug = debug;
295 }
296 
297 /**
298  * Get left flank start position for model.
299  */
get_lflank_model_spos1()300 int32_t AHMM::get_lflank_model_spos1()
301 {
302     return lflank_start[MODEL];
303 };
304 
305 /**
306  * Get left flank end position for model.
307  */
get_lflank_model_epos1()308 int32_t AHMM::get_lflank_model_epos1()
309 {
310     return lflank_end[MODEL];
311 };
312 
313 /**
314  * Get motif start position for model.
315  */
get_motif_model_spos1()316 int32_t AHMM::get_motif_model_spos1()
317 {
318     return motif_start[MODEL];
319 };
320 
321 /**
322  * Get motif end position for model.
323  */
get_motif_model_epos1()324 int32_t AHMM::get_motif_model_epos1()
325 {
326     return motif_end[MODEL];
327 };
328 
329 /**
330  * Get right flank start position for model.
331  */
get_rflank_model_spos1()332 int32_t AHMM::get_rflank_model_spos1()
333 {
334     return rflank_start[MODEL];
335 };
336 
337 /**
338  * Get right flank end position for model.
339  */
get_rflank_model_epos1()340 int32_t AHMM::get_rflank_model_epos1()
341 {
342     return rflank_end[MODEL];
343 };
344 
345 /**
346  * Get left flank start position for read.
347  */
get_lflank_read_spos1()348 int32_t AHMM::get_lflank_read_spos1()
349 {
350     return lflank_start[READ];
351 };
352 
353 /**
354  * Get left flank end position for read.
355  */
get_lflank_read_epos1()356 int32_t AHMM::get_lflank_read_epos1()
357 {
358     return lflank_end[READ];
359 };
360 
361 /**
362  * Get motif start position for read.
363  */
get_motif_read_spos1()364 int32_t AHMM::get_motif_read_spos1()
365 {
366     return motif_start[READ];
367 };
368 
369 /**
370  * Get motif end position for read.
371  */
get_motif_read_epos1()372 int32_t AHMM::get_motif_read_epos1()
373 {
374     return motif_end[READ];
375 };
376 
377 /**
378  * Get right flank start position for read.
379  */
get_rflank_read_spos1()380 int32_t AHMM::get_rflank_read_spos1()
381 {
382     return rflank_start[READ];
383 };
384 
385 /**
386  * Get right flank end position for read.
387  */
get_rflank_read_epos1()388 int32_t AHMM::get_rflank_read_epos1()
389 {
390     return rflank_end[READ];
391 };
392 
393 /**
394  * Get motif concordance.
395  */
get_motif_concordance()396 float AHMM::get_motif_concordance()
397 {
398     return motif_concordance;
399 };
400 
401 /**
402  * Get exact motif count.
403  */
get_exact_motif_count()404 uint32_t AHMM::get_exact_motif_count()
405 {
406     return exact_motif_count;
407 };
408 
409 /**
410  * Get motif count.
411  */
get_motif_count()412 uint32_t AHMM::get_motif_count()
413 {
414     return motif_count;
415 };
416 
417 /**
418  * Computes the score associated with the move from A to B
419  * Updates the max_score and associated max_track.
420  *
421  * @A      - start state
422  * @B      - end state
423  * @index1 - flattened index of the one dimensional array of start state
424  * @j      - 1 based position of read of start state
425  * @m      - base match required (MATCH, MODEL, READ)
426  */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)427 void AHMM::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
428 {
429     float emission = 0, score = 0, valid = 0;
430 
431     //t is the new track
432     int32_t  t =  (this->*(moves[A][B]))(U[A][index1],j);
433 
434     if (t==NULL_TRACK)
435     {
436         valid = -INFINITY;
437     }
438     else if (match_type==MATCH)
439     {
440         emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
441     }
442 
443     score = V[A][index1] + T[A][B] + emission + valid;
444 
445     if (score>max_score)
446     {
447         max_score = score;
448         max_track = t;
449     }
450 
451     if (debug)
452     {
453         std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
454         std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
455         std::cerr << track2string(U[A][index1]) << "=>";
456         std::cerr << track2string(t) << " ";
457         std::cerr << emission << " (e: " << (track_get_d(t)<=MOTIF?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N')  << ") + ";
458         std::cerr << T[A][B] << " (t) + ";
459         std::cerr << V[A][index1] << " (p) + ";
460         std::cerr << valid << " (v) = ";
461         std::cerr << score << "\n";
462     }
463 }
464 
465 /**
466  * Align read against model.
467  */
align(const char * read,const char * qual)468 void AHMM::align(const char* read, const char* qual)
469 {
470     clear_statistics();
471     optimal_path_traced = false;
472     this->read = read;
473     this->qual = qual;
474     rlen = strlen(read);
475 
476     if (rlen>=MAXLEN)
477     {
478         fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported, subsetting string to first 1023 characters: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
479         rlen = 1023;
480     }
481     plen = rlen;
482 
483     float max = 0;
484     char maxPath = 'X';
485 
486     size_t c,d,u,l;
487 
488     //alignment
489     //take into consideration
490     for (size_t i=1; i<=plen; ++i)
491     {
492         //break;
493 
494         for (size_t j=1; j<=rlen; ++j)
495         {
496             c = index(i,j);
497             d = index(i-1,j-1);
498             u = index(i-1,j);
499             l = index(i,j-1);
500 
501             /////
502             //M//
503             /////
504             //only need to update this i>rflen
505             max_score = -INFINITY;
506             max_track = NULL_TRACK;
507             proc_comp(S, M, d, j-1, MATCH);
508             proc_comp(M, M, d, j-1, MATCH);
509             proc_comp(D, M, d, j-1, MATCH);
510             proc_comp(I, M, d, j-1, MATCH);
511             V[M][c] = max_score;
512             U[M][c] = max_track;
513             if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
514 
515             /////
516             //D//
517             /////
518             max_score = -INFINITY;
519             max_track = NULL_TRACK;
520             proc_comp(S, D, u, j, MODEL);
521             proc_comp(M, D, u, j, MODEL);
522             proc_comp(D, D, u, j, MODEL);
523             V[D][c] = max_score;
524             U[D][c] = max_track;
525             if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
526 
527             /////
528             //I//
529             /////
530             max_score = -INFINITY;
531             max_track = NULL_TRACK;
532             proc_comp(S, I, l, j-1, READ);
533             proc_comp(M, I, l, j-1, READ);
534             proc_comp(I, I, l, j-1, READ);
535             V[I][c] = max_score;
536             U[I][c] = max_track;
537             if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
538         }
539     }
540 
541     if (debug)
542     {
543         std::cerr << "\n   =V[S]=\n";
544         print(V[S], plen+1, rlen+1);
545         std::cerr << "\n   =U[S]=\n";
546         print_U(U[S], plen+1, rlen+1);
547 
548         std::cerr << "\n   =V[M]=\n";
549         print(V[M], plen+1, rlen+1);
550         std::cerr << "\n   =U[M]=\n";
551         print_U(U[M], plen+1, rlen+1);
552         std::cerr << "\n   =V[D]=\n";
553         print(V[D], plen+1, rlen+1);
554         std::cerr << "\n   =U[D]=\n";
555         print_U(U[D], plen+1, rlen+1);
556         std::cerr << "\n   =V[I]=\n";
557         print(V[I], plen+1, rlen+1);
558         std::cerr << "\n   =U[I]=\n";
559         print_U(U[I], plen+1, rlen+1);
560 
561         std::cerr << "\n";
562         std::cerr << "\n";
563 
564         print_T();
565     }
566 
567     trace_path();
568 
569     exact_motif_count = motif_count;
570     motif_concordance = 0;
571     for (int32_t k=1; k<=motif_count; ++k)
572     {
573         if (motif_discordance[k])
574         {
575             --exact_motif_count;
576         }
577 
578         if (mlen>=motif_discordance[k])
579         {
580             motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
581         }
582     }
583     motif_concordance /= motif_count;
584 
585 
586 
587 };
588 
589 /**
590  * Trace path after alignment.
591  */
trace_path()592 void AHMM::trace_path()
593 {
594     //search for a complete path in M
595     size_t c;
596     optimal_score = -INFINITY;
597     optimal_track = NULL_TRACK;
598     optimal_state = TBD;
599     optimal_probe_len = 0;
600     trf_score = 0;
601     for (size_t i=lflen; i<=plen; ++i)
602     {
603         c = index(i,rlen);
604 
605         if (V[M][c]>=optimal_score)
606         {
607             optimal_score = V[M][c];
608             optimal_track = U[M][c];
609             optimal_state = M;
610             optimal_probe_len = i;
611         }
612 
613         if (V[D][c]>=optimal_score)
614         {
615             optimal_score = V[D][c];
616             optimal_track = U[D][c];
617             optimal_state = D;
618             optimal_probe_len = i;
619         }
620 
621 //        if (V[I][c]>=optimal_score)
622 //        {
623 //            optimal_score = V[I][c];
624 //            optimal_track = U[I][c];
625 //            optimal_state = I;
626 //            optimal_probe_len = i;
627 //        }
628     }
629 
630     //trace path
631     optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
632     int32_t i = optimal_probe_len, j = rlen;
633     int32_t last_t = make_track(optimal_state, UNMODELED, 0, 0); //dummy end track for E
634     optimal_path_len = 0;
635     int32_t u;
636     int32_t des_t, src_t = make_track(E, UNMODELED, 0, 0);
637 
638     do
639     {
640         u = track_get_u(last_t);
641         last_t = U[u][index(i,j)];
642         *optimal_path_ptr = track_set_u(last_t, u);
643 
644         des_t = *optimal_path_ptr;
645         collect_statistics(src_t, des_t, j);
646         if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " :  " << track2string(last_t) << "\n";
647         src_t = des_t;
648 
649         if (u==M)
650         {
651             --i; --j;
652         }
653         else if (u==D)
654         {
655             --i;
656         }
657         else if (u==I || u==Z)
658         {
659             --j;
660         }
661 
662         --optimal_path_ptr;
663         ++optimal_path_len;
664 
665     } while (track_get_u(last_t)!=S);
666 
667     collect_statistics(src_t, last_t, j);
668 
669     ++optimal_path_ptr;
670     optimal_path_traced = true;
671 };
672 
673 /**
674  * Collect alignment summary statistics.
675  */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)676 void AHMM::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
677 {
678     //std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
679 
680     int32_t src_u = track_get_u(src_t);
681     int32_t des_u = track_get_u(des_t);
682 
683     if (src_u==E)
684     {
685         if (des_u==M || des_u==D || des_u==I)
686         {
687             rflank_start[MODEL] = NAN;
688             rflank_start[READ] = NAN;
689             rflank_end[MODEL] = INFINITY;
690             rflank_end[READ] = INFINITY;
691 
692             motif_end[MODEL] = track_get_c(des_t);
693             motif_count = track_get_c(des_t);
694             last_motif_pos = track_get_p(des_t);
695             motif_end[READ] = j;
696 
697             //initialize array for tracking inexact repeats
698             for (int32_t k=1; k<=motif_count; ++k)
699             {
700                 motif_discordance[k] = 0;
701             }
702 
703             if (des_u==D || track_get_base(des_t)!=read[j-1])
704             {
705                 ++motif_discordance[motif_count];
706             }
707         }
708     }
709 
710     if (des_u==M)
711     {
712         if (track_get_base(des_t)!=read[j-1])
713         {
714             trf_score -= 7;
715             ++motif_discordance[track_get_c(des_t)];
716         }
717         else
718         {
719             trf_score += 2;
720         }
721     }
722 
723     if (des_u==D || des_u==I)
724     {
725         trf_score -= 7;
726         ++motif_discordance[track_get_c(des_t)];
727     }
728 
729     frac_no_repeats = motif_count - (mlen-last_motif_pos)/((float)mlen);
730 };
731 
732 /**
733  * Clear alignment statistics.
734  */
clear_statistics()735 void AHMM::clear_statistics()
736 {
737     lflank_start[MODEL] = NAN;
738     lflank_start[READ] = NAN;
739     lflank_end[MODEL] = NAN;
740     lflank_end[READ] = NAN;
741     motif_start[MODEL] = NAN;
742     motif_start[READ] = NAN;
743     motif_end[MODEL] = NAN;
744     motif_end[READ] = NAN;
745     motif_count = NAN;
746     exact_motif_count = NAN;
747     motif_m = NAN;
748     motif_xid = NAN;
749     motif_concordance = NAN;
750     rflank_start[MODEL] = NAN;
751     rflank_start[READ] = NAN;
752     rflank_end[MODEL] = NAN;
753     rflank_end[READ] = NAN;
754 }
755 
756 /**
757  * Update alignment statistics after collection.
758  */
update_statistics()759 void AHMM::update_statistics()
760 {
761     motif_concordance = (float)motif_m/(motif_m+motif_xid);
762 }
763 
764 /**
765  * Returns true if flanks are mapped.
766  */
flanks_are_mapped()767 bool AHMM::flanks_are_mapped()
768 {
769     return lflank_end[MODEL]==lflen;
770 }
771 
772 
773 /**
774  * Compute log10 emission odds based on equal error probability distribution.
775  */
log10_emission_odds(char probe_base,char read_base,uint32_t pl,float mismatch_penalty)776 float AHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl, float mismatch_penalty)
777 {
778 //    if (read_base=='N' || probe_base=='N')
779 //    {
780 //        return -INFINITY;  //is this appropriate in this case?
781 //    }
782 
783     if (read_base!=probe_base)
784     {
785         return LogTool::pl2log10_varp(pl);
786     }
787     else //match
788     {
789         return -(LogTool::pl2log10_varp(pl)-mismatch_penalty);
790     }
791 };
792 
793 /**
794  * Compute log10 emission odds based on equal error probability distribution.
795  */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)796 float AHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
797 {
798 //    if (read_base=='N' || probe_base=='N')
799 //    {
800 //        return -INFINITY;  //is this appropriate in this case?
801 //    }
802 
803     if (read_base!=probe_base)
804     {
805         return LogTool::pl2log10_varp(pl)-par.mismatch_penalty;
806     }
807     else //match
808     {
809         return -(LogTool::pl2log10_varp(pl));
810     }
811 };
812 
813 /**
814  * Converts state to string representation.
815  */
state2string(int32_t state)816 std::string AHMM::state2string(int32_t state)
817 {
818     if (state==S)
819     {
820         return "S";
821     }
822     else if (state==M)
823     {
824         return "M";
825     }
826     else if (state==D)
827     {
828         return "D";
829     }
830     else if (state==I)
831     {
832         return "I";
833     }
834     else if (state==E)
835     {
836         return "E";
837     }
838     else if (state==N)
839     {
840         return "N";
841     }
842     else if (state==TBD)
843     {
844         return "*";
845     }
846     else
847     {
848         return "!";
849     }
850 }
851 
852 /**
853  * Converts state to cigar string representation.
854  */
state2cigarstring(int32_t state)855 std::string AHMM::state2cigarstring(int32_t state)
856 {
857     if (state==S)
858     {
859         return "S";
860     }
861     else if (state==M)
862     {
863         return "M";
864     }
865     else if (state==D)
866     {
867         return "D";
868     }
869     else if (state==I)
870     {
871         return "I";
872     }
873     else if (state==E)
874     {
875         return "E";
876     }
877     else if (state==N)
878     {
879         return "N";
880     }
881     else if (state==TBD)
882     {
883         return "*";
884     }
885     else
886     {
887         return "!";
888     }
889 }
890 
891 /**
892  * Converts state to cigar string representation.
893  */
track2cigarstring1(int32_t t,int32_t j)894 std::string AHMM::track2cigarstring1(int32_t t, int32_t j)
895 {
896     int32_t state = track_get_u(t);
897 
898     if (state==S)
899     {
900         return "S";
901     }
902     else if (state==M)
903     {
904         if (track_get_base(t)==read[j-1])
905         {
906             return "M";
907         }
908         else
909         {
910             return "*";
911         }
912     }
913     else if (state==D)
914     {
915         return "D";
916     }
917     else if (state==I)
918     {
919         return "I";
920     }
921     else if (state==Z)
922     {
923         return "Z";
924     }
925     else if (state==E)
926     {
927         return "E";
928     }
929     else if (state==N)
930     {
931         return "N";
932     }
933     else if (state==TBD)
934     {
935         return "*";
936     }
937     else
938     {
939         return "!";
940     }
941 }
942 
943 /**
944  * Converts track to cigar string representation.
945  */
track2cigarstring2(int32_t t)946 std::string AHMM::track2cigarstring2(int32_t t)
947 {
948     int32_t state = track_get_u(t);
949 
950     if (state==M)
951     {
952         return (track_get_c(t)%2==0?"+":"o");
953     }
954     else if (state==D)
955     {
956         return (track_get_c(t)%2==0?"+":"o");
957     }
958     else if (state==I)
959     {
960         return (track_get_c(t)%2==0?"+":"o");
961     }
962     else
963     {
964         return " ";
965     }
966 }
967 
968 /**
969  * Converts model component to string representation.
970  */
component2string(int32_t component)971 std::string AHMM::component2string(int32_t component)
972 {
973     if (component==LFLANK)
974     {
975         return "l";
976     }
977     else if (component==MOTIF)
978     {
979         return "m";
980     }
981     else if (component==RFLANK)
982     {
983         return "r";
984     }
985     else if (component==UNMODELED)
986     {
987         return "!";
988     }
989     else if (component==READ)
990     {
991         return "s";
992     }
993     else if (component==UNCERTAIN)
994     {
995         return "?";
996     }
997     else
998     {
999         return "!";
1000     }
1001 }
1002 
1003 /**
1004  * Prints an alignment.
1005  */
print_alignment()1006 void AHMM::print_alignment()
1007 {
1008     std::string pad = "\t";
1009     print_alignment(pad);
1010 };
1011 
1012 /**
1013  * Prints an alignment with padding.
1014  */
print_alignment(std::string & pad)1015 void AHMM::print_alignment(std::string& pad)
1016 {
1017     if (!optimal_path_traced)
1018     {
1019         std::cerr << "path not traced\n";
1020     }
1021 
1022     std::cerr << "=================================\n";
1023     std::cerr << "AHMM\n";
1024     std::cerr << "*********************************\n";
1025     std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1026     std::cerr << "lflen        : " << lflen << "\n";
1027     std::cerr << "mlen         : " << mlen << "\n";
1028     std::cerr << "plen         : " << plen << "\n";
1029     std::cerr << "\n";
1030     std::cerr << "read         : " << read << "\n";
1031     std::cerr << "rlen         : " << rlen << "\n";
1032     std::cerr << "\n";
1033     std::cerr << "optimal score: " << optimal_score << "\n";
1034     std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1035     std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1036     std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1037     std::cerr << "optimal path length : " << optimal_path_len << "\n";
1038     std::cerr << "max j: " << rlen << "\n";
1039     std::cerr << "mismatch penalty: " << par.mismatch_penalty << "\n";
1040     std::cerr << "\n";
1041 
1042     std::cerr << "model: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1043                           << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "]\n";
1044     std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1045                           << "[" << motif_start[READ] << "~" << motif_end[READ] << "]"
1046                           << "[" << rflank_start[READ] << "~" << rflank_end[READ] << "]\n";
1047     std::cerr << "\n";
1048     std::cerr << "motif #                     : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1049     std::cerr << "motif concordance           : " << motif_concordance << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1050     std::cerr << "last motif position         : " << last_motif_pos << "\n";
1051     std::cerr << "motif discordance           : ";
1052     for (int32_t k=1; k<=motif_count; ++k)
1053     {
1054         std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1055     }
1056     std::cerr << "fractional no. repeat units : " << frac_no_repeats << "\n";
1057     std::cerr << "repeat tract length         : " << rlen << "\n";
1058     std::cerr << "TRF Score                   : " << trf_score << "\n";
1059 
1060     std::cerr << "\n";
1061 
1062     //print path
1063     int32_t* path;
1064     path = optimal_path_ptr;
1065     std::cerr << "Model:  ";
1066     int32_t t = NULL_TRACK;
1067     int32_t j = 0;
1068     while (path<optimal_path+(MAXLEN<<2))
1069     {
1070         int32_t u = track_get_u(*path);
1071         if (u==M || u==D)
1072         {
1073             std::cerr << track_get_base(*path);
1074         }
1075         else
1076         {
1077             std::cerr << '-';
1078         }
1079         ++path;
1080     }
1081     std::cerr << " \n";
1082 
1083     std::cerr << "       S";
1084     path = optimal_path_ptr;
1085     j=1;
1086     while (path<optimal_path+(MAXLEN<<2))
1087     {
1088         std::cerr << track2cigarstring1(*path,j);
1089         int32_t u = track_get_u(*path);
1090         if (u==M || u==I || u==Z)
1091         {
1092             ++j;
1093         }
1094         ++path;
1095     }
1096     std::cerr << "E\n";
1097 
1098     path = optimal_path_ptr;
1099     std::cerr << "        ";
1100     while (path<optimal_path+(MAXLEN<<2))
1101     {
1102         std::cerr << track2cigarstring2(*path);
1103         ++path;
1104     }
1105     std::cerr << " \n";
1106 
1107     path = optimal_path_ptr;
1108     j=1;
1109     std::cerr << "Read:   ";
1110     while (path<optimal_path+(MAXLEN<<2))
1111     {
1112         int32_t u = track_get_u(*path);
1113         if (u==M || u==I || u==Z)
1114         {
1115             std::cerr << read[j-1];
1116             ++j;
1117         }
1118         else
1119         {
1120             std::cerr << '-';
1121         }
1122         ++path;
1123     }
1124     std::cerr << " \n";
1125     std::cerr << "=================================\n";
1126 };
1127 
1128 /**
1129  * Prints a float matrix.
1130  */
print(float * v,size_t plen,size_t rlen)1131 void AHMM::print(float *v, size_t plen, size_t rlen)
1132 {
1133     float val;
1134     std::cerr << std::setprecision(1) << std::fixed;
1135     for (size_t i=0; i<plen; ++i)
1136     {
1137         for (size_t j=0; j<rlen; ++j)
1138         {
1139             val =  v[index(i,j)];
1140             std::cerr << (val<0?"  ":"   ") << val;
1141         }
1142 
1143         std::cerr << "\n";
1144     }
1145 };
1146 
1147 /**
1148  * Prints a char matrix.
1149  */
print(int32_t * v,size_t plen,size_t rlen)1150 void AHMM::print(int32_t *v, size_t plen, size_t rlen)
1151 {
1152     float val;
1153     std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1154     for (size_t i=0; i<plen; ++i)
1155     {
1156         for (size_t j=0; j<rlen; ++j)
1157         {
1158           val =  v[index(i,j)];
1159           std::cerr << (val<0?"  ":"   ") << val;
1160         }
1161 
1162         std::cerr << "\n";
1163     }
1164 };
1165 
1166 /**
1167  * Prints the transition matrix.
1168  */
print_T()1169 void AHMM::print_T()
1170 {
1171     std::cerr << "\t";
1172     for (size_t j=S; j<=Z; ++j)
1173     {
1174         std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1175     }
1176     std::cerr << "\n";
1177 
1178     for (size_t i=S; i<=Z; ++i)
1179     {
1180         std::cerr << "\t";
1181         for (size_t j=S; j<=Z; ++j)
1182         {
1183             if (j)
1184             {
1185                 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1186             }
1187             else
1188             {
1189                 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1190             }
1191         }
1192         std::cerr << "\n";
1193     }
1194     std::cerr << "\n";
1195 };
1196 
1197 /**
1198  * Prints U.
1199  */
print_U(int32_t * U,size_t plen,size_t rlen)1200 void AHMM::print_U(int32_t *U, size_t plen, size_t rlen)
1201 {
1202     std::cerr << std::setprecision(1) << std::fixed;
1203     std::string state;
1204     for (size_t i=0; i<plen; ++i)
1205     {
1206         for (size_t j=0; j<rlen; ++j)
1207         {
1208             int32_t t = U[index(i,j)];
1209             state = state2string(track_get_u(t));
1210             std::cerr << (state.size()==1 ? "   " : "  ")
1211                       << state << "|"
1212                       << component2string(track_get_d(t)) << "|"
1213                       << track_get_c(t) << "|"
1214                       << track_get_p(t) << (j==rlen-1?"\n":"   ");
1215         }
1216     }
1217 };
1218 
1219 /**
1220  * Prints U and V.
1221  */
print_trace(int32_t state,size_t plen,size_t rlen)1222 void AHMM::print_trace(int32_t state, size_t plen, size_t rlen)
1223 {
1224     std::cerr << std::setprecision(1) << std::fixed;
1225     int32_t *u = U[state];
1226     float *v = V[state];
1227     std::string s;
1228     for (size_t i=0; i<plen; ++i)
1229     {
1230         for (size_t j=0; j<rlen; ++j)
1231         {
1232             int32_t t = u[index(i,j)];
1233             s = state2string(track_get_u(t));
1234             std::cerr << (s.size()==1 ? "   " : "  ")
1235                       << s << "|"
1236                       << component2string(track_get_d(t)) << "|"
1237                       << track_get_c(t) << "|"
1238                       << track_get_p(t) << "|"
1239                       << v[index(i,j)];
1240         }
1241 
1242         std::cerr << "\n";
1243     }
1244 };
1245 
1246 /**
1247  * Returns a string representation of track.
1248  */
track2string(int32_t t)1249 std::string AHMM::track2string(int32_t t)
1250 {
1251     kstring_t str = {0,0,0};
1252 
1253     kputs(state2string(track_get_u(t)).c_str(), &str);
1254     kputc('|', &str);
1255     kputs(component2string(track_get_d(t)).c_str(), &str);
1256     kputc('|', &str);
1257     kputw(track_get_c(t), &str);
1258     kputc('|', &str);
1259     kputw(track_get_p(t), &str);
1260 
1261     std::string s(str.s);
1262     if (str.m) free(str.s);
1263 
1264     return s;
1265 }
1266 
1267 /**
1268  * Prints track.
1269  */
print_track(int32_t t)1270 void AHMM::print_track(int32_t t)
1271 {
1272     std::cerr << track2string(t) << "\n";
1273 }
1274 
1275 #undef MAXLEN
1276 #undef MAXLEN_NBITS
1277 #undef S
1278 #undef ML
1279 #undef M
1280 #undef I
1281 #undef D
1282 #undef Z
1283 #undef E
1284 #undef N
1285 #undef TBD
1286 #undef NSTATES
1287 
1288 #undef LFLANK
1289 #undef MOTIF
1290 #undef RFLANK
1291 #undef UNMODELED
1292 #undef UNCERTAIN
1293 
1294 #undef READ
1295 #undef MODEL
1296 #undef MATCH
1297 
1298 #undef index
1299 #undef track_get_u
1300 #undef track_get_d
1301 #undef track_get_d
1302 #undef track_get_c
1303 #undef track_get_p
1304 #undef track_get_base
1305 #undef track_valid
1306 #undef track_set_u
1307 #undef track_set_d
1308 #undef track_set_c
1309 #undef track_set_p
1310 #undef make_track
1311 
1312 #undef NULL_TRACK
1313 #undef START_TRACK