1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "rfhmm.h"
25 
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28 
29 //model states
30 #define S       0
31 #define Y       1
32 #define M       2
33 #define D       3
34 #define I       4
35 #define MR      5
36 #define E       6
37 #define N       7
38 #define TBD     8
39 
40 #define NSTATES 7
41 
42 //model components
43 #define LFLANK    0
44 #define MOTIF     1
45 #define RFLANK    2
46 #define UNMODELED 3
47 #define UNCERTAIN 4
48 
49 //match type
50 #define MODEL 0
51 #define READ  1
52 #define MATCH 2
53 
54 /*for indexing single array*/
55 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
56 
57 /*functions for getting trace back information*/
58 #define track_get_u(t)      (((t)&0xF8000000)>>27)
59 #define track_get_d(t)      (((t)&0x07000000)>>24)
60 #define track_get_c(t)      (((t)&0x00FFF000)>>12)
61 #define track_get_p(t)      (((t)&0x00000FFF))
62 #define track_get_base(t)   (model[track_get_d(t)][track_get_p(t)-1])
63 #define track_valid(t)      ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
64 #define track_set_u(t,u)    (((t)&0x07FFFFFF)|((u)<<27))
65 #define track_set_d(t,d)    (((t)&0xF8FFFFFF)|((d)<<24))
66 #define track_set_c(t,c)    (((t)&0xFF000FFF)|((c)<<12))
67 #define track_set_p(t,p)    (((t)&0xFFFFF000)|(p))
68 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
69 
70 //[N|!|0|0]
71 #define NULL_TRACK  0x7C000000
72 //[N|l|0|0]
73 #define START_TRACK 0x78000000
74 
75 /**
76  * Constructor.
77  */
RFHMM(bool debug)78 RFHMM::RFHMM(bool debug)
79 {
80     this-> debug = debug;
81     initialize();
82 };
83 
84 /**
85  * Destructor.
86  */
~RFHMM()87 RFHMM::~RFHMM()
88 {
89     delete optimal_path;
90 
91     //the best alignment V_ for subsequence (i,j)
92     for (size_t state=S; state<=E; ++state)
93     {
94         delete V[state];
95         delete U[state];
96     }
97 
98     delete V;
99     delete U;
100 };
101 
102 /**
103  * Initializes objects; helper function for constructor.
104  */
initialize()105 void RFHMM::initialize()
106 {
107     initialize_structures();
108     initialize_T();
109     initialize_UV();
110 };
111 
112 /**
113  * Initializes objects for constructor.
114  */
initialize_structures()115 void RFHMM::initialize_structures()
116 {
117     model = new char*[3];
118     model[LFLANK] = NULL;
119     model[MOTIF] = NULL;
120     model[RFLANK] = NULL;
121 
122     rflen = 0;
123     mlen = 0;
124 
125     motif_discordance = new int32_t[MAXLEN];
126     optimal_path = new int32_t[MAXLEN<<2];
127     optimal_path_traced = false;
128 
129     typedef int32_t (RFHMM::*move) (int32_t t, int32_t j);
130     V = new float*[NSTATES];
131     U = new int32_t*[NSTATES];
132     moves = new move*[NSTATES];
133     for (size_t state=S; state<=E; ++state)
134     {
135         V[state] = new float[MAXLEN*MAXLEN];
136         U[state] = new int32_t[MAXLEN*MAXLEN];
137         moves[state] = new move[NSTATES];
138     }
139 
140     for (size_t state=S; state<=E; ++state)
141     {
142         moves[state] = new move[NSTATES];
143     }
144 
145     moves[S][Y] = &RFHMM::move_S_Y;
146     moves[Y][Y] = &RFHMM::move_Y_Y;
147 
148     moves[S][M] = &RFHMM::move_S_M;
149     moves[Y][M] = &RFHMM::move_Y_M;
150     moves[M][M] = &RFHMM::move_M_M;
151     moves[D][M] = &RFHMM::move_D_M;
152     moves[I][M] = &RFHMM::move_I_M;
153 
154     moves[S][D] = &RFHMM::move_S_D;
155     moves[Y][D] = &RFHMM::move_Y_D;
156     moves[M][D] = &RFHMM::move_M_D;
157     moves[D][D] = &RFHMM::move_D_D;
158 
159     moves[S][I] = &RFHMM::move_S_I;
160     moves[Y][I] = &RFHMM::move_Y_I;
161     moves[M][I] = &RFHMM::move_M_I;
162     moves[I][I] = &RFHMM::move_I_I;
163 
164     moves[S][MR] = &RFHMM::move_S_MR;
165     moves[Y][MR] = &RFHMM::move_Y_MR;
166     moves[M][MR] = &RFHMM::move_M_MR;
167     moves[D][MR] = &RFHMM::move_D_MR;
168     moves[I][MR] = &RFHMM::move_I_MR;
169     moves[MR][MR] = &RFHMM::move_MR_MR;
170 };
171 
172 /**
173  * Initialize transition matrix based on parameters.
174  */
initialize_T()175 void RFHMM::initialize_T()
176 {
177     float delta = par.delta;
178     float epsilon = par.epsilon;
179     float tau = par.tau;
180     float eta = par.eta;
181 
182     for (size_t i=S; i<=E; ++i)
183     {
184         for (size_t j=S; j<=E; ++j)
185         {
186             T[i][j] = -INFINITY;
187         }
188     }
189 
190     T[S][Y] = 0;
191     T[Y][Y] = 0;
192 
193     T[S][M] = log10((tau*(1-2*delta-tau))/(tau*eta*(1-eta)*(1-eta)));
194     T[Y][M] = T[S][M];
195     T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
196     T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
197     T[I][M] = T[I][M];
198 
199     T[S][D] = log10((tau*delta)/(eta*(1-eta)));
200     T[Y][D] = T[S][D];
201     T[M][D] = log10(delta/(1-eta));
202     T[D][D] = log10(epsilon/(1-eta));
203 
204     T[S][I] = log10((tau*delta)/(eta*(1-eta)));
205     T[Y][I] = T[S][I];
206     T[M][I] = log10(delta/(1-eta));
207     T[I][I] = log10(epsilon/(1-eta));
208 
209     T[S][MR] = log10((tau*tau*(1-tau))/(tau*eta*eta*eta*(1-eta)*(1-eta)));
210     T[Y][MR] = log10((eta*tau)/(eta*eta*eta));
211     T[M][MR] = log10((tau*(1-tau))/(eta*eta*(1-eta)*(1-eta)));
212     T[D][MR] = T[M][MR];
213     T[I][MR] = T[D][MR];
214     T[MR][MR] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
215 };
216 
217 /**
218  * Initializes U and V.
219  */
initialize_UV()220 void RFHMM::initialize_UV()
221 {
222     int32_t t=0;
223     for (size_t i=0; i<MAXLEN; ++i)
224     {
225         for (size_t j=0; j<MAXLEN; ++j)
226         {
227             size_t c = index(i,j);
228 
229             V[S][c] = -INFINITY;
230             U[S][c] = NULL_TRACK;
231 
232             //Y
233             V[Y][c] = 0;
234             if (i>=1)
235             {
236                 V[Y][c] = -INFINITY;
237                 U[Y][c] = NULL_TRACK;
238             }
239             else
240             {
241                 if (j==1)
242                 {
243                     U[Y][c] = make_track(S,LFLANK,0,j);
244                 }
245                 else
246                 {
247                     U[Y][c] =  make_track(Y,LFLANK,0,j);
248                 }
249             }
250 
251             //M
252             V[M][c] = -INFINITY;
253             if (!i || !j)
254             {
255                 U[M][c] = make_track(N,UNMODELED,0,0);
256             }
257             else
258             {
259                 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
260             }
261 
262             //D
263             V[D][c] = -INFINITY;
264             if (!i || !j)
265             {
266                 U[D][c] = make_track(N,UNMODELED,0,0);
267             }
268             else
269             {
270                 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
271             }
272 
273             //I
274             V[I][c] = -INFINITY;
275             if (!i || !j)
276             {
277                 U[I][c] = make_track(N,UNMODELED,0,0);
278             }
279             else
280             {
281                 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
282             }
283 
284             //MR
285             V[MR][c] = -INFINITY;
286             if (!i || !j)
287             {
288                 U[MR][c] = make_track(N,UNMODELED,0,0);
289             }
290             else
291             {
292                 U[MR][c] = make_track(TBD,UNCERTAIN,0,0);
293             }
294         }
295     }
296 
297     V[S][index(0,0)] = 0;
298     U[S][index(0,0)] = START_TRACK;
299 
300     V[M][index(0,0)] = -INFINITY;
301     V[MR][index(0,0)] = -INFINITY;
302 };
303 
304 /**
305  * Sets a model.
306  */
set_model(const char * motif,const char * rflank)307 void RFHMM::set_model(const char* motif, const char* rflank)
308 {
309     if (model[LFLANK]) free(model[LFLANK]);
310     if (model[MOTIF]) free(model[MOTIF]);
311     if (model[RFLANK]) free(model[RFLANK]);
312 
313     model[MOTIF] = strdup(motif);
314     model[RFLANK] = strdup(rflank);
315 
316     mlen = strlen(model[MOTIF]);
317     rflen = strlen(model[RFLANK]);
318 }
319 
320 /**
321  * Sets delta.
322  */
set_delta(float delta)323 void RFHMM::set_delta(float delta)
324 {
325     par.delta = delta;
326 }
327 
328 /**
329  * Sets epsilon.
330  */
set_epsilon(float epsilon)331 void RFHMM::set_epsilon(float epsilon)
332 {
333     par.epsilon = epsilon;
334 }
335 
336 /**
337  * Sets tau.
338  */
set_tau(float tau)339 void RFHMM::set_tau(float tau)
340 {
341     par.tau = tau;
342 }
343 
344 /**
345  * Sets eta.
346  */
set_eta(float eta)347 void RFHMM::set_eta(float eta)
348 {
349     par.eta = eta;
350 }
351 
352 /**
353  * Sets mismatch penalty.
354  */
set_mismatch_penalty(float mismatch_penalty)355 void RFHMM::set_mismatch_penalty(float mismatch_penalty)
356 {
357     par.mismatch_penalty = mismatch_penalty;
358 }
359 
360 /**
361  * Sets debug.
362  */
set_debug(bool debug)363 void RFHMM::set_debug(bool debug)
364 {
365     this->debug = debug;
366 }
367 
368 
369 /**
370  * Get left flank start position for model.
371  */
get_lflank_model_spos1()372 int32_t RFHMM::get_lflank_model_spos1()
373 {
374     return lflank_start[MODEL];
375 };
376 
377 /**
378  * Get left flank end position for model.
379  */
get_lflank_model_epos1()380 int32_t RFHMM::get_lflank_model_epos1()
381 {
382     return lflank_end[MODEL];
383 };
384 
385 /**
386  * Get motif start position for model.
387  */
get_motif_model_spos1()388 int32_t RFHMM::get_motif_model_spos1()
389 {
390     return motif_start[MODEL];
391 };
392 
393 /**
394  * Get motif end position for model.
395  */
get_motif_model_epos1()396 int32_t RFHMM::get_motif_model_epos1()
397 {
398     return motif_end[MODEL];
399 };
400 
401 /**
402  * Get right flank start position for model.
403  */
get_rflank_model_spos1()404 int32_t RFHMM::get_rflank_model_spos1()
405 {
406     return rflank_start[MODEL];
407 };
408 
409 /**
410  * Get right flank end position for model.
411  */
get_rflank_model_epos1()412 int32_t RFHMM::get_rflank_model_epos1()
413 {
414     return rflank_end[MODEL];
415 };
416 
417 /**
418  * Get left flank start position for read.
419  */
get_lflank_read_spos1()420 int32_t RFHMM::get_lflank_read_spos1()
421 {
422     return lflank_start[READ];
423 };
424 
425 /**
426  * Get left flank end position for read.
427  */
get_lflank_read_epos1()428 int32_t RFHMM::get_lflank_read_epos1()
429 {
430     return lflank_end[READ];
431 };
432 
433 /**
434  * Get motif start position for read.
435  */
get_motif_read_spos1()436 int32_t RFHMM::get_motif_read_spos1()
437 {
438     return motif_start[READ];
439 };
440 
441 /**
442  * Get motif end position for read.
443  */
get_motif_read_epos1()444 int32_t RFHMM::get_motif_read_epos1()
445 {
446     return motif_end[READ];
447 };
448 
449 /**
450  * Get right flank start position for read.
451  */
get_rflank_read_spos1()452 int32_t RFHMM::get_rflank_read_spos1()
453 {
454     return rflank_start[READ];
455 };
456 
457 /**
458  * Get right flank end position for read.
459  */
get_rflank_read_epos1()460 int32_t RFHMM::get_rflank_read_epos1()
461 {
462     return rflank_end[READ];
463 };
464 
465 /**
466  * Computes the score associated with the move from A to B
467  * Updates the max_score and associated max_track.
468  *
469  * @A      - start state
470  * @B      - end state
471  * @index1 - flattened index of the one dimensional array of start state
472  * @j      - 1 based position of read of start state
473  * @m      - base match required (MATCH, MODEL, READ)
474  */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)475 void RFHMM::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
476 {
477     float emission = 0, score = 0, valid = 0;
478 
479     //t is the new track
480     int32_t  t =  (this->*(moves[A][B]))(U[A][index1],j);
481 
482     if (t==NULL_TRACK)
483     {
484         valid = -INFINITY;
485     }
486     else if (match_type==MATCH)
487     {
488         emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
489     }
490 
491     score = V[A][index1] + T[A][B] + emission + valid;
492 
493     if (score>max_score)
494     {
495         max_score = score;
496         max_track = t;
497     }
498 
499     if (debug)
500     {
501         std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
502         std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
503         std::cerr << track2string(U[A][index1]) << "=>";
504         std::cerr << track2string(t) << " ";
505         std::cerr << emission << " (e: " << (track_get_d(t)<=RFLANK?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N')  << ") + ";
506         std::cerr << T[A][B] << " (t) + ";
507         std::cerr << V[A][index1] << " (p) + ";
508         std::cerr << valid << " (v) = ";
509         std::cerr << score << "\n";
510     }
511 }
512 
513 /**
514  * Align read against model.
515  */
align(const char * read,const char * qual)516 void RFHMM::align(const char* read, const char* qual)
517 {
518     clear_statistics();
519     optimal_path_traced = false;
520     this->read = read;
521     this->qual = qual;
522     rlen = strlen(read);
523     plen = rlen + rflen;
524 
525     if (rlen>MAXLEN)
526     {
527         fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
528         exit(1);
529     }
530 
531     float max = 0;
532     char maxPath = 'Y';
533 
534     size_t c,d,u,l;
535 
536     //alignment
537     //take into consideration
538     for (size_t i=1; i<=plen; ++i)
539     {
540         for (size_t j=1; j<=rlen; ++j)
541         {
542             size_t c = index(i,j);
543             size_t d = index(i-1,j-1);
544             size_t u = index(i-1,j);
545             size_t l = index(i,j-1);
546 
547             /////
548             //X//
549             /////
550             //invariant
551 
552             /////
553             //Y//
554             /////
555             //invariant
556 
557             /////
558             //M//
559             /////
560             //only need to update this i>rflen
561             if (debug) std::cerr << "(" << i << "," << j << ")\n";
562             max_score = -INFINITY;
563             max_track = NULL_TRACK;
564             proc_comp(S, M, d, j-1, MATCH);
565             proc_comp(Y, M, d, j-1, MATCH);
566             proc_comp(M, M, d, j-1, MATCH);
567             proc_comp(D, M, d, j-1, MATCH);
568             proc_comp(I, M, d, j-1, MATCH);
569             V[M][c] = max_score;
570             U[M][c] = max_track;
571             if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
572 
573             /////
574             //D//
575             /////
576             max_score = -INFINITY;
577             max_track = NULL_TRACK;
578             proc_comp(S, D, u, j, MODEL);
579             proc_comp(Y, D, u, j, MODEL);
580             proc_comp(M, D, u, j, MODEL);
581             proc_comp(D, D, u, j, MODEL);
582             V[D][c] = max_score;
583             U[D][c] = max_track;
584             if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
585 
586             /////
587             //I//
588             /////
589             max_score = -INFINITY;
590             max_track = NULL_TRACK;
591             proc_comp(S, I, l, j-1, READ);
592             proc_comp(Y, I, l, j-1, READ);
593             proc_comp(M, I, l, j-1, READ);
594             proc_comp(I, I, l, j-1, READ);
595             V[I][c] = max_score;
596             U[I][c] = max_track;
597             if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
598 
599             //////
600             //MR//
601             //////
602             max_score = -INFINITY;
603             max_track = NULL_TRACK;
604             proc_comp(S, MR, d, j-1, MATCH);
605             proc_comp(Y, MR, d, j-1, MATCH);
606             proc_comp(M, MR, d, j-1, MATCH);
607             proc_comp(D, MR, d, j-1, MATCH);
608             proc_comp(I, MR, d, j-1, MATCH);
609             proc_comp(MR, MR, d, j-1, MATCH);
610             V[MR][c] = max_score;
611             U[MR][c] = max_track;
612             if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
613 
614         }
615     }
616 
617     if (debug)
618     {
619         print(V[Y], plen+1, rlen+1);
620         std::cerr << "\n   =U[Y]=\n";
621         print_U(U[Y], plen+1, rlen+1);
622 
623         std::cerr << "\n   =V[M]=\n";
624         print(V[M], plen+1, rlen+1);
625         std::cerr << "\n   =U[M]=\n";
626         print_U(U[M], plen+1, rlen+1);
627         std::cerr << "\n   =V[D]=\n";
628         print(V[D], plen+1, rlen+1);
629         std::cerr << "\n   =U[D]=\n";
630         print_U(U[D], plen+1, rlen+1);
631         std::cerr << "\n   =V[I]=\n";
632         print(V[I], plen+1, rlen+1);
633         std::cerr << "\n   =U[I]=\n";
634         print_U(U[I], plen+1, rlen+1);
635 
636         std::cerr << "\n   =V[MR]=\n";
637         print(V[MR], plen+1, rlen+1);
638         std::cerr << "\n   =U[MR]=\n";
639         print_U(U[MR], plen+1, rlen+1);
640 
641         std::cerr << "\n";
642     }
643 
644     trace_path();
645 };
646 
647 /**
648  * Trace path after alignment.
649  */
trace_path()650 void RFHMM::trace_path()
651 {
652     //search for a complete path in MR or W or Z
653     size_t c;
654     optimal_score = -INFINITY;
655     optimal_track = NULL_TRACK;
656     optimal_state = TBD;
657     optimal_probe_len = 0;
658     for (size_t i=0; i<=plen; ++i)
659     {
660         c = index(i,rlen);
661         if (V[MR][c]>optimal_score)
662         {
663             optimal_score = V[MR][c];
664             optimal_track = U[MR][c];
665             optimal_state = MR;
666             optimal_probe_len = i;
667         }
668     }
669 
670     //trace path
671     optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
672     int32_t i = optimal_probe_len, j = rlen;
673     int32_t last_t = make_track(optimal_state, RFLANK, 0, rflen+1);
674     optimal_path_len = 0;
675     int32_t u;
676     int32_t des_t, src_t = make_track(E, RFLANK, 0, rflen+1);
677 
678     if (debug) std::cerr << "src_t (i,j) => dest_t : last_t\n";
679 
680     do
681     {
682         u = track_get_u(last_t);
683         last_t = U[u][index(i,j)];
684         des_t = track_set_u(last_t, u);
685         *optimal_path_ptr = des_t;
686 
687         collect_statistics(src_t, des_t, j);
688         if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " :  " << track2string(last_t) << "\n";
689 
690         if (track_get_u(src_t)==TBD)
691         {
692             exit(1);
693         }
694         src_t = des_t;
695 
696         if (u==M || u==MR)
697         {
698             --i; --j;
699         }
700         else if (u==D)
701         {
702             --i;
703         }
704         else if (u==Y || u==I)
705         {
706             --j;
707         }
708 
709         --optimal_path_ptr;
710         ++optimal_path_len;
711 
712     } while (track_get_u(last_t)!=S);
713 
714     if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " :  " << track2string(last_t) << "\n";
715     if (debug) std::cerr << track2string(last_t) << "\n";
716     if (debug) std::cerr << "\n";
717 
718     collect_statistics(src_t, last_t, j);
719 
720     ++optimal_path_ptr;
721     optimal_path_traced = true;
722 };
723 
724 /**
725  * Collect alignment summary statistics.
726  */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)727 void RFHMM::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
728 {
729     if (debug) std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
730 
731     int32_t src_u = track_get_u(src_t);
732     int32_t des_u = track_get_u(des_t);
733 
734     if (src_u==E)
735     {
736         if (des_u==MR)
737         {
738             rflank_end[MODEL] = track_get_p(des_t);
739             rflank_end[READ] = j;
740         }
741     }
742     else if (src_u==MR)
743     {
744         if (des_u==M)
745         {
746             rflank_start[MODEL] = track_get_p(src_t);
747             rflank_start[READ] = j+1;
748 
749             motif_end[MODEL] = track_get_c(des_t);
750             motif_count = track_get_c(des_t);
751             motif_end[READ] = j;
752 
753             //initialize array for tracking inexact repeats
754             for (size_t k=1; k<=motif_count; ++k)
755             {
756                 motif_discordance[k] = 0;
757             }
758 
759             if (track_get_base(des_t)!=read[j-1])
760             {
761                 ++motif_discordance[motif_count];
762             }
763         }
764         else if (des_u==D)
765         {
766             rflank_start[MODEL] = track_get_p(src_t);
767             rflank_start[READ] = j+1;
768 
769             motif_end[MODEL] = track_get_c(des_t);
770             motif_count = track_get_c(des_t);
771             motif_end[READ] = j;
772 
773             //initialize array for tracking inexact repeats
774             for (size_t k=1; k<=motif_count; ++k)
775             {
776                 motif_discordance[k] = 0;
777             }
778 
779             ++motif_discordance[motif_count];
780 
781         }
782         else if (des_u==I)
783         {
784         }
785         else if (des_u==Y)
786         {
787             rflank_start[MODEL] = track_get_p(src_t);
788             rflank_start[READ] = j+1;
789 
790             lflank_end[MODEL] = track_get_p(des_t);
791             lflank_end[READ] = j;
792         }
793     }
794     else if (src_u==M)
795     {
796         if (des_u==Y)
797         {
798             motif_start[MODEL] = track_get_c(src_t);
799             motif_start[READ] = j+1;
800             lflank_end[MODEL] = track_get_p(des_t);
801             lflank_end[READ] = j;
802         }
803     }
804     else if (src_u==Y)
805     {
806         if (des_u==S)
807         {
808             lflank_start[MODEL] = track_get_p(des_t);
809             lflank_start[READ] = j+1;
810         }
811 
812     }
813 
814     if (des_u==M)
815     {
816         if (track_get_base(des_t)!=read[j-1])
817         {
818             ++motif_discordance[track_get_c(des_t)];
819         }
820     }
821 
822     if (des_u==D || des_u==I)
823     {
824         ++motif_discordance[track_get_c(des_t)];
825     }
826 };
827 
828 /**
829  * Clear alignment statistics.
830  */
clear_statistics()831 void RFHMM::clear_statistics()
832 {
833     lflank_start[MODEL] = NAN;
834     lflank_start[READ] = NAN;
835     lflank_end[MODEL] = NAN;
836     lflank_end[READ] = NAN;
837     motif_start[MODEL] = NAN;
838     motif_start[READ] = NAN;
839     motif_end[MODEL] = NAN;
840     motif_end[READ] = NAN;
841     rflank_start[MODEL] = NAN;
842     rflank_start[READ] = NAN;
843     rflank_end[MODEL] = NAN;
844     rflank_end[READ] = NAN;
845     motif_count = NAN;
846     exact_motif_count = NAN;
847     motif_m = NAN;
848     motif_xid = NAN;
849     motif_concordance = NAN;
850     maxLogOdds = NAN;
851 }
852 
853 
854 /**
855  * Update alignment statistics after collection.
856  */
update_statistics()857 void RFHMM::update_statistics()
858 {
859     motif_concordance = (float)motif_m/(motif_m+motif_xid);
860 }
861 
862 /**
863  * Returns true if flanks are mapped.
864  */
flanks_are_mapped()865 bool RFHMM::flanks_are_mapped()
866 {
867     return rflank_start[MODEL]==rflen;
868 }
869 
870 /**
871  * Compute log10 emission odds based on equal error probability distribution.
872  */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)873 float RFHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
874 {
875     //4 encodes for N
876 //    if (read_base=='N' || probe_base=='N')
877 //    {
878 //        //silent match
879 //        return -INFINITY;
880 //    }
881 
882     if (read_base!=probe_base)
883     {
884         return LogTool::pl2log10_varp(pl) - par.mismatch_penalty;
885     }
886     else
887     {
888         return -(LogTool::pl2log10_varp(pl));
889     }
890 };
891 
892 /**
893  * Converts state to string representation.
894  */
state2string(int32_t state)895 std::string RFHMM::state2string(int32_t state)
896 {
897     if (state==S)
898     {
899         return "S";
900     }
901     else if (state==Y)
902     {
903         return "Y";
904     }
905     else if (state==M)
906     {
907         return "M";
908     }
909     else if (state==D)
910     {
911         return "D";
912     }
913     else if (state==I)
914     {
915         return "I";
916     }
917     else if (state==MR)
918     {
919         return "MR";
920     }
921     else if (state==E)
922     {
923         return "E";
924     }
925     else if (state==N)
926     {
927         return "N";
928     }
929     else if (state==TBD)
930     {
931         return "*";
932     }
933     else
934     {
935         return "!";
936     }
937 }
938 
939 /**
940  * Converts state to cigar string representation.
941  */
state2cigarstring(int32_t state)942 std::string RFHMM::state2cigarstring(int32_t state)
943 {
944     if (state==S)
945     {
946         return "S";
947     }
948     else if (state==Y)
949     {
950         return "Y";
951     }
952     else if (state==M)
953     {
954         return "M";
955     }
956     else if (state==D)
957     {
958         return "D";
959     }
960     else if (state==I)
961     {
962         return "I";
963     }
964     else if (state==MR)
965     {
966         return "R";
967     }
968     else if (state==E)
969     {
970         return "E";
971     }
972     else if (state==N)
973     {
974         return "N";
975     }
976     else if (state==TBD)
977     {
978         return "*";
979     }
980     else
981     {
982         return "!";
983     }
984 }
985 
986 /**
987  * Converts state to cigar string representation.
988  */
track2cigarstring1(int32_t t,int32_t j)989 std::string RFHMM::track2cigarstring1(int32_t t, int32_t j)
990 {
991     int32_t state = track_get_u(t);
992 
993     if (state==S)
994     {
995         return "S";
996     }
997     else if (state==Y)
998     {
999         return "Y";
1000     }
1001     else if (state==M || state==MR)
1002     {
1003         if (track_get_base(t)==read[j-1])
1004         {
1005             return "M";
1006         }
1007         else
1008         {
1009             return "*";
1010         }
1011     }
1012     else if (state==D)
1013     {
1014         return "D";
1015     }
1016     else if (state==I)
1017     {
1018         return "I";
1019     }
1020     else if (state==E)
1021     {
1022         return "E";
1023     }
1024     else if (state==N)
1025     {
1026         return "N";
1027     }
1028     else if (state==TBD)
1029     {
1030         return "*";
1031     }
1032     else
1033     {
1034         return "!";
1035     }
1036 }
1037 
1038 /**
1039  * Converts track to cigar string representation.
1040  */
track2cigarstring2(int32_t t)1041 std::string RFHMM::track2cigarstring2(int32_t t)
1042 {
1043     int32_t state = track_get_u(t);
1044 
1045     if (state==M)
1046     {
1047         return (track_get_c(t)%2==0?"+":"o");
1048     }
1049     else if (state==D)
1050     {
1051         return (track_get_c(t)%2==0?"+":"o");
1052     }
1053     else if (state==I)
1054     {
1055         return (track_get_c(t)%2==0?"+":"o");
1056     }
1057     else if (state==MR)
1058     {
1059         return "R";
1060     }
1061     else
1062     {
1063         return " ";
1064     }
1065 }
1066 /**
1067  * Converts model component to string representation.
1068  */
component2string(int32_t component)1069 std::string RFHMM::component2string(int32_t component)
1070 {
1071     if (component==LFLANK)
1072     {
1073         return "l";
1074     }
1075     else if (component==MOTIF)
1076     {
1077         return "m";
1078     }
1079     else if (component==RFLANK)
1080     {
1081         return "r";
1082     }
1083     else if (component==UNMODELED)
1084     {
1085         return "!";
1086     }
1087     else if (component==READ)
1088     {
1089         return "s";
1090     }
1091     else if (component==UNCERTAIN)
1092     {
1093         return "?";
1094     }
1095     else
1096     {
1097         return "!";
1098     }
1099 }
1100 
1101 /**
1102  * Prints an alignment.
1103  */
print_alignment()1104 void RFHMM::print_alignment()
1105 {
1106     std::string pad = "\t";
1107     print_alignment(pad);
1108 };
1109 
1110 /**
1111  * Prints an alignment with padding.
1112  */
print_alignment(std::string & pad)1113 void RFHMM::print_alignment(std::string& pad)
1114 {
1115     std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1116 
1117     if (!optimal_path_traced)
1118     {
1119         std::cerr << "path not traced\n";
1120     }
1121 
1122     print_T();
1123     std::cerr << "\n";
1124     par.print();
1125     std::cerr << "\n";
1126 
1127     std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1128     std::cerr << "rflank       : " << model[RFLANK] << "\n";
1129     std::cerr << "mlen         : " << mlen << "\n";
1130     std::cerr << "rflen        : " << rflen << "\n";
1131     std::cerr << "plen         : " << plen << "\n";
1132     std::cerr << "\n";
1133     std::cerr << "read         : " << read << "\n";
1134     std::cerr << "rlen         : " << rlen << "\n";
1135     std::cerr << "\n";
1136     std::cerr << "optimal score: " << optimal_score << "\n";
1137     std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1138     std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1139     std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1140     std::cerr << "optimal path length : " << optimal_path_len << "\n";
1141     std::cerr << "max j: " << rlen << "\n";
1142 
1143     std::cerr << "probe: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1144                           << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "] "
1145                           << "(" << rflank_start[MODEL] << "~" << rflank_end[MODEL] << ")\n";
1146     std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1147                           << "[" << motif_start[READ] << "~" << motif_end[READ] << "] "
1148                           << "(" << rflank_start[READ] << "~" << rflank_end[READ] << ")\n";
1149     std::cerr << "\n";
1150     std::cerr << "motif #           : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1151 
1152     exact_motif_count = motif_count;
1153     motif_concordance = 0;
1154     for (int32_t k=1; k<=motif_count; ++k)
1155     {
1156         if (motif_discordance[k])
1157         {
1158             --exact_motif_count;
1159         }
1160 
1161         if (mlen>=motif_discordance[k])
1162         {
1163             motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
1164         }
1165     }
1166     motif_concordance *= 1.0/motif_count;
1167     std::cerr << "motif concordance : " << (motif_concordance*100) << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1168     std::cerr << "motif discordance : ";
1169     for (int32_t k=1; k<=motif_count; ++k)
1170     {
1171         std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1172     }
1173     std::cerr << "\n";
1174 
1175     //print path
1176     int32_t* path;
1177     path = optimal_path_ptr;
1178     std::cerr << "Model:  ";
1179     int32_t t = NULL_TRACK;
1180     int32_t j = 0;
1181     while (path<optimal_path+(MAXLEN<<2))
1182     {
1183         int32_t u = track_get_u(*path);
1184         if (u==M || u==D || u==MR)
1185         {
1186             std::cerr << track_get_base(*path);
1187         }
1188         else
1189         {
1190             std::cerr << '-';
1191         }
1192         ++path;
1193     }
1194     std::cerr << " \n";
1195 
1196     std::cerr << "       S";
1197     path = optimal_path_ptr;
1198     j=1;
1199     while (path<optimal_path+(MAXLEN<<2))
1200     {
1201         std::cerr << track2cigarstring1(*path,j);
1202         int32_t u = track_get_u(*path);
1203         if (u==Y || u==M || u==I || u==MR)
1204         {
1205             ++j;
1206         }
1207         ++path;
1208     }
1209     std::cerr << "E\n";
1210 
1211     path = optimal_path_ptr;
1212     std::cerr << "        ";
1213     while (path<optimal_path+(MAXLEN<<2))
1214     {
1215         std::cerr << track2cigarstring2(*path);
1216         ++path;
1217     }
1218     std::cerr << " \n";
1219 
1220     path = optimal_path_ptr;
1221     j=1;
1222     std::cerr << "Read:   ";
1223     while (path<optimal_path+(MAXLEN<<2))
1224     {
1225         int32_t u = track_get_u(*path);
1226         if (u==Y || u==M || u==I || u==MR)
1227         {
1228             std::cerr << read[j-1];
1229             ++j;
1230         }
1231         else
1232         {
1233             std::cerr << '-';
1234         }
1235         ++path;
1236     }
1237     std::cerr << " \n";
1238     std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1239 };
1240 
1241 /**
1242  * Prints a float matrix.
1243  */
print(float * v,size_t plen,size_t rlen)1244 void RFHMM::print(float *v, size_t plen, size_t rlen)
1245 {
1246     float val;
1247     std::cerr << std::setprecision(1) << std::fixed;
1248     for (size_t i=0; i<plen; ++i)
1249     {
1250         for (size_t j=0; j<rlen; ++j)
1251         {
1252             val =  v[index(i,j)];
1253             std::cerr << (val<0?"  ":"   ") << val;
1254         }
1255 
1256         std::cerr << "\n";
1257     }
1258 };
1259 
1260 /**
1261  * Prints a char matrix.
1262  */
print(int32_t * v,size_t plen,size_t rlen)1263 void RFHMM::print(int32_t *v, size_t plen, size_t rlen)
1264 {
1265     float val;
1266     std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1267     for (size_t i=0; i<plen; ++i)
1268     {
1269         for (size_t j=0; j<rlen; ++j)
1270         {
1271             val =  v[index(i,j)];
1272             std::cerr << (val<0?"  ":"   ") << val;
1273         }
1274 
1275         std::cerr << "\n";
1276     }
1277 };
1278 
1279 /**
1280  * Prints the transition matrix.
1281  */
print_T()1282 void RFHMM::print_T()
1283 {
1284     for (size_t j=S; j<=E; ++j)
1285     {
1286         std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1287     }
1288     std::cerr << "\n";
1289 
1290     for (size_t i=S; i<=E; ++i)
1291     {
1292         for (size_t j=S; j<=E; ++j)
1293         {
1294             if (j)
1295             {
1296                 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1297             }
1298             else
1299             {
1300                 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1301             }
1302         }
1303         std::cerr << "\n";
1304     }
1305 };
1306 
1307 /**
1308  * Prints U.
1309  */
print_U(int32_t * U,size_t plen,size_t rlen)1310 void RFHMM::print_U(int32_t *U, size_t plen, size_t rlen)
1311 {
1312     std::cerr << std::setprecision(1) << std::fixed;
1313     std::string state;
1314     for (size_t i=0; i<plen; ++i)
1315     {
1316         for (size_t j=0; j<rlen; ++j)
1317         {
1318             int32_t t = U[index(i,j)];
1319             state = state2string(track_get_u(t));
1320             std::cerr << (state.size()==1 ? "   " : "  ")
1321                       << state << "|"
1322                       << component2string(track_get_d(t)) << "|"
1323                       << track_get_c(t) << "|"
1324                       << track_get_p(t) << (j==rlen-1?"\n":"   ");
1325         }
1326     }
1327 };
1328 
1329 /**
1330  * Prints U and V.
1331  */
print_trace(int32_t state,size_t plen,size_t rlen)1332 void RFHMM::print_trace(int32_t state, size_t plen, size_t rlen)
1333 {
1334     std::cerr << std::setprecision(1) << std::fixed;
1335     int32_t *u = U[state];
1336     float *v = V[state];
1337     std::string s;
1338     for (size_t i=0; i<plen; ++i)
1339     {
1340         for (size_t j=0; j<rlen; ++j)
1341         {
1342             int32_t t = u[index(i,j)];
1343             s = state2string(track_get_u(t));
1344             std::cerr << (s.size()==1 ? "   " : "  ")
1345                       << s << "|"
1346                       << component2string(track_get_d(t)) << "|"
1347                       << track_get_c(t) << "|"
1348                       << track_get_p(t) << "|"
1349                       << v[index(i,j)];
1350         }
1351 
1352         std::cerr << "\n";
1353     }
1354 };
1355 
1356 /**
1357  * Returns a string representation of track.
1358  */
track2string(int32_t t)1359 std::string RFHMM::track2string(int32_t t)
1360 {
1361     kstring_t str = {0,0,0};
1362 
1363     kputs(state2string(track_get_u(t)).c_str(), &str);
1364     kputc('|', &str);
1365     kputs(component2string(track_get_d(t)).c_str(), &str);
1366     kputc('|', &str);
1367     kputw(track_get_c(t), &str);
1368     kputc('|', &str);
1369     kputw(track_get_p(t), &str);
1370 
1371     std::string s(str.s);
1372     if (str.m) free(str.s);
1373 
1374     return s;
1375 }
1376 
1377 /**
1378  * Prints track.
1379  */
print_track(int32_t t)1380 void RFHMM::print_track(int32_t t)
1381 {
1382     std::cerr << track2string(t) << "\n";
1383 }
1384 
1385 #undef MAXLEN
1386 #undef MAXLEN_NBITS
1387 #undef S
1388 #undef Y
1389 #undef M
1390 #undef I
1391 #undef D
1392 #undef MR
1393 #undef E
1394 #undef N
1395 #undef TBD
1396 #undef NSTATES
1397 
1398 #undef LFLANK
1399 #undef MOTIF
1400 #undef RFLANK
1401 #undef UNMODELED
1402 #undef UNCERTAIN
1403 
1404 #undef READ
1405 #undef MODEL
1406 #undef MATCH
1407 
1408 #undef index
1409 #undef track_get_u
1410 #undef track_get_d
1411 #undef track_get_d
1412 #undef track_get_c
1413 #undef track_get_p
1414 #undef track_get_base
1415 #undef track_valid
1416 #undef track_set_u
1417 #undef track_set_d
1418 #undef track_set_c
1419 #undef track_set_p
1420 #undef make_track
1421 
1422 #undef NULL_TRACK
1423 #undef START_TRACK