1 /* The MIT License
2 
3    Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include "rfhmm_x.h"
25 
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28 
29 //model states
30 #define S       0
31 #define Y       1
32 #define M       2
33 #define D       3
34 #define I       4
35 #define MR      5
36 #define E       6
37 #define N       7
38 #define TBD     8
39 
40 #define NSTATES 7
41 
42 //model components
43 #define LFLANK    0
44 #define MOTIF     1
45 #define RFLANK    2
46 #define UNMODELED 3
47 #define UNCERTAIN 4
48 
49 //match type
50 #define MODEL 0
51 #define READ  1
52 #define MATCH 2
53 
54 /*for indexing single array*/
55 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
56 
57 /*functions for getting trace back information*/
58 #define track_get_u(t)      (((t)&0xF8000000)>>27)
59 #define track_get_d(t)      (((t)&0x07000000)>>24)
60 #define track_get_c(t)      (((t)&0x00FFF000)>>12)
61 #define track_get_p(t)      (((t)&0x00000FFF))
62 #define track_get_base(t)   (model[track_get_d(t)][track_get_p(t)-1])
63 #define track_valid(t)      ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
64 #define track_set_u(t,u)    (((t)&0x07FFFFFF)|((u)<<27))
65 #define track_set_d(t,d)    (((t)&0xF8FFFFFF)|((d)<<24))
66 #define track_set_c(t,c)    (((t)&0xFF000FFF)|((c)<<12))
67 #define track_set_p(t,p)    (((t)&0xFFFFF000)|(p))
68 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
69 
70 //[N|!|0|0]
71 #define NULL_TRACK  0x7C000000
72 //[N|l|0|0]
73 #define START_TRACK 0x78000000
74 
75 /**
76  * Constructor.
77  */
RFHMM_X(bool debug)78 RFHMM_X::RFHMM_X(bool debug)
79 {
80     this-> debug = debug;
81     initialize();
82 };
83 
84 /**
85  * Destructor.
86  */
~RFHMM_X()87 RFHMM_X::~RFHMM_X()
88 {
89     delete optimal_path;
90 
91     //the best alignment V_ for subsequence (i,j)
92     for (size_t state=S; state<=E; ++state)
93     {
94         delete V[state];
95         delete U[state];
96     }
97 
98     delete V;
99     delete U;
100 };
101 
102 /**
103  * Initializes objects; helper function for constructor.
104  */
initialize()105 void RFHMM_X::initialize()
106 {
107     initialize_structures();
108     initialize_T();
109     initialize_UV();
110 };
111 
112 /**
113  * Initializes objects for constructor.
114  */
initialize_structures()115 void RFHMM_X::initialize_structures()
116 {
117     model = new char*[3];
118     model[LFLANK] = NULL;
119     model[MOTIF] = NULL;
120     model[RFLANK] = NULL;
121 
122     rflen = 0;
123     mlen = 0;
124 
125     motif_discordance = new int32_t[MAXLEN];
126     optimal_path = new int32_t[MAXLEN<<2];
127     optimal_path_traced = false;
128 
129     typedef int32_t (RFHMM_X::*move) (int32_t t, int32_t j);
130     V = new float*[NSTATES];
131     U = new int32_t*[NSTATES];
132     moves = new move*[NSTATES];
133     for (size_t state=S; state<=E; ++state)
134     {
135         V[state] = new float[MAXLEN*MAXLEN];
136         U[state] = new int32_t[MAXLEN*MAXLEN];
137         moves[state] = new move[NSTATES];
138     }
139 
140     for (size_t state=S; state<=E; ++state)
141     {
142         moves[state] = new move[NSTATES];
143     }
144 
145     moves[S][Y] = &RFHMM_X::move_S_Y;
146     moves[Y][Y] = &RFHMM_X::move_Y_Y;
147 
148     moves[S][M] = &RFHMM_X::move_S_M;
149     moves[Y][M] = &RFHMM_X::move_Y_M;
150     moves[M][M] = &RFHMM_X::move_M_M;
151     moves[D][M] = &RFHMM_X::move_D_M;
152     moves[I][M] = &RFHMM_X::move_I_M;
153 
154     moves[S][D] = &RFHMM_X::move_S_D;
155     moves[Y][D] = &RFHMM_X::move_Y_D;
156     moves[M][D] = &RFHMM_X::move_M_D;
157     moves[D][D] = &RFHMM_X::move_D_D;
158 
159     moves[S][I] = &RFHMM_X::move_S_I;
160     moves[Y][I] = &RFHMM_X::move_Y_I;
161     moves[M][I] = &RFHMM_X::move_M_I;
162     moves[I][I] = &RFHMM_X::move_I_I;
163 
164     moves[S][MR] = &RFHMM_X::move_S_MR;
165     moves[Y][MR] = &RFHMM_X::move_Y_MR;
166     moves[M][MR] = &RFHMM_X::move_M_MR;
167     moves[D][MR] = &RFHMM_X::move_D_MR;
168     moves[I][MR] = &RFHMM_X::move_I_MR;
169     moves[MR][MR] = &RFHMM_X::move_MR_MR;
170 };
171 
172 /**
173  * Initialize transition matrix based on parameters.
174  */
initialize_T()175 void RFHMM_X::initialize_T()
176 {
177     float delta = par.delta;
178     float epsilon = par.epsilon;
179     float tau = par.tau;
180     float eta = par.eta;
181 
182     for (size_t i=S; i<=E; ++i)
183     {
184         for (size_t j=S; j<=E; ++j)
185         {
186             T[i][j] = -INFINITY;
187         }
188     }
189 
190     T[S][Y] = 0;
191     T[Y][Y] = 0;
192 
193     T[S][M] = log10((tau*(1-2*delta-tau))/(tau*eta*(1-eta)*(1-eta)));
194     T[Y][M] = T[S][M];
195     T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
196     T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
197     T[I][M] = T[I][M];
198 
199     T[S][D] = log10((tau*delta)/(eta*(1-eta)));
200     T[Y][D] = T[S][D];
201     T[M][D] = log10(delta/(1-eta));
202     T[D][D] = log10(epsilon/(1-eta));
203 
204     T[S][I] = log10((tau*delta)/(eta*(1-eta)));
205     T[Y][I] = T[S][I];
206     T[M][I] = log10(delta/(1-eta));
207     T[I][I] = log10(epsilon/(1-eta));
208 
209     T[S][MR] = log10((tau*tau*(1-tau))/(tau*eta*eta*eta*(1-eta)*(1-eta)));
210     T[Y][MR] = log10((eta*tau)/(eta*eta*eta));
211     T[M][MR] = log10((tau*(1-tau))/(eta*eta*(1-eta)*(1-eta)));
212     T[D][MR] = T[M][MR];
213     T[I][MR] = T[D][MR];
214     T[MR][MR] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
215 };
216 
217 /**
218  * Initializes U and V.
219  */
initialize_UV()220 void RFHMM_X::initialize_UV()
221 {
222     int32_t t=0;
223     for (size_t i=0; i<MAXLEN; ++i)
224     {
225         for (size_t j=0; j<MAXLEN; ++j)
226         {
227             size_t c = index(i,j);
228 
229             V[S][c] = -INFINITY;
230             U[S][c] = NULL_TRACK;
231 
232             //Y
233             V[Y][c] = 0;
234             if (i>=1)
235             {
236                 V[Y][c] = -INFINITY;
237                 U[Y][c] = NULL_TRACK;
238             }
239             else
240             {
241                 if (j==1)
242                 {
243                     U[Y][c] = make_track(S,LFLANK,0,j);
244                 }
245                 else
246                 {
247                     U[Y][c] =  make_track(Y,LFLANK,0,j);
248                 }
249             }
250 
251             //M
252             V[M][c] = -INFINITY;
253             if (!i || !j)
254             {
255                 U[M][c] = make_track(N,UNMODELED,0,0);
256             }
257             else
258             {
259                 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
260             }
261 
262             //D
263             V[D][c] = -INFINITY;
264             if (!i || !j)
265             {
266                 U[D][c] = make_track(N,UNMODELED,0,0);
267             }
268             else
269             {
270                 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
271             }
272 
273             //I
274             V[I][c] = -INFINITY;
275             if (!i || !j)
276             {
277                 U[I][c] = make_track(N,UNMODELED,0,0);
278             }
279             else
280             {
281                 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
282             }
283 
284             //MR
285             V[MR][c] = -INFINITY;
286             if (!i || !j)
287             {
288                 U[MR][c] = make_track(N,UNMODELED,0,0);
289             }
290             else
291             {
292                 U[MR][c] = make_track(TBD,UNCERTAIN,0,0);
293             }
294         }
295     }
296 
297     V[S][index(0,0)] = 0;
298     U[S][index(0,0)] = START_TRACK;
299 
300     V[M][index(0,0)] = -INFINITY;
301     V[MR][index(0,0)] = -INFINITY;
302 };
303 
304 /**
305  * Sets a model.
306  */
set_model(const char * motif,const char * rflank)307 void RFHMM_X::set_model(const char* motif, const char* rflank)
308 {
309     if (model[LFLANK]) free(model[LFLANK]);
310     if (model[MOTIF]) free(model[MOTIF]);
311     if (model[RFLANK]) free(model[RFLANK]);
312 
313     model[MOTIF] = strdup(motif);
314     model[RFLANK] = strdup(rflank);
315 
316     mlen = strlen(model[MOTIF]);
317     rflen = strlen(model[RFLANK]);
318 }
319 
320 /**
321  * Sets delta.
322  */
set_delta(float delta)323 void RFHMM_X::set_delta(float delta)
324 {
325     par.delta = delta;
326 }
327 
328 /**
329  * Sets epsilon.
330  */
set_epsilon(float epsilon)331 void RFHMM_X::set_epsilon(float epsilon)
332 {
333     par.epsilon = epsilon;
334 }
335 
336 /**
337  * Sets tau.
338  */
set_tau(float tau)339 void RFHMM_X::set_tau(float tau)
340 {
341     par.tau = tau;
342 }
343 
344 /**
345  * Sets eta.
346  */
set_eta(float eta)347 void RFHMM_X::set_eta(float eta)
348 {
349     par.eta = eta;
350 }
351 
352 /**
353  * Sets mismatch penalty.
354  */
set_mismatch_penalty(float mismatch_penalty)355 void RFHMM_X::set_mismatch_penalty(float mismatch_penalty)
356 {
357     par.mismatch_penalty = mismatch_penalty;
358 }
359 
360 /**
361  * Sets debug.
362  */
set_debug(bool debug)363 void RFHMM_X::set_debug(bool debug)
364 {
365     this->debug = debug;
366 }
367 
368 
369 /**
370  * Get left flank start position for model.
371  */
get_lflank_model_spos1()372 int32_t RFHMM_X::get_lflank_model_spos1()
373 {
374     return lflank_start[MODEL];
375 };
376 
377 /**
378  * Get left flank end position for model.
379  */
get_lflank_model_epos1()380 int32_t RFHMM_X::get_lflank_model_epos1()
381 {
382     return lflank_end[MODEL];
383 };
384 
385 /**
386  * Get motif start position for model.
387  */
get_motif_model_spos1()388 int32_t RFHMM_X::get_motif_model_spos1()
389 {
390     return motif_start[MODEL];
391 };
392 
393 /**
394  * Get motif end position for model.
395  */
get_motif_model_epos1()396 int32_t RFHMM_X::get_motif_model_epos1()
397 {
398     return motif_end[MODEL];
399 };
400 
401 /**
402  * Get right flank start position for model.
403  */
get_rflank_model_spos1()404 int32_t RFHMM_X::get_rflank_model_spos1()
405 {
406     return rflank_start[MODEL];
407 };
408 
409 /**
410  * Get right flank end position for model.
411  */
get_rflank_model_epos1()412 int32_t RFHMM_X::get_rflank_model_epos1()
413 {
414     return rflank_end[MODEL];
415 };
416 
417 /**
418  * Get left flank start position for read.
419  */
get_lflank_read_spos1()420 int32_t RFHMM_X::get_lflank_read_spos1()
421 {
422     return lflank_start[READ];
423 };
424 
425 /**
426  * Get left flank end position for read.
427  */
get_lflank_read_epos1()428 int32_t RFHMM_X::get_lflank_read_epos1()
429 {
430     return lflank_end[READ];
431 };
432 
433 /**
434  * Get motif start position for read.
435  */
get_motif_read_spos1()436 int32_t RFHMM_X::get_motif_read_spos1()
437 {
438     return motif_start[READ];
439 };
440 
441 /**
442  * Get motif end position for read.
443  */
get_motif_read_epos1()444 int32_t RFHMM_X::get_motif_read_epos1()
445 {
446     return motif_end[READ];
447 };
448 
449 /**
450  * Get right flank start position for read.
451  */
get_rflank_read_spos1()452 int32_t RFHMM_X::get_rflank_read_spos1()
453 {
454     return rflank_start[READ];
455 };
456 
457 /**
458  * Get right flank end position for read.
459  */
get_rflank_read_epos1()460 int32_t RFHMM_X::get_rflank_read_epos1()
461 {
462     return rflank_end[READ];
463 };
464 
465 /**
466  * Computes the score associated with the move from A to B
467  * Updates the max_score and associated max_track.
468  *
469  * @A      - start state
470  * @B      - end state
471  * @index1 - flattened index of the one dimensional array of start state
472  * @j      - 1 based position of read of start state
473  * @m      - base match required (MATCH, MODEL, READ)
474  */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)475 void RFHMM_X::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
476 {
477     float emission = 0, score = 0, valid = 0;
478 
479     //t is the new track
480     int32_t  t =  (this->*(moves[A][B]))(U[A][index1],j);
481 
482     if (t==NULL_TRACK)
483     {
484         valid = -INFINITY;
485     }
486     else if (match_type==MATCH)
487     {
488         emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
489     }
490 
491     score = V[A][index1] + T[A][B] + emission + valid;
492 
493     if (score>max_score)
494     {
495         max_score = score;
496         max_track = t;
497     }
498 
499     if (debug)
500     {
501         std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
502         std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
503         std::cerr << track2string(U[A][index1]) << "=>";
504         std::cerr << track2string(t) << " ";
505         std::cerr << emission << " (e: " << (track_get_d(t)<=RFLANK?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N')  << ") + ";
506         std::cerr << T[A][B] << " (t) + ";
507         std::cerr << V[A][index1] << " (p) + ";
508         std::cerr << valid << " (v) = ";
509         std::cerr << score << "\n";
510     }
511 }
512 
513 /**
514  * Align model against read.
515  *
516  * This implements the dynamic programming algorithm in a
517  * right to left fashion, this allows the repeat units to
518  * end naturally in partial units when trailing on the left
519  * hand side.
520  *
521  *   R E A D
522  * M
523  * O
524  * D
525  * E
526  * L
527  */
align(const char * read,const char * qual)528 void RFHMM_X::align(const char* read, const char* qual)
529 {
530     clear_statistics();
531     optimal_path_traced = false;
532     this->read = read;
533     this->qual = qual;
534     rlen = strlen(read);
535     plen = rlen + rflen;
536 
537     if (rlen>MAXLEN)
538     {
539         fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
540         exit(1);
541     }
542 
543     float max = 0;
544     char maxPath = 'Y';
545 
546     size_t c,d,u,l;
547 
548     //right to left alignment
549     for (size_t i=0; i>=plen; ++i)
550     {
551         for (size_t j=MAXLEN; j<=MAXLEN-rlen; --j)
552         {
553             size_t c = index(i,j);
554             size_t d = index(i-1,j+1);
555             size_t u = index(i-1,j);
556             size_t l = index(i,j+1);
557 
558             /////
559             //X//
560             /////
561             //invariant
562 
563             /////
564             //Y//
565             /////
566             //invariant
567 
568             /////
569             //M//
570             /////
571             //only need to update this i>rflen
572             if (debug) std::cerr << "(" << i << "," << j << ")\n";
573             max_score = -INFINITY;
574             max_track = NULL_TRACK;
575             proc_comp(S, M, d, j+1, MATCH);
576             proc_comp(Y, M, d, j+1, MATCH);
577             proc_comp(M, M, d, j+1, MATCH);
578             proc_comp(D, M, d, j+1, MATCH);
579             proc_comp(I, M, d, j+1, MATCH);
580             V[M][c] = max_score;
581             U[M][c] = max_track;
582             if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
583 
584             /////
585             //D//
586             /////
587             max_score = -INFINITY;
588             max_track = NULL_TRACK;
589             proc_comp(S, D, u, j, MODEL);
590             proc_comp(Y, D, u, j, MODEL);
591             proc_comp(M, D, u, j, MODEL);
592             proc_comp(D, D, u, j, MODEL);
593             V[D][c] = max_score;
594             U[D][c] = max_track;
595             if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
596 
597             /////
598             //I//
599             /////
600             max_score = -INFINITY;
601             max_track = NULL_TRACK;
602             proc_comp(S, I, l, j+1, READ);
603             proc_comp(Y, I, l, j+1, READ);
604             proc_comp(M, I, l, j+1, READ);
605             proc_comp(I, I, l, j+1, READ);
606             V[I][c] = max_score;
607             U[I][c] = max_track;
608             if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
609 
610             //////
611             //MR//
612             //////
613             max_score = -INFINITY;
614             max_track = NULL_TRACK;
615             proc_comp(S, MR, d, j+1, MATCH);
616             proc_comp(Y, MR, d, j+1, MATCH);
617             proc_comp(M, MR, d, j+1, MATCH);
618             proc_comp(D, MR, d, j+1, MATCH);
619             proc_comp(I, MR, d, j+1, MATCH);
620             proc_comp(MR, MR, d, j+1, MATCH);
621             V[MR][c] = max_score;
622             U[MR][c] = max_track;
623             if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
624 
625         }
626     }
627 
628     if (0)
629 {
630     //alignment
631     //take into consideration
632     for (size_t i=1; i<=plen; ++i)
633     {
634         for (size_t j=1; j<=rlen; ++j)
635         {
636             size_t c = index(i,j);
637             size_t d = index(i-1,j-1);
638             size_t u = index(i-1,j);
639             size_t l = index(i,j-1);
640 
641             /////
642             //X//
643             /////
644             //invariant
645 
646             /////
647             //Y//
648             /////
649             //invariant
650 
651             /////
652             //M//
653             /////
654             //only need to update this i>rflen
655             if (debug) std::cerr << "(" << i << "," << j << ")\n";
656             max_score = -INFINITY;
657             max_track = NULL_TRACK;
658             proc_comp(S, M, d, j-1, MATCH);
659             proc_comp(Y, M, d, j-1, MATCH);
660             proc_comp(M, M, d, j-1, MATCH);
661             proc_comp(D, M, d, j-1, MATCH);
662             proc_comp(I, M, d, j-1, MATCH);
663             V[M][c] = max_score;
664             U[M][c] = max_track;
665             if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
666 
667             /////
668             //D//
669             /////
670             max_score = -INFINITY;
671             max_track = NULL_TRACK;
672             proc_comp(S, D, u, j, MODEL);
673             proc_comp(Y, D, u, j, MODEL);
674             proc_comp(M, D, u, j, MODEL);
675             proc_comp(D, D, u, j, MODEL);
676             V[D][c] = max_score;
677             U[D][c] = max_track;
678             if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
679 
680             /////
681             //I//
682             /////
683             max_score = -INFINITY;
684             max_track = NULL_TRACK;
685             proc_comp(S, I, l, j-1, READ);
686             proc_comp(Y, I, l, j-1, READ);
687             proc_comp(M, I, l, j-1, READ);
688             proc_comp(I, I, l, j-1, READ);
689             V[I][c] = max_score;
690             U[I][c] = max_track;
691             if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
692 
693             //////
694             //MR//
695             //////
696             max_score = -INFINITY;
697             max_track = NULL_TRACK;
698             proc_comp(S, MR, d, j-1, MATCH);
699             proc_comp(Y, MR, d, j-1, MATCH);
700             proc_comp(M, MR, d, j-1, MATCH);
701             proc_comp(D, MR, d, j-1, MATCH);
702             proc_comp(I, MR, d, j-1, MATCH);
703             proc_comp(MR, MR, d, j-1, MATCH);
704             V[MR][c] = max_score;
705             U[MR][c] = max_track;
706             if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
707 
708         }
709     }
710 }
711 
712     if (debug)
713     {
714         print(V[Y], plen+1, rlen+1);
715         std::cerr << "\n   =U[Y]=\n";
716         print_U(U[Y], plen+1, rlen+1);
717 
718         std::cerr << "\n   =V[M]=\n";
719         print(V[M], plen+1, rlen+1);
720         std::cerr << "\n   =U[M]=\n";
721         print_U(U[M], plen+1, rlen+1);
722         std::cerr << "\n   =V[D]=\n";
723         print(V[D], plen+1, rlen+1);
724         std::cerr << "\n   =U[D]=\n";
725         print_U(U[D], plen+1, rlen+1);
726         std::cerr << "\n   =V[I]=\n";
727         print(V[I], plen+1, rlen+1);
728         std::cerr << "\n   =U[I]=\n";
729         print_U(U[I], plen+1, rlen+1);
730 
731         std::cerr << "\n   =V[MR]=\n";
732         print(V[MR], plen+1, rlen+1);
733         std::cerr << "\n   =U[MR]=\n";
734         print_U(U[MR], plen+1, rlen+1);
735 
736         std::cerr << "\n";
737     }
738 
739     trace_path();
740 };
741 
742 /**
743  * Trace path after alignment.
744  */
trace_path()745 void RFHMM_X::trace_path()
746 {
747     //search for a complete path in MR or W or Z
748     size_t c;
749     optimal_score = -INFINITY;
750     optimal_track = NULL_TRACK;
751     optimal_state = TBD;
752     optimal_probe_len = 0;
753     for (size_t i=0; i<=plen; ++i)
754     {
755         c = index(i,rlen);
756         if (V[MR][c]>optimal_score)
757         {
758             optimal_score = V[MR][c];
759             optimal_track = U[MR][c];
760             optimal_state = MR;
761             optimal_probe_len = i;
762         }
763     }
764 
765     //trace path
766     optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
767     int32_t i = optimal_probe_len, j = rlen;
768     int32_t last_t = make_track(optimal_state, RFLANK, 0, rflen+1);
769     optimal_path_len = 0;
770     int32_t u;
771     int32_t des_t, src_t = make_track(E, RFLANK, 0, rflen+1);
772 
773     if (debug) std::cerr << "src_t (i,j) => dest_t : last_t\n";
774 
775     do
776     {
777         u = track_get_u(last_t);
778         last_t = U[u][index(i,j)];
779         des_t = track_set_u(last_t, u);
780         *optimal_path_ptr = des_t;
781 
782         collect_statistics(src_t, des_t, j);
783         if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " :  " << track2string(last_t) << "\n";
784 
785         if (track_get_u(src_t)==TBD)
786         {
787             exit(1);
788         }
789         src_t = des_t;
790 
791         if (u==M || u==MR)
792         {
793             --i; --j;
794         }
795         else if (u==D)
796         {
797             --i;
798         }
799         else if (u==Y || u==I)
800         {
801             --j;
802         }
803 
804         --optimal_path_ptr;
805         ++optimal_path_len;
806 
807     } while (track_get_u(last_t)!=S);
808 
809     if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " :  " << track2string(last_t) << "\n";
810     if (debug) std::cerr << track2string(last_t) << "\n";
811     if (debug) std::cerr << "\n";
812 
813     collect_statistics(src_t, last_t, j);
814 
815     ++optimal_path_ptr;
816     optimal_path_traced = true;
817 };
818 
819 /**
820  * Collect alignment summary statistics.
821  */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)822 void RFHMM_X::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
823 {
824     if (debug) std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
825 
826     int32_t src_u = track_get_u(src_t);
827     int32_t des_u = track_get_u(des_t);
828 
829     if (src_u==E)
830     {
831         if (des_u==MR)
832         {
833             rflank_end[MODEL] = track_get_p(des_t);
834             rflank_end[READ] = j;
835         }
836     }
837     else if (src_u==MR)
838     {
839         if (des_u==M)
840         {
841             rflank_start[MODEL] = track_get_p(src_t);
842             rflank_start[READ] = j+1;
843 
844             motif_end[MODEL] = track_get_c(des_t);
845             motif_count = track_get_c(des_t);
846             motif_end[READ] = j;
847 
848             //initialize array for tracking inexact repeats
849             for (size_t k=1; k<=motif_count; ++k)
850             {
851                 motif_discordance[k] = 0;
852             }
853 
854             if (track_get_base(des_t)!=read[j-1])
855             {
856                 ++motif_discordance[motif_count];
857             }
858         }
859         else if (des_u==D)
860         {
861             rflank_start[MODEL] = track_get_p(src_t);
862             rflank_start[READ] = j+1;
863 
864             motif_end[MODEL] = track_get_c(des_t);
865             motif_count = track_get_c(des_t);
866             motif_end[READ] = j;
867 
868             //initialize array for tracking inexact repeats
869             for (size_t k=1; k<=motif_count; ++k)
870             {
871                 motif_discordance[k] = 0;
872             }
873 
874             ++motif_discordance[motif_count];
875 
876         }
877         else if (des_u==I)
878         {
879         }
880         else if (des_u==Y)
881         {
882             rflank_start[MODEL] = track_get_p(src_t);
883             rflank_start[READ] = j+1;
884 
885             lflank_end[MODEL] = track_get_p(des_t);
886             lflank_end[READ] = j;
887         }
888     }
889     else if (src_u==M)
890     {
891         if (des_u==Y)
892         {
893             motif_start[MODEL] = track_get_c(src_t);
894             motif_start[READ] = j+1;
895             lflank_end[MODEL] = track_get_p(des_t);
896             lflank_end[READ] = j;
897         }
898     }
899     else if (src_u==Y)
900     {
901         if (des_u==S)
902         {
903             lflank_start[MODEL] = track_get_p(des_t);
904             lflank_start[READ] = j+1;
905         }
906 
907     }
908 
909     if (des_u==M)
910     {
911         if (track_get_base(des_t)!=read[j-1])
912         {
913             ++motif_discordance[track_get_c(des_t)];
914         }
915     }
916 
917     if (des_u==D || des_u==I)
918     {
919         ++motif_discordance[track_get_c(des_t)];
920     }
921 };
922 
923 /**
924  * Clear alignment statistics.
925  */
clear_statistics()926 void RFHMM_X::clear_statistics()
927 {
928     lflank_start[MODEL] = NAN;
929     lflank_start[READ] = NAN;
930     lflank_end[MODEL] = NAN;
931     lflank_end[READ] = NAN;
932     motif_start[MODEL] = NAN;
933     motif_start[READ] = NAN;
934     motif_end[MODEL] = NAN;
935     motif_end[READ] = NAN;
936     rflank_start[MODEL] = NAN;
937     rflank_start[READ] = NAN;
938     rflank_end[MODEL] = NAN;
939     rflank_end[READ] = NAN;
940     motif_count = NAN;
941     exact_motif_count = NAN;
942     motif_m = NAN;
943     motif_xid = NAN;
944     motif_concordance = NAN;
945     maxLogOdds = NAN;
946 }
947 
948 
949 /**
950  * Update alignment statistics after collection.
951  */
update_statistics()952 void RFHMM_X::update_statistics()
953 {
954     motif_concordance = (float)motif_m/(motif_m+motif_xid);
955 }
956 
957 /**
958  * Returns true if flanks are mapped.
959  */
flanks_are_mapped()960 bool RFHMM_X::flanks_are_mapped()
961 {
962     return rflank_start[MODEL]==rflen;
963 }
964 
965 /**
966  * Compute log10 emission odds based on equal error probability distribution.
967  */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)968 float RFHMM_X::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
969 {
970     //4 encodes for N
971 //    if (read_base=='N' || probe_base=='N')
972 //    {
973 //        //silent match
974 //        return -INFINITY;
975 //    }
976 
977     if (read_base!=probe_base)
978     {
979         return LogTool::pl2log10_varp(pl) - par.mismatch_penalty;
980     }
981     else
982     {
983         return -(LogTool::pl2log10_varp(pl));
984     }
985 };
986 
987 /**
988  * Converts state to string representation.
989  */
state2string(int32_t state)990 std::string RFHMM_X::state2string(int32_t state)
991 {
992     if (state==S)
993     {
994         return "S";
995     }
996     else if (state==Y)
997     {
998         return "Y";
999     }
1000     else if (state==M)
1001     {
1002         return "M";
1003     }
1004     else if (state==D)
1005     {
1006         return "D";
1007     }
1008     else if (state==I)
1009     {
1010         return "I";
1011     }
1012     else if (state==MR)
1013     {
1014         return "MR";
1015     }
1016     else if (state==E)
1017     {
1018         return "E";
1019     }
1020     else if (state==N)
1021     {
1022         return "N";
1023     }
1024     else if (state==TBD)
1025     {
1026         return "*";
1027     }
1028     else
1029     {
1030         return "!";
1031     }
1032 }
1033 
1034 /**
1035  * Converts state to cigar string representation.
1036  */
state2cigarstring(int32_t state)1037 std::string RFHMM_X::state2cigarstring(int32_t state)
1038 {
1039     if (state==S)
1040     {
1041         return "S";
1042     }
1043     else if (state==Y)
1044     {
1045         return "Y";
1046     }
1047     else if (state==M)
1048     {
1049         return "M";
1050     }
1051     else if (state==D)
1052     {
1053         return "D";
1054     }
1055     else if (state==I)
1056     {
1057         return "I";
1058     }
1059     else if (state==MR)
1060     {
1061         return "R";
1062     }
1063     else if (state==E)
1064     {
1065         return "E";
1066     }
1067     else if (state==N)
1068     {
1069         return "N";
1070     }
1071     else if (state==TBD)
1072     {
1073         return "*";
1074     }
1075     else
1076     {
1077         return "!";
1078     }
1079 }
1080 
1081 /**
1082  * Converts state to cigar string representation.
1083  */
track2cigarstring1(int32_t t,int32_t j)1084 std::string RFHMM_X::track2cigarstring1(int32_t t, int32_t j)
1085 {
1086     int32_t state = track_get_u(t);
1087 
1088     if (state==S)
1089     {
1090         return "S";
1091     }
1092     else if (state==Y)
1093     {
1094         return "Y";
1095     }
1096     else if (state==M || state==MR)
1097     {
1098         if (track_get_base(t)==read[j-1])
1099         {
1100             return "M";
1101         }
1102         else
1103         {
1104             return "*";
1105         }
1106     }
1107     else if (state==D)
1108     {
1109         return "D";
1110     }
1111     else if (state==I)
1112     {
1113         return "I";
1114     }
1115     else if (state==E)
1116     {
1117         return "E";
1118     }
1119     else if (state==N)
1120     {
1121         return "N";
1122     }
1123     else if (state==TBD)
1124     {
1125         return "*";
1126     }
1127     else
1128     {
1129         return "!";
1130     }
1131 }
1132 
1133 /**
1134  * Converts track to cigar string representation.
1135  */
track2cigarstring2(int32_t t)1136 std::string RFHMM_X::track2cigarstring2(int32_t t)
1137 {
1138     int32_t state = track_get_u(t);
1139 
1140     if (state==M)
1141     {
1142         return (track_get_c(t)%2==0?"+":"o");
1143     }
1144     else if (state==D)
1145     {
1146         return (track_get_c(t)%2==0?"+":"o");
1147     }
1148     else if (state==I)
1149     {
1150         return (track_get_c(t)%2==0?"+":"o");
1151     }
1152     else if (state==MR)
1153     {
1154         return "R";
1155     }
1156     else
1157     {
1158         return " ";
1159     }
1160 }
1161 /**
1162  * Converts model component to string representation.
1163  */
component2string(int32_t component)1164 std::string RFHMM_X::component2string(int32_t component)
1165 {
1166     if (component==LFLANK)
1167     {
1168         return "l";
1169     }
1170     else if (component==MOTIF)
1171     {
1172         return "m";
1173     }
1174     else if (component==RFLANK)
1175     {
1176         return "r";
1177     }
1178     else if (component==UNMODELED)
1179     {
1180         return "!";
1181     }
1182     else if (component==READ)
1183     {
1184         return "s";
1185     }
1186     else if (component==UNCERTAIN)
1187     {
1188         return "?";
1189     }
1190     else
1191     {
1192         return "!";
1193     }
1194 }
1195 
1196 /**
1197  * Prints an alignment.
1198  */
print_alignment()1199 void RFHMM_X::print_alignment()
1200 {
1201     std::string pad = "\t";
1202     print_alignment(pad);
1203 };
1204 
1205 /**
1206  * Prints an alignment with padding.
1207  */
print_alignment(std::string & pad)1208 void RFHMM_X::print_alignment(std::string& pad)
1209 {
1210     std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1211 
1212     if (!optimal_path_traced)
1213     {
1214         std::cerr << "path not traced\n";
1215     }
1216 
1217     print_T();
1218     std::cerr << "\n";
1219     par.print();
1220     std::cerr << "\n";
1221 
1222     std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1223     std::cerr << "rflank       : " << model[RFLANK] << "\n";
1224     std::cerr << "mlen         : " << mlen << "\n";
1225     std::cerr << "rflen        : " << rflen << "\n";
1226     std::cerr << "plen         : " << plen << "\n";
1227     std::cerr << "\n";
1228     std::cerr << "read         : " << read << "\n";
1229     std::cerr << "rlen         : " << rlen << "\n";
1230     std::cerr << "\n";
1231     std::cerr << "optimal score: " << optimal_score << "\n";
1232     std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1233     std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1234     std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1235     std::cerr << "optimal path length : " << optimal_path_len << "\n";
1236     std::cerr << "max j: " << rlen << "\n";
1237 
1238     std::cerr << "probe: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1239                           << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "] "
1240                           << "(" << rflank_start[MODEL] << "~" << rflank_end[MODEL] << ")\n";
1241     std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1242                           << "[" << motif_start[READ] << "~" << motif_end[READ] << "] "
1243                           << "(" << rflank_start[READ] << "~" << rflank_end[READ] << ")\n";
1244     std::cerr << "\n";
1245     std::cerr << "motif #           : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1246 
1247     exact_motif_count = motif_count;
1248     motif_concordance = 0;
1249     for (int32_t k=1; k<=motif_count; ++k)
1250     {
1251         if (motif_discordance[k])
1252         {
1253             --exact_motif_count;
1254         }
1255 
1256         if (mlen>=motif_discordance[k])
1257         {
1258             motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
1259         }
1260     }
1261     motif_concordance *= 1.0/motif_count;
1262     std::cerr << "motif concordance : " << (motif_concordance*100) << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1263     std::cerr << "motif discordance : ";
1264     for (int32_t k=1; k<=motif_count; ++k)
1265     {
1266         std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1267     }
1268     std::cerr << "\n";
1269 
1270     //print path
1271     int32_t* path;
1272     path = optimal_path_ptr;
1273     std::cerr << "Model:  ";
1274     int32_t t = NULL_TRACK;
1275     int32_t j = 0;
1276     while (path<optimal_path+(MAXLEN<<2))
1277     {
1278         int32_t u = track_get_u(*path);
1279         if (u==M || u==D || u==MR)
1280         {
1281             std::cerr << track_get_base(*path);
1282         }
1283         else
1284         {
1285             std::cerr << '-';
1286         }
1287         ++path;
1288     }
1289     std::cerr << " \n";
1290 
1291     std::cerr << "       S";
1292     path = optimal_path_ptr;
1293     j=1;
1294     while (path<optimal_path+(MAXLEN<<2))
1295     {
1296         std::cerr << track2cigarstring1(*path,j);
1297         int32_t u = track_get_u(*path);
1298         if (u==Y || u==M || u==I || u==MR)
1299         {
1300             ++j;
1301         }
1302         ++path;
1303     }
1304     std::cerr << "E\n";
1305 
1306     path = optimal_path_ptr;
1307     std::cerr << "        ";
1308     while (path<optimal_path+(MAXLEN<<2))
1309     {
1310         std::cerr << track2cigarstring2(*path);
1311         ++path;
1312     }
1313     std::cerr << " \n";
1314 
1315     path = optimal_path_ptr;
1316     j=1;
1317     std::cerr << "Read:   ";
1318     while (path<optimal_path+(MAXLEN<<2))
1319     {
1320         int32_t u = track_get_u(*path);
1321         if (u==Y || u==M || u==I || u==MR)
1322         {
1323             std::cerr << read[j-1];
1324             ++j;
1325         }
1326         else
1327         {
1328             std::cerr << '-';
1329         }
1330         ++path;
1331     }
1332     std::cerr << " \n";
1333     std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1334 };
1335 
1336 /**
1337  * Prints a float matrix.
1338  */
print(float * v,size_t plen,size_t rlen)1339 void RFHMM_X::print(float *v, size_t plen, size_t rlen)
1340 {
1341     float val;
1342     std::cerr << std::setprecision(1) << std::fixed;
1343     for (size_t i=0; i<plen; ++i)
1344     {
1345         for (size_t j=0; j<rlen; ++j)
1346         {
1347             val =  v[index(i,j)];
1348             std::cerr << (val<0?"  ":"   ") << val;
1349         }
1350 
1351         std::cerr << "\n";
1352     }
1353 };
1354 
1355 /**
1356  * Prints a char matrix.
1357  */
print(int32_t * v,size_t plen,size_t rlen)1358 void RFHMM_X::print(int32_t *v, size_t plen, size_t rlen)
1359 {
1360     float val;
1361     std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1362     for (size_t i=0; i<plen; ++i)
1363     {
1364         for (size_t j=0; j<rlen; ++j)
1365         {
1366             val =  v[index(i,j)];
1367             std::cerr << (val<0?"  ":"   ") << val;
1368         }
1369 
1370         std::cerr << "\n";
1371     }
1372 };
1373 
1374 /**
1375  * Prints the transition matrix.
1376  */
print_T()1377 void RFHMM_X::print_T()
1378 {
1379     for (size_t j=S; j<=E; ++j)
1380     {
1381         std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1382     }
1383     std::cerr << "\n";
1384 
1385     for (size_t i=S; i<=E; ++i)
1386     {
1387         for (size_t j=S; j<=E; ++j)
1388         {
1389             if (j)
1390             {
1391                 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1392             }
1393             else
1394             {
1395                 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1396             }
1397         }
1398         std::cerr << "\n";
1399     }
1400 };
1401 
1402 /**
1403  * Prints U.
1404  */
print_U(int32_t * U,size_t plen,size_t rlen)1405 void RFHMM_X::print_U(int32_t *U, size_t plen, size_t rlen)
1406 {
1407     std::cerr << std::setprecision(1) << std::fixed;
1408     std::string state;
1409     for (size_t i=0; i<plen; ++i)
1410     {
1411         for (size_t j=0; j<rlen; ++j)
1412         {
1413             int32_t t = U[index(i,j)];
1414             state = state2string(track_get_u(t));
1415             std::cerr << (state.size()==1 ? "   " : "  ")
1416                       << state << "|"
1417                       << component2string(track_get_d(t)) << "|"
1418                       << track_get_c(t) << "|"
1419                       << track_get_p(t) << (j==rlen-1?"\n":"   ");
1420         }
1421     }
1422 };
1423 
1424 /**
1425  * Prints U and V.
1426  */
print_trace(int32_t state,size_t plen,size_t rlen)1427 void RFHMM_X::print_trace(int32_t state, size_t plen, size_t rlen)
1428 {
1429     std::cerr << std::setprecision(1) << std::fixed;
1430     int32_t *u = U[state];
1431     float *v = V[state];
1432     std::string s;
1433     for (size_t i=0; i<plen; ++i)
1434     {
1435         for (size_t j=0; j<rlen; ++j)
1436         {
1437             int32_t t = u[index(i,j)];
1438             s = state2string(track_get_u(t));
1439             std::cerr << (s.size()==1 ? "   " : "  ")
1440                       << s << "|"
1441                       << component2string(track_get_d(t)) << "|"
1442                       << track_get_c(t) << "|"
1443                       << track_get_p(t) << "|"
1444                       << v[index(i,j)];
1445         }
1446 
1447         std::cerr << "\n";
1448     }
1449 };
1450 
1451 /**
1452  * Returns a string representation of track.
1453  */
track2string(int32_t t)1454 std::string RFHMM_X::track2string(int32_t t)
1455 {
1456     kstring_t str = {0,0,0};
1457 
1458     kputs(state2string(track_get_u(t)).c_str(), &str);
1459     kputc('|', &str);
1460     kputs(component2string(track_get_d(t)).c_str(), &str);
1461     kputc('|', &str);
1462     kputw(track_get_c(t), &str);
1463     kputc('|', &str);
1464     kputw(track_get_p(t), &str);
1465 
1466     std::string s(str.s);
1467     if (str.m) free(str.s);
1468 
1469     return s;
1470 }
1471 
1472 /**
1473  * Prints track.
1474  */
print_track(int32_t t)1475 void RFHMM_X::print_track(int32_t t)
1476 {
1477     std::cerr << track2string(t) << "\n";
1478 }
1479 
1480 #undef MAXLEN
1481 #undef MAXLEN_NBITS
1482 #undef S
1483 #undef Y
1484 #undef M
1485 #undef I
1486 #undef D
1487 #undef MR
1488 #undef E
1489 #undef N
1490 #undef TBD
1491 #undef NSTATES
1492 
1493 #undef LFLANK
1494 #undef MOTIF
1495 #undef RFLANK
1496 #undef UNMODELED
1497 #undef UNCERTAIN
1498 
1499 #undef READ
1500 #undef MODEL
1501 #undef MATCH
1502 
1503 #undef index
1504 #undef track_get_u
1505 #undef track_get_d
1506 #undef track_get_d
1507 #undef track_get_c
1508 #undef track_get_p
1509 #undef track_get_base
1510 #undef track_valid
1511 #undef track_set_u
1512 #undef track_set_d
1513 #undef track_set_c
1514 #undef track_set_p
1515 #undef make_track
1516 
1517 #undef NULL_TRACK
1518 #undef START_TRACK