1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "rfhmm_x.h"
25
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28
29 //model states
30 #define S 0
31 #define Y 1
32 #define M 2
33 #define D 3
34 #define I 4
35 #define MR 5
36 #define E 6
37 #define N 7
38 #define TBD 8
39
40 #define NSTATES 7
41
42 //model components
43 #define LFLANK 0
44 #define MOTIF 1
45 #define RFLANK 2
46 #define UNMODELED 3
47 #define UNCERTAIN 4
48
49 //match type
50 #define MODEL 0
51 #define READ 1
52 #define MATCH 2
53
54 /*for indexing single array*/
55 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
56
57 /*functions for getting trace back information*/
58 #define track_get_u(t) (((t)&0xF8000000)>>27)
59 #define track_get_d(t) (((t)&0x07000000)>>24)
60 #define track_get_c(t) (((t)&0x00FFF000)>>12)
61 #define track_get_p(t) (((t)&0x00000FFF))
62 #define track_get_base(t) (model[track_get_d(t)][track_get_p(t)-1])
63 #define track_valid(t) ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
64 #define track_set_u(t,u) (((t)&0x07FFFFFF)|((u)<<27))
65 #define track_set_d(t,d) (((t)&0xF8FFFFFF)|((d)<<24))
66 #define track_set_c(t,c) (((t)&0xFF000FFF)|((c)<<12))
67 #define track_set_p(t,p) (((t)&0xFFFFF000)|(p))
68 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
69
70 //[N|!|0|0]
71 #define NULL_TRACK 0x7C000000
72 //[N|l|0|0]
73 #define START_TRACK 0x78000000
74
75 /**
76 * Constructor.
77 */
RFHMM_X(bool debug)78 RFHMM_X::RFHMM_X(bool debug)
79 {
80 this-> debug = debug;
81 initialize();
82 };
83
84 /**
85 * Destructor.
86 */
~RFHMM_X()87 RFHMM_X::~RFHMM_X()
88 {
89 delete optimal_path;
90
91 //the best alignment V_ for subsequence (i,j)
92 for (size_t state=S; state<=E; ++state)
93 {
94 delete V[state];
95 delete U[state];
96 }
97
98 delete V;
99 delete U;
100 };
101
102 /**
103 * Initializes objects; helper function for constructor.
104 */
initialize()105 void RFHMM_X::initialize()
106 {
107 initialize_structures();
108 initialize_T();
109 initialize_UV();
110 };
111
112 /**
113 * Initializes objects for constructor.
114 */
initialize_structures()115 void RFHMM_X::initialize_structures()
116 {
117 model = new char*[3];
118 model[LFLANK] = NULL;
119 model[MOTIF] = NULL;
120 model[RFLANK] = NULL;
121
122 rflen = 0;
123 mlen = 0;
124
125 motif_discordance = new int32_t[MAXLEN];
126 optimal_path = new int32_t[MAXLEN<<2];
127 optimal_path_traced = false;
128
129 typedef int32_t (RFHMM_X::*move) (int32_t t, int32_t j);
130 V = new float*[NSTATES];
131 U = new int32_t*[NSTATES];
132 moves = new move*[NSTATES];
133 for (size_t state=S; state<=E; ++state)
134 {
135 V[state] = new float[MAXLEN*MAXLEN];
136 U[state] = new int32_t[MAXLEN*MAXLEN];
137 moves[state] = new move[NSTATES];
138 }
139
140 for (size_t state=S; state<=E; ++state)
141 {
142 moves[state] = new move[NSTATES];
143 }
144
145 moves[S][Y] = &RFHMM_X::move_S_Y;
146 moves[Y][Y] = &RFHMM_X::move_Y_Y;
147
148 moves[S][M] = &RFHMM_X::move_S_M;
149 moves[Y][M] = &RFHMM_X::move_Y_M;
150 moves[M][M] = &RFHMM_X::move_M_M;
151 moves[D][M] = &RFHMM_X::move_D_M;
152 moves[I][M] = &RFHMM_X::move_I_M;
153
154 moves[S][D] = &RFHMM_X::move_S_D;
155 moves[Y][D] = &RFHMM_X::move_Y_D;
156 moves[M][D] = &RFHMM_X::move_M_D;
157 moves[D][D] = &RFHMM_X::move_D_D;
158
159 moves[S][I] = &RFHMM_X::move_S_I;
160 moves[Y][I] = &RFHMM_X::move_Y_I;
161 moves[M][I] = &RFHMM_X::move_M_I;
162 moves[I][I] = &RFHMM_X::move_I_I;
163
164 moves[S][MR] = &RFHMM_X::move_S_MR;
165 moves[Y][MR] = &RFHMM_X::move_Y_MR;
166 moves[M][MR] = &RFHMM_X::move_M_MR;
167 moves[D][MR] = &RFHMM_X::move_D_MR;
168 moves[I][MR] = &RFHMM_X::move_I_MR;
169 moves[MR][MR] = &RFHMM_X::move_MR_MR;
170 };
171
172 /**
173 * Initialize transition matrix based on parameters.
174 */
initialize_T()175 void RFHMM_X::initialize_T()
176 {
177 float delta = par.delta;
178 float epsilon = par.epsilon;
179 float tau = par.tau;
180 float eta = par.eta;
181
182 for (size_t i=S; i<=E; ++i)
183 {
184 for (size_t j=S; j<=E; ++j)
185 {
186 T[i][j] = -INFINITY;
187 }
188 }
189
190 T[S][Y] = 0;
191 T[Y][Y] = 0;
192
193 T[S][M] = log10((tau*(1-2*delta-tau))/(tau*eta*(1-eta)*(1-eta)));
194 T[Y][M] = T[S][M];
195 T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
196 T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
197 T[I][M] = T[I][M];
198
199 T[S][D] = log10((tau*delta)/(eta*(1-eta)));
200 T[Y][D] = T[S][D];
201 T[M][D] = log10(delta/(1-eta));
202 T[D][D] = log10(epsilon/(1-eta));
203
204 T[S][I] = log10((tau*delta)/(eta*(1-eta)));
205 T[Y][I] = T[S][I];
206 T[M][I] = log10(delta/(1-eta));
207 T[I][I] = log10(epsilon/(1-eta));
208
209 T[S][MR] = log10((tau*tau*(1-tau))/(tau*eta*eta*eta*(1-eta)*(1-eta)));
210 T[Y][MR] = log10((eta*tau)/(eta*eta*eta));
211 T[M][MR] = log10((tau*(1-tau))/(eta*eta*(1-eta)*(1-eta)));
212 T[D][MR] = T[M][MR];
213 T[I][MR] = T[D][MR];
214 T[MR][MR] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
215 };
216
217 /**
218 * Initializes U and V.
219 */
initialize_UV()220 void RFHMM_X::initialize_UV()
221 {
222 int32_t t=0;
223 for (size_t i=0; i<MAXLEN; ++i)
224 {
225 for (size_t j=0; j<MAXLEN; ++j)
226 {
227 size_t c = index(i,j);
228
229 V[S][c] = -INFINITY;
230 U[S][c] = NULL_TRACK;
231
232 //Y
233 V[Y][c] = 0;
234 if (i>=1)
235 {
236 V[Y][c] = -INFINITY;
237 U[Y][c] = NULL_TRACK;
238 }
239 else
240 {
241 if (j==1)
242 {
243 U[Y][c] = make_track(S,LFLANK,0,j);
244 }
245 else
246 {
247 U[Y][c] = make_track(Y,LFLANK,0,j);
248 }
249 }
250
251 //M
252 V[M][c] = -INFINITY;
253 if (!i || !j)
254 {
255 U[M][c] = make_track(N,UNMODELED,0,0);
256 }
257 else
258 {
259 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
260 }
261
262 //D
263 V[D][c] = -INFINITY;
264 if (!i || !j)
265 {
266 U[D][c] = make_track(N,UNMODELED,0,0);
267 }
268 else
269 {
270 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
271 }
272
273 //I
274 V[I][c] = -INFINITY;
275 if (!i || !j)
276 {
277 U[I][c] = make_track(N,UNMODELED,0,0);
278 }
279 else
280 {
281 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
282 }
283
284 //MR
285 V[MR][c] = -INFINITY;
286 if (!i || !j)
287 {
288 U[MR][c] = make_track(N,UNMODELED,0,0);
289 }
290 else
291 {
292 U[MR][c] = make_track(TBD,UNCERTAIN,0,0);
293 }
294 }
295 }
296
297 V[S][index(0,0)] = 0;
298 U[S][index(0,0)] = START_TRACK;
299
300 V[M][index(0,0)] = -INFINITY;
301 V[MR][index(0,0)] = -INFINITY;
302 };
303
304 /**
305 * Sets a model.
306 */
set_model(const char * motif,const char * rflank)307 void RFHMM_X::set_model(const char* motif, const char* rflank)
308 {
309 if (model[LFLANK]) free(model[LFLANK]);
310 if (model[MOTIF]) free(model[MOTIF]);
311 if (model[RFLANK]) free(model[RFLANK]);
312
313 model[MOTIF] = strdup(motif);
314 model[RFLANK] = strdup(rflank);
315
316 mlen = strlen(model[MOTIF]);
317 rflen = strlen(model[RFLANK]);
318 }
319
320 /**
321 * Sets delta.
322 */
set_delta(float delta)323 void RFHMM_X::set_delta(float delta)
324 {
325 par.delta = delta;
326 }
327
328 /**
329 * Sets epsilon.
330 */
set_epsilon(float epsilon)331 void RFHMM_X::set_epsilon(float epsilon)
332 {
333 par.epsilon = epsilon;
334 }
335
336 /**
337 * Sets tau.
338 */
set_tau(float tau)339 void RFHMM_X::set_tau(float tau)
340 {
341 par.tau = tau;
342 }
343
344 /**
345 * Sets eta.
346 */
set_eta(float eta)347 void RFHMM_X::set_eta(float eta)
348 {
349 par.eta = eta;
350 }
351
352 /**
353 * Sets mismatch penalty.
354 */
set_mismatch_penalty(float mismatch_penalty)355 void RFHMM_X::set_mismatch_penalty(float mismatch_penalty)
356 {
357 par.mismatch_penalty = mismatch_penalty;
358 }
359
360 /**
361 * Sets debug.
362 */
set_debug(bool debug)363 void RFHMM_X::set_debug(bool debug)
364 {
365 this->debug = debug;
366 }
367
368
369 /**
370 * Get left flank start position for model.
371 */
get_lflank_model_spos1()372 int32_t RFHMM_X::get_lflank_model_spos1()
373 {
374 return lflank_start[MODEL];
375 };
376
377 /**
378 * Get left flank end position for model.
379 */
get_lflank_model_epos1()380 int32_t RFHMM_X::get_lflank_model_epos1()
381 {
382 return lflank_end[MODEL];
383 };
384
385 /**
386 * Get motif start position for model.
387 */
get_motif_model_spos1()388 int32_t RFHMM_X::get_motif_model_spos1()
389 {
390 return motif_start[MODEL];
391 };
392
393 /**
394 * Get motif end position for model.
395 */
get_motif_model_epos1()396 int32_t RFHMM_X::get_motif_model_epos1()
397 {
398 return motif_end[MODEL];
399 };
400
401 /**
402 * Get right flank start position for model.
403 */
get_rflank_model_spos1()404 int32_t RFHMM_X::get_rflank_model_spos1()
405 {
406 return rflank_start[MODEL];
407 };
408
409 /**
410 * Get right flank end position for model.
411 */
get_rflank_model_epos1()412 int32_t RFHMM_X::get_rflank_model_epos1()
413 {
414 return rflank_end[MODEL];
415 };
416
417 /**
418 * Get left flank start position for read.
419 */
get_lflank_read_spos1()420 int32_t RFHMM_X::get_lflank_read_spos1()
421 {
422 return lflank_start[READ];
423 };
424
425 /**
426 * Get left flank end position for read.
427 */
get_lflank_read_epos1()428 int32_t RFHMM_X::get_lflank_read_epos1()
429 {
430 return lflank_end[READ];
431 };
432
433 /**
434 * Get motif start position for read.
435 */
get_motif_read_spos1()436 int32_t RFHMM_X::get_motif_read_spos1()
437 {
438 return motif_start[READ];
439 };
440
441 /**
442 * Get motif end position for read.
443 */
get_motif_read_epos1()444 int32_t RFHMM_X::get_motif_read_epos1()
445 {
446 return motif_end[READ];
447 };
448
449 /**
450 * Get right flank start position for read.
451 */
get_rflank_read_spos1()452 int32_t RFHMM_X::get_rflank_read_spos1()
453 {
454 return rflank_start[READ];
455 };
456
457 /**
458 * Get right flank end position for read.
459 */
get_rflank_read_epos1()460 int32_t RFHMM_X::get_rflank_read_epos1()
461 {
462 return rflank_end[READ];
463 };
464
465 /**
466 * Computes the score associated with the move from A to B
467 * Updates the max_score and associated max_track.
468 *
469 * @A - start state
470 * @B - end state
471 * @index1 - flattened index of the one dimensional array of start state
472 * @j - 1 based position of read of start state
473 * @m - base match required (MATCH, MODEL, READ)
474 */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)475 void RFHMM_X::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
476 {
477 float emission = 0, score = 0, valid = 0;
478
479 //t is the new track
480 int32_t t = (this->*(moves[A][B]))(U[A][index1],j);
481
482 if (t==NULL_TRACK)
483 {
484 valid = -INFINITY;
485 }
486 else if (match_type==MATCH)
487 {
488 emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
489 }
490
491 score = V[A][index1] + T[A][B] + emission + valid;
492
493 if (score>max_score)
494 {
495 max_score = score;
496 max_track = t;
497 }
498
499 if (debug)
500 {
501 std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
502 std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
503 std::cerr << track2string(U[A][index1]) << "=>";
504 std::cerr << track2string(t) << " ";
505 std::cerr << emission << " (e: " << (track_get_d(t)<=RFLANK?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N') << ") + ";
506 std::cerr << T[A][B] << " (t) + ";
507 std::cerr << V[A][index1] << " (p) + ";
508 std::cerr << valid << " (v) = ";
509 std::cerr << score << "\n";
510 }
511 }
512
513 /**
514 * Align model against read.
515 *
516 * This implements the dynamic programming algorithm in a
517 * right to left fashion, this allows the repeat units to
518 * end naturally in partial units when trailing on the left
519 * hand side.
520 *
521 * R E A D
522 * M
523 * O
524 * D
525 * E
526 * L
527 */
align(const char * read,const char * qual)528 void RFHMM_X::align(const char* read, const char* qual)
529 {
530 clear_statistics();
531 optimal_path_traced = false;
532 this->read = read;
533 this->qual = qual;
534 rlen = strlen(read);
535 plen = rlen + rflen;
536
537 if (rlen>MAXLEN)
538 {
539 fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
540 exit(1);
541 }
542
543 float max = 0;
544 char maxPath = 'Y';
545
546 size_t c,d,u,l;
547
548 //right to left alignment
549 for (size_t i=0; i>=plen; ++i)
550 {
551 for (size_t j=MAXLEN; j<=MAXLEN-rlen; --j)
552 {
553 size_t c = index(i,j);
554 size_t d = index(i-1,j+1);
555 size_t u = index(i-1,j);
556 size_t l = index(i,j+1);
557
558 /////
559 //X//
560 /////
561 //invariant
562
563 /////
564 //Y//
565 /////
566 //invariant
567
568 /////
569 //M//
570 /////
571 //only need to update this i>rflen
572 if (debug) std::cerr << "(" << i << "," << j << ")\n";
573 max_score = -INFINITY;
574 max_track = NULL_TRACK;
575 proc_comp(S, M, d, j+1, MATCH);
576 proc_comp(Y, M, d, j+1, MATCH);
577 proc_comp(M, M, d, j+1, MATCH);
578 proc_comp(D, M, d, j+1, MATCH);
579 proc_comp(I, M, d, j+1, MATCH);
580 V[M][c] = max_score;
581 U[M][c] = max_track;
582 if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
583
584 /////
585 //D//
586 /////
587 max_score = -INFINITY;
588 max_track = NULL_TRACK;
589 proc_comp(S, D, u, j, MODEL);
590 proc_comp(Y, D, u, j, MODEL);
591 proc_comp(M, D, u, j, MODEL);
592 proc_comp(D, D, u, j, MODEL);
593 V[D][c] = max_score;
594 U[D][c] = max_track;
595 if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
596
597 /////
598 //I//
599 /////
600 max_score = -INFINITY;
601 max_track = NULL_TRACK;
602 proc_comp(S, I, l, j+1, READ);
603 proc_comp(Y, I, l, j+1, READ);
604 proc_comp(M, I, l, j+1, READ);
605 proc_comp(I, I, l, j+1, READ);
606 V[I][c] = max_score;
607 U[I][c] = max_track;
608 if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
609
610 //////
611 //MR//
612 //////
613 max_score = -INFINITY;
614 max_track = NULL_TRACK;
615 proc_comp(S, MR, d, j+1, MATCH);
616 proc_comp(Y, MR, d, j+1, MATCH);
617 proc_comp(M, MR, d, j+1, MATCH);
618 proc_comp(D, MR, d, j+1, MATCH);
619 proc_comp(I, MR, d, j+1, MATCH);
620 proc_comp(MR, MR, d, j+1, MATCH);
621 V[MR][c] = max_score;
622 U[MR][c] = max_track;
623 if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
624
625 }
626 }
627
628 if (0)
629 {
630 //alignment
631 //take into consideration
632 for (size_t i=1; i<=plen; ++i)
633 {
634 for (size_t j=1; j<=rlen; ++j)
635 {
636 size_t c = index(i,j);
637 size_t d = index(i-1,j-1);
638 size_t u = index(i-1,j);
639 size_t l = index(i,j-1);
640
641 /////
642 //X//
643 /////
644 //invariant
645
646 /////
647 //Y//
648 /////
649 //invariant
650
651 /////
652 //M//
653 /////
654 //only need to update this i>rflen
655 if (debug) std::cerr << "(" << i << "," << j << ")\n";
656 max_score = -INFINITY;
657 max_track = NULL_TRACK;
658 proc_comp(S, M, d, j-1, MATCH);
659 proc_comp(Y, M, d, j-1, MATCH);
660 proc_comp(M, M, d, j-1, MATCH);
661 proc_comp(D, M, d, j-1, MATCH);
662 proc_comp(I, M, d, j-1, MATCH);
663 V[M][c] = max_score;
664 U[M][c] = max_track;
665 if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
666
667 /////
668 //D//
669 /////
670 max_score = -INFINITY;
671 max_track = NULL_TRACK;
672 proc_comp(S, D, u, j, MODEL);
673 proc_comp(Y, D, u, j, MODEL);
674 proc_comp(M, D, u, j, MODEL);
675 proc_comp(D, D, u, j, MODEL);
676 V[D][c] = max_score;
677 U[D][c] = max_track;
678 if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
679
680 /////
681 //I//
682 /////
683 max_score = -INFINITY;
684 max_track = NULL_TRACK;
685 proc_comp(S, I, l, j-1, READ);
686 proc_comp(Y, I, l, j-1, READ);
687 proc_comp(M, I, l, j-1, READ);
688 proc_comp(I, I, l, j-1, READ);
689 V[I][c] = max_score;
690 U[I][c] = max_track;
691 if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
692
693 //////
694 //MR//
695 //////
696 max_score = -INFINITY;
697 max_track = NULL_TRACK;
698 proc_comp(S, MR, d, j-1, MATCH);
699 proc_comp(Y, MR, d, j-1, MATCH);
700 proc_comp(M, MR, d, j-1, MATCH);
701 proc_comp(D, MR, d, j-1, MATCH);
702 proc_comp(I, MR, d, j-1, MATCH);
703 proc_comp(MR, MR, d, j-1, MATCH);
704 V[MR][c] = max_score;
705 U[MR][c] = max_track;
706 if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
707
708 }
709 }
710 }
711
712 if (debug)
713 {
714 print(V[Y], plen+1, rlen+1);
715 std::cerr << "\n =U[Y]=\n";
716 print_U(U[Y], plen+1, rlen+1);
717
718 std::cerr << "\n =V[M]=\n";
719 print(V[M], plen+1, rlen+1);
720 std::cerr << "\n =U[M]=\n";
721 print_U(U[M], plen+1, rlen+1);
722 std::cerr << "\n =V[D]=\n";
723 print(V[D], plen+1, rlen+1);
724 std::cerr << "\n =U[D]=\n";
725 print_U(U[D], plen+1, rlen+1);
726 std::cerr << "\n =V[I]=\n";
727 print(V[I], plen+1, rlen+1);
728 std::cerr << "\n =U[I]=\n";
729 print_U(U[I], plen+1, rlen+1);
730
731 std::cerr << "\n =V[MR]=\n";
732 print(V[MR], plen+1, rlen+1);
733 std::cerr << "\n =U[MR]=\n";
734 print_U(U[MR], plen+1, rlen+1);
735
736 std::cerr << "\n";
737 }
738
739 trace_path();
740 };
741
742 /**
743 * Trace path after alignment.
744 */
trace_path()745 void RFHMM_X::trace_path()
746 {
747 //search for a complete path in MR or W or Z
748 size_t c;
749 optimal_score = -INFINITY;
750 optimal_track = NULL_TRACK;
751 optimal_state = TBD;
752 optimal_probe_len = 0;
753 for (size_t i=0; i<=plen; ++i)
754 {
755 c = index(i,rlen);
756 if (V[MR][c]>optimal_score)
757 {
758 optimal_score = V[MR][c];
759 optimal_track = U[MR][c];
760 optimal_state = MR;
761 optimal_probe_len = i;
762 }
763 }
764
765 //trace path
766 optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
767 int32_t i = optimal_probe_len, j = rlen;
768 int32_t last_t = make_track(optimal_state, RFLANK, 0, rflen+1);
769 optimal_path_len = 0;
770 int32_t u;
771 int32_t des_t, src_t = make_track(E, RFLANK, 0, rflen+1);
772
773 if (debug) std::cerr << "src_t (i,j) => dest_t : last_t\n";
774
775 do
776 {
777 u = track_get_u(last_t);
778 last_t = U[u][index(i,j)];
779 des_t = track_set_u(last_t, u);
780 *optimal_path_ptr = des_t;
781
782 collect_statistics(src_t, des_t, j);
783 if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " : " << track2string(last_t) << "\n";
784
785 if (track_get_u(src_t)==TBD)
786 {
787 exit(1);
788 }
789 src_t = des_t;
790
791 if (u==M || u==MR)
792 {
793 --i; --j;
794 }
795 else if (u==D)
796 {
797 --i;
798 }
799 else if (u==Y || u==I)
800 {
801 --j;
802 }
803
804 --optimal_path_ptr;
805 ++optimal_path_len;
806
807 } while (track_get_u(last_t)!=S);
808
809 if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " : " << track2string(last_t) << "\n";
810 if (debug) std::cerr << track2string(last_t) << "\n";
811 if (debug) std::cerr << "\n";
812
813 collect_statistics(src_t, last_t, j);
814
815 ++optimal_path_ptr;
816 optimal_path_traced = true;
817 };
818
819 /**
820 * Collect alignment summary statistics.
821 */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)822 void RFHMM_X::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
823 {
824 if (debug) std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
825
826 int32_t src_u = track_get_u(src_t);
827 int32_t des_u = track_get_u(des_t);
828
829 if (src_u==E)
830 {
831 if (des_u==MR)
832 {
833 rflank_end[MODEL] = track_get_p(des_t);
834 rflank_end[READ] = j;
835 }
836 }
837 else if (src_u==MR)
838 {
839 if (des_u==M)
840 {
841 rflank_start[MODEL] = track_get_p(src_t);
842 rflank_start[READ] = j+1;
843
844 motif_end[MODEL] = track_get_c(des_t);
845 motif_count = track_get_c(des_t);
846 motif_end[READ] = j;
847
848 //initialize array for tracking inexact repeats
849 for (size_t k=1; k<=motif_count; ++k)
850 {
851 motif_discordance[k] = 0;
852 }
853
854 if (track_get_base(des_t)!=read[j-1])
855 {
856 ++motif_discordance[motif_count];
857 }
858 }
859 else if (des_u==D)
860 {
861 rflank_start[MODEL] = track_get_p(src_t);
862 rflank_start[READ] = j+1;
863
864 motif_end[MODEL] = track_get_c(des_t);
865 motif_count = track_get_c(des_t);
866 motif_end[READ] = j;
867
868 //initialize array for tracking inexact repeats
869 for (size_t k=1; k<=motif_count; ++k)
870 {
871 motif_discordance[k] = 0;
872 }
873
874 ++motif_discordance[motif_count];
875
876 }
877 else if (des_u==I)
878 {
879 }
880 else if (des_u==Y)
881 {
882 rflank_start[MODEL] = track_get_p(src_t);
883 rflank_start[READ] = j+1;
884
885 lflank_end[MODEL] = track_get_p(des_t);
886 lflank_end[READ] = j;
887 }
888 }
889 else if (src_u==M)
890 {
891 if (des_u==Y)
892 {
893 motif_start[MODEL] = track_get_c(src_t);
894 motif_start[READ] = j+1;
895 lflank_end[MODEL] = track_get_p(des_t);
896 lflank_end[READ] = j;
897 }
898 }
899 else if (src_u==Y)
900 {
901 if (des_u==S)
902 {
903 lflank_start[MODEL] = track_get_p(des_t);
904 lflank_start[READ] = j+1;
905 }
906
907 }
908
909 if (des_u==M)
910 {
911 if (track_get_base(des_t)!=read[j-1])
912 {
913 ++motif_discordance[track_get_c(des_t)];
914 }
915 }
916
917 if (des_u==D || des_u==I)
918 {
919 ++motif_discordance[track_get_c(des_t)];
920 }
921 };
922
923 /**
924 * Clear alignment statistics.
925 */
clear_statistics()926 void RFHMM_X::clear_statistics()
927 {
928 lflank_start[MODEL] = NAN;
929 lflank_start[READ] = NAN;
930 lflank_end[MODEL] = NAN;
931 lflank_end[READ] = NAN;
932 motif_start[MODEL] = NAN;
933 motif_start[READ] = NAN;
934 motif_end[MODEL] = NAN;
935 motif_end[READ] = NAN;
936 rflank_start[MODEL] = NAN;
937 rflank_start[READ] = NAN;
938 rflank_end[MODEL] = NAN;
939 rflank_end[READ] = NAN;
940 motif_count = NAN;
941 exact_motif_count = NAN;
942 motif_m = NAN;
943 motif_xid = NAN;
944 motif_concordance = NAN;
945 maxLogOdds = NAN;
946 }
947
948
949 /**
950 * Update alignment statistics after collection.
951 */
update_statistics()952 void RFHMM_X::update_statistics()
953 {
954 motif_concordance = (float)motif_m/(motif_m+motif_xid);
955 }
956
957 /**
958 * Returns true if flanks are mapped.
959 */
flanks_are_mapped()960 bool RFHMM_X::flanks_are_mapped()
961 {
962 return rflank_start[MODEL]==rflen;
963 }
964
965 /**
966 * Compute log10 emission odds based on equal error probability distribution.
967 */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)968 float RFHMM_X::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
969 {
970 //4 encodes for N
971 // if (read_base=='N' || probe_base=='N')
972 // {
973 // //silent match
974 // return -INFINITY;
975 // }
976
977 if (read_base!=probe_base)
978 {
979 return LogTool::pl2log10_varp(pl) - par.mismatch_penalty;
980 }
981 else
982 {
983 return -(LogTool::pl2log10_varp(pl));
984 }
985 };
986
987 /**
988 * Converts state to string representation.
989 */
state2string(int32_t state)990 std::string RFHMM_X::state2string(int32_t state)
991 {
992 if (state==S)
993 {
994 return "S";
995 }
996 else if (state==Y)
997 {
998 return "Y";
999 }
1000 else if (state==M)
1001 {
1002 return "M";
1003 }
1004 else if (state==D)
1005 {
1006 return "D";
1007 }
1008 else if (state==I)
1009 {
1010 return "I";
1011 }
1012 else if (state==MR)
1013 {
1014 return "MR";
1015 }
1016 else if (state==E)
1017 {
1018 return "E";
1019 }
1020 else if (state==N)
1021 {
1022 return "N";
1023 }
1024 else if (state==TBD)
1025 {
1026 return "*";
1027 }
1028 else
1029 {
1030 return "!";
1031 }
1032 }
1033
1034 /**
1035 * Converts state to cigar string representation.
1036 */
state2cigarstring(int32_t state)1037 std::string RFHMM_X::state2cigarstring(int32_t state)
1038 {
1039 if (state==S)
1040 {
1041 return "S";
1042 }
1043 else if (state==Y)
1044 {
1045 return "Y";
1046 }
1047 else if (state==M)
1048 {
1049 return "M";
1050 }
1051 else if (state==D)
1052 {
1053 return "D";
1054 }
1055 else if (state==I)
1056 {
1057 return "I";
1058 }
1059 else if (state==MR)
1060 {
1061 return "R";
1062 }
1063 else if (state==E)
1064 {
1065 return "E";
1066 }
1067 else if (state==N)
1068 {
1069 return "N";
1070 }
1071 else if (state==TBD)
1072 {
1073 return "*";
1074 }
1075 else
1076 {
1077 return "!";
1078 }
1079 }
1080
1081 /**
1082 * Converts state to cigar string representation.
1083 */
track2cigarstring1(int32_t t,int32_t j)1084 std::string RFHMM_X::track2cigarstring1(int32_t t, int32_t j)
1085 {
1086 int32_t state = track_get_u(t);
1087
1088 if (state==S)
1089 {
1090 return "S";
1091 }
1092 else if (state==Y)
1093 {
1094 return "Y";
1095 }
1096 else if (state==M || state==MR)
1097 {
1098 if (track_get_base(t)==read[j-1])
1099 {
1100 return "M";
1101 }
1102 else
1103 {
1104 return "*";
1105 }
1106 }
1107 else if (state==D)
1108 {
1109 return "D";
1110 }
1111 else if (state==I)
1112 {
1113 return "I";
1114 }
1115 else if (state==E)
1116 {
1117 return "E";
1118 }
1119 else if (state==N)
1120 {
1121 return "N";
1122 }
1123 else if (state==TBD)
1124 {
1125 return "*";
1126 }
1127 else
1128 {
1129 return "!";
1130 }
1131 }
1132
1133 /**
1134 * Converts track to cigar string representation.
1135 */
track2cigarstring2(int32_t t)1136 std::string RFHMM_X::track2cigarstring2(int32_t t)
1137 {
1138 int32_t state = track_get_u(t);
1139
1140 if (state==M)
1141 {
1142 return (track_get_c(t)%2==0?"+":"o");
1143 }
1144 else if (state==D)
1145 {
1146 return (track_get_c(t)%2==0?"+":"o");
1147 }
1148 else if (state==I)
1149 {
1150 return (track_get_c(t)%2==0?"+":"o");
1151 }
1152 else if (state==MR)
1153 {
1154 return "R";
1155 }
1156 else
1157 {
1158 return " ";
1159 }
1160 }
1161 /**
1162 * Converts model component to string representation.
1163 */
component2string(int32_t component)1164 std::string RFHMM_X::component2string(int32_t component)
1165 {
1166 if (component==LFLANK)
1167 {
1168 return "l";
1169 }
1170 else if (component==MOTIF)
1171 {
1172 return "m";
1173 }
1174 else if (component==RFLANK)
1175 {
1176 return "r";
1177 }
1178 else if (component==UNMODELED)
1179 {
1180 return "!";
1181 }
1182 else if (component==READ)
1183 {
1184 return "s";
1185 }
1186 else if (component==UNCERTAIN)
1187 {
1188 return "?";
1189 }
1190 else
1191 {
1192 return "!";
1193 }
1194 }
1195
1196 /**
1197 * Prints an alignment.
1198 */
print_alignment()1199 void RFHMM_X::print_alignment()
1200 {
1201 std::string pad = "\t";
1202 print_alignment(pad);
1203 };
1204
1205 /**
1206 * Prints an alignment with padding.
1207 */
print_alignment(std::string & pad)1208 void RFHMM_X::print_alignment(std::string& pad)
1209 {
1210 std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1211
1212 if (!optimal_path_traced)
1213 {
1214 std::cerr << "path not traced\n";
1215 }
1216
1217 print_T();
1218 std::cerr << "\n";
1219 par.print();
1220 std::cerr << "\n";
1221
1222 std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1223 std::cerr << "rflank : " << model[RFLANK] << "\n";
1224 std::cerr << "mlen : " << mlen << "\n";
1225 std::cerr << "rflen : " << rflen << "\n";
1226 std::cerr << "plen : " << plen << "\n";
1227 std::cerr << "\n";
1228 std::cerr << "read : " << read << "\n";
1229 std::cerr << "rlen : " << rlen << "\n";
1230 std::cerr << "\n";
1231 std::cerr << "optimal score: " << optimal_score << "\n";
1232 std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1233 std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1234 std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1235 std::cerr << "optimal path length : " << optimal_path_len << "\n";
1236 std::cerr << "max j: " << rlen << "\n";
1237
1238 std::cerr << "probe: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1239 << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "] "
1240 << "(" << rflank_start[MODEL] << "~" << rflank_end[MODEL] << ")\n";
1241 std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1242 << "[" << motif_start[READ] << "~" << motif_end[READ] << "] "
1243 << "(" << rflank_start[READ] << "~" << rflank_end[READ] << ")\n";
1244 std::cerr << "\n";
1245 std::cerr << "motif # : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1246
1247 exact_motif_count = motif_count;
1248 motif_concordance = 0;
1249 for (int32_t k=1; k<=motif_count; ++k)
1250 {
1251 if (motif_discordance[k])
1252 {
1253 --exact_motif_count;
1254 }
1255
1256 if (mlen>=motif_discordance[k])
1257 {
1258 motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
1259 }
1260 }
1261 motif_concordance *= 1.0/motif_count;
1262 std::cerr << "motif concordance : " << (motif_concordance*100) << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1263 std::cerr << "motif discordance : ";
1264 for (int32_t k=1; k<=motif_count; ++k)
1265 {
1266 std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1267 }
1268 std::cerr << "\n";
1269
1270 //print path
1271 int32_t* path;
1272 path = optimal_path_ptr;
1273 std::cerr << "Model: ";
1274 int32_t t = NULL_TRACK;
1275 int32_t j = 0;
1276 while (path<optimal_path+(MAXLEN<<2))
1277 {
1278 int32_t u = track_get_u(*path);
1279 if (u==M || u==D || u==MR)
1280 {
1281 std::cerr << track_get_base(*path);
1282 }
1283 else
1284 {
1285 std::cerr << '-';
1286 }
1287 ++path;
1288 }
1289 std::cerr << " \n";
1290
1291 std::cerr << " S";
1292 path = optimal_path_ptr;
1293 j=1;
1294 while (path<optimal_path+(MAXLEN<<2))
1295 {
1296 std::cerr << track2cigarstring1(*path,j);
1297 int32_t u = track_get_u(*path);
1298 if (u==Y || u==M || u==I || u==MR)
1299 {
1300 ++j;
1301 }
1302 ++path;
1303 }
1304 std::cerr << "E\n";
1305
1306 path = optimal_path_ptr;
1307 std::cerr << " ";
1308 while (path<optimal_path+(MAXLEN<<2))
1309 {
1310 std::cerr << track2cigarstring2(*path);
1311 ++path;
1312 }
1313 std::cerr << " \n";
1314
1315 path = optimal_path_ptr;
1316 j=1;
1317 std::cerr << "Read: ";
1318 while (path<optimal_path+(MAXLEN<<2))
1319 {
1320 int32_t u = track_get_u(*path);
1321 if (u==Y || u==M || u==I || u==MR)
1322 {
1323 std::cerr << read[j-1];
1324 ++j;
1325 }
1326 else
1327 {
1328 std::cerr << '-';
1329 }
1330 ++path;
1331 }
1332 std::cerr << " \n";
1333 std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1334 };
1335
1336 /**
1337 * Prints a float matrix.
1338 */
print(float * v,size_t plen,size_t rlen)1339 void RFHMM_X::print(float *v, size_t plen, size_t rlen)
1340 {
1341 float val;
1342 std::cerr << std::setprecision(1) << std::fixed;
1343 for (size_t i=0; i<plen; ++i)
1344 {
1345 for (size_t j=0; j<rlen; ++j)
1346 {
1347 val = v[index(i,j)];
1348 std::cerr << (val<0?" ":" ") << val;
1349 }
1350
1351 std::cerr << "\n";
1352 }
1353 };
1354
1355 /**
1356 * Prints a char matrix.
1357 */
print(int32_t * v,size_t plen,size_t rlen)1358 void RFHMM_X::print(int32_t *v, size_t plen, size_t rlen)
1359 {
1360 float val;
1361 std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1362 for (size_t i=0; i<plen; ++i)
1363 {
1364 for (size_t j=0; j<rlen; ++j)
1365 {
1366 val = v[index(i,j)];
1367 std::cerr << (val<0?" ":" ") << val;
1368 }
1369
1370 std::cerr << "\n";
1371 }
1372 };
1373
1374 /**
1375 * Prints the transition matrix.
1376 */
print_T()1377 void RFHMM_X::print_T()
1378 {
1379 for (size_t j=S; j<=E; ++j)
1380 {
1381 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1382 }
1383 std::cerr << "\n";
1384
1385 for (size_t i=S; i<=E; ++i)
1386 {
1387 for (size_t j=S; j<=E; ++j)
1388 {
1389 if (j)
1390 {
1391 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1392 }
1393 else
1394 {
1395 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1396 }
1397 }
1398 std::cerr << "\n";
1399 }
1400 };
1401
1402 /**
1403 * Prints U.
1404 */
print_U(int32_t * U,size_t plen,size_t rlen)1405 void RFHMM_X::print_U(int32_t *U, size_t plen, size_t rlen)
1406 {
1407 std::cerr << std::setprecision(1) << std::fixed;
1408 std::string state;
1409 for (size_t i=0; i<plen; ++i)
1410 {
1411 for (size_t j=0; j<rlen; ++j)
1412 {
1413 int32_t t = U[index(i,j)];
1414 state = state2string(track_get_u(t));
1415 std::cerr << (state.size()==1 ? " " : " ")
1416 << state << "|"
1417 << component2string(track_get_d(t)) << "|"
1418 << track_get_c(t) << "|"
1419 << track_get_p(t) << (j==rlen-1?"\n":" ");
1420 }
1421 }
1422 };
1423
1424 /**
1425 * Prints U and V.
1426 */
print_trace(int32_t state,size_t plen,size_t rlen)1427 void RFHMM_X::print_trace(int32_t state, size_t plen, size_t rlen)
1428 {
1429 std::cerr << std::setprecision(1) << std::fixed;
1430 int32_t *u = U[state];
1431 float *v = V[state];
1432 std::string s;
1433 for (size_t i=0; i<plen; ++i)
1434 {
1435 for (size_t j=0; j<rlen; ++j)
1436 {
1437 int32_t t = u[index(i,j)];
1438 s = state2string(track_get_u(t));
1439 std::cerr << (s.size()==1 ? " " : " ")
1440 << s << "|"
1441 << component2string(track_get_d(t)) << "|"
1442 << track_get_c(t) << "|"
1443 << track_get_p(t) << "|"
1444 << v[index(i,j)];
1445 }
1446
1447 std::cerr << "\n";
1448 }
1449 };
1450
1451 /**
1452 * Returns a string representation of track.
1453 */
track2string(int32_t t)1454 std::string RFHMM_X::track2string(int32_t t)
1455 {
1456 kstring_t str = {0,0,0};
1457
1458 kputs(state2string(track_get_u(t)).c_str(), &str);
1459 kputc('|', &str);
1460 kputs(component2string(track_get_d(t)).c_str(), &str);
1461 kputc('|', &str);
1462 kputw(track_get_c(t), &str);
1463 kputc('|', &str);
1464 kputw(track_get_p(t), &str);
1465
1466 std::string s(str.s);
1467 if (str.m) free(str.s);
1468
1469 return s;
1470 }
1471
1472 /**
1473 * Prints track.
1474 */
print_track(int32_t t)1475 void RFHMM_X::print_track(int32_t t)
1476 {
1477 std::cerr << track2string(t) << "\n";
1478 }
1479
1480 #undef MAXLEN
1481 #undef MAXLEN_NBITS
1482 #undef S
1483 #undef Y
1484 #undef M
1485 #undef I
1486 #undef D
1487 #undef MR
1488 #undef E
1489 #undef N
1490 #undef TBD
1491 #undef NSTATES
1492
1493 #undef LFLANK
1494 #undef MOTIF
1495 #undef RFLANK
1496 #undef UNMODELED
1497 #undef UNCERTAIN
1498
1499 #undef READ
1500 #undef MODEL
1501 #undef MATCH
1502
1503 #undef index
1504 #undef track_get_u
1505 #undef track_get_d
1506 #undef track_get_d
1507 #undef track_get_c
1508 #undef track_get_p
1509 #undef track_get_base
1510 #undef track_valid
1511 #undef track_set_u
1512 #undef track_set_d
1513 #undef track_set_c
1514 #undef track_set_p
1515 #undef make_track
1516
1517 #undef NULL_TRACK
1518 #undef START_TRACK