1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "ahmm.h"
25
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28
29 #define S 0
30 #define M 1
31 #define D 2
32 #define I 3
33 #define Z 4
34 #define E 5
35 #define N 6
36 #define TBD 7
37 #define NSTATES 6
38
39 #define LFLANK 0
40 #define MOTIF 1
41 #define RFLANK 2
42 #define UNMODELED 3
43 #define UNCERTAIN 4
44
45 //match type
46 #define MODEL 0
47 #define READ 1
48 #define MATCH 2
49
50 /*for indexing single array*/
51 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
52
53 /*functions for getting trace back information*/
54 #define track_get_u(t) (((t)&0xF8000000)>>27)
55 #define track_get_d(t) (((t)&0x07000000)>>24)
56 #define track_get_c(t) (((t)&0x00FFF000)>>12)
57 #define track_get_p(t) (((t)&0x00000FFF))
58 #define track_get_base(t) (model[track_get_d(t)][track_get_p(t)-1])
59 #define track_valid(t) ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
60 #define track_set_u(t,u) (((t)&0x07FFFFFF)|((u)<<27))
61 #define track_set_d(t,d) (((t)&0xF8FFFFFF)|((d)<<24))
62 #define track_set_c(t,c) (((t)&0xFF000FFF)|((c)<<12))
63 #define track_set_p(t,p) (((t)&0xFFFFF000)|(p))
64 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
65
66 //[N|!|0|0]
67 #define NULL_TRACK 0x7C000000
68 //[N|l|0|0]
69 #define START_TRACK 0x78000000
70
71 /**
72 * Constructor.
73 */
AHMM(bool debug)74 AHMM::AHMM(bool debug)
75 {
76 this->debug = debug;
77 initialize();
78 };
79
80 /**
81 * Destructor.
82 */
~AHMM()83 AHMM::~AHMM()
84 {
85 delete optimal_path;
86
87 for (size_t state=S; state<=E; ++state)
88 {
89 delete V[state];
90 delete U[state];
91 }
92
93 delete V;
94 delete U;
95 };
96
97 /**
98 * Initializes objects; helper function for constructor.
99 */
initialize()100 void AHMM::initialize()
101 {
102 initialize_structures();
103 initialize_T();
104 initialize_UV();
105 };
106
107 /**
108 * Initializes objects for constructor.
109 */
initialize_structures()110 void AHMM::initialize_structures()
111 {
112 max_len = MAXLEN;
113
114 model = new char*[3];
115 model[LFLANK] = NULL;
116 model[MOTIF] = NULL;
117 model[RFLANK] = NULL;
118
119 lflen = 0;
120 mlen = 0;
121
122 motif_discordance = new int32_t[MAXLEN];
123 optimal_path = new int32_t[MAXLEN<<2];
124 optimal_path_traced = false;
125
126 typedef int32_t (AHMM::*move) (int32_t t, int32_t j);
127 V = new float*[NSTATES];
128 U = new int32_t*[NSTATES];
129 moves = new move*[NSTATES];
130 for (size_t state=S; state<=E; ++state)
131 {
132 V[state] = new float[MAXLEN*MAXLEN];
133 U[state] = new int32_t[MAXLEN*MAXLEN];
134 moves[state] = new move[NSTATES];
135 }
136
137 for (size_t state=S; state<=E; ++state)
138 {
139 moves[state] = new move[NSTATES];
140 }
141
142 moves[S][M] = &AHMM::move_S_M;
143 moves[M][M] = &AHMM::move_M_M;
144 moves[D][M] = &AHMM::move_D_M;
145 moves[I][M] = &AHMM::move_I_M;
146 moves[S][D] = &AHMM::move_S_D;
147 moves[M][D] = &AHMM::move_M_D;
148 moves[D][D] = &AHMM::move_D_D;
149 moves[S][I] = &AHMM::move_S_I;
150 moves[M][I] = &AHMM::move_M_I;
151 moves[I][I] = &AHMM::move_I_I;
152 };
153
154 /**
155 * Initialize transition matrix based on parameters.
156 */
initialize_T()157 void AHMM::initialize_T()
158 {
159 float delta = par.delta;
160 float epsilon = par.epsilon;
161 float tau = par.tau;
162 float eta = par.eta;
163
164 for (size_t i=S; i<=E; ++i)
165 {
166 for (size_t j=S; j<=E; ++j)
167 {
168 T[i][j] = -INFINITY;
169 }
170 }
171
172 T[S][M] = log10(((1-delta))/((1-eta)*(1-eta)));
173 T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
174 T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
175 T[I][M] = T[D][M];
176
177 T[S][D] = log10(delta/(1-eta));
178 T[M][D] = log10(delta/(1-eta));
179 T[D][D] = log10(epsilon/(1-eta));;
180
181 T[M][I] = T[M][D];
182 T[I][I] = T[D][D];
183 };
184
185 /**
186 * Initializes U and V.
187 */
initialize_UV()188 void AHMM::initialize_UV()
189 {
190 for (size_t i=0; i<MAXLEN; ++i)
191 {
192 for (size_t j=0; j<MAXLEN; ++j)
193 {
194 size_t c = index(i,j);
195
196 V[S][c] = -INFINITY;
197 U[S][c] = NULL_TRACK;
198
199 V[M][c] = -INFINITY;
200 if (!i || !j)
201 {
202 U[M][c] = make_track(N,UNMODELED,0,0);
203 }
204 else
205 {
206 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
207 }
208
209 V[D][c] = -INFINITY;
210 if (!i || !j)
211 {
212 U[D][c] = make_track(N,UNMODELED,0,0);
213 }
214 else
215 {
216 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
217 }
218
219 V[I][c] = -INFINITY;
220 if (!i || !j)
221 {
222 U[I][c] = make_track(N,UNMODELED,0,0);
223 }
224 else
225 {
226 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
227 }
228 }
229 }
230
231 V[S][index(0,0)] = 0;
232 U[S][index(0,0)] = START_TRACK;
233
234 V[M][index(0,0)] = -INFINITY;
235 };
236
237 /**
238 * Sets a model.
239 */
set_model(const char * motif)240 void AHMM::set_model(const char* motif)
241 {
242 if (model[MOTIF]) free(model[MOTIF]);
243
244 model[MOTIF] = strdup(motif);
245
246 mlen = strlen(model[MOTIF]);
247 }
248
249 /**
250 * Sets delta.
251 */
set_delta(float delta)252 void AHMM::set_delta(float delta)
253 {
254 par.delta = delta;
255 }
256
257 /**
258 * Sets epsilon.
259 */
set_epsilon(float epsilon)260 void AHMM::set_epsilon(float epsilon)
261 {
262 par.epsilon = epsilon;
263 }
264
265 /**
266 * Sets tau.
267 */
set_tau(float tau)268 void AHMM::set_tau(float tau)
269 {
270 par.tau = tau;
271 }
272
273 /**
274 * Sets eta.
275 */
set_eta(float eta)276 void AHMM::set_eta(float eta)
277 {
278 par.eta = eta;
279 }
280
281 /**
282 * Sets mismatch penalty.
283 */
set_mismatch_penalty(float mismatch_penalty)284 void AHMM::set_mismatch_penalty(float mismatch_penalty)
285 {
286 par.mismatch_penalty = mismatch_penalty;
287 }
288
289 /**
290 * Sets debug.
291 */
set_debug(bool debug)292 void AHMM::set_debug(bool debug)
293 {
294 this->debug = debug;
295 }
296
297 /**
298 * Get left flank start position for model.
299 */
get_lflank_model_spos1()300 int32_t AHMM::get_lflank_model_spos1()
301 {
302 return lflank_start[MODEL];
303 };
304
305 /**
306 * Get left flank end position for model.
307 */
get_lflank_model_epos1()308 int32_t AHMM::get_lflank_model_epos1()
309 {
310 return lflank_end[MODEL];
311 };
312
313 /**
314 * Get motif start position for model.
315 */
get_motif_model_spos1()316 int32_t AHMM::get_motif_model_spos1()
317 {
318 return motif_start[MODEL];
319 };
320
321 /**
322 * Get motif end position for model.
323 */
get_motif_model_epos1()324 int32_t AHMM::get_motif_model_epos1()
325 {
326 return motif_end[MODEL];
327 };
328
329 /**
330 * Get right flank start position for model.
331 */
get_rflank_model_spos1()332 int32_t AHMM::get_rflank_model_spos1()
333 {
334 return rflank_start[MODEL];
335 };
336
337 /**
338 * Get right flank end position for model.
339 */
get_rflank_model_epos1()340 int32_t AHMM::get_rflank_model_epos1()
341 {
342 return rflank_end[MODEL];
343 };
344
345 /**
346 * Get left flank start position for read.
347 */
get_lflank_read_spos1()348 int32_t AHMM::get_lflank_read_spos1()
349 {
350 return lflank_start[READ];
351 };
352
353 /**
354 * Get left flank end position for read.
355 */
get_lflank_read_epos1()356 int32_t AHMM::get_lflank_read_epos1()
357 {
358 return lflank_end[READ];
359 };
360
361 /**
362 * Get motif start position for read.
363 */
get_motif_read_spos1()364 int32_t AHMM::get_motif_read_spos1()
365 {
366 return motif_start[READ];
367 };
368
369 /**
370 * Get motif end position for read.
371 */
get_motif_read_epos1()372 int32_t AHMM::get_motif_read_epos1()
373 {
374 return motif_end[READ];
375 };
376
377 /**
378 * Get right flank start position for read.
379 */
get_rflank_read_spos1()380 int32_t AHMM::get_rflank_read_spos1()
381 {
382 return rflank_start[READ];
383 };
384
385 /**
386 * Get right flank end position for read.
387 */
get_rflank_read_epos1()388 int32_t AHMM::get_rflank_read_epos1()
389 {
390 return rflank_end[READ];
391 };
392
393 /**
394 * Get motif concordance.
395 */
get_motif_concordance()396 float AHMM::get_motif_concordance()
397 {
398 return motif_concordance;
399 };
400
401 /**
402 * Get exact motif count.
403 */
get_exact_motif_count()404 uint32_t AHMM::get_exact_motif_count()
405 {
406 return exact_motif_count;
407 };
408
409 /**
410 * Get motif count.
411 */
get_motif_count()412 uint32_t AHMM::get_motif_count()
413 {
414 return motif_count;
415 };
416
417 /**
418 * Computes the score associated with the move from A to B
419 * Updates the max_score and associated max_track.
420 *
421 * @A - start state
422 * @B - end state
423 * @index1 - flattened index of the one dimensional array of start state
424 * @j - 1 based position of read of start state
425 * @m - base match required (MATCH, MODEL, READ)
426 */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)427 void AHMM::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
428 {
429 float emission = 0, score = 0, valid = 0;
430
431 //t is the new track
432 int32_t t = (this->*(moves[A][B]))(U[A][index1],j);
433
434 if (t==NULL_TRACK)
435 {
436 valid = -INFINITY;
437 }
438 else if (match_type==MATCH)
439 {
440 emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
441 }
442
443 score = V[A][index1] + T[A][B] + emission + valid;
444
445 if (score>max_score)
446 {
447 max_score = score;
448 max_track = t;
449 }
450
451 if (debug)
452 {
453 std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
454 std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
455 std::cerr << track2string(U[A][index1]) << "=>";
456 std::cerr << track2string(t) << " ";
457 std::cerr << emission << " (e: " << (track_get_d(t)<=MOTIF?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N') << ") + ";
458 std::cerr << T[A][B] << " (t) + ";
459 std::cerr << V[A][index1] << " (p) + ";
460 std::cerr << valid << " (v) = ";
461 std::cerr << score << "\n";
462 }
463 }
464
465 /**
466 * Align read against model.
467 */
align(const char * read,const char * qual)468 void AHMM::align(const char* read, const char* qual)
469 {
470 clear_statistics();
471 optimal_path_traced = false;
472 this->read = read;
473 this->qual = qual;
474 rlen = strlen(read);
475
476 if (rlen>=MAXLEN)
477 {
478 fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported, subsetting string to first 1023 characters: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
479 rlen = 1023;
480 }
481 plen = rlen;
482
483 float max = 0;
484 char maxPath = 'X';
485
486 size_t c,d,u,l;
487
488 //alignment
489 //take into consideration
490 for (size_t i=1; i<=plen; ++i)
491 {
492 //break;
493
494 for (size_t j=1; j<=rlen; ++j)
495 {
496 c = index(i,j);
497 d = index(i-1,j-1);
498 u = index(i-1,j);
499 l = index(i,j-1);
500
501 /////
502 //M//
503 /////
504 //only need to update this i>rflen
505 max_score = -INFINITY;
506 max_track = NULL_TRACK;
507 proc_comp(S, M, d, j-1, MATCH);
508 proc_comp(M, M, d, j-1, MATCH);
509 proc_comp(D, M, d, j-1, MATCH);
510 proc_comp(I, M, d, j-1, MATCH);
511 V[M][c] = max_score;
512 U[M][c] = max_track;
513 if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
514
515 /////
516 //D//
517 /////
518 max_score = -INFINITY;
519 max_track = NULL_TRACK;
520 proc_comp(S, D, u, j, MODEL);
521 proc_comp(M, D, u, j, MODEL);
522 proc_comp(D, D, u, j, MODEL);
523 V[D][c] = max_score;
524 U[D][c] = max_track;
525 if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
526
527 /////
528 //I//
529 /////
530 max_score = -INFINITY;
531 max_track = NULL_TRACK;
532 proc_comp(S, I, l, j-1, READ);
533 proc_comp(M, I, l, j-1, READ);
534 proc_comp(I, I, l, j-1, READ);
535 V[I][c] = max_score;
536 U[I][c] = max_track;
537 if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
538 }
539 }
540
541 if (debug)
542 {
543 std::cerr << "\n =V[S]=\n";
544 print(V[S], plen+1, rlen+1);
545 std::cerr << "\n =U[S]=\n";
546 print_U(U[S], plen+1, rlen+1);
547
548 std::cerr << "\n =V[M]=\n";
549 print(V[M], plen+1, rlen+1);
550 std::cerr << "\n =U[M]=\n";
551 print_U(U[M], plen+1, rlen+1);
552 std::cerr << "\n =V[D]=\n";
553 print(V[D], plen+1, rlen+1);
554 std::cerr << "\n =U[D]=\n";
555 print_U(U[D], plen+1, rlen+1);
556 std::cerr << "\n =V[I]=\n";
557 print(V[I], plen+1, rlen+1);
558 std::cerr << "\n =U[I]=\n";
559 print_U(U[I], plen+1, rlen+1);
560
561 std::cerr << "\n";
562 std::cerr << "\n";
563
564 print_T();
565 }
566
567 trace_path();
568
569 exact_motif_count = motif_count;
570 motif_concordance = 0;
571 for (int32_t k=1; k<=motif_count; ++k)
572 {
573 if (motif_discordance[k])
574 {
575 --exact_motif_count;
576 }
577
578 if (mlen>=motif_discordance[k])
579 {
580 motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
581 }
582 }
583 motif_concordance /= motif_count;
584
585
586
587 };
588
589 /**
590 * Trace path after alignment.
591 */
trace_path()592 void AHMM::trace_path()
593 {
594 //search for a complete path in M
595 size_t c;
596 optimal_score = -INFINITY;
597 optimal_track = NULL_TRACK;
598 optimal_state = TBD;
599 optimal_probe_len = 0;
600 trf_score = 0;
601 for (size_t i=lflen; i<=plen; ++i)
602 {
603 c = index(i,rlen);
604
605 if (V[M][c]>=optimal_score)
606 {
607 optimal_score = V[M][c];
608 optimal_track = U[M][c];
609 optimal_state = M;
610 optimal_probe_len = i;
611 }
612
613 if (V[D][c]>=optimal_score)
614 {
615 optimal_score = V[D][c];
616 optimal_track = U[D][c];
617 optimal_state = D;
618 optimal_probe_len = i;
619 }
620
621 // if (V[I][c]>=optimal_score)
622 // {
623 // optimal_score = V[I][c];
624 // optimal_track = U[I][c];
625 // optimal_state = I;
626 // optimal_probe_len = i;
627 // }
628 }
629
630 //trace path
631 optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
632 int32_t i = optimal_probe_len, j = rlen;
633 int32_t last_t = make_track(optimal_state, UNMODELED, 0, 0); //dummy end track for E
634 optimal_path_len = 0;
635 int32_t u;
636 int32_t des_t, src_t = make_track(E, UNMODELED, 0, 0);
637
638 do
639 {
640 u = track_get_u(last_t);
641 last_t = U[u][index(i,j)];
642 *optimal_path_ptr = track_set_u(last_t, u);
643
644 des_t = *optimal_path_ptr;
645 collect_statistics(src_t, des_t, j);
646 if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " : " << track2string(last_t) << "\n";
647 src_t = des_t;
648
649 if (u==M)
650 {
651 --i; --j;
652 }
653 else if (u==D)
654 {
655 --i;
656 }
657 else if (u==I || u==Z)
658 {
659 --j;
660 }
661
662 --optimal_path_ptr;
663 ++optimal_path_len;
664
665 } while (track_get_u(last_t)!=S);
666
667 collect_statistics(src_t, last_t, j);
668
669 ++optimal_path_ptr;
670 optimal_path_traced = true;
671 };
672
673 /**
674 * Collect alignment summary statistics.
675 */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)676 void AHMM::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
677 {
678 //std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
679
680 int32_t src_u = track_get_u(src_t);
681 int32_t des_u = track_get_u(des_t);
682
683 if (src_u==E)
684 {
685 if (des_u==M || des_u==D || des_u==I)
686 {
687 rflank_start[MODEL] = NAN;
688 rflank_start[READ] = NAN;
689 rflank_end[MODEL] = INFINITY;
690 rflank_end[READ] = INFINITY;
691
692 motif_end[MODEL] = track_get_c(des_t);
693 motif_count = track_get_c(des_t);
694 last_motif_pos = track_get_p(des_t);
695 motif_end[READ] = j;
696
697 //initialize array for tracking inexact repeats
698 for (int32_t k=1; k<=motif_count; ++k)
699 {
700 motif_discordance[k] = 0;
701 }
702
703 if (des_u==D || track_get_base(des_t)!=read[j-1])
704 {
705 ++motif_discordance[motif_count];
706 }
707 }
708 }
709
710 if (des_u==M)
711 {
712 if (track_get_base(des_t)!=read[j-1])
713 {
714 trf_score -= 7;
715 ++motif_discordance[track_get_c(des_t)];
716 }
717 else
718 {
719 trf_score += 2;
720 }
721 }
722
723 if (des_u==D || des_u==I)
724 {
725 trf_score -= 7;
726 ++motif_discordance[track_get_c(des_t)];
727 }
728
729 frac_no_repeats = motif_count - (mlen-last_motif_pos)/((float)mlen);
730 };
731
732 /**
733 * Clear alignment statistics.
734 */
clear_statistics()735 void AHMM::clear_statistics()
736 {
737 lflank_start[MODEL] = NAN;
738 lflank_start[READ] = NAN;
739 lflank_end[MODEL] = NAN;
740 lflank_end[READ] = NAN;
741 motif_start[MODEL] = NAN;
742 motif_start[READ] = NAN;
743 motif_end[MODEL] = NAN;
744 motif_end[READ] = NAN;
745 motif_count = NAN;
746 exact_motif_count = NAN;
747 motif_m = NAN;
748 motif_xid = NAN;
749 motif_concordance = NAN;
750 rflank_start[MODEL] = NAN;
751 rflank_start[READ] = NAN;
752 rflank_end[MODEL] = NAN;
753 rflank_end[READ] = NAN;
754 }
755
756 /**
757 * Update alignment statistics after collection.
758 */
update_statistics()759 void AHMM::update_statistics()
760 {
761 motif_concordance = (float)motif_m/(motif_m+motif_xid);
762 }
763
764 /**
765 * Returns true if flanks are mapped.
766 */
flanks_are_mapped()767 bool AHMM::flanks_are_mapped()
768 {
769 return lflank_end[MODEL]==lflen;
770 }
771
772
773 /**
774 * Compute log10 emission odds based on equal error probability distribution.
775 */
log10_emission_odds(char probe_base,char read_base,uint32_t pl,float mismatch_penalty)776 float AHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl, float mismatch_penalty)
777 {
778 // if (read_base=='N' || probe_base=='N')
779 // {
780 // return -INFINITY; //is this appropriate in this case?
781 // }
782
783 if (read_base!=probe_base)
784 {
785 return LogTool::pl2log10_varp(pl);
786 }
787 else //match
788 {
789 return -(LogTool::pl2log10_varp(pl)-mismatch_penalty);
790 }
791 };
792
793 /**
794 * Compute log10 emission odds based on equal error probability distribution.
795 */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)796 float AHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
797 {
798 // if (read_base=='N' || probe_base=='N')
799 // {
800 // return -INFINITY; //is this appropriate in this case?
801 // }
802
803 if (read_base!=probe_base)
804 {
805 return LogTool::pl2log10_varp(pl)-par.mismatch_penalty;
806 }
807 else //match
808 {
809 return -(LogTool::pl2log10_varp(pl));
810 }
811 };
812
813 /**
814 * Converts state to string representation.
815 */
state2string(int32_t state)816 std::string AHMM::state2string(int32_t state)
817 {
818 if (state==S)
819 {
820 return "S";
821 }
822 else if (state==M)
823 {
824 return "M";
825 }
826 else if (state==D)
827 {
828 return "D";
829 }
830 else if (state==I)
831 {
832 return "I";
833 }
834 else if (state==E)
835 {
836 return "E";
837 }
838 else if (state==N)
839 {
840 return "N";
841 }
842 else if (state==TBD)
843 {
844 return "*";
845 }
846 else
847 {
848 return "!";
849 }
850 }
851
852 /**
853 * Converts state to cigar string representation.
854 */
state2cigarstring(int32_t state)855 std::string AHMM::state2cigarstring(int32_t state)
856 {
857 if (state==S)
858 {
859 return "S";
860 }
861 else if (state==M)
862 {
863 return "M";
864 }
865 else if (state==D)
866 {
867 return "D";
868 }
869 else if (state==I)
870 {
871 return "I";
872 }
873 else if (state==E)
874 {
875 return "E";
876 }
877 else if (state==N)
878 {
879 return "N";
880 }
881 else if (state==TBD)
882 {
883 return "*";
884 }
885 else
886 {
887 return "!";
888 }
889 }
890
891 /**
892 * Converts state to cigar string representation.
893 */
track2cigarstring1(int32_t t,int32_t j)894 std::string AHMM::track2cigarstring1(int32_t t, int32_t j)
895 {
896 int32_t state = track_get_u(t);
897
898 if (state==S)
899 {
900 return "S";
901 }
902 else if (state==M)
903 {
904 if (track_get_base(t)==read[j-1])
905 {
906 return "M";
907 }
908 else
909 {
910 return "*";
911 }
912 }
913 else if (state==D)
914 {
915 return "D";
916 }
917 else if (state==I)
918 {
919 return "I";
920 }
921 else if (state==Z)
922 {
923 return "Z";
924 }
925 else if (state==E)
926 {
927 return "E";
928 }
929 else if (state==N)
930 {
931 return "N";
932 }
933 else if (state==TBD)
934 {
935 return "*";
936 }
937 else
938 {
939 return "!";
940 }
941 }
942
943 /**
944 * Converts track to cigar string representation.
945 */
track2cigarstring2(int32_t t)946 std::string AHMM::track2cigarstring2(int32_t t)
947 {
948 int32_t state = track_get_u(t);
949
950 if (state==M)
951 {
952 return (track_get_c(t)%2==0?"+":"o");
953 }
954 else if (state==D)
955 {
956 return (track_get_c(t)%2==0?"+":"o");
957 }
958 else if (state==I)
959 {
960 return (track_get_c(t)%2==0?"+":"o");
961 }
962 else
963 {
964 return " ";
965 }
966 }
967
968 /**
969 * Converts model component to string representation.
970 */
component2string(int32_t component)971 std::string AHMM::component2string(int32_t component)
972 {
973 if (component==LFLANK)
974 {
975 return "l";
976 }
977 else if (component==MOTIF)
978 {
979 return "m";
980 }
981 else if (component==RFLANK)
982 {
983 return "r";
984 }
985 else if (component==UNMODELED)
986 {
987 return "!";
988 }
989 else if (component==READ)
990 {
991 return "s";
992 }
993 else if (component==UNCERTAIN)
994 {
995 return "?";
996 }
997 else
998 {
999 return "!";
1000 }
1001 }
1002
1003 /**
1004 * Prints an alignment.
1005 */
print_alignment()1006 void AHMM::print_alignment()
1007 {
1008 std::string pad = "\t";
1009 print_alignment(pad);
1010 };
1011
1012 /**
1013 * Prints an alignment with padding.
1014 */
print_alignment(std::string & pad)1015 void AHMM::print_alignment(std::string& pad)
1016 {
1017 if (!optimal_path_traced)
1018 {
1019 std::cerr << "path not traced\n";
1020 }
1021
1022 std::cerr << "=================================\n";
1023 std::cerr << "AHMM\n";
1024 std::cerr << "*********************************\n";
1025 std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1026 std::cerr << "lflen : " << lflen << "\n";
1027 std::cerr << "mlen : " << mlen << "\n";
1028 std::cerr << "plen : " << plen << "\n";
1029 std::cerr << "\n";
1030 std::cerr << "read : " << read << "\n";
1031 std::cerr << "rlen : " << rlen << "\n";
1032 std::cerr << "\n";
1033 std::cerr << "optimal score: " << optimal_score << "\n";
1034 std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1035 std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1036 std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1037 std::cerr << "optimal path length : " << optimal_path_len << "\n";
1038 std::cerr << "max j: " << rlen << "\n";
1039 std::cerr << "mismatch penalty: " << par.mismatch_penalty << "\n";
1040 std::cerr << "\n";
1041
1042 std::cerr << "model: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1043 << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "]\n";
1044 std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1045 << "[" << motif_start[READ] << "~" << motif_end[READ] << "]"
1046 << "[" << rflank_start[READ] << "~" << rflank_end[READ] << "]\n";
1047 std::cerr << "\n";
1048 std::cerr << "motif # : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1049 std::cerr << "motif concordance : " << motif_concordance << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1050 std::cerr << "last motif position : " << last_motif_pos << "\n";
1051 std::cerr << "motif discordance : ";
1052 for (int32_t k=1; k<=motif_count; ++k)
1053 {
1054 std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1055 }
1056 std::cerr << "fractional no. repeat units : " << frac_no_repeats << "\n";
1057 std::cerr << "repeat tract length : " << rlen << "\n";
1058 std::cerr << "TRF Score : " << trf_score << "\n";
1059
1060 std::cerr << "\n";
1061
1062 //print path
1063 int32_t* path;
1064 path = optimal_path_ptr;
1065 std::cerr << "Model: ";
1066 int32_t t = NULL_TRACK;
1067 int32_t j = 0;
1068 while (path<optimal_path+(MAXLEN<<2))
1069 {
1070 int32_t u = track_get_u(*path);
1071 if (u==M || u==D)
1072 {
1073 std::cerr << track_get_base(*path);
1074 }
1075 else
1076 {
1077 std::cerr << '-';
1078 }
1079 ++path;
1080 }
1081 std::cerr << " \n";
1082
1083 std::cerr << " S";
1084 path = optimal_path_ptr;
1085 j=1;
1086 while (path<optimal_path+(MAXLEN<<2))
1087 {
1088 std::cerr << track2cigarstring1(*path,j);
1089 int32_t u = track_get_u(*path);
1090 if (u==M || u==I || u==Z)
1091 {
1092 ++j;
1093 }
1094 ++path;
1095 }
1096 std::cerr << "E\n";
1097
1098 path = optimal_path_ptr;
1099 std::cerr << " ";
1100 while (path<optimal_path+(MAXLEN<<2))
1101 {
1102 std::cerr << track2cigarstring2(*path);
1103 ++path;
1104 }
1105 std::cerr << " \n";
1106
1107 path = optimal_path_ptr;
1108 j=1;
1109 std::cerr << "Read: ";
1110 while (path<optimal_path+(MAXLEN<<2))
1111 {
1112 int32_t u = track_get_u(*path);
1113 if (u==M || u==I || u==Z)
1114 {
1115 std::cerr << read[j-1];
1116 ++j;
1117 }
1118 else
1119 {
1120 std::cerr << '-';
1121 }
1122 ++path;
1123 }
1124 std::cerr << " \n";
1125 std::cerr << "=================================\n";
1126 };
1127
1128 /**
1129 * Prints a float matrix.
1130 */
print(float * v,size_t plen,size_t rlen)1131 void AHMM::print(float *v, size_t plen, size_t rlen)
1132 {
1133 float val;
1134 std::cerr << std::setprecision(1) << std::fixed;
1135 for (size_t i=0; i<plen; ++i)
1136 {
1137 for (size_t j=0; j<rlen; ++j)
1138 {
1139 val = v[index(i,j)];
1140 std::cerr << (val<0?" ":" ") << val;
1141 }
1142
1143 std::cerr << "\n";
1144 }
1145 };
1146
1147 /**
1148 * Prints a char matrix.
1149 */
print(int32_t * v,size_t plen,size_t rlen)1150 void AHMM::print(int32_t *v, size_t plen, size_t rlen)
1151 {
1152 float val;
1153 std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1154 for (size_t i=0; i<plen; ++i)
1155 {
1156 for (size_t j=0; j<rlen; ++j)
1157 {
1158 val = v[index(i,j)];
1159 std::cerr << (val<0?" ":" ") << val;
1160 }
1161
1162 std::cerr << "\n";
1163 }
1164 };
1165
1166 /**
1167 * Prints the transition matrix.
1168 */
print_T()1169 void AHMM::print_T()
1170 {
1171 std::cerr << "\t";
1172 for (size_t j=S; j<=Z; ++j)
1173 {
1174 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1175 }
1176 std::cerr << "\n";
1177
1178 for (size_t i=S; i<=Z; ++i)
1179 {
1180 std::cerr << "\t";
1181 for (size_t j=S; j<=Z; ++j)
1182 {
1183 if (j)
1184 {
1185 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1186 }
1187 else
1188 {
1189 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1190 }
1191 }
1192 std::cerr << "\n";
1193 }
1194 std::cerr << "\n";
1195 };
1196
1197 /**
1198 * Prints U.
1199 */
print_U(int32_t * U,size_t plen,size_t rlen)1200 void AHMM::print_U(int32_t *U, size_t plen, size_t rlen)
1201 {
1202 std::cerr << std::setprecision(1) << std::fixed;
1203 std::string state;
1204 for (size_t i=0; i<plen; ++i)
1205 {
1206 for (size_t j=0; j<rlen; ++j)
1207 {
1208 int32_t t = U[index(i,j)];
1209 state = state2string(track_get_u(t));
1210 std::cerr << (state.size()==1 ? " " : " ")
1211 << state << "|"
1212 << component2string(track_get_d(t)) << "|"
1213 << track_get_c(t) << "|"
1214 << track_get_p(t) << (j==rlen-1?"\n":" ");
1215 }
1216 }
1217 };
1218
1219 /**
1220 * Prints U and V.
1221 */
print_trace(int32_t state,size_t plen,size_t rlen)1222 void AHMM::print_trace(int32_t state, size_t plen, size_t rlen)
1223 {
1224 std::cerr << std::setprecision(1) << std::fixed;
1225 int32_t *u = U[state];
1226 float *v = V[state];
1227 std::string s;
1228 for (size_t i=0; i<plen; ++i)
1229 {
1230 for (size_t j=0; j<rlen; ++j)
1231 {
1232 int32_t t = u[index(i,j)];
1233 s = state2string(track_get_u(t));
1234 std::cerr << (s.size()==1 ? " " : " ")
1235 << s << "|"
1236 << component2string(track_get_d(t)) << "|"
1237 << track_get_c(t) << "|"
1238 << track_get_p(t) << "|"
1239 << v[index(i,j)];
1240 }
1241
1242 std::cerr << "\n";
1243 }
1244 };
1245
1246 /**
1247 * Returns a string representation of track.
1248 */
track2string(int32_t t)1249 std::string AHMM::track2string(int32_t t)
1250 {
1251 kstring_t str = {0,0,0};
1252
1253 kputs(state2string(track_get_u(t)).c_str(), &str);
1254 kputc('|', &str);
1255 kputs(component2string(track_get_d(t)).c_str(), &str);
1256 kputc('|', &str);
1257 kputw(track_get_c(t), &str);
1258 kputc('|', &str);
1259 kputw(track_get_p(t), &str);
1260
1261 std::string s(str.s);
1262 if (str.m) free(str.s);
1263
1264 return s;
1265 }
1266
1267 /**
1268 * Prints track.
1269 */
print_track(int32_t t)1270 void AHMM::print_track(int32_t t)
1271 {
1272 std::cerr << track2string(t) << "\n";
1273 }
1274
1275 #undef MAXLEN
1276 #undef MAXLEN_NBITS
1277 #undef S
1278 #undef ML
1279 #undef M
1280 #undef I
1281 #undef D
1282 #undef Z
1283 #undef E
1284 #undef N
1285 #undef TBD
1286 #undef NSTATES
1287
1288 #undef LFLANK
1289 #undef MOTIF
1290 #undef RFLANK
1291 #undef UNMODELED
1292 #undef UNCERTAIN
1293
1294 #undef READ
1295 #undef MODEL
1296 #undef MATCH
1297
1298 #undef index
1299 #undef track_get_u
1300 #undef track_get_d
1301 #undef track_get_d
1302 #undef track_get_c
1303 #undef track_get_p
1304 #undef track_get_base
1305 #undef track_valid
1306 #undef track_set_u
1307 #undef track_set_d
1308 #undef track_set_c
1309 #undef track_set_p
1310 #undef make_track
1311
1312 #undef NULL_TRACK
1313 #undef START_TRACK