1 /* The MIT License
2
3 Copyright (c) 2014 Adrian Tan <atks@umich.edu>
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include "rfhmm.h"
25
26 #define MAXLEN 1024
27 #define MAXLEN_NBITS 10
28
29 //model states
30 #define S 0
31 #define Y 1
32 #define M 2
33 #define D 3
34 #define I 4
35 #define MR 5
36 #define E 6
37 #define N 7
38 #define TBD 8
39
40 #define NSTATES 7
41
42 //model components
43 #define LFLANK 0
44 #define MOTIF 1
45 #define RFLANK 2
46 #define UNMODELED 3
47 #define UNCERTAIN 4
48
49 //match type
50 #define MODEL 0
51 #define READ 1
52 #define MATCH 2
53
54 /*for indexing single array*/
55 #define index(i,j) (((i)<<MAXLEN_NBITS)+(j))
56
57 /*functions for getting trace back information*/
58 #define track_get_u(t) (((t)&0xF8000000)>>27)
59 #define track_get_d(t) (((t)&0x07000000)>>24)
60 #define track_get_c(t) (((t)&0x00FFF000)>>12)
61 #define track_get_p(t) (((t)&0x00000FFF))
62 #define track_get_base(t) (model[track_get_d(t)][track_get_p(t)-1])
63 #define track_valid(t) ((track_get_d(t)==RFLANK||track_get_d(t)==MOTIF||track_get_d(t)==LFLANK)&&track_get_p(t)!=0)
64 #define track_set_u(t,u) (((t)&0x07FFFFFF)|((u)<<27))
65 #define track_set_d(t,d) (((t)&0xF8FFFFFF)|((d)<<24))
66 #define track_set_c(t,c) (((t)&0xFF000FFF)|((c)<<12))
67 #define track_set_p(t,p) (((t)&0xFFFFF000)|(p))
68 #define make_track(u,d,c,p) (((u)<<27)|((d)<<24)|((c)<<12)|(p))
69
70 //[N|!|0|0]
71 #define NULL_TRACK 0x7C000000
72 //[N|l|0|0]
73 #define START_TRACK 0x78000000
74
75 /**
76 * Constructor.
77 */
RFHMM(bool debug)78 RFHMM::RFHMM(bool debug)
79 {
80 this-> debug = debug;
81 initialize();
82 };
83
84 /**
85 * Destructor.
86 */
~RFHMM()87 RFHMM::~RFHMM()
88 {
89 delete optimal_path;
90
91 //the best alignment V_ for subsequence (i,j)
92 for (size_t state=S; state<=E; ++state)
93 {
94 delete V[state];
95 delete U[state];
96 }
97
98 delete V;
99 delete U;
100 };
101
102 /**
103 * Initializes objects; helper function for constructor.
104 */
initialize()105 void RFHMM::initialize()
106 {
107 initialize_structures();
108 initialize_T();
109 initialize_UV();
110 };
111
112 /**
113 * Initializes objects for constructor.
114 */
initialize_structures()115 void RFHMM::initialize_structures()
116 {
117 model = new char*[3];
118 model[LFLANK] = NULL;
119 model[MOTIF] = NULL;
120 model[RFLANK] = NULL;
121
122 rflen = 0;
123 mlen = 0;
124
125 motif_discordance = new int32_t[MAXLEN];
126 optimal_path = new int32_t[MAXLEN<<2];
127 optimal_path_traced = false;
128
129 typedef int32_t (RFHMM::*move) (int32_t t, int32_t j);
130 V = new float*[NSTATES];
131 U = new int32_t*[NSTATES];
132 moves = new move*[NSTATES];
133 for (size_t state=S; state<=E; ++state)
134 {
135 V[state] = new float[MAXLEN*MAXLEN];
136 U[state] = new int32_t[MAXLEN*MAXLEN];
137 moves[state] = new move[NSTATES];
138 }
139
140 for (size_t state=S; state<=E; ++state)
141 {
142 moves[state] = new move[NSTATES];
143 }
144
145 moves[S][Y] = &RFHMM::move_S_Y;
146 moves[Y][Y] = &RFHMM::move_Y_Y;
147
148 moves[S][M] = &RFHMM::move_S_M;
149 moves[Y][M] = &RFHMM::move_Y_M;
150 moves[M][M] = &RFHMM::move_M_M;
151 moves[D][M] = &RFHMM::move_D_M;
152 moves[I][M] = &RFHMM::move_I_M;
153
154 moves[S][D] = &RFHMM::move_S_D;
155 moves[Y][D] = &RFHMM::move_Y_D;
156 moves[M][D] = &RFHMM::move_M_D;
157 moves[D][D] = &RFHMM::move_D_D;
158
159 moves[S][I] = &RFHMM::move_S_I;
160 moves[Y][I] = &RFHMM::move_Y_I;
161 moves[M][I] = &RFHMM::move_M_I;
162 moves[I][I] = &RFHMM::move_I_I;
163
164 moves[S][MR] = &RFHMM::move_S_MR;
165 moves[Y][MR] = &RFHMM::move_Y_MR;
166 moves[M][MR] = &RFHMM::move_M_MR;
167 moves[D][MR] = &RFHMM::move_D_MR;
168 moves[I][MR] = &RFHMM::move_I_MR;
169 moves[MR][MR] = &RFHMM::move_MR_MR;
170 };
171
172 /**
173 * Initialize transition matrix based on parameters.
174 */
initialize_T()175 void RFHMM::initialize_T()
176 {
177 float delta = par.delta;
178 float epsilon = par.epsilon;
179 float tau = par.tau;
180 float eta = par.eta;
181
182 for (size_t i=S; i<=E; ++i)
183 {
184 for (size_t j=S; j<=E; ++j)
185 {
186 T[i][j] = -INFINITY;
187 }
188 }
189
190 T[S][Y] = 0;
191 T[Y][Y] = 0;
192
193 T[S][M] = log10((tau*(1-2*delta-tau))/(tau*eta*(1-eta)*(1-eta)));
194 T[Y][M] = T[S][M];
195 T[M][M] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
196 T[D][M] = log10(((1-epsilon-tau))/((1-eta)*(1-eta)));
197 T[I][M] = T[I][M];
198
199 T[S][D] = log10((tau*delta)/(eta*(1-eta)));
200 T[Y][D] = T[S][D];
201 T[M][D] = log10(delta/(1-eta));
202 T[D][D] = log10(epsilon/(1-eta));
203
204 T[S][I] = log10((tau*delta)/(eta*(1-eta)));
205 T[Y][I] = T[S][I];
206 T[M][I] = log10(delta/(1-eta));
207 T[I][I] = log10(epsilon/(1-eta));
208
209 T[S][MR] = log10((tau*tau*(1-tau))/(tau*eta*eta*eta*(1-eta)*(1-eta)));
210 T[Y][MR] = log10((eta*tau)/(eta*eta*eta));
211 T[M][MR] = log10((tau*(1-tau))/(eta*eta*(1-eta)*(1-eta)));
212 T[D][MR] = T[M][MR];
213 T[I][MR] = T[D][MR];
214 T[MR][MR] = log10(((1-2*delta-tau))/((1-eta)*(1-eta)));
215 };
216
217 /**
218 * Initializes U and V.
219 */
initialize_UV()220 void RFHMM::initialize_UV()
221 {
222 int32_t t=0;
223 for (size_t i=0; i<MAXLEN; ++i)
224 {
225 for (size_t j=0; j<MAXLEN; ++j)
226 {
227 size_t c = index(i,j);
228
229 V[S][c] = -INFINITY;
230 U[S][c] = NULL_TRACK;
231
232 //Y
233 V[Y][c] = 0;
234 if (i>=1)
235 {
236 V[Y][c] = -INFINITY;
237 U[Y][c] = NULL_TRACK;
238 }
239 else
240 {
241 if (j==1)
242 {
243 U[Y][c] = make_track(S,LFLANK,0,j);
244 }
245 else
246 {
247 U[Y][c] = make_track(Y,LFLANK,0,j);
248 }
249 }
250
251 //M
252 V[M][c] = -INFINITY;
253 if (!i || !j)
254 {
255 U[M][c] = make_track(N,UNMODELED,0,0);
256 }
257 else
258 {
259 U[M][c] = make_track(TBD,UNCERTAIN,0,0);
260 }
261
262 //D
263 V[D][c] = -INFINITY;
264 if (!i || !j)
265 {
266 U[D][c] = make_track(N,UNMODELED,0,0);
267 }
268 else
269 {
270 U[D][c] = make_track(TBD,UNCERTAIN,0,0);
271 }
272
273 //I
274 V[I][c] = -INFINITY;
275 if (!i || !j)
276 {
277 U[I][c] = make_track(N,UNMODELED,0,0);
278 }
279 else
280 {
281 U[I][c] = make_track(TBD,UNCERTAIN,0,0);
282 }
283
284 //MR
285 V[MR][c] = -INFINITY;
286 if (!i || !j)
287 {
288 U[MR][c] = make_track(N,UNMODELED,0,0);
289 }
290 else
291 {
292 U[MR][c] = make_track(TBD,UNCERTAIN,0,0);
293 }
294 }
295 }
296
297 V[S][index(0,0)] = 0;
298 U[S][index(0,0)] = START_TRACK;
299
300 V[M][index(0,0)] = -INFINITY;
301 V[MR][index(0,0)] = -INFINITY;
302 };
303
304 /**
305 * Sets a model.
306 */
set_model(const char * motif,const char * rflank)307 void RFHMM::set_model(const char* motif, const char* rflank)
308 {
309 if (model[LFLANK]) free(model[LFLANK]);
310 if (model[MOTIF]) free(model[MOTIF]);
311 if (model[RFLANK]) free(model[RFLANK]);
312
313 model[MOTIF] = strdup(motif);
314 model[RFLANK] = strdup(rflank);
315
316 mlen = strlen(model[MOTIF]);
317 rflen = strlen(model[RFLANK]);
318 }
319
320 /**
321 * Sets delta.
322 */
set_delta(float delta)323 void RFHMM::set_delta(float delta)
324 {
325 par.delta = delta;
326 }
327
328 /**
329 * Sets epsilon.
330 */
set_epsilon(float epsilon)331 void RFHMM::set_epsilon(float epsilon)
332 {
333 par.epsilon = epsilon;
334 }
335
336 /**
337 * Sets tau.
338 */
set_tau(float tau)339 void RFHMM::set_tau(float tau)
340 {
341 par.tau = tau;
342 }
343
344 /**
345 * Sets eta.
346 */
set_eta(float eta)347 void RFHMM::set_eta(float eta)
348 {
349 par.eta = eta;
350 }
351
352 /**
353 * Sets mismatch penalty.
354 */
set_mismatch_penalty(float mismatch_penalty)355 void RFHMM::set_mismatch_penalty(float mismatch_penalty)
356 {
357 par.mismatch_penalty = mismatch_penalty;
358 }
359
360 /**
361 * Sets debug.
362 */
set_debug(bool debug)363 void RFHMM::set_debug(bool debug)
364 {
365 this->debug = debug;
366 }
367
368
369 /**
370 * Get left flank start position for model.
371 */
get_lflank_model_spos1()372 int32_t RFHMM::get_lflank_model_spos1()
373 {
374 return lflank_start[MODEL];
375 };
376
377 /**
378 * Get left flank end position for model.
379 */
get_lflank_model_epos1()380 int32_t RFHMM::get_lflank_model_epos1()
381 {
382 return lflank_end[MODEL];
383 };
384
385 /**
386 * Get motif start position for model.
387 */
get_motif_model_spos1()388 int32_t RFHMM::get_motif_model_spos1()
389 {
390 return motif_start[MODEL];
391 };
392
393 /**
394 * Get motif end position for model.
395 */
get_motif_model_epos1()396 int32_t RFHMM::get_motif_model_epos1()
397 {
398 return motif_end[MODEL];
399 };
400
401 /**
402 * Get right flank start position for model.
403 */
get_rflank_model_spos1()404 int32_t RFHMM::get_rflank_model_spos1()
405 {
406 return rflank_start[MODEL];
407 };
408
409 /**
410 * Get right flank end position for model.
411 */
get_rflank_model_epos1()412 int32_t RFHMM::get_rflank_model_epos1()
413 {
414 return rflank_end[MODEL];
415 };
416
417 /**
418 * Get left flank start position for read.
419 */
get_lflank_read_spos1()420 int32_t RFHMM::get_lflank_read_spos1()
421 {
422 return lflank_start[READ];
423 };
424
425 /**
426 * Get left flank end position for read.
427 */
get_lflank_read_epos1()428 int32_t RFHMM::get_lflank_read_epos1()
429 {
430 return lflank_end[READ];
431 };
432
433 /**
434 * Get motif start position for read.
435 */
get_motif_read_spos1()436 int32_t RFHMM::get_motif_read_spos1()
437 {
438 return motif_start[READ];
439 };
440
441 /**
442 * Get motif end position for read.
443 */
get_motif_read_epos1()444 int32_t RFHMM::get_motif_read_epos1()
445 {
446 return motif_end[READ];
447 };
448
449 /**
450 * Get right flank start position for read.
451 */
get_rflank_read_spos1()452 int32_t RFHMM::get_rflank_read_spos1()
453 {
454 return rflank_start[READ];
455 };
456
457 /**
458 * Get right flank end position for read.
459 */
get_rflank_read_epos1()460 int32_t RFHMM::get_rflank_read_epos1()
461 {
462 return rflank_end[READ];
463 };
464
465 /**
466 * Computes the score associated with the move from A to B
467 * Updates the max_score and associated max_track.
468 *
469 * @A - start state
470 * @B - end state
471 * @index1 - flattened index of the one dimensional array of start state
472 * @j - 1 based position of read of start state
473 * @m - base match required (MATCH, MODEL, READ)
474 */
proc_comp(int32_t A,int32_t B,int32_t index1,int32_t j,int32_t match_type)475 void RFHMM::proc_comp(int32_t A, int32_t B, int32_t index1, int32_t j, int32_t match_type)
476 {
477 float emission = 0, score = 0, valid = 0;
478
479 //t is the new track
480 int32_t t = (this->*(moves[A][B]))(U[A][index1],j);
481
482 if (t==NULL_TRACK)
483 {
484 valid = -INFINITY;
485 }
486 else if (match_type==MATCH)
487 {
488 emission = log10_emission_odds(track_get_base(t), read[j], qual[j]-33);
489 }
490
491 score = V[A][index1] + T[A][B] + emission + valid;
492
493 if (score>max_score)
494 {
495 max_score = score;
496 max_track = t;
497 }
498
499 if (debug)
500 {
501 std::cerr << "\t" << state2string(A) << "=>" << state2string(B);
502 std::cerr << " (" << ((index1-j)>>MAXLEN_NBITS) << "," << j << ") ";
503 std::cerr << track2string(U[A][index1]) << "=>";
504 std::cerr << track2string(t) << " ";
505 std::cerr << emission << " (e: " << (track_get_d(t)<=RFLANK?track_get_base(t):'N') << " vs " << (j!=rlen?read[j]:'N') << ") + ";
506 std::cerr << T[A][B] << " (t) + ";
507 std::cerr << V[A][index1] << " (p) + ";
508 std::cerr << valid << " (v) = ";
509 std::cerr << score << "\n";
510 }
511 }
512
513 /**
514 * Align read against model.
515 */
align(const char * read,const char * qual)516 void RFHMM::align(const char* read, const char* qual)
517 {
518 clear_statistics();
519 optimal_path_traced = false;
520 this->read = read;
521 this->qual = qual;
522 rlen = strlen(read);
523 plen = rlen + rflen;
524
525 if (rlen>MAXLEN)
526 {
527 fprintf(stderr, "[%s:%d %s] Sequence to be aligned is greater than %d currently supported: %d\n", __FILE__, __LINE__, __FUNCTION__, MAXLEN, rlen);
528 exit(1);
529 }
530
531 float max = 0;
532 char maxPath = 'Y';
533
534 size_t c,d,u,l;
535
536 //alignment
537 //take into consideration
538 for (size_t i=1; i<=plen; ++i)
539 {
540 for (size_t j=1; j<=rlen; ++j)
541 {
542 size_t c = index(i,j);
543 size_t d = index(i-1,j-1);
544 size_t u = index(i-1,j);
545 size_t l = index(i,j-1);
546
547 /////
548 //X//
549 /////
550 //invariant
551
552 /////
553 //Y//
554 /////
555 //invariant
556
557 /////
558 //M//
559 /////
560 //only need to update this i>rflen
561 if (debug) std::cerr << "(" << i << "," << j << ")\n";
562 max_score = -INFINITY;
563 max_track = NULL_TRACK;
564 proc_comp(S, M, d, j-1, MATCH);
565 proc_comp(Y, M, d, j-1, MATCH);
566 proc_comp(M, M, d, j-1, MATCH);
567 proc_comp(D, M, d, j-1, MATCH);
568 proc_comp(I, M, d, j-1, MATCH);
569 V[M][c] = max_score;
570 U[M][c] = max_track;
571 if (debug) std::cerr << "\tset M " << max_score << " - " << track2string(max_track) << "\n";
572
573 /////
574 //D//
575 /////
576 max_score = -INFINITY;
577 max_track = NULL_TRACK;
578 proc_comp(S, D, u, j, MODEL);
579 proc_comp(Y, D, u, j, MODEL);
580 proc_comp(M, D, u, j, MODEL);
581 proc_comp(D, D, u, j, MODEL);
582 V[D][c] = max_score;
583 U[D][c] = max_track;
584 if (debug) std::cerr << "\tset D " << max_score << " - " << track2string(max_track) << "\n";
585
586 /////
587 //I//
588 /////
589 max_score = -INFINITY;
590 max_track = NULL_TRACK;
591 proc_comp(S, I, l, j-1, READ);
592 proc_comp(Y, I, l, j-1, READ);
593 proc_comp(M, I, l, j-1, READ);
594 proc_comp(I, I, l, j-1, READ);
595 V[I][c] = max_score;
596 U[I][c] = max_track;
597 if (debug) std::cerr << "\tset I " << max_score << " - " << track2string(max_track) << "\n";
598
599 //////
600 //MR//
601 //////
602 max_score = -INFINITY;
603 max_track = NULL_TRACK;
604 proc_comp(S, MR, d, j-1, MATCH);
605 proc_comp(Y, MR, d, j-1, MATCH);
606 proc_comp(M, MR, d, j-1, MATCH);
607 proc_comp(D, MR, d, j-1, MATCH);
608 proc_comp(I, MR, d, j-1, MATCH);
609 proc_comp(MR, MR, d, j-1, MATCH);
610 V[MR][c] = max_score;
611 U[MR][c] = max_track;
612 if (debug) std::cerr << "\tset MR " << max_score << " - " << track2string(max_track) << "\n";
613
614 }
615 }
616
617 if (debug)
618 {
619 print(V[Y], plen+1, rlen+1);
620 std::cerr << "\n =U[Y]=\n";
621 print_U(U[Y], plen+1, rlen+1);
622
623 std::cerr << "\n =V[M]=\n";
624 print(V[M], plen+1, rlen+1);
625 std::cerr << "\n =U[M]=\n";
626 print_U(U[M], plen+1, rlen+1);
627 std::cerr << "\n =V[D]=\n";
628 print(V[D], plen+1, rlen+1);
629 std::cerr << "\n =U[D]=\n";
630 print_U(U[D], plen+1, rlen+1);
631 std::cerr << "\n =V[I]=\n";
632 print(V[I], plen+1, rlen+1);
633 std::cerr << "\n =U[I]=\n";
634 print_U(U[I], plen+1, rlen+1);
635
636 std::cerr << "\n =V[MR]=\n";
637 print(V[MR], plen+1, rlen+1);
638 std::cerr << "\n =U[MR]=\n";
639 print_U(U[MR], plen+1, rlen+1);
640
641 std::cerr << "\n";
642 }
643
644 trace_path();
645 };
646
647 /**
648 * Trace path after alignment.
649 */
trace_path()650 void RFHMM::trace_path()
651 {
652 //search for a complete path in MR or W or Z
653 size_t c;
654 optimal_score = -INFINITY;
655 optimal_track = NULL_TRACK;
656 optimal_state = TBD;
657 optimal_probe_len = 0;
658 for (size_t i=0; i<=plen; ++i)
659 {
660 c = index(i,rlen);
661 if (V[MR][c]>optimal_score)
662 {
663 optimal_score = V[MR][c];
664 optimal_track = U[MR][c];
665 optimal_state = MR;
666 optimal_probe_len = i;
667 }
668 }
669
670 //trace path
671 optimal_path_ptr = optimal_path+(MAXLEN<<2)-1;
672 int32_t i = optimal_probe_len, j = rlen;
673 int32_t last_t = make_track(optimal_state, RFLANK, 0, rflen+1);
674 optimal_path_len = 0;
675 int32_t u;
676 int32_t des_t, src_t = make_track(E, RFLANK, 0, rflen+1);
677
678 if (debug) std::cerr << "src_t (i,j) => dest_t : last_t\n";
679
680 do
681 {
682 u = track_get_u(last_t);
683 last_t = U[u][index(i,j)];
684 des_t = track_set_u(last_t, u);
685 *optimal_path_ptr = des_t;
686
687 collect_statistics(src_t, des_t, j);
688 if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " : " << track2string(last_t) << "\n";
689
690 if (track_get_u(src_t)==TBD)
691 {
692 exit(1);
693 }
694 src_t = des_t;
695
696 if (u==M || u==MR)
697 {
698 --i; --j;
699 }
700 else if (u==D)
701 {
702 --i;
703 }
704 else if (u==Y || u==I)
705 {
706 --j;
707 }
708
709 --optimal_path_ptr;
710 ++optimal_path_len;
711
712 } while (track_get_u(last_t)!=S);
713
714 if (debug) std::cerr << track2string(src_t) << " (" << i << "," << j << ") => " << track2string(des_t) << " : " << track2string(last_t) << "\n";
715 if (debug) std::cerr << track2string(last_t) << "\n";
716 if (debug) std::cerr << "\n";
717
718 collect_statistics(src_t, last_t, j);
719
720 ++optimal_path_ptr;
721 optimal_path_traced = true;
722 };
723
724 /**
725 * Collect alignment summary statistics.
726 */
collect_statistics(int32_t src_t,int32_t des_t,int32_t j)727 void RFHMM::collect_statistics(int32_t src_t, int32_t des_t, int32_t j)
728 {
729 if (debug) std::cerr << "\t " << track2string(src_t) << " (" << j << ") => " << track2string(des_t) << "\n";
730
731 int32_t src_u = track_get_u(src_t);
732 int32_t des_u = track_get_u(des_t);
733
734 if (src_u==E)
735 {
736 if (des_u==MR)
737 {
738 rflank_end[MODEL] = track_get_p(des_t);
739 rflank_end[READ] = j;
740 }
741 }
742 else if (src_u==MR)
743 {
744 if (des_u==M)
745 {
746 rflank_start[MODEL] = track_get_p(src_t);
747 rflank_start[READ] = j+1;
748
749 motif_end[MODEL] = track_get_c(des_t);
750 motif_count = track_get_c(des_t);
751 motif_end[READ] = j;
752
753 //initialize array for tracking inexact repeats
754 for (size_t k=1; k<=motif_count; ++k)
755 {
756 motif_discordance[k] = 0;
757 }
758
759 if (track_get_base(des_t)!=read[j-1])
760 {
761 ++motif_discordance[motif_count];
762 }
763 }
764 else if (des_u==D)
765 {
766 rflank_start[MODEL] = track_get_p(src_t);
767 rflank_start[READ] = j+1;
768
769 motif_end[MODEL] = track_get_c(des_t);
770 motif_count = track_get_c(des_t);
771 motif_end[READ] = j;
772
773 //initialize array for tracking inexact repeats
774 for (size_t k=1; k<=motif_count; ++k)
775 {
776 motif_discordance[k] = 0;
777 }
778
779 ++motif_discordance[motif_count];
780
781 }
782 else if (des_u==I)
783 {
784 }
785 else if (des_u==Y)
786 {
787 rflank_start[MODEL] = track_get_p(src_t);
788 rflank_start[READ] = j+1;
789
790 lflank_end[MODEL] = track_get_p(des_t);
791 lflank_end[READ] = j;
792 }
793 }
794 else if (src_u==M)
795 {
796 if (des_u==Y)
797 {
798 motif_start[MODEL] = track_get_c(src_t);
799 motif_start[READ] = j+1;
800 lflank_end[MODEL] = track_get_p(des_t);
801 lflank_end[READ] = j;
802 }
803 }
804 else if (src_u==Y)
805 {
806 if (des_u==S)
807 {
808 lflank_start[MODEL] = track_get_p(des_t);
809 lflank_start[READ] = j+1;
810 }
811
812 }
813
814 if (des_u==M)
815 {
816 if (track_get_base(des_t)!=read[j-1])
817 {
818 ++motif_discordance[track_get_c(des_t)];
819 }
820 }
821
822 if (des_u==D || des_u==I)
823 {
824 ++motif_discordance[track_get_c(des_t)];
825 }
826 };
827
828 /**
829 * Clear alignment statistics.
830 */
clear_statistics()831 void RFHMM::clear_statistics()
832 {
833 lflank_start[MODEL] = NAN;
834 lflank_start[READ] = NAN;
835 lflank_end[MODEL] = NAN;
836 lflank_end[READ] = NAN;
837 motif_start[MODEL] = NAN;
838 motif_start[READ] = NAN;
839 motif_end[MODEL] = NAN;
840 motif_end[READ] = NAN;
841 rflank_start[MODEL] = NAN;
842 rflank_start[READ] = NAN;
843 rflank_end[MODEL] = NAN;
844 rflank_end[READ] = NAN;
845 motif_count = NAN;
846 exact_motif_count = NAN;
847 motif_m = NAN;
848 motif_xid = NAN;
849 motif_concordance = NAN;
850 maxLogOdds = NAN;
851 }
852
853
854 /**
855 * Update alignment statistics after collection.
856 */
update_statistics()857 void RFHMM::update_statistics()
858 {
859 motif_concordance = (float)motif_m/(motif_m+motif_xid);
860 }
861
862 /**
863 * Returns true if flanks are mapped.
864 */
flanks_are_mapped()865 bool RFHMM::flanks_are_mapped()
866 {
867 return rflank_start[MODEL]==rflen;
868 }
869
870 /**
871 * Compute log10 emission odds based on equal error probability distribution.
872 */
log10_emission_odds(char probe_base,char read_base,uint32_t pl)873 float RFHMM::log10_emission_odds(char probe_base, char read_base, uint32_t pl)
874 {
875 //4 encodes for N
876 // if (read_base=='N' || probe_base=='N')
877 // {
878 // //silent match
879 // return -INFINITY;
880 // }
881
882 if (read_base!=probe_base)
883 {
884 return LogTool::pl2log10_varp(pl) - par.mismatch_penalty;
885 }
886 else
887 {
888 return -(LogTool::pl2log10_varp(pl));
889 }
890 };
891
892 /**
893 * Converts state to string representation.
894 */
state2string(int32_t state)895 std::string RFHMM::state2string(int32_t state)
896 {
897 if (state==S)
898 {
899 return "S";
900 }
901 else if (state==Y)
902 {
903 return "Y";
904 }
905 else if (state==M)
906 {
907 return "M";
908 }
909 else if (state==D)
910 {
911 return "D";
912 }
913 else if (state==I)
914 {
915 return "I";
916 }
917 else if (state==MR)
918 {
919 return "MR";
920 }
921 else if (state==E)
922 {
923 return "E";
924 }
925 else if (state==N)
926 {
927 return "N";
928 }
929 else if (state==TBD)
930 {
931 return "*";
932 }
933 else
934 {
935 return "!";
936 }
937 }
938
939 /**
940 * Converts state to cigar string representation.
941 */
state2cigarstring(int32_t state)942 std::string RFHMM::state2cigarstring(int32_t state)
943 {
944 if (state==S)
945 {
946 return "S";
947 }
948 else if (state==Y)
949 {
950 return "Y";
951 }
952 else if (state==M)
953 {
954 return "M";
955 }
956 else if (state==D)
957 {
958 return "D";
959 }
960 else if (state==I)
961 {
962 return "I";
963 }
964 else if (state==MR)
965 {
966 return "R";
967 }
968 else if (state==E)
969 {
970 return "E";
971 }
972 else if (state==N)
973 {
974 return "N";
975 }
976 else if (state==TBD)
977 {
978 return "*";
979 }
980 else
981 {
982 return "!";
983 }
984 }
985
986 /**
987 * Converts state to cigar string representation.
988 */
track2cigarstring1(int32_t t,int32_t j)989 std::string RFHMM::track2cigarstring1(int32_t t, int32_t j)
990 {
991 int32_t state = track_get_u(t);
992
993 if (state==S)
994 {
995 return "S";
996 }
997 else if (state==Y)
998 {
999 return "Y";
1000 }
1001 else if (state==M || state==MR)
1002 {
1003 if (track_get_base(t)==read[j-1])
1004 {
1005 return "M";
1006 }
1007 else
1008 {
1009 return "*";
1010 }
1011 }
1012 else if (state==D)
1013 {
1014 return "D";
1015 }
1016 else if (state==I)
1017 {
1018 return "I";
1019 }
1020 else if (state==E)
1021 {
1022 return "E";
1023 }
1024 else if (state==N)
1025 {
1026 return "N";
1027 }
1028 else if (state==TBD)
1029 {
1030 return "*";
1031 }
1032 else
1033 {
1034 return "!";
1035 }
1036 }
1037
1038 /**
1039 * Converts track to cigar string representation.
1040 */
track2cigarstring2(int32_t t)1041 std::string RFHMM::track2cigarstring2(int32_t t)
1042 {
1043 int32_t state = track_get_u(t);
1044
1045 if (state==M)
1046 {
1047 return (track_get_c(t)%2==0?"+":"o");
1048 }
1049 else if (state==D)
1050 {
1051 return (track_get_c(t)%2==0?"+":"o");
1052 }
1053 else if (state==I)
1054 {
1055 return (track_get_c(t)%2==0?"+":"o");
1056 }
1057 else if (state==MR)
1058 {
1059 return "R";
1060 }
1061 else
1062 {
1063 return " ";
1064 }
1065 }
1066 /**
1067 * Converts model component to string representation.
1068 */
component2string(int32_t component)1069 std::string RFHMM::component2string(int32_t component)
1070 {
1071 if (component==LFLANK)
1072 {
1073 return "l";
1074 }
1075 else if (component==MOTIF)
1076 {
1077 return "m";
1078 }
1079 else if (component==RFLANK)
1080 {
1081 return "r";
1082 }
1083 else if (component==UNMODELED)
1084 {
1085 return "!";
1086 }
1087 else if (component==READ)
1088 {
1089 return "s";
1090 }
1091 else if (component==UNCERTAIN)
1092 {
1093 return "?";
1094 }
1095 else
1096 {
1097 return "!";
1098 }
1099 }
1100
1101 /**
1102 * Prints an alignment.
1103 */
print_alignment()1104 void RFHMM::print_alignment()
1105 {
1106 std::string pad = "\t";
1107 print_alignment(pad);
1108 };
1109
1110 /**
1111 * Prints an alignment with padding.
1112 */
print_alignment(std::string & pad)1113 void RFHMM::print_alignment(std::string& pad)
1114 {
1115 std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1116
1117 if (!optimal_path_traced)
1118 {
1119 std::cerr << "path not traced\n";
1120 }
1121
1122 print_T();
1123 std::cerr << "\n";
1124 par.print();
1125 std::cerr << "\n";
1126
1127 std::cerr << "repeat motif : " << model[MOTIF] << "\n";
1128 std::cerr << "rflank : " << model[RFLANK] << "\n";
1129 std::cerr << "mlen : " << mlen << "\n";
1130 std::cerr << "rflen : " << rflen << "\n";
1131 std::cerr << "plen : " << plen << "\n";
1132 std::cerr << "\n";
1133 std::cerr << "read : " << read << "\n";
1134 std::cerr << "rlen : " << rlen << "\n";
1135 std::cerr << "\n";
1136 std::cerr << "optimal score: " << optimal_score << "\n";
1137 std::cerr << "optimal state: " << state2string(optimal_state) << "\n";
1138 std::cerr << "optimal track: " << track2string(optimal_track) << "\n";
1139 std::cerr << "optimal probe len: " << optimal_probe_len << "\n";
1140 std::cerr << "optimal path length : " << optimal_path_len << "\n";
1141 std::cerr << "max j: " << rlen << "\n";
1142
1143 std::cerr << "probe: " << "(" << lflank_start[MODEL] << "~" << lflank_end[MODEL] << ") "
1144 << "[" << motif_start[MODEL] << "~" << motif_end[MODEL] << "] "
1145 << "(" << rflank_start[MODEL] << "~" << rflank_end[MODEL] << ")\n";
1146 std::cerr << "read : " << "(" << lflank_start[READ] << "~" << lflank_end[READ] << ") "
1147 << "[" << motif_start[READ] << "~" << motif_end[READ] << "] "
1148 << "(" << rflank_start[READ] << "~" << rflank_end[READ] << ")\n";
1149 std::cerr << "\n";
1150 std::cerr << "motif # : " << motif_count << " [" << motif_start[READ] << "," << motif_end[READ] << "]\n";
1151
1152 exact_motif_count = motif_count;
1153 motif_concordance = 0;
1154 for (int32_t k=1; k<=motif_count; ++k)
1155 {
1156 if (motif_discordance[k])
1157 {
1158 --exact_motif_count;
1159 }
1160
1161 if (mlen>=motif_discordance[k])
1162 {
1163 motif_concordance += (float)(mlen-motif_discordance[k]) / mlen;
1164 }
1165 }
1166 motif_concordance *= 1.0/motif_count;
1167 std::cerr << "motif concordance : " << (motif_concordance*100) << "% (" << exact_motif_count << "/" << motif_count << ")\n";
1168 std::cerr << "motif discordance : ";
1169 for (int32_t k=1; k<=motif_count; ++k)
1170 {
1171 std::cerr << motif_discordance[k] << (k==motif_count?"\n":"|");
1172 }
1173 std::cerr << "\n";
1174
1175 //print path
1176 int32_t* path;
1177 path = optimal_path_ptr;
1178 std::cerr << "Model: ";
1179 int32_t t = NULL_TRACK;
1180 int32_t j = 0;
1181 while (path<optimal_path+(MAXLEN<<2))
1182 {
1183 int32_t u = track_get_u(*path);
1184 if (u==M || u==D || u==MR)
1185 {
1186 std::cerr << track_get_base(*path);
1187 }
1188 else
1189 {
1190 std::cerr << '-';
1191 }
1192 ++path;
1193 }
1194 std::cerr << " \n";
1195
1196 std::cerr << " S";
1197 path = optimal_path_ptr;
1198 j=1;
1199 while (path<optimal_path+(MAXLEN<<2))
1200 {
1201 std::cerr << track2cigarstring1(*path,j);
1202 int32_t u = track_get_u(*path);
1203 if (u==Y || u==M || u==I || u==MR)
1204 {
1205 ++j;
1206 }
1207 ++path;
1208 }
1209 std::cerr << "E\n";
1210
1211 path = optimal_path_ptr;
1212 std::cerr << " ";
1213 while (path<optimal_path+(MAXLEN<<2))
1214 {
1215 std::cerr << track2cigarstring2(*path);
1216 ++path;
1217 }
1218 std::cerr << " \n";
1219
1220 path = optimal_path_ptr;
1221 j=1;
1222 std::cerr << "Read: ";
1223 while (path<optimal_path+(MAXLEN<<2))
1224 {
1225 int32_t u = track_get_u(*path);
1226 if (u==Y || u==M || u==I || u==MR)
1227 {
1228 std::cerr << read[j-1];
1229 ++j;
1230 }
1231 else
1232 {
1233 std::cerr << '-';
1234 }
1235 ++path;
1236 }
1237 std::cerr << " \n";
1238 std::cerr << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
1239 };
1240
1241 /**
1242 * Prints a float matrix.
1243 */
print(float * v,size_t plen,size_t rlen)1244 void RFHMM::print(float *v, size_t plen, size_t rlen)
1245 {
1246 float val;
1247 std::cerr << std::setprecision(1) << std::fixed;
1248 for (size_t i=0; i<plen; ++i)
1249 {
1250 for (size_t j=0; j<rlen; ++j)
1251 {
1252 val = v[index(i,j)];
1253 std::cerr << (val<0?" ":" ") << val;
1254 }
1255
1256 std::cerr << "\n";
1257 }
1258 };
1259
1260 /**
1261 * Prints a char matrix.
1262 */
print(int32_t * v,size_t plen,size_t rlen)1263 void RFHMM::print(int32_t *v, size_t plen, size_t rlen)
1264 {
1265 float val;
1266 std::cerr << std::setprecision(1) << std::fixed << std::setw(6);
1267 for (size_t i=0; i<plen; ++i)
1268 {
1269 for (size_t j=0; j<rlen; ++j)
1270 {
1271 val = v[index(i,j)];
1272 std::cerr << (val<0?" ":" ") << val;
1273 }
1274
1275 std::cerr << "\n";
1276 }
1277 };
1278
1279 /**
1280 * Prints the transition matrix.
1281 */
print_T()1282 void RFHMM::print_T()
1283 {
1284 for (size_t j=S; j<=E; ++j)
1285 {
1286 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << state2string(j);
1287 }
1288 std::cerr << "\n";
1289
1290 for (size_t i=S; i<=E; ++i)
1291 {
1292 for (size_t j=S; j<=E; ++j)
1293 {
1294 if (j)
1295 {
1296 std::cerr << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1297 }
1298 else
1299 {
1300 std::cerr << state2string(i) << std::setw(8) << std::setprecision(2) << std::fixed << T[i][j];
1301 }
1302 }
1303 std::cerr << "\n";
1304 }
1305 };
1306
1307 /**
1308 * Prints U.
1309 */
print_U(int32_t * U,size_t plen,size_t rlen)1310 void RFHMM::print_U(int32_t *U, size_t plen, size_t rlen)
1311 {
1312 std::cerr << std::setprecision(1) << std::fixed;
1313 std::string state;
1314 for (size_t i=0; i<plen; ++i)
1315 {
1316 for (size_t j=0; j<rlen; ++j)
1317 {
1318 int32_t t = U[index(i,j)];
1319 state = state2string(track_get_u(t));
1320 std::cerr << (state.size()==1 ? " " : " ")
1321 << state << "|"
1322 << component2string(track_get_d(t)) << "|"
1323 << track_get_c(t) << "|"
1324 << track_get_p(t) << (j==rlen-1?"\n":" ");
1325 }
1326 }
1327 };
1328
1329 /**
1330 * Prints U and V.
1331 */
print_trace(int32_t state,size_t plen,size_t rlen)1332 void RFHMM::print_trace(int32_t state, size_t plen, size_t rlen)
1333 {
1334 std::cerr << std::setprecision(1) << std::fixed;
1335 int32_t *u = U[state];
1336 float *v = V[state];
1337 std::string s;
1338 for (size_t i=0; i<plen; ++i)
1339 {
1340 for (size_t j=0; j<rlen; ++j)
1341 {
1342 int32_t t = u[index(i,j)];
1343 s = state2string(track_get_u(t));
1344 std::cerr << (s.size()==1 ? " " : " ")
1345 << s << "|"
1346 << component2string(track_get_d(t)) << "|"
1347 << track_get_c(t) << "|"
1348 << track_get_p(t) << "|"
1349 << v[index(i,j)];
1350 }
1351
1352 std::cerr << "\n";
1353 }
1354 };
1355
1356 /**
1357 * Returns a string representation of track.
1358 */
track2string(int32_t t)1359 std::string RFHMM::track2string(int32_t t)
1360 {
1361 kstring_t str = {0,0,0};
1362
1363 kputs(state2string(track_get_u(t)).c_str(), &str);
1364 kputc('|', &str);
1365 kputs(component2string(track_get_d(t)).c_str(), &str);
1366 kputc('|', &str);
1367 kputw(track_get_c(t), &str);
1368 kputc('|', &str);
1369 kputw(track_get_p(t), &str);
1370
1371 std::string s(str.s);
1372 if (str.m) free(str.s);
1373
1374 return s;
1375 }
1376
1377 /**
1378 * Prints track.
1379 */
print_track(int32_t t)1380 void RFHMM::print_track(int32_t t)
1381 {
1382 std::cerr << track2string(t) << "\n";
1383 }
1384
1385 #undef MAXLEN
1386 #undef MAXLEN_NBITS
1387 #undef S
1388 #undef Y
1389 #undef M
1390 #undef I
1391 #undef D
1392 #undef MR
1393 #undef E
1394 #undef N
1395 #undef TBD
1396 #undef NSTATES
1397
1398 #undef LFLANK
1399 #undef MOTIF
1400 #undef RFLANK
1401 #undef UNMODELED
1402 #undef UNCERTAIN
1403
1404 #undef READ
1405 #undef MODEL
1406 #undef MATCH
1407
1408 #undef index
1409 #undef track_get_u
1410 #undef track_get_d
1411 #undef track_get_d
1412 #undef track_get_c
1413 #undef track_get_p
1414 #undef track_get_base
1415 #undef track_valid
1416 #undef track_set_u
1417 #undef track_set_d
1418 #undef track_set_c
1419 #undef track_set_p
1420 #undef make_track
1421
1422 #undef NULL_TRACK
1423 #undef START_TRACK