1 /* $Id: hspfilter_mapper.c,v 1.9 2017/01/05 18:34:13 fukanchi Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Greg Boratyn
27 *
28 */
29
30 /** @file hspfilter_mapper.c
31 * Implementation of the BlastHSPWriter interface to save only the best scoring * chains of HSPs for aligning RNA-Seq sequences to a genome.
32 */
33
34 #include <algo/blast/core/hspfilter_mapper.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_hits.h>
37 #include "jumper.h"
38
39 /* Pair configurations, in the order of prefernece */
40 #define PAIR_CONVERGENT 0
41 #define PAIR_DIVERGENT 1
42 #define PAIR_PARALLEL 2
43 #define PAIR_NONE 3
44
45 /* A container to create a list of HSPs */
46 typedef struct HSPContainer
47 {
48 BlastHSP* hsp;
49 struct HSPContainer* next;
50 } HSPContainer;
51
52
53 /* Create HSPContainer and take ownership of the HSP */
HSPContainerNew(BlastHSP ** hsp)54 static HSPContainer* HSPContainerNew(BlastHSP** hsp)
55 {
56 HSPContainer* retval = calloc(1, sizeof(HSPContainer));
57 if (!retval) {
58 return NULL;
59 }
60 retval->hsp = *hsp;
61 /* take ownership of this HSP */
62 *hsp = NULL;
63
64 return retval;
65 }
66
67 /* Free the list of HSPs, along with the stored HSPs */
HSPContainerFree(HSPContainer * hc)68 static HSPContainer* HSPContainerFree(HSPContainer* hc)
69 {
70 HSPContainer* h = hc;
71 while (h) {
72 HSPContainer* next = h->next;
73 if (h->hsp) {
74 Blast_HSPFree(h->hsp);
75 }
76
77 sfree(h);
78 h = next;
79 }
80
81 return NULL;
82 }
83
84 /* Clone a list of HSP containers */
HSPContainerDup(HSPContainer * inh)85 static HSPContainer* HSPContainerDup(HSPContainer* inh)
86 {
87 HSPContainer* retval = NULL;
88 HSPContainer* hi = NULL, *ho = NULL;
89 BlastHSP* new_hsp = NULL;
90
91 if (!inh || !inh->hsp) {
92 return NULL;
93 }
94
95 new_hsp = Blast_HSPClone(inh->hsp);
96 if (!new_hsp) {
97 return NULL;
98 }
99 retval = HSPContainerNew(&new_hsp);
100 if (!retval) {
101 Blast_HSPFree(new_hsp);
102 return NULL;
103 }
104
105 hi = inh->next;
106 ho = retval;
107 for (; hi; hi = hi->next, ho = ho->next) {
108 new_hsp = Blast_HSPClone(hi->hsp);
109 if (!new_hsp) {
110 Blast_HSPFree(new_hsp);
111 HSPContainerFree(retval);
112 return NULL;
113 }
114 ho->next = HSPContainerNew(&new_hsp);
115 if (!ho->next) {
116 Blast_HSPFree(new_hsp);
117 HSPContainerFree(retval);
118 return NULL;
119 }
120 }
121
122 return retval;
123 }
124
125 /* A chain of HSPs: gene transcript sequence aligned to exons */
126 typedef struct HSPChain
127 {
128 Int4 context; /* query context */
129 Int4 oid; /* subject oid */
130 Int4 score; /* score for the whole chain */
131 HSPContainer* hsps; /* list of HSPs that belong to this chain */
132 Int4 compartment; /* compartment number for the chain (needed
133 for reporting results) */
134
135 Int4 count; /* number of chains with the same or larger score found
136 for the same query */
137 struct HSPChain* pair; /* pointer to the pair (for paired reads) */
138 Uint1 pair_conf; /* pair configuration */
139
140 Int4 adapter; /* adapter start position */
141 Int4 polyA; /* start of polyA tail */
142 struct HSPChain* next;
143 } HSPChain;
144
145
HSPChainFree(HSPChain * chain_list)146 static HSPChain* HSPChainFree(HSPChain* chain_list)
147 {
148 HSPChain* chain = chain_list;
149 while (chain) {
150 HSPChain* next = chain->next;
151 if (chain->pair) {
152 chain->pair->pair = NULL;
153 }
154 ASSERT(chain->hsps);
155 chain->hsps = HSPContainerFree(chain->hsps);
156 sfree(chain);
157 chain = next;
158 }
159
160 return NULL;
161 }
162
HSPChainNew(Int4 context)163 static HSPChain* HSPChainNew(Int4 context)
164 {
165 HSPChain* retval = calloc(1, sizeof(HSPChain));
166 if (!retval) {
167 return NULL;
168 }
169 retval->context = context;
170 retval->compartment = -1;
171 retval->adapter = -1;
172
173 return retval;
174 }
175
176
177 /* Insert a single chain into the list so that the list is sorted in decending
178 order of chain scores. */
s_HSPChainListInsertOne(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)179 static Int4 s_HSPChainListInsertOne(HSPChain** list, HSPChain* chain,
180 Boolean check_for_duplicates)
181 {
182 HSPChain* ch = NULL;
183
184 if (!list || !chain) {
185 return -1;
186 }
187 ASSERT(!chain->next);
188
189 if (!*list) {
190 *list = chain;
191 return 0;
192 }
193
194 ch = *list;
195 if (ch->score < chain->score) {
196 chain->next = ch;
197 *list = chain;
198 return 0;
199 }
200
201 /* check for duplicate chain: the new chains may be the same as already
202 saved; this may come from writing HSPs to HSP stream for each chunk */
203 if (check_for_duplicates &&
204 ch->oid == chain->oid && ch->score == chain->score &&
205 ch->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
206 ch->hsps->hsp->subject.offset ==
207 chain->hsps->hsp->subject.offset) {
208
209 chain->next = NULL;
210 chain = HSPChainFree(chain);
211 return 0;
212 }
213
214 while (ch->next && ch->next->score >= chain->score){
215
216 /* check for duplicate chain: the new chains may be the same as already
217 saved; this may come from writing HSPs to HSP stream for each chunk */
218 if (check_for_duplicates &&
219 ch->next->oid == chain->oid && ch->next->score == chain->score &&
220 ch->next->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
221 ch->next->hsps->hsp->subject.offset ==
222 chain->hsps->hsp->subject.offset) {
223
224 chain->next = NULL;
225 chain = HSPChainFree(chain);
226 return 0;
227 }
228
229 ch = ch->next;
230 }
231 ASSERT(ch);
232 chain->next = ch->next;
233 ch->next = chain;
234
235 return 0;
236 }
237
238 /* Insert chains into the list so that the list is sorted in descending order
239 of chain scores. If chain is a list, each element is added separately. The
240 list must be sorted before adding chain */
HSPChainListInsert(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)241 static Int4 HSPChainListInsert(HSPChain** list, HSPChain* chain,
242 Boolean check_for_duplicates)
243 {
244 HSPChain* ch = NULL;
245 Int4 status = 0;
246
247 if (!list || !chain) {
248 return -1;
249 }
250
251 ch = chain;
252 while (ch) {
253 HSPChain* next = ch->next;
254 ch->next = NULL;
255 status = s_HSPChainListInsertOne(list, ch, check_for_duplicates);
256 if (status) {
257 return status;
258 }
259 ch = next;
260
261 }
262
263 return 0;
264 }
265
266 /* Remove from the list chains with scores lower than the best one by at least
267 the given margin. The list must be sorted in descending order of scores. */
HSPChainListTrim(HSPChain * list,Int4 margin)268 static Int4 HSPChainListTrim(HSPChain* list, Int4 margin)
269 {
270 HSPChain* ch = NULL;
271 Int4 best_score;
272
273 if (!list) {
274 return -1;
275 }
276
277 best_score = list->score;
278 ch = list;
279 while (ch->next && best_score - ch->next->score <= margin) {
280 ASSERT(best_score - ch->next->score >= 0);
281 ch = ch->next;
282 }
283 ASSERT(ch);
284
285 ch->next = HSPChainFree(ch->next);
286
287 return 0;
288 }
289
290
291 /* Clone a single HSP chain */
s_CloneChain(const HSPChain * chain)292 static HSPChain* s_CloneChain(const HSPChain* chain)
293 {
294 HSPChain* retval = NULL;
295
296 if (!chain) {
297 return NULL;
298 }
299
300 retval = HSPChainNew(chain->context);
301 if (!retval) {
302 return NULL;
303 }
304 retval->hsps = HSPContainerDup(chain->hsps);
305 if (!retval->hsps) {
306 HSPChainFree(retval);
307 return NULL;
308 }
309 retval->oid = chain->oid;
310 retval->score = chain->score;
311 retval->adapter = chain->adapter;
312 retval->polyA = chain->polyA;
313
314 return retval;
315 }
316
317 /* Test that all pointers in the chain are set */
s_TestChains(HSPChain * chain)318 static Boolean s_TestChains(HSPChain* chain)
319 {
320 HSPContainer* hc;
321
322 ASSERT(chain);
323 hc = chain->hsps;
324 ASSERT(hc);
325 ASSERT(hc->hsp);
326 ASSERT(hc->hsp->context == chain->context);
327
328 ASSERT(hc->hsp->gap_info->size > 1 ||
329 hc->hsp->query.end - hc->hsp->query.offset ==
330 hc->hsp->subject.end - hc->hsp->subject.offset);
331
332 hc = hc->next;
333 while (hc) {
334 ASSERT(hc->hsp);
335 ASSERT(hc->hsp->context == chain->context);
336
337 ASSERT(hc->hsp->gap_info->size > 1 ||
338 hc->hsp->query.end - hc->hsp->query.offset ==
339 hc->hsp->subject.end - hc->hsp->subject.offset);
340
341 if (hc->next) {
342 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
343 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
344 }
345 hc = hc->next;
346 }
347
348 chain = chain->next;
349 while (chain) {
350 hc = chain->hsps;
351 ASSERT(hc);
352 ASSERT(hc->hsp);
353 ASSERT(hc->hsp->context == chain->context);
354
355 ASSERT(hc->hsp->gap_info->size > 1 ||
356 hc->hsp->query.end - hc->hsp->query.offset ==
357 hc->hsp->subject.end - hc->hsp->subject.offset);
358
359 hc = hc->next;
360 while (hc) {
361 ASSERT(hc);
362 ASSERT(hc->hsp);
363 ASSERT(hc->hsp->context == chain->context);
364
365 ASSERT(hc->hsp->gap_info->size > 1 ||
366 hc->hsp->query.end - hc->hsp->query.offset ==
367 hc->hsp->subject.end - hc->hsp->subject.offset);
368
369 if (hc->next) {
370 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
371 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
372 }
373
374 hc = hc->next;
375 }
376 chain = chain->next;
377 }
378
379 return TRUE;
380 }
381
382 #if _DEBUG
s_TestChainsSorted(HSPChain * chain)383 static Boolean s_TestChainsSorted(HSPChain* chain)
384 {
385 HSPChain* prev;
386
387 s_TestChains(chain);
388
389 prev = chain;
390 chain = chain->next;
391 for (; chain; chain = chain->next, prev = prev->next) {
392 ASSERT(prev->score >= chain->score);
393 }
394
395 return TRUE;
396 }
397 #endif
398
399 /** Data structure used by the writer */
400 typedef struct BlastHSPMapperData {
401 BlastHSPMapperParams* params; /**< how many hits to save */
402 BLAST_SequenceBlk* query; /**< query sequence */
403 BlastQueryInfo* query_info; /**< information about queries */
404 HSPChain** saved_chains; /**< HSP chains are stored here */
405 } BlastHSPMapperData;
406
407 /*************************************************************/
408 /** The following are implementations for BlastHSPWriter ADT */
409
410 /** Perform pre-run stage-specific initialization
411 * @param data The internal data structure [in][out]
412 * @param results The HSP results to operate on [in]
413 */
414 static int
s_BlastHSPMapperPairedInit(void * data,void * results)415 s_BlastHSPMapperPairedInit(void* data, void* results)
416 {
417 BlastHSPMapperData * spl_data = data;
418 BlastHSPResults* r = (BlastHSPResults*)results;
419 spl_data->saved_chains = calloc(r->num_queries, sizeof(HSPChain*));
420
421 return 0;
422 }
423
424
425 /* Get subject start position */
s_FindFragmentStart(HSPChain * chain)426 static Int4 s_FindFragmentStart(HSPChain* chain)
427 {
428 Int2 frame;
429 HSPContainer* hc = NULL;
430 ASSERT(chain);
431 ASSERT(chain->hsps);
432 ASSERT(chain->hsps->hsp);
433
434 frame = chain->hsps->hsp->query.frame;
435 if (frame > 0) {
436 return chain->hsps->hsp->subject.offset;
437 }
438
439 hc = chain->hsps;
440 while (hc->next) {
441 hc = hc->next;
442 }
443 ASSERT(hc);
444
445 return hc->hsp->subject.end - 1;
446 }
447
448
449 /* Get subject end position */
450 /* Not used
451 static Int4 s_FindFragmentEnd(HSPChain* chain)
452 {
453 Int2 frame;
454 HSPContainer* hc = NULL;
455 ASSERT(chain);
456 ASSERT(chain->hsps);
457 ASSERT(chain->hsps->hsp);
458
459 frame = chain->hsps->hsp->query.frame;
460 if (frame < 0) {
461 return chain->hsps->hsp->subject.offset;
462 }
463
464 hc = chain->hsps;
465 while (hc->next) {
466 hc = hc->next;
467 }
468 ASSERT(hc);
469
470 return hc->hsp->subject.end - 1;
471 }
472 */
473
474 /* Compute HSP alignment score from Jumper edit script */
s_ComputeAlignmentScore(BlastHSP * hsp,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)475 static Int4 s_ComputeAlignmentScore(BlastHSP* hsp, Int4 mismatch_score,
476 Int4 gap_open_score, Int4 gap_extend_score)
477 {
478 Int4 i;
479 Int4 last_pos = hsp->query.offset;
480 Int4 score = 0;
481 const Int4 kGap = 15;
482
483 for (i = 0;i < hsp->map_info->edits->num_edits;i++) {
484 JumperEdit* e = &(hsp->map_info->edits->edits[i]);
485 Int4 num_matches = e->query_pos - last_pos;
486 ASSERT(num_matches >= 0);
487 last_pos = e->query_pos;
488 score += num_matches;
489
490 if (e->query_base == kGap) {
491 ASSERT(e->subject_base != kGap);
492 if (num_matches > 0 ||
493 (i > 0 && hsp->map_info->edits->edits[i - 1].query_base !=
494 kGap)) {
495
496 score += gap_open_score;
497 }
498 score += gap_extend_score;
499 }
500 else if (e->subject_base == kGap) {
501 if (num_matches > 0 ||
502 (i > 0 && hsp->map_info->edits->edits[i - 1].subject_base !=
503 kGap)) {
504
505 score += gap_open_score;
506 }
507 score += gap_extend_score;
508 last_pos++;
509 }
510 else {
511 score += mismatch_score;
512 last_pos++;
513 }
514 }
515
516 score += hsp->query.end - last_pos;
517 return score;
518 }
519
520
521 /* Find the cost of chain HSPs overlapping on the query, as the smaller score
522 of the overlapping region */
s_GetOverlapCost(const BlastHSP * a,const BlastHSP * b,Int4 edit_penalty)523 static Int4 s_GetOverlapCost(const BlastHSP* a, const BlastHSP* b,
524 Int4 edit_penalty)
525 {
526 Int4 i;
527 Int4 overlap_f, overlap_s;
528 const BlastHSP* f;
529 const BlastHSP* s;
530
531 /* if one HSP in contained within the other on the query, return the
532 smaller score */
533 if ((a->query.offset <= b->query.offset && a->query.end >= b->query.end) ||
534 (a->query.offset >= b->query.offset && a->query.end <= b->query.end)) {
535
536 return MIN(a->score, b->score);
537 }
538
539 /* if the two HSPs are mutually exclusive on the query, there is no cost */
540 if ((a->query.end < b->query.offset && a->query.offset < b->query.end) ||
541 (b->query.end < a->query.offset && b->query.offset < a->query.end)) {
542
543 return 0;
544 }
545
546 /* otherwise the HSPs partially overlap; reurn the the smaller score for
547 the overlap region */
548
549 /* find which HSP precedes which on the query */
550 if (a->query.offset <= b->query.offset) {
551 f = a;
552 s = b;
553 }
554 else {
555 f = b;
556 s = a;
557 }
558
559 /* this is the overlap score assuming perfect matches in the overlap */
560 overlap_f = overlap_s = f->query.end - s->query.offset;
561 ASSERT(overlap_f >= 0 && overlap_s >= 0);
562 /* subtract penalties for mismatches and gaps in the overlap region */
563 for (i = 0;i < f->map_info->edits->num_edits;i++) {
564 if (f->map_info->edits->edits[i].query_pos >= s->query.offset) {
565 overlap_f -= edit_penalty;
566 }
567 }
568 for (i = 0;i < s->map_info->edits->num_edits;i++) {
569 if (s->map_info->edits->edits[i].query_pos < f->query.end) {
570 overlap_s -= edit_penalty;
571 }
572 }
573
574 return MIN(overlap_f, overlap_s);
575 }
576
577
578 /* Compute chain score */
s_ComputeChainScore(HSPChain * chain,const ScoringOptions * score_options,Int4 query_len,Boolean comp_hsp_score)579 static Int4 s_ComputeChainScore(HSPChain* chain,
580 const ScoringOptions* score_options,
581 Int4 query_len,
582 Boolean comp_hsp_score)
583 {
584 Int4 retval;
585 HSPContainer* h = NULL;
586 HSPContainer* prev = NULL;
587
588 if (!chain || !score_options) {
589 return -1;
590 }
591
592 h = chain->hsps;
593 if (comp_hsp_score) {
594 h->hsp->score = s_ComputeAlignmentScore(h->hsp, score_options->penalty,
595 score_options->gap_open,
596 score_options->gap_extend);
597 }
598 retval = h->hsp->score;
599 if (h->hsp->query.offset > 0 &&
600 (h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0) {
601
602 retval += score_options->no_splice_signal;
603 }
604 if (h->hsp->query.end < query_len &&
605 (h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
606
607 retval += score_options->no_splice_signal;
608 }
609
610 prev = h;
611 h = h->next;
612
613 for (; h; h = h->next, prev = prev->next) {
614 /* HSPs must be sorted by query positon */
615 ASSERT(h->hsp->query.offset >= prev->hsp->query.offset);
616
617 if (comp_hsp_score) {
618
619 h->hsp->score = s_ComputeAlignmentScore(h->hsp,
620 score_options->penalty,
621 score_options->gap_open,
622 score_options->gap_extend);
623 }
624 retval += h->hsp->score;
625
626 if (h->hsp->query.offset > 0 &&
627 (h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0) {
628
629 retval += score_options->no_splice_signal;
630 }
631 if (h->hsp->query.end < query_len &&
632 (h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
633
634 retval += score_options->no_splice_signal;
635 }
636 }
637
638 return retval;
639 }
640
641 #if _DEBUG
s_TestHSPRanges(const BlastHSP * hsp)642 static Boolean s_TestHSPRanges(const BlastHSP* hsp)
643 {
644 Int4 i;
645 Int4 d = 0;
646 for (i=0;i < hsp->gap_info->size;i++) {
647 switch (hsp->gap_info->op_type[i]) {
648 case eGapAlignIns:
649 d -= hsp->gap_info->num[i];
650 break;
651
652 case eGapAlignDel:
653 d += hsp->gap_info->num[i];
654 break;
655
656 default:
657 break;
658 }
659 }
660 if (hsp->query.end - hsp->query.offset + d !=
661 hsp->subject.end - hsp->subject.offset) {
662
663 return FALSE;
664 }
665
666 return TRUE;
667 }
668 #endif
669
670 /* Trim HSP by a number of bases on query or subject, either from the start or
671 from the end */
s_TrimHSP(BlastHSP * hsp,Int4 num,Boolean is_query,Boolean is_start,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)672 static Int4 s_TrimHSP(BlastHSP* hsp, Int4 num, Boolean is_query,
673 Boolean is_start, Int4 mismatch_score,
674 Int4 gap_open_score, Int4 gap_extend_score)
675 {
676 Int4 num_left = num;
677 Int4 i = is_start ? 0 : hsp->gap_info->size - 1;
678 Int4 d = is_start ? 1 : -1;
679 Int4 end = is_start ? hsp->gap_info->size : -1;
680 Int4 delta_query = 0, delta_subject = 0;
681 Boolean is_subject = !is_query;
682 Boolean is_end = !is_start;
683 Int4 k;
684
685 ASSERT(hsp /* */ && num > 0 /* */);
686 ASSERT((is_query && num <= hsp->query.end - hsp->query.offset) ||
687 (!is_query && num <= hsp->subject.end - hsp->subject.offset));
688
689 #if _DEBUG
690 ASSERT(s_TestHSPRanges(hsp));
691 #endif
692
693 if (num == 0) {
694 return 0;
695 }
696
697
698 /* itereate over the edit script and remove the required number
699 of subject bases */
700 while (i != end && num_left > 0) {
701 switch (hsp->gap_info->op_type[i]) {
702 case eGapAlignSub:
703 if (hsp->gap_info->num[i] > num_left) {
704 hsp->gap_info->num[i] -= num_left;
705 delta_query += num_left;
706 delta_subject += num_left;
707 num_left = 0;
708 }
709 else {
710 Int4 n = hsp->gap_info->num[i];
711
712 delta_query += n;
713 delta_subject += n;
714 num_left -= n;
715 i += d;
716 }
717 break;
718
719 case eGapAlignDel:
720 if (is_subject) {
721 if (hsp->gap_info->num[i] > num_left) {
722 hsp->gap_info->num[i] -= num_left;
723 delta_subject += num_left;
724 num_left = 0;
725 }
726 else {
727 delta_subject += hsp->gap_info->num[i];
728 num_left -= hsp->gap_info->num[i];
729 i += d;
730 }
731 }
732 else {
733 delta_subject += hsp->gap_info->num[i];
734 i += d;
735 }
736 break;
737
738 case eGapAlignIns:
739 if (is_query) {
740 if (hsp->gap_info->num[i] > num_left) {
741 hsp->gap_info->num[i] -= num_left;
742 delta_query += num_left;
743 num_left = 0;
744 }
745 else {
746 delta_query += hsp->gap_info->num[i];
747 num_left -= hsp->gap_info->num[i];
748 i += d;
749 }
750 }
751 else {
752 delta_query += hsp->gap_info->num[i];
753 i += d;
754 }
755 break;
756
757 default:
758 ASSERT(0);
759 break;
760 }
761 }
762
763 /* shift edit script elements to fill the removed ones */
764 if (is_start && i > 0) {
765 for (k = 0;k < hsp->gap_info->size - i;k++) {
766 hsp->gap_info->op_type[k] = hsp->gap_info->op_type[k + i];
767 hsp->gap_info->num[k] = hsp->gap_info->num[k + i];
768 }
769 hsp->gap_info->size -= i;
770 }
771
772 if (is_end) {
773 hsp->gap_info->size = i + 1;
774 }
775
776 /* update the Jumper edit script */
777 if (is_start) {
778 Int4 k = hsp->map_info->edits->num_edits - 1;
779 Int4 p = hsp->query.end - 1;
780 const Uint1 kGap = 15;
781 for (i = hsp->gap_info->size - 1;i >= 0;i--) {
782 if (hsp->gap_info->op_type[i] != eGapAlignDel) {
783
784 p -= hsp->gap_info->num[i];
785 while (k >= 0 &&
786 hsp->map_info->edits->edits[k].query_pos > p &&
787 hsp->map_info->edits->edits[k].query_base != kGap) {
788
789 k--;
790 }
791 }
792 else {
793 Int4 j;
794 for (j = 0;j < hsp->gap_info->num[i];j++) {
795 ASSERT(k >= 0);
796 ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
797 k--;
798 }
799 }
800 }
801 k++;
802 if (k > 0) {
803 for (i = 0;i < hsp->map_info->edits->num_edits - k;i++) {
804 hsp->map_info->edits->edits[i] =
805 hsp->map_info->edits->edits[i + k];
806 }
807 hsp->map_info->edits->num_edits -= k;
808 ASSERT(hsp->map_info->edits->num_edits >= 0);
809 }
810 }
811 else {
812 Int4 k = 0;
813 Int4 p = hsp->query.offset;
814 const Uint1 kGap = 15;
815
816 for (i = 0;i < hsp->gap_info->size;i++) {
817 if (hsp->gap_info->op_type[i] != eGapAlignDel) {
818
819 p += hsp->gap_info->num[i];
820 while (k < hsp->map_info->edits->num_edits &&
821 hsp->map_info->edits->edits[k].query_pos < p &&
822 hsp->map_info->edits->edits[k].query_base != kGap) {
823
824 k++;
825 }
826 }
827 else {
828 Int4 j;
829 for (j = 0;j < hsp->gap_info->num[i];j++) {
830 ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
831 k++;
832 }
833 }
834 }
835 hsp->map_info->edits->num_edits = k;
836 }
837
838 /* update HSP start positions */
839 if (is_start) {
840 hsp->query.offset += delta_query;
841 hsp->subject.offset += delta_subject;
842 }
843 else {
844 hsp->query.end -= delta_query;
845 hsp->subject.end -= delta_subject;
846 }
847
848 /* update HSP score */
849 hsp->score = s_ComputeAlignmentScore(hsp, mismatch_score, gap_open_score,
850 gap_extend_score);
851
852 #if _DEBUG
853 ASSERT(s_TestHSPRanges(hsp));
854 #endif
855
856 return 0;
857 }
858
859
860 #define NUM_ADAPTERS 4
861 #define MAX_ADAPTER_LEN 20
862
863 /* Find adapeter on the 5' end in a single sequence. The search will start
864 towards the end of the last HSP (from, to) */
s_FindAdapterInSequence(Int4 hsp_from,Int4 hsp_to,Uint1 * query,Int4 query_len)865 static Int4 s_FindAdapterInSequence(Int4 hsp_from, Int4 hsp_to, Uint1* query,
866 Int4 query_len)
867 {
868 Uint1 adapters_tab[NUM_ADAPTERS][MAX_ADAPTER_LEN] = {
869 /* Contaminants from FastQC config file:
870 http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ */
871 /* Illumina universal adapter: AGATCGGAAGAG */
872 {0, 2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2},
873 /* Illumina small RNA adapter: ATGGAATTCTCG */
874 {0, 3, 2, 2, 0, 0, 3, 3, 1, 3, 1, 2},
875 /* Nextera transosase sequence: CTGTCTCTTATA */
876 {1, 3, 2, 3, 1, 3, 1, 3, 3, 0, 3, 0},
877 /* Illumina multiplexing adapter 1: GATCGGAAGAGCACACGTCT */
878 {2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2, 1, 0, 1, 0, 1, 2, 3, 1, 3}};
879
880 int lengths[NUM_ADAPTERS] = {12, 12, 12, 20};
881 Boolean found = FALSE;
882 Int4 adapter_start = -1;
883 int adptr_idx;
884 Int4 from = hsp_from, to = hsp_to;
885 const Int4 kMaxErrors = 1; /* max number of mismaches allowed for matching
886 the adapter sequence + 1*/
887
888
889 if (!query) {
890 return -1;
891 }
892
893 for (adptr_idx = 0;!found && adptr_idx < NUM_ADAPTERS;adptr_idx++) {
894 Uint1* adapter = adapters_tab[adptr_idx];
895 Uint4 word = *(Uint4*)adapter;
896 Int4 q = MAX(to - lengths[adptr_idx], from);
897 int i = 0;
898 while (q < query_len - 4 && q + i < query_len && i < lengths[adptr_idx]) {
899 while (q < query_len - 4 && *(Uint4*)(query + q) != word) {
900 q++;
901 }
902 if (q < query_len - 4) {
903 Int4 errors = kMaxErrors + 1;
904 i = 0;
905 while (q + i < query_len && i < lengths[adptr_idx] &&
906 errors) {
907
908 if (query[q + i] != adapter[i]) {
909 errors--;
910 }
911 i++;
912 }
913 if (q + i == query_len || i == lengths[adptr_idx]) {
914 adapter_start = q;
915 found = TRUE;
916 break;
917 }
918
919 q++;
920 }
921 }
922 }
923
924 ASSERT(adapter_start <= query_len);
925 return adapter_start;
926 }
927
928
929 /* Set adapter position in the chain and trim alignments that span beyond
930 adapter start position */
s_SetAdapter(HSPChain ** chains_ptr,Int4 adapter_pos,Int4 query_len,const ScoringOptions * scores)931 static Int2 s_SetAdapter(HSPChain** chains_ptr, Int4 adapter_pos,
932 Int4 query_len, const ScoringOptions* scores)
933 {
934 HSPChain* head = NULL;
935 HSPChain* chain = NULL;
936 const Int4 kMinAdapterLen = 3;
937
938 if (!chains_ptr || !*chains_ptr || adapter_pos < 0) {
939 return -1;
940 }
941
942 head = *chains_ptr;
943 chain = *chains_ptr;
944 /* iterate over chains */
945 while (chain) {
946 BlastHSP* hsp = chain->hsps->hsp;
947
948 /* check the 3' alignment extent on the query and skip this chain
949 if the query is covered almost to the end */
950 if (hsp->query.frame > 0) {
951 /* for positive strand, check the last query position */
952 HSPContainer* h = chain->hsps;
953 while (h->next) {
954 h = h->next;
955 }
956 ASSERT(h);
957 if (query_len - h->hsp->query.end < kMinAdapterLen) {
958 chain = chain->next;
959 continue;
960 }
961 }
962 else {
963 /* for negative strand, check the first query position */
964 ASSERT(hsp);
965 if (hsp->query.offset < kMinAdapterLen) {
966 chain = chain->next;
967 continue;
968 }
969 }
970
971 /* set adapter start position in the chain */
972 chain->adapter = adapter_pos;
973
974 /* Trim alignment that covers part of the adapter */
975 /* for query on the positive strand */
976 if (hsp->query.frame > 0) {
977 HSPContainer* h = chain->hsps;
978
979 /* if all HSPs are behind the adapter, delete this chain;
980 we require that HSP without adapter is at least 5 bases long */
981 if (h->hsp->query.offset >= adapter_pos - 5) {
982 HSPChain* prev = head;
983 HSPChain* next = chain->next;
984 while (prev && prev->next != chain) {
985 prev =prev->next;
986 }
987
988 chain->next = NULL;
989 HSPChainFree(chain);
990 chain = next;
991 if (prev) {
992 prev->next = next;
993 }
994 else {
995 head = chain;
996 }
997
998 continue;
999 }
1000
1001 /* find the first HSP with end position after adapter */
1002 while (h && h->hsp->query.end <= adapter_pos) {
1003 h = h->next;
1004 }
1005
1006 /* if one is found */
1007 if (h) {
1008 HSPContainer* hh = NULL;
1009 hsp = h->hsp;
1010
1011 /* if adapter is with in the HSP trim the HSP */
1012 if (hsp->query.offset < adapter_pos ) {
1013 Int4 old_score = hsp->score;
1014 ASSERT(hsp->query.end - adapter_pos > 0);
1015 s_TrimHSP(hsp, hsp->query.end - adapter_pos, TRUE,
1016 FALSE, scores->penalty, scores->gap_open,
1017 scores->gap_extend);
1018 chain->score += hsp->score - old_score;
1019
1020 /* remove HSPs after h as they are past the adapter */
1021 if (h->next) {
1022 h->next = HSPContainerFree(h->next);
1023 }
1024 }
1025 else {
1026 /* otherwise delete h and everything after it */
1027 hh = chain->hsps;
1028 while (hh && hh->next != h) {
1029 hh = hh->next;
1030 }
1031 ASSERT(hh);
1032 hh->next = HSPContainerFree(hh->next);
1033 }
1034
1035 /* compute the new chain score */
1036 chain->score = s_ComputeChainScore(chain, scores, query_len,
1037 FALSE);
1038 }
1039
1040 /* mark adapter in the HSPs */
1041 h = chain->hsps;
1042 while (h->next) {
1043 h = h->next;
1044 }
1045 h->hsp->map_info->right_edge |= MAPPER_ADAPTER;
1046 h->hsp->map_info->right_edge |= MAPPER_EXON;
1047 }
1048 else {
1049 /* negative strand: adapter is in the beginning of the chain */
1050 Int4 pos_minus = query_len - adapter_pos - 1;
1051 HSPContainer* h = chain->hsps;
1052
1053 /* find the first HSP with end position after adapter;
1054 we require that HSP without adapter is at least 5 bases long */
1055 while (h && h->hsp->query.end <= pos_minus + 5) {
1056 h = h->next;
1057 }
1058
1059 /* if all HSPs are before the adapter, delete this chain */
1060 if (!h) {
1061 /* remove chain */
1062 HSPChain* prev = head;
1063 HSPChain* next = chain->next;
1064 while (prev && prev->next != chain) {
1065 prev =prev->next;
1066 }
1067
1068 chain->next = NULL;
1069 HSPChainFree(chain);
1070 chain = next;
1071 if (prev) {
1072 prev->next = next;
1073 }
1074 else {
1075 head = chain;
1076 }
1077
1078 continue;
1079 }
1080
1081 /* if h is not the first HSP in the chain, remove all HSPs before
1082 h */
1083 if (h != chain->hsps) {
1084 HSPContainer* hh = chain->hsps;
1085 while (hh && hh->next != h) {
1086 hh = hh->next;
1087 }
1088 ASSERT(hh);
1089 hh->next = NULL;
1090 HSPContainerFree(chain->hsps);
1091 chain->hsps = h;
1092
1093 /* compute new chain score */
1094 chain->score = s_ComputeChainScore(chain, scores, query_len,
1095 FALSE);
1096 }
1097 ASSERT(h);
1098
1099 /* set adapter information */
1100 hsp = chain->hsps->hsp;
1101 hsp->map_info->left_edge |= MAPPER_ADAPTER;
1102 hsp->map_info->left_edge |= MAPPER_EXON;
1103
1104 /* trim HSP if necessary */
1105 if (pos_minus >= hsp->query.offset) {
1106 Int4 old_score = hsp->score;
1107 ASSERT(pos_minus - hsp->query.offset + 1 > 0);
1108 s_TrimHSP(hsp, pos_minus - hsp->query.offset + 1,
1109 TRUE, TRUE, scores->penalty, scores->gap_open,
1110 scores->gap_extend);
1111 chain->score += hsp->score - old_score;
1112 }
1113 }
1114
1115 chain = chain->next;
1116 }
1117
1118 ASSERT(!head || s_TestChains(head));
1119 *chains_ptr = head;
1120
1121 return 0;
1122 }
1123
1124
s_FindAdapters(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * score_opts)1125 static Int4 s_FindAdapters(HSPChain** saved,
1126 const BLAST_SequenceBlk* query_blk,
1127 const BlastQueryInfo* query_info,
1128 const ScoringOptions* score_opts)
1129 {
1130 Int4 query_idx;
1131
1132 for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1133 HSPChain* chain = NULL;
1134 HSPChain* ch = NULL;
1135 Uint1* query = NULL;
1136 Int4 query_len;
1137 Int4 from = -1, to = -1;
1138 BlastHSP* hsp = NULL;
1139 Int4 overhang = 0;
1140 Int4 adapter_pos = -1;
1141
1142 if (!saved[query_idx]) {
1143 continue;
1144 }
1145
1146 /* we search for adapters and only in the plus strand of the query */
1147 query = query_blk->sequence +
1148 query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1149 ASSERT(query);
1150 query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1151
1152 /* find the best scoring chain */
1153 chain = saved[query_idx];
1154 ch = chain->next;
1155 for (; ch; ch = ch->next) {
1156 if (ch->score > chain->score) {
1157 chain = ch;
1158 }
1159 }
1160 ASSERT(chain);
1161
1162 /* find query coverage by the whole chain */
1163 if (chain->hsps->hsp->query.frame > 0) {
1164 HSPContainer* h = chain->hsps;
1165 from = h->hsp->query.offset;
1166 while (h->next) {
1167 h = h->next;
1168 }
1169 to = h->hsp->query.end;
1170 }
1171 else {
1172 HSPContainer* h = chain->hsps;
1173 to = query_len - h->hsp->query.offset;
1174 while (h->next) {
1175 h = h->next;
1176 }
1177 from = query_len - h->hsp->query.end;
1178 }
1179 ASSERT(from >= 0 && to >= 0);
1180
1181 /* do not search for adapters if the query is mostly covered by the
1182 best scoring chain */
1183 if (from < 20 && to > query_len - 3) {
1184 continue;
1185 }
1186
1187 /* find the chain with the longest overhang */
1188 chain = saved[query_idx];
1189 ch = chain->next;
1190 for (; ch; ch = ch->next) {
1191 HSPContainer* h = ch->hsps;
1192 if (h->hsp->query.frame > 0) {
1193 while (h->next) {
1194 h = h->next;
1195 }
1196 if (query_len - h->hsp->query.end > overhang) {
1197 overhang = query_len - h->hsp->query.end;
1198 chain = ch;
1199 }
1200 }
1201 else {
1202 if (h->hsp->query.offset + 1 > overhang) {
1203 overhang = h->hsp->query.offset + 1;
1204 chain = ch;
1205 }
1206 }
1207 }
1208 ASSERT(chain);
1209
1210 /* find location from where to search for the adapter */
1211 hsp = chain->hsps->hsp;
1212 if (hsp->query.frame > 0) {
1213 HSPContainer* h = chain->hsps;
1214 while (h->next) {
1215 h = h->next;
1216 }
1217 hsp = h->hsp;
1218 }
1219 ASSERT(hsp);
1220
1221 if (hsp->query.frame > 0) {
1222 from = hsp->query.offset;
1223 to = hsp->query.end;
1224 }
1225 else {
1226 from = query_len - hsp->query.end;
1227 to = query_len - hsp->query.offset;
1228 }
1229
1230 /* do not search for adapters if the read aligns almost to the end */
1231 if (to >= query_len - 3) {
1232 continue;
1233 }
1234
1235 /* search for adapters */
1236 adapter_pos = s_FindAdapterInSequence(from, to, query, query_len);
1237
1238 /* set adapter information in all chains and trim HSPs that extend
1239 into the adapter */
1240 if (adapter_pos >= 0) {
1241 s_SetAdapter(&(saved[query_idx]), adapter_pos, query_len,
1242 score_opts);
1243 }
1244 }
1245
1246 return 0;
1247 }
1248
1249
1250 /* Search for polyA tail at the end of a sequence and return the start position */
s_FindPolyAInSequence(Uint1 * sequence,Int4 length)1251 static Int4 s_FindPolyAInSequence(Uint1* sequence, Int4 length)
1252 {
1253 Int4 i;
1254 const Uint1 kBaseA = 0;
1255 Int4 num_a = 0;
1256 Int4 err = 0;
1257 const Int4 kMaxErrors = 3;
1258
1259 if (!sequence) {
1260 return -1;
1261 }
1262
1263 /* iterate over positions untile kMaxErrors non A bases are found */
1264 i = length - 1;
1265 while (i >= 0 && err < kMaxErrors) {
1266 if (sequence[i] != kBaseA) {
1267 err++;
1268 }
1269
1270 i--;
1271 }
1272 i++;
1273
1274 /* find the beginnig of the string of As */
1275 while (i < length - 1 &&
1276 (sequence[i] != kBaseA || sequence[i + 1] != kBaseA)) {
1277
1278 if (sequence[i] != kBaseA) {
1279 err--;
1280 }
1281 i++;
1282 }
1283
1284 num_a = length - i - err;
1285
1286 /* short tails must have no errors */
1287 if (num_a < 3 || (num_a < 5 && err > 0)) {
1288 return -1;
1289 }
1290
1291 return i;
1292 }
1293
1294
1295 /* Set PolyA information in HSP chains */
s_SetPolyATail(HSPChain * chains,Int4 positive_start,Int4 negative_start,Int4 query_len)1296 static Int4 s_SetPolyATail(HSPChain* chains, Int4 positive_start,
1297 Int4 negative_start, Int4 query_len)
1298 {
1299 HSPChain* ch = NULL;
1300
1301 if (!chains) {
1302 return -1;
1303 }
1304
1305 for (ch = chains; ch; ch = ch->next) {
1306 HSPContainer* h = ch->hsps;
1307 while (h->next) {
1308 h = h->next;
1309 }
1310
1311 if (query_len - h->hsp->query.end < 5) {
1312 continue;
1313 }
1314
1315 if ((h->hsp->query.frame < 0 && negative_start >= 0) ||
1316 (h->hsp->query.frame > 0 && positive_start >= 0)) {
1317
1318 h->hsp->map_info->right_edge |= MAPPER_POLY_A;
1319 h->hsp->map_info->right_edge |= MAPPER_EXON;
1320
1321 if (h->hsp->query.frame > 0) {
1322 ch->polyA = MAX(positive_start, h->hsp->query.end);
1323 }
1324 else {
1325 ch->polyA = MAX(negative_start, h->hsp->query.end);
1326 }
1327 }
1328 }
1329
1330 return 0;
1331 }
1332
1333
s_FindPolyATails(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)1334 static Int4 s_FindPolyATails(HSPChain** saved,
1335 const BLAST_SequenceBlk* query_blk,
1336 const BlastQueryInfo* query_info)
1337 {
1338 Int4 query_idx;
1339
1340 /* for each query */
1341 for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1342 HSPChain* chain = NULL;
1343 HSPChain* ch = NULL;
1344 Uint1* query = NULL;
1345 Int4 query_len;
1346 Int4 from = -1, to = -1;
1347 Int4 positive_start, negative_start;
1348
1349 /* skip queries with no results and those with adapters */
1350 if (!saved[query_idx] || saved[query_idx]->adapter >= 0) {
1351 continue;
1352 }
1353
1354 query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1355
1356 /* find the best scoring chain */
1357 chain = saved[query_idx];
1358 ch = chain->next;
1359 for (; ch; ch = ch->next) {
1360 if (ch->score > chain->score) {
1361 chain = ch;
1362 }
1363 }
1364 ASSERT(chain);
1365
1366 /* find query coverage by the whole chain */
1367 if (chain->hsps->hsp->query.frame > 0) {
1368 HSPContainer* h = chain->hsps;
1369 from = h->hsp->query.offset;
1370 while (h->next) {
1371 h = h->next;
1372 }
1373 to = h->hsp->query.end;
1374 }
1375 else {
1376 HSPContainer* h = chain->hsps;
1377 to = query_len - h->hsp->query.offset;
1378 while (h->next) {
1379 h = h->next;
1380 }
1381 from = query_len - h->hsp->query.end;
1382 }
1383 ASSERT(from >= 0 && to >= 0);
1384
1385 /* do not search for polyA tails if the query is mostly covered by the
1386 best scoring chain */
1387 if (from < 4 && to > query_len - 3) {
1388 continue;
1389 }
1390
1391 /* search for polyA tail on the positive strand */
1392 query = query_blk->sequence +
1393 query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1394 positive_start = s_FindPolyAInSequence(query, query_len);
1395
1396 /* search for the polyA tail on the negative strand */
1397 query = query_blk->sequence +
1398 query_info->contexts[query_idx * NUM_STRANDS + 1].query_offset;
1399 negative_start = s_FindPolyAInSequence(query, query_len);
1400
1401 /* set polyA start positions and HSP flags in the saved chains */
1402 if (positive_start >= 0 || negative_start >= 0) {
1403 s_SetPolyATail(saved[query_idx], positive_start, negative_start,
1404 query_len);
1405 }
1406 }
1407
1408 return 0;
1409 }
1410
1411
1412 /* Merge two HSPs into one. The two HSPs may only be separated by at most
1413 one gap or mismatch. Function returns NULL is the HSPs cannon be merged */
s_MergeHSPs(const BlastHSP * first,const BlastHSP * second,const Uint1 * query,const ScoringOptions * score_opts)1414 static BlastHSP* s_MergeHSPs(const BlastHSP* first, const BlastHSP* second,
1415 const Uint1* query,
1416 const ScoringOptions* score_opts)
1417 {
1418 BlastHSP* merged_hsp = NULL; /* this will be the result */
1419 const BlastHSP* hsp = second;
1420 Int4 query_gap;
1421 Int4 subject_gap;
1422 Int4 mismatches = 0;
1423 Int4 gap_info_size;
1424 Int4 edits_size;
1425 Int4 k;
1426
1427 if (!first || !second || !query || !score_opts) {
1428 return NULL;
1429 }
1430
1431 merged_hsp = Blast_HSPClone(first);
1432 if (!merged_hsp) {
1433 return NULL;
1434 }
1435
1436 query_gap = second->query.offset - first->query.end;
1437 subject_gap = second->subject.offset - first->subject.end;
1438
1439 if (query_gap < 0 || subject_gap < 0 || query_gap > 1 ||
1440 query_gap != subject_gap) {
1441
1442 return NULL;
1443 }
1444
1445 gap_info_size = first->gap_info->size + second->gap_info->size +
1446 MAX(query_gap, subject_gap);
1447
1448 edits_size = first->map_info->edits->num_edits +
1449 second->map_info->edits->num_edits + MAX(query_gap, subject_gap);
1450
1451 /* FIXME: should be done through an API */
1452 /* reallocate memory for edit scripts */
1453 merged_hsp->gap_info->op_type = realloc(merged_hsp->gap_info->op_type,
1454 gap_info_size *
1455 sizeof(EGapAlignOpType));
1456
1457 merged_hsp->gap_info->num = realloc(merged_hsp->gap_info->num,
1458 gap_info_size *
1459 sizeof(Int4));
1460
1461 merged_hsp->map_info->edits->edits = realloc(
1462 merged_hsp->map_info->edits->edits,
1463 edits_size * sizeof(JumperEdit));
1464
1465 if (!merged_hsp->gap_info->op_type ||
1466 !merged_hsp->gap_info->num ||
1467 !merged_hsp->map_info->edits->edits) {
1468
1469 Blast_HSPFree(merged_hsp);
1470 return NULL;
1471 }
1472
1473 if (query_gap == subject_gap) {
1474 mismatches = query_gap;
1475 }
1476
1477 /* add mismatches to gap_info */
1478 if (mismatches > 0) {
1479 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1480 == eGapAlignSub) {
1481 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1482 mismatches;
1483 }
1484 else {
1485 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1486 eGapAlignSub;
1487 merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1488 mismatches;
1489 merged_hsp->gap_info->size++;
1490 }
1491 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1492 }
1493
1494 /* merge gap_info */
1495 for (k = 0;k < hsp->gap_info->size;k++) {
1496
1497 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1498 == hsp->gap_info->op_type[k]) {
1499
1500 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1501 hsp->gap_info->num[k];
1502 }
1503 else {
1504 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size]
1505 = hsp->gap_info->op_type[k];
1506
1507 merged_hsp->gap_info->num[merged_hsp->gap_info->size++]
1508 = hsp->gap_info->num[k];
1509 }
1510 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1511 }
1512
1513 /* add mismatches to jumper edits */
1514 if (mismatches > 0) {
1515 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1516 merged_hsp->map_info->edits->num_edits++;
1517 edit->query_pos = merged_hsp->query.end;
1518 /* FIXME: Mismatch bases cannot be currently set because there is
1519 no access to query or subject sequence in this function. */
1520 edit->query_base = query[edit->query_pos];
1521 edit->subject_base = edit->query_base;
1522
1523 ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1524 }
1525
1526 /* merge jumper edits */
1527 if (hsp->map_info->edits->num_edits) {
1528 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1529 merged_hsp->map_info->edits->num_edits;
1530
1531 ASSERT(merged_hsp->map_info->edits->num_edits +
1532 hsp->map_info->edits->num_edits <= edits_size);
1533
1534 memcpy(edit, hsp->map_info->edits->edits,
1535 hsp->map_info->edits->num_edits * sizeof(JumperEdit));
1536
1537 merged_hsp->map_info->edits->num_edits +=
1538 hsp->map_info->edits->num_edits;
1539 }
1540
1541 /* update alignment extents, scores, etc */
1542 merged_hsp->query.end = hsp->query.end;
1543 merged_hsp->subject.end = hsp->subject.end;
1544 merged_hsp->score = s_ComputeAlignmentScore(merged_hsp,
1545 score_opts->penalty,
1546 score_opts->gap_open,
1547 score_opts->gap_extend);
1548 merged_hsp->num_ident += hsp->num_ident;
1549
1550 merged_hsp->map_info->right_edge = hsp->map_info->right_edge;
1551 if (!merged_hsp->map_info->subject_overhangs) {
1552 return merged_hsp;
1553 }
1554
1555 if (!hsp->map_info->subject_overhangs ||
1556 !hsp->map_info->subject_overhangs->right_len ||
1557 !hsp->map_info->subject_overhangs->right) {
1558
1559 merged_hsp->map_info->subject_overhangs->right_len = 0;
1560 if (merged_hsp->map_info->subject_overhangs->right) {
1561 sfree(merged_hsp->map_info->subject_overhangs->right);
1562 merged_hsp->map_info->subject_overhangs->right = NULL;
1563 }
1564 }
1565 else {
1566 Int4 new_right_len = hsp->map_info->subject_overhangs->right_len;
1567 if (merged_hsp->map_info->subject_overhangs->right_len <
1568 new_right_len) {
1569
1570 merged_hsp->map_info->subject_overhangs->right =
1571 realloc(merged_hsp->map_info->subject_overhangs->right,
1572 new_right_len * sizeof(Uint1));
1573 }
1574 memcpy(merged_hsp->map_info->subject_overhangs->right,
1575 hsp->map_info->subject_overhangs->right,
1576 new_right_len * sizeof(Uint1));
1577 }
1578
1579 return merged_hsp;
1580 }
1581
1582
1583 /* Find paired reads mapped to different database sequences */
s_FindRearrangedPairs(HSPChain ** saved,const BlastQueryInfo * query_info)1584 static int s_FindRearrangedPairs(HSPChain** saved,
1585 const BlastQueryInfo* query_info)
1586 {
1587 Int4 query_idx;
1588
1589 /* iterate over single reads from each pair */
1590 for (query_idx = 0;query_idx + 1 < query_info->num_queries; query_idx++) {
1591 HSPChain* chain = saved[query_idx];
1592 HSPChain* thepair = saved[query_idx + 1];
1593
1594 /* skip queries with no results */
1595 if (!chain || !thepair) {
1596 continue;
1597 }
1598
1599 /* skip queries that do not have pairs */
1600 if (query_info->contexts[query_idx * NUM_STRANDS].segment_flags !=
1601 eFirstSegment) {
1602
1603 continue;
1604 }
1605
1606 /* skip queries with more than one saved chain and ones that already
1607 have a pair */
1608 if (chain->next || chain->pair || thepair->next) {
1609 continue;
1610 }
1611
1612 /* skip chains aligned to the same subject; these are taken care of
1613 elsewhere */
1614 if (chain->oid == thepair->oid) {
1615 continue;
1616 }
1617
1618 /* skip HSP chains aligned on the same strands */
1619 if (SIGN(chain->hsps->hsp->query.frame) ==
1620 SIGN(thepair->hsps->hsp->query.frame)) {
1621 continue;
1622 }
1623
1624 /* mark the pair */
1625 chain->pair = thepair;
1626 thepair->pair = chain;
1627
1628 chain->pair_conf = PAIR_PARALLEL;
1629 thepair->pair_conf = PAIR_PARALLEL;
1630 }
1631
1632 return 0;
1633 }
1634
1635
1636 /* Sort chains by score in descending order */
s_CompareChainsByScore(const void * cha,const void * chb)1637 static int s_CompareChainsByScore(const void* cha, const void* chb)
1638 {
1639 const HSPChain* a = *((HSPChain**)cha);
1640 const HSPChain* b = *((HSPChain**)chb);
1641
1642 if (a->score < b->score) {
1643 return 1;
1644 }
1645 else if (a->score > b->score) {
1646 return -1;
1647 }
1648 else {
1649 return 0;
1650 }
1651 }
1652
1653 /* Remove chains with scores below the these of the best ones considering
1654 score bonus for pairs and count the number of chains with same of better
1655 score for each chain */
s_PruneChains(HSPChain ** saved,Int4 num_queries,Int4 pair_bonus)1656 static int s_PruneChains(HSPChain** saved, Int4 num_queries, Int4 pair_bonus)
1657
1658 {
1659 Int4 query_idx;
1660 Int4 array_size = 10;
1661 HSPChain** array = calloc(array_size, sizeof(HSPChain*));
1662
1663 /* Prunning is done in two passes: first considering single chains,
1664 then considering pairs. The first pass breaks pairs where there is much
1665 better chain with no pair. The second pass selects the best scoring
1666 chains giving preference to pairs. Pairs are preferable, so they get
1667 additional score. Two passes are requires, because we compare single
1668 chains with single chains and pairs with pairs. This is to avoid
1669 breaking pairs in situations with two pairs with one high scoring
1670 chain. */
1671
1672 /* first pass: for each query remove poor scoring chains that may split
1673 pairs */
1674 for (query_idx = 0; query_idx < num_queries; query_idx++) {
1675 HSPChain* chain = saved[query_idx];
1676 HSPChain* prev = NULL;
1677 Int4 best_score = 0;
1678
1679 if (!chain || !chain->next) {
1680 continue;
1681 }
1682
1683 /* find the best single chain score */
1684 for (chain = saved[query_idx]; chain; chain = chain->next) {
1685 if (chain->score > best_score) {
1686 best_score = chain->score;
1687 }
1688 }
1689
1690 /* remove chains with poor scores, considering pair bonus */
1691 chain = saved[query_idx];
1692 while (chain) {
1693 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1694
1695 if (score < best_score) {
1696 HSPChain* next = chain->next;
1697
1698 chain->next = NULL;
1699 HSPChainFree(chain);
1700 chain = next;
1701
1702 if (prev) {
1703 prev->next = next;
1704 }
1705 else {
1706 saved[query_idx] = next;
1707 }
1708 }
1709 else {
1710 prev = chain;
1711 chain = chain->next;
1712 }
1713 }
1714
1715 }
1716
1717 /* second pass: for each query remove chains scoring poorer than the best
1718 ones considering pair bonus */
1719 for (query_idx = 0; query_idx < num_queries; query_idx++) {
1720 HSPChain* chain = saved[query_idx];
1721 HSPChain* prev = NULL;
1722 Int4 best_score = 0;
1723 Int4 best_pair_score = 0;
1724 Int4 i;
1725 Int4 count = 0;
1726 Int4 num_chains = 0;
1727
1728 if (!chain) {
1729 continue;
1730 }
1731
1732 if (!chain->next) {
1733 chain->count = 1;
1734 continue;
1735 }
1736
1737 /* find the best scores for single chains and pairs */
1738 for (chain = saved[query_idx]; chain; chain = chain->next) {
1739 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1740 if (score >= best_score) {
1741 best_score = score;
1742 }
1743
1744 if (chain->pair &&
1745 chain->score + chain->pair->score >= best_pair_score) {
1746
1747 best_pair_score = chain->score + chain->pair->score;
1748 }
1749
1750 /* collect chains in the array */
1751 if (num_chains >= array_size) {
1752 array_size *= 2;
1753 array = realloc(array, array_size * sizeof(HSPChain*));
1754 if (!array) {
1755 return -1;
1756 }
1757 }
1758 array[num_chains++] = chain;
1759 }
1760
1761 /* sort chains by scores in descending order */
1762 ASSERT(num_chains > 1);
1763 qsort(array, num_chains, sizeof(HSPChain*), s_CompareChainsByScore);
1764
1765 /* find multiplicity for each chain: number of chains with same or
1766 larger score */
1767 /* Note that multiplicity is currently not reported. Only the number
1768 of returned hits for each query is reported */
1769 for (i = 0;i < num_chains; i++) {
1770 Int4 k;
1771
1772 count++;
1773 /* check for chains with the same score */
1774 k = i + 1;
1775 while (k < num_chains && array[k]->score == array[i]->score) {
1776 k++;
1777 count++;
1778 }
1779
1780 /* save multiplicity */
1781 for (;i < k;i++) {
1782 array[i]->count = count;
1783 }
1784 i--;
1785 }
1786
1787
1788 /* remove chains with poor score and count chains with equal or better
1789 score to the minimum pair one */
1790 chain = saved[query_idx];
1791 while (chain) {
1792 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1793
1794 /* We compare single chains with single chains and pairs with
1795 pairs. There may be pairs with one strong and one weak hit.
1796 Comparing single chain scores we can be left with single chains
1797 from different pairs. This comparison aviods that. The first
1798 pass ensures that pairs with poor scoring chains are removed,
1799 so we do not have to compare singles with pairs. */
1800 if ((!chain->pair && score < best_score) ||
1801 (chain->pair && chain->score + chain->pair->score <
1802 best_pair_score)) {
1803
1804 HSPChain* next = chain->next;
1805
1806 chain->next = NULL;
1807 HSPChainFree(chain);
1808 chain = next;
1809
1810 if (prev) {
1811 prev->next = next;
1812 }
1813 else {
1814 saved[query_idx] = next;
1815 }
1816 }
1817 else {
1818 prev = chain;
1819 chain = chain->next;
1820 }
1821 }
1822 }
1823
1824 if (array) {
1825 sfree(array);
1826 }
1827
1828 return 0;
1829 }
1830
1831
1832 /* Compare chains by subject oid and offfset */
s_CompareChainsByOid(const void * cha,const void * chb)1833 static int s_CompareChainsByOid(const void* cha, const void* chb)
1834 {
1835 const HSPChain* a = *((HSPChain**)cha);
1836 const HSPChain* b = *((HSPChain**)chb);
1837
1838 if (a->oid > b->oid) {
1839 return 1;
1840 }
1841 else if (a->oid < b->oid) {
1842 return -1;
1843 }
1844 else if (a->hsps->hsp->subject.offset > b->hsps->hsp->subject.offset) {
1845 return 1;
1846 }
1847 else if (a->hsps->hsp->subject.offset < b->hsps->hsp->subject.offset) {
1848 return -1;
1849 }
1850
1851 return 0;
1852 }
1853
1854 /* Sort chains by subject oid and position and HSPs in the chains by query and
1855 subject positions */
s_SortChains(HSPChain ** saved,Int4 num_queries,int (* comp)(const void *,const void *))1856 static Int4 s_SortChains(HSPChain** saved, Int4 num_queries,
1857 int(*comp)(const void*, const void*))
1858 {
1859 Int4 i;
1860 Int4 array_size = 50;
1861 HSPChain** chain_array = NULL;
1862
1863 if (!saved || num_queries < 0) {
1864 return -1;
1865 }
1866
1867 chain_array = calloc(array_size, sizeof(HSPChain*));
1868 if (!chain_array) {
1869 return -1;
1870 }
1871
1872 /* for each query */
1873 for (i = 0;i < num_queries;i++) {
1874 HSPChain* chain = saved[i];
1875 Int4 num_chains = 0;
1876 Int4 k;
1877
1878 if (!chain) {
1879 continue;
1880 }
1881
1882 /* create an array of pointer to chains */
1883 for (; chain; chain = chain->next) {
1884 if (num_chains >= array_size) {
1885 array_size *= 2;
1886 chain_array = realloc(chain_array, array_size *
1887 sizeof(HSPChain*));
1888
1889 ASSERT(chain_array);
1890 if (!chain_array) {
1891 return -1;
1892 }
1893 }
1894 chain_array[num_chains++] = chain;
1895 }
1896
1897 if (num_chains > 1) {
1898 /* sort chains */
1899 qsort(chain_array, num_chains, sizeof(HSPChain*), comp);
1900
1901 /* create the list of chains in the new order */
1902 for (k = 0;k < num_chains - 1;k++) {
1903 chain_array[k]->next = chain_array[k + 1];
1904 }
1905 chain_array[num_chains - 1]->next = NULL;
1906
1907 saved[i] = chain_array[0];
1908 ASSERT(saved[i]);
1909 }
1910 }
1911
1912 if (chain_array) {
1913 free(chain_array);
1914 }
1915
1916 return 0;
1917 }
1918
1919 /* Convert internal HSPChain structure to BlastHSPChain used to report results */
s_HSPChainToBlastHSPChain(HSPChain * chain)1920 static BlastHSPChain* s_HSPChainToBlastHSPChain(HSPChain* chain)
1921 {
1922 BlastHSPChain* new_chain = NULL;
1923 HSPContainer* h = NULL;
1924 Int4 num_hsps = 0;
1925 Int4 index = 0;
1926
1927 if (!chain || !chain->hsps) {
1928 return NULL;
1929 }
1930
1931 new_chain = Blast_HSPChainNew();
1932 if (!new_chain) {
1933 return NULL;
1934 }
1935
1936 new_chain->query_index = chain->context / NUM_STRANDS;
1937 new_chain->oid = chain->oid;
1938 new_chain->score = chain->score;
1939 new_chain->adapter = chain->adapter;
1940 new_chain->polyA = chain->polyA;
1941
1942 /* move hsps */
1943 for (h = chain->hsps; h; h = h->next) {
1944 num_hsps++;
1945 }
1946 new_chain->num_hsps = num_hsps;
1947 new_chain->hsp_array = calloc(num_hsps, sizeof(BlastHSP*));
1948 if (!new_chain->hsp_array) {
1949 return NULL;
1950 }
1951 for (h = chain->hsps; h; h = h->next) {
1952 new_chain->hsp_array[index++] = h->hsp;
1953 h->hsp = NULL;
1954 }
1955
1956 return new_chain;
1957 }
1958
1959 /* Separate HSPs that overlap on the query to different chains (most output
1960 formats do not support query overlaps) */
s_RemoveOverlaps(HSPChain * chain,const ScoringOptions * score_opts,Int4 query_len)1961 static int s_RemoveOverlaps(HSPChain* chain, const ScoringOptions* score_opts,
1962 Int4 query_len)
1963 {
1964 HSPContainer* h = NULL;
1965 HSPContainer* prev = NULL;
1966 HSPChain* tail = chain->next;
1967 HSPChain* head = chain;
1968 Boolean rescore = FALSE;
1969
1970 if (!chain) {
1971 return -1;
1972 }
1973
1974 /* current and previous HSP */
1975 h = chain->hsps->next;
1976 prev = chain->hsps;
1977
1978 for (; h; h = h->next) {
1979
1980 /* if HSPs overlap on the query */
1981 if (prev->hsp->query.end > h->hsp->query.offset) {
1982 /* do a flat copy of the chain */
1983 HSPChain* new_chain = HSPChainNew(chain->context);
1984 memcpy(new_chain, chain, sizeof(HSPChain));
1985 new_chain->pair = NULL;
1986 new_chain->next = NULL;
1987
1988 /* new chain starts with h */
1989 new_chain->hsps = h;
1990
1991 /* the original chain ends on the previous HSP */
1992 prev->next = NULL;
1993
1994 /* add new chain to the list */
1995 head->next = new_chain;
1996 head = new_chain;
1997
1998 rescore = TRUE;
1999 }
2000 prev = h;
2001 }
2002
2003 /* compute chains scores */
2004 if (rescore) {
2005 for (; chain; chain = chain->next) {
2006 chain->score = s_ComputeChainScore(chain, score_opts, query_len,
2007 FALSE);
2008 }
2009 }
2010
2011 /* re-attach other chains to the list */
2012 head->next = tail;
2013
2014 return 0;
2015 }
2016
2017 /* Populate HSP data and save final results */
s_Finalize(HSPChain ** saved,BlastMappingResults * results,const BlastQueryInfo * query_info,const BLAST_SequenceBlk * query_blk,const ScoringOptions * score_opts,Boolean is_paired)2018 static int s_Finalize(HSPChain** saved, BlastMappingResults* results,
2019 const BlastQueryInfo* query_info,
2020 const BLAST_SequenceBlk* query_blk,
2021 const ScoringOptions* score_opts,
2022 Boolean is_paired)
2023 {
2024 Int4 query_idx;
2025 Int4 num_results = 0;
2026 Int4 num = 0;
2027 const Int4 kPairBonus = 21;
2028 Int4* num_unique_chains = NULL;
2029
2030 /* remove poor scoring chains and find chain multiplicity */
2031 if (!getenv("MAPPER_NO_PRUNNING")) {
2032 s_PruneChains(saved, query_info->num_queries, kPairBonus);
2033 }
2034
2035 /* find adapters in query sequences */
2036 s_FindAdapters(saved, query_blk, query_info, score_opts);
2037
2038 /* find polyA tails in query sequences */
2039 s_FindPolyATails(saved, query_blk, query_info);
2040
2041 /* find pairs of chains aligned to different subjects */
2042 s_FindRearrangedPairs(saved, query_info);
2043
2044 /* sort chains by subject oid to ensure stable results for multithreaded
2045 runs; sorting must be done here so that compartment numbers are stable */
2046 /* FIXME: this may not be needed */
2047 s_SortChains(saved, query_info->num_queries, s_CompareChainsByOid);
2048
2049 /* Compute number of results to store, and number of unique mappings for
2050 each query (some chains may be copied to create pairs) */
2051
2052 num_unique_chains = calloc(query_info->num_queries, sizeof(Int4));
2053 if (!num_unique_chains) {
2054 return -1;
2055 }
2056
2057 /* count number of unique mappings */
2058 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2059 HSPChain* chain = saved[query_idx];
2060 HSPChain* prev = chain;
2061
2062 if (!chain) {
2063 continue;
2064 }
2065
2066 num_unique_chains[query_idx]++;
2067
2068 chain = chain->next;
2069 for (; chain; chain = chain->next, prev = prev->next) {
2070 if (prev->oid != chain->oid ||
2071 s_FindFragmentStart(prev) != s_FindFragmentStart(chain)) {
2072
2073 num_unique_chains[query_idx]++;
2074 }
2075 }
2076 }
2077
2078 /* separate HSPs that overlap on the query into different chains */
2079 if (getenv("MAPPER_NO_OVERLAPPED_HSP_MERGE")) {
2080 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2081 HSPChain* chain = saved[query_idx];
2082 Int4 query_len;
2083
2084 if (!chain) {
2085 continue;
2086 }
2087
2088 query_len = query_info->contexts[chain->context].query_length;
2089 for (; chain; chain = chain->next) {
2090 s_RemoveOverlaps(chain, score_opts, query_len);
2091 }
2092
2093 }
2094 }
2095
2096 /* count number of chains to report */
2097 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2098 HSPChain* chain = saved[query_idx];
2099
2100 for (; chain; chain = chain->next) {
2101 num_results++;
2102 }
2103 }
2104
2105 results->chain_array = calloc(num_results, sizeof(BlastHSPChain*));
2106 if (!results->chain_array) {
2107 if (num_unique_chains) {
2108 free(num_unique_chains);
2109 }
2110 return -1;
2111 }
2112 results->num_results = num_results;
2113
2114 /* iterate over queries */
2115 for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
2116 HSPChain* chain = saved[query_idx];
2117
2118 /* skip queries for which there are no saved results */
2119 if (!chain) {
2120 continue;
2121 }
2122
2123 for (; chain; chain = chain->next) {
2124 BlastHSPChain* new_chain = NULL;
2125 BlastHSPChain* pair = NULL;
2126
2127 /* pairs are processed together so that we can store their pointers
2128 so a chain whose pair's context is smaller was already processed
2129 */
2130 if (chain->pair && chain->context > chain->pair->context) {
2131 continue;
2132 }
2133
2134 new_chain = s_HSPChainToBlastHSPChain(chain);
2135 ASSERT(new_chain);
2136 if (!new_chain) {
2137 return -1;
2138 }
2139 new_chain->multiplicity = num_unique_chains[new_chain->query_index];
2140
2141 results->chain_array[num++] = new_chain;
2142
2143 if (chain->pair) {
2144 pair = s_HSPChainToBlastHSPChain(chain->pair);
2145 chain->pair->compartment = -1;
2146
2147 new_chain->pair = pair;
2148 pair->pair = new_chain;
2149 pair->multiplicity = num_unique_chains[pair->query_index];
2150 results->chain_array[num++] = pair;
2151 }
2152 }
2153
2154 }
2155 ASSERT(num == results->num_results);
2156
2157 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2158 saved[query_idx] = HSPChainFree(saved[query_idx]);
2159 }
2160
2161 if (num_unique_chains) {
2162 sfree(num_unique_chains);
2163 }
2164
2165 return 0;
2166 }
2167
2168 /** Perform post-run clean-ups
2169 * @param data The buffered data structure [in]
2170 * @param results The HSP results to propagate [in][out]
2171 */
2172 static int
s_BlastHSPMapperFinal(void * data,void * mapping_results)2173 s_BlastHSPMapperFinal(void* data, void* mapping_results)
2174 {
2175 BlastHSPMapperData* spl_data = data;
2176 BlastHSPMapperParams* params = spl_data->params;
2177 BlastMappingResults* results = (BlastMappingResults*)mapping_results;
2178
2179 if (spl_data->saved_chains) {
2180 s_Finalize(spl_data->saved_chains, results, spl_data->query_info,
2181 spl_data->query, ¶ms->scoring_options, params->paired);
2182 }
2183
2184 if (spl_data->saved_chains) {
2185 sfree(spl_data->saved_chains);
2186 }
2187
2188 return 0;
2189 }
2190
2191
2192 /* A node representing an HSP for finding the best chain of HSPs for RNA-seq */
2193 typedef struct HSPNode {
2194 BlastHSP** hsp; /* pointer to the entry in hsp_list */
2195 Int4 best_score; /* score for path that starts at this node */
2196 struct HSPNode* path_next; /* next node in the path */
2197 } HSPNode;
2198
2199
2200 /* Copy a array of HSP nodes */
s_HSPNodeArrayCopy(HSPNode * dest,HSPNode * source,Int4 num)2201 static int s_HSPNodeArrayCopy(HSPNode* dest, HSPNode* source, Int4 num)
2202 {
2203 int i;
2204
2205 if (!dest || !source) {
2206 return -1;
2207 }
2208
2209 for (i = 0;i < num;i++) {
2210 dest[i] = source[i];
2211 if (source[i].path_next) {
2212 dest[i].path_next = dest + (source[i].path_next - source);
2213 }
2214 }
2215
2216 return 0;
2217 }
2218
2219
2220 /* Maximum number of top scoring HSP chains to report for a single read and
2221 strand */
2222 #define MAX_NUM_HSP_PATHS 40
2223
2224 /* A chain of HSPs aligning a spliced RNA-Seq read to a genome */
2225 typedef struct HSPPath {
2226 HSPNode** start; /* array of chain starting nodes */
2227 int num_paths; /* number of chains */
2228 Int4 score; /* all chains in start have this cumulative score */
2229 } HSPPath;
2230
2231
HSPPathFree(HSPPath * path)2232 static HSPPath* HSPPathFree(HSPPath* path)
2233 {
2234 if (path) {
2235 if (path->start) {
2236 free(path->start);
2237 }
2238 free(path);
2239 }
2240
2241 return NULL;
2242 }
2243
HSPPathNew(void)2244 static HSPPath* HSPPathNew(void)
2245 {
2246 HSPPath* retval = calloc(1, sizeof(HSPPath));
2247 if (!retval) {
2248 return NULL;
2249 }
2250
2251 retval->start = calloc(MAX_NUM_HSP_PATHS, sizeof(HSPNode*));
2252 if (!retval->start) {
2253 free(retval);
2254 return NULL;
2255 }
2256
2257 return retval;
2258 }
2259
2260
2261 #define NUM_SIGNALS 23
2262
2263 /* Find a split for HSPs overlapping on the query by finding splice signals in
2264 the subject sequence. The first HSP must have smaller query offset than the
2265 second one. Subject is recreated from the query sequence and information
2266 in an HSP.
2267 @param first HSP with smaller query offset [in|out]
2268 @param second HSP with larger query offset [in|out]
2269 @param query Query sequence [in]
2270 @return Status
2271 */
2272 static Int4
s_FindSpliceJunctionsForOverlaps(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len)2273 s_FindSpliceJunctionsForOverlaps(BlastHSP* first, BlastHSP* second,
2274 Uint1* query, Int4 query_len)
2275 {
2276 Int4 i, k;
2277 /* splice signal pairs from include/algo/sequence/consensus_splice.hpp */
2278 Uint1 signals[NUM_SIGNALS] = {0xb2, /* GTAG */
2279 0x71, /* CTAC (reverse complement) */
2280 0x92, /* GCAG */
2281 0x79, /* CTGC */
2282 0x31, /* ATAC */
2283 0xb3, /* GTAT */
2284 0xbe, /* GTTG */
2285 0x41, /* CAAC */
2286 0xba, /* GTGG */
2287 0x51, /* CCAC */
2288 0xb0, /* GTAA */
2289 0xf1, /* TTAC */
2290 0x82, /* GAAG */
2291 0x7d, /* CTTC */
2292 0xf2, /* TTAG */
2293 0x70, /* CTAA */
2294 0x32, /* ATAG */
2295 0x73, /* CTAT */
2296 0xa2, /* GGAG */
2297 0x75, /* CTCC */
2298 0x33, /* ATAT (revese complemented is self) */
2299 0x30, /* ATAA */
2300 0xf3 /* TTAT */};
2301
2302
2303 Boolean found = FALSE;
2304 Uint1* subject = NULL;
2305 Int4 overlap_len;
2306 const Uint1 kGap = 15;
2307
2308 /* HSPs must be sorted and must overlap only on the query */
2309 ASSERT(first->query.offset < second->query.offset);
2310 ASSERT(second->query.offset <= first->query.end);
2311 ASSERT(first->subject.offset < second->subject.offset);
2312
2313
2314 /* if there are edits within the overlap region, then try to maximize
2315 the chain score by assigning a part of the overlap region to the
2316 HSP that has fewer edits in the overlap */
2317 if ((first->map_info->edits->num_edits > 0 &&
2318 first->map_info->edits->edits[
2319 first->map_info->edits->num_edits - 1].query_pos >=
2320 second->query.offset)
2321 || (second->map_info->edits->num_edits > 0 &&
2322 second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2323
2324 /* find nummer of edits in both HSPs within the overlap and assign
2325 overlap to the HSP that has fewer edits */
2326 Int4 num_first = 0;
2327 Int4 num_second = 0;
2328
2329 /* find number of edits */
2330 for (i = first->map_info->edits->num_edits - 1;i >= 0;i--) {
2331 JumperEdit* edits = first->map_info->edits->edits;
2332 if (edits[i].query_pos < second->query.offset) {
2333 break;
2334 }
2335 num_first++;
2336 }
2337
2338 for (i = 0;i < second->map_info->edits->num_edits;i++) {
2339 JumperEdit* edits = second->map_info->edits->edits;
2340 if (edits[i].query_pos >= first->query.end) {
2341 break;
2342 }
2343 num_second++;
2344 }
2345
2346 if (num_first > num_second) {
2347 while (first->map_info->edits->num_edits > 0 &&
2348 first->map_info->edits->edits[
2349 first->map_info->edits->num_edits - 1].query_pos >=
2350 second->query.offset) {
2351
2352 JumperEdit* edits = first->map_info->edits->edits;
2353 Uint1 edge = first->map_info->right_edge;
2354 Int4 num_edits = first->map_info->edits->num_edits;
2355 Int4 trim_by;
2356
2357 if (edits[num_edits - 1].query_pos >= first->query.end - 1) {
2358 if (edits[num_edits - 1].subject_base != kGap) {
2359 edge >>= 2;
2360 edge |= edits[num_edits - 1].subject_base << 2;
2361 }
2362 }
2363 else if (edits[num_edits - 1].query_pos == query_len - 2 &&
2364 edits[num_edits - 1].subject_base == kGap) {
2365
2366 edge = (edge << 2) | query[query_len - 1];
2367 }
2368 else {
2369 if (edits[num_edits - 1].subject_base != kGap &&
2370 edits[num_edits - 1].query_base != kGap) {
2371
2372 edge = (edits[num_edits - 1].subject_base << 2) |
2373 query[edits[num_edits - 1].query_pos + 1];
2374 }
2375 else if (edits[num_edits - 1].subject_base == kGap) {
2376 edge = (query[edits[num_edits - 1].query_pos + 1] << 2 ) |
2377 query[edits[num_edits - 1].query_pos + 2];
2378 }
2379 else {
2380 edge = (edits[num_edits - 1].subject_base << 2) |
2381 query[edits[num_edits - 1].query_pos];
2382 }
2383 }
2384
2385 trim_by = first->query.end - edits[
2386 first->map_info->edits->num_edits - 1].query_pos;
2387 /* if the edit is an insert in the genome, count genome bases
2388 when trimming the HSP */
2389 if (edits[num_edits - 1].query_base == kGap) {
2390 trim_by++;
2391 /* FIXME: use proper scores here */
2392 s_TrimHSP(first, trim_by, FALSE, FALSE, -4, -4, -4);
2393 }
2394 else {
2395 s_TrimHSP(first, trim_by, TRUE, FALSE, -4, -4, -4);
2396 }
2397 first->map_info->right_edge = edge;
2398 }
2399 }
2400 else if (num_second > num_first) {
2401 while (second->map_info->edits->num_edits > 0 &&
2402 second->map_info->edits->edits[0].query_pos <
2403 first->query.end) {
2404
2405 JumperEdit* edits = second->map_info->edits->edits;
2406 Uint1 edge = second->map_info->left_edge;
2407 Int4 trim_by;
2408
2409 if (edits[0].query_pos == 0) {
2410 if (edits[0].subject_base != kGap) {
2411 edge <<= 2;
2412 edge |= edits[0].subject_base;
2413 }
2414 }
2415 else if (edits[0].query_pos == 1 &&
2416 edits[0].subject_base == kGap) {
2417
2418 edge = (edge << 2) | query[0];
2419 }
2420 else {
2421
2422 edge = (query[edits[0].query_pos - 1] << 2) |
2423 edits[0].subject_base;
2424
2425 if (edits[0].subject_base == kGap) {
2426 edge = (query[edits[0].query_pos - 2] << 2) |
2427 query[edits[0].query_pos - 1];
2428 }
2429 }
2430
2431 trim_by = edits[0].query_pos - second->query.offset + 1;
2432 /* if the edit is an insert in the genome, count genome bases
2433 when trimming the HSP */
2434 if (edits[0].query_base == kGap) {
2435 trim_by++;
2436 /* FIXME: use proper scores here */
2437 s_TrimHSP(second,
2438 edits[0].query_pos - second->query.offset + 1,
2439 FALSE, TRUE, -4, -4, -4);
2440 }
2441 else {
2442 s_TrimHSP(second,
2443 edits[0].query_pos - second->query.offset + 1,
2444 TRUE, TRUE, -4, -4, -4);
2445 }
2446 second->map_info->left_edge = edge;
2447 }
2448 }
2449 else {
2450 /* if the number of edits is the same we do not know how to
2451 divide the overlap */
2452 return 0;
2453 }
2454 }
2455
2456 /* give up if there are gaps in the alignments or mismatches still left in
2457 the overlapping parts */
2458 if ((first->map_info->edits->num_edits > 0 &&
2459 first->map_info->edits->edits[
2460 first->map_info->edits->num_edits - 1].query_pos >=
2461 second->query.offset)
2462 || (second->map_info->edits->num_edits > 0 &&
2463 second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2464
2465 return 0;
2466 }
2467
2468 /* recreate the subject sequence plus two bases on each side */
2469 overlap_len = first->query.end - second->query.offset;
2470 subject = calloc(overlap_len + 4, sizeof(Uint1));
2471 if (!subject) {
2472 return -1;
2473 }
2474
2475 /* FIXME: lef_edge and right_edge should be changed to subject_overhangs */
2476 subject[0] = (second->map_info->left_edge & 0xf) >> 2;
2477 subject[1] = second->map_info->left_edge & 3;
2478 memcpy(subject + 2, query + second->query.offset,
2479 overlap_len * sizeof(Uint1));
2480
2481 subject[2 + overlap_len] = (first->map_info->right_edge & 0xf) >> 2;
2482 subject[2 + overlap_len + 1] = first->map_info->right_edge & 3;
2483
2484 /* search for matching splice signals in the subject sequence */
2485 for (k = 0;k < NUM_SIGNALS;k++) {
2486
2487 for (i = 0;i <= overlap_len && !found;i++) {
2488
2489 Uint1 seq = (subject[i] << 2) | subject[i + 1] |
2490 (subject[i + 2] << 6) | (subject[i + 3] << 4);
2491
2492 /* if a splice signals are found, update HSPs */
2493 if (seq == signals[k]) {
2494 Int4 d = first->query.end - (second->query.offset + i);
2495 first->query.end -= d;
2496 first->subject.end -= d;
2497 first->gap_info->num[first->gap_info->size - 1] -= d;
2498 first->score -= d;
2499 first->map_info->right_edge = (seq & 0xf0) >> 4;
2500 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2501
2502 d = i;
2503 second->query.offset += d;
2504 second->subject.offset += d;
2505 second->gap_info->num[0] -= d;
2506 second->score -= d;
2507 second->map_info->left_edge = seq & 0xf;
2508 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2509
2510 ASSERT(first->gap_info->size > 1 ||
2511 first->query.end - first->query.offset ==
2512 first->subject.end - first->subject.offset);
2513
2514 ASSERT(second->gap_info->size > 1 ||
2515 second->query.end - second->query.offset ==
2516 second->subject.end - second->subject.offset);
2517
2518 found = TRUE;
2519 break;
2520 }
2521 }
2522 }
2523
2524 if (!found) {
2525 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2526 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2527 }
2528
2529 if (subject) {
2530 sfree(subject);
2531 }
2532
2533 ASSERT(first->query.offset < first->query.end);
2534 ASSERT(second->query.offset < second->query.end);
2535
2536 return 0;
2537 }
2538
s_ExtendAlignmentCleanup(Uint1 * subject,BlastGapAlignStruct * gap_align,GapEditScript * edit_script,JumperEditsBlock * edits)2539 static void s_ExtendAlignmentCleanup(Uint1* subject,
2540 BlastGapAlignStruct* gap_align,
2541 GapEditScript* edit_script,
2542 JumperEditsBlock* edits)
2543 {
2544 if (subject) {
2545 free(subject);
2546 }
2547
2548 BLAST_GapAlignStructFree(gap_align);
2549 GapEditScriptDelete(edit_script);
2550 JumperEditsBlockFree(edits);
2551 }
2552
2553 /* Extend alignment on one side of an HSP up to a given point (a splice site)
2554 */
s_ExtendAlignment(BlastHSP * hsp,const Uint1 * query,Int4 query_from,Int4 query_to,Int4 subject_from,Int4 subject_to,const ScoringOptions * score_options,Boolean is_left)2555 static Int4 s_ExtendAlignment(BlastHSP* hsp, const Uint1* query,
2556 Int4 query_from, Int4 query_to,
2557 Int4 subject_from, Int4 subject_to,
2558 const ScoringOptions* score_options,
2559 Boolean is_left)
2560 {
2561 Int4 o_len;
2562 Uint1* o_seq = NULL;
2563 Uint1* subject = NULL;
2564 BlastGapAlignStruct* gap_align = NULL;
2565 GapEditScript* edit_script = NULL;
2566 JumperEditsBlock* edits = NULL;
2567 Int4 i, k, s;
2568 /* number of query bases to align */
2569 Int4 query_gap = query_to - query_from + 1;
2570 /* number of subject bases to align */
2571 Int4 subject_gap = subject_to - subject_from + 1;
2572 Int4 num_identical;
2573 Int4 query_ext_len, subject_ext_len;
2574 Int4 ungapped_ext_len;
2575
2576 JUMP* jumper_table = NULL;
2577
2578 JUMP jumper_mismatch[] = {{1, 1, 0, 0}};
2579
2580 JUMP jumper_insertion[] = {{1, 1, 2, 0},
2581 {1, 0, 1, 0},
2582 {1, 1, 0, 0}};
2583
2584 JUMP jumper_deletion[] = {{1, 1, 2, 0},
2585 {0, 1, 1, 0},
2586 {1, 1, 0, 0}};
2587
2588 if (!hsp || !query || !hsp->map_info ||
2589 !hsp->map_info->subject_overhangs) {
2590 return -1;
2591 }
2592
2593 /* length of the subject overgang sequence saved in the HSP */
2594 o_len = is_left ? hsp->map_info->subject_overhangs->left_len :
2595 hsp->map_info->subject_overhangs->right_len;
2596 /* subject sequence */
2597 o_seq = is_left ? hsp->map_info->subject_overhangs->left :
2598 hsp->map_info->subject_overhangs->right;
2599 ASSERT(o_seq);
2600
2601 /* compressed subject sequence (needed for computig jumper edits) */
2602 subject = calloc(o_len / 4 + 1, sizeof(Uint1));
2603 if (!subject) {
2604 return -1;
2605 }
2606
2607 /* compress subject sequence */
2608 k = 6;
2609 s = 0;
2610 for (i = 0;i < o_len;i++, k-=2) {
2611 subject[s] |= o_seq[i] << k;
2612 if (k == 0) {
2613 s++;
2614 k = 8;
2615 }
2616 }
2617
2618 /* gap_align is needed in jumper alignment API calls, we only allocate
2619 fields that will be needed in these calls */
2620 gap_align = calloc(1, sizeof(BlastGapAlignStruct));
2621 if (!gap_align) {
2622 s_ExtendAlignmentCleanup(subject, NULL, edit_script, edits);
2623 return -1;
2624 }
2625 gap_align->jumper = JumperGapAlignNew(o_len * 2);
2626 if (!gap_align->jumper) {
2627 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2628 return -1;
2629 }
2630
2631 /* select jumper table based on the number of expected indels; we allow
2632 only insertions or only deletions */
2633 switch (SIGN(query_gap - subject_gap)) {
2634 case 0: jumper_table = jumper_mismatch;
2635 break;
2636
2637 case -1: jumper_table = jumper_deletion;
2638 break;
2639
2640 case 1: jumper_table = jumper_insertion;
2641 break;
2642
2643 default:
2644 ASSERT(0);
2645 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2646 return -1;
2647 };
2648
2649 /* Compute alignment. We always do right extension as this is typically
2650 a short alignment. This is global alignment, hence the large max number
2651 of errors and no penalties for mismatches or gaps. The jumper table
2652 specifies costs for alignment errors. The penalty socres specify
2653 required number of macthes at the ends of the alignment. */
2654 JumperExtendRightWithTraceback(query + query_from, o_seq + subject_from,
2655 query_gap, subject_gap,
2656 1, 0, 0, 0,
2657 20, 20,
2658 &query_ext_len, &subject_ext_len,
2659 gap_align->jumper->right_prelim_block,
2660 &num_identical,
2661 FALSE,
2662 &ungapped_ext_len,
2663 jumper_table);
2664
2665 ASSERT(query_ext_len <= query_gap);
2666 ASSERT(subject_ext_len <= subject_gap);
2667
2668 /* Because this is a global alignment a gap may exist after the last query
2669 or subject base which causes jumper extension to stop prematurely. If
2670 this is the case, add the missing indel */
2671 while (query_ext_len < query_gap) {
2672 JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2673 JUMPER_INSERTION);
2674
2675 query_ext_len++;
2676 }
2677
2678 while (subject_ext_len < subject_gap) {
2679 JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2680 JUMPER_DELETION);
2681
2682 subject_ext_len++;
2683 }
2684 ASSERT(query_ext_len == query_gap);
2685 ASSERT(subject_ext_len == subject_gap);
2686
2687 /* update gap info in the HSP */
2688 edit_script = JumperPrelimEditBlockToGapEditScript(
2689 gap_align->jumper->left_prelim_block,
2690 gap_align->jumper->right_prelim_block);
2691 if (!edit_script) {
2692 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2693 return -1;
2694 }
2695
2696 if (is_left) {
2697 hsp->gap_info = GapEditScriptCombine(&edit_script, &hsp->gap_info);
2698 }
2699 else {
2700 hsp->gap_info = GapEditScriptCombine(&hsp->gap_info, &edit_script);
2701 ASSERT(!edit_script);
2702 }
2703 ASSERT(hsp->gap_info);
2704 if (!hsp->gap_info) {
2705 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2706 return -1;
2707 }
2708
2709 /* update jumper edits */
2710 gap_align->query_start = query_from;
2711 gap_align->query_stop = query_from + query_ext_len;
2712 gap_align->subject_start = subject_from;
2713 gap_align->subject_stop = subject_from + subject_ext_len;
2714 edits = JumperFindEdits(query, subject, gap_align);
2715 ASSERT(edits);
2716 if (!edits) {
2717 s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2718 return -1;
2719 }
2720
2721 if (is_left) {
2722 hsp->map_info->edits = JumperEditsBlockCombine(&edits,
2723 &hsp->map_info->edits);
2724 }
2725 else {
2726 hsp->map_info->edits = JumperEditsBlockCombine(
2727 &hsp->map_info->edits,
2728 &edits);
2729 ASSERT(!edits);
2730 }
2731 ASSERT(hsp->map_info->edits);
2732 if (!hsp->map_info->edits) {
2733 s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2734 return -1;
2735 }
2736
2737 if (is_left) {
2738 hsp->query.offset -= query_ext_len;
2739 hsp->subject.offset -= subject_ext_len;
2740 }
2741 else {
2742 hsp->query.end += query_ext_len;
2743 hsp->subject.end += subject_ext_len;
2744 }
2745
2746 /* compute new HSP score */
2747 hsp->score = s_ComputeAlignmentScore(hsp, score_options->penalty,
2748 score_options->gap_open,
2749 score_options->gap_extend);
2750
2751 ASSERT(hsp->gap_info);
2752 ASSERT(hsp->map_info->edits);
2753
2754 ASSERT(subject);
2755
2756
2757 if (subject) {
2758 free(subject);
2759 }
2760 BLAST_GapAlignStructFree(gap_align);
2761
2762 return 0;
2763 }
2764
2765 #define NUM_SIGNALS_CONSENSUS 2
2766
2767 /* Find splcie junctions and extend HSP alignment if there are unaligned query
2768 bases in the middle of a chain.
2769 */
2770 static Int4
s_FindSpliceJunctionsForGap(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,const ScoringOptions * score_options)2771 s_FindSpliceJunctionsForGap(BlastHSP* first, BlastHSP* second,
2772 Uint1* query, Int4 query_len,
2773 const ScoringOptions* score_options)
2774 {
2775 Int4 query_gap;
2776 Int4 first_len, second_len;
2777 Boolean found = FALSE;
2778 Int4 k;
2779 Int4 q;
2780 Uint1 signal = 0;
2781 /* use only cannonical splice signals here */
2782 /* The procedure does not maximize score for unaligned bases of the query.
2783 It only finds splice signals and then the best alignment it can find
2784 for the unaligned bases. For limited number of unaligned query bases,
2785 we can be sure of the splice site vs a continuous alignment, because a
2786 continuous alignment would score better. But we do not compare
2787 alignment scores between different splice sites, so we do not know
2788 which splice site and signal is better; and many splice signals here
2789 would lead to finding wrong splice sites. */
2790 Uint1 signals[NUM_SIGNALS_CONSENSUS] = {0xb2, /* GTAG */
2791 0x71 /* CTAC */};
2792
2793 if (!first || !second) {
2794 return -1;
2795 }
2796
2797 if (!first->map_info->subject_overhangs ||
2798 !first->map_info->subject_overhangs->right ||
2799 !second->map_info->subject_overhangs ||
2800 !second->map_info->subject_overhangs->left) {
2801 return -1;
2802 }
2803
2804 /* number of query bases that fall between the HSPs */
2805 query_gap = second->query.offset - first->query.end;
2806
2807 /* we do not have enough subject sequence saved */
2808 if (query_gap > first->map_info->subject_overhangs->right_len ||
2809 query_gap > second->map_info->subject_overhangs->left_len) {
2810 return 0;
2811 }
2812
2813 first_len = first->map_info->subject_overhangs->right_len;
2814 second_len = second->map_info->subject_overhangs->left_len;
2815
2816 /* assume that the begining of intron was found, search for the splice
2817 signal at the end of the first HSP */
2818 for (q = 0; !found && q < 4;q++) {
2819
2820 if (first->query.end - q <= first->query.offset) {
2821 break;
2822 }
2823
2824 for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2825 Int4 i;
2826 Int4 status = 0;
2827 Int4 start;
2828 Uint1 seq;
2829
2830 if (q == 0) {
2831 seq = (first->map_info->subject_overhangs->right[0] << 6) |
2832 (first->map_info->subject_overhangs->right[1] << 4);
2833 }
2834 else if (q == 1) {
2835 seq = (query[first->query.end - 1] << 6) |
2836 (first->map_info->subject_overhangs->right[0] << 4);
2837 }
2838 else if (q > 1) {
2839 seq = (query[first->query.end - q] << 6) |
2840 (query[first->query.end - q + 1] << 4);
2841 }
2842 else {
2843 return -1;
2844 }
2845
2846 if (seq != (signals[k] & 0xf0)) {
2847 continue;
2848 }
2849
2850 /* location of the splice signa in subject overhang if there are no
2851 indels */
2852 start = second->map_info->subject_overhangs->left_len - 1 -
2853 query_gap - 2 - q;
2854
2855 /* search for the splice signal at the end of intron;
2856 allow for up to 1 indel */
2857 for (i = MAX(start - 1, 0);i <= MIN(start + 1, second_len);i++) {
2858 seq &= 0xf0;
2859 seq |= (second->map_info->subject_overhangs->left[i] << 2) |
2860 second->map_info->subject_overhangs->left[i + 1];
2861
2862 /* if the splice signal found extend the alignment */
2863 if (seq == signals[k]) {
2864 Int4 subject_gap = second_len - (i + 2) + q;
2865 if (query_gap - subject_gap < -1 ||
2866 query_gap - subject_gap > 1) {
2867
2868 continue;
2869 }
2870
2871 found = TRUE;
2872 if (q > 0) {
2873 s_TrimHSP(first, q, TRUE, FALSE,
2874 score_options->penalty,
2875 score_options->gap_open,
2876 score_options->gap_extend);
2877 }
2878
2879 status = s_ExtendAlignment(second, query,
2880 first->query.end,
2881 second->query.offset - 1,
2882 i + 2,
2883 second->map_info->subject_overhangs->left_len - 1,
2884 score_options,
2885 TRUE);
2886
2887 ASSERT(status == 0);
2888 if (status) {
2889 return status;
2890 }
2891
2892 signal = seq;
2893 break;
2894 }
2895 }
2896 }
2897 }
2898
2899 /* assume that end of the intron was found, search for the splice signal
2900 at the beginning of the intron */
2901 for (q = 0; !found && q < 4;q++) {
2902
2903 if (second->query.offset + q >= second->query.end) {
2904 break;
2905 }
2906
2907 for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2908 Int4 i;
2909 Int4 end;
2910 Uint1 seq;
2911 if (q == 0) {
2912 seq =
2913 (second->map_info->subject_overhangs->left[second_len - 2] << 2) |
2914 second->map_info->subject_overhangs->left[second_len - 1];
2915 }
2916 else if (q == 1) {
2917 seq = (second->map_info->subject_overhangs->left[second_len - 1] << 2) | query[second->query.offset];
2918
2919
2920 }
2921 else if (q > 1) {
2922
2923 if (second->map_info->edits->num_edits > 0 &&
2924 second->map_info->edits->edits[0].query_pos < q - 2) {
2925
2926 break;
2927 }
2928
2929 seq = (query[second->query.offset + q - 2] << 2) |
2930 query[second->query.offset + q + 1 - 2];
2931 }
2932 else {
2933 return -1;
2934 }
2935
2936 if (seq != (signals[k] & 0xf)) {
2937 continue;
2938 }
2939
2940 /* location of the splice signal in the subject overhang if there are
2941 no indels */
2942 end = query_gap + q;
2943
2944 /* allow for up to 1 indel */
2945 for (i = MAX(end - 1, 0);i <= MIN(end + 1, first_len);i++) {
2946 seq &= 0xf;
2947 seq |= (first->map_info->subject_overhangs->right[i] << 6) |
2948 (first->map_info->subject_overhangs->right[i + 1] << 4);
2949
2950 if (seq == signals[k]) {
2951 Int4 status = 0;
2952 Int4 subject_gap = i - q;
2953 if (query_gap - subject_gap < -1 ||
2954 query_gap - subject_gap > 1) {
2955
2956 continue;
2957 }
2958
2959 found = TRUE;
2960 if (q > 0) {
2961 s_TrimHSP(second, q, TRUE, TRUE,
2962 score_options->penalty,
2963 score_options->gap_open,
2964 score_options->gap_extend);
2965 }
2966
2967 status = s_ExtendAlignment(first, query,
2968 first->query.end,
2969 second->query.offset - 1,
2970 0, i - 1,
2971 score_options,
2972 FALSE);
2973 ASSERT(status == 0);
2974 if (status) {
2975 return status;
2976 }
2977
2978 signal = seq;
2979 break;
2980 }
2981 }
2982 }
2983 }
2984
2985 if (found) {
2986 first->map_info->right_edge = (signal & 0xf0) >> 4;
2987 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2988 second->map_info->left_edge = signal & 0xf;
2989 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2990
2991 ASSERT(first->gap_info->size > 1 ||
2992 first->query.end - first->query.offset ==
2993 first->subject.end - first->subject.offset);
2994
2995 ASSERT(second->gap_info->size > 1 ||
2996 second->query.end - second->query.offset ==
2997 second->subject.end - second->subject.offset);
2998 }
2999 else {
3000 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3001 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3002 }
3003
3004 return 0;
3005 }
3006
3007
3008 #define NUM_SINGLE_SIGNALS 8
3009
3010 /* Record subject bases pre and post alignment in HSP as possible splice
3011 signals and set a flag for recognized signals */
s_FindSpliceSignals(BlastHSP * hsp,Uint1 * query,Int4 query_len)3012 static Int4 s_FindSpliceSignals(BlastHSP* hsp, Uint1* query, Int4 query_len)
3013 {
3014 SequenceOverhangs* overhangs = NULL;
3015 /* splice signals in the order of preference */
3016 Uint1 signals[NUM_SINGLE_SIGNALS] = {2, /* AG */
3017 11, /* GT */
3018 13, /* TC */
3019 7, /* CT */
3020 1, /* AC */
3021 4, /* CA */
3022 8, /* GA */
3023 14 /* TG */};
3024
3025 if (!hsp || !query) {
3026 return -1;
3027 }
3028
3029 overhangs = hsp->map_info->subject_overhangs;
3030 ASSERT(overhangs);
3031
3032 if (!overhangs) {
3033 return -1;
3034 }
3035
3036 /* FIXME: check for polyA and adapters */
3037
3038 /* search for a splice signal on the left edge, unless the query is aligned
3039 from the beginning */
3040 if (hsp->query.offset == 0) {
3041 hsp->map_info->left_edge = MAPPER_EXON;
3042 }
3043 else {
3044 int k, i;
3045 Boolean found = FALSE;
3046
3047 if (overhangs->left_len >= 2) {
3048 Uint1 signal = (overhangs->left[overhangs->left_len - 2] << 2) |
3049 overhangs->left[overhangs->left_len - 1];
3050
3051 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3052 if (signal == signals[k]) {
3053 hsp->map_info->left_edge = signal;
3054 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3055
3056 found = TRUE;
3057 break;
3058 }
3059 }
3060 }
3061
3062 if (!found && overhangs->left_len >= 1) {
3063 Uint1 signal = (overhangs->left[overhangs->left_len - 1] << 2) |
3064 query[hsp->query.offset];
3065
3066 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3067 if (signal == signals[k]) {
3068 hsp->map_info->left_edge = signal;
3069 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3070
3071 /* trim alignment */
3072 s_TrimHSP(hsp, 1, TRUE, TRUE, -8, 0, -8);
3073
3074 /* update score at some point */
3075 found = TRUE;
3076 break;
3077 }
3078 }
3079 }
3080
3081 for (i = hsp->query.offset;!found && i < 4;i++) {
3082 Uint1 signal = (query[i] << 2) | query[i + 1];
3083 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3084 if (signal == signals[k]) {
3085 hsp->map_info->left_edge = signal;
3086 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3087
3088 /* trim alignment */
3089 s_TrimHSP(hsp, i + 2, TRUE, TRUE, -8, 0, -8);
3090
3091 found = TRUE;
3092 break;
3093 }
3094 }
3095 }
3096 }
3097
3098 /* search for the splice signal on the right edge, unless the query is
3099 aligned to the end */
3100 if (hsp->query.end == query_len) {
3101 hsp->map_info->right_edge = MAPPER_EXON;
3102 }
3103 else {
3104 int k, i;
3105 Boolean found = FALSE;
3106
3107 if (overhangs->right_len >= 2) {
3108 Uint1 signal = (overhangs->right[0] << 2) | overhangs->right[1];
3109 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3110 if (signal == signals[k]) {
3111 hsp->map_info->right_edge = signal;
3112 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3113
3114 found = TRUE;
3115 break;
3116 }
3117 }
3118
3119 }
3120
3121 if (!found && overhangs->right_len >= 1) {
3122 Uint1 signal = (query[hsp->query.end - 1] << 2) |
3123 overhangs->right[0];
3124
3125 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3126 if (signal == signals[k]) {
3127 hsp->map_info->right_edge = signal;
3128 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3129
3130 /* update HSP */
3131 s_TrimHSP(hsp, 1, TRUE, FALSE, -8, 0, -8);
3132
3133 /* update score at some point */
3134 found = TRUE;
3135 break;
3136 }
3137 }
3138
3139 }
3140
3141 for (i = hsp->query.end - 2;!found && i > hsp->query.end - 2 - 4;i--) {
3142 Uint1 signal = (query[i] << 2) | query[i + 1];
3143 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3144 if (signal == signals[k]) {
3145 hsp->map_info->right_edge = signal;
3146 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3147
3148 /* update HSP */
3149 s_TrimHSP(hsp, hsp->query.end - i, TRUE, FALSE, -8, 0, -8);
3150
3151 found = TRUE;
3152 break;
3153 }
3154 }
3155 }
3156 }
3157
3158 return 0;
3159 }
3160
3161
3162 /* Search for splice signals between two HSPs in a chain. The HSPs in the
3163 chain must be sorted by query position in asceding order.
3164 */
3165 static Int4
s_FindSpliceJunctions(HSPChain * chains,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3166 s_FindSpliceJunctions(HSPChain* chains,
3167 const BLAST_SequenceBlk* query_blk,
3168 const BlastQueryInfo* query_info,
3169 const ScoringOptions* scoring_opts)
3170 {
3171 HSPChain* ch = NULL;
3172
3173 if (!chains || !query_blk || !query_info || !scoring_opts) {
3174 return -1;
3175 }
3176
3177 /* iterate over HSPs in the chain */
3178 for (ch = chains; ch; ch = ch->next) {
3179 HSPContainer* h = ch->hsps;
3180 Boolean searched = FALSE;
3181 Uint1* query = NULL;
3182 Int4 context;
3183 Int4 query_len;
3184 ASSERT(h);
3185
3186 context = h->hsp->context;
3187 query = query_blk->sequence +
3188 query_info->contexts[context].query_offset;
3189 query_len = query_info->contexts[context].query_length;
3190
3191
3192 /* FIXME: check the overhangs of the first and last HSP and try finding
3193 splice signal within a few bases in the HSP */
3194 if (!h->next) {
3195
3196 s_FindSpliceSignals(h->hsp, query, query_len);
3197 searched = TRUE;
3198 }
3199
3200 while (h->next) {
3201 HSPContainer* next = h->next;
3202 ASSERT(next);
3203
3204 /* process overlap if found */
3205 if (next->hsp->query.offset <= h->hsp->query.end &&
3206 next->hsp->query.offset > h->hsp->query.offset) {
3207
3208 s_FindSpliceJunctionsForOverlaps(h->hsp, next->hsp, query,
3209 query_len);
3210 searched = TRUE;
3211 h = h->next;
3212 }
3213 /* if not a spliced alignment */
3214 else if (next->hsp->query.offset - h->hsp->query.end < 10 &&
3215 next->hsp->query.offset - h->hsp->query.end ==
3216 next->hsp->subject.offset - h->hsp->subject.end) {
3217
3218 /* save pointer to hsps after next */
3219 HSPContainer* following = h->next->next;
3220 BlastHSP* new_hsp = NULL;
3221
3222 /* duplicate HSPContainer with the two HSPs */
3223 h->next->next = NULL;
3224
3225 /* extend the first HSP to cover the gap between HSPs */
3226 if (next->hsp->query.offset - h->hsp->query.end > 1) {
3227 BlastHSP* first = h->hsp;
3228 BlastHSP* second = h->next->hsp;
3229
3230 s_ExtendAlignment(first, query, first->query.end,
3231 second->query.offset - 1, 0,
3232 second->subject.offset - 1 - first->subject.end,
3233 scoring_opts, FALSE);
3234 }
3235
3236 /* merge HSPs */
3237 new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3238 scoring_opts);
3239
3240 if (new_hsp) {
3241
3242 /* replace the two processed HSPs with the combined one */
3243 Blast_HSPFree(h->hsp);
3244 HSPContainerFree(h->next);
3245 h->hsp = new_hsp;
3246 h->next = following;
3247 }
3248 else {
3249 /* something went wrong with merging, use the initial
3250 HSPs */
3251 h->next->next = following;
3252 h = h->next;
3253 }
3254 }
3255 else if (next->hsp->query.offset - h->hsp->query.end > 0) {
3256 s_FindSpliceJunctionsForGap(h->hsp, next->hsp, query,
3257 query_len, scoring_opts);
3258 searched = TRUE;
3259 h = h->next;
3260 }
3261 else {
3262 h = h->next;
3263 }
3264
3265 /* FIXME: if a splice junction cannot be found, we can try looking
3266 for the split between perfect matches */
3267 }
3268
3269 /* Remove HSPs that have the same start position on the query or
3270 subject within a chain. They may arise from modified HSP extents. */
3271 h = ch->hsps;
3272 while (h->next) {
3273 if (h->hsp->query.offset >= h->next->hsp->query.offset ||
3274 h->hsp->subject.offset >= h->next->hsp->subject.offset) {
3275
3276 if (h->hsp->score >= h->next->hsp->score) {
3277 HSPContainer* remove = h->next;
3278 h->next = h->next->next;
3279 remove->next = NULL;
3280 HSPContainerFree(remove);
3281 }
3282 else {
3283 HSPContainer* remove = h;
3284 HSPContainer* prev = ch->hsps;
3285 if (remove == ch->hsps) {
3286 ch->hsps = remove->next;
3287 h = ch->hsps;
3288 }
3289 else {
3290 while (prev->next && prev->next != h) {
3291 prev = prev->next;
3292 }
3293 ASSERT(prev->next && prev->next == remove);
3294
3295 prev->next = remove->next;
3296 h = prev;
3297 }
3298 remove->next = NULL;
3299 HSPContainerFree(remove);
3300 }
3301
3302 continue;
3303
3304 }
3305 h = h->next;
3306 }
3307
3308 /* recalculated chain score if splice sites were searched */
3309 if (searched) {
3310 ch->score = s_ComputeChainScore(ch, scoring_opts, query_len, FALSE);
3311 }
3312 }
3313
3314 s_TestChains(chains);
3315
3316 return 0;
3317 }
3318
3319
3320 /* Find the best scoring chain of HSPs for aligning a single RNA-Seq read to
3321 a genome on a single strand, returns all top scoring chains */
s_FindBestPath(HSPNode * nodes,Int4 num,HSPPath * path,Int4 oid,Boolean is_spliced,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3322 static HSPChain* s_FindBestPath(HSPNode* nodes, Int4 num, HSPPath* path,
3323 Int4 oid, Boolean is_spliced,
3324 const BLAST_SequenceBlk* query_blk,
3325 const BlastQueryInfo* query_info,
3326 const ScoringOptions* scoring_opts)
3327 {
3328 int i, k;
3329 Int4 best_score = 0;
3330 const Int4 kMaxIntronLength = 900000;
3331 /* FIXME: use mismatch and/or gap extend penalty here */
3332 HSPChain* retval = NULL;
3333
3334 if (!path || !nodes || !num) {
3335 return NULL;
3336 }
3337 path->num_paths = 0;
3338 path->score = 0;
3339
3340 for (i = num - 1;i >= 0;i--) {
3341
3342 BlastHSP* hsp = *(nodes[i].hsp);
3343 int self_score = nodes[i].best_score;
3344 /* FIXME: the is_spliced condition is a hack */
3345 for (k = i + 1;k < num && is_spliced;k++) {
3346
3347 BlastHSP* newhsp = *(nodes[k].hsp);
3348 Int4 new_score = nodes[k].best_score + self_score -
3349 s_GetOverlapCost(newhsp, *(nodes[i].hsp), 4);
3350
3351 /* FIXME: some of the conditions double others */
3352 const Int4 hsp_len = hsp->query.end - hsp->query.offset;
3353 const Int4 newhsp_len = newhsp->query.end - newhsp->query.offset;
3354 const Int4 overlap_len = MAX(MIN(hsp->query.end, newhsp->query.end) -
3355 MAX(hsp->query.offset, newhsp->query.offset), 0);
3356
3357 /* add next HSP to the path only if there is fewer than 10 query
3358 bases unaligned between HSPs, newhsp is not contained within
3359 hsp, newhsp aligns to the subject behind hsp, and score improves */
3360 /* FIXME: there should be a penalty if new hsp->query.offset >
3361 hsp->query.end */
3362 if (newhsp->query.offset - hsp->query.end < 10 &&
3363 newhsp->query.offset > hsp->query.offset &&
3364 newhsp->query.end > hsp->query.end &&
3365 newhsp->subject.offset - hsp->subject.end >=
3366 /* the difference on query may be smaller than zero,
3367 this will let as combine HSPs of which extension stopped
3368 too soon (not real introns) */
3369 newhsp->query.offset - hsp->query.end &&
3370 newhsp->subject.offset - hsp->subject.end < kMaxIntronLength &&
3371 (double)overlap_len / hsp_len < 0.75 &&
3372 (double)overlap_len / newhsp_len < 0.75 &&
3373 new_score > nodes[i].best_score) {
3374
3375 /* prefer paths with identified splice sites to those without
3376 ones */
3377 /* FIXME: add min intron length to the condition below */
3378 if (newhsp->subject.offset - hsp->subject.end > 1) {
3379 /* FIXME: The function that finds splice signals modifies
3380 HSPs, so we need to clone HSPs here. */
3381 BlastHSP* hsp_copy = Blast_HSPClone(hsp);
3382 BlastHSP* newhsp_copy = Blast_HSPClone(newhsp);
3383
3384 HSPChain* chain = HSPChainNew(hsp->context);
3385 chain->hsps = HSPContainerNew(&hsp_copy);
3386 chain->hsps->next = HSPContainerNew(&newhsp_copy);
3387
3388 s_FindSpliceJunctions(chain, query_blk, query_info,
3389 scoring_opts);
3390
3391 if ((chain->hsps->hsp->map_info->right_edge &
3392 MAPPER_SPLICE_SIGNAL) == 0) {
3393
3394 /* new_score += scoring_opts->no_splice_signal; */
3395 /* FIXME: temporarely, do not create chains if splice
3396 signals are not found */
3397 new_score = 0;
3398 }
3399
3400 chain = HSPChainFree(chain);
3401 }
3402 else if (newhsp->query.offset - hsp->query.end == 1) {
3403 /* FIXME: this is a temporary hack, we need to increase
3404 the score of combining two HSP into one with a
3405 mismatch so that it is scored the same as the one with
3406 splice signal */
3407 new_score++;
3408 }
3409 else {
3410 new_score = 0;
3411 }
3412
3413 if (new_score > nodes[i].best_score) {
3414 nodes[i].best_score = new_score;
3415 nodes[i].path_next = &(nodes[k]);
3416 }
3417 }
3418 }
3419
3420 /* we want to return all top scoring hits, so we save all paths with
3421 top score */
3422 if (nodes[i].best_score == best_score) {
3423 if (path->num_paths < MAX_NUM_HSP_PATHS) {
3424 path->start[path->num_paths++] = &(nodes[i]);
3425 }
3426 }
3427 else if (nodes[i].best_score > best_score) {
3428 best_score = nodes[i].best_score;
3429 path->start[0] = &(nodes[i]);
3430 path->num_paths = 1;
3431 }
3432 }
3433 ASSERT(path->num_paths);
3434 path->score = path->start[0]->best_score;
3435
3436 /* Find if any paths share an HSP and remove the one with the larger index.
3437 Doing this at the end ensures that we find optimal path in terms of
3438 score and/or HSP positions. */
3439 for (i = 0;i < path->num_paths - 1;i++) {
3440 if (!path->start[i]) {
3441 continue;
3442 }
3443 for (k = i + 1;k < path->num_paths;k++) {
3444 HSPNode* node_i = path->start[i];
3445 if (!path->start[k]) {
3446 continue;
3447 }
3448
3449 while (node_i) {
3450 HSPNode* node_k = path->start[k];
3451 while (node_k) {
3452 if (node_i->hsp[0] == node_k->hsp[0]) {
3453 path->start[k] = NULL;
3454 break;
3455 }
3456 node_k = node_k->path_next;
3457 }
3458 node_i = node_i->path_next;
3459 }
3460
3461 }
3462 }
3463
3464 ASSERT(path->num_paths > 0);
3465 ASSERT(path->start[0]);
3466
3467 /* conver from HSPPath to HSPChain structure */
3468 if (path->num_paths > 0) {
3469 HSPChain* ch = NULL;
3470 HSPContainer* hc = NULL;
3471 HSPNode* node = path->start[0];
3472
3473 retval = HSPChainNew((*node->hsp)->context);
3474 if (!retval) {
3475 return NULL;
3476 }
3477 retval->oid = oid;
3478 retval->score = path->score;
3479 retval->hsps = hc = HSPContainerNew(node->hsp);
3480 node = node->path_next;
3481 while (node) {
3482 hc->next = HSPContainerNew(node->hsp);
3483 if (!hc->next) {
3484 HSPChainFree(retval);
3485 return NULL;
3486 }
3487 hc = hc->next;
3488 node = node->path_next;
3489 }
3490 ASSERT(retval->hsps);
3491
3492 ch = retval;
3493 for (i = 1;i < path->num_paths;i++) {
3494 node = path->start[i];
3495 if (!node) {
3496 continue;
3497 }
3498 ch->next = HSPChainNew((*node->hsp)->context);
3499 ASSERT(ch->next);
3500 if (!ch->next) {
3501 HSPChainFree(retval);
3502 return NULL;
3503 }
3504 ch->next->oid = oid;
3505 ch->next->score = path->score;
3506 ch->next->hsps = hc = HSPContainerNew(node->hsp);
3507 node = node->path_next;
3508 while (node) {
3509 hc->next = HSPContainerNew(node->hsp);
3510 ASSERT(hc->next);
3511 if (!hc->next) {
3512 HSPChainFree(retval);
3513 return NULL;
3514 }
3515 hc = hc->next;
3516 node = node->path_next;
3517 }
3518 ch = ch->next;
3519 ASSERT(ch->hsps);
3520 }
3521 }
3522
3523 ASSERT(path->num_paths == 0 || s_TestChains(retval));
3524
3525 return retval;
3526 }
3527
3528 /* Compare HSPs by context (accending), and score (descending). */
3529 static int
s_CompareHSPsByContextScore(const void * v1,const void * v2)3530 s_CompareHSPsByContextScore(const void* v1, const void* v2)
3531 {
3532 BlastHSP* h1,* h2;
3533 BlastHSP** hp1,** hp2;
3534
3535 hp1 = (BlastHSP**) v1;
3536 hp2 = (BlastHSP**) v2;
3537 h1 = *hp1;
3538 h2 = *hp2;
3539
3540 if (!h1 && !h2)
3541 return 0;
3542 else if (!h1)
3543 return -1;
3544 else if (!h2)
3545 return 1;
3546
3547 if (h1->context < h2->context)
3548 return -1;
3549 if (h1->context > h2->context)
3550 return 1;
3551
3552 if (h1->score < h2->score)
3553 return 1;
3554 if (h1->score > h2->score)
3555 return -1;
3556
3557 return 0;
3558 }
3559
3560 static int
s_CompareHSPsByContextSubjectOffset(const void * v1,const void * v2)3561 s_CompareHSPsByContextSubjectOffset(const void* v1, const void* v2)
3562 {
3563 BlastHSP* h1,* h2;
3564 BlastHSP** hp1,** hp2;
3565
3566 hp1 = (BlastHSP**) v1;
3567 hp2 = (BlastHSP**) v2;
3568 h1 = *hp1;
3569 h2 = *hp2;
3570
3571 if (!h1 && !h2)
3572 return 0;
3573 else if (!h1)
3574 return 1;
3575 else if (!h2)
3576 return -1;
3577
3578 /* If these are from different contexts, don't compare offsets */
3579 if (h1->context < h2->context)
3580 return -1;
3581 if (h1->context > h2->context)
3582 return 1;
3583
3584 if (h1->subject.offset < h2->subject.offset)
3585 return -1;
3586 if (h1->subject.offset > h2->subject.offset)
3587 return 1;
3588
3589 if (h1->query.offset < h2->query.offset)
3590 return -1;
3591 if (h1->query.offset > h2->query.offset)
3592 return 1;
3593
3594 /* tie breakers: sort by decreasing score, then
3595 by increasing size of query range, then by
3596 increasing subject range. */
3597
3598 if (h1->score < h2->score)
3599 return 1;
3600 if (h1->score > h2->score)
3601 return -1;
3602
3603 if (h1->query.end < h2->query.end)
3604 return 1;
3605 if (h1->query.end > h2->query.end)
3606 return -1;
3607
3608 if (h1->subject.end < h2->subject.end)
3609 return 1;
3610 if (h1->subject.end > h2->subject.end)
3611 return -1;
3612
3613 return 0;
3614 }
3615
3616
3617 /* Pair data, used for selecting optimal pairs of aligned chains */
3618 typedef struct Pairinfo
3619 {
3620 HSPChain* first;
3621 HSPChain* second;
3622 Int4 distance;
3623 Int4 conf;
3624 Int4 score;
3625 Int4 trim_first; // when a chain overextends past beginning of the pair
3626 Int4 trim_second; // it needs to be trimmed
3627 Boolean valid_pair;
3628 } Pairinfo;
3629
3630
s_BlastHSPMapperSplicedRunCleanUp(HSPPath * path,HSPNode * nodes,Pairinfo * pair_data)3631 static void s_BlastHSPMapperSplicedRunCleanUp(HSPPath* path,
3632 HSPNode* nodes,
3633 Pairinfo* pair_data)
3634 {
3635 HSPPathFree(path);
3636
3637 if (nodes) {
3638 sfree(nodes);
3639 }
3640
3641 if (pair_data) {
3642 sfree(pair_data);
3643 }
3644 }
3645
3646
3647 /* Compare two HSP chain pairs aligned for the same query and subject
3648 by score and distance */
3649 static int
s_ComparePairs(const void * v1,const void * v2)3650 s_ComparePairs(const void* v1, const void* v2)
3651 {
3652 Pairinfo* a = (Pairinfo*)v1;
3653 Pairinfo* b = (Pairinfo*)v2;
3654
3655 if (a->score > b->score) {
3656 return -1;
3657 }
3658 if (a->score < b->score) {
3659 return 1;
3660 }
3661
3662 if (a->conf < b->conf) {
3663 return -1;
3664 }
3665 if (a->conf > b->conf) {
3666 return 1;
3667 }
3668
3669 if (a->distance < b->distance) {
3670 return -1;
3671 }
3672 if (a->distance > b->distance) {
3673 return 1;
3674 }
3675
3676 return 0;
3677 }
3678
3679
3680 /* Trim beginning of a chain alignment up to the given subject position */
s_TrimChainStartToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)3681 static Int4 s_TrimChainStartToSubjPos(HSPChain* chain, Int4 subj_pos,
3682 Int4 mismatch_score,
3683 Int4 gap_open_score,
3684 Int4 gap_extend_score)
3685 {
3686 HSPContainer* h = NULL;
3687 BlastHSP* hsp = NULL;
3688 BlastHSP* next_hsp = NULL;
3689 Int4 num_left;
3690 Int4 old_score;
3691
3692 ASSERT(chain);
3693 ASSERT(subj_pos >= 0);
3694 if (!chain || subj_pos < 0) {
3695 return 0;
3696 }
3697
3698 /* first remove HSPs that are fully below the subject position */
3699 h = chain->hsps;
3700 while (h && h->hsp->subject.end < subj_pos) {
3701 HSPContainer* tmp = h;
3702 ASSERT(tmp);
3703 h = h->next;
3704 tmp->next = NULL;
3705 chain->score -= tmp->hsp->score;
3706 HSPContainerFree(tmp);
3707 }
3708 ASSERT(h);
3709 chain->hsps = h;
3710 if (!h) {
3711 return -1;
3712 }
3713
3714 /* if all remaning HSPs are above the subject position, then exit */
3715 if (h->hsp->subject.offset >= subj_pos) {
3716 return 0;
3717 }
3718
3719 /* otherwise move HSP start positions */
3720 hsp = h->hsp;
3721 num_left = subj_pos - hsp->subject.offset;
3722 ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
3723
3724 old_score = hsp->score;
3725 s_TrimHSP(hsp, num_left, FALSE, TRUE, mismatch_score, gap_open_score,
3726 gap_extend_score);
3727
3728 /* update chain score */
3729 chain->score -= old_score - hsp->score;
3730
3731 /* remove splice signals and exon flags */
3732 hsp->map_info->left_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
3733 hsp->map_info->left_edge &= 0xff ^ MAPPER_EXON;
3734
3735 /* check whether the HSP is contained within the next one and remove it
3736 if it is */
3737 next_hsp = NULL;
3738 if (chain->hsps->next) {
3739 next_hsp = chain->hsps->next->hsp;
3740 }
3741 if (next_hsp && hsp->query.offset >= next_hsp->query.offset) {
3742 chain->hsps = h->next;
3743 h->next = NULL;
3744 HSPContainerFree(h);
3745 }
3746
3747 return 0;
3748 }
3749
3750 /* Trim chain alignment end up the the given subject position */
s_TrimChainEndToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)3751 static Int4 s_TrimChainEndToSubjPos(HSPChain* chain, Int4 subj_pos,
3752 Int4 mismatch_score,
3753 Int4 gap_open_score, Int4 gap_extend_score)
3754 {
3755 HSPContainer* h = NULL;
3756 BlastHSP* hsp = NULL;
3757 Int4 num_left;
3758 Int4 old_score;
3759
3760 ASSERT(chain);
3761 ASSERT(subj_pos >= 0);
3762 if (!chain || subj_pos <= 0) {
3763 return -1;
3764 }
3765
3766 /* first remove HSPs that are full above the subject position */
3767 h = chain->hsps;
3768 while (h->next && h->next->hsp->subject.end < subj_pos) {
3769 h = h->next;
3770 }
3771
3772 if (h->next) {
3773 HSPContainer* hc = NULL;
3774 if (h->next->hsp->subject.offset < subj_pos) {
3775 h = h->next;
3776 }
3777
3778 /* update chain score */
3779 for (hc = h->next;hc != NULL;hc = hc->next) {
3780 chain->score -= hc->hsp->score;
3781 }
3782
3783 HSPContainerFree(h->next);
3784 h->next = NULL;
3785 }
3786
3787 /* if all remaning HSPs are below the subject position, exit */
3788 if (h->hsp->subject.end <= subj_pos) {
3789 return 0;
3790 }
3791
3792 /* otheriwse move HSP end positions */
3793 hsp = h->hsp;
3794 num_left = hsp->subject.end - subj_pos;
3795 ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
3796
3797 old_score = hsp->score;
3798 s_TrimHSP(hsp, num_left, FALSE, FALSE, mismatch_score, gap_open_score,
3799 gap_extend_score);
3800
3801 /* update chain score */
3802 chain->score -= old_score - hsp->score;
3803
3804 /* remove splice signals and exon flags */
3805 hsp->map_info->right_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
3806 hsp->map_info->right_edge &= 0xff ^ MAPPER_EXON;
3807
3808 /* check whether the HSP is contained within the next one and remove it
3809 if it is */
3810 if (chain->hsps != h) {
3811 HSPContainer* hc = chain->hsps;
3812 while (hc && hc->next != h) {
3813 hc = hc->next;
3814 }
3815 ASSERT(hc && hc->next == h);
3816 if (hc->hsp->query.end >= h->hsp->query.end) {
3817 chain->score -= h->hsp->score;
3818 HSPContainerFree(h);
3819 hc->next = NULL;
3820 }
3821 }
3822
3823 return 0;
3824 }
3825
3826
3827 /* Find optimal pairs */
s_FindBestPairs(HSPChain ** first_list,HSPChain ** second_list,Int4 min_score,Pairinfo ** pair_info_ptr,Int4 * max_num_pairs,const ScoringOptions * scoring_options)3828 static Boolean s_FindBestPairs(HSPChain** first_list,
3829 HSPChain** second_list,
3830 Int4 min_score,
3831 Pairinfo** pair_info_ptr,
3832 Int4* max_num_pairs,
3833 const ScoringOptions* scoring_options)
3834 {
3835 HSPChain* first;
3836 HSPChain* second;
3837 Pairinfo* pair_info = *pair_info_ptr;
3838 Int4 conv_bonus = 0;
3839 Int4 num_pairs = 0;
3840 Boolean found = FALSE;
3841
3842 /* iterate over all pairs of HSP chains for the first and second read of
3843 the pair and collect pair information */
3844 for (first = *first_list; first; first = first->next) {
3845 for (second = *second_list; second; second = second->next) {
3846 Int2 first_frame = first->hsps->hsp->query.frame;
3847 Int2 second_frame = second->hsps->hsp->query.frame;
3848
3849 /* reallocate pair info array if necessary */
3850 if (num_pairs >= *max_num_pairs) {
3851 *max_num_pairs *= 2;
3852 Pairinfo* new_pair_info =
3853 realloc(pair_info, *max_num_pairs * sizeof(Pairinfo));
3854 if (!new_pair_info) {
3855 return -1;
3856 }
3857 pair_info = new_pair_info;
3858 *pair_info_ptr = new_pair_info;
3859 }
3860
3861 /* collect pair information */
3862 pair_info[num_pairs].first = first;
3863 pair_info[num_pairs].second = second;
3864 pair_info[num_pairs].score = first->score + second->score;
3865 pair_info[num_pairs].trim_first = 0;
3866 pair_info[num_pairs].trim_second = 0;
3867 pair_info[num_pairs].valid_pair = 0;
3868
3869 /* if the chains align on the opposite strands */
3870 ASSERT(first_frame != 0 && second_frame != 0);
3871 if (SIGN(first_frame) != SIGN(second_frame)) {
3872 HSPChain* plus = NULL, *minus = NULL;
3873 Int4 plus_start, minus_start;
3874 HSPContainer* hsp = NULL;
3875 Int4 distance = 0;
3876
3877 /* find which query aligns to plus and which to minus strand
3878 on the subject */
3879 if (first_frame > 0) {
3880 plus = first;
3881 }
3882 else {
3883 minus = first;
3884 }
3885 if (second_frame > 0) {
3886 plus = second;
3887 }
3888 else {
3889 minus = second;
3890 }
3891 ASSERT(plus && minus);
3892
3893 /* find start positions on the subject, for the minus strand we
3894 are interested subject start position on the minus strand,
3895 but the HSP has subject on plus and query on the minus
3896 strand, so we need to flip the query */
3897 plus_start = plus->hsps->hsp->subject.offset;
3898 hsp = minus->hsps;
3899 while (hsp->next) {
3900 hsp = hsp->next;
3901 }
3902 ASSERT(hsp);
3903 minus_start = hsp->hsp->subject.end;
3904
3905 /* this is also fragment length */
3906 distance = minus_start - plus_start;
3907 pair_info[num_pairs].distance = distance;
3908
3909 /* distance > 0 indicates a convergent pair (typical) */
3910 if (distance > 0) {
3911 Int4 plus_end, minus_end;
3912 hsp = plus->hsps;
3913 while (hsp->next) {
3914 hsp = hsp->next;
3915 }
3916 ASSERT(hsp);
3917 plus_end = hsp->hsp->subject.end;
3918 minus_end = minus->hsps->hsp->subject.offset;
3919
3920 /* HSP chain end going past the beginning of the chain
3921 for the pair, indicates that the this end of the
3922 query may be aligned to an adapater instead of a real
3923 sequence and needs to be trimmed */
3924 if (plus_end > minus_start || minus_end < plus_start) {
3925 ASSERT(plus && minus);
3926
3927 /* check the 3' (right) side of the fragment */
3928 if (plus_end > minus_start) {
3929 ASSERT(plus_end - minus_start > 0);
3930
3931 /* decrease the score by the trimmed portion; we
3932 assume here that the trimmed part aligns
3933 perfectly, which may not be the case, but this
3934 score is only used for comparison with different
3935 pairs */
3936 pair_info[num_pairs].score -=
3937 plus_end - minus_start;
3938
3939 /* Mark which chain and up to what subject location
3940 needs to be trimmed. We cannot trim here,
3941 because pair_info holds poiners to chains that
3942 may be part of better pairs */
3943 if (plus == pair_info[num_pairs].first) {
3944 pair_info[num_pairs].trim_first = minus_start;
3945 }
3946 else {
3947 pair_info[num_pairs].trim_second = minus_start;
3948 }
3949 }
3950
3951 /* check the 5' (left) side doe the fragment */
3952 if (minus_end < plus_start) {
3953 ASSERT(plus_start - minus_end > 0);
3954
3955 pair_info[num_pairs].score -=
3956 plus_start - minus_end;
3957
3958 if (minus == pair_info[num_pairs].first) {
3959 pair_info[num_pairs].trim_first = plus_start;
3960 }
3961 else {
3962 pair_info[num_pairs].trim_second = plus_start;
3963 }
3964 }
3965 }
3966
3967 pair_info[num_pairs].conf = PAIR_CONVERGENT;
3968
3969 /* add a bonus for convergent pairs; because they are more
3970 common we prefer them over different configurations
3971 with a sligntly better score */
3972 pair_info[num_pairs].score += conv_bonus;
3973 }
3974 else {
3975 pair_info[num_pairs].conf = PAIR_DIVERGENT;
3976 }
3977
3978 /* skip pairs with a score smaller than one found for a
3979 different subject */
3980 if (pair_info[num_pairs].score < min_score) {
3981 continue;
3982 }
3983
3984 num_pairs++;
3985 }
3986 else {
3987 /* for read pairs aligned in the same direction */
3988 pair_info[num_pairs].conf = PAIR_PARALLEL;
3989 pair_info[num_pairs].score -= 1;
3990 num_pairs++;
3991 }
3992 }
3993 }
3994
3995 if (num_pairs > 0) {
3996 Int4 i;
3997 Int4 best_score = pair_info[0].score;
3998 Int4 margin = 5;
3999 HSPChain* ch = NULL;
4000
4001 /* sort pair information by score, configuration distance */
4002 qsort(pair_info, num_pairs, sizeof(Pairinfo), s_ComparePairs);
4003
4004 /* iterate over pair information and collect pairs */
4005 for (i=0;i < num_pairs;i++) {
4006 HSPChain* f = pair_info[i].first;
4007 HSPChain* s = pair_info[i].second;
4008
4009 /* skip pairs with chains already used in other pairs */
4010 if (pair_info[i].first->pair || pair_info[i].second->pair) {
4011 continue;
4012 }
4013
4014 /* skip pairs with scores significantly smaller than the best one
4015 found */
4016 if (best_score - pair_info[i].score > margin) {
4017 break;
4018 }
4019
4020 /* link chains as a pair */
4021 f->pair = s;
4022 s->pair = f;
4023 f->pair_conf = s->pair_conf = pair_info[i].conf;
4024 pair_info[i].valid_pair = TRUE;
4025 found = TRUE;
4026 }
4027
4028 /* for chains that pair with already paired chains, clone the paired
4029 chain */
4030 for (ch = *first_list; ch; ch = ch->next) {
4031 HSPChain* pair = NULL;
4032
4033 if (ch->pair) {
4034 continue;
4035 }
4036
4037 i = 0;
4038 while (i < num_pairs &&
4039 (pair_info[i].first != ch || !pair_info[i].second->pair ||
4040 pair_info[i].conf != PAIR_CONVERGENT ||
4041 best_score - pair_info[i].score > margin)) {
4042 i++;
4043 }
4044 if (i >= num_pairs) {
4045 continue;
4046 }
4047 ASSERT(pair_info[i].second->pair);
4048
4049 pair = s_CloneChain(pair_info[i].second);
4050 ASSERT(pair);
4051 ASSERT(s_TestChains(pair));
4052 pair_info[i].second = pair;
4053 ch->pair = pair;
4054 pair->pair = ch;
4055 ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4056 pair_info[i].valid_pair = TRUE;
4057 HSPChainListInsert(second_list, pair, FALSE);
4058 }
4059
4060 for (ch = *second_list; ch; ch = ch->next) {
4061 HSPChain* pair = NULL;
4062
4063 if (ch->pair) {
4064 continue;
4065 }
4066
4067 i = 0;
4068 while (i < num_pairs &&
4069 (pair_info[i].second != ch || !pair_info[i].first->pair ||
4070 pair_info[i].conf != PAIR_CONVERGENT ||
4071 best_score - pair_info[i].score > margin)) {
4072 i++;
4073 }
4074 if (i >= num_pairs) {
4075 continue;
4076 }
4077 ASSERT(pair_info[i].first->pair);
4078
4079 pair = s_CloneChain(pair_info[i].first);
4080 ASSERT(pair);
4081 ASSERT(s_TestChains(pair));
4082 pair_info[i].first = pair;
4083 ch->pair = pair;
4084 pair->pair = ch;
4085 ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4086 pair_info[i].valid_pair = TRUE;
4087 HSPChainListInsert(first_list, pair, FALSE);
4088 }
4089
4090 /* trim the alignments that overextend beyond the pair */
4091 for (i = 0;i < num_pairs;i++) {
4092 if (!pair_info[i].valid_pair ||
4093 !pair_info[i].first->pair || !pair_info[i].second->pair) {
4094 continue;
4095 }
4096
4097 /* trim the first chain if its end goes beyond the start of
4098 second on the subject */
4099 if (pair_info[i].trim_first > 0) {
4100 if (pair_info[i].first->hsps->hsp->query.frame > 0) {
4101 s_TrimChainEndToSubjPos(pair_info[i].first,
4102 pair_info[i].trim_first,
4103 scoring_options->penalty,
4104 scoring_options->gap_open,
4105 scoring_options->gap_extend);
4106 }
4107 else {
4108 /* minus strand of the qurey is represented by another
4109 (reverse complemented) sequence aligned like the plus
4110 strand, so the end of the chain is really the beginnig
4111 by query coordinates */
4112 s_TrimChainStartToSubjPos(pair_info[i].first,
4113 pair_info[i].trim_first,
4114 scoring_options->penalty,
4115 scoring_options->gap_open,
4116 scoring_options->gap_extend);
4117 }
4118 }
4119
4120 /* trim the second chain if its end goes beyond the start of the
4121 first on the subject */
4122 if (pair_info[i].trim_second > 0) {
4123 if (pair_info[i].second->hsps->hsp->query.frame > 0) {
4124 s_TrimChainEndToSubjPos(pair_info[i].second,
4125 pair_info[i].trim_second,
4126 scoring_options->penalty,
4127 scoring_options->gap_open,
4128 scoring_options->gap_extend);
4129 }
4130 else {
4131 s_TrimChainStartToSubjPos(pair_info[i].second,
4132 pair_info[i].trim_second,
4133 scoring_options->penalty,
4134 scoring_options->gap_open,
4135 scoring_options->gap_extend);
4136 }
4137 }
4138
4139 }
4140 }
4141
4142 return found;
4143 }
4144
4145
4146 /** Perform writing task for paired reads
4147 * ownership of the HSP list and sets the dereferenced pointer to NULL.
4148 * @param data To store results to [in][out]
4149 * @param hsp_list Pointer to the HSP list to save in the collector. [in]
4150 */
4151 static int
s_BlastHSPMapperSplicedPairedRun(void * data,BlastHSPList * hsp_list)4152 s_BlastHSPMapperSplicedPairedRun(void* data, BlastHSPList* hsp_list)
4153 {
4154 BlastHSPMapperData* spl_data = data;
4155 BlastHSPMapperParams* params = spl_data->params;
4156 const ScoringOptions* scoring_opts = ¶ms->scoring_options;
4157 Boolean is_spliced = params->splice;
4158 BLAST_SequenceBlk* query_blk = spl_data->query;
4159 BlastQueryInfo* query_info = spl_data->query_info;
4160 const Int4 kDefaultMaxHsps = 1000;
4161 Int4 max_hsps = kDefaultMaxHsps;
4162 HSPNode* nodes = calloc(max_hsps, sizeof(HSPNode));
4163 HSPPath* path = NULL;
4164 BlastHSP** hsp_array;
4165 Int4 num_hsps = 0;
4166 Int4 i;
4167
4168 HSPChain** saved_chains = spl_data->saved_chains;
4169 HSPChain* chain_array[2];
4170 HSPChain* first = NULL, *second = NULL;
4171 Int4 workspace_size = 40;
4172 Pairinfo* pair_workspace = calloc(workspace_size, sizeof(Pairinfo));
4173 /* the score bonus needs to include exon pars that were to short to be
4174 aligned (at either end of the chain) */
4175 const Int4 kPairBonus = is_spliced ? 21 : 5;
4176
4177 if (!hsp_list) {
4178 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4179 return 0;
4180 }
4181
4182 if (!params || !nodes) {
4183 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4184 return -1;
4185 }
4186
4187 hsp_array = hsp_list->hsp_array;
4188
4189 path = HSPPathNew();
4190
4191 /* sort HSPs by query context and subject position for spliced aligments */
4192 if (is_spliced) {
4193 qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4194 s_CompareHSPsByContextSubjectOffset);
4195 }
4196 else {
4197 /* for non spliced alignments, sort HSPs by query context and score */
4198 qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4199 s_CompareHSPsByContextScore);
4200 }
4201
4202 i = 0;
4203 chain_array[0] = NULL;
4204 chain_array[1] = NULL;
4205 while (i < hsp_list->hspcnt) {
4206
4207 Int4 context = hsp_array[i]->context;
4208 /* is this query the first segment of a pair */
4209 Boolean has_pair = (query_info->contexts[context].segment_flags ==
4210 eFirstSegment);
4211 Int4 query_idx = context / NUM_STRANDS;
4212 Int4 first_context = query_idx * NUM_STRANDS;
4213 Int4 context_next_fragment = has_pair ? (query_idx + 2) * NUM_STRANDS :
4214 (query_idx + 1) * NUM_STRANDS;
4215
4216 HSPChain* new_chains = NULL;
4217
4218 /* iterate over contexts that belong to a read pair */
4219 while (i < hsp_list->hspcnt &&
4220 hsp_array[i]->context < context_next_fragment) {
4221
4222 memset(nodes, 0, num_hsps * sizeof(HSPNode));
4223 num_hsps = 0;
4224 context = hsp_array[i]->context;
4225
4226 /* collect HSPs for the current context */
4227 while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4228
4229 /* Overlapping portions of two consecutive subject chunks
4230 may produce two HSPs representing the same alignment. We
4231 drop one of them here (assuming the HSPs for the same
4232 context are sorted by score or subject position). */
4233 BlastHSP* h = (num_hsps == 0 ? NULL : *nodes[num_hsps - 1].hsp);
4234 if (!h || hsp_array[i]->context != h->context ||
4235 hsp_array[i]->subject.offset != h->subject.offset ||
4236 hsp_array[i]->score != h->score) {
4237
4238 /* reallocate node array if necessary */
4239 if (num_hsps >= max_hsps) {
4240 HSPNode* new_nodes = NULL;
4241
4242 /* For non-spliced mapping we are only interested in
4243 the best scoring whole HSPs. HSPs are sorted by
4244 score and those that map to many places are not good
4245 so the node array does not need to be extended. */
4246 if (!is_spliced) {
4247 break;
4248 }
4249
4250 max_hsps *= 2;
4251 new_nodes = calloc(max_hsps, sizeof(HSPNode));
4252 if (!nodes) {
4253 s_BlastHSPMapperSplicedRunCleanUp(path, nodes,
4254 pair_workspace);
4255 return -1;
4256 }
4257 s_HSPNodeArrayCopy(new_nodes, nodes, num_hsps);
4258 free(nodes);
4259 nodes = new_nodes;
4260 }
4261
4262 nodes[num_hsps].hsp = &(hsp_array[i]);
4263 nodes[num_hsps].best_score = hsp_array[i]->score;
4264
4265 num_hsps++;
4266 }
4267 i++;
4268 }
4269
4270 /* find the best scoring chain of HSPs that align to exons
4271 parts of the query and the subject */
4272 new_chains = s_FindBestPath(nodes, num_hsps, path,
4273 hsp_list->oid, is_spliced,
4274 query_blk, query_info, scoring_opts);
4275
4276 ASSERT(new_chains);
4277 HSPChainListInsert(&chain_array[(context - first_context) /
4278 NUM_STRANDS],
4279 new_chains, FALSE);
4280
4281 /* skip HSPs for the same context if there are more then max
4282 allowed per context */
4283 while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4284 i++;
4285 }
4286 }
4287
4288 #if _DEBUG
4289 ASSERT(!chain_array[0] || s_TestChainsSorted(chain_array[0]));
4290 ASSERT(!chain_array[1] || s_TestChainsSorted(chain_array[1]));
4291 #endif
4292
4293 if (is_spliced && chain_array[0]) {
4294 s_FindSpliceJunctions(chain_array[0], query_blk, query_info,
4295 scoring_opts);
4296 }
4297
4298 if (is_spliced && chain_array[1]) {
4299 s_FindSpliceJunctions(chain_array[1], query_blk, query_info,
4300 scoring_opts);
4301 }
4302
4303 first = chain_array[0];
4304 second = chain_array[1];
4305
4306 /* If the chains provide sufficient score check for pairs.
4307 The score bonus is added to prefer a pair with a slightly smaller
4308 score to single chains with a better one. */
4309 if (first && second) {
4310
4311 s_FindBestPairs(&first, &second, 0, &pair_workspace,
4312 &workspace_size, scoring_opts);
4313
4314 ASSERT(s_TestChains(first));
4315 ASSERT(s_TestChains(second));
4316 }
4317
4318 /* save all chains and remove ones with scores lower than
4319 best score - kPairBonus */
4320 if (first) {
4321 HSPChainListInsert(&saved_chains[query_idx], first, TRUE);
4322 HSPChainListTrim(saved_chains[query_idx], kPairBonus);
4323 }
4324 if (second) {
4325 HSPChainListInsert(&saved_chains[query_idx + 1], second, TRUE);
4326 HSPChainListTrim(saved_chains[query_idx + 1], kPairBonus);
4327 }
4328
4329 #if _DEBUG
4330 ASSERT(!chain_array[0] || s_TestChainsSorted(saved_chains[query_idx]));
4331 ASSERT(!chain_array[1] || s_TestChainsSorted(saved_chains[query_idx + 1]));
4332 #endif
4333
4334 /* make temporary lists empty */
4335 chain_array[0] = chain_array[1] = NULL;
4336 }
4337
4338 /* delete HSPs that were not saved in results */
4339 hsp_list = Blast_HSPListFree(hsp_list);
4340
4341 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4342
4343 return 0;
4344 }
4345
4346
4347 /** Free the writer
4348 * @param writer The writer to free [in]
4349 * @return NULL.
4350 */
4351 static
4352 BlastHSPWriter*
s_BlastHSPMapperFree(BlastHSPWriter * writer)4353 s_BlastHSPMapperFree(BlastHSPWriter* writer)
4354 {
4355 BlastHSPMapperData *data = writer->data;
4356 sfree(data->params);
4357 sfree(writer->data);
4358 sfree(writer);
4359 return NULL;
4360 }
4361
4362 /** create the writer
4363 * @param params Pointer to the hit paramters [in]
4364 * @param query_info BlastQueryInfo (not used) [in]
4365 * @return writer
4366 */
4367 static
4368 BlastHSPWriter*
s_BlastHSPMapperPairedNew(void * params,BlastQueryInfo * query_info,BLAST_SequenceBlk * query)4369 s_BlastHSPMapperPairedNew(void* params, BlastQueryInfo* query_info,
4370 BLAST_SequenceBlk* query)
4371 {
4372 BlastHSPWriter* writer = NULL;
4373 BlastHSPMapperData* data = NULL;
4374 /* BlastHSPMapperParams * spl_param = params; */
4375
4376 /* allocate space for writer */
4377 writer = malloc(sizeof(BlastHSPWriter));
4378
4379 /* fill up the function pointers */
4380 writer->InitFnPtr = &s_BlastHSPMapperPairedInit;
4381 writer->FinalFnPtr = &s_BlastHSPMapperFinal;
4382 writer->FreeFnPtr = &s_BlastHSPMapperFree;
4383 writer->RunFnPtr = &s_BlastHSPMapperSplicedPairedRun;
4384
4385 /* allocate for data structure */
4386 writer->data = calloc(1, sizeof(BlastHSPMapperData));
4387 data = writer->data;
4388 data->params = params;
4389 data->query_info = query_info;
4390 data->query = query;
4391
4392 return writer;
4393 }
4394
4395
4396 /*************************************************************/
4397 /** The following are exported functions to be used by APP */
4398
4399 BlastHSPMapperParams*
BlastHSPMapperParamsNew(const BlastHitSavingOptions * hit_options,const BlastScoringOptions * scoring_options)4400 BlastHSPMapperParamsNew(const BlastHitSavingOptions* hit_options,
4401 const BlastScoringOptions* scoring_options)
4402 {
4403 BlastHSPMapperParams* retval = NULL;
4404
4405 if (hit_options == NULL)
4406 return NULL;
4407
4408 retval = (BlastHSPMapperParams*)malloc(sizeof(BlastHSPMapperParams));
4409 retval->hitlist_size = MAX(hit_options->hitlist_size, 10);
4410 retval->paired = hit_options->paired;
4411 retval->splice = hit_options->splice;
4412 retval->program = hit_options->program_number;
4413 retval->scoring_options.reward = scoring_options->reward;
4414 retval->scoring_options.penalty = scoring_options->penalty;
4415 retval->scoring_options.gap_open = -scoring_options->gap_open;
4416 retval->scoring_options.gap_extend = -scoring_options->gap_extend;
4417 retval->scoring_options.no_splice_signal = -2;
4418 return retval;
4419 }
4420
4421 BlastHSPMapperParams*
BlastHSPMapperParamsFree(BlastHSPMapperParams * opts)4422 BlastHSPMapperParamsFree(BlastHSPMapperParams* opts)
4423 {
4424 if ( !opts )
4425 return NULL;
4426 sfree(opts);
4427 return NULL;
4428 }
4429
4430 BlastHSPWriterInfo*
BlastHSPMapperInfoNew(BlastHSPMapperParams * params)4431 BlastHSPMapperInfoNew(BlastHSPMapperParams* params) {
4432 BlastHSPWriterInfo * writer_info =
4433 malloc(sizeof(BlastHSPWriterInfo));
4434
4435 writer_info->NewFnPtr = &s_BlastHSPMapperPairedNew;
4436
4437 writer_info->params = params;
4438 return writer_info;
4439 }
4440