1 /* $Id: hspfilter_mapper.c 587429 2019-06-04 17:18:58Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Greg Boratyn
27 *
28 */
29
30 /** @file hspfilter_mapper.c
31 * Implementation of the BlastHSPWriter interface to save only the best scoring * chains of HSPs for aligning RNA-Seq sequences to a genome.
32 */
33
34 #include <algo/blast/core/hspfilter_mapper.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_hits.h>
37 #include <algo/blast/core/spliced_hits.h>
38 #include "jumper.h"
39
40 /* Pair configurations, in the order of prefernece */
41 #define PAIR_CONVERGENT 0
42 #define PAIR_DIVERGENT 1
43 #define PAIR_PARALLEL 2
44 #define PAIR_NONE 3
45
46 /* Insert a single chain into the list so that the list is sorted in decending
47 order of chain scores. */
s_HSPChainListInsertOne(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)48 static Int4 s_HSPChainListInsertOne(HSPChain** list, HSPChain* chain,
49 Boolean check_for_duplicates)
50 {
51 HSPChain* ch = NULL;
52
53 if (!list || !chain) {
54 return -1;
55 }
56 ASSERT(!chain->next);
57
58 if (!*list) {
59 *list = chain;
60 return 0;
61 }
62
63 ch = *list;
64 if (ch->score < chain->score) {
65 chain->next = ch;
66 *list = chain;
67 return 0;
68 }
69
70 /* check for duplicate chain: the new chains may be the same as already
71 saved; this may come from writing HSPs to HSP stream for each chunk */
72 if (check_for_duplicates &&
73 ch->oid == chain->oid && ch->score == chain->score &&
74 ch->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
75 ch->hsps->hsp->subject.offset ==
76 chain->hsps->hsp->subject.offset) {
77
78 chain->next = NULL;
79 chain = HSPChainFree(chain);
80 return 0;
81 }
82
83 while (ch->next && ch->next->score >= chain->score){
84
85 /* check for duplicate chain: the new chains may be the same as already
86 saved; this may come from writing HSPs to HSP stream for each chunk */
87 if (check_for_duplicates &&
88 ch->next->oid == chain->oid && ch->next->score == chain->score &&
89 ch->next->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
90 ch->next->hsps->hsp->subject.offset ==
91 chain->hsps->hsp->subject.offset) {
92
93 chain->next = NULL;
94 chain = HSPChainFree(chain);
95 return 0;
96 }
97
98 ch = ch->next;
99 }
100 ASSERT(ch);
101 chain->next = ch->next;
102 ch->next = chain;
103
104 return 0;
105 }
106
107
s_TestCutoffs(HSPChain * chain,Int4 cutoff_score,Int4 cutoff_edit_dist)108 static Boolean s_TestCutoffs(HSPChain* chain, Int4 cutoff_score,
109 Int4 cutoff_edit_dist)
110 {
111 if (!chain) {
112 return FALSE;
113 }
114
115 if (chain->score >= cutoff_score) {
116
117 Int4 align_len = 0;
118 Int4 num_identical = 0;
119 HSPContainer* h = chain->hsps;
120
121 if (cutoff_edit_dist < 0) {
122 return TRUE;
123 }
124
125 for (; h; h = h->next) {
126 align_len += MAX(h->hsp->query.end - h->hsp->query.offset,
127 h->hsp->subject.end - h->hsp->subject.offset);
128
129 num_identical += h->hsp->num_ident;
130
131 ASSERT(h->hsp->num_ident <=
132 MAX(h->hsp->query.end - h->hsp->query.offset,
133 h->hsp->subject.end - h->hsp->subject.offset));
134 }
135
136 if (align_len - num_identical <= cutoff_edit_dist) {
137 return TRUE;
138 }
139 }
140
141 return FALSE;
142 }
143
144 /* Insert chains into the list so that the list is sorted in descending order
145 of chain scores. If chain is a list, each element is added separately. The
146 list must be sorted before adding chain */
HSPChainListInsert(HSPChain ** list,HSPChain ** chain,Int4 cutoff_score,Int4 cutoff_edit_dist,Boolean check_for_duplicates)147 static Int4 HSPChainListInsert(HSPChain** list, HSPChain** chain,
148 Int4 cutoff_score, Int4 cutoff_edit_dist,
149 Boolean check_for_duplicates)
150 {
151 HSPChain* ch = NULL;
152 Int4 status = 0;
153
154 if (!list || !chain) {
155 return -1;
156 }
157
158 ch = *chain;
159 while (ch) {
160 HSPChain* next = ch->next;
161 ch->next = NULL;
162 if (ch->score >= cutoff_score) {
163 status = s_HSPChainListInsertOne(list, ch, check_for_duplicates);
164 }
165 else {
166 HSPChainFree(ch);
167 }
168 if (status) {
169 return status;
170 }
171 ch = next;
172
173 }
174
175 *chain = NULL;
176 return 0;
177 }
178
179 /* Remove from the list chains with scores lower than the best one by at least
180 the given margin. The list must be sorted in descending order of scores. */
HSPChainListTrim(HSPChain * list,Int4 margin)181 static Int4 HSPChainListTrim(HSPChain* list, Int4 margin)
182 {
183 HSPChain* ch = NULL;
184 Int4 best_score;
185
186 if (!list) {
187 return -1;
188 }
189
190 best_score = list->score;
191 ch = list;
192 while (ch->next && best_score - ch->next->score <= margin) {
193 ASSERT(best_score - ch->next->score >= 0);
194 ch = ch->next;
195 }
196 ASSERT(ch);
197
198 ch->next = HSPChainFree(ch->next);
199
200 return 0;
201 }
202
203
204 /* Test that all pointers in the chain are set */
s_TestChains(HSPChain * chain)205 static Boolean s_TestChains(HSPChain* chain)
206 {
207 HSPContainer* hc;
208
209 ASSERT(chain);
210 hc = chain->hsps;
211 ASSERT(hc);
212 ASSERT(hc->hsp);
213 ASSERT(hc->hsp->context == chain->context);
214
215 ASSERT(hc->hsp->gap_info->size > 1 ||
216 hc->hsp->query.end - hc->hsp->query.offset ==
217 hc->hsp->subject.end - hc->hsp->subject.offset);
218
219 hc = hc->next;
220 while (hc) {
221 ASSERT(hc->hsp);
222 ASSERT(hc->hsp->context == chain->context);
223
224 ASSERT(hc->hsp->gap_info->size > 1 ||
225 hc->hsp->query.end - hc->hsp->query.offset ==
226 hc->hsp->subject.end - hc->hsp->subject.offset);
227
228 if (hc->next) {
229 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
230 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
231 }
232 hc = hc->next;
233 }
234
235 chain = chain->next;
236 while (chain) {
237 hc = chain->hsps;
238 ASSERT(hc);
239 ASSERT(hc->hsp);
240 ASSERT(hc->hsp->context == chain->context);
241
242 ASSERT(hc->hsp->gap_info->size > 1 ||
243 hc->hsp->query.end - hc->hsp->query.offset ==
244 hc->hsp->subject.end - hc->hsp->subject.offset);
245
246 hc = hc->next;
247 while (hc) {
248 ASSERT(hc);
249 ASSERT(hc->hsp);
250 ASSERT(hc->hsp->context == chain->context);
251
252 ASSERT(hc->hsp->gap_info->size > 1 ||
253 hc->hsp->query.end - hc->hsp->query.offset ==
254 hc->hsp->subject.end - hc->hsp->subject.offset);
255
256 if (hc->next) {
257 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
258 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
259 }
260
261 hc = hc->next;
262 }
263 chain = chain->next;
264 }
265
266 return TRUE;
267 }
268
269 #if _DEBUG
s_TestChainsSorted(HSPChain * chain)270 static Boolean s_TestChainsSorted(HSPChain* chain)
271 {
272 HSPChain* prev;
273
274 s_TestChains(chain);
275
276 prev = chain;
277 chain = chain->next;
278 for (; chain; chain = chain->next, prev = prev->next) {
279 ASSERT(prev->score >= chain->score);
280 }
281
282 return TRUE;
283 }
284 #endif
285
286 /** Data structure used by the writer */
287 typedef struct BlastHSPMapperData {
288 BlastHSPMapperParams* params; /**< how many hits to save */
289 BLAST_SequenceBlk* query; /**< query sequence */
290 BlastQueryInfo* query_info; /**< information about queries */
291 HSPChain** saved_chains; /**< HSP chains are stored here */
292 } BlastHSPMapperData;
293
294 /*************************************************************/
295 /** The following are implementations for BlastHSPWriter ADT */
296
297 /** Perform pre-run stage-specific initialization
298 * @param data The internal data structure [in][out]
299 * @param results The HSP results to operate on [in]
300 */
301 static int
s_BlastHSPMapperPairedInit(void * data,void * results)302 s_BlastHSPMapperPairedInit(void* data, void* results)
303 {
304 BlastHSPMapperData * spl_data = data;
305 BlastHSPResults* r = (BlastHSPResults*)results;
306 spl_data->saved_chains = calloc(r->num_queries, sizeof(HSPChain*));
307
308 return 0;
309 }
310
311
312 /* Get subject start position */
s_FindFragmentStart(HSPChain * chain)313 static Int4 s_FindFragmentStart(HSPChain* chain)
314 {
315 Int2 frame;
316 HSPContainer* hc = NULL;
317 ASSERT(chain);
318 ASSERT(chain->hsps);
319 ASSERT(chain->hsps->hsp);
320
321 frame = chain->hsps->hsp->query.frame;
322 if (frame > 0) {
323 return chain->hsps->hsp->subject.offset;
324 }
325
326 hc = chain->hsps;
327 while (hc->next) {
328 hc = hc->next;
329 }
330 ASSERT(hc);
331
332 return hc->hsp->subject.end - 1;
333 }
334
335
336 /* Get subject end position */
337 /* Not used
338 static Int4 s_FindFragmentEnd(HSPChain* chain)
339 {
340 Int2 frame;
341 HSPContainer* hc = NULL;
342 ASSERT(chain);
343 ASSERT(chain->hsps);
344 ASSERT(chain->hsps->hsp);
345
346 frame = chain->hsps->hsp->query.frame;
347 if (frame < 0) {
348 return chain->hsps->hsp->subject.offset;
349 }
350
351 hc = chain->hsps;
352 while (hc->next) {
353 hc = hc->next;
354 }
355 ASSERT(hc);
356
357 return hc->hsp->subject.end - 1;
358 }
359 */
360
s_ComputeGapScore(Int4 length,Int4 open_score,Int4 extend_score,Int4 seq_error)361 static Int4 s_ComputeGapScore(Int4 length, Int4 open_score, Int4 extend_score,
362 Int4 seq_error)
363 {
364 if (length < 4) {
365 return length * seq_error;
366 }
367
368 return open_score + MIN(length, 4) * extend_score;
369 }
370
371 /* Compute HSP alignment score from Jumper edit script */
s_ComputeAlignmentScore(BlastHSP * hsp,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)372 static Int4 s_ComputeAlignmentScore(BlastHSP* hsp, Int4 mismatch_score,
373 Int4 gap_open_score, Int4 gap_extend_score)
374 {
375 Int4 i;
376 Int4 last_pos = hsp->query.offset;
377 Int4 score = 0;
378 const Int4 kGap = 15;
379 Int4 num_identical = 0;
380 Int4 query_gap = 0;
381 Int4 subject_gap = 0;
382
383 for (i = 0;i < hsp->map_info->edits->num_edits;i++) {
384 JumperEdit* e = &(hsp->map_info->edits->edits[i]);
385 Int4 num_matches = e->query_pos - last_pos;
386 ASSERT(num_matches >= 0);
387 last_pos = e->query_pos;
388 score += num_matches;
389 num_identical += num_matches;
390
391 if (e->query_base == kGap) {
392 ASSERT(e->subject_base != kGap);
393 query_gap++;
394
395 if (subject_gap > 0) {
396 score += s_ComputeGapScore(subject_gap, -12, -1, -4);
397 subject_gap = 0;
398 }
399 }
400 else if (e->subject_base == kGap) {
401 subject_gap++;
402 last_pos++;
403
404 if (query_gap > 0) {
405 score += s_ComputeGapScore(query_gap, -12, -1, -4);
406 query_gap = 0;
407 }
408 }
409 else {
410 score += mismatch_score;
411 last_pos++;
412
413 if (subject_gap > 0) {
414 score += s_ComputeGapScore(subject_gap, -12, -1, -4);
415 subject_gap = 0;
416 }
417 if (query_gap > 0) {
418 score += s_ComputeGapScore(query_gap, -12, -1, -4);
419 query_gap = 0;
420 }
421 }
422 }
423
424 if (subject_gap > 0) {
425 score += s_ComputeGapScore(subject_gap, -12, -1, -4);
426 subject_gap = 0;
427 }
428 if (query_gap > 0) {
429 score += s_ComputeGapScore(query_gap, -12, -1, -4);
430 query_gap = 0;
431 }
432
433 score += hsp->query.end - last_pos;
434 num_identical += hsp->query.end - last_pos;
435 hsp->num_ident = num_identical;
436 return score;
437 }
438
439
440 /* Find the cost of chain HSPs overlapping on the query, as the smaller score
441 of the overlapping region */
s_GetOverlapCost(const BlastHSP * a,const BlastHSP * b,Int4 edit_penalty)442 static Int4 s_GetOverlapCost(const BlastHSP* a, const BlastHSP* b,
443 Int4 edit_penalty)
444 {
445 Int4 i;
446 Int4 overlap_f, overlap_s;
447 const BlastHSP* f;
448 const BlastHSP* s;
449
450 /* if one HSP in contained within the other on the query, return the
451 smaller score */
452 if ((a->query.offset <= b->query.offset && a->query.end >= b->query.end) ||
453 (a->query.offset >= b->query.offset && a->query.end <= b->query.end)) {
454
455 return MIN(a->score, b->score);
456 }
457
458 /* if the two HSPs are mutually exclusive on the query, there is no cost */
459 if ((a->query.end < b->query.offset && a->query.offset < b->query.end) ||
460 (b->query.end < a->query.offset && b->query.offset < a->query.end)) {
461
462 return 0;
463 }
464
465 /* otherwise the HSPs partially overlap; reurn the the smaller score for
466 the overlap region */
467
468 /* find which HSP precedes which on the query */
469 if (a->query.offset <= b->query.offset) {
470 f = a;
471 s = b;
472 }
473 else {
474 f = b;
475 s = a;
476 }
477
478 /* this is the overlap score assuming perfect matches in the overlap */
479 overlap_f = overlap_s = f->query.end - s->query.offset;
480 ASSERT(overlap_f >= 0 && overlap_s >= 0);
481 /* subtract penalties for mismatches and gaps in the overlap region */
482 for (i = 0;i < f->map_info->edits->num_edits;i++) {
483 if (f->map_info->edits->edits[i].query_pos >= s->query.offset) {
484 overlap_f -= edit_penalty;
485 }
486 }
487 for (i = 0;i < s->map_info->edits->num_edits;i++) {
488 if (s->map_info->edits->edits[i].query_pos < f->query.end) {
489 overlap_s -= edit_penalty;
490 }
491 }
492
493 return MIN(overlap_f, overlap_s);
494 }
495
496
497 /* Compute chain score */
s_ComputeChainScore(HSPChain * chain,const ScoringOptions * score_options,Int4 query_len,Boolean comp_hsp_score)498 static Int4 s_ComputeChainScore(HSPChain* chain,
499 const ScoringOptions* score_options,
500 Int4 query_len,
501 Boolean comp_hsp_score)
502 {
503 Int4 retval;
504 HSPContainer* h = NULL;
505 HSPContainer* prev = NULL;
506
507 if (!chain || !score_options) {
508 return -1;
509 }
510
511 h = chain->hsps;
512 if (comp_hsp_score) {
513 h->hsp->score = s_ComputeAlignmentScore(h->hsp, score_options->penalty,
514 score_options->gap_open,
515 score_options->gap_extend);
516 }
517 retval = h->hsp->score;
518
519 prev = h;
520 h = h->next;
521
522 for (; h; h = h->next, prev = prev->next) {
523 /* HSPs must be sorted by query positon */
524 ASSERT(h->hsp->query.offset >= prev->hsp->query.offset);
525
526 if (comp_hsp_score) {
527
528 h->hsp->score = s_ComputeAlignmentScore(h->hsp,
529 score_options->penalty,
530 score_options->gap_open,
531 score_options->gap_extend);
532 }
533 retval += h->hsp->score;
534
535 if ((h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0 ||
536 (prev->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
537
538 Int4 query_gap =
539 MAX(h->hsp->query.offset - prev->hsp->query.end, 0);
540
541 Int4 subject_gap =
542 MAX(h->hsp->subject.offset - prev->hsp->subject.end, 0);
543
544 retval += s_ComputeGapScore(query_gap, -12, -1, -4);
545 retval += s_ComputeGapScore(subject_gap, -12, -1, -4);
546 }
547
548 }
549
550 return retval;
551 }
552
553 #if _DEBUG
s_TestHSPRanges(const BlastHSP * hsp)554 static Boolean s_TestHSPRanges(const BlastHSP* hsp)
555 {
556 Int4 i;
557 Int4 d = 0;
558 Int4 q = 0, s = 0;
559 Int4 num_matches;
560 Int4 last_pos;
561 const Int4 kGap = 15;
562 for (i=0;i < hsp->gap_info->size;i++) {
563 switch (hsp->gap_info->op_type[i]) {
564 case eGapAlignIns:
565 d -= hsp->gap_info->num[i];
566 q += hsp->gap_info->num[i];
567 break;
568
569 case eGapAlignDel:
570 d += hsp->gap_info->num[i];
571 s += hsp->gap_info->num[i];
572 break;
573
574 case eGapAlignSub:
575 q += hsp->gap_info->num[i];
576 s += hsp->gap_info->num[i];
577
578 default:
579 break;
580 }
581 }
582 if (hsp->query.end - hsp->query.offset + d !=
583 hsp->subject.end - hsp->subject.offset) {
584
585 return FALSE;
586 }
587
588 ASSERT(hsp->query.end - hsp->query.offset == q);
589 ASSERT(hsp->subject.end - hsp->subject.offset == s);
590
591 d = 0;
592 q = 0;
593 s = 0;
594 last_pos = hsp->query.offset;
595 for (i=0;i < hsp->map_info->edits->num_edits;i++) {
596
597 num_matches = hsp->map_info->edits->edits[i].query_pos - last_pos - 1;
598 if (i == 0 ||
599 (i > 0 && hsp->map_info->edits->edits[i - 1].query_base == kGap)) {
600 num_matches++;
601 }
602 q += num_matches;
603 s += num_matches;
604
605 ASSERT(hsp->query.offset + q ==
606 hsp->map_info->edits->edits[i].query_pos);
607
608 if (hsp->map_info->edits->edits[i].query_base == kGap) {
609 d++;
610 s++;
611 }
612 else if (hsp->map_info->edits->edits[i].subject_base == kGap) {
613 d--;
614 q++;
615 }
616 else {
617 q++;
618 s++;
619 }
620
621 last_pos = hsp->map_info->edits->edits[i].query_pos;
622 }
623 num_matches = hsp->query.end - last_pos - 1;
624 if (hsp->map_info->edits->num_edits == 0 ||
625 (hsp->map_info->edits->num_edits > 0 &&
626 hsp->map_info->edits->edits[hsp->map_info->edits->num_edits - 1].query_base == kGap)) {
627
628 num_matches++;
629 }
630 q += num_matches;
631 s += num_matches;
632
633 ASSERT(hsp->query.end - hsp->query.offset + d ==
634 hsp->subject.end - hsp->subject.offset);
635
636 ASSERT(hsp->query.end - hsp->query.offset == q);
637 ASSERT(hsp->subject.end - hsp->subject.offset == s);
638
639 return TRUE;
640 }
641 #endif
642
643 /* Trim HSP by a number of bases on query or subject, either from the start or
644 from the end */
s_TrimHSP(BlastHSP * hsp,Int4 num,Boolean is_query,Boolean is_start,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const Uint1 * query_seq)645 static Int4 s_TrimHSP(BlastHSP* hsp, Int4 num, Boolean is_query,
646 Boolean is_start, Int4 mismatch_score,
647 Int4 gap_open_score, Int4 gap_extend_score,
648 const Uint1* query_seq)
649 {
650 Int4 num_left = num;
651 Int4 i = is_start ? 0 : hsp->gap_info->size - 1;
652 Int4 d = is_start ? 1 : -1;
653 Int4 end = is_start ? hsp->gap_info->size : -1;
654 Int4 delta_query = 0, delta_subject = 0;
655 Boolean is_subject = !is_query;
656 Boolean is_end = !is_start;
657 Int4 k;
658 const Uint1 kGap = 15;
659
660 ASSERT(hsp /* */ && num > 0 /* */);
661 ASSERT((is_query && num <= hsp->query.end - hsp->query.offset) ||
662 (!is_query && num <= hsp->subject.end - hsp->subject.offset));
663
664 #if _DEBUG
665 ASSERT(s_TestHSPRanges(hsp));
666 #endif
667
668 if (num == 0) {
669 return 0;
670 }
671
672
673 /* itereate over the edit script and remove the required number
674 of subject bases */
675 while (i != end && num_left > 0) {
676 switch (hsp->gap_info->op_type[i]) {
677 case eGapAlignSub:
678 if (hsp->gap_info->num[i] > num_left) {
679 hsp->gap_info->num[i] -= num_left;
680 delta_query += num_left;
681 delta_subject += num_left;
682 num_left = 0;
683 }
684 else {
685 Int4 n = hsp->gap_info->num[i];
686
687 delta_query += n;
688 delta_subject += n;
689 num_left -= n;
690 i += d;
691 }
692 break;
693
694 case eGapAlignDel:
695 if (is_subject) {
696 if (hsp->gap_info->num[i] > num_left) {
697 hsp->gap_info->num[i] -= num_left;
698 delta_subject += num_left;
699 num_left = 0;
700 }
701 else {
702 delta_subject += hsp->gap_info->num[i];
703 num_left -= hsp->gap_info->num[i];
704 i += d;
705 }
706 }
707 else {
708 delta_subject += hsp->gap_info->num[i];
709 i += d;
710 }
711 break;
712
713 case eGapAlignIns:
714 if (is_query) {
715 if (hsp->gap_info->num[i] > num_left) {
716 hsp->gap_info->num[i] -= num_left;
717 delta_query += num_left;
718 num_left = 0;
719 }
720 else {
721 delta_query += hsp->gap_info->num[i];
722 num_left -= hsp->gap_info->num[i];
723 i += d;
724 }
725 }
726 else {
727 delta_query += hsp->gap_info->num[i];
728 i += d;
729 }
730 break;
731
732 default:
733 ASSERT(0);
734 break;
735 }
736 }
737
738 /* shift edit script elements to fill the removed ones */
739 if (is_start && i > 0) {
740 for (k = 0;k < hsp->gap_info->size - i;k++) {
741 hsp->gap_info->op_type[k] = hsp->gap_info->op_type[k + i];
742 hsp->gap_info->num[k] = hsp->gap_info->num[k + i];
743 }
744 hsp->gap_info->size -= i;
745 }
746
747 if (is_end) {
748 hsp->gap_info->size = i + 1;
749 }
750
751
752 if (!hsp->map_info->subject_overhangs) {
753 hsp->map_info->subject_overhangs = calloc(1, sizeof(SequenceOverhangs));
754 if (!hsp->map_info->subject_overhangs) {
755 return -1;
756 }
757 }
758
759
760 /* update subject overhangs */
761 if (is_start) {
762 SequenceOverhangs* overhangs = hsp->map_info->subject_overhangs;
763 Uint1* subject_bases = NULL;
764 Int4 offset = 0; /* query pos -> subject pos */
765
766 ASSERT((overhangs->left && overhangs->left_len > 0) ||
767 (!overhangs->left && overhangs->left_len == 0));
768
769 overhangs->left = realloc(overhangs->left,
770 (overhangs->left_len + delta_subject) *
771 sizeof(Uint1));
772 if (!overhangs->left) {
773 return -1;
774 }
775 subject_bases = overhangs->left + overhangs->left_len;
776 overhangs->left_len += delta_subject;
777
778 memcpy(subject_bases, query_seq + hsp->query.offset, delta_subject);
779 for (k = 0;k < hsp->map_info->edits->num_edits &&
780 hsp->map_info->edits->edits[k].query_pos <
781 hsp->query.offset; k++) {
782
783 JumperEdit* edits = hsp->map_info->edits->edits + k;
784 if (edits->subject_base == kGap) {
785 offset--;
786 }
787 else {
788 ASSERT(edits->query_pos + offset >= 0);
789 ASSERT(edits->query_pos + offset < delta_subject);
790 subject_bases[edits->query_pos + offset] = edits->subject_base;
791 }
792 }
793 }
794 else {
795 SequenceOverhangs* overhangs = hsp->map_info->subject_overhangs;
796 Uint1* subject_bases = NULL;
797 Int4 offset /*= hsp->subject.offset - hsp->query.offset*/;
798 Uint1* tmp = NULL;
799
800 ASSERT((overhangs->right && overhangs->right_len > 0) ||
801 (!overhangs->right && overhangs->right_len == 0));
802
803 tmp = malloc((overhangs->right_len + delta_subject) * sizeof(Uint1));
804 if (!tmp) {
805 return -1;
806 }
807 memcpy(tmp + delta_subject, overhangs->right, overhangs->right_len);
808 free(overhangs->right);
809 overhangs->right = tmp;
810 subject_bases = overhangs->right;
811
812 memcpy(subject_bases, query_seq + hsp->query.end - delta_subject,
813 delta_subject);
814
815
816 k = 0;
817 while (k < hsp->map_info->edits->num_edits &&
818 hsp->map_info->edits->edits[k].query_pos <
819 hsp->query.end - delta_query) {
820 k++;
821 }
822
823 offset = -(hsp->query.end - delta_query);
824 for (;k < hsp->map_info->edits->num_edits; k++) {
825
826 JumperEdit* edits = hsp->map_info->edits->edits + k;
827
828 if (edits->query_pos >= hsp->query.end) {
829 ASSERT(edits->query_base == kGap);
830 continue;
831 }
832
833 if (edits->subject_base == kGap) {
834 offset--;
835 }
836 else {
837 ASSERT(edits->query_pos + offset >= 0);
838 ASSERT(edits->query_pos + offset < delta_subject);
839 subject_bases[edits->query_pos + offset] = edits->subject_base;
840 }
841 }
842
843 }
844
845
846 /* update the Jumper edit script */
847 if (is_start) {
848 Int4 k = hsp->map_info->edits->num_edits - 1;
849 Int4 p = hsp->query.end - 1;
850 for (i = hsp->gap_info->size - 1;i >= 0;i--) {
851 if (hsp->gap_info->op_type[i] != eGapAlignDel) {
852
853 p -= hsp->gap_info->num[i];
854 while (k >= 0 &&
855 hsp->map_info->edits->edits[k].query_pos > p &&
856 hsp->map_info->edits->edits[k].query_base != kGap) {
857
858 k--;
859 }
860 }
861 else {
862 Int4 j;
863 for (j = 0;j < hsp->gap_info->num[i];j++) {
864 ASSERT(k >= 0);
865 ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
866 k--;
867 }
868 }
869 }
870 k++;
871 if (k > 0) {
872 for (i = 0;i < hsp->map_info->edits->num_edits - k;i++) {
873 hsp->map_info->edits->edits[i] =
874 hsp->map_info->edits->edits[i + k];
875 }
876 hsp->map_info->edits->num_edits -= k;
877 ASSERT(hsp->map_info->edits->num_edits >= 0);
878 }
879 }
880 else {
881 Int4 k = 0;
882 Int4 p = hsp->query.offset;
883 const Uint1 kGap = 15;
884
885 for (i = 0;i < hsp->gap_info->size;i++) {
886 if (hsp->gap_info->op_type[i] != eGapAlignDel) {
887
888 p += hsp->gap_info->num[i];
889 while (k < hsp->map_info->edits->num_edits &&
890 hsp->map_info->edits->edits[k].query_pos < p &&
891 hsp->map_info->edits->edits[k].query_base != kGap) {
892
893 k++;
894 }
895 }
896 else {
897 Int4 j;
898 for (j = 0;j < hsp->gap_info->num[i];j++) {
899 ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
900 k++;
901 }
902 }
903 }
904 hsp->map_info->edits->num_edits = k;
905 }
906
907 /* update HSP start positions */
908 if (is_start) {
909 hsp->query.offset += delta_query;
910 hsp->subject.offset += delta_subject;
911 }
912 else {
913 hsp->query.end -= delta_query;
914 hsp->subject.end -= delta_subject;
915 }
916
917 /* update HSP score */
918 hsp->score = s_ComputeAlignmentScore(hsp, mismatch_score, gap_open_score,
919 gap_extend_score);
920
921 #if _DEBUG
922 ASSERT(s_TestHSPRanges(hsp));
923 #endif
924
925 return 0;
926 }
927
928
929 #define NUM_ADAPTERS 4
930 #define MAX_ADAPTER_LEN 20
931
932 /* Find adapeter on the 5' end in a single sequence. The search will start
933 towards the end of the last HSP (from, to) */
s_FindAdapterInSequence(Int4 hsp_from,Int4 hsp_to,Uint1 * query,Int4 query_len)934 static Int4 s_FindAdapterInSequence(Int4 hsp_from, Int4 hsp_to, Uint1* query,
935 Int4 query_len)
936 {
937 Uint1 adapters_tab[NUM_ADAPTERS][MAX_ADAPTER_LEN] = {
938 /* Contaminants from FastQC config file:
939 http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ */
940 /* Illumina universal adapter: AGATCGGAAGAG */
941 {0, 2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2},
942 /* Illumina small RNA adapter: ATGGAATTCTCG */
943 {0, 3, 2, 2, 0, 0, 3, 3, 1, 3, 1, 2},
944 /* Nextera transosase sequence: CTGTCTCTTATA */
945 {1, 3, 2, 3, 1, 3, 1, 3, 3, 0, 3, 0},
946 /* Illumina multiplexing adapter 1: GATCGGAAGAGCACACGTCT */
947 {2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2, 1, 0, 1, 0, 1, 2, 3, 1, 3}};
948
949 int lengths[NUM_ADAPTERS] = {12, 12, 12, 20};
950 Boolean found = FALSE;
951 Int4 adapter_start = -1;
952 int adptr_idx;
953 Int4 from = hsp_from, to = hsp_to;
954 const Int4 kMaxErrors = 1; /* max number of mismaches allowed for matching
955 the adapter sequence + 1*/
956
957
958 if (!query) {
959 return -1;
960 }
961
962 for (adptr_idx = 0;!found && adptr_idx < NUM_ADAPTERS;adptr_idx++) {
963 Uint1* adapter = adapters_tab[adptr_idx];
964 Uint4 word = *(Uint4*)adapter;
965 Int4 q = MAX(to - lengths[adptr_idx], from);
966 int i = 0;
967 while (q < query_len - 4 && q + i < query_len && i < lengths[adptr_idx]) {
968 while (q < query_len - 4 && *(Uint4*)(query + q) != word) {
969 q++;
970 }
971 if (q < query_len - 4) {
972 Int4 errors = kMaxErrors + 1;
973 i = 0;
974 while (q + i < query_len && i < lengths[adptr_idx] &&
975 errors) {
976
977 if (query[q + i] != adapter[i]) {
978 errors--;
979 }
980 i++;
981 }
982 if (q + i == query_len || i == lengths[adptr_idx]) {
983 adapter_start = q;
984 found = TRUE;
985 break;
986 }
987
988 q++;
989 }
990 }
991 }
992
993 ASSERT(adapter_start <= query_len);
994 return adapter_start;
995 }
996
997
998 /* Set adapter position in the chain and trim alignments that span beyond
999 adapter start position */
s_SetAdapter(HSPChain ** chains_ptr,Int4 adapter_pos,const Uint1 * query,Int4 query_len,const ScoringOptions * scores)1000 static Int2 s_SetAdapter(HSPChain** chains_ptr, Int4 adapter_pos,
1001 const Uint1* query, Int4 query_len,
1002 const ScoringOptions* scores)
1003 {
1004 HSPChain* head = NULL;
1005 HSPChain* chain = NULL;
1006 const Int4 kMinAdapterLen = 3;
1007
1008 if (!chains_ptr || !*chains_ptr || adapter_pos < 0) {
1009 return -1;
1010 }
1011
1012 head = *chains_ptr;
1013 chain = *chains_ptr;
1014 /* iterate over chains */
1015 while (chain) {
1016 BlastHSP* hsp = chain->hsps->hsp;
1017
1018 /* check the 3' alignment extent on the query and skip this chain
1019 if the query is covered almost to the end */
1020 if (hsp->query.frame > 0) {
1021 /* for positive strand, check the last query position */
1022 HSPContainer* h = chain->hsps;
1023 while (h->next) {
1024 h = h->next;
1025 }
1026 ASSERT(h);
1027 if (query_len - h->hsp->query.end < kMinAdapterLen) {
1028 chain = chain->next;
1029 continue;
1030 }
1031 }
1032 else {
1033 /* for negative strand, check the first query position */
1034 ASSERT(hsp);
1035 if (hsp->query.offset < kMinAdapterLen) {
1036 chain = chain->next;
1037 continue;
1038 }
1039 }
1040
1041 /* set adapter start position in the chain */
1042 chain->adapter = adapter_pos;
1043
1044 /* Trim alignment that covers part of the adapter */
1045 /* for query on the positive strand */
1046 if (hsp->query.frame > 0) {
1047 HSPContainer* h = chain->hsps;
1048
1049 /* if all HSPs are behind the adapter, delete this chain;
1050 we require that HSP without adapter is at least 5 bases long */
1051 if (h->hsp->query.offset >= adapter_pos - 5) {
1052 HSPChain* prev = head;
1053 HSPChain* next = chain->next;
1054 while (prev && prev->next != chain) {
1055 prev =prev->next;
1056 }
1057
1058 chain->next = NULL;
1059 HSPChainFree(chain);
1060 chain = next;
1061 if (prev) {
1062 prev->next = next;
1063 }
1064 else {
1065 head = chain;
1066 }
1067
1068 continue;
1069 }
1070
1071 /* find the first HSP with end position after adapter */
1072 while (h && h->hsp->query.end <= adapter_pos) {
1073 h = h->next;
1074 }
1075
1076 /* if one is found */
1077 if (h) {
1078 HSPContainer* hh = NULL;
1079 hsp = h->hsp;
1080
1081 /* if adapter is with in the HSP trim the HSP */
1082 if (hsp->query.offset < adapter_pos ) {
1083 Int4 old_score = hsp->score;
1084 ASSERT(hsp->query.end - adapter_pos > 0);
1085 s_TrimHSP(hsp, hsp->query.end - adapter_pos, TRUE,
1086 FALSE, scores->penalty, scores->gap_open,
1087 scores->gap_extend, query);
1088 chain->score += hsp->score - old_score;
1089
1090 /* remove HSPs after h as they are past the adapter */
1091 if (h->next) {
1092 h->next = HSPContainerFree(h->next);
1093 }
1094 }
1095 else {
1096 /* otherwise delete h and everything after it */
1097 hh = chain->hsps;
1098 while (hh && hh->next != h) {
1099 hh = hh->next;
1100 }
1101 ASSERT(hh);
1102 hh->next = HSPContainerFree(hh->next);
1103 }
1104
1105 /* compute the new chain score */
1106 chain->score = s_ComputeChainScore(chain, scores, query_len,
1107 FALSE);
1108 }
1109
1110 /* mark adapter in the HSPs */
1111 h = chain->hsps;
1112 while (h->next) {
1113 h = h->next;
1114 }
1115 h->hsp->map_info->right_edge |= MAPPER_ADAPTER;
1116 h->hsp->map_info->right_edge |= MAPPER_EXON;
1117 }
1118 else {
1119 /* negative strand: adapter is in the beginning of the chain */
1120 Int4 pos_minus = query_len - adapter_pos - 1;
1121 HSPContainer* h = chain->hsps;
1122
1123 /* find the first HSP with end position after adapter;
1124 we require that HSP without adapter is at least 5 bases long */
1125 while (h && h->hsp->query.end <= pos_minus + 5) {
1126 h = h->next;
1127 }
1128
1129 /* if all HSPs are before the adapter, delete this chain */
1130 if (!h) {
1131 /* remove chain */
1132 HSPChain* prev = head;
1133 HSPChain* next = chain->next;
1134 while (prev && prev->next != chain) {
1135 prev =prev->next;
1136 }
1137
1138 chain->next = NULL;
1139 HSPChainFree(chain);
1140 chain = next;
1141 if (prev) {
1142 prev->next = next;
1143 }
1144 else {
1145 head = chain;
1146 }
1147
1148 continue;
1149 }
1150
1151 /* if h is not the first HSP in the chain, remove all HSPs before
1152 h */
1153 if (h != chain->hsps) {
1154 HSPContainer* hh = chain->hsps;
1155 while (hh && hh->next != h) {
1156 hh = hh->next;
1157 }
1158 ASSERT(hh);
1159 hh->next = NULL;
1160 HSPContainerFree(chain->hsps);
1161 chain->hsps = h;
1162
1163 /* compute new chain score */
1164 chain->score = s_ComputeChainScore(chain, scores, query_len,
1165 FALSE);
1166 }
1167 ASSERT(h);
1168
1169 /* set adapter information */
1170 hsp = chain->hsps->hsp;
1171 hsp->map_info->left_edge |= MAPPER_ADAPTER;
1172 hsp->map_info->left_edge |= MAPPER_EXON;
1173
1174 /* trim HSP if necessary */
1175 if (pos_minus >= hsp->query.offset) {
1176 Int4 old_score = hsp->score;
1177 ASSERT(pos_minus - hsp->query.offset + 1 > 0);
1178 s_TrimHSP(hsp, pos_minus - hsp->query.offset + 1,
1179 TRUE, TRUE, scores->penalty, scores->gap_open,
1180 scores->gap_extend, query);
1181 chain->score += hsp->score - old_score;
1182 }
1183 }
1184
1185 chain = chain->next;
1186 }
1187
1188 ASSERT(!head || s_TestChains(head));
1189 *chains_ptr = head;
1190
1191 return 0;
1192 }
1193
1194
s_FindAdapters(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * score_opts)1195 static Int4 s_FindAdapters(HSPChain** saved,
1196 const BLAST_SequenceBlk* query_blk,
1197 const BlastQueryInfo* query_info,
1198 const ScoringOptions* score_opts)
1199 {
1200 Int4 query_idx;
1201
1202 for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1203 HSPChain* chain = NULL;
1204 HSPChain* ch = NULL;
1205 Uint1* query = NULL;
1206 Int4 query_len;
1207 Int4 from = -1, to = -1;
1208 BlastHSP* hsp = NULL;
1209 Int4 overhang = 0;
1210 Int4 adapter_pos = -1;
1211
1212 if (!saved[query_idx]) {
1213 continue;
1214 }
1215
1216 /* we search for adapters and only in the plus strand of the query */
1217 query = query_blk->sequence +
1218 query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1219 ASSERT(query);
1220 query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1221
1222 /* find the best scoring chain */
1223 chain = saved[query_idx];
1224 ch = chain->next;
1225 for (; ch; ch = ch->next) {
1226 if (ch->score > chain->score) {
1227 chain = ch;
1228 }
1229 }
1230 ASSERT(chain);
1231
1232 /* find query coverage by the whole chain */
1233 if (chain->hsps->hsp->query.frame > 0) {
1234 HSPContainer* h = chain->hsps;
1235 from = h->hsp->query.offset;
1236 while (h->next) {
1237 h = h->next;
1238 }
1239 to = h->hsp->query.end;
1240 }
1241 else {
1242 HSPContainer* h = chain->hsps;
1243 to = query_len - h->hsp->query.offset;
1244 while (h->next) {
1245 h = h->next;
1246 }
1247 from = query_len - h->hsp->query.end;
1248 }
1249 ASSERT(from >= 0 && to >= 0);
1250
1251 /* do not search for adapters if the query is mostly covered by the
1252 best scoring chain */
1253 if (from < 20 && to > query_len - 3) {
1254 continue;
1255 }
1256
1257 /* find the chain with the longest overhang */
1258 chain = saved[query_idx];
1259 ch = chain->next;
1260 for (; ch; ch = ch->next) {
1261 HSPContainer* h = ch->hsps;
1262 if (h->hsp->query.frame > 0) {
1263 while (h->next) {
1264 h = h->next;
1265 }
1266 if (query_len - h->hsp->query.end > overhang) {
1267 overhang = query_len - h->hsp->query.end;
1268 chain = ch;
1269 }
1270 }
1271 else {
1272 if (h->hsp->query.offset + 1 > overhang) {
1273 overhang = h->hsp->query.offset + 1;
1274 chain = ch;
1275 }
1276 }
1277 }
1278 ASSERT(chain);
1279
1280 /* find location from where to search for the adapter */
1281 hsp = chain->hsps->hsp;
1282 if (hsp->query.frame > 0) {
1283 HSPContainer* h = chain->hsps;
1284 while (h->next) {
1285 h = h->next;
1286 }
1287 hsp = h->hsp;
1288 }
1289 ASSERT(hsp);
1290
1291 if (hsp->query.frame > 0) {
1292 from = hsp->query.offset;
1293 to = hsp->query.end;
1294 }
1295 else {
1296 from = query_len - hsp->query.end;
1297 to = query_len - hsp->query.offset;
1298 }
1299
1300 /* do not search for adapters if the read aligns almost to the end */
1301 if (to >= query_len - 3) {
1302 continue;
1303 }
1304
1305 /* search for adapters */
1306 adapter_pos = s_FindAdapterInSequence(from, to, query, query_len);
1307
1308 /* set adapter information in all chains and trim HSPs that extend
1309 into the adapter */
1310 if (adapter_pos >= 0) {
1311 s_SetAdapter(&(saved[query_idx]), adapter_pos, query, query_len,
1312 score_opts);
1313 }
1314 }
1315
1316 return 0;
1317 }
1318
1319
1320 /* Search for polyA tail at the end of a sequence and return the start position */
s_FindPolyAInSequence(Uint1 * sequence,Int4 length)1321 static Int4 s_FindPolyAInSequence(Uint1* sequence, Int4 length)
1322 {
1323 Int4 i;
1324 const Uint1 kBaseA = 0;
1325 Int4 num_a = 0;
1326 Int4 err = 0;
1327 const Int4 kMaxErrors = 3;
1328
1329 if (!sequence) {
1330 return -1;
1331 }
1332
1333 /* iterate over positions untile kMaxErrors non A bases are found */
1334 i = length - 1;
1335 while (i >= 0 && err < kMaxErrors) {
1336 if (sequence[i] != kBaseA) {
1337 err++;
1338 }
1339
1340 i--;
1341 }
1342 i++;
1343
1344 /* find the beginnig of the string of As */
1345 while (i < length - 1 &&
1346 (sequence[i] != kBaseA || sequence[i + 1] != kBaseA)) {
1347
1348 if (sequence[i] != kBaseA) {
1349 err--;
1350 }
1351 i++;
1352 }
1353
1354 num_a = length - i - err;
1355
1356 /* short tails must have no errors */
1357 if (num_a < 3 || (num_a < 5 && err > 0)) {
1358 return -1;
1359 }
1360
1361 return i;
1362 }
1363
1364
1365 /* Set PolyA information in HSP chains */
s_SetPolyATail(HSPChain * chains,Int4 positive_start,Int4 negative_start,Int4 query_len)1366 static Int4 s_SetPolyATail(HSPChain* chains, Int4 positive_start,
1367 Int4 negative_start, Int4 query_len)
1368 {
1369 HSPChain* ch = NULL;
1370
1371 if (!chains) {
1372 return -1;
1373 }
1374
1375 for (ch = chains; ch; ch = ch->next) {
1376 HSPContainer* h = ch->hsps;
1377 while (h->next) {
1378 h = h->next;
1379 }
1380
1381 if (query_len - h->hsp->query.end < 5) {
1382 continue;
1383 }
1384
1385 if ((h->hsp->query.frame < 0 && negative_start >= 0) ||
1386 (h->hsp->query.frame > 0 && positive_start >= 0)) {
1387
1388 h->hsp->map_info->right_edge |= MAPPER_POLY_A;
1389 h->hsp->map_info->right_edge |= MAPPER_EXON;
1390
1391 if (h->hsp->query.frame > 0) {
1392 ch->polyA = MAX(positive_start, h->hsp->query.end);
1393 }
1394 else {
1395 ch->polyA = MAX(negative_start, h->hsp->query.end);
1396 }
1397 }
1398 }
1399
1400 return 0;
1401 }
1402
1403
s_FindPolyATails(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)1404 static Int4 s_FindPolyATails(HSPChain** saved,
1405 const BLAST_SequenceBlk* query_blk,
1406 const BlastQueryInfo* query_info)
1407 {
1408 Int4 query_idx;
1409
1410 /* for each query */
1411 for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1412 HSPChain* chain = NULL;
1413 HSPChain* ch = NULL;
1414 Uint1* query = NULL;
1415 Int4 query_len;
1416 Int4 from = -1, to = -1;
1417 Int4 positive_start, negative_start;
1418
1419 /* skip queries with no results and those with adapters */
1420 if (!saved[query_idx] || saved[query_idx]->adapter >= 0) {
1421 continue;
1422 }
1423
1424 query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1425
1426 /* find the best scoring chain */
1427 chain = saved[query_idx];
1428 ch = chain->next;
1429 for (; ch; ch = ch->next) {
1430 if (ch->score > chain->score) {
1431 chain = ch;
1432 }
1433 }
1434 ASSERT(chain);
1435
1436 /* find query coverage by the whole chain */
1437 if (chain->hsps->hsp->query.frame > 0) {
1438 HSPContainer* h = chain->hsps;
1439 from = h->hsp->query.offset;
1440 while (h->next) {
1441 h = h->next;
1442 }
1443 to = h->hsp->query.end;
1444 }
1445 else {
1446 HSPContainer* h = chain->hsps;
1447 to = query_len - h->hsp->query.offset;
1448 while (h->next) {
1449 h = h->next;
1450 }
1451 from = query_len - h->hsp->query.end;
1452 }
1453 ASSERT(from >= 0 && to >= 0);
1454
1455 /* do not search for polyA tails if the query is mostly covered by the
1456 best scoring chain */
1457 if (from < 4 && to > query_len - 3) {
1458 continue;
1459 }
1460
1461 /* search for polyA tail on the positive strand */
1462 query = query_blk->sequence +
1463 query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1464 positive_start = s_FindPolyAInSequence(query, query_len);
1465
1466 /* search for the polyA tail on the negative strand */
1467 query = query_blk->sequence +
1468 query_info->contexts[query_idx * NUM_STRANDS + 1].query_offset;
1469 negative_start = s_FindPolyAInSequence(query, query_len);
1470
1471 /* set polyA start positions and HSP flags in the saved chains */
1472 if (positive_start >= 0 || negative_start >= 0) {
1473 s_SetPolyATail(saved[query_idx], positive_start, negative_start,
1474 query_len);
1475 }
1476 }
1477
1478 return 0;
1479 }
1480
1481
1482 /* Merge two HSPs into one. The two HSPs may only be separated by at most
1483 one gap or mismatch. Function returns NULL is the HSPs cannon be merged */
s_MergeHSPs(const BlastHSP * first,const BlastHSP * second,const Uint1 * query,const ScoringOptions * score_opts)1484 static BlastHSP* s_MergeHSPs(const BlastHSP* first, const BlastHSP* second,
1485 const Uint1* query,
1486 const ScoringOptions* score_opts)
1487 {
1488 BlastHSP* merged_hsp = NULL; /* this will be the result */
1489 const BlastHSP* hsp = second;
1490 Int4 query_gap;
1491 Int4 subject_gap;
1492 Int4 mismatches = 0;
1493 Int4 gap_info_size;
1494 Int4 edits_size;
1495 Int4 k;
1496 Int4 offset;
1497 const Uint1 kGap = 15;
1498
1499 if (!first || !second || !query || !score_opts) {
1500 return NULL;
1501 }
1502
1503 merged_hsp = Blast_HSPClone(first);
1504 if (!merged_hsp) {
1505 return NULL;
1506 }
1507
1508 query_gap = second->subject.offset - first->subject.end;
1509 subject_gap = second->query.offset - first->query.end;
1510
1511 if (query_gap < 0 || subject_gap < 0) {
1512
1513 return NULL;
1514 }
1515
1516 if (MAX(query_gap, subject_gap) < 4) {
1517 mismatches = MIN(query_gap, subject_gap);
1518 query_gap -= mismatches;
1519 subject_gap -= mismatches;
1520 }
1521
1522 gap_info_size = first->gap_info->size + second->gap_info->size + 3;
1523
1524 edits_size = first->map_info->edits->num_edits +
1525 second->map_info->edits->num_edits +
1526 mismatches + query_gap + subject_gap;
1527
1528 /* FIXME: should be done through an API */
1529 /* reallocate memory for edit scripts */
1530 merged_hsp->gap_info->op_type = realloc(merged_hsp->gap_info->op_type,
1531 gap_info_size *
1532 sizeof(EGapAlignOpType));
1533
1534 merged_hsp->gap_info->num = realloc(merged_hsp->gap_info->num,
1535 gap_info_size *
1536 sizeof(Int4));
1537
1538 merged_hsp->map_info->edits->edits = realloc(
1539 merged_hsp->map_info->edits->edits,
1540 edits_size * sizeof(JumperEdit));
1541
1542 if (!merged_hsp->gap_info->op_type ||
1543 !merged_hsp->gap_info->num ||
1544 !merged_hsp->map_info->edits->edits) {
1545
1546 Blast_HSPFree(merged_hsp);
1547 return NULL;
1548 }
1549
1550 /* add mismatches to gap_info */
1551 if (mismatches > 0) {
1552 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1553 == eGapAlignSub) {
1554 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1555 mismatches;
1556 }
1557 else {
1558 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1559 eGapAlignSub;
1560 merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1561 mismatches;
1562 merged_hsp->gap_info->size++;
1563 }
1564 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1565 }
1566
1567
1568 /* add query gap to gap info */
1569 if (query_gap > 0) {
1570 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1571 == eGapAlignDel) {
1572
1573 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1574 query_gap;
1575 }
1576 else {
1577 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1578 eGapAlignDel;
1579 merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1580 query_gap;
1581 merged_hsp->gap_info->size++;
1582 }
1583 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1584 }
1585
1586 /* add subject gap to gap info */
1587 if (subject_gap > 0) {
1588 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1589 == eGapAlignIns) {
1590
1591 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1592 subject_gap;
1593 }
1594 else {
1595 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1596 eGapAlignIns;
1597 merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1598 subject_gap;
1599 merged_hsp->gap_info->size++;
1600 }
1601 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1602 }
1603
1604 /* merge gap_info */
1605 for (k = 0;k < hsp->gap_info->size;k++) {
1606
1607 if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1608 == hsp->gap_info->op_type[k]) {
1609
1610 merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1611 hsp->gap_info->num[k];
1612 }
1613 else {
1614 merged_hsp->gap_info->op_type[merged_hsp->gap_info->size]
1615 = hsp->gap_info->op_type[k];
1616
1617 merged_hsp->gap_info->num[merged_hsp->gap_info->size++]
1618 = hsp->gap_info->num[k];
1619 }
1620 ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1621 }
1622
1623 /* find translation of a query into the subject position:
1624 query pos + offset = subject pos */
1625 offset = merged_hsp->subject.offset - merged_hsp->query.offset;
1626 for (k = 0;k < merged_hsp->map_info->edits->num_edits; k++) {
1627 if (merged_hsp->map_info->edits->edits[k].query_base == kGap) {
1628 offset++;
1629 }
1630 else if (merged_hsp->map_info->edits->edits[k].subject_base == kGap) {
1631 offset--;
1632 }
1633 }
1634
1635 /* add mismatches to jumper edits */
1636 if (mismatches > 0) {
1637 for (k = 0;k < mismatches;k++) {
1638 Int4 subject_pos;
1639 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1640 merged_hsp->map_info->edits->num_edits++;
1641
1642 edit->query_pos = merged_hsp->query.end + k;
1643 /* FIXME: Mismatch bases cannot be currently set because there is
1644 no access to query or subject sequence in this function. */
1645 edit->query_base = query[edit->query_pos];
1646
1647
1648 subject_pos = edit->query_pos + offset - merged_hsp->subject.end;
1649 if (merged_hsp->map_info->subject_overhangs &&
1650 merged_hsp->map_info->subject_overhangs->right &&
1651 subject_pos <
1652 merged_hsp->map_info->subject_overhangs->right_len) {
1653
1654 edit->subject_base =
1655 merged_hsp->map_info->subject_overhangs->right[subject_pos];
1656 }
1657 else {
1658 edit->subject_base = edit->query_base;
1659 }
1660
1661
1662 ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1663 }
1664 }
1665
1666 /* add query gap to jumper edits */
1667 if (query_gap > 0) {
1668 for (k = 0;k < query_gap;k++) {
1669 Int4 subject_pos;
1670 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1671 merged_hsp->map_info->edits->num_edits++;
1672
1673 edit->query_pos = merged_hsp->query.end + mismatches;
1674 /* FIXME: Mismatch bases cannot be currently set because there is
1675 no access to query or subject sequence in this function. */
1676 edit->query_base = kGap;
1677
1678 subject_pos = edit->query_pos + offset - merged_hsp->subject.end;
1679 if (merged_hsp->map_info->subject_overhangs &&
1680 merged_hsp->map_info->subject_overhangs->right &&
1681 subject_pos <
1682 merged_hsp->map_info->subject_overhangs->right_len) {
1683
1684 edit->subject_base =
1685 merged_hsp->map_info->subject_overhangs->right[subject_pos];
1686 }
1687 else {
1688 edit->subject_base = 0;
1689 }
1690 offset++;
1691
1692 ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1693 }
1694 }
1695
1696 /* add subject gap to jumper edits */
1697 if (subject_gap > 0) {
1698 for (k = 0;k < subject_gap;k++) {
1699 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1700 merged_hsp->map_info->edits->num_edits++;
1701
1702 edit->query_pos = merged_hsp->query.end + mismatches + k;
1703 /* FIXME: Mismatch bases cannot be currently set because there is
1704 no access to query or subject sequence in this function. */
1705 edit->query_base = query[edit->query_pos];
1706 edit->subject_base = kGap;
1707
1708 ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1709 }
1710 }
1711
1712 /* merge jumper edits */
1713 if (hsp->map_info->edits->num_edits) {
1714 JumperEdit* edit = merged_hsp->map_info->edits->edits +
1715 merged_hsp->map_info->edits->num_edits;
1716
1717 ASSERT(merged_hsp->map_info->edits->num_edits +
1718 hsp->map_info->edits->num_edits <= edits_size);
1719
1720 memcpy(edit, hsp->map_info->edits->edits,
1721 hsp->map_info->edits->num_edits * sizeof(JumperEdit));
1722
1723 merged_hsp->map_info->edits->num_edits +=
1724 hsp->map_info->edits->num_edits;
1725 }
1726
1727 /* update alignment extents, scores, etc */
1728 merged_hsp->query.end = hsp->query.end;
1729 merged_hsp->subject.end = hsp->subject.end;
1730 merged_hsp->score = s_ComputeAlignmentScore(merged_hsp,
1731 score_opts->penalty,
1732 score_opts->gap_open,
1733 score_opts->gap_extend);
1734
1735 merged_hsp->map_info->right_edge = hsp->map_info->right_edge;
1736 if (!merged_hsp->map_info->subject_overhangs) {
1737 return merged_hsp;
1738 }
1739
1740 if (!hsp->map_info->subject_overhangs ||
1741 !hsp->map_info->subject_overhangs->right_len ||
1742 !hsp->map_info->subject_overhangs->right) {
1743
1744 merged_hsp->map_info->subject_overhangs->right_len = 0;
1745 if (merged_hsp->map_info->subject_overhangs->right) {
1746 sfree(merged_hsp->map_info->subject_overhangs->right);
1747 merged_hsp->map_info->subject_overhangs->right = NULL;
1748 }
1749 }
1750 else {
1751 Int4 new_right_len = hsp->map_info->subject_overhangs->right_len;
1752 if (merged_hsp->map_info->subject_overhangs->right_len <
1753 new_right_len) {
1754
1755 merged_hsp->map_info->subject_overhangs->right =
1756 realloc(merged_hsp->map_info->subject_overhangs->right,
1757 new_right_len * sizeof(Uint1));
1758 }
1759 memcpy(merged_hsp->map_info->subject_overhangs->right,
1760 hsp->map_info->subject_overhangs->right,
1761 new_right_len * sizeof(Uint1));
1762 }
1763
1764 return merged_hsp;
1765 }
1766
1767
1768 /* Find paired reads mapped to different database sequences */
s_FindRearrangedPairs(HSPChain ** saved,const BlastQueryInfo * query_info)1769 static int s_FindRearrangedPairs(HSPChain** saved,
1770 const BlastQueryInfo* query_info)
1771 {
1772 Int4 query_idx;
1773
1774 /* iterate over single reads from each pair */
1775 for (query_idx = 0;query_idx + 1 < query_info->num_queries; query_idx++) {
1776 HSPChain* chain = saved[query_idx];
1777 HSPChain* thepair = saved[query_idx + 1];
1778
1779 /* skip queries with no results */
1780 if (!chain || !thepair) {
1781 continue;
1782 }
1783
1784 /* skip queries that do not have pairs */
1785 if (query_info->contexts[query_idx * NUM_STRANDS].segment_flags !=
1786 eFirstSegment) {
1787
1788 continue;
1789 }
1790
1791 /* skip queries with more than one saved chain and ones that already
1792 have a pair */
1793 if (chain->next || chain->pair || thepair->next) {
1794 continue;
1795 }
1796
1797 /* skip chains aligned to the same subject; these are taken care of
1798 elsewhere */
1799 if (chain->oid == thepair->oid) {
1800 continue;
1801 }
1802
1803 /* skip HSP chains aligned on the same strands */
1804 if (SIGN(chain->hsps->hsp->query.frame) ==
1805 SIGN(thepair->hsps->hsp->query.frame)) {
1806 continue;
1807 }
1808
1809 /* mark the pair */
1810 chain->pair = thepair;
1811 thepair->pair = chain;
1812
1813 chain->pair_conf = PAIR_PARALLEL;
1814 thepair->pair_conf = PAIR_PARALLEL;
1815 }
1816
1817 return 0;
1818 }
1819
1820
1821 /* Sort chains by score in descending order */
s_CompareChainsByScore(const void * cha,const void * chb)1822 static int s_CompareChainsByScore(const void* cha, const void* chb)
1823 {
1824 const HSPChain* a = *((HSPChain**)cha);
1825 const HSPChain* b = *((HSPChain**)chb);
1826
1827 if (a->score < b->score) {
1828 return 1;
1829 }
1830 else if (a->score > b->score) {
1831 return -1;
1832 }
1833 else {
1834 return 0;
1835 }
1836 }
1837
1838 /* Remove chains with scores below the these of the best ones considering
1839 score bonus for pairs and count the number of chains with same of better
1840 score for each chain */
s_PruneChains(HSPChain ** saved,Int4 num_queries,Int4 pair_bonus)1841 static int s_PruneChains(HSPChain** saved, Int4 num_queries, Int4 pair_bonus)
1842
1843 {
1844 Int4 query_idx;
1845 Int4 array_size = 10;
1846 HSPChain** array = calloc(array_size, sizeof(HSPChain*));
1847
1848 /* Prunning is done in two passes: first considering single chains,
1849 then considering pairs. The first pass breaks pairs where there is much
1850 better chain with no pair. The second pass selects the best scoring
1851 chains giving preference to pairs. Pairs are preferable, so they get
1852 additional score. Two passes are requires, because we compare single
1853 chains with single chains and pairs with pairs. This is to avoid
1854 breaking pairs in situations with two pairs with one high scoring
1855 chain. */
1856
1857 /* first pass: for each query remove poor scoring chains that may split
1858 pairs */
1859 for (query_idx = 0; query_idx < num_queries; query_idx++) {
1860 HSPChain* chain = saved[query_idx];
1861 HSPChain* prev = NULL;
1862 Int4 best_score = 0;
1863
1864 if (!chain || !chain->next) {
1865 continue;
1866 }
1867
1868 /* find the best single chain score */
1869 for (chain = saved[query_idx]; chain; chain = chain->next) {
1870 if (chain->score > best_score) {
1871 best_score = chain->score;
1872 }
1873 }
1874
1875 /* remove chains with poor scores, considering pair bonus */
1876 chain = saved[query_idx];
1877 while (chain) {
1878 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1879
1880 if (score < best_score) {
1881 HSPChain* next = chain->next;
1882
1883 chain->next = NULL;
1884 HSPChainFree(chain);
1885 chain = next;
1886
1887 if (prev) {
1888 prev->next = next;
1889 }
1890 else {
1891 saved[query_idx] = next;
1892 }
1893 }
1894 else {
1895 prev = chain;
1896 chain = chain->next;
1897 }
1898 }
1899
1900 }
1901
1902 /* second pass: for each query remove chains scoring poorer than the best
1903 ones considering pair bonus */
1904 for (query_idx = 0; query_idx < num_queries; query_idx++) {
1905 HSPChain* chain = saved[query_idx];
1906 HSPChain* prev = NULL;
1907 Int4 best_score = 0;
1908 Int4 best_pair_score = 0;
1909 Int4 i;
1910 Int4 count = 0;
1911 Int4 num_chains = 0;
1912
1913 if (!chain) {
1914 continue;
1915 }
1916
1917 if (!chain->next) {
1918 chain->count = 1;
1919 continue;
1920 }
1921
1922 /* find the best scores for single chains and pairs */
1923 for (chain = saved[query_idx]; chain; chain = chain->next) {
1924 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1925 if (score >= best_score) {
1926 best_score = score;
1927 }
1928
1929 if (chain->pair &&
1930 chain->score + chain->pair->score >= best_pair_score) {
1931
1932 best_pair_score = chain->score + chain->pair->score;
1933 }
1934
1935 /* collect chains in the array */
1936 if (num_chains >= array_size) {
1937 array_size *= 2;
1938 array = realloc(array, array_size * sizeof(HSPChain*));
1939 if (!array) {
1940 return -1;
1941 }
1942 }
1943 array[num_chains++] = chain;
1944 }
1945
1946 /* sort chains by scores in descending order */
1947 ASSERT(num_chains > 1);
1948 qsort(array, num_chains, sizeof(HSPChain*), s_CompareChainsByScore);
1949
1950 /* find multiplicity for each chain: number of chains with same or
1951 larger score */
1952 /* Note that multiplicity is currently not reported. Only the number
1953 of returned hits for each query is reported */
1954 for (i = 0;i < num_chains; i++) {
1955 Int4 k;
1956
1957 count++;
1958 /* check for chains with the same score */
1959 k = i + 1;
1960 while (k < num_chains && array[k]->score == array[i]->score) {
1961 k++;
1962 count++;
1963 }
1964
1965 /* save multiplicity */
1966 for (;i < k;i++) {
1967 array[i]->count = count;
1968 }
1969 i--;
1970 }
1971
1972
1973 /* remove chains with poor score and count chains with equal or better
1974 score to the minimum pair one */
1975 chain = saved[query_idx];
1976 while (chain) {
1977 Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1978
1979 /* We compare single chains with single chains and pairs with
1980 pairs. There may be pairs with one strong and one weak hit.
1981 Comparing single chain scores we can be left with single chains
1982 from different pairs. This comparison aviods that. The first
1983 pass ensures that pairs with poor scoring chains are removed,
1984 so we do not have to compare singles with pairs. */
1985 if ((!chain->pair && score < best_score) ||
1986 (chain->pair && chain->score + chain->pair->score <
1987 best_pair_score)) {
1988
1989 HSPChain* next = chain->next;
1990
1991 chain->next = NULL;
1992 HSPChainFree(chain);
1993 chain = next;
1994
1995 if (prev) {
1996 prev->next = next;
1997 }
1998 else {
1999 saved[query_idx] = next;
2000 }
2001 }
2002 else {
2003 prev = chain;
2004 chain = chain->next;
2005 }
2006 }
2007 }
2008
2009 if (array) {
2010 sfree(array);
2011 }
2012
2013 return 0;
2014 }
2015
2016
2017 /* Compare chains by subject oid and offfset */
s_CompareChainsByOid(const void * cha,const void * chb)2018 static int s_CompareChainsByOid(const void* cha, const void* chb)
2019 {
2020 const HSPChain* a = *((HSPChain**)cha);
2021 const HSPChain* b = *((HSPChain**)chb);
2022
2023 if (a->oid > b->oid) {
2024 return 1;
2025 }
2026 else if (a->oid < b->oid) {
2027 return -1;
2028 }
2029 else if (a->hsps->hsp->subject.offset > b->hsps->hsp->subject.offset) {
2030 return 1;
2031 }
2032 else if (a->hsps->hsp->subject.offset < b->hsps->hsp->subject.offset) {
2033 return -1;
2034 }
2035
2036 return 0;
2037 }
2038
2039 /* Sort chains by subject oid and position and HSPs in the chains by query and
2040 subject positions */
s_SortChains(HSPChain ** saved,Int4 num_queries,int (* comp)(const void *,const void *))2041 static Int4 s_SortChains(HSPChain** saved, Int4 num_queries,
2042 int(*comp)(const void*, const void*))
2043 {
2044 Int4 i;
2045 Int4 array_size = 50;
2046 HSPChain** chain_array = NULL;
2047
2048 if (!saved || num_queries < 0) {
2049 return -1;
2050 }
2051
2052 chain_array = calloc(array_size, sizeof(HSPChain*));
2053 if (!chain_array) {
2054 return -1;
2055 }
2056
2057 /* for each query */
2058 for (i = 0;i < num_queries;i++) {
2059 HSPChain* chain = saved[i];
2060 Int4 num_chains = 0;
2061 Int4 k;
2062
2063 if (!chain) {
2064 continue;
2065 }
2066
2067 /* create an array of pointer to chains */
2068 for (; chain; chain = chain->next) {
2069 if (num_chains >= array_size) {
2070 array_size *= 2;
2071 chain_array = realloc(chain_array, array_size *
2072 sizeof(HSPChain*));
2073
2074 ASSERT(chain_array);
2075 if (!chain_array) {
2076 return -1;
2077 }
2078 }
2079 chain_array[num_chains++] = chain;
2080 }
2081
2082 if (num_chains > 1) {
2083 /* sort chains */
2084 qsort(chain_array, num_chains, sizeof(HSPChain*), comp);
2085
2086 /* create the list of chains in the new order */
2087 for (k = 0;k < num_chains - 1;k++) {
2088 chain_array[k]->next = chain_array[k + 1];
2089 }
2090 chain_array[num_chains - 1]->next = NULL;
2091
2092 saved[i] = chain_array[0];
2093 ASSERT(saved[i]);
2094 }
2095 }
2096
2097 if (chain_array) {
2098 free(chain_array);
2099 }
2100
2101 return 0;
2102 }
2103
2104
2105 /* Separate HSPs that overlap on the query to different chains (most output
2106 formats do not support query overlaps) */
s_RemoveOverlaps(HSPChain * chain,const ScoringOptions * score_opts,Int4 query_len)2107 static int s_RemoveOverlaps(HSPChain* chain, const ScoringOptions* score_opts,
2108 Int4 query_len)
2109 {
2110 HSPContainer* h = NULL;
2111 HSPContainer* prev = NULL;
2112 HSPChain* tail = chain->next;
2113 HSPChain* head = chain;
2114 Boolean rescore = FALSE;
2115
2116 if (!chain) {
2117 return -1;
2118 }
2119
2120 /* current and previous HSP */
2121 h = chain->hsps->next;
2122 prev = chain->hsps;
2123
2124 for (; h; h = h->next) {
2125
2126 /* if HSPs overlap on the query */
2127 if (prev->hsp->query.end > h->hsp->query.offset) {
2128 /* do a flat copy of the chain */
2129 HSPChain* new_chain = HSPChainNew(chain->context);
2130 memcpy(new_chain, chain, sizeof(HSPChain));
2131 new_chain->pair = NULL;
2132 new_chain->next = NULL;
2133
2134 /* new chain starts with h */
2135 new_chain->hsps = h;
2136
2137 /* the original chain ends on the previous HSP */
2138 prev->next = NULL;
2139
2140 /* add new chain to the list */
2141 head->next = new_chain;
2142 head = new_chain;
2143
2144 rescore = TRUE;
2145 }
2146 prev = h;
2147 }
2148
2149 /* compute chains scores */
2150 if (rescore) {
2151 for (; chain; chain = chain->next) {
2152 chain->score = s_ComputeChainScore(chain, score_opts, query_len,
2153 FALSE);
2154 }
2155 }
2156
2157 /* re-attach other chains to the list */
2158 head->next = tail;
2159
2160 return 0;
2161 }
2162
2163
2164 /* Removes chains with scores below the cutoff */
s_FilterChains(HSPChain ** chains_ptr,Int4 cutoff_score,Int4 cutoff_edit_distance)2165 static int s_FilterChains(HSPChain** chains_ptr, Int4 cutoff_score,
2166 Int4 cutoff_edit_distance)
2167 {
2168 HSPChain* chain = *chains_ptr;
2169
2170 while (chain && !s_TestCutoffs(chain, cutoff_score, cutoff_edit_distance)
2171 /*chain->score < cutoff_score*/) {
2172 HSPChain* next = chain->next;
2173 chain->next = NULL;
2174 HSPChainFree(chain);
2175 chain = next;
2176 }
2177 *chains_ptr = chain;
2178
2179 if (chain) {
2180 while (chain && chain->next) {
2181 if (!s_TestCutoffs(chain->next, cutoff_score, cutoff_edit_distance)
2182 /*chain->next->score < cutoff_score*/) {
2183 HSPChain* next = chain->next->next;
2184 chain->next->next = NULL;
2185 HSPChainFree(chain->next);
2186 chain->next = next;
2187 }
2188 else {
2189 chain = chain->next;
2190 }
2191 }
2192 }
2193
2194 return 0;
2195 }
2196
2197
2198 /* Populate HSP data and save final results */
s_Finalize(HSPChain ** saved,BlastMappingResults * results,const BlastQueryInfo * query_info,const BLAST_SequenceBlk * query_blk,const ScoringOptions * score_opts,Boolean is_paired,Int4 cutoff_score,Int4 cutoff_edit_dist)2199 static int s_Finalize(HSPChain** saved, BlastMappingResults* results,
2200 const BlastQueryInfo* query_info,
2201 const BLAST_SequenceBlk* query_blk,
2202 const ScoringOptions* score_opts,
2203 Boolean is_paired,
2204 Int4 cutoff_score,
2205 Int4 cutoff_edit_dist)
2206 {
2207 Int4 query_idx;
2208 const Int4 kPairBonus = 21;
2209
2210 /* remove poor scoring chains and find chain multiplicity */
2211 if (!getenv("MAPPER_NO_PRUNNING")) {
2212 s_PruneChains(saved, query_info->num_queries, kPairBonus);
2213 }
2214
2215 /* find adapters in query sequences */
2216 s_FindAdapters(saved, query_blk, query_info, score_opts);
2217
2218 /* find polyA tails in query sequences */
2219 s_FindPolyATails(saved, query_blk, query_info);
2220
2221 /* find pairs of chains aligned to different subjects */
2222 s_FindRearrangedPairs(saved, query_info);
2223
2224 /* sort chains by subject oid to ensure stable results for multithreaded
2225 runs; sorting must be done here so that compartment numbers are stable */
2226 /* FIXME: this may not be needed */
2227 s_SortChains(saved, query_info->num_queries, s_CompareChainsByOid);
2228
2229
2230 /* separate HSPs that overlap on the query into different chains */
2231 if (getenv("MAPPER_NO_OVERLAPPED_HSP_MERGE")) {
2232 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2233 HSPChain* chain = saved[query_idx];
2234 Int4 query_len;
2235
2236 if (!chain) {
2237 continue;
2238 }
2239
2240 query_len = query_info->contexts[chain->context].query_length;
2241 for (; chain; chain = chain->next) {
2242 s_RemoveOverlaps(chain, score_opts, query_len);
2243 }
2244
2245 }
2246 }
2247
2248
2249 /* remove chains with score below cutoff */
2250 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2251 s_FilterChains(&saved[query_idx], cutoff_score, cutoff_edit_dist);
2252 }
2253
2254 /* Compute number of results to store, and number of unique mappings for
2255 each query (some chains may be copied to create pairs) */
2256
2257 /* count number of unique mappings */
2258 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2259 HSPChain* chain = saved[query_idx];
2260 HSPChain* prev = chain;
2261 Int4 num_unique = 1;
2262
2263 if (!chain) {
2264 continue;
2265 }
2266
2267 chain = chain->next;
2268 for (; chain; chain = chain->next, prev = prev->next) {
2269 if (prev->oid != chain->oid ||
2270 s_FindFragmentStart(prev) != s_FindFragmentStart(chain)) {
2271
2272 num_unique++;
2273 }
2274 }
2275
2276 /* FIXME: for tabular output check count computed earlier */
2277 for (chain = saved[query_idx]; chain; chain = chain->next) {
2278 chain->count = num_unique;
2279 }
2280 }
2281
2282 #if _DEBUG
2283 for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2284 HSPChain* chain = saved[query_idx];
2285 for (; chain; chain = chain->next) {
2286 HSPContainer* h = chain->hsps;
2287 for (; h; h = h->next) {
2288 s_TestHSPRanges(h->hsp);
2289 }
2290 }
2291
2292 }
2293 #endif
2294
2295 results->chain_array = saved;
2296 results->num_queries = query_info->num_queries;
2297
2298 return 0;
2299 }
2300
2301 /** Perform post-run clean-ups
2302 * @param data The buffered data structure [in]
2303 * @param results The HSP results to propagate [in][out]
2304 */
2305 static int
s_BlastHSPMapperFinal(void * data,void * mapping_results)2306 s_BlastHSPMapperFinal(void* data, void* mapping_results)
2307 {
2308 BlastHSPMapperData* spl_data = data;
2309 BlastHSPMapperParams* params = spl_data->params;
2310 BlastMappingResults* results = (BlastMappingResults*)mapping_results;
2311
2312 ASSERT(spl_data->saved_chains);
2313
2314 if (spl_data->saved_chains != NULL) {
2315 s_Finalize(spl_data->saved_chains, results, spl_data->query_info,
2316 spl_data->query, ¶ms->scoring_options, params->paired,
2317 params->cutoff_score, params->cutoff_edit_dist);
2318 }
2319
2320 spl_data->saved_chains = NULL;
2321
2322 return 0;
2323 }
2324
2325
2326 /* A node representing an HSP for finding the best chain of HSPs for RNA-seq */
2327 typedef struct HSPNode {
2328 BlastHSP** hsp; /* pointer to the entry in hsp_list */
2329 Int4 best_score; /* score for path that starts at this node */
2330 struct HSPNode* path_next; /* next node in the path */
2331 } HSPNode;
2332
2333
2334 /* Copy a array of HSP nodes */
s_HSPNodeArrayCopy(HSPNode * dest,HSPNode * source,Int4 num)2335 static int s_HSPNodeArrayCopy(HSPNode* dest, HSPNode* source, Int4 num)
2336 {
2337 int i;
2338
2339 if (!dest || !source) {
2340 return -1;
2341 }
2342
2343 for (i = 0;i < num;i++) {
2344 dest[i] = source[i];
2345 if (source[i].path_next) {
2346 dest[i].path_next = dest + (source[i].path_next - source);
2347 }
2348 }
2349
2350 return 0;
2351 }
2352
2353
2354 /* Maximum number of top scoring HSP chains to report for a single read and
2355 strand */
2356 #define MAX_NUM_HSP_PATHS 40
2357
2358 /* A chain of HSPs aligning a spliced RNA-Seq read to a genome */
2359 typedef struct HSPPath {
2360 HSPNode** start; /* array of chain starting nodes */
2361 int num_paths; /* number of chains */
2362 Int4 score; /* all chains in start have this cumulative score */
2363 } HSPPath;
2364
2365
HSPPathFree(HSPPath * path)2366 static HSPPath* HSPPathFree(HSPPath* path)
2367 {
2368 if (path) {
2369 if (path->start) {
2370 free(path->start);
2371 }
2372 free(path);
2373 }
2374
2375 return NULL;
2376 }
2377
HSPPathNew(void)2378 static HSPPath* HSPPathNew(void)
2379 {
2380 HSPPath* retval = calloc(1, sizeof(HSPPath));
2381 if (!retval) {
2382 return NULL;
2383 }
2384
2385 retval->start = calloc(MAX_NUM_HSP_PATHS, sizeof(HSPNode*));
2386 if (!retval->start) {
2387 free(retval);
2388 return NULL;
2389 }
2390
2391 return retval;
2392 }
2393
2394
2395 #define NUM_SIGNALS 23
2396 #define NUM_SIGNALS_CONSENSUS 2
2397
2398 /* Find a split for HSPs overlapping on the query by finding splice signals in
2399 the subject sequence. The first HSP must have smaller query offset than the
2400 second one. Subject is recreated from the query sequence and information
2401 in an HSP.
2402 @param first HSP with smaller query offset [in|out]
2403 @param second HSP with larger query offset [in|out]
2404 @param query Query sequence [in]
2405 @return Status
2406 */
2407 static Int4
s_FindSpliceJunctionsForOverlaps(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,Boolean consensus_only)2408 s_FindSpliceJunctionsForOverlaps(BlastHSP* first, BlastHSP* second,
2409 Uint1* query, Int4 query_len,
2410 Boolean consensus_only)
2411 {
2412 Int4 i, k;
2413 /* splice signal pairs from include/algo/sequence/consensus_splice.hpp */
2414 Uint1 signals[NUM_SIGNALS] = {0xb2, /* GTAG */
2415 0x71, /* CTAC (reverse complement) */
2416 0x92, /* GCAG */
2417 0x79, /* CTGC */
2418 0x31, /* ATAC */
2419 0xb3, /* GTAT */
2420 0xbe, /* GTTG */
2421 0x41, /* CAAC */
2422 0xba, /* GTGG */
2423 0x51, /* CCAC */
2424 0xb0, /* GTAA */
2425 0xf1, /* TTAC */
2426 0x82, /* GAAG */
2427 0x7d, /* CTTC */
2428 0xf2, /* TTAG */
2429 0x70, /* CTAA */
2430 0x32, /* ATAG */
2431 0x73, /* CTAT */
2432 0xa2, /* GGAG */
2433 0x75, /* CTCC */
2434 0x33, /* ATAT (revese complemented is self) */
2435 0x30, /* ATAA */
2436 0xf3 /* TTAT */};
2437
2438
2439 Boolean found = FALSE;
2440 Uint1* subject = NULL;
2441 Int4 overlap_len;
2442 const Uint1 kGap = 15;
2443 Int4 num_signals = consensus_only ? NUM_SIGNALS_CONSENSUS : NUM_SIGNALS;
2444
2445 /* HSPs must be sorted and must overlap only on the query */
2446 ASSERT(first->query.offset < second->query.offset);
2447 ASSERT(second->query.offset <= first->query.end);
2448 ASSERT(first->subject.offset < second->subject.offset);
2449
2450
2451 /* if there are edits within the overlap region, then try to maximize
2452 the chain score by assigning a part of the overlap region to the
2453 HSP that has fewer edits in the overlap */
2454 if ((first->map_info->edits->num_edits > 0 &&
2455 first->map_info->edits->edits[
2456 first->map_info->edits->num_edits - 1].query_pos >=
2457 second->query.offset)
2458 || (second->map_info->edits->num_edits > 0 &&
2459 second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2460
2461 /* find nummer of edits in both HSPs within the overlap and assign
2462 overlap to the HSP that has fewer edits */
2463 Int4 num_first = 0;
2464 Int4 num_second = 0;
2465
2466 /* find number of edits */
2467 for (i = first->map_info->edits->num_edits - 1;i >= 0;i--) {
2468 JumperEdit* edits = first->map_info->edits->edits;
2469 if (edits[i].query_pos < second->query.offset) {
2470 break;
2471 }
2472 num_first++;
2473 }
2474
2475 for (i = 0;i < second->map_info->edits->num_edits;i++) {
2476 JumperEdit* edits = second->map_info->edits->edits;
2477 if (edits[i].query_pos >= first->query.end) {
2478 break;
2479 }
2480 num_second++;
2481 }
2482
2483 if (num_first > num_second) {
2484 while (first->map_info->edits->num_edits > 0 &&
2485 first->map_info->edits->edits[
2486 first->map_info->edits->num_edits - 1].query_pos >=
2487 second->query.offset) {
2488
2489 JumperEdit* edits = first->map_info->edits->edits;
2490 Uint1 edge = first->map_info->right_edge;
2491 Int4 num_edits = first->map_info->edits->num_edits;
2492 Int4 trim_by;
2493
2494 if (edits[num_edits - 1].query_pos >= first->query.end - 1) {
2495 if (edits[num_edits - 1].subject_base != kGap) {
2496 edge >>= 2;
2497 edge |= edits[num_edits - 1].subject_base << 2;
2498 }
2499 }
2500 else if (edits[num_edits - 1].query_pos == query_len - 2 &&
2501 edits[num_edits - 1].subject_base == kGap) {
2502
2503 edge = (edge << 2) | query[query_len - 1];
2504 }
2505 else {
2506 if (edits[num_edits - 1].subject_base != kGap &&
2507 edits[num_edits - 1].query_base != kGap) {
2508
2509 edge = (edits[num_edits - 1].subject_base << 2) |
2510 query[edits[num_edits - 1].query_pos + 1];
2511 }
2512 else if (edits[num_edits - 1].subject_base == kGap) {
2513 edge = (query[edits[num_edits - 1].query_pos + 1] << 2 ) |
2514 query[edits[num_edits - 1].query_pos + 2];
2515 }
2516 else {
2517 edge = (edits[num_edits - 1].subject_base << 2) |
2518 query[edits[num_edits - 1].query_pos];
2519 }
2520 }
2521
2522 trim_by = first->query.end - edits[
2523 first->map_info->edits->num_edits - 1].query_pos;
2524 /* if the edit is an insert in the genome, count genome bases
2525 when trimming the HSP */
2526 if (edits[num_edits - 1].query_base == kGap) {
2527 trim_by++;
2528 /* FIXME: use proper scores here */
2529 s_TrimHSP(first, trim_by, FALSE, FALSE, -4, -4, -4, query);
2530 }
2531 else {
2532 s_TrimHSP(first, trim_by, TRUE, FALSE, -4, -4, -4, query);
2533 }
2534 first->map_info->right_edge = edge;
2535 }
2536 }
2537 else if (num_second > num_first) {
2538 while (second->map_info->edits->num_edits > 0 &&
2539 second->map_info->edits->edits[0].query_pos <
2540 first->query.end) {
2541
2542 JumperEdit* edits = second->map_info->edits->edits;
2543 Uint1 edge = second->map_info->left_edge;
2544 Int4 trim_by;
2545
2546 if (edits[0].query_pos == 0) {
2547 if (edits[0].subject_base != kGap) {
2548 edge <<= 2;
2549 edge |= edits[0].subject_base;
2550 }
2551 }
2552 else if (edits[0].query_pos == 1 &&
2553 edits[0].subject_base == kGap) {
2554
2555 edge = (edge << 2) | query[0];
2556 }
2557 else {
2558
2559 edge = (query[edits[0].query_pos - 1] << 2) |
2560 edits[0].subject_base;
2561
2562 if (edits[0].subject_base == kGap) {
2563 edge = (query[edits[0].query_pos - 2] << 2) |
2564 query[edits[0].query_pos - 1];
2565 }
2566 }
2567
2568 trim_by = edits[0].query_pos - second->query.offset + 1;
2569 /* if the edit is an insert in the genome, count genome bases
2570 when trimming the HSP */
2571 if (edits[0].query_base == kGap) {
2572 trim_by++;
2573 /* FIXME: use proper scores here */
2574 s_TrimHSP(second,
2575 edits[0].query_pos - second->query.offset + 1,
2576 FALSE, TRUE, -4, -4, -4, query);
2577 }
2578 else {
2579 s_TrimHSP(second,
2580 edits[0].query_pos - second->query.offset + 1,
2581 TRUE, TRUE, -4, -4, -4, query);
2582 }
2583 second->map_info->left_edge = edge;
2584 }
2585 }
2586 else {
2587 /* if the number of edits is the same we do not know how to
2588 divide the overlap */
2589
2590 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2591 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2592 return 0;
2593 }
2594 }
2595
2596 /* give up if there are gaps in the alignments or mismatches still left in
2597 the overlapping parts */
2598 if ((first->map_info->edits->num_edits > 0 &&
2599 first->map_info->edits->edits[
2600 first->map_info->edits->num_edits - 1].query_pos >=
2601 second->query.offset)
2602 || (second->map_info->edits->num_edits > 0 &&
2603 second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2604
2605 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2606 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2607 return 0;
2608 }
2609
2610 /* recreate the subject sequence plus two bases on each side */
2611 overlap_len = first->query.end - second->query.offset;
2612 subject = calloc(overlap_len + 4, sizeof(Uint1));
2613 if (!subject) {
2614 return -1;
2615 }
2616
2617 /* FIXME: lef_edge and right_edge should be changed to subject_overhangs */
2618 subject[0] = (second->map_info->left_edge & 0xf) >> 2;
2619 subject[1] = second->map_info->left_edge & 3;
2620 memcpy(subject + 2, query + second->query.offset,
2621 overlap_len * sizeof(Uint1));
2622
2623 subject[2 + overlap_len] = (first->map_info->right_edge & 0xf) >> 2;
2624 subject[2 + overlap_len + 1] = first->map_info->right_edge & 3;
2625
2626 /* search for matching splice signals in the subject sequence */
2627 for (k = 0;k < num_signals;k++) {
2628
2629 for (i = 0;i <= overlap_len && !found;i++) {
2630
2631 Uint1 seq = (subject[i] << 2) | subject[i + 1] |
2632 (subject[i + 2] << 6) | (subject[i + 3] << 4);
2633
2634 /* if splice signals are found, update HSPs */
2635 if (seq == signals[k]) {
2636 Int4 d = first->query.end - (second->query.offset + i);
2637 first->query.end -= d;
2638 first->subject.end -= d;
2639 first->gap_info->num[first->gap_info->size - 1] -= d;
2640 first->score -= d;
2641 first->num_ident -= d;
2642 first->map_info->right_edge = (seq & 0xf0) >> 4;
2643 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2644
2645 d = i;
2646 second->query.offset += d;
2647 second->subject.offset += d;
2648 second->gap_info->num[0] -= d;
2649 second->score -= d;
2650 second->num_ident -= d;
2651 second->map_info->left_edge = seq & 0xf;
2652 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2653
2654 ASSERT(first->gap_info->size > 1 ||
2655 first->query.end - first->query.offset ==
2656 first->subject.end - first->subject.offset);
2657
2658 ASSERT(second->gap_info->size > 1 ||
2659 second->query.end - second->query.offset ==
2660 second->subject.end - second->subject.offset);
2661
2662 found = TRUE;
2663 break;
2664 }
2665 }
2666 }
2667
2668 if (!found) {
2669 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2670 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2671 }
2672
2673 if (subject) {
2674 sfree(subject);
2675 }
2676
2677 ASSERT(first->query.offset < first->query.end);
2678 ASSERT(second->query.offset < second->query.end);
2679
2680 return 0;
2681 }
2682
s_ExtendAlignmentCleanup(Uint1 * subject,BlastGapAlignStruct * gap_align,GapEditScript * edit_script,JumperEditsBlock * edits)2683 static void s_ExtendAlignmentCleanup(Uint1* subject,
2684 BlastGapAlignStruct* gap_align,
2685 GapEditScript* edit_script,
2686 JumperEditsBlock* edits)
2687 {
2688 if (subject) {
2689 free(subject);
2690 }
2691
2692 BLAST_GapAlignStructFree(gap_align);
2693 GapEditScriptDelete(edit_script);
2694 JumperEditsBlockFree(edits);
2695 }
2696
2697 /* Extend alignment on one side of an HSP up to a given point (a splice site)
2698 */
s_ExtendAlignment(BlastHSP * hsp,const Uint1 * query,Int4 query_from,Int4 query_to,Int4 subject_from,Int4 subject_to,const ScoringOptions * score_options,Boolean is_left)2699 static Int4 s_ExtendAlignment(BlastHSP* hsp, const Uint1* query,
2700 Int4 query_from, Int4 query_to,
2701 Int4 subject_from, Int4 subject_to,
2702 const ScoringOptions* score_options,
2703 Boolean is_left)
2704 {
2705 Int4 o_len;
2706 Uint1* o_seq = NULL;
2707 Uint1* subject = NULL;
2708 BlastGapAlignStruct* gap_align = NULL;
2709 GapEditScript* edit_script = NULL;
2710 JumperEditsBlock* edits = NULL;
2711 Int4 i, k, s;
2712 /* number of query bases to align */
2713 Int4 query_gap = query_to - query_from + 1;
2714 /* number of subject bases to align */
2715 Int4 subject_gap = subject_to - subject_from + 1;
2716 Int4 num_identical;
2717 Int4 query_ext_len, subject_ext_len;
2718 Int4 ungapped_ext_len;
2719
2720 JUMP* jumper_table = NULL;
2721
2722 JUMP jumper_mismatch[] = {{1, 1, 0, 0}};
2723
2724 JUMP jumper_insertion[] = {{1, 1, 2, 0},
2725 {1, 0, 1, 0},
2726 {1, 1, 0, 0}};
2727
2728 JUMP jumper_deletion[] = {{1, 1, 2, 0},
2729 {0, 1, 1, 0},
2730 {1, 1, 0, 0}};
2731
2732 if (!hsp || !query || !hsp->map_info ||
2733 !hsp->map_info->subject_overhangs) {
2734 return -1;
2735 }
2736
2737 /* length of the subject overgang sequence saved in the HSP */
2738 o_len = is_left ? hsp->map_info->subject_overhangs->left_len :
2739 hsp->map_info->subject_overhangs->right_len;
2740 /* subject sequence */
2741 o_seq = is_left ? hsp->map_info->subject_overhangs->left :
2742 hsp->map_info->subject_overhangs->right;
2743 ASSERT(o_seq);
2744
2745 /* compressed subject sequence (needed for computig jumper edits) */
2746 subject = calloc(o_len / 4 + 1, sizeof(Uint1));
2747 if (!subject) {
2748 return -1;
2749 }
2750
2751 /* compress subject sequence */
2752 k = 6;
2753 s = 0;
2754 for (i = 0;i < o_len;i++, k-=2) {
2755 subject[s] |= o_seq[i] << k;
2756 if (k == 0) {
2757 s++;
2758 k = 8;
2759 }
2760 }
2761
2762 /* gap_align is needed in jumper alignment API calls, we only allocate
2763 fields that will be needed in these calls */
2764 gap_align = calloc(1, sizeof(BlastGapAlignStruct));
2765 if (!gap_align) {
2766 s_ExtendAlignmentCleanup(subject, NULL, edit_script, edits);
2767 return -1;
2768 }
2769 gap_align->jumper = JumperGapAlignNew(o_len * 2);
2770 if (!gap_align->jumper) {
2771 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2772 return -1;
2773 }
2774
2775 /* select jumper table based on the number of expected indels; we allow
2776 only insertions or only deletions */
2777 switch (SIGN(query_gap - subject_gap)) {
2778 case 0: jumper_table = jumper_mismatch;
2779 break;
2780
2781 case -1: jumper_table = jumper_deletion;
2782 break;
2783
2784 case 1: jumper_table = jumper_insertion;
2785 break;
2786
2787 default:
2788 ASSERT(0);
2789 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2790 return -1;
2791 };
2792
2793 /* Compute alignment. We always do right extension as this is typically
2794 a short alignment. This is global alignment, hence the large max number
2795 of errors and no penalties for mismatches or gaps. The jumper table
2796 specifies costs for alignment errors. The penalty socres specify
2797 required number of macthes at the ends of the alignment. */
2798 JumperExtendRightWithTraceback(query + query_from, o_seq + subject_from,
2799 query_gap, subject_gap,
2800 1, 0, 0, 0,
2801 20, 20,
2802 &query_ext_len, &subject_ext_len,
2803 gap_align->jumper->right_prelim_block,
2804 &num_identical,
2805 FALSE,
2806 &ungapped_ext_len,
2807 jumper_table);
2808
2809 ASSERT(query_ext_len <= query_gap);
2810 ASSERT(subject_ext_len <= subject_gap);
2811
2812 /* Because this is a global alignment a gap may exist after the last query
2813 or subject base which causes jumper extension to stop prematurely. If
2814 this is the case, add the missing indel */
2815 while (query_ext_len < query_gap) {
2816 JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2817 JUMPER_INSERTION);
2818
2819 query_ext_len++;
2820 }
2821
2822 while (subject_ext_len < subject_gap) {
2823 JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2824 JUMPER_DELETION);
2825
2826 subject_ext_len++;
2827 }
2828 ASSERT(query_ext_len == query_gap);
2829 ASSERT(subject_ext_len == subject_gap);
2830
2831 /* update gap info in the HSP */
2832 edit_script = JumperPrelimEditBlockToGapEditScript(
2833 gap_align->jumper->left_prelim_block,
2834 gap_align->jumper->right_prelim_block);
2835 if (!edit_script) {
2836 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2837 return -1;
2838 }
2839
2840 if (is_left) {
2841 hsp->gap_info = GapEditScriptCombine(&edit_script, &hsp->gap_info);
2842 }
2843 else {
2844 hsp->gap_info = GapEditScriptCombine(&hsp->gap_info, &edit_script);
2845 ASSERT(!edit_script);
2846 }
2847 ASSERT(hsp->gap_info);
2848 if (!hsp->gap_info) {
2849 s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2850 return -1;
2851 }
2852
2853 /* update jumper edits */
2854 gap_align->query_start = query_from;
2855 gap_align->query_stop = query_from + query_ext_len;
2856 gap_align->subject_start = subject_from;
2857 gap_align->subject_stop = subject_from + subject_ext_len;
2858 edits = JumperFindEdits(query, subject, gap_align);
2859 ASSERT(edits);
2860 if (!edits) {
2861 s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2862 return -1;
2863 }
2864
2865 if (is_left) {
2866 hsp->map_info->edits = JumperEditsBlockCombine(&edits,
2867 &hsp->map_info->edits);
2868 }
2869 else {
2870 hsp->map_info->edits = JumperEditsBlockCombine(
2871 &hsp->map_info->edits,
2872 &edits);
2873 ASSERT(!edits);
2874 }
2875 ASSERT(hsp->map_info->edits);
2876 if (!hsp->map_info->edits) {
2877 s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2878 return -1;
2879 }
2880
2881 if (is_left) {
2882 hsp->query.offset -= query_ext_len;
2883 hsp->subject.offset -= subject_ext_len;
2884 }
2885 else {
2886 hsp->query.end += query_ext_len;
2887 hsp->subject.end += subject_ext_len;
2888 }
2889
2890 /* compute new HSP score */
2891 hsp->score = s_ComputeAlignmentScore(hsp, score_options->penalty,
2892 score_options->gap_open,
2893 score_options->gap_extend);
2894
2895 ASSERT(hsp->gap_info);
2896 ASSERT(hsp->map_info->edits);
2897
2898 ASSERT(subject);
2899
2900
2901 if (subject) {
2902 free(subject);
2903 }
2904 BLAST_GapAlignStructFree(gap_align);
2905
2906 return 0;
2907 }
2908
2909
2910 /* Find splcie junctions and extend HSP alignment if there are unaligned query
2911 bases in the middle of a chain.
2912 */
2913 static Int4
s_FindSpliceJunctionsForGap(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,const ScoringOptions * score_options)2914 s_FindSpliceJunctionsForGap(BlastHSP* first, BlastHSP* second,
2915 Uint1* query, Int4 query_len,
2916 const ScoringOptions* score_options)
2917 {
2918 Int4 query_gap;
2919 Int4 first_len, second_len;
2920 Boolean found = FALSE;
2921 Int4 k;
2922 Int4 q;
2923 Uint1 signal = 0;
2924 /* use only cannonical splice signals here */
2925 /* The procedure does not maximize score for unaligned bases of the query.
2926 It only finds splice signals and then the best alignment it can find
2927 for the unaligned bases. For limited number of unaligned query bases,
2928 we can be sure of the splice site vs a continuous alignment, because a
2929 continuous alignment would score better. But we do not compare
2930 alignment scores between different splice sites, so we do not know
2931 which splice site and signal is better; and many splice signals here
2932 would lead to finding wrong splice sites. */
2933 Uint1 signals[NUM_SIGNALS_CONSENSUS] = {0xb2, /* GTAG */
2934 0x71 /* CTAC */};
2935
2936 if (!first || !second) {
2937 return -1;
2938 }
2939
2940 if (!first->map_info->subject_overhangs ||
2941 !first->map_info->subject_overhangs->right ||
2942 !second->map_info->subject_overhangs ||
2943 !second->map_info->subject_overhangs->left) {
2944 return -1;
2945 }
2946
2947 /* number of query bases that fall between the HSPs */
2948 query_gap = second->query.offset - first->query.end;
2949
2950 /* we do not have enough subject sequence saved (-1 because we allow up to
2951 one indel) */
2952 if (query_gap > first->map_info->subject_overhangs->right_len - 2 ||
2953 query_gap > second->map_info->subject_overhangs->left_len - 2) {
2954 return 0;
2955 }
2956
2957 first_len = first->map_info->subject_overhangs->right_len;
2958 second_len = second->map_info->subject_overhangs->left_len;
2959
2960 /* assume that the begining of intron was found, search for the splice
2961 signal at the end of the first HSP */
2962 for (q = 0; !found && q < 4;q++) {
2963
2964 if (first->query.end - q <= first->query.offset) {
2965 break;
2966 }
2967
2968 for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2969 Int4 i;
2970 Int4 status = 0;
2971 Int4 start;
2972 Uint1 seq;
2973
2974 if (q == 0) {
2975 seq = (first->map_info->subject_overhangs->right[0] << 6) |
2976 (first->map_info->subject_overhangs->right[1] << 4);
2977 }
2978 else if (q == 1) {
2979 seq = (query[first->query.end - 1] << 6) |
2980 (first->map_info->subject_overhangs->right[0] << 4);
2981 }
2982 else if (q > 1) {
2983 seq = (query[first->query.end - q] << 6) |
2984 (query[first->query.end - q + 1] << 4);
2985 }
2986 else {
2987 return -1;
2988 }
2989
2990 if (seq != (signals[k] & 0xf0)) {
2991 continue;
2992 }
2993
2994 /* location of the splice signa in subject overhang if there are no
2995 indels */
2996 start = second->map_info->subject_overhangs->left_len - 1 -
2997 query_gap - 2 - q;
2998
2999 /* search for the splice signal at the end of intron;
3000 allow for up to 1 indel */
3001 for (i = MAX(start - 1, 0);i <= MIN(start + 1, second_len - 2);i++) {
3002 seq &= 0xf0;
3003 seq |= (second->map_info->subject_overhangs->left[i] << 2) |
3004 second->map_info->subject_overhangs->left[i + 1];
3005
3006 /* if the splice signal found extend the alignment */
3007 if (seq == signals[k]) {
3008 Int4 subject_gap = second_len - (i + 2) + q;
3009 if (query_gap - subject_gap < -1 ||
3010 query_gap - subject_gap > 1) {
3011
3012 continue;
3013 }
3014
3015 found = TRUE;
3016 if (q > 0) {
3017 s_TrimHSP(first, q, TRUE, FALSE,
3018 score_options->penalty,
3019 score_options->gap_open,
3020 score_options->gap_extend,
3021 query);
3022 }
3023
3024 status = s_ExtendAlignment(second, query,
3025 first->query.end,
3026 second->query.offset - 1,
3027 i + 2,
3028 second->map_info->subject_overhangs->left_len - 1,
3029 score_options,
3030 TRUE);
3031
3032 ASSERT(status == 0);
3033 if (status) {
3034 return status;
3035 }
3036
3037 signal = seq;
3038 break;
3039 }
3040 }
3041 }
3042 }
3043
3044 /* assume that end of the intron was found, search for the splice signal
3045 at the beginning of the intron */
3046 for (q = 0; !found && q < 4;q++) {
3047
3048 if (second->query.offset + q >= second->query.end) {
3049 break;
3050 }
3051
3052 for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
3053 Int4 i;
3054 Int4 end;
3055 Uint1 seq;
3056 if (q == 0) {
3057 seq =
3058 (second->map_info->subject_overhangs->left[second_len - 2] << 2) |
3059 second->map_info->subject_overhangs->left[second_len - 1];
3060 }
3061 else if (q == 1) {
3062 seq = (second->map_info->subject_overhangs->left[second_len - 1] << 2) | query[second->query.offset];
3063
3064
3065 }
3066 else if (q > 1) {
3067
3068 if (second->map_info->edits->num_edits > 0 &&
3069 second->map_info->edits->edits[0].query_pos < q - 2) {
3070
3071 break;
3072 }
3073
3074 seq = (query[second->query.offset + q - 2] << 2) |
3075 query[second->query.offset + q + 1 - 2];
3076 }
3077 else {
3078 return -1;
3079 }
3080
3081 if (seq != (signals[k] & 0xf)) {
3082 continue;
3083 }
3084
3085 /* location of the splice signal in the subject overhang if there are
3086 no indels */
3087 end = query_gap + q;
3088
3089 /* allow for up to 1 indel */
3090 for (i = MAX(end - 1, 0);i <= MIN(end + 1, first_len - 2);i++) {
3091 seq &= 0xf;
3092 seq |= (first->map_info->subject_overhangs->right[i] << 6) |
3093 (first->map_info->subject_overhangs->right[i + 1] << 4);
3094
3095 if (seq == signals[k]) {
3096 Int4 status = 0;
3097 Int4 subject_gap = i - q;
3098 if (query_gap - subject_gap < -1 ||
3099 query_gap - subject_gap > 1) {
3100
3101 continue;
3102 }
3103
3104 found = TRUE;
3105 if (q > 0) {
3106 s_TrimHSP(second, q, TRUE, TRUE,
3107 score_options->penalty,
3108 score_options->gap_open,
3109 score_options->gap_extend,
3110 query);
3111 }
3112
3113 status = s_ExtendAlignment(first, query,
3114 first->query.end,
3115 second->query.offset - 1,
3116 0, i - 1,
3117 score_options,
3118 FALSE);
3119 ASSERT(status == 0);
3120 if (status) {
3121 return status;
3122 }
3123
3124 signal = seq;
3125 break;
3126 }
3127 }
3128 }
3129 }
3130
3131 if (found) {
3132 first->map_info->right_edge = (signal & 0xf0) >> 4;
3133 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3134 second->map_info->left_edge = signal & 0xf;
3135 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3136
3137 ASSERT(first->gap_info->size > 1 ||
3138 first->query.end - first->query.offset ==
3139 first->subject.end - first->subject.offset);
3140
3141 ASSERT(second->gap_info->size > 1 ||
3142 second->query.end - second->query.offset ==
3143 second->subject.end - second->subject.offset);
3144 }
3145 else {
3146 first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3147 second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3148 }
3149
3150 return 0;
3151 }
3152
3153
3154 #define NUM_SINGLE_SIGNALS 8
3155
3156 /* Record subject bases pre and post alignment in HSP as possible splice
3157 signals and set a flag for recognized signals */
3158 /* Not used
3159 static Int4 s_FindSpliceSignals(BlastHSP* hsp, Uint1* query, Int4 query_len)
3160 {
3161 SequenceOverhangs* overhangs = NULL;
3162 /--* splice signals in the order of preference *--/
3163 Uint1 signals[NUM_SINGLE_SIGNALS] = {2, /--* AG *--/
3164 11, /--* GT *--/
3165 13, /--* TC *--/
3166 7, /--* CT *--/
3167 1, /--* AC *--/
3168 4, /--* CA *--/
3169 8, /--* GA *--/
3170 14 /--* TG *--/};
3171
3172 if (!hsp || !query) {
3173 return -1;
3174 }
3175
3176 overhangs = hsp->map_info->subject_overhangs;
3177 ASSERT(overhangs);
3178
3179 if (!overhangs) {
3180 return -1;
3181 }
3182
3183 /--* FIXME: check for polyA and adapters *--/
3184
3185 /--* search for a splice signal on the left edge, unless the query is aligned
3186 from the beginning *--/
3187 if (hsp->query.offset == 0) {
3188 hsp->map_info->left_edge = MAPPER_EXON;
3189 }
3190 else {
3191 int k, i;
3192 Boolean found = FALSE;
3193
3194 if (overhangs->left_len >= 2) {
3195 Uint1 signal = (overhangs->left[overhangs->left_len - 2] << 2) |
3196 overhangs->left[overhangs->left_len - 1];
3197
3198 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3199 if (signal == signals[k]) {
3200 hsp->map_info->left_edge = signal;
3201 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3202
3203 found = TRUE;
3204 break;
3205 }
3206 }
3207 }
3208
3209 if (!found && overhangs->left_len >= 1) {
3210 Uint1 signal = (overhangs->left[overhangs->left_len - 1] << 2) |
3211 query[hsp->query.offset];
3212
3213 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3214 if (signal == signals[k]) {
3215 hsp->map_info->left_edge = signal;
3216 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3217
3218 /--* trim alignment *--/
3219 s_TrimHSP(hsp, 1, TRUE, TRUE, -8, 0, -8);
3220
3221 /--* update score at some point *--/
3222 found = TRUE;
3223 break;
3224 }
3225 }
3226 }
3227
3228 for (i = hsp->query.offset;!found && i < 4;i++) {
3229 Uint1 signal = (query[i] << 2) | query[i + 1];
3230 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3231 if (signal == signals[k]) {
3232 hsp->map_info->left_edge = signal;
3233 hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3234
3235 /--* trim alignment *--/
3236 s_TrimHSP(hsp, i + 2, TRUE, TRUE, -8, 0, -8);
3237
3238 found = TRUE;
3239 break;
3240 }
3241 }
3242 }
3243 }
3244
3245 /--* search for the splice signal on the right edge, unless the query is
3246 aligned to the end *--/
3247 if (hsp->query.end == query_len) {
3248 hsp->map_info->right_edge = MAPPER_EXON;
3249 }
3250 else {
3251 int k, i;
3252 Boolean found = FALSE;
3253
3254 if (overhangs->right_len >= 2) {
3255 Uint1 signal = (overhangs->right[0] << 2) | overhangs->right[1];
3256 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3257 if (signal == signals[k]) {
3258 hsp->map_info->right_edge = signal;
3259 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3260
3261 found = TRUE;
3262 break;
3263 }
3264 }
3265
3266 }
3267
3268 if (!found && overhangs->right_len >= 1) {
3269 Uint1 signal = (query[hsp->query.end - 1] << 2) |
3270 overhangs->right[0];
3271
3272 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3273 if (signal == signals[k]) {
3274 hsp->map_info->right_edge = signal;
3275 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3276
3277 /--* update HSP *--/
3278 s_TrimHSP(hsp, 1, TRUE, FALSE, -8, 0, -8);
3279
3280 /--* update score at some point *--/
3281 found = TRUE;
3282 break;
3283 }
3284 }
3285
3286 }
3287
3288 for (i = hsp->query.end - 2;!found && i > hsp->query.end - 2 - 4;i--) {
3289 Uint1 signal = (query[i] << 2) | query[i + 1];
3290 for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3291 if (signal == signals[k]) {
3292 hsp->map_info->right_edge = signal;
3293 hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3294
3295 /--* update HSP *--/
3296 s_TrimHSP(hsp, hsp->query.end - i, TRUE, FALSE, -8, 0, -8);
3297
3298 found = TRUE;
3299 break;
3300 }
3301 }
3302 }
3303 }
3304
3305 return 0;
3306 }
3307 */
3308
3309
3310 static Int4
s_TrimOverlap(BlastHSP * first,BlastHSP * second,const Uint1 * query)3311 s_TrimOverlap(BlastHSP* first, BlastHSP* second, const Uint1* query)
3312 {
3313 if (second->query.offset - first->query.end < 0) {
3314 Int4 overlap = first->query.end - second->query.offset;
3315 ASSERT(overlap >= 0);
3316
3317 if (second->query.end - second->query.offset > overlap) {
3318
3319 s_TrimHSP(second, overlap, TRUE, TRUE, -4, -4, -4, query);
3320 }
3321 else {
3322 s_TrimHSP(first, overlap, TRUE, FALSE, -4, -4, -4, query);
3323 }
3324
3325
3326 ASSERT(first->query.end == second->query.offset);
3327
3328 #if _DEBUG
3329 s_TestHSPRanges(second);
3330 #endif
3331 }
3332
3333 if (second->subject.offset - first->subject.end < 0) {
3334 Int4 overlap = first->subject.end - second->subject.offset;
3335 ASSERT(overlap >= 0);
3336
3337 if (second->subject.end - second->subject.offset > overlap) {
3338
3339 s_TrimHSP(second, overlap, FALSE, TRUE, -4, -4, -4, query);
3340 }
3341 else {
3342 s_TrimHSP(first, overlap, FALSE, FALSE, -4, -4, -4, query);
3343 }
3344
3345 ASSERT(first->subject.end == second->subject.offset);
3346 #if _DEBUG
3347 s_TestHSPRanges(second);
3348 #endif
3349 }
3350
3351 ASSERT(first->query.end <= second->query.offset);
3352 ASSERT(first->subject.end <= second->subject.offset);
3353
3354 return 0;
3355 }
3356
3357
3358 /* Relpace an intron with a gap */
3359 static Int4
s_IntronToGap(HSPContainer * h,HSPContainer * next,const Uint1 * query,const ScoringOptions * scoring_opts)3360 s_IntronToGap(HSPContainer* h, HSPContainer* next, const Uint1* query,
3361 const ScoringOptions* scoring_opts)
3362 {
3363 BlastHSP* new_hsp = NULL;
3364 HSPContainer* following = NULL;
3365
3366 ASSERT(h);
3367 ASSERT(next);
3368 ASSERT(query);
3369 ASSERT(scoring_opts);
3370
3371 following = h->next->next;
3372 h->next->next = NULL;
3373
3374 s_TrimOverlap(h->hsp, next->hsp, query);
3375
3376 #if _DEBUG
3377 s_TestHSPRanges(h->hsp);
3378 #endif
3379
3380
3381 /* merge HSPs */
3382 new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3383 scoring_opts);
3384
3385
3386 #if _DEBUG
3387 s_TestHSPRanges(new_hsp);
3388 #endif
3389
3390
3391 if (new_hsp) {
3392
3393 /* replace the two processed HSPs with the combined one */
3394 Blast_HSPFree(h->hsp);
3395 HSPContainerFree(h->next);
3396 h->hsp = new_hsp;
3397 h->next = following;
3398 }
3399 else {
3400 /* something went wrong with merging, use the initial HSPs */
3401
3402 /* remove splice signal */
3403 h->hsp->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3404 next->hsp->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3405
3406 h->next->next = following;
3407 h = h->next;
3408
3409 ASSERT(!h->next ||
3410 (h->hsp->query.end <= h->next->hsp->query.offset &&
3411 h->hsp->subject.end <= h->next->hsp->subject.offset));
3412 }
3413
3414 return 0;
3415 }
3416
3417
3418 /* Search for splice signals between two HSPs in a chain. The HSPs in the
3419 chain must be sorted by query position in asceding order.
3420 */
3421 static Int4
s_FindSpliceJunctions(HSPChain * chains,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3422 s_FindSpliceJunctions(HSPChain* chains,
3423 const BLAST_SequenceBlk* query_blk,
3424 const BlastQueryInfo* query_info,
3425 const ScoringOptions* scoring_opts)
3426 {
3427 HSPChain* ch = NULL;
3428
3429 if (!chains || !query_blk || !query_info || !scoring_opts) {
3430 return -1;
3431 }
3432
3433 /* iterate over HSPs in the chain */
3434 for (ch = chains; ch; ch = ch->next) {
3435 HSPContainer* h = ch->hsps;
3436 Uint1* query = NULL;
3437 Int4 context;
3438 Int4 query_len;
3439 ASSERT(h);
3440
3441 context = h->hsp->context;
3442 query = query_blk->sequence +
3443 query_info->contexts[context].query_offset;
3444 query_len = query_info->contexts[context].query_length;
3445
3446 while (h->next) {
3447 HSPContainer* next = h->next;
3448 ASSERT(next);
3449
3450 /* if not a spliced alignment, try merging HSPs into one */
3451 /* Introns are typically at least 30 bases long, and there can be
3452 a few unalined query bases. */
3453 if ((next->hsp->subject.offset - h->hsp->subject.end -
3454 (next->hsp->query.offset - h->hsp->query.end) < 30) &&
3455
3456 /* this condition is needed to align unaligned query bases */
3457 next->hsp->subject.offset - h->hsp->subject.end <
3458 h->hsp->map_info->subject_overhangs->right_len) {
3459
3460 /* save pointer to hsps after next */
3461 HSPContainer* following = h->next->next;
3462 BlastHSP* new_hsp = NULL;
3463
3464 /* duplicate HSPContainer with the two HSPs */
3465 h->next->next = NULL;
3466
3467 s_TrimOverlap(h->hsp, next->hsp, query);
3468
3469 /* extend the first HSP to cover the gap between HSPs */
3470 if (next->hsp->query.offset - h->hsp->query.end > 1) {
3471 BlastHSP* first = h->hsp;
3472 BlastHSP* second = h->next->hsp;
3473
3474 s_ExtendAlignment(first, query, first->query.end,
3475 second->query.offset - 1, 0,
3476 second->subject.offset - 1 - first->subject.end,
3477 scoring_opts, FALSE);
3478
3479 #if _DEBUG
3480 s_TestHSPRanges(first);
3481 #endif
3482 }
3483
3484
3485 /* merge HSPs */
3486 new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3487 scoring_opts);
3488
3489
3490 #if _DEBUG
3491 s_TestHSPRanges(new_hsp);
3492 #endif
3493
3494
3495 if (new_hsp) {
3496
3497 /* replace the two processed HSPs with the combined one */
3498 Blast_HSPFree(h->hsp);
3499 HSPContainerFree(h->next);
3500 h->hsp = new_hsp;
3501 h->next = following;
3502 }
3503 else {
3504 /* something went wrong with merging, use the initial
3505 HSPs */
3506 h->next->next = following;
3507 h = h->next;
3508
3509 ASSERT(!h->next ||
3510 (h->hsp->query.end <= h->next->hsp->query.offset &&
3511 h->hsp->subject.end <= h->next->hsp->subject.offset));
3512 }
3513 }
3514 /* process overlap if found */
3515 else if (next->hsp->query.offset <= h->hsp->query.end &&
3516 next->hsp->query.offset > h->hsp->query.offset) {
3517
3518 Boolean consensus_only = TRUE;
3519 if (h->hsp->score > 50 && next->hsp->score > 50) {
3520 consensus_only = FALSE;
3521 }
3522
3523 s_FindSpliceJunctionsForOverlaps(h->hsp, next->hsp, query,
3524 query_len, consensus_only);
3525
3526
3527 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
3528
3529 s_TrimOverlap(h->hsp, next->hsp, query);
3530 }
3531
3532 /* if the intron is shorter than 30 bases, replace it with
3533 deletion and merge HSPs */
3534 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) != 0 &&
3535 next->hsp->subject.offset - h->hsp->subject.end < 30) {
3536
3537
3538 s_IntronToGap(h, next, query, scoring_opts);
3539
3540 }
3541 else {
3542
3543 #if _DEBUG
3544 s_TestHSPRanges(h->hsp);
3545 #endif
3546
3547 h = h->next;
3548 }
3549 }
3550 else if (next->hsp->query.offset - h->hsp->query.end > 0 &&
3551 next->hsp->query.offset - h->hsp->query.end <
3552 h->hsp->map_info->subject_overhangs->right_len) {
3553
3554 s_FindSpliceJunctionsForGap(h->hsp, next->hsp, query,
3555 query_len, scoring_opts);
3556
3557
3558 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
3559
3560 s_TrimOverlap(h->hsp, next->hsp, query);
3561 }
3562
3563
3564
3565 /* if the intron is shorter than 30 bases, replace it with
3566 deletion and merge HSPs */
3567 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) != 0 &&
3568 next->hsp->subject.offset - h->hsp->subject.end < 30) {
3569
3570
3571 s_IntronToGap(h, next, query, scoring_opts);
3572
3573 }
3574 else {
3575
3576
3577 #if _DEBUG
3578 s_TestHSPRanges(h->hsp);
3579 #endif
3580
3581 h = h->next;
3582 }
3583 }
3584
3585
3586 else {
3587
3588 s_TrimOverlap(h->hsp, next->hsp, query);
3589
3590 #if _DEBUG
3591 s_TestHSPRanges(h->hsp);
3592 #endif
3593
3594 h = h->next;
3595 }
3596
3597 }
3598
3599 /* recalculated chain score */
3600 ch->score = s_ComputeChainScore(ch, scoring_opts, query_len, TRUE);
3601 }
3602
3603 s_TestChains(chains);
3604
3605 return 0;
3606 }
3607
3608
3609 /* Find the best scoring chain of HSPs for aligning a single RNA-Seq read to
3610 a genome on a single strand, returns all top scoring chains */
s_FindBestPath(HSPNode * nodes,Int4 num,HSPPath * path,Int4 oid,Boolean is_spliced,Int4 longest_intron,Int4 cutoff_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3611 static HSPChain* s_FindBestPath(HSPNode* nodes, Int4 num, HSPPath* path,
3612 Int4 oid, Boolean is_spliced,
3613 Int4 longest_intron,
3614 Int4 cutoff_score,
3615 const BLAST_SequenceBlk* query_blk,
3616 const BlastQueryInfo* query_info,
3617 const ScoringOptions* scoring_opts)
3618 {
3619 int i, k;
3620 Int4 best_score = 0;
3621 const Int4 kMaxIntronLength = longest_intron;
3622 /* FIXME: use mismatch and/or gap extend penalty here */
3623 HSPChain* retval = NULL;
3624
3625 if (!path || !nodes || !num) {
3626 return NULL;
3627 }
3628 path->num_paths = 0;
3629 path->score = 0;
3630
3631 for (i = num - 1;i >= 0;i--) {
3632
3633 BlastHSP* hsp = *(nodes[i].hsp);
3634 int self_score = nodes[i].best_score;
3635 /* FIXME: the is_spliced condition is a hack */
3636 for (k = i + 1;k < num && is_spliced;k++) {
3637
3638 BlastHSP* newhsp = *(nodes[k].hsp);
3639 Int4 overlap_cost = s_GetOverlapCost(newhsp, *(nodes[i].hsp), 4);
3640 Int4 new_score = nodes[k].best_score + self_score - overlap_cost;
3641
3642 /* FIXME: some of the conditions double others */
3643 const Int4 hsp_len = hsp->query.end - hsp->query.offset;
3644 const Int4 newhsp_len = newhsp->query.end - newhsp->query.offset;
3645 const Int4 overlap_len = MAX(MIN(hsp->query.end, newhsp->query.end) -
3646 MAX(hsp->query.offset, newhsp->query.offset), 0);
3647
3648 const Int4 subj_overlap_len =
3649 MAX(MIN(hsp->subject.end, newhsp->subject.end) -
3650 MAX(hsp->subject.offset, newhsp->subject.offset), 0);
3651
3652
3653 /* add next HSP to the chain only if hsp, newhsp aligns to the
3654 subject behind hsp, and score improves */
3655 if (newhsp->query.offset > hsp->query.offset &&
3656 newhsp->query.end > hsp->query.end &&
3657
3658 newhsp->subject.offset > hsp->subject.offset &&
3659 newhsp->subject.end > hsp->subject.end &&
3660 newhsp->subject.offset - hsp->subject.end < kMaxIntronLength &&
3661 (double)overlap_len / hsp_len < 0.75 &&
3662 (double)overlap_len / newhsp_len < 0.75 &&
3663 (double)subj_overlap_len / hsp_len < 0.75 &&
3664 (double)subj_overlap_len / newhsp_len < 0.75) {
3665
3666 /* FIXME: this condition may not be necessary */
3667 if (newhsp->subject.offset - hsp->subject.end > 1) {
3668 /* The function that finds splice signals modifies
3669 HSPs, so we need to clone HSPs here. */
3670 BlastHSP* hsp_copy = Blast_HSPClone(hsp);
3671 BlastHSP* newhsp_copy = Blast_HSPClone(newhsp);
3672
3673 HSPChain* chain = HSPChainNew(hsp->context);
3674 chain->hsps = HSPContainerNew(&hsp_copy);
3675 chain->hsps->next = HSPContainerNew(&newhsp_copy);
3676
3677 s_FindSpliceJunctions(chain, query_blk, query_info,
3678 scoring_opts);
3679
3680 /* update score: add the difference between sum of
3681 two HSP scores minus overalp and the new score
3682 for the merged HSP */
3683 new_score += chain->score + overlap_cost -
3684 (newhsp->score + self_score);
3685
3686 chain = HSPChainFree(chain);
3687 }
3688 else if (newhsp->query.offset - hsp->query.end == 1) {
3689 /* FIXME: this is a temporary hack, we need to increase
3690 the score of combining two HSP into one with a
3691 mismatch so that it is scored the same as the one with
3692 splice signal */
3693 new_score++;
3694 }
3695 else {
3696 new_score = 0;
3697 }
3698
3699 if (new_score > nodes[i].best_score) {
3700 nodes[i].best_score = new_score;
3701 nodes[i].path_next = &(nodes[k]);
3702 }
3703 }
3704 }
3705
3706 /* we want to return all top scoring hits, so we save all paths with
3707 top score */
3708 if (nodes[i].best_score == best_score) {
3709 if (path->num_paths < MAX_NUM_HSP_PATHS) {
3710 path->start[path->num_paths++] = &(nodes[i]);
3711 }
3712 }
3713 else if (nodes[i].best_score > best_score) {
3714 best_score = nodes[i].best_score;
3715 path->start[0] = &(nodes[i]);
3716 path->num_paths = 1;
3717 }
3718 }
3719 ASSERT(path->num_paths);
3720 path->score = path->start[0]->best_score;
3721
3722 /* Find if any paths share an HSP and remove the one with the larger index.
3723 Doing this at the end ensures that we find optimal path in terms of
3724 score and/or HSP positions. */
3725 for (i = 0;i < path->num_paths - 1;i++) {
3726 if (!path->start[i]) {
3727 continue;
3728 }
3729 for (k = i + 1;k < path->num_paths;k++) {
3730 HSPNode* node_i = path->start[i];
3731 if (!path->start[k]) {
3732 continue;
3733 }
3734
3735 while (node_i) {
3736 HSPNode* node_k = path->start[k];
3737 while (node_k) {
3738 if (node_i->hsp[0] == node_k->hsp[0]) {
3739 path->start[k] = NULL;
3740 break;
3741 }
3742 node_k = node_k->path_next;
3743 }
3744 node_i = node_i->path_next;
3745 }
3746
3747 }
3748 }
3749
3750 ASSERT(path->num_paths > 0);
3751 ASSERT(path->start[0]);
3752
3753 /* conver from HSPPath to HSPChain structure */
3754 if (path->num_paths > 0) {
3755 HSPChain* ch = NULL;
3756 HSPContainer* hc = NULL;
3757 HSPNode* node = path->start[0];
3758
3759 retval = HSPChainNew((*node->hsp)->context);
3760 if (!retval) {
3761 return NULL;
3762 }
3763 retval->oid = oid;
3764 retval->score = path->score;
3765 retval->hsps = hc = HSPContainerNew(node->hsp);
3766 node = node->path_next;
3767 while (node) {
3768 hc->next = HSPContainerNew(node->hsp);
3769 if (!hc->next) {
3770 HSPChainFree(retval);
3771 return NULL;
3772 }
3773 hc = hc->next;
3774 node = node->path_next;
3775 }
3776 ASSERT(retval->hsps);
3777
3778 ch = retval;
3779 for (i = 1;i < path->num_paths;i++) {
3780 node = path->start[i];
3781 if (!node) {
3782 continue;
3783 }
3784
3785 if (path->score < cutoff_score) {
3786 continue;
3787 }
3788
3789 ch->next = HSPChainNew((*node->hsp)->context);
3790 ASSERT(ch->next);
3791 if (!ch->next) {
3792 HSPChainFree(retval);
3793 return NULL;
3794 }
3795 ch->next->oid = oid;
3796 ch->next->score = path->score;
3797 ch->next->hsps = hc = HSPContainerNew(node->hsp);
3798 node = node->path_next;
3799 while (node) {
3800 hc->next = HSPContainerNew(node->hsp);
3801 ASSERT(hc->next);
3802 if (!hc->next) {
3803 HSPChainFree(retval);
3804 return NULL;
3805 }
3806 hc = hc->next;
3807 node = node->path_next;
3808 }
3809 ch = ch->next;
3810 ASSERT(ch->hsps);
3811 }
3812 }
3813
3814 ASSERT(path->num_paths == 0 || s_TestChains(retval));
3815
3816 return retval;
3817 }
3818
3819 /* Compare HSPs by context (accending), and score (descending). */
3820 static int
s_CompareHSPsByContextScore(const void * v1,const void * v2)3821 s_CompareHSPsByContextScore(const void* v1, const void* v2)
3822 {
3823 BlastHSP* h1,* h2;
3824 BlastHSP** hp1,** hp2;
3825
3826 hp1 = (BlastHSP**) v1;
3827 hp2 = (BlastHSP**) v2;
3828 h1 = *hp1;
3829 h2 = *hp2;
3830
3831 if (!h1 && !h2)
3832 return 0;
3833 else if (!h1)
3834 return -1;
3835 else if (!h2)
3836 return 1;
3837
3838 if (h1->context < h2->context)
3839 return -1;
3840 if (h1->context > h2->context)
3841 return 1;
3842
3843 if (h1->score < h2->score)
3844 return 1;
3845 if (h1->score > h2->score)
3846 return -1;
3847
3848 return 0;
3849 }
3850
3851 static int
s_CompareHSPsByContextSubjectOffset(const void * v1,const void * v2)3852 s_CompareHSPsByContextSubjectOffset(const void* v1, const void* v2)
3853 {
3854 BlastHSP* h1,* h2;
3855 BlastHSP** hp1,** hp2;
3856
3857 hp1 = (BlastHSP**) v1;
3858 hp2 = (BlastHSP**) v2;
3859 h1 = *hp1;
3860 h2 = *hp2;
3861
3862 if (!h1 && !h2)
3863 return 0;
3864 else if (!h1)
3865 return 1;
3866 else if (!h2)
3867 return -1;
3868
3869 /* If these are from different contexts, don't compare offsets */
3870 if (h1->context < h2->context)
3871 return -1;
3872 if (h1->context > h2->context)
3873 return 1;
3874
3875 if (h1->subject.offset < h2->subject.offset)
3876 return -1;
3877 if (h1->subject.offset > h2->subject.offset)
3878 return 1;
3879
3880 if (h1->query.offset < h2->query.offset)
3881 return -1;
3882 if (h1->query.offset > h2->query.offset)
3883 return 1;
3884
3885 /* tie breakers: sort by decreasing score, then
3886 by increasing size of query range, then by
3887 increasing subject range. */
3888
3889 if (h1->score < h2->score)
3890 return 1;
3891 if (h1->score > h2->score)
3892 return -1;
3893
3894 if (h1->query.end < h2->query.end)
3895 return 1;
3896 if (h1->query.end > h2->query.end)
3897 return -1;
3898
3899 if (h1->subject.end < h2->subject.end)
3900 return 1;
3901 if (h1->subject.end > h2->subject.end)
3902 return -1;
3903
3904 return 0;
3905 }
3906
3907
3908 /* Pair data, used for selecting optimal pairs of aligned chains */
3909 typedef struct Pairinfo
3910 {
3911 HSPChain* first;
3912 HSPChain* second;
3913 Int4 distance;
3914 Int4 conf;
3915 Int4 score;
3916 Int4 trim_first; // when a chain overextends past beginning of the pair
3917 Int4 trim_second; // it needs to be trimmed
3918 Boolean valid_pair;
3919 } Pairinfo;
3920
3921
s_BlastHSPMapperSplicedRunCleanUp(HSPPath * path,HSPNode * nodes,Pairinfo * pair_data)3922 static void s_BlastHSPMapperSplicedRunCleanUp(HSPPath* path,
3923 HSPNode* nodes,
3924 Pairinfo* pair_data)
3925 {
3926 HSPPathFree(path);
3927
3928 if (nodes) {
3929 sfree(nodes);
3930 }
3931
3932 if (pair_data) {
3933 sfree(pair_data);
3934 }
3935 }
3936
3937
3938 /* Compare two HSP chain pairs aligned for the same query and subject
3939 by score and distance */
3940 static int
s_ComparePairs(const void * v1,const void * v2)3941 s_ComparePairs(const void* v1, const void* v2)
3942 {
3943 Pairinfo* a = (Pairinfo*)v1;
3944 Pairinfo* b = (Pairinfo*)v2;
3945
3946 if (a->score > b->score) {
3947 return -1;
3948 }
3949 if (a->score < b->score) {
3950 return 1;
3951 }
3952
3953 if (a->conf < b->conf) {
3954 return -1;
3955 }
3956 if (a->conf > b->conf) {
3957 return 1;
3958 }
3959
3960 if (a->distance < b->distance) {
3961 return -1;
3962 }
3963 if (a->distance > b->distance) {
3964 return 1;
3965 }
3966
3967 return 0;
3968 }
3969
3970
3971 /* Trim beginning of a chain alignment up to the given subject position */
s_TrimChainStartToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)3972 static Int4 s_TrimChainStartToSubjPos(HSPChain* chain, Int4 subj_pos,
3973 Int4 mismatch_score,
3974 Int4 gap_open_score,
3975 Int4 gap_extend_score,
3976 const BLAST_SequenceBlk* query_blk,
3977 const BlastQueryInfo* query_info)
3978 {
3979 HSPContainer* h = NULL;
3980 BlastHSP* hsp = NULL;
3981 BlastHSP* next_hsp = NULL;
3982 Int4 num_left;
3983 Int4 old_score;
3984 Uint1* query = NULL;
3985
3986 ASSERT(chain);
3987 ASSERT(subj_pos >= 0);
3988 if (!chain || subj_pos < 0) {
3989 return 0;
3990 }
3991
3992 query = query_blk->sequence +
3993 query_info->contexts[chain->context].query_offset;
3994
3995 /* first remove HSPs that are fully below the subject position */
3996 h = chain->hsps;
3997 while (h && h->hsp->subject.end < subj_pos) {
3998 HSPContainer* tmp = h;
3999 ASSERT(tmp);
4000 h = h->next;
4001 tmp->next = NULL;
4002 chain->score -= tmp->hsp->score;
4003 HSPContainerFree(tmp);
4004 }
4005 ASSERT(h);
4006 chain->hsps = h;
4007 if (!h) {
4008 return -1;
4009 }
4010
4011 /* if all remaning HSPs are above the subject position, then exit */
4012 if (h->hsp->subject.offset >= subj_pos) {
4013 return 0;
4014 }
4015
4016 /* otherwise move HSP start positions */
4017 hsp = h->hsp;
4018 num_left = subj_pos - hsp->subject.offset;
4019 ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
4020
4021 old_score = hsp->score;
4022 s_TrimHSP(hsp, num_left, FALSE, TRUE, mismatch_score, gap_open_score,
4023 gap_extend_score, query);
4024
4025 /* update chain score */
4026 chain->score -= old_score - hsp->score;
4027
4028 /* remove splice signals and exon flags */
4029 hsp->map_info->left_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
4030 hsp->map_info->left_edge &= 0xff ^ MAPPER_EXON;
4031
4032 /* check whether the HSP is contained within the next one and remove it
4033 if it is */
4034 next_hsp = NULL;
4035 if (chain->hsps->next) {
4036 next_hsp = chain->hsps->next->hsp;
4037 }
4038 if (next_hsp && hsp->query.offset >= next_hsp->query.offset) {
4039 chain->hsps = h->next;
4040 h->next = NULL;
4041 HSPContainerFree(h);
4042 }
4043
4044 return 0;
4045 }
4046
4047 /* Trim chain alignment end up the the given subject position */
s_TrimChainEndToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)4048 static Int4 s_TrimChainEndToSubjPos(HSPChain* chain, Int4 subj_pos,
4049 Int4 mismatch_score,
4050 Int4 gap_open_score, Int4 gap_extend_score,
4051 const BLAST_SequenceBlk* query_blk,
4052 const BlastQueryInfo* query_info)
4053 {
4054 HSPContainer* h = NULL;
4055 BlastHSP* hsp = NULL;
4056 Int4 num_left;
4057 Int4 old_score;
4058 Uint1* query = NULL;
4059
4060 ASSERT(chain);
4061 ASSERT(subj_pos >= 0);
4062 if (!chain || subj_pos <= 0 || !query_blk || !query_info) {
4063 return -1;
4064 }
4065
4066 query = query_blk->sequence +
4067 query_info->contexts[chain->context].query_offset;
4068
4069 /* first remove HSPs that are full above the subject position */
4070 h = chain->hsps;
4071 while (h->next && h->next->hsp->subject.end < subj_pos) {
4072 h = h->next;
4073 }
4074
4075 if (h->next) {
4076 HSPContainer* hc = NULL;
4077 if (h->next->hsp->subject.offset < subj_pos) {
4078 h = h->next;
4079 }
4080
4081 /* update chain score */
4082 for (hc = h->next;hc != NULL;hc = hc->next) {
4083 chain->score -= hc->hsp->score;
4084 }
4085
4086 HSPContainerFree(h->next);
4087 h->next = NULL;
4088 }
4089
4090 /* if all remaning HSPs are below the subject position, exit */
4091 if (h->hsp->subject.end <= subj_pos) {
4092 return 0;
4093 }
4094
4095 /* otheriwse move HSP end positions */
4096 hsp = h->hsp;
4097 num_left = hsp->subject.end - subj_pos;
4098 ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
4099
4100 old_score = hsp->score;
4101 s_TrimHSP(hsp, num_left, FALSE, FALSE, mismatch_score, gap_open_score,
4102 gap_extend_score, query);
4103
4104 /* update chain score */
4105 chain->score -= old_score - hsp->score;
4106
4107 /* remove splice signals and exon flags */
4108 hsp->map_info->right_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
4109 hsp->map_info->right_edge &= 0xff ^ MAPPER_EXON;
4110
4111 /* check whether the HSP is contained within the next one and remove it
4112 if it is */
4113 if (chain->hsps != h) {
4114 HSPContainer* hc = chain->hsps;
4115 while (hc && hc->next != h) {
4116 hc = hc->next;
4117 }
4118 ASSERT(hc && hc->next == h);
4119 if (hc->hsp->query.end >= h->hsp->query.end) {
4120 chain->score -= h->hsp->score;
4121 HSPContainerFree(h);
4122 hc->next = NULL;
4123 }
4124 }
4125
4126 return 0;
4127 }
4128
4129
4130 /* Find optimal pairs */
s_FindBestPairs(HSPChain ** first_list,HSPChain ** second_list,Int4 min_score,Pairinfo ** pair_info_ptr,Int4 * max_num_pairs,Boolean is_spliced,const ScoringOptions * scoring_options,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)4131 static Boolean s_FindBestPairs(HSPChain** first_list,
4132 HSPChain** second_list,
4133 Int4 min_score,
4134 Pairinfo** pair_info_ptr,
4135 Int4* max_num_pairs,
4136 Boolean is_spliced,
4137 const ScoringOptions* scoring_options,
4138 const BLAST_SequenceBlk* query_blk,
4139 const BlastQueryInfo* query_info)
4140 {
4141 HSPChain* first;
4142 HSPChain* second;
4143 Pairinfo* pair_info = *pair_info_ptr;
4144 Int4 conv_bonus = 0;
4145 Int4 num_pairs = 0;
4146 Boolean found = FALSE;
4147 const Int4 kMaxInsertSize = is_spliced ?
4148 MAGICBLAST_MAX_INSERT_SIZE_SPLICED :
4149 MAGICBLAST_MAX_INSERT_SIZE_NONSPLICED;
4150
4151 /* iterate over all pairs of HSP chains for the first and second read of
4152 the pair and collect pair information */
4153 for (first = *first_list; first; first = first->next) {
4154 for (second = *second_list; second; second = second->next) {
4155 Int2 first_frame = first->hsps->hsp->query.frame;
4156 Int2 second_frame = second->hsps->hsp->query.frame;
4157
4158 /* reallocate pair info array if necessary */
4159 if (num_pairs >= *max_num_pairs) {
4160 *max_num_pairs *= 2;
4161 Pairinfo* new_pair_info =
4162 realloc(pair_info, *max_num_pairs * sizeof(Pairinfo));
4163 if (!new_pair_info) {
4164 return -1;
4165 }
4166 pair_info = new_pair_info;
4167 *pair_info_ptr = new_pair_info;
4168 }
4169
4170 /* collect pair information */
4171 pair_info[num_pairs].first = first;
4172 pair_info[num_pairs].second = second;
4173 pair_info[num_pairs].score = first->score + second->score;
4174 pair_info[num_pairs].trim_first = 0;
4175 pair_info[num_pairs].trim_second = 0;
4176 pair_info[num_pairs].valid_pair = 0;
4177 pair_info[num_pairs].distance = 0;
4178
4179 /* if the chains align on the opposite strands */
4180 ASSERT(first_frame != 0 && second_frame != 0);
4181 if (SIGN(first_frame) != SIGN(second_frame)) {
4182 HSPChain* plus = NULL, *minus = NULL;
4183 Int4 plus_start, minus_start;
4184 HSPContainer* hsp = NULL;
4185 Int4 distance = 0;
4186
4187 /* find which query aligns to plus and which to minus strand
4188 on the subject */
4189 if (first_frame > 0) {
4190 plus = first;
4191 }
4192 else {
4193 minus = first;
4194 }
4195 if (second_frame > 0) {
4196 plus = second;
4197 }
4198 else {
4199 minus = second;
4200 }
4201 ASSERT(plus && minus);
4202
4203 /* find start positions on the subject, for the minus strand we
4204 are interested subject start position on the minus strand,
4205 but the HSP has subject on plus and query on the minus
4206 strand, so we need to flip the query */
4207 plus_start = plus->hsps->hsp->subject.offset;
4208 hsp = minus->hsps;
4209 while (hsp->next) {
4210 hsp = hsp->next;
4211 }
4212 ASSERT(hsp);
4213 minus_start = hsp->hsp->subject.end;
4214
4215 /* this is also fragment length */
4216 distance = minus_start - plus_start;
4217 pair_info[num_pairs].distance = distance;
4218
4219 /* distance > 0 indicates a convergent pair (typical) */
4220 if (distance > 0 && distance < kMaxInsertSize) {
4221 Int4 plus_end, minus_end;
4222 hsp = plus->hsps;
4223 while (hsp->next) {
4224 hsp = hsp->next;
4225 }
4226 ASSERT(hsp);
4227 plus_end = hsp->hsp->subject.end;
4228 minus_end = minus->hsps->hsp->subject.offset;
4229
4230 /* HSP chain end going past the beginning of the chain
4231 for the pair, indicates that the this end of the
4232 query may be aligned to an adapater instead of a real
4233 sequence and needs to be trimmed */
4234 if (plus_end > minus_start || minus_end < plus_start) {
4235 ASSERT(plus && minus);
4236
4237 /* check the 3' (right) side of the fragment */
4238 if (plus_end > minus_start) {
4239 ASSERT(plus_end - minus_start > 0);
4240
4241 /* decrease the score by the trimmed portion; we
4242 assume here that the trimmed part aligns
4243 perfectly, which may not be the case, but this
4244 score is only used for comparison with different
4245 pairs */
4246 pair_info[num_pairs].score -=
4247 plus_end - minus_start;
4248
4249 /* Mark which chain and up to what subject location
4250 needs to be trimmed. We cannot trim here,
4251 because pair_info holds poiners to chains that
4252 may be part of better pairs */
4253 if (plus == pair_info[num_pairs].first) {
4254 pair_info[num_pairs].trim_first = minus_start;
4255 }
4256 else {
4257 pair_info[num_pairs].trim_second = minus_start;
4258 }
4259 }
4260
4261 /* check the 5' (left) side doe the fragment */
4262 if (minus_end < plus_start) {
4263 ASSERT(plus_start - minus_end > 0);
4264
4265 pair_info[num_pairs].score -=
4266 plus_start - minus_end;
4267
4268 if (minus == pair_info[num_pairs].first) {
4269 pair_info[num_pairs].trim_first = plus_start;
4270 }
4271 else {
4272 pair_info[num_pairs].trim_second = plus_start;
4273 }
4274 }
4275 }
4276
4277 pair_info[num_pairs].conf = PAIR_CONVERGENT;
4278
4279 /* add a bonus for convergent pairs; because they are more
4280 common we prefer them over different configurations
4281 with a sligntly better score */
4282 pair_info[num_pairs].score += conv_bonus;
4283 }
4284 else {
4285 pair_info[num_pairs].conf = PAIR_DIVERGENT;
4286 }
4287
4288 /* skip pairs with a score smaller than one found for a
4289 different subject */
4290 if (pair_info[num_pairs].score < min_score) {
4291 continue;
4292 }
4293
4294 num_pairs++;
4295 }
4296 else {
4297 /* for read pairs aligned in the same direction */
4298 pair_info[num_pairs].conf = PAIR_PARALLEL;
4299 pair_info[num_pairs].score -= 1;
4300 num_pairs++;
4301 }
4302 }
4303 }
4304
4305 if (num_pairs > 0) {
4306 Int4 i;
4307 Int4 best_score = pair_info[0].score;
4308 Int4 margin = 5;
4309 HSPChain* ch = NULL;
4310 Boolean convergent_found = FALSE;
4311
4312 /* sort pair information by score, configuration distance */
4313 qsort(pair_info, num_pairs, sizeof(Pairinfo), s_ComparePairs);
4314
4315 /* iterate over pair information and collect pairs */
4316 for (i=0;i < num_pairs;i++) {
4317 HSPChain* f = pair_info[i].first;
4318 HSPChain* s = pair_info[i].second;
4319
4320 /* skip pairs with chains already used in other pairs
4321 and other configurations when convergent one was found */
4322 if (pair_info[i].first->pair || pair_info[i].second->pair ||
4323 (convergent_found && pair_info[i].conf != PAIR_CONVERGENT)) {
4324
4325 continue;
4326 }
4327
4328 if (pair_info[i].conf == PAIR_CONVERGENT) {
4329 convergent_found = TRUE;
4330 }
4331
4332 /* skip pairs with scores significantly smaller than the best one
4333 found */
4334 if (best_score - pair_info[i].score > margin) {
4335 break;
4336 }
4337
4338 /* link chains as a pair */
4339 f->pair = s;
4340 s->pair = f;
4341 f->pair_conf = s->pair_conf = pair_info[i].conf;
4342 pair_info[i].valid_pair = TRUE;
4343 found = TRUE;
4344 }
4345
4346 /* for chains that pair with already paired chains, clone the paired
4347 chain */
4348 for (ch = *first_list; ch; ch = ch->next) {
4349 HSPChain* pair = NULL;
4350
4351 if (ch->pair) {
4352 continue;
4353 }
4354
4355 i = 0;
4356 while (i < num_pairs &&
4357 (pair_info[i].first != ch || !pair_info[i].second->pair ||
4358 pair_info[i].conf != PAIR_CONVERGENT ||
4359 best_score - pair_info[i].score > margin)) {
4360 i++;
4361 }
4362 if (i >= num_pairs) {
4363 continue;
4364 }
4365 ASSERT(pair_info[i].second->pair);
4366
4367 pair = CloneChain(pair_info[i].second);
4368 ASSERT(pair);
4369 ASSERT(s_TestChains(pair));
4370 pair_info[i].second = pair;
4371 ch->pair = pair;
4372 pair->pair = ch;
4373 ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4374 pair_info[i].valid_pair = TRUE;
4375 HSPChainListInsert(second_list, &pair, 0, -1, FALSE);
4376 }
4377
4378 for (ch = *second_list; ch; ch = ch->next) {
4379 HSPChain* pair = NULL;
4380
4381 if (ch->pair) {
4382 continue;
4383 }
4384
4385 i = 0;
4386 while (i < num_pairs &&
4387 (pair_info[i].second != ch || !pair_info[i].first->pair ||
4388 pair_info[i].conf != PAIR_CONVERGENT ||
4389 best_score - pair_info[i].score > margin)) {
4390 i++;
4391 }
4392 if (i >= num_pairs) {
4393 continue;
4394 }
4395 ASSERT(pair_info[i].first->pair);
4396
4397 pair = CloneChain(pair_info[i].first);
4398 ASSERT(pair);
4399 ASSERT(s_TestChains(pair));
4400 pair_info[i].first = pair;
4401 ch->pair = pair;
4402 pair->pair = ch;
4403 ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4404 pair_info[i].valid_pair = TRUE;
4405 HSPChainListInsert(first_list, &pair, 0, -1, FALSE);
4406 }
4407
4408 /* trim the alignments that overextend beyond the pair */
4409 for (i = 0;i < num_pairs;i++) {
4410 if (!pair_info[i].valid_pair ||
4411 !pair_info[i].first->pair || !pair_info[i].second->pair) {
4412 continue;
4413 }
4414
4415 /* trim the first chain if its end goes beyond the start of
4416 second on the subject */
4417 if (pair_info[i].trim_first > 0) {
4418 if (pair_info[i].first->hsps->hsp->query.frame > 0) {
4419 s_TrimChainEndToSubjPos(pair_info[i].first,
4420 pair_info[i].trim_first,
4421 scoring_options->penalty,
4422 scoring_options->gap_open,
4423 scoring_options->gap_extend,
4424 query_blk,
4425 query_info);
4426 }
4427 else {
4428 /* minus strand of the qurey is represented by another
4429 (reverse complemented) sequence aligned like the plus
4430 strand, so the end of the chain is really the beginnig
4431 by query coordinates */
4432 s_TrimChainStartToSubjPos(pair_info[i].first,
4433 pair_info[i].trim_first,
4434 scoring_options->penalty,
4435 scoring_options->gap_open,
4436 scoring_options->gap_extend,
4437 query_blk,
4438 query_info);
4439 }
4440 }
4441
4442 /* trim the second chain if its end goes beyond the start of the
4443 first on the subject */
4444 if (pair_info[i].trim_second > 0) {
4445 if (pair_info[i].second->hsps->hsp->query.frame > 0) {
4446 s_TrimChainEndToSubjPos(pair_info[i].second,
4447 pair_info[i].trim_second,
4448 scoring_options->penalty,
4449 scoring_options->gap_open,
4450 scoring_options->gap_extend,
4451 query_blk,
4452 query_info);
4453 }
4454 else {
4455 s_TrimChainStartToSubjPos(pair_info[i].second,
4456 pair_info[i].trim_second,
4457 scoring_options->penalty,
4458 scoring_options->gap_open,
4459 scoring_options->gap_extend,
4460 query_blk,
4461 query_info);
4462 }
4463 }
4464
4465 }
4466 }
4467
4468 return found;
4469 }
4470
4471
FindPartialyCoveredQueries(void * data,Int4 oid,Int4 word_size)4472 HSPChain* FindPartialyCoveredQueries(void* data, Int4 oid, Int4 word_size)
4473 {
4474 BlastHSPMapperData* spl_data = data;
4475 BlastQueryInfo* query_info = spl_data->query_info;
4476 HSPChain** saved = spl_data->saved_chains;
4477 HSPChain* retval = NULL;
4478 HSPChain* last;
4479 Int4 i;
4480
4481 for (i = 0;i < query_info->num_queries;i++) {
4482 HSPChain* chain = saved[i];
4483
4484 for (chain = saved[i]; chain; chain = chain->next) {
4485 HSPContainer* h;
4486
4487 if (chain->oid != oid) {
4488 continue;
4489 }
4490
4491 if (chain->score < 30) {
4492 continue;
4493 }
4494
4495 h = chain->hsps;
4496
4497 if (h->hsp->query.offset > word_size) {
4498
4499 if (!retval) {
4500 retval = CloneChain(chain);
4501 last = retval;
4502 }
4503 else {
4504 last->next = CloneChain(chain);
4505 last = last->next;
4506 }
4507
4508 continue;
4509 }
4510
4511 while (h->next) {
4512 h = h->next;
4513 }
4514 if (query_info->contexts[h->hsp->context].query_length -
4515 h->hsp->query.end > word_size) {
4516
4517 if (!retval) {
4518 retval = CloneChain(chain);
4519 last = retval;
4520 }
4521 else {
4522 last->next = CloneChain(chain);
4523 last = last->next;
4524 }
4525 }
4526 }
4527 }
4528
4529 return retval;
4530 }
4531
4532
4533 /** Perform writing task for paired reads
4534 * ownership of the HSP list and sets the dereferenced pointer to NULL.
4535 * @param data To store results to [in][out]
4536 * @param hsp_list Pointer to the HSP list to save in the collector. [in]
4537 */
4538 static int
s_BlastHSPMapperSplicedPairedRun(void * data,BlastHSPList * hsp_list)4539 s_BlastHSPMapperSplicedPairedRun(void* data, BlastHSPList* hsp_list)
4540 {
4541 BlastHSPMapperData* spl_data = data;
4542 BlastHSPMapperParams* params = spl_data->params;
4543 const ScoringOptions* scoring_opts = ¶ms->scoring_options;
4544 Boolean is_spliced = params->splice;
4545 const Int4 kLongestIntron = params->longest_intron;
4546 Int4 cutoff_score = params->cutoff_score;
4547 Int4* cutoff_score_fun = params->cutoff_score_fun;
4548 Int4 cutoff_edit_dist = params->cutoff_edit_dist;
4549 BLAST_SequenceBlk* query_blk = spl_data->query;
4550 BlastQueryInfo* query_info = spl_data->query_info;
4551 const Int4 kDefaultMaxHsps = 1000;
4552 Int4 max_hsps = kDefaultMaxHsps;
4553 HSPNode* nodes = calloc(max_hsps, sizeof(HSPNode));
4554 HSPPath* path = NULL;
4555 BlastHSP** hsp_array;
4556 Int4 num_hsps = 0;
4557 Int4 i;
4558
4559 HSPChain** saved_chains = spl_data->saved_chains;
4560 HSPChain* chain_array[2];
4561 HSPChain* first = NULL, *second = NULL;
4562 Int4 workspace_size = 40;
4563 Pairinfo* pair_workspace = calloc(workspace_size, sizeof(Pairinfo));
4564 /* the score bonus needs to include exon pars that were to short to be
4565 aligned (at either end of the chain) */
4566 const Int4 kPairBonus = is_spliced ? 21 : 5;
4567
4568 if (!hsp_list) {
4569 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4570 return 0;
4571 }
4572
4573 if (!params || !nodes) {
4574 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4575 return -1;
4576 }
4577
4578 hsp_array = hsp_list->hsp_array;
4579
4580 path = HSPPathNew();
4581
4582 /* sort HSPs by query context and subject position for spliced aligments */
4583 if (is_spliced) {
4584 qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4585 s_CompareHSPsByContextSubjectOffset);
4586 }
4587 else {
4588 /* for non spliced alignments, sort HSPs by query context and score */
4589 qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4590 s_CompareHSPsByContextScore);
4591 }
4592
4593 i = 0;
4594 chain_array[0] = NULL;
4595 chain_array[1] = NULL;
4596 while (i < hsp_list->hspcnt) {
4597
4598 Int4 context = hsp_array[i]->context;
4599 /* is this query the first segment of a pair */
4600 Boolean has_pair = (query_info->contexts[context].segment_flags ==
4601 eFirstSegment);
4602 Int4 query_idx = context / NUM_STRANDS;
4603 Int4 first_context = query_idx * NUM_STRANDS;
4604 Int4 context_next_fragment = has_pair ? (query_idx + 2) * NUM_STRANDS :
4605 (query_idx + 1) * NUM_STRANDS;
4606
4607 HSPChain* new_chains = NULL;
4608
4609 /* iterate over contexts that belong to a read pair */
4610 while (i < hsp_list->hspcnt &&
4611 hsp_array[i]->context < context_next_fragment) {
4612
4613 memset(nodes, 0, num_hsps * sizeof(HSPNode));
4614 num_hsps = 0;
4615 context = hsp_array[i]->context;
4616
4617 /* collect HSPs for the current context */
4618 while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4619
4620 /* Overlapping portions of two consecutive subject chunks
4621 may produce two HSPs representing the same alignment. We
4622 drop one of them here (assuming the HSPs for the same
4623 context are sorted by score or subject position). */
4624 BlastHSP* h = (num_hsps == 0 ? NULL : *nodes[num_hsps - 1].hsp);
4625 if (!h || hsp_array[i]->context != h->context ||
4626 hsp_array[i]->subject.offset != h->subject.offset ||
4627 hsp_array[i]->score != h->score) {
4628
4629 /* reallocate node array if necessary */
4630 if (num_hsps >= max_hsps) {
4631 HSPNode* new_nodes = NULL;
4632
4633 /* For non-spliced mapping we are only interested in
4634 the best scoring whole HSPs. HSPs are sorted by
4635 score and those that map to many places are not good
4636 so the node array does not need to be extended. */
4637 if (!is_spliced) {
4638 break;
4639 }
4640
4641 max_hsps *= 2;
4642 new_nodes = calloc(max_hsps, sizeof(HSPNode));
4643 if (!nodes) {
4644 s_BlastHSPMapperSplicedRunCleanUp(path, nodes,
4645 pair_workspace);
4646 return -1;
4647 }
4648 s_HSPNodeArrayCopy(new_nodes, nodes, num_hsps);
4649 free(nodes);
4650 nodes = new_nodes;
4651 }
4652
4653 nodes[num_hsps].hsp = &(hsp_array[i]);
4654 nodes[num_hsps].best_score = hsp_array[i]->score;
4655
4656 num_hsps++;
4657 }
4658 i++;
4659 }
4660
4661 /* find cutoff score for the query */
4662 if (cutoff_score_fun[1] != 0) {
4663 Int4 query_len = query_info->contexts[context].query_length;
4664 cutoff_score = (cutoff_score_fun[0] +
4665 cutoff_score_fun[1] * query_len) / 100;
4666 }
4667 else if (params->cutoff_score == 0) {
4668 cutoff_score = GetCutoffScore(
4669 query_info->contexts[context].query_length);
4670 }
4671
4672 /* find the best scoring chain of HSPs that align to exons
4673 parts of the query and the subject */
4674 new_chains = s_FindBestPath(nodes, num_hsps, path,
4675 hsp_list->oid, is_spliced,
4676 kLongestIntron, cutoff_score,
4677 query_blk, query_info, scoring_opts);
4678
4679 ASSERT(new_chains);
4680 HSPChainListInsert(&chain_array[(context - first_context) /
4681 NUM_STRANDS],
4682 &new_chains, cutoff_score, cutoff_edit_dist,
4683 FALSE);
4684
4685 /* skip HSPs for the same context if there are more then max
4686 allowed per context */
4687 while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4688 i++;
4689 }
4690 }
4691
4692 #if _DEBUG
4693 ASSERT(!chain_array[0] || s_TestChainsSorted(chain_array[0]));
4694 ASSERT(!chain_array[1] || s_TestChainsSorted(chain_array[1]));
4695 #endif
4696
4697 if (is_spliced && chain_array[0]) {
4698 s_FindSpliceJunctions(chain_array[0], query_blk, query_info,
4699 scoring_opts);
4700 }
4701
4702 if (is_spliced && chain_array[1]) {
4703 s_FindSpliceJunctions(chain_array[1], query_blk, query_info,
4704 scoring_opts);
4705 }
4706
4707 first = chain_array[0];
4708 second = chain_array[1];
4709
4710 /* If the chains provide sufficient score check for pairs.
4711 The score bonus is added to prefer a pair with a slightly smaller
4712 score to single chains with a better one. */
4713 if (first && second) {
4714
4715 s_FindBestPairs(&first, &second, 0, &pair_workspace,
4716 &workspace_size, is_spliced, scoring_opts,
4717 query_blk, query_info);
4718
4719 ASSERT(s_TestChains(first));
4720 ASSERT(s_TestChains(second));
4721 }
4722
4723 /* save all chains and remove ones with scores lower than
4724 best score - kPairBonus */
4725 if (first) {
4726 HSPChainListInsert(&saved_chains[query_idx], &first, cutoff_score,
4727 cutoff_edit_dist, TRUE);
4728 HSPChainListTrim(saved_chains[query_idx], kPairBonus);
4729
4730
4731 #if _DEBUG
4732 ASSERT(!saved_chains[query_idx] ||
4733 s_TestChainsSorted(saved_chains[query_idx]));
4734 #endif
4735 }
4736 if (second) {
4737 HSPChainListInsert(&saved_chains[query_idx + 1], &second,
4738 cutoff_score, cutoff_edit_dist, TRUE);
4739 HSPChainListTrim(saved_chains[query_idx + 1], kPairBonus);
4740
4741
4742 #if _DEBUG
4743 ASSERT(!saved_chains[query_idx + 1] ||
4744 s_TestChainsSorted(saved_chains[query_idx + 1]));
4745 #endif
4746 }
4747
4748 /* make temporary lists empty */
4749 chain_array[0] = chain_array[1] = NULL;
4750 }
4751
4752 /* delete HSPs that were not saved in results */
4753 hsp_list = Blast_HSPListFree(hsp_list);
4754
4755 s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4756
4757 return 0;
4758 }
4759
4760
4761 /** Free the writer
4762 * @param writer The writer to free [in]
4763 * @return NULL.
4764 */
4765 static
4766 BlastHSPWriter*
s_BlastHSPMapperFree(BlastHSPWriter * writer)4767 s_BlastHSPMapperFree(BlastHSPWriter* writer)
4768 {
4769 BlastHSPMapperData *data = writer->data;
4770 sfree(data->params);
4771 sfree(writer->data);
4772 sfree(writer);
4773 return NULL;
4774 }
4775
4776 /** create the writer
4777 * @param params Pointer to the hit paramters [in]
4778 * @param query_info BlastQueryInfo (not used) [in]
4779 * @return writer
4780 */
4781 static
4782 BlastHSPWriter*
s_BlastHSPMapperPairedNew(void * params,BlastQueryInfo * query_info,BLAST_SequenceBlk * query)4783 s_BlastHSPMapperPairedNew(void* params, BlastQueryInfo* query_info,
4784 BLAST_SequenceBlk* query)
4785 {
4786 BlastHSPWriter* writer = NULL;
4787 BlastHSPMapperData* data = NULL;
4788 /* BlastHSPMapperParams * spl_param = params; */
4789
4790 /* allocate space for writer */
4791 writer = malloc(sizeof(BlastHSPWriter));
4792
4793 /* fill up the function pointers */
4794 writer->InitFnPtr = &s_BlastHSPMapperPairedInit;
4795 writer->FinalFnPtr = &s_BlastHSPMapperFinal;
4796 writer->FreeFnPtr = &s_BlastHSPMapperFree;
4797 writer->RunFnPtr = &s_BlastHSPMapperSplicedPairedRun;
4798
4799 /* allocate for data structure */
4800 writer->data = calloc(1, sizeof(BlastHSPMapperData));
4801 data = writer->data;
4802 data->params = params;
4803 data->query_info = query_info;
4804 data->query = query;
4805
4806 return writer;
4807 }
4808
4809
4810 /*************************************************************/
4811 /** The following are exported functions to be used by APP */
4812
4813 BlastHSPMapperParams*
BlastHSPMapperParamsNew(const BlastHitSavingOptions * hit_options,const BlastScoringOptions * scoring_options)4814 BlastHSPMapperParamsNew(const BlastHitSavingOptions* hit_options,
4815 const BlastScoringOptions* scoring_options)
4816 {
4817 BlastHSPMapperParams* retval = NULL;
4818
4819 if (hit_options == NULL)
4820 return NULL;
4821
4822 retval = (BlastHSPMapperParams*)malloc(sizeof(BlastHSPMapperParams));
4823 retval->hitlist_size = MAX(hit_options->hitlist_size, 10);
4824 retval->paired = hit_options->paired;
4825 retval->splice = hit_options->splice;
4826 retval->longest_intron = hit_options->longest_intron;
4827 retval->cutoff_score = hit_options->cutoff_score;
4828 retval->cutoff_score_fun[0] = hit_options->cutoff_score_fun[0];
4829 retval->cutoff_score_fun[1] = hit_options->cutoff_score_fun[1];
4830 retval->cutoff_edit_dist = hit_options->max_edit_distance;
4831 retval->program = hit_options->program_number;
4832 retval->scoring_options.reward = scoring_options->reward;
4833 retval->scoring_options.penalty = scoring_options->penalty;
4834 retval->scoring_options.gap_open = -scoring_options->gap_open;
4835 retval->scoring_options.gap_extend = -scoring_options->gap_extend;
4836 retval->scoring_options.no_splice_signal = -2;
4837 return retval;
4838 }
4839
4840 BlastHSPMapperParams*
BlastHSPMapperParamsFree(BlastHSPMapperParams * opts)4841 BlastHSPMapperParamsFree(BlastHSPMapperParams* opts)
4842 {
4843 if ( !opts )
4844 return NULL;
4845 sfree(opts);
4846 return NULL;
4847 }
4848
4849 BlastHSPWriterInfo*
BlastHSPMapperInfoNew(BlastHSPMapperParams * params)4850 BlastHSPMapperInfoNew(BlastHSPMapperParams* params) {
4851 BlastHSPWriterInfo * writer_info =
4852 malloc(sizeof(BlastHSPWriterInfo));
4853
4854 writer_info->NewFnPtr = &s_BlastHSPMapperPairedNew;
4855
4856 writer_info->params = params;
4857 return writer_info;
4858 }
4859