1 /*  $Id: hspfilter_mapper.c 587429 2019-06-04 17:18:58Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Greg Boratyn
27  *
28  */
29 
30 /** @file hspfilter_mapper.c
31  * Implementation of the BlastHSPWriter interface to save only the best scoring * chains of HSPs for aligning RNA-Seq sequences to a genome.
32  */
33 
34 #include <algo/blast/core/hspfilter_mapper.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_hits.h>
37 #include <algo/blast/core/spliced_hits.h>
38 #include "jumper.h"
39 
40 /* Pair configurations, in the order of prefernece */
41 #define PAIR_CONVERGENT 0
42 #define PAIR_DIVERGENT 1
43 #define PAIR_PARALLEL 2
44 #define PAIR_NONE 3
45 
46 /* Insert a single chain into the list so that the list is sorted in decending
47    order of chain scores. */
s_HSPChainListInsertOne(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)48 static Int4 s_HSPChainListInsertOne(HSPChain** list, HSPChain* chain,
49                                     Boolean check_for_duplicates)
50 {
51     HSPChain* ch = NULL;
52 
53     if (!list || !chain) {
54         return -1;
55     }
56     ASSERT(!chain->next);
57 
58     if (!*list) {
59         *list = chain;
60         return 0;
61     }
62 
63     ch = *list;
64     if (ch->score < chain->score) {
65         chain->next = ch;
66         *list = chain;
67         return 0;
68     }
69 
70     /* check for duplicate chain: the new chains may be the same as already
71        saved; this may come from writing HSPs to HSP stream for each chunk */
72     if (check_for_duplicates &&
73         ch->oid == chain->oid && ch->score == chain->score &&
74         ch->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
75         ch->hsps->hsp->subject.offset ==
76         chain->hsps->hsp->subject.offset) {
77 
78         chain->next = NULL;
79         chain = HSPChainFree(chain);
80         return 0;
81     }
82 
83     while (ch->next && ch->next->score >= chain->score){
84 
85         /* check for duplicate chain: the new chains may be the same as already
86            saved; this may come from writing HSPs to HSP stream for each chunk */
87         if (check_for_duplicates &&
88             ch->next->oid == chain->oid && ch->next->score == chain->score &&
89             ch->next->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
90             ch->next->hsps->hsp->subject.offset ==
91             chain->hsps->hsp->subject.offset) {
92 
93             chain->next = NULL;
94             chain = HSPChainFree(chain);
95             return 0;
96         }
97 
98         ch = ch->next;
99     }
100     ASSERT(ch);
101     chain->next = ch->next;
102     ch->next = chain;
103 
104     return 0;
105 }
106 
107 
s_TestCutoffs(HSPChain * chain,Int4 cutoff_score,Int4 cutoff_edit_dist)108 static Boolean s_TestCutoffs(HSPChain* chain, Int4 cutoff_score,
109                              Int4 cutoff_edit_dist)
110 {
111     if (!chain) {
112         return FALSE;
113     }
114 
115     if (chain->score >= cutoff_score) {
116 
117         Int4 align_len = 0;
118         Int4 num_identical = 0;
119         HSPContainer* h = chain->hsps;
120 
121         if (cutoff_edit_dist < 0) {
122             return TRUE;
123         }
124 
125         for (; h; h = h->next) {
126             align_len += MAX(h->hsp->query.end - h->hsp->query.offset,
127                              h->hsp->subject.end - h->hsp->subject.offset);
128 
129             num_identical += h->hsp->num_ident;
130 
131             ASSERT(h->hsp->num_ident <=
132                    MAX(h->hsp->query.end - h->hsp->query.offset,
133                        h->hsp->subject.end - h->hsp->subject.offset));
134         }
135 
136         if (align_len - num_identical <= cutoff_edit_dist) {
137             return TRUE;
138         }
139     }
140 
141     return FALSE;
142 }
143 
144 /* Insert chains into the list so that the list is sorted in descending order
145    of chain scores. If chain is a list, each element is added separately. The
146    list must be sorted before adding chain */
HSPChainListInsert(HSPChain ** list,HSPChain ** chain,Int4 cutoff_score,Int4 cutoff_edit_dist,Boolean check_for_duplicates)147 static Int4 HSPChainListInsert(HSPChain** list, HSPChain** chain,
148                                Int4 cutoff_score, Int4 cutoff_edit_dist,
149                                Boolean check_for_duplicates)
150 {
151     HSPChain* ch = NULL;
152     Int4 status = 0;
153 
154     if (!list || !chain) {
155         return -1;
156     }
157 
158     ch = *chain;
159     while (ch) {
160         HSPChain* next = ch->next;
161         ch->next = NULL;
162         if (ch->score >= cutoff_score) {
163             status = s_HSPChainListInsertOne(list, ch, check_for_duplicates);
164         }
165         else {
166             HSPChainFree(ch);
167         }
168         if (status) {
169             return status;
170         }
171         ch = next;
172 
173     }
174 
175     *chain = NULL;
176     return 0;
177 }
178 
179 /* Remove from the list chains with scores lower than the best one by at least
180    the given margin. The list must be sorted in descending order of scores. */
HSPChainListTrim(HSPChain * list,Int4 margin)181 static Int4 HSPChainListTrim(HSPChain* list, Int4 margin)
182 {
183     HSPChain* ch = NULL;
184     Int4 best_score;
185 
186     if (!list) {
187         return -1;
188     }
189 
190     best_score = list->score;
191     ch = list;
192     while (ch->next && best_score - ch->next->score <= margin) {
193         ASSERT(best_score - ch->next->score >= 0);
194         ch = ch->next;
195     }
196     ASSERT(ch);
197 
198     ch->next = HSPChainFree(ch->next);
199 
200     return 0;
201 }
202 
203 
204 /* Test that all pointers in the chain are set */
s_TestChains(HSPChain * chain)205 static Boolean s_TestChains(HSPChain* chain)
206 {
207     HSPContainer* hc;
208 
209     ASSERT(chain);
210     hc = chain->hsps;
211     ASSERT(hc);
212     ASSERT(hc->hsp);
213     ASSERT(hc->hsp->context == chain->context);
214 
215     ASSERT(hc->hsp->gap_info->size > 1 ||
216            hc->hsp->query.end - hc->hsp->query.offset ==
217            hc->hsp->subject.end - hc->hsp->subject.offset);
218 
219     hc = hc->next;
220     while (hc) {
221         ASSERT(hc->hsp);
222         ASSERT(hc->hsp->context == chain->context);
223 
224         ASSERT(hc->hsp->gap_info->size > 1 ||
225                hc->hsp->query.end - hc->hsp->query.offset ==
226                hc->hsp->subject.end - hc->hsp->subject.offset);
227 
228         if (hc->next) {
229             ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
230             ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
231         }
232         hc = hc->next;
233     }
234 
235     chain = chain->next;
236     while (chain) {
237         hc = chain->hsps;
238         ASSERT(hc);
239         ASSERT(hc->hsp);
240         ASSERT(hc->hsp->context == chain->context);
241 
242         ASSERT(hc->hsp->gap_info->size > 1 ||
243                hc->hsp->query.end - hc->hsp->query.offset ==
244                hc->hsp->subject.end - hc->hsp->subject.offset);
245 
246         hc = hc->next;
247         while (hc) {
248             ASSERT(hc);
249             ASSERT(hc->hsp);
250             ASSERT(hc->hsp->context == chain->context);
251 
252             ASSERT(hc->hsp->gap_info->size > 1 ||
253                    hc->hsp->query.end - hc->hsp->query.offset ==
254                    hc->hsp->subject.end - hc->hsp->subject.offset);
255 
256             if (hc->next) {
257                 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
258                 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
259             }
260 
261             hc = hc->next;
262         }
263         chain = chain->next;
264     }
265 
266     return TRUE;
267 }
268 
269 #if _DEBUG
s_TestChainsSorted(HSPChain * chain)270 static Boolean s_TestChainsSorted(HSPChain* chain)
271 {
272     HSPChain* prev;
273 
274     s_TestChains(chain);
275 
276     prev = chain;
277     chain = chain->next;
278     for (; chain; chain = chain->next, prev = prev->next) {
279         ASSERT(prev->score >= chain->score);
280     }
281 
282     return TRUE;
283 }
284 #endif
285 
286 /** Data structure used by the writer */
287 typedef struct BlastHSPMapperData {
288    BlastHSPMapperParams* params;         /**< how many hits to save */
289    BLAST_SequenceBlk* query;              /**< query sequence */
290    BlastQueryInfo* query_info;            /**< information about queries */
291    HSPChain** saved_chains;               /**< HSP chains are stored here */
292 } BlastHSPMapperData;
293 
294 /*************************************************************/
295 /** The following are implementations for BlastHSPWriter ADT */
296 
297 /** Perform pre-run stage-specific initialization
298  * @param data The internal data structure [in][out]
299  * @param results The HSP results to operate on  [in]
300  */
301 static int
s_BlastHSPMapperPairedInit(void * data,void * results)302 s_BlastHSPMapperPairedInit(void* data, void* results)
303 {
304    BlastHSPMapperData * spl_data = data;
305    BlastHSPResults* r = (BlastHSPResults*)results;
306    spl_data->saved_chains = calloc(r->num_queries, sizeof(HSPChain*));
307 
308    return 0;
309 }
310 
311 
312 /* Get subject start position */
s_FindFragmentStart(HSPChain * chain)313 static Int4 s_FindFragmentStart(HSPChain* chain)
314 {
315     Int2 frame;
316     HSPContainer* hc = NULL;
317     ASSERT(chain);
318     ASSERT(chain->hsps);
319     ASSERT(chain->hsps->hsp);
320 
321     frame = chain->hsps->hsp->query.frame;
322     if (frame > 0) {
323         return chain->hsps->hsp->subject.offset;
324     }
325 
326     hc = chain->hsps;
327     while (hc->next) {
328         hc = hc->next;
329     }
330     ASSERT(hc);
331 
332     return hc->hsp->subject.end - 1;
333 }
334 
335 
336 /* Get subject end position */
337 /* Not used
338 static Int4 s_FindFragmentEnd(HSPChain* chain)
339 {
340     Int2 frame;
341     HSPContainer* hc = NULL;
342     ASSERT(chain);
343     ASSERT(chain->hsps);
344     ASSERT(chain->hsps->hsp);
345 
346     frame = chain->hsps->hsp->query.frame;
347     if (frame < 0) {
348         return chain->hsps->hsp->subject.offset;
349     }
350 
351     hc = chain->hsps;
352     while (hc->next) {
353         hc = hc->next;
354     }
355     ASSERT(hc);
356 
357     return hc->hsp->subject.end - 1;
358 }
359 */
360 
s_ComputeGapScore(Int4 length,Int4 open_score,Int4 extend_score,Int4 seq_error)361 static Int4 s_ComputeGapScore(Int4 length, Int4 open_score, Int4 extend_score,
362                               Int4 seq_error)
363 {
364     if (length < 4) {
365         return length * seq_error;
366     }
367 
368     return open_score + MIN(length, 4) * extend_score;
369 }
370 
371 /* Compute HSP alignment score from Jumper edit script */
s_ComputeAlignmentScore(BlastHSP * hsp,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)372 static Int4 s_ComputeAlignmentScore(BlastHSP* hsp, Int4 mismatch_score,
373                                     Int4 gap_open_score, Int4 gap_extend_score)
374 {
375     Int4 i;
376     Int4 last_pos = hsp->query.offset;
377     Int4 score = 0;
378     const Int4 kGap = 15;
379     Int4 num_identical = 0;
380     Int4 query_gap = 0;
381     Int4 subject_gap = 0;
382 
383     for (i = 0;i < hsp->map_info->edits->num_edits;i++) {
384         JumperEdit* e = &(hsp->map_info->edits->edits[i]);
385         Int4 num_matches = e->query_pos - last_pos;
386         ASSERT(num_matches >= 0);
387         last_pos = e->query_pos;
388         score += num_matches;
389         num_identical += num_matches;
390 
391         if (e->query_base == kGap) {
392             ASSERT(e->subject_base != kGap);
393             query_gap++;
394 
395             if (subject_gap > 0) {
396                 score += s_ComputeGapScore(subject_gap, -12, -1, -4);
397                 subject_gap = 0;
398             }
399         }
400         else if (e->subject_base == kGap) {
401             subject_gap++;
402             last_pos++;
403 
404             if (query_gap > 0) {
405                 score += s_ComputeGapScore(query_gap, -12, -1, -4);
406                 query_gap = 0;
407             }
408         }
409         else {
410             score += mismatch_score;
411             last_pos++;
412 
413             if (subject_gap > 0) {
414                 score += s_ComputeGapScore(subject_gap, -12, -1, -4);
415                 subject_gap = 0;
416             }
417             if (query_gap > 0) {
418                 score += s_ComputeGapScore(query_gap, -12, -1, -4);
419                 query_gap = 0;
420             }
421         }
422     }
423 
424     if (subject_gap > 0) {
425         score += s_ComputeGapScore(subject_gap, -12, -1, -4);
426         subject_gap = 0;
427     }
428     if (query_gap > 0) {
429         score += s_ComputeGapScore(query_gap, -12, -1, -4);
430         query_gap = 0;
431     }
432 
433     score += hsp->query.end - last_pos;
434     num_identical += hsp->query.end - last_pos;
435     hsp->num_ident = num_identical;
436     return score;
437 }
438 
439 
440 /* Find the cost of chain HSPs overlapping on the query, as the smaller score
441    of the overlapping region */
s_GetOverlapCost(const BlastHSP * a,const BlastHSP * b,Int4 edit_penalty)442 static Int4 s_GetOverlapCost(const BlastHSP* a, const BlastHSP* b,
443                              Int4 edit_penalty)
444 {
445     Int4 i;
446     Int4 overlap_f, overlap_s;
447     const BlastHSP* f;
448     const BlastHSP* s;
449 
450     /* if one HSP in contained within the other on the query, return the
451        smaller score */
452     if ((a->query.offset <= b->query.offset && a->query.end >= b->query.end) ||
453         (a->query.offset >= b->query.offset && a->query.end <= b->query.end)) {
454 
455         return MIN(a->score, b->score);
456     }
457 
458     /* if the two HSPs are mutually exclusive on the query, there is no cost */
459     if ((a->query.end < b->query.offset && a->query.offset < b->query.end) ||
460         (b->query.end < a->query.offset && b->query.offset < a->query.end)) {
461 
462         return 0;
463     }
464 
465     /* otherwise the HSPs partially overlap; reurn the the smaller score for
466        the overlap region */
467 
468     /* find which HSP precedes which on the query */
469     if (a->query.offset <= b->query.offset) {
470         f = a;
471         s = b;
472     }
473     else {
474         f = b;
475         s = a;
476     }
477 
478     /* this is the overlap score assuming perfect matches in the overlap */
479     overlap_f = overlap_s = f->query.end - s->query.offset;
480     ASSERT(overlap_f >= 0 && overlap_s >= 0);
481     /* subtract penalties for mismatches and gaps in the overlap region */
482     for (i = 0;i < f->map_info->edits->num_edits;i++) {
483         if (f->map_info->edits->edits[i].query_pos >= s->query.offset) {
484             overlap_f -= edit_penalty;
485         }
486     }
487     for (i = 0;i < s->map_info->edits->num_edits;i++) {
488         if (s->map_info->edits->edits[i].query_pos < f->query.end) {
489             overlap_s -= edit_penalty;
490         }
491     }
492 
493     return MIN(overlap_f, overlap_s);
494 }
495 
496 
497 /* Compute chain score */
s_ComputeChainScore(HSPChain * chain,const ScoringOptions * score_options,Int4 query_len,Boolean comp_hsp_score)498 static Int4 s_ComputeChainScore(HSPChain* chain,
499                                 const ScoringOptions* score_options,
500                                 Int4 query_len,
501                                 Boolean comp_hsp_score)
502 {
503     Int4 retval;
504     HSPContainer* h = NULL;
505     HSPContainer* prev = NULL;
506 
507     if (!chain || !score_options) {
508         return -1;
509     }
510 
511     h = chain->hsps;
512     if (comp_hsp_score) {
513         h->hsp->score = s_ComputeAlignmentScore(h->hsp, score_options->penalty,
514                                                 score_options->gap_open,
515                                                 score_options->gap_extend);
516     }
517     retval = h->hsp->score;
518 
519     prev = h;
520     h = h->next;
521 
522     for (; h; h = h->next, prev = prev->next) {
523         /* HSPs must be sorted by query positon */
524         ASSERT(h->hsp->query.offset >= prev->hsp->query.offset);
525 
526         if (comp_hsp_score) {
527 
528             h->hsp->score = s_ComputeAlignmentScore(h->hsp,
529                                                     score_options->penalty,
530                                                     score_options->gap_open,
531                                                     score_options->gap_extend);
532         }
533         retval += h->hsp->score;
534 
535         if ((h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0 ||
536             (prev->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
537 
538             Int4 query_gap =
539                 MAX(h->hsp->query.offset - prev->hsp->query.end, 0);
540 
541             Int4 subject_gap =
542                 MAX(h->hsp->subject.offset - prev->hsp->subject.end, 0);
543 
544             retval += s_ComputeGapScore(query_gap, -12, -1, -4);
545             retval += s_ComputeGapScore(subject_gap, -12, -1, -4);
546         }
547 
548     }
549 
550     return retval;
551 }
552 
553 #if _DEBUG
s_TestHSPRanges(const BlastHSP * hsp)554 static Boolean s_TestHSPRanges(const BlastHSP* hsp)
555 {
556     Int4 i;
557     Int4 d = 0;
558     Int4 q = 0, s = 0;
559     Int4 num_matches;
560     Int4 last_pos;
561     const Int4 kGap = 15;
562     for (i=0;i < hsp->gap_info->size;i++) {
563         switch (hsp->gap_info->op_type[i]) {
564         case eGapAlignIns:
565             d -= hsp->gap_info->num[i];
566             q += hsp->gap_info->num[i];
567             break;
568 
569         case eGapAlignDel:
570             d += hsp->gap_info->num[i];
571             s += hsp->gap_info->num[i];
572             break;
573 
574         case eGapAlignSub:
575             q += hsp->gap_info->num[i];
576             s += hsp->gap_info->num[i];
577 
578         default:
579             break;
580         }
581     }
582     if (hsp->query.end - hsp->query.offset + d !=
583         hsp->subject.end - hsp->subject.offset) {
584 
585         return FALSE;
586     }
587 
588     ASSERT(hsp->query.end - hsp->query.offset == q);
589     ASSERT(hsp->subject.end - hsp->subject.offset == s);
590 
591     d = 0;
592     q = 0;
593     s = 0;
594     last_pos = hsp->query.offset;
595     for (i=0;i < hsp->map_info->edits->num_edits;i++) {
596 
597         num_matches = hsp->map_info->edits->edits[i].query_pos - last_pos - 1;
598         if (i == 0 ||
599             (i > 0 && hsp->map_info->edits->edits[i - 1].query_base == kGap)) {
600             num_matches++;
601         }
602         q += num_matches;
603         s += num_matches;
604 
605         ASSERT(hsp->query.offset + q ==
606                hsp->map_info->edits->edits[i].query_pos);
607 
608         if (hsp->map_info->edits->edits[i].query_base == kGap) {
609             d++;
610             s++;
611         }
612         else if (hsp->map_info->edits->edits[i].subject_base == kGap) {
613             d--;
614             q++;
615         }
616         else {
617             q++;
618             s++;
619         }
620 
621         last_pos = hsp->map_info->edits->edits[i].query_pos;
622     }
623     num_matches = hsp->query.end - last_pos - 1;
624     if (hsp->map_info->edits->num_edits == 0 ||
625         (hsp->map_info->edits->num_edits > 0 &&
626          hsp->map_info->edits->edits[hsp->map_info->edits->num_edits - 1].query_base == kGap)) {
627 
628         num_matches++;
629     }
630     q += num_matches;
631     s += num_matches;
632 
633     ASSERT(hsp->query.end - hsp->query.offset + d ==
634            hsp->subject.end - hsp->subject.offset);
635 
636     ASSERT(hsp->query.end - hsp->query.offset == q);
637     ASSERT(hsp->subject.end - hsp->subject.offset == s);
638 
639     return TRUE;
640 }
641 #endif
642 
643 /* Trim HSP by a number of bases on query or subject, either from the start or
644    from the end */
s_TrimHSP(BlastHSP * hsp,Int4 num,Boolean is_query,Boolean is_start,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const Uint1 * query_seq)645 static Int4 s_TrimHSP(BlastHSP* hsp, Int4 num, Boolean is_query,
646                       Boolean is_start, Int4 mismatch_score,
647                       Int4 gap_open_score, Int4 gap_extend_score,
648                       const Uint1* query_seq)
649 {
650     Int4 num_left = num;
651     Int4 i = is_start ? 0 : hsp->gap_info->size - 1;
652     Int4 d = is_start ? 1 : -1;
653     Int4 end = is_start ? hsp->gap_info->size : -1;
654     Int4 delta_query = 0, delta_subject = 0;
655     Boolean is_subject = !is_query;
656     Boolean is_end = !is_start;
657     Int4 k;
658     const Uint1 kGap = 15;
659 
660     ASSERT(hsp /* */ && num > 0 /* */);
661     ASSERT((is_query && num <= hsp->query.end - hsp->query.offset) ||
662            (!is_query && num <= hsp->subject.end - hsp->subject.offset));
663 
664 #if _DEBUG
665     ASSERT(s_TestHSPRanges(hsp));
666 #endif
667 
668     if (num == 0) {
669         return 0;
670     }
671 
672 
673     /* itereate over the edit script and remove the required number
674        of subject bases */
675     while (i != end && num_left > 0) {
676         switch (hsp->gap_info->op_type[i]) {
677         case eGapAlignSub:
678             if (hsp->gap_info->num[i] > num_left) {
679                 hsp->gap_info->num[i] -= num_left;
680                 delta_query += num_left;
681                 delta_subject += num_left;
682                 num_left = 0;
683             }
684             else {
685                 Int4 n = hsp->gap_info->num[i];
686 
687                 delta_query += n;
688                 delta_subject += n;
689                 num_left -= n;
690                 i += d;
691             }
692             break;
693 
694         case eGapAlignDel:
695             if (is_subject) {
696                 if (hsp->gap_info->num[i] > num_left) {
697                     hsp->gap_info->num[i] -= num_left;
698                     delta_subject += num_left;
699                     num_left = 0;
700                 }
701                 else {
702                     delta_subject += hsp->gap_info->num[i];
703                     num_left -= hsp->gap_info->num[i];
704                     i += d;
705                 }
706             }
707             else {
708                 delta_subject += hsp->gap_info->num[i];
709                 i += d;
710             }
711             break;
712 
713         case eGapAlignIns:
714             if (is_query) {
715                 if (hsp->gap_info->num[i] > num_left) {
716                     hsp->gap_info->num[i] -= num_left;
717                     delta_query += num_left;
718                     num_left = 0;
719                 }
720                 else {
721                     delta_query += hsp->gap_info->num[i];
722                     num_left -= hsp->gap_info->num[i];
723                     i += d;
724                 }
725             }
726             else {
727                 delta_query += hsp->gap_info->num[i];
728                 i += d;
729             }
730             break;
731 
732         default:
733             ASSERT(0);
734             break;
735         }
736     }
737 
738     /* shift edit script elements to fill the removed ones */
739     if (is_start && i > 0) {
740         for (k = 0;k < hsp->gap_info->size - i;k++) {
741             hsp->gap_info->op_type[k] = hsp->gap_info->op_type[k + i];
742             hsp->gap_info->num[k] = hsp->gap_info->num[k + i];
743         }
744         hsp->gap_info->size -= i;
745     }
746 
747     if (is_end) {
748         hsp->gap_info->size = i + 1;
749     }
750 
751 
752     if (!hsp->map_info->subject_overhangs) {
753         hsp->map_info->subject_overhangs = calloc(1, sizeof(SequenceOverhangs));
754         if (!hsp->map_info->subject_overhangs) {
755             return -1;
756         }
757     }
758 
759 
760     /* update subject overhangs */
761     if (is_start) {
762         SequenceOverhangs* overhangs = hsp->map_info->subject_overhangs;
763         Uint1* subject_bases = NULL;
764         Int4 offset = 0;  /* query pos -> subject pos */
765 
766         ASSERT((overhangs->left && overhangs->left_len > 0) ||
767                (!overhangs->left && overhangs->left_len == 0));
768 
769         overhangs->left = realloc(overhangs->left,
770                                   (overhangs->left_len + delta_subject) *
771                                   sizeof(Uint1));
772         if (!overhangs->left) {
773             return -1;
774         }
775         subject_bases = overhangs->left + overhangs->left_len;
776         overhangs->left_len += delta_subject;
777 
778         memcpy(subject_bases, query_seq + hsp->query.offset, delta_subject);
779         for (k = 0;k < hsp->map_info->edits->num_edits &&
780                  hsp->map_info->edits->edits[k].query_pos <
781                  hsp->query.offset; k++) {
782 
783             JumperEdit* edits = hsp->map_info->edits->edits + k;
784             if (edits->subject_base == kGap) {
785                 offset--;
786             }
787             else {
788                 ASSERT(edits->query_pos + offset >= 0);
789                 ASSERT(edits->query_pos + offset < delta_subject);
790                 subject_bases[edits->query_pos + offset] = edits->subject_base;
791             }
792         }
793     }
794     else {
795         SequenceOverhangs* overhangs = hsp->map_info->subject_overhangs;
796         Uint1* subject_bases = NULL;
797         Int4 offset /*= hsp->subject.offset - hsp->query.offset*/;
798         Uint1* tmp = NULL;
799 
800         ASSERT((overhangs->right && overhangs->right_len > 0) ||
801                (!overhangs->right && overhangs->right_len == 0));
802 
803         tmp = malloc((overhangs->right_len + delta_subject) * sizeof(Uint1));
804         if (!tmp) {
805             return -1;
806         }
807         memcpy(tmp + delta_subject, overhangs->right, overhangs->right_len);
808         free(overhangs->right);
809         overhangs->right = tmp;
810         subject_bases = overhangs->right;
811 
812         memcpy(subject_bases, query_seq + hsp->query.end - delta_subject,
813                delta_subject);
814 
815 
816         k = 0;
817         while (k < hsp->map_info->edits->num_edits &&
818                hsp->map_info->edits->edits[k].query_pos <
819                hsp->query.end - delta_query) {
820             k++;
821         }
822 
823         offset = -(hsp->query.end - delta_query);
824         for (;k < hsp->map_info->edits->num_edits; k++) {
825 
826             JumperEdit* edits = hsp->map_info->edits->edits + k;
827 
828             if (edits->query_pos >= hsp->query.end) {
829                 ASSERT(edits->query_base == kGap);
830                 continue;
831             }
832 
833             if (edits->subject_base == kGap) {
834                 offset--;
835             }
836             else {
837                 ASSERT(edits->query_pos + offset >= 0);
838                 ASSERT(edits->query_pos + offset < delta_subject);
839                 subject_bases[edits->query_pos + offset] = edits->subject_base;
840             }
841         }
842 
843     }
844 
845 
846     /* update the Jumper edit script */
847     if (is_start) {
848         Int4 k = hsp->map_info->edits->num_edits - 1;
849         Int4 p = hsp->query.end - 1;
850         for (i = hsp->gap_info->size - 1;i >= 0;i--) {
851             if (hsp->gap_info->op_type[i] != eGapAlignDel) {
852 
853                 p -= hsp->gap_info->num[i];
854                 while (k >= 0 &&
855                        hsp->map_info->edits->edits[k].query_pos > p &&
856                        hsp->map_info->edits->edits[k].query_base != kGap) {
857 
858                     k--;
859                 }
860             }
861             else {
862                 Int4 j;
863                 for (j = 0;j < hsp->gap_info->num[i];j++) {
864                     ASSERT(k >= 0);
865                     ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
866                     k--;
867                 }
868             }
869         }
870         k++;
871         if (k > 0) {
872             for (i = 0;i < hsp->map_info->edits->num_edits - k;i++) {
873                 hsp->map_info->edits->edits[i] =
874                     hsp->map_info->edits->edits[i + k];
875             }
876             hsp->map_info->edits->num_edits -= k;
877             ASSERT(hsp->map_info->edits->num_edits >= 0);
878         }
879     }
880     else {
881         Int4 k = 0;
882         Int4 p = hsp->query.offset;
883         const Uint1 kGap = 15;
884 
885         for (i = 0;i < hsp->gap_info->size;i++) {
886             if (hsp->gap_info->op_type[i] != eGapAlignDel) {
887 
888                 p += hsp->gap_info->num[i];
889                 while (k < hsp->map_info->edits->num_edits &&
890                        hsp->map_info->edits->edits[k].query_pos < p &&
891                        hsp->map_info->edits->edits[k].query_base != kGap) {
892 
893                     k++;
894                 }
895             }
896             else {
897                 Int4 j;
898                 for (j = 0;j < hsp->gap_info->num[i];j++) {
899                     ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
900                     k++;
901                 }
902             }
903         }
904         hsp->map_info->edits->num_edits = k;
905     }
906 
907     /* update HSP start positions */
908     if (is_start) {
909         hsp->query.offset += delta_query;
910         hsp->subject.offset += delta_subject;
911     }
912     else {
913         hsp->query.end -= delta_query;
914         hsp->subject.end -= delta_subject;
915     }
916 
917     /* update HSP score */
918     hsp->score = s_ComputeAlignmentScore(hsp, mismatch_score, gap_open_score,
919                                          gap_extend_score);
920 
921 #if _DEBUG
922     ASSERT(s_TestHSPRanges(hsp));
923 #endif
924 
925     return 0;
926 }
927 
928 
929 #define NUM_ADAPTERS 4
930 #define MAX_ADAPTER_LEN 20
931 
932 /* Find adapeter on the 5' end in a single sequence. The search will start
933    towards the end of the last HSP (from, to) */
s_FindAdapterInSequence(Int4 hsp_from,Int4 hsp_to,Uint1 * query,Int4 query_len)934 static Int4 s_FindAdapterInSequence(Int4 hsp_from, Int4 hsp_to, Uint1* query,
935                                     Int4 query_len)
936 {
937     Uint1 adapters_tab[NUM_ADAPTERS][MAX_ADAPTER_LEN] = {
938         /* Contaminants from FastQC config file:
939            http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ */
940         /* Illumina universal adapter: AGATCGGAAGAG */
941         {0, 2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2},
942         /* Illumina small RNA adapter: ATGGAATTCTCG */
943         {0, 3, 2, 2, 0, 0, 3, 3, 1, 3, 1, 2},
944         /* Nextera transosase sequence: CTGTCTCTTATA */
945         {1, 3, 2, 3, 1, 3, 1, 3, 3, 0, 3, 0},
946         /* Illumina multiplexing adapter 1: GATCGGAAGAGCACACGTCT */
947         {2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2, 1, 0, 1, 0, 1, 2, 3, 1, 3}};
948 
949     int lengths[NUM_ADAPTERS] = {12, 12, 12, 20};
950     Boolean found = FALSE;
951     Int4 adapter_start = -1;
952     int adptr_idx;
953     Int4 from = hsp_from, to = hsp_to;
954     const Int4 kMaxErrors = 1; /* max number of mismaches allowed for matching
955                                   the adapter sequence + 1*/
956 
957 
958     if (!query) {
959         return -1;
960     }
961 
962     for (adptr_idx = 0;!found && adptr_idx < NUM_ADAPTERS;adptr_idx++) {
963         Uint1* adapter = adapters_tab[adptr_idx];
964         Uint4 word = *(Uint4*)adapter;
965         Int4 q = MAX(to - lengths[adptr_idx], from);
966         int i = 0;
967         while (q < query_len - 4 && q + i < query_len && i < lengths[adptr_idx]) {
968             while (q < query_len - 4 && *(Uint4*)(query + q) != word) {
969                 q++;
970             }
971             if (q < query_len - 4) {
972                 Int4 errors = kMaxErrors + 1;
973                 i = 0;
974                 while (q + i < query_len && i < lengths[adptr_idx] &&
975                        errors) {
976 
977                     if (query[q + i] != adapter[i]) {
978                         errors--;
979                     }
980                     i++;
981                 }
982                 if (q + i == query_len || i == lengths[adptr_idx]) {
983                     adapter_start = q;
984                     found = TRUE;
985                     break;
986                 }
987 
988                 q++;
989             }
990         }
991     }
992 
993     ASSERT(adapter_start <= query_len);
994     return adapter_start;
995 }
996 
997 
998 /* Set adapter position in the chain and trim alignments that span beyond
999    adapter start position */
s_SetAdapter(HSPChain ** chains_ptr,Int4 adapter_pos,const Uint1 * query,Int4 query_len,const ScoringOptions * scores)1000 static Int2 s_SetAdapter(HSPChain** chains_ptr, Int4 adapter_pos,
1001                          const Uint1* query, Int4 query_len,
1002                          const ScoringOptions* scores)
1003 {
1004     HSPChain* head = NULL;
1005     HSPChain* chain = NULL;
1006     const Int4 kMinAdapterLen = 3;
1007 
1008     if (!chains_ptr || !*chains_ptr || adapter_pos < 0) {
1009         return -1;
1010     }
1011 
1012     head = *chains_ptr;
1013     chain = *chains_ptr;
1014     /* iterate over chains */
1015     while (chain) {
1016         BlastHSP* hsp = chain->hsps->hsp;
1017 
1018         /* check the 3' alignment extent on the query and skip this chain
1019            if the query is covered almost to the end */
1020         if (hsp->query.frame > 0) {
1021             /* for positive strand, check the last query position */
1022             HSPContainer* h = chain->hsps;
1023             while (h->next) {
1024                 h = h->next;
1025             }
1026             ASSERT(h);
1027             if (query_len - h->hsp->query.end < kMinAdapterLen) {
1028                 chain = chain->next;
1029                 continue;
1030             }
1031         }
1032         else {
1033             /* for negative strand, check the first query position */
1034             ASSERT(hsp);
1035             if (hsp->query.offset < kMinAdapterLen) {
1036                 chain = chain->next;
1037                 continue;
1038             }
1039         }
1040 
1041         /* set adapter start position in the chain */
1042         chain->adapter = adapter_pos;
1043 
1044         /* Trim alignment that covers part of the adapter */
1045         /* for query on the positive strand */
1046         if (hsp->query.frame > 0) {
1047             HSPContainer* h = chain->hsps;
1048 
1049             /* if all HSPs are behind the adapter, delete this chain;
1050                we require that HSP without adapter is at least 5 bases long */
1051             if (h->hsp->query.offset >= adapter_pos - 5) {
1052                 HSPChain* prev = head;
1053                 HSPChain* next = chain->next;
1054                 while (prev && prev->next != chain) {
1055                     prev =prev->next;
1056                 }
1057 
1058                 chain->next = NULL;
1059                 HSPChainFree(chain);
1060                 chain = next;
1061                 if (prev) {
1062                     prev->next = next;
1063                 }
1064                 else {
1065                     head = chain;
1066                 }
1067 
1068                 continue;
1069             }
1070 
1071             /* find the first HSP with end position after adapter */
1072             while (h && h->hsp->query.end <= adapter_pos) {
1073                 h = h->next;
1074             }
1075 
1076             /* if one is found */
1077             if (h) {
1078                 HSPContainer* hh = NULL;
1079                 hsp = h->hsp;
1080 
1081                 /* if adapter is with in the HSP trim the HSP */
1082                 if (hsp->query.offset < adapter_pos ) {
1083                     Int4 old_score = hsp->score;
1084                     ASSERT(hsp->query.end - adapter_pos > 0);
1085                     s_TrimHSP(hsp, hsp->query.end - adapter_pos, TRUE,
1086                               FALSE, scores->penalty, scores->gap_open,
1087                               scores->gap_extend, query);
1088                     chain->score += hsp->score - old_score;
1089 
1090                     /* remove HSPs after h as they are past the adapter */
1091                     if (h->next) {
1092                         h->next = HSPContainerFree(h->next);
1093                     }
1094                 }
1095                 else {
1096                     /* otherwise delete h and everything after it */
1097                     hh = chain->hsps;
1098                     while (hh && hh->next != h) {
1099                         hh = hh->next;
1100                     }
1101                     ASSERT(hh);
1102                     hh->next = HSPContainerFree(hh->next);
1103                 }
1104 
1105                 /* compute the new chain score */
1106                 chain->score = s_ComputeChainScore(chain, scores, query_len,
1107                                                    FALSE);
1108             }
1109 
1110             /* mark adapter in the HSPs */
1111             h = chain->hsps;
1112             while (h->next) {
1113                 h = h->next;
1114             }
1115             h->hsp->map_info->right_edge |= MAPPER_ADAPTER;
1116             h->hsp->map_info->right_edge |= MAPPER_EXON;
1117         }
1118         else {
1119             /* negative strand: adapter is in the beginning of the chain */
1120             Int4 pos_minus = query_len - adapter_pos - 1;
1121             HSPContainer* h = chain->hsps;
1122 
1123             /* find the first HSP with end position after adapter;
1124                we require that HSP without adapter is at least 5 bases long */
1125             while (h && h->hsp->query.end <= pos_minus + 5) {
1126                 h = h->next;
1127             }
1128 
1129             /* if all HSPs are before the adapter, delete this chain */
1130             if (!h) {
1131                 /* remove chain */
1132                 HSPChain* prev = head;
1133                 HSPChain* next = chain->next;
1134                 while (prev && prev->next != chain) {
1135                     prev =prev->next;
1136                 }
1137 
1138                 chain->next = NULL;
1139                 HSPChainFree(chain);
1140                 chain = next;
1141                 if (prev) {
1142                     prev->next = next;
1143                 }
1144                 else {
1145                     head = chain;
1146                 }
1147 
1148                 continue;
1149             }
1150 
1151             /* if h is not the first HSP in the chain, remove all HSPs before
1152                h */
1153             if (h != chain->hsps) {
1154                 HSPContainer* hh = chain->hsps;
1155                 while (hh && hh->next != h) {
1156                     hh = hh->next;
1157                 }
1158                 ASSERT(hh);
1159                 hh->next = NULL;
1160                 HSPContainerFree(chain->hsps);
1161                 chain->hsps = h;
1162 
1163                 /* compute new chain score */
1164                 chain->score = s_ComputeChainScore(chain, scores, query_len,
1165                                                    FALSE);
1166             }
1167             ASSERT(h);
1168 
1169             /* set adapter information */
1170             hsp = chain->hsps->hsp;
1171             hsp->map_info->left_edge |= MAPPER_ADAPTER;
1172             hsp->map_info->left_edge |= MAPPER_EXON;
1173 
1174             /* trim HSP if necessary */
1175             if (pos_minus >= hsp->query.offset) {
1176                 Int4 old_score = hsp->score;
1177                 ASSERT(pos_minus - hsp->query.offset + 1 > 0);
1178                 s_TrimHSP(hsp, pos_minus - hsp->query.offset + 1,
1179                           TRUE, TRUE, scores->penalty, scores->gap_open,
1180                           scores->gap_extend, query);
1181                 chain->score += hsp->score - old_score;
1182             }
1183         }
1184 
1185         chain = chain->next;
1186     }
1187 
1188     ASSERT(!head || s_TestChains(head));
1189     *chains_ptr = head;
1190 
1191     return 0;
1192 }
1193 
1194 
s_FindAdapters(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * score_opts)1195 static Int4 s_FindAdapters(HSPChain** saved,
1196                            const BLAST_SequenceBlk* query_blk,
1197                            const BlastQueryInfo* query_info,
1198                            const ScoringOptions* score_opts)
1199 {
1200     Int4 query_idx;
1201 
1202     for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1203         HSPChain* chain = NULL;
1204         HSPChain* ch = NULL;
1205         Uint1* query = NULL;
1206         Int4 query_len;
1207         Int4 from = -1, to = -1;
1208         BlastHSP* hsp = NULL;
1209         Int4 overhang = 0;
1210         Int4 adapter_pos = -1;
1211 
1212         if (!saved[query_idx]) {
1213             continue;
1214         }
1215 
1216         /* we search for adapters and  only in the plus strand of the query */
1217         query = query_blk->sequence +
1218             query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1219         ASSERT(query);
1220         query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1221 
1222         /* find the best scoring chain */
1223         chain = saved[query_idx];
1224         ch = chain->next;
1225         for (; ch; ch = ch->next) {
1226             if (ch->score > chain->score) {
1227                 chain = ch;
1228             }
1229         }
1230         ASSERT(chain);
1231 
1232         /* find query coverage by the whole chain */
1233         if (chain->hsps->hsp->query.frame > 0) {
1234             HSPContainer* h = chain->hsps;
1235             from = h->hsp->query.offset;
1236             while (h->next) {
1237                 h = h->next;
1238             }
1239             to = h->hsp->query.end;
1240         }
1241         else {
1242             HSPContainer* h = chain->hsps;
1243             to = query_len - h->hsp->query.offset;
1244             while (h->next) {
1245                 h = h->next;
1246             }
1247             from = query_len - h->hsp->query.end;
1248         }
1249         ASSERT(from >= 0 && to >= 0);
1250 
1251         /* do not search for adapters if the query is mostly covered by the
1252            best scoring chain */
1253         if (from < 20 && to > query_len - 3) {
1254             continue;
1255         }
1256 
1257         /* find the chain with the longest overhang */
1258         chain = saved[query_idx];
1259         ch = chain->next;
1260         for (; ch; ch = ch->next) {
1261             HSPContainer* h = ch->hsps;
1262             if (h->hsp->query.frame > 0) {
1263                 while (h->next) {
1264                     h = h->next;
1265                 }
1266                 if (query_len - h->hsp->query.end > overhang) {
1267                     overhang = query_len - h->hsp->query.end;
1268                     chain = ch;
1269                 }
1270             }
1271             else {
1272                 if (h->hsp->query.offset + 1 > overhang) {
1273                     overhang = h->hsp->query.offset + 1;
1274                     chain = ch;
1275                 }
1276             }
1277         }
1278         ASSERT(chain);
1279 
1280         /* find location from where to search for the adapter */
1281         hsp = chain->hsps->hsp;
1282         if (hsp->query.frame > 0) {
1283             HSPContainer* h = chain->hsps;
1284             while (h->next) {
1285                 h = h->next;
1286             }
1287             hsp = h->hsp;
1288         }
1289         ASSERT(hsp);
1290 
1291         if (hsp->query.frame > 0) {
1292             from = hsp->query.offset;
1293             to  = hsp->query.end;
1294         }
1295         else {
1296             from = query_len - hsp->query.end;
1297             to = query_len - hsp->query.offset;
1298         }
1299 
1300         /* do not search for adapters if the read aligns almost to the end */
1301         if (to >= query_len - 3) {
1302             continue;
1303         }
1304 
1305         /* search for adapters */
1306         adapter_pos = s_FindAdapterInSequence(from, to, query, query_len);
1307 
1308         /* set adapter information in all chains and trim HSPs that extend
1309            into the adapter */
1310         if (adapter_pos >= 0) {
1311             s_SetAdapter(&(saved[query_idx]), adapter_pos, query, query_len,
1312                          score_opts);
1313         }
1314     }
1315 
1316     return 0;
1317 }
1318 
1319 
1320 /* Search for polyA tail at the end of a sequence and return the start position */
s_FindPolyAInSequence(Uint1 * sequence,Int4 length)1321 static Int4 s_FindPolyAInSequence(Uint1* sequence, Int4 length)
1322 {
1323     Int4 i;
1324     const Uint1 kBaseA = 0;
1325     Int4 num_a = 0;
1326     Int4 err = 0;
1327     const Int4 kMaxErrors = 3;
1328 
1329     if (!sequence) {
1330         return -1;
1331     }
1332 
1333     /* iterate over positions untile kMaxErrors non A bases are found */
1334     i = length - 1;
1335     while (i >= 0 && err < kMaxErrors) {
1336         if (sequence[i] != kBaseA) {
1337             err++;
1338         }
1339 
1340         i--;
1341     }
1342     i++;
1343 
1344     /* find the beginnig of the string of As */
1345     while (i < length - 1 &&
1346            (sequence[i] != kBaseA || sequence[i + 1] != kBaseA)) {
1347 
1348         if (sequence[i] != kBaseA) {
1349             err--;
1350         }
1351         i++;
1352     }
1353 
1354     num_a = length - i - err;
1355 
1356     /* short tails must have no errors */
1357     if (num_a < 3 || (num_a < 5 && err > 0)) {
1358         return -1;
1359     }
1360 
1361     return i;
1362 }
1363 
1364 
1365 /* Set PolyA information in HSP chains */
s_SetPolyATail(HSPChain * chains,Int4 positive_start,Int4 negative_start,Int4 query_len)1366 static Int4 s_SetPolyATail(HSPChain* chains, Int4 positive_start,
1367                            Int4 negative_start, Int4 query_len)
1368 {
1369     HSPChain* ch = NULL;
1370 
1371     if (!chains) {
1372         return -1;
1373     }
1374 
1375     for (ch = chains; ch; ch = ch->next) {
1376         HSPContainer* h = ch->hsps;
1377         while (h->next) {
1378             h = h->next;
1379         }
1380 
1381         if (query_len - h->hsp->query.end < 5) {
1382             continue;
1383         }
1384 
1385         if ((h->hsp->query.frame < 0 && negative_start >= 0) ||
1386             (h->hsp->query.frame > 0 && positive_start >= 0)) {
1387 
1388             h->hsp->map_info->right_edge |= MAPPER_POLY_A;
1389             h->hsp->map_info->right_edge |= MAPPER_EXON;
1390 
1391             if (h->hsp->query.frame > 0) {
1392                 ch->polyA = MAX(positive_start, h->hsp->query.end);
1393             }
1394             else {
1395                 ch->polyA = MAX(negative_start, h->hsp->query.end);
1396             }
1397         }
1398     }
1399 
1400     return 0;
1401 }
1402 
1403 
s_FindPolyATails(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)1404 static Int4 s_FindPolyATails(HSPChain** saved,
1405                              const BLAST_SequenceBlk* query_blk,
1406                              const BlastQueryInfo* query_info)
1407 {
1408     Int4 query_idx;
1409 
1410     /* for each query */
1411     for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1412         HSPChain* chain = NULL;
1413         HSPChain* ch = NULL;
1414         Uint1* query = NULL;
1415         Int4 query_len;
1416         Int4 from = -1, to = -1;
1417         Int4 positive_start, negative_start;
1418 
1419         /* skip queries with no results and those with adapters */
1420         if (!saved[query_idx] || saved[query_idx]->adapter >= 0) {
1421             continue;
1422         }
1423 
1424         query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1425 
1426         /* find the best scoring chain */
1427         chain = saved[query_idx];
1428         ch = chain->next;
1429         for (; ch; ch = ch->next) {
1430             if (ch->score > chain->score) {
1431                 chain = ch;
1432             }
1433         }
1434         ASSERT(chain);
1435 
1436         /* find query coverage by the whole chain */
1437         if (chain->hsps->hsp->query.frame > 0) {
1438             HSPContainer* h = chain->hsps;
1439             from = h->hsp->query.offset;
1440             while (h->next) {
1441                 h = h->next;
1442             }
1443             to = h->hsp->query.end;
1444         }
1445         else {
1446             HSPContainer* h = chain->hsps;
1447             to = query_len - h->hsp->query.offset;
1448             while (h->next) {
1449                 h = h->next;
1450             }
1451             from = query_len - h->hsp->query.end;
1452         }
1453         ASSERT(from >= 0 && to >= 0);
1454 
1455         /* do not search for polyA tails if the query is mostly covered by the
1456            best scoring chain */
1457         if (from < 4 && to > query_len - 3) {
1458             continue;
1459         }
1460 
1461         /* search for polyA tail on the positive strand */
1462         query = query_blk->sequence +
1463             query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1464         positive_start = s_FindPolyAInSequence(query, query_len);
1465 
1466         /* search for the polyA tail on the negative strand */
1467         query = query_blk->sequence +
1468             query_info->contexts[query_idx * NUM_STRANDS + 1].query_offset;
1469         negative_start = s_FindPolyAInSequence(query, query_len);
1470 
1471         /* set polyA start positions and HSP flags in the saved chains */
1472         if (positive_start >= 0 || negative_start >= 0) {
1473             s_SetPolyATail(saved[query_idx], positive_start, negative_start,
1474                            query_len);
1475         }
1476     }
1477 
1478     return 0;
1479 }
1480 
1481 
1482 /* Merge two HSPs into one. The two HSPs may only be separated by at most
1483    one gap or mismatch. Function returns NULL is the HSPs cannon be merged */
s_MergeHSPs(const BlastHSP * first,const BlastHSP * second,const Uint1 * query,const ScoringOptions * score_opts)1484 static BlastHSP* s_MergeHSPs(const BlastHSP* first, const BlastHSP* second,
1485                              const Uint1* query,
1486                              const ScoringOptions* score_opts)
1487 {
1488     BlastHSP* merged_hsp = NULL;  /* this will be the result */
1489     const BlastHSP* hsp = second;
1490     Int4 query_gap;
1491     Int4 subject_gap;
1492     Int4 mismatches = 0;
1493     Int4 gap_info_size;
1494     Int4 edits_size;
1495     Int4 k;
1496     Int4 offset;
1497     const Uint1 kGap = 15;
1498 
1499     if (!first || !second || !query || !score_opts) {
1500         return NULL;
1501     }
1502 
1503     merged_hsp = Blast_HSPClone(first);
1504     if (!merged_hsp) {
1505         return NULL;
1506     }
1507 
1508     query_gap = second->subject.offset - first->subject.end;
1509     subject_gap = second->query.offset - first->query.end;
1510 
1511     if (query_gap < 0 || subject_gap < 0) {
1512 
1513         return NULL;
1514     }
1515 
1516     if (MAX(query_gap, subject_gap) < 4) {
1517         mismatches = MIN(query_gap, subject_gap);
1518         query_gap -= mismatches;
1519         subject_gap -= mismatches;
1520     }
1521 
1522     gap_info_size = first->gap_info->size + second->gap_info->size + 3;
1523 
1524     edits_size = first->map_info->edits->num_edits +
1525         second->map_info->edits->num_edits +
1526         mismatches + query_gap + subject_gap;
1527 
1528     /* FIXME: should be done through an API */
1529     /* reallocate memory for edit scripts */
1530     merged_hsp->gap_info->op_type = realloc(merged_hsp->gap_info->op_type,
1531                                             gap_info_size *
1532                                             sizeof(EGapAlignOpType));
1533 
1534     merged_hsp->gap_info->num = realloc(merged_hsp->gap_info->num,
1535                                         gap_info_size *
1536                                         sizeof(Int4));
1537 
1538     merged_hsp->map_info->edits->edits = realloc(
1539                                        merged_hsp->map_info->edits->edits,
1540                                        edits_size * sizeof(JumperEdit));
1541 
1542     if (!merged_hsp->gap_info->op_type ||
1543         !merged_hsp->gap_info->num ||
1544         !merged_hsp->map_info->edits->edits) {
1545 
1546         Blast_HSPFree(merged_hsp);
1547         return NULL;
1548     }
1549 
1550     /* add mismatches to gap_info */
1551     if (mismatches > 0) {
1552         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1553             == eGapAlignSub) {
1554             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1555                 mismatches;
1556         }
1557         else {
1558             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1559                 eGapAlignSub;
1560             merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1561                 mismatches;
1562             merged_hsp->gap_info->size++;
1563         }
1564         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1565     }
1566 
1567 
1568     /* add query gap to gap info */
1569     if (query_gap > 0) {
1570         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1571             == eGapAlignDel) {
1572 
1573             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1574                 query_gap;
1575         }
1576         else {
1577             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1578                 eGapAlignDel;
1579             merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1580                 query_gap;
1581             merged_hsp->gap_info->size++;
1582         }
1583         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1584     }
1585 
1586     /* add subject gap to gap info */
1587     if (subject_gap > 0) {
1588         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1589             == eGapAlignIns) {
1590 
1591             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1592                 subject_gap;
1593         }
1594         else {
1595             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1596                 eGapAlignIns;
1597             merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1598                 subject_gap;
1599             merged_hsp->gap_info->size++;
1600         }
1601         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1602     }
1603 
1604     /* merge gap_info */
1605     for (k = 0;k < hsp->gap_info->size;k++) {
1606 
1607         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1608             == hsp->gap_info->op_type[k]) {
1609 
1610             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1611                 hsp->gap_info->num[k];
1612         }
1613         else {
1614             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size]
1615                 = hsp->gap_info->op_type[k];
1616 
1617             merged_hsp->gap_info->num[merged_hsp->gap_info->size++]
1618                 = hsp->gap_info->num[k];
1619         }
1620         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1621     }
1622 
1623     /* find translation of a query into the subject position:
1624        query pos + offset = subject pos */
1625     offset = merged_hsp->subject.offset - merged_hsp->query.offset;
1626     for (k = 0;k < merged_hsp->map_info->edits->num_edits; k++) {
1627         if (merged_hsp->map_info->edits->edits[k].query_base == kGap) {
1628             offset++;
1629         }
1630         else if (merged_hsp->map_info->edits->edits[k].subject_base == kGap) {
1631             offset--;
1632         }
1633     }
1634 
1635     /* add mismatches to jumper edits */
1636     if (mismatches > 0) {
1637         for (k = 0;k < mismatches;k++) {
1638             Int4 subject_pos;
1639             JumperEdit* edit = merged_hsp->map_info->edits->edits +
1640                 merged_hsp->map_info->edits->num_edits++;
1641 
1642             edit->query_pos = merged_hsp->query.end + k;
1643             /* FIXME: Mismatch bases cannot be currently set because there is
1644                no access to query or subject sequence in this function. */
1645             edit->query_base = query[edit->query_pos];
1646 
1647 
1648             subject_pos = edit->query_pos + offset - merged_hsp->subject.end;
1649             if (merged_hsp->map_info->subject_overhangs &&
1650                 merged_hsp->map_info->subject_overhangs->right &&
1651                 subject_pos <
1652                 merged_hsp->map_info->subject_overhangs->right_len) {
1653 
1654                 edit->subject_base =
1655                     merged_hsp->map_info->subject_overhangs->right[subject_pos];
1656             }
1657             else {
1658                 edit->subject_base = edit->query_base;
1659             }
1660 
1661 
1662             ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1663         }
1664     }
1665 
1666     /* add query gap to jumper edits */
1667     if (query_gap > 0) {
1668         for (k = 0;k < query_gap;k++) {
1669             Int4 subject_pos;
1670             JumperEdit* edit = merged_hsp->map_info->edits->edits +
1671                 merged_hsp->map_info->edits->num_edits++;
1672 
1673             edit->query_pos = merged_hsp->query.end + mismatches;
1674             /* FIXME: Mismatch bases cannot be currently set because there is
1675                no access to query or subject sequence in this function. */
1676             edit->query_base = kGap;
1677 
1678             subject_pos = edit->query_pos + offset - merged_hsp->subject.end;
1679             if (merged_hsp->map_info->subject_overhangs &&
1680                 merged_hsp->map_info->subject_overhangs->right &&
1681                 subject_pos <
1682                 merged_hsp->map_info->subject_overhangs->right_len) {
1683 
1684                 edit->subject_base =
1685                     merged_hsp->map_info->subject_overhangs->right[subject_pos];
1686             }
1687             else {
1688                 edit->subject_base = 0;
1689             }
1690             offset++;
1691 
1692             ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1693         }
1694     }
1695 
1696     /* add subject gap to jumper edits */
1697     if (subject_gap > 0) {
1698         for (k = 0;k < subject_gap;k++) {
1699             JumperEdit* edit = merged_hsp->map_info->edits->edits +
1700                 merged_hsp->map_info->edits->num_edits++;
1701 
1702             edit->query_pos = merged_hsp->query.end + mismatches + k;
1703             /* FIXME: Mismatch bases cannot be currently set because there is
1704                no access to query or subject sequence in this function. */
1705             edit->query_base = query[edit->query_pos];
1706             edit->subject_base = kGap;
1707 
1708             ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1709         }
1710     }
1711 
1712     /* merge jumper edits */
1713     if (hsp->map_info->edits->num_edits) {
1714         JumperEdit* edit = merged_hsp->map_info->edits->edits +
1715             merged_hsp->map_info->edits->num_edits;
1716 
1717         ASSERT(merged_hsp->map_info->edits->num_edits +
1718                hsp->map_info->edits->num_edits <= edits_size);
1719 
1720         memcpy(edit, hsp->map_info->edits->edits,
1721                hsp->map_info->edits->num_edits * sizeof(JumperEdit));
1722 
1723         merged_hsp->map_info->edits->num_edits +=
1724             hsp->map_info->edits->num_edits;
1725     }
1726 
1727     /* update alignment extents, scores, etc */
1728     merged_hsp->query.end = hsp->query.end;
1729     merged_hsp->subject.end = hsp->subject.end;
1730     merged_hsp->score = s_ComputeAlignmentScore(merged_hsp,
1731                                                 score_opts->penalty,
1732                                                 score_opts->gap_open,
1733                                                 score_opts->gap_extend);
1734 
1735     merged_hsp->map_info->right_edge = hsp->map_info->right_edge;
1736     if (!merged_hsp->map_info->subject_overhangs) {
1737         return merged_hsp;
1738     }
1739 
1740     if (!hsp->map_info->subject_overhangs ||
1741         !hsp->map_info->subject_overhangs->right_len ||
1742         !hsp->map_info->subject_overhangs->right) {
1743 
1744         merged_hsp->map_info->subject_overhangs->right_len = 0;
1745         if (merged_hsp->map_info->subject_overhangs->right) {
1746             sfree(merged_hsp->map_info->subject_overhangs->right);
1747             merged_hsp->map_info->subject_overhangs->right = NULL;
1748         }
1749     }
1750     else {
1751         Int4 new_right_len = hsp->map_info->subject_overhangs->right_len;
1752         if (merged_hsp->map_info->subject_overhangs->right_len <
1753             new_right_len) {
1754 
1755             merged_hsp->map_info->subject_overhangs->right =
1756                 realloc(merged_hsp->map_info->subject_overhangs->right,
1757                         new_right_len * sizeof(Uint1));
1758         }
1759         memcpy(merged_hsp->map_info->subject_overhangs->right,
1760                hsp->map_info->subject_overhangs->right,
1761                new_right_len * sizeof(Uint1));
1762     }
1763 
1764     return merged_hsp;
1765 }
1766 
1767 
1768 /* Find paired reads mapped to different database sequences */
s_FindRearrangedPairs(HSPChain ** saved,const BlastQueryInfo * query_info)1769 static int s_FindRearrangedPairs(HSPChain** saved,
1770                                  const BlastQueryInfo* query_info)
1771 {
1772     Int4 query_idx;
1773 
1774     /* iterate over single reads from each pair */
1775     for (query_idx = 0;query_idx + 1 < query_info->num_queries; query_idx++) {
1776         HSPChain* chain = saved[query_idx];
1777         HSPChain* thepair = saved[query_idx + 1];
1778 
1779         /* skip queries with no results */
1780         if (!chain || !thepair) {
1781             continue;
1782         }
1783 
1784         /* skip queries that do not have pairs */
1785         if (query_info->contexts[query_idx * NUM_STRANDS].segment_flags !=
1786             eFirstSegment) {
1787 
1788             continue;
1789         }
1790 
1791         /* skip queries with more than one saved chain and ones that already
1792            have a pair */
1793         if (chain->next || chain->pair || thepair->next) {
1794             continue;
1795         }
1796 
1797         /* skip chains aligned to the same subject; these are taken care of
1798            elsewhere */
1799         if (chain->oid == thepair->oid) {
1800             continue;
1801         }
1802 
1803         /* skip HSP chains aligned on the same strands */
1804         if (SIGN(chain->hsps->hsp->query.frame) ==
1805             SIGN(thepair->hsps->hsp->query.frame)) {
1806             continue;
1807         }
1808 
1809         /* mark the pair */
1810         chain->pair = thepair;
1811         thepair->pair = chain;
1812 
1813         chain->pair_conf = PAIR_PARALLEL;
1814         thepair->pair_conf = PAIR_PARALLEL;
1815     }
1816 
1817     return 0;
1818 }
1819 
1820 
1821 /* Sort chains by score in descending order */
s_CompareChainsByScore(const void * cha,const void * chb)1822 static int s_CompareChainsByScore(const void* cha, const void* chb)
1823 {
1824     const HSPChain* a = *((HSPChain**)cha);
1825     const HSPChain* b = *((HSPChain**)chb);
1826 
1827     if (a->score < b->score) {
1828         return 1;
1829     }
1830     else if (a->score > b->score) {
1831         return -1;
1832     }
1833     else {
1834         return 0;
1835     }
1836 }
1837 
1838 /* Remove chains with scores below the these of the best ones considering
1839    score bonus for pairs and count the number of chains with same of better
1840    score for each chain */
s_PruneChains(HSPChain ** saved,Int4 num_queries,Int4 pair_bonus)1841 static int s_PruneChains(HSPChain** saved, Int4 num_queries, Int4 pair_bonus)
1842 
1843 {
1844     Int4 query_idx;
1845     Int4 array_size = 10;
1846     HSPChain** array = calloc(array_size, sizeof(HSPChain*));
1847 
1848     /* Prunning is done in two passes: first considering single chains,
1849        then considering pairs. The first pass breaks pairs where there is much
1850        better chain with no pair. The second pass selects the best scoring
1851        chains giving preference to pairs. Pairs are preferable, so they get
1852        additional score. Two passes are requires, because we compare single
1853        chains with single chains and pairs with pairs. This is to avoid
1854        breaking pairs in situations with two pairs with one high scoring
1855        chain. */
1856 
1857     /* first pass: for each query remove poor scoring chains that may split
1858        pairs */
1859     for (query_idx = 0; query_idx < num_queries; query_idx++) {
1860         HSPChain* chain = saved[query_idx];
1861         HSPChain* prev = NULL;
1862         Int4 best_score = 0;
1863 
1864         if (!chain || !chain->next) {
1865             continue;
1866         }
1867 
1868         /* find the best single chain score */
1869         for (chain = saved[query_idx]; chain; chain = chain->next) {
1870             if (chain->score > best_score) {
1871                 best_score = chain->score;
1872             }
1873         }
1874 
1875         /* remove chains with poor scores, considering pair bonus */
1876         chain = saved[query_idx];
1877         while (chain) {
1878             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1879 
1880             if (score < best_score) {
1881                 HSPChain* next = chain->next;
1882 
1883                 chain->next = NULL;
1884                 HSPChainFree(chain);
1885                 chain = next;
1886 
1887                 if (prev) {
1888                     prev->next = next;
1889                 }
1890                 else {
1891                     saved[query_idx] = next;
1892                 }
1893             }
1894             else {
1895                 prev = chain;
1896                 chain = chain->next;
1897             }
1898         }
1899 
1900     }
1901 
1902     /* second pass: for each query remove chains scoring poorer than the best
1903        ones considering pair bonus */
1904     for (query_idx = 0; query_idx < num_queries; query_idx++) {
1905         HSPChain* chain = saved[query_idx];
1906         HSPChain* prev = NULL;
1907         Int4 best_score = 0;
1908         Int4 best_pair_score = 0;
1909         Int4 i;
1910         Int4 count = 0;
1911         Int4 num_chains = 0;
1912 
1913         if (!chain) {
1914             continue;
1915         }
1916 
1917         if (!chain->next) {
1918             chain->count = 1;
1919             continue;
1920         }
1921 
1922         /* find the best scores for single chains and pairs */
1923         for (chain = saved[query_idx]; chain; chain = chain->next) {
1924             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1925             if (score >= best_score) {
1926                 best_score = score;
1927             }
1928 
1929             if (chain->pair &&
1930                 chain->score + chain->pair->score >= best_pair_score) {
1931 
1932                 best_pair_score = chain->score + chain->pair->score;
1933             }
1934 
1935             /* collect chains in the array */
1936             if (num_chains >= array_size) {
1937                 array_size *= 2;
1938                 array = realloc(array, array_size * sizeof(HSPChain*));
1939                 if (!array) {
1940                     return -1;
1941                 }
1942             }
1943             array[num_chains++] = chain;
1944         }
1945 
1946         /* sort chains by scores in descending order */
1947         ASSERT(num_chains > 1);
1948         qsort(array, num_chains, sizeof(HSPChain*), s_CompareChainsByScore);
1949 
1950         /* find multiplicity for each chain: number of chains with same or
1951            larger score */
1952         /* Note that multiplicity is currently not reported. Only the number
1953            of returned hits for each query is reported */
1954         for (i = 0;i < num_chains; i++) {
1955             Int4  k;
1956 
1957             count++;
1958             /* check for chains with the same score */
1959             k = i + 1;
1960             while (k < num_chains && array[k]->score == array[i]->score) {
1961                 k++;
1962                 count++;
1963             }
1964 
1965             /* save multiplicity */
1966             for (;i < k;i++) {
1967                 array[i]->count = count;
1968             }
1969             i--;
1970         }
1971 
1972 
1973         /* remove chains with poor score and count chains with equal or better
1974            score to the minimum pair one */
1975         chain = saved[query_idx];
1976         while (chain) {
1977             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1978 
1979             /* We compare single chains with single chains and pairs with
1980                pairs. There may be pairs with one strong and one weak hit.
1981                Comparing single chain scores we can be left with single chains
1982                from different pairs. This comparison aviods that. The first
1983                pass ensures that pairs with poor scoring chains are removed,
1984                so we do not have to compare singles with pairs. */
1985             if ((!chain->pair && score < best_score) ||
1986                 (chain->pair && chain->score + chain->pair->score <
1987                  best_pair_score)) {
1988 
1989                 HSPChain* next = chain->next;
1990 
1991                 chain->next = NULL;
1992                 HSPChainFree(chain);
1993                 chain = next;
1994 
1995                 if (prev) {
1996                     prev->next = next;
1997                 }
1998                 else {
1999                     saved[query_idx] = next;
2000                 }
2001             }
2002             else {
2003                 prev = chain;
2004                 chain = chain->next;
2005             }
2006         }
2007     }
2008 
2009     if (array) {
2010         sfree(array);
2011     }
2012 
2013     return 0;
2014 }
2015 
2016 
2017 /* Compare chains by subject oid and offfset */
s_CompareChainsByOid(const void * cha,const void * chb)2018 static int s_CompareChainsByOid(const void* cha, const void* chb)
2019 {
2020     const HSPChain* a = *((HSPChain**)cha);
2021     const HSPChain* b = *((HSPChain**)chb);
2022 
2023     if (a->oid > b->oid) {
2024         return 1;
2025     }
2026     else if (a->oid < b->oid) {
2027         return -1;
2028     }
2029     else if (a->hsps->hsp->subject.offset > b->hsps->hsp->subject.offset) {
2030         return 1;
2031     }
2032     else if (a->hsps->hsp->subject.offset < b->hsps->hsp->subject.offset) {
2033         return -1;
2034     }
2035 
2036     return 0;
2037 }
2038 
2039 /* Sort chains by subject oid and position and HSPs in the chains by query and
2040    subject positions */
s_SortChains(HSPChain ** saved,Int4 num_queries,int (* comp)(const void *,const void *))2041 static Int4 s_SortChains(HSPChain** saved, Int4 num_queries,
2042                          int(*comp)(const void*, const void*))
2043 {
2044     Int4 i;
2045     Int4 array_size = 50;
2046     HSPChain** chain_array = NULL;
2047 
2048     if (!saved || num_queries < 0) {
2049         return -1;
2050     }
2051 
2052     chain_array = calloc(array_size, sizeof(HSPChain*));
2053     if (!chain_array) {
2054         return -1;
2055     }
2056 
2057     /* for each query */
2058     for (i = 0;i < num_queries;i++) {
2059         HSPChain* chain = saved[i];
2060         Int4 num_chains = 0;
2061         Int4 k;
2062 
2063         if (!chain) {
2064             continue;
2065         }
2066 
2067         /* create an array of pointer to chains */
2068         for (; chain; chain = chain->next) {
2069             if (num_chains >= array_size) {
2070                 array_size *= 2;
2071                 chain_array = realloc(chain_array, array_size *
2072                                       sizeof(HSPChain*));
2073 
2074                 ASSERT(chain_array);
2075                 if (!chain_array) {
2076                     return -1;
2077                 }
2078             }
2079             chain_array[num_chains++] = chain;
2080         }
2081 
2082         if (num_chains > 1) {
2083             /* sort chains */
2084             qsort(chain_array, num_chains, sizeof(HSPChain*), comp);
2085 
2086             /* create the list of chains in the new order */
2087             for (k = 0;k < num_chains - 1;k++) {
2088                 chain_array[k]->next = chain_array[k + 1];
2089             }
2090             chain_array[num_chains - 1]->next = NULL;
2091 
2092             saved[i] = chain_array[0];
2093             ASSERT(saved[i]);
2094         }
2095     }
2096 
2097     if (chain_array) {
2098         free(chain_array);
2099     }
2100 
2101     return 0;
2102 }
2103 
2104 
2105 /* Separate HSPs that overlap on the query to different chains (most output
2106    formats do not support query overlaps) */
s_RemoveOverlaps(HSPChain * chain,const ScoringOptions * score_opts,Int4 query_len)2107 static int s_RemoveOverlaps(HSPChain* chain, const ScoringOptions* score_opts,
2108                             Int4 query_len)
2109 {
2110     HSPContainer* h = NULL;
2111     HSPContainer* prev = NULL;
2112     HSPChain* tail = chain->next;
2113     HSPChain* head = chain;
2114     Boolean rescore = FALSE;
2115 
2116     if (!chain) {
2117         return -1;
2118     }
2119 
2120     /* current and previous HSP */
2121     h = chain->hsps->next;
2122     prev = chain->hsps;
2123 
2124     for (; h; h = h->next) {
2125 
2126         /* if HSPs overlap on the query */
2127         if (prev->hsp->query.end > h->hsp->query.offset) {
2128             /* do a flat copy of the chain */
2129             HSPChain* new_chain = HSPChainNew(chain->context);
2130             memcpy(new_chain, chain, sizeof(HSPChain));
2131             new_chain->pair = NULL;
2132             new_chain->next = NULL;
2133 
2134             /* new chain starts with h */
2135             new_chain->hsps = h;
2136 
2137             /* the original chain ends on the previous HSP */
2138             prev->next = NULL;
2139 
2140             /* add new chain to the list */
2141             head->next = new_chain;
2142             head = new_chain;
2143 
2144             rescore = TRUE;
2145         }
2146         prev = h;
2147     }
2148 
2149     /* compute chains scores */
2150     if (rescore) {
2151         for (; chain; chain = chain->next) {
2152             chain->score = s_ComputeChainScore(chain, score_opts, query_len,
2153                                                FALSE);
2154         }
2155     }
2156 
2157     /* re-attach other chains to the list */
2158     head->next = tail;
2159 
2160     return 0;
2161 }
2162 
2163 
2164 /* Removes chains with scores below the cutoff */
s_FilterChains(HSPChain ** chains_ptr,Int4 cutoff_score,Int4 cutoff_edit_distance)2165 static int s_FilterChains(HSPChain** chains_ptr, Int4 cutoff_score,
2166                           Int4 cutoff_edit_distance)
2167 {
2168     HSPChain* chain = *chains_ptr;
2169 
2170     while (chain && !s_TestCutoffs(chain, cutoff_score, cutoff_edit_distance)
2171            /*chain->score < cutoff_score*/) {
2172         HSPChain* next = chain->next;
2173         chain->next = NULL;
2174         HSPChainFree(chain);
2175         chain = next;
2176     }
2177     *chains_ptr = chain;
2178 
2179     if (chain) {
2180         while (chain && chain->next) {
2181             if (!s_TestCutoffs(chain->next, cutoff_score, cutoff_edit_distance)
2182                 /*chain->next->score < cutoff_score*/) {
2183                 HSPChain* next = chain->next->next;
2184                 chain->next->next = NULL;
2185                 HSPChainFree(chain->next);
2186                 chain->next = next;
2187             }
2188             else {
2189                 chain = chain->next;
2190             }
2191         }
2192     }
2193 
2194     return 0;
2195 }
2196 
2197 
2198 /* Populate HSP data and save final results */
s_Finalize(HSPChain ** saved,BlastMappingResults * results,const BlastQueryInfo * query_info,const BLAST_SequenceBlk * query_blk,const ScoringOptions * score_opts,Boolean is_paired,Int4 cutoff_score,Int4 cutoff_edit_dist)2199 static int s_Finalize(HSPChain** saved, BlastMappingResults* results,
2200                       const BlastQueryInfo* query_info,
2201                       const BLAST_SequenceBlk* query_blk,
2202                       const ScoringOptions* score_opts,
2203                       Boolean is_paired,
2204                       Int4 cutoff_score,
2205                       Int4 cutoff_edit_dist)
2206 {
2207     Int4 query_idx;
2208     const Int4 kPairBonus = 21;
2209 
2210     /* remove poor scoring chains and find chain multiplicity */
2211     if (!getenv("MAPPER_NO_PRUNNING")) {
2212         s_PruneChains(saved, query_info->num_queries, kPairBonus);
2213     }
2214 
2215     /* find adapters in query sequences */
2216     s_FindAdapters(saved, query_blk, query_info, score_opts);
2217 
2218     /* find polyA tails in query sequences */
2219     s_FindPolyATails(saved, query_blk, query_info);
2220 
2221     /* find pairs of chains aligned to different subjects */
2222     s_FindRearrangedPairs(saved, query_info);
2223 
2224     /* sort chains by subject oid to ensure stable results for multithreaded
2225        runs; sorting must be done here so that compartment numbers are stable */
2226     /* FIXME: this may not be needed */
2227     s_SortChains(saved, query_info->num_queries, s_CompareChainsByOid);
2228 
2229 
2230     /* separate HSPs that overlap on the query into different chains */
2231     if (getenv("MAPPER_NO_OVERLAPPED_HSP_MERGE")) {
2232         for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2233             HSPChain* chain = saved[query_idx];
2234             Int4 query_len;
2235 
2236             if (!chain) {
2237                 continue;
2238             }
2239 
2240             query_len = query_info->contexts[chain->context].query_length;
2241             for (; chain; chain = chain->next) {
2242                 s_RemoveOverlaps(chain, score_opts, query_len);
2243             }
2244 
2245         }
2246     }
2247 
2248 
2249     /* remove chains with score below cutoff */
2250     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2251         s_FilterChains(&saved[query_idx], cutoff_score, cutoff_edit_dist);
2252     }
2253 
2254     /* Compute number of results to store, and number of unique mappings for
2255        each query (some chains may be copied to create pairs) */
2256 
2257     /* count number of unique mappings */
2258     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2259         HSPChain* chain = saved[query_idx];
2260         HSPChain* prev = chain;
2261         Int4 num_unique = 1;
2262 
2263         if (!chain) {
2264             continue;
2265         }
2266 
2267         chain = chain->next;
2268         for (; chain; chain = chain->next, prev = prev->next) {
2269             if (prev->oid != chain->oid ||
2270                 s_FindFragmentStart(prev) != s_FindFragmentStart(chain)) {
2271 
2272                 num_unique++;
2273             }
2274         }
2275 
2276         /* FIXME: for tabular output check count computed earlier */
2277         for (chain = saved[query_idx]; chain; chain = chain->next) {
2278             chain->count = num_unique;
2279         }
2280     }
2281 
2282 #if _DEBUG
2283     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2284          HSPChain* chain = saved[query_idx];
2285          for (; chain; chain = chain->next) {
2286              HSPContainer* h = chain->hsps;
2287              for (; h; h = h->next) {
2288                  s_TestHSPRanges(h->hsp);
2289              }
2290          }
2291 
2292     }
2293 #endif
2294 
2295     results->chain_array = saved;
2296     results->num_queries = query_info->num_queries;
2297 
2298     return 0;
2299 }
2300 
2301 /** Perform post-run clean-ups
2302  * @param data The buffered data structure [in]
2303  * @param results The HSP results to propagate [in][out]
2304  */
2305 static int
s_BlastHSPMapperFinal(void * data,void * mapping_results)2306 s_BlastHSPMapperFinal(void* data, void* mapping_results)
2307 {
2308     BlastHSPMapperData* spl_data = data;
2309     BlastHSPMapperParams* params = spl_data->params;
2310     BlastMappingResults* results = (BlastMappingResults*)mapping_results;
2311 
2312     ASSERT(spl_data->saved_chains);
2313 
2314     if (spl_data->saved_chains != NULL) {
2315         s_Finalize(spl_data->saved_chains, results, spl_data->query_info,
2316                    spl_data->query, &params->scoring_options, params->paired,
2317                    params->cutoff_score, params->cutoff_edit_dist);
2318     }
2319 
2320     spl_data->saved_chains = NULL;
2321 
2322    return 0;
2323 }
2324 
2325 
2326 /* A node representing an HSP for finding the best chain of HSPs for RNA-seq */
2327 typedef struct HSPNode {
2328     BlastHSP** hsp;     /* pointer to the entry in hsp_list */
2329     Int4 best_score;    /* score for path that starts at this node */
2330     struct HSPNode* path_next; /* next node in the path */
2331 } HSPNode;
2332 
2333 
2334 /* Copy a array of HSP nodes */
s_HSPNodeArrayCopy(HSPNode * dest,HSPNode * source,Int4 num)2335 static int s_HSPNodeArrayCopy(HSPNode* dest, HSPNode* source, Int4 num)
2336 {
2337     int i;
2338 
2339     if (!dest || !source) {
2340         return -1;
2341     }
2342 
2343     for (i = 0;i < num;i++) {
2344         dest[i] = source[i];
2345         if (source[i].path_next) {
2346             dest[i].path_next = dest + (source[i].path_next - source);
2347         }
2348     }
2349 
2350     return 0;
2351 }
2352 
2353 
2354 /* Maximum number of top scoring HSP chains to report for a single read and
2355    strand */
2356 #define MAX_NUM_HSP_PATHS 40
2357 
2358 /* A chain of HSPs aligning a spliced RNA-Seq read to a genome */
2359 typedef struct HSPPath {
2360     HSPNode** start;  /* array of chain starting nodes */
2361     int num_paths;    /* number of chains */
2362     Int4 score;       /* all chains in start have this cumulative score */
2363 } HSPPath;
2364 
2365 
HSPPathFree(HSPPath * path)2366 static HSPPath* HSPPathFree(HSPPath* path)
2367 {
2368     if (path) {
2369         if (path->start) {
2370             free(path->start);
2371         }
2372         free(path);
2373     }
2374 
2375     return NULL;
2376 }
2377 
HSPPathNew(void)2378 static HSPPath* HSPPathNew(void)
2379 {
2380     HSPPath* retval = calloc(1, sizeof(HSPPath));
2381     if (!retval) {
2382         return NULL;
2383     }
2384 
2385     retval->start = calloc(MAX_NUM_HSP_PATHS, sizeof(HSPNode*));
2386     if (!retval->start) {
2387         free(retval);
2388         return NULL;
2389     }
2390 
2391     return retval;
2392 }
2393 
2394 
2395 #define NUM_SIGNALS 23
2396 #define NUM_SIGNALS_CONSENSUS 2
2397 
2398 /* Find a split for HSPs overlapping on the query by finding splice signals in
2399    the subject sequence. The first HSP must have smaller query offset than the
2400    second one. Subject is recreated from the query sequence and information
2401    in an HSP.
2402    @param first HSP with smaller query offset [in|out]
2403    @param second HSP with larger query offset [in|out]
2404    @param query Query sequence [in]
2405    @return Status
2406  */
2407 static Int4
s_FindSpliceJunctionsForOverlaps(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,Boolean consensus_only)2408 s_FindSpliceJunctionsForOverlaps(BlastHSP* first, BlastHSP* second,
2409                                  Uint1* query, Int4 query_len,
2410                                  Boolean consensus_only)
2411 {
2412     Int4 i, k;
2413     /* splice signal pairs from include/algo/sequence/consensus_splice.hpp */
2414     Uint1 signals[NUM_SIGNALS] = {0xb2, /* GTAG */
2415                                   0x71, /* CTAC (reverse complement) */
2416                                   0x92, /* GCAG */
2417                                   0x79, /* CTGC */
2418                                   0x31, /* ATAC */
2419                                   0xb3, /* GTAT */
2420                                   0xbe, /* GTTG */
2421                                   0x41, /* CAAC */
2422                                   0xba, /* GTGG */
2423                                   0x51, /* CCAC */
2424                                   0xb0, /* GTAA */
2425                                   0xf1, /* TTAC */
2426                                   0x82, /* GAAG */
2427                                   0x7d, /* CTTC */
2428                                   0xf2, /* TTAG */
2429                                   0x70, /* CTAA */
2430                                   0x32, /* ATAG */
2431                                   0x73, /* CTAT */
2432                                   0xa2, /* GGAG */
2433                                   0x75, /* CTCC */
2434                                   0x33, /* ATAT (revese complemented is self) */
2435                                   0x30, /* ATAA */
2436                                   0xf3  /* TTAT */};
2437 
2438 
2439     Boolean found = FALSE;
2440     Uint1* subject = NULL;
2441     Int4 overlap_len;
2442     const Uint1 kGap = 15;
2443     Int4 num_signals = consensus_only ? NUM_SIGNALS_CONSENSUS : NUM_SIGNALS;
2444 
2445     /* HSPs must be sorted and must overlap only on the query */
2446     ASSERT(first->query.offset < second->query.offset);
2447     ASSERT(second->query.offset <= first->query.end);
2448     ASSERT(first->subject.offset < second->subject.offset);
2449 
2450 
2451     /* if there are edits within the overlap region, then try to maximize
2452        the chain score by assigning a part of the overlap region to the
2453        HSP that has fewer edits in the overlap */
2454     if ((first->map_info->edits->num_edits > 0 &&
2455          first->map_info->edits->edits[
2456                         first->map_info->edits->num_edits - 1].query_pos >=
2457             second->query.offset)
2458         || (second->map_info->edits->num_edits > 0 &&
2459             second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2460 
2461         /* find nummer of  edits in both HSPs within the overlap and assign
2462            overlap to the HSP that has fewer edits */
2463         Int4 num_first = 0;
2464         Int4 num_second = 0;
2465 
2466         /* find number of edits */
2467         for (i = first->map_info->edits->num_edits - 1;i >= 0;i--) {
2468             JumperEdit* edits = first->map_info->edits->edits;
2469             if (edits[i].query_pos < second->query.offset) {
2470                 break;
2471             }
2472             num_first++;
2473         }
2474 
2475         for (i = 0;i < second->map_info->edits->num_edits;i++) {
2476             JumperEdit* edits = second->map_info->edits->edits;
2477             if (edits[i].query_pos >= first->query.end) {
2478                 break;
2479             }
2480             num_second++;
2481         }
2482 
2483         if (num_first > num_second) {
2484             while (first->map_info->edits->num_edits > 0 &&
2485                    first->map_info->edits->edits[
2486                           first->map_info->edits->num_edits - 1].query_pos >=
2487                    second->query.offset) {
2488 
2489                 JumperEdit* edits = first->map_info->edits->edits;
2490                 Uint1 edge = first->map_info->right_edge;
2491                 Int4 num_edits = first->map_info->edits->num_edits;
2492                 Int4 trim_by;
2493 
2494                 if (edits[num_edits - 1].query_pos >= first->query.end - 1) {
2495                     if (edits[num_edits - 1].subject_base != kGap) {
2496                         edge >>= 2;
2497                         edge |= edits[num_edits - 1].subject_base << 2;
2498                     }
2499                 }
2500                 else if (edits[num_edits - 1].query_pos == query_len - 2 &&
2501                          edits[num_edits - 1].subject_base == kGap) {
2502 
2503                     edge = (edge << 2) | query[query_len - 1];
2504                 }
2505                 else {
2506                     if (edits[num_edits - 1].subject_base != kGap &&
2507                         edits[num_edits - 1].query_base != kGap) {
2508 
2509                         edge = (edits[num_edits - 1].subject_base << 2) |
2510                             query[edits[num_edits - 1].query_pos + 1];
2511                     }
2512                     else if (edits[num_edits - 1].subject_base == kGap) {
2513                         edge = (query[edits[num_edits - 1].query_pos + 1] << 2 ) |
2514                             query[edits[num_edits - 1].query_pos + 2];
2515                     }
2516                     else {
2517                         edge = (edits[num_edits - 1].subject_base << 2) |
2518                             query[edits[num_edits - 1].query_pos];
2519                     }
2520                 }
2521 
2522                 trim_by = first->query.end -  edits[
2523                             first->map_info->edits->num_edits - 1].query_pos;
2524                 /* if the edit is an insert in the genome, count genome bases
2525                    when trimming the HSP */
2526                 if (edits[num_edits - 1].query_base == kGap) {
2527                     trim_by++;
2528                     /* FIXME: use proper scores here */
2529                     s_TrimHSP(first, trim_by, FALSE, FALSE, -4, -4, -4, query);
2530                 }
2531                 else {
2532                     s_TrimHSP(first, trim_by, TRUE, FALSE, -4, -4, -4, query);
2533                 }
2534                 first->map_info->right_edge = edge;
2535             }
2536         }
2537         else if (num_second > num_first) {
2538             while (second->map_info->edits->num_edits > 0 &&
2539                    second->map_info->edits->edits[0].query_pos <
2540                    first->query.end) {
2541 
2542                 JumperEdit* edits = second->map_info->edits->edits;
2543                 Uint1 edge = second->map_info->left_edge;
2544                 Int4 trim_by;
2545 
2546                 if (edits[0].query_pos == 0) {
2547                     if (edits[0].subject_base != kGap) {
2548                         edge <<= 2;
2549                         edge |= edits[0].subject_base;
2550                     }
2551                 }
2552                 else if (edits[0].query_pos == 1 &&
2553                          edits[0].subject_base == kGap) {
2554 
2555                     edge = (edge << 2) | query[0];
2556                 }
2557                 else {
2558 
2559                     edge = (query[edits[0].query_pos - 1] << 2) |
2560                         edits[0].subject_base;
2561 
2562                     if (edits[0].subject_base == kGap) {
2563                         edge = (query[edits[0].query_pos - 2] << 2) |
2564                             query[edits[0].query_pos - 1];
2565                     }
2566                 }
2567 
2568                 trim_by = edits[0].query_pos - second->query.offset + 1;
2569                 /* if the edit is an insert in the genome, count genome bases
2570                    when trimming the HSP */
2571                 if (edits[0].query_base == kGap) {
2572                     trim_by++;
2573                     /* FIXME: use proper scores here */
2574                     s_TrimHSP(second,
2575                               edits[0].query_pos - second->query.offset + 1,
2576                               FALSE, TRUE, -4, -4, -4, query);
2577                 }
2578                 else {
2579                     s_TrimHSP(second,
2580                               edits[0].query_pos - second->query.offset + 1,
2581                               TRUE, TRUE, -4, -4, -4, query);
2582                 }
2583                 second->map_info->left_edge = edge;
2584             }
2585         }
2586         else {
2587             /* if the number of edits is the same we do not know how to
2588                divide the overlap */
2589 
2590             first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2591             second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2592             return 0;
2593         }
2594     }
2595 
2596     /* give up if there are gaps in the alignments or mismatches still left in
2597        the overlapping parts */
2598     if ((first->map_info->edits->num_edits > 0 &&
2599          first->map_info->edits->edits[
2600                         first->map_info->edits->num_edits - 1].query_pos >=
2601             second->query.offset)
2602         || (second->map_info->edits->num_edits > 0 &&
2603             second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2604 
2605         first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2606         second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2607         return 0;
2608     }
2609 
2610     /* recreate the subject sequence plus two bases on each side */
2611     overlap_len = first->query.end - second->query.offset;
2612     subject = calloc(overlap_len + 4, sizeof(Uint1));
2613     if (!subject) {
2614         return -1;
2615     }
2616 
2617     /* FIXME: lef_edge and right_edge should be changed to subject_overhangs */
2618     subject[0] = (second->map_info->left_edge & 0xf) >> 2;
2619     subject[1] = second->map_info->left_edge & 3;
2620     memcpy(subject + 2, query + second->query.offset,
2621            overlap_len * sizeof(Uint1));
2622 
2623     subject[2 + overlap_len] = (first->map_info->right_edge & 0xf) >> 2;
2624     subject[2 + overlap_len + 1] = first->map_info->right_edge & 3;
2625 
2626     /* search for matching splice signals in the subject sequence */
2627     for (k = 0;k < num_signals;k++) {
2628 
2629         for (i = 0;i <= overlap_len && !found;i++) {
2630 
2631             Uint1 seq = (subject[i] << 2) | subject[i + 1] |
2632                 (subject[i + 2] << 6) | (subject[i + 3] << 4);
2633 
2634             /* if splice signals are found, update HSPs */
2635             if (seq == signals[k]) {
2636                 Int4 d = first->query.end - (second->query.offset + i);
2637                 first->query.end -= d;
2638                 first->subject.end -= d;
2639                 first->gap_info->num[first->gap_info->size - 1] -= d;
2640                 first->score -= d;
2641                 first->num_ident -= d;
2642                 first->map_info->right_edge = (seq & 0xf0) >> 4;
2643                 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2644 
2645                 d = i;
2646                 second->query.offset += d;
2647                 second->subject.offset += d;
2648                 second->gap_info->num[0] -= d;
2649                 second->score -= d;
2650                 second->num_ident -= d;
2651                 second->map_info->left_edge = seq & 0xf;
2652                 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2653 
2654                 ASSERT(first->gap_info->size > 1 ||
2655                        first->query.end - first->query.offset ==
2656                        first->subject.end - first->subject.offset);
2657 
2658                 ASSERT(second->gap_info->size > 1 ||
2659                        second->query.end - second->query.offset ==
2660                        second->subject.end - second->subject.offset);
2661 
2662                 found = TRUE;
2663                 break;
2664             }
2665         }
2666     }
2667 
2668     if (!found) {
2669         first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2670         second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2671     }
2672 
2673     if (subject) {
2674         sfree(subject);
2675     }
2676 
2677     ASSERT(first->query.offset < first->query.end);
2678     ASSERT(second->query.offset < second->query.end);
2679 
2680     return 0;
2681 }
2682 
s_ExtendAlignmentCleanup(Uint1 * subject,BlastGapAlignStruct * gap_align,GapEditScript * edit_script,JumperEditsBlock * edits)2683 static void s_ExtendAlignmentCleanup(Uint1* subject,
2684                                      BlastGapAlignStruct* gap_align,
2685                                      GapEditScript* edit_script,
2686                                      JumperEditsBlock* edits)
2687 {
2688     if (subject) {
2689         free(subject);
2690     }
2691 
2692     BLAST_GapAlignStructFree(gap_align);
2693     GapEditScriptDelete(edit_script);
2694     JumperEditsBlockFree(edits);
2695 }
2696 
2697 /* Extend alignment on one side of an HSP up to a given point (a splice site)
2698 */
s_ExtendAlignment(BlastHSP * hsp,const Uint1 * query,Int4 query_from,Int4 query_to,Int4 subject_from,Int4 subject_to,const ScoringOptions * score_options,Boolean is_left)2699 static Int4 s_ExtendAlignment(BlastHSP* hsp, const Uint1* query,
2700                               Int4 query_from, Int4 query_to,
2701                               Int4 subject_from, Int4 subject_to,
2702                               const ScoringOptions* score_options,
2703                               Boolean is_left)
2704 {
2705     Int4 o_len;
2706     Uint1* o_seq = NULL;
2707     Uint1* subject = NULL;
2708     BlastGapAlignStruct* gap_align = NULL;
2709     GapEditScript* edit_script = NULL;
2710     JumperEditsBlock* edits = NULL;
2711     Int4 i, k, s;
2712     /* number of query bases to align */
2713     Int4 query_gap = query_to - query_from + 1;
2714     /* number of subject bases to align */
2715     Int4 subject_gap = subject_to - subject_from + 1;
2716     Int4 num_identical;
2717     Int4 query_ext_len, subject_ext_len;
2718     Int4 ungapped_ext_len;
2719 
2720     JUMP* jumper_table = NULL;
2721 
2722     JUMP jumper_mismatch[] = {{1, 1, 0, 0}};
2723 
2724     JUMP jumper_insertion[] = {{1, 1, 2, 0},
2725                                {1, 0, 1, 0},
2726                                {1, 1, 0, 0}};
2727 
2728     JUMP jumper_deletion[] = {{1, 1, 2, 0},
2729                               {0, 1, 1, 0},
2730                               {1, 1, 0, 0}};
2731 
2732     if (!hsp || !query || !hsp->map_info ||
2733         !hsp->map_info->subject_overhangs) {
2734         return -1;
2735     }
2736 
2737     /* length of the subject overgang sequence saved in the HSP */
2738     o_len = is_left ? hsp->map_info->subject_overhangs->left_len :
2739         hsp->map_info->subject_overhangs->right_len;
2740     /* subject sequence */
2741     o_seq = is_left ? hsp->map_info->subject_overhangs->left :
2742         hsp->map_info->subject_overhangs->right;
2743     ASSERT(o_seq);
2744 
2745     /* compressed subject sequence (needed for computig jumper edits) */
2746     subject = calloc(o_len / 4 + 1, sizeof(Uint1));
2747     if (!subject) {
2748         return -1;
2749     }
2750 
2751     /* compress subject sequence  */
2752     k = 6;
2753     s = 0;
2754     for (i = 0;i < o_len;i++, k-=2) {
2755         subject[s] |= o_seq[i] << k;
2756         if (k == 0) {
2757             s++;
2758             k = 8;
2759         }
2760     }
2761 
2762     /* gap_align is needed in jumper alignment API calls, we only allocate
2763        fields that will be needed in these calls */
2764     gap_align = calloc(1, sizeof(BlastGapAlignStruct));
2765     if (!gap_align) {
2766         s_ExtendAlignmentCleanup(subject, NULL, edit_script, edits);
2767         return -1;
2768     }
2769     gap_align->jumper = JumperGapAlignNew(o_len * 2);
2770     if (!gap_align->jumper) {
2771         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2772         return -1;
2773     }
2774 
2775     /* select jumper table based on the number of expected indels; we allow
2776        only insertions or only deletions */
2777     switch (SIGN(query_gap - subject_gap)) {
2778     case 0: jumper_table = jumper_mismatch;
2779         break;
2780 
2781     case -1: jumper_table = jumper_deletion;
2782         break;
2783 
2784     case 1: jumper_table = jumper_insertion;
2785         break;
2786 
2787     default:
2788         ASSERT(0);
2789         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2790         return -1;
2791     };
2792 
2793     /* Compute alignment. We always do right extension as this is typically
2794        a short alignment. This is global alignment, hence the large max number
2795        of errors and no penalties for mismatches or gaps. The jumper table
2796        specifies costs for alignment errors. The penalty socres specify
2797        required number of macthes at the ends of the alignment. */
2798     JumperExtendRightWithTraceback(query + query_from, o_seq + subject_from,
2799                                    query_gap, subject_gap,
2800                                    1, 0, 0, 0,
2801                                    20, 20,
2802                                    &query_ext_len, &subject_ext_len,
2803                                    gap_align->jumper->right_prelim_block,
2804                                    &num_identical,
2805                                    FALSE,
2806                                    &ungapped_ext_len,
2807                                    jumper_table);
2808 
2809     ASSERT(query_ext_len <= query_gap);
2810     ASSERT(subject_ext_len <= subject_gap);
2811 
2812     /* Because this is a global alignment a gap may exist after the last query
2813        or subject base which causes jumper extension to stop prematurely. If
2814        this is the case, add the missing indel */
2815     while (query_ext_len < query_gap) {
2816         JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2817                                  JUMPER_INSERTION);
2818 
2819         query_ext_len++;
2820     }
2821 
2822     while (subject_ext_len < subject_gap) {
2823         JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2824                                  JUMPER_DELETION);
2825 
2826         subject_ext_len++;
2827     }
2828     ASSERT(query_ext_len == query_gap);
2829     ASSERT(subject_ext_len == subject_gap);
2830 
2831     /* update gap info in the HSP */
2832     edit_script = JumperPrelimEditBlockToGapEditScript(
2833                                         gap_align->jumper->left_prelim_block,
2834                                         gap_align->jumper->right_prelim_block);
2835     if (!edit_script) {
2836         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2837         return -1;
2838     }
2839 
2840     if (is_left) {
2841         hsp->gap_info = GapEditScriptCombine(&edit_script, &hsp->gap_info);
2842     }
2843     else {
2844         hsp->gap_info = GapEditScriptCombine(&hsp->gap_info, &edit_script);
2845         ASSERT(!edit_script);
2846     }
2847     ASSERT(hsp->gap_info);
2848     if (!hsp->gap_info) {
2849         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2850         return -1;
2851     }
2852 
2853     /* update jumper edits */
2854     gap_align->query_start = query_from;
2855     gap_align->query_stop = query_from + query_ext_len;
2856     gap_align->subject_start = subject_from;
2857     gap_align->subject_stop = subject_from + subject_ext_len;
2858     edits = JumperFindEdits(query, subject, gap_align);
2859     ASSERT(edits);
2860     if (!edits) {
2861         s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2862         return -1;
2863     }
2864 
2865     if (is_left) {
2866         hsp->map_info->edits = JumperEditsBlockCombine(&edits,
2867                                                    &hsp->map_info->edits);
2868     }
2869     else {
2870         hsp->map_info->edits = JumperEditsBlockCombine(
2871                                                     &hsp->map_info->edits,
2872                                                     &edits);
2873         ASSERT(!edits);
2874     }
2875     ASSERT(hsp->map_info->edits);
2876     if (!hsp->map_info->edits) {
2877         s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2878         return -1;
2879     }
2880 
2881     if (is_left) {
2882         hsp->query.offset -= query_ext_len;
2883         hsp->subject.offset -= subject_ext_len;
2884     }
2885     else {
2886         hsp->query.end += query_ext_len;
2887         hsp->subject.end += subject_ext_len;
2888     }
2889 
2890     /* compute new HSP score */
2891     hsp->score = s_ComputeAlignmentScore(hsp, score_options->penalty,
2892                                          score_options->gap_open,
2893                                          score_options->gap_extend);
2894 
2895     ASSERT(hsp->gap_info);
2896     ASSERT(hsp->map_info->edits);
2897 
2898     ASSERT(subject);
2899 
2900 
2901     if (subject) {
2902         free(subject);
2903     }
2904     BLAST_GapAlignStructFree(gap_align);
2905 
2906     return 0;
2907 }
2908 
2909 
2910 /* Find splcie junctions and extend HSP alignment if there are unaligned query
2911    bases in the middle of a chain.
2912 */
2913 static Int4
s_FindSpliceJunctionsForGap(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,const ScoringOptions * score_options)2914 s_FindSpliceJunctionsForGap(BlastHSP* first, BlastHSP* second,
2915                             Uint1* query, Int4 query_len,
2916                             const ScoringOptions* score_options)
2917 {
2918     Int4 query_gap;
2919     Int4 first_len, second_len;
2920     Boolean found = FALSE;
2921     Int4 k;
2922     Int4 q;
2923     Uint1 signal = 0;
2924     /* use only cannonical splice signals here */
2925     /* The procedure does not maximize score for unaligned bases of the query.
2926        It only finds splice signals and then the best alignment it can find
2927        for the unaligned bases. For limited number of unaligned query bases,
2928        we can be sure of the splice site vs a continuous alignment, because a
2929        continuous alignment would score better. But we do not compare
2930        alignment scores between different splice sites, so we do not know
2931        which splice site and signal is better; and many splice signals here
2932        would lead to finding wrong splice sites. */
2933     Uint1 signals[NUM_SIGNALS_CONSENSUS] = {0xb2, /* GTAG */
2934                                             0x71  /* CTAC */};
2935 
2936     if (!first || !second) {
2937         return -1;
2938     }
2939 
2940     if (!first->map_info->subject_overhangs ||
2941         !first->map_info->subject_overhangs->right ||
2942         !second->map_info->subject_overhangs ||
2943         !second->map_info->subject_overhangs->left) {
2944         return -1;
2945     }
2946 
2947     /* number of query bases that fall between the HSPs */
2948     query_gap = second->query.offset - first->query.end;
2949 
2950     /* we do not have enough subject sequence saved (-1 because we allow up to
2951        one indel) */
2952     if (query_gap > first->map_info->subject_overhangs->right_len - 2 ||
2953         query_gap > second->map_info->subject_overhangs->left_len - 2) {
2954         return 0;
2955     }
2956 
2957     first_len = first->map_info->subject_overhangs->right_len;
2958     second_len = second->map_info->subject_overhangs->left_len;
2959 
2960     /* assume that the begining of intron was found, search for the splice
2961        signal at the end of the first HSP */
2962     for (q = 0; !found && q < 4;q++) {
2963 
2964         if (first->query.end - q <= first->query.offset) {
2965             break;
2966         }
2967 
2968         for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2969             Int4 i;
2970             Int4 status = 0;
2971             Int4 start;
2972             Uint1 seq;
2973 
2974             if (q == 0) {
2975                 seq = (first->map_info->subject_overhangs->right[0] << 6) |
2976                     (first->map_info->subject_overhangs->right[1] << 4);
2977             }
2978             else if (q == 1) {
2979                 seq = (query[first->query.end - 1] << 6) |
2980                     (first->map_info->subject_overhangs->right[0] << 4);
2981             }
2982             else if (q > 1) {
2983                 seq = (query[first->query.end - q] << 6) |
2984                     (query[first->query.end - q + 1] << 4);
2985             }
2986             else {
2987                 return -1;
2988             }
2989 
2990             if (seq != (signals[k] & 0xf0)) {
2991                 continue;
2992             }
2993 
2994             /* location of the splice signa in subject overhang if there are no
2995                indels */
2996             start = second->map_info->subject_overhangs->left_len - 1 -
2997                 query_gap - 2 - q;
2998 
2999             /* search for the splice signal at the end of intron;
3000                allow for up to 1 indel */
3001             for (i = MAX(start - 1, 0);i <= MIN(start + 1, second_len - 2);i++) {
3002                 seq &= 0xf0;
3003                 seq |= (second->map_info->subject_overhangs->left[i] << 2) |
3004                     second->map_info->subject_overhangs->left[i + 1];
3005 
3006                 /* if the splice signal found extend the alignment */
3007                 if (seq == signals[k]) {
3008                     Int4 subject_gap = second_len - (i + 2) + q;
3009                     if (query_gap - subject_gap < -1 ||
3010                         query_gap - subject_gap > 1) {
3011 
3012                         continue;
3013                     }
3014 
3015                     found = TRUE;
3016                     if (q > 0) {
3017                         s_TrimHSP(first, q, TRUE, FALSE,
3018                                   score_options->penalty,
3019                                   score_options->gap_open,
3020                                   score_options->gap_extend,
3021                                   query);
3022                     }
3023 
3024                     status = s_ExtendAlignment(second, query,
3025                              first->query.end,
3026                              second->query.offset - 1,
3027                              i + 2,
3028                              second->map_info->subject_overhangs->left_len - 1,
3029                              score_options,
3030                              TRUE);
3031 
3032                     ASSERT(status == 0);
3033                     if (status) {
3034                         return status;
3035                     }
3036 
3037                     signal = seq;
3038                     break;
3039                 }
3040             }
3041         }
3042     }
3043 
3044     /* assume that end of the intron was found, search for the splice signal
3045        at the beginning of the intron */
3046     for (q = 0; !found && q < 4;q++) {
3047 
3048         if (second->query.offset + q >= second->query.end) {
3049             break;
3050         }
3051 
3052         for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
3053             Int4 i;
3054             Int4 end;
3055             Uint1 seq;
3056             if (q == 0) {
3057                 seq =
3058                     (second->map_info->subject_overhangs->left[second_len - 2] << 2) |
3059                     second->map_info->subject_overhangs->left[second_len - 1];
3060             }
3061             else if (q == 1) {
3062                 seq = (second->map_info->subject_overhangs->left[second_len - 1] << 2) | query[second->query.offset];
3063 
3064 
3065             }
3066             else if (q > 1) {
3067 
3068                 if (second->map_info->edits->num_edits > 0 &&
3069                     second->map_info->edits->edits[0].query_pos < q - 2) {
3070 
3071                     break;
3072                 }
3073 
3074                 seq = (query[second->query.offset + q - 2] << 2) |
3075                     query[second->query.offset + q + 1 - 2];
3076             }
3077             else {
3078                 return -1;
3079             }
3080 
3081             if (seq != (signals[k] & 0xf)) {
3082                 continue;
3083             }
3084 
3085             /* location of the splice signal in the subject overhang if there are
3086                no indels */
3087             end = query_gap + q;
3088 
3089             /* allow for up to 1 indel */
3090             for (i = MAX(end - 1, 0);i <= MIN(end + 1, first_len - 2);i++) {
3091                 seq &= 0xf;
3092                 seq |= (first->map_info->subject_overhangs->right[i] << 6) |
3093                     (first->map_info->subject_overhangs->right[i + 1] << 4);
3094 
3095                 if (seq == signals[k]) {
3096                     Int4 status = 0;
3097                     Int4 subject_gap = i - q;
3098                     if (query_gap - subject_gap < -1 ||
3099                         query_gap - subject_gap > 1) {
3100 
3101                         continue;
3102                     }
3103 
3104                     found = TRUE;
3105                     if (q > 0) {
3106                         s_TrimHSP(second, q, TRUE, TRUE,
3107                                   score_options->penalty,
3108                                   score_options->gap_open,
3109                                   score_options->gap_extend,
3110                                   query);
3111                     }
3112 
3113                     status = s_ExtendAlignment(first, query,
3114                                                first->query.end,
3115                                                second->query.offset - 1,
3116                                                0, i - 1,
3117                                                score_options,
3118                                                FALSE);
3119                     ASSERT(status == 0);
3120                     if (status) {
3121                         return status;
3122                     }
3123 
3124                     signal = seq;
3125                     break;
3126                 }
3127             }
3128         }
3129     }
3130 
3131     if (found) {
3132         first->map_info->right_edge = (signal & 0xf0) >> 4;
3133         first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3134         second->map_info->left_edge = signal & 0xf;
3135         second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3136 
3137         ASSERT(first->gap_info->size > 1 ||
3138                first->query.end - first->query.offset ==
3139                first->subject.end - first->subject.offset);
3140 
3141         ASSERT(second->gap_info->size > 1 ||
3142                second->query.end - second->query.offset ==
3143                second->subject.end - second->subject.offset);
3144     }
3145     else {
3146         first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3147         second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3148     }
3149 
3150     return 0;
3151 }
3152 
3153 
3154 #define NUM_SINGLE_SIGNALS 8
3155 
3156 /* Record subject bases pre and post alignment in HSP as possible splice
3157    signals and set a flag for recognized signals */
3158 /* Not used
3159 static Int4 s_FindSpliceSignals(BlastHSP* hsp, Uint1* query, Int4 query_len)
3160 {
3161     SequenceOverhangs* overhangs = NULL;
3162     /--* splice signals in the order of preference *--/
3163     Uint1 signals[NUM_SINGLE_SIGNALS] = {2,  /--* AG *--/
3164                                          11, /--* GT *--/
3165                                          13, /--* TC *--/
3166                                          7,  /--* CT *--/
3167                                          1,  /--* AC *--/
3168                                          4,  /--* CA *--/
3169                                          8,  /--* GA *--/
3170                                          14  /--* TG *--/};
3171 
3172     if (!hsp || !query) {
3173         return -1;
3174     }
3175 
3176     overhangs = hsp->map_info->subject_overhangs;
3177     ASSERT(overhangs);
3178 
3179     if (!overhangs) {
3180         return -1;
3181     }
3182 
3183     /--* FIXME: check for polyA and adapters *--/
3184 
3185     /--* search for a splice signal on the left edge, unless the query is aligned
3186        from the beginning *--/
3187     if (hsp->query.offset == 0) {
3188         hsp->map_info->left_edge = MAPPER_EXON;
3189     }
3190     else {
3191         int k, i;
3192         Boolean found = FALSE;
3193 
3194         if (overhangs->left_len >= 2) {
3195             Uint1 signal = (overhangs->left[overhangs->left_len - 2] << 2) |
3196                 overhangs->left[overhangs->left_len - 1];
3197 
3198             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3199                 if (signal == signals[k]) {
3200                     hsp->map_info->left_edge = signal;
3201                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3202 
3203                     found = TRUE;
3204                     break;
3205                 }
3206             }
3207         }
3208 
3209         if (!found && overhangs->left_len >= 1) {
3210             Uint1 signal = (overhangs->left[overhangs->left_len - 1] << 2) |
3211                 query[hsp->query.offset];
3212 
3213             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3214                 if (signal == signals[k]) {
3215                     hsp->map_info->left_edge = signal;
3216                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3217 
3218                     /--* trim alignment *--/
3219                     s_TrimHSP(hsp, 1, TRUE, TRUE, -8, 0, -8);
3220 
3221                     /--* update score at some point *--/
3222                     found = TRUE;
3223                     break;
3224                 }
3225             }
3226         }
3227 
3228         for (i = hsp->query.offset;!found && i < 4;i++) {
3229             Uint1 signal = (query[i] << 2) | query[i + 1];
3230             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3231                 if (signal == signals[k]) {
3232                     hsp->map_info->left_edge = signal;
3233                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3234 
3235                     /--* trim alignment *--/
3236                     s_TrimHSP(hsp, i + 2, TRUE, TRUE, -8, 0, -8);
3237 
3238                     found = TRUE;
3239                     break;
3240                 }
3241             }
3242         }
3243     }
3244 
3245     /--* search for the splice signal on the right edge, unless the query is
3246        aligned to the end *--/
3247     if (hsp->query.end == query_len) {
3248         hsp->map_info->right_edge = MAPPER_EXON;
3249     }
3250     else {
3251         int k, i;
3252         Boolean found = FALSE;
3253 
3254         if (overhangs->right_len >= 2) {
3255             Uint1 signal = (overhangs->right[0] << 2) | overhangs->right[1];
3256             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3257                 if (signal == signals[k]) {
3258                     hsp->map_info->right_edge = signal;
3259                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3260 
3261                     found = TRUE;
3262                     break;
3263                 }
3264             }
3265 
3266         }
3267 
3268         if (!found && overhangs->right_len >= 1) {
3269             Uint1 signal = (query[hsp->query.end - 1] << 2) |
3270                 overhangs->right[0];
3271 
3272             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3273                 if (signal == signals[k]) {
3274                     hsp->map_info->right_edge = signal;
3275                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3276 
3277                     /--* update HSP *--/
3278                     s_TrimHSP(hsp, 1, TRUE, FALSE, -8, 0, -8);
3279 
3280                     /--* update score at some point *--/
3281                     found = TRUE;
3282                     break;
3283                 }
3284             }
3285 
3286         }
3287 
3288         for (i = hsp->query.end - 2;!found && i > hsp->query.end - 2 - 4;i--) {
3289             Uint1 signal = (query[i] << 2) | query[i + 1];
3290             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3291                 if (signal == signals[k]) {
3292                     hsp->map_info->right_edge = signal;
3293                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3294 
3295                     /--* update HSP *--/
3296                     s_TrimHSP(hsp, hsp->query.end - i, TRUE, FALSE, -8, 0, -8);
3297 
3298                     found = TRUE;
3299                     break;
3300                 }
3301             }
3302         }
3303     }
3304 
3305     return 0;
3306 }
3307 */
3308 
3309 
3310 static Int4
s_TrimOverlap(BlastHSP * first,BlastHSP * second,const Uint1 * query)3311 s_TrimOverlap(BlastHSP* first, BlastHSP* second, const Uint1* query)
3312 {
3313     if (second->query.offset - first->query.end < 0) {
3314         Int4 overlap = first->query.end - second->query.offset;
3315         ASSERT(overlap >= 0);
3316 
3317         if (second->query.end - second->query.offset > overlap) {
3318 
3319             s_TrimHSP(second, overlap, TRUE, TRUE, -4, -4, -4, query);
3320         }
3321         else {
3322             s_TrimHSP(first, overlap, TRUE, FALSE, -4, -4, -4, query);
3323         }
3324 
3325 
3326         ASSERT(first->query.end == second->query.offset);
3327 
3328 #if _DEBUG
3329         s_TestHSPRanges(second);
3330 #endif
3331     }
3332 
3333     if (second->subject.offset - first->subject.end < 0) {
3334         Int4 overlap = first->subject.end - second->subject.offset;
3335         ASSERT(overlap >= 0);
3336 
3337         if (second->subject.end - second->subject.offset > overlap) {
3338 
3339             s_TrimHSP(second, overlap, FALSE, TRUE, -4, -4, -4, query);
3340         }
3341         else {
3342             s_TrimHSP(first, overlap, FALSE, FALSE, -4, -4, -4, query);
3343         }
3344 
3345         ASSERT(first->subject.end == second->subject.offset);
3346 #if _DEBUG
3347         s_TestHSPRanges(second);
3348 #endif
3349     }
3350 
3351     ASSERT(first->query.end <= second->query.offset);
3352     ASSERT(first->subject.end <= second->subject.offset);
3353 
3354     return 0;
3355 }
3356 
3357 
3358 /* Relpace an intron with a gap */
3359 static Int4
s_IntronToGap(HSPContainer * h,HSPContainer * next,const Uint1 * query,const ScoringOptions * scoring_opts)3360 s_IntronToGap(HSPContainer* h, HSPContainer* next, const Uint1* query,
3361               const ScoringOptions* scoring_opts)
3362 {
3363     BlastHSP* new_hsp = NULL;
3364     HSPContainer* following = NULL;
3365 
3366     ASSERT(h);
3367     ASSERT(next);
3368     ASSERT(query);
3369     ASSERT(scoring_opts);
3370 
3371     following = h->next->next;
3372     h->next->next = NULL;
3373 
3374     s_TrimOverlap(h->hsp, next->hsp, query);
3375 
3376 #if _DEBUG
3377     s_TestHSPRanges(h->hsp);
3378 #endif
3379 
3380 
3381     /* merge HSPs */
3382     new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3383                           scoring_opts);
3384 
3385 
3386 #if _DEBUG
3387     s_TestHSPRanges(new_hsp);
3388 #endif
3389 
3390 
3391     if (new_hsp) {
3392 
3393         /* replace the two processed HSPs with the combined one */
3394         Blast_HSPFree(h->hsp);
3395         HSPContainerFree(h->next);
3396         h->hsp = new_hsp;
3397         h->next = following;
3398     }
3399     else {
3400         /* something went wrong with merging, use the initial HSPs */
3401 
3402         /* remove splice signal */
3403         h->hsp->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3404         next->hsp->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3405 
3406         h->next->next = following;
3407         h = h->next;
3408 
3409         ASSERT(!h->next ||
3410                (h->hsp->query.end <= h->next->hsp->query.offset &&
3411                 h->hsp->subject.end <= h->next->hsp->subject.offset));
3412     }
3413 
3414     return 0;
3415 }
3416 
3417 
3418 /* Search for splice signals between two HSPs in a chain. The HSPs in the
3419    chain must be sorted by query position in asceding order.
3420 */
3421 static Int4
s_FindSpliceJunctions(HSPChain * chains,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3422 s_FindSpliceJunctions(HSPChain* chains,
3423                       const BLAST_SequenceBlk* query_blk,
3424                       const BlastQueryInfo* query_info,
3425                       const ScoringOptions* scoring_opts)
3426 {
3427     HSPChain* ch = NULL;
3428 
3429     if (!chains || !query_blk || !query_info || !scoring_opts) {
3430         return -1;
3431     }
3432 
3433     /* iterate over HSPs in the chain */
3434     for (ch = chains; ch; ch = ch->next) {
3435         HSPContainer* h = ch->hsps;
3436         Uint1* query = NULL;
3437         Int4 context;
3438         Int4 query_len;
3439         ASSERT(h);
3440 
3441         context = h->hsp->context;
3442         query = query_blk->sequence +
3443             query_info->contexts[context].query_offset;
3444         query_len = query_info->contexts[context].query_length;
3445 
3446         while (h->next) {
3447             HSPContainer* next = h->next;
3448             ASSERT(next);
3449 
3450             /* if not a spliced alignment, try merging HSPs into one */
3451             /* Introns are typically at least 30 bases long, and there can be
3452                a few unalined query bases. */
3453             if ((next->hsp->subject.offset - h->hsp->subject.end -
3454                  (next->hsp->query.offset - h->hsp->query.end) < 30) &&
3455 
3456                 /* this condition is needed to align unaligned query bases */
3457                 next->hsp->subject.offset - h->hsp->subject.end <
3458                 h->hsp->map_info->subject_overhangs->right_len) {
3459 
3460                 /* save pointer to hsps after next */
3461                 HSPContainer* following = h->next->next;
3462                 BlastHSP* new_hsp = NULL;
3463 
3464                 /* duplicate HSPContainer with the two HSPs */
3465                 h->next->next = NULL;
3466 
3467                 s_TrimOverlap(h->hsp, next->hsp, query);
3468 
3469                 /* extend the first HSP to cover the gap between HSPs */
3470                 if (next->hsp->query.offset - h->hsp->query.end > 1) {
3471                     BlastHSP* first = h->hsp;
3472                     BlastHSP* second = h->next->hsp;
3473 
3474                     s_ExtendAlignment(first, query, first->query.end,
3475                               second->query.offset - 1, 0,
3476                               second->subject.offset - 1 - first->subject.end,
3477                               scoring_opts, FALSE);
3478 
3479 #if _DEBUG
3480                     s_TestHSPRanges(first);
3481 #endif
3482                 }
3483 
3484 
3485                 /* merge HSPs */
3486                 new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3487                                       scoring_opts);
3488 
3489 
3490 #if _DEBUG
3491                     s_TestHSPRanges(new_hsp);
3492 #endif
3493 
3494 
3495                 if (new_hsp) {
3496 
3497                     /* replace the two processed HSPs with the combined one */
3498                     Blast_HSPFree(h->hsp);
3499                     HSPContainerFree(h->next);
3500                     h->hsp = new_hsp;
3501                     h->next = following;
3502                 }
3503                 else {
3504                     /* something went wrong with merging, use the initial
3505                        HSPs */
3506                     h->next->next = following;
3507                     h = h->next;
3508 
3509                     ASSERT(!h->next ||
3510                            (h->hsp->query.end <= h->next->hsp->query.offset &&
3511                             h->hsp->subject.end <= h->next->hsp->subject.offset));
3512                 }
3513             }
3514             /* process overlap if found */
3515             else if (next->hsp->query.offset <= h->hsp->query.end &&
3516                      next->hsp->query.offset > h->hsp->query.offset) {
3517 
3518                 Boolean consensus_only = TRUE;
3519                 if (h->hsp->score > 50 && next->hsp->score > 50) {
3520                     consensus_only = FALSE;
3521                 }
3522 
3523                 s_FindSpliceJunctionsForOverlaps(h->hsp, next->hsp, query,
3524                                                  query_len, consensus_only);
3525 
3526 
3527                 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
3528 
3529                     s_TrimOverlap(h->hsp, next->hsp, query);
3530                 }
3531 
3532                 /* if the intron is shorter than 30 bases, replace it with
3533                    deletion and merge HSPs */
3534                 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) != 0 &&
3535                     next->hsp->subject.offset - h->hsp->subject.end < 30) {
3536 
3537 
3538                     s_IntronToGap(h, next, query, scoring_opts);
3539 
3540                 }
3541                 else {
3542 
3543 #if _DEBUG
3544                     s_TestHSPRanges(h->hsp);
3545 #endif
3546 
3547                     h = h->next;
3548                 }
3549             }
3550             else if (next->hsp->query.offset - h->hsp->query.end > 0 &&
3551                      next->hsp->query.offset - h->hsp->query.end <
3552                      h->hsp->map_info->subject_overhangs->right_len) {
3553 
3554                 s_FindSpliceJunctionsForGap(h->hsp, next->hsp, query,
3555                                             query_len, scoring_opts);
3556 
3557 
3558                 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
3559 
3560                     s_TrimOverlap(h->hsp, next->hsp, query);
3561                 }
3562 
3563 
3564 
3565                 /* if the intron is shorter than 30 bases, replace it with
3566                    deletion and merge HSPs */
3567                 if ((h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) != 0 &&
3568                     next->hsp->subject.offset - h->hsp->subject.end < 30) {
3569 
3570 
3571                     s_IntronToGap(h, next, query, scoring_opts);
3572 
3573                 }
3574                 else {
3575 
3576 
3577 #if _DEBUG
3578                     s_TestHSPRanges(h->hsp);
3579 #endif
3580 
3581                     h = h->next;
3582                 }
3583             }
3584 
3585 
3586             else {
3587 
3588                 s_TrimOverlap(h->hsp, next->hsp, query);
3589 
3590 #if _DEBUG
3591                     s_TestHSPRanges(h->hsp);
3592 #endif
3593 
3594                 h = h->next;
3595             }
3596 
3597         }
3598 
3599         /* recalculated chain score */
3600         ch->score = s_ComputeChainScore(ch, scoring_opts, query_len, TRUE);
3601     }
3602 
3603     s_TestChains(chains);
3604 
3605     return 0;
3606 }
3607 
3608 
3609 /* Find the best scoring chain of HSPs for aligning a single RNA-Seq read to
3610    a genome on a single strand, returns all top scoring chains */
s_FindBestPath(HSPNode * nodes,Int4 num,HSPPath * path,Int4 oid,Boolean is_spliced,Int4 longest_intron,Int4 cutoff_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3611 static HSPChain* s_FindBestPath(HSPNode* nodes, Int4 num, HSPPath* path,
3612                                 Int4 oid, Boolean is_spliced,
3613                                 Int4 longest_intron,
3614                                 Int4 cutoff_score,
3615                                 const BLAST_SequenceBlk* query_blk,
3616                                 const BlastQueryInfo* query_info,
3617                                 const ScoringOptions* scoring_opts)
3618 {
3619     int i, k;
3620     Int4 best_score = 0;
3621     const Int4 kMaxIntronLength = longest_intron;
3622     /* FIXME: use mismatch and/or gap extend penalty here */
3623     HSPChain* retval = NULL;
3624 
3625     if (!path || !nodes || !num) {
3626         return NULL;
3627     }
3628     path->num_paths = 0;
3629     path->score = 0;
3630 
3631     for (i = num - 1;i >= 0;i--) {
3632 
3633         BlastHSP* hsp = *(nodes[i].hsp);
3634         int self_score = nodes[i].best_score;
3635         /* FIXME: the is_spliced condition is a hack */
3636         for (k = i + 1;k < num && is_spliced;k++) {
3637 
3638             BlastHSP* newhsp = *(nodes[k].hsp);
3639             Int4 overlap_cost = s_GetOverlapCost(newhsp, *(nodes[i].hsp), 4);
3640             Int4 new_score = nodes[k].best_score + self_score - overlap_cost;
3641 
3642             /* FIXME: some of the conditions double others */
3643             const Int4 hsp_len = hsp->query.end - hsp->query.offset;
3644             const Int4 newhsp_len = newhsp->query.end - newhsp->query.offset;
3645             const Int4 overlap_len = MAX(MIN(hsp->query.end, newhsp->query.end) -
3646                               MAX(hsp->query.offset, newhsp->query.offset), 0);
3647 
3648             const Int4 subj_overlap_len =
3649                 MAX(MIN(hsp->subject.end, newhsp->subject.end) -
3650                     MAX(hsp->subject.offset, newhsp->subject.offset), 0);
3651 
3652 
3653             /* add next HSP to the chain only if hsp, newhsp aligns to the
3654                subject behind hsp, and score improves */
3655             if (newhsp->query.offset > hsp->query.offset &&
3656                 newhsp->query.end > hsp->query.end &&
3657 
3658                 newhsp->subject.offset > hsp->subject.offset &&
3659                 newhsp->subject.end > hsp->subject.end &&
3660                 newhsp->subject.offset - hsp->subject.end < kMaxIntronLength &&
3661                 (double)overlap_len / hsp_len < 0.75 &&
3662                 (double)overlap_len / newhsp_len < 0.75 &&
3663                 (double)subj_overlap_len / hsp_len < 0.75 &&
3664                 (double)subj_overlap_len / newhsp_len < 0.75) {
3665 
3666                 /* FIXME: this condition may not be necessary */
3667                 if (newhsp->subject.offset - hsp->subject.end > 1) {
3668                     /* The function that finds splice signals modifies
3669                        HSPs, so we need to clone HSPs here. */
3670                     BlastHSP* hsp_copy = Blast_HSPClone(hsp);
3671                     BlastHSP* newhsp_copy = Blast_HSPClone(newhsp);
3672 
3673                     HSPChain* chain = HSPChainNew(hsp->context);
3674                     chain->hsps = HSPContainerNew(&hsp_copy);
3675                     chain->hsps->next = HSPContainerNew(&newhsp_copy);
3676 
3677                     s_FindSpliceJunctions(chain, query_blk, query_info,
3678                                           scoring_opts);
3679 
3680                     /* update score: add the difference between sum of
3681                        two HSP scores minus overalp and the new score
3682                        for the merged HSP */
3683                     new_score += chain->score + overlap_cost -
3684                         (newhsp->score + self_score);
3685 
3686                     chain = HSPChainFree(chain);
3687                 }
3688                 else if (newhsp->query.offset - hsp->query.end == 1) {
3689                     /* FIXME: this is a temporary hack, we need to increase
3690                        the score of combining two HSP into one with a
3691                        mismatch so that it is scored the same as the one with
3692                        splice signal */
3693                     new_score++;
3694                 }
3695                 else {
3696                     new_score = 0;
3697                 }
3698 
3699                 if (new_score > nodes[i].best_score) {
3700                     nodes[i].best_score = new_score;
3701                     nodes[i].path_next = &(nodes[k]);
3702                 }
3703             }
3704         }
3705 
3706         /* we want to return all top scoring hits, so we save all paths with
3707            top score */
3708         if (nodes[i].best_score == best_score) {
3709             if (path->num_paths < MAX_NUM_HSP_PATHS) {
3710                 path->start[path->num_paths++] = &(nodes[i]);
3711             }
3712         }
3713         else if (nodes[i].best_score > best_score) {
3714             best_score = nodes[i].best_score;
3715             path->start[0] = &(nodes[i]);
3716             path->num_paths = 1;
3717         }
3718     }
3719     ASSERT(path->num_paths);
3720     path->score = path->start[0]->best_score;
3721 
3722     /* Find if any paths share an HSP and remove the one with the larger index.
3723        Doing this at the end ensures that we find optimal path in terms of
3724        score and/or HSP positions. */
3725     for (i = 0;i < path->num_paths - 1;i++) {
3726         if (!path->start[i]) {
3727             continue;
3728         }
3729         for (k = i + 1;k < path->num_paths;k++) {
3730             HSPNode* node_i = path->start[i];
3731             if (!path->start[k]) {
3732                 continue;
3733             }
3734 
3735             while (node_i) {
3736                 HSPNode* node_k = path->start[k];
3737                 while (node_k) {
3738                     if (node_i->hsp[0] == node_k->hsp[0]) {
3739                         path->start[k] = NULL;
3740                         break;
3741                     }
3742                     node_k = node_k->path_next;
3743                 }
3744                 node_i = node_i->path_next;
3745             }
3746 
3747         }
3748     }
3749 
3750     ASSERT(path->num_paths > 0);
3751     ASSERT(path->start[0]);
3752 
3753     /* conver from HSPPath to HSPChain structure */
3754     if (path->num_paths > 0) {
3755         HSPChain* ch = NULL;
3756         HSPContainer* hc = NULL;
3757         HSPNode* node = path->start[0];
3758 
3759         retval = HSPChainNew((*node->hsp)->context);
3760         if (!retval) {
3761             return NULL;
3762         }
3763         retval->oid = oid;
3764         retval->score = path->score;
3765         retval->hsps = hc = HSPContainerNew(node->hsp);
3766         node = node->path_next;
3767         while (node) {
3768             hc->next = HSPContainerNew(node->hsp);
3769             if (!hc->next) {
3770                 HSPChainFree(retval);
3771                 return NULL;
3772             }
3773             hc = hc->next;
3774             node = node->path_next;
3775         }
3776         ASSERT(retval->hsps);
3777 
3778         ch = retval;
3779         for (i = 1;i < path->num_paths;i++) {
3780             node = path->start[i];
3781             if (!node) {
3782                 continue;
3783             }
3784 
3785             if (path->score < cutoff_score) {
3786                 continue;
3787             }
3788 
3789             ch->next = HSPChainNew((*node->hsp)->context);
3790             ASSERT(ch->next);
3791             if (!ch->next) {
3792                 HSPChainFree(retval);
3793                 return NULL;
3794             }
3795             ch->next->oid = oid;
3796             ch->next->score = path->score;
3797             ch->next->hsps = hc = HSPContainerNew(node->hsp);
3798             node = node->path_next;
3799             while (node) {
3800                 hc->next = HSPContainerNew(node->hsp);
3801                 ASSERT(hc->next);
3802                 if (!hc->next) {
3803                     HSPChainFree(retval);
3804                     return NULL;
3805                 }
3806                 hc = hc->next;
3807                 node = node->path_next;
3808             }
3809             ch = ch->next;
3810             ASSERT(ch->hsps);
3811         }
3812     }
3813 
3814     ASSERT(path->num_paths == 0 || s_TestChains(retval));
3815 
3816     return retval;
3817 }
3818 
3819 /* Compare HSPs by context (accending), and score (descending). */
3820 static int
s_CompareHSPsByContextScore(const void * v1,const void * v2)3821 s_CompareHSPsByContextScore(const void* v1, const void* v2)
3822 {
3823    BlastHSP* h1,* h2;
3824    BlastHSP** hp1,** hp2;
3825 
3826    hp1 = (BlastHSP**) v1;
3827    hp2 = (BlastHSP**) v2;
3828    h1 = *hp1;
3829    h2 = *hp2;
3830 
3831    if (!h1 && !h2)
3832       return 0;
3833    else if (!h1)
3834       return -1;
3835    else if (!h2)
3836       return 1;
3837 
3838    if (h1->context < h2->context)
3839       return -1;
3840    if (h1->context > h2->context)
3841       return 1;
3842 
3843    if (h1->score < h2->score)
3844       return 1;
3845    if (h1->score > h2->score)
3846       return -1;
3847 
3848    return 0;
3849 }
3850 
3851 static int
s_CompareHSPsByContextSubjectOffset(const void * v1,const void * v2)3852 s_CompareHSPsByContextSubjectOffset(const void* v1, const void* v2)
3853 {
3854    BlastHSP* h1,* h2;
3855    BlastHSP** hp1,** hp2;
3856 
3857    hp1 = (BlastHSP**) v1;
3858    hp2 = (BlastHSP**) v2;
3859    h1 = *hp1;
3860    h2 = *hp2;
3861 
3862    if (!h1 && !h2)
3863       return 0;
3864    else if (!h1)
3865       return 1;
3866    else if (!h2)
3867       return -1;
3868 
3869    /* If these are from different contexts, don't compare offsets */
3870    if (h1->context < h2->context)
3871       return -1;
3872    if (h1->context > h2->context)
3873       return 1;
3874 
3875    if (h1->subject.offset < h2->subject.offset)
3876       return -1;
3877    if (h1->subject.offset > h2->subject.offset)
3878       return 1;
3879 
3880    if (h1->query.offset < h2->query.offset)
3881       return -1;
3882    if (h1->query.offset > h2->query.offset)
3883       return 1;
3884 
3885    /* tie breakers: sort by decreasing score, then
3886       by increasing size of query range, then by
3887       increasing subject range. */
3888 
3889    if (h1->score < h2->score)
3890       return 1;
3891    if (h1->score > h2->score)
3892       return -1;
3893 
3894    if (h1->query.end < h2->query.end)
3895       return 1;
3896    if (h1->query.end > h2->query.end)
3897       return -1;
3898 
3899    if (h1->subject.end < h2->subject.end)
3900       return 1;
3901    if (h1->subject.end > h2->subject.end)
3902       return -1;
3903 
3904    return 0;
3905 }
3906 
3907 
3908 /* Pair data, used for selecting optimal pairs of aligned chains */
3909 typedef struct Pairinfo
3910 {
3911     HSPChain* first;
3912     HSPChain* second;
3913     Int4 distance;
3914     Int4 conf;
3915     Int4 score;
3916     Int4 trim_first;    // when a chain overextends past beginning of the pair
3917     Int4 trim_second;   // it needs to be trimmed
3918     Boolean valid_pair;
3919 } Pairinfo;
3920 
3921 
s_BlastHSPMapperSplicedRunCleanUp(HSPPath * path,HSPNode * nodes,Pairinfo * pair_data)3922 static void s_BlastHSPMapperSplicedRunCleanUp(HSPPath* path,
3923                                               HSPNode* nodes,
3924                                               Pairinfo* pair_data)
3925 {
3926     HSPPathFree(path);
3927 
3928     if (nodes) {
3929         sfree(nodes);
3930     }
3931 
3932     if (pair_data) {
3933         sfree(pair_data);
3934     }
3935 }
3936 
3937 
3938 /* Compare two HSP chain pairs aligned for the same query and subject
3939    by score and distance */
3940 static int
s_ComparePairs(const void * v1,const void * v2)3941 s_ComparePairs(const void* v1, const void* v2)
3942 {
3943     Pairinfo* a = (Pairinfo*)v1;
3944     Pairinfo* b = (Pairinfo*)v2;
3945 
3946     if (a->score > b->score) {
3947         return -1;
3948     }
3949     if (a->score < b->score) {
3950         return 1;
3951     }
3952 
3953     if (a->conf < b->conf) {
3954         return -1;
3955     }
3956     if (a->conf > b->conf) {
3957         return 1;
3958     }
3959 
3960     if (a->distance < b->distance) {
3961         return -1;
3962     }
3963     if (a->distance > b->distance) {
3964         return 1;
3965     }
3966 
3967     return 0;
3968 }
3969 
3970 
3971 /* Trim beginning of a chain alignment up to the given subject position */
s_TrimChainStartToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)3972 static Int4 s_TrimChainStartToSubjPos(HSPChain* chain, Int4 subj_pos,
3973                                       Int4 mismatch_score,
3974                                       Int4 gap_open_score,
3975                                       Int4 gap_extend_score,
3976                                       const BLAST_SequenceBlk* query_blk,
3977                                       const BlastQueryInfo* query_info)
3978 {
3979     HSPContainer* h = NULL;
3980     BlastHSP* hsp = NULL;
3981     BlastHSP* next_hsp = NULL;
3982     Int4 num_left;
3983     Int4 old_score;
3984     Uint1* query = NULL;
3985 
3986     ASSERT(chain);
3987     ASSERT(subj_pos >= 0);
3988     if (!chain || subj_pos < 0) {
3989         return 0;
3990     }
3991 
3992     query = query_blk->sequence +
3993         query_info->contexts[chain->context].query_offset;
3994 
3995     /* first remove HSPs that are fully below the subject position */
3996     h = chain->hsps;
3997     while (h && h->hsp->subject.end < subj_pos) {
3998         HSPContainer* tmp = h;
3999         ASSERT(tmp);
4000         h = h->next;
4001         tmp->next = NULL;
4002         chain->score -= tmp->hsp->score;
4003         HSPContainerFree(tmp);
4004     }
4005     ASSERT(h);
4006     chain->hsps = h;
4007     if (!h) {
4008         return -1;
4009     }
4010 
4011     /* if all remaning HSPs are above the subject position, then exit */
4012     if (h->hsp->subject.offset >= subj_pos) {
4013         return 0;
4014     }
4015 
4016     /* otherwise move HSP start positions */
4017     hsp = h->hsp;
4018     num_left = subj_pos - hsp->subject.offset;
4019     ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
4020 
4021     old_score = hsp->score;
4022     s_TrimHSP(hsp, num_left, FALSE, TRUE, mismatch_score, gap_open_score,
4023               gap_extend_score, query);
4024 
4025     /* update chain score */
4026     chain->score -= old_score - hsp->score;
4027 
4028     /* remove splice signals and exon flags */
4029     hsp->map_info->left_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
4030     hsp->map_info->left_edge &= 0xff ^ MAPPER_EXON;
4031 
4032     /* check whether the HSP is contained within the next one and remove it
4033        if it is */
4034     next_hsp = NULL;
4035     if (chain->hsps->next) {
4036         next_hsp = chain->hsps->next->hsp;
4037     }
4038     if (next_hsp && hsp->query.offset >= next_hsp->query.offset) {
4039         chain->hsps = h->next;
4040         h->next = NULL;
4041         HSPContainerFree(h);
4042     }
4043 
4044     return 0;
4045 }
4046 
4047 /* Trim chain alignment end up the the given subject position */
s_TrimChainEndToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)4048 static Int4 s_TrimChainEndToSubjPos(HSPChain* chain, Int4 subj_pos,
4049                                     Int4 mismatch_score,
4050                                     Int4 gap_open_score, Int4 gap_extend_score,
4051                                     const BLAST_SequenceBlk* query_blk,
4052                                     const BlastQueryInfo* query_info)
4053 {
4054     HSPContainer* h = NULL;
4055     BlastHSP* hsp = NULL;
4056     Int4 num_left;
4057     Int4 old_score;
4058     Uint1* query = NULL;
4059 
4060     ASSERT(chain);
4061     ASSERT(subj_pos >= 0);
4062     if (!chain || subj_pos <= 0 || !query_blk || !query_info) {
4063         return -1;
4064     }
4065 
4066     query = query_blk->sequence +
4067         query_info->contexts[chain->context].query_offset;
4068 
4069     /* first remove HSPs that are full above the subject position */
4070     h = chain->hsps;
4071     while (h->next && h->next->hsp->subject.end < subj_pos) {
4072         h = h->next;
4073     }
4074 
4075     if (h->next) {
4076         HSPContainer* hc = NULL;
4077         if (h->next->hsp->subject.offset < subj_pos) {
4078             h = h->next;
4079         }
4080 
4081         /* update chain score */
4082         for (hc = h->next;hc != NULL;hc = hc->next) {
4083             chain->score -= hc->hsp->score;
4084         }
4085 
4086         HSPContainerFree(h->next);
4087         h->next = NULL;
4088     }
4089 
4090     /* if all remaning HSPs are below the subject position, exit */
4091     if (h->hsp->subject.end <= subj_pos) {
4092         return 0;
4093     }
4094 
4095     /* otheriwse move HSP end positions */
4096     hsp = h->hsp;
4097     num_left = hsp->subject.end - subj_pos;
4098     ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
4099 
4100     old_score = hsp->score;
4101     s_TrimHSP(hsp, num_left, FALSE, FALSE, mismatch_score, gap_open_score,
4102               gap_extend_score, query);
4103 
4104     /* update chain score */
4105     chain->score -= old_score - hsp->score;
4106 
4107     /* remove splice signals and exon flags */
4108     hsp->map_info->right_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
4109     hsp->map_info->right_edge &= 0xff ^ MAPPER_EXON;
4110 
4111     /* check whether the HSP is contained within the next one and remove it
4112        if it is */
4113     if (chain->hsps != h) {
4114         HSPContainer* hc = chain->hsps;
4115         while (hc && hc->next != h) {
4116             hc = hc->next;
4117         }
4118         ASSERT(hc && hc->next == h);
4119         if (hc->hsp->query.end >= h->hsp->query.end) {
4120             chain->score -= h->hsp->score;
4121             HSPContainerFree(h);
4122             hc->next = NULL;
4123         }
4124     }
4125 
4126     return 0;
4127 }
4128 
4129 
4130 /* Find optimal pairs */
s_FindBestPairs(HSPChain ** first_list,HSPChain ** second_list,Int4 min_score,Pairinfo ** pair_info_ptr,Int4 * max_num_pairs,Boolean is_spliced,const ScoringOptions * scoring_options,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)4131 static Boolean s_FindBestPairs(HSPChain** first_list,
4132                                HSPChain** second_list,
4133                                Int4 min_score,
4134                                Pairinfo** pair_info_ptr,
4135                                Int4* max_num_pairs,
4136                                Boolean is_spliced,
4137                                const ScoringOptions* scoring_options,
4138                                const BLAST_SequenceBlk* query_blk,
4139                                const BlastQueryInfo* query_info)
4140 {
4141     HSPChain* first;
4142     HSPChain* second;
4143     Pairinfo* pair_info = *pair_info_ptr;
4144     Int4 conv_bonus = 0;
4145     Int4 num_pairs = 0;
4146     Boolean found = FALSE;
4147     const Int4 kMaxInsertSize = is_spliced ?
4148         MAGICBLAST_MAX_INSERT_SIZE_SPLICED :
4149         MAGICBLAST_MAX_INSERT_SIZE_NONSPLICED;
4150 
4151     /* iterate over all pairs of HSP chains for the first and second read of
4152        the pair and collect pair information */
4153     for (first = *first_list; first; first = first->next) {
4154         for (second = *second_list; second; second = second->next) {
4155             Int2 first_frame = first->hsps->hsp->query.frame;
4156             Int2 second_frame = second->hsps->hsp->query.frame;
4157 
4158             /* reallocate pair info array if necessary */
4159             if (num_pairs >= *max_num_pairs) {
4160                 *max_num_pairs *= 2;
4161                 Pairinfo* new_pair_info =
4162                     realloc(pair_info, *max_num_pairs * sizeof(Pairinfo));
4163                 if (!new_pair_info) {
4164                     return -1;
4165                 }
4166                 pair_info = new_pair_info;
4167                 *pair_info_ptr = new_pair_info;
4168             }
4169 
4170             /* collect pair information */
4171             pair_info[num_pairs].first = first;
4172             pair_info[num_pairs].second = second;
4173             pair_info[num_pairs].score = first->score + second->score;
4174             pair_info[num_pairs].trim_first = 0;
4175             pair_info[num_pairs].trim_second = 0;
4176             pair_info[num_pairs].valid_pair = 0;
4177             pair_info[num_pairs].distance = 0;
4178 
4179             /* if the chains align on the opposite strands */
4180             ASSERT(first_frame != 0 && second_frame != 0);
4181             if (SIGN(first_frame) != SIGN(second_frame)) {
4182                 HSPChain* plus = NULL, *minus = NULL;
4183                 Int4 plus_start, minus_start;
4184                 HSPContainer* hsp = NULL;
4185                 Int4 distance = 0;
4186 
4187                 /* find which query aligns to plus and which to minus strand
4188                    on the subject */
4189                 if (first_frame > 0) {
4190                     plus = first;
4191                 }
4192                 else {
4193                     minus = first;
4194                 }
4195                 if (second_frame > 0) {
4196                     plus = second;
4197                 }
4198                 else {
4199                     minus = second;
4200                 }
4201                 ASSERT(plus && minus);
4202 
4203                 /* find start positions on the subject, for the minus strand we
4204                    are interested subject start position on the minus strand,
4205                    but the HSP has subject on plus and query on the minus
4206                    strand, so we need to flip the query */
4207                 plus_start = plus->hsps->hsp->subject.offset;
4208                 hsp = minus->hsps;
4209                 while (hsp->next) {
4210                     hsp = hsp->next;
4211                 }
4212                 ASSERT(hsp);
4213                 minus_start = hsp->hsp->subject.end;
4214 
4215                 /* this is also fragment length */
4216                 distance = minus_start - plus_start;
4217                 pair_info[num_pairs].distance = distance;
4218 
4219                 /* distance > 0 indicates a convergent pair (typical) */
4220                 if (distance > 0 && distance < kMaxInsertSize) {
4221                     Int4 plus_end, minus_end;
4222                     hsp = plus->hsps;
4223                     while (hsp->next) {
4224                         hsp = hsp->next;
4225                     }
4226                     ASSERT(hsp);
4227                     plus_end = hsp->hsp->subject.end;
4228                     minus_end = minus->hsps->hsp->subject.offset;
4229 
4230                     /* HSP chain end going past the beginning of the chain
4231                        for the pair, indicates that the this end of the
4232                        query may be aligned to an adapater instead of a real
4233                        sequence and needs to be trimmed */
4234                     if (plus_end > minus_start || minus_end < plus_start) {
4235                         ASSERT(plus && minus);
4236 
4237                         /* check the 3' (right) side of the fragment */
4238                         if (plus_end > minus_start) {
4239                             ASSERT(plus_end - minus_start > 0);
4240 
4241                             /* decrease the score by the trimmed portion; we
4242                                assume here that the trimmed part aligns
4243                                perfectly, which may not be the case, but this
4244                                score is only used for comparison with different
4245                                pairs */
4246                             pair_info[num_pairs].score -=
4247                                 plus_end - minus_start;
4248 
4249                             /* Mark which chain and up to what subject location
4250                                needs to be trimmed. We cannot trim here,
4251                                because pair_info holds poiners to chains that
4252                                may be part of better pairs */
4253                             if (plus == pair_info[num_pairs].first) {
4254                                 pair_info[num_pairs].trim_first = minus_start;
4255                             }
4256                             else {
4257                                 pair_info[num_pairs].trim_second = minus_start;
4258                             }
4259                         }
4260 
4261                         /* check the 5' (left) side doe the fragment */
4262                         if (minus_end < plus_start) {
4263                             ASSERT(plus_start - minus_end > 0);
4264 
4265                             pair_info[num_pairs].score -=
4266                                 plus_start - minus_end;
4267 
4268                             if (minus == pair_info[num_pairs].first) {
4269                                 pair_info[num_pairs].trim_first = plus_start;
4270                             }
4271                             else {
4272                                 pair_info[num_pairs].trim_second = plus_start;
4273                             }
4274                         }
4275                     }
4276 
4277                     pair_info[num_pairs].conf = PAIR_CONVERGENT;
4278 
4279                     /* add a bonus for convergent pairs; because they are more
4280                        common we prefer them over different configurations
4281                        with a sligntly better score */
4282                     pair_info[num_pairs].score += conv_bonus;
4283                 }
4284                 else {
4285                     pair_info[num_pairs].conf = PAIR_DIVERGENT;
4286                 }
4287 
4288                 /* skip pairs with a score smaller than one found for a
4289                    different subject */
4290                 if (pair_info[num_pairs].score < min_score) {
4291                     continue;
4292                 }
4293 
4294                 num_pairs++;
4295             }
4296             else {
4297                 /* for read pairs aligned in the same direction */
4298                 pair_info[num_pairs].conf = PAIR_PARALLEL;
4299                 pair_info[num_pairs].score -= 1;
4300                 num_pairs++;
4301             }
4302         }
4303     }
4304 
4305     if (num_pairs > 0) {
4306         Int4 i;
4307         Int4 best_score = pair_info[0].score;
4308         Int4 margin = 5;
4309         HSPChain* ch = NULL;
4310         Boolean convergent_found = FALSE;
4311 
4312         /* sort pair information by score, configuration distance */
4313         qsort(pair_info, num_pairs, sizeof(Pairinfo), s_ComparePairs);
4314 
4315         /* iterate over pair information and collect pairs */
4316         for (i=0;i < num_pairs;i++) {
4317             HSPChain* f = pair_info[i].first;
4318             HSPChain* s = pair_info[i].second;
4319 
4320             /* skip pairs with chains already used in other pairs
4321                and other configurations when convergent one was found */
4322             if (pair_info[i].first->pair || pair_info[i].second->pair ||
4323                 (convergent_found && pair_info[i].conf != PAIR_CONVERGENT)) {
4324 
4325                 continue;
4326             }
4327 
4328             if (pair_info[i].conf == PAIR_CONVERGENT) {
4329                 convergent_found = TRUE;
4330             }
4331 
4332             /* skip pairs with scores significantly smaller than the best one
4333                found */
4334             if (best_score - pair_info[i].score > margin) {
4335                 break;
4336             }
4337 
4338             /* link chains as a pair */
4339             f->pair = s;
4340             s->pair = f;
4341             f->pair_conf = s->pair_conf = pair_info[i].conf;
4342             pair_info[i].valid_pair = TRUE;
4343             found = TRUE;
4344         }
4345 
4346         /* for chains that pair with already paired chains, clone the paired
4347            chain */
4348         for (ch = *first_list; ch; ch = ch->next) {
4349             HSPChain* pair = NULL;
4350 
4351             if (ch->pair) {
4352                 continue;
4353             }
4354 
4355             i = 0;
4356             while (i < num_pairs &&
4357                    (pair_info[i].first != ch || !pair_info[i].second->pair ||
4358                     pair_info[i].conf != PAIR_CONVERGENT ||
4359                     best_score - pair_info[i].score > margin)) {
4360                 i++;
4361             }
4362             if (i >= num_pairs) {
4363                 continue;
4364             }
4365             ASSERT(pair_info[i].second->pair);
4366 
4367             pair = CloneChain(pair_info[i].second);
4368             ASSERT(pair);
4369             ASSERT(s_TestChains(pair));
4370             pair_info[i].second = pair;
4371             ch->pair = pair;
4372             pair->pair = ch;
4373             ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4374             pair_info[i].valid_pair = TRUE;
4375             HSPChainListInsert(second_list, &pair, 0, -1, FALSE);
4376         }
4377 
4378         for (ch = *second_list; ch; ch = ch->next) {
4379             HSPChain* pair = NULL;
4380 
4381             if (ch->pair) {
4382                 continue;
4383             }
4384 
4385             i = 0;
4386             while (i < num_pairs &&
4387                    (pair_info[i].second != ch || !pair_info[i].first->pair ||
4388                     pair_info[i].conf != PAIR_CONVERGENT ||
4389                     best_score - pair_info[i].score > margin)) {
4390                 i++;
4391             }
4392             if (i >= num_pairs) {
4393                 continue;
4394             }
4395             ASSERT(pair_info[i].first->pair);
4396 
4397             pair = CloneChain(pair_info[i].first);
4398             ASSERT(pair);
4399             ASSERT(s_TestChains(pair));
4400             pair_info[i].first = pair;
4401             ch->pair = pair;
4402             pair->pair = ch;
4403             ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4404             pair_info[i].valid_pair = TRUE;
4405             HSPChainListInsert(first_list, &pair, 0, -1, FALSE);
4406         }
4407 
4408         /* trim the alignments that overextend beyond the pair */
4409         for (i = 0;i < num_pairs;i++) {
4410             if (!pair_info[i].valid_pair ||
4411                 !pair_info[i].first->pair || !pair_info[i].second->pair) {
4412                 continue;
4413             }
4414 
4415             /* trim the first chain if its end goes beyond the start of
4416                second on the subject */
4417             if (pair_info[i].trim_first > 0) {
4418                 if (pair_info[i].first->hsps->hsp->query.frame > 0) {
4419                     s_TrimChainEndToSubjPos(pair_info[i].first,
4420                                             pair_info[i].trim_first,
4421                                             scoring_options->penalty,
4422                                             scoring_options->gap_open,
4423                                             scoring_options->gap_extend,
4424                                             query_blk,
4425                                             query_info);
4426                 }
4427                 else {
4428                     /* minus strand of the qurey is represented by another
4429                        (reverse complemented) sequence aligned like the plus
4430                        strand, so the end of the chain is really the beginnig
4431                        by query coordinates */
4432                     s_TrimChainStartToSubjPos(pair_info[i].first,
4433                                               pair_info[i].trim_first,
4434                                               scoring_options->penalty,
4435                                               scoring_options->gap_open,
4436                                               scoring_options->gap_extend,
4437                                               query_blk,
4438                                               query_info);
4439                 }
4440             }
4441 
4442             /* trim the second chain if its end goes beyond the start of the
4443                first on the subject */
4444             if (pair_info[i].trim_second > 0) {
4445                 if (pair_info[i].second->hsps->hsp->query.frame > 0) {
4446                     s_TrimChainEndToSubjPos(pair_info[i].second,
4447                                             pair_info[i].trim_second,
4448                                             scoring_options->penalty,
4449                                             scoring_options->gap_open,
4450                                             scoring_options->gap_extend,
4451                                             query_blk,
4452                                             query_info);
4453                 }
4454                 else {
4455                     s_TrimChainStartToSubjPos(pair_info[i].second,
4456                                               pair_info[i].trim_second,
4457                                               scoring_options->penalty,
4458                                               scoring_options->gap_open,
4459                                               scoring_options->gap_extend,
4460                                               query_blk,
4461                                               query_info);
4462                 }
4463             }
4464 
4465         }
4466     }
4467 
4468     return found;
4469 }
4470 
4471 
FindPartialyCoveredQueries(void * data,Int4 oid,Int4 word_size)4472 HSPChain* FindPartialyCoveredQueries(void* data, Int4 oid, Int4 word_size)
4473 {
4474     BlastHSPMapperData* spl_data = data;
4475     BlastQueryInfo* query_info = spl_data->query_info;
4476     HSPChain** saved = spl_data->saved_chains;
4477     HSPChain* retval = NULL;
4478     HSPChain* last;
4479     Int4 i;
4480 
4481     for (i = 0;i < query_info->num_queries;i++) {
4482         HSPChain* chain = saved[i];
4483 
4484         for (chain = saved[i]; chain; chain = chain->next) {
4485             HSPContainer* h;
4486 
4487             if (chain->oid != oid) {
4488                 continue;
4489             }
4490 
4491             if (chain->score < 30) {
4492                 continue;
4493             }
4494 
4495             h = chain->hsps;
4496 
4497             if (h->hsp->query.offset > word_size) {
4498 
4499                 if (!retval) {
4500                     retval = CloneChain(chain);
4501                     last = retval;
4502                 }
4503                 else {
4504                     last->next = CloneChain(chain);
4505                     last = last->next;
4506                 }
4507 
4508                 continue;
4509             }
4510 
4511             while (h->next) {
4512                 h = h->next;
4513             }
4514             if (query_info->contexts[h->hsp->context].query_length -
4515                 h->hsp->query.end > word_size) {
4516 
4517                 if (!retval) {
4518                     retval = CloneChain(chain);
4519                     last = retval;
4520                 }
4521                 else {
4522                     last->next = CloneChain(chain);
4523                     last = last->next;
4524                 }
4525             }
4526         }
4527     }
4528 
4529     return retval;
4530 }
4531 
4532 
4533 /** Perform writing task for paired reads
4534  * ownership of the HSP list and sets the dereferenced pointer to NULL.
4535  * @param data To store results to [in][out]
4536  * @param hsp_list Pointer to the HSP list to save in the collector. [in]
4537  */
4538 static int
s_BlastHSPMapperSplicedPairedRun(void * data,BlastHSPList * hsp_list)4539 s_BlastHSPMapperSplicedPairedRun(void* data, BlastHSPList* hsp_list)
4540 {
4541     BlastHSPMapperData* spl_data = data;
4542     BlastHSPMapperParams* params = spl_data->params;
4543     const ScoringOptions* scoring_opts = &params->scoring_options;
4544     Boolean is_spliced  = params->splice;
4545     const Int4 kLongestIntron = params->longest_intron;
4546     Int4 cutoff_score = params->cutoff_score;
4547     Int4* cutoff_score_fun = params->cutoff_score_fun;
4548     Int4 cutoff_edit_dist = params->cutoff_edit_dist;
4549     BLAST_SequenceBlk* query_blk = spl_data->query;
4550     BlastQueryInfo* query_info = spl_data->query_info;
4551     const Int4 kDefaultMaxHsps = 1000;
4552     Int4 max_hsps = kDefaultMaxHsps;
4553     HSPNode* nodes = calloc(max_hsps, sizeof(HSPNode));
4554     HSPPath* path = NULL;
4555     BlastHSP** hsp_array;
4556     Int4 num_hsps = 0;
4557     Int4 i;
4558 
4559     HSPChain** saved_chains = spl_data->saved_chains;
4560     HSPChain* chain_array[2];
4561     HSPChain* first = NULL, *second = NULL;
4562     Int4 workspace_size = 40;
4563     Pairinfo* pair_workspace = calloc(workspace_size, sizeof(Pairinfo));
4564     /* the score bonus needs to include exon pars that were to short to be
4565        aligned (at either end of the chain) */
4566     const Int4 kPairBonus = is_spliced ? 21 : 5;
4567 
4568     if (!hsp_list) {
4569         s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4570         return 0;
4571     }
4572 
4573     if (!params || !nodes) {
4574         s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4575         return -1;
4576     }
4577 
4578     hsp_array = hsp_list->hsp_array;
4579 
4580     path = HSPPathNew();
4581 
4582     /* sort HSPs by query context and subject position for spliced aligments */
4583     if (is_spliced) {
4584         qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4585               s_CompareHSPsByContextSubjectOffset);
4586     }
4587     else {
4588         /* for non spliced alignments, sort HSPs by query context and score */
4589         qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4590               s_CompareHSPsByContextScore);
4591     }
4592 
4593     i = 0;
4594     chain_array[0] = NULL;
4595     chain_array[1] = NULL;
4596     while (i < hsp_list->hspcnt) {
4597 
4598         Int4 context = hsp_array[i]->context;
4599         /* is this query the first segment of a pair */
4600         Boolean has_pair = (query_info->contexts[context].segment_flags ==
4601                             eFirstSegment);
4602         Int4 query_idx  = context / NUM_STRANDS;
4603         Int4 first_context = query_idx * NUM_STRANDS;
4604         Int4 context_next_fragment = has_pair ? (query_idx + 2) * NUM_STRANDS :
4605             (query_idx + 1) * NUM_STRANDS;
4606 
4607         HSPChain* new_chains = NULL;
4608 
4609         /* iterate over contexts that belong to a read pair */
4610         while (i < hsp_list->hspcnt &&
4611                hsp_array[i]->context < context_next_fragment) {
4612 
4613             memset(nodes, 0, num_hsps * sizeof(HSPNode));
4614             num_hsps = 0;
4615             context = hsp_array[i]->context;
4616 
4617             /* collect HSPs for the current context */
4618             while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4619 
4620                 /* Overlapping portions of two consecutive subject chunks
4621                    may produce two HSPs representing the same alignment. We
4622                    drop one of them here (assuming the HSPs for the same
4623                    context are sorted by score or subject position). */
4624                 BlastHSP* h = (num_hsps == 0 ? NULL : *nodes[num_hsps - 1].hsp);
4625                 if (!h || hsp_array[i]->context != h->context ||
4626                     hsp_array[i]->subject.offset != h->subject.offset ||
4627                     hsp_array[i]->score != h->score) {
4628 
4629                     /* reallocate node array if necessary */
4630                     if (num_hsps >= max_hsps) {
4631                         HSPNode* new_nodes = NULL;
4632 
4633                         /* For non-spliced mapping we are only interested in
4634                            the best scoring whole HSPs. HSPs are sorted by
4635                            score and those that map to many places are not good
4636                            so the node array does not need to be extended. */
4637                         if (!is_spliced) {
4638                             break;
4639                         }
4640 
4641                         max_hsps *= 2;
4642                         new_nodes = calloc(max_hsps, sizeof(HSPNode));
4643                         if (!nodes) {
4644                             s_BlastHSPMapperSplicedRunCleanUp(path, nodes,
4645                                                               pair_workspace);
4646                             return -1;
4647                         }
4648                         s_HSPNodeArrayCopy(new_nodes, nodes, num_hsps);
4649                         free(nodes);
4650                         nodes = new_nodes;
4651                     }
4652 
4653                     nodes[num_hsps].hsp = &(hsp_array[i]);
4654                     nodes[num_hsps].best_score = hsp_array[i]->score;
4655 
4656                     num_hsps++;
4657                 }
4658                 i++;
4659             }
4660 
4661             /* find cutoff score for the query */
4662             if (cutoff_score_fun[1] != 0) {
4663                 Int4 query_len = query_info->contexts[context].query_length;
4664                 cutoff_score = (cutoff_score_fun[0] +
4665                                 cutoff_score_fun[1] * query_len) / 100;
4666             }
4667             else if (params->cutoff_score == 0) {
4668                 cutoff_score = GetCutoffScore(
4669                                 query_info->contexts[context].query_length);
4670             }
4671 
4672             /* find the best scoring chain of HSPs that align to exons
4673                parts of the query and the subject */
4674             new_chains = s_FindBestPath(nodes, num_hsps, path,
4675                                         hsp_list->oid, is_spliced,
4676                                         kLongestIntron, cutoff_score,
4677                                         query_blk, query_info, scoring_opts);
4678 
4679             ASSERT(new_chains);
4680             HSPChainListInsert(&chain_array[(context - first_context) /
4681                                             NUM_STRANDS],
4682                                &new_chains, cutoff_score, cutoff_edit_dist,
4683                                FALSE);
4684 
4685             /* skip HSPs for the same context if there are more then max
4686                allowed per context */
4687             while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4688                 i++;
4689             }
4690         }
4691 
4692 #if _DEBUG
4693         ASSERT(!chain_array[0] || s_TestChainsSorted(chain_array[0]));
4694         ASSERT(!chain_array[1] || s_TestChainsSorted(chain_array[1]));
4695 #endif
4696 
4697         if (is_spliced && chain_array[0]) {
4698             s_FindSpliceJunctions(chain_array[0], query_blk, query_info,
4699                                   scoring_opts);
4700         }
4701 
4702         if (is_spliced && chain_array[1]) {
4703             s_FindSpliceJunctions(chain_array[1], query_blk, query_info,
4704                                   scoring_opts);
4705         }
4706 
4707         first = chain_array[0];
4708         second = chain_array[1];
4709 
4710         /* If the chains provide sufficient score check for pairs.
4711            The score bonus is added to prefer a pair with a slightly smaller
4712            score to single chains with a better one. */
4713         if (first && second) {
4714 
4715             s_FindBestPairs(&first, &second, 0, &pair_workspace,
4716                             &workspace_size, is_spliced, scoring_opts,
4717                             query_blk, query_info);
4718 
4719             ASSERT(s_TestChains(first));
4720             ASSERT(s_TestChains(second));
4721         }
4722 
4723         /* save all chains and remove ones with scores lower than
4724            best score - kPairBonus */
4725         if (first) {
4726             HSPChainListInsert(&saved_chains[query_idx], &first, cutoff_score,
4727                                cutoff_edit_dist, TRUE);
4728             HSPChainListTrim(saved_chains[query_idx], kPairBonus);
4729 
4730 
4731 #if _DEBUG
4732             ASSERT(!saved_chains[query_idx] ||
4733                    s_TestChainsSorted(saved_chains[query_idx]));
4734 #endif
4735         }
4736         if (second) {
4737             HSPChainListInsert(&saved_chains[query_idx + 1], &second,
4738                                cutoff_score, cutoff_edit_dist, TRUE);
4739             HSPChainListTrim(saved_chains[query_idx + 1], kPairBonus);
4740 
4741 
4742 #if _DEBUG
4743             ASSERT(!saved_chains[query_idx + 1] ||
4744                    s_TestChainsSorted(saved_chains[query_idx + 1]));
4745 #endif
4746         }
4747 
4748         /* make temporary lists empty */
4749         chain_array[0] = chain_array[1] = NULL;
4750     }
4751 
4752     /* delete HSPs that were not saved in results */
4753     hsp_list = Blast_HSPListFree(hsp_list);
4754 
4755     s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4756 
4757     return 0;
4758 }
4759 
4760 
4761 /** Free the writer
4762  * @param writer The writer to free [in]
4763  * @return NULL.
4764  */
4765 static
4766 BlastHSPWriter*
s_BlastHSPMapperFree(BlastHSPWriter * writer)4767 s_BlastHSPMapperFree(BlastHSPWriter* writer)
4768 {
4769    BlastHSPMapperData *data = writer->data;
4770    sfree(data->params);
4771    sfree(writer->data);
4772    sfree(writer);
4773    return NULL;
4774 }
4775 
4776 /** create the writer
4777  * @param params Pointer to the hit paramters [in]
4778  * @param query_info BlastQueryInfo (not used) [in]
4779  * @return writer
4780  */
4781 static
4782 BlastHSPWriter*
s_BlastHSPMapperPairedNew(void * params,BlastQueryInfo * query_info,BLAST_SequenceBlk * query)4783 s_BlastHSPMapperPairedNew(void* params, BlastQueryInfo* query_info,
4784                           BLAST_SequenceBlk* query)
4785 {
4786    BlastHSPWriter* writer = NULL;
4787    BlastHSPMapperData* data = NULL;
4788 /*   BlastHSPMapperParams * spl_param = params; */
4789 
4790    /* allocate space for writer */
4791    writer = malloc(sizeof(BlastHSPWriter));
4792 
4793    /* fill up the function pointers */
4794    writer->InitFnPtr   = &s_BlastHSPMapperPairedInit;
4795    writer->FinalFnPtr  = &s_BlastHSPMapperFinal;
4796    writer->FreeFnPtr   = &s_BlastHSPMapperFree;
4797    writer->RunFnPtr    = &s_BlastHSPMapperSplicedPairedRun;
4798 
4799    /* allocate for data structure */
4800    writer->data = calloc(1, sizeof(BlastHSPMapperData));
4801    data = writer->data;
4802    data->params = params;
4803    data->query_info = query_info;
4804    data->query = query;
4805 
4806    return writer;
4807 }
4808 
4809 
4810 /*************************************************************/
4811 /** The following are exported functions to be used by APP   */
4812 
4813 BlastHSPMapperParams*
BlastHSPMapperParamsNew(const BlastHitSavingOptions * hit_options,const BlastScoringOptions * scoring_options)4814 BlastHSPMapperParamsNew(const BlastHitSavingOptions* hit_options,
4815                         const BlastScoringOptions* scoring_options)
4816 {
4817        BlastHSPMapperParams* retval = NULL;
4818 
4819        if (hit_options == NULL)
4820            return NULL;
4821 
4822        retval = (BlastHSPMapperParams*)malloc(sizeof(BlastHSPMapperParams));
4823        retval->hitlist_size = MAX(hit_options->hitlist_size, 10);
4824        retval->paired = hit_options->paired;
4825        retval->splice = hit_options->splice;
4826        retval->longest_intron = hit_options->longest_intron;
4827        retval->cutoff_score = hit_options->cutoff_score;
4828        retval->cutoff_score_fun[0] = hit_options->cutoff_score_fun[0];
4829        retval->cutoff_score_fun[1] = hit_options->cutoff_score_fun[1];
4830        retval->cutoff_edit_dist = hit_options->max_edit_distance;
4831        retval->program = hit_options->program_number;
4832        retval->scoring_options.reward = scoring_options->reward;
4833        retval->scoring_options.penalty = scoring_options->penalty;
4834        retval->scoring_options.gap_open = -scoring_options->gap_open;
4835        retval->scoring_options.gap_extend = -scoring_options->gap_extend;
4836        retval->scoring_options.no_splice_signal = -2;
4837        return retval;
4838 }
4839 
4840 BlastHSPMapperParams*
BlastHSPMapperParamsFree(BlastHSPMapperParams * opts)4841 BlastHSPMapperParamsFree(BlastHSPMapperParams* opts)
4842 {
4843     if ( !opts )
4844         return NULL;
4845     sfree(opts);
4846     return NULL;
4847 }
4848 
4849 BlastHSPWriterInfo*
BlastHSPMapperInfoNew(BlastHSPMapperParams * params)4850 BlastHSPMapperInfoNew(BlastHSPMapperParams* params) {
4851    BlastHSPWriterInfo * writer_info =
4852                         malloc(sizeof(BlastHSPWriterInfo));
4853 
4854    writer_info->NewFnPtr = &s_BlastHSPMapperPairedNew;
4855 
4856    writer_info->params = params;
4857    return writer_info;
4858 }
4859