1 /*  $Id: hspfilter_mapper.c,v 1.9 2017/01/05 18:34:13 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Greg Boratyn
27  *
28  */
29 
30 /** @file hspfilter_mapper.c
31  * Implementation of the BlastHSPWriter interface to save only the best scoring * chains of HSPs for aligning RNA-Seq sequences to a genome.
32  */
33 
34 #include <algo/blast/core/hspfilter_mapper.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_hits.h>
37 #include "jumper.h"
38 
39 /* Pair configurations, in the order of prefernece */
40 #define PAIR_CONVERGENT 0
41 #define PAIR_DIVERGENT 1
42 #define PAIR_PARALLEL 2
43 #define PAIR_NONE 3
44 
45 /* A container to create a list of HSPs */
46 typedef struct HSPContainer
47 {
48     BlastHSP* hsp;
49     struct HSPContainer* next;
50 } HSPContainer;
51 
52 
53 /* Create HSPContainer and take ownership of the HSP */
HSPContainerNew(BlastHSP ** hsp)54 static HSPContainer* HSPContainerNew(BlastHSP** hsp)
55 {
56     HSPContainer* retval = calloc(1, sizeof(HSPContainer));
57     if (!retval) {
58         return NULL;
59     }
60     retval->hsp = *hsp;
61     /* take ownership of this HSP */
62     *hsp = NULL;
63 
64     return retval;
65 }
66 
67 /* Free the list of HSPs, along with the stored HSPs */
HSPContainerFree(HSPContainer * hc)68 static HSPContainer* HSPContainerFree(HSPContainer* hc)
69 {
70     HSPContainer* h = hc;
71     while (h) {
72         HSPContainer* next = h->next;
73         if (h->hsp) {
74             Blast_HSPFree(h->hsp);
75         }
76 
77         sfree(h);
78         h = next;
79     }
80 
81     return NULL;
82 }
83 
84 /* Clone a list of HSP containers */
HSPContainerDup(HSPContainer * inh)85 static HSPContainer* HSPContainerDup(HSPContainer* inh)
86 {
87     HSPContainer* retval = NULL;
88     HSPContainer* hi = NULL, *ho = NULL;
89     BlastHSP* new_hsp = NULL;
90 
91     if (!inh || !inh->hsp) {
92         return NULL;
93     }
94 
95     new_hsp = Blast_HSPClone(inh->hsp);
96     if (!new_hsp) {
97         return NULL;
98     }
99     retval = HSPContainerNew(&new_hsp);
100     if (!retval) {
101         Blast_HSPFree(new_hsp);
102         return NULL;
103     }
104 
105     hi = inh->next;
106     ho = retval;
107     for (; hi; hi = hi->next, ho = ho->next) {
108         new_hsp = Blast_HSPClone(hi->hsp);
109         if (!new_hsp) {
110             Blast_HSPFree(new_hsp);
111             HSPContainerFree(retval);
112             return NULL;
113         }
114         ho->next = HSPContainerNew(&new_hsp);
115         if (!ho->next) {
116             Blast_HSPFree(new_hsp);
117             HSPContainerFree(retval);
118             return NULL;
119         }
120     }
121 
122     return retval;
123 }
124 
125 /* A chain of HSPs: gene transcript sequence aligned to exons */
126 typedef struct HSPChain
127 {
128     Int4 context;  /* query context */
129     Int4 oid;      /* subject oid */
130     Int4 score;    /* score for the whole chain */
131     HSPContainer* hsps;  /* list of HSPs that belong to this chain */
132     Int4 compartment;    /* compartment number for the chain (needed
133                             for reporting results) */
134 
135     Int4 count;    /* number of chains with the same or larger score found
136                       for the same query */
137     struct HSPChain* pair;  /* pointer to the pair (for paired reads) */
138     Uint1 pair_conf;        /* pair configuration */
139 
140     Int4 adapter;  /* adapter start position */
141     Int4 polyA;    /* start of polyA tail */
142     struct HSPChain* next;
143 } HSPChain;
144 
145 
HSPChainFree(HSPChain * chain_list)146 static HSPChain* HSPChainFree(HSPChain* chain_list)
147 {
148     HSPChain* chain = chain_list;
149     while (chain) {
150         HSPChain* next = chain->next;
151         if (chain->pair) {
152             chain->pair->pair = NULL;
153         }
154         ASSERT(chain->hsps);
155         chain->hsps = HSPContainerFree(chain->hsps);
156         sfree(chain);
157         chain = next;
158     }
159 
160     return NULL;
161 }
162 
HSPChainNew(Int4 context)163 static HSPChain* HSPChainNew(Int4 context)
164 {
165     HSPChain* retval = calloc(1, sizeof(HSPChain));
166     if (!retval) {
167         return NULL;
168     }
169     retval->context = context;
170     retval->compartment = -1;
171     retval->adapter = -1;
172 
173     return retval;
174 }
175 
176 
177 /* Insert a single chain into the list so that the list is sorted in decending
178    order of chain scores. */
s_HSPChainListInsertOne(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)179 static Int4 s_HSPChainListInsertOne(HSPChain** list, HSPChain* chain,
180                                     Boolean check_for_duplicates)
181 {
182     HSPChain* ch = NULL;
183 
184     if (!list || !chain) {
185         return -1;
186     }
187     ASSERT(!chain->next);
188 
189     if (!*list) {
190         *list = chain;
191         return 0;
192     }
193 
194     ch = *list;
195     if (ch->score < chain->score) {
196         chain->next = ch;
197         *list = chain;
198         return 0;
199     }
200 
201     /* check for duplicate chain: the new chains may be the same as already
202        saved; this may come from writing HSPs to HSP stream for each chunk */
203     if (check_for_duplicates &&
204         ch->oid == chain->oid && ch->score == chain->score &&
205         ch->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
206         ch->hsps->hsp->subject.offset ==
207         chain->hsps->hsp->subject.offset) {
208 
209         chain->next = NULL;
210         chain = HSPChainFree(chain);
211         return 0;
212     }
213 
214     while (ch->next && ch->next->score >= chain->score){
215 
216         /* check for duplicate chain: the new chains may be the same as already
217            saved; this may come from writing HSPs to HSP stream for each chunk */
218         if (check_for_duplicates &&
219             ch->next->oid == chain->oid && ch->next->score == chain->score &&
220             ch->next->hsps->hsp->query.frame == chain->hsps->hsp->query.frame &&
221             ch->next->hsps->hsp->subject.offset ==
222             chain->hsps->hsp->subject.offset) {
223 
224             chain->next = NULL;
225             chain = HSPChainFree(chain);
226             return 0;
227         }
228 
229         ch = ch->next;
230     }
231     ASSERT(ch);
232     chain->next = ch->next;
233     ch->next = chain;
234 
235     return 0;
236 }
237 
238 /* Insert chains into the list so that the list is sorted in descending order
239    of chain scores. If chain is a list, each element is added separately. The
240    list must be sorted before adding chain */
HSPChainListInsert(HSPChain ** list,HSPChain * chain,Boolean check_for_duplicates)241 static Int4 HSPChainListInsert(HSPChain** list, HSPChain* chain,
242                                Boolean check_for_duplicates)
243 {
244     HSPChain* ch = NULL;
245     Int4 status = 0;
246 
247     if (!list || !chain) {
248         return -1;
249     }
250 
251     ch = chain;
252     while (ch) {
253         HSPChain* next = ch->next;
254         ch->next = NULL;
255         status = s_HSPChainListInsertOne(list, ch, check_for_duplicates);
256         if (status) {
257             return status;
258         }
259         ch = next;
260 
261     }
262 
263     return 0;
264 }
265 
266 /* Remove from the list chains with scores lower than the best one by at least
267    the given margin. The list must be sorted in descending order of scores. */
HSPChainListTrim(HSPChain * list,Int4 margin)268 static Int4 HSPChainListTrim(HSPChain* list, Int4 margin)
269 {
270     HSPChain* ch = NULL;
271     Int4 best_score;
272 
273     if (!list) {
274         return -1;
275     }
276 
277     best_score = list->score;
278     ch = list;
279     while (ch->next && best_score - ch->next->score <= margin) {
280         ASSERT(best_score - ch->next->score >= 0);
281         ch = ch->next;
282     }
283     ASSERT(ch);
284 
285     ch->next = HSPChainFree(ch->next);
286 
287     return 0;
288 }
289 
290 
291 /* Clone a single HSP chain */
s_CloneChain(const HSPChain * chain)292 static HSPChain* s_CloneChain(const HSPChain* chain)
293 {
294     HSPChain* retval = NULL;
295 
296     if (!chain) {
297         return NULL;
298     }
299 
300     retval = HSPChainNew(chain->context);
301     if (!retval) {
302         return NULL;
303     }
304     retval->hsps = HSPContainerDup(chain->hsps);
305     if (!retval->hsps) {
306         HSPChainFree(retval);
307         return NULL;
308     }
309     retval->oid = chain->oid;
310     retval->score = chain->score;
311     retval->adapter = chain->adapter;
312     retval->polyA = chain->polyA;
313 
314     return retval;
315 }
316 
317 /* Test that all pointers in the chain are set */
s_TestChains(HSPChain * chain)318 static Boolean s_TestChains(HSPChain* chain)
319 {
320     HSPContainer* hc;
321 
322     ASSERT(chain);
323     hc = chain->hsps;
324     ASSERT(hc);
325     ASSERT(hc->hsp);
326     ASSERT(hc->hsp->context == chain->context);
327 
328     ASSERT(hc->hsp->gap_info->size > 1 ||
329            hc->hsp->query.end - hc->hsp->query.offset ==
330            hc->hsp->subject.end - hc->hsp->subject.offset);
331 
332     hc = hc->next;
333     while (hc) {
334         ASSERT(hc->hsp);
335         ASSERT(hc->hsp->context == chain->context);
336 
337         ASSERT(hc->hsp->gap_info->size > 1 ||
338                hc->hsp->query.end - hc->hsp->query.offset ==
339                hc->hsp->subject.end - hc->hsp->subject.offset);
340 
341         if (hc->next) {
342             ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
343             ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
344         }
345         hc = hc->next;
346     }
347 
348     chain = chain->next;
349     while (chain) {
350         hc = chain->hsps;
351         ASSERT(hc);
352         ASSERT(hc->hsp);
353         ASSERT(hc->hsp->context == chain->context);
354 
355         ASSERT(hc->hsp->gap_info->size > 1 ||
356                hc->hsp->query.end - hc->hsp->query.offset ==
357                hc->hsp->subject.end - hc->hsp->subject.offset);
358 
359         hc = hc->next;
360         while (hc) {
361             ASSERT(hc);
362             ASSERT(hc->hsp);
363             ASSERT(hc->hsp->context == chain->context);
364 
365             ASSERT(hc->hsp->gap_info->size > 1 ||
366                    hc->hsp->query.end - hc->hsp->query.offset ==
367                    hc->hsp->subject.end - hc->hsp->subject.offset);
368 
369             if (hc->next) {
370                 ASSERT(hc->hsp->query.offset < hc->next->hsp->query.offset);
371                 ASSERT(hc->hsp->subject.offset < hc->next->hsp->subject.offset);
372             }
373 
374             hc = hc->next;
375         }
376         chain = chain->next;
377     }
378 
379     return TRUE;
380 }
381 
382 #if _DEBUG
s_TestChainsSorted(HSPChain * chain)383 static Boolean s_TestChainsSorted(HSPChain* chain)
384 {
385     HSPChain* prev;
386 
387     s_TestChains(chain);
388 
389     prev = chain;
390     chain = chain->next;
391     for (; chain; chain = chain->next, prev = prev->next) {
392         ASSERT(prev->score >= chain->score);
393     }
394 
395     return TRUE;
396 }
397 #endif
398 
399 /** Data structure used by the writer */
400 typedef struct BlastHSPMapperData {
401    BlastHSPMapperParams* params;         /**< how many hits to save */
402    BLAST_SequenceBlk* query;              /**< query sequence */
403    BlastQueryInfo* query_info;            /**< information about queries */
404    HSPChain** saved_chains;               /**< HSP chains are stored here */
405 } BlastHSPMapperData;
406 
407 /*************************************************************/
408 /** The following are implementations for BlastHSPWriter ADT */
409 
410 /** Perform pre-run stage-specific initialization
411  * @param data The internal data structure [in][out]
412  * @param results The HSP results to operate on  [in]
413  */
414 static int
s_BlastHSPMapperPairedInit(void * data,void * results)415 s_BlastHSPMapperPairedInit(void* data, void* results)
416 {
417    BlastHSPMapperData * spl_data = data;
418    BlastHSPResults* r = (BlastHSPResults*)results;
419    spl_data->saved_chains = calloc(r->num_queries, sizeof(HSPChain*));
420 
421    return 0;
422 }
423 
424 
425 /* Get subject start position */
s_FindFragmentStart(HSPChain * chain)426 static Int4 s_FindFragmentStart(HSPChain* chain)
427 {
428     Int2 frame;
429     HSPContainer* hc = NULL;
430     ASSERT(chain);
431     ASSERT(chain->hsps);
432     ASSERT(chain->hsps->hsp);
433 
434     frame = chain->hsps->hsp->query.frame;
435     if (frame > 0) {
436         return chain->hsps->hsp->subject.offset;
437     }
438 
439     hc = chain->hsps;
440     while (hc->next) {
441         hc = hc->next;
442     }
443     ASSERT(hc);
444 
445     return hc->hsp->subject.end - 1;
446 }
447 
448 
449 /* Get subject end position */
450 /* Not used
451 static Int4 s_FindFragmentEnd(HSPChain* chain)
452 {
453     Int2 frame;
454     HSPContainer* hc = NULL;
455     ASSERT(chain);
456     ASSERT(chain->hsps);
457     ASSERT(chain->hsps->hsp);
458 
459     frame = chain->hsps->hsp->query.frame;
460     if (frame < 0) {
461         return chain->hsps->hsp->subject.offset;
462     }
463 
464     hc = chain->hsps;
465     while (hc->next) {
466         hc = hc->next;
467     }
468     ASSERT(hc);
469 
470     return hc->hsp->subject.end - 1;
471 }
472 */
473 
474 /* Compute HSP alignment score from Jumper edit script */
s_ComputeAlignmentScore(BlastHSP * hsp,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)475 static Int4 s_ComputeAlignmentScore(BlastHSP* hsp, Int4 mismatch_score,
476                                     Int4 gap_open_score, Int4 gap_extend_score)
477 {
478     Int4 i;
479     Int4 last_pos = hsp->query.offset;
480     Int4 score = 0;
481     const Int4 kGap = 15;
482 
483     for (i = 0;i < hsp->map_info->edits->num_edits;i++) {
484         JumperEdit* e = &(hsp->map_info->edits->edits[i]);
485         Int4 num_matches = e->query_pos - last_pos;
486         ASSERT(num_matches >= 0);
487         last_pos = e->query_pos;
488         score += num_matches;
489 
490         if (e->query_base == kGap) {
491             ASSERT(e->subject_base != kGap);
492             if (num_matches > 0 ||
493                 (i > 0 && hsp->map_info->edits->edits[i - 1].query_base !=
494                  kGap)) {
495 
496                 score += gap_open_score;
497             }
498             score += gap_extend_score;
499         }
500         else if (e->subject_base == kGap) {
501             if (num_matches > 0 ||
502                 (i > 0 && hsp->map_info->edits->edits[i - 1].subject_base !=
503                  kGap)) {
504 
505                 score += gap_open_score;
506             }
507             score += gap_extend_score;
508             last_pos++;
509         }
510         else {
511             score += mismatch_score;
512             last_pos++;
513         }
514     }
515 
516     score += hsp->query.end - last_pos;
517     return score;
518 }
519 
520 
521 /* Find the cost of chain HSPs overlapping on the query, as the smaller score
522    of the overlapping region */
s_GetOverlapCost(const BlastHSP * a,const BlastHSP * b,Int4 edit_penalty)523 static Int4 s_GetOverlapCost(const BlastHSP* a, const BlastHSP* b,
524                              Int4 edit_penalty)
525 {
526     Int4 i;
527     Int4 overlap_f, overlap_s;
528     const BlastHSP* f;
529     const BlastHSP* s;
530 
531     /* if one HSP in contained within the other on the query, return the
532        smaller score */
533     if ((a->query.offset <= b->query.offset && a->query.end >= b->query.end) ||
534         (a->query.offset >= b->query.offset && a->query.end <= b->query.end)) {
535 
536         return MIN(a->score, b->score);
537     }
538 
539     /* if the two HSPs are mutually exclusive on the query, there is no cost */
540     if ((a->query.end < b->query.offset && a->query.offset < b->query.end) ||
541         (b->query.end < a->query.offset && b->query.offset < a->query.end)) {
542 
543         return 0;
544     }
545 
546     /* otherwise the HSPs partially overlap; reurn the the smaller score for
547        the overlap region */
548 
549     /* find which HSP precedes which on the query */
550     if (a->query.offset <= b->query.offset) {
551         f = a;
552         s = b;
553     }
554     else {
555         f = b;
556         s = a;
557     }
558 
559     /* this is the overlap score assuming perfect matches in the overlap */
560     overlap_f = overlap_s = f->query.end - s->query.offset;
561     ASSERT(overlap_f >= 0 && overlap_s >= 0);
562     /* subtract penalties for mismatches and gaps in the overlap region */
563     for (i = 0;i < f->map_info->edits->num_edits;i++) {
564         if (f->map_info->edits->edits[i].query_pos >= s->query.offset) {
565             overlap_f -= edit_penalty;
566         }
567     }
568     for (i = 0;i < s->map_info->edits->num_edits;i++) {
569         if (s->map_info->edits->edits[i].query_pos < f->query.end) {
570             overlap_s -= edit_penalty;
571         }
572     }
573 
574     return MIN(overlap_f, overlap_s);
575 }
576 
577 
578 /* Compute chain score */
s_ComputeChainScore(HSPChain * chain,const ScoringOptions * score_options,Int4 query_len,Boolean comp_hsp_score)579 static Int4 s_ComputeChainScore(HSPChain* chain,
580                                 const ScoringOptions* score_options,
581                                 Int4 query_len,
582                                 Boolean comp_hsp_score)
583 {
584     Int4 retval;
585     HSPContainer* h = NULL;
586     HSPContainer* prev = NULL;
587 
588     if (!chain || !score_options) {
589         return -1;
590     }
591 
592     h = chain->hsps;
593     if (comp_hsp_score) {
594         h->hsp->score = s_ComputeAlignmentScore(h->hsp, score_options->penalty,
595                                                 score_options->gap_open,
596                                                 score_options->gap_extend);
597     }
598     retval = h->hsp->score;
599     if (h->hsp->query.offset > 0 &&
600         (h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0) {
601 
602         retval += score_options->no_splice_signal;
603     }
604     if (h->hsp->query.end < query_len &&
605         (h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
606 
607         retval += score_options->no_splice_signal;
608     }
609 
610     prev = h;
611     h = h->next;
612 
613     for (; h; h = h->next, prev = prev->next) {
614         /* HSPs must be sorted by query positon */
615         ASSERT(h->hsp->query.offset >= prev->hsp->query.offset);
616 
617         if (comp_hsp_score) {
618 
619             h->hsp->score = s_ComputeAlignmentScore(h->hsp,
620                                                     score_options->penalty,
621                                                     score_options->gap_open,
622                                                     score_options->gap_extend);
623         }
624         retval += h->hsp->score;
625 
626         if (h->hsp->query.offset > 0 &&
627             (h->hsp->map_info->left_edge & MAPPER_SPLICE_SIGNAL) == 0) {
628 
629             retval += score_options->no_splice_signal;
630         }
631         if (h->hsp->query.end < query_len &&
632             (h->hsp->map_info->right_edge & MAPPER_SPLICE_SIGNAL) == 0) {
633 
634             retval += score_options->no_splice_signal;
635         }
636     }
637 
638     return retval;
639 }
640 
641 #if _DEBUG
s_TestHSPRanges(const BlastHSP * hsp)642 static Boolean s_TestHSPRanges(const BlastHSP* hsp)
643 {
644     Int4 i;
645     Int4 d = 0;
646     for (i=0;i < hsp->gap_info->size;i++) {
647         switch (hsp->gap_info->op_type[i]) {
648         case eGapAlignIns:
649             d -= hsp->gap_info->num[i];
650             break;
651 
652         case eGapAlignDel:
653             d += hsp->gap_info->num[i];
654             break;
655 
656         default:
657             break;
658         }
659     }
660     if (hsp->query.end - hsp->query.offset + d !=
661         hsp->subject.end - hsp->subject.offset) {
662 
663         return FALSE;
664     }
665 
666     return TRUE;
667 }
668 #endif
669 
670 /* Trim HSP by a number of bases on query or subject, either from the start or
671    from the end */
s_TrimHSP(BlastHSP * hsp,Int4 num,Boolean is_query,Boolean is_start,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)672 static Int4 s_TrimHSP(BlastHSP* hsp, Int4 num, Boolean is_query,
673                       Boolean is_start, Int4 mismatch_score,
674                       Int4 gap_open_score, Int4 gap_extend_score)
675 {
676     Int4 num_left = num;
677     Int4 i = is_start ? 0 : hsp->gap_info->size - 1;
678     Int4 d = is_start ? 1 : -1;
679     Int4 end = is_start ? hsp->gap_info->size : -1;
680     Int4 delta_query = 0, delta_subject = 0;
681     Boolean is_subject = !is_query;
682     Boolean is_end = !is_start;
683     Int4 k;
684 
685     ASSERT(hsp /* */ && num > 0 /* */);
686     ASSERT((is_query && num <= hsp->query.end - hsp->query.offset) ||
687            (!is_query && num <= hsp->subject.end - hsp->subject.offset));
688 
689 #if _DEBUG
690     ASSERT(s_TestHSPRanges(hsp));
691 #endif
692 
693     if (num == 0) {
694         return 0;
695     }
696 
697 
698     /* itereate over the edit script and remove the required number
699        of subject bases */
700     while (i != end && num_left > 0) {
701         switch (hsp->gap_info->op_type[i]) {
702         case eGapAlignSub:
703             if (hsp->gap_info->num[i] > num_left) {
704                 hsp->gap_info->num[i] -= num_left;
705                 delta_query += num_left;
706                 delta_subject += num_left;
707                 num_left = 0;
708             }
709             else {
710                 Int4 n = hsp->gap_info->num[i];
711 
712                 delta_query += n;
713                 delta_subject += n;
714                 num_left -= n;
715                 i += d;
716             }
717             break;
718 
719         case eGapAlignDel:
720             if (is_subject) {
721                 if (hsp->gap_info->num[i] > num_left) {
722                     hsp->gap_info->num[i] -= num_left;
723                     delta_subject += num_left;
724                     num_left = 0;
725                 }
726                 else {
727                     delta_subject += hsp->gap_info->num[i];
728                     num_left -= hsp->gap_info->num[i];
729                     i += d;
730                 }
731             }
732             else {
733                 delta_subject += hsp->gap_info->num[i];
734                 i += d;
735             }
736             break;
737 
738         case eGapAlignIns:
739             if (is_query) {
740                 if (hsp->gap_info->num[i] > num_left) {
741                     hsp->gap_info->num[i] -= num_left;
742                     delta_query += num_left;
743                     num_left = 0;
744                 }
745                 else {
746                     delta_query += hsp->gap_info->num[i];
747                     num_left -= hsp->gap_info->num[i];
748                     i += d;
749                 }
750             }
751             else {
752                 delta_query += hsp->gap_info->num[i];
753                 i += d;
754             }
755             break;
756 
757         default:
758             ASSERT(0);
759             break;
760         }
761     }
762 
763     /* shift edit script elements to fill the removed ones */
764     if (is_start && i > 0) {
765         for (k = 0;k < hsp->gap_info->size - i;k++) {
766             hsp->gap_info->op_type[k] = hsp->gap_info->op_type[k + i];
767             hsp->gap_info->num[k] = hsp->gap_info->num[k + i];
768         }
769         hsp->gap_info->size -= i;
770     }
771 
772     if (is_end) {
773         hsp->gap_info->size = i + 1;
774     }
775 
776     /* update the Jumper edit script */
777     if (is_start) {
778         Int4 k = hsp->map_info->edits->num_edits - 1;
779         Int4 p = hsp->query.end - 1;
780         const Uint1 kGap = 15;
781         for (i = hsp->gap_info->size - 1;i >= 0;i--) {
782             if (hsp->gap_info->op_type[i] != eGapAlignDel) {
783 
784                 p -= hsp->gap_info->num[i];
785                 while (k >= 0 &&
786                        hsp->map_info->edits->edits[k].query_pos > p &&
787                        hsp->map_info->edits->edits[k].query_base != kGap) {
788 
789                     k--;
790                 }
791             }
792             else {
793                 Int4 j;
794                 for (j = 0;j < hsp->gap_info->num[i];j++) {
795                     ASSERT(k >= 0);
796                     ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
797                     k--;
798                 }
799             }
800         }
801         k++;
802         if (k > 0) {
803             for (i = 0;i < hsp->map_info->edits->num_edits - k;i++) {
804                 hsp->map_info->edits->edits[i] =
805                     hsp->map_info->edits->edits[i + k];
806             }
807             hsp->map_info->edits->num_edits -= k;
808             ASSERT(hsp->map_info->edits->num_edits >= 0);
809         }
810     }
811     else {
812         Int4 k = 0;
813         Int4 p = hsp->query.offset;
814         const Uint1 kGap = 15;
815 
816         for (i = 0;i < hsp->gap_info->size;i++) {
817             if (hsp->gap_info->op_type[i] != eGapAlignDel) {
818 
819                 p += hsp->gap_info->num[i];
820                 while (k < hsp->map_info->edits->num_edits &&
821                        hsp->map_info->edits->edits[k].query_pos < p &&
822                        hsp->map_info->edits->edits[k].query_base != kGap) {
823 
824                     k++;
825                 }
826             }
827             else {
828                 Int4 j;
829                 for (j = 0;j < hsp->gap_info->num[i];j++) {
830                     ASSERT(hsp->map_info->edits->edits[k].query_base == kGap);
831                     k++;
832                 }
833             }
834         }
835         hsp->map_info->edits->num_edits = k;
836     }
837 
838     /* update HSP start positions */
839     if (is_start) {
840         hsp->query.offset += delta_query;
841         hsp->subject.offset += delta_subject;
842     }
843     else {
844         hsp->query.end -= delta_query;
845         hsp->subject.end -= delta_subject;
846     }
847 
848     /* update HSP score */
849     hsp->score = s_ComputeAlignmentScore(hsp, mismatch_score, gap_open_score,
850                                          gap_extend_score);
851 
852 #if _DEBUG
853     ASSERT(s_TestHSPRanges(hsp));
854 #endif
855 
856     return 0;
857 }
858 
859 
860 #define NUM_ADAPTERS 4
861 #define MAX_ADAPTER_LEN 20
862 
863 /* Find adapeter on the 5' end in a single sequence. The search will start
864    towards the end of the last HSP (from, to) */
s_FindAdapterInSequence(Int4 hsp_from,Int4 hsp_to,Uint1 * query,Int4 query_len)865 static Int4 s_FindAdapterInSequence(Int4 hsp_from, Int4 hsp_to, Uint1* query,
866                                     Int4 query_len)
867 {
868     Uint1 adapters_tab[NUM_ADAPTERS][MAX_ADAPTER_LEN] = {
869         /* Contaminants from FastQC config file:
870            http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ */
871         /* Illumina universal adapter: AGATCGGAAGAG */
872         {0, 2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2},
873         /* Illumina small RNA adapter: ATGGAATTCTCG */
874         {0, 3, 2, 2, 0, 0, 3, 3, 1, 3, 1, 2},
875         /* Nextera transosase sequence: CTGTCTCTTATA */
876         {1, 3, 2, 3, 1, 3, 1, 3, 3, 0, 3, 0},
877         /* Illumina multiplexing adapter 1: GATCGGAAGAGCACACGTCT */
878         {2, 0, 3, 1, 2, 2, 0, 0, 2, 0, 2, 1, 0, 1, 0, 1, 2, 3, 1, 3}};
879 
880     int lengths[NUM_ADAPTERS] = {12, 12, 12, 20};
881     Boolean found = FALSE;
882     Int4 adapter_start = -1;
883     int adptr_idx;
884     Int4 from = hsp_from, to = hsp_to;
885     const Int4 kMaxErrors = 1; /* max number of mismaches allowed for matching
886                                   the adapter sequence + 1*/
887 
888 
889     if (!query) {
890         return -1;
891     }
892 
893     for (adptr_idx = 0;!found && adptr_idx < NUM_ADAPTERS;adptr_idx++) {
894         Uint1* adapter = adapters_tab[adptr_idx];
895         Uint4 word = *(Uint4*)adapter;
896         Int4 q = MAX(to - lengths[adptr_idx], from);
897         int i = 0;
898         while (q < query_len - 4 && q + i < query_len && i < lengths[adptr_idx]) {
899             while (q < query_len - 4 && *(Uint4*)(query + q) != word) {
900                 q++;
901             }
902             if (q < query_len - 4) {
903                 Int4 errors = kMaxErrors + 1;
904                 i = 0;
905                 while (q + i < query_len && i < lengths[adptr_idx] &&
906                        errors) {
907 
908                     if (query[q + i] != adapter[i]) {
909                         errors--;
910                     }
911                     i++;
912                 }
913                 if (q + i == query_len || i == lengths[adptr_idx]) {
914                     adapter_start = q;
915                     found = TRUE;
916                     break;
917                 }
918 
919                 q++;
920             }
921         }
922     }
923 
924     ASSERT(adapter_start <= query_len);
925     return adapter_start;
926 }
927 
928 
929 /* Set adapter position in the chain and trim alignments that span beyond
930    adapter start position */
s_SetAdapter(HSPChain ** chains_ptr,Int4 adapter_pos,Int4 query_len,const ScoringOptions * scores)931 static Int2 s_SetAdapter(HSPChain** chains_ptr, Int4 adapter_pos,
932                          Int4 query_len, const ScoringOptions* scores)
933 {
934     HSPChain* head = NULL;
935     HSPChain* chain = NULL;
936     const Int4 kMinAdapterLen = 3;
937 
938     if (!chains_ptr || !*chains_ptr || adapter_pos < 0) {
939         return -1;
940     }
941 
942     head = *chains_ptr;
943     chain = *chains_ptr;
944     /* iterate over chains */
945     while (chain) {
946         BlastHSP* hsp = chain->hsps->hsp;
947 
948         /* check the 3' alignment extent on the query and skip this chain
949            if the query is covered almost to the end */
950         if (hsp->query.frame > 0) {
951             /* for positive strand, check the last query position */
952             HSPContainer* h = chain->hsps;
953             while (h->next) {
954                 h = h->next;
955             }
956             ASSERT(h);
957             if (query_len - h->hsp->query.end < kMinAdapterLen) {
958                 chain = chain->next;
959                 continue;
960             }
961         }
962         else {
963             /* for negative strand, check the first query position */
964             ASSERT(hsp);
965             if (hsp->query.offset < kMinAdapterLen) {
966                 chain = chain->next;
967                 continue;
968             }
969         }
970 
971         /* set adapter start position in the chain */
972         chain->adapter = adapter_pos;
973 
974         /* Trim alignment that covers part of the adapter */
975         /* for query on the positive strand */
976         if (hsp->query.frame > 0) {
977             HSPContainer* h = chain->hsps;
978 
979             /* if all HSPs are behind the adapter, delete this chain;
980                we require that HSP without adapter is at least 5 bases long */
981             if (h->hsp->query.offset >= adapter_pos - 5) {
982                 HSPChain* prev = head;
983                 HSPChain* next = chain->next;
984                 while (prev && prev->next != chain) {
985                     prev =prev->next;
986                 }
987 
988                 chain->next = NULL;
989                 HSPChainFree(chain);
990                 chain = next;
991                 if (prev) {
992                     prev->next = next;
993                 }
994                 else {
995                     head = chain;
996                 }
997 
998                 continue;
999             }
1000 
1001             /* find the first HSP with end position after adapter */
1002             while (h && h->hsp->query.end <= adapter_pos) {
1003                 h = h->next;
1004             }
1005 
1006             /* if one is found */
1007             if (h) {
1008                 HSPContainer* hh = NULL;
1009                 hsp = h->hsp;
1010 
1011                 /* if adapter is with in the HSP trim the HSP */
1012                 if (hsp->query.offset < adapter_pos ) {
1013                     Int4 old_score = hsp->score;
1014                     ASSERT(hsp->query.end - adapter_pos > 0);
1015                     s_TrimHSP(hsp, hsp->query.end - adapter_pos, TRUE,
1016                               FALSE, scores->penalty, scores->gap_open,
1017                               scores->gap_extend);
1018                     chain->score += hsp->score - old_score;
1019 
1020                     /* remove HSPs after h as they are past the adapter */
1021                     if (h->next) {
1022                         h->next = HSPContainerFree(h->next);
1023                     }
1024                 }
1025                 else {
1026                     /* otherwise delete h and everything after it */
1027                     hh = chain->hsps;
1028                     while (hh && hh->next != h) {
1029                         hh = hh->next;
1030                     }
1031                     ASSERT(hh);
1032                     hh->next = HSPContainerFree(hh->next);
1033                 }
1034 
1035                 /* compute the new chain score */
1036                 chain->score = s_ComputeChainScore(chain, scores, query_len,
1037                                                    FALSE);
1038             }
1039 
1040             /* mark adapter in the HSPs */
1041             h = chain->hsps;
1042             while (h->next) {
1043                 h = h->next;
1044             }
1045             h->hsp->map_info->right_edge |= MAPPER_ADAPTER;
1046             h->hsp->map_info->right_edge |= MAPPER_EXON;
1047         }
1048         else {
1049             /* negative strand: adapter is in the beginning of the chain */
1050             Int4 pos_minus = query_len - adapter_pos - 1;
1051             HSPContainer* h = chain->hsps;
1052 
1053             /* find the first HSP with end position after adapter;
1054                we require that HSP without adapter is at least 5 bases long */
1055             while (h && h->hsp->query.end <= pos_minus + 5) {
1056                 h = h->next;
1057             }
1058 
1059             /* if all HSPs are before the adapter, delete this chain */
1060             if (!h) {
1061                 /* remove chain */
1062                 HSPChain* prev = head;
1063                 HSPChain* next = chain->next;
1064                 while (prev && prev->next != chain) {
1065                     prev =prev->next;
1066                 }
1067 
1068                 chain->next = NULL;
1069                 HSPChainFree(chain);
1070                 chain = next;
1071                 if (prev) {
1072                     prev->next = next;
1073                 }
1074                 else {
1075                     head = chain;
1076                 }
1077 
1078                 continue;
1079             }
1080 
1081             /* if h is not the first HSP in the chain, remove all HSPs before
1082                h */
1083             if (h != chain->hsps) {
1084                 HSPContainer* hh = chain->hsps;
1085                 while (hh && hh->next != h) {
1086                     hh = hh->next;
1087                 }
1088                 ASSERT(hh);
1089                 hh->next = NULL;
1090                 HSPContainerFree(chain->hsps);
1091                 chain->hsps = h;
1092 
1093                 /* compute new chain score */
1094                 chain->score = s_ComputeChainScore(chain, scores, query_len,
1095                                                    FALSE);
1096             }
1097             ASSERT(h);
1098 
1099             /* set adapter information */
1100             hsp = chain->hsps->hsp;
1101             hsp->map_info->left_edge |= MAPPER_ADAPTER;
1102             hsp->map_info->left_edge |= MAPPER_EXON;
1103 
1104             /* trim HSP if necessary */
1105             if (pos_minus >= hsp->query.offset) {
1106                 Int4 old_score = hsp->score;
1107                 ASSERT(pos_minus - hsp->query.offset + 1 > 0);
1108                 s_TrimHSP(hsp, pos_minus - hsp->query.offset + 1,
1109                           TRUE, TRUE, scores->penalty, scores->gap_open,
1110                           scores->gap_extend);
1111                 chain->score += hsp->score - old_score;
1112             }
1113         }
1114 
1115         chain = chain->next;
1116     }
1117 
1118     ASSERT(!head || s_TestChains(head));
1119     *chains_ptr = head;
1120 
1121     return 0;
1122 }
1123 
1124 
s_FindAdapters(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * score_opts)1125 static Int4 s_FindAdapters(HSPChain** saved,
1126                            const BLAST_SequenceBlk* query_blk,
1127                            const BlastQueryInfo* query_info,
1128                            const ScoringOptions* score_opts)
1129 {
1130     Int4 query_idx;
1131 
1132     for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1133         HSPChain* chain = NULL;
1134         HSPChain* ch = NULL;
1135         Uint1* query = NULL;
1136         Int4 query_len;
1137         Int4 from = -1, to = -1;
1138         BlastHSP* hsp = NULL;
1139         Int4 overhang = 0;
1140         Int4 adapter_pos = -1;
1141 
1142         if (!saved[query_idx]) {
1143             continue;
1144         }
1145 
1146         /* we search for adapters and  only in the plus strand of the query */
1147         query = query_blk->sequence +
1148             query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1149         ASSERT(query);
1150         query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1151 
1152         /* find the best scoring chain */
1153         chain = saved[query_idx];
1154         ch = chain->next;
1155         for (; ch; ch = ch->next) {
1156             if (ch->score > chain->score) {
1157                 chain = ch;
1158             }
1159         }
1160         ASSERT(chain);
1161 
1162         /* find query coverage by the whole chain */
1163         if (chain->hsps->hsp->query.frame > 0) {
1164             HSPContainer* h = chain->hsps;
1165             from = h->hsp->query.offset;
1166             while (h->next) {
1167                 h = h->next;
1168             }
1169             to = h->hsp->query.end;
1170         }
1171         else {
1172             HSPContainer* h = chain->hsps;
1173             to = query_len - h->hsp->query.offset;
1174             while (h->next) {
1175                 h = h->next;
1176             }
1177             from = query_len - h->hsp->query.end;
1178         }
1179         ASSERT(from >= 0 && to >= 0);
1180 
1181         /* do not search for adapters if the query is mostly covered by the
1182            best scoring chain */
1183         if (from < 20 && to > query_len - 3) {
1184             continue;
1185         }
1186 
1187         /* find the chain with the longest overhang */
1188         chain = saved[query_idx];
1189         ch = chain->next;
1190         for (; ch; ch = ch->next) {
1191             HSPContainer* h = ch->hsps;
1192             if (h->hsp->query.frame > 0) {
1193                 while (h->next) {
1194                     h = h->next;
1195                 }
1196                 if (query_len - h->hsp->query.end > overhang) {
1197                     overhang = query_len - h->hsp->query.end;
1198                     chain = ch;
1199                 }
1200             }
1201             else {
1202                 if (h->hsp->query.offset + 1 > overhang) {
1203                     overhang = h->hsp->query.offset + 1;
1204                     chain = ch;
1205                 }
1206             }
1207         }
1208         ASSERT(chain);
1209 
1210         /* find location from where to search for the adapter */
1211         hsp = chain->hsps->hsp;
1212         if (hsp->query.frame > 0) {
1213             HSPContainer* h = chain->hsps;
1214             while (h->next) {
1215                 h = h->next;
1216             }
1217             hsp = h->hsp;
1218         }
1219         ASSERT(hsp);
1220 
1221         if (hsp->query.frame > 0) {
1222             from = hsp->query.offset;
1223             to  = hsp->query.end;
1224         }
1225         else {
1226             from = query_len - hsp->query.end;
1227             to = query_len - hsp->query.offset;
1228         }
1229 
1230         /* do not search for adapters if the read aligns almost to the end */
1231         if (to >= query_len - 3) {
1232             continue;
1233         }
1234 
1235         /* search for adapters */
1236         adapter_pos = s_FindAdapterInSequence(from, to, query, query_len);
1237 
1238         /* set adapter information in all chains and trim HSPs that extend
1239            into the adapter */
1240         if (adapter_pos >= 0) {
1241             s_SetAdapter(&(saved[query_idx]), adapter_pos, query_len,
1242                          score_opts);
1243         }
1244     }
1245 
1246     return 0;
1247 }
1248 
1249 
1250 /* Search for polyA tail at the end of a sequence and return the start position */
s_FindPolyAInSequence(Uint1 * sequence,Int4 length)1251 static Int4 s_FindPolyAInSequence(Uint1* sequence, Int4 length)
1252 {
1253     Int4 i;
1254     const Uint1 kBaseA = 0;
1255     Int4 num_a = 0;
1256     Int4 err = 0;
1257     const Int4 kMaxErrors = 3;
1258 
1259     if (!sequence) {
1260         return -1;
1261     }
1262 
1263     /* iterate over positions untile kMaxErrors non A bases are found */
1264     i = length - 1;
1265     while (i >= 0 && err < kMaxErrors) {
1266         if (sequence[i] != kBaseA) {
1267             err++;
1268         }
1269 
1270         i--;
1271     }
1272     i++;
1273 
1274     /* find the beginnig of the string of As */
1275     while (i < length - 1 &&
1276            (sequence[i] != kBaseA || sequence[i + 1] != kBaseA)) {
1277 
1278         if (sequence[i] != kBaseA) {
1279             err--;
1280         }
1281         i++;
1282     }
1283 
1284     num_a = length - i - err;
1285 
1286     /* short tails must have no errors */
1287     if (num_a < 3 || (num_a < 5 && err > 0)) {
1288         return -1;
1289     }
1290 
1291     return i;
1292 }
1293 
1294 
1295 /* Set PolyA information in HSP chains */
s_SetPolyATail(HSPChain * chains,Int4 positive_start,Int4 negative_start,Int4 query_len)1296 static Int4 s_SetPolyATail(HSPChain* chains, Int4 positive_start,
1297                            Int4 negative_start, Int4 query_len)
1298 {
1299     HSPChain* ch = NULL;
1300 
1301     if (!chains) {
1302         return -1;
1303     }
1304 
1305     for (ch = chains; ch; ch = ch->next) {
1306         HSPContainer* h = ch->hsps;
1307         while (h->next) {
1308             h = h->next;
1309         }
1310 
1311         if (query_len - h->hsp->query.end < 5) {
1312             continue;
1313         }
1314 
1315         if ((h->hsp->query.frame < 0 && negative_start >= 0) ||
1316             (h->hsp->query.frame > 0 && positive_start >= 0)) {
1317 
1318             h->hsp->map_info->right_edge |= MAPPER_POLY_A;
1319             h->hsp->map_info->right_edge |= MAPPER_EXON;
1320 
1321             if (h->hsp->query.frame > 0) {
1322                 ch->polyA = MAX(positive_start, h->hsp->query.end);
1323             }
1324             else {
1325                 ch->polyA = MAX(negative_start, h->hsp->query.end);
1326             }
1327         }
1328     }
1329 
1330     return 0;
1331 }
1332 
1333 
s_FindPolyATails(HSPChain ** saved,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info)1334 static Int4 s_FindPolyATails(HSPChain** saved,
1335                              const BLAST_SequenceBlk* query_blk,
1336                              const BlastQueryInfo* query_info)
1337 {
1338     Int4 query_idx;
1339 
1340     /* for each query */
1341     for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
1342         HSPChain* chain = NULL;
1343         HSPChain* ch = NULL;
1344         Uint1* query = NULL;
1345         Int4 query_len;
1346         Int4 from = -1, to = -1;
1347         Int4 positive_start, negative_start;
1348 
1349         /* skip queries with no results and those with adapters */
1350         if (!saved[query_idx] || saved[query_idx]->adapter >= 0) {
1351             continue;
1352         }
1353 
1354         query_len = query_info->contexts[query_idx * NUM_STRANDS].query_length;
1355 
1356         /* find the best scoring chain */
1357         chain = saved[query_idx];
1358         ch = chain->next;
1359         for (; ch; ch = ch->next) {
1360             if (ch->score > chain->score) {
1361                 chain = ch;
1362             }
1363         }
1364         ASSERT(chain);
1365 
1366         /* find query coverage by the whole chain */
1367         if (chain->hsps->hsp->query.frame > 0) {
1368             HSPContainer* h = chain->hsps;
1369             from = h->hsp->query.offset;
1370             while (h->next) {
1371                 h = h->next;
1372             }
1373             to = h->hsp->query.end;
1374         }
1375         else {
1376             HSPContainer* h = chain->hsps;
1377             to = query_len - h->hsp->query.offset;
1378             while (h->next) {
1379                 h = h->next;
1380             }
1381             from = query_len - h->hsp->query.end;
1382         }
1383         ASSERT(from >= 0 && to >= 0);
1384 
1385         /* do not search for polyA tails if the query is mostly covered by the
1386            best scoring chain */
1387         if (from < 4 && to > query_len - 3) {
1388             continue;
1389         }
1390 
1391         /* search for polyA tail on the positive strand */
1392         query = query_blk->sequence +
1393             query_info->contexts[query_idx * NUM_STRANDS].query_offset;
1394         positive_start = s_FindPolyAInSequence(query, query_len);
1395 
1396         /* search for the polyA tail on the negative strand */
1397         query = query_blk->sequence +
1398             query_info->contexts[query_idx * NUM_STRANDS + 1].query_offset;
1399         negative_start = s_FindPolyAInSequence(query, query_len);
1400 
1401         /* set polyA start positions and HSP flags in the saved chains */
1402         if (positive_start >= 0 || negative_start >= 0) {
1403             s_SetPolyATail(saved[query_idx], positive_start, negative_start,
1404                            query_len);
1405         }
1406     }
1407 
1408     return 0;
1409 }
1410 
1411 
1412 /* Merge two HSPs into one. The two HSPs may only be separated by at most
1413    one gap or mismatch. Function returns NULL is the HSPs cannon be merged */
s_MergeHSPs(const BlastHSP * first,const BlastHSP * second,const Uint1 * query,const ScoringOptions * score_opts)1414 static BlastHSP* s_MergeHSPs(const BlastHSP* first, const BlastHSP* second,
1415                              const Uint1* query,
1416                              const ScoringOptions* score_opts)
1417 {
1418     BlastHSP* merged_hsp = NULL;  /* this will be the result */
1419     const BlastHSP* hsp = second;
1420     Int4 query_gap;
1421     Int4 subject_gap;
1422     Int4 mismatches = 0;
1423     Int4 gap_info_size;
1424     Int4 edits_size;
1425     Int4 k;
1426 
1427     if (!first || !second || !query || !score_opts) {
1428         return NULL;
1429     }
1430 
1431     merged_hsp = Blast_HSPClone(first);
1432     if (!merged_hsp) {
1433         return NULL;
1434     }
1435 
1436     query_gap = second->query.offset - first->query.end;
1437     subject_gap = second->subject.offset - first->subject.end;
1438 
1439     if (query_gap < 0 || subject_gap < 0 || query_gap > 1 ||
1440         query_gap != subject_gap) {
1441 
1442         return NULL;
1443     }
1444 
1445     gap_info_size = first->gap_info->size + second->gap_info->size +
1446         MAX(query_gap, subject_gap);
1447 
1448     edits_size = first->map_info->edits->num_edits +
1449         second->map_info->edits->num_edits + MAX(query_gap, subject_gap);
1450 
1451     /* FIXME: should be done through an API */
1452     /* reallocate memory for edit scripts */
1453     merged_hsp->gap_info->op_type = realloc(merged_hsp->gap_info->op_type,
1454                                             gap_info_size *
1455                                             sizeof(EGapAlignOpType));
1456 
1457     merged_hsp->gap_info->num = realloc(merged_hsp->gap_info->num,
1458                                         gap_info_size *
1459                                         sizeof(Int4));
1460 
1461     merged_hsp->map_info->edits->edits = realloc(
1462                                        merged_hsp->map_info->edits->edits,
1463                                        edits_size * sizeof(JumperEdit));
1464 
1465     if (!merged_hsp->gap_info->op_type ||
1466         !merged_hsp->gap_info->num ||
1467         !merged_hsp->map_info->edits->edits) {
1468 
1469         Blast_HSPFree(merged_hsp);
1470         return NULL;
1471     }
1472 
1473     if (query_gap == subject_gap) {
1474         mismatches = query_gap;
1475     }
1476 
1477     /* add mismatches to gap_info */
1478     if (mismatches > 0) {
1479         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1480             == eGapAlignSub) {
1481             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1482                 mismatches;
1483         }
1484         else {
1485             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size] =
1486                 eGapAlignSub;
1487             merged_hsp->gap_info->num[merged_hsp->gap_info->size] =
1488                 mismatches;
1489             merged_hsp->gap_info->size++;
1490         }
1491         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1492     }
1493 
1494     /* merge gap_info */
1495     for (k = 0;k < hsp->gap_info->size;k++) {
1496 
1497         if (merged_hsp->gap_info->op_type[merged_hsp->gap_info->size - 1]
1498             == hsp->gap_info->op_type[k]) {
1499 
1500             merged_hsp->gap_info->num[merged_hsp->gap_info->size - 1] +=
1501                 hsp->gap_info->num[k];
1502         }
1503         else {
1504             merged_hsp->gap_info->op_type[merged_hsp->gap_info->size]
1505                 = hsp->gap_info->op_type[k];
1506 
1507             merged_hsp->gap_info->num[merged_hsp->gap_info->size++]
1508                 = hsp->gap_info->num[k];
1509         }
1510         ASSERT(merged_hsp->gap_info->size <= gap_info_size);
1511     }
1512 
1513     /* add mismatches to jumper edits */
1514     if (mismatches > 0) {
1515         JumperEdit* edit = merged_hsp->map_info->edits->edits +
1516             merged_hsp->map_info->edits->num_edits++;
1517         edit->query_pos = merged_hsp->query.end;
1518         /* FIXME: Mismatch bases cannot be currently set because there is
1519            no access to query or subject sequence in this function. */
1520         edit->query_base = query[edit->query_pos];
1521         edit->subject_base = edit->query_base;
1522 
1523         ASSERT(merged_hsp->map_info->edits->num_edits <= edits_size);
1524     }
1525 
1526     /* merge jumper edits */
1527     if (hsp->map_info->edits->num_edits) {
1528         JumperEdit* edit = merged_hsp->map_info->edits->edits +
1529             merged_hsp->map_info->edits->num_edits;
1530 
1531         ASSERT(merged_hsp->map_info->edits->num_edits +
1532                hsp->map_info->edits->num_edits <= edits_size);
1533 
1534         memcpy(edit, hsp->map_info->edits->edits,
1535                hsp->map_info->edits->num_edits * sizeof(JumperEdit));
1536 
1537         merged_hsp->map_info->edits->num_edits +=
1538             hsp->map_info->edits->num_edits;
1539     }
1540 
1541     /* update alignment extents, scores, etc */
1542     merged_hsp->query.end = hsp->query.end;
1543     merged_hsp->subject.end = hsp->subject.end;
1544     merged_hsp->score = s_ComputeAlignmentScore(merged_hsp,
1545                                                 score_opts->penalty,
1546                                                 score_opts->gap_open,
1547                                                 score_opts->gap_extend);
1548     merged_hsp->num_ident += hsp->num_ident;
1549 
1550     merged_hsp->map_info->right_edge = hsp->map_info->right_edge;
1551     if (!merged_hsp->map_info->subject_overhangs) {
1552         return merged_hsp;
1553     }
1554 
1555     if (!hsp->map_info->subject_overhangs ||
1556         !hsp->map_info->subject_overhangs->right_len ||
1557         !hsp->map_info->subject_overhangs->right) {
1558 
1559         merged_hsp->map_info->subject_overhangs->right_len = 0;
1560         if (merged_hsp->map_info->subject_overhangs->right) {
1561             sfree(merged_hsp->map_info->subject_overhangs->right);
1562             merged_hsp->map_info->subject_overhangs->right = NULL;
1563         }
1564     }
1565     else {
1566         Int4 new_right_len = hsp->map_info->subject_overhangs->right_len;
1567         if (merged_hsp->map_info->subject_overhangs->right_len <
1568             new_right_len) {
1569 
1570             merged_hsp->map_info->subject_overhangs->right =
1571                 realloc(merged_hsp->map_info->subject_overhangs->right,
1572                         new_right_len * sizeof(Uint1));
1573         }
1574         memcpy(merged_hsp->map_info->subject_overhangs->right,
1575                hsp->map_info->subject_overhangs->right,
1576                new_right_len * sizeof(Uint1));
1577     }
1578 
1579     return merged_hsp;
1580 }
1581 
1582 
1583 /* Find paired reads mapped to different database sequences */
s_FindRearrangedPairs(HSPChain ** saved,const BlastQueryInfo * query_info)1584 static int s_FindRearrangedPairs(HSPChain** saved,
1585                                  const BlastQueryInfo* query_info)
1586 {
1587     Int4 query_idx;
1588 
1589     /* iterate over single reads from each pair */
1590     for (query_idx = 0;query_idx + 1 < query_info->num_queries; query_idx++) {
1591         HSPChain* chain = saved[query_idx];
1592         HSPChain* thepair = saved[query_idx + 1];
1593 
1594         /* skip queries with no results */
1595         if (!chain || !thepair) {
1596             continue;
1597         }
1598 
1599         /* skip queries that do not have pairs */
1600         if (query_info->contexts[query_idx * NUM_STRANDS].segment_flags !=
1601             eFirstSegment) {
1602 
1603             continue;
1604         }
1605 
1606         /* skip queries with more than one saved chain and ones that already
1607            have a pair */
1608         if (chain->next || chain->pair || thepair->next) {
1609             continue;
1610         }
1611 
1612         /* skip chains aligned to the same subject; these are taken care of
1613            elsewhere */
1614         if (chain->oid == thepair->oid) {
1615             continue;
1616         }
1617 
1618         /* skip HSP chains aligned on the same strands */
1619         if (SIGN(chain->hsps->hsp->query.frame) ==
1620             SIGN(thepair->hsps->hsp->query.frame)) {
1621             continue;
1622         }
1623 
1624         /* mark the pair */
1625         chain->pair = thepair;
1626         thepair->pair = chain;
1627 
1628         chain->pair_conf = PAIR_PARALLEL;
1629         thepair->pair_conf = PAIR_PARALLEL;
1630     }
1631 
1632     return 0;
1633 }
1634 
1635 
1636 /* Sort chains by score in descending order */
s_CompareChainsByScore(const void * cha,const void * chb)1637 static int s_CompareChainsByScore(const void* cha, const void* chb)
1638 {
1639     const HSPChain* a = *((HSPChain**)cha);
1640     const HSPChain* b = *((HSPChain**)chb);
1641 
1642     if (a->score < b->score) {
1643         return 1;
1644     }
1645     else if (a->score > b->score) {
1646         return -1;
1647     }
1648     else {
1649         return 0;
1650     }
1651 }
1652 
1653 /* Remove chains with scores below the these of the best ones considering
1654    score bonus for pairs and count the number of chains with same of better
1655    score for each chain */
s_PruneChains(HSPChain ** saved,Int4 num_queries,Int4 pair_bonus)1656 static int s_PruneChains(HSPChain** saved, Int4 num_queries, Int4 pair_bonus)
1657 
1658 {
1659     Int4 query_idx;
1660     Int4 array_size = 10;
1661     HSPChain** array = calloc(array_size, sizeof(HSPChain*));
1662 
1663     /* Prunning is done in two passes: first considering single chains,
1664        then considering pairs. The first pass breaks pairs where there is much
1665        better chain with no pair. The second pass selects the best scoring
1666        chains giving preference to pairs. Pairs are preferable, so they get
1667        additional score. Two passes are requires, because we compare single
1668        chains with single chains and pairs with pairs. This is to avoid
1669        breaking pairs in situations with two pairs with one high scoring
1670        chain. */
1671 
1672     /* first pass: for each query remove poor scoring chains that may split
1673        pairs */
1674     for (query_idx = 0; query_idx < num_queries; query_idx++) {
1675         HSPChain* chain = saved[query_idx];
1676         HSPChain* prev = NULL;
1677         Int4 best_score = 0;
1678 
1679         if (!chain || !chain->next) {
1680             continue;
1681         }
1682 
1683         /* find the best single chain score */
1684         for (chain = saved[query_idx]; chain; chain = chain->next) {
1685             if (chain->score > best_score) {
1686                 best_score = chain->score;
1687             }
1688         }
1689 
1690         /* remove chains with poor scores, considering pair bonus */
1691         chain = saved[query_idx];
1692         while (chain) {
1693             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1694 
1695             if (score < best_score) {
1696                 HSPChain* next = chain->next;
1697 
1698                 chain->next = NULL;
1699                 HSPChainFree(chain);
1700                 chain = next;
1701 
1702                 if (prev) {
1703                     prev->next = next;
1704                 }
1705                 else {
1706                     saved[query_idx] = next;
1707                 }
1708             }
1709             else {
1710                 prev = chain;
1711                 chain = chain->next;
1712             }
1713         }
1714 
1715     }
1716 
1717     /* second pass: for each query remove chains scoring poorer than the best
1718        ones considering pair bonus */
1719     for (query_idx = 0; query_idx < num_queries; query_idx++) {
1720         HSPChain* chain = saved[query_idx];
1721         HSPChain* prev = NULL;
1722         Int4 best_score = 0;
1723         Int4 best_pair_score = 0;
1724         Int4 i;
1725         Int4 count = 0;
1726         Int4 num_chains = 0;
1727 
1728         if (!chain) {
1729             continue;
1730         }
1731 
1732         if (!chain->next) {
1733             chain->count = 1;
1734             continue;
1735         }
1736 
1737         /* find the best scores for single chains and pairs */
1738         for (chain = saved[query_idx]; chain; chain = chain->next) {
1739             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1740             if (score >= best_score) {
1741                 best_score = score;
1742             }
1743 
1744             if (chain->pair &&
1745                 chain->score + chain->pair->score >= best_pair_score) {
1746 
1747                 best_pair_score = chain->score + chain->pair->score;
1748             }
1749 
1750             /* collect chains in the array */
1751             if (num_chains >= array_size) {
1752                 array_size *= 2;
1753                 array = realloc(array, array_size * sizeof(HSPChain*));
1754                 if (!array) {
1755                     return -1;
1756                 }
1757             }
1758             array[num_chains++] = chain;
1759         }
1760 
1761         /* sort chains by scores in descending order */
1762         ASSERT(num_chains > 1);
1763         qsort(array, num_chains, sizeof(HSPChain*), s_CompareChainsByScore);
1764 
1765         /* find multiplicity for each chain: number of chains with same or
1766            larger score */
1767         /* Note that multiplicity is currently not reported. Only the number
1768            of returned hits for each query is reported */
1769         for (i = 0;i < num_chains; i++) {
1770             Int4  k;
1771 
1772             count++;
1773             /* check for chains with the same score */
1774             k = i + 1;
1775             while (k < num_chains && array[k]->score == array[i]->score) {
1776                 k++;
1777                 count++;
1778             }
1779 
1780             /* save multiplicity */
1781             for (;i < k;i++) {
1782                 array[i]->count = count;
1783             }
1784             i--;
1785         }
1786 
1787 
1788         /* remove chains with poor score and count chains with equal or better
1789            score to the minimum pair one */
1790         chain = saved[query_idx];
1791         while (chain) {
1792             Int4 score = (chain->pair ? chain->score + pair_bonus : chain->score);
1793 
1794             /* We compare single chains with single chains and pairs with
1795                pairs. There may be pairs with one strong and one weak hit.
1796                Comparing single chain scores we can be left with single chains
1797                from different pairs. This comparison aviods that. The first
1798                pass ensures that pairs with poor scoring chains are removed,
1799                so we do not have to compare singles with pairs. */
1800             if ((!chain->pair && score < best_score) ||
1801                 (chain->pair && chain->score + chain->pair->score <
1802                  best_pair_score)) {
1803 
1804                 HSPChain* next = chain->next;
1805 
1806                 chain->next = NULL;
1807                 HSPChainFree(chain);
1808                 chain = next;
1809 
1810                 if (prev) {
1811                     prev->next = next;
1812                 }
1813                 else {
1814                     saved[query_idx] = next;
1815                 }
1816             }
1817             else {
1818                 prev = chain;
1819                 chain = chain->next;
1820             }
1821         }
1822     }
1823 
1824     if (array) {
1825         sfree(array);
1826     }
1827 
1828     return 0;
1829 }
1830 
1831 
1832 /* Compare chains by subject oid and offfset */
s_CompareChainsByOid(const void * cha,const void * chb)1833 static int s_CompareChainsByOid(const void* cha, const void* chb)
1834 {
1835     const HSPChain* a = *((HSPChain**)cha);
1836     const HSPChain* b = *((HSPChain**)chb);
1837 
1838     if (a->oid > b->oid) {
1839         return 1;
1840     }
1841     else if (a->oid < b->oid) {
1842         return -1;
1843     }
1844     else if (a->hsps->hsp->subject.offset > b->hsps->hsp->subject.offset) {
1845         return 1;
1846     }
1847     else if (a->hsps->hsp->subject.offset < b->hsps->hsp->subject.offset) {
1848         return -1;
1849     }
1850 
1851     return 0;
1852 }
1853 
1854 /* Sort chains by subject oid and position and HSPs in the chains by query and
1855    subject positions */
s_SortChains(HSPChain ** saved,Int4 num_queries,int (* comp)(const void *,const void *))1856 static Int4 s_SortChains(HSPChain** saved, Int4 num_queries,
1857                          int(*comp)(const void*, const void*))
1858 {
1859     Int4 i;
1860     Int4 array_size = 50;
1861     HSPChain** chain_array = NULL;
1862 
1863     if (!saved || num_queries < 0) {
1864         return -1;
1865     }
1866 
1867     chain_array = calloc(array_size, sizeof(HSPChain*));
1868     if (!chain_array) {
1869         return -1;
1870     }
1871 
1872     /* for each query */
1873     for (i = 0;i < num_queries;i++) {
1874         HSPChain* chain = saved[i];
1875         Int4 num_chains = 0;
1876         Int4 k;
1877 
1878         if (!chain) {
1879             continue;
1880         }
1881 
1882         /* create an array of pointer to chains */
1883         for (; chain; chain = chain->next) {
1884             if (num_chains >= array_size) {
1885                 array_size *= 2;
1886                 chain_array = realloc(chain_array, array_size *
1887                                       sizeof(HSPChain*));
1888 
1889                 ASSERT(chain_array);
1890                 if (!chain_array) {
1891                     return -1;
1892                 }
1893             }
1894             chain_array[num_chains++] = chain;
1895         }
1896 
1897         if (num_chains > 1) {
1898             /* sort chains */
1899             qsort(chain_array, num_chains, sizeof(HSPChain*), comp);
1900 
1901             /* create the list of chains in the new order */
1902             for (k = 0;k < num_chains - 1;k++) {
1903                 chain_array[k]->next = chain_array[k + 1];
1904             }
1905             chain_array[num_chains - 1]->next = NULL;
1906 
1907             saved[i] = chain_array[0];
1908             ASSERT(saved[i]);
1909         }
1910     }
1911 
1912     if (chain_array) {
1913         free(chain_array);
1914     }
1915 
1916     return 0;
1917 }
1918 
1919 /* Convert internal HSPChain structure to BlastHSPChain used to report results */
s_HSPChainToBlastHSPChain(HSPChain * chain)1920 static BlastHSPChain* s_HSPChainToBlastHSPChain(HSPChain* chain)
1921 {
1922     BlastHSPChain* new_chain = NULL;
1923     HSPContainer* h = NULL;
1924     Int4 num_hsps = 0;
1925     Int4 index = 0;
1926 
1927     if (!chain || !chain->hsps) {
1928         return NULL;
1929     }
1930 
1931     new_chain = Blast_HSPChainNew();
1932     if (!new_chain) {
1933         return NULL;
1934     }
1935 
1936     new_chain->query_index = chain->context / NUM_STRANDS;
1937     new_chain->oid = chain->oid;
1938     new_chain->score = chain->score;
1939     new_chain->adapter = chain->adapter;
1940     new_chain->polyA = chain->polyA;
1941 
1942     /* move hsps */
1943     for (h = chain->hsps; h; h = h->next) {
1944         num_hsps++;
1945     }
1946     new_chain->num_hsps = num_hsps;
1947     new_chain->hsp_array = calloc(num_hsps, sizeof(BlastHSP*));
1948     if (!new_chain->hsp_array) {
1949         return NULL;
1950     }
1951     for (h = chain->hsps; h; h = h->next) {
1952         new_chain->hsp_array[index++] = h->hsp;
1953         h->hsp = NULL;
1954     }
1955 
1956     return new_chain;
1957 }
1958 
1959 /* Separate HSPs that overlap on the query to different chains (most output
1960    formats do not support query overlaps) */
s_RemoveOverlaps(HSPChain * chain,const ScoringOptions * score_opts,Int4 query_len)1961 static int s_RemoveOverlaps(HSPChain* chain, const ScoringOptions* score_opts,
1962                             Int4 query_len)
1963 {
1964     HSPContainer* h = NULL;
1965     HSPContainer* prev = NULL;
1966     HSPChain* tail = chain->next;
1967     HSPChain* head = chain;
1968     Boolean rescore = FALSE;
1969 
1970     if (!chain) {
1971         return -1;
1972     }
1973 
1974     /* current and previous HSP */
1975     h = chain->hsps->next;
1976     prev = chain->hsps;
1977 
1978     for (; h; h = h->next) {
1979 
1980         /* if HSPs overlap on the query */
1981         if (prev->hsp->query.end > h->hsp->query.offset) {
1982             /* do a flat copy of the chain */
1983             HSPChain* new_chain = HSPChainNew(chain->context);
1984             memcpy(new_chain, chain, sizeof(HSPChain));
1985             new_chain->pair = NULL;
1986             new_chain->next = NULL;
1987 
1988             /* new chain starts with h */
1989             new_chain->hsps = h;
1990 
1991             /* the original chain ends on the previous HSP */
1992             prev->next = NULL;
1993 
1994             /* add new chain to the list */
1995             head->next = new_chain;
1996             head = new_chain;
1997 
1998             rescore = TRUE;
1999         }
2000         prev = h;
2001     }
2002 
2003     /* compute chains scores */
2004     if (rescore) {
2005         for (; chain; chain = chain->next) {
2006             chain->score = s_ComputeChainScore(chain, score_opts, query_len,
2007                                                FALSE);
2008         }
2009     }
2010 
2011     /* re-attach other chains to the list */
2012     head->next = tail;
2013 
2014     return 0;
2015 }
2016 
2017 /* Populate HSP data and save final results */
s_Finalize(HSPChain ** saved,BlastMappingResults * results,const BlastQueryInfo * query_info,const BLAST_SequenceBlk * query_blk,const ScoringOptions * score_opts,Boolean is_paired)2018 static int s_Finalize(HSPChain** saved, BlastMappingResults* results,
2019                       const BlastQueryInfo* query_info,
2020                       const BLAST_SequenceBlk* query_blk,
2021                       const ScoringOptions* score_opts,
2022                       Boolean is_paired)
2023 {
2024     Int4 query_idx;
2025     Int4 num_results = 0;
2026     Int4 num = 0;
2027     const Int4 kPairBonus = 21;
2028     Int4* num_unique_chains = NULL;
2029 
2030     /* remove poor scoring chains and find chain multiplicity */
2031     if (!getenv("MAPPER_NO_PRUNNING")) {
2032         s_PruneChains(saved, query_info->num_queries, kPairBonus);
2033     }
2034 
2035     /* find adapters in query sequences */
2036     s_FindAdapters(saved, query_blk, query_info, score_opts);
2037 
2038     /* find polyA tails in query sequences */
2039     s_FindPolyATails(saved, query_blk, query_info);
2040 
2041     /* find pairs of chains aligned to different subjects */
2042     s_FindRearrangedPairs(saved, query_info);
2043 
2044     /* sort chains by subject oid to ensure stable results for multithreaded
2045        runs; sorting must be done here so that compartment numbers are stable */
2046     /* FIXME: this may not be needed */
2047     s_SortChains(saved, query_info->num_queries, s_CompareChainsByOid);
2048 
2049     /* Compute number of results to store, and number of unique mappings for
2050        each query (some chains may be copied to create pairs) */
2051 
2052     num_unique_chains = calloc(query_info->num_queries, sizeof(Int4));
2053     if (!num_unique_chains) {
2054         return -1;
2055     }
2056 
2057     /* count number of unique mappings */
2058     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2059         HSPChain* chain = saved[query_idx];
2060         HSPChain* prev = chain;
2061 
2062         if (!chain) {
2063             continue;
2064         }
2065 
2066         num_unique_chains[query_idx]++;
2067 
2068         chain = chain->next;
2069         for (; chain; chain = chain->next, prev = prev->next) {
2070             if (prev->oid != chain->oid ||
2071                 s_FindFragmentStart(prev) != s_FindFragmentStart(chain)) {
2072 
2073                 num_unique_chains[query_idx]++;
2074             }
2075         }
2076     }
2077 
2078     /* separate HSPs that overlap on the query into different chains */
2079     if (getenv("MAPPER_NO_OVERLAPPED_HSP_MERGE")) {
2080         for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2081             HSPChain* chain = saved[query_idx];
2082             Int4 query_len;
2083 
2084             if (!chain) {
2085                 continue;
2086             }
2087 
2088             query_len = query_info->contexts[chain->context].query_length;
2089             for (; chain; chain = chain->next) {
2090                 s_RemoveOverlaps(chain, score_opts, query_len);
2091             }
2092 
2093         }
2094     }
2095 
2096     /* count number of chains to report */
2097     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2098         HSPChain* chain = saved[query_idx];
2099 
2100         for (; chain; chain = chain->next) {
2101             num_results++;
2102         }
2103     }
2104 
2105     results->chain_array = calloc(num_results, sizeof(BlastHSPChain*));
2106     if (!results->chain_array) {
2107         if (num_unique_chains) {
2108             free(num_unique_chains);
2109         }
2110         return -1;
2111     }
2112     results->num_results = num_results;
2113 
2114     /* iterate over queries */
2115     for (query_idx = 0;query_idx < query_info->num_queries;query_idx++) {
2116         HSPChain* chain = saved[query_idx];
2117 
2118         /* skip queries for which there are no saved results */
2119         if (!chain) {
2120             continue;
2121         }
2122 
2123         for (; chain; chain = chain->next) {
2124             BlastHSPChain* new_chain = NULL;
2125             BlastHSPChain* pair = NULL;
2126 
2127             /* pairs are processed together so that we can store their pointers
2128                so a chain whose pair's context is smaller was already processed
2129             */
2130             if (chain->pair && chain->context > chain->pair->context) {
2131                 continue;
2132             }
2133 
2134             new_chain = s_HSPChainToBlastHSPChain(chain);
2135             ASSERT(new_chain);
2136             if (!new_chain) {
2137                 return -1;
2138             }
2139             new_chain->multiplicity = num_unique_chains[new_chain->query_index];
2140 
2141             results->chain_array[num++] = new_chain;
2142 
2143             if (chain->pair) {
2144                 pair = s_HSPChainToBlastHSPChain(chain->pair);
2145                 chain->pair->compartment = -1;
2146 
2147                 new_chain->pair = pair;
2148                 pair->pair = new_chain;
2149                 pair->multiplicity = num_unique_chains[pair->query_index];
2150                 results->chain_array[num++] = pair;
2151             }
2152         }
2153 
2154     }
2155     ASSERT(num == results->num_results);
2156 
2157     for (query_idx = 0; query_idx < query_info->num_queries; query_idx++) {
2158         saved[query_idx] = HSPChainFree(saved[query_idx]);
2159     }
2160 
2161     if (num_unique_chains) {
2162         sfree(num_unique_chains);
2163     }
2164 
2165     return 0;
2166 }
2167 
2168 /** Perform post-run clean-ups
2169  * @param data The buffered data structure [in]
2170  * @param results The HSP results to propagate [in][out]
2171  */
2172 static int
s_BlastHSPMapperFinal(void * data,void * mapping_results)2173 s_BlastHSPMapperFinal(void* data, void* mapping_results)
2174 {
2175     BlastHSPMapperData* spl_data = data;
2176     BlastHSPMapperParams* params = spl_data->params;
2177     BlastMappingResults* results = (BlastMappingResults*)mapping_results;
2178 
2179     if (spl_data->saved_chains) {
2180         s_Finalize(spl_data->saved_chains, results, spl_data->query_info,
2181                    spl_data->query, &params->scoring_options, params->paired);
2182     }
2183 
2184     if (spl_data->saved_chains) {
2185         sfree(spl_data->saved_chains);
2186     }
2187 
2188    return 0;
2189 }
2190 
2191 
2192 /* A node representing an HSP for finding the best chain of HSPs for RNA-seq */
2193 typedef struct HSPNode {
2194     BlastHSP** hsp;     /* pointer to the entry in hsp_list */
2195     Int4 best_score;    /* score for path that starts at this node */
2196     struct HSPNode* path_next; /* next node in the path */
2197 } HSPNode;
2198 
2199 
2200 /* Copy a array of HSP nodes */
s_HSPNodeArrayCopy(HSPNode * dest,HSPNode * source,Int4 num)2201 static int s_HSPNodeArrayCopy(HSPNode* dest, HSPNode* source, Int4 num)
2202 {
2203     int i;
2204 
2205     if (!dest || !source) {
2206         return -1;
2207     }
2208 
2209     for (i = 0;i < num;i++) {
2210         dest[i] = source[i];
2211         if (source[i].path_next) {
2212             dest[i].path_next = dest + (source[i].path_next - source);
2213         }
2214     }
2215 
2216     return 0;
2217 }
2218 
2219 
2220 /* Maximum number of top scoring HSP chains to report for a single read and
2221    strand */
2222 #define MAX_NUM_HSP_PATHS 40
2223 
2224 /* A chain of HSPs aligning a spliced RNA-Seq read to a genome */
2225 typedef struct HSPPath {
2226     HSPNode** start;  /* array of chain starting nodes */
2227     int num_paths;    /* number of chains */
2228     Int4 score;       /* all chains in start have this cumulative score */
2229 } HSPPath;
2230 
2231 
HSPPathFree(HSPPath * path)2232 static HSPPath* HSPPathFree(HSPPath* path)
2233 {
2234     if (path) {
2235         if (path->start) {
2236             free(path->start);
2237         }
2238         free(path);
2239     }
2240 
2241     return NULL;
2242 }
2243 
HSPPathNew(void)2244 static HSPPath* HSPPathNew(void)
2245 {
2246     HSPPath* retval = calloc(1, sizeof(HSPPath));
2247     if (!retval) {
2248         return NULL;
2249     }
2250 
2251     retval->start = calloc(MAX_NUM_HSP_PATHS, sizeof(HSPNode*));
2252     if (!retval->start) {
2253         free(retval);
2254         return NULL;
2255     }
2256 
2257     return retval;
2258 }
2259 
2260 
2261 #define NUM_SIGNALS 23
2262 
2263 /* Find a split for HSPs overlapping on the query by finding splice signals in
2264    the subject sequence. The first HSP must have smaller query offset than the
2265    second one. Subject is recreated from the query sequence and information
2266    in an HSP.
2267    @param first HSP with smaller query offset [in|out]
2268    @param second HSP with larger query offset [in|out]
2269    @param query Query sequence [in]
2270    @return Status
2271  */
2272 static Int4
s_FindSpliceJunctionsForOverlaps(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len)2273 s_FindSpliceJunctionsForOverlaps(BlastHSP* first, BlastHSP* second,
2274                                  Uint1* query, Int4 query_len)
2275 {
2276     Int4 i, k;
2277     /* splice signal pairs from include/algo/sequence/consensus_splice.hpp */
2278     Uint1 signals[NUM_SIGNALS] = {0xb2, /* GTAG */
2279                                   0x71, /* CTAC (reverse complement) */
2280                                   0x92, /* GCAG */
2281                                   0x79, /* CTGC */
2282                                   0x31, /* ATAC */
2283                                   0xb3, /* GTAT */
2284                                   0xbe, /* GTTG */
2285                                   0x41, /* CAAC */
2286                                   0xba, /* GTGG */
2287                                   0x51, /* CCAC */
2288                                   0xb0, /* GTAA */
2289                                   0xf1, /* TTAC */
2290                                   0x82, /* GAAG */
2291                                   0x7d, /* CTTC */
2292                                   0xf2, /* TTAG */
2293                                   0x70, /* CTAA */
2294                                   0x32, /* ATAG */
2295                                   0x73, /* CTAT */
2296                                   0xa2, /* GGAG */
2297                                   0x75, /* CTCC */
2298                                   0x33, /* ATAT (revese complemented is self) */
2299                                   0x30, /* ATAA */
2300                                   0xf3  /* TTAT */};
2301 
2302 
2303     Boolean found = FALSE;
2304     Uint1* subject = NULL;
2305     Int4 overlap_len;
2306     const Uint1 kGap = 15;
2307 
2308     /* HSPs must be sorted and must overlap only on the query */
2309     ASSERT(first->query.offset < second->query.offset);
2310     ASSERT(second->query.offset <= first->query.end);
2311     ASSERT(first->subject.offset < second->subject.offset);
2312 
2313 
2314     /* if there are edits within the overlap region, then try to maximize
2315        the chain score by assigning a part of the overlap region to the
2316        HSP that has fewer edits in the overlap */
2317     if ((first->map_info->edits->num_edits > 0 &&
2318          first->map_info->edits->edits[
2319                         first->map_info->edits->num_edits - 1].query_pos >=
2320             second->query.offset)
2321         || (second->map_info->edits->num_edits > 0 &&
2322             second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2323 
2324         /* find nummer of  edits in both HSPs within the overlap and assign
2325            overlap to the HSP that has fewer edits */
2326         Int4 num_first = 0;
2327         Int4 num_second = 0;
2328 
2329         /* find number of edits */
2330         for (i = first->map_info->edits->num_edits - 1;i >= 0;i--) {
2331             JumperEdit* edits = first->map_info->edits->edits;
2332             if (edits[i].query_pos < second->query.offset) {
2333                 break;
2334             }
2335             num_first++;
2336         }
2337 
2338         for (i = 0;i < second->map_info->edits->num_edits;i++) {
2339             JumperEdit* edits = second->map_info->edits->edits;
2340             if (edits[i].query_pos >= first->query.end) {
2341                 break;
2342             }
2343             num_second++;
2344         }
2345 
2346         if (num_first > num_second) {
2347             while (first->map_info->edits->num_edits > 0 &&
2348                    first->map_info->edits->edits[
2349                           first->map_info->edits->num_edits - 1].query_pos >=
2350                    second->query.offset) {
2351 
2352                 JumperEdit* edits = first->map_info->edits->edits;
2353                 Uint1 edge = first->map_info->right_edge;
2354                 Int4 num_edits = first->map_info->edits->num_edits;
2355                 Int4 trim_by;
2356 
2357                 if (edits[num_edits - 1].query_pos >= first->query.end - 1) {
2358                     if (edits[num_edits - 1].subject_base != kGap) {
2359                         edge >>= 2;
2360                         edge |= edits[num_edits - 1].subject_base << 2;
2361                     }
2362                 }
2363                 else if (edits[num_edits - 1].query_pos == query_len - 2 &&
2364                          edits[num_edits - 1].subject_base == kGap) {
2365 
2366                     edge = (edge << 2) | query[query_len - 1];
2367                 }
2368                 else {
2369                     if (edits[num_edits - 1].subject_base != kGap &&
2370                         edits[num_edits - 1].query_base != kGap) {
2371 
2372                         edge = (edits[num_edits - 1].subject_base << 2) |
2373                             query[edits[num_edits - 1].query_pos + 1];
2374                     }
2375                     else if (edits[num_edits - 1].subject_base == kGap) {
2376                         edge = (query[edits[num_edits - 1].query_pos + 1] << 2 ) |
2377                             query[edits[num_edits - 1].query_pos + 2];
2378                     }
2379                     else {
2380                         edge = (edits[num_edits - 1].subject_base << 2) |
2381                             query[edits[num_edits - 1].query_pos];
2382                     }
2383                 }
2384 
2385                 trim_by = first->query.end -  edits[
2386                             first->map_info->edits->num_edits - 1].query_pos;
2387                 /* if the edit is an insert in the genome, count genome bases
2388                    when trimming the HSP */
2389                 if (edits[num_edits - 1].query_base == kGap) {
2390                     trim_by++;
2391                     /* FIXME: use proper scores here */
2392                     s_TrimHSP(first, trim_by, FALSE, FALSE, -4, -4, -4);
2393                 }
2394                 else {
2395                     s_TrimHSP(first, trim_by, TRUE, FALSE, -4, -4, -4);
2396                 }
2397                 first->map_info->right_edge = edge;
2398             }
2399         }
2400         else if (num_second > num_first) {
2401             while (second->map_info->edits->num_edits > 0 &&
2402                    second->map_info->edits->edits[0].query_pos <
2403                    first->query.end) {
2404 
2405                 JumperEdit* edits = second->map_info->edits->edits;
2406                 Uint1 edge = second->map_info->left_edge;
2407                 Int4 trim_by;
2408 
2409                 if (edits[0].query_pos == 0) {
2410                     if (edits[0].subject_base != kGap) {
2411                         edge <<= 2;
2412                         edge |= edits[0].subject_base;
2413                     }
2414                 }
2415                 else if (edits[0].query_pos == 1 &&
2416                          edits[0].subject_base == kGap) {
2417 
2418                     edge = (edge << 2) | query[0];
2419                 }
2420                 else {
2421 
2422                     edge = (query[edits[0].query_pos - 1] << 2) |
2423                         edits[0].subject_base;
2424 
2425                     if (edits[0].subject_base == kGap) {
2426                         edge = (query[edits[0].query_pos - 2] << 2) |
2427                             query[edits[0].query_pos - 1];
2428                     }
2429                 }
2430 
2431                 trim_by = edits[0].query_pos - second->query.offset + 1;
2432                 /* if the edit is an insert in the genome, count genome bases
2433                    when trimming the HSP */
2434                 if (edits[0].query_base == kGap) {
2435                     trim_by++;
2436                     /* FIXME: use proper scores here */
2437                     s_TrimHSP(second,
2438                               edits[0].query_pos - second->query.offset + 1,
2439                               FALSE, TRUE, -4, -4, -4);
2440                 }
2441                 else {
2442                     s_TrimHSP(second,
2443                               edits[0].query_pos - second->query.offset + 1,
2444                               TRUE, TRUE, -4, -4, -4);
2445                 }
2446                 second->map_info->left_edge = edge;
2447             }
2448         }
2449         else {
2450             /* if the number of edits is the same we do not know how to
2451                divide the overlap */
2452             return 0;
2453         }
2454     }
2455 
2456     /* give up if there are gaps in the alignments or mismatches still left in
2457        the overlapping parts */
2458     if ((first->map_info->edits->num_edits > 0 &&
2459          first->map_info->edits->edits[
2460                         first->map_info->edits->num_edits - 1].query_pos >=
2461             second->query.offset)
2462         || (second->map_info->edits->num_edits > 0 &&
2463             second->map_info->edits->edits[0].query_pos <= first->query.end)) {
2464 
2465         return 0;
2466     }
2467 
2468     /* recreate the subject sequence plus two bases on each side */
2469     overlap_len = first->query.end - second->query.offset;
2470     subject = calloc(overlap_len + 4, sizeof(Uint1));
2471     if (!subject) {
2472         return -1;
2473     }
2474 
2475     /* FIXME: lef_edge and right_edge should be changed to subject_overhangs */
2476     subject[0] = (second->map_info->left_edge & 0xf) >> 2;
2477     subject[1] = second->map_info->left_edge & 3;
2478     memcpy(subject + 2, query + second->query.offset,
2479            overlap_len * sizeof(Uint1));
2480 
2481     subject[2 + overlap_len] = (first->map_info->right_edge & 0xf) >> 2;
2482     subject[2 + overlap_len + 1] = first->map_info->right_edge & 3;
2483 
2484     /* search for matching splice signals in the subject sequence */
2485     for (k = 0;k < NUM_SIGNALS;k++) {
2486 
2487         for (i = 0;i <= overlap_len && !found;i++) {
2488 
2489             Uint1 seq = (subject[i] << 2) | subject[i + 1] |
2490                 (subject[i + 2] << 6) | (subject[i + 3] << 4);
2491 
2492             /* if a splice signals are found, update HSPs */
2493             if (seq == signals[k]) {
2494                 Int4 d = first->query.end - (second->query.offset + i);
2495                 first->query.end -= d;
2496                 first->subject.end -= d;
2497                 first->gap_info->num[first->gap_info->size - 1] -= d;
2498                 first->score -= d;
2499                 first->map_info->right_edge = (seq & 0xf0) >> 4;
2500                 first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2501 
2502                 d = i;
2503                 second->query.offset += d;
2504                 second->subject.offset += d;
2505                 second->gap_info->num[0] -= d;
2506                 second->score -= d;
2507                 second->map_info->left_edge = seq & 0xf;
2508                 second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2509 
2510                 ASSERT(first->gap_info->size > 1 ||
2511                        first->query.end - first->query.offset ==
2512                        first->subject.end - first->subject.offset);
2513 
2514                 ASSERT(second->gap_info->size > 1 ||
2515                        second->query.end - second->query.offset ==
2516                        second->subject.end - second->subject.offset);
2517 
2518                 found = TRUE;
2519                 break;
2520             }
2521         }
2522     }
2523 
2524     if (!found) {
2525         first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
2526         second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
2527     }
2528 
2529     if (subject) {
2530         sfree(subject);
2531     }
2532 
2533     ASSERT(first->query.offset < first->query.end);
2534     ASSERT(second->query.offset < second->query.end);
2535 
2536     return 0;
2537 }
2538 
s_ExtendAlignmentCleanup(Uint1 * subject,BlastGapAlignStruct * gap_align,GapEditScript * edit_script,JumperEditsBlock * edits)2539 static void s_ExtendAlignmentCleanup(Uint1* subject,
2540                                      BlastGapAlignStruct* gap_align,
2541                                      GapEditScript* edit_script,
2542                                      JumperEditsBlock* edits)
2543 {
2544     if (subject) {
2545         free(subject);
2546     }
2547 
2548     BLAST_GapAlignStructFree(gap_align);
2549     GapEditScriptDelete(edit_script);
2550     JumperEditsBlockFree(edits);
2551 }
2552 
2553 /* Extend alignment on one side of an HSP up to a given point (a splice site)
2554 */
s_ExtendAlignment(BlastHSP * hsp,const Uint1 * query,Int4 query_from,Int4 query_to,Int4 subject_from,Int4 subject_to,const ScoringOptions * score_options,Boolean is_left)2555 static Int4 s_ExtendAlignment(BlastHSP* hsp, const Uint1* query,
2556                               Int4 query_from, Int4 query_to,
2557                               Int4 subject_from, Int4 subject_to,
2558                               const ScoringOptions* score_options,
2559                               Boolean is_left)
2560 {
2561     Int4 o_len;
2562     Uint1* o_seq = NULL;
2563     Uint1* subject = NULL;
2564     BlastGapAlignStruct* gap_align = NULL;
2565     GapEditScript* edit_script = NULL;
2566     JumperEditsBlock* edits = NULL;
2567     Int4 i, k, s;
2568     /* number of query bases to align */
2569     Int4 query_gap = query_to - query_from + 1;
2570     /* number of subject bases to align */
2571     Int4 subject_gap = subject_to - subject_from + 1;
2572     Int4 num_identical;
2573     Int4 query_ext_len, subject_ext_len;
2574     Int4 ungapped_ext_len;
2575 
2576     JUMP* jumper_table = NULL;
2577 
2578     JUMP jumper_mismatch[] = {{1, 1, 0, 0}};
2579 
2580     JUMP jumper_insertion[] = {{1, 1, 2, 0},
2581                                {1, 0, 1, 0},
2582                                {1, 1, 0, 0}};
2583 
2584     JUMP jumper_deletion[] = {{1, 1, 2, 0},
2585                               {0, 1, 1, 0},
2586                               {1, 1, 0, 0}};
2587 
2588     if (!hsp || !query || !hsp->map_info ||
2589         !hsp->map_info->subject_overhangs) {
2590         return -1;
2591     }
2592 
2593     /* length of the subject overgang sequence saved in the HSP */
2594     o_len = is_left ? hsp->map_info->subject_overhangs->left_len :
2595         hsp->map_info->subject_overhangs->right_len;
2596     /* subject sequence */
2597     o_seq = is_left ? hsp->map_info->subject_overhangs->left :
2598         hsp->map_info->subject_overhangs->right;
2599     ASSERT(o_seq);
2600 
2601     /* compressed subject sequence (needed for computig jumper edits) */
2602     subject = calloc(o_len / 4 + 1, sizeof(Uint1));
2603     if (!subject) {
2604         return -1;
2605     }
2606 
2607     /* compress subject sequence  */
2608     k = 6;
2609     s = 0;
2610     for (i = 0;i < o_len;i++, k-=2) {
2611         subject[s] |= o_seq[i] << k;
2612         if (k == 0) {
2613             s++;
2614             k = 8;
2615         }
2616     }
2617 
2618     /* gap_align is needed in jumper alignment API calls, we only allocate
2619        fields that will be needed in these calls */
2620     gap_align = calloc(1, sizeof(BlastGapAlignStruct));
2621     if (!gap_align) {
2622         s_ExtendAlignmentCleanup(subject, NULL, edit_script, edits);
2623         return -1;
2624     }
2625     gap_align->jumper = JumperGapAlignNew(o_len * 2);
2626     if (!gap_align->jumper) {
2627         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2628         return -1;
2629     }
2630 
2631     /* select jumper table based on the number of expected indels; we allow
2632        only insertions or only deletions */
2633     switch (SIGN(query_gap - subject_gap)) {
2634     case 0: jumper_table = jumper_mismatch;
2635         break;
2636 
2637     case -1: jumper_table = jumper_deletion;
2638         break;
2639 
2640     case 1: jumper_table = jumper_insertion;
2641         break;
2642 
2643     default:
2644         ASSERT(0);
2645         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2646         return -1;
2647     };
2648 
2649     /* Compute alignment. We always do right extension as this is typically
2650        a short alignment. This is global alignment, hence the large max number
2651        of errors and no penalties for mismatches or gaps. The jumper table
2652        specifies costs for alignment errors. The penalty socres specify
2653        required number of macthes at the ends of the alignment. */
2654     JumperExtendRightWithTraceback(query + query_from, o_seq + subject_from,
2655                                    query_gap, subject_gap,
2656                                    1, 0, 0, 0,
2657                                    20, 20,
2658                                    &query_ext_len, &subject_ext_len,
2659                                    gap_align->jumper->right_prelim_block,
2660                                    &num_identical,
2661                                    FALSE,
2662                                    &ungapped_ext_len,
2663                                    jumper_table);
2664 
2665     ASSERT(query_ext_len <= query_gap);
2666     ASSERT(subject_ext_len <= subject_gap);
2667 
2668     /* Because this is a global alignment a gap may exist after the last query
2669        or subject base which causes jumper extension to stop prematurely. If
2670        this is the case, add the missing indel */
2671     while (query_ext_len < query_gap) {
2672         JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2673                                  JUMPER_INSERTION);
2674 
2675         query_ext_len++;
2676     }
2677 
2678     while (subject_ext_len < subject_gap) {
2679         JumperPrelimEditBlockAdd(gap_align->jumper->right_prelim_block,
2680                                  JUMPER_DELETION);
2681 
2682         subject_ext_len++;
2683     }
2684     ASSERT(query_ext_len == query_gap);
2685     ASSERT(subject_ext_len == subject_gap);
2686 
2687     /* update gap info in the HSP */
2688     edit_script = JumperPrelimEditBlockToGapEditScript(
2689                                         gap_align->jumper->left_prelim_block,
2690                                         gap_align->jumper->right_prelim_block);
2691     if (!edit_script) {
2692         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2693         return -1;
2694     }
2695 
2696     if (is_left) {
2697         hsp->gap_info = GapEditScriptCombine(&edit_script, &hsp->gap_info);
2698     }
2699     else {
2700         hsp->gap_info = GapEditScriptCombine(&hsp->gap_info, &edit_script);
2701         ASSERT(!edit_script);
2702     }
2703     ASSERT(hsp->gap_info);
2704     if (!hsp->gap_info) {
2705         s_ExtendAlignmentCleanup(subject, gap_align, edit_script, edits);
2706         return -1;
2707     }
2708 
2709     /* update jumper edits */
2710     gap_align->query_start = query_from;
2711     gap_align->query_stop = query_from + query_ext_len;
2712     gap_align->subject_start = subject_from;
2713     gap_align->subject_stop = subject_from + subject_ext_len;
2714     edits = JumperFindEdits(query, subject, gap_align);
2715     ASSERT(edits);
2716     if (!edits) {
2717         s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2718         return -1;
2719     }
2720 
2721     if (is_left) {
2722         hsp->map_info->edits = JumperEditsBlockCombine(&edits,
2723                                                    &hsp->map_info->edits);
2724     }
2725     else {
2726         hsp->map_info->edits = JumperEditsBlockCombine(
2727                                                     &hsp->map_info->edits,
2728                                                     &edits);
2729         ASSERT(!edits);
2730     }
2731     ASSERT(hsp->map_info->edits);
2732     if (!hsp->map_info->edits) {
2733         s_ExtendAlignmentCleanup(subject, gap_align, NULL, edits);
2734         return -1;
2735     }
2736 
2737     if (is_left) {
2738         hsp->query.offset -= query_ext_len;
2739         hsp->subject.offset -= subject_ext_len;
2740     }
2741     else {
2742         hsp->query.end += query_ext_len;
2743         hsp->subject.end += subject_ext_len;
2744     }
2745 
2746     /* compute new HSP score */
2747     hsp->score = s_ComputeAlignmentScore(hsp, score_options->penalty,
2748                                          score_options->gap_open,
2749                                          score_options->gap_extend);
2750 
2751     ASSERT(hsp->gap_info);
2752     ASSERT(hsp->map_info->edits);
2753 
2754     ASSERT(subject);
2755 
2756 
2757     if (subject) {
2758         free(subject);
2759     }
2760     BLAST_GapAlignStructFree(gap_align);
2761 
2762     return 0;
2763 }
2764 
2765 #define NUM_SIGNALS_CONSENSUS 2
2766 
2767 /* Find splcie junctions and extend HSP alignment if there are unaligned query
2768    bases in the middle of a chain.
2769 */
2770 static Int4
s_FindSpliceJunctionsForGap(BlastHSP * first,BlastHSP * second,Uint1 * query,Int4 query_len,const ScoringOptions * score_options)2771 s_FindSpliceJunctionsForGap(BlastHSP* first, BlastHSP* second,
2772                             Uint1* query, Int4 query_len,
2773                             const ScoringOptions* score_options)
2774 {
2775     Int4 query_gap;
2776     Int4 first_len, second_len;
2777     Boolean found = FALSE;
2778     Int4 k;
2779     Int4 q;
2780     Uint1 signal = 0;
2781     /* use only cannonical splice signals here */
2782     /* The procedure does not maximize score for unaligned bases of the query.
2783        It only finds splice signals and then the best alignment it can find
2784        for the unaligned bases. For limited number of unaligned query bases,
2785        we can be sure of the splice site vs a continuous alignment, because a
2786        continuous alignment would score better. But we do not compare
2787        alignment scores between different splice sites, so we do not know
2788        which splice site and signal is better; and many splice signals here
2789        would lead to finding wrong splice sites. */
2790     Uint1 signals[NUM_SIGNALS_CONSENSUS] = {0xb2, /* GTAG */
2791                                             0x71  /* CTAC */};
2792 
2793     if (!first || !second) {
2794         return -1;
2795     }
2796 
2797     if (!first->map_info->subject_overhangs ||
2798         !first->map_info->subject_overhangs->right ||
2799         !second->map_info->subject_overhangs ||
2800         !second->map_info->subject_overhangs->left) {
2801         return -1;
2802     }
2803 
2804     /* number of query bases that fall between the HSPs */
2805     query_gap = second->query.offset - first->query.end;
2806 
2807     /* we do not have enough subject sequence saved */
2808     if (query_gap > first->map_info->subject_overhangs->right_len ||
2809         query_gap > second->map_info->subject_overhangs->left_len) {
2810         return 0;
2811     }
2812 
2813     first_len = first->map_info->subject_overhangs->right_len;
2814     second_len = second->map_info->subject_overhangs->left_len;
2815 
2816     /* assume that the begining of intron was found, search for the splice
2817        signal at the end of the first HSP */
2818     for (q = 0; !found && q < 4;q++) {
2819 
2820         if (first->query.end - q <= first->query.offset) {
2821             break;
2822         }
2823 
2824         for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2825             Int4 i;
2826             Int4 status = 0;
2827             Int4 start;
2828             Uint1 seq;
2829 
2830             if (q == 0) {
2831                 seq = (first->map_info->subject_overhangs->right[0] << 6) |
2832                     (first->map_info->subject_overhangs->right[1] << 4);
2833             }
2834             else if (q == 1) {
2835                 seq = (query[first->query.end - 1] << 6) |
2836                     (first->map_info->subject_overhangs->right[0] << 4);
2837             }
2838             else if (q > 1) {
2839                 seq = (query[first->query.end - q] << 6) |
2840                     (query[first->query.end - q + 1] << 4);
2841             }
2842             else {
2843                 return -1;
2844             }
2845 
2846             if (seq != (signals[k] & 0xf0)) {
2847                 continue;
2848             }
2849 
2850             /* location of the splice signa in subject overhang if there are no
2851                indels */
2852             start = second->map_info->subject_overhangs->left_len - 1 -
2853                 query_gap - 2 - q;
2854 
2855             /* search for the splice signal at the end of intron;
2856                allow for up to 1 indel */
2857             for (i = MAX(start - 1, 0);i <= MIN(start + 1, second_len);i++) {
2858                 seq &= 0xf0;
2859                 seq |= (second->map_info->subject_overhangs->left[i] << 2) |
2860                     second->map_info->subject_overhangs->left[i + 1];
2861 
2862                 /* if the splice signal found extend the alignment */
2863                 if (seq == signals[k]) {
2864                     Int4 subject_gap = second_len - (i + 2) + q;
2865                     if (query_gap - subject_gap < -1 ||
2866                         query_gap - subject_gap > 1) {
2867 
2868                         continue;
2869                     }
2870 
2871                     found = TRUE;
2872                     if (q > 0) {
2873                         s_TrimHSP(first, q, TRUE, FALSE,
2874                                   score_options->penalty,
2875                                   score_options->gap_open,
2876                                   score_options->gap_extend);
2877                     }
2878 
2879                     status = s_ExtendAlignment(second, query,
2880                              first->query.end,
2881                              second->query.offset - 1,
2882                              i + 2,
2883                              second->map_info->subject_overhangs->left_len - 1,
2884                              score_options,
2885                              TRUE);
2886 
2887                     ASSERT(status == 0);
2888                     if (status) {
2889                         return status;
2890                     }
2891 
2892                     signal = seq;
2893                     break;
2894                 }
2895             }
2896         }
2897     }
2898 
2899     /* assume that end of the intron was found, search for the splice signal
2900        at the beginning of the intron */
2901     for (q = 0; !found && q < 4;q++) {
2902 
2903         if (second->query.offset + q >= second->query.end) {
2904             break;
2905         }
2906 
2907         for (k = 0;k < NUM_SIGNALS_CONSENSUS && !found;k++) {
2908             Int4 i;
2909             Int4 end;
2910             Uint1 seq;
2911             if (q == 0) {
2912                 seq =
2913                     (second->map_info->subject_overhangs->left[second_len - 2] << 2) |
2914                     second->map_info->subject_overhangs->left[second_len - 1];
2915             }
2916             else if (q == 1) {
2917                 seq = (second->map_info->subject_overhangs->left[second_len - 1] << 2) | query[second->query.offset];
2918 
2919 
2920             }
2921             else if (q > 1) {
2922 
2923                 if (second->map_info->edits->num_edits > 0 &&
2924                     second->map_info->edits->edits[0].query_pos < q - 2) {
2925 
2926                     break;
2927                 }
2928 
2929                 seq = (query[second->query.offset + q - 2] << 2) |
2930                     query[second->query.offset + q + 1 - 2];
2931             }
2932             else {
2933                 return -1;
2934             }
2935 
2936             if (seq != (signals[k] & 0xf)) {
2937                 continue;
2938             }
2939 
2940             /* location of the splice signal in the subject overhang if there are
2941                no indels */
2942             end = query_gap + q;
2943 
2944             /* allow for up to 1 indel */
2945             for (i = MAX(end - 1, 0);i <= MIN(end + 1, first_len);i++) {
2946                 seq &= 0xf;
2947                 seq |= (first->map_info->subject_overhangs->right[i] << 6) |
2948                     (first->map_info->subject_overhangs->right[i + 1] << 4);
2949 
2950                 if (seq == signals[k]) {
2951                     Int4 status = 0;
2952                     Int4 subject_gap = i - q;
2953                     if (query_gap - subject_gap < -1 ||
2954                         query_gap - subject_gap > 1) {
2955 
2956                         continue;
2957                     }
2958 
2959                     found = TRUE;
2960                     if (q > 0) {
2961                         s_TrimHSP(second, q, TRUE, TRUE,
2962                                   score_options->penalty,
2963                                   score_options->gap_open,
2964                                   score_options->gap_extend);
2965                     }
2966 
2967                     status = s_ExtendAlignment(first, query,
2968                                                first->query.end,
2969                                                second->query.offset - 1,
2970                                                0, i - 1,
2971                                                score_options,
2972                                                FALSE);
2973                     ASSERT(status == 0);
2974                     if (status) {
2975                         return status;
2976                     }
2977 
2978                     signal = seq;
2979                     break;
2980                 }
2981             }
2982         }
2983     }
2984 
2985     if (found) {
2986         first->map_info->right_edge = (signal & 0xf0) >> 4;
2987         first->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
2988         second->map_info->left_edge = signal & 0xf;
2989         second->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
2990 
2991         ASSERT(first->gap_info->size > 1 ||
2992                first->query.end - first->query.offset ==
2993                first->subject.end - first->subject.offset);
2994 
2995         ASSERT(second->gap_info->size > 1 ||
2996                second->query.end - second->query.offset ==
2997                second->subject.end - second->subject.offset);
2998     }
2999     else {
3000         first->map_info->right_edge &= ~MAPPER_SPLICE_SIGNAL;
3001         second->map_info->left_edge &= ~MAPPER_SPLICE_SIGNAL;
3002     }
3003 
3004     return 0;
3005 }
3006 
3007 
3008 #define NUM_SINGLE_SIGNALS 8
3009 
3010 /* Record subject bases pre and post alignment in HSP as possible splice
3011    signals and set a flag for recognized signals */
s_FindSpliceSignals(BlastHSP * hsp,Uint1 * query,Int4 query_len)3012 static Int4 s_FindSpliceSignals(BlastHSP* hsp, Uint1* query, Int4 query_len)
3013 {
3014     SequenceOverhangs* overhangs = NULL;
3015     /* splice signals in the order of preference */
3016     Uint1 signals[NUM_SINGLE_SIGNALS] = {2,  /* AG */
3017                                          11, /* GT */
3018                                          13, /* TC */
3019                                          7,  /* CT */
3020                                          1,  /* AC */
3021                                          4,  /* CA */
3022                                          8,  /* GA */
3023                                          14  /* TG */};
3024 
3025     if (!hsp || !query) {
3026         return -1;
3027     }
3028 
3029     overhangs = hsp->map_info->subject_overhangs;
3030     ASSERT(overhangs);
3031 
3032     if (!overhangs) {
3033         return -1;
3034     }
3035 
3036     /* FIXME: check for polyA and adapters */
3037 
3038     /* search for a splice signal on the left edge, unless the query is aligned
3039        from the beginning */
3040     if (hsp->query.offset == 0) {
3041         hsp->map_info->left_edge = MAPPER_EXON;
3042     }
3043     else {
3044         int k, i;
3045         Boolean found = FALSE;
3046 
3047         if (overhangs->left_len >= 2) {
3048             Uint1 signal = (overhangs->left[overhangs->left_len - 2] << 2) |
3049                 overhangs->left[overhangs->left_len - 1];
3050 
3051             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3052                 if (signal == signals[k]) {
3053                     hsp->map_info->left_edge = signal;
3054                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3055 
3056                     found = TRUE;
3057                     break;
3058                 }
3059             }
3060         }
3061 
3062         if (!found && overhangs->left_len >= 1) {
3063             Uint1 signal = (overhangs->left[overhangs->left_len - 1] << 2) |
3064                 query[hsp->query.offset];
3065 
3066             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3067                 if (signal == signals[k]) {
3068                     hsp->map_info->left_edge = signal;
3069                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3070 
3071                     /* trim alignment */
3072                     s_TrimHSP(hsp, 1, TRUE, TRUE, -8, 0, -8);
3073 
3074                     /* update score at some point */
3075                     found = TRUE;
3076                     break;
3077                 }
3078             }
3079         }
3080 
3081         for (i = hsp->query.offset;!found && i < 4;i++) {
3082             Uint1 signal = (query[i] << 2) | query[i + 1];
3083             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3084                 if (signal == signals[k]) {
3085                     hsp->map_info->left_edge = signal;
3086                     hsp->map_info->left_edge |= MAPPER_SPLICE_SIGNAL;
3087 
3088                     /* trim alignment */
3089                     s_TrimHSP(hsp, i + 2, TRUE, TRUE, -8, 0, -8);
3090 
3091                     found = TRUE;
3092                     break;
3093                 }
3094             }
3095         }
3096     }
3097 
3098     /* search for the splice signal on the right edge, unless the query is
3099        aligned to the end */
3100     if (hsp->query.end == query_len) {
3101         hsp->map_info->right_edge = MAPPER_EXON;
3102     }
3103     else {
3104         int k, i;
3105         Boolean found = FALSE;
3106 
3107         if (overhangs->right_len >= 2) {
3108             Uint1 signal = (overhangs->right[0] << 2) | overhangs->right[1];
3109             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3110                 if (signal == signals[k]) {
3111                     hsp->map_info->right_edge = signal;
3112                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3113 
3114                     found = TRUE;
3115                     break;
3116                 }
3117             }
3118 
3119         }
3120 
3121         if (!found && overhangs->right_len >= 1) {
3122             Uint1 signal = (query[hsp->query.end - 1] << 2) |
3123                 overhangs->right[0];
3124 
3125             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3126                 if (signal == signals[k]) {
3127                     hsp->map_info->right_edge = signal;
3128                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3129 
3130                     /* update HSP */
3131                     s_TrimHSP(hsp, 1, TRUE, FALSE, -8, 0, -8);
3132 
3133                     /* update score at some point */
3134                     found = TRUE;
3135                     break;
3136                 }
3137             }
3138 
3139         }
3140 
3141         for (i = hsp->query.end - 2;!found && i > hsp->query.end - 2 - 4;i--) {
3142             Uint1 signal = (query[i] << 2) | query[i + 1];
3143             for (k = 0;k < NUM_SINGLE_SIGNALS;k++) {
3144                 if (signal == signals[k]) {
3145                     hsp->map_info->right_edge = signal;
3146                     hsp->map_info->right_edge |= MAPPER_SPLICE_SIGNAL;
3147 
3148                     /* update HSP */
3149                     s_TrimHSP(hsp, hsp->query.end - i, TRUE, FALSE, -8, 0, -8);
3150 
3151                     found = TRUE;
3152                     break;
3153                 }
3154             }
3155         }
3156     }
3157 
3158     return 0;
3159 }
3160 
3161 
3162 /* Search for splice signals between two HSPs in a chain. The HSPs in the
3163    chain must be sorted by query position in asceding order.
3164 */
3165 static Int4
s_FindSpliceJunctions(HSPChain * chains,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3166 s_FindSpliceJunctions(HSPChain* chains,
3167                       const BLAST_SequenceBlk* query_blk,
3168                       const BlastQueryInfo* query_info,
3169                       const ScoringOptions* scoring_opts)
3170 {
3171     HSPChain* ch = NULL;
3172 
3173     if (!chains || !query_blk || !query_info || !scoring_opts) {
3174         return -1;
3175     }
3176 
3177     /* iterate over HSPs in the chain */
3178     for (ch = chains; ch; ch = ch->next) {
3179         HSPContainer* h = ch->hsps;
3180         Boolean searched = FALSE;
3181         Uint1* query = NULL;
3182         Int4 context;
3183         Int4 query_len;
3184         ASSERT(h);
3185 
3186         context = h->hsp->context;
3187         query = query_blk->sequence +
3188             query_info->contexts[context].query_offset;
3189         query_len = query_info->contexts[context].query_length;
3190 
3191 
3192         /* FIXME: check the overhangs of the first and last HSP and try finding
3193            splice signal within a few bases in the HSP */
3194         if (!h->next) {
3195 
3196             s_FindSpliceSignals(h->hsp, query, query_len);
3197             searched = TRUE;
3198         }
3199 
3200         while (h->next) {
3201             HSPContainer* next = h->next;
3202             ASSERT(next);
3203 
3204             /* process overlap if found */
3205             if (next->hsp->query.offset <= h->hsp->query.end &&
3206                 next->hsp->query.offset > h->hsp->query.offset) {
3207 
3208                 s_FindSpliceJunctionsForOverlaps(h->hsp, next->hsp, query,
3209                                                  query_len);
3210                 searched = TRUE;
3211                 h = h->next;
3212             }
3213             /* if not a spliced alignment */
3214             else if (next->hsp->query.offset - h->hsp->query.end < 10 &&
3215                      next->hsp->query.offset - h->hsp->query.end ==
3216                      next->hsp->subject.offset - h->hsp->subject.end) {
3217 
3218                 /* save pointer to hsps after next */
3219                 HSPContainer* following = h->next->next;
3220                 BlastHSP* new_hsp = NULL;
3221 
3222                 /* duplicate HSPContainer with the two HSPs */
3223                 h->next->next = NULL;
3224 
3225                 /* extend the first HSP to cover the gap between HSPs */
3226                 if (next->hsp->query.offset - h->hsp->query.end > 1) {
3227                     BlastHSP* first = h->hsp;
3228                     BlastHSP* second = h->next->hsp;
3229 
3230                     s_ExtendAlignment(first, query, first->query.end,
3231                               second->query.offset - 1, 0,
3232                               second->subject.offset - 1 - first->subject.end,
3233                               scoring_opts, FALSE);
3234                 }
3235 
3236                 /* merge HSPs */
3237                 new_hsp = s_MergeHSPs(h->hsp, h->next->hsp, query,
3238                                       scoring_opts);
3239 
3240                 if (new_hsp) {
3241 
3242                     /* replace the two processed HSPs with the combined one */
3243                     Blast_HSPFree(h->hsp);
3244                     HSPContainerFree(h->next);
3245                     h->hsp = new_hsp;
3246                     h->next = following;
3247                 }
3248                 else {
3249                     /* something went wrong with merging, use the initial
3250                        HSPs */
3251                     h->next->next = following;
3252                     h = h->next;
3253                 }
3254             }
3255             else if (next->hsp->query.offset - h->hsp->query.end > 0) {
3256                 s_FindSpliceJunctionsForGap(h->hsp, next->hsp, query,
3257                                             query_len, scoring_opts);
3258                 searched = TRUE;
3259                 h = h->next;
3260             }
3261             else {
3262                 h = h->next;
3263             }
3264 
3265             /* FIXME: if a splice junction cannot be found, we can try looking
3266                for the split between perfect matches */
3267         }
3268 
3269         /* Remove HSPs that have the same start position on the query or
3270            subject within a chain. They may arise from modified HSP extents. */
3271         h = ch->hsps;
3272         while (h->next) {
3273             if (h->hsp->query.offset >= h->next->hsp->query.offset ||
3274                 h->hsp->subject.offset >= h->next->hsp->subject.offset) {
3275 
3276                 if (h->hsp->score >= h->next->hsp->score) {
3277                     HSPContainer* remove = h->next;
3278                     h->next = h->next->next;
3279                     remove->next = NULL;
3280                     HSPContainerFree(remove);
3281                 }
3282                 else {
3283                     HSPContainer* remove = h;
3284                     HSPContainer* prev = ch->hsps;
3285                     if (remove == ch->hsps) {
3286                         ch->hsps = remove->next;
3287                         h = ch->hsps;
3288                     }
3289                     else {
3290                         while (prev->next && prev->next != h) {
3291                             prev = prev->next;
3292                         }
3293                         ASSERT(prev->next && prev->next == remove);
3294 
3295                         prev->next = remove->next;
3296                         h = prev;
3297                     }
3298                     remove->next = NULL;
3299                     HSPContainerFree(remove);
3300                 }
3301 
3302                 continue;
3303 
3304             }
3305             h = h->next;
3306         }
3307 
3308         /* recalculated chain score if splice sites were searched */
3309         if (searched) {
3310             ch->score = s_ComputeChainScore(ch, scoring_opts, query_len, FALSE);
3311         }
3312     }
3313 
3314     s_TestChains(chains);
3315 
3316     return 0;
3317 }
3318 
3319 
3320 /* Find the best scoring chain of HSPs for aligning a single RNA-Seq read to
3321    a genome on a single strand, returns all top scoring chains */
s_FindBestPath(HSPNode * nodes,Int4 num,HSPPath * path,Int4 oid,Boolean is_spliced,const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const ScoringOptions * scoring_opts)3322 static HSPChain* s_FindBestPath(HSPNode* nodes, Int4 num, HSPPath* path,
3323                                 Int4 oid, Boolean is_spliced,
3324                                 const BLAST_SequenceBlk* query_blk,
3325                                 const BlastQueryInfo* query_info,
3326                                 const ScoringOptions* scoring_opts)
3327 {
3328     int i, k;
3329     Int4 best_score = 0;
3330     const Int4 kMaxIntronLength = 900000;
3331     /* FIXME: use mismatch and/or gap extend penalty here */
3332     HSPChain* retval = NULL;
3333 
3334     if (!path || !nodes || !num) {
3335         return NULL;
3336     }
3337     path->num_paths = 0;
3338     path->score = 0;
3339 
3340     for (i = num - 1;i >= 0;i--) {
3341 
3342         BlastHSP* hsp = *(nodes[i].hsp);
3343         int self_score = nodes[i].best_score;
3344         /* FIXME: the is_spliced condition is a hack */
3345         for (k = i + 1;k < num && is_spliced;k++) {
3346 
3347             BlastHSP* newhsp = *(nodes[k].hsp);
3348             Int4 new_score = nodes[k].best_score + self_score -
3349                 s_GetOverlapCost(newhsp, *(nodes[i].hsp), 4);
3350 
3351             /* FIXME: some of the conditions double others */
3352             const Int4 hsp_len = hsp->query.end - hsp->query.offset;
3353             const Int4 newhsp_len = newhsp->query.end - newhsp->query.offset;
3354             const Int4 overlap_len = MAX(MIN(hsp->query.end, newhsp->query.end) -
3355                               MAX(hsp->query.offset, newhsp->query.offset), 0);
3356 
3357             /* add next HSP to the path only if there is fewer than 10 query
3358                bases unaligned between HSPs, newhsp is not contained within
3359                hsp, newhsp aligns to the subject behind hsp, and score improves */
3360             /* FIXME: there should be a penalty if new hsp->query.offset >
3361                hsp->query.end */
3362             if (newhsp->query.offset - hsp->query.end < 10 &&
3363                 newhsp->query.offset > hsp->query.offset &&
3364                 newhsp->query.end > hsp->query.end &&
3365                 newhsp->subject.offset - hsp->subject.end >=
3366                    /* the difference on query may be smaller than zero,
3367                       this will let as combine HSPs of which extension stopped
3368                       too soon (not real introns) */
3369                    newhsp->query.offset - hsp->query.end &&
3370                 newhsp->subject.offset - hsp->subject.end < kMaxIntronLength &&
3371                 (double)overlap_len / hsp_len < 0.75 &&
3372                 (double)overlap_len / newhsp_len < 0.75 &&
3373                 new_score > nodes[i].best_score) {
3374 
3375                 /* prefer paths with identified splice sites to those without
3376                    ones */
3377                 /* FIXME: add min intron length to the condition below */
3378                 if (newhsp->subject.offset - hsp->subject.end > 1) {
3379                     /* FIXME: The function that finds splice signals modifies
3380                        HSPs, so we need to clone HSPs here. */
3381                     BlastHSP* hsp_copy = Blast_HSPClone(hsp);
3382                     BlastHSP* newhsp_copy = Blast_HSPClone(newhsp);
3383 
3384                     HSPChain* chain = HSPChainNew(hsp->context);
3385                     chain->hsps = HSPContainerNew(&hsp_copy);
3386                     chain->hsps->next = HSPContainerNew(&newhsp_copy);
3387 
3388                     s_FindSpliceJunctions(chain, query_blk, query_info,
3389                                           scoring_opts);
3390 
3391                     if ((chain->hsps->hsp->map_info->right_edge &
3392                          MAPPER_SPLICE_SIGNAL) == 0) {
3393 
3394 /*                        new_score += scoring_opts->no_splice_signal; */
3395                         /* FIXME: temporarely, do not create chains if splice
3396                            signals are not found */
3397                         new_score = 0;
3398                     }
3399 
3400                     chain = HSPChainFree(chain);
3401                 }
3402                 else if (newhsp->query.offset - hsp->query.end == 1) {
3403                     /* FIXME: this is a temporary hack, we need to increase
3404                        the score of combining two HSP into one with a
3405                        mismatch so that it is scored the same as the one with
3406                        splice signal */
3407                     new_score++;
3408                 }
3409                 else {
3410                     new_score = 0;
3411                 }
3412 
3413                 if (new_score > nodes[i].best_score) {
3414                     nodes[i].best_score = new_score;
3415                     nodes[i].path_next = &(nodes[k]);
3416                 }
3417             }
3418         }
3419 
3420         /* we want to return all top scoring hits, so we save all paths with
3421            top score */
3422         if (nodes[i].best_score == best_score) {
3423             if (path->num_paths < MAX_NUM_HSP_PATHS) {
3424                 path->start[path->num_paths++] = &(nodes[i]);
3425             }
3426         }
3427         else if (nodes[i].best_score > best_score) {
3428             best_score = nodes[i].best_score;
3429             path->start[0] = &(nodes[i]);
3430             path->num_paths = 1;
3431         }
3432     }
3433     ASSERT(path->num_paths);
3434     path->score = path->start[0]->best_score;
3435 
3436     /* Find if any paths share an HSP and remove the one with the larger index.
3437        Doing this at the end ensures that we find optimal path in terms of
3438        score and/or HSP positions. */
3439     for (i = 0;i < path->num_paths - 1;i++) {
3440         if (!path->start[i]) {
3441             continue;
3442         }
3443         for (k = i + 1;k < path->num_paths;k++) {
3444             HSPNode* node_i = path->start[i];
3445             if (!path->start[k]) {
3446                 continue;
3447             }
3448 
3449             while (node_i) {
3450                 HSPNode* node_k = path->start[k];
3451                 while (node_k) {
3452                     if (node_i->hsp[0] == node_k->hsp[0]) {
3453                         path->start[k] = NULL;
3454                         break;
3455                     }
3456                     node_k = node_k->path_next;
3457                 }
3458                 node_i = node_i->path_next;
3459             }
3460 
3461         }
3462     }
3463 
3464     ASSERT(path->num_paths > 0);
3465     ASSERT(path->start[0]);
3466 
3467     /* conver from HSPPath to HSPChain structure */
3468     if (path->num_paths > 0) {
3469         HSPChain* ch = NULL;
3470         HSPContainer* hc = NULL;
3471         HSPNode* node = path->start[0];
3472 
3473         retval = HSPChainNew((*node->hsp)->context);
3474         if (!retval) {
3475             return NULL;
3476         }
3477         retval->oid = oid;
3478         retval->score = path->score;
3479         retval->hsps = hc = HSPContainerNew(node->hsp);
3480         node = node->path_next;
3481         while (node) {
3482             hc->next = HSPContainerNew(node->hsp);
3483             if (!hc->next) {
3484                 HSPChainFree(retval);
3485                 return NULL;
3486             }
3487             hc = hc->next;
3488             node = node->path_next;
3489         }
3490         ASSERT(retval->hsps);
3491 
3492         ch = retval;
3493         for (i = 1;i < path->num_paths;i++) {
3494             node = path->start[i];
3495             if (!node) {
3496                 continue;
3497             }
3498             ch->next = HSPChainNew((*node->hsp)->context);
3499             ASSERT(ch->next);
3500             if (!ch->next) {
3501                 HSPChainFree(retval);
3502                 return NULL;
3503             }
3504             ch->next->oid = oid;
3505             ch->next->score = path->score;
3506             ch->next->hsps = hc = HSPContainerNew(node->hsp);
3507             node = node->path_next;
3508             while (node) {
3509                 hc->next = HSPContainerNew(node->hsp);
3510                 ASSERT(hc->next);
3511                 if (!hc->next) {
3512                     HSPChainFree(retval);
3513                     return NULL;
3514                 }
3515                 hc = hc->next;
3516                 node = node->path_next;
3517             }
3518             ch = ch->next;
3519             ASSERT(ch->hsps);
3520         }
3521     }
3522 
3523     ASSERT(path->num_paths == 0 || s_TestChains(retval));
3524 
3525     return retval;
3526 }
3527 
3528 /* Compare HSPs by context (accending), and score (descending). */
3529 static int
s_CompareHSPsByContextScore(const void * v1,const void * v2)3530 s_CompareHSPsByContextScore(const void* v1, const void* v2)
3531 {
3532    BlastHSP* h1,* h2;
3533    BlastHSP** hp1,** hp2;
3534 
3535    hp1 = (BlastHSP**) v1;
3536    hp2 = (BlastHSP**) v2;
3537    h1 = *hp1;
3538    h2 = *hp2;
3539 
3540    if (!h1 && !h2)
3541       return 0;
3542    else if (!h1)
3543       return -1;
3544    else if (!h2)
3545       return 1;
3546 
3547    if (h1->context < h2->context)
3548       return -1;
3549    if (h1->context > h2->context)
3550       return 1;
3551 
3552    if (h1->score < h2->score)
3553       return 1;
3554    if (h1->score > h2->score)
3555       return -1;
3556 
3557    return 0;
3558 }
3559 
3560 static int
s_CompareHSPsByContextSubjectOffset(const void * v1,const void * v2)3561 s_CompareHSPsByContextSubjectOffset(const void* v1, const void* v2)
3562 {
3563    BlastHSP* h1,* h2;
3564    BlastHSP** hp1,** hp2;
3565 
3566    hp1 = (BlastHSP**) v1;
3567    hp2 = (BlastHSP**) v2;
3568    h1 = *hp1;
3569    h2 = *hp2;
3570 
3571    if (!h1 && !h2)
3572       return 0;
3573    else if (!h1)
3574       return 1;
3575    else if (!h2)
3576       return -1;
3577 
3578    /* If these are from different contexts, don't compare offsets */
3579    if (h1->context < h2->context)
3580       return -1;
3581    if (h1->context > h2->context)
3582       return 1;
3583 
3584    if (h1->subject.offset < h2->subject.offset)
3585       return -1;
3586    if (h1->subject.offset > h2->subject.offset)
3587       return 1;
3588 
3589    if (h1->query.offset < h2->query.offset)
3590       return -1;
3591    if (h1->query.offset > h2->query.offset)
3592       return 1;
3593 
3594    /* tie breakers: sort by decreasing score, then
3595       by increasing size of query range, then by
3596       increasing subject range. */
3597 
3598    if (h1->score < h2->score)
3599       return 1;
3600    if (h1->score > h2->score)
3601       return -1;
3602 
3603    if (h1->query.end < h2->query.end)
3604       return 1;
3605    if (h1->query.end > h2->query.end)
3606       return -1;
3607 
3608    if (h1->subject.end < h2->subject.end)
3609       return 1;
3610    if (h1->subject.end > h2->subject.end)
3611       return -1;
3612 
3613    return 0;
3614 }
3615 
3616 
3617 /* Pair data, used for selecting optimal pairs of aligned chains */
3618 typedef struct Pairinfo
3619 {
3620     HSPChain* first;
3621     HSPChain* second;
3622     Int4 distance;
3623     Int4 conf;
3624     Int4 score;
3625     Int4 trim_first;    // when a chain overextends past beginning of the pair
3626     Int4 trim_second;   // it needs to be trimmed
3627     Boolean valid_pair;
3628 } Pairinfo;
3629 
3630 
s_BlastHSPMapperSplicedRunCleanUp(HSPPath * path,HSPNode * nodes,Pairinfo * pair_data)3631 static void s_BlastHSPMapperSplicedRunCleanUp(HSPPath* path,
3632                                               HSPNode* nodes,
3633                                               Pairinfo* pair_data)
3634 {
3635     HSPPathFree(path);
3636 
3637     if (nodes) {
3638         sfree(nodes);
3639     }
3640 
3641     if (pair_data) {
3642         sfree(pair_data);
3643     }
3644 }
3645 
3646 
3647 /* Compare two HSP chain pairs aligned for the same query and subject
3648    by score and distance */
3649 static int
s_ComparePairs(const void * v1,const void * v2)3650 s_ComparePairs(const void* v1, const void* v2)
3651 {
3652     Pairinfo* a = (Pairinfo*)v1;
3653     Pairinfo* b = (Pairinfo*)v2;
3654 
3655     if (a->score > b->score) {
3656         return -1;
3657     }
3658     if (a->score < b->score) {
3659         return 1;
3660     }
3661 
3662     if (a->conf < b->conf) {
3663         return -1;
3664     }
3665     if (a->conf > b->conf) {
3666         return 1;
3667     }
3668 
3669     if (a->distance < b->distance) {
3670         return -1;
3671     }
3672     if (a->distance > b->distance) {
3673         return 1;
3674     }
3675 
3676     return 0;
3677 }
3678 
3679 
3680 /* Trim beginning of a chain alignment up to the given subject position */
s_TrimChainStartToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)3681 static Int4 s_TrimChainStartToSubjPos(HSPChain* chain, Int4 subj_pos,
3682                                       Int4 mismatch_score,
3683                                       Int4 gap_open_score,
3684                                       Int4 gap_extend_score)
3685 {
3686     HSPContainer* h = NULL;
3687     BlastHSP* hsp = NULL;
3688     BlastHSP* next_hsp = NULL;
3689     Int4 num_left;
3690     Int4 old_score;
3691 
3692     ASSERT(chain);
3693     ASSERT(subj_pos >= 0);
3694     if (!chain || subj_pos < 0) {
3695         return 0;
3696     }
3697 
3698     /* first remove HSPs that are fully below the subject position */
3699     h = chain->hsps;
3700     while (h && h->hsp->subject.end < subj_pos) {
3701         HSPContainer* tmp = h;
3702         ASSERT(tmp);
3703         h = h->next;
3704         tmp->next = NULL;
3705         chain->score -= tmp->hsp->score;
3706         HSPContainerFree(tmp);
3707     }
3708     ASSERT(h);
3709     chain->hsps = h;
3710     if (!h) {
3711         return -1;
3712     }
3713 
3714     /* if all remaning HSPs are above the subject position, then exit */
3715     if (h->hsp->subject.offset >= subj_pos) {
3716         return 0;
3717     }
3718 
3719     /* otherwise move HSP start positions */
3720     hsp = h->hsp;
3721     num_left = subj_pos - hsp->subject.offset;
3722     ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
3723 
3724     old_score = hsp->score;
3725     s_TrimHSP(hsp, num_left, FALSE, TRUE, mismatch_score, gap_open_score,
3726               gap_extend_score);
3727 
3728     /* update chain score */
3729     chain->score -= old_score - hsp->score;
3730 
3731     /* remove splice signals and exon flags */
3732     hsp->map_info->left_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
3733     hsp->map_info->left_edge &= 0xff ^ MAPPER_EXON;
3734 
3735     /* check whether the HSP is contained within the next one and remove it
3736        if it is */
3737     next_hsp = NULL;
3738     if (chain->hsps->next) {
3739         next_hsp = chain->hsps->next->hsp;
3740     }
3741     if (next_hsp && hsp->query.offset >= next_hsp->query.offset) {
3742         chain->hsps = h->next;
3743         h->next = NULL;
3744         HSPContainerFree(h);
3745     }
3746 
3747     return 0;
3748 }
3749 
3750 /* Trim chain alignment end up the the given subject position */
s_TrimChainEndToSubjPos(HSPChain * chain,Int4 subj_pos,Int4 mismatch_score,Int4 gap_open_score,Int4 gap_extend_score)3751 static Int4 s_TrimChainEndToSubjPos(HSPChain* chain, Int4 subj_pos,
3752                                     Int4 mismatch_score,
3753                                     Int4 gap_open_score, Int4 gap_extend_score)
3754 {
3755     HSPContainer* h = NULL;
3756     BlastHSP* hsp = NULL;
3757     Int4 num_left;
3758     Int4 old_score;
3759 
3760     ASSERT(chain);
3761     ASSERT(subj_pos >= 0);
3762     if (!chain || subj_pos <= 0) {
3763         return -1;
3764     }
3765 
3766     /* first remove HSPs that are full above the subject position */
3767     h = chain->hsps;
3768     while (h->next && h->next->hsp->subject.end < subj_pos) {
3769         h = h->next;
3770     }
3771 
3772     if (h->next) {
3773         HSPContainer* hc = NULL;
3774         if (h->next->hsp->subject.offset < subj_pos) {
3775             h = h->next;
3776         }
3777 
3778         /* update chain score */
3779         for (hc = h->next;hc != NULL;hc = hc->next) {
3780             chain->score -= hc->hsp->score;
3781         }
3782 
3783         HSPContainerFree(h->next);
3784         h->next = NULL;
3785     }
3786 
3787     /* if all remaning HSPs are below the subject position, exit */
3788     if (h->hsp->subject.end <= subj_pos) {
3789         return 0;
3790     }
3791 
3792     /* otheriwse move HSP end positions */
3793     hsp = h->hsp;
3794     num_left = hsp->subject.end - subj_pos;
3795     ASSERT(num_left > 0 && num_left <= hsp->subject.end - hsp->subject.offset);
3796 
3797     old_score = hsp->score;
3798     s_TrimHSP(hsp, num_left, FALSE, FALSE, mismatch_score, gap_open_score,
3799               gap_extend_score);
3800 
3801     /* update chain score */
3802     chain->score -= old_score - hsp->score;
3803 
3804     /* remove splice signals and exon flags */
3805     hsp->map_info->right_edge &= 0xff ^ MAPPER_SPLICE_SIGNAL;
3806     hsp->map_info->right_edge &= 0xff ^ MAPPER_EXON;
3807 
3808     /* check whether the HSP is contained within the next one and remove it
3809        if it is */
3810     if (chain->hsps != h) {
3811         HSPContainer* hc = chain->hsps;
3812         while (hc && hc->next != h) {
3813             hc = hc->next;
3814         }
3815         ASSERT(hc && hc->next == h);
3816         if (hc->hsp->query.end >= h->hsp->query.end) {
3817             chain->score -= h->hsp->score;
3818             HSPContainerFree(h);
3819             hc->next = NULL;
3820         }
3821     }
3822 
3823     return 0;
3824 }
3825 
3826 
3827 /* Find optimal pairs */
s_FindBestPairs(HSPChain ** first_list,HSPChain ** second_list,Int4 min_score,Pairinfo ** pair_info_ptr,Int4 * max_num_pairs,const ScoringOptions * scoring_options)3828 static Boolean s_FindBestPairs(HSPChain** first_list,
3829                                HSPChain** second_list,
3830                                Int4 min_score,
3831                                Pairinfo** pair_info_ptr,
3832                                Int4* max_num_pairs,
3833                                const ScoringOptions* scoring_options)
3834 {
3835     HSPChain* first;
3836     HSPChain* second;
3837     Pairinfo* pair_info = *pair_info_ptr;
3838     Int4 conv_bonus = 0;
3839     Int4 num_pairs = 0;
3840     Boolean found = FALSE;
3841 
3842     /* iterate over all pairs of HSP chains for the first and second read of
3843        the pair and collect pair information */
3844     for (first = *first_list; first; first = first->next) {
3845         for (second = *second_list; second; second = second->next) {
3846             Int2 first_frame = first->hsps->hsp->query.frame;
3847             Int2 second_frame = second->hsps->hsp->query.frame;
3848 
3849             /* reallocate pair info array if necessary */
3850             if (num_pairs >= *max_num_pairs) {
3851                 *max_num_pairs *= 2;
3852                 Pairinfo* new_pair_info =
3853                     realloc(pair_info, *max_num_pairs * sizeof(Pairinfo));
3854                 if (!new_pair_info) {
3855                     return -1;
3856                 }
3857                 pair_info = new_pair_info;
3858                 *pair_info_ptr = new_pair_info;
3859             }
3860 
3861             /* collect pair information */
3862             pair_info[num_pairs].first = first;
3863             pair_info[num_pairs].second = second;
3864             pair_info[num_pairs].score = first->score + second->score;
3865             pair_info[num_pairs].trim_first = 0;
3866             pair_info[num_pairs].trim_second = 0;
3867             pair_info[num_pairs].valid_pair = 0;
3868 
3869             /* if the chains align on the opposite strands */
3870             ASSERT(first_frame != 0 && second_frame != 0);
3871             if (SIGN(first_frame) != SIGN(second_frame)) {
3872                 HSPChain* plus = NULL, *minus = NULL;
3873                 Int4 plus_start, minus_start;
3874                 HSPContainer* hsp = NULL;
3875                 Int4 distance = 0;
3876 
3877                 /* find which query aligns to plus and which to minus strand
3878                    on the subject */
3879                 if (first_frame > 0) {
3880                     plus = first;
3881                 }
3882                 else {
3883                     minus = first;
3884                 }
3885                 if (second_frame > 0) {
3886                     plus = second;
3887                 }
3888                 else {
3889                     minus = second;
3890                 }
3891                 ASSERT(plus && minus);
3892 
3893                 /* find start positions on the subject, for the minus strand we
3894                    are interested subject start position on the minus strand,
3895                    but the HSP has subject on plus and query on the minus
3896                    strand, so we need to flip the query */
3897                 plus_start = plus->hsps->hsp->subject.offset;
3898                 hsp = minus->hsps;
3899                 while (hsp->next) {
3900                     hsp = hsp->next;
3901                 }
3902                 ASSERT(hsp);
3903                 minus_start = hsp->hsp->subject.end;
3904 
3905                 /* this is also fragment length */
3906                 distance = minus_start - plus_start;
3907                 pair_info[num_pairs].distance = distance;
3908 
3909                 /* distance > 0 indicates a convergent pair (typical) */
3910                 if (distance > 0) {
3911                     Int4 plus_end, minus_end;
3912                     hsp = plus->hsps;
3913                     while (hsp->next) {
3914                         hsp = hsp->next;
3915                     }
3916                     ASSERT(hsp);
3917                     plus_end = hsp->hsp->subject.end;
3918                     minus_end = minus->hsps->hsp->subject.offset;
3919 
3920                     /* HSP chain end going past the beginning of the chain
3921                        for the pair, indicates that the this end of the
3922                        query may be aligned to an adapater instead of a real
3923                        sequence and needs to be trimmed */
3924                     if (plus_end > minus_start || minus_end < plus_start) {
3925                         ASSERT(plus && minus);
3926 
3927                         /* check the 3' (right) side of the fragment */
3928                         if (plus_end > minus_start) {
3929                             ASSERT(plus_end - minus_start > 0);
3930 
3931                             /* decrease the score by the trimmed portion; we
3932                                assume here that the trimmed part aligns
3933                                perfectly, which may not be the case, but this
3934                                score is only used for comparison with different
3935                                pairs */
3936                             pair_info[num_pairs].score -=
3937                                 plus_end - minus_start;
3938 
3939                             /* Mark which chain and up to what subject location
3940                                needs to be trimmed. We cannot trim here,
3941                                because pair_info holds poiners to chains that
3942                                may be part of better pairs */
3943                             if (plus == pair_info[num_pairs].first) {
3944                                 pair_info[num_pairs].trim_first = minus_start;
3945                             }
3946                             else {
3947                                 pair_info[num_pairs].trim_second = minus_start;
3948                             }
3949                         }
3950 
3951                         /* check the 5' (left) side doe the fragment */
3952                         if (minus_end < plus_start) {
3953                             ASSERT(plus_start - minus_end > 0);
3954 
3955                             pair_info[num_pairs].score -=
3956                                 plus_start - minus_end;
3957 
3958                             if (minus == pair_info[num_pairs].first) {
3959                                 pair_info[num_pairs].trim_first = plus_start;
3960                             }
3961                             else {
3962                                 pair_info[num_pairs].trim_second = plus_start;
3963                             }
3964                         }
3965                     }
3966 
3967                     pair_info[num_pairs].conf = PAIR_CONVERGENT;
3968 
3969                     /* add a bonus for convergent pairs; because they are more
3970                        common we prefer them over different configurations
3971                        with a sligntly better score */
3972                     pair_info[num_pairs].score += conv_bonus;
3973                 }
3974                 else {
3975                     pair_info[num_pairs].conf = PAIR_DIVERGENT;
3976                 }
3977 
3978                 /* skip pairs with a score smaller than one found for a
3979                    different subject */
3980                 if (pair_info[num_pairs].score < min_score) {
3981                     continue;
3982                 }
3983 
3984                 num_pairs++;
3985             }
3986             else {
3987                 /* for read pairs aligned in the same direction */
3988                 pair_info[num_pairs].conf = PAIR_PARALLEL;
3989                 pair_info[num_pairs].score -= 1;
3990                 num_pairs++;
3991             }
3992         }
3993     }
3994 
3995     if (num_pairs > 0) {
3996         Int4 i;
3997         Int4 best_score = pair_info[0].score;
3998         Int4 margin = 5;
3999         HSPChain* ch = NULL;
4000 
4001         /* sort pair information by score, configuration distance */
4002         qsort(pair_info, num_pairs, sizeof(Pairinfo), s_ComparePairs);
4003 
4004         /* iterate over pair information and collect pairs */
4005         for (i=0;i < num_pairs;i++) {
4006             HSPChain* f = pair_info[i].first;
4007             HSPChain* s = pair_info[i].second;
4008 
4009             /* skip pairs with chains already used in other pairs */
4010             if (pair_info[i].first->pair || pair_info[i].second->pair) {
4011                 continue;
4012             }
4013 
4014             /* skip pairs with scores significantly smaller than the best one
4015                found */
4016             if (best_score - pair_info[i].score > margin) {
4017                 break;
4018             }
4019 
4020             /* link chains as a pair */
4021             f->pair = s;
4022             s->pair = f;
4023             f->pair_conf = s->pair_conf = pair_info[i].conf;
4024             pair_info[i].valid_pair = TRUE;
4025             found = TRUE;
4026         }
4027 
4028         /* for chains that pair with already paired chains, clone the paired
4029            chain */
4030         for (ch = *first_list; ch; ch = ch->next) {
4031             HSPChain* pair = NULL;
4032 
4033             if (ch->pair) {
4034                 continue;
4035             }
4036 
4037             i = 0;
4038             while (i < num_pairs &&
4039                    (pair_info[i].first != ch || !pair_info[i].second->pair ||
4040                     pair_info[i].conf != PAIR_CONVERGENT ||
4041                     best_score - pair_info[i].score > margin)) {
4042                 i++;
4043             }
4044             if (i >= num_pairs) {
4045                 continue;
4046             }
4047             ASSERT(pair_info[i].second->pair);
4048 
4049             pair = s_CloneChain(pair_info[i].second);
4050             ASSERT(pair);
4051             ASSERT(s_TestChains(pair));
4052             pair_info[i].second = pair;
4053             ch->pair = pair;
4054             pair->pair = ch;
4055             ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4056             pair_info[i].valid_pair = TRUE;
4057             HSPChainListInsert(second_list, pair, FALSE);
4058         }
4059 
4060         for (ch = *second_list; ch; ch = ch->next) {
4061             HSPChain* pair = NULL;
4062 
4063             if (ch->pair) {
4064                 continue;
4065             }
4066 
4067             i = 0;
4068             while (i < num_pairs &&
4069                    (pair_info[i].second != ch || !pair_info[i].first->pair ||
4070                     pair_info[i].conf != PAIR_CONVERGENT ||
4071                     best_score - pair_info[i].score > margin)) {
4072                 i++;
4073             }
4074             if (i >= num_pairs) {
4075                 continue;
4076             }
4077             ASSERT(pair_info[i].first->pair);
4078 
4079             pair = s_CloneChain(pair_info[i].first);
4080             ASSERT(pair);
4081             ASSERT(s_TestChains(pair));
4082             pair_info[i].first = pair;
4083             ch->pair = pair;
4084             pair->pair = ch;
4085             ch->pair_conf = pair->pair_conf = pair_info[i].conf;
4086             pair_info[i].valid_pair = TRUE;
4087             HSPChainListInsert(first_list, pair, FALSE);
4088         }
4089 
4090         /* trim the alignments that overextend beyond the pair */
4091         for (i = 0;i < num_pairs;i++) {
4092             if (!pair_info[i].valid_pair ||
4093                 !pair_info[i].first->pair || !pair_info[i].second->pair) {
4094                 continue;
4095             }
4096 
4097             /* trim the first chain if its end goes beyond the start of
4098                second on the subject */
4099             if (pair_info[i].trim_first > 0) {
4100                 if (pair_info[i].first->hsps->hsp->query.frame > 0) {
4101                     s_TrimChainEndToSubjPos(pair_info[i].first,
4102                                             pair_info[i].trim_first,
4103                                             scoring_options->penalty,
4104                                             scoring_options->gap_open,
4105                                             scoring_options->gap_extend);
4106                 }
4107                 else {
4108                     /* minus strand of the qurey is represented by another
4109                        (reverse complemented) sequence aligned like the plus
4110                        strand, so the end of the chain is really the beginnig
4111                        by query coordinates */
4112                     s_TrimChainStartToSubjPos(pair_info[i].first,
4113                                               pair_info[i].trim_first,
4114                                               scoring_options->penalty,
4115                                               scoring_options->gap_open,
4116                                               scoring_options->gap_extend);
4117                 }
4118             }
4119 
4120             /* trim the second chain if its end goes beyond the start of the
4121                first on the subject */
4122             if (pair_info[i].trim_second > 0) {
4123                 if (pair_info[i].second->hsps->hsp->query.frame > 0) {
4124                     s_TrimChainEndToSubjPos(pair_info[i].second,
4125                                             pair_info[i].trim_second,
4126                                             scoring_options->penalty,
4127                                             scoring_options->gap_open,
4128                                             scoring_options->gap_extend);
4129                 }
4130                 else {
4131                     s_TrimChainStartToSubjPos(pair_info[i].second,
4132                                               pair_info[i].trim_second,
4133                                               scoring_options->penalty,
4134                                               scoring_options->gap_open,
4135                                               scoring_options->gap_extend);
4136                 }
4137             }
4138 
4139         }
4140     }
4141 
4142     return found;
4143 }
4144 
4145 
4146 /** Perform writing task for paired reads
4147  * ownership of the HSP list and sets the dereferenced pointer to NULL.
4148  * @param data To store results to [in][out]
4149  * @param hsp_list Pointer to the HSP list to save in the collector. [in]
4150  */
4151 static int
s_BlastHSPMapperSplicedPairedRun(void * data,BlastHSPList * hsp_list)4152 s_BlastHSPMapperSplicedPairedRun(void* data, BlastHSPList* hsp_list)
4153 {
4154     BlastHSPMapperData* spl_data = data;
4155     BlastHSPMapperParams* params = spl_data->params;
4156     const ScoringOptions* scoring_opts = &params->scoring_options;
4157     Boolean is_spliced  = params->splice;
4158     BLAST_SequenceBlk* query_blk = spl_data->query;
4159     BlastQueryInfo* query_info = spl_data->query_info;
4160     const Int4 kDefaultMaxHsps = 1000;
4161     Int4 max_hsps = kDefaultMaxHsps;
4162     HSPNode* nodes = calloc(max_hsps, sizeof(HSPNode));
4163     HSPPath* path = NULL;
4164     BlastHSP** hsp_array;
4165     Int4 num_hsps = 0;
4166     Int4 i;
4167 
4168     HSPChain** saved_chains = spl_data->saved_chains;
4169     HSPChain* chain_array[2];
4170     HSPChain* first = NULL, *second = NULL;
4171     Int4 workspace_size = 40;
4172     Pairinfo* pair_workspace = calloc(workspace_size, sizeof(Pairinfo));
4173     /* the score bonus needs to include exon pars that were to short to be
4174        aligned (at either end of the chain) */
4175     const Int4 kPairBonus = is_spliced ? 21 : 5;
4176 
4177     if (!hsp_list) {
4178         s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4179         return 0;
4180     }
4181 
4182     if (!params || !nodes) {
4183         s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4184         return -1;
4185     }
4186 
4187     hsp_array = hsp_list->hsp_array;
4188 
4189     path = HSPPathNew();
4190 
4191     /* sort HSPs by query context and subject position for spliced aligments */
4192     if (is_spliced) {
4193         qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4194               s_CompareHSPsByContextSubjectOffset);
4195     }
4196     else {
4197         /* for non spliced alignments, sort HSPs by query context and score */
4198         qsort(hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
4199               s_CompareHSPsByContextScore);
4200     }
4201 
4202     i = 0;
4203     chain_array[0] = NULL;
4204     chain_array[1] = NULL;
4205     while (i < hsp_list->hspcnt) {
4206 
4207         Int4 context = hsp_array[i]->context;
4208         /* is this query the first segment of a pair */
4209         Boolean has_pair = (query_info->contexts[context].segment_flags ==
4210                             eFirstSegment);
4211         Int4 query_idx  = context / NUM_STRANDS;
4212         Int4 first_context = query_idx * NUM_STRANDS;
4213         Int4 context_next_fragment = has_pair ? (query_idx + 2) * NUM_STRANDS :
4214             (query_idx + 1) * NUM_STRANDS;
4215 
4216         HSPChain* new_chains = NULL;
4217 
4218         /* iterate over contexts that belong to a read pair */
4219         while (i < hsp_list->hspcnt &&
4220                hsp_array[i]->context < context_next_fragment) {
4221 
4222             memset(nodes, 0, num_hsps * sizeof(HSPNode));
4223             num_hsps = 0;
4224             context = hsp_array[i]->context;
4225 
4226             /* collect HSPs for the current context */
4227             while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4228 
4229                 /* Overlapping portions of two consecutive subject chunks
4230                    may produce two HSPs representing the same alignment. We
4231                    drop one of them here (assuming the HSPs for the same
4232                    context are sorted by score or subject position). */
4233                 BlastHSP* h = (num_hsps == 0 ? NULL : *nodes[num_hsps - 1].hsp);
4234                 if (!h || hsp_array[i]->context != h->context ||
4235                     hsp_array[i]->subject.offset != h->subject.offset ||
4236                     hsp_array[i]->score != h->score) {
4237 
4238                     /* reallocate node array if necessary */
4239                     if (num_hsps >= max_hsps) {
4240                         HSPNode* new_nodes = NULL;
4241 
4242                         /* For non-spliced mapping we are only interested in
4243                            the best scoring whole HSPs. HSPs are sorted by
4244                            score and those that map to many places are not good
4245                            so the node array does not need to be extended. */
4246                         if (!is_spliced) {
4247                             break;
4248                         }
4249 
4250                         max_hsps *= 2;
4251                         new_nodes = calloc(max_hsps, sizeof(HSPNode));
4252                         if (!nodes) {
4253                             s_BlastHSPMapperSplicedRunCleanUp(path, nodes,
4254                                                               pair_workspace);
4255                             return -1;
4256                         }
4257                         s_HSPNodeArrayCopy(new_nodes, nodes, num_hsps);
4258                         free(nodes);
4259                         nodes = new_nodes;
4260                     }
4261 
4262                     nodes[num_hsps].hsp = &(hsp_array[i]);
4263                     nodes[num_hsps].best_score = hsp_array[i]->score;
4264 
4265                     num_hsps++;
4266                 }
4267                 i++;
4268             }
4269 
4270             /* find the best scoring chain of HSPs that align to exons
4271                parts of the query and the subject */
4272             new_chains = s_FindBestPath(nodes, num_hsps, path,
4273                                         hsp_list->oid, is_spliced,
4274                                         query_blk, query_info, scoring_opts);
4275 
4276             ASSERT(new_chains);
4277             HSPChainListInsert(&chain_array[(context - first_context) /
4278                                             NUM_STRANDS],
4279                                new_chains, FALSE);
4280 
4281             /* skip HSPs for the same context if there are more then max
4282                allowed per context */
4283             while (i < hsp_list->hspcnt && hsp_array[i]->context == context) {
4284                 i++;
4285             }
4286         }
4287 
4288 #if _DEBUG
4289         ASSERT(!chain_array[0] || s_TestChainsSorted(chain_array[0]));
4290         ASSERT(!chain_array[1] || s_TestChainsSorted(chain_array[1]));
4291 #endif
4292 
4293         if (is_spliced && chain_array[0]) {
4294             s_FindSpliceJunctions(chain_array[0], query_blk, query_info,
4295                                   scoring_opts);
4296         }
4297 
4298         if (is_spliced && chain_array[1]) {
4299             s_FindSpliceJunctions(chain_array[1], query_blk, query_info,
4300                                   scoring_opts);
4301         }
4302 
4303         first = chain_array[0];
4304         second = chain_array[1];
4305 
4306         /* If the chains provide sufficient score check for pairs.
4307            The score bonus is added to prefer a pair with a slightly smaller
4308            score to single chains with a better one. */
4309         if (first && second) {
4310 
4311             s_FindBestPairs(&first, &second, 0, &pair_workspace,
4312                             &workspace_size, scoring_opts);
4313 
4314             ASSERT(s_TestChains(first));
4315             ASSERT(s_TestChains(second));
4316         }
4317 
4318         /* save all chains and remove ones with scores lower than
4319            best score - kPairBonus */
4320         if (first) {
4321             HSPChainListInsert(&saved_chains[query_idx], first, TRUE);
4322             HSPChainListTrim(saved_chains[query_idx], kPairBonus);
4323         }
4324         if (second) {
4325             HSPChainListInsert(&saved_chains[query_idx + 1], second, TRUE);
4326             HSPChainListTrim(saved_chains[query_idx + 1], kPairBonus);
4327         }
4328 
4329 #if _DEBUG
4330         ASSERT(!chain_array[0] || s_TestChainsSorted(saved_chains[query_idx]));
4331         ASSERT(!chain_array[1] || s_TestChainsSorted(saved_chains[query_idx + 1]));
4332 #endif
4333 
4334         /* make temporary lists empty */
4335         chain_array[0] = chain_array[1] = NULL;
4336     }
4337 
4338     /* delete HSPs that were not saved in results */
4339     hsp_list = Blast_HSPListFree(hsp_list);
4340 
4341     s_BlastHSPMapperSplicedRunCleanUp(path, nodes, pair_workspace);
4342 
4343     return 0;
4344 }
4345 
4346 
4347 /** Free the writer
4348  * @param writer The writer to free [in]
4349  * @return NULL.
4350  */
4351 static
4352 BlastHSPWriter*
s_BlastHSPMapperFree(BlastHSPWriter * writer)4353 s_BlastHSPMapperFree(BlastHSPWriter* writer)
4354 {
4355    BlastHSPMapperData *data = writer->data;
4356    sfree(data->params);
4357    sfree(writer->data);
4358    sfree(writer);
4359    return NULL;
4360 }
4361 
4362 /** create the writer
4363  * @param params Pointer to the hit paramters [in]
4364  * @param query_info BlastQueryInfo (not used) [in]
4365  * @return writer
4366  */
4367 static
4368 BlastHSPWriter*
s_BlastHSPMapperPairedNew(void * params,BlastQueryInfo * query_info,BLAST_SequenceBlk * query)4369 s_BlastHSPMapperPairedNew(void* params, BlastQueryInfo* query_info,
4370                           BLAST_SequenceBlk* query)
4371 {
4372    BlastHSPWriter* writer = NULL;
4373    BlastHSPMapperData* data = NULL;
4374 /*   BlastHSPMapperParams * spl_param = params; */
4375 
4376    /* allocate space for writer */
4377    writer = malloc(sizeof(BlastHSPWriter));
4378 
4379    /* fill up the function pointers */
4380    writer->InitFnPtr   = &s_BlastHSPMapperPairedInit;
4381    writer->FinalFnPtr  = &s_BlastHSPMapperFinal;
4382    writer->FreeFnPtr   = &s_BlastHSPMapperFree;
4383    writer->RunFnPtr    = &s_BlastHSPMapperSplicedPairedRun;
4384 
4385    /* allocate for data structure */
4386    writer->data = calloc(1, sizeof(BlastHSPMapperData));
4387    data = writer->data;
4388    data->params = params;
4389    data->query_info = query_info;
4390    data->query = query;
4391 
4392    return writer;
4393 }
4394 
4395 
4396 /*************************************************************/
4397 /** The following are exported functions to be used by APP   */
4398 
4399 BlastHSPMapperParams*
BlastHSPMapperParamsNew(const BlastHitSavingOptions * hit_options,const BlastScoringOptions * scoring_options)4400 BlastHSPMapperParamsNew(const BlastHitSavingOptions* hit_options,
4401                         const BlastScoringOptions* scoring_options)
4402 {
4403        BlastHSPMapperParams* retval = NULL;
4404 
4405        if (hit_options == NULL)
4406            return NULL;
4407 
4408        retval = (BlastHSPMapperParams*)malloc(sizeof(BlastHSPMapperParams));
4409        retval->hitlist_size = MAX(hit_options->hitlist_size, 10);
4410        retval->paired = hit_options->paired;
4411        retval->splice = hit_options->splice;
4412        retval->program = hit_options->program_number;
4413        retval->scoring_options.reward = scoring_options->reward;
4414        retval->scoring_options.penalty = scoring_options->penalty;
4415        retval->scoring_options.gap_open = -scoring_options->gap_open;
4416        retval->scoring_options.gap_extend = -scoring_options->gap_extend;
4417        retval->scoring_options.no_splice_signal = -2;
4418        return retval;
4419 }
4420 
4421 BlastHSPMapperParams*
BlastHSPMapperParamsFree(BlastHSPMapperParams * opts)4422 BlastHSPMapperParamsFree(BlastHSPMapperParams* opts)
4423 {
4424     if ( !opts )
4425         return NULL;
4426     sfree(opts);
4427     return NULL;
4428 }
4429 
4430 BlastHSPWriterInfo*
BlastHSPMapperInfoNew(BlastHSPMapperParams * params)4431 BlastHSPMapperInfoNew(BlastHSPMapperParams* params) {
4432    BlastHSPWriterInfo * writer_info =
4433                         malloc(sizeof(BlastHSPWriterInfo));
4434 
4435    writer_info->NewFnPtr = &s_BlastHSPMapperPairedNew;
4436 
4437    writer_info->params = params;
4438    return writer_info;
4439 }
4440