1 /* $Id: blast_gapalign.h 504861 2016-06-20 15:45:40Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Ilya Dondoshansky
27  *
28  */
29 
30 /** @file blast_gapalign.h
31  * Structures and functions prototypes used for BLAST gapped extension
32  * @todo FIXME: elaborate on contents.
33  */
34 
35 #ifndef ALGO_BLAST_CORE__BLAST_GAPALIGN__H
36 #define ALGO_BLAST_CORE__BLAST_GAPALIGN__H
37 
38 #include <algo/blast/core/ncbi_std.h>
39 #include <algo/blast/core/blast_export.h>
40 #include <algo/blast/core/blast_def.h>
41 #include <algo/blast/core/blast_extend.h>
42 #include <algo/blast/core/blast_query_info.h>
43 #include <algo/blast/core/blast_parameters.h>
44 #include <algo/blast/core/gapinfo.h>
45 #include <algo/blast/core/greedy_align.h>
46 #include <algo/blast/core/blast_hits.h>
47 #include <algo/blast/core/blast_diagnostics.h>
48 
49 #ifdef __cplusplus
50 extern "C" {
51 #endif
52 
53 /** Split subject sequences if longer than this */
54 #define MAX_DBSEQ_LEN 5000000
55 
56 /** Auxiliary structure for dynamic programming gapped extension */
57 typedef struct {
58   Int4 best;            /**< score of best path that ends in a match
59                              at this position */
60   Int4 best_gap;        /**< score of best path that ends in a gap
61                              at this position */
62 } BlastGapDP;
63 
64 
65 typedef struct JumperGapAlign JumperGapAlign;
66 
67 
68 /** Structure supporting the gapped alignment */
69 typedef struct BlastGapAlignStruct {
70    Boolean positionBased; /**< Is this PSI-BLAST? */
71    GapStateArrayStruct* state_struct; /**< Structure to keep extension
72                                                 state information */
73    GapEditScript* edit_script; /**< The traceback (gap) information */
74    GapPrelimEditBlock *fwd_prelim_tback; /**< traceback from right extensions */
75    GapPrelimEditBlock *rev_prelim_tback; /**< traceback from left extensions */
76    SGreedyAlignMem* greedy_align_mem;/**< Preallocated memory for the greedy
77                                          gapped extension */
78    BlastGapDP* dp_mem; /**< scratch structures for dynamic programming */
79    Int4 dp_mem_alloc;  /**< current number of structures allocated */
80    BlastScoreBlk* sbp; /**< Pointer to the scoring information block */
81    Int4 gap_x_dropoff; /**< X-dropoff parameter to use */
82    Int4 max_mismatches;  /**< Max number of mismatches for jumper */
83    Int4 mismatch_window; /**< Window sie for mismatches for jumper */
84    Int4 query_start; /**< query start offset of current alignment */
85    Int4 query_stop; /**< query end offseet of current alignment */
86    Int4 subject_start;  /**< subject start offset current alignment */
87    Int4 subject_stop; /**< subject end offset of current alignment */
88    Int4 greedy_query_seed_start;  /**< for greedy alignments, the query
89                                        offset of the gapped start point */
90    Int4 greedy_subject_seed_start;  /**< for greedy alignments, the subject
91                                          offset of the gapped start point */
92    Int4 score;   /**< Return value: alignment score */
93 
94    JumperGapAlign* jumper;   /**< data for jumper alignment */
95 } BlastGapAlignStruct;
96 
97 /** Initializes the BlastGapAlignStruct structure
98  * @param score_params Parameters related to scoring alignments [in]
99  * @param ext_params parameters related to gapped extension [in]
100  * @param max_subject_length Maximum length of any subject sequence (needed
101  *        for greedy extension allocation only) [in]
102  * @param sbp The scoring information block [in]
103  * @param gap_align_ptr The BlastGapAlignStruct structure [out]
104 */
105 NCBI_XBLAST_EXPORT
106 Int2
107 BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
108    const BlastExtensionParameters* ext_params,
109    Uint4 max_subject_length, BlastScoreBlk* sbp,
110    BlastGapAlignStruct** gap_align_ptr);
111 
112 /** Deallocates memory in the BlastGapAlignStruct structure */
113 NCBI_XBLAST_EXPORT
114 BlastGapAlignStruct*
115 BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align);
116 
117 /** Performs gapped extension for all non-Mega BLAST programs, given
118  * that ungapped extension has been done earlier.
119  * Sorts initial HSPs by score (from ungapped extension);
120  * Deletes HSPs that are included in already extended HSPs;
121  * Performs gapped extension;
122  * Saves HSPs into an HSP list.
123  * @param program_number Type of BLAST program [in]
124  * @param query The query sequence block [in]
125  * @param query_info Query information structure, containing offsets into
126  *                   the concatenated sequence [in]
127  * @param subject The subject sequence block [in]
128  * @param gap_align The auxiliary structure for gapped alignment [in]
129  * @param score_params Options and parameters related to scoring [in]
130  * @param ext_params Options and parameters related to extensions [in]
131  * @param hit_params Options related to saving hits [in]
132  * @param init_hitlist List of initial HSPs (offset pairs with additional
133  *        information from the ungapped alignment performed earlier) [in]
134  * @param hsp_list_ptr Structure containing all saved HSPs [out]
135  * @param gapped_stats Return statistics (not filled if NULL) [out]
136  * @param fence_hit True is returned here if overrun is detected. [in]
137  */
138 NCBI_XBLAST_EXPORT
139 Int2 BLAST_GetGappedScore (EBlastProgramType program_number,
140             BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
141               BLAST_SequenceBlk* subject,
142               BlastGapAlignStruct* gap_align,
143               const BlastScoringParameters* score_params,
144               const BlastExtensionParameters* ext_params,
145               const BlastHitSavingParameters* hit_params,
146               BlastInitHitList* init_hitlist,
147               BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
148               Boolean * fence_hit);
149 
150 /** Perform a gapped alignment with traceback
151  * @param program Type of BLAST program [in]
152  * @param query The query sequence [in]
153  * @param subject The subject sequence [in]
154  * @param gap_align The gapped alignment structure [in] [out]
155  * @param score_params Scoring parameters [in]
156  * @param q_start Offset in query where to start alignment [in]
157  * @param s_start Offset in subject where to start alignment [in]
158  * @param query_length Maximal allowed extension in query [in]
159  * @param subject_length Maximal allowed extension in subject [in]
160  * @param fence_hit True is returned here if overrun is detected. [in]
161  */
162 NCBI_XBLAST_EXPORT
163 Int2 BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,
164         const Uint1* query, const Uint1* subject,
165         BlastGapAlignStruct* gap_align,
166         const BlastScoringParameters* score_params,
167         Int4 q_start, Int4 s_start, Int4 query_length, Int4 subject_length,
168         Boolean * fence_hit);
169 
170 /** Greedy gapped alignment, with or without traceback.
171  * Given two sequences, relevant options and an offset pair, fills the
172  * gap_align structure with alignment endpoints and, if traceback is
173  * performed, gap information.
174  * @param query The query sequence [in]
175  * @param subject The subject sequence [in]
176  * @param query_length The query sequence length [in]
177  * @param subject_length The subject sequence length [in]
178  * @param gap_align The structure holding various information and memory
179  *        needed for gapped alignment [in] [out]
180  * @param score_params Parameters related to scoring alignments [in]
181  * @param q_off Starting offset in query [in]
182  * @param s_off Starting offset in subject [in]
183  * @param compressed_subject Is subject sequence compressed? [in]
184  * @param do_traceback Should traceback be saved? [in]
185  * @param fence_hit True is returned here if overrun is detected. [in]
186  */
187 NCBI_XBLAST_EXPORT
188 Int2
189 BLAST_GreedyGappedAlignment(const Uint1* query, const Uint1* subject,
190    Int4 query_length, Int4 subject_length, BlastGapAlignStruct* gap_align,
191    const BlastScoringParameters* score_params,
192    Int4 q_off, Int4 s_off, Boolean compressed_subject, Boolean do_traceback,
193    Boolean * fence_hit);
194 
195 /** Convert initial HSP list to an HSP list: to be used in ungapped search.
196  * Ungapped data must be available in the initial HSP list for this function
197  * to work.
198  * @param init_hitlist List of initial HSPs with ungapped extension
199  *                     information [in]
200  * @param query_info Query information structure, containing offsets into
201  *                   the concatenated queries/strands/frames [in]
202  * @param subject Subject sequence block containing frame information [in]
203  * @param hit_options Hit saving options [in]
204  * @param hsp_list_ptr HSPs in the final form [out]
205  */
206 NCBI_XBLAST_EXPORT
207 Int2 BLAST_GetUngappedHSPList(BlastInitHitList* init_hitlist,
208         BlastQueryInfo* query_info, BLAST_SequenceBlk* subject,
209         const BlastHitSavingOptions* hit_options,
210         BlastHSPList** hsp_list_ptr);
211 
212 /** Adjusts range of subject sequence to be passed for gapped extension,
213  * taking into account the length and starting position of the alignment in
214  * query.
215  * @param subject_offset_ptr Start of the subject range [out]
216  * @param subject_length_ptr Length of the subject range [out]
217  * @param query_offset Offset in query from which alignment starts [in]
218  * @param query_length Length of the query sequence [in]
219  * @param start_shift The offset by which the output range is shifted with
220  *                    respect to the full subject sequence [out]
221  */
222 NCBI_XBLAST_EXPORT
223 void
224 AdjustSubjectRange(Int4* subject_offset_ptr, Int4* subject_length_ptr,
225                    Int4 query_offset, Int4 query_length, Int4* start_shift);
226 
227 /** Function to look for the highest scoring window (of size HSP_MAX_WINDOW)
228  * in an HSP and return the middle of this.  Used by the gapped-alignment
229  * functions to start the gapped alignments.
230  * @param query The query sequence [in]
231  * @param subject The subject sequence [in]
232  * @param sbp Scoring block, containing matrix [in]
233  * @param q_start Starting offset in query [in]
234  * @param q_length Length of HSP in query [in]
235  * @param s_start Starting offset in subject [in]
236  * @param s_length Length of HSP in subject [in]
237  * @return The offset at which alignment should be started [out]
238 */
239 NCBI_XBLAST_EXPORT
240 Int4
241 BlastGetStartForGappedAlignment (const Uint1* query, const Uint1* subject,
242    const BlastScoreBlk* sbp, Uint4 q_start, Uint4 q_length,
243    Uint4 s_start, Uint4 s_length);
244 
245 /** Function to look for the longest identity match run (up to size HSP_MAX_IDENT_RUN)
246  * in an HSP and return the middle of this.  Used by the gapped-alignment
247  * functions to start the gapped alignments.
248  * @param query The query sequence [in]
249  * @param subject The subject sequence [in]
250  * @param hsp On return, the gapped_start will be filled. [in][out]
251 */
252 NCBI_XBLAST_EXPORT
253 void
254 BlastGetStartForGappedAlignmentNucl (const Uint1* query, const Uint1* subject,
255    BlastHSP* hsp);
256 
257 /** Function to look for the highest scoring window (of size HSP_MAX_WINDOW)
258  * in an HSP and return the middle of this.  Used by the gapped-alignment
259  * functions to start the gapped alignments.
260  * Should be used instead of BlastGetStartForGappedAlignment
261  * @param query The query sequence [in]
262  * @param subject The subject sequence [in]
263  * @param sbp Scoring block, containing matrix [in]
264  * @param hsp start and stops of HSP [in]
265  * @param q_retval query offset to use [out]
266  * @param s_retval subject offset to use [out]
267  *
268 */
269 NCBI_XBLAST_EXPORT
270 Boolean
271 BlastGetOffsetsForGappedAlignment (const Uint1* query, const Uint1* subject,
272    const BlastScoreBlk* sbp, BlastHSP* hsp, Int4* q_retval, Int4* s_retval);
273 
274 #ifdef __cplusplus
275 }
276 #endif
277 #endif /* !ALGO_BLAST_CORE__BLAST_GAPALIGN__H */
278