1 /* $Id: redo_alignment.h,v 1.17 2016/06/23 13:19:16 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ==========================================================================*/
25 /**
26  * @file redo_alignment.h
27  * Definitions used to redo a set of alignments, using either
28  * composition matrix adjustment or the Smith-Waterman algorithm (or
29  * both.)
30  *
31  * Definitions with the prefix 'BlastCompo_' are primarily intended for use
32  * by glue code that interfaces with this module, i.e. the definitions
33  * need to be externally available so that glue code may be written, but
34  * are not intended for general use.
35  *
36  * @author Alejandro Schaffer, E. Michael Gertz
37  */
38 #ifndef __REDO_ALIGNMENT__
39 #define __REDO_ALIGNMENT__
40 
41 #include <algo/blast/composition_adjustment/composition_adjustment.h>
42 #include <algo/blast/composition_adjustment/composition_constants.h>
43 #include <algo/blast/composition_adjustment/smith_waterman.h>
44 #include <algo/blast/composition_adjustment/compo_heap.h>
45 
46 #ifdef __cplusplus
47 extern "C" {
48 #endif
49 
50 /**
51  * Within the composition adjustment module, an object of type
52  * BlastCompo_Alignment represents a distinct alignment of the query
53  * sequence to the current subject sequence.  These objects are
54  * typically part of a singly linked list of distinct alignments,
55  * stored in the reverse of the order in which they were computed.
56  */
57 typedef struct BlastCompo_Alignment {
58     int score;           /**< the score of this alignment */
59     EMatrixAdjustRule matrix_adjust_rule;
60     /**< how the score matrix was computed */
61     int queryIndex;      /**< index of the query in a concatenated query */
62     int queryStart;      /**< the start of the alignment in the query */
63     int queryEnd;        /**< one past the end of the alignment in the query */
64     int matchStart;      /**< the start of the alignment in the subject */
65     int matchEnd;        /**< one past the end of the alignment in the
66                               subject */
67     int frame;           /**< the subject frame */
68     void * context;    /**< traceback info for a gapped alignment */
69     struct BlastCompo_Alignment * next;  /**< the next alignment in the
70                                               list */
71 } BlastCompo_Alignment;
72 
73 
74 /**
75  * Create a new BlastCompo_Alignment; parameters to this function
76  * correspond directly to fields of BlastCompo_Alignment */
77 NCBI_XBLAST_EXPORT
78 BlastCompo_Alignment *
79 BlastCompo_AlignmentNew(int score,
80                         EMatrixAdjustRule whichRule,
81                         int queryIndex, int queryStart, int queryEnd,
82                         int matchStart, int matchEnd, int frame,
83                         void * context);
84 
85 
86 /**
87  * Recursively free all alignments in the singly linked list whose
88  * head is *palign. Set *palign to NULL.
89  *
90  * @param palign            pointer to the head of a singly linked list
91  *                          of alignments.
92  * @param free_context      a function capable of freeing the context
93  *                          field of an alignment, or NULL if the
94  *                          context field should not be freed
95  */
96 NCBI_XBLAST_EXPORT
97 void BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign,
98                                void (*free_context)(void*));
99 
100 
101 /** Parameters used to compute gapped alignments */
102 typedef struct BlastCompo_GappingParams {
103     int gap_open;        /**< penalty for opening a gap */
104     int gap_extend;      /**< penalty for extending a gapped alignment by
105                               one residue */
106     int decline_align;   /**< penalty for declining to align two characters */
107     int x_dropoff;       /**< for x-drop algorithms, once a path falls below
108                               the best score by this (positive) amount, the
109                               path is no longer searched */
110     void * context;      /**< a pointer to any additional gapping parameters
111                               that may be needed by the calling routine. */
112 } BlastCompo_GappingParams;
113 
114 
115 /**
116  * BlastCompo_SequenceRange - a struct whose instances represent a range
117  * of data in a sequence. */
118 typedef struct BlastCompo_SequenceRange
119 {
120     int begin;    /**< the starting index of the range */
121     int end;      /**< one beyond the last item in the range */
122     int context;  /**< integer identifier for this window, can
123                        indicate a translation frame or an index into a
124                        set of sequences. */
125 } BlastCompo_SequenceRange;
126 
127 
128 /**
129  * BlastCompo_SequenceData - represents a string of amino acids or nucleotides
130  */
131 typedef struct BlastCompo_SequenceData {
132     Uint1 * data;                /**< amino acid or nucleotide data */
133     int length;                  /**< the length of data. For amino acid data
134                                    &data[-1] is a valid address and
135                                    data[-1] == 0. */
136     Uint1 * buffer;               /**< if non-nil, points to memory that
137                                     must be freed when this instance of
138                                     BlastCompo_SequenceData is deleted. */
139 } BlastCompo_SequenceData;
140 
141 
142 /**
143  * A BlastCompo_MatchingSequence represents a subject sequence to be aligned
144  * with the query.  This abstract sequence is used to hide the
145  * complexity associated with actually obtaining and releasing the
146  * data for a matching sequence, e.g. reading the sequence from a DB
147  * or translating it from a nucleotide sequence.
148  *
149  * We draw a distinction between a sequence itself, and strings of
150  * data that may be obtained from the sequence.  The amino
151  * acid/nucleotide data is represented by an object of type
152  * BlastCompo_SequenceData.  There may be more than one instance of
153  * BlastCompo_SequenceData per BlastCompo_MatchingSequence, each representing a
154  * different range in the sequence, or a different translation frame.
155  */
156 typedef struct BlastCompo_MatchingSequence {
157   Int4          length;         /**< length of this matching sequence */
158   Int4          index;          /**< index of this sequence in the database */
159   void * local_data;            /**< holds any sort of data that is
160                                      necessary for callbacks to access
161                                      the sequence */
162 } BlastCompo_MatchingSequence;
163 
164 
165 /** Collected information about a query */
166 typedef struct BlastCompo_QueryInfo {
167     int origin;               /**< origin of the query in a
168                                    concatenated query */
169     BlastCompo_SequenceData seq;   /**< sequence data for the query */
170     Blast_AminoAcidComposition composition;   /**< the composition of
171                                                    the query */
172     double eff_search_space;  /**< effective search space of searches
173                                    involving this query */
174 
175     Uint8* words;             /**< list words in the query, needed for
176                                    testing whether the query and a subject
177                                    are nearly identical */
178 } BlastCompo_QueryInfo;
179 
180 
181 /** Callbacks **/
182 
183 /** Function type: calculate the statistical parameter Lambda from a
184  * set of score probabilities.
185  *
186  * @param probs            an array of score probabilities
187  * @param min_score        the score corresponding to probs[0]
188  * @param max_score        the largest score in the probs array
189  * @param lambda0          an initial guess for Lambda
190  * @return Lambda
191  */
192 typedef double
193 calc_lambda_type(double * probs, int min_score, int max_score,
194                  double lambda0);
195 
196 /**
197  * Function type: Get a range of data for a sequence.
198  *
199  * @param sequence        a sequence
200  * @param range           the range to get
201  * @param data            the matching sequence data obtained
202  * @param queryData       the sequence data for the query
203  * @param queryOffset     offset for align if there are multiple queries
204  * @param align           information about the alignment between query and subject
205  * @param query_words     a list of hashes of query words (used for testing
206  *                        for near identical sequences)
207  * @param shouldTestIdentical did alignment pass a preliminary test in
208  *                       redo_alignment.c that indicates the sequence
209  *                        pieces may be near identical
210  * @param subject_maybe_biased If false, subject is not biased, otherwise it is
211  *                             unknown [in|out]
212  */
213 typedef int
214 get_range_type(const BlastCompo_MatchingSequence * sequence,
215                const BlastCompo_SequenceRange * range,
216                BlastCompo_SequenceData * data,
217 	       const BlastCompo_SequenceData * origQuery,
218                const BlastCompo_SequenceRange * q_range,
219                BlastCompo_SequenceData * q_data,
220                const Uint8* query_words,
221 	       const BlastCompo_Alignment *align,
222 	       const Boolean shouldTestIdentical,
223 	       const ECompoAdjustModes compo_adjust_mode,
224 	       const Boolean isSmithWaterman,
225 	       Boolean* subject_maybe_biased);
226 
227 /**
228  * Function type: Calculate the traceback for one alignment by
229  * performing an x-drop alignment in both directions
230  *
231  * @param in_align         the existing alignment, without traceback
232  * @param matrix_adjust_rule   rule used to compute the scoring matrix
233  * @param whichMode        which mode of composition adjustment has
234  *                         been used to adjust the scoring matrix
235  * @param query_data       query sequence data
236  * @param query_range      range of this query in the concatenated
237  *                         query
238  * @param ccat_query_length   total length of the concatenated query
239  * @param subject_data     subject sequence data
240  * @param subject_range    range of subject_data in the translated
241  *                         query, in amino acid coordinates
242  * @param full_subject_length   length of the full subject sequence
243  * @param gapping_params        parameters used to compute gapped
244  *                              alignments
245  */
246 typedef BlastCompo_Alignment *
247 redo_one_alignment_type(BlastCompo_Alignment * in_align,
248                         EMatrixAdjustRule matrix_adjust_rule,
249                         BlastCompo_SequenceData * query_data,
250                         BlastCompo_SequenceRange * query_range,
251                         int ccat_query_length,
252                         BlastCompo_SequenceData * subject_data,
253                         BlastCompo_SequenceRange * subject_range,
254                         int full_subject_length,
255                         BlastCompo_GappingParams * gapping_params);
256 
257 /**
258  * Function type: Calculate the traceback for one alignment by
259  * performing an x-drop alignment in the forward direction, possibly
260  * increasing the x-drop parameter until the desired score is
261  * attained.
262  *
263  * The start, end and score of the alignment should be obtained
264  * using the Smith-Waterman algorithm before this routine is called.
265  *
266  * @param *palign          the new alignment
267  * @param *pqueryEnd       on entry, the end of the alignment in the
268  *                         query, as computed by the Smith-Waterman
269  *                         algorithm.  On exit, the end as computed by
270  *                         the x-drop algorithm
271  * @param *pmatchEnd       like as *pqueryEnd, but for the subject
272  *                         sequence
273  * @param queryStart       the starting point in the query
274  * @param matchStart       the starting point in the subject
275  * @param score            the score of the alignment, as computed by
276  *                         the Smith-Waterman algorithm
277  * @param query            query sequence data
278  * @param query_range      range of this query in the concatenated
279  *                         query
280  * @param ccat_query_length   total length of the concatenated query
281  * @param subject          subject sequence data
282  * @param subject_range    range of subject_data in the translated
283  *                         query, in amino acid coordinates
284  * @param full_subject_length   length of the full subject sequence
285  * @param gapping_params        parameters used to compute gapped
286  *                              alignments
287  * @param matrix_adjust_rule   rule used to compute the scoring matrix
288  * @return   0 on success, -1 for out-of-memory error
289  */
290 typedef int
291 new_xdrop_align_type(BlastCompo_Alignment **palign,
292                      Int4 * pqueryEnd, Int4 * pmatchEnd,
293                      Int4 queryStart, Int4 matchStart, Int4 score,
294                      BlastCompo_SequenceData * query,
295                      BlastCompo_SequenceRange * query_range,
296                      Int4 ccat_query_length,
297                      BlastCompo_SequenceData * subject,
298                      BlastCompo_SequenceRange * subject_range,
299                      Int4 full_subject_length,
300                      BlastCompo_GappingParams * gapping_params,
301                      EMatrixAdjustRule matrix_adjust_rule);
302 
303 /** Function type: free traceback data in a BlastCompo_Alignment.
304  *
305  *  @param traceback_data   data (of unknown type) to be freed
306  */
307 typedef void free_align_traceback_type(void * traceback_data);
308 
309 
310 /** Callbacks used by Blast_RedoOneMatch and
311  * Blast_RedoOneMatchSmithWaterman routines */
312 typedef struct Blast_RedoAlignCallbacks {
313     /** @sa calc_lambda_type */
314     calc_lambda_type * calc_lambda;
315     /** @sa get_range_type */
316     get_range_type * get_range;
317     /** @sa redo_one_alignment_type */
318     redo_one_alignment_type * redo_one_alignment;
319     /** @sa new_xdrop_align_type */
320     new_xdrop_align_type * new_xdrop_align;
321     /** @sa free_align_traceback_type */
322     free_align_traceback_type * free_align_traceback;
323 } Blast_RedoAlignCallbacks;
324 
325 
326 /** A parameter block for the Blast_RedoOneMatch and
327  * Blast_RedoOneMatchSmithWaterman routines */
328 typedef struct Blast_RedoAlignParams {
329     Blast_MatrixInfo * matrix_info;  /**< information about the scoring
330                                           matrix used */
331     BlastCompo_GappingParams *
332         gapping_params;              /**< parameters for performing a
333                                           gapped alignment */
334     ECompoAdjustModes compo_adjust_mode;   /**< composition adjustment mode */
335     int positionBased;      /**< true if the search is position-based */
336     int RE_pseudocounts;    /**< number of pseudocounts to use in
337                                  relative-entropy based composition
338                                  adjustment. */
339     int subject_is_translated; /**< true if the subject is translated */
340     int query_is_translated; /**< true if the query is translated */
341     int ccat_query_length;  /**< length of the concatenated query, or
342                                 just the length of the query if not
343                                 concatenated */
344     int cutoff_s;           /**< cutoff score for saving alignments when
345                                  HSP linking is used */
346     double cutoff_e;        /**< cutoff e-value for saving alignments */
347     int do_link_hsps;       /**< if true, then HSP linking and sum
348                                statistics are used to computed e-values */
349     const Blast_RedoAlignCallbacks *
350         callbacks;                     /**< callback functions used by
351                                             the Blast_RedoAlign* functions */
352 
353     double near_identical_cutoff;  /**< alignments with raw score per residue
354                                         above this value will be tested if
355                                         sequences are near identical to avoid
356                                         segging */
357 } Blast_RedoAlignParams;
358 
359 
360 /** Create new Blast_RedoAlignParams object.  The parameters of this
361  * function correspond directly to the fields of
362  * Blast_RedoAlignParams.  The new Blast_RedoAlignParams object takes
363  * possession of *pmatrix_info and *pgapping_params, so these values
364  * are set to NULL on exit. */
365 NCBI_XBLAST_EXPORT
366 Blast_RedoAlignParams *
367 Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info,
368                          BlastCompo_GappingParams **pgapping_params,
369                          ECompoAdjustModes compo_adjust_mode,
370                          int positionBased,
371                          int subject_is_translated,
372                          int query_is_translated,
373                          int ccat_query_length, int cutoff_s,
374                          double cutoff_e, int do_link_hsps,
375                          const Blast_RedoAlignCallbacks * callbacks,
376                          double near_identical_cutoff);
377 
378 
379 /** Free a set of Blast_RedoAlignParams */
380 NCBI_XBLAST_EXPORT
381 void Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams);
382 
383 
384 /**
385  * Recompute all alignments for one query/subject pair using the
386  * Smith-Waterman algorithm and possibly also composition-based
387  * statistics or composition-based matrix adjustment.
388  *
389  * @param alignments       an array of lists containing the newly
390  *                         computed alignments.  There is one array
391  *                         element for each query in the original
392  *                         search
393  * @param params           parameters used to redo the alignments
394  * @param incoming_aligns  a list of existing alignments
395  * @param hspcnt           length of incoming_aligns
396  * @param Lambda           statistical parameter
397  * @param logK             statistical parameter
398  * @param matchingSeq      the database sequence
399  * @param query_info       information about all queries
400  * @param numQueries       the number of queries
401  * @param matrix           the scoring matrix
402  * @param alphsize         the size of the alphabet
403  * @param NRrecord         a workspace used to adjust the composition.
404  * @param forbidden        a workspace used to hold forbidden ranges
405  *                         for the Smith-Waterman algorithm.
406  * @param significantMatches   an array of heaps of alignments for
407  *                             query-subject pairs that have already
408  *                             been redone; used to terminate the
409  *                             Smith-Waterman algorithm early if it is
410  *                             clear that the current match is not
411  *                             significant enough to be saved.
412  * @param pvalueThisPair   the compositional p-value for this pair of sequences
413  * @param compositionTestIndex   index of the test function used to decide
414  *                               whether to use a compositional p-value
415  * @param LambdaRatio      the ratio of the actual value of Lambda to the
416  *                         ideal value.
417  *
418  * @return 0 on success, -1 on out-of-memory
419  */
420 NCBI_XBLAST_EXPORT
421 int Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments,
422                                     Blast_RedoAlignParams * params,
423                                     BlastCompo_Alignment * incoming_aligns,
424                                     int hspcnt,
425                                     double Lambda, double logK,
426                                     BlastCompo_MatchingSequence * matchingSeq,
427                                     BlastCompo_QueryInfo query_info[],
428                                     int numQueries,
429                                     int ** matrix, int alphsize,
430                                     Blast_CompositionWorkspace * NRrecord,
431                                     Blast_ForbiddenRanges * forbidden,
432                                     BlastCompo_Heap * significantMatches,
433                                     double *pvalueThisPair,
434                                     int compositionTestIndex,
435                                     double *LambdaRatio);
436 
437 
438 /**
439  * Recompute all alignments for one query/subject pair using
440  * composition-based statistics or composition-based matrix adjustment.
441  *
442  * @param alignments       an array of lists containing the newly
443  *                         computed alignments.  There is one array
444  *                         element for each query in the original
445  *                         search
446  * @param params           parameters used to redo the alignments
447  * @param incoming_aligns  a list of existing alignments
448  * @param hspcnt           length of incoming_aligns
449  * @param Lambda           statistical parameter
450  * @param matchingSeq      the database sequence
451  * @param ccat_query_length  the length of the concatenated query
452  * @param query_info       information about all queries
453  * @param numQueries       the number of queries
454  * @param matrix           the scoring matrix
455  * @param alphsize         the size of the alphabet
456  * @param NRrecord         a workspace used to adjust the composition.
457  * @param pvalueThisPair   the compositional p-value for this pair of sequences
458  * @param compositionTestIndex   index of the test function used to decide
459  *                               whether to use a compositional p-value
460  * @param LambdaRatio      the ratio of the actual value of Lambda to the
461  *                         ideal value.
462  *
463  * @return 0 on success, -1 on out-of-memory
464  */
465 NCBI_XBLAST_EXPORT
466 int Blast_RedoOneMatch(BlastCompo_Alignment ** alignments,
467                        Blast_RedoAlignParams * params,
468                        BlastCompo_Alignment * incoming_aligns,
469                        int hspcnt,
470                        double Lambda,
471                        BlastCompo_MatchingSequence * matchingSeq,
472                        int ccat_query_length,
473                        BlastCompo_QueryInfo query_info[],
474                        int numQueries,
475                        int ** matrix, int alphsize,
476                        Blast_CompositionWorkspace * NRrecord,
477                        double *pvalueThisPair,
478                        int compositionTestIndex,
479                        double *LambdaRatio);
480 
481 
482 /** Return true if a heuristic determines that it is unlikely to be
483  * worthwhile to redo a query-subject pair with the given e-value; used
484  * to terminate the main loop for redoing all alignments early. */
485 NCBI_XBLAST_EXPORT
486 int BlastCompo_EarlyTermination(double evalue,
487                                 BlastCompo_Heap significantMatches[],
488                                 int numQueries);
489 
490 #ifndef GET_SEQ_FRAME
491 #define GET_SEQ_FRAME(f) ((f<0)?ABS(f)+2: f-1)
492 #endif
493 
494 #ifndef GET_NUCL_LENGTH
495 #define GET_NUCL_LENGTH(l) ((l-5)/2 +2)
496 #endif
497 
498 #ifndef GET_TRANSLATED_LENGTH
499 #define GET_TRANSLATED_LENGTH(l,f) ((l-f% 3)/3)
500 #endif
501 
502 
503 #ifdef __cplusplus
504 }
505 #endif
506 
507 #endif
508