1 /* $Id: redo_alignment.h,v 1.17 2016/06/23 13:19:16 fukanchi Exp $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * ==========================================================================*/ 25 /** 26 * @file redo_alignment.h 27 * Definitions used to redo a set of alignments, using either 28 * composition matrix adjustment or the Smith-Waterman algorithm (or 29 * both.) 30 * 31 * Definitions with the prefix 'BlastCompo_' are primarily intended for use 32 * by glue code that interfaces with this module, i.e. the definitions 33 * need to be externally available so that glue code may be written, but 34 * are not intended for general use. 35 * 36 * @author Alejandro Schaffer, E. Michael Gertz 37 */ 38 #ifndef __REDO_ALIGNMENT__ 39 #define __REDO_ALIGNMENT__ 40 41 #include <algo/blast/composition_adjustment/composition_adjustment.h> 42 #include <algo/blast/composition_adjustment/composition_constants.h> 43 #include <algo/blast/composition_adjustment/smith_waterman.h> 44 #include <algo/blast/composition_adjustment/compo_heap.h> 45 46 #ifdef __cplusplus 47 extern "C" { 48 #endif 49 50 /** 51 * Within the composition adjustment module, an object of type 52 * BlastCompo_Alignment represents a distinct alignment of the query 53 * sequence to the current subject sequence. These objects are 54 * typically part of a singly linked list of distinct alignments, 55 * stored in the reverse of the order in which they were computed. 56 */ 57 typedef struct BlastCompo_Alignment { 58 int score; /**< the score of this alignment */ 59 EMatrixAdjustRule matrix_adjust_rule; 60 /**< how the score matrix was computed */ 61 int queryIndex; /**< index of the query in a concatenated query */ 62 int queryStart; /**< the start of the alignment in the query */ 63 int queryEnd; /**< one past the end of the alignment in the query */ 64 int matchStart; /**< the start of the alignment in the subject */ 65 int matchEnd; /**< one past the end of the alignment in the 66 subject */ 67 int frame; /**< the subject frame */ 68 void * context; /**< traceback info for a gapped alignment */ 69 struct BlastCompo_Alignment * next; /**< the next alignment in the 70 list */ 71 } BlastCompo_Alignment; 72 73 74 /** 75 * Create a new BlastCompo_Alignment; parameters to this function 76 * correspond directly to fields of BlastCompo_Alignment */ 77 NCBI_XBLAST_EXPORT 78 BlastCompo_Alignment * 79 BlastCompo_AlignmentNew(int score, 80 EMatrixAdjustRule whichRule, 81 int queryIndex, int queryStart, int queryEnd, 82 int matchStart, int matchEnd, int frame, 83 void * context); 84 85 86 /** 87 * Recursively free all alignments in the singly linked list whose 88 * head is *palign. Set *palign to NULL. 89 * 90 * @param palign pointer to the head of a singly linked list 91 * of alignments. 92 * @param free_context a function capable of freeing the context 93 * field of an alignment, or NULL if the 94 * context field should not be freed 95 */ 96 NCBI_XBLAST_EXPORT 97 void BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign, 98 void (*free_context)(void*)); 99 100 101 /** Parameters used to compute gapped alignments */ 102 typedef struct BlastCompo_GappingParams { 103 int gap_open; /**< penalty for opening a gap */ 104 int gap_extend; /**< penalty for extending a gapped alignment by 105 one residue */ 106 int decline_align; /**< penalty for declining to align two characters */ 107 int x_dropoff; /**< for x-drop algorithms, once a path falls below 108 the best score by this (positive) amount, the 109 path is no longer searched */ 110 void * context; /**< a pointer to any additional gapping parameters 111 that may be needed by the calling routine. */ 112 } BlastCompo_GappingParams; 113 114 115 /** 116 * BlastCompo_SequenceRange - a struct whose instances represent a range 117 * of data in a sequence. */ 118 typedef struct BlastCompo_SequenceRange 119 { 120 int begin; /**< the starting index of the range */ 121 int end; /**< one beyond the last item in the range */ 122 int context; /**< integer identifier for this window, can 123 indicate a translation frame or an index into a 124 set of sequences. */ 125 } BlastCompo_SequenceRange; 126 127 128 /** 129 * BlastCompo_SequenceData - represents a string of amino acids or nucleotides 130 */ 131 typedef struct BlastCompo_SequenceData { 132 Uint1 * data; /**< amino acid or nucleotide data */ 133 int length; /**< the length of data. For amino acid data 134 &data[-1] is a valid address and 135 data[-1] == 0. */ 136 Uint1 * buffer; /**< if non-nil, points to memory that 137 must be freed when this instance of 138 BlastCompo_SequenceData is deleted. */ 139 } BlastCompo_SequenceData; 140 141 142 /** 143 * A BlastCompo_MatchingSequence represents a subject sequence to be aligned 144 * with the query. This abstract sequence is used to hide the 145 * complexity associated with actually obtaining and releasing the 146 * data for a matching sequence, e.g. reading the sequence from a DB 147 * or translating it from a nucleotide sequence. 148 * 149 * We draw a distinction between a sequence itself, and strings of 150 * data that may be obtained from the sequence. The amino 151 * acid/nucleotide data is represented by an object of type 152 * BlastCompo_SequenceData. There may be more than one instance of 153 * BlastCompo_SequenceData per BlastCompo_MatchingSequence, each representing a 154 * different range in the sequence, or a different translation frame. 155 */ 156 typedef struct BlastCompo_MatchingSequence { 157 Int4 length; /**< length of this matching sequence */ 158 Int4 index; /**< index of this sequence in the database */ 159 void * local_data; /**< holds any sort of data that is 160 necessary for callbacks to access 161 the sequence */ 162 } BlastCompo_MatchingSequence; 163 164 165 /** Collected information about a query */ 166 typedef struct BlastCompo_QueryInfo { 167 int origin; /**< origin of the query in a 168 concatenated query */ 169 BlastCompo_SequenceData seq; /**< sequence data for the query */ 170 Blast_AminoAcidComposition composition; /**< the composition of 171 the query */ 172 double eff_search_space; /**< effective search space of searches 173 involving this query */ 174 175 Uint8* words; /**< list words in the query, needed for 176 testing whether the query and a subject 177 are nearly identical */ 178 } BlastCompo_QueryInfo; 179 180 181 /** Callbacks **/ 182 183 /** Function type: calculate the statistical parameter Lambda from a 184 * set of score probabilities. 185 * 186 * @param probs an array of score probabilities 187 * @param min_score the score corresponding to probs[0] 188 * @param max_score the largest score in the probs array 189 * @param lambda0 an initial guess for Lambda 190 * @return Lambda 191 */ 192 typedef double 193 calc_lambda_type(double * probs, int min_score, int max_score, 194 double lambda0); 195 196 /** 197 * Function type: Get a range of data for a sequence. 198 * 199 * @param sequence a sequence 200 * @param range the range to get 201 * @param data the matching sequence data obtained 202 * @param queryData the sequence data for the query 203 * @param queryOffset offset for align if there are multiple queries 204 * @param align information about the alignment between query and subject 205 * @param query_words a list of hashes of query words (used for testing 206 * for near identical sequences) 207 * @param shouldTestIdentical did alignment pass a preliminary test in 208 * redo_alignment.c that indicates the sequence 209 * pieces may be near identical 210 * @param subject_maybe_biased If false, subject is not biased, otherwise it is 211 * unknown [in|out] 212 */ 213 typedef int 214 get_range_type(const BlastCompo_MatchingSequence * sequence, 215 const BlastCompo_SequenceRange * range, 216 BlastCompo_SequenceData * data, 217 const BlastCompo_SequenceData * origQuery, 218 const BlastCompo_SequenceRange * q_range, 219 BlastCompo_SequenceData * q_data, 220 const Uint8* query_words, 221 const BlastCompo_Alignment *align, 222 const Boolean shouldTestIdentical, 223 const ECompoAdjustModes compo_adjust_mode, 224 const Boolean isSmithWaterman, 225 Boolean* subject_maybe_biased); 226 227 /** 228 * Function type: Calculate the traceback for one alignment by 229 * performing an x-drop alignment in both directions 230 * 231 * @param in_align the existing alignment, without traceback 232 * @param matrix_adjust_rule rule used to compute the scoring matrix 233 * @param whichMode which mode of composition adjustment has 234 * been used to adjust the scoring matrix 235 * @param query_data query sequence data 236 * @param query_range range of this query in the concatenated 237 * query 238 * @param ccat_query_length total length of the concatenated query 239 * @param subject_data subject sequence data 240 * @param subject_range range of subject_data in the translated 241 * query, in amino acid coordinates 242 * @param full_subject_length length of the full subject sequence 243 * @param gapping_params parameters used to compute gapped 244 * alignments 245 */ 246 typedef BlastCompo_Alignment * 247 redo_one_alignment_type(BlastCompo_Alignment * in_align, 248 EMatrixAdjustRule matrix_adjust_rule, 249 BlastCompo_SequenceData * query_data, 250 BlastCompo_SequenceRange * query_range, 251 int ccat_query_length, 252 BlastCompo_SequenceData * subject_data, 253 BlastCompo_SequenceRange * subject_range, 254 int full_subject_length, 255 BlastCompo_GappingParams * gapping_params); 256 257 /** 258 * Function type: Calculate the traceback for one alignment by 259 * performing an x-drop alignment in the forward direction, possibly 260 * increasing the x-drop parameter until the desired score is 261 * attained. 262 * 263 * The start, end and score of the alignment should be obtained 264 * using the Smith-Waterman algorithm before this routine is called. 265 * 266 * @param *palign the new alignment 267 * @param *pqueryEnd on entry, the end of the alignment in the 268 * query, as computed by the Smith-Waterman 269 * algorithm. On exit, the end as computed by 270 * the x-drop algorithm 271 * @param *pmatchEnd like as *pqueryEnd, but for the subject 272 * sequence 273 * @param queryStart the starting point in the query 274 * @param matchStart the starting point in the subject 275 * @param score the score of the alignment, as computed by 276 * the Smith-Waterman algorithm 277 * @param query query sequence data 278 * @param query_range range of this query in the concatenated 279 * query 280 * @param ccat_query_length total length of the concatenated query 281 * @param subject subject sequence data 282 * @param subject_range range of subject_data in the translated 283 * query, in amino acid coordinates 284 * @param full_subject_length length of the full subject sequence 285 * @param gapping_params parameters used to compute gapped 286 * alignments 287 * @param matrix_adjust_rule rule used to compute the scoring matrix 288 * @return 0 on success, -1 for out-of-memory error 289 */ 290 typedef int 291 new_xdrop_align_type(BlastCompo_Alignment **palign, 292 Int4 * pqueryEnd, Int4 * pmatchEnd, 293 Int4 queryStart, Int4 matchStart, Int4 score, 294 BlastCompo_SequenceData * query, 295 BlastCompo_SequenceRange * query_range, 296 Int4 ccat_query_length, 297 BlastCompo_SequenceData * subject, 298 BlastCompo_SequenceRange * subject_range, 299 Int4 full_subject_length, 300 BlastCompo_GappingParams * gapping_params, 301 EMatrixAdjustRule matrix_adjust_rule); 302 303 /** Function type: free traceback data in a BlastCompo_Alignment. 304 * 305 * @param traceback_data data (of unknown type) to be freed 306 */ 307 typedef void free_align_traceback_type(void * traceback_data); 308 309 310 /** Callbacks used by Blast_RedoOneMatch and 311 * Blast_RedoOneMatchSmithWaterman routines */ 312 typedef struct Blast_RedoAlignCallbacks { 313 /** @sa calc_lambda_type */ 314 calc_lambda_type * calc_lambda; 315 /** @sa get_range_type */ 316 get_range_type * get_range; 317 /** @sa redo_one_alignment_type */ 318 redo_one_alignment_type * redo_one_alignment; 319 /** @sa new_xdrop_align_type */ 320 new_xdrop_align_type * new_xdrop_align; 321 /** @sa free_align_traceback_type */ 322 free_align_traceback_type * free_align_traceback; 323 } Blast_RedoAlignCallbacks; 324 325 326 /** A parameter block for the Blast_RedoOneMatch and 327 * Blast_RedoOneMatchSmithWaterman routines */ 328 typedef struct Blast_RedoAlignParams { 329 Blast_MatrixInfo * matrix_info; /**< information about the scoring 330 matrix used */ 331 BlastCompo_GappingParams * 332 gapping_params; /**< parameters for performing a 333 gapped alignment */ 334 ECompoAdjustModes compo_adjust_mode; /**< composition adjustment mode */ 335 int positionBased; /**< true if the search is position-based */ 336 int RE_pseudocounts; /**< number of pseudocounts to use in 337 relative-entropy based composition 338 adjustment. */ 339 int subject_is_translated; /**< true if the subject is translated */ 340 int query_is_translated; /**< true if the query is translated */ 341 int ccat_query_length; /**< length of the concatenated query, or 342 just the length of the query if not 343 concatenated */ 344 int cutoff_s; /**< cutoff score for saving alignments when 345 HSP linking is used */ 346 double cutoff_e; /**< cutoff e-value for saving alignments */ 347 int do_link_hsps; /**< if true, then HSP linking and sum 348 statistics are used to computed e-values */ 349 const Blast_RedoAlignCallbacks * 350 callbacks; /**< callback functions used by 351 the Blast_RedoAlign* functions */ 352 353 double near_identical_cutoff; /**< alignments with raw score per residue 354 above this value will be tested if 355 sequences are near identical to avoid 356 segging */ 357 } Blast_RedoAlignParams; 358 359 360 /** Create new Blast_RedoAlignParams object. The parameters of this 361 * function correspond directly to the fields of 362 * Blast_RedoAlignParams. The new Blast_RedoAlignParams object takes 363 * possession of *pmatrix_info and *pgapping_params, so these values 364 * are set to NULL on exit. */ 365 NCBI_XBLAST_EXPORT 366 Blast_RedoAlignParams * 367 Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info, 368 BlastCompo_GappingParams **pgapping_params, 369 ECompoAdjustModes compo_adjust_mode, 370 int positionBased, 371 int subject_is_translated, 372 int query_is_translated, 373 int ccat_query_length, int cutoff_s, 374 double cutoff_e, int do_link_hsps, 375 const Blast_RedoAlignCallbacks * callbacks, 376 double near_identical_cutoff); 377 378 379 /** Free a set of Blast_RedoAlignParams */ 380 NCBI_XBLAST_EXPORT 381 void Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams); 382 383 384 /** 385 * Recompute all alignments for one query/subject pair using the 386 * Smith-Waterman algorithm and possibly also composition-based 387 * statistics or composition-based matrix adjustment. 388 * 389 * @param alignments an array of lists containing the newly 390 * computed alignments. There is one array 391 * element for each query in the original 392 * search 393 * @param params parameters used to redo the alignments 394 * @param incoming_aligns a list of existing alignments 395 * @param hspcnt length of incoming_aligns 396 * @param Lambda statistical parameter 397 * @param logK statistical parameter 398 * @param matchingSeq the database sequence 399 * @param query_info information about all queries 400 * @param numQueries the number of queries 401 * @param matrix the scoring matrix 402 * @param alphsize the size of the alphabet 403 * @param NRrecord a workspace used to adjust the composition. 404 * @param forbidden a workspace used to hold forbidden ranges 405 * for the Smith-Waterman algorithm. 406 * @param significantMatches an array of heaps of alignments for 407 * query-subject pairs that have already 408 * been redone; used to terminate the 409 * Smith-Waterman algorithm early if it is 410 * clear that the current match is not 411 * significant enough to be saved. 412 * @param pvalueThisPair the compositional p-value for this pair of sequences 413 * @param compositionTestIndex index of the test function used to decide 414 * whether to use a compositional p-value 415 * @param LambdaRatio the ratio of the actual value of Lambda to the 416 * ideal value. 417 * 418 * @return 0 on success, -1 on out-of-memory 419 */ 420 NCBI_XBLAST_EXPORT 421 int Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments, 422 Blast_RedoAlignParams * params, 423 BlastCompo_Alignment * incoming_aligns, 424 int hspcnt, 425 double Lambda, double logK, 426 BlastCompo_MatchingSequence * matchingSeq, 427 BlastCompo_QueryInfo query_info[], 428 int numQueries, 429 int ** matrix, int alphsize, 430 Blast_CompositionWorkspace * NRrecord, 431 Blast_ForbiddenRanges * forbidden, 432 BlastCompo_Heap * significantMatches, 433 double *pvalueThisPair, 434 int compositionTestIndex, 435 double *LambdaRatio); 436 437 438 /** 439 * Recompute all alignments for one query/subject pair using 440 * composition-based statistics or composition-based matrix adjustment. 441 * 442 * @param alignments an array of lists containing the newly 443 * computed alignments. There is one array 444 * element for each query in the original 445 * search 446 * @param params parameters used to redo the alignments 447 * @param incoming_aligns a list of existing alignments 448 * @param hspcnt length of incoming_aligns 449 * @param Lambda statistical parameter 450 * @param matchingSeq the database sequence 451 * @param ccat_query_length the length of the concatenated query 452 * @param query_info information about all queries 453 * @param numQueries the number of queries 454 * @param matrix the scoring matrix 455 * @param alphsize the size of the alphabet 456 * @param NRrecord a workspace used to adjust the composition. 457 * @param pvalueThisPair the compositional p-value for this pair of sequences 458 * @param compositionTestIndex index of the test function used to decide 459 * whether to use a compositional p-value 460 * @param LambdaRatio the ratio of the actual value of Lambda to the 461 * ideal value. 462 * 463 * @return 0 on success, -1 on out-of-memory 464 */ 465 NCBI_XBLAST_EXPORT 466 int Blast_RedoOneMatch(BlastCompo_Alignment ** alignments, 467 Blast_RedoAlignParams * params, 468 BlastCompo_Alignment * incoming_aligns, 469 int hspcnt, 470 double Lambda, 471 BlastCompo_MatchingSequence * matchingSeq, 472 int ccat_query_length, 473 BlastCompo_QueryInfo query_info[], 474 int numQueries, 475 int ** matrix, int alphsize, 476 Blast_CompositionWorkspace * NRrecord, 477 double *pvalueThisPair, 478 int compositionTestIndex, 479 double *LambdaRatio); 480 481 482 /** Return true if a heuristic determines that it is unlikely to be 483 * worthwhile to redo a query-subject pair with the given e-value; used 484 * to terminate the main loop for redoing all alignments early. */ 485 NCBI_XBLAST_EXPORT 486 int BlastCompo_EarlyTermination(double evalue, 487 BlastCompo_Heap significantMatches[], 488 int numQueries); 489 490 #ifndef GET_SEQ_FRAME 491 #define GET_SEQ_FRAME(f) ((f<0)?ABS(f)+2: f-1) 492 #endif 493 494 #ifndef GET_NUCL_LENGTH 495 #define GET_NUCL_LENGTH(l) ((l-5)/2 +2) 496 #endif 497 498 #ifndef GET_TRANSLATED_LENGTH 499 #define GET_TRANSLATED_LENGTH(l,f) ((l-f% 3)/3) 500 #endif 501 502 503 #ifdef __cplusplus 504 } 505 #endif 506 507 #endif 508