1 /* $Id: blast_stat.h 495576 2016-03-18 14:26:46Z rackerst $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Tom Madden 27 * 28 */ 29 30 /** @file blast_stat.h 31 * Definitions and prototypes used by blast_stat.c to calculate BLAST 32 * statistics. @todo FIXME: needs doxygen comments 33 */ 34 35 #ifndef ALGO_BLAST_CORE__BLAST_STAT__H 36 #define ALGO_BLAST_CORE__BLAST_STAT__H 37 38 #include <algo/blast/core/ncbi_std.h> 39 #include <algo/blast/core/blast_export.h> 40 #include <algo/blast/core/blast_program.h> 41 #include <algo/blast/core/blast_query_info.h> 42 #include <algo/blast/core/blast_message.h> 43 #include <util/tables/raw_scoremat.h> 44 45 #ifdef __cplusplus 46 extern "C" { 47 #endif 48 49 /** 50 * Defines for the matrix 'preferences' (as specified by S. Altschul). 51 * Used in arrays such as blosum45_prefs in blast_stat.c 52 */ 53 #define BLAST_MATRIX_NOMINAL 0 /**< acceptable values, not recommended. */ 54 #define BLAST_MATRIX_PREFERRED 1 /**< These value are preferred over others. */ 55 #define BLAST_MATRIX_BEST 2 /**< This is the best value, only one per matrix. */ 56 57 58 #define BLASTMAT_DIR "/usr/ncbi/blast/matrix" /**< Default location for blast databases. */ 59 60 /** callback to resolve the path to blast score matrices */ 61 typedef char* (*GET_MATRIX_PATH) (const char*, Boolean); 62 63 /** 64 Structure to hold the Karlin-Altschul parameters. 65 */ 66 typedef struct Blast_KarlinBlk { 67 double Lambda; /**< Lambda value used in statistics */ 68 double K; /**< K value used in statistics */ 69 double logK; /**< natural log of K value used in statistics */ 70 double H; /**< H value used in statistics */ 71 double paramC; /**< for use in seed. */ 72 } Blast_KarlinBlk; 73 74 75 /** 76 Tabulated results for faster erfc(x) lookup. 77 erfc(x) = 1/2 + 1/sqrt(2*pi) * \int_0^x { exp(-y^2/2)} dy 78 erfc(-\inf) = 0.0 79 erfc(+\inf) = 1.0 80 The erfc(x) is tabulated in p with step h, starting from a to b 81 */ 82 typedef struct erfc_table { 83 double eps; 84 double a; 85 double b; 86 Int4 N; 87 double h; 88 double *p; 89 } erfc_table; 90 91 /** 92 Structure to hold the Gumbel parameters (for FSC). 93 */ 94 typedef struct Blast_GumbelBlk { 95 double Lambda; /**< the unscaled Lambda value */ 96 double C; 97 double G; /**< G is the total penalty for extension */ 98 double a; /**< avg(L) = a y + b */ 99 double Alpha; /**< var(L) = alpha y + beta */ 100 double Sigma; /**< cov(L) = sigma y + tau */ 101 double a_un; /**< Ungapped a */ 102 double Alpha_un; /**< Ungapped alpha */ 103 104 double b; /**< 2*G*(a_un - a) */ 105 double Beta; /**< 2*G*(alpha_un - alpha) */ 106 double Tau; /**< 2*G*(alpha_un - Sigma) */ 107 108 Int8 db_length; /**< total length of database */ 109 110 Boolean filled; /**< flag indicate the values of gbp are prepared */ 111 112 } Blast_GumbelBlk; 113 114 115 /******************************************************************** 116 117 Structures relating to scoring or the BlastScoreBlk 118 119 ********************************************************************/ 120 121 #define BLAST_SCORE_MIN INT2_MIN /**< minimum allowed score (for one letter comparison). */ 122 #define BLAST_SCORE_MAX INT2_MAX /**< maximum allowed score (for one letter comparison). */ 123 124 125 /** Holds score frequencies used in calculation 126 of Karlin-Altschul parameters for an ungapped search. 127 */ 128 typedef struct Blast_ScoreFreq { 129 Int4 score_min; /**< lowest allowed scores */ 130 Int4 score_max; /**< highest allowed scores */ 131 Int4 obs_min; /**< lowest observed (actual) scores */ 132 Int4 obs_max; /**< highest observed (actual) scores */ 133 double score_avg; /**< average score, must be negative for local alignment. */ 134 double* sprob0; /**< arrays for frequency of given score */ 135 double* sprob; /**< arrays for frequency of given score, shifted down by score_min. */ 136 } Blast_ScoreFreq; 137 138 /** Scoring matrix used in BLAST */ 139 typedef struct SBlastScoreMatrix { 140 int** data; /**< actual scoring matrix data, stored in row-major 141 form */ 142 size_t ncols; /**< number of columns */ 143 size_t nrows; /**< number of rows */ 144 double* freqs; /**< array of assumed matrix background frequencies -RMH-*/ 145 double lambda; /**< derived value of the matrix lambda -RMH- */ 146 } SBlastScoreMatrix; 147 148 /** Scoring matrix data used in PSI-BLAST */ 149 typedef struct SPsiBlastScoreMatrix { 150 SBlastScoreMatrix* pssm; /**< position-specific score matrix */ 151 double** freq_ratios; /**< PSSM's frequency ratios, dimensions are 152 specified in pssm data above */ 153 Blast_KarlinBlk* kbp; /**< Karlin-Altschul block associated with this 154 PSSM */ 155 } SPsiBlastScoreMatrix; 156 157 /** Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by 158 * BLASTAA_SIZE. 159 * @param ncols number of columns (i.e.: query length) [in] 160 * @return NULL in case of memory allocation failure, else new 161 * SPsiBlastScoreMatrix structure 162 */ 163 NCBI_XBLAST_EXPORT 164 SPsiBlastScoreMatrix* 165 SPsiBlastScoreMatrixNew(size_t ncols); 166 167 /** Deallocates a SPsiBlastScoreMatrix structure. 168 * @param matrix structure to deallocate [in] 169 * @return NULL 170 */ 171 NCBI_XBLAST_EXPORT 172 SPsiBlastScoreMatrix* 173 SPsiBlastScoreMatrixFree(SPsiBlastScoreMatrix* matrix); 174 175 /** Structure used for scoring calculations. 176 */ 177 typedef struct BlastScoreBlk { 178 Boolean protein_alphabet; /**< TRUE if alphabet_code is for a 179 protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */ 180 Uint1 alphabet_code; /**< NCBI alphabet code. */ 181 Int2 alphabet_size; /**< size of alphabet. */ 182 Int2 alphabet_start; /**< numerical value of 1st letter. */ 183 char* name; /**< name of scoring matrix. */ 184 ListNode* comments; /**< Comments about scoring matrix. */ 185 SBlastScoreMatrix* matrix; /**< scoring matrix data */ 186 SPsiBlastScoreMatrix* psi_matrix; /**< PSSM and associated data. If this 187 is not NULL, then the BLAST search is 188 position specific (i.e.: PSI-BLAST) */ 189 Boolean matrix_only_scoring; /**< Score ungapped/gapped alignment only 190 using the matrix parameters and 191 with raw scores. Ignore 192 penalty/reward and do not report 193 Karlin-Altschul stats. This is used 194 by the rmblastn program. -RMH- */ 195 Boolean complexity_adjusted_scoring; /**< Use cross_match-like complexity 196 adjustment on raw scores. -RMH- */ 197 Int4 loscore; /**< Min. substitution scores */ 198 Int4 hiscore; /**< Max. substitution scores */ 199 Int4 penalty; /**< penalty for mismatch in blastn. */ 200 Int4 reward; /**< reward for match in blastn. */ 201 double scale_factor; /**< multiplier for all cutoff and dropoff scores */ 202 Boolean read_in_matrix; /**< If TRUE, matrix is read in, otherwise 203 produce one from penalty and reward above. @todo should this be 204 an allowed way of specifying the matrix to use? */ 205 Blast_ScoreFreq** sfp; /**< score frequencies for scoring matrix. */ 206 /* kbp & kbp_gap are ptrs that should be set to kbp_std, kbp_psi, etc. */ 207 Blast_KarlinBlk** kbp; /**< Karlin-Altschul parameters. Actually just a placeholder. */ 208 Blast_KarlinBlk** kbp_gap; /**< K-A parameters for gapped alignments. Actually just a placeholder. */ 209 Blast_GumbelBlk* gbp; /**< Gumbel parameters for FSC. */ 210 /* Below are the Karlin-Altschul parameters for non-position based ('std') 211 and position based ('psi') searches. */ 212 Blast_KarlinBlk **kbp_std, /**< K-A parameters for ungapped alignments */ 213 **kbp_psi, /**< K-A parameters for position-based alignments. */ 214 **kbp_gap_std, /**< K-A parameters for std (not position-based) alignments */ 215 **kbp_gap_psi; /**< K-A parameters for psi alignments. */ 216 Blast_KarlinBlk* kbp_ideal; /**< Ideal values (for query with average database composition). */ 217 Int4 number_of_contexts; /**< Used by sfp and kbp, how large are these*/ 218 Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/ 219 Int2 ambig_size, /**< size of array above. FIXME: not needed here? */ 220 ambig_occupy; /**< How many occupied? */ 221 Boolean round_down; /**< Score must be rounded down to nearest even score if odd. */ 222 } BlastScoreBlk; 223 224 /** Scoring matrix data used for compressed protein alphabets */ 225 typedef struct SCompressedAlphabet { 226 Int4 compressed_alphabet_size; /**< letters in the compressed alphabet */ 227 SBlastScoreMatrix* matrix; /**< score matrix */ 228 Uint1* compress_table; /**< translation table (AA->compressed)*/ 229 } SCompressedAlphabet; 230 231 /** Deallocates SBlastScoreMatrix structure 232 * @param matrix structure to deallocate [in] 233 * @return NULL 234 */ 235 NCBI_XBLAST_EXPORT 236 SBlastScoreMatrix* 237 SBlastScoreMatrixFree(SBlastScoreMatrix* matrix); 238 239 /** Allocates a new SBlastScoreMatrix structure of the specified dimensions. 240 * @param ncols number of columns [in] 241 * @param nrows number of rows [in] 242 * @return NULL in case of memory allocation failure, else new 243 * SBlastScoreMatrix structure 244 */ 245 NCBI_XBLAST_EXPORT 246 SBlastScoreMatrix* 247 SBlastScoreMatrixNew(size_t ncols, size_t nrows); 248 249 /** Allocate a new compressed alphabet and score matrix 250 * @param sbp Current score matrix information [in] 251 * @param compressed_alphabet_size Desired size of compressed 252 * alphabet (current choices are limited to 10 or 15) [in] 253 * @param scale_factor Score matrix entries are scaled by this value [in] 254 * @return the new alphabet, or NULL on failure 255 */ 256 NCBI_XBLAST_EXPORT 257 SCompressedAlphabet* 258 SCompressedAlphabetNew(BlastScoreBlk *sbp, 259 Int4 compressed_alphabet_size, 260 double scale_factor); 261 262 /** Free a compressed alphabet and score matrix 263 * @param alphabet The compressed alphabet structure 264 * @return Always NULL 265 */ 266 NCBI_XBLAST_EXPORT 267 SCompressedAlphabet* 268 SCompressedAlphabetFree(SCompressedAlphabet *alphabet); 269 270 /** 271 Stores the letter frequency of a sequence or database. 272 */ 273 typedef struct Blast_ResFreq { 274 Uint1 alphabet_code; /**< indicates alphabet. */ 275 double* prob; /**< letter probs, (possible) non-zero offset. */ 276 double* prob0; /**< probs, zero offset. */ 277 } Blast_ResFreq; 278 279 /** 280 * Check that score blk is valid, returns zero if it is. 281 * @param sbp ScoreBlk to check [in] 282 * @return zero if valid 283 */ 284 NCBI_XBLAST_EXPORT 285 int BlastScoreBlkCheck(BlastScoreBlk* sbp); 286 287 /** 288 * Allocates and initializes BlastScoreBlk 289 * @param alphabet either BLASTAA_SEQ_CODE or BLASTNA_SEQ_CODE [in] 290 * @param number_of_contexts how many strands or sequences [in] 291 * @return BlastScoreBlk* 292 */ 293 NCBI_XBLAST_EXPORT 294 BlastScoreBlk* BlastScoreBlkNew (Uint1 alphabet, Int4 number_of_contexts); 295 296 /** Deallocates BlastScoreBlk as well as all associated structures. 297 * @param sbp BlastScoreBlk to be deallocated [in] 298 * @return NULL pointer. 299 */ 300 NCBI_XBLAST_EXPORT 301 BlastScoreBlk* BlastScoreBlkFree (BlastScoreBlk* sbp); 302 303 /** Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*. 304 * Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first. 305 * 306 * @param sbp the object to be modified [in|out] 307 * @param ambiguous_res the residue to be set on the BlastScoreBlk 308 * @return zero on success, others on error 309 */ 310 NCBI_XBLAST_EXPORT 311 Int2 BLAST_ScoreSetAmbigRes (BlastScoreBlk* sbp, char ambiguous_res); 312 313 314 /** Calculate and fill the ungapped Karlin-Altschul parameters in the 315 * BlastScoreBlk structure (fields kbp_std, kbp_psi, and kbp of that structure). 316 * @param program BLAST program type, needed to decide whether to substitute 317 * ideal values. [in] 318 * @param sbp Scoring block to work with [in] [out] 319 * @param query Buffer containing (concatenated) query sequence [in] 320 * @param query_info Information about offsets of concatenated queries [in] 321 * @param blast_message returns queries that could not be processed [out] 322 * @return 0 if ungapped Karlin-Altschul parameters could be calculated for 323 * all of the query sequence's contexts; 1 if any of the contexts 324 * failed (but all others will be populated). 325 */ 326 NCBI_XBLAST_EXPORT 327 Int2 328 Blast_ScoreBlkKbpUngappedCalc(EBlastProgramType program, 329 BlastScoreBlk* sbp, Uint1* query, 330 const BlastQueryInfo* query_info, 331 Blast_Message* *blast_message); 332 333 /** This function fills in the BlastScoreBlk structure. 334 * Tasks are: 335 * -read in the matrix 336 * -set maxscore 337 * @param sbp Scoring block [in] [out] 338 * @param get_path pointer to function that will return path to matrix. Only called 339 * if built-in matrix not found [in] 340 */ 341 NCBI_XBLAST_EXPORT 342 Int2 Blast_ScoreBlkMatrixFill (BlastScoreBlk* sbp, GET_MATRIX_PATH get_path); 343 344 /** Callocs a Blast_KarlinBlk 345 * @return pointer to the Blast_KarlinBlk 346 */ 347 NCBI_XBLAST_EXPORT 348 Blast_KarlinBlk* Blast_KarlinBlkNew (void); 349 350 /** Copies contents of one Karlin block to another. Both must be allocated 351 * before this function is called. 352 * @param kbp_to Karlin block to copy values to [in] [out] 353 * @param kbp_from Karlin block to copy values from [in] 354 * @return 0 on success; -1 if either argument is NULL on input. 355 */ 356 NCBI_XBLAST_EXPORT 357 Int2 Blast_KarlinBlkCopy(Blast_KarlinBlk* kbp_to, Blast_KarlinBlk* kbp_from); 358 359 360 /** Deallocates the KarlinBlk 361 * @param kbp KarlinBlk to be deallocated [in] 362 * @return NULL 363 */ 364 NCBI_XBLAST_EXPORT 365 Blast_KarlinBlk* Blast_KarlinBlkFree(Blast_KarlinBlk* kbp); 366 367 /** Fills in lambda, H, and K values, as calculated by Stephen Altschul 368 * in Methods in Enzy. (vol 266, page 474). 369 * @param kbp object to be filled in [in|out] 370 * @param gap_open cost of gap existence [in] 371 * @param gap_extend cost to extend a gap one letter [in] 372 * @param matrix_name name of the matrix to be used [in] 373 * @param error_return filled in with error message if needed [out] 374 * @return zero on success 375 */ 376 NCBI_XBLAST_EXPORT 377 Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open, 378 Int4 gap_extend, const char* matrix_name, Blast_Message** error_return); 379 380 /** Retrieves Karlin-Altschul parameters from precomputed tables, given the 381 * substitution and gap scores. Gap cost values greater than any of those 382 * listed in the tables ("greater" meaning that both values are greater than or 383 * equal, and at least one is strictly greater), are treated as infinite, and 384 * parameters values are copied from the ungapped Karlin block. 385 * @param kbp Allocated Karlin block to fill [in] [out] 386 * @param gap_open Gap openening (existence) cost [in] 387 * @param gap_extend Gap extension cost [in] 388 * @param reward Match reward score [in] 389 * @param penalty Mismatch penalty score [in] 390 * @param kbp_ungap Karlin block with ungapped Karlin-Altschul parameters [in] 391 * @param round_down specifies that the score should be rounded down to nearest even 392 * score in some cases [in|out] 393 * @param error_return Pointer to error message. [in] [out] 394 */ 395 NCBI_XBLAST_EXPORT 396 Int2 397 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, 398 Int4 gap_extend, Int4 reward, Int4 penalty, 399 Blast_KarlinBlk* kbp_ungap, 400 Boolean* round_down, 401 Blast_Message** error_return); 402 403 404 /** Calculates the Karlin-Altschul parameters assuming standard residue 405 * compositions for the query and subject sequences. It populates the kbp_ideal 406 * field of its sbp argument. This is used if the query is translated and the 407 * calculated (real) Karlin parameters are bad, as they're calculated for 408 * non-coding regions. 409 * @param sbp ScoreBlk used to calculate "ideal" values. [in|out] 410 * @return 0 on success, 1 on failure 411 */ 412 NCBI_XBLAST_EXPORT 413 Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk* sbp); 414 415 /** Attempts to fill KarlinBlk for given gap opening, extensions etc. 416 * 417 * @param kbp object to be filled in [in|out] 418 * @param gap_open gap existence cost [in] 419 * @param gap_extend gap extension cost [in] 420 * @param matrix_name name of the matrix used [in] 421 * @param standard_only If true, include only the standard blosum and pam matrices [in] 422 * @return -1 if matrix_name is NULL; 423 * 1 if matrix not found 424 * 2 if matrix found, but open, extend etc. values not supported. 425 */ 426 NCBI_XBLAST_EXPORT 427 Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, const char* matrix_name, Boolean standard_only); 428 429 /** Fills in gumbel parameters to estimate p-value using FSC 430 * @param gbp object to be filled in [in|out] 431 * @param gap_open cost of gap existence [in] 432 * @param gap_extend cost to extend a gap one letter [in] 433 * @param matrix_name name of the matrix to be used [in] 434 * @param error_return filled in with error message if needed [out] 435 * @return zero on success 436 */ 437 NCBI_XBLAST_EXPORT 438 Int2 Blast_GumbelBlkCalc (Blast_GumbelBlk* gbp, Int4 gap_open, 439 Int4 gap_extend, const char* matrix_name, Blast_Message** error_return); 440 441 /** Attempts to fill GumbelBlk for given gap opening, extensions etc. 442 * 443 * @param gbp object to be filled in [in|out] 444 * @param gap_open gap existence cost [in] 445 * @param gap_extend gap extension cost [in] 446 * @param matrix_name name of the matrix used [in] 447 * @return -1 if matrix_name is NULL; 448 * 1 if matrix not found 449 * 2 if matrix found, but open, extend etc. values not supported. 450 */ 451 NCBI_XBLAST_EXPORT 452 Int2 Blast_GumbelBlkLoadFromTables(Blast_GumbelBlk* gbp, Int4 gap_open, 453 Int4 gap_extend, const char* matrix_name); 454 455 /** Prints a messages about the allowed matrices, BlastKarlinBlkGappedFill should return 1 before this is called. 456 * @param matrix the matrix to print a message about [in] 457 * @param standard_only, If true, include only the standad blosum and pam 458 * matrices [in] 459 * @return the message 460 */ 461 NCBI_XBLAST_EXPORT 462 char* BLAST_PrintMatrixMessage(const char *matrix, Boolean standard_only); 463 464 /** Prints a messages about the allowed open etc values for the given matrix, 465 * BlastKarlinBlkGappedFill should return 2 before this is called. 466 * @param matrix name of the matrix [in] 467 * @param gap_open gap existence cost [in] 468 * @param gap_extend cost to extend a gap by one [in] 469 * @return message 470 */ 471 NCBI_XBLAST_EXPORT 472 char* BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend); 473 474 /** Calculates the parameter Lambda given an initial guess for its value */ 475 NCBI_XBLAST_EXPORT 476 double 477 Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess); 478 479 /** Calculates the Expect value based upon the search space and some Karlin-Altschul 480 * parameters. It is "simple" as it does not use sum-statistics. 481 * @param S the score of the alignment. [in] 482 * @param kbp the Karlin-Altschul parameters. [in] 483 * @param searchsp total search space to be used [in] 484 * @return the expect value 485 */ 486 NCBI_XBLAST_EXPORT 487 double BLAST_KarlinStoE_simple (Int4 S, Blast_KarlinBlk* kbp, Int8 searchsp); 488 489 /** Calculates the Expect value based upon the Spouge's FSC method. 490 * @param S the score of the alignment. [in] 491 * @param gbp the Gumbel parameters. [in] 492 * @param qlen the query length. [in] 493 * @param slen the subject length. [in] 494 * @return the expect value 495 */ 496 NCBI_XBLAST_EXPORT 497 double BLAST_SpougeStoE (Int4 S, Blast_KarlinBlk* kbp, Blast_GumbelBlk* gbp, Int4 qlen, Int4 slen); 498 499 /** Estimate the score for a specified expect value. 500 * @param E the expect value of the alignment. [in] 501 * @param gbp the Gumbel parameters. [in] 502 * @param qlen the query length. [in] 503 * @param slen the subject length. [in] 504 * @return the score 505 */ 506 NCBI_XBLAST_EXPORT 507 Int4 BLAST_SpougeEtoS (double E, Blast_KarlinBlk* kbp, Blast_GumbelBlk* gbp, Int4 qlen, Int4 slen); 508 509 /** Convert a P-value to an E-value. 510 * 511 * P-values and E-values may either represent statistics of a database 512 * search or represent statistics on the two sequences being compared. 513 * If given a database P-value, this routine will return a database 514 * E-value; if given a pairwise P-value, it will return a pairwise 515 * E-value. 516 * 517 * In the context of a database search, the available P-value is often 518 * a pairwise P-value, whereas the desired E-value is a database 519 * E-value. When this it the case, the value returned by this routine 520 * should be multiplied by the effective length of the database and 521 * divided by the effective length of the subject. 522 * 523 * @param p the P-value to be converted [in] @return the corresponding 524 * expect value. 525 */ 526 NCBI_XBLAST_EXPORT double BLAST_KarlinPtoE(double p); 527 528 /** Convert an E-value to a P-value. 529 * 530 * E-values and P-values may either represent statistics of a database 531 * search or represent statistics on the two sequences being compared. 532 * If given a database E-value, this routine will return a database 533 * P-value; if given a pairwise E-value, it will return a pairwise 534 * P-value. 535 * 536 * In the context of a database search, the available E-value is 537 * typically a database E-value, whereas the desired P-value is a 538 * pairwise P-value. When this is the case, the E-value should be 539 * divided by the effective length of the database and multiplied by 540 * the effective length of the subject, before BLAST_KarlinEtoP is 541 * called. 542 * 543 * @param x the expect value to be converted [in] 544 * @return the corresponding p-value. 545 */ 546 NCBI_XBLAST_EXPORT 547 double BLAST_KarlinEtoP(double x); 548 549 /** Compute a divisor used to weight the evalue of a collection of 550 * "nsegs" distinct alignments. These divisors are used to compensate 551 * for the effect of choosing the best among multiple collections of 552 * alignments. See 553 * 554 * Stephen F. Altschul. Evaluating the statitical significance of 555 * multiple distinct local alignments. In Suhai, editior, Theoretical 556 * and Computational Methods in Genome Research, pages 1-14. Plenum 557 * Press, New York, 1997. 558 * 559 * The "decayrate" parameter of this routine is a value in the 560 * interval (0,1). Typical values of decayrate are .1 and .5. 561 * @param decayrate adjusts for (multiple) tests of number of HSP sum groups [in] 562 * @param nsegs the number of HSPs in the sum group [in] 563 * @return divisor used to compensate for multiple tests 564 */ 565 NCBI_XBLAST_EXPORT 566 double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs ); 567 568 /** Calculate the cutoff score from the expected number of HSPs or vice versa. 569 * @param S The calculated score [in] [out] 570 * @param E The calculated e-value [in] [out] 571 * @param kbp The Karlin-Altschul statistical parameters [in] 572 * @param searchsp The effective search space [in] 573 * @param dodecay Use gap decay feature? [in] 574 * @param gap_decay_rate Gap decay rate to use, if dodecay is set [in] 575 */ 576 NCBI_XBLAST_EXPORT 577 Int2 BLAST_Cutoffs (Int4 *S, double* E, Blast_KarlinBlk* kbp, 578 Int8 searchsp, Boolean dodecay, double gap_decay_rate); 579 580 /** Calculates the e-value for alignments with "small" gaps (typically 581 * under fifty residues/basepairs) following ideas of Stephen Altschul's. 582 * @param start_points the number of starting points permitted between 583 * adjacent alignments; max_overlap + max_gap + 1 [in] 584 * @param num the number of distinct alignments in this collection [in] 585 * @param xsum the sum of the scores of these alignments each individually 586 * normalized using an appropriate value of Lambda and logK [in] 587 * @param query_length effective len of the query seq [in] 588 * @param subject_length effective len of the subject seq [in] 589 * @param searchsp_eff effective size of the search space [in] 590 * @param weight_divisor a divisor used to weight the e-value 591 * when multiple collections of alignments are being considered by 592 * the calling routine [in] 593 * @return the expect value 594 */ 595 NCBI_XBLAST_EXPORT 596 double BLAST_SmallGapSumE (Int4 start_points, Int2 num, double xsum, 597 Int4 query_length, Int4 subject_length, 598 Int8 searchsp_eff, double weight_divisor); 599 600 /** Calculates the e-value of a collection multiple distinct 601 * alignments with asymmetric gaps between the alignments. The gaps 602 * in one (protein) sequence are typically small (like in 603 * BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA) 604 * sequence are possibly large (up to 4000 bp.) This routine is used 605 * for linking HSPs representing exons in the DNA sequence that are 606 * separated by introns. 607 * @param query_start_points the number of starting points in 608 * the query sequence permitted between adjacent alignments; 609 * max_overlap + max_gap + 1. [in] 610 * @param subject_start_points the number of starting points in 611 * the subject sequence permitted between adjacent alignments [in] 612 * @param num number of distinct alignments in one collection [in] 613 * @param xsum the sum of the scores of these alignments each individually 614 * normalized using an appropriate value of Lambda and logK [in] 615 * @param query_length effective length of query [in] 616 * @param subject_length effective length of subject [in] 617 * @param searchsp_eff effective size of the search space [in] 618 * @param weight_divisor a divisor used to weight the e-value 619 * when multiple collections of alignments are being considered by 620 * the calling routine [in] 621 * @return sum expect value. 622 */ 623 NCBI_XBLAST_EXPORT 624 double BLAST_UnevenGapSumE (Int4 query_start_points, Int4 subject_start_points, 625 Int2 num, double xsum, 626 Int4 query_length, Int4 subject_length, 627 Int8 searchsp_eff, double weight_divisor); 628 629 /** Calculates the e-value if a collection of distinct alignments with 630 * arbitrarily large gaps between the alignments 631 * @param num number of distinct alignments in the collection [in] 632 * @param xsum the sum of the scores of these alignments each individually 633 * normalized using an appropriate value of Lambda and logK [in] 634 * @param query_length effective length of query sequence [in] 635 * @param subject_length effective length of subject sequence [in] 636 * @param searchsp_eff effective size of the search space [in] 637 * @param weight_divisor a divisor used to weight the e-value 638 * when multiple collections of alignments are being considered by the 639 * calling routine [in] 640 * @return sum expect value. 641 */ 642 NCBI_XBLAST_EXPORT 643 double BLAST_LargeGapSumE (Int2 num, double xsum, 644 Int4 query_length, Int4 subject_length, 645 Int8 searchsp_eff, double weight_divisor ); 646 647 /** Extract the recommended gap existence and extension values. 648 * Only to be used with protein matrices. 649 * @param matrixName name of the matrix [in] 650 * @param gap_existence returns recommended existence cost [in|out] 651 * @param gap_extension returns recommended extension cost [in|out] 652 * @return zero on success 653 */ 654 NCBI_XBLAST_EXPORT 655 Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName, 656 Int4* gap_existence, 657 Int4* gap_extension); 658 659 /** Extract the recommended gap existence and extension values. 660 * Only to be used with blastn searches. 661 * @param reward match score [in] 662 * @param penalty mismatch score [in] 663 * @param gap_existence returns recommended existence cost [in|out] 664 * @param gap_extension returns recommended extension cost [in|out] 665 * @return zero on success 666 */ 667 NCBI_XBLAST_EXPORT 668 Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, 669 Int4 penalty, 670 Int4* gap_existence, 671 Int4* gap_extension); 672 673 /** Check the validity of the reward and penalty scores. 674 * Only to be used with blastn searches. 675 * @param reward match score [in] 676 * @param penalty mismatch score [in] 677 * @return TRUE on success 678 */ 679 NCBI_XBLAST_EXPORT 680 Boolean BLAST_CheckRewardPenaltyScores(Int4 reward, Int4 penalty); 681 682 /** Extract the alpha and beta settings for this matrixName, and these 683 * gap open and gap extension costs 684 * @param matrixName name of the matrix used [in] 685 * @param alpha Karlin-Altschul parameter to be set [out] 686 * @param beta Karlin-Altschul parameter to be set [out] 687 * @param gapped TRUE if a gapped search [in] 688 * @param gap_open existence cost of a gap [in] 689 * @param gap_extend extension cost of a gap [in] 690 * @param kbp_ungapped Karlin block with ungapped values of the parameters [in] 691 */ 692 NCBI_XBLAST_EXPORT 693 void BLAST_GetAlphaBeta (const char* matrixName, double *alpha, 694 double *beta, Boolean gapped, Int4 gap_open, 695 Int4 gap_extend, const Blast_KarlinBlk* kbp_ungapped); 696 697 /** Extract the alpha and beta settings for these substitution and gap scores. 698 * If substitution or gap costs are not found in the tables, assume an ungapped 699 * search. Then alpha is computed using the formula Alpha = Lambda/H, and beta 700 * is equal to 0 except for some special cases. 701 * @param reward Match reward score [in] 702 * @param penalty Mismatch penalty score [in] 703 * @param gap_open Gap opening (existence) cost [in] 704 * @param gap_extend Gap extension cost [in] 705 * @param kbp Karlin block containing already computed Lambda, K and H 706 * parameters. 707 * @param gapped_calculation Is this a gapped search? [in] 708 * @param alpha Alpha parameter for this scoring system [out] 709 * @param beta Beta parameter for this scoring system [out] 710 */ 711 NCBI_XBLAST_EXPORT 712 Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, 713 Int4 gap_extend, Blast_KarlinBlk* kbp, 714 Boolean gapped_calculation, 715 double *alpha, double *beta); 716 717 /** Rescale the PSSM, using composition-based statistics, for use 718 * with RPS BLAST. This function produces a PSSM for a single RPS DB 719 * sequence (of size db_seq_length) and incorporates information from 720 * the RPS blast query. Each individual database sequence must call this 721 * function to retrieve its own PSSM. The matrix is returned (and must 722 * be freed elsewhere). posMatrix is the portion of the complete 723 * concatenated PSSM that is specific to this DB sequence 724 * @todo revise to use existing code 725 * @param scalingFactor used to rescale Lambda [in] 726 * @param rps_query_length length of query sequence [in] 727 * @param rps_query_seq the query sequence [in] 728 * @param db_seq_length Length of the database sequence [in] 729 * @param posMatrix matrix (actual) values to be used [in] 730 * @param sbp Structure with score matrix parameters [in][out] 731 * @return rescaled pssm 732 */ 733 NCBI_XBLAST_EXPORT 734 Int4 ** RPSRescalePssm(double scalingFactor, Int4 rps_query_length, 735 const Uint1 * rps_query_seq, Int4 db_seq_length, 736 Int4 **posMatrix, BlastScoreBlk *sbp); 737 738 739 /** 740 * Computes the adjustment to the lengths of the query and database sequences 741 * that is used to compensate for edge effects when computing evalues. 742 * 743 * The length adjustment is an integer-valued approximation to the fixed 744 * point of the function 745 * 746 * f(ell) = beta + 747 * (alpha/lambda) * (log K + log((m - ell)*(n - N ell))) 748 * 749 * where m is the query length n is the length of the database and N is the 750 * number of sequences in the database. The values beta, alpha, lambda and 751 * K are statistical, Karlin-Altschul parameters. 752 * 753 * The value of the length adjustment computed by this routine, A, 754 * will always be an integer smaller than the fixed point of 755 * f(ell). Usually, it will be the largest such integer. However, the 756 * computed length adjustment, A, will also be so small that 757 * 758 * K * (m - A) * (n - N * A) > MAX(m,n). 759 * 760 * Moreover, an iterative method is used to compute A, and under 761 * unusual circumstances the iterative method may not converge. 762 * 763 * @param K the statistical parameter K [in] 764 * @param logK the natural logarithm of K [in] 765 * @param alpha_d_lambda the ratio of the statistical parameters 766 * alpha and lambda (for ungapped alignments, the 767 * value 1/H should be used) [in] 768 * @param beta the statistical parameter beta (for ungapped 769 * alignments, beta == 0) [in] 770 * @param query_length the length of the query sequence [in] 771 * @param db_length the length of the database [in] 772 * @param db_num_seqs the number of sequences in the database [in] 773 * @param length_adjustment the computed value of the length adjustment [out] 774 * 775 * @return 0 if length_adjustment is known to be the largest integer less 776 * than the fixed point of f(ell); 1 otherwise. 777 */ 778 NCBI_XBLAST_EXPORT 779 Int4 780 BLAST_ComputeLengthAdjustment(double K, 781 double logK, 782 double alpha_d_lambda, 783 double beta, 784 Int4 query_length, 785 Int8 db_length, 786 Int4 db_num_seqs, 787 Int4 * length_adjustment); 788 789 790 /** Allocates a new Blast_ResFreq structure and fills in the prob element 791 based upon the contents of sbp. 792 * @param sbp The BlastScoreBlk* used to init prob [in] 793 */ 794 NCBI_XBLAST_EXPORT 795 Blast_ResFreq* Blast_ResFreqNew(const BlastScoreBlk* sbp); 796 797 /** Deallocates Blast_ResFreq and prob0 element. 798 * @param rfp the Blast_ResFreq to be deallocated. 799 */ 800 NCBI_XBLAST_EXPORT 801 Blast_ResFreq* Blast_ResFreqFree(Blast_ResFreq* rfp); 802 803 804 /** Calculates residues frequencies given a standard distribution. 805 * @param sbp the BlastScoreBlk provides information on alphabet. 806 * @param rfp the prob element on this Blast_ResFreq is used. 807 * @return zero on success 808 */ 809 NCBI_XBLAST_EXPORT 810 Int2 Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp); 811 812 /** Creates a new structure to keep track of score frequencies for a scoring 813 * system. 814 * @param score_min Minimum score [in] 815 * @param score_max Maximum score [in] 816 * @return allocated and initialized pointer to Blast_ScoreFreq 817 */ 818 NCBI_XBLAST_EXPORT 819 Blast_ScoreFreq* 820 Blast_ScoreFreqNew(Int4 score_min, Int4 score_max); 821 822 /** Deallocates the score frequencies structure 823 * @param sfp the structure to deallocate [in] 824 * @return NULL 825 */ 826 NCBI_XBLAST_EXPORT 827 Blast_ScoreFreq* 828 Blast_ScoreFreqFree(Blast_ScoreFreq* sfp); 829 830 /** Fills a buffer with the 'standard' alphabet 831 * (given by STD_AMINO_ACID_FREQS[index].ch). 832 * 833 * @param alphabet_code specifies alphabet [in] 834 * @param residues buffer to be filled in [in|out] 835 * @param residue_size size of "residues" buffer [in] 836 * @return Number of residues in alphabet or negative returns upon error. 837 */ 838 NCBI_XBLAST_EXPORT 839 Int2 840 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues, 841 Uint4 residue_size); 842 843 /** Computes the parameters lambda, H K for use in calculating the 844 * statistical significance of high-scoring segments or subalignments (see 845 * comment on blast_stat.c for more details). 846 * @param kbp object containing Lambda, H, and K as well as scoring information [in|out] 847 * @param sfp array of probabilities for all scores [in] 848 * @return zero on success, 1 on error. 849 */ 850 NCBI_XBLAST_EXPORT 851 Int2 852 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp); 853 854 /** Given a sequence of 'length' amino acid residues, compute the 855 * probability of each residue and put that in the array resProb 856 * Excludes ambiguity characters. 857 * 858 * @param sequence the sequence to be computed upon [in] 859 * @param length the length of the sequence [in] 860 * @param resProb the object to be filled in [in|out] 861 */ 862 NCBI_XBLAST_EXPORT 863 void 864 Blast_FillResidueProbability(const Uint1* sequence, Int4 length, double * resProb); 865 866 /** Fill in the matrix for blastn using the penaly and rewards 867 * The query sequence alphabet is blastna, the subject sequence 868 * is ncbi2na. The alphabet blastna is defined in blast_stat.h 869 * and the first four elements of blastna are identical to ncbi2na. 870 * if sbp->matrix==NULL, it is allocated. 871 * @param sbp the BlastScoreBlk on which reward, penalty, and matrix will be set 872 [in|out] 873 * @return zero on success. 874 */ 875 NCBI_XBLAST_EXPORT 876 Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk* sbp); 877 878 /** Returns a pointer to the static compiled in version of the 879 * matrix. If name is NULL or the matrix is not compiled in 880 * NULL is returned. 881 * @param name matrix name [in] 882 * @return pointer to matrix or NULL if not supported. 883 */ 884 NCBI_XBLAST_EXPORT 885 SNCBIPackedScoreMatrix* BlastScoreBlkGetCompiledInMatrix(const char* name); 886 887 #ifdef __cplusplus 888 } 889 #endif 890 #endif /* !ALGO_BLAST_CORE__BLAST_STAT__H */ 891