1 /* $Id: hspfilter_besthit.h 274418 2011-04-14 13:40:26Z maning $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Ning Ma 27 * 28 */ 29 30 /** @file hspfilter_besthit.h 31 * Implementation of a number of BlastHSPWriters to save hits from 32 * a BLAST search, and subsequently return them in sorted order. 33 */ 34 35 #ifndef ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H 36 #define ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H 37 38 #include <algo/blast/core/ncbi_std.h> 39 #include <algo/blast/core/blast_program.h> 40 #include <algo/blast/core/blast_options.h> 41 #include <algo/blast/core/blast_hspfilter.h> 42 #include <algo/blast/core/blast_hits.h> 43 #include <connect/ncbi_core.h> 44 45 #ifdef __cplusplus 46 extern "C" { 47 #endif 48 49 /************************************************************************/ 50 /** The "best hit" writer 51 52 Prune the hsp_list for each query and keeps only the best ones. 53 1. For a pair of hits A and B, check based on 10% overhangs whether 54 A can be dropped because of B due to end points of A being within 55 10% extension of B and vice versa. Note that this would allow A 56 to be dropped even if it is at most 20% longer than B. 57 58 2. If A can be dropped because of B, check if Evalue(A) >= Evalue(B); 59 that is A has the same or worse evalue than B. Do the same check for 60 whether B can be dropped because of A. 61 62 3. If A can still be dropped because of B, check if density(A) <= density(B). 63 Do the same check for whether B can be dropped because of A. 64 65 4. If only one can be dropped, then drop that one. If both are mutually 66 replaceable, use length criteria and drop the shorter one 67 only if it is at least 10% shorter (90% coverage). 68 69 So, essentially length coverage is being used a tie-breaker and if the 70 tie-breaker does not break the tie, both alignments are kept. Above is not 71 very different than what you have now, just rearranged in conditions 72 so that we do not have non-deterministic behavior between a pair of 73 alignments. We could still have issues with cascades where A was dropped 74 because of B and then B gets dropped because of C, but A would not have 75 been dropped because of C becuase of condition 4. However, I think this 76 will be extremely rare. 77 */ 78 79 /// Default value for overhang 80 #define kBestHit_OverhangDflt 0.1 81 /// Minimum value for overhang 82 #define kBestHit_OverhangMin 0.0 83 /// Maximum value for overhang 84 #define kBestHit_OverhangMax 0.5 85 86 /// Default value for score_edge 87 #define kBestHit_ScoreEdgeDflt 0.1 88 /// Minimum value for score_edge 89 #define kBestHit_ScoreEdgeMin 0.0 90 /// Maximum value for score_edge 91 #define kBestHit_ScoreEdgeMax 0.5 92 93 /** Keeps parameters used in best hit algorithm.*/ 94 typedef struct BlastHSPBestHitParams { 95 EBlastProgramType program;/**< program type. */ 96 Int4 prelim_hitlist_size; /**< number of hits saved during preliminary 97 part of search. */ 98 Int4 hsp_num_max; /**< number of HSPs to save per db sequence. */ 99 double overhang; /**< overhang used in condition 1. */ 100 double score_edge; /**< fraction of score margin in condition 4*/ 101 } BlastHSPBestHitParams; 102 103 /** create a set of parameters 104 * @param program Blast program type.[in] 105 * @param hit_options field hitlist_size and hsp_num_max needed, a pointer to 106 * this structure will be stored on resulting structure.[in] 107 * @param best_hit_opts Specifies the ratio of overhang to length, which is used to 108 determine if hit A is contained in hit B 109 * @param compostionBasedStats the compsotion based stats needed. [in] 110 * @param gapped_calculation if gapped_calculation is needed. [in] 111 * @return the pointer to the allocated parameter 112 */ 113 NCBI_XBLAST_EXPORT 114 BlastHSPBestHitParams* 115 BlastHSPBestHitParamsNew(const BlastHitSavingOptions* hit_options, 116 const BlastHSPBestHitOptions* best_hit_opts, 117 Int4 compositionBasedStats, 118 Boolean gapped_calculation); 119 120 /** Deallocates the BlastHSPBestHitParams structure passed in 121 * @param opts structure to deallocate [in] 122 * @return NULL 123 */ 124 NCBI_XBLAST_EXPORT 125 BlastHSPBestHitParams* 126 BlastHSPBestHitParamsFree(BlastHSPBestHitParams* opts); 127 128 /** WriterInfo and PipeInfo to create a best hit writer/pipe 129 * @param params Specifies writer parameters. [in] 130 * @return the newly allocated writer/pipe info 131 */ 132 NCBI_XBLAST_EXPORT 133 BlastHSPWriterInfo* 134 BlastHSPBestHitInfoNew(BlastHSPBestHitParams* params); 135 136 NCBI_XBLAST_EXPORT 137 BlastHSPPipeInfo* 138 BlastHSPBestHitPipeInfoNew(BlastHSPBestHitParams* params); 139 140 #ifdef __cplusplus 141 } 142 #endif 143 144 #endif /* !ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H */ 145