1 /*  $Id: hspfilter_besthit.h 274418 2011-04-14 13:40:26Z maning $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Ning Ma
27  *
28  */
29 
30 /** @file hspfilter_besthit.h
31  * Implementation of a number of BlastHSPWriters to save hits from
32  * a BLAST search, and subsequently return them in sorted order.
33  */
34 
35 #ifndef ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H
36 #define ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H
37 
38 #include <algo/blast/core/ncbi_std.h>
39 #include <algo/blast/core/blast_program.h>
40 #include <algo/blast/core/blast_options.h>
41 #include <algo/blast/core/blast_hspfilter.h>
42 #include <algo/blast/core/blast_hits.h>
43 #include <connect/ncbi_core.h>
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 /************************************************************************/
50 /** The "best hit" writer
51 
52    Prune the hsp_list for each query and keeps only the best ones.
53    1. For a pair of hits A and B, check based on 10% overhangs whether
54        A can be dropped because of B due to end points of A being within
55        10% extension of B and vice versa. Note that this would allow A
56        to be dropped even if it is at most 20% longer than B.
57 
58    2. If A can be dropped because of B, check if Evalue(A) >= Evalue(B);
59        that is A has the same or worse evalue than B. Do the same check for
60        whether B can be dropped because of A.
61 
62    3. If A can still be dropped because of B, check if density(A) <= density(B).
63        Do the same check for whether B can be dropped because of A.
64 
65    4. If only one can be dropped, then drop that one. If both are mutually
66        replaceable, use length criteria and drop the shorter one
67        only if it is at least 10% shorter (90% coverage).
68 
69    So, essentially length coverage is being used a tie-breaker and if the
70    tie-breaker does not break the tie, both alignments are kept. Above is not
71    very different than what you have now, just rearranged in conditions
72    so that we do not have non-deterministic behavior between a pair of
73    alignments. We could still have issues with cascades where A was dropped
74    because of B and then B gets dropped because of C, but A would not have
75    been dropped because of C becuase of condition 4. However, I think this
76    will be extremely rare.
77   */
78 
79 /// Default value for overhang
80 #define kBestHit_OverhangDflt 0.1
81 /// Minimum value for overhang
82 #define kBestHit_OverhangMin 0.0
83 /// Maximum value for overhang
84 #define kBestHit_OverhangMax 0.5
85 
86 /// Default value for score_edge
87 #define kBestHit_ScoreEdgeDflt 0.1
88 /// Minimum value for score_edge
89 #define kBestHit_ScoreEdgeMin  0.0
90 /// Maximum value for score_edge
91 #define kBestHit_ScoreEdgeMax  0.5
92 
93 /** Keeps parameters used in best hit algorithm.*/
94 typedef struct BlastHSPBestHitParams {
95    EBlastProgramType program;/**< program type. */
96    Int4 prelim_hitlist_size; /**< number of hits saved during preliminary
97                                   part of search. */
98    Int4 hsp_num_max;         /**< number of HSPs to save per db sequence. */
99    double overhang;          /**< overhang used in condition 1. */
100    double score_edge;        /**< fraction of score margin in condition 4*/
101 } BlastHSPBestHitParams;
102 
103 /** create a set of parameters
104  * @param program Blast program type.[in]
105  * @param hit_options field hitlist_size and hsp_num_max needed, a pointer to
106  *      this structure will be stored on resulting structure.[in]
107  * @param best_hit_opts Specifies the ratio of overhang to length, which is used to
108         determine if hit A is contained in hit B
109  * @param compostionBasedStats the compsotion based stats needed. [in]
110  * @param gapped_calculation if gapped_calculation is needed. [in]
111  * @return the pointer to the allocated parameter
112  */
113 NCBI_XBLAST_EXPORT
114 BlastHSPBestHitParams*
115 BlastHSPBestHitParamsNew(const BlastHitSavingOptions* hit_options,
116                          const BlastHSPBestHitOptions* best_hit_opts,
117                          Int4 compositionBasedStats,
118                          Boolean gapped_calculation);
119 
120 /** Deallocates the BlastHSPBestHitParams structure passed in
121  * @param opts structure to deallocate [in]
122  * @return NULL
123  */
124 NCBI_XBLAST_EXPORT
125 BlastHSPBestHitParams*
126 BlastHSPBestHitParamsFree(BlastHSPBestHitParams* opts);
127 
128 /** WriterInfo and PipeInfo to create a best hit writer/pipe
129  * @param params Specifies writer parameters. [in]
130  * @return the newly allocated writer/pipe info
131  */
132 NCBI_XBLAST_EXPORT
133 BlastHSPWriterInfo*
134 BlastHSPBestHitInfoNew(BlastHSPBestHitParams* params);
135 
136 NCBI_XBLAST_EXPORT
137 BlastHSPPipeInfo*
138 BlastHSPBestHitPipeInfoNew(BlastHSPBestHitParams* params);
139 
140 #ifdef __cplusplus
141 }
142 #endif
143 
144 #endif /* !ALGO_BLAST_CORE__HSPFILTER_BESTHIT__H */
145