1 /* $Id: twoseq_api.h,v 1.14 2009/05/27 19:29:17 camacho Exp $
2 ***************************************************************************
3 *                                                                         *
4 *                             COPYRIGHT NOTICE                            *
5 *                                                                         *
6 * This software/database is categorized as "United States Government      *
7 * Work" under the terms of the United States Copyright Act.  It was       *
8 * produced as part of the author's official duties as a Government        *
9 * employee and thus can not be copyrighted.  This software/database is    *
10 * freely available to the public for use without a copyright notice.      *
11 * Restrictions can not be placed on its present or future use.            *
12 *                                                                         *
13 * Although all reasonable efforts have been taken to ensure the accuracy  *
14 * and reliability of the software and data, the National Library of       *
15 * Medicine (NLM) and the U.S. Government do not and can not warrant the   *
16 * performance or results that may be obtained by using this software,     *
17 * data, or derivative works thereof.  The NLM and the U.S. Government     *
18 * disclaim any and all warranties, expressed or implied, as to the        *
19 * performance, merchantability or fitness for any particular purpose or   *
20 * use.                                                                    *
21 *                                                                         *
22 * In any work or product derived from this material, proper attribution   *
23 * of the author(s) as the source of the software or data would be         *
24 * appreciated.                                                            *
25 *                                                                         *
26 * Author: Jason Papadopoulos                                              *
27 *                                                                         *
28 ***************************************************************************/
29 
30 /** @file twoseq_api.h
31  * Functions for C toolkit applications to compare two sequences using the
32  * rewritten BLAST engine.
33  */
34 
35 #ifndef _TWOSEQ_API_H_
36 #define _TWOSEQ_API_H_
37 
38 #include <ncbi.h>
39 #include <objseq.h>
40 #include <tofasta.h>
41 #include <sqnutils.h>
42 #include <algo/blast/api/blast_returns.h>
43 #include <algo/blast/api/blast_options_api.h>
44 #include <algo/blast/api/blast_seqalign.h>
45 
46 /** @addtogroup CToolkitAlgoBlast
47  *
48  * @{
49  */
50 
51 /** Maximal query length, for which Blastn is used as default. Mega BLAST or
52  * discontiguous Mega BLAST are set to be default for fast or sensitive
53  * searches, if query is longer than this cutoff.
54  */
55 #define MEGABLAST_CUTOFF 10000
56 
57 /**
58  * The type of blast search to perform. For nucleotide searches,
59  * the blastn algorithm is used unless the first input sequence
60  * exceeds MEGABLAST_CUTOFF bases in size. In that case, megablast
61  * is used instead. If the blast_hint is eSensitive, discontiguous
62  * megablast with word size 11 is used (and any user-specified
63  * word size is ignored).
64  */
65 enum blast_type {
66     eChoose = 100,     /**< Choose type of search by sequences molecule type:
67                             n-n=blastn, p-p=blastp, n-p=blastx, p-n=tblastn */
68     eBlastn = 101,     /**< blastn or megablast (determined automatically) */
69     eBlastp = 102,     /**< blastp search between protein sequences */
70     eBlastx = 103,     /**< blastx for nucleotide vs protein sequences */
71     eTblastn = 104,    /**< tblastn for protein vs nucleotide sequences */
72     eTblastx = 105     /**< tblastx for translated nucleotide sequences */
73 };
74 
75 /**
76  * Provide a hint on how the search is to be set up. At
77  * present this only applies to nucleotide searches
78  */
79 enum blast_hint {
80     eBlastHint_Sensitive = 0,     /**< trade off speed for sensitivity */
81     eBlastHint_Fast = 1,           /**< trade off sensitivity for speed */
82     eBlastHint_None = 2           /**< no hint provided, do not attempt to guess what is desired. */
83 };
84 
85 typedef enum seed_type {
86    eDefaultSeedType = 0, /**< BLAST will decide which method to use based on
87                             program and other information. */
88    eOneHit = 1,          /**< Require only one initial hit for extension */
89    eTwoHits = 2           /**< Require more than one hit within a window
90                             for extension */
91 } seed_type;
92 
93 /**
94   * The main user-visible setup structure for the API. This
95   * only makes a (small) subset of the complete options available
96   */
97 typedef struct {
98     enum blast_hint hint;       /**< for nucleotide searches, how should
99                                      the search be set up?
100                                      Default = eSensitive */
101     enum blast_type program;    /**< the BLAST program to use.
102                                      Default = eChoose */
103     char strand;                /**< For nucleotide searches, the strand
104                                      of the first sequence to check:
105                                      choices are Seq_strand_{plus|minus|both}
106                                      Default is Seq_strand_both */
107     double cutoff_evalue;       /**< Alignments whose E value is larger than
108                                      this number are discarded. Default 10.0 */
109     char* matrix;               /**< The scoring matrix to use (protein
110                                      searches only). NULL means "BLOSUM62".
111                                      Default is NULL */
112     char* filter_string;        /**< Specifies filtering to apply to the
113                                      first of the two input sequences.
114                                      NULL or "T" implies DUST/SEG, "F"
115                                      turns off filtering. Default = NULL */
116     Int4 word_size;             /**< The word size to use. 0 chooses the
117                                      default for the specified program
118                                      (i.e. 3 for blastp, 11 for blastn,
119                                      28 for blastn with large sequences).
120                                      Default = 0 */
121     Boolean gapped_calculation; /**< Perform gapped alignments. Default = TRUE*/
122     Boolean use_megablast;      /**< Use megablast for the search. Default = FALSE. */
123     Int4 nucleotide_match;      /**< For nucleotide searches, the reward
124                                      for matching letters (default 1) */
125     Int4 nucleotide_mismatch;   /**< For nucleotide searches, the penalty
126                                      for mismatching letters (default -3) */
127     Int4 gap_open;              /**< Cost of opening a gap. Default=0, invokes
128                                      default values: 5 for nucleotide;
129                                      depends on matrix for protein search.*/
130     Int4 gap_extend;            /**< Cost of extending a gap. Default=0,
131                                      invokes default values: 2 for nucleotide;
132                                      depends on matrix for protein search.*/
133     Int4 gap_x_dropoff;         /**< Dropoff value for the gapped extension.
134                                      Default=0, invokes default values. */
135     double db_length;           /**< Database length to use in statistical
136                                      calculations.
137                                      Default=0 means "database length" is set
138                                      to the subject sequence length for each
139                                      subject sequence. */
140     Int4 word_threshold;        /**< Threshold for finding neighboring words
141                                      in protein searches. Default=0, which
142                                      invokes default values*/
143     Int4 longest_intron;        /**< Used in uneven sum gap statistics. Only used
144                                      with tblastn right now.  Default = 0 (turned off) */
145     seed_type init_seed_method; /**< Single-hit or multiple-hit choice of
146                                      initial seeds for extension. */
147 } BLAST_SummaryOptions;
148 
149 
150 /**
151   * Allocate storage for an API setup structure and set the
152   * default options for it.
153   *
154   * @param options pointer to be updated with newly allocated structure [out]
155   * @return 0 for successful allocation, -1 otherwise
156   */
157 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options);
158 
159 /**
160   * Free the storage previously allocated for an API setup structure
161   *
162   * @param options pointer tothe structure to be freed [in]
163   * @return always NULL
164   */
165 BLAST_SummaryOptions* BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options);
166 
167 /**
168   * Perform a BLAST search on the two input sequences and return
169   * the list of alignments the search generates
170   * @param options structure describing how the search will be configured [in]
171   * @param bsp1 the first sequence to be compared. Filtering and selection
172   *             of nucleotide strand apply only to this sequence [in]
173   * @param bsp2 the second sequence to be compared [in]
174   * @param seqalign_out the list of alignments generated by the search.
175   *             If search failed or no alignments were found, set to NULL [out]
176   * @return 0 for a successful search, nonzero if search failed
177   */
178 Int2 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *options,
179                               Bioseq *bsp1,
180                               Bioseq *bsp2,
181                               SeqAlign **seqalign_out);
182 
183 /** Creates the advanced search options structure from the basic options.
184  * @param basic_options Basic options for the two sequences search [in]
185  * @param query_seqloc Query Seq-loc, needed to find query length. [in]
186  * @param extra_returns Initialized summary returns structure. [in]
187  * @param search_options Populated advanced options structure [out]
188  * @param program_name Program name [out]
189  */
190 Int2
191 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options,
192                                       SeqLoc* query_seqloc,
193                                       Blast_SummaryReturn* extra_returns,
194                                       SBlastOptions* *search_options,
195                                       char* *program_name);
196 
197 /**
198   * Perform a BLAST search on the two input sequences and return
199   * the list of alignments the search generates
200   * @param options Structure describing how the search will be configured [in]
201   * @param seqloc1 The first list of sequences (queries) to be compared.
202   *                Filtering is applied only to these sequences [in]
203   * @param seqloc2 The second list of sequences (subjects) to be compared [in]
204   * @param masking_locs locations to be used for masking [in]
205   * @param seqalign_arr Object containing the SeqAligns. [in|out]
206   * @param filter_out Masking locations [out]
207   * @param mask_at_hash set to TRUE if filtering only on lookup table [out]
208   * @param extra_returns Data needed to print the bottom of BLAST report [out]
209   * @return 0 for a successful search, nonzero if search failed
210   */
211 Int2 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *options,
212                          SeqLoc* seqloc1, SeqLoc* seqloc2,
213                          SeqLoc* masking_locs,
214                          SBlastSeqalignArray* *seqalign_arr,
215                          SeqLoc** filter_out,
216                          Boolean* mask_at_hash,
217                          Blast_SummaryReturn* *extra_returns);
218 
219 /* @} */
220 
221 #endif  /* !_TWOSEQ_API_H_ */
222