1 /* $Id: twoseq_api.h,v 1.14 2009/05/27 19:29:17 camacho Exp $ 2 *************************************************************************** 3 * * 4 * COPYRIGHT NOTICE * 5 * * 6 * This software/database is categorized as "United States Government * 7 * Work" under the terms of the United States Copyright Act. It was * 8 * produced as part of the author's official duties as a Government * 9 * employee and thus can not be copyrighted. This software/database is * 10 * freely available to the public for use without a copyright notice. * 11 * Restrictions can not be placed on its present or future use. * 12 * * 13 * Although all reasonable efforts have been taken to ensure the accuracy * 14 * and reliability of the software and data, the National Library of * 15 * Medicine (NLM) and the U.S. Government do not and can not warrant the * 16 * performance or results that may be obtained by using this software, * 17 * data, or derivative works thereof. The NLM and the U.S. Government * 18 * disclaim any and all warranties, expressed or implied, as to the * 19 * performance, merchantability or fitness for any particular purpose or * 20 * use. * 21 * * 22 * In any work or product derived from this material, proper attribution * 23 * of the author(s) as the source of the software or data would be * 24 * appreciated. * 25 * * 26 * Author: Jason Papadopoulos * 27 * * 28 ***************************************************************************/ 29 30 /** @file twoseq_api.h 31 * Functions for C toolkit applications to compare two sequences using the 32 * rewritten BLAST engine. 33 */ 34 35 #ifndef _TWOSEQ_API_H_ 36 #define _TWOSEQ_API_H_ 37 38 #include <ncbi.h> 39 #include <objseq.h> 40 #include <tofasta.h> 41 #include <sqnutils.h> 42 #include <algo/blast/api/blast_returns.h> 43 #include <algo/blast/api/blast_options_api.h> 44 #include <algo/blast/api/blast_seqalign.h> 45 46 /** @addtogroup CToolkitAlgoBlast 47 * 48 * @{ 49 */ 50 51 /** Maximal query length, for which Blastn is used as default. Mega BLAST or 52 * discontiguous Mega BLAST are set to be default for fast or sensitive 53 * searches, if query is longer than this cutoff. 54 */ 55 #define MEGABLAST_CUTOFF 10000 56 57 /** 58 * The type of blast search to perform. For nucleotide searches, 59 * the blastn algorithm is used unless the first input sequence 60 * exceeds MEGABLAST_CUTOFF bases in size. In that case, megablast 61 * is used instead. If the blast_hint is eSensitive, discontiguous 62 * megablast with word size 11 is used (and any user-specified 63 * word size is ignored). 64 */ 65 enum blast_type { 66 eChoose = 100, /**< Choose type of search by sequences molecule type: 67 n-n=blastn, p-p=blastp, n-p=blastx, p-n=tblastn */ 68 eBlastn = 101, /**< blastn or megablast (determined automatically) */ 69 eBlastp = 102, /**< blastp search between protein sequences */ 70 eBlastx = 103, /**< blastx for nucleotide vs protein sequences */ 71 eTblastn = 104, /**< tblastn for protein vs nucleotide sequences */ 72 eTblastx = 105 /**< tblastx for translated nucleotide sequences */ 73 }; 74 75 /** 76 * Provide a hint on how the search is to be set up. At 77 * present this only applies to nucleotide searches 78 */ 79 enum blast_hint { 80 eBlastHint_Sensitive = 0, /**< trade off speed for sensitivity */ 81 eBlastHint_Fast = 1, /**< trade off sensitivity for speed */ 82 eBlastHint_None = 2 /**< no hint provided, do not attempt to guess what is desired. */ 83 }; 84 85 typedef enum seed_type { 86 eDefaultSeedType = 0, /**< BLAST will decide which method to use based on 87 program and other information. */ 88 eOneHit = 1, /**< Require only one initial hit for extension */ 89 eTwoHits = 2 /**< Require more than one hit within a window 90 for extension */ 91 } seed_type; 92 93 /** 94 * The main user-visible setup structure for the API. This 95 * only makes a (small) subset of the complete options available 96 */ 97 typedef struct { 98 enum blast_hint hint; /**< for nucleotide searches, how should 99 the search be set up? 100 Default = eSensitive */ 101 enum blast_type program; /**< the BLAST program to use. 102 Default = eChoose */ 103 char strand; /**< For nucleotide searches, the strand 104 of the first sequence to check: 105 choices are Seq_strand_{plus|minus|both} 106 Default is Seq_strand_both */ 107 double cutoff_evalue; /**< Alignments whose E value is larger than 108 this number are discarded. Default 10.0 */ 109 char* matrix; /**< The scoring matrix to use (protein 110 searches only). NULL means "BLOSUM62". 111 Default is NULL */ 112 char* filter_string; /**< Specifies filtering to apply to the 113 first of the two input sequences. 114 NULL or "T" implies DUST/SEG, "F" 115 turns off filtering. Default = NULL */ 116 Int4 word_size; /**< The word size to use. 0 chooses the 117 default for the specified program 118 (i.e. 3 for blastp, 11 for blastn, 119 28 for blastn with large sequences). 120 Default = 0 */ 121 Boolean gapped_calculation; /**< Perform gapped alignments. Default = TRUE*/ 122 Boolean use_megablast; /**< Use megablast for the search. Default = FALSE. */ 123 Int4 nucleotide_match; /**< For nucleotide searches, the reward 124 for matching letters (default 1) */ 125 Int4 nucleotide_mismatch; /**< For nucleotide searches, the penalty 126 for mismatching letters (default -3) */ 127 Int4 gap_open; /**< Cost of opening a gap. Default=0, invokes 128 default values: 5 for nucleotide; 129 depends on matrix for protein search.*/ 130 Int4 gap_extend; /**< Cost of extending a gap. Default=0, 131 invokes default values: 2 for nucleotide; 132 depends on matrix for protein search.*/ 133 Int4 gap_x_dropoff; /**< Dropoff value for the gapped extension. 134 Default=0, invokes default values. */ 135 double db_length; /**< Database length to use in statistical 136 calculations. 137 Default=0 means "database length" is set 138 to the subject sequence length for each 139 subject sequence. */ 140 Int4 word_threshold; /**< Threshold for finding neighboring words 141 in protein searches. Default=0, which 142 invokes default values*/ 143 Int4 longest_intron; /**< Used in uneven sum gap statistics. Only used 144 with tblastn right now. Default = 0 (turned off) */ 145 seed_type init_seed_method; /**< Single-hit or multiple-hit choice of 146 initial seeds for extension. */ 147 } BLAST_SummaryOptions; 148 149 150 /** 151 * Allocate storage for an API setup structure and set the 152 * default options for it. 153 * 154 * @param options pointer to be updated with newly allocated structure [out] 155 * @return 0 for successful allocation, -1 otherwise 156 */ 157 Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options); 158 159 /** 160 * Free the storage previously allocated for an API setup structure 161 * 162 * @param options pointer tothe structure to be freed [in] 163 * @return always NULL 164 */ 165 BLAST_SummaryOptions* BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options); 166 167 /** 168 * Perform a BLAST search on the two input sequences and return 169 * the list of alignments the search generates 170 * @param options structure describing how the search will be configured [in] 171 * @param bsp1 the first sequence to be compared. Filtering and selection 172 * of nucleotide strand apply only to this sequence [in] 173 * @param bsp2 the second sequence to be compared [in] 174 * @param seqalign_out the list of alignments generated by the search. 175 * If search failed or no alignments were found, set to NULL [out] 176 * @return 0 for a successful search, nonzero if search failed 177 */ 178 Int2 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *options, 179 Bioseq *bsp1, 180 Bioseq *bsp2, 181 SeqAlign **seqalign_out); 182 183 /** Creates the advanced search options structure from the basic options. 184 * @param basic_options Basic options for the two sequences search [in] 185 * @param query_seqloc Query Seq-loc, needed to find query length. [in] 186 * @param extra_returns Initialized summary returns structure. [in] 187 * @param search_options Populated advanced options structure [out] 188 * @param program_name Program name [out] 189 */ 190 Int2 191 Blast_SearchOptionsFromSummaryOptions(const BLAST_SummaryOptions *basic_options, 192 SeqLoc* query_seqloc, 193 Blast_SummaryReturn* extra_returns, 194 SBlastOptions* *search_options, 195 char* *program_name); 196 197 /** 198 * Perform a BLAST search on the two input sequences and return 199 * the list of alignments the search generates 200 * @param options Structure describing how the search will be configured [in] 201 * @param seqloc1 The first list of sequences (queries) to be compared. 202 * Filtering is applied only to these sequences [in] 203 * @param seqloc2 The second list of sequences (subjects) to be compared [in] 204 * @param masking_locs locations to be used for masking [in] 205 * @param seqalign_arr Object containing the SeqAligns. [in|out] 206 * @param filter_out Masking locations [out] 207 * @param mask_at_hash set to TRUE if filtering only on lookup table [out] 208 * @param extra_returns Data needed to print the bottom of BLAST report [out] 209 * @return 0 for a successful search, nonzero if search failed 210 */ 211 Int2 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *options, 212 SeqLoc* seqloc1, SeqLoc* seqloc2, 213 SeqLoc* masking_locs, 214 SBlastSeqalignArray* *seqalign_arr, 215 SeqLoc** filter_out, 216 Boolean* mask_at_hash, 217 Blast_SummaryReturn* *extra_returns); 218 219 /* @} */ 220 221 #endif /* !_TWOSEQ_API_H_ */ 222