1 /* $Id: blast_filter.h,v 1.46 2010/05/12 12:54:32 kazimird Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Ilya Dondoshansky
27 *
28 */
29
30 /** @file blast_filter.h
31 * BLAST filtering functions. @todo FIXME: contains more than filtering
32 * functions, combine with blast_dust.h?
33 */
34
35 #ifndef ALGO_BLAST_CORE__BLAST_FILTER__H
36 #define ALGO_BLAST_CORE__BLAST_FILTER__H
37
38 #include <algo/blast/core/ncbi_std.h>
39 #include <algo/blast/core/blast_def.h>
40 #include <algo/blast/core/blast_program.h>
41 #include <algo/blast/core/blast_query_info.h>
42 #include <algo/blast/core/blast_message.h>
43 #include <algo/blast/core/blast_options.h>
44
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48
49 /** BLASTNA element used to mask bases in BLAST */
50 NCBI_XBLAST_EXPORT
51 extern const Uint1 kNuclMask;
52 /** NCBISTDAA element used to mask residues in BLAST */
53 NCBI_XBLAST_EXPORT
54 extern const Uint1 kProtMask;
55
56 /** Repeats filtering default options. */
57 #define REPEATS_SEARCH_EVALUE 0.1 /**< Default e-value threshold, keep for C toolkit */
58 #define REPEATS_SEARCH_MINSCORE 26 /**< Default score cutoff */
59 #define REPEATS_SEARCH_PENALTY -1 /**< Default mismatch penalty */
60 #define REPEATS_SEARCH_REWARD 1 /**< Default match reward */
61 #define REPEATS_SEARCH_GAP_OPEN 2 /**< Default gap opening cost */
62 #define REPEATS_SEARCH_GAP_EXTEND 1 /**< Default gap extension cost */
63 #define REPEATS_SEARCH_WORD_SIZE 11 /**< Default word size */
64 #define REPEATS_SEARCH_XDROP_UNGAPPED 40/**< Default X-dropoff for ungapped
65 extension */
66 #define REPEATS_SEARCH_XDROP_FINAL 90 /**< Default X-dropoff for gapped
67 extension with traceback */
68 #define REPEATS_SEARCH_FILTER_STRING "F"/**< Default filter string -
69 no filtering */
70
71 /** Largest gap allowed to be filled between repeat mask intervals */
72 #define REPEAT_MASK_LINK_VALUE 5
73
74 /** Create and initialize a new sequence interval.
75 * @param head existing BlastSeqLoc to append onto, if *head
76 * is NULL then it will be set to new BlastSeqLoc, may be NULL [in|out]
77 * @param from Start of the interval [in]
78 * @param to End of the interval [in]
79 * @return Pointer to the allocated BlastSeqLoc structure (i.e.: tail of the
80 * list).
81 */
82 NCBI_XBLAST_EXPORT
83 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to);
84
85 /** Appends the BlastSeqLoc to the list of BlastSeqLoc-s pointed to by head.
86 * @param head Pointer to the head of the linked list of BlastSeqLoc-s [in]
87 * @param node Pointer to the node to be added to the list. If this is NULL,
88 * this function does nothing. [in]
89 * @returns pointer to the second argument to this function (i.e.: tail of the
90 * list)
91 */
92 NCBI_XBLAST_EXPORT
93 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node);
94
95 /** Deallocate a single BlastSeqLoc structure and its contents, without
96 * following its next pointer
97 * @param node structure to deallocate [in]
98 * @return NULL
99 */
100 NCBI_XBLAST_EXPORT
101 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* node);
102
103 /** Deallocate all BlastSeqLoc objects in a chain.
104 * @param loc object to be freed [in]
105 * @return NULL pointer returned.
106 */
107 NCBI_XBLAST_EXPORT
108 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc);
109
110 /** Make a deep copy of the linked list of BlastSeqLoc-s pointed to by its
111 * argument
112 * @param head head of the linked list [in]
113 * @return NULL on NULL input or memory allocation failure, else a copy of the
114 * list and its contents
115 */
116 NCBI_XBLAST_EXPORT
117 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head);
118
119 /** Converts reverse strand coordinates to forward strand in place.
120 * @param masks BlastSeqLoc to be reversed [in|out]
121 * @param query_length length of query [in]
122 */
123 NCBI_XBLAST_EXPORT
124 void BlastSeqLocReverse(BlastSeqLoc* masks, Int4 query_length);
125
126 /** Go through all mask locations in one sequence and combine any that overlap,
127 * deallocating the unneeded locations.
128 * @param mask_loc The list of masks to be merged (in place) [in|out]
129 * @param link_value Largest gap size between locations for which they
130 * should be linked together [in]
131 */
132 NCBI_XBLAST_EXPORT
133 void
134 BlastSeqLocCombine(BlastSeqLoc** mask_loc, Int4 link_value);
135
136 /** Allocate memory for a BlastMaskLoc.
137 * @param total number of contexts for which SSeqLocs should be allocated
138 * (result of number of queries * number of contexts for given program) [in]
139 * @return Pointer to the allocated BlastMaskLoc structure.
140 */
141 NCBI_XBLAST_EXPORT
142 BlastMaskLoc* BlastMaskLocNew(Int4 total);
143
144 /**
145 * @brief Perform a deep copy of the BlastMaskLoc structure passed to this
146 * function
147 *
148 * @param mask_loc Source masking location structure [in]
149 *
150 * @return Deep copy of its argument, or NULL if the argument was NULL or if
151 * not enough memory was available
152 */
153 NCBI_XBLAST_EXPORT
154 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc);
155
156 /** Deallocate memory for a BlastMaskLoc structure
157 * as well as the BlastSeqLoc's pointed to.
158 * @param mask_loc the object to be deleted [in]
159 * @return NULL pointer
160 */
161 NCBI_XBLAST_EXPORT
162 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc);
163
164 /** Given a BlastMaskLoc with an array of lists of DNA mask locations,
165 * substitutes that array by a new array of per-protein-frame mask location
166 * lists.
167 * @param mask_loc Mask locations structure. This structure can have either
168 * masks for all frames in nucleotide coordinates (e.g.: the results of
169 * translating protein masks to nucleotide) or a single mask per query
170 * (i.e.:location NUM_FRAMES*query_index). In the latter case, this mask will
171 * be used for all frames. [in|out]
172 * @param query_info Query information structure, containing contexts data [in]
173 * @note This function does NOT take into consideration the strands requested
174 * to be searched, which is INCONSISTENT with what the C++ API does.
175 */
176 NCBI_XBLAST_EXPORT
177 Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
178 const BlastQueryInfo* query_info);
179
180 /** Given a BlastMaskLoc with an array of lists of mask locations per protein
181 * frame, recalculates all mask offsets in terms of the DNA sequence.
182 * @param mask_loc Mask locations structure [in|out]
183 * @param query_info Query information structure, containing contexts data [in]
184 */
185 NCBI_XBLAST_EXPORT
186 Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
187 const BlastQueryInfo* query_info);
188
189 /** This function takes the list of mask locations (i.e., regions that
190 * should not be searched or not added to lookup table) and makes up a set
191 * of SSeqRange*'s in the concatenated sequence built from a set of queries,
192 * that should be searched (that is, takes the complement).
193 * If all sequences in the query set are completely filtered, then an
194 * SSeqRange is created and both of its elements (left and right) are set to
195 * -1 to indicate this.
196 * If any of the mask_loc's is NULL, an SSeqRange for the full span of the
197 * respective query sequence is created.
198 * @param program_number Type of BLAST program [in]
199 * @param query_info The query information structure [in]
200 * @param mask_loc All mask locations [in]
201 * @param complement_mask Linked list of SSeqRange*s in the concatenated
202 * sequence to be indexed in the lookup table . [out]
203 */
204 NCBI_XBLAST_EXPORT
205 Int2
206 BLAST_ComplementMaskLocations(EBlastProgramType program_number,
207 const BlastQueryInfo* query_info, const BlastMaskLoc* mask_loc,
208 BlastSeqLoc* *complement_mask);
209
210 /** Runs seg filtering functions, according to the filtering options, returns
211 * BlastSeqLoc*. Should combine all SeqLocs so they are non-redundant.
212 * @param program_number Type of BLAST program [in]
213 * @param sequence The sequence or part of the sequence to be filtered [in]
214 * @param length Length of the (sub)sequence [in]
215 * @param offset Offset into the full sequence [in]
216 * @param filter_options specifies how filtering is to be done [in]
217 * @param seqloc_retval Resulting locations for filtered region. [out]
218 * @param blast_message error messages on error [out]
219 * @return zero on success
220 */
221 NCBI_XBLAST_EXPORT
222 Int2
223 BlastSetUp_Filter(EBlastProgramType program_number,
224 Uint1* sequence,
225 Int4 length,
226 Int4 offset,
227 const SBlastFilterOptions* filter_options,
228 BlastSeqLoc* *seqloc_retval,
229 Blast_Message * *blast_message);
230
231
232 /** Does preparation for filtering and then calls BlastSetUp_Filter
233 * @param query_blk sequence to be filtered [in]
234 * @param query_info info on sequence to be filtered [in]
235 * @param program_number one of blastn,blastp,blastx,etc. [in]
236 * @param filter_options specifies how filtering is to be done [in]
237 * @param filter_out resulting locations for filtered region. [out]
238 * @param blast_message message that needs to be sent back to user.
239 */
240 NCBI_XBLAST_EXPORT
241 Int2
242 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
243 const BlastQueryInfo* query_info,
244 EBlastProgramType program_number,
245 const SBlastFilterOptions* filter_options,
246 BlastMaskLoc** filter_out,
247 Blast_Message* *blast_message);
248
249 /** Masks the letters in buffer.
250 * This is a low-level routine and takes a raw buffer which it assumes
251 * to be in ncbistdaa (protein) or blastna (nucleotide).
252 * @param buffer the sequence to be masked (will be modified, cannot be NULL or
253 * undefined behavior will result).[in|out]
254 * @param length length of the sequence to be masked . [in]
255 * @param is_na nucleotide if TRUE [in]
256 * @param mask_loc the BlastSeqLoc to use for masking [in]
257 * @param reverse minus strand if TRUE [in]
258 * @param offset how far along sequence is 1st residuse in buffer [in]
259 */
260 NCBI_XBLAST_EXPORT
261 void
262 Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
263 const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset);
264
265 /** Mask protein letters that are currently unsupported. This routine
266 * is used to make the core ignore letters within protein sequences
267 * that cannot (yet) be correctly handled
268 * @param seq Protein sequence to be masked (ncbistdaa format required).
269 * Letters whose numerical value exceeds a cutoff are
270 * converted into kProtMask values [in|out]
271 * @param min_invalid The first ncbistdaa value that is considered invalid.
272 * All sequence letters with numerical value >= this number
273 * are masked [in]
274 */
275 NCBI_XBLAST_EXPORT
276 void
277 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, Uint1 min_invalid);
278
279 /** Masks the sequence given a BlastMaskLoc
280 * @param query_blk sequence to be filtered [in]
281 * @param query_info info on sequence to be filtered [in]
282 * @param filter_maskloc Locations to filter [in]
283 * @param program_number one of blastn,blastp,blastx,etc. [in]
284 */
285 NCBI_XBLAST_EXPORT
286 void
287 BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
288 const BlastQueryInfo* query_info,
289 const BlastMaskLoc *filter_maskloc,
290 EBlastProgramType program_number);
291
292 /** Produces SBlastFilterOptions from a string that has been traditionally supported
293 * in blast.
294 * @param program_number Type of BLAST program [in]
295 * @param instructions the string describing the filtering to be done [in]
296 * @param filtering_options the structure to be filled in [out]
297 * @param blast_message optional field for error messages [out]
298 * @return zero on success
299 */
300 NCBI_XBLAST_EXPORT
301 Int2
302 BlastFilteringOptionsFromString(EBlastProgramType program_number,
303 const char* instructions,
304 SBlastFilterOptions* *filtering_options,
305 Blast_Message* *blast_message);
306
307 /// Convert the filtering options structure to a string
308 /// @param filtering_options filtering options structure, assumed to be
309 /// correctly filled in [in]
310 /// @return malloc'd string containing filtering options or NULL if there is
311 /// not enough memory to create the return value. Caller
312 /// must free() return value
313 /// @sa TAutoCharPtr it is recommended that the caller store the return value
314 /// in a TAutoCharPtr if working in C++ (defined in blast_aux.hpp)
315 NCBI_XBLAST_EXPORT
316 char*
317 BlastFilteringOptionsToString(const SBlastFilterOptions* filtering_options);
318
319 /** Determines whether this is a nucleotide query and whether this a minus strand or not
320 *
321 * @param is_na the query is nucleotide
322 * @param context offset in the QueryInfo array
323 * @return TRUE if this is minus strand
324 */
BlastIsReverseStrand(Boolean is_na,Int4 context)325 static NCBI_INLINE Boolean BlastIsReverseStrand(Boolean is_na, Int4 context)
326 {
327 return (is_na && ((context & 1) != 0));
328
329 }
330
331 #ifdef __cplusplus
332 }
333 #endif
334 #endif /* !ALGO_BLAST_CORE__BLAST_FILTER__H */
335