1 /* $Id: blast_filter.h,v 1.46 2010/05/12 12:54:32 kazimird Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Ilya Dondoshansky
27  *
28  */
29 
30 /** @file blast_filter.h
31  * BLAST filtering functions. @todo FIXME: contains more than filtering
32  * functions, combine with blast_dust.h?
33  */
34 
35 #ifndef ALGO_BLAST_CORE__BLAST_FILTER__H
36 #define ALGO_BLAST_CORE__BLAST_FILTER__H
37 
38 #include <algo/blast/core/ncbi_std.h>
39 #include <algo/blast/core/blast_def.h>
40 #include <algo/blast/core/blast_program.h>
41 #include <algo/blast/core/blast_query_info.h>
42 #include <algo/blast/core/blast_message.h>
43 #include <algo/blast/core/blast_options.h>
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 /** BLASTNA element used to mask bases in BLAST */
50 NCBI_XBLAST_EXPORT
51 extern const Uint1 kNuclMask;
52 /** NCBISTDAA element used to mask residues in BLAST */
53 NCBI_XBLAST_EXPORT
54 extern const Uint1 kProtMask;
55 
56 /** Repeats filtering default options. */
57 #define REPEATS_SEARCH_EVALUE 0.1       /**< Default e-value threshold, keep for C toolkit */
58 #define REPEATS_SEARCH_MINSCORE 26       /**< Default score cutoff */
59 #define REPEATS_SEARCH_PENALTY -1       /**< Default mismatch penalty */
60 #define REPEATS_SEARCH_REWARD 1       /**< Default match reward */
61 #define REPEATS_SEARCH_GAP_OPEN 2       /**< Default gap opening cost */
62 #define REPEATS_SEARCH_GAP_EXTEND 1     /**< Default gap extension cost */
63 #define REPEATS_SEARCH_WORD_SIZE 11     /**< Default word size */
64 #define REPEATS_SEARCH_XDROP_UNGAPPED 40/**< Default X-dropoff for ungapped
65                                            extension */
66 #define REPEATS_SEARCH_XDROP_FINAL 90   /**< Default X-dropoff for gapped
67                                            extension with traceback */
68 #define REPEATS_SEARCH_FILTER_STRING "F"/**< Default filter string -
69                                            no filtering */
70 
71 /** Largest gap allowed to be filled between repeat mask intervals */
72 #define REPEAT_MASK_LINK_VALUE 5
73 
74 /** Create and initialize a new sequence interval.
75  * @param head existing BlastSeqLoc to append onto, if *head
76  *   is NULL then it will be set to new BlastSeqLoc, may be NULL [in|out]
77  * @param from Start of the interval [in]
78  * @param to End of the interval [in]
79  * @return Pointer to the allocated BlastSeqLoc structure (i.e.: tail of the
80  * list).
81  */
82 NCBI_XBLAST_EXPORT
83 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to);
84 
85 /** Appends the BlastSeqLoc to the list of BlastSeqLoc-s pointed to by head.
86  * @param head Pointer to the head of the linked list of BlastSeqLoc-s [in]
87  * @param node Pointer to the node to be added to the list. If this is NULL,
88  * this function does nothing. [in]
89  * @returns pointer to the second argument to this function (i.e.: tail of the
90  * list)
91  */
92 NCBI_XBLAST_EXPORT
93 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node);
94 
95 /** Deallocate a single BlastSeqLoc structure and its contents, without
96  * following its next pointer
97  * @param node structure to deallocate [in]
98  * @return NULL
99  */
100 NCBI_XBLAST_EXPORT
101 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* node);
102 
103 /** Deallocate all BlastSeqLoc objects in a chain.
104  * @param loc object to be freed [in]
105  * @return NULL pointer returned.
106  */
107 NCBI_XBLAST_EXPORT
108 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc);
109 
110 /** Make a deep copy of the linked list of BlastSeqLoc-s pointed to by its
111  * argument
112  * @param head head of the linked list [in]
113  * @return NULL on NULL input or memory allocation failure, else a copy of the
114  * list and its contents
115  */
116 NCBI_XBLAST_EXPORT
117 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head);
118 
119 /** Converts reverse strand coordinates to forward strand in place.
120  * @param masks BlastSeqLoc to be reversed [in|out]
121  * @param query_length length of query [in]
122  */
123 NCBI_XBLAST_EXPORT
124 void BlastSeqLocReverse(BlastSeqLoc* masks, Int4 query_length);
125 
126 /** Go through all mask locations in one sequence and combine any that overlap,
127  * deallocating the unneeded locations.
128  * @param mask_loc The list of masks to be merged (in place) [in|out]
129  * @param link_value Largest gap size between locations for which they
130  *                   should be linked together [in]
131 */
132 NCBI_XBLAST_EXPORT
133 void
134 BlastSeqLocCombine(BlastSeqLoc** mask_loc, Int4 link_value);
135 
136 /** Allocate memory for a BlastMaskLoc.
137  * @param total number of contexts for which SSeqLocs should be allocated
138  * (result of number of queries * number of contexts for given program) [in]
139  * @return Pointer to the allocated BlastMaskLoc structure.
140 */
141 NCBI_XBLAST_EXPORT
142 BlastMaskLoc* BlastMaskLocNew(Int4 total);
143 
144 /**
145  * @brief Perform a deep copy of the BlastMaskLoc structure passed to this
146  * function
147  *
148  * @param mask_loc Source masking location structure [in]
149  *
150  * @return Deep copy of its argument, or NULL if the argument was NULL or if
151  * not enough memory was available
152  */
153 NCBI_XBLAST_EXPORT
154 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc);
155 
156 /** Deallocate memory for a BlastMaskLoc structure
157  * as well as the BlastSeqLoc's pointed to.
158  * @param mask_loc the object to be deleted [in]
159  * @return NULL pointer
160  */
161 NCBI_XBLAST_EXPORT
162 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc);
163 
164 /** Given a BlastMaskLoc with an array of lists of DNA mask locations,
165  * substitutes that array by a new array of per-protein-frame mask location
166  * lists.
167  * @param mask_loc Mask locations structure. This structure can have either
168  * masks for all frames in nucleotide coordinates (e.g.: the results of
169  * translating protein masks to nucleotide) or a single mask per query
170  * (i.e.:location NUM_FRAMES*query_index). In the latter case, this mask will
171  * be used for all frames. [in|out]
172  * @param query_info Query information structure, containing contexts data [in]
173  * @note This function does NOT take into consideration the strands requested
174  * to be searched, which is INCONSISTENT with what the C++ API does.
175  */
176 NCBI_XBLAST_EXPORT
177 Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
178                               const BlastQueryInfo* query_info);
179 
180 /** Given a BlastMaskLoc with an array of lists of mask locations per protein
181  * frame, recalculates all mask offsets in terms of the DNA sequence.
182  * @param mask_loc Mask locations structure [in|out]
183  * @param query_info Query information structure, containing contexts data [in]
184  */
185 NCBI_XBLAST_EXPORT
186 Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
187                               const BlastQueryInfo* query_info);
188 
189 /** This function takes the list of mask locations (i.e., regions that
190  * should not be searched or not added to lookup table) and makes up a set
191  * of SSeqRange*'s in the concatenated sequence built from a set of queries,
192  * that should be searched (that is, takes the complement).
193  * If all sequences in the query set are completely filtered, then an
194  * SSeqRange is created and both of its elements (left and right) are set to
195  * -1 to indicate this.
196  * If any of the mask_loc's is NULL, an SSeqRange for the full span of the
197  * respective query sequence is created.
198  * @param program_number Type of BLAST program [in]
199  * @param query_info The query information structure [in]
200  * @param mask_loc All mask locations [in]
201  * @param complement_mask Linked list of SSeqRange*s in the concatenated
202  *                        sequence to be indexed in the lookup table . [out]
203  */
204 NCBI_XBLAST_EXPORT
205 Int2
206 BLAST_ComplementMaskLocations(EBlastProgramType program_number,
207    const BlastQueryInfo* query_info, const BlastMaskLoc* mask_loc,
208    BlastSeqLoc* *complement_mask);
209 
210 /** Runs seg filtering functions, according to the filtering options, returns
211  * BlastSeqLoc*. Should combine all SeqLocs so they are non-redundant.
212  * @param program_number Type of BLAST program [in]
213  * @param sequence The sequence or part of the sequence to be filtered [in]
214  * @param length Length of the (sub)sequence [in]
215  * @param offset Offset into the full sequence [in]
216  * @param filter_options specifies how filtering is to be done [in]
217  * @param seqloc_retval Resulting locations for filtered region. [out]
218  * @param blast_message error messages on error [out]
219  * @return zero on success
220 */
221 NCBI_XBLAST_EXPORT
222 Int2
223 BlastSetUp_Filter(EBlastProgramType program_number,
224     Uint1* sequence,
225     Int4 length,
226     Int4 offset,
227     const SBlastFilterOptions* filter_options,
228     BlastSeqLoc* *seqloc_retval,
229     Blast_Message * *blast_message);
230 
231 
232 /** Does preparation for filtering and then calls BlastSetUp_Filter
233  * @param query_blk sequence to be filtered [in]
234  * @param query_info info on sequence to be filtered [in]
235  * @param program_number one of blastn,blastp,blastx,etc. [in]
236  * @param filter_options specifies how filtering is to be done [in]
237  * @param filter_out resulting locations for filtered region. [out]
238  * @param blast_message message that needs to be sent back to user.
239 */
240 NCBI_XBLAST_EXPORT
241 Int2
242 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
243                                  const BlastQueryInfo* query_info,
244                                  EBlastProgramType program_number,
245                                  const SBlastFilterOptions* filter_options,
246                                  BlastMaskLoc** filter_out,
247                                  Blast_Message* *blast_message);
248 
249 /** Masks the letters in buffer.
250  * This is a low-level routine and takes a raw buffer which it assumes
251  * to be in ncbistdaa (protein) or blastna (nucleotide).
252  * @param buffer the sequence to be masked (will be modified, cannot be NULL or
253  * undefined behavior will result).[in|out]
254  * @param length length of the sequence to be masked . [in]
255  * @param is_na nucleotide if TRUE [in]
256  * @param mask_loc the BlastSeqLoc to use for masking [in]
257  * @param reverse minus strand if TRUE [in]
258  * @param offset how far along sequence is 1st residuse in buffer [in]
259 */
260 NCBI_XBLAST_EXPORT
261 void
262 Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
263     const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset);
264 
265 /** Mask protein letters that are currently unsupported. This routine
266  *  is used to make the core ignore letters within protein sequences
267  *  that cannot (yet) be correctly handled
268  * @param seq Protein sequence to be masked (ncbistdaa format required).
269  *            Letters whose numerical value exceeds a cutoff are
270  *            converted into kProtMask values [in|out]
271  * @param min_invalid The first ncbistdaa value that is considered invalid.
272  *            All sequence letters with numerical value >= this number
273  *            are masked [in]
274  */
275 NCBI_XBLAST_EXPORT
276 void
277 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, Uint1 min_invalid);
278 
279 /** Masks the sequence given a BlastMaskLoc
280  * @param query_blk sequence to be filtered [in]
281  * @param query_info info on sequence to be filtered [in]
282  * @param filter_maskloc Locations to filter [in]
283  * @param program_number one of blastn,blastp,blastx,etc. [in]
284 */
285 NCBI_XBLAST_EXPORT
286 void
287 BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
288                      const BlastQueryInfo* query_info,
289                      const BlastMaskLoc *filter_maskloc,
290                      EBlastProgramType program_number);
291 
292 /** Produces SBlastFilterOptions from a string that has been traditionally supported
293  * in blast.
294  * @param program_number Type of BLAST program [in]
295  * @param instructions the string describing the filtering to be done [in]
296  * @param filtering_options the structure to be filled in [out]
297  * @param blast_message optional field for error messages [out]
298  * @return zero on success
299  */
300 NCBI_XBLAST_EXPORT
301 Int2
302 BlastFilteringOptionsFromString(EBlastProgramType program_number,
303                                 const char* instructions,
304                                 SBlastFilterOptions* *filtering_options,
305                                 Blast_Message* *blast_message);
306 
307 /// Convert the filtering options structure to a string
308 /// @param filtering_options filtering options structure, assumed to be
309 /// correctly filled in [in]
310 /// @return malloc'd string containing filtering options or NULL if there is
311 /// not enough memory to create the return value. Caller
312 /// must free() return value
313 /// @sa TAutoCharPtr it is recommended that the caller store the return value
314 /// in a TAutoCharPtr if working in C++ (defined in blast_aux.hpp)
315 NCBI_XBLAST_EXPORT
316 char*
317 BlastFilteringOptionsToString(const SBlastFilterOptions* filtering_options);
318 
319 /** Determines whether this is a nucleotide query and whether this a minus strand or not
320  *
321  * @param is_na the query is nucleotide
322  * @param context offset in the QueryInfo array
323  * @return TRUE if this is minus strand
324  */
BlastIsReverseStrand(Boolean is_na,Int4 context)325 static NCBI_INLINE Boolean BlastIsReverseStrand(Boolean is_na, Int4 context)
326 {
327      return (is_na && ((context & 1) != 0));
328 
329 }
330 
331 #ifdef __cplusplus
332 }
333 #endif
334 #endif /* !ALGO_BLAST_CORE__BLAST_FILTER__H */
335