1 #ifndef SKIP_DOXYGEN_PROCESSING
2 static char const rcsid[] = "$Id: dust_filter.c,v 1.12 2007/12/19 22:05:56 camacho Exp $";
3 #endif /* SKIP_DOXYGEN_PROCESSING */
4
5 /*
6 * ===========================================================================
7 *
8 * PUBLIC DOMAIN NOTICE
9 * National Center for Biotechnology Information
10 *
11 * This software/database is a "United States Government Work" under the
12 * terms of the United States Copyright Act. It was written as part of
13 * the author's official duties as a United States Government employee and
14 * thus cannot be copyrighted. This software/database is freely available
15 * to the public for use. The National Library of Medicine and the U.S.
16 * Government have not placed any restriction on its use or reproduction.
17 *
18 * Although all reasonable efforts have been taken to ensure the accuracy
19 * and reliability of the software and data, the NLM and the U.S.
20 * Government do not and cannot warrant the performance or results that
21 * may be obtained by using this software or data. The NLM and the U.S.
22 * Government disclaim all warranties, express or implied, including
23 * warranties of performance, merchantability or fitness for any particular
24 * purpose.
25 *
26 * Please cite the author in any work or product based on this material.
27 *
28 * ===========================================================================
29 *
30 * File Name: $RCSfile: dust_filter.c,v $
31 *
32 * Author: Tom Madden
33 *
34 */
35
36 /** @file dust_filter.c
37 * Dust filtering API for the new BLAST code
38 */
39
40 /* Prototypes of functions defined below */
41 #include <algo/blast/api/dust_filter.h>
42 #include <algo/blast/api/blast_api.h>
43 #include <algo/blast/api/blast_seq.h>
44 #include <algo/blast/api/seqsrc_readdb.h>
45 #include <algo/blast/core/blast_filter.h>
46 #include <algo/blast/core/blast_util.h>
47 #include <blast_dust.h>
48
49 /** @addtogroup CToolkitAlgoBlast
50 *
51 * @{
52 */
53
54
55 /** Look for dustable locations
56 * @param loc returns locations [in|out]
57 * @param reg dust specific locations [in]
58 * @param nreg number of DREGION [in]
59 * @return 0 if no error
60 */
61 static Int2
s_GetDustLocations(BlastSeqLoc ** loc,DREGION * reg,Int4 nreg)62 s_GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg)
63 {
64 BlastSeqLoc* tail = NULL; /* pointer to tail of loc linked list */
65
66 if (!loc)
67 return -1;
68
69 *loc = NULL;
70
71 /* point to dusted locations */
72 if (nreg > 0) {
73 Int4 i;
74 for (i = 0; reg && i < nreg; i++) {
75 /* Cache the tail of the list to avoid the overhead of traversing the
76 * list when appending to it */
77 tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to);
78 reg = reg->next;
79 }
80 }
81 return 0;
82 }
83
84 /** Dust provided sequence.
85 * @param sequence input sequence [in]
86 * @param length number of bases [in]
87 * @param offset where to start on input sequence [in]
88 * @param level dust parameter [in]
89 * @param window dust parameter [in]
90 * @param linker dust parameter [in]
91 * @param dust_loc returns locations [in|out]
92 * @return 0 if no error
93 */
s_SeqBufferDust(Uint1 * sequence,Int4 length,Int4 offset,Int2 level,Int2 window,Int2 linker,BlastSeqLoc ** dust_loc)94 Int2 s_SeqBufferDust (Uint1* sequence, Int4 length, Int4 offset,
95 Int2 level, Int2 window, Int2 linker,
96 BlastSeqLoc** dust_loc)
97 {
98 DREGION* reg,* regold;
99 Int4 nreg;
100 Int2 status = 0;
101
102 /* place for dusted regions */
103 regold = reg = (DREGION*) calloc(1, sizeof(DREGION));
104 if (!reg)
105 return -1;
106
107 nreg = DustSegs (sequence, length, offset, reg, (Int4)level,
108 (Int4)window, (Int4)linker);
109
110 status = s_GetDustLocations(dust_loc, reg, nreg);
111
112 /* clean up memory */
113 reg = regold;
114 while (reg)
115 {
116 regold = reg;
117 reg = reg->next;
118 sfree (regold);
119 }
120
121 return status;
122 }
123
124 /** Returns locations that should be masked according to dust.
125 * @param query_blk input sequence [in]
126 * @param query_info number of bases etc. [in]
127 * @param query_seqloc locations to be dusted [in]
128 * @param filter_options dust parameters provided here [in]
129 * @param filter_maskloc return value [in|out]
130 * @return 0 if no error
131 */
132 static Int2
s_GetFilteringLocations(BLAST_SequenceBlk * query_blk,BlastQueryInfo * query_info,SeqLoc * query_seqloc,const SBlastFilterOptions * filter_options,BlastMaskLoc ** filter_maskloc)133 s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info, SeqLoc* query_seqloc, const SBlastFilterOptions* filter_options, BlastMaskLoc** filter_maskloc)
134 {
135 Int4 context = 0; /* loop variable. */
136 const Boolean kIsNucl = TRUE;
137 Boolean no_forward_strand = (query_info->first_context > 0); /* filtering needed on reverse strand. */
138 SeqLoc* slp_var = query_seqloc;
139
140 ASSERT(query_info && query_blk && filter_maskloc && query_seqloc);
141
142 *filter_maskloc = BlastMaskLocNew(query_info->last_context+1);
143
144 for (context = query_info->first_context;
145 context <= query_info->last_context && slp_var; ++context) {
146
147 Boolean reverse = BlastIsReverseStrand(kIsNucl, context);
148 Int4 query_length = query_info->contexts[context].query_length;
149
150
151 /* For each query, check if forward strand is present */
152 if (query_length <= 0)
153 {
154 if (kIsNucl && (context & 1) == 0) /* Needed only for blastn, or does this not apply FIXME */
155 no_forward_strand = TRUE; /* No plus strand, we cannot simply infer locations by going from plus to minus */
156 continue;
157 }
158 else if (!reverse) /* This is a plus strand, safe to set no_forward_strand to FALSE as clearly there is one. */
159 no_forward_strand = FALSE;
160
161 if (!reverse || no_forward_strand)
162 {
163 BlastSeqLoc *filter_slp = NULL; /* Used to hold combined SeqLoc's */
164 Int4 filter_index = context;
165 Int4 context_offset = query_info->contexts[context].query_offset;
166 Uint1* buffer = &query_blk->sequence[context_offset];
167 SDustOptions* dust_options = filter_options->dustOptions;
168
169 if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
170 { /* Reverse this as it's on minus strand. */
171 BlastSeqLoc* filter_slp_rev = NULL;
172 s_SeqBufferDust(buffer, query_length, 0, dust_options->level,
173 dust_options->window, dust_options->linker, &filter_slp);
174
175 /* Reverse this relative to the part of the query being searched, leave it up to
176 BlastMaskLocToSeqLoc to put it into the context of the entire query sequence (and
177 not just that part being searched). */
178 BlastSeqLocReverse(filter_slp, query_length);
179 }
180 else
181 {
182 s_SeqBufferDust(buffer, query_length, 0, dust_options->level,
183 dust_options->window, dust_options->linker, &filter_slp);
184 }
185
186 (*filter_maskloc)->seqloc_array[filter_index] = filter_slp;
187 }
188
189 if (slp_var->choice == SEQLOC_WHOLE)
190 {
191 if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
192 slp_var = slp_var->next;
193 }
194 else
195 {
196 slp_var = slp_var->next;
197 }
198 }
199
200 return 0;
201 }
202
203 Int2
Blast_FindDustSeqLoc(SeqLoc * query_seqloc,const SBlastOptions * options,SeqLoc ** mask_loc)204 Blast_FindDustSeqLoc(SeqLoc* query_seqloc,
205 const SBlastOptions* options,
206 SeqLoc* *mask_loc)
207 {
208 Int2 status = 0;
209 BlastMaskLoc* filter_loc = NULL;
210 BLAST_SequenceBlk* query_blk = NULL;
211 BlastQueryInfo* query_info = NULL;
212 QuerySetUpOptions* qsup_options = options->query_options;
213 const EBlastProgramType kProgram = eBlastTypeBlastn;
214
215 *mask_loc = NULL;
216
217 /* If dust filtering not requested, return success. */
218 if (qsup_options->filtering_options == NULL || qsup_options->filtering_options->dustOptions == NULL)
219 return 0;
220
221 BLAST_SetUpQuery(kProgram, query_seqloc, qsup_options, NULL, &query_info, &query_blk);
222
223 status = s_GetFilteringLocations(query_blk, query_info, query_seqloc, qsup_options->filtering_options, &filter_loc);
224
225 query_info = BlastQueryInfoFree(query_info);
226 query_blk = BlastSequenceBlkFree(query_blk);
227
228 if (filter_loc)
229 {
230 *mask_loc = BlastMaskLocToSeqLoc(kProgram, filter_loc, query_seqloc);
231 filter_loc = BlastMaskLocFree(filter_loc);
232 }
233
234 return status;
235 }
236
237 /* @} */
238
239