1 #ifndef SKIP_DOXYGEN_PROCESSING
2 static char const rcsid[] = "$Id: dust_filter.c,v 1.12 2007/12/19 22:05:56 camacho Exp $";
3 #endif /* SKIP_DOXYGEN_PROCESSING */
4 
5 /*
6  * ===========================================================================
7  *
8  *                            PUBLIC DOMAIN NOTICE
9  *               National Center for Biotechnology Information
10  *
11  *  This software/database is a "United States Government Work" under the
12  *  terms of the United States Copyright Act.  It was written as part of
13  *  the author's official duties as a United States Government employee and
14  *  thus cannot be copyrighted.  This software/database is freely available
15  *  to the public for use. The National Library of Medicine and the U.S.
16  *  Government have not placed any restriction on its use or reproduction.
17  *
18  *  Although all reasonable efforts have been taken to ensure the accuracy
19  *  and reliability of the software and data, the NLM and the U.S.
20  *  Government do not and cannot warrant the performance or results that
21  *  may be obtained by using this software or data. The NLM and the U.S.
22  *  Government disclaim all warranties, express or implied, including
23  *  warranties of performance, merchantability or fitness for any particular
24  *  purpose.
25  *
26  *  Please cite the author in any work or product based on this material.
27  *
28  * ===========================================================================
29  *
30  * File Name:  $RCSfile: dust_filter.c,v $
31  *
32  * Author: Tom Madden
33  *
34  */
35 
36 /** @file dust_filter.c
37  * Dust filtering API for the new BLAST code
38  */
39 
40 /* Prototypes of functions defined below */
41 #include <algo/blast/api/dust_filter.h>
42 #include <algo/blast/api/blast_api.h>
43 #include <algo/blast/api/blast_seq.h>
44 #include <algo/blast/api/seqsrc_readdb.h>
45 #include <algo/blast/core/blast_filter.h>
46 #include <algo/blast/core/blast_util.h>
47 #include <blast_dust.h>
48 
49 /** @addtogroup CToolkitAlgoBlast
50  *
51  * @{
52  */
53 
54 
55 /** Look for dustable locations
56  * @param loc returns locations [in|out]
57  * @param reg dust specific locations [in]
58  * @param nreg number of DREGION [in]
59  * @return 0 if no error
60  */
61 static Int2
s_GetDustLocations(BlastSeqLoc ** loc,DREGION * reg,Int4 nreg)62 s_GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg)
63 {
64    BlastSeqLoc* tail = NULL;   /* pointer to tail of loc linked list */
65 
66    if (!loc)
67       return -1;
68 
69    *loc = NULL;
70 
71    /* point to dusted locations */
72    if (nreg > 0) {
73       Int4 i;
74       for (i = 0; reg && i < nreg; i++) {
75          /* Cache the tail of the list to avoid the overhead of traversing the
76           * list when appending to it */
77          tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to);
78          reg = reg->next;
79       }
80    }
81    return 0;
82 }
83 
84 /** Dust provided sequence.
85  * @param sequence input sequence [in]
86  * @param length number of bases [in]
87  * @param offset where to start on input sequence [in]
88  * @param level dust parameter [in]
89  * @param window dust parameter [in]
90  * @param linker dust parameter [in]
91  * @param dust_loc returns locations [in|out]
92  * @return 0 if no error
93  */
s_SeqBufferDust(Uint1 * sequence,Int4 length,Int4 offset,Int2 level,Int2 window,Int2 linker,BlastSeqLoc ** dust_loc)94 Int2 s_SeqBufferDust (Uint1* sequence, Int4 length, Int4 offset,
95                     Int2 level, Int2 window, Int2 linker,
96                     BlastSeqLoc** dust_loc)
97 {
98 	DREGION* reg,* regold;
99 	Int4 nreg;
100         Int2 status = 0;
101 
102         /* place for dusted regions */
103 	regold = reg = (DREGION*) calloc(1, sizeof(DREGION));
104 	if (!reg)
105            return -1;
106 
107         nreg = DustSegs (sequence, length, offset, reg, (Int4)level,
108                   (Int4)window, (Int4)linker);
109 
110         status = s_GetDustLocations(dust_loc, reg, nreg);
111 
112         /* clean up memory */
113 	reg = regold;
114 	while (reg)
115 	{
116 		regold = reg;
117 		reg = reg->next;
118 		sfree (regold);
119 	}
120 
121 	return status;
122 }
123 
124 /** Returns locations that should be masked according to dust.
125  * @param query_blk input sequence [in]
126  * @param query_info number of bases etc. [in]
127  * @param query_seqloc locations to be dusted [in]
128  * @param filter_options dust parameters provided here [in]
129  * @param filter_maskloc return value [in|out]
130  * @return 0 if no error
131  */
132 static Int2
s_GetFilteringLocations(BLAST_SequenceBlk * query_blk,BlastQueryInfo * query_info,SeqLoc * query_seqloc,const SBlastFilterOptions * filter_options,BlastMaskLoc ** filter_maskloc)133 s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info, SeqLoc* query_seqloc, const SBlastFilterOptions* filter_options, BlastMaskLoc** filter_maskloc)
134 {
135     Int4 context = 0; /* loop variable. */
136     const Boolean kIsNucl = TRUE;
137     Boolean no_forward_strand = (query_info->first_context > 0);  /* filtering needed on reverse strand. */
138     SeqLoc* slp_var = query_seqloc;
139 
140     ASSERT(query_info && query_blk && filter_maskloc && query_seqloc);
141 
142     *filter_maskloc = BlastMaskLocNew(query_info->last_context+1);
143 
144     for (context = query_info->first_context;
145          context <= query_info->last_context && slp_var; ++context) {
146 
147         Boolean reverse = BlastIsReverseStrand(kIsNucl, context);
148         Int4 query_length = query_info->contexts[context].query_length;
149 
150 
151         /* For each query, check if forward strand is present */
152         if (query_length <= 0)
153         {
154             if (kIsNucl && (context & 1) == 0)  /* Needed only for blastn, or does this not apply FIXME */
155                no_forward_strand = TRUE;  /* No plus strand, we cannot simply infer locations by going from plus to minus */
156             continue;
157         }
158         else if (!reverse)  /* This is a plus strand, safe to set no_forward_strand to FALSE as clearly there is one. */
159                no_forward_strand = FALSE;
160 
161         if (!reverse || no_forward_strand)
162         {
163             BlastSeqLoc *filter_slp = NULL;   /* Used to hold combined SeqLoc's */
164             Int4 filter_index = context;
165             Int4 context_offset = query_info->contexts[context].query_offset;
166             Uint1* buffer = &query_blk->sequence[context_offset];
167             SDustOptions* dust_options = filter_options->dustOptions;
168 
169             if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
170             {  /* Reverse this as it's on minus strand. */
171                   BlastSeqLoc* filter_slp_rev = NULL;
172                   s_SeqBufferDust(buffer, query_length, 0, dust_options->level,
173                        dust_options->window, dust_options->linker, &filter_slp);
174 
175                   /* Reverse this relative to the part of the query being searched, leave it up to
176                     BlastMaskLocToSeqLoc to put it into the context of the entire query sequence (and
177                     not just that part being searched). */
178                   BlastSeqLocReverse(filter_slp, query_length);
179             }
180             else
181             {
182                    s_SeqBufferDust(buffer, query_length, 0, dust_options->level,
183                        dust_options->window, dust_options->linker, &filter_slp);
184             }
185 
186              (*filter_maskloc)->seqloc_array[filter_index] = filter_slp;
187         }
188 
189         if (slp_var->choice == SEQLOC_WHOLE)
190         {
191             if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
192                 slp_var = slp_var->next;
193         }
194         else
195         {
196             slp_var = slp_var->next;
197         }
198     }
199 
200     return 0;
201 }
202 
203 Int2
Blast_FindDustSeqLoc(SeqLoc * query_seqloc,const SBlastOptions * options,SeqLoc ** mask_loc)204 Blast_FindDustSeqLoc(SeqLoc* query_seqloc,
205                              const SBlastOptions* options,
206                              SeqLoc* *mask_loc)
207 {
208     Int2 status = 0;
209     BlastMaskLoc* filter_loc = NULL;
210     BLAST_SequenceBlk* query_blk = NULL;
211     BlastQueryInfo* query_info = NULL;
212     QuerySetUpOptions* qsup_options = options->query_options;
213     const EBlastProgramType kProgram = eBlastTypeBlastn;
214 
215     *mask_loc = NULL;
216 
217     /* If dust filtering not requested, return success. */
218     if (qsup_options->filtering_options == NULL || qsup_options->filtering_options->dustOptions == NULL)
219         return 0;
220 
221     BLAST_SetUpQuery(kProgram, query_seqloc, qsup_options, NULL, &query_info, &query_blk);
222 
223     status = s_GetFilteringLocations(query_blk, query_info, query_seqloc, qsup_options->filtering_options, &filter_loc);
224 
225     query_info = BlastQueryInfoFree(query_info);
226     query_blk = BlastSequenceBlkFree(query_blk);
227 
228     if (filter_loc)
229     {
230             *mask_loc = BlastMaskLocToSeqLoc(kProgram, filter_loc, query_seqloc);
231             filter_loc = BlastMaskLocFree(filter_loc);
232     }
233 
234     return status;
235 }
236 
237 /* @} */
238 
239