1 /* $Id: blast_filter.c 306966 2011-06-20 13:16:49Z maning $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file blast_filter.c
28  * All code related to query sequence masking/filtering for BLAST
29  */
30 
31 #include <cstddef>
32 #include <algorithm>
33 #include <assert.h>
34 #include "blast_filter.h"
35 #include "blast_seg.h"
36 
37 #ifndef NULLB
38 /** terminating byte of a char* string. */
39 #define NULLB '\0'
40 #endif
41 
42 /****************************************************************************/
43 /* Constants */
44 const uint8_t kNuclMask = 14;     /* N in BLASTNA */
45 const uint8_t kProtMask = 21;     /* X in NCBISTDAA */
46 
47 
48 /** Allowed length of the filtering options string. */
49 #define BLASTOPTIONS_BUFFER_SIZE 128
50 
BlastSeqLocNew(BlastSeqLoc ** head,int32_t from,int32_t to)51 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, int32_t from, int32_t to)
52 {
53    BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
54    if ( !loc ) {
55        return NULL;
56    }
57    loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
58    loc->ssr->left = from;
59    loc->ssr->right = to;
60 
61    return BlastSeqLocAppend(head, loc);
62 }
63 
BlastSeqLocAppend(BlastSeqLoc ** head,BlastSeqLoc * node)64 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node)
65 {
66     if ( !node ) {
67         return NULL;
68     }
69 
70     if (head)
71     {
72         if (*head)
73         {
74             BlastSeqLoc* tmp = *head;
75             while (tmp->next)
76                tmp = tmp->next;
77             tmp->next = node;
78         }
79         else
80         {
81             *head = node;
82         }
83     }
84 
85     return node;
86 }
87 
88 /** Makes a copy of the BlastSeqLoc and also a copy of the
89  * SSRange element.  Does not copy BlastSeqLoc that is pointed
90  * to by "next".
91  * @param source the object to be copied [in]
92  * @return another BlastSeqLoc*
93  */
s_BlastSeqLocNodeDup(BlastSeqLoc * source)94 static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source)
95 {
96     if ( !source ) {
97         return NULL;
98     }
99     assert(source->ssr);
100     return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
101 }
102 
103 /** Calculates number of links in a chain of BlastSeqLoc's.
104  * @param var Chain of BlastSeqLoc structures [in]
105  * @return Number of links in the chain.
106  */
s_BlastSeqLocLen(const BlastSeqLoc * var)107 static int32_t s_BlastSeqLocLen(const BlastSeqLoc* var)
108 {
109     BlastSeqLoc* itr = NULL;
110     int32_t retval = 0;
111 
112     for (itr = (BlastSeqLoc*)var; itr; itr = itr->next, retval++) {
113         ;
114     }
115     return retval;
116 }
117 
118 /** Converts a BlastSeqLoc list to an array of pointers, each pointing to an
119  * element of the list passed in to this function and the last element points
120  * to NULL
121  * @param list List to convert to an array of pointers [in]
122  * @param count number of elements populated in the array [out]
123  */
124 static BlastSeqLoc**
s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc * list,int32_t * count)125 s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc* list, int32_t* count)
126 {
127     BlastSeqLoc* tmp,** retval;
128     int32_t i;
129     *count = 0;
130 
131     if (list == NULL)
132        return NULL;
133 
134     *count = s_BlastSeqLocLen(list);
135     retval = (BlastSeqLoc**) calloc(((size_t)(*count)+1), sizeof(BlastSeqLoc*));
136     for (tmp = (BlastSeqLoc*)list, i = 0; tmp != NULL && i < *count; i++) {
137         retval[i] = tmp;
138         tmp = tmp->next;
139     }
140     return retval;
141 }
142 
143 /** Reverse elements in the list
144  * @param head pointer to pointer to the head of the list. [in|out]
145  * (this is not declared static so that it can be tested in the unit tests
146  */
BlastSeqLocListReverse(BlastSeqLoc ** head)147 void BlastSeqLocListReverse(BlastSeqLoc** head)
148 {
149     BlastSeqLoc** ptrs = NULL;  /* array of pointers to BlastSeqLoc elements */
150     int32_t num_elems = 0, i = 0;
151 
152     if ( !head ) {
153         return;
154     }
155 
156     ptrs = s_BlastSeqLocListToArrayOfPointers(*head, &num_elems);
157     if (num_elems == 0) {
158         return;
159     }
160     assert(ptrs);
161     *head = ptrs[num_elems-1];
162     for (i = num_elems-1; i > 0; i--) {
163         ptrs[i]->next = ptrs[i-1];
164     }
165     ptrs[0]->next = NULL;
166     sfree(ptrs);
167 }
168 
BlastSeqLocNodeFree(BlastSeqLoc * loc)169 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc)
170 {
171     if ( !loc ) {
172         return NULL;
173     }
174     sfree(loc->ssr);
175     sfree(loc);
176     return NULL;
177 }
178 
BlastSeqLocFree(BlastSeqLoc * loc)179 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
180 {
181     while (loc) {
182         BlastSeqLoc* next_loc = loc->next;
183         loc = BlastSeqLocNodeFree(loc);
184         loc = next_loc;
185     }
186     return NULL;
187 }
188 
BlastSeqLocListDup(BlastSeqLoc * head)189 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head)
190 {
191     BlastSeqLoc* retval = NULL;
192     BlastSeqLoc* retval_tail = NULL;
193 
194     for (; head; head = head->next) {
195         retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
196                                         s_BlastSeqLocNodeDup(head));
197     }
198 
199     return retval;
200 }
201 
BlastMaskLocNew(int32_t total)202 BlastMaskLoc* BlastMaskLocNew(int32_t total)
203 {
204     BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
205     retval->total_size = total;
206     if (total > 0)
207         retval->seqloc_array = (BlastSeqLoc **) calloc(total,
208                                                        sizeof(BlastSeqLoc *));
209     return retval;
210 }
211 
BlastMaskLocDup(const BlastMaskLoc * mask_loc)212 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc)
213 {
214     BlastMaskLoc* retval = NULL;
215     int32_t index = 0;
216 
217     if ( !mask_loc ) {
218         return NULL;
219     }
220 
221     retval = BlastMaskLocNew(mask_loc->total_size);
222 
223     for (index = 0; index < mask_loc->total_size; index++) {
224         retval->seqloc_array[index] =
225             BlastSeqLocListDup(mask_loc->seqloc_array[index]);
226     }
227 
228     return retval;
229 }
230 
BlastMaskLocFree(BlastMaskLoc * mask_loc)231 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
232 {
233    int32_t index;
234 
235    if (mask_loc == NULL)
236       return NULL;
237 
238    for (index=0; index<mask_loc->total_size; index++)
239    {
240       if (mask_loc->seqloc_array != NULL)
241          BlastSeqLocFree(mask_loc->seqloc_array[index]);
242    }
243    sfree(mask_loc->seqloc_array);
244    sfree(mask_loc);
245    return NULL;
246 }
247 
248 /** Used for qsort, compares two SeqLoc's by starting position. */
s_SeqRangeSortByStartPosition(const void * vp1,const void * vp2)249 static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
250 {
251    BlastSeqLoc* v1 = *((BlastSeqLoc**) vp1);
252    BlastSeqLoc* v2 = *((BlastSeqLoc**) vp2);
253    SSeqRange* loc1 = (SSeqRange*) v1->ssr;
254    SSeqRange* loc2 = (SSeqRange*) v2->ssr;
255 
256    if (loc1->left < loc2->left)
257       return -1;
258    else if (loc1->left > loc2->left)
259       return 1;
260    else
261       return 0;
262 }
263 
264 void
BlastSeqLocCombine(BlastSeqLoc ** mask_loc,int32_t link_value)265 BlastSeqLocCombine(BlastSeqLoc** mask_loc, int32_t link_value)
266 {
267     BlastSeqLoc** ptrs = NULL;
268     int32_t i = 0, num_elems = 0;
269 
270     /* Break up the list into an array of pointers and sort it */
271     ptrs = s_BlastSeqLocListToArrayOfPointers(*mask_loc, &num_elems);
272     if (num_elems == 0) {
273         return;
274     }
275     assert(ptrs);
276     qsort(ptrs, (size_t)num_elems, sizeof(*ptrs),
277           s_SeqRangeSortByStartPosition);
278 
279     /* Merge the overlapping elements */
280     {
281         BlastSeqLoc* curr_tail = *mask_loc = ptrs[0];
282         for (i = 0; i < num_elems - 1; i++) {
283             const SSeqRange* next_ssr = ptrs[i+1]->ssr;
284             const int32_t stop = curr_tail->ssr->right;
285 
286             if ((stop + link_value) > next_ssr->left) {
287                 curr_tail->ssr->right = std::max(stop, next_ssr->right);
288                 ptrs[i+1] = BlastSeqLocNodeFree(ptrs[i+1]);
289             } else {
290                 curr_tail = ptrs[i+1];
291             }
292         }
293     }
294 
295     /* Rebuild the linked list */
296     {
297         BlastSeqLoc* tail = *mask_loc;
298         for (i = 1; i < num_elems; i++) {
299             if (ptrs[i]) {
300                 tail->next = ptrs[i];
301                 tail = ptrs[i];
302             }
303         }
304         tail->next = NULL;
305     }
306     sfree(ptrs);
307 }
308 
309 void
BlastSeqLocReverse(BlastSeqLoc * masks,int32_t query_length)310 BlastSeqLocReverse(BlastSeqLoc* masks, int32_t query_length)
311 {
312     for(; masks; masks = masks->next) {
313         masks->ssr->left    = query_length - 1 - masks->ssr->right;
314         masks->ssr->right   = query_length - 1 - masks->ssr->left;
315     }
316 }
317 
318 void
Blast_MaskTheResidues(uint8_t * buffer,int32_t length,bool is_na,const BlastSeqLoc * mask_loc,bool reverse,int32_t offset)319 Blast_MaskTheResidues(uint8_t * buffer, int32_t length, bool is_na,
320                       const BlastSeqLoc* mask_loc, bool reverse, int32_t offset)
321 {
322     const uint8_t kMaskingLetter = is_na ? kNuclMask : kProtMask;
323     assert(buffer);
324     for (; mask_loc; mask_loc = mask_loc->next) {
325 
326         int32_t index, start, stop;
327 
328         if (reverse) {
329             start = length - 1 - mask_loc->ssr->right;
330             stop = length - 1 - mask_loc->ssr->left;
331         } else {
332             start = mask_loc->ssr->left;
333             stop = mask_loc->ssr->right;
334         }
335 
336         start -= offset;
337         stop -= offset;
338 
339         assert(start < length);
340         assert(stop <= length);
341 
342         for (index = start; index <= stop; index++)
343             buffer[index] = kMaskingLetter;
344     }
345 }
346 
347 void
Blast_MaskUnsupportedAA(BLAST_SequenceBlk * seq,uint8_t min_invalid)348 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, uint8_t min_invalid)
349 {
350     uint8_t *sequence = seq->sequence;
351     int32_t length = seq->length;
352     int32_t i;
353 
354     for (i = 0; i < length; i++) {
355         if (sequence[i] >= min_invalid) {
356             sequence[i] = kProtMask;
357         }
358     }
359 }
360