1 /* $Id: blast_filter.c 306966 2011-06-20 13:16:49Z maning $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /** @file blast_filter.c
28 * All code related to query sequence masking/filtering for BLAST
29 */
30
31 #include <cstddef>
32 #include <algorithm>
33 #include <assert.h>
34 #include "blast_filter.h"
35 #include "blast_seg.h"
36
37 #ifndef NULLB
38 /** terminating byte of a char* string. */
39 #define NULLB '\0'
40 #endif
41
42 /****************************************************************************/
43 /* Constants */
44 const uint8_t kNuclMask = 14; /* N in BLASTNA */
45 const uint8_t kProtMask = 21; /* X in NCBISTDAA */
46
47
48 /** Allowed length of the filtering options string. */
49 #define BLASTOPTIONS_BUFFER_SIZE 128
50
BlastSeqLocNew(BlastSeqLoc ** head,int32_t from,int32_t to)51 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, int32_t from, int32_t to)
52 {
53 BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
54 if ( !loc ) {
55 return NULL;
56 }
57 loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
58 loc->ssr->left = from;
59 loc->ssr->right = to;
60
61 return BlastSeqLocAppend(head, loc);
62 }
63
BlastSeqLocAppend(BlastSeqLoc ** head,BlastSeqLoc * node)64 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node)
65 {
66 if ( !node ) {
67 return NULL;
68 }
69
70 if (head)
71 {
72 if (*head)
73 {
74 BlastSeqLoc* tmp = *head;
75 while (tmp->next)
76 tmp = tmp->next;
77 tmp->next = node;
78 }
79 else
80 {
81 *head = node;
82 }
83 }
84
85 return node;
86 }
87
88 /** Makes a copy of the BlastSeqLoc and also a copy of the
89 * SSRange element. Does not copy BlastSeqLoc that is pointed
90 * to by "next".
91 * @param source the object to be copied [in]
92 * @return another BlastSeqLoc*
93 */
s_BlastSeqLocNodeDup(BlastSeqLoc * source)94 static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source)
95 {
96 if ( !source ) {
97 return NULL;
98 }
99 assert(source->ssr);
100 return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
101 }
102
103 /** Calculates number of links in a chain of BlastSeqLoc's.
104 * @param var Chain of BlastSeqLoc structures [in]
105 * @return Number of links in the chain.
106 */
s_BlastSeqLocLen(const BlastSeqLoc * var)107 static int32_t s_BlastSeqLocLen(const BlastSeqLoc* var)
108 {
109 BlastSeqLoc* itr = NULL;
110 int32_t retval = 0;
111
112 for (itr = (BlastSeqLoc*)var; itr; itr = itr->next, retval++) {
113 ;
114 }
115 return retval;
116 }
117
118 /** Converts a BlastSeqLoc list to an array of pointers, each pointing to an
119 * element of the list passed in to this function and the last element points
120 * to NULL
121 * @param list List to convert to an array of pointers [in]
122 * @param count number of elements populated in the array [out]
123 */
124 static BlastSeqLoc**
s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc * list,int32_t * count)125 s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc* list, int32_t* count)
126 {
127 BlastSeqLoc* tmp,** retval;
128 int32_t i;
129 *count = 0;
130
131 if (list == NULL)
132 return NULL;
133
134 *count = s_BlastSeqLocLen(list);
135 retval = (BlastSeqLoc**) calloc(((size_t)(*count)+1), sizeof(BlastSeqLoc*));
136 for (tmp = (BlastSeqLoc*)list, i = 0; tmp != NULL && i < *count; i++) {
137 retval[i] = tmp;
138 tmp = tmp->next;
139 }
140 return retval;
141 }
142
143 /** Reverse elements in the list
144 * @param head pointer to pointer to the head of the list. [in|out]
145 * (this is not declared static so that it can be tested in the unit tests
146 */
BlastSeqLocListReverse(BlastSeqLoc ** head)147 void BlastSeqLocListReverse(BlastSeqLoc** head)
148 {
149 BlastSeqLoc** ptrs = NULL; /* array of pointers to BlastSeqLoc elements */
150 int32_t num_elems = 0, i = 0;
151
152 if ( !head ) {
153 return;
154 }
155
156 ptrs = s_BlastSeqLocListToArrayOfPointers(*head, &num_elems);
157 if (num_elems == 0) {
158 return;
159 }
160 assert(ptrs);
161 *head = ptrs[num_elems-1];
162 for (i = num_elems-1; i > 0; i--) {
163 ptrs[i]->next = ptrs[i-1];
164 }
165 ptrs[0]->next = NULL;
166 sfree(ptrs);
167 }
168
BlastSeqLocNodeFree(BlastSeqLoc * loc)169 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc)
170 {
171 if ( !loc ) {
172 return NULL;
173 }
174 sfree(loc->ssr);
175 sfree(loc);
176 return NULL;
177 }
178
BlastSeqLocFree(BlastSeqLoc * loc)179 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
180 {
181 while (loc) {
182 BlastSeqLoc* next_loc = loc->next;
183 loc = BlastSeqLocNodeFree(loc);
184 loc = next_loc;
185 }
186 return NULL;
187 }
188
BlastSeqLocListDup(BlastSeqLoc * head)189 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head)
190 {
191 BlastSeqLoc* retval = NULL;
192 BlastSeqLoc* retval_tail = NULL;
193
194 for (; head; head = head->next) {
195 retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
196 s_BlastSeqLocNodeDup(head));
197 }
198
199 return retval;
200 }
201
BlastMaskLocNew(int32_t total)202 BlastMaskLoc* BlastMaskLocNew(int32_t total)
203 {
204 BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
205 retval->total_size = total;
206 if (total > 0)
207 retval->seqloc_array = (BlastSeqLoc **) calloc(total,
208 sizeof(BlastSeqLoc *));
209 return retval;
210 }
211
BlastMaskLocDup(const BlastMaskLoc * mask_loc)212 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc)
213 {
214 BlastMaskLoc* retval = NULL;
215 int32_t index = 0;
216
217 if ( !mask_loc ) {
218 return NULL;
219 }
220
221 retval = BlastMaskLocNew(mask_loc->total_size);
222
223 for (index = 0; index < mask_loc->total_size; index++) {
224 retval->seqloc_array[index] =
225 BlastSeqLocListDup(mask_loc->seqloc_array[index]);
226 }
227
228 return retval;
229 }
230
BlastMaskLocFree(BlastMaskLoc * mask_loc)231 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
232 {
233 int32_t index;
234
235 if (mask_loc == NULL)
236 return NULL;
237
238 for (index=0; index<mask_loc->total_size; index++)
239 {
240 if (mask_loc->seqloc_array != NULL)
241 BlastSeqLocFree(mask_loc->seqloc_array[index]);
242 }
243 sfree(mask_loc->seqloc_array);
244 sfree(mask_loc);
245 return NULL;
246 }
247
248 /** Used for qsort, compares two SeqLoc's by starting position. */
s_SeqRangeSortByStartPosition(const void * vp1,const void * vp2)249 static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
250 {
251 BlastSeqLoc* v1 = *((BlastSeqLoc**) vp1);
252 BlastSeqLoc* v2 = *((BlastSeqLoc**) vp2);
253 SSeqRange* loc1 = (SSeqRange*) v1->ssr;
254 SSeqRange* loc2 = (SSeqRange*) v2->ssr;
255
256 if (loc1->left < loc2->left)
257 return -1;
258 else if (loc1->left > loc2->left)
259 return 1;
260 else
261 return 0;
262 }
263
264 void
BlastSeqLocCombine(BlastSeqLoc ** mask_loc,int32_t link_value)265 BlastSeqLocCombine(BlastSeqLoc** mask_loc, int32_t link_value)
266 {
267 BlastSeqLoc** ptrs = NULL;
268 int32_t i = 0, num_elems = 0;
269
270 /* Break up the list into an array of pointers and sort it */
271 ptrs = s_BlastSeqLocListToArrayOfPointers(*mask_loc, &num_elems);
272 if (num_elems == 0) {
273 return;
274 }
275 assert(ptrs);
276 qsort(ptrs, (size_t)num_elems, sizeof(*ptrs),
277 s_SeqRangeSortByStartPosition);
278
279 /* Merge the overlapping elements */
280 {
281 BlastSeqLoc* curr_tail = *mask_loc = ptrs[0];
282 for (i = 0; i < num_elems - 1; i++) {
283 const SSeqRange* next_ssr = ptrs[i+1]->ssr;
284 const int32_t stop = curr_tail->ssr->right;
285
286 if ((stop + link_value) > next_ssr->left) {
287 curr_tail->ssr->right = std::max(stop, next_ssr->right);
288 ptrs[i+1] = BlastSeqLocNodeFree(ptrs[i+1]);
289 } else {
290 curr_tail = ptrs[i+1];
291 }
292 }
293 }
294
295 /* Rebuild the linked list */
296 {
297 BlastSeqLoc* tail = *mask_loc;
298 for (i = 1; i < num_elems; i++) {
299 if (ptrs[i]) {
300 tail->next = ptrs[i];
301 tail = ptrs[i];
302 }
303 }
304 tail->next = NULL;
305 }
306 sfree(ptrs);
307 }
308
309 void
BlastSeqLocReverse(BlastSeqLoc * masks,int32_t query_length)310 BlastSeqLocReverse(BlastSeqLoc* masks, int32_t query_length)
311 {
312 for(; masks; masks = masks->next) {
313 masks->ssr->left = query_length - 1 - masks->ssr->right;
314 masks->ssr->right = query_length - 1 - masks->ssr->left;
315 }
316 }
317
318 void
Blast_MaskTheResidues(uint8_t * buffer,int32_t length,bool is_na,const BlastSeqLoc * mask_loc,bool reverse,int32_t offset)319 Blast_MaskTheResidues(uint8_t * buffer, int32_t length, bool is_na,
320 const BlastSeqLoc* mask_loc, bool reverse, int32_t offset)
321 {
322 const uint8_t kMaskingLetter = is_na ? kNuclMask : kProtMask;
323 assert(buffer);
324 for (; mask_loc; mask_loc = mask_loc->next) {
325
326 int32_t index, start, stop;
327
328 if (reverse) {
329 start = length - 1 - mask_loc->ssr->right;
330 stop = length - 1 - mask_loc->ssr->left;
331 } else {
332 start = mask_loc->ssr->left;
333 stop = mask_loc->ssr->right;
334 }
335
336 start -= offset;
337 stop -= offset;
338
339 assert(start < length);
340 assert(stop <= length);
341
342 for (index = start; index <= stop; index++)
343 buffer[index] = kMaskingLetter;
344 }
345 }
346
347 void
Blast_MaskUnsupportedAA(BLAST_SequenceBlk * seq,uint8_t min_invalid)348 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, uint8_t min_invalid)
349 {
350 uint8_t *sequence = seq->sequence;
351 int32_t length = seq->length;
352 int32_t i;
353
354 for (i = 0; i < length; i++) {
355 if (sequence[i] >= min_invalid) {
356 sequence[i] = kProtMask;
357 }
358 }
359 }
360