1 /* $Id: blast_filter.c 533522 2017-04-17 15:50:34Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file blast_filter.c
28  * All code related to query sequence masking/filtering for BLAST
29  */
30 
31 #include <algo/blast/core/blast_util.h>
32 #include <algo/blast/core/blast_filter.h>
33 #include <algo/blast/core/blast_seg.h>
34 #include "jumper.h"
35 
36 /****************************************************************************/
37 /* Constants */
38 const Uint1 kNuclMask = 14;     /* N in BLASTNA */
39 const Uint1 kProtMask = 21;     /* X in NCBISTDAA */
40 
41 
42 /** Allowed length of the filtering options string. */
43 #define BLASTOPTIONS_BUFFER_SIZE 128
44 
45 
46 /** Copies filtering commands for one filtering algorithm from "instructions" to
47  * "buffer".
48  * ";" is a delimiter for the commands for different algorithms, so it stops
49  * copying when a ";" is found.
50  * Example filtering string: "m L; R -d rodents.lib"
51  * @param instructions filtering commands [in]
52  * @param buffer filled with filtering commands for one algorithm. [out]
53 */
54 static const char *
s_LoadOptionsToBuffer(const char * instructions,char * buffer)55 s_LoadOptionsToBuffer(const char *instructions, char* buffer)
56 {
57 	Boolean not_started=TRUE;
58 	char* buffer_ptr;
59 	const char *ptr;
60 	Int4 index;
61 
62 	ptr = instructions;
63 	buffer_ptr = buffer;
64 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE && *ptr != NULLB; index++)
65 	{
66 		if (*ptr == ';')
67 		{	/* ";" is a delimiter for different filtering algorithms. */
68 			ptr++;
69 			break;
70 		}
71 		/* Remove blanks at the beginning. */
72 		if (not_started && *ptr == ' ')
73 		{
74 			ptr++;
75 		}
76 		else
77 		{
78 			not_started = FALSE;
79 			*buffer_ptr = *ptr;
80 			buffer_ptr++; ptr++;
81 		}
82 	}
83 
84 	*buffer_ptr = NULLB;
85 
86 	if (not_started == FALSE)
87 	{	/* Remove trailing blanks. */
88 		buffer_ptr--;
89 		while (*buffer_ptr == ' ' && buffer_ptr > buffer)
90 		{
91 			*buffer_ptr = NULLB;
92 			buffer_ptr--;
93 		}
94 	}
95 
96 	return ptr;
97 }
98 
99 /** Parses repeat filtering options string.
100  * @param repeat_options Input character string [in]
101  * @param dbname Database name for repeats filtering [out]
102  */
103 static Int2
s_ParseRepeatOptions(const char * repeat_options,char ** dbname)104 s_ParseRepeatOptions(const char* repeat_options, char** dbname)
105 {
106     char* ptr;
107 
108     ASSERT(dbname);
109     *dbname = NULL;
110 
111     if (!repeat_options)
112         return 0;
113 
114     ptr = strstr(repeat_options, "-d");
115     if (ptr) {
116         ptr += 2;
117         while (*ptr == ' ' || *ptr == '\t')
118             ++ptr;
119         *dbname = strdup(ptr);
120     }
121     return 0;
122 }
123 
124 /** Parses window masker options string.
125  * @param winmask_options Input character string [in]
126  * @param dbname Database name for window masker filtering [out]
127  * @param taxid Taxonomic ID for window masker filtering [out]
128  */
129 static Int2
s_ParseWindowMaskerOptions(const char * winmask_options,char ** dbname,int * taxid)130 s_ParseWindowMaskerOptions(const char  * winmask_options,
131                            char       ** dbname,
132                            int         * taxid)
133 {
134     char* ptr = NULL;
135 
136     ASSERT(dbname);
137     *dbname = NULL;
138 
139     if (!winmask_options)
140         return 0;
141 
142     ptr = strstr(winmask_options, "-d");
143 
144     if (ptr) {
145         char * endp = 0;
146 
147         ptr += 2;
148         while (*ptr == ' ' || *ptr == '\t')
149             ++ptr;
150 
151         *dbname = strdup(ptr);
152 
153         for(endp = *dbname; *endp; ++endp) {
154             if (*endp == ' ' || *endp == '\t') {
155                 *endp = (char)0;
156                 break;
157             }
158         }
159     } else {
160         ptr = strstr(winmask_options, "-t");
161 
162         if (ptr) {
163             ptr += 2;
164             while (*ptr == ' ' || *ptr == '\t')
165                 ++ptr;
166             *taxid = atoi(ptr);
167         }
168     }
169 
170     return 0;
171 }
172 
173 /** Parses options used for dust.
174  * @param ptr buffer containing instructions. [in]
175  * @param level sets level for dust. [out]
176  * @param window sets window for dust [out]
177  * @param linker sets linker for dust. [out]
178 */
179 static Int2
s_ParseDustOptions(const char * ptr,int * level,int * window,int * linker)180 s_ParseDustOptions(const char *ptr, int* level, int* window, int* linker)
181 
182 {
183 	char buffer[BLASTOPTIONS_BUFFER_SIZE];
184 	int arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1;
185 
186 	arg = 0;
187 	index1 = 0;
188 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
189 	{
190 		if (*ptr == ' ' || *ptr == NULLB)
191 		{
192 	                long tmplong;
193 			buffer[index1] = NULLB;
194 			index1 = 0;
195 			switch(arg) {
196 				case 0:
197 					sscanf(buffer, "%ld", &tmplong);
198 					level_pri = tmplong;
199 					break;
200 				case 1:
201 					sscanf(buffer, "%ld", &tmplong);
202 					window_pri = tmplong;
203 					break;
204 				case 2:
205 					sscanf(buffer, "%ld", &tmplong);
206 					linker_pri = tmplong;
207 					break;
208 				default:
209 					break;
210 			}
211 
212 			arg++;
213 			while (*ptr == ' ')
214 				ptr++;
215 
216 			/* end of the buffer. */
217 			if (*ptr == NULLB)
218 				break;
219 		}
220 		else
221 		{
222 			buffer[index1] = *ptr; ptr++;
223 			index1++;
224 		}
225 	}
226         if (arg != 0 && arg != 3)
227            return 1;
228 
229 	*level = level_pri;
230 	*window = window_pri;
231 	*linker = linker_pri;
232 
233 	return 0;
234 }
235 
236 /** parses a string to set three seg options.
237  * @param ptr buffer containing instructions [in]
238  * @param window returns "window" for seg algorithm. [out]
239  * @param locut returns "locut" for seg. [out]
240  * @param hicut returns "hicut" for seg. [out]
241 */
242 static Int2
s_ParseSegOptions(const char * ptr,Int4 * window,double * locut,double * hicut)243 s_ParseSegOptions(const char *ptr, Int4* window, double* locut, double* hicut)
244 
245 {
246 	char buffer[BLASTOPTIONS_BUFFER_SIZE];
247 	Int4 arg, index, index1;
248 
249 	arg = 0;
250 	index1 = 0;
251 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
252 	{
253 		if (*ptr == ' ' || *ptr == NULLB)
254 		{
255 	                long tmplong;
256 	                double tmpdouble;
257 			buffer[index1] = NULLB;
258 			index1 = 0;
259 			switch(arg) {
260 				case 0:
261 					sscanf(buffer, "%ld", &tmplong);
262 					*window = tmplong;
263 					break;
264 				case 1:
265 					sscanf(buffer, "%le", &tmpdouble);
266 					*locut = tmpdouble;
267 					break;
268 				case 2:
269 					sscanf(buffer, "%le", &tmpdouble);
270 					*hicut = tmpdouble;
271 					break;
272 				default:
273 					break;
274 			}
275 
276 			arg++;
277 			while (*ptr == ' ')
278 				ptr++;
279 
280 			/* end of the buffer. */
281 			if (*ptr == NULLB)
282 				break;
283 		}
284 		else
285 		{
286 			buffer[index1] = *ptr; ptr++;
287 			index1++;
288 		}
289 	}
290         if (arg != 0 && arg != 3)
291            return 1;
292 
293 	return 0;
294 }
295 
296 /// Wrapper around strcat to ensure we don't do buffer overflows :)
297 /// @param dest string to concatenate to [in|out]
298 /// @param dest_size size of the dest array, modified if dest is grown [in|out]
299 /// @param string2append string to append to dest [in]
300 /// @return the concatenated string or NULL if we run out of memory
301 static char*
s_SafeStrCat(char ** dest,unsigned int * dest_size,const char * string2append)302 s_SafeStrCat(char** dest, unsigned int* dest_size, const char* string2append)
303 {
304     size_t dest_length = strlen(*dest);
305     size_t string2append_length = strlen(string2append);
306     if ((dest_length + string2append_length + 1) > *dest_size) {
307         size_t target_size = MAX(string2append_length, dest_length) * 2;
308         *dest = (char*)realloc((void*)*dest, target_size);
309         if (*dest) {
310             (*dest_size) = target_size;
311         } else {
312             sfree(*dest);
313             return 0;
314         }
315     }
316     strcat(*dest, string2append);
317     return *dest;
318 }
319 
320 char*
BlastFilteringOptionsToString(const SBlastFilterOptions * filtering_options)321 BlastFilteringOptionsToString(const SBlastFilterOptions* filtering_options)
322 {
323     char* retval = NULL;
324     unsigned int retval_size = 0;
325 
326     if (filtering_options == NULL) {
327         return strdup("F");
328     }
329 
330     retval_size = 64;   /* Usually this will suffice */
331     retval = (char*) calloc(retval_size, sizeof(char));
332 
333     if (filtering_options->dustOptions) {
334         if (filtering_options->dustOptions->level == kDustLevel &&
335             filtering_options->dustOptions->window == kDustWindow &&
336             filtering_options->dustOptions->linker == kDustLinker) {
337             if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
338                 return 0;
339             }
340         } else {
341             char buffer[24] = { '\0' };
342             snprintf(buffer, sizeof(buffer), "D %d %d %d;",
343                      filtering_options->dustOptions->level,
344                      filtering_options->dustOptions->window,
345                      filtering_options->dustOptions->linker);
346             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
347                 return 0;
348             }
349         }
350     }
351 
352     if (filtering_options->segOptions) {
353         if (filtering_options->segOptions->window == kSegWindow &&
354             filtering_options->segOptions->locut == kSegLocut &&
355             filtering_options->segOptions->hicut == kSegHicut) {
356             if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
357                 return 0;
358             }
359         } else {
360             char buffer[24] = { '\0' };
361             snprintf(buffer, sizeof(buffer), "S %d %1.1f %1.1f;",
362                      filtering_options->segOptions->window,
363                      filtering_options->segOptions->locut,
364                      filtering_options->segOptions->hicut);
365             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
366                 return 0;
367             }
368         }
369     }
370 
371     if (filtering_options->repeatFilterOptions) {
372         if (filtering_options->repeatFilterOptions->database) {
373             if (!s_SafeStrCat(&retval, &retval_size, "R -d ")) {
374                 return 0;
375             }
376             if (!s_SafeStrCat(&retval, &retval_size,
377                          filtering_options->repeatFilterOptions->database)) {
378                 return 0;
379             }
380             if (!s_SafeStrCat(&retval, &retval_size, ";")) {
381                 return 0;
382             }
383         } else {
384             if (!s_SafeStrCat(&retval, &retval_size, "R;")) {
385                 return 0;
386             }
387         }
388     }
389 
390     if (filtering_options->windowMaskerOptions) {
391         if (filtering_options->windowMaskerOptions->taxid != 0) {
392             char buffer[24] = { '\0' };
393             snprintf(buffer, sizeof(buffer), "W -t %d;",
394                      filtering_options->windowMaskerOptions->taxid);
395             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
396                 return 0;
397             }
398         } else if (filtering_options->windowMaskerOptions->database) {
399             if (!s_SafeStrCat(&retval, &retval_size, "W -d ")) {
400                 return 0;
401             }
402             if (!s_SafeStrCat(&retval, &retval_size,
403                          filtering_options->windowMaskerOptions->database)) {
404                 return 0;
405             }
406             if (!s_SafeStrCat(&retval, &retval_size, ";")) {
407                 return 0;
408             }
409         }
410     }
411 
412     /* Mask at hash is a modifier for other filtering options, as such it
413      * doesn't make sense to apply it by itself */
414     if (SBlastFilterOptionsMaskAtHash(filtering_options)) {
415         if (strlen(retval) != 0) {
416             /* Add mask at hash as a modifier for other filtering options */
417             if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
418                 return 0;
419             }
420         } else {
421             /* We still need to set "m" in a filter string (WB-391, WB-394) */
422             /* The string below can be modified into "mF" or "mL" or
423                whatever is decided to be the conventional meaning */
424             if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
425                 return 0;
426             }
427         }
428     }
429 
430     return strlen(retval) == 0
431         ? s_SafeStrCat(&retval, &retval_size, "F")
432         : retval;
433 }
434 
435 Int2
BlastFilteringOptionsFromString(EBlastProgramType program_number,const char * instructions,SBlastFilterOptions ** filtering_options,Blast_Message ** blast_message)436 BlastFilteringOptionsFromString(EBlastProgramType program_number,
437                                 const char* instructions,
438                                 SBlastFilterOptions* *filtering_options,
439                                 Blast_Message* *blast_message)
440 {
441         Boolean mask_at_hash = FALSE; /* the default. */
442         char* buffer;
443         const char* ptr = instructions;
444         char error_buffer[1024];
445         Int2 status = 0;
446         SSegOptions* segOptions = NULL;
447         SDustOptions* dustOptions = NULL;
448         SRepeatFilterOptions* repeatOptions = NULL;
449         SWindowMaskerOptions * winmaskOptions = NULL;
450 
451         *filtering_options = NULL;
452         if (blast_message)
453             *blast_message = NULL;
454 
455         if (instructions == NULL || strcasecmp(instructions, "F") == 0)
456         {
457              SBlastFilterOptionsNew(filtering_options, eEmpty);
458              return status;
459         }
460 
461         buffer = (char*) calloc(strlen(instructions), sizeof(char));
462 	/* allow old-style filters when m cannot be followed by the ';' */
463 	if (ptr[0] == 'm' && ptr[1] == ' ')
464 	{
465 		mask_at_hash = TRUE;
466 		ptr += 2;
467 	}
468 
469 	while (*ptr != NULLB)
470 	{
471 		if (*ptr == 'S')
472 		{
473                         SSegOptionsNew(&segOptions);
474 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
475 			if (buffer[0] != NULLB)
476 			{
477                                 int window = 0;
478                                 double locut = .0, hicut = .0;
479 				status = s_ParseSegOptions(buffer, &window, &locut, &hicut);
480                                 if (status)
481                                 {
482                                      segOptions = SSegOptionsFree(segOptions);
483                                      sprintf(error_buffer, "Error parsing filter string: %s", buffer);
484                                      if (blast_message)
485                                        Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
486                                             error_buffer);
487                                      sfree(buffer);
488                                      return status;
489                                 }
490                                 segOptions->window = window;
491                                 segOptions->locut = locut;
492                                 segOptions->hicut = hicut;
493 			}
494 		}
495 		else if (*ptr == 'D')
496 		{
497                         SDustOptionsNew(&dustOptions);
498 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
499 			if (buffer[0] != NULLB)
500                         {
501                                 int window = 0, level = 0, linker = 0;
502 				status = s_ParseDustOptions(buffer, &level, &window, &linker);
503                                 if (status)
504                                 {
505                                      dustOptions = SDustOptionsFree(dustOptions);
506                                      sprintf(error_buffer, "Error parsing filter string: %s", buffer);
507                                      if (blast_message)
508                                        Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
509                                             error_buffer);
510                                      sfree(buffer);
511                                      return status;
512                                 }
513                                 dustOptions->level = level;
514                                 dustOptions->window = window;
515                                 dustOptions->linker = linker;
516                         }
517 		}
518                 else if (*ptr == 'R')
519                 {
520                         SRepeatFilterOptionsNew(&repeatOptions);
521 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
522 			if (buffer[0] != NULLB)
523                         {
524                              char* dbname = NULL;
525                              status = s_ParseRepeatOptions(buffer, &dbname);
526                              if (status)
527                              {
528                                   repeatOptions = SRepeatFilterOptionsFree(repeatOptions);
529                                   sprintf(error_buffer, "Error parsing filter string: %s", buffer);
530                                   if (blast_message)
531                                      Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
532                                             error_buffer);
533                                    sfree(buffer);
534                                    return status;
535                              }
536                              if (dbname)
537                              {
538                                  sfree(repeatOptions->database);
539                                  repeatOptions->database = dbname;
540                              }
541                         }
542                 }
543                 else if (*ptr == 'W')
544                 {
545                     SWindowMaskerOptionsNew(&winmaskOptions);
546 
547                     ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
548                     if (buffer[0] != NULLB) {
549                         char* dbname = NULL;
550                         int taxid = 0;
551 
552                         status = s_ParseWindowMaskerOptions(buffer, &dbname, &taxid);
553                         if (status) {
554                             winmaskOptions = SWindowMaskerOptionsFree(winmaskOptions);
555                             sprintf(error_buffer, "Error parsing filter string: %s", buffer);
556                             if (blast_message)
557                                 Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
558                                                    error_buffer);
559 
560                             sfree(buffer);
561                             return status;
562                         }
563                         if (dbname) {
564                             sfree(winmaskOptions->database);
565                             winmaskOptions->database = dbname;
566                         }
567                         if (taxid) {
568                             winmaskOptions->taxid = taxid;
569                         }
570                     }
571                 }
572 		else if (*ptr == 'L' || *ptr == 'T')
573 		{ /* do low-complexity filtering; dust for blastn, otherwise seg.*/
574                         if (program_number == eBlastTypeBlastn
575                             || program_number == eBlastTypeMapping)
576 
577                             SDustOptionsNew(&dustOptions);
578                         else
579                             SSegOptionsNew(&segOptions);
580 			ptr++;
581 		}
582 		else if (*ptr == 'm')
583 		{
584 		        mask_at_hash = TRUE;
585                         ptr++;
586                 }
587                 else
588                 {    /* Nothing applied */
589                          ptr++;
590                 }
591 	}
592         sfree(buffer);
593 
594         status = SBlastFilterOptionsNew(filtering_options, eEmpty);
595         if (status)
596             return status;
597 
598         (*filtering_options)->dustOptions = dustOptions;
599         (*filtering_options)->segOptions = segOptions;
600         (*filtering_options)->repeatFilterOptions = repeatOptions;
601         (*filtering_options)->windowMaskerOptions = winmaskOptions;
602         (*filtering_options)->mask_at_hash = mask_at_hash;
603 
604         return status;
605 }
606 
607 
BlastSeqLocNew(BlastSeqLoc ** head,Int4 from,Int4 to)608 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to)
609 {
610    BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
611    if ( !loc ) {
612        return NULL;
613    }
614    loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
615    loc->ssr->left = from;
616    loc->ssr->right = to;
617 
618    return BlastSeqLocAppend(head, loc);
619 }
620 
BlastSeqLocAppend(BlastSeqLoc ** head,BlastSeqLoc * node)621 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node)
622 {
623     if ( !node ) {
624         return NULL;
625     }
626 
627     if (head)
628     {
629         if (*head)
630         {
631             BlastSeqLoc* tmp = *head;
632             while (tmp->next)
633                tmp = tmp->next;
634             tmp->next = node;
635         }
636         else
637         {
638             *head = node;
639         }
640     }
641 
642     return node;
643 }
644 
645 /** Makes a copy of the BlastSeqLoc and also a copy of the
646  * SSRange element.  Does not copy BlastSeqLoc that is pointed
647  * to by "next".
648  * @param source the object to be copied [in]
649  * @return another BlastSeqLoc*
650  */
s_BlastSeqLocNodeDup(BlastSeqLoc * source)651 static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source)
652 {
653     if ( !source ) {
654         return NULL;
655     }
656     ASSERT(source->ssr);
657     return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
658 }
659 
660 /** Calculates number of links in a chain of BlastSeqLoc's.
661  * @param var Chain of BlastSeqLoc structures [in]
662  * @return Number of links in the chain.
663  */
s_BlastSeqLocLen(const BlastSeqLoc * var)664 static Int4 s_BlastSeqLocLen(const BlastSeqLoc* var)
665 {
666     BlastSeqLoc* itr = NULL;
667     Int4 retval = 0;
668 
669     for (itr = (BlastSeqLoc*)var; itr; itr = itr->next, retval++) {
670         ;
671     }
672     return retval;
673 }
674 
675 /** Converts a BlastSeqLoc list to an array of pointers, each pointing to an
676  * element of the list passed in to this function and the last element points
677  * to NULL
678  * @param list List to convert to an array of pointers [in]
679  * @param count number of elements populated in the array [out]
680  */
681 static BlastSeqLoc**
s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc * list,Int4 * count)682 s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc* list, Int4* count)
683 {
684     BlastSeqLoc* tmp,** retval;
685     Int4 i;
686     *count = 0;
687 
688     if (list == NULL)
689        return NULL;
690 
691     *count = s_BlastSeqLocLen(list);
692     retval = (BlastSeqLoc**) calloc(((size_t)(*count)+1), sizeof(BlastSeqLoc*));
693     for (tmp = (BlastSeqLoc*)list, i = 0; tmp != NULL && i < *count; i++) {
694         retval[i] = tmp;
695         tmp = tmp->next;
696     }
697     return retval;
698 }
699 
700 /** Reverse elements in the list
701  * @param head pointer to pointer to the head of the list. [in|out]
702  * (this is not declared static so that it can be tested in the unit tests
703  */
704 NCBI_XBLAST_EXPORT
BlastSeqLocListReverse(BlastSeqLoc ** head)705 void BlastSeqLocListReverse(BlastSeqLoc** head)
706 {
707     BlastSeqLoc** ptrs = NULL;  /* array of pointers to BlastSeqLoc elements */
708     Int4 num_elems = 0, i = 0;
709 
710     if ( !head ) {
711         return;
712     }
713 
714     ptrs = s_BlastSeqLocListToArrayOfPointers(*head, &num_elems);
715     if (num_elems == 0) {
716         return;
717     }
718     ASSERT(ptrs);
719     *head = ptrs[num_elems-1];
720     for (i = num_elems-1; i > 0; i--) {
721         ptrs[i]->next = ptrs[i-1];
722     }
723     ptrs[0]->next = NULL;
724     sfree(ptrs);
725 }
726 
BlastSeqLocNodeFree(BlastSeqLoc * loc)727 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc)
728 {
729     if ( !loc ) {
730         return NULL;
731     }
732     sfree(loc->ssr);
733     sfree(loc);
734     return NULL;
735 }
736 
BlastSeqLocFree(BlastSeqLoc * loc)737 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
738 {
739     while (loc) {
740         BlastSeqLoc* next_loc = loc->next;
741         loc = BlastSeqLocNodeFree(loc);
742         loc = next_loc;
743     }
744     return NULL;
745 }
746 
BlastSeqLocListDup(BlastSeqLoc * head)747 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head)
748 {
749     BlastSeqLoc* retval = NULL;
750     BlastSeqLoc* retval_tail = NULL;
751 
752     for (; head; head = head->next) {
753         retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
754                                         s_BlastSeqLocNodeDup(head));
755     }
756 
757     return retval;
758 }
759 
BlastMaskLocNew(Int4 total)760 BlastMaskLoc* BlastMaskLocNew(Int4 total)
761 {
762     BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
763     retval->total_size = total;
764     if (total > 0)
765         retval->seqloc_array = (BlastSeqLoc **) calloc(total,
766                                                        sizeof(BlastSeqLoc *));
767     return retval;
768 }
769 
BlastMaskLocDup(const BlastMaskLoc * mask_loc)770 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc)
771 {
772     BlastMaskLoc* retval = NULL;
773     Int4 index = 0;
774 
775     if ( !mask_loc ) {
776         return NULL;
777     }
778 
779     retval = BlastMaskLocNew(mask_loc->total_size);
780 
781     for (index = 0; index < mask_loc->total_size; index++) {
782         retval->seqloc_array[index] =
783             BlastSeqLocListDup(mask_loc->seqloc_array[index]);
784     }
785 
786     return retval;
787 }
788 
BlastMaskLocFree(BlastMaskLoc * mask_loc)789 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
790 {
791    Int4 index;
792 
793    if (mask_loc == NULL)
794       return NULL;
795 
796    for (index=0; index<mask_loc->total_size; index++)
797    {
798       if (mask_loc->seqloc_array != NULL)
799          BlastSeqLocFree(mask_loc->seqloc_array[index]);
800    }
801    sfree(mask_loc->seqloc_array);
802    sfree(mask_loc);
803    return NULL;
804 }
805 
BlastMaskLocDNAToProtein(BlastMaskLoc * mask_loc,const BlastQueryInfo * query_info)806 Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
807                               const BlastQueryInfo* query_info)
808 {
809     Uint4 seq_index;
810     BlastSeqLoc* dna_seqlocs[NUM_FRAMES];
811 
812     if (!mask_loc)
813         return 0;
814 
815     /* Check that the array size in BlastMaskLoc corresponds to the number
816        of contexts in BlastQueryInfo. */
817     ASSERT(mask_loc->total_size == query_info->last_context + 1);
818 
819     /* Loop over multiple DNA sequences */
820     for (seq_index = 0; seq_index < (Uint4)query_info->num_queries;
821          ++seq_index) {
822         const Uint4 ctx_idx = NUM_FRAMES * seq_index;
823         const Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
824                                                              eBlastTypeBlastx,
825                                                              seq_index);
826         Int4 context;
827 
828         /* Save the DNA masking locations, as they'll be freed and overwritten
829          * by their translations */
830         memset((void*) &dna_seqlocs, 0, sizeof(dna_seqlocs));
831         memcpy((void*) &dna_seqlocs,
832                (void*) &mask_loc->seqloc_array[ctx_idx],
833                sizeof(dna_seqlocs));
834         memset((void*) &mask_loc->seqloc_array[ctx_idx], 0, sizeof(dna_seqlocs));
835 
836         /* Reproduce this mask for all 6 frames, with translated coordinates */
837         for (context = 0; context < NUM_FRAMES; ++context) {
838             const Int2 frame = BLAST_ContextToFrame(eBlastTypeBlastx, context);
839             BlastSeqLoc* frame_seqloc = dna_seqlocs[context];
840             BlastSeqLoc* prot_tail = NULL;
841             BlastSeqLoc* itr = NULL;
842 
843             /* If no masks were provided for some frames, use the first one */
844             if (frame_seqloc == NULL && dna_seqlocs[0]) {
845                 frame_seqloc = dna_seqlocs[0];
846             }
847             for (itr = frame_seqloc; itr; itr = itr->next) {
848                 Int4 from, to;
849                 SSeqRange* seq_range = itr->ssr;
850                 /* masks should be 0-offset */
851                 ASSERT(seq_range->right < dna_length);
852                 ASSERT(seq_range->left  >= 0);
853                 if (frame < 0) {
854                     from = (dna_length + frame - seq_range->right)/CODON_LENGTH;
855                     to = (dna_length + frame - seq_range->left)/CODON_LENGTH;
856                 } else {
857                     from = (seq_range->left - frame + 1)/CODON_LENGTH;
858                     to = (seq_range->right - frame + 1)/CODON_LENGTH;
859                 }
860 
861                 if (from < 0)
862                     from = 0;
863                 if (to   < 0)
864                     to   = 0;
865                 if (from >= query_info->contexts[ctx_idx+context].query_length)
866                     from = query_info->contexts[ctx_idx+context].query_length - 1;
867                 if (to >= query_info->contexts[ctx_idx+context].query_length)
868                     to = query_info->contexts[ctx_idx+context].query_length - 1;
869 
870                 ASSERT(from >= 0);
871                 ASSERT(to   >= 0);
872                 ASSERT(from < query_info->contexts[ctx_idx+context].query_length);
873                 ASSERT(to   < query_info->contexts[ctx_idx+context].query_length);
874 
875                 /* Cache the tail of the list to avoid the overhead of
876                  * traversing the list when appending to it */
877                 prot_tail = BlastSeqLocNew((prot_tail
878                             ? & prot_tail
879                             : & mask_loc->seqloc_array[ctx_idx+context]),
880                             from, to);
881             }
882         }
883         for (context = 0; context < NUM_FRAMES; ++context) {
884             BlastSeqLocFree(dna_seqlocs[context]);
885         }
886     }
887 
888     return 0;
889 }
890 
891 
BlastMaskLocProteinToDNA(BlastMaskLoc * mask_loc,const BlastQueryInfo * query_info)892 Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
893                               const BlastQueryInfo* query_info)
894 {
895    Int2 status = 0;
896    Int4 index;
897 
898    /* If there is not mask, there is nothing to convert to DNA coordinates,
899       hence just return. */
900    if (!mask_loc)
901       return 0;
902 
903    /* Check that the array size in BlastMaskLoc corresponds to the number
904       of contexts in BlastQueryInfo. */
905    ASSERT(mask_loc->total_size == query_info->last_context + 1);
906 
907    /* Loop over all DNA sequences */
908    for (index=0; index < query_info->num_queries; ++index)
909    {
910        Int4 frame_start = index*NUM_FRAMES;
911        Int4 frame_index;
912        Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
913                                                       eBlastTypeBlastx,
914                                                       index);
915        /* Loop over all frames of one DNA sequence */
916        for (frame_index=frame_start; frame_index<(frame_start+NUM_FRAMES);
917             frame_index++) {
918            BlastSeqLoc* loc;
919            Int2 frame =
920                BLAST_ContextToFrame(eBlastTypeBlastx, frame_index % NUM_FRAMES);
921            /* Loop over all mask locations for a given frame */
922            for (loc = mask_loc->seqloc_array[frame_index]; loc; loc = loc->next) {
923                Int4 from=0, to=0;
924                SSeqRange* seq_range = loc->ssr;
925                if (frame < 0) {
926                    to = dna_length - CODON_LENGTH*seq_range->left + frame;
927                    from = dna_length - CODON_LENGTH*seq_range->right + frame + 1;
928                } else {
929                    from = CODON_LENGTH*seq_range->left + frame - 1;
930                    to = CODON_LENGTH*seq_range->right + frame - 1;
931                }
932 
933                if (from < 0)
934                    from = 0;
935                if (to   < 0)
936                    to   = 0;
937                if (from >= dna_length)
938                    from = dna_length - 1;
939                if (to   >= dna_length)
940                    to   = dna_length - 1;
941 
942                ASSERT(from >= 0);
943                ASSERT(to   >= 0);
944                ASSERT(from < dna_length);
945                ASSERT(to   < dna_length);
946 
947                seq_range->left = from;
948                seq_range->right = to;
949            }
950        }
951    }
952    return status;
953 }
954 
955 /** Used for qsort, compares two SeqLoc's by starting position. */
s_SeqRangeSortByStartPosition(const void * vp1,const void * vp2)956 static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
957 {
958    BlastSeqLoc* v1 = *((BlastSeqLoc**) vp1);
959    BlastSeqLoc* v2 = *((BlastSeqLoc**) vp2);
960    SSeqRange* loc1 = (SSeqRange*) v1->ssr;
961    SSeqRange* loc2 = (SSeqRange*) v2->ssr;
962 
963    if (loc1->left < loc2->left)
964       return -1;
965    else if (loc1->left > loc2->left)
966       return 1;
967    else
968       return 0;
969 }
970 
971 void
BlastSeqLocCombine(BlastSeqLoc ** mask_loc,Int4 link_value)972 BlastSeqLocCombine(BlastSeqLoc** mask_loc, Int4 link_value)
973 {
974     BlastSeqLoc** ptrs = NULL;
975     Int4 i = 0, num_elems = 0;
976 
977     /* Break up the list into an array of pointers and sort it */
978     ptrs = s_BlastSeqLocListToArrayOfPointers(*mask_loc, &num_elems);
979     if (num_elems == 0) {
980         return;
981     }
982     ASSERT(ptrs);
983     qsort(ptrs, (size_t)num_elems, sizeof(*ptrs),
984           s_SeqRangeSortByStartPosition);
985 
986     /* Merge the overlapping elements */
987     {
988         BlastSeqLoc* curr_tail = *mask_loc = ptrs[0];
989         for (i = 0; i < num_elems - 1; i++) {
990             const SSeqRange* next_ssr = ptrs[i+1]->ssr;
991             const Int4 stop = curr_tail->ssr->right;
992 
993             if ((stop + link_value) > next_ssr->left) {
994                 curr_tail->ssr->right = MAX(stop, next_ssr->right);
995                 ptrs[i+1] = BlastSeqLocNodeFree(ptrs[i+1]);
996             } else {
997                 curr_tail = ptrs[i+1];
998             }
999         }
1000     }
1001 
1002     /* Rebuild the linked list */
1003     {
1004         BlastSeqLoc* tail = *mask_loc;
1005         for (i = 1; i < num_elems; i++) {
1006             if (ptrs[i]) {
1007                 tail->next = ptrs[i];
1008                 tail = ptrs[i];
1009             }
1010         }
1011         tail->next = NULL;
1012     }
1013     sfree(ptrs);
1014 }
1015 
1016 Int2
BLAST_ComplementMaskLocations(EBlastProgramType program_number,const BlastQueryInfo * query_info,const BlastMaskLoc * mask_loc,BlastSeqLoc ** complement_mask)1017 BLAST_ComplementMaskLocations(EBlastProgramType program_number,
1018    const BlastQueryInfo* query_info,
1019    const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask)
1020 {
1021    Int4 context;
1022    const Boolean kIsNucl = (program_number == eBlastTypeBlastn ||
1023                             program_number == eBlastTypeMapping);
1024    BlastSeqLoc* tail = NULL;    /* Pointer to the tail of the complement_mask
1025                                    linked list */
1026 
1027    if (complement_mask == NULL)
1028 	return -1;
1029 
1030    *complement_mask = NULL;
1031 
1032    for (context = query_info->first_context;
1033         context <= query_info->last_context; ++context) {
1034 
1035       Boolean first = TRUE;	/* Specifies beginning of query. */
1036       Boolean last_interval_open=TRUE; /* if TRUE last interval needs to be closed. */
1037       Int4 start_offset, end_offset, filter_start, filter_end;
1038       Int4 left=0, right; /* Used for left/right extent of a region. */
1039       BlastSeqLoc* loc = NULL;
1040 
1041       if (query_info->contexts[context].is_valid == FALSE) {
1042           continue;
1043       }
1044 
1045       start_offset = query_info->contexts[context].query_offset;
1046       end_offset = query_info->contexts[context].query_length
1047           + start_offset - 1;
1048       ASSERT(start_offset <= end_offset);
1049 
1050       /* mask_loc NULL is simply the case that NULL was passed in, which we
1051          take to mean that nothing on query is masked. */
1052       if (mask_loc == NULL || mask_loc->seqloc_array[context] == NULL) {
1053          /* Cache the tail of the list to avoid the overhead of traversing the
1054           * list when appending to it */
1055          tail = BlastSeqLocNew(tail ? &tail : complement_mask,
1056                                start_offset, end_offset);
1057          continue;
1058       }
1059 
1060       if (BlastIsReverseStrand(kIsNucl, context)) {
1061          BlastSeqLocListReverse(&mask_loc->seqloc_array[context]);
1062       }
1063       loc = mask_loc->seqloc_array[context];
1064 
1065       first = TRUE;
1066       for ( ; loc; loc = loc->next) {
1067          SSeqRange* seq_range = loc->ssr;
1068          if (BlastIsReverseStrand(kIsNucl, context)) {
1069             filter_start = end_offset - seq_range->right;
1070             filter_end = end_offset - seq_range->left;
1071          } else {
1072             filter_start = start_offset + seq_range->left;
1073             filter_end = start_offset + seq_range->right;
1074          }
1075          /* The canonical "state" at the top of this
1076             while loop is that both "left" and "right" have
1077             been initialized to their correct values.
1078             The first time this loop is entered in a call to
1079             the function this is not true and the following "if"
1080             statement moves everything to the canonical state. */
1081          if (first) {
1082             last_interval_open = TRUE;
1083             first = FALSE;
1084 
1085             if (filter_start > start_offset) {
1086                /* beginning of sequence not filtered */
1087                left = start_offset;
1088             } else {
1089                /* beginning of sequence filtered */
1090                left = filter_end + 1;
1091                continue;
1092             }
1093          }
1094 
1095          right = filter_start - 1;
1096 
1097          /* Cache the tail of the list to avoid the overhead of traversing the
1098           * list when appending to it */
1099          tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1100          if (filter_end >= end_offset) {
1101             /* last masked region at end of sequence */
1102             last_interval_open = FALSE;
1103             break;
1104          } else {
1105             left = filter_end + 1;
1106          }
1107       }
1108 
1109       if (last_interval_open) {
1110          /* Need to finish SSeqRange* for last interval. */
1111          right = end_offset;
1112          /* Cache the tail of the list to avoid the overhead of traversing the
1113           * list when appending to it */
1114          tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1115       }
1116    }
1117    return 0;
1118 }
1119 
1120 
1121 Int2
BlastSetUp_Filter(EBlastProgramType program_number,Uint1 * sequence,Int4 length,Int4 offset,const SBlastFilterOptions * filter_options,BlastSeqLoc ** seqloc_retval,Blast_Message ** blast_message)1122 BlastSetUp_Filter(EBlastProgramType program_number,
1123                   Uint1* sequence,
1124                   Int4 length,
1125                   Int4 offset,
1126                   const SBlastFilterOptions* filter_options,
1127                   BlastSeqLoc** seqloc_retval,
1128                   Blast_Message* *blast_message)
1129 {
1130 	Int2 status=0;		/* return value. */
1131 
1132     ASSERT(filter_options);
1133     ASSERT(seqloc_retval);
1134 
1135     *seqloc_retval = NULL;
1136 
1137     status = SBlastFilterOptionsValidate(program_number, filter_options,
1138                                          blast_message);
1139     if (status)
1140        return status;
1141 
1142 	if (filter_options->segOptions)
1143 	{
1144         SSegOptions* seg_options = filter_options->segOptions;
1145         SegParameters* sparamsp=NULL;
1146 
1147         sparamsp = SegParametersNewAa();
1148         sparamsp->overlaps = TRUE;
1149         if (seg_options->window > 0)
1150             sparamsp->window = seg_options->window;
1151         if (seg_options->locut > 0.0)
1152             sparamsp->locut = seg_options->locut;
1153         if (seg_options->hicut > 0.0)
1154             sparamsp->hicut = seg_options->hicut;
1155 
1156 		status = SeqBufferSeg(sequence, length, offset, sparamsp,
1157                               seqloc_retval);
1158 		SegParametersFree(sparamsp);
1159 		sparamsp = NULL;
1160 	}
1161 
1162     if (filter_options->readQualityOptions) {
1163         status = FilterQueriesForMapping(sequence, length, offset,
1164                                          filter_options->readQualityOptions,
1165                                          seqloc_retval);
1166     }
1167 
1168 
1169 	return status;
1170 }
1171 
1172 void
BlastSeqLocReverse(BlastSeqLoc * masks,Int4 query_length)1173 BlastSeqLocReverse(BlastSeqLoc* masks, Int4 query_length)
1174 {
1175     for(; masks; masks = masks->next) {
1176         masks->ssr->left    = query_length - 1 - masks->ssr->right;
1177         masks->ssr->right   = query_length - 1 - masks->ssr->left;
1178     }
1179 }
1180 
1181 /** Calculates the mask locations one context at a time.
1182  * @param query_blk sequence [in]
1183  * @param query_info information about sequences [in]
1184  * @param context which context is this?  [in]
1185  * @param program_number program (blastn, blastp, etc.) [in]
1186  * @param filter_options instructions for producing mask [in]
1187  * @param filter_out results of filtering operations [out]
1188  * @param blast_message any error or warning messages [out]
1189  * @return zero on success
1190  */
1191 static Int2
s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,Int4 context,EBlastProgramType program_number,const SBlastFilterOptions * filter_options,BlastSeqLoc ** filter_out,Blast_Message ** blast_message)1192 s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk,
1193                                      const BlastQueryInfo* query_info,
1194                                      Int4 context,
1195                                      EBlastProgramType program_number,
1196                                      const SBlastFilterOptions* filter_options,
1197                                      BlastSeqLoc* *filter_out,
1198                                      Blast_Message* *blast_message)
1199 {
1200     Int2 status = 0;
1201     Int4 query_length = 0;      /* Length of query described by SeqLocPtr. */
1202     Int4 context_offset;
1203     Uint1 *buffer;              /* holds sequence for plus strand or protein. */
1204 
1205     const Boolean kIsNucl = (program_number == eBlastTypeBlastn ||
1206                              program_number == eBlastTypeMapping);
1207 
1208     context_offset = query_info->contexts[context].query_offset;
1209     buffer = &query_blk->sequence[context_offset];
1210 
1211     if (query_info->contexts[context].is_valid == FALSE) {
1212           return 0;
1213     }
1214 
1215     query_length = query_info->contexts[context].query_length;
1216 
1217     status = BlastSetUp_Filter(program_number,
1218                                buffer,
1219                                query_length,
1220                                0,
1221                                filter_options,
1222                                filter_out,
1223                                blast_message);
1224     if (status)
1225          return status;
1226 
1227     if (BlastIsReverseStrand(kIsNucl, context) == TRUE) {
1228         /* Reverse this as it's on minus strand. */
1229         BlastSeqLocReverse(*filter_out, query_length);
1230     }
1231 
1232     /* Extract the mask locations corresponding to this query
1233        (frame, strand), detach it from other masks.
1234        NB: for translated search the mask locations are expected in
1235        protein coordinates. The nucleotide locations must be converted
1236        to protein coordinates prior to the call to BLAST_MainSetUp.
1237     */
1238     {
1239         /* Auxiliary locations for lower-case masking or any other masking
1240          * which occurred outside of CORE BLAST */
1241         BlastSeqLoc *lcase_mask_slp = NULL;
1242         if (query_blk->lcase_mask && query_blk->lcase_mask->seqloc_array)
1243         {
1244             ASSERT(context < query_blk->lcase_mask->total_size);
1245             lcase_mask_slp = query_blk->lcase_mask->seqloc_array[context];
1246             /* Set location list to NULL, to allow safe memory deallocation,
1247               ownership transferred to filter_out below. */
1248             query_blk->lcase_mask->seqloc_array[context] = NULL;
1249         }
1250 
1251         /* Attach the lower case mask locations to the filter locations and
1252            combine them */
1253         BlastSeqLocAppend(filter_out, lcase_mask_slp);
1254     }
1255 
1256     BlastSeqLocCombine(filter_out, 0);
1257 
1258 	return 0;
1259 }
1260 
1261 Int2
BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,EBlastProgramType program_number,const SBlastFilterOptions * filter_options,BlastMaskLoc ** filter_maskloc,Blast_Message ** blast_message)1262 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
1263                                  const BlastQueryInfo* query_info,
1264                                  EBlastProgramType program_number,
1265                                  const SBlastFilterOptions* filter_options,
1266                                  BlastMaskLoc** filter_maskloc,
1267                                  Blast_Message** blast_message)
1268 {
1269     Int2 status = 0;
1270     Int4 context = 0; /* loop variable. */
1271     const int kNumContexts = query_info->last_context + 1;
1272 
1273     ASSERT(query_info && query_blk && filter_maskloc);
1274 
1275     ASSERT(blast_message);
1276     ASSERT(kNumContexts ==
1277            query_info->num_queries*BLAST_GetNumberOfContexts(program_number));
1278     *filter_maskloc = BlastMaskLocNew(kNumContexts);
1279 
1280     for (context = query_info->first_context;
1281          context <= query_info->last_context; ++context) {
1282 
1283         BlastSeqLoc *filter_per_context = NULL;
1284         status = s_GetFilteringLocationsForOneContext(query_blk,
1285                                                       query_info,
1286                                                       context,
1287                                                       program_number,
1288                                                       filter_options,
1289                                                       &filter_per_context,
1290                                                       blast_message);
1291         if (status) {
1292             Blast_MessageWrite(blast_message, eBlastSevError, context,
1293                                    "Failure at filtering");
1294             return status;
1295         }
1296 
1297     /* NB: for translated searches filter locations are returned in
1298            protein coordinates, because the DNA lengths of sequences are
1299            not available here. The caller must take care of converting
1300            them back to nucleotide coordinates. */
1301          (*filter_maskloc)->seqloc_array[context] = filter_per_context;
1302     }
1303     return 0;
1304 }
1305 
1306 void
Blast_MaskTheResidues(Uint1 * buffer,Int4 length,Boolean is_na,const BlastSeqLoc * mask_loc,Boolean reverse,Int4 offset)1307 Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
1308                       const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset)
1309 {
1310     const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask;
1311     ASSERT(buffer);
1312     for (; mask_loc; mask_loc = mask_loc->next) {
1313 
1314         Int4 index, start, stop;
1315 
1316         if (reverse) {
1317             start = length - 1 - mask_loc->ssr->right;
1318             stop = length - 1 - mask_loc->ssr->left;
1319         } else {
1320             start = mask_loc->ssr->left;
1321             stop = mask_loc->ssr->right;
1322         }
1323 
1324         start -= offset;
1325         stop -= offset;
1326 
1327         ASSERT(start < length);
1328         ASSERT(stop <= length);
1329 
1330         for (index = start; index <= stop; index++)
1331             buffer[index] = kMaskingLetter;
1332     }
1333 }
1334 
1335 void
Blast_MaskUnsupportedAA(BLAST_SequenceBlk * seq,Uint1 min_invalid)1336 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, Uint1 min_invalid)
1337 {
1338     Uint1 *sequence = seq->sequence;
1339     Int4 length = seq->length;
1340     Int4 i;
1341 
1342     for (i = 0; i < length; i++) {
1343         if (sequence[i] >= min_invalid) {
1344             sequence[i] = kProtMask;
1345         }
1346     }
1347 }
1348 
1349 void
BlastSetUp_MaskQuery(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const BlastMaskLoc * filter_maskloc,EBlastProgramType program_number)1350 BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
1351                      const BlastQueryInfo* query_info,
1352                      const BlastMaskLoc *filter_maskloc,
1353                      EBlastProgramType program_number)
1354 {
1355     const Boolean kIsNucl = (program_number == eBlastTypeBlastn
1356                              || program_number == eBlastTypeMapping);
1357     Int4 context; /* loop variable. */
1358     Int4 total_length;
1359     Boolean has_mask = FALSE; /* Check for whether filter_maskloc is empty. */
1360     Int4 index; /* loop variable. */
1361 
1362     ASSERT(query_blk);
1363     ASSERT(query_info);
1364     ASSERT(filter_maskloc);
1365 
1366 
1367     for (index=0; index<filter_maskloc->total_size; index++)
1368     {
1369          if (filter_maskloc->seqloc_array[index])
1370          {
1371             has_mask = TRUE;
1372             break;
1373          }
1374     }
1375     if (has_mask == FALSE)
1376        return;
1377 
1378 
1379     total_length  = query_info->contexts[query_info->last_context].query_offset
1380                   + query_info->contexts[query_info->last_context].query_length + 2;
1381     query_blk->sequence_start_nomask = BlastMemDup(query_blk->sequence_start, total_length);
1382     query_blk->sequence_nomask = query_blk->sequence_start_nomask + 1;
1383     query_blk->nomask_allocated = TRUE;
1384 
1385     for (context = query_info->first_context;
1386          context <= query_info->last_context; ++context) {
1387 
1388         Int4 query_length = 0;
1389         Int4 context_offset = 0;
1390         Uint1 *buffer = NULL;              /* holds sequence */
1391 
1392         if (query_info->contexts[context].is_valid == FALSE) {
1393           continue;
1394         }
1395 
1396         query_length = query_info->contexts[context].query_length;
1397 
1398         context_offset = query_info->contexts[context].query_offset;
1399         buffer = &query_blk->sequence[context_offset];
1400         ASSERT(buffer);
1401 
1402         Blast_MaskTheResidues(buffer, query_length, kIsNucl,
1403                               filter_maskloc->seqloc_array[context],
1404                               BlastIsReverseStrand(kIsNucl, context), 0);
1405     }
1406 }
1407