1 /* $Id: blast_filter.c,v 1.102 2016/06/20 15:49:13 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file blast_filter.c
28  * All code related to query sequence masking/filtering for BLAST
29  */
30 
31 #include <algo/blast/core/blast_util.h>
32 #include <algo/blast/core/blast_filter.h>
33 #include <algo/blast/core/blast_seg.h>
34 
35 /****************************************************************************/
36 /* Constants */
37 const Uint1 kNuclMask = 14;     /* N in BLASTNA */
38 const Uint1 kProtMask = 21;     /* X in NCBISTDAA */
39 
40 
41 /** Allowed length of the filtering options string. */
42 #define BLASTOPTIONS_BUFFER_SIZE 128
43 
44 
45 /** Copies filtering commands for one filtering algorithm from "instructions" to
46  * "buffer".
47  * ";" is a delimiter for the commands for different algorithms, so it stops
48  * copying when a ";" is found.
49  * Example filtering string: "m L; R -d rodents.lib"
50  * @param instructions filtering commands [in]
51  * @param buffer filled with filtering commands for one algorithm. [out]
52 */
53 static const char *
s_LoadOptionsToBuffer(const char * instructions,char * buffer)54 s_LoadOptionsToBuffer(const char *instructions, char* buffer)
55 {
56 	Boolean not_started=TRUE;
57 	char* buffer_ptr;
58 	const char *ptr;
59 	Int4 index;
60 
61 	ptr = instructions;
62 	buffer_ptr = buffer;
63 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE && *ptr != NULLB; index++)
64 	{
65 		if (*ptr == ';')
66 		{	/* ";" is a delimiter for different filtering algorithms. */
67 			ptr++;
68 			break;
69 		}
70 		/* Remove blanks at the beginning. */
71 		if (not_started && *ptr == ' ')
72 		{
73 			ptr++;
74 		}
75 		else
76 		{
77 			not_started = FALSE;
78 			*buffer_ptr = *ptr;
79 			buffer_ptr++; ptr++;
80 		}
81 	}
82 
83 	*buffer_ptr = NULLB;
84 
85 	if (not_started == FALSE)
86 	{	/* Remove trailing blanks. */
87 		buffer_ptr--;
88 		while (*buffer_ptr == ' ' && buffer_ptr > buffer)
89 		{
90 			*buffer_ptr = NULLB;
91 			buffer_ptr--;
92 		}
93 	}
94 
95 	return ptr;
96 }
97 
98 /** Parses repeat filtering options string.
99  * @param repeat_options Input character string [in]
100  * @param dbname Database name for repeats filtering [out]
101  */
102 static Int2
s_ParseRepeatOptions(const char * repeat_options,char ** dbname)103 s_ParseRepeatOptions(const char* repeat_options, char** dbname)
104 {
105     char* ptr;
106 
107     ASSERT(dbname);
108     *dbname = NULL;
109 
110     if (!repeat_options)
111         return 0;
112 
113     ptr = strstr(repeat_options, "-d");
114     if (ptr) {
115         ptr += 2;
116         while (*ptr == ' ' || *ptr == '\t')
117             ++ptr;
118         *dbname = strdup(ptr);
119     }
120     return 0;
121 }
122 
123 /** Parses window masker options string.
124  * @param winmask_options Input character string [in]
125  * @param dbname Database name for window masker filtering [out]
126  * @param taxid Taxonomic ID for window masker filtering [out]
127  */
128 static Int2
s_ParseWindowMaskerOptions(const char * winmask_options,char ** dbname,int * taxid)129 s_ParseWindowMaskerOptions(const char  * winmask_options,
130                            char       ** dbname,
131                            int         * taxid)
132 {
133     char* ptr = NULL;
134 
135     ASSERT(dbname);
136     *dbname = NULL;
137 
138     if (!winmask_options)
139         return 0;
140 
141     ptr = strstr(winmask_options, "-d");
142 
143     if (ptr) {
144         char * endp = 0;
145 
146         ptr += 2;
147         while (*ptr == ' ' || *ptr == '\t')
148             ++ptr;
149 
150         *dbname = strdup(ptr);
151 
152         for(endp = *dbname; *endp; ++endp) {
153             if (*endp == ' ' || *endp == '\t') {
154                 *endp = (char)0;
155                 break;
156             }
157         }
158     } else {
159         ptr = strstr(winmask_options, "-t");
160 
161         if (ptr) {
162             ptr += 2;
163             while (*ptr == ' ' || *ptr == '\t')
164                 ++ptr;
165             *taxid = atoi(ptr);
166         }
167     }
168 
169     return 0;
170 }
171 
172 /** Parses options used for dust.
173  * @param ptr buffer containing instructions. [in]
174  * @param level sets level for dust. [out]
175  * @param window sets window for dust [out]
176  * @param linker sets linker for dust. [out]
177 */
178 static Int2
s_ParseDustOptions(const char * ptr,int * level,int * window,int * linker)179 s_ParseDustOptions(const char *ptr, int* level, int* window, int* linker)
180 
181 {
182 	char buffer[BLASTOPTIONS_BUFFER_SIZE];
183 	int arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1;
184 
185 	arg = 0;
186 	index1 = 0;
187 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
188 	{
189 		if (*ptr == ' ' || *ptr == NULLB)
190 		{
191 	                long tmplong;
192 			buffer[index1] = NULLB;
193 			index1 = 0;
194 			switch(arg) {
195 				case 0:
196 					sscanf(buffer, "%ld", &tmplong);
197 					level_pri = tmplong;
198 					break;
199 				case 1:
200 					sscanf(buffer, "%ld", &tmplong);
201 					window_pri = tmplong;
202 					break;
203 				case 2:
204 					sscanf(buffer, "%ld", &tmplong);
205 					linker_pri = tmplong;
206 					break;
207 				default:
208 					break;
209 			}
210 
211 			arg++;
212 			while (*ptr == ' ')
213 				ptr++;
214 
215 			/* end of the buffer. */
216 			if (*ptr == NULLB)
217 				break;
218 		}
219 		else
220 		{
221 			buffer[index1] = *ptr; ptr++;
222 			index1++;
223 		}
224 	}
225         if (arg != 0 && arg != 3)
226            return 1;
227 
228 	*level = level_pri;
229 	*window = window_pri;
230 	*linker = linker_pri;
231 
232 	return 0;
233 }
234 
235 /** parses a string to set three seg options.
236  * @param ptr buffer containing instructions [in]
237  * @param window returns "window" for seg algorithm. [out]
238  * @param locut returns "locut" for seg. [out]
239  * @param hicut returns "hicut" for seg. [out]
240 */
241 static Int2
s_ParseSegOptions(const char * ptr,Int4 * window,double * locut,double * hicut)242 s_ParseSegOptions(const char *ptr, Int4* window, double* locut, double* hicut)
243 
244 {
245 	char buffer[BLASTOPTIONS_BUFFER_SIZE];
246 	Int4 arg, index, index1;
247 
248 	arg = 0;
249 	index1 = 0;
250 	for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
251 	{
252 		if (*ptr == ' ' || *ptr == NULLB)
253 		{
254 	                long tmplong;
255 	                double tmpdouble;
256 			buffer[index1] = NULLB;
257 			index1 = 0;
258 			switch(arg) {
259 				case 0:
260 					sscanf(buffer, "%ld", &tmplong);
261 					*window = tmplong;
262 					break;
263 				case 1:
264 					sscanf(buffer, "%le", &tmpdouble);
265 					*locut = tmpdouble;
266 					break;
267 				case 2:
268 					sscanf(buffer, "%le", &tmpdouble);
269 					*hicut = tmpdouble;
270 					break;
271 				default:
272 					break;
273 			}
274 
275 			arg++;
276 			while (*ptr == ' ')
277 				ptr++;
278 
279 			/* end of the buffer. */
280 			if (*ptr == NULLB)
281 				break;
282 		}
283 		else
284 		{
285 			buffer[index1] = *ptr; ptr++;
286 			index1++;
287 		}
288 	}
289         if (arg != 0 && arg != 3)
290            return 1;
291 
292 	return 0;
293 }
294 
295 /// Wrapper around strcat to ensure we don't do buffer overflows :)
296 /// @param dest string to concatenate to [in|out]
297 /// @param dest_size size of the dest array, modified if dest is grown [in|out]
298 /// @param string2append string to append to dest [in]
299 /// @return the concatenated string or NULL if we run out of memory
300 static char*
s_SafeStrCat(char ** dest,unsigned int * dest_size,const char * string2append)301 s_SafeStrCat(char** dest, unsigned int* dest_size, const char* string2append)
302 {
303     size_t dest_length = strlen(*dest);
304     size_t string2append_length = strlen(string2append);
305     if ((dest_length + string2append_length + 1) > *dest_size) {
306         size_t target_size = MAX(string2append_length, dest_length) * 2;
307         *dest = (char*)realloc((void*)*dest, target_size);
308         if (*dest) {
309             (*dest_size) = target_size;
310         } else {
311             sfree(*dest);
312             return 0;
313         }
314     }
315     strcat(*dest, string2append);
316     return *dest;
317 }
318 
319 char*
BlastFilteringOptionsToString(const SBlastFilterOptions * filtering_options)320 BlastFilteringOptionsToString(const SBlastFilterOptions* filtering_options)
321 {
322     char* retval = NULL;
323     unsigned int retval_size = 0;
324 
325     if (filtering_options == NULL) {
326         return strdup("F");
327     }
328 
329     retval_size = 64;   /* Usually this will suffice */
330     retval = (char*) calloc(retval_size, sizeof(char));
331 
332     if (filtering_options->dustOptions) {
333         if (filtering_options->dustOptions->level == kDustLevel &&
334             filtering_options->dustOptions->window == kDustWindow &&
335             filtering_options->dustOptions->linker == kDustLinker) {
336             if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
337                 return 0;
338             }
339         } else {
340             char buffer[24] = { '\0' };
341             snprintf(buffer, sizeof(buffer), "D %d %d %d;",
342                      filtering_options->dustOptions->level,
343                      filtering_options->dustOptions->window,
344                      filtering_options->dustOptions->linker);
345             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
346                 return 0;
347             }
348         }
349     }
350 
351     if (filtering_options->segOptions) {
352         if (filtering_options->segOptions->window == kSegWindow &&
353             filtering_options->segOptions->locut == kSegLocut &&
354             filtering_options->segOptions->hicut == kSegHicut) {
355             if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
356                 return 0;
357             }
358         } else {
359             char buffer[24] = { '\0' };
360             snprintf(buffer, sizeof(buffer), "S %d %1.1f %1.1f;",
361                      filtering_options->segOptions->window,
362                      filtering_options->segOptions->locut,
363                      filtering_options->segOptions->hicut);
364             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
365                 return 0;
366             }
367         }
368     }
369 
370     if (filtering_options->repeatFilterOptions) {
371         if (filtering_options->repeatFilterOptions->database) {
372             if (!s_SafeStrCat(&retval, &retval_size, "R -d ")) {
373                 return 0;
374             }
375             if (!s_SafeStrCat(&retval, &retval_size,
376                          filtering_options->repeatFilterOptions->database)) {
377                 return 0;
378             }
379             if (!s_SafeStrCat(&retval, &retval_size, ";")) {
380                 return 0;
381             }
382         } else {
383             if (!s_SafeStrCat(&retval, &retval_size, "R;")) {
384                 return 0;
385             }
386         }
387     }
388 
389     if (filtering_options->windowMaskerOptions) {
390         if (filtering_options->windowMaskerOptions->taxid != 0) {
391             char buffer[24] = { '\0' };
392             snprintf(buffer, sizeof(buffer), "W -t %d;",
393                      filtering_options->windowMaskerOptions->taxid);
394             if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
395                 return 0;
396             }
397         } else if (filtering_options->windowMaskerOptions->database) {
398             if (!s_SafeStrCat(&retval, &retval_size, "W -d ")) {
399                 return 0;
400             }
401             if (!s_SafeStrCat(&retval, &retval_size,
402                          filtering_options->windowMaskerOptions->database)) {
403                 return 0;
404             }
405             if (!s_SafeStrCat(&retval, &retval_size, ";")) {
406                 return 0;
407             }
408         }
409     }
410 
411     /* Mask at hash is a modifier for other filtering options, as such it
412      * doesn't make sense to apply it by itself */
413     if (SBlastFilterOptionsMaskAtHash(filtering_options)) {
414         if (strlen(retval) != 0) {
415             /* Add mask at hash as a modifier for other filtering options */
416             if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
417                 return 0;
418             }
419         } else {
420             /* We still need to set "m" in a filter string (WB-391, WB-394) */
421             /* The string below can be modified into "mF" or "mL" or
422                whatever is decided to be the conventional meaning */
423             if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
424                 return 0;
425             }
426         }
427     }
428 
429     return strlen(retval) == 0
430         ? s_SafeStrCat(&retval, &retval_size, "F")
431         : retval;
432 }
433 
434 Int2
BlastFilteringOptionsFromString(EBlastProgramType program_number,const char * instructions,SBlastFilterOptions ** filtering_options,Blast_Message ** blast_message)435 BlastFilteringOptionsFromString(EBlastProgramType program_number,
436                                 const char* instructions,
437                                 SBlastFilterOptions* *filtering_options,
438                                 Blast_Message* *blast_message)
439 {
440         Boolean mask_at_hash = FALSE; /* the default. */
441         char* buffer;
442         const char* ptr = instructions;
443         char error_buffer[1024];
444         Int2 status = 0;
445         SSegOptions* segOptions = NULL;
446         SDustOptions* dustOptions = NULL;
447         SRepeatFilterOptions* repeatOptions = NULL;
448         SWindowMaskerOptions * winmaskOptions = NULL;
449 
450         *filtering_options = NULL;
451         if (blast_message)
452             *blast_message = NULL;
453 
454         if (instructions == NULL || strcasecmp(instructions, "F") == 0)
455         {
456              SBlastFilterOptionsNew(filtering_options, eEmpty);
457              return status;
458         }
459 
460         buffer = (char*) calloc(strlen(instructions), sizeof(char));
461 	/* allow old-style filters when m cannot be followed by the ';' */
462 	if (ptr[0] == 'm' && ptr[1] == ' ')
463 	{
464 		mask_at_hash = TRUE;
465 		ptr += 2;
466 	}
467 
468 	while (*ptr != NULLB)
469 	{
470 		if (*ptr == 'S')
471 		{
472                         SSegOptionsNew(&segOptions);
473 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
474 			if (buffer[0] != NULLB)
475 			{
476                                 int window = 0;
477                                 double locut = .0, hicut = .0;
478 				status = s_ParseSegOptions(buffer, &window, &locut, &hicut);
479                                 if (status)
480                                 {
481                                      segOptions = SSegOptionsFree(segOptions);
482                                      sprintf(error_buffer, "Error parsing filter string: %s", buffer);
483                                      if (blast_message)
484                                        Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
485                                             error_buffer);
486                                      sfree(buffer);
487                                      return status;
488                                 }
489                                 segOptions->window = window;
490                                 segOptions->locut = locut;
491                                 segOptions->hicut = hicut;
492 			}
493 		}
494 		else if (*ptr == 'D')
495 		{
496                         SDustOptionsNew(&dustOptions);
497 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
498 			if (buffer[0] != NULLB)
499                         {
500                                 int window = 0, level = 0, linker = 0;
501 				status = s_ParseDustOptions(buffer, &level, &window, &linker);
502                                 if (status)
503                                 {
504                                      dustOptions = SDustOptionsFree(dustOptions);
505                                      sprintf(error_buffer, "Error parsing filter string: %s", buffer);
506                                      if (blast_message)
507                                        Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
508                                             error_buffer);
509                                      sfree(buffer);
510                                      return status;
511                                 }
512                                 dustOptions->level = level;
513                                 dustOptions->window = window;
514                                 dustOptions->linker = linker;
515                         }
516 		}
517                 else if (*ptr == 'R')
518                 {
519                         SRepeatFilterOptionsNew(&repeatOptions);
520 			ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
521 			if (buffer[0] != NULLB)
522                         {
523                              char* dbname = NULL;
524                              status = s_ParseRepeatOptions(buffer, &dbname);
525                              if (status)
526                              {
527                                   repeatOptions = SRepeatFilterOptionsFree(repeatOptions);
528                                   sprintf(error_buffer, "Error parsing filter string: %s", buffer);
529                                   if (blast_message)
530                                      Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
531                                             error_buffer);
532                                    sfree(buffer);
533                                    return status;
534                              }
535                              if (dbname)
536                              {
537                                  sfree(repeatOptions->database);
538                                  repeatOptions->database = dbname;
539                              }
540                         }
541                 }
542                 else if (*ptr == 'W')
543                 {
544                     SWindowMaskerOptionsNew(&winmaskOptions);
545 
546                     ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
547                     if (buffer[0] != NULLB) {
548                         char* dbname = NULL;
549                         int taxid = 0;
550 
551                         status = s_ParseWindowMaskerOptions(buffer, &dbname, &taxid);
552                         if (status) {
553                             winmaskOptions = SWindowMaskerOptionsFree(winmaskOptions);
554                             sprintf(error_buffer, "Error parsing filter string: %s", buffer);
555                             if (blast_message)
556                                 Blast_MessageWrite(blast_message, eBlastSevError, kBlastMessageNoContext,
557                                                    error_buffer);
558 
559                             sfree(buffer);
560                             return status;
561                         }
562                         if (dbname) {
563                             sfree(winmaskOptions->database);
564                             winmaskOptions->database = dbname;
565                         }
566                         if (taxid) {
567                             winmaskOptions->taxid = taxid;
568                         }
569                     }
570                 }
571 		else if (*ptr == 'L' || *ptr == 'T')
572 		{ /* do low-complexity filtering; dust for blastn, otherwise seg.*/
573                         if (program_number == eBlastTypeBlastn
574                             || program_number == eBlastTypeMapping)
575 
576                             SDustOptionsNew(&dustOptions);
577                         else
578                             SSegOptionsNew(&segOptions);
579 			ptr++;
580 		}
581 		else if (*ptr == 'm')
582 		{
583 		        mask_at_hash = TRUE;
584                         ptr++;
585                 }
586                 else
587                 {    /* Nothing applied */
588                          ptr++;
589                 }
590 	}
591         sfree(buffer);
592 
593         status = SBlastFilterOptionsNew(filtering_options, eEmpty);
594         if (status)
595             return status;
596 
597         (*filtering_options)->dustOptions = dustOptions;
598         (*filtering_options)->segOptions = segOptions;
599         (*filtering_options)->repeatFilterOptions = repeatOptions;
600         (*filtering_options)->windowMaskerOptions = winmaskOptions;
601         (*filtering_options)->mask_at_hash = mask_at_hash;
602 
603         return status;
604 }
605 
606 
BlastSeqLocNew(BlastSeqLoc ** head,Int4 from,Int4 to)607 BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to)
608 {
609    BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
610    if ( !loc ) {
611        return NULL;
612    }
613    loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
614    loc->ssr->left = from;
615    loc->ssr->right = to;
616 
617    return BlastSeqLocAppend(head, loc);
618 }
619 
BlastSeqLocAppend(BlastSeqLoc ** head,BlastSeqLoc * node)620 BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node)
621 {
622     if ( !node ) {
623         return NULL;
624     }
625 
626     if (head)
627     {
628         if (*head)
629         {
630             BlastSeqLoc* tmp = *head;
631             while (tmp->next)
632                tmp = tmp->next;
633             tmp->next = node;
634         }
635         else
636         {
637             *head = node;
638         }
639     }
640 
641     return node;
642 }
643 
644 /** Makes a copy of the BlastSeqLoc and also a copy of the
645  * SSRange element.  Does not copy BlastSeqLoc that is pointed
646  * to by "next".
647  * @param source the object to be copied [in]
648  * @return another BlastSeqLoc*
649  */
s_BlastSeqLocNodeDup(BlastSeqLoc * source)650 static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source)
651 {
652     if ( !source ) {
653         return NULL;
654     }
655     ASSERT(source->ssr);
656     return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
657 }
658 
659 /** Calculates number of links in a chain of BlastSeqLoc's.
660  * @param var Chain of BlastSeqLoc structures [in]
661  * @return Number of links in the chain.
662  */
s_BlastSeqLocLen(const BlastSeqLoc * var)663 static Int4 s_BlastSeqLocLen(const BlastSeqLoc* var)
664 {
665     BlastSeqLoc* itr = NULL;
666     Int4 retval = 0;
667 
668     for (itr = (BlastSeqLoc*)var; itr; itr = itr->next, retval++) {
669         ;
670     }
671     return retval;
672 }
673 
674 /** Converts a BlastSeqLoc list to an array of pointers, each pointing to an
675  * element of the list passed in to this function and the last element points
676  * to NULL
677  * @param list List to convert to an array of pointers [in]
678  * @param count number of elements populated in the array [out]
679  */
680 static BlastSeqLoc**
s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc * list,Int4 * count)681 s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc* list, Int4* count)
682 {
683     BlastSeqLoc* tmp,** retval;
684     Int4 i;
685     *count = 0;
686 
687     if (list == NULL)
688        return NULL;
689 
690     *count = s_BlastSeqLocLen(list);
691     retval = (BlastSeqLoc**) calloc(((size_t)(*count)+1), sizeof(BlastSeqLoc*));
692     for (tmp = (BlastSeqLoc*)list, i = 0; tmp != NULL && i < *count; i++) {
693         retval[i] = tmp;
694         tmp = tmp->next;
695     }
696     return retval;
697 }
698 
699 /** Reverse elements in the list
700  * @param head pointer to pointer to the head of the list. [in|out]
701  * (this is not declared static so that it can be tested in the unit tests
702  */
703 NCBI_XBLAST_EXPORT
BlastSeqLocListReverse(BlastSeqLoc ** head)704 void BlastSeqLocListReverse(BlastSeqLoc** head)
705 {
706     BlastSeqLoc** ptrs = NULL;  /* array of pointers to BlastSeqLoc elements */
707     Int4 num_elems = 0, i = 0;
708 
709     if ( !head ) {
710         return;
711     }
712 
713     ptrs = s_BlastSeqLocListToArrayOfPointers(*head, &num_elems);
714     if (num_elems == 0) {
715         return;
716     }
717     ASSERT(ptrs);
718     *head = ptrs[num_elems-1];
719     for (i = num_elems-1; i > 0; i--) {
720         ptrs[i]->next = ptrs[i-1];
721     }
722     ptrs[0]->next = NULL;
723     sfree(ptrs);
724 }
725 
BlastSeqLocNodeFree(BlastSeqLoc * loc)726 BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc)
727 {
728     if ( !loc ) {
729         return NULL;
730     }
731     sfree(loc->ssr);
732     sfree(loc);
733     return NULL;
734 }
735 
BlastSeqLocFree(BlastSeqLoc * loc)736 BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
737 {
738     while (loc) {
739         BlastSeqLoc* next_loc = loc->next;
740         loc = BlastSeqLocNodeFree(loc);
741         loc = next_loc;
742     }
743     return NULL;
744 }
745 
BlastSeqLocListDup(BlastSeqLoc * head)746 BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head)
747 {
748     BlastSeqLoc* retval = NULL;
749     BlastSeqLoc* retval_tail = NULL;
750 
751     for (; head; head = head->next) {
752         retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
753                                         s_BlastSeqLocNodeDup(head));
754     }
755 
756     return retval;
757 }
758 
BlastMaskLocNew(Int4 total)759 BlastMaskLoc* BlastMaskLocNew(Int4 total)
760 {
761     BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
762     retval->total_size = total;
763     if (total > 0)
764         retval->seqloc_array = (BlastSeqLoc **) calloc(total,
765                                                        sizeof(BlastSeqLoc *));
766     return retval;
767 }
768 
BlastMaskLocDup(const BlastMaskLoc * mask_loc)769 BlastMaskLoc* BlastMaskLocDup(const BlastMaskLoc* mask_loc)
770 {
771     BlastMaskLoc* retval = NULL;
772     Int4 index = 0;
773 
774     if ( !mask_loc ) {
775         return NULL;
776     }
777 
778     retval = BlastMaskLocNew(mask_loc->total_size);
779 
780     for (index = 0; index < mask_loc->total_size; index++) {
781         retval->seqloc_array[index] =
782             BlastSeqLocListDup(mask_loc->seqloc_array[index]);
783     }
784 
785     return retval;
786 }
787 
BlastMaskLocFree(BlastMaskLoc * mask_loc)788 BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
789 {
790    Int4 index;
791 
792    if (mask_loc == NULL)
793       return NULL;
794 
795    for (index=0; index<mask_loc->total_size; index++)
796    {
797       if (mask_loc->seqloc_array != NULL)
798          BlastSeqLocFree(mask_loc->seqloc_array[index]);
799    }
800    sfree(mask_loc->seqloc_array);
801    sfree(mask_loc);
802    return NULL;
803 }
804 
BlastMaskLocDNAToProtein(BlastMaskLoc * mask_loc,const BlastQueryInfo * query_info)805 Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
806                               const BlastQueryInfo* query_info)
807 {
808     Uint4 seq_index;
809     BlastSeqLoc* dna_seqlocs[NUM_FRAMES];
810 
811     if (!mask_loc)
812         return 0;
813 
814     /* Check that the array size in BlastMaskLoc corresponds to the number
815        of contexts in BlastQueryInfo. */
816     ASSERT(mask_loc->total_size == query_info->last_context + 1);
817 
818     /* Loop over multiple DNA sequences */
819     for (seq_index = 0; seq_index < (Uint4)query_info->num_queries;
820          ++seq_index) {
821         const Uint4 ctx_idx = NUM_FRAMES * seq_index;
822         const Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
823                                                              eBlastTypeBlastx,
824                                                              seq_index);
825         Int4 context;
826 
827         /* Save the DNA masking locations, as they'll be freed and overwritten
828          * by their translations */
829         memset((void*) &dna_seqlocs, 0, sizeof(dna_seqlocs));
830         memcpy((void*) &dna_seqlocs,
831                (void*) &mask_loc->seqloc_array[ctx_idx],
832                sizeof(dna_seqlocs));
833         memset((void*) &mask_loc->seqloc_array[ctx_idx], 0, sizeof(dna_seqlocs));
834 
835         /* Reproduce this mask for all 6 frames, with translated coordinates */
836         for (context = 0; context < NUM_FRAMES; ++context) {
837             const Int2 frame = BLAST_ContextToFrame(eBlastTypeBlastx, context);
838             BlastSeqLoc* frame_seqloc = dna_seqlocs[context];
839             BlastSeqLoc* prot_tail = NULL;
840             BlastSeqLoc* itr = NULL;
841 
842             /* If no masks were provided for some frames, use the first one */
843             if (frame_seqloc == NULL && dna_seqlocs[0]) {
844                 frame_seqloc = dna_seqlocs[0];
845             }
846             for (itr = frame_seqloc; itr; itr = itr->next) {
847                 Int4 from, to;
848                 SSeqRange* seq_range = itr->ssr;
849                 /* masks should be 0-offset */
850                 ASSERT(seq_range->right < dna_length);
851                 ASSERT(seq_range->left  >= 0);
852                 if (frame < 0) {
853                     from = (dna_length + frame - seq_range->right)/CODON_LENGTH;
854                     to = (dna_length + frame - seq_range->left)/CODON_LENGTH;
855                 } else {
856                     from = (seq_range->left - frame + 1)/CODON_LENGTH;
857                     to = (seq_range->right - frame + 1)/CODON_LENGTH;
858                 }
859 
860                 if (from < 0)
861                     from = 0;
862                 if (to   < 0)
863                     to   = 0;
864                 if (from >= query_info->contexts[ctx_idx+context].query_length)
865                     from = query_info->contexts[ctx_idx+context].query_length - 1;
866                 if (to >= query_info->contexts[ctx_idx+context].query_length)
867                     to = query_info->contexts[ctx_idx+context].query_length - 1;
868 
869                 ASSERT(from >= 0);
870                 ASSERT(to   >= 0);
871                 ASSERT(from < query_info->contexts[ctx_idx+context].query_length);
872                 ASSERT(to   < query_info->contexts[ctx_idx+context].query_length);
873 
874                 /* Cache the tail of the list to avoid the overhead of
875                  * traversing the list when appending to it */
876                 prot_tail = BlastSeqLocNew((prot_tail
877                             ? & prot_tail
878                             : & mask_loc->seqloc_array[ctx_idx+context]),
879                             from, to);
880             }
881         }
882         for (context = 0; context < NUM_FRAMES; ++context) {
883             BlastSeqLocFree(dna_seqlocs[context]);
884         }
885     }
886 
887     return 0;
888 }
889 
890 
BlastMaskLocProteinToDNA(BlastMaskLoc * mask_loc,const BlastQueryInfo * query_info)891 Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
892                               const BlastQueryInfo* query_info)
893 {
894    Int2 status = 0;
895    Int4 index;
896 
897    /* If there is not mask, there is nothing to convert to DNA coordinates,
898       hence just return. */
899    if (!mask_loc)
900       return 0;
901 
902    /* Check that the array size in BlastMaskLoc corresponds to the number
903       of contexts in BlastQueryInfo. */
904    ASSERT(mask_loc->total_size == query_info->last_context + 1);
905 
906    /* Loop over all DNA sequences */
907    for (index=0; index < query_info->num_queries; ++index)
908    {
909        Int4 frame_start = index*NUM_FRAMES;
910        Int4 frame_index;
911        Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
912                                                       eBlastTypeBlastx,
913                                                       index);
914        /* Loop over all frames of one DNA sequence */
915        for (frame_index=frame_start; frame_index<(frame_start+NUM_FRAMES);
916             frame_index++) {
917            BlastSeqLoc* loc;
918            Int2 frame =
919                BLAST_ContextToFrame(eBlastTypeBlastx, frame_index % NUM_FRAMES);
920            /* Loop over all mask locations for a given frame */
921            for (loc = mask_loc->seqloc_array[frame_index]; loc; loc = loc->next) {
922                Int4 from=0, to=0;
923                SSeqRange* seq_range = loc->ssr;
924                if (frame < 0) {
925                    to = dna_length - CODON_LENGTH*seq_range->left + frame;
926                    from = dna_length - CODON_LENGTH*seq_range->right + frame + 1;
927                } else {
928                    from = CODON_LENGTH*seq_range->left + frame - 1;
929                    to = CODON_LENGTH*seq_range->right + frame - 1;
930                }
931 
932                if (from < 0)
933                    from = 0;
934                if (to   < 0)
935                    to   = 0;
936                if (from >= dna_length)
937                    from = dna_length - 1;
938                if (to   >= dna_length)
939                    to   = dna_length - 1;
940 
941                ASSERT(from >= 0);
942                ASSERT(to   >= 0);
943                ASSERT(from < dna_length);
944                ASSERT(to   < dna_length);
945 
946                seq_range->left = from;
947                seq_range->right = to;
948            }
949        }
950    }
951    return status;
952 }
953 
954 /** Used for qsort, compares two SeqLoc's by starting position. */
s_SeqRangeSortByStartPosition(const void * vp1,const void * vp2)955 static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
956 {
957    BlastSeqLoc* v1 = *((BlastSeqLoc**) vp1);
958    BlastSeqLoc* v2 = *((BlastSeqLoc**) vp2);
959    SSeqRange* loc1 = (SSeqRange*) v1->ssr;
960    SSeqRange* loc2 = (SSeqRange*) v2->ssr;
961 
962    if (loc1->left < loc2->left)
963       return -1;
964    else if (loc1->left > loc2->left)
965       return 1;
966    else
967       return 0;
968 }
969 
970 void
BlastSeqLocCombine(BlastSeqLoc ** mask_loc,Int4 link_value)971 BlastSeqLocCombine(BlastSeqLoc** mask_loc, Int4 link_value)
972 {
973     BlastSeqLoc** ptrs = NULL;
974     Int4 i = 0, num_elems = 0;
975 
976     /* Break up the list into an array of pointers and sort it */
977     ptrs = s_BlastSeqLocListToArrayOfPointers(*mask_loc, &num_elems);
978     if (num_elems == 0) {
979         return;
980     }
981     ASSERT(ptrs);
982     qsort(ptrs, (size_t)num_elems, sizeof(*ptrs),
983           s_SeqRangeSortByStartPosition);
984 
985     /* Merge the overlapping elements */
986     {
987         BlastSeqLoc* curr_tail = *mask_loc = ptrs[0];
988         for (i = 0; i < num_elems - 1; i++) {
989             const SSeqRange* next_ssr = ptrs[i+1]->ssr;
990             const Int4 stop = curr_tail->ssr->right;
991 
992             if ((stop + link_value) > next_ssr->left) {
993                 curr_tail->ssr->right = MAX(stop, next_ssr->right);
994                 ptrs[i+1] = BlastSeqLocNodeFree(ptrs[i+1]);
995             } else {
996                 curr_tail = ptrs[i+1];
997             }
998         }
999     }
1000 
1001     /* Rebuild the linked list */
1002     {
1003         BlastSeqLoc* tail = *mask_loc;
1004         for (i = 1; i < num_elems; i++) {
1005             if (ptrs[i]) {
1006                 tail->next = ptrs[i];
1007                 tail = ptrs[i];
1008             }
1009         }
1010         tail->next = NULL;
1011     }
1012     sfree(ptrs);
1013 }
1014 
1015 Int2
BLAST_ComplementMaskLocations(EBlastProgramType program_number,const BlastQueryInfo * query_info,const BlastMaskLoc * mask_loc,BlastSeqLoc ** complement_mask)1016 BLAST_ComplementMaskLocations(EBlastProgramType program_number,
1017    const BlastQueryInfo* query_info,
1018    const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask)
1019 {
1020    Int4 context;
1021    const Boolean kIsNucl = (program_number == eBlastTypeBlastn ||
1022                             program_number == eBlastTypeMapping);
1023    BlastSeqLoc* tail = NULL;    /* Pointer to the tail of the complement_mask
1024                                    linked list */
1025 
1026    if (complement_mask == NULL)
1027 	return -1;
1028 
1029    *complement_mask = NULL;
1030 
1031    for (context = query_info->first_context;
1032         context <= query_info->last_context; ++context) {
1033 
1034       Boolean first = TRUE;	/* Specifies beginning of query. */
1035       Boolean last_interval_open=TRUE; /* if TRUE last interval needs to be closed. */
1036       Int4 start_offset, end_offset, filter_start, filter_end;
1037       Int4 left=0, right; /* Used for left/right extent of a region. */
1038       BlastSeqLoc* loc = NULL;
1039 
1040       if (query_info->contexts[context].is_valid == FALSE) {
1041           continue;
1042       }
1043 
1044       start_offset = query_info->contexts[context].query_offset;
1045       end_offset = query_info->contexts[context].query_length
1046           + start_offset - 1;
1047       ASSERT(start_offset <= end_offset);
1048 
1049       /* mask_loc NULL is simply the case that NULL was passed in, which we
1050          take to mean that nothing on query is masked. */
1051       if (mask_loc == NULL || mask_loc->seqloc_array[context] == NULL) {
1052          /* Cache the tail of the list to avoid the overhead of traversing the
1053           * list when appending to it */
1054          tail = BlastSeqLocNew(tail ? &tail : complement_mask,
1055                                start_offset, end_offset);
1056          continue;
1057       }
1058 
1059       if (BlastIsReverseStrand(kIsNucl, context)) {
1060          BlastSeqLocListReverse(&mask_loc->seqloc_array[context]);
1061       }
1062       loc = mask_loc->seqloc_array[context];
1063 
1064       first = TRUE;
1065       for ( ; loc; loc = loc->next) {
1066          SSeqRange* seq_range = loc->ssr;
1067          if (BlastIsReverseStrand(kIsNucl, context)) {
1068             filter_start = end_offset - seq_range->right;
1069             filter_end = end_offset - seq_range->left;
1070          } else {
1071             filter_start = start_offset + seq_range->left;
1072             filter_end = start_offset + seq_range->right;
1073          }
1074          /* The canonical "state" at the top of this
1075             while loop is that both "left" and "right" have
1076             been initialized to their correct values.
1077             The first time this loop is entered in a call to
1078             the function this is not true and the following "if"
1079             statement moves everything to the canonical state. */
1080          if (first) {
1081             last_interval_open = TRUE;
1082             first = FALSE;
1083 
1084             if (filter_start > start_offset) {
1085                /* beginning of sequence not filtered */
1086                left = start_offset;
1087             } else {
1088                /* beginning of sequence filtered */
1089                left = filter_end + 1;
1090                continue;
1091             }
1092          }
1093 
1094          right = filter_start - 1;
1095 
1096          /* Cache the tail of the list to avoid the overhead of traversing the
1097           * list when appending to it */
1098          tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1099          if (filter_end >= end_offset) {
1100             /* last masked region at end of sequence */
1101             last_interval_open = FALSE;
1102             break;
1103          } else {
1104             left = filter_end + 1;
1105          }
1106       }
1107 
1108       if (last_interval_open) {
1109          /* Need to finish SSeqRange* for last interval. */
1110          right = end_offset;
1111          /* Cache the tail of the list to avoid the overhead of traversing the
1112           * list when appending to it */
1113          tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1114       }
1115    }
1116    return 0;
1117 }
1118 
1119 
1120 Int2
BlastSetUp_Filter(EBlastProgramType program_number,Uint1 * sequence,Int4 length,Int4 offset,const SBlastFilterOptions * filter_options,BlastSeqLoc ** seqloc_retval,Blast_Message ** blast_message)1121 BlastSetUp_Filter(EBlastProgramType program_number,
1122                   Uint1* sequence,
1123                   Int4 length,
1124                   Int4 offset,
1125                   const SBlastFilterOptions* filter_options,
1126                   BlastSeqLoc** seqloc_retval,
1127                   Blast_Message* *blast_message)
1128 {
1129 	Int2 status=0;		/* return value. */
1130 
1131     ASSERT(filter_options);
1132     ASSERT(seqloc_retval);
1133 
1134     *seqloc_retval = NULL;
1135 
1136     status = SBlastFilterOptionsValidate(program_number, filter_options,
1137                                          blast_message);
1138     if (status)
1139        return status;
1140 
1141 	if (filter_options->segOptions)
1142 	{
1143         SSegOptions* seg_options = filter_options->segOptions;
1144         SegParameters* sparamsp=NULL;
1145 
1146         sparamsp = SegParametersNewAa();
1147         sparamsp->overlaps = TRUE;
1148         if (seg_options->window > 0)
1149             sparamsp->window = seg_options->window;
1150         if (seg_options->locut > 0.0)
1151             sparamsp->locut = seg_options->locut;
1152         if (seg_options->hicut > 0.0)
1153             sparamsp->hicut = seg_options->hicut;
1154 
1155 		status = SeqBufferSeg(sequence, length, offset, sparamsp,
1156                               seqloc_retval);
1157 		SegParametersFree(sparamsp);
1158 		sparamsp = NULL;
1159 	}
1160 
1161 	return status;
1162 }
1163 
1164 void
BlastSeqLocReverse(BlastSeqLoc * masks,Int4 query_length)1165 BlastSeqLocReverse(BlastSeqLoc* masks, Int4 query_length)
1166 {
1167     for(; masks; masks = masks->next) {
1168         masks->ssr->left    = query_length - 1 - masks->ssr->right;
1169         masks->ssr->right   = query_length - 1 - masks->ssr->left;
1170     }
1171 }
1172 
1173 /** Calculates the mask locations one context at a time.
1174  * @param query_blk sequence [in]
1175  * @param query_info information about sequences [in]
1176  * @param context which context is this?  [in]
1177  * @param program_number program (blastn, blastp, etc.) [in]
1178  * @param filter_options instructions for producing mask [in]
1179  * @param filter_out results of filtering operations [out]
1180  * @param blast_message any error or warning messages [out]
1181  * @return zero on success
1182  */
1183 static Int2
s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,Int4 context,EBlastProgramType program_number,const SBlastFilterOptions * filter_options,BlastSeqLoc ** filter_out,Blast_Message ** blast_message)1184 s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk,
1185                                      const BlastQueryInfo* query_info,
1186                                      Int4 context,
1187                                      EBlastProgramType program_number,
1188                                      const SBlastFilterOptions* filter_options,
1189                                      BlastSeqLoc* *filter_out,
1190                                      Blast_Message* *blast_message)
1191 {
1192     Int2 status = 0;
1193     Int4 query_length = 0;      /* Length of query described by SeqLocPtr. */
1194     Int4 context_offset;
1195     Uint1 *buffer;              /* holds sequence for plus strand or protein. */
1196 
1197     const Boolean kIsNucl = (program_number == eBlastTypeBlastn ||
1198                              program_number == eBlastTypeMapping);
1199 
1200     context_offset = query_info->contexts[context].query_offset;
1201     buffer = &query_blk->sequence[context_offset];
1202 
1203     if (query_info->contexts[context].is_valid == FALSE) {
1204           return 0;
1205     }
1206 
1207     query_length = query_info->contexts[context].query_length;
1208 
1209     status = BlastSetUp_Filter(program_number,
1210                                buffer,
1211                                query_length,
1212                                0,
1213                                filter_options,
1214                                filter_out,
1215                                blast_message);
1216     if (status)
1217          return status;
1218 
1219     if (BlastIsReverseStrand(kIsNucl, context) == TRUE) {
1220         /* Reverse this as it's on minus strand. */
1221         BlastSeqLocReverse(*filter_out, query_length);
1222     }
1223 
1224     /* Extract the mask locations corresponding to this query
1225        (frame, strand), detach it from other masks.
1226        NB: for translated search the mask locations are expected in
1227        protein coordinates. The nucleotide locations must be converted
1228        to protein coordinates prior to the call to BLAST_MainSetUp.
1229     */
1230     {
1231         /* Auxiliary locations for lower-case masking or any other masking
1232          * which occurred outside of CORE BLAST */
1233         BlastSeqLoc *lcase_mask_slp = NULL;
1234         if (query_blk->lcase_mask && query_blk->lcase_mask->seqloc_array)
1235         {
1236             ASSERT(context < query_blk->lcase_mask->total_size);
1237             lcase_mask_slp = query_blk->lcase_mask->seqloc_array[context];
1238             /* Set location list to NULL, to allow safe memory deallocation,
1239               ownership transferred to filter_out below. */
1240             query_blk->lcase_mask->seqloc_array[context] = NULL;
1241         }
1242 
1243         /* Attach the lower case mask locations to the filter locations and
1244            combine them */
1245         BlastSeqLocAppend(filter_out, lcase_mask_slp);
1246     }
1247 
1248     BlastSeqLocCombine(filter_out, 0);
1249 
1250 	return 0;
1251 }
1252 
1253 Int2
BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,EBlastProgramType program_number,const SBlastFilterOptions * filter_options,BlastMaskLoc ** filter_maskloc,Blast_Message ** blast_message)1254 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
1255                                  const BlastQueryInfo* query_info,
1256                                  EBlastProgramType program_number,
1257                                  const SBlastFilterOptions* filter_options,
1258                                  BlastMaskLoc** filter_maskloc,
1259                                  Blast_Message** blast_message)
1260 {
1261     Int2 status = 0;
1262     Int4 context = 0; /* loop variable. */
1263     const int kNumContexts = query_info->last_context + 1;
1264 
1265     ASSERT(query_info && query_blk && filter_maskloc);
1266 
1267     ASSERT(blast_message);
1268     ASSERT(kNumContexts ==
1269            query_info->num_queries*BLAST_GetNumberOfContexts(program_number));
1270     *filter_maskloc = BlastMaskLocNew(kNumContexts);
1271 
1272     for (context = query_info->first_context;
1273          context <= query_info->last_context; ++context) {
1274 
1275         BlastSeqLoc *filter_per_context = NULL;
1276         status = s_GetFilteringLocationsForOneContext(query_blk,
1277                                                       query_info,
1278                                                       context,
1279                                                       program_number,
1280                                                       filter_options,
1281                                                       &filter_per_context,
1282                                                       blast_message);
1283         if (status) {
1284             Blast_MessageWrite(blast_message, eBlastSevError, context,
1285                                    "Failure at filtering");
1286             return status;
1287         }
1288 
1289     /* NB: for translated searches filter locations are returned in
1290            protein coordinates, because the DNA lengths of sequences are
1291            not available here. The caller must take care of converting
1292            them back to nucleotide coordinates. */
1293          (*filter_maskloc)->seqloc_array[context] = filter_per_context;
1294     }
1295     return 0;
1296 }
1297 
1298 void
Blast_MaskTheResidues(Uint1 * buffer,Int4 length,Boolean is_na,const BlastSeqLoc * mask_loc,Boolean reverse,Int4 offset)1299 Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
1300                       const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset)
1301 {
1302     const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask;
1303     ASSERT(buffer);
1304     for (; mask_loc; mask_loc = mask_loc->next) {
1305 
1306         Int4 index, start, stop;
1307 
1308         if (reverse) {
1309             start = length - 1 - mask_loc->ssr->right;
1310             stop = length - 1 - mask_loc->ssr->left;
1311         } else {
1312             start = mask_loc->ssr->left;
1313             stop = mask_loc->ssr->right;
1314         }
1315 
1316         start -= offset;
1317         stop -= offset;
1318 
1319         ASSERT(start < length);
1320         ASSERT(stop <= length);
1321 
1322         for (index = start; index <= stop; index++)
1323             buffer[index] = kMaskingLetter;
1324     }
1325 }
1326 
1327 void
Blast_MaskUnsupportedAA(BLAST_SequenceBlk * seq,Uint1 min_invalid)1328 Blast_MaskUnsupportedAA(BLAST_SequenceBlk* seq, Uint1 min_invalid)
1329 {
1330     Uint1 *sequence = seq->sequence;
1331     Int4 length = seq->length;
1332     Int4 i;
1333 
1334     for (i = 0; i < length; i++) {
1335         if (sequence[i] >= min_invalid) {
1336             sequence[i] = kProtMask;
1337         }
1338     }
1339 }
1340 
1341 void
BlastSetUp_MaskQuery(BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,const BlastMaskLoc * filter_maskloc,EBlastProgramType program_number)1342 BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
1343                      const BlastQueryInfo* query_info,
1344                      const BlastMaskLoc *filter_maskloc,
1345                      EBlastProgramType program_number)
1346 {
1347     const Boolean kIsNucl = (program_number == eBlastTypeBlastn
1348                              || program_number == eBlastTypeMapping);
1349     Int4 context; /* loop variable. */
1350     Int4 total_length;
1351     Boolean has_mask = FALSE; /* Check for whether filter_maskloc is empty. */
1352     Int4 index; /* loop variable. */
1353 
1354     ASSERT(query_blk);
1355     ASSERT(query_info);
1356     ASSERT(filter_maskloc);
1357 
1358 
1359     for (index=0; index<filter_maskloc->total_size; index++)
1360     {
1361          if (filter_maskloc->seqloc_array[index])
1362          {
1363             has_mask = TRUE;
1364             break;
1365          }
1366     }
1367     if (has_mask == FALSE)
1368        return;
1369 
1370 
1371     total_length  = query_info->contexts[query_info->last_context].query_offset
1372                   + query_info->contexts[query_info->last_context].query_length + 2;
1373     query_blk->sequence_start_nomask = BlastMemDup(query_blk->sequence_start, total_length);
1374     query_blk->sequence_nomask = query_blk->sequence_start_nomask + 1;
1375     query_blk->nomask_allocated = TRUE;
1376 
1377     for (context = query_info->first_context;
1378          context <= query_info->last_context; ++context) {
1379 
1380         Int4 query_length = 0;
1381         Int4 context_offset = 0;
1382         Uint1 *buffer = NULL;              /* holds sequence */
1383 
1384         if (query_info->contexts[context].is_valid == FALSE) {
1385           continue;
1386         }
1387 
1388         query_length = query_info->contexts[context].query_length;
1389 
1390         context_offset = query_info->contexts[context].query_offset;
1391         buffer = &query_blk->sequence[context_offset];
1392         ASSERT(buffer);
1393 
1394         Blast_MaskTheResidues(buffer, query_length, kIsNucl,
1395                               filter_maskloc->seqloc_array[context],
1396                               BlastIsReverseStrand(kIsNucl, context), 0);
1397     }
1398 }
1399