1 static char const rcsid[] = "$Id: pattern1.c,v 6.22 2006/08/04 21:11:17 papadopo Exp $";
2 
3 /* $Id: pattern1.c,v 6.22 2006/08/04 21:11:17 papadopo Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 
29 /*****************************************************************************
30 File name: pattern1.c
31 
32 Original Author: Zheng Zhang
33 
34 Contents: central pattern matching routines for PHI-BLAST and pseed3
35 
36 $Revision: 6.22 $
37 
38 $Log: pattern1.c,v $
39 Revision 6.22  2006/08/04 21:11:17  papadopo
40 refine check for end of line when parsing pattern string (fixes rt#15187012)
41 
42 Revision 6.21  2005/07/28 14:57:10  coulouri
43 remove dead code
44 
45 Revision 6.20  2005/07/22 14:24:22  coulouri
46 Incremented positions printed by the -p patternp option so that the first position is 1
47 
48 Revision 6.19  2004/11/22 17:12:52  madden
49 From Alejandro Schaffer:
50 Changed use of startSeqMatch in procedure pat_output to accurately report word boundaries for matches of a multiword pattern in the output hit file.
51 
52 Revision 6.18  2004/04/01 13:43:08  lavr
53 Spell "occurred", "occurrence", and "occurring"
54 
55 Revision 6.17  2003/08/06 15:11:17  dondosha
56 Fixed search for pattern remainder when it starts from a base non-divisible by 4
57 
58 Revision 6.16  2003/05/30 17:25:37  coulouri
59 add rcsid
60 
61 Revision 6.15  2003/05/13 16:02:53  coulouri
62 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
63 
64 Revision 6.14  2003/03/06 21:33:13  madden
65 Move big arrays from stack to heap
66 
67 Revision 6.13  2002/08/28 13:37:29  madden
68 Fix memory leaks
69 
70 Revision 6.12  2002/08/09 17:32:30  madden
71 Add warning if MAX_HIT is reached
72 
73 Revision 6.11  1999/09/22 17:51:02  shavirin
74 Now functions will collect messages in ValNodePtr before printing out.
75 
76 
77 *****************************************************************************/
78 
79 
80 #include <ncbi.h>
81 #include <objseq.h>
82 #include <objsset.h>
83 #include <sequtil.h>
84 #include <seqport.h>
85 #include <tofasta.h>
86 #include <blast.h>
87 #include <blastpri.h>
88 #include <txalign.h>
89 #include <simutil.h>
90 #include <gapxdrop.h>
91 #include <posit.h>
92 #include <readdb.h>
93 #include <ncbithr.h>
94 #include <seed.h>
95 
96 #ifdef __cplusplus
97 extern "C" {
98 #endif
99 #define MININT INT4_MIN/2
100 
101 
102 
103 
104 
105 
106 static Int4 expanding PROTO((Int4 *inputPatternMasked, Uint1 *inputPattern,
107 		      Int4 length, Int4 maxLength));
108 static Int4 packing PROTO((Uint1 *inputPattern, Int4 length));
109 
110 static void longpacking PROTO((Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch));
111 
112 static void longpacking2 PROTO((Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch));
113 
114 static void init_pattern_DNA PROTO((patternSearchItems *patternSearch));
115 
116 /*Initialize occurrence probabilities for each amino acid*/
initProbs(seedSearchItems * seedSearch)117 void LIBCALL initProbs(seedSearchItems * seedSearch)
118 {
119    double totalCount;  /*for Robinson frequencies*/
120    seedSearch->pchars[0] = '-';
121    seedSearch->pchars[1] = 'A';
122    seedSearch->pchars[2] = 'B';
123    seedSearch->pchars[3] = 'C';
124    seedSearch->pchars[4] = 'D';
125    seedSearch->pchars[5] = 'E';
126    seedSearch->pchars[6] = 'F';
127    seedSearch->pchars[7] = 'G';
128    seedSearch->pchars[8] = 'H';
129    seedSearch->pchars[9] = 'I';
130    seedSearch->pchars[10] = 'K';
131    seedSearch->pchars[11] = 'L';
132    seedSearch->pchars[12] = 'M';
133    seedSearch->pchars[13] = 'N';
134    seedSearch->pchars[14] = 'P';
135    seedSearch->pchars[15] = 'Q';
136    seedSearch->pchars[16] = 'R';
137    seedSearch->pchars[17] = 'S';
138    seedSearch->pchars[18] = 'T';
139    seedSearch->pchars[19] = 'V';
140    seedSearch->pchars[20] = 'W';
141    seedSearch->pchars[21] = 'X';
142    seedSearch->pchars[22] = 'Y';
143    seedSearch->pchars[23] = 'Z';
144    seedSearch->pchars[24] = 'U';
145    seedSearch->pchars[25] = '*';
146    totalCount = 78.0 + 19.0 + 54.0 + 63.0 + 39.0 +
147     74.0 + 22.0 + 52.0 + 57.0 + 90.0 + 22.0 + 45.0 + 52.0 +
148      43.0 + 51.0 + 71.0 + 59.0 + 64.0 + 13.0 + 32.0;
149    seedSearch->standardProb[0] = 0.0;
150    seedSearch->standardProb[1] = 78.0/totalCount; /*A*/
151    seedSearch->standardProb[2] = 0.0;
152    seedSearch->standardProb[3] = 19.0/totalCount; /*C*/
153    seedSearch->standardProb[4] = 54.0/totalCount; /*D*/
154    seedSearch->standardProb[5] = 63.0/totalCount; /*E*/
155    seedSearch->standardProb[6] = 39.0/totalCount; /*F*/
156    seedSearch->standardProb[7] = 74.0/totalCount; /*G*/
157    seedSearch->standardProb[8] = 22.0/totalCount; /*H*/
158    seedSearch->standardProb[9] = 52.0/totalCount; /*I*/
159    seedSearch->standardProb[10] = 57.0/totalCount; /*K*/
160    seedSearch->standardProb[11] = 90.0/totalCount; /*L*/
161    seedSearch->standardProb[12] = 22.0/totalCount; /*M*/
162    seedSearch->standardProb[13] = 45.0/totalCount; /*N*/
163    seedSearch->standardProb[14] = 52.0/totalCount; /*P*/
164    seedSearch->standardProb[15] = 43.0/totalCount; /*Q*/
165    seedSearch->standardProb[16] = 51.0/totalCount; /*R*/
166    seedSearch->standardProb[17] = 71.0/totalCount; /*S*/
167    seedSearch->standardProb[18] = 59.0/totalCount; /*T*/
168    seedSearch->standardProb[19] = 64.0/totalCount; /*V*/
169    seedSearch->standardProb[20] = 13.0/totalCount; /*W*/
170    seedSearch->standardProb[21] = 0.0;   /*X*/
171    seedSearch->standardProb[22] = 32.0/totalCount;   /*Y*/
172    seedSearch->standardProb[23] = 0.0;   /*Z*/
173    seedSearch->standardProb[24] = 0.0;   /*U*/
174 }
175 
176 /*pattern is a string describing the pattern to search for;
177   is_dna is a boolean describing the strings are DNA or protein*/
init_pattern(Uint1 * pattern,Boolean is_dna,patternSearchItems * patternSearch,seedSearchItems * seedSearch,ValNodePtr * error_return)178 Int4 LIBCALL init_pattern(Uint1 *pattern, Boolean is_dna, patternSearchItems * patternSearch,
179 	     seedSearchItems *seedSearch, ValNodePtr * error_return)
180 {
181     Int4 i; /*index over string describing the pattern*/
182     Int4 j; /*index for position in pattern*/
183     Int4 charIndex; /*index over characters in alphabet*/
184     Int4 secondIndex; /*second index into pattern*/
185     Int4 numIdentical; /*number of consec. positions with identical specification*/
186     Int4 charSetMask;  /*index over masks for specific characters*/
187     Int4 currentSetMask, prevSetMask ; /*mask for current and previous character positions*/
188     Int4 thisMask;    /*integer representing a bit pattern for a
189                         set of characters*/
190     Int4 minWildcard, maxWildcard; /*used for variable number of wildcard
191                                      positions*/
192     Int4  tj; /*temporary copy of j*/
193     Int4 tempInputPatternMasked[MaxP]; /*local copy of parts
194             of inputPatternMasked*/
195     Uint1 c;  /*character occurring in pattern*/
196     Uint1 localPattern[MaxP]; /*local variable to hold
197                                for each position whether it is
198                                last in pattern (1) or not (0) */
199     Nlm_FloatHi positionProbability; /*probability of a set of characters
200                                     allowed in one position*/
201     Int4 currentWildcardProduct; /*product of wildcard lengths for
202                                    consecutive character positions that
203                                    overlap*/
204 
205     patternSearch->flagPatternLength = ONE_WORD_PATTERN;
206     patternSearch->patternProbability = 1.0;
207     patternSearch->minPatternMatchLength = 0;
208     patternSearch->wildcardProduct = 1;
209     currentWildcardProduct = 1;
210     prevSetMask = 0;
211     currentSetMask = 0;
212 
213     for (i = 0 ; i < MaxP; i++) {
214       patternSearch->inputPatternMasked[i] = 0;
215       localPattern[i] = 0;
216     }
217     for (i = 0, j = 0; i < strlen((Char *) pattern); i++) {
218       c = pattern[i];
219       if (c == '\0' || c == '\r' || c == '\n')
220         break;
221       if (c == '-' || c == '.' || c =='>' || c ==' ' || c == '<')
222 	continue;  /*spacers that mean nothing*/
223       if ( c != '[' && c != '{') { /*not the start of a set of characters*/
224 	if (c == 'x' || c== 'X') {  /*wild-card character matches anything*/
225           /*next line checks to see if wild card is for multiple positions*/
226 	  if (pattern[i+1] == '(') {
227 	    i++;
228 	    secondIndex = i;
229             /*find end of description of how many positions are wildcarded
230                will look like x(2) or x(2,5) */
231 	    while (pattern[secondIndex] != ',' && pattern[secondIndex] != ')')
232 	      secondIndex++;
233 	    if (pattern[secondIndex] == ')') {  /*fixed number of positions wildcarded*/
234 	      i -= 1;
235               /*wildcard, so all characters are allowed*/
236 	      charSetMask=allone;
237 	      positionProbability = 1;
238 	    }
239 	    else { /*variable number of positions wildcarded*/
240 	      sscanf((Char*) &pattern[++i], "%d,%d", &minWildcard, &maxWildcard);
241 	      maxWildcard = maxWildcard - minWildcard;
242               currentWildcardProduct *= (maxWildcard + 1);
243               if (currentWildcardProduct > patternSearch->wildcardProduct)
244 		patternSearch->wildcardProduct = currentWildcardProduct;
245               patternSearch->minPatternMatchLength += minWildcard;
246 	      while (minWildcard-- > 0) {
247 		/*use one position each for the minimum number of
248                   wildcard spaces required */
249 		patternSearch->inputPatternMasked[j++] = allone;
250 		if (j >= MaxP) {
251                   BlastConstructErrorMessage("init_pattern", "pattern too long", 1, error_return);
252 		  /*ErrPostEx(SEV_FATAL, 1, 0, "pattern too long\n");*/
253 		  return(-1);
254 		}
255 	      }
256 	      if (maxWildcard != 0) {
257 		/*negative masking used to indicate variability
258                   in number of wildcard spaces; e.g., if pattern looks
259                   like x(3,5) then variability is 2 and there will
260                   be three wildcard positions with mask allone followed
261                   by a single position with mask -2*/
262 		patternSearch->inputPatternMasked[j++] = -maxWildcard;
263 		patternSearch->patternProbability *= maxWildcard;
264 	      }
265               /*now skip over wildcard description with the i index*/
266 	      while (pattern[++i] != ')') ;
267 	      continue;
268 	    }
269 	  }
270 	  else {  /*wild card is for one position only*/
271 	    charSetMask=allone;
272 	    positionProbability =1;
273 	  }
274 	}
275 	else {
276 	  if (c == 'U') {   /*look for special U character*/
277 	    charSetMask = allone*2+1;
278 	    positionProbability = 1;
279 	  }
280 	  else {
281             /*exactly one character matches*/
282             prevSetMask = currentSetMask;
283             currentSetMask =  charSetMask = (1 << seedSearch->order[c]);
284             if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
285 	      currentWildcardProduct = 1;
286 	    positionProbability = seedSearch->standardProb[seedSearch->order[c]];
287 	  }
288 	}
289       }
290       else {
291 	if (c == '[') {  /*start of a set of characters allowed*/
292 	  charSetMask = 0;
293 	  positionProbability = 0;
294 	  /*For each character in the set add it to the mask and
295             add its probability to positionProbability*/
296 	  while ((c=pattern[++i]) != ']') { /*end of set*/
297             if ((c < 'A') || (c > 'Z') || (c == '\0')) {
298 	      /* ErrPostEx(SEV_FATAL, 1, 0, "your pattern description has a non-alphabetic character inside a bracket\n"); */
299               BlastConstructErrorMessage("init_pattern", "your pattern description has a non-alphabetic character inside a bracket", 1, error_return);
300               return(-1);
301 	    }
302 	    charSetMask = charSetMask | (1 << seedSearch->order[c]);
303 	    positionProbability += seedSearch->standardProb[seedSearch->order[c]];
304 	  }
305           prevSetMask = currentSetMask;
306           currentSetMask = charSetMask;
307 	  if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
308 	      currentWildcardProduct = 1;
309  	}
310 	else {   /*start of a set of characters forbidden*/
311 	  /*For each character forbidden remove it to the mask and
312             subtract its probability from positionProbability*/
313 	  charSetMask = allone;
314 	  positionProbability = 1;
315 	  while ((c=pattern[++i]) != '}') { /*end of set*/
316 	    charSetMask = charSetMask -  (charSetMask & (1 << seedSearch->order[c]));
317 	    positionProbability -= seedSearch->standardProb[seedSearch->order[c]];
318 	  }
319           prevSetMask = currentSetMask;
320           currentSetMask = charSetMask;
321 	  if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
322 	      currentWildcardProduct = 1;
323 	}
324       }
325       /*handle a number of positions that are the same */
326       if (pattern[i+1] == '(') {  /*read opening paren*/
327 	i++;
328 	numIdentical = atoi((Char *) &pattern[++i]);  /*get number of positions*/
329         patternSearch->minPatternMatchLength += numIdentical;
330 	while (pattern[++i] != ')') ;  /*skip over piece in pattern*/
331 	while ((numIdentical--) > 0) {
332 	  /*set up mask for these positions*/
333 	  patternSearch->inputPatternMasked[j++] = charSetMask;
334 	  patternSearch->patternProbability *= positionProbability;
335 	}
336       }
337       else {   /*specification is for one posiion only*/
338 	patternSearch->inputPatternMasked[j++] = charSetMask;
339         patternSearch->minPatternMatchLength++;
340 	patternSearch->patternProbability *= positionProbability;
341       }
342       if (j >= MaxP) {
343 	BlastConstructErrorMessage("init_pattern", "pattern too long", 1, error_return);
344       }
345     }
346     localPattern[j-1] = 1;
347     if (patternSearch->patternProbability > 1.0)
348       patternSearch->patternProbability = 1.0;
349 
350     for (i = 0; i < j; i++) {
351       tempInputPatternMasked[i] = patternSearch->inputPatternMasked[i];
352       tj = j;
353     }
354     j = expanding(patternSearch->inputPatternMasked, localPattern, j, MaxP);
355     if ((j== -1) || ((j > BITS_PACKED_PER_WORD) && is_dna)) {
356       patternSearch->flagPatternLength = PATTERN_TOO_LONG;
357       longpacking2(tempInputPatternMasked, tj, patternSearch);
358       for (i = 0; i < tj; i++)
359 	patternSearch->inputPatternMasked[i] = tempInputPatternMasked[i];
360       patternSearch->highestPlace = tj;
361       if (is_dna)
362 	init_pattern_DNA(patternSearch);
363       return 1;
364     }
365     if (j > BITS_PACKED_PER_WORD) {
366       patternSearch->flagPatternLength = MULTI_WORD_PATTERN;
367       longpacking(j, localPattern, patternSearch);
368       return j;
369     }
370     /*make a bit mask out of local pattern of length j*/
371     patternSearch->match_mask = packing(localPattern, j);
372     /*store for each character a bit mask of which positions
373       that character can occur in*/
374     for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
375       thisMask = 0;
376       for (charSetMask = 0; charSetMask < j; charSetMask++) {
377 	if ((1<< charIndex) & patternSearch->inputPatternMasked[charSetMask])
378 	  thisMask |= (1 << charSetMask);
379       }
380       patternSearch->whichPositionsByCharacter[charIndex] = thisMask;
381     }
382     patternSearch->whichPositionPtr = patternSearch->whichPositionsByCharacter;
383     if (is_dna)
384       init_pattern_DNA(patternSearch);
385     return j; /*return number of places for pattern representation*/
386 }
387 
388 /*Looks for 1 bits in the same position of s and mask
389   Let rightOne be the rightmost position where s and mask both have
390   a 1.
391   Let rightMaskOnly < rightOne be the rightmost position where mask has a 1, if any
392      or -1 otherwise
393   returns (rightOne - rightMaskOnly) */
394 
lenof(Int4 s,Int4 mask)395 static Int4 lenof(Int4 s, Int4 mask)
396 {
397     Int4 checkingMatches = s & mask;  /*look for 1 bits in same position*/
398     Int4 rightOne; /*loop index looking for 1 in checkingMatches*/
399     Int4 rightMaskOnly; /*rightnost bit that is 1 in the mask only*/
400     rightMaskOnly = -1;
401     /*AAS Changed upper bound on loop here*/
402     for (rightOne = 0; rightOne < BITS_PACKED_PER_WORD; rightOne++) {
403 	if ((checkingMatches >> rightOne) % 2  == 1)
404 	  return rightOne - rightMaskOnly;
405 	if ((mask >> rightOne) %2  == 1)
406 	  rightMaskOnly = rightOne;
407     }
408     ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");
409     return(-1);
410 }
411 
412 /* routine to find hits of pattern to sequence when sequence is proteins
413    hitArray is an array of matches to pass back
414    seq is the input sequence
415    len1 is the length of the input sequence
416    patternSearch carries variables that keep track of
417       search parameters
418    returns the number of matches*/
find_hitsS(Int4 * hitArray,Uint1Ptr seq,Int4 len1,patternSearchItems * patternSearch)419 static Int4 find_hitsS(Int4 *hitArray, Uint1Ptr seq, Int4 len1,
420 		patternSearchItems *patternSearch)
421 {
422     Int4 i; /*loop index on sequence*/
423     Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
424                  with seq; e.g., if value is 9 = 0101 then
425                  last 3 chars of seq match first 3 positions in pattern
426                  and last 1 char of seq matches 1 position of pattern*/
427     Int4 numMatches = 0; /*number of matches found*/
428     Int4 mask;  /*mask of input pattern positions after which
429                   a match can be declared*/
430     Int4 maskShiftPlus1; /*mask shifted left 1 plus 1 */
431 
432     mask = patternSearch->match_mask;
433     maskShiftPlus1 = (mask << 1) + 1;
434     for (i = 0; i < len1; i++) {
435       /*shift the positions matched by 1 and try to match up against
436         the next character, also allow next character to match the
437         first position*/
438       prefixMatchedBitPattern =
439 	((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
440 	patternSearch->whichPositionPtr[seq[i]];
441       if (prefixMatchedBitPattern & mask) {
442         /*first part of pair is index of place in seq where match
443           ends; second part is where match starts*/
444 	hitArray[numMatches++] = i;
445 	hitArray[numMatches++] = i - lenof(prefixMatchedBitPattern, mask)+1;
446 	if (numMatches == MAX_HIT)
447 	{
448     		ErrPostEx(SEV_WARNING, 0, 0, "%ld matches saved, discarding others", numMatches);
449 		break;
450 	}
451       }
452     }
453     return numMatches;
454 }
455 
456 /*set uo matches for words that encode 4 DNA characters; figure out
457   for each of 256 possible DNA 4-mers, where a prefix matches the pattern
458  and where a suffix matches the pattern; store in prefixPos and
459  suffixPos; mask has 1 bits for whatever lengths of string
460 the pattern can match, mask2 has 4 1 bits corresponding to
461 the last 4 positions of a match; they are used to
462 do the prefixPos and suffixPos claculations with bit arithmetic*/
setting_tt(Int4Ptr S,Int4 mask,Int4 mask2,Uint4Ptr prefixPos,Uint4Ptr suffixPos)463 static void setting_tt(Int4Ptr S, Int4 mask, Int4 mask2, Uint4Ptr prefixPos,
464 		       Uint4Ptr suffixPos)
465 {
466   Int4 i; /*index over possible DNA encoded words, 4 bases per word*/
467   Int4 tmp; /*holds different mask combinations*/
468   Int4 maskLeftPlusOne; /*mask shifted left 1 plus 1; guarantees 1
469                            1 character match effectively */
470   Uint1 a1, a2, a3, a4;  /*four bases packed into an integer*/
471 
472   maskLeftPlusOne = (mask << 1)+1;
473   for (i = 0; i < ASCII_SIZE; i++) {
474     /*find out the 4 bases packed in integer i*/
475     a1 = READDB_UNPACK_BASE_1(i);
476     a2 = READDB_UNPACK_BASE_2(i);
477     a3 = READDB_UNPACK_BASE_3(i);
478     a4 = READDB_UNPACK_BASE_4(i);
479     /*what positions match a prefix of a4 followed by a3*/
480     tmp = ((S[a4]>>1) | mask) & S[a3];
481     /*what positions match a prefix of a4 followed by a3 followed by a2*/
482     tmp = ((tmp >>1) | mask) & S[a2];
483     /*what positions match a prefix of a4, a3, a2,a1*/
484     prefixPos[i] = mask2 & ((tmp >>1) | mask) & S[a1];
485 
486     /*what positions match a suffix of a2, a1*/
487     tmp = ((S[a1]<<1) | maskLeftPlusOne) & S[a2];
488     /* what positions match a suffix of a3, a2, a1*/
489     tmp = ((tmp <<1) | maskLeftPlusOne) & S[a3];
490     /*what positions match a suffix of a4, a3, a2, a1*/
491     suffixPos[i] = ((((tmp <<1) | maskLeftPlusOne) & S[a4]) << 1) | maskLeftPlusOne;
492   }
493 }
494 
495 
496 /*find hits when sequence is DNA and pattern is short
497   returns twice the number of hits
498   pos indicates the starting position
499   len is length of sequence seq
500   hitArray stores the results*/
find_hitsS_DNA(Int4Ptr hitArray,Uint1Ptr seq,Char pos,Int4 len,patternSearchItems * patternSearch)501 static Int4 find_hitsS_DNA(Int4Ptr hitArray, Uint1Ptr seq, Char pos, Int4 len,
502 	       patternSearchItems *patternSearch)
503 {
504   /*Some variables and the algorithm are similar to what is
505     used in find_hits() above; see more detailed comments there*/
506   Uint4 prefixMatchedBitPattern; /*indicates where pattern aligns
507                                   with sequence*/
508   Uint4 mask2; /*mask to match agaist*/
509   Int4 maskShiftPlus1; /*mask2 shifted plus 1*/
510   Uint4 tmp; /*intermediate result of masked comparisons*/
511   Int4 i; /*index on seq*/
512   Int4 end; /*count of number of 4-mer iterations needed*/
513   Int4 remain; /*0,1,2,3 DNA letters left over*/
514   Int4 j; /*index on suffixRemnant*/
515   Int4 twiceNumHits = 0; /*twice the number of hits*/
516 
517   mask2 = patternSearch->match_mask*BITS_PACKED_PER_WORD+15;
518   maskShiftPlus1 = (patternSearch->match_mask << 1)+1;
519 
520   if (pos != 0) {
521     pos = 4 - pos;
522     prefixMatchedBitPattern = ((patternSearch->match_mask * ((1 << (pos+1))-1)*2) +
523 	 (1 << (pos+1))-1)& patternSearch->DNAwhichSuffixPosPtr[seq[0]];
524     seq++;
525     end = (len-pos)/4;
526     remain = (len-pos) % 4;
527   }
528   else {
529     prefixMatchedBitPattern = maskShiftPlus1;
530     end = len/4;
531     remain = len % 4;
532   }
533   for (i = 0; i < end; i++) {
534     if (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]])) {
535       for (j = 0; j < 4; j++) {
536 	if (tmp & patternSearch->match_mask) {
537 	  hitArray[twiceNumHits++] = i*4+j + pos;
538 	  hitArray[twiceNumHits++] = i*4+j + pos -lenof(tmp & patternSearch->match_mask,
539 					  patternSearch->match_mask)+1;
540 	}
541 	tmp = (tmp << 1);
542       }
543     }
544     prefixMatchedBitPattern = (((prefixMatchedBitPattern << 4) | mask2) & patternSearch->DNAwhichSuffixPosPtr[seq[i]]);
545   }
546   /* In the last byte check bits only up to 'remain' */
547   if (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]])) {
548      for (j = 0; j < remain; j++) {
549         if (tmp & patternSearch->match_mask) {
550            hitArray[twiceNumHits++] = i*4+j + pos;
551            hitArray[twiceNumHits++] = i*4+j + pos - lenof(tmp & patternSearch->match_mask, patternSearch->match_mask)+1;
552         }
553         tmp = (tmp << 1);
554      }
555   }
556   return twiceNumHits;
557 }
558 
559 /*Top level routine when pattern has a short description
560   hitArray is to return the hits
561   seq is the input sequence
562   start is what position to start at in seq
563   len is the length of seq
564   is_dna is 1 if and only if seq is a DNA sequence
565   the return value from the appropriate lower level routine is passed
566   back*/
find_hitsS_head(Int4Ptr hitArray,Uint1Ptr seq,Int4 start,Int4 len,Uint1 is_dna,patternSearchItems * patternSearch)567 static Int4  find_hitsS_head(Int4Ptr hitArray, Uint1Ptr seq, Int4 start, Int4 len,
568 		      Uint1 is_dna, patternSearchItems *patternSearch)
569 {
570   if (is_dna)
571     return find_hitsS_DNA(hitArray, &seq[start/4], start % 4, len, patternSearch);
572   return find_hitsS(hitArray, &seq[start], len, patternSearch);
573 }
574 
575 /*length is the length of inputPattern, maxLength is a limit on
576    how long inputPattern can get;
577    return the final length of the pattern or -1 if too long*/
expanding(Int4 * inputPatternMasked,Uint1 * inputPattern,Int4 length,Int4 maxLength)578 static Int4 expanding(Int4 *inputPatternMasked, Uint1 *inputPattern,
579 		      Int4 length, Int4 maxLength)
580 {
581     Int4 i, j; /*pattern indices*/
582     Int4 numPos; /*number of positions index*/
583     Int4  k, t; /*loop indices*/
584     Int4 recReturnValue1, recReturnValue2; /*values returned from
585                                              recursive calls*/
586     Int4 thisPlaceMasked; /*value of one place in inputPatternMasked*/
587     Int4 tempPatternMask[MaxP]; /*used as a local representation of
588                                part of inputPatternMasked*/
589     Uint1 tempPattern[MaxP]; /*used as a local representation of part of
590                                inputPattern*/
591 
592     for (i = 0; i < length; i++) {
593       thisPlaceMasked = -inputPatternMasked[i];
594       if (thisPlaceMasked > 0) {  /*represented variable wildcard*/
595 	inputPatternMasked[i] = allone;
596 	for (j = 0; j < length; j++) {
597 	  /*use this to keep track of pattern*/
598 	  tempPatternMask[j] = inputPatternMasked[j];
599 	  tempPattern[j] = inputPattern[j];
600 	}
601 	recReturnValue2 = recReturnValue1 =
602 	  expanding(inputPatternMasked, inputPattern, length, maxLength);
603 	if (recReturnValue1 == -1)
604 	  return -1;
605 	for (numPos = 0; numPos <= thisPlaceMasked; numPos++) {
606 	  if (numPos == 1)
607 	    continue;
608 	  for (k = 0; k < length; k++) {
609 	    if (k == i) {
610 	      for (t = 0; t < numPos; t++) {
611 		inputPatternMasked[recReturnValue1++] = allone;
612                 if (recReturnValue1 >= maxLength)
613                   return(-1);
614 	      }
615 	    }
616 	    else {
617 	      inputPatternMasked[recReturnValue1] = tempPatternMask[k];
618 	      inputPattern[recReturnValue1++] = tempPattern[k];
619               if (recReturnValue1 >= maxLength)
620                   return(-1);
621 	    }
622 	    if (recReturnValue1 >= maxLength)
623 	      return (-1);
624 	  }
625 	  recReturnValue1 =
626 	    expanding(&inputPatternMasked[recReturnValue2],
627 		      &inputPattern[recReturnValue2],
628 		      length + numPos - 1,
629 		      maxLength - recReturnValue2);
630 	  if (recReturnValue1 == -1)
631 	    return -1;
632 	  recReturnValue2 += recReturnValue1;
633 	  recReturnValue1 = recReturnValue2;
634 	}
635 	return recReturnValue1;
636       }
637     }
638     return length;
639 }
640 
641 /*Pack the next length bytes of inputPattern into a bit vector
642   where the bit is 1 if and only if the byte is non-0
643   Returns packed bit vector*/
packing(Uint1 * inputPattern,Int4 length)644 static Int4 packing(Uint1 *inputPattern, Int4 length)
645 {
646     Int4 i; /*loop index*/
647     Int4 returnValue = 0; /*value to return*/
648     for (i = 0; i < length; i++) {
649       if (inputPattern[i])
650 	returnValue += (1 << i);
651     }
652     return returnValue;
653 }
654 
655 
656 /*Pack the bit representation of the inputPattern into
657    the array patternSearch->match_maskL
658    numPlaces is the number of positions in
659    inputPattern
660    Also packs patternSearch->bitPatternByLetter  */
longpacking(Int4 numPlaces,Uint1 * inputPattern,patternSearchItems * patternSearch)661 static void longpacking(Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch)
662 {
663     Int4 charIndex; /*index over characters in alphabet*/
664     Int4 bitPattern; /*bit pattern for one word to pack*/
665     Int4 i;  /*loop index over places*/
666     Int4 wordIndex; /*loop counter over words to pack into*/
667 
668     patternSearch->numWords = (numPlaces-1) / BITS_PACKED_PER_WORD +1;
669 
670     for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
671       bitPattern = 0;
672       for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
673 	if (inputPattern[wordIndex*BITS_PACKED_PER_WORD+i])
674 	  bitPattern += (1 << i);
675       }
676       patternSearch->match_maskL[wordIndex] = bitPattern;
677     }
678     for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
679       for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
680 	bitPattern = 0;
681 	for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
682 	  if ((1<< charIndex) & patternSearch->inputPatternMasked[wordIndex*BITS_PACKED_PER_WORD + i])
683 	    bitPattern = bitPattern | (1 << i);
684 	}
685 	patternSearch->bitPatternByLetter[charIndex][wordIndex] =
686 	  bitPattern;
687 	}
688     }
689 }
690 
691 
692 /*Let F be the number of the first bit in s that is 1
693   Let G be the first bit in mask that is one such that G < F;
694   Else let G = -1;
695   Returns F - G*/
lenofL(Int4 * s,Int4 * mask,patternSearchItems * patternSearch)696 static Int4 lenofL(Int4 *s, Int4 *mask, patternSearchItems *patternSearch)
697 {
698     Int4 bitIndex; /*loop index over bits in a word*/
699     Int4 wordIndex;  /*loop index over words*/
700     Int4 firstOneInMask;
701 
702     firstOneInMask = -1;
703     for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
704       for (bitIndex = 0; bitIndex < BITS_PACKED_PER_WORD; bitIndex++) {
705 	if ((s[wordIndex] >> bitIndex) % 2  == 1)
706 	  return wordIndex*BITS_PACKED_PER_WORD+bitIndex-firstOneInMask;
707 	if ((mask[wordIndex] >> bitIndex) %2  == 1)
708 	  firstOneInMask = wordIndex*BITS_PACKED_PER_WORD+bitIndex;
709       }
710     }
711     ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");
712     return(-1);
713 }
714 
715 /*Shift each word in the array left by 1 bit and add bit b,
716   if the new values is bigger that OVERFLOW1, then subtract OVERFLOW1 */
lmove(Int4 * a,Uint1 b,patternSearchItems * patternSearch)717 static void lmove(Int4 *a, Uint1 b, patternSearchItems *patternSearch)
718 {
719     Int4 x;
720     Int4 i; /*index on words*/
721     for (i = 0; i < patternSearch->numWords; i++) {
722       x = (a[i] << 1) + b;
723       if (x >= OVERFLOW1) {
724 	a[i] = x - OVERFLOW1;
725 	b = 1;
726       }
727       else {
728 	a[i] = x;
729 	b = 0;
730       }
731     }
732 }
733 
734 /*Do a word-by-word bit-wise or of a and b and put the result back in a*/
or(Int4 * a,Int4 * b,patternSearchItems * patternSearch)735 static void or(Int4 *a, Int4 *b, patternSearchItems *patternSearch)
736 {
737     Int4 i; /*index over words*/
738     for (i = 0; i < patternSearch->numWords; i++)
739 	a[i] = (a[i] | b[i]);
740 }
741 
742 /*Do a word-by-word bit-wise or of a and b and put the result in
743   result; return 1 if there are any non-zero words*/
and(Int4 * result,Int4 * a,Int4 * b,patternSearchItems * patternSearch)744 static and(Int4 *result, Int4 *a, Int4 *b, patternSearchItems *patternSearch)
745 {
746     Int4 i; /*index over words*/
747     Int4 returnValue = 0;
748 
749     for (i = 0; i < patternSearch->numWords; i++)
750       if (result[i] = (a[i] & b[i]))
751 	returnValue = 1;
752     return returnValue;
753 }
754 
755 /* Finds places where pattern matches seq and returns them as
756    pairs of positions in consecutive entries of hitArray;
757    similar to find_hitsS
758    hitArray is array of hits to return
759    seq is the input sequence and len1 is its length
760    patternSearch carries all the pattern variables
761    return twice the number of hits*/
find_hitsL(Int4 * hitArray,Uint1Ptr seq,Int4 len1,patternSearchItems * patternSearch)762 static Int4 find_hitsL(Int4 *hitArray, Uint1Ptr seq, Int4 len1,
763 		       patternSearchItems *patternSearch)
764 {
765     Int4 wordIndex; /*index on words in mask*/
766     Int4 i; /*loop index on seq */
767     Int4  *prefixMatchedBitPattern; /*see similar variable in
768                                       find_hitsS*/
769     Int4 twiceNumHits = 0; /*counter for hitArray*/
770     Int4 *mask; /*local copy of match_maskL version of pattern
771                   indicates after which positions a match can be declared*/
772     Int4 *matchResult; /*Array of words to hold the result of the
773                          final test for a match*/
774 
775     matchResult = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
776     mask = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
777     prefixMatchedBitPattern = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
778     for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
779       mask[wordIndex] = patternSearch->match_maskL[wordIndex];
780       prefixMatchedBitPattern[wordIndex] = 0;
781     }
782     /*This is a multiword version of the algorithm in find_hitsS*/
783     lmove(mask, 1, patternSearch);
784     for (i = 0; i < len1; i++) {
785       lmove(prefixMatchedBitPattern, 0, patternSearch);
786       or(prefixMatchedBitPattern, mask, patternSearch);
787       and(prefixMatchedBitPattern, prefixMatchedBitPattern, patternSearch->bitPatternByLetter[seq[i]], patternSearch);
788       if (and(matchResult, prefixMatchedBitPattern, patternSearch->match_maskL, patternSearch)) {
789 	hitArray[twiceNumHits++] = i;
790 	hitArray[twiceNumHits++] = i-lenofL(matchResult, patternSearch->match_maskL, patternSearch)+1;
791       }
792     }
793     MemFree(prefixMatchedBitPattern);
794     MemFree(matchResult);
795     MemFree(mask);
796     return twiceNumHits;
797 }
798 
799 
800 /*initialize mask and other arrays for DNA patterns*/
init_pattern_DNA(patternSearchItems * patternSearch)801 static void init_pattern_DNA(patternSearchItems *patternSearch)
802 {
803   Int4 mask1; /*mask for one word in a set position*/
804   Int4 compositeMask; /*superimposed mask1 in 4 adjacent positions*/
805   Int4 wordIndex; /*index over words in pattern*/
806 
807   if (patternSearch->flagPatternLength != ONE_WORD_PATTERN) {
808     for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
809       mask1 = patternSearch->match_maskL[wordIndex];
810       compositeMask = mask1 + (mask1>>1)+(mask1>>2)+(mask1>>3);
811       setting_tt(patternSearch->SLL[wordIndex],
812       patternSearch->match_maskL[wordIndex],
813 	 compositeMask, patternSearch->DNAprefixSLL[wordIndex], patternSearch->DNAsuffixSLL[wordIndex]);
814     }
815   }
816   else {
817     compositeMask = patternSearch->match_mask +
818       (patternSearch->match_mask>>1) +
819       (patternSearch->match_mask>>2) + (patternSearch->match_mask>>3);
820     patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAwhichPrefixPositions;
821     patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAwhichSuffixPositions;
822     setting_tt(patternSearch->whichPositionsByCharacter,
823     patternSearch->match_mask, compositeMask,
824     patternSearch->DNAwhichPrefixPositions, patternSearch->DNAwhichSuffixPositions);
825   }
826 }
827 
828 /*Return the number of 1 bits in the base 2 representation of a*/
num_of_one(Int4 a)829 static Int4 num_of_one(Int4 a)
830 {
831   Int4 returnValue;
832   returnValue = 0;
833   while (a > 0) {
834     if (a % 2 == 1)
835       returnValue++;
836     a = (a >> 1);
837   }
838   return returnValue;
839 }
840 
841 /*Sets up field in patternSearch when pattern is very long*/
longpacking2(Int4 * inputPatternMasked,Int4 numPlacesInPattern,patternSearchItems * patternSearch)842 static void longpacking2(Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch)
843 {
844     Int4 placeIndex; /*index over places in pattern rep.*/
845     Int4 wordIndex; /*index over words*/
846     Int4 placeInWord, placeInWord2;  /*index for places in a single word*/
847     Int4 charIndex; /*index over characters in alphabet*/
848     Int4 oneWordMask; /*mask of matching characters for one word in
849                         pattern representation*/
850     Nlm_FloatHi patternWordProbability;
851     Nlm_FloatHi  most_specific; /*lowest probability of a word in the pattern*/
852     Int4 *oneWordSLL; /*holds patternSearch->SLL for one word*/
853 
854     most_specific = 1.0;
855     patternSearch->whichMostSpecific = 0;
856     patternWordProbability = 1.0;
857     for (placeIndex = 0, wordIndex = 0, placeInWord=0;
858 	 placeIndex <= numPlacesInPattern; 	 placeIndex++, placeInWord++) {
859       if (placeIndex==numPlacesInPattern || inputPatternMasked[placeIndex] < 0
860 	  || placeInWord == BITS_PACKED_PER_WORD ) {
861 	patternSearch->match_maskL[wordIndex] = 1 << (placeInWord-1);
862 	oneWordSLL = patternSearch->SLL[wordIndex];
863 	for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
864 	  oneWordMask = 0;
865 	  for (placeInWord2 = 0; placeInWord2 < placeInWord; placeInWord2++) {
866 	    if ((1<< charIndex) &
867 		inputPatternMasked[placeIndex-placeInWord+placeInWord2])
868 	      oneWordMask |= (1 << placeInWord2);
869 	  }
870 	  oneWordSLL[charIndex] = oneWordMask;
871 	}
872 	patternSearch->numPlacesInWord[wordIndex] = placeInWord;
873 	if (patternWordProbability < most_specific) {
874 	  most_specific = patternWordProbability;
875 	  patternSearch->whichMostSpecific = wordIndex;
876 	}
877 	if (placeIndex == numPlacesInPattern)
878 	  patternSearch->spacing[wordIndex++] = 0;
879 	else
880 	  if (inputPatternMasked[placeIndex] < 0) {
881 	    patternSearch->spacing[wordIndex++] = -inputPatternMasked[placeIndex];
882 	  }
883 	  else {
884 	    placeIndex--;
885 	    patternSearch->spacing[wordIndex++] = 0;
886 	  }
887 	placeInWord = -1;
888 	patternWordProbability = 1.0;
889       }
890       else {
891 	patternWordProbability *= (Nlm_FloatHi)
892 	  num_of_one(inputPatternMasked[placeIndex])/ (Nlm_FloatHi) ALPHABET_SIZE;
893 	}
894     }
895     patternSearch->numWords = wordIndex;
896 }
897 
898 /*find matches when pattern is very long,
899   hitArray is used to pass back pairs of end position. start position for hits
900   seq is the sequence; len is its length
901   is_dna indicates if the sequence is DNA or protein*/
find_hitsLL(Int4 * hitArray,Uint1Ptr seq,Int4 len,Boolean is_dna,patternSearchItems * patternSearch)902 static Int4 find_hitsLL(Int4 *hitArray, Uint1Ptr seq, Int4 len, Boolean is_dna,
903 		 patternSearchItems *patternSearch)
904 {
905     Int4 twiceNumHits; /*twice the number of matches*/
906     Int4 twiceHitsOneCall; /*twice the number of hits in one call to
907                                  find_hitsS */
908     Int4 wordIndex;  /*index over words in pattern*/
909     Int4 start; /*start position in sequence for calls to find_hitsS */
910     Int4 hitArray1[MAX_HIT]; /*used to get hits against different words*/
911     Int4 nextPosInHitArray; /*next available position in hitArray1 */
912     Int4 hitIndex1, hitIndex2;  /*indices over hitArray1*/
913 
914     patternSearch->whichPositionPtr =
915       patternSearch->SLL[patternSearch->whichMostSpecific];
916     patternSearch->match_mask =
917       patternSearch->match_maskL[patternSearch->whichMostSpecific];
918     if (is_dna) {
919       patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[patternSearch->whichMostSpecific];
920       patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[patternSearch->whichMostSpecific];
921     }
922     /*find matches to most specific word of pattern*/
923     twiceNumHits = find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
924     if (twiceNumHits < 2)
925       return 0;
926     /*extend matches word by word*/
927     for (wordIndex = patternSearch->whichMostSpecific+1;
928 	 wordIndex < patternSearch->numWords; wordIndex++) {
929 	patternSearch->whichPositionPtr =
930 	  patternSearch->SLL[wordIndex];
931 	patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
932 	if (is_dna) {
933 	  patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
934 	  patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
935 	}
936 	nextPosInHitArray = 0;
937 	for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
938 	  twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq,
939        hitArray[hitIndex2]+1, MIN(len-hitArray[hitIndex2]-1,
940        patternSearch->spacing[wordIndex-1] + patternSearch->numPlacesInWord[wordIndex]), is_dna, patternSearch);
941 	  for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
942 	    hitArray1[nextPosInHitArray+hitIndex1] =
943 	      hitArray[hitIndex2]+hitArray1[nextPosInHitArray+hitIndex1]+1;
944 	    hitArray1[nextPosInHitArray+hitIndex1+1] = hitArray[hitIndex2+1];
945 	  }
946 	  nextPosInHitArray += twiceHitsOneCall;
947 	}
948 	twiceNumHits = nextPosInHitArray;
949 	if (twiceNumHits < 2)
950 	  return 0;
951         /*copy back matches that extend */
952 	for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
953 	  hitArray[hitIndex2] = hitArray1[hitIndex2];
954     }
955     /*extend each match back one word at a time*/
956     for (wordIndex = patternSearch->whichMostSpecific-1; wordIndex >=0;
957 	 wordIndex--) {
958       patternSearch->whichPositionPtr =
959 	patternSearch->SLL[wordIndex];
960       patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
961       if (is_dna) {
962 	patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
963 	patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
964       }
965       nextPosInHitArray = 0;
966       for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
967 	start = hitArray[hitIndex2+1]-patternSearch->spacing[wordIndex]-patternSearch->numPlacesInWord[wordIndex];
968 	if (start < 0)
969 	  start = 0;
970 	twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq, start,
971 			    hitArray[hitIndex2+1]-start, is_dna, patternSearch);
972 	for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
973 	  hitArray1[nextPosInHitArray+hitIndex1] = hitArray[hitIndex2];
974 	  hitArray1[nextPosInHitArray+hitIndex1+1] = start +
975 	    hitArray1[nextPosInHitArray+hitIndex1+1];
976 	}
977 	nextPosInHitArray += twiceHitsOneCall;
978       }
979       twiceNumHits = nextPosInHitArray;
980       if (twiceNumHits < 2)
981 	return 0;
982       /*copy back matches that extend*/
983       for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
984 	hitArray[hitIndex2] = hitArray1[hitIndex2];
985     }
986     return twiceNumHits;
987 }
988 
989 /*stores dynamic programming information */
990 typedef struct DP {
991     Int4 CC; /*minimum cost for these coordiantes*/
992     Int4 DD; /*minimum cost when preceded by a deletion (vertical edge)
993                for these coordinates*/
994 } *dp_ptr, dp_node;
995 
996 #define gap(k)  ((k) <= 0 ? 0 : (alignSearch->gapOpen+alignSearch->gapExtend*(k)))	/* k-symbol indel cost */
997 
998 typedef struct {
999   Int4Ptr sapp;			/* Current script append ptr */
1000   Int4  last;	               /*last edit operation*/
1001 } data_dp;
1002 
1003 						/* Append "Delete k" op */
1004 #define DEL_(k)				\
1005 { 					\
1006   if (data->last < 0)				\
1007     data->last = data->sapp[-1] -= (k);		\
1008   else					\
1009     data->last = (*data->sapp++ = -(k));		\
1010 }
1011 						/* Append "Insert k" op */
1012 #define INS_(k)				\
1013 { 					\
1014   if (data->last > 0)				\
1015     data->last = data->sapp[-1] += (k);		\
1016   else					\
1017     data->last = (*data->sapp++ = (k));		\
1018 }
1019 
1020 						/* Append "Replace" op */
1021 #define REP_ 				\
1022 { data->last = (*data->sapp++ = 0); 			\
1023 }
1024 
1025 /* Returns the cost of an optimum conversion
1026    within highDiag and lowDiag between
1027    seq1[1..end1] and seq2[1..end2] and appends such a conversion
1028    to the current script.
1029 Score_only is 1 if only the score is desired, not the conversion script.
1030               0 otherwise
1031    CD and data stores dynamic program intermediate scores
1032 */
align(Uint1 * seq1,Uint1 * seq2,Int4 end1,Int4 end2,Int4 lowDiag,Int4 highDiag,Char score_only,dp_ptr CD,alignSearchItems * alignSearch,data_dp * data)1033 static Int4 align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2,
1034   Int4 lowDiag, Int4 highDiag, Char score_only, dp_ptr CD, alignSearchItems * alignSearch, data_dp * data)
1035 {
1036         Int4 nextState; /*stores code for next entry in state*/
1037         Int4 score; /*score to return*/
1038 	Int4 band; /*number of diagonals between highDiag and lowDiag
1039                      inclusive*/
1040         Int4 diagIndex; /*loop index over diagonals*/
1041 	Int4 leftd, rightd;	/* diagonal indices for CC, DD, CP and DP */
1042 	register Int4 curd;	/* current index for CC, DD CP and DP */
1043 	register Int4 i;  /*loop index*/
1044         register Int4 index1; /*index on seq1*/
1045 	register Int4 tempCC; /*placeholder for a CC field*/
1046 	register Int4 tempDD; /*placeholder for a DD field*/
1047 	register Int4 tempHorScore; /*dual of tempDD for the case where a
1048                               horizontal edge (insertion) is the last step*/
1049 	register dp_ptr CDrow; /*points to a row of CD*/
1050 	Int4 stateDecoder; /*used to decode the edge information in a state*/
1051         Int4 initialScore; /*score to initialize dynamic program entries*/
1052         Int4 *matrixRow; /*row of score matrix*/
1053 	Int1 **state; /*stores dynamic program information*/
1054         Int1 *stateRow; /*holds one row of state*/
1055         Int1 *editInstructions; /*holds instruction for string-to-string edit*/
1056 	Int4 index2; /*index on seq2*/
1057         Int4 charCounter; /*counts number of characters involved in
1058                             optimal edit sequence*/
1059         Int4 charIndex; /*index over character positions in optimal
1060                           edit sequence*/
1061         Int4 editStep, nextEditStep; /*steps in string conversion sequence*/
1062 
1063 
1064 	/* Boundary cases: end1 <= 0 , end2 <= 0, or highDiag-lowDiag <= 0 */
1065 	band = highDiag-lowDiag+1;
1066 	state = (Int1 **) ckalloc(sizeof(Int1 *)*(end1+1));
1067 	state[0] = (Int1 *) ckalloc((end1+1)*(band+2));
1068 	for (index1 = 1; index1 <= end1; index1++)
1069 	  state[index1] = state[index1-1]+band+2;
1070 
1071 	/* Initialization */
1072 	leftd = 1-lowDiag;
1073 	rightd = highDiag-lowDiag+1;
1074 
1075 	CD[leftd].CC = 0;
1076 	state[0][leftd] = -1;
1077 	initialScore = -alignSearch->gapOpen;
1078 	for (diagIndex = leftd+1; diagIndex <= rightd; diagIndex++) {
1079 	  CD[diagIndex].CC = initialScore = initialScore-alignSearch->gapExtend;
1080 	  CD[diagIndex-1].DD = initialScore-alignSearch->gapCost;
1081 	  state[0][diagIndex] = DIAGONAL_INSERT;
1082 	}
1083 	CD[rightd+1].CC = MININT;
1084 	CD[rightd].DD = MININT;
1085 	CD[leftd-1].DD = -alignSearch->gapCost;
1086 	CD[leftd-1].CC = MININT;
1087 	for (i = 1; i <= end1; i++) {
1088 	  if (i > end2-highDiag)
1089 	    rightd--;
1090 	  if (leftd > 1)
1091 	    leftd--;
1092 	  matrixRow = alignSearch->matrix[seq1[i]];
1093 	  tempDD = CD[leftd].DD;
1094 	  nextState = 0;
1095 	  if ((index2 = leftd+lowDiag-1+i) > 0)
1096 	    tempCC = CD[leftd].CC+matrixRow[seq2[index2]];
1097 	  if (tempDD > tempCC || index2 <= 0) {
1098 	    tempCC = tempDD;
1099 	    nextState = DIAGONAL_DELETE;
1100 	  }
1101 	  tempHorScore = tempCC-alignSearch->gapCost;
1102 	  if (leftd >= 1)
1103 	    if ((tempDD-= alignSearch->gapExtend) >= tempHorScore) {
1104 	      CD[leftd-1].DD = tempDD;
1105 	      nextState += DELETE_CODE;
1106 	    }
1107 	    else {
1108 	      CD[leftd-1].DD = tempHorScore;
1109 	    }
1110 	  stateRow = &state[i][leftd];
1111 	  *stateRow++ = nextState;
1112 	  CD[leftd].CC = tempCC;
1113 	  for (curd=leftd+1, CDrow = &CD[curd]; curd <= rightd; curd++) {
1114 	    tempCC = CDrow->CC + matrixRow[seq2[curd+lowDiag-1+i]];
1115 	    if ((tempDD=CDrow->DD) > tempCC) {
1116 	      if (tempDD > tempHorScore) {
1117 		CDrow->CC = tempDD;
1118 		tempHorScore -= alignSearch->gapExtend;
1119 		(CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1120 		*stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_DELETE;
1121 	      }
1122 	      else {
1123 		CDrow->CC = tempHorScore;
1124 		tempHorScore -= alignSearch->gapExtend;
1125 		(CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1126 		*stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_INSERT;
1127 	      }
1128 	    }
1129 	    else
1130 	      if (tempHorScore > tempCC) {
1131 		CDrow->CC = tempHorScore;
1132 		tempHorScore -= alignSearch->gapExtend;
1133 		(CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1134 		*stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_INSERT;;
1135 	      }
1136 	      else {
1137 		CDrow->CC = tempCC;
1138 		if ((tempCC -= alignSearch->gapCost) >
1139 		    (tempHorScore-=alignSearch->gapExtend)) {
1140 		  tempHorScore = tempCC;
1141 		  nextState = 0;
1142 		}
1143 		else
1144 		  nextState = INSERT_CODE;
1145 		if (tempCC > (tempDD -= alignSearch->gapExtend)) {
1146 		  *stateRow++= nextState;
1147 		  (CDrow++-1)->DD = tempCC;
1148 		}
1149 		else {
1150 		  *stateRow++ = nextState+DELETE_CODE;
1151 		  (CDrow++-1)->DD = tempDD;
1152 		}
1153 	      }
1154 	  }
1155 	}
1156 
1157 	/* decide which path to be traced back */
1158 	score = (CDrow-1)->CC;
1159 	if (score_only) {
1160 	  MemFree(state[0]);
1161 	  MemFree(state);
1162 	  return score;
1163 	}
1164 	editInstructions = (Int1Ptr) ckalloc(end1+end2);
1165 	for (index1 = end1, diagIndex = rightd, editStep=0, charCounter = 0;
1166 	     index1>=0; index1--, charCounter++) {
1167 	  nextEditStep  = (stateDecoder=state[index1][diagIndex]) % INSERT_CODE;
1168 	  if (stateDecoder == -1)
1169 	    break;
1170 	  if (editStep == DIAGONAL_INSERT
1171                   && ((stateDecoder/INSERT_CODE)%2) == 1)
1172 	    nextEditStep = DIAGONAL_INSERT;
1173 	  if (editStep == DIAGONAL_DELETE && (stateDecoder/DELETE_CODE)== 1)
1174 	    nextEditStep = DIAGONAL_DELETE;
1175 	  if (nextEditStep == DIAGONAL_INSERT) {
1176 	    diagIndex--;
1177 	    index1++;
1178 	  }
1179 	  else
1180 	    if (nextEditStep == DIAGONAL_DELETE)
1181 	      diagIndex++;
1182 	  editInstructions[charCounter] = editStep = nextEditStep;
1183 	}
1184 	for (charIndex = charCounter-1; charIndex >= 0; charIndex--)
1185 	  switch(editInstructions[charIndex]) {
1186 	  case 0:
1187 	    REP_;
1188 	    break;
1189 	  case DIAGONAL_INSERT:
1190 	    INS_(1);
1191 	    break;
1192 	  case DIAGONAL_DELETE:
1193 	    DEL_(1);
1194 	    break;
1195 	  }
1196 	MemFree(editInstructions);
1197 	MemFree(state[0]);
1198 	MemFree(state);
1199 	return(score);
1200 }
1201 
1202 
1203 /*Do a banded, gapped alignment of seq1 and seq2 starting at position start1
1204   of seq1 and position start2 of seq2; the band boundaries are lowBand
1205   and highBand, matrix is the score matrix; gapOpen and gapExtend are
1206   the gap costs; alignScript is used to describe the alignment;
1207   alignScript is NULL if only the alignment score is desired
1208   the alignment score is returned*/
ALIGN_B(Uint1 * seq1,Uint1 * seq2,Int4 start1,Int4 start2,Int4 lowDiag,Int4 highDiag,Int4 ** matrix,Int4 gapOpen,Int4 gapExtend,Int4 * alignScript,data_dp * data)1209 static Int4 ALIGN_B(Uint1 *seq1, Uint1 *seq2,Int4 start1, Int4 start2,
1210  Int4 lowDiag, Int4 highDiag,Int4 **matrix, Int4 gapOpen,
1211  Int4 gapExtend, Int4 * alignScript, data_dp * data)
1212 {
1213 	Int4 score; /*score to return*/
1214         Int4 i; /*index over sequences*/
1215 	Int4 band; /*width of band*/
1216         alignSearchItems *alignSearch; /*holds alignment parameters*/
1217         dp_ptr CD; /*array for dynamic program information*/
1218 
1219         alignSearch = (alignSearchItems *) ckalloc(sizeof(alignSearchItems));
1220         /* Setup alignment parameters */
1221 	alignSearch->matrix = matrix;
1222 	alignSearch->gapOpen = gapOpen;
1223 	alignSearch->gapExtend = gapExtend;
1224 	alignSearch->gapCost = gapOpen+gapExtend;
1225 	lowDiag = MIN(MAX(-start1, lowDiag),MIN(start2-start1,0));
1226 	highDiag = MAX(MIN(start2, highDiag),MAX(start2-start1,0));
1227 
1228 	if (start2 <= 0) {
1229 	  if (start1 > 0)
1230 	    if (alignScript)
1231 	      DEL_(start1);
1232 	  return -gap(start1);
1233 	}
1234 	if (start1 <= 0) {
1235 	  if (alignScript)
1236 	    INS_(start2);
1237 	  return -gap(start2);
1238 	}
1239 	if ((band = highDiag-lowDiag+1) <= 1) {
1240 	  score = 0;
1241 	  for (i = 1; i <= start1; i++) {
1242 	    if (alignScript)
1243 	      REP_;
1244 	    score += alignSearch->matrix[seq1[i]][seq2[i]];
1245 	  }
1246           MemFree(alignSearch);
1247 	  return score;
1248 	}
1249 
1250 	CD = (dp_ptr) ckalloc(sizeof(dp_node)*(band+2));
1251 	if (alignScript) {
1252 	  score = align(seq1,seq2,start1,start2,lowDiag,highDiag,0, CD, alignSearch, data);
1253 	}
1254 	else {
1255 	  score = align(seq1,seq2,start1,start2,lowDiag,highDiag, 1, CD, alignSearch, data);
1256 	}
1257 	MemFree(CD);
1258         MemFree(alignSearch);
1259 	return score;
1260 }
1261 
1262 /*seq is a piece of a sequence, len is the sength of seq
1263  pass back start and end position where first pattern match occurs*/
get_pat(Uint1 * seq,Int4 len,Int4 * start,Int4 * end,patternSearchItems * patternSearch)1264 static void get_pat(Uint1 *seq, Int4 len, Int4 *start, Int4 *end,
1265 	       patternSearchItems *patternSearch)
1266 {
1267     Int4 mask;   /*mask of input pattern positions after which
1268                   a match can be declared*/
1269     Int4  maskShiftPlus1; /*mask shifted left plus 1*/
1270     Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
1271                  with seq; e.g., if value is 9 = 0101 then
1272                  last 3 chars of seq match first 3 positions in pattern
1273                  and last 1 char of seq matches 1 position of pattern*/
1274     Int4 i; /*index over seq */
1275     Int4 rightOne;  /*loop index looking for 1 in both mask and
1276                      prefixMatchedBitPattern*/
1277     Int4 rightMaskOnly; /*rightmost bit that is 1 in the mask only*/
1278 
1279     mask = patternSearch->match_mask;
1280     maskShiftPlus1 = (mask << 1) +1;
1281     for (i = 0, prefixMatchedBitPattern= 0; i < len; i++) {
1282       prefixMatchedBitPattern =
1283 	((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
1284 	patternSearch->whichPositionPtr[seq[i]];
1285     }
1286     /*do the work of lenof here*/
1287     rightMaskOnly = -1;
1288     rightOne = prefixMatchedBitPattern & mask;
1289     for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
1290       if ((rightOne >> i) % 2  == 1)
1291 	break;
1292       if ((mask >> i) %2  == 1)
1293 	rightMaskOnly = i;
1294     }
1295     *start = rightMaskOnly + 1;
1296     *end = i;
1297 }
1298 
1299 /*seq is a piece of a sequence, len is the sength of seq
1300  pass back start and end position where first pattern match occurs
1301  Similar to get_pat, but for patterns longer than a word*/
get_patL(Uint1 * seq,Int4 len,Int4 * start,Int4 * end,patternSearchItems * patternSearch)1302 static void get_patL(Uint1 *seq, Int4 len, Int4 *start, Int4 *end,
1303                  patternSearchItems *patternSearch)
1304 {
1305     Int4 *mask;  /*mask of input pattern positions after which
1306                   a match can be declared*/
1307     Int4 *prefixMatchedBitPattern; /*indicates where pattern aligns with seq*/
1308     Int4 wordIndex; /*index over words in pattern*/
1309     Int4  i;  /*index over seq*/
1310     Int4 rightMaskOnly; /*rightmost bit that is 1 in the mask only*/
1311     Int4 j; /*index over bits in a word*/
1312     Boolean found = FALSE;  /*found match position yet*/
1313 
1314     mask = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
1315     prefixMatchedBitPattern = (Int4 *)
1316       ckalloc(sizeof(Int4)*patternSearch->numWords);
1317     for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1318       mask[wordIndex] = patternSearch->match_maskL[wordIndex];
1319       prefixMatchedBitPattern[wordIndex] = 0;
1320     }
1321     lmove(mask, 1, patternSearch);
1322     for (i = 0; i < len; i++) {
1323       lmove(prefixMatchedBitPattern, 0, patternSearch);
1324       or(prefixMatchedBitPattern, mask, patternSearch);
1325       and(prefixMatchedBitPattern, prefixMatchedBitPattern,
1326 	  patternSearch->bitPatternByLetter[seq[i]], patternSearch);
1327     }
1328     and(prefixMatchedBitPattern, prefixMatchedBitPattern,
1329 	patternSearch->match_maskL, patternSearch);
1330     rightMaskOnly = -1;
1331     for (wordIndex = 0; (wordIndex < patternSearch->numWords) && (!found);
1332 	 wordIndex++) {
1333       for (j = 0; j < BITS_PACKED_PER_WORD && (!found); j++) {
1334 	if ((prefixMatchedBitPattern[wordIndex]>>j) % 2 == 1)
1335 	  found = TRUE;
1336 	else
1337 	  if ((patternSearch->match_maskL[wordIndex] >> j)%2 == 1)
1338 	    rightMaskOnly = wordIndex*BITS_PACKED_PER_WORD+j;
1339       }
1340     }
1341     if (found) {
1342       wordIndex--;
1343       j --;
1344     }
1345     prefixMatchedBitPattern = MemFree(prefixMatchedBitPattern);
1346     mask = MemFree(mask);
1347     *start = rightMaskOnly+1;
1348     *end = wordIndex*BITS_PACKED_PER_WORD+j;
1349 }
1350 
1351 /*Find pattern occurrences in seq when the pattern description is
1352   extra long, report the results back in hitArray
1353   seq is input sequence, len is length of seq
1354   hitArray stores pairs of length/position for matches */
get_patLL(Uint1 * seq,Int4 len,Int4 * hitArray,patternSearchItems * patternSearch)1355 static void get_patLL(Uint1 *seq, Int4 len, Int4 *hitArray,
1356 		      patternSearchItems *patternSearch)
1357 {
1358     Int4 i, j; /*indices on one word hits*/
1359     Int4  wordIndex, wordIndex2; /*indices on words in pattern representation*/
1360     Int4  twiceHitsOneWord; /*Twice the number of hits against one
1361                               word of the pattern*/
1362     Int4  hitIndex; /*index over hits against one word*/
1363     Int4 pos; /*keeps track of how many intermediate hits*/
1364     Int4 placeIndex; /*index over the number of places in the pattern rep*/
1365     Int4 hitArray1[MAX_HIT];
1366     Int4 oneWordHitArray[MAX_WORDS_IN_PATTERN]; /*hits for one word of
1367                               pattern representation*/
1368 
1369     i = 1;
1370     hitArray[0] = patternSearch->numPlacesInWord[0]-1;
1371     for (wordIndex = 1; wordIndex < patternSearch->numWords; wordIndex++) {
1372       patternSearch->whichPositionPtr = patternSearch->SLL[wordIndex];
1373       patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
1374       pos = 0;
1375       for (j = 0; j < i; j += wordIndex) {
1376 	twiceHitsOneWord = find_hitsS(oneWordHitArray,
1377 		       (Uint1Ptr) &seq[hitArray[j]+1],
1378        MIN(len-hitArray[j]-1,
1379  patternSearch->spacing[wordIndex-1]+patternSearch->numPlacesInWord[wordIndex]),
1380 				      patternSearch);
1381 	for (hitIndex = 0; hitIndex < twiceHitsOneWord;
1382 	     hitIndex+= 2, pos+= wordIndex+1) {
1383 	  hitArray1[pos] = hitArray[j]+oneWordHitArray[hitIndex]+1;
1384 	  for (wordIndex2 = 1; wordIndex2 < wordIndex; wordIndex2++)
1385 	    hitArray1[pos+wordIndex2] = hitArray[j+wordIndex2];
1386 	  hitArray1[pos+wordIndex] = oneWordHitArray[hitIndex+1];
1387 	}
1388       }
1389       i = pos;
1390       for (j = 0; j < pos; j++)
1391 	hitArray[j] = hitArray1[j];
1392     }
1393     for (j = 0; j < pos; j+= wordIndex) {
1394       for (placeIndex = patternSearch->numPlacesInWord[0], i=j+1;
1395 	   i < j+wordIndex; i++)
1396 	placeIndex += hitArray[i]+patternSearch->numPlacesInWord[i-j];
1397       if (placeIndex != len)
1398 	continue;
1399       if (j == 0)
1400 	return;
1401       for (i = 0; i < wordIndex; i++)
1402 	hitArray[i] = hitArray[i+j];
1403       return;
1404     }
1405     ErrPostEx(SEV_FATAL, 1, 0, "getLL wrong\n");
1406     exit(1);
1407 }
1408 
1409 /*align querySeq to dbSeq when guaranteed that the pattern occurs
1410   in each
1411   lenQuerySeq and lenDbSeq are their lengths
1412   alignScript is alignment script; NULL if only score is desired
1413   tback will hold adjusted alignment script for return
1414   gap_align keeps track of parameters for a gapped alignment
1415   multiple is related to significance of the pattern match
1416   the overal match score is returned; a variant of the
1417   score is passed back in useful_score*/
1418 
align_of_pattern(Uint1 * querySeq,Uint1 * dbSeq,Int4 lenQuerySeq,Int4 lenDbSeq,Int4 * alignScript,Int4 ** tback,GapAlignBlkPtr gap_align,Int4 * useful_score,Nlm_FloatHi * multiple,patternSearchItems * patternSearch,seedSearchItems * seedSearch)1419 Int4 LIBCALL align_of_pattern(Uint1 *querySeq, Uint1 *dbSeq, Int4 lenQuerySeq,
1420 		 Int4 lenDbSeq, Int4 *alignScript,  Int4 **tback,
1421 		 GapAlignBlkPtr gap_align, Int4 *useful_score,
1422 		 Nlm_FloatHi *multiple, patternSearchItems *patternSearch,
1423                  seedSearchItems * seedSearch)
1424 {
1425     data_dp *data; /*stores dynaic program info*/
1426     Int4  startQueryMatch, endQueryMatch; /*positions delimiting
1427                              where query matches pattern first */
1428     Int4 startDbMatch, endDbMatch; /*positions delimiting where
1429                                      database sequence matches pattern first*/
1430     Int4  local_score, local_useful_score; /*two versions of score for return*/
1431     Int4 queryMatchOffset, dbMatchOffset; /*offset from sequence start where
1432                                             pattern character matches,
1433                                             used as indices*/
1434     Int4 patternPosQuery, patternPosDb; /*positions in pattern
1435                             for matches to query and database sequences*/
1436     Int4 wordIndex; /*index over words*/
1437 
1438     Int4 placeIndex1, placeIndex2; /*indices over places in pattern*/
1439     Int4  *hitArray1=NULL, *hitArray2=NULL;
1440     Int4 numPlacesInWord[MAX_WORDS_IN_PATTERN];
1441     Int4 **matrix;  /*score matrix*/
1442     Int4 gap_open; /*gap opening penalty*/
1443     Int4 gap_extend; /*gap extension penalty*/
1444     Nlm_FloatHi mul = 1.0; /*local copy of what to pass back in multiple*/
1445 
1446     if (alignScript)
1447       data = (data_dp *) ckalloc(1 * sizeof(data_dp));
1448     gap_open = gap_align->gap_open;
1449     gap_extend = gap_align->gap_extend;
1450     matrix = gap_align->matrix;
1451     if (patternSearch->flagPatternLength == ONE_WORD_PATTERN) {
1452       get_pat(querySeq, lenQuerySeq, &startQueryMatch, &endQueryMatch,
1453 	      patternSearch);
1454       get_pat(dbSeq, lenDbSeq, &startDbMatch, &endDbMatch, patternSearch);
1455     }
1456     else
1457       if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN) {
1458 	get_patL(querySeq, lenQuerySeq, &startQueryMatch, &endQueryMatch,
1459                     patternSearch);
1460 	get_patL(dbSeq, lenDbSeq, &startDbMatch, &endDbMatch,
1461                     patternSearch);
1462       }
1463       else {
1464 	hitArray1 = Nlm_Malloc(MAX_HIT*sizeof(Int4));
1465 	hitArray2 = Nlm_Malloc(MAX_HIT*sizeof(Int4));
1466 	if (alignScript) {
1467 	    data->sapp = alignScript;
1468 	    data->last = 0;
1469 	}
1470 	get_patLL(querySeq, lenQuerySeq, hitArray1, patternSearch);
1471 	get_patLL(dbSeq, lenDbSeq, hitArray2, patternSearch);
1472 	mul = 1.0;
1473 	local_useful_score = 0;
1474 	local_score = 0;
1475 	for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++)
1476 	  numPlacesInWord[wordIndex] = patternSearch->numPlacesInWord[wordIndex];
1477 	for (placeIndex1 = 0, wordIndex = 0;
1478 	     placeIndex1 < patternSearch->highestPlace; placeIndex1++) {
1479 	  if (patternSearch->inputPatternMasked[placeIndex1] < 0) {
1480 	    for (placeIndex2 = placeIndex1-1; placeIndex2 >=0; placeIndex2--)
1481 	      if (patternSearch->inputPatternMasked[placeIndex2] != allone)
1482 		break;
1483 	    numPlacesInWord[wordIndex++]-= placeIndex1-placeIndex2-1;
1484 	    hitArray1[wordIndex] += placeIndex1-placeIndex2-1;
1485 	    hitArray2[wordIndex] += placeIndex1-placeIndex2-1;
1486 	  }
1487 	}
1488 	queryMatchOffset = dbMatchOffset = 0;
1489 	for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1490 	  for (placeIndex1 = 0; placeIndex1 < numPlacesInWord[wordIndex]; placeIndex1++) {
1491 	    if (alignScript)
1492 	      REP_
1493 		else {
1494 		  if (patternSearch->inputPatternMasked[wordIndex] > allone) {
1495 		    local_score +=matrix[*querySeq][*dbSeq];
1496 		    local_useful_score += matrix[*querySeq][*dbSeq];
1497 		    mul *= seedSearch->charMultiple[*querySeq];
1498 		    }
1499 		  else
1500 		    local_score += matrix[*querySeq][*dbSeq];
1501 		}
1502 		querySeq++;
1503 		dbSeq++;
1504 	    }
1505 	    if (wordIndex < patternSearch->numWords-1) {
1506 	      local_score += ALIGN_B(querySeq-1, dbSeq-1, hitArray1[wordIndex+1], hitArray2[wordIndex+1],
1507 			   BAND_LOW, BAND_HIGH, matrix, gap_open, gap_extend, alignScript, data);
1508 	      querySeq += hitArray1[wordIndex+1];
1509 	      dbSeq += hitArray2[wordIndex+1];
1510 	    }
1511 	}
1512 	if (!alignScript) {
1513 	  *useful_score = local_useful_score;
1514 	  *multiple = mul;
1515 	}
1516 	else
1517 	  *tback = data->sapp;
1518 
1519 	hitArray1 = MemFree(hitArray1);
1520 	hitArray2 = MemFree(hitArray2);
1521 
1522 	return local_score;
1523       }
1524 
1525     local_score = local_useful_score = 0;
1526     if (alignScript) {
1527       data->sapp = alignScript;
1528       data->last = 0;
1529     }
1530     for (patternPosQuery = startQueryMatch, patternPosDb = startDbMatch; patternPosQuery <= endQueryMatch || patternPosDb <= endDbMatch; ) {
1531       if (patternSearch->inputPatternMasked[patternPosQuery] != allone && patternSearch->inputPatternMasked[patternPosDb] != allone) {
1532 	if (alignScript)
1533 	  REP_
1534 	else {
1535 	  local_score += matrix[*querySeq][*dbSeq];
1536 	  if (patternSearch->inputPatternMasked[patternPosQuery] > allone) {
1537 	    local_useful_score+= matrix[*querySeq][*dbSeq];
1538 	    mul *= seedSearch->charMultiple[*querySeq];
1539 	  }
1540 	}
1541 	patternPosQuery++;
1542 	patternPosDb++;
1543 	querySeq++;
1544 	dbSeq++;
1545 	}
1546       else {
1547 	for (queryMatchOffset =0; patternSearch->inputPatternMasked[patternPosQuery] == allone
1548   && patternPosQuery <= endQueryMatch; patternPosQuery++, queryMatchOffset++) ;
1549 	for (dbMatchOffset = 0; patternSearch->inputPatternMasked[patternPosDb] == allone && patternPosDb <= endDbMatch; patternPosDb++, dbMatchOffset++) ;
1550 	if (queryMatchOffset == dbMatchOffset) {
1551 	  do {
1552 	    if (alignScript)
1553 	      REP_
1554 	    else
1555 	      local_score+= matrix[*querySeq][*dbSeq];
1556 	    querySeq++;
1557 	    dbSeq++;
1558 	    queryMatchOffset--;
1559 	  } while (queryMatchOffset > 0);
1560 	}
1561 	else {
1562 	  local_score += ALIGN_B(querySeq-1, dbSeq-1, queryMatchOffset, dbMatchOffset,
1563 		       BAND_LOW, BAND_HIGH, matrix, gap_open, gap_extend, alignScript, data);
1564 	  querySeq+=queryMatchOffset;
1565 	  dbSeq+=dbMatchOffset;
1566 	}
1567       }
1568     }
1569     if (!alignScript) {
1570       *multiple = mul;
1571       *useful_score = local_useful_score;
1572     }
1573     else
1574       *tback = data->sapp;
1575     if (alignScript)
1576       MemFree(data);
1577     return local_score;
1578 }
1579 
1580 
1581 /*print output for sequence seq starting at offset begin and
1582 ending at offset end
1583 called once for each match*/
pat_output(Uint1 * seq,Int4 begin,Int4 end,patternSearchItems * patternSearch,ValNodePtr PNTR info_vnp)1584 void LIBCALL pat_output(Uint1 *seq, Int4 begin, Int4 end, patternSearchItems *patternSearch, ValNodePtr PNTR info_vnp)
1585 {
1586     Int4 startSeqMatch, endSeqMatch; /*positions in seq where
1587                                        pattern match starts and ends*/
1588     Int4 position; /*position of match*/
1589     Int4 nextMatchStart; /*increment for start of next match*/
1590     Int4 wordIndex; /*index over words in pattern*/
1591     Int4 i; /*index over seq*/
1592     Int4 placeIndex1, placeIndex2; /*indices over places in pattern*/
1593     Int4 spacingArray[MAX_WORDS_IN_PATTERN]; /*number of characters of
1594               sequence used across each variable-length region in pattern*/
1595     Int4 numPlacesInWord[MAX_WORDS_IN_PATTERN]; /*number of places in each word
1596                                                 of the pattern*/
1597     Char buffer[512];
1598 
1599     ValNodeCopyStr(info_vnp, 0, "HI "); /*Fixed printing command here*/
1600 
1601     if (patternSearch->flagPatternLength == ONE_WORD_PATTERN) {
1602       get_pat(seq+begin, end-begin+1, &startSeqMatch, &endSeqMatch, patternSearch);
1603     }
1604     else
1605       if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN) {
1606 	get_patL(seq+begin, end-begin+1, &startSeqMatch, &endSeqMatch, patternSearch);
1607       }
1608       else {
1609 	get_patLL(seq+begin, end-begin+1, spacingArray, patternSearch);
1610 	for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++)
1611 	  numPlacesInWord[wordIndex] = patternSearch->numPlacesInWord[wordIndex];
1612 	for (placeIndex1 = 0, wordIndex = 0; placeIndex1 < patternSearch->highestPlace; placeIndex1++) {
1613 	  if (patternSearch->inputPatternMasked[placeIndex1] < 0) {
1614 	    for (placeIndex2 = placeIndex1-1; placeIndex2 >=0; placeIndex2--)
1615 	      if (patternSearch->inputPatternMasked[placeIndex2] != allone)
1616 		break;
1617 	    numPlacesInWord[wordIndex++]-= placeIndex1-placeIndex2-1;
1618 	    spacingArray[wordIndex] += placeIndex1-placeIndex2-1;
1619 	  }
1620 	}
1621 	position = begin;
1622 	for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1623 	  sprintf(buffer, "(%ld %ld)", (long) (1 + position), (long) (1 + position+numPlacesInWord[wordIndex]-1));
1624           ValNodeCopyStr(info_vnp, 0, buffer);
1625 	  position += numPlacesInWord[wordIndex]+spacingArray[wordIndex+1];
1626 	}
1627         ValNodeCopyStr(info_vnp, 0, "\n");
1628 	return;
1629       }
1630     nextMatchStart  = 0;
1631     for (i = startSeqMatch; i <= endSeqMatch; ) {
1632       if (patternSearch->inputPatternMasked[i] != allone) {
1633 	i++;
1634       } else {
1635 	if (0 == nextMatchStart)
1636           sprintf(buffer, "(%ld %ld) ", (long) (1 + begin+nextMatchStart), (long) (1+begin+i-1 - startSeqMatch));
1637 	else
1638           sprintf(buffer, "(%ld %ld) ", (long) (1 +begin+nextMatchStart - startSeqMatch), (long) (1+begin+i-1 - startSeqMatch));
1639           ValNodeCopyStr(info_vnp, 0, buffer);
1640 
1641           for (; patternSearch->inputPatternMasked[i] == allone && i <= endSeqMatch; i++) ;
1642           nextMatchStart = i;
1643       }
1644     }
1645     if (nextMatchStart != i) {  /*last match*/
1646         sprintf(buffer, "(%ld %ld)\n", (long) (1+begin+nextMatchStart-startSeqMatch), (long) (1+begin+i-1 - startSeqMatch));
1647         ValNodeCopyStr(info_vnp, 0, buffer);
1648     } else {
1649         ValNodeCopyStr(info_vnp, 0, "\n");
1650     }
1651 }
1652 
1653 /*find the places where the pattern matches seq;
1654   len is the length of seq
1655   hitArray stores the results as pairs of positions in consecutive
1656   entries
1657   is_dna indicates whether seq is made of DNA or protein letters
1658   twice the number of hits (length of hitArray filled in is returned
1659   3 different methods are used depending on the length of the pattern*/
find_hits(Int4 * hitArray,Uint1Ptr seq,Int4 len,Boolean is_dna,patternSearchItems * patternSearch)1660 Int4 LIBCALL find_hits(Int4 *hitArray, Uint1Ptr seq, Int4 len, Boolean is_dna,
1661 	       patternSearchItems * patternSearch)
1662 {
1663     if (patternSearch->flagPatternLength == ONE_WORD_PATTERN)
1664       return find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
1665     if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN)
1666       return find_hitsL(hitArray, seq, len, patternSearch);
1667     return find_hitsLL(hitArray, seq, len, is_dna, patternSearch);
1668 }
1669 
1670 #ifdef __cplusplus
1671 }
1672 #endif
1673