1 static char const rcsid[] = "$Id: pattern1.c,v 6.22 2006/08/04 21:11:17 papadopo Exp $";
2
3 /* $Id: pattern1.c,v 6.22 2006/08/04 21:11:17 papadopo Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28
29 /*****************************************************************************
30 File name: pattern1.c
31
32 Original Author: Zheng Zhang
33
34 Contents: central pattern matching routines for PHI-BLAST and pseed3
35
36 $Revision: 6.22 $
37
38 $Log: pattern1.c,v $
39 Revision 6.22 2006/08/04 21:11:17 papadopo
40 refine check for end of line when parsing pattern string (fixes rt#15187012)
41
42 Revision 6.21 2005/07/28 14:57:10 coulouri
43 remove dead code
44
45 Revision 6.20 2005/07/22 14:24:22 coulouri
46 Incremented positions printed by the -p patternp option so that the first position is 1
47
48 Revision 6.19 2004/11/22 17:12:52 madden
49 From Alejandro Schaffer:
50 Changed use of startSeqMatch in procedure pat_output to accurately report word boundaries for matches of a multiword pattern in the output hit file.
51
52 Revision 6.18 2004/04/01 13:43:08 lavr
53 Spell "occurred", "occurrence", and "occurring"
54
55 Revision 6.17 2003/08/06 15:11:17 dondosha
56 Fixed search for pattern remainder when it starts from a base non-divisible by 4
57
58 Revision 6.16 2003/05/30 17:25:37 coulouri
59 add rcsid
60
61 Revision 6.15 2003/05/13 16:02:53 coulouri
62 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
63
64 Revision 6.14 2003/03/06 21:33:13 madden
65 Move big arrays from stack to heap
66
67 Revision 6.13 2002/08/28 13:37:29 madden
68 Fix memory leaks
69
70 Revision 6.12 2002/08/09 17:32:30 madden
71 Add warning if MAX_HIT is reached
72
73 Revision 6.11 1999/09/22 17:51:02 shavirin
74 Now functions will collect messages in ValNodePtr before printing out.
75
76
77 *****************************************************************************/
78
79
80 #include <ncbi.h>
81 #include <objseq.h>
82 #include <objsset.h>
83 #include <sequtil.h>
84 #include <seqport.h>
85 #include <tofasta.h>
86 #include <blast.h>
87 #include <blastpri.h>
88 #include <txalign.h>
89 #include <simutil.h>
90 #include <gapxdrop.h>
91 #include <posit.h>
92 #include <readdb.h>
93 #include <ncbithr.h>
94 #include <seed.h>
95
96 #ifdef __cplusplus
97 extern "C" {
98 #endif
99 #define MININT INT4_MIN/2
100
101
102
103
104
105
106 static Int4 expanding PROTO((Int4 *inputPatternMasked, Uint1 *inputPattern,
107 Int4 length, Int4 maxLength));
108 static Int4 packing PROTO((Uint1 *inputPattern, Int4 length));
109
110 static void longpacking PROTO((Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch));
111
112 static void longpacking2 PROTO((Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch));
113
114 static void init_pattern_DNA PROTO((patternSearchItems *patternSearch));
115
116 /*Initialize occurrence probabilities for each amino acid*/
initProbs(seedSearchItems * seedSearch)117 void LIBCALL initProbs(seedSearchItems * seedSearch)
118 {
119 double totalCount; /*for Robinson frequencies*/
120 seedSearch->pchars[0] = '-';
121 seedSearch->pchars[1] = 'A';
122 seedSearch->pchars[2] = 'B';
123 seedSearch->pchars[3] = 'C';
124 seedSearch->pchars[4] = 'D';
125 seedSearch->pchars[5] = 'E';
126 seedSearch->pchars[6] = 'F';
127 seedSearch->pchars[7] = 'G';
128 seedSearch->pchars[8] = 'H';
129 seedSearch->pchars[9] = 'I';
130 seedSearch->pchars[10] = 'K';
131 seedSearch->pchars[11] = 'L';
132 seedSearch->pchars[12] = 'M';
133 seedSearch->pchars[13] = 'N';
134 seedSearch->pchars[14] = 'P';
135 seedSearch->pchars[15] = 'Q';
136 seedSearch->pchars[16] = 'R';
137 seedSearch->pchars[17] = 'S';
138 seedSearch->pchars[18] = 'T';
139 seedSearch->pchars[19] = 'V';
140 seedSearch->pchars[20] = 'W';
141 seedSearch->pchars[21] = 'X';
142 seedSearch->pchars[22] = 'Y';
143 seedSearch->pchars[23] = 'Z';
144 seedSearch->pchars[24] = 'U';
145 seedSearch->pchars[25] = '*';
146 totalCount = 78.0 + 19.0 + 54.0 + 63.0 + 39.0 +
147 74.0 + 22.0 + 52.0 + 57.0 + 90.0 + 22.0 + 45.0 + 52.0 +
148 43.0 + 51.0 + 71.0 + 59.0 + 64.0 + 13.0 + 32.0;
149 seedSearch->standardProb[0] = 0.0;
150 seedSearch->standardProb[1] = 78.0/totalCount; /*A*/
151 seedSearch->standardProb[2] = 0.0;
152 seedSearch->standardProb[3] = 19.0/totalCount; /*C*/
153 seedSearch->standardProb[4] = 54.0/totalCount; /*D*/
154 seedSearch->standardProb[5] = 63.0/totalCount; /*E*/
155 seedSearch->standardProb[6] = 39.0/totalCount; /*F*/
156 seedSearch->standardProb[7] = 74.0/totalCount; /*G*/
157 seedSearch->standardProb[8] = 22.0/totalCount; /*H*/
158 seedSearch->standardProb[9] = 52.0/totalCount; /*I*/
159 seedSearch->standardProb[10] = 57.0/totalCount; /*K*/
160 seedSearch->standardProb[11] = 90.0/totalCount; /*L*/
161 seedSearch->standardProb[12] = 22.0/totalCount; /*M*/
162 seedSearch->standardProb[13] = 45.0/totalCount; /*N*/
163 seedSearch->standardProb[14] = 52.0/totalCount; /*P*/
164 seedSearch->standardProb[15] = 43.0/totalCount; /*Q*/
165 seedSearch->standardProb[16] = 51.0/totalCount; /*R*/
166 seedSearch->standardProb[17] = 71.0/totalCount; /*S*/
167 seedSearch->standardProb[18] = 59.0/totalCount; /*T*/
168 seedSearch->standardProb[19] = 64.0/totalCount; /*V*/
169 seedSearch->standardProb[20] = 13.0/totalCount; /*W*/
170 seedSearch->standardProb[21] = 0.0; /*X*/
171 seedSearch->standardProb[22] = 32.0/totalCount; /*Y*/
172 seedSearch->standardProb[23] = 0.0; /*Z*/
173 seedSearch->standardProb[24] = 0.0; /*U*/
174 }
175
176 /*pattern is a string describing the pattern to search for;
177 is_dna is a boolean describing the strings are DNA or protein*/
init_pattern(Uint1 * pattern,Boolean is_dna,patternSearchItems * patternSearch,seedSearchItems * seedSearch,ValNodePtr * error_return)178 Int4 LIBCALL init_pattern(Uint1 *pattern, Boolean is_dna, patternSearchItems * patternSearch,
179 seedSearchItems *seedSearch, ValNodePtr * error_return)
180 {
181 Int4 i; /*index over string describing the pattern*/
182 Int4 j; /*index for position in pattern*/
183 Int4 charIndex; /*index over characters in alphabet*/
184 Int4 secondIndex; /*second index into pattern*/
185 Int4 numIdentical; /*number of consec. positions with identical specification*/
186 Int4 charSetMask; /*index over masks for specific characters*/
187 Int4 currentSetMask, prevSetMask ; /*mask for current and previous character positions*/
188 Int4 thisMask; /*integer representing a bit pattern for a
189 set of characters*/
190 Int4 minWildcard, maxWildcard; /*used for variable number of wildcard
191 positions*/
192 Int4 tj; /*temporary copy of j*/
193 Int4 tempInputPatternMasked[MaxP]; /*local copy of parts
194 of inputPatternMasked*/
195 Uint1 c; /*character occurring in pattern*/
196 Uint1 localPattern[MaxP]; /*local variable to hold
197 for each position whether it is
198 last in pattern (1) or not (0) */
199 Nlm_FloatHi positionProbability; /*probability of a set of characters
200 allowed in one position*/
201 Int4 currentWildcardProduct; /*product of wildcard lengths for
202 consecutive character positions that
203 overlap*/
204
205 patternSearch->flagPatternLength = ONE_WORD_PATTERN;
206 patternSearch->patternProbability = 1.0;
207 patternSearch->minPatternMatchLength = 0;
208 patternSearch->wildcardProduct = 1;
209 currentWildcardProduct = 1;
210 prevSetMask = 0;
211 currentSetMask = 0;
212
213 for (i = 0 ; i < MaxP; i++) {
214 patternSearch->inputPatternMasked[i] = 0;
215 localPattern[i] = 0;
216 }
217 for (i = 0, j = 0; i < strlen((Char *) pattern); i++) {
218 c = pattern[i];
219 if (c == '\0' || c == '\r' || c == '\n')
220 break;
221 if (c == '-' || c == '.' || c =='>' || c ==' ' || c == '<')
222 continue; /*spacers that mean nothing*/
223 if ( c != '[' && c != '{') { /*not the start of a set of characters*/
224 if (c == 'x' || c== 'X') { /*wild-card character matches anything*/
225 /*next line checks to see if wild card is for multiple positions*/
226 if (pattern[i+1] == '(') {
227 i++;
228 secondIndex = i;
229 /*find end of description of how many positions are wildcarded
230 will look like x(2) or x(2,5) */
231 while (pattern[secondIndex] != ',' && pattern[secondIndex] != ')')
232 secondIndex++;
233 if (pattern[secondIndex] == ')') { /*fixed number of positions wildcarded*/
234 i -= 1;
235 /*wildcard, so all characters are allowed*/
236 charSetMask=allone;
237 positionProbability = 1;
238 }
239 else { /*variable number of positions wildcarded*/
240 sscanf((Char*) &pattern[++i], "%d,%d", &minWildcard, &maxWildcard);
241 maxWildcard = maxWildcard - minWildcard;
242 currentWildcardProduct *= (maxWildcard + 1);
243 if (currentWildcardProduct > patternSearch->wildcardProduct)
244 patternSearch->wildcardProduct = currentWildcardProduct;
245 patternSearch->minPatternMatchLength += minWildcard;
246 while (minWildcard-- > 0) {
247 /*use one position each for the minimum number of
248 wildcard spaces required */
249 patternSearch->inputPatternMasked[j++] = allone;
250 if (j >= MaxP) {
251 BlastConstructErrorMessage("init_pattern", "pattern too long", 1, error_return);
252 /*ErrPostEx(SEV_FATAL, 1, 0, "pattern too long\n");*/
253 return(-1);
254 }
255 }
256 if (maxWildcard != 0) {
257 /*negative masking used to indicate variability
258 in number of wildcard spaces; e.g., if pattern looks
259 like x(3,5) then variability is 2 and there will
260 be three wildcard positions with mask allone followed
261 by a single position with mask -2*/
262 patternSearch->inputPatternMasked[j++] = -maxWildcard;
263 patternSearch->patternProbability *= maxWildcard;
264 }
265 /*now skip over wildcard description with the i index*/
266 while (pattern[++i] != ')') ;
267 continue;
268 }
269 }
270 else { /*wild card is for one position only*/
271 charSetMask=allone;
272 positionProbability =1;
273 }
274 }
275 else {
276 if (c == 'U') { /*look for special U character*/
277 charSetMask = allone*2+1;
278 positionProbability = 1;
279 }
280 else {
281 /*exactly one character matches*/
282 prevSetMask = currentSetMask;
283 currentSetMask = charSetMask = (1 << seedSearch->order[c]);
284 if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
285 currentWildcardProduct = 1;
286 positionProbability = seedSearch->standardProb[seedSearch->order[c]];
287 }
288 }
289 }
290 else {
291 if (c == '[') { /*start of a set of characters allowed*/
292 charSetMask = 0;
293 positionProbability = 0;
294 /*For each character in the set add it to the mask and
295 add its probability to positionProbability*/
296 while ((c=pattern[++i]) != ']') { /*end of set*/
297 if ((c < 'A') || (c > 'Z') || (c == '\0')) {
298 /* ErrPostEx(SEV_FATAL, 1, 0, "your pattern description has a non-alphabetic character inside a bracket\n"); */
299 BlastConstructErrorMessage("init_pattern", "your pattern description has a non-alphabetic character inside a bracket", 1, error_return);
300 return(-1);
301 }
302 charSetMask = charSetMask | (1 << seedSearch->order[c]);
303 positionProbability += seedSearch->standardProb[seedSearch->order[c]];
304 }
305 prevSetMask = currentSetMask;
306 currentSetMask = charSetMask;
307 if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
308 currentWildcardProduct = 1;
309 }
310 else { /*start of a set of characters forbidden*/
311 /*For each character forbidden remove it to the mask and
312 subtract its probability from positionProbability*/
313 charSetMask = allone;
314 positionProbability = 1;
315 while ((c=pattern[++i]) != '}') { /*end of set*/
316 charSetMask = charSetMask - (charSetMask & (1 << seedSearch->order[c]));
317 positionProbability -= seedSearch->standardProb[seedSearch->order[c]];
318 }
319 prevSetMask = currentSetMask;
320 currentSetMask = charSetMask;
321 if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/
322 currentWildcardProduct = 1;
323 }
324 }
325 /*handle a number of positions that are the same */
326 if (pattern[i+1] == '(') { /*read opening paren*/
327 i++;
328 numIdentical = atoi((Char *) &pattern[++i]); /*get number of positions*/
329 patternSearch->minPatternMatchLength += numIdentical;
330 while (pattern[++i] != ')') ; /*skip over piece in pattern*/
331 while ((numIdentical--) > 0) {
332 /*set up mask for these positions*/
333 patternSearch->inputPatternMasked[j++] = charSetMask;
334 patternSearch->patternProbability *= positionProbability;
335 }
336 }
337 else { /*specification is for one posiion only*/
338 patternSearch->inputPatternMasked[j++] = charSetMask;
339 patternSearch->minPatternMatchLength++;
340 patternSearch->patternProbability *= positionProbability;
341 }
342 if (j >= MaxP) {
343 BlastConstructErrorMessage("init_pattern", "pattern too long", 1, error_return);
344 }
345 }
346 localPattern[j-1] = 1;
347 if (patternSearch->patternProbability > 1.0)
348 patternSearch->patternProbability = 1.0;
349
350 for (i = 0; i < j; i++) {
351 tempInputPatternMasked[i] = patternSearch->inputPatternMasked[i];
352 tj = j;
353 }
354 j = expanding(patternSearch->inputPatternMasked, localPattern, j, MaxP);
355 if ((j== -1) || ((j > BITS_PACKED_PER_WORD) && is_dna)) {
356 patternSearch->flagPatternLength = PATTERN_TOO_LONG;
357 longpacking2(tempInputPatternMasked, tj, patternSearch);
358 for (i = 0; i < tj; i++)
359 patternSearch->inputPatternMasked[i] = tempInputPatternMasked[i];
360 patternSearch->highestPlace = tj;
361 if (is_dna)
362 init_pattern_DNA(patternSearch);
363 return 1;
364 }
365 if (j > BITS_PACKED_PER_WORD) {
366 patternSearch->flagPatternLength = MULTI_WORD_PATTERN;
367 longpacking(j, localPattern, patternSearch);
368 return j;
369 }
370 /*make a bit mask out of local pattern of length j*/
371 patternSearch->match_mask = packing(localPattern, j);
372 /*store for each character a bit mask of which positions
373 that character can occur in*/
374 for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
375 thisMask = 0;
376 for (charSetMask = 0; charSetMask < j; charSetMask++) {
377 if ((1<< charIndex) & patternSearch->inputPatternMasked[charSetMask])
378 thisMask |= (1 << charSetMask);
379 }
380 patternSearch->whichPositionsByCharacter[charIndex] = thisMask;
381 }
382 patternSearch->whichPositionPtr = patternSearch->whichPositionsByCharacter;
383 if (is_dna)
384 init_pattern_DNA(patternSearch);
385 return j; /*return number of places for pattern representation*/
386 }
387
388 /*Looks for 1 bits in the same position of s and mask
389 Let rightOne be the rightmost position where s and mask both have
390 a 1.
391 Let rightMaskOnly < rightOne be the rightmost position where mask has a 1, if any
392 or -1 otherwise
393 returns (rightOne - rightMaskOnly) */
394
lenof(Int4 s,Int4 mask)395 static Int4 lenof(Int4 s, Int4 mask)
396 {
397 Int4 checkingMatches = s & mask; /*look for 1 bits in same position*/
398 Int4 rightOne; /*loop index looking for 1 in checkingMatches*/
399 Int4 rightMaskOnly; /*rightnost bit that is 1 in the mask only*/
400 rightMaskOnly = -1;
401 /*AAS Changed upper bound on loop here*/
402 for (rightOne = 0; rightOne < BITS_PACKED_PER_WORD; rightOne++) {
403 if ((checkingMatches >> rightOne) % 2 == 1)
404 return rightOne - rightMaskOnly;
405 if ((mask >> rightOne) %2 == 1)
406 rightMaskOnly = rightOne;
407 }
408 ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");
409 return(-1);
410 }
411
412 /* routine to find hits of pattern to sequence when sequence is proteins
413 hitArray is an array of matches to pass back
414 seq is the input sequence
415 len1 is the length of the input sequence
416 patternSearch carries variables that keep track of
417 search parameters
418 returns the number of matches*/
find_hitsS(Int4 * hitArray,Uint1Ptr seq,Int4 len1,patternSearchItems * patternSearch)419 static Int4 find_hitsS(Int4 *hitArray, Uint1Ptr seq, Int4 len1,
420 patternSearchItems *patternSearch)
421 {
422 Int4 i; /*loop index on sequence*/
423 Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
424 with seq; e.g., if value is 9 = 0101 then
425 last 3 chars of seq match first 3 positions in pattern
426 and last 1 char of seq matches 1 position of pattern*/
427 Int4 numMatches = 0; /*number of matches found*/
428 Int4 mask; /*mask of input pattern positions after which
429 a match can be declared*/
430 Int4 maskShiftPlus1; /*mask shifted left 1 plus 1 */
431
432 mask = patternSearch->match_mask;
433 maskShiftPlus1 = (mask << 1) + 1;
434 for (i = 0; i < len1; i++) {
435 /*shift the positions matched by 1 and try to match up against
436 the next character, also allow next character to match the
437 first position*/
438 prefixMatchedBitPattern =
439 ((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
440 patternSearch->whichPositionPtr[seq[i]];
441 if (prefixMatchedBitPattern & mask) {
442 /*first part of pair is index of place in seq where match
443 ends; second part is where match starts*/
444 hitArray[numMatches++] = i;
445 hitArray[numMatches++] = i - lenof(prefixMatchedBitPattern, mask)+1;
446 if (numMatches == MAX_HIT)
447 {
448 ErrPostEx(SEV_WARNING, 0, 0, "%ld matches saved, discarding others", numMatches);
449 break;
450 }
451 }
452 }
453 return numMatches;
454 }
455
456 /*set uo matches for words that encode 4 DNA characters; figure out
457 for each of 256 possible DNA 4-mers, where a prefix matches the pattern
458 and where a suffix matches the pattern; store in prefixPos and
459 suffixPos; mask has 1 bits for whatever lengths of string
460 the pattern can match, mask2 has 4 1 bits corresponding to
461 the last 4 positions of a match; they are used to
462 do the prefixPos and suffixPos claculations with bit arithmetic*/
setting_tt(Int4Ptr S,Int4 mask,Int4 mask2,Uint4Ptr prefixPos,Uint4Ptr suffixPos)463 static void setting_tt(Int4Ptr S, Int4 mask, Int4 mask2, Uint4Ptr prefixPos,
464 Uint4Ptr suffixPos)
465 {
466 Int4 i; /*index over possible DNA encoded words, 4 bases per word*/
467 Int4 tmp; /*holds different mask combinations*/
468 Int4 maskLeftPlusOne; /*mask shifted left 1 plus 1; guarantees 1
469 1 character match effectively */
470 Uint1 a1, a2, a3, a4; /*four bases packed into an integer*/
471
472 maskLeftPlusOne = (mask << 1)+1;
473 for (i = 0; i < ASCII_SIZE; i++) {
474 /*find out the 4 bases packed in integer i*/
475 a1 = READDB_UNPACK_BASE_1(i);
476 a2 = READDB_UNPACK_BASE_2(i);
477 a3 = READDB_UNPACK_BASE_3(i);
478 a4 = READDB_UNPACK_BASE_4(i);
479 /*what positions match a prefix of a4 followed by a3*/
480 tmp = ((S[a4]>>1) | mask) & S[a3];
481 /*what positions match a prefix of a4 followed by a3 followed by a2*/
482 tmp = ((tmp >>1) | mask) & S[a2];
483 /*what positions match a prefix of a4, a3, a2,a1*/
484 prefixPos[i] = mask2 & ((tmp >>1) | mask) & S[a1];
485
486 /*what positions match a suffix of a2, a1*/
487 tmp = ((S[a1]<<1) | maskLeftPlusOne) & S[a2];
488 /* what positions match a suffix of a3, a2, a1*/
489 tmp = ((tmp <<1) | maskLeftPlusOne) & S[a3];
490 /*what positions match a suffix of a4, a3, a2, a1*/
491 suffixPos[i] = ((((tmp <<1) | maskLeftPlusOne) & S[a4]) << 1) | maskLeftPlusOne;
492 }
493 }
494
495
496 /*find hits when sequence is DNA and pattern is short
497 returns twice the number of hits
498 pos indicates the starting position
499 len is length of sequence seq
500 hitArray stores the results*/
find_hitsS_DNA(Int4Ptr hitArray,Uint1Ptr seq,Char pos,Int4 len,patternSearchItems * patternSearch)501 static Int4 find_hitsS_DNA(Int4Ptr hitArray, Uint1Ptr seq, Char pos, Int4 len,
502 patternSearchItems *patternSearch)
503 {
504 /*Some variables and the algorithm are similar to what is
505 used in find_hits() above; see more detailed comments there*/
506 Uint4 prefixMatchedBitPattern; /*indicates where pattern aligns
507 with sequence*/
508 Uint4 mask2; /*mask to match agaist*/
509 Int4 maskShiftPlus1; /*mask2 shifted plus 1*/
510 Uint4 tmp; /*intermediate result of masked comparisons*/
511 Int4 i; /*index on seq*/
512 Int4 end; /*count of number of 4-mer iterations needed*/
513 Int4 remain; /*0,1,2,3 DNA letters left over*/
514 Int4 j; /*index on suffixRemnant*/
515 Int4 twiceNumHits = 0; /*twice the number of hits*/
516
517 mask2 = patternSearch->match_mask*BITS_PACKED_PER_WORD+15;
518 maskShiftPlus1 = (patternSearch->match_mask << 1)+1;
519
520 if (pos != 0) {
521 pos = 4 - pos;
522 prefixMatchedBitPattern = ((patternSearch->match_mask * ((1 << (pos+1))-1)*2) +
523 (1 << (pos+1))-1)& patternSearch->DNAwhichSuffixPosPtr[seq[0]];
524 seq++;
525 end = (len-pos)/4;
526 remain = (len-pos) % 4;
527 }
528 else {
529 prefixMatchedBitPattern = maskShiftPlus1;
530 end = len/4;
531 remain = len % 4;
532 }
533 for (i = 0; i < end; i++) {
534 if (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]])) {
535 for (j = 0; j < 4; j++) {
536 if (tmp & patternSearch->match_mask) {
537 hitArray[twiceNumHits++] = i*4+j + pos;
538 hitArray[twiceNumHits++] = i*4+j + pos -lenof(tmp & patternSearch->match_mask,
539 patternSearch->match_mask)+1;
540 }
541 tmp = (tmp << 1);
542 }
543 }
544 prefixMatchedBitPattern = (((prefixMatchedBitPattern << 4) | mask2) & patternSearch->DNAwhichSuffixPosPtr[seq[i]]);
545 }
546 /* In the last byte check bits only up to 'remain' */
547 if (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]])) {
548 for (j = 0; j < remain; j++) {
549 if (tmp & patternSearch->match_mask) {
550 hitArray[twiceNumHits++] = i*4+j + pos;
551 hitArray[twiceNumHits++] = i*4+j + pos - lenof(tmp & patternSearch->match_mask, patternSearch->match_mask)+1;
552 }
553 tmp = (tmp << 1);
554 }
555 }
556 return twiceNumHits;
557 }
558
559 /*Top level routine when pattern has a short description
560 hitArray is to return the hits
561 seq is the input sequence
562 start is what position to start at in seq
563 len is the length of seq
564 is_dna is 1 if and only if seq is a DNA sequence
565 the return value from the appropriate lower level routine is passed
566 back*/
find_hitsS_head(Int4Ptr hitArray,Uint1Ptr seq,Int4 start,Int4 len,Uint1 is_dna,patternSearchItems * patternSearch)567 static Int4 find_hitsS_head(Int4Ptr hitArray, Uint1Ptr seq, Int4 start, Int4 len,
568 Uint1 is_dna, patternSearchItems *patternSearch)
569 {
570 if (is_dna)
571 return find_hitsS_DNA(hitArray, &seq[start/4], start % 4, len, patternSearch);
572 return find_hitsS(hitArray, &seq[start], len, patternSearch);
573 }
574
575 /*length is the length of inputPattern, maxLength is a limit on
576 how long inputPattern can get;
577 return the final length of the pattern or -1 if too long*/
expanding(Int4 * inputPatternMasked,Uint1 * inputPattern,Int4 length,Int4 maxLength)578 static Int4 expanding(Int4 *inputPatternMasked, Uint1 *inputPattern,
579 Int4 length, Int4 maxLength)
580 {
581 Int4 i, j; /*pattern indices*/
582 Int4 numPos; /*number of positions index*/
583 Int4 k, t; /*loop indices*/
584 Int4 recReturnValue1, recReturnValue2; /*values returned from
585 recursive calls*/
586 Int4 thisPlaceMasked; /*value of one place in inputPatternMasked*/
587 Int4 tempPatternMask[MaxP]; /*used as a local representation of
588 part of inputPatternMasked*/
589 Uint1 tempPattern[MaxP]; /*used as a local representation of part of
590 inputPattern*/
591
592 for (i = 0; i < length; i++) {
593 thisPlaceMasked = -inputPatternMasked[i];
594 if (thisPlaceMasked > 0) { /*represented variable wildcard*/
595 inputPatternMasked[i] = allone;
596 for (j = 0; j < length; j++) {
597 /*use this to keep track of pattern*/
598 tempPatternMask[j] = inputPatternMasked[j];
599 tempPattern[j] = inputPattern[j];
600 }
601 recReturnValue2 = recReturnValue1 =
602 expanding(inputPatternMasked, inputPattern, length, maxLength);
603 if (recReturnValue1 == -1)
604 return -1;
605 for (numPos = 0; numPos <= thisPlaceMasked; numPos++) {
606 if (numPos == 1)
607 continue;
608 for (k = 0; k < length; k++) {
609 if (k == i) {
610 for (t = 0; t < numPos; t++) {
611 inputPatternMasked[recReturnValue1++] = allone;
612 if (recReturnValue1 >= maxLength)
613 return(-1);
614 }
615 }
616 else {
617 inputPatternMasked[recReturnValue1] = tempPatternMask[k];
618 inputPattern[recReturnValue1++] = tempPattern[k];
619 if (recReturnValue1 >= maxLength)
620 return(-1);
621 }
622 if (recReturnValue1 >= maxLength)
623 return (-1);
624 }
625 recReturnValue1 =
626 expanding(&inputPatternMasked[recReturnValue2],
627 &inputPattern[recReturnValue2],
628 length + numPos - 1,
629 maxLength - recReturnValue2);
630 if (recReturnValue1 == -1)
631 return -1;
632 recReturnValue2 += recReturnValue1;
633 recReturnValue1 = recReturnValue2;
634 }
635 return recReturnValue1;
636 }
637 }
638 return length;
639 }
640
641 /*Pack the next length bytes of inputPattern into a bit vector
642 where the bit is 1 if and only if the byte is non-0
643 Returns packed bit vector*/
packing(Uint1 * inputPattern,Int4 length)644 static Int4 packing(Uint1 *inputPattern, Int4 length)
645 {
646 Int4 i; /*loop index*/
647 Int4 returnValue = 0; /*value to return*/
648 for (i = 0; i < length; i++) {
649 if (inputPattern[i])
650 returnValue += (1 << i);
651 }
652 return returnValue;
653 }
654
655
656 /*Pack the bit representation of the inputPattern into
657 the array patternSearch->match_maskL
658 numPlaces is the number of positions in
659 inputPattern
660 Also packs patternSearch->bitPatternByLetter */
longpacking(Int4 numPlaces,Uint1 * inputPattern,patternSearchItems * patternSearch)661 static void longpacking(Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch)
662 {
663 Int4 charIndex; /*index over characters in alphabet*/
664 Int4 bitPattern; /*bit pattern for one word to pack*/
665 Int4 i; /*loop index over places*/
666 Int4 wordIndex; /*loop counter over words to pack into*/
667
668 patternSearch->numWords = (numPlaces-1) / BITS_PACKED_PER_WORD +1;
669
670 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
671 bitPattern = 0;
672 for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
673 if (inputPattern[wordIndex*BITS_PACKED_PER_WORD+i])
674 bitPattern += (1 << i);
675 }
676 patternSearch->match_maskL[wordIndex] = bitPattern;
677 }
678 for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
679 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
680 bitPattern = 0;
681 for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
682 if ((1<< charIndex) & patternSearch->inputPatternMasked[wordIndex*BITS_PACKED_PER_WORD + i])
683 bitPattern = bitPattern | (1 << i);
684 }
685 patternSearch->bitPatternByLetter[charIndex][wordIndex] =
686 bitPattern;
687 }
688 }
689 }
690
691
692 /*Let F be the number of the first bit in s that is 1
693 Let G be the first bit in mask that is one such that G < F;
694 Else let G = -1;
695 Returns F - G*/
lenofL(Int4 * s,Int4 * mask,patternSearchItems * patternSearch)696 static Int4 lenofL(Int4 *s, Int4 *mask, patternSearchItems *patternSearch)
697 {
698 Int4 bitIndex; /*loop index over bits in a word*/
699 Int4 wordIndex; /*loop index over words*/
700 Int4 firstOneInMask;
701
702 firstOneInMask = -1;
703 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
704 for (bitIndex = 0; bitIndex < BITS_PACKED_PER_WORD; bitIndex++) {
705 if ((s[wordIndex] >> bitIndex) % 2 == 1)
706 return wordIndex*BITS_PACKED_PER_WORD+bitIndex-firstOneInMask;
707 if ((mask[wordIndex] >> bitIndex) %2 == 1)
708 firstOneInMask = wordIndex*BITS_PACKED_PER_WORD+bitIndex;
709 }
710 }
711 ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");
712 return(-1);
713 }
714
715 /*Shift each word in the array left by 1 bit and add bit b,
716 if the new values is bigger that OVERFLOW1, then subtract OVERFLOW1 */
lmove(Int4 * a,Uint1 b,patternSearchItems * patternSearch)717 static void lmove(Int4 *a, Uint1 b, patternSearchItems *patternSearch)
718 {
719 Int4 x;
720 Int4 i; /*index on words*/
721 for (i = 0; i < patternSearch->numWords; i++) {
722 x = (a[i] << 1) + b;
723 if (x >= OVERFLOW1) {
724 a[i] = x - OVERFLOW1;
725 b = 1;
726 }
727 else {
728 a[i] = x;
729 b = 0;
730 }
731 }
732 }
733
734 /*Do a word-by-word bit-wise or of a and b and put the result back in a*/
or(Int4 * a,Int4 * b,patternSearchItems * patternSearch)735 static void or(Int4 *a, Int4 *b, patternSearchItems *patternSearch)
736 {
737 Int4 i; /*index over words*/
738 for (i = 0; i < patternSearch->numWords; i++)
739 a[i] = (a[i] | b[i]);
740 }
741
742 /*Do a word-by-word bit-wise or of a and b and put the result in
743 result; return 1 if there are any non-zero words*/
and(Int4 * result,Int4 * a,Int4 * b,patternSearchItems * patternSearch)744 static and(Int4 *result, Int4 *a, Int4 *b, patternSearchItems *patternSearch)
745 {
746 Int4 i; /*index over words*/
747 Int4 returnValue = 0;
748
749 for (i = 0; i < patternSearch->numWords; i++)
750 if (result[i] = (a[i] & b[i]))
751 returnValue = 1;
752 return returnValue;
753 }
754
755 /* Finds places where pattern matches seq and returns them as
756 pairs of positions in consecutive entries of hitArray;
757 similar to find_hitsS
758 hitArray is array of hits to return
759 seq is the input sequence and len1 is its length
760 patternSearch carries all the pattern variables
761 return twice the number of hits*/
find_hitsL(Int4 * hitArray,Uint1Ptr seq,Int4 len1,patternSearchItems * patternSearch)762 static Int4 find_hitsL(Int4 *hitArray, Uint1Ptr seq, Int4 len1,
763 patternSearchItems *patternSearch)
764 {
765 Int4 wordIndex; /*index on words in mask*/
766 Int4 i; /*loop index on seq */
767 Int4 *prefixMatchedBitPattern; /*see similar variable in
768 find_hitsS*/
769 Int4 twiceNumHits = 0; /*counter for hitArray*/
770 Int4 *mask; /*local copy of match_maskL version of pattern
771 indicates after which positions a match can be declared*/
772 Int4 *matchResult; /*Array of words to hold the result of the
773 final test for a match*/
774
775 matchResult = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
776 mask = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
777 prefixMatchedBitPattern = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
778 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
779 mask[wordIndex] = patternSearch->match_maskL[wordIndex];
780 prefixMatchedBitPattern[wordIndex] = 0;
781 }
782 /*This is a multiword version of the algorithm in find_hitsS*/
783 lmove(mask, 1, patternSearch);
784 for (i = 0; i < len1; i++) {
785 lmove(prefixMatchedBitPattern, 0, patternSearch);
786 or(prefixMatchedBitPattern, mask, patternSearch);
787 and(prefixMatchedBitPattern, prefixMatchedBitPattern, patternSearch->bitPatternByLetter[seq[i]], patternSearch);
788 if (and(matchResult, prefixMatchedBitPattern, patternSearch->match_maskL, patternSearch)) {
789 hitArray[twiceNumHits++] = i;
790 hitArray[twiceNumHits++] = i-lenofL(matchResult, patternSearch->match_maskL, patternSearch)+1;
791 }
792 }
793 MemFree(prefixMatchedBitPattern);
794 MemFree(matchResult);
795 MemFree(mask);
796 return twiceNumHits;
797 }
798
799
800 /*initialize mask and other arrays for DNA patterns*/
init_pattern_DNA(patternSearchItems * patternSearch)801 static void init_pattern_DNA(patternSearchItems *patternSearch)
802 {
803 Int4 mask1; /*mask for one word in a set position*/
804 Int4 compositeMask; /*superimposed mask1 in 4 adjacent positions*/
805 Int4 wordIndex; /*index over words in pattern*/
806
807 if (patternSearch->flagPatternLength != ONE_WORD_PATTERN) {
808 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
809 mask1 = patternSearch->match_maskL[wordIndex];
810 compositeMask = mask1 + (mask1>>1)+(mask1>>2)+(mask1>>3);
811 setting_tt(patternSearch->SLL[wordIndex],
812 patternSearch->match_maskL[wordIndex],
813 compositeMask, patternSearch->DNAprefixSLL[wordIndex], patternSearch->DNAsuffixSLL[wordIndex]);
814 }
815 }
816 else {
817 compositeMask = patternSearch->match_mask +
818 (patternSearch->match_mask>>1) +
819 (patternSearch->match_mask>>2) + (patternSearch->match_mask>>3);
820 patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAwhichPrefixPositions;
821 patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAwhichSuffixPositions;
822 setting_tt(patternSearch->whichPositionsByCharacter,
823 patternSearch->match_mask, compositeMask,
824 patternSearch->DNAwhichPrefixPositions, patternSearch->DNAwhichSuffixPositions);
825 }
826 }
827
828 /*Return the number of 1 bits in the base 2 representation of a*/
num_of_one(Int4 a)829 static Int4 num_of_one(Int4 a)
830 {
831 Int4 returnValue;
832 returnValue = 0;
833 while (a > 0) {
834 if (a % 2 == 1)
835 returnValue++;
836 a = (a >> 1);
837 }
838 return returnValue;
839 }
840
841 /*Sets up field in patternSearch when pattern is very long*/
longpacking2(Int4 * inputPatternMasked,Int4 numPlacesInPattern,patternSearchItems * patternSearch)842 static void longpacking2(Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch)
843 {
844 Int4 placeIndex; /*index over places in pattern rep.*/
845 Int4 wordIndex; /*index over words*/
846 Int4 placeInWord, placeInWord2; /*index for places in a single word*/
847 Int4 charIndex; /*index over characters in alphabet*/
848 Int4 oneWordMask; /*mask of matching characters for one word in
849 pattern representation*/
850 Nlm_FloatHi patternWordProbability;
851 Nlm_FloatHi most_specific; /*lowest probability of a word in the pattern*/
852 Int4 *oneWordSLL; /*holds patternSearch->SLL for one word*/
853
854 most_specific = 1.0;
855 patternSearch->whichMostSpecific = 0;
856 patternWordProbability = 1.0;
857 for (placeIndex = 0, wordIndex = 0, placeInWord=0;
858 placeIndex <= numPlacesInPattern; placeIndex++, placeInWord++) {
859 if (placeIndex==numPlacesInPattern || inputPatternMasked[placeIndex] < 0
860 || placeInWord == BITS_PACKED_PER_WORD ) {
861 patternSearch->match_maskL[wordIndex] = 1 << (placeInWord-1);
862 oneWordSLL = patternSearch->SLL[wordIndex];
863 for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {
864 oneWordMask = 0;
865 for (placeInWord2 = 0; placeInWord2 < placeInWord; placeInWord2++) {
866 if ((1<< charIndex) &
867 inputPatternMasked[placeIndex-placeInWord+placeInWord2])
868 oneWordMask |= (1 << placeInWord2);
869 }
870 oneWordSLL[charIndex] = oneWordMask;
871 }
872 patternSearch->numPlacesInWord[wordIndex] = placeInWord;
873 if (patternWordProbability < most_specific) {
874 most_specific = patternWordProbability;
875 patternSearch->whichMostSpecific = wordIndex;
876 }
877 if (placeIndex == numPlacesInPattern)
878 patternSearch->spacing[wordIndex++] = 0;
879 else
880 if (inputPatternMasked[placeIndex] < 0) {
881 patternSearch->spacing[wordIndex++] = -inputPatternMasked[placeIndex];
882 }
883 else {
884 placeIndex--;
885 patternSearch->spacing[wordIndex++] = 0;
886 }
887 placeInWord = -1;
888 patternWordProbability = 1.0;
889 }
890 else {
891 patternWordProbability *= (Nlm_FloatHi)
892 num_of_one(inputPatternMasked[placeIndex])/ (Nlm_FloatHi) ALPHABET_SIZE;
893 }
894 }
895 patternSearch->numWords = wordIndex;
896 }
897
898 /*find matches when pattern is very long,
899 hitArray is used to pass back pairs of end position. start position for hits
900 seq is the sequence; len is its length
901 is_dna indicates if the sequence is DNA or protein*/
find_hitsLL(Int4 * hitArray,Uint1Ptr seq,Int4 len,Boolean is_dna,patternSearchItems * patternSearch)902 static Int4 find_hitsLL(Int4 *hitArray, Uint1Ptr seq, Int4 len, Boolean is_dna,
903 patternSearchItems *patternSearch)
904 {
905 Int4 twiceNumHits; /*twice the number of matches*/
906 Int4 twiceHitsOneCall; /*twice the number of hits in one call to
907 find_hitsS */
908 Int4 wordIndex; /*index over words in pattern*/
909 Int4 start; /*start position in sequence for calls to find_hitsS */
910 Int4 hitArray1[MAX_HIT]; /*used to get hits against different words*/
911 Int4 nextPosInHitArray; /*next available position in hitArray1 */
912 Int4 hitIndex1, hitIndex2; /*indices over hitArray1*/
913
914 patternSearch->whichPositionPtr =
915 patternSearch->SLL[patternSearch->whichMostSpecific];
916 patternSearch->match_mask =
917 patternSearch->match_maskL[patternSearch->whichMostSpecific];
918 if (is_dna) {
919 patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[patternSearch->whichMostSpecific];
920 patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[patternSearch->whichMostSpecific];
921 }
922 /*find matches to most specific word of pattern*/
923 twiceNumHits = find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
924 if (twiceNumHits < 2)
925 return 0;
926 /*extend matches word by word*/
927 for (wordIndex = patternSearch->whichMostSpecific+1;
928 wordIndex < patternSearch->numWords; wordIndex++) {
929 patternSearch->whichPositionPtr =
930 patternSearch->SLL[wordIndex];
931 patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
932 if (is_dna) {
933 patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
934 patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
935 }
936 nextPosInHitArray = 0;
937 for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
938 twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq,
939 hitArray[hitIndex2]+1, MIN(len-hitArray[hitIndex2]-1,
940 patternSearch->spacing[wordIndex-1] + patternSearch->numPlacesInWord[wordIndex]), is_dna, patternSearch);
941 for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
942 hitArray1[nextPosInHitArray+hitIndex1] =
943 hitArray[hitIndex2]+hitArray1[nextPosInHitArray+hitIndex1]+1;
944 hitArray1[nextPosInHitArray+hitIndex1+1] = hitArray[hitIndex2+1];
945 }
946 nextPosInHitArray += twiceHitsOneCall;
947 }
948 twiceNumHits = nextPosInHitArray;
949 if (twiceNumHits < 2)
950 return 0;
951 /*copy back matches that extend */
952 for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
953 hitArray[hitIndex2] = hitArray1[hitIndex2];
954 }
955 /*extend each match back one word at a time*/
956 for (wordIndex = patternSearch->whichMostSpecific-1; wordIndex >=0;
957 wordIndex--) {
958 patternSearch->whichPositionPtr =
959 patternSearch->SLL[wordIndex];
960 patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
961 if (is_dna) {
962 patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAprefixSLL[wordIndex];
963 patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAsuffixSLL[wordIndex];
964 }
965 nextPosInHitArray = 0;
966 for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
967 start = hitArray[hitIndex2+1]-patternSearch->spacing[wordIndex]-patternSearch->numPlacesInWord[wordIndex];
968 if (start < 0)
969 start = 0;
970 twiceHitsOneCall = find_hitsS_head(&hitArray1[nextPosInHitArray], seq, start,
971 hitArray[hitIndex2+1]-start, is_dna, patternSearch);
972 for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
973 hitArray1[nextPosInHitArray+hitIndex1] = hitArray[hitIndex2];
974 hitArray1[nextPosInHitArray+hitIndex1+1] = start +
975 hitArray1[nextPosInHitArray+hitIndex1+1];
976 }
977 nextPosInHitArray += twiceHitsOneCall;
978 }
979 twiceNumHits = nextPosInHitArray;
980 if (twiceNumHits < 2)
981 return 0;
982 /*copy back matches that extend*/
983 for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
984 hitArray[hitIndex2] = hitArray1[hitIndex2];
985 }
986 return twiceNumHits;
987 }
988
989 /*stores dynamic programming information */
990 typedef struct DP {
991 Int4 CC; /*minimum cost for these coordiantes*/
992 Int4 DD; /*minimum cost when preceded by a deletion (vertical edge)
993 for these coordinates*/
994 } *dp_ptr, dp_node;
995
996 #define gap(k) ((k) <= 0 ? 0 : (alignSearch->gapOpen+alignSearch->gapExtend*(k))) /* k-symbol indel cost */
997
998 typedef struct {
999 Int4Ptr sapp; /* Current script append ptr */
1000 Int4 last; /*last edit operation*/
1001 } data_dp;
1002
1003 /* Append "Delete k" op */
1004 #define DEL_(k) \
1005 { \
1006 if (data->last < 0) \
1007 data->last = data->sapp[-1] -= (k); \
1008 else \
1009 data->last = (*data->sapp++ = -(k)); \
1010 }
1011 /* Append "Insert k" op */
1012 #define INS_(k) \
1013 { \
1014 if (data->last > 0) \
1015 data->last = data->sapp[-1] += (k); \
1016 else \
1017 data->last = (*data->sapp++ = (k)); \
1018 }
1019
1020 /* Append "Replace" op */
1021 #define REP_ \
1022 { data->last = (*data->sapp++ = 0); \
1023 }
1024
1025 /* Returns the cost of an optimum conversion
1026 within highDiag and lowDiag between
1027 seq1[1..end1] and seq2[1..end2] and appends such a conversion
1028 to the current script.
1029 Score_only is 1 if only the score is desired, not the conversion script.
1030 0 otherwise
1031 CD and data stores dynamic program intermediate scores
1032 */
align(Uint1 * seq1,Uint1 * seq2,Int4 end1,Int4 end2,Int4 lowDiag,Int4 highDiag,Char score_only,dp_ptr CD,alignSearchItems * alignSearch,data_dp * data)1033 static Int4 align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2,
1034 Int4 lowDiag, Int4 highDiag, Char score_only, dp_ptr CD, alignSearchItems * alignSearch, data_dp * data)
1035 {
1036 Int4 nextState; /*stores code for next entry in state*/
1037 Int4 score; /*score to return*/
1038 Int4 band; /*number of diagonals between highDiag and lowDiag
1039 inclusive*/
1040 Int4 diagIndex; /*loop index over diagonals*/
1041 Int4 leftd, rightd; /* diagonal indices for CC, DD, CP and DP */
1042 register Int4 curd; /* current index for CC, DD CP and DP */
1043 register Int4 i; /*loop index*/
1044 register Int4 index1; /*index on seq1*/
1045 register Int4 tempCC; /*placeholder for a CC field*/
1046 register Int4 tempDD; /*placeholder for a DD field*/
1047 register Int4 tempHorScore; /*dual of tempDD for the case where a
1048 horizontal edge (insertion) is the last step*/
1049 register dp_ptr CDrow; /*points to a row of CD*/
1050 Int4 stateDecoder; /*used to decode the edge information in a state*/
1051 Int4 initialScore; /*score to initialize dynamic program entries*/
1052 Int4 *matrixRow; /*row of score matrix*/
1053 Int1 **state; /*stores dynamic program information*/
1054 Int1 *stateRow; /*holds one row of state*/
1055 Int1 *editInstructions; /*holds instruction for string-to-string edit*/
1056 Int4 index2; /*index on seq2*/
1057 Int4 charCounter; /*counts number of characters involved in
1058 optimal edit sequence*/
1059 Int4 charIndex; /*index over character positions in optimal
1060 edit sequence*/
1061 Int4 editStep, nextEditStep; /*steps in string conversion sequence*/
1062
1063
1064 /* Boundary cases: end1 <= 0 , end2 <= 0, or highDiag-lowDiag <= 0 */
1065 band = highDiag-lowDiag+1;
1066 state = (Int1 **) ckalloc(sizeof(Int1 *)*(end1+1));
1067 state[0] = (Int1 *) ckalloc((end1+1)*(band+2));
1068 for (index1 = 1; index1 <= end1; index1++)
1069 state[index1] = state[index1-1]+band+2;
1070
1071 /* Initialization */
1072 leftd = 1-lowDiag;
1073 rightd = highDiag-lowDiag+1;
1074
1075 CD[leftd].CC = 0;
1076 state[0][leftd] = -1;
1077 initialScore = -alignSearch->gapOpen;
1078 for (diagIndex = leftd+1; diagIndex <= rightd; diagIndex++) {
1079 CD[diagIndex].CC = initialScore = initialScore-alignSearch->gapExtend;
1080 CD[diagIndex-1].DD = initialScore-alignSearch->gapCost;
1081 state[0][diagIndex] = DIAGONAL_INSERT;
1082 }
1083 CD[rightd+1].CC = MININT;
1084 CD[rightd].DD = MININT;
1085 CD[leftd-1].DD = -alignSearch->gapCost;
1086 CD[leftd-1].CC = MININT;
1087 for (i = 1; i <= end1; i++) {
1088 if (i > end2-highDiag)
1089 rightd--;
1090 if (leftd > 1)
1091 leftd--;
1092 matrixRow = alignSearch->matrix[seq1[i]];
1093 tempDD = CD[leftd].DD;
1094 nextState = 0;
1095 if ((index2 = leftd+lowDiag-1+i) > 0)
1096 tempCC = CD[leftd].CC+matrixRow[seq2[index2]];
1097 if (tempDD > tempCC || index2 <= 0) {
1098 tempCC = tempDD;
1099 nextState = DIAGONAL_DELETE;
1100 }
1101 tempHorScore = tempCC-alignSearch->gapCost;
1102 if (leftd >= 1)
1103 if ((tempDD-= alignSearch->gapExtend) >= tempHorScore) {
1104 CD[leftd-1].DD = tempDD;
1105 nextState += DELETE_CODE;
1106 }
1107 else {
1108 CD[leftd-1].DD = tempHorScore;
1109 }
1110 stateRow = &state[i][leftd];
1111 *stateRow++ = nextState;
1112 CD[leftd].CC = tempCC;
1113 for (curd=leftd+1, CDrow = &CD[curd]; curd <= rightd; curd++) {
1114 tempCC = CDrow->CC + matrixRow[seq2[curd+lowDiag-1+i]];
1115 if ((tempDD=CDrow->DD) > tempCC) {
1116 if (tempDD > tempHorScore) {
1117 CDrow->CC = tempDD;
1118 tempHorScore -= alignSearch->gapExtend;
1119 (CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1120 *stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_DELETE;
1121 }
1122 else {
1123 CDrow->CC = tempHorScore;
1124 tempHorScore -= alignSearch->gapExtend;
1125 (CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1126 *stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_INSERT;
1127 }
1128 }
1129 else
1130 if (tempHorScore > tempCC) {
1131 CDrow->CC = tempHorScore;
1132 tempHorScore -= alignSearch->gapExtend;
1133 (CDrow++-1)->DD = tempDD-alignSearch->gapExtend;
1134 *stateRow++=DELETE_CODE + INSERT_CODE + DIAGONAL_INSERT;;
1135 }
1136 else {
1137 CDrow->CC = tempCC;
1138 if ((tempCC -= alignSearch->gapCost) >
1139 (tempHorScore-=alignSearch->gapExtend)) {
1140 tempHorScore = tempCC;
1141 nextState = 0;
1142 }
1143 else
1144 nextState = INSERT_CODE;
1145 if (tempCC > (tempDD -= alignSearch->gapExtend)) {
1146 *stateRow++= nextState;
1147 (CDrow++-1)->DD = tempCC;
1148 }
1149 else {
1150 *stateRow++ = nextState+DELETE_CODE;
1151 (CDrow++-1)->DD = tempDD;
1152 }
1153 }
1154 }
1155 }
1156
1157 /* decide which path to be traced back */
1158 score = (CDrow-1)->CC;
1159 if (score_only) {
1160 MemFree(state[0]);
1161 MemFree(state);
1162 return score;
1163 }
1164 editInstructions = (Int1Ptr) ckalloc(end1+end2);
1165 for (index1 = end1, diagIndex = rightd, editStep=0, charCounter = 0;
1166 index1>=0; index1--, charCounter++) {
1167 nextEditStep = (stateDecoder=state[index1][diagIndex]) % INSERT_CODE;
1168 if (stateDecoder == -1)
1169 break;
1170 if (editStep == DIAGONAL_INSERT
1171 && ((stateDecoder/INSERT_CODE)%2) == 1)
1172 nextEditStep = DIAGONAL_INSERT;
1173 if (editStep == DIAGONAL_DELETE && (stateDecoder/DELETE_CODE)== 1)
1174 nextEditStep = DIAGONAL_DELETE;
1175 if (nextEditStep == DIAGONAL_INSERT) {
1176 diagIndex--;
1177 index1++;
1178 }
1179 else
1180 if (nextEditStep == DIAGONAL_DELETE)
1181 diagIndex++;
1182 editInstructions[charCounter] = editStep = nextEditStep;
1183 }
1184 for (charIndex = charCounter-1; charIndex >= 0; charIndex--)
1185 switch(editInstructions[charIndex]) {
1186 case 0:
1187 REP_;
1188 break;
1189 case DIAGONAL_INSERT:
1190 INS_(1);
1191 break;
1192 case DIAGONAL_DELETE:
1193 DEL_(1);
1194 break;
1195 }
1196 MemFree(editInstructions);
1197 MemFree(state[0]);
1198 MemFree(state);
1199 return(score);
1200 }
1201
1202
1203 /*Do a banded, gapped alignment of seq1 and seq2 starting at position start1
1204 of seq1 and position start2 of seq2; the band boundaries are lowBand
1205 and highBand, matrix is the score matrix; gapOpen and gapExtend are
1206 the gap costs; alignScript is used to describe the alignment;
1207 alignScript is NULL if only the alignment score is desired
1208 the alignment score is returned*/
ALIGN_B(Uint1 * seq1,Uint1 * seq2,Int4 start1,Int4 start2,Int4 lowDiag,Int4 highDiag,Int4 ** matrix,Int4 gapOpen,Int4 gapExtend,Int4 * alignScript,data_dp * data)1209 static Int4 ALIGN_B(Uint1 *seq1, Uint1 *seq2,Int4 start1, Int4 start2,
1210 Int4 lowDiag, Int4 highDiag,Int4 **matrix, Int4 gapOpen,
1211 Int4 gapExtend, Int4 * alignScript, data_dp * data)
1212 {
1213 Int4 score; /*score to return*/
1214 Int4 i; /*index over sequences*/
1215 Int4 band; /*width of band*/
1216 alignSearchItems *alignSearch; /*holds alignment parameters*/
1217 dp_ptr CD; /*array for dynamic program information*/
1218
1219 alignSearch = (alignSearchItems *) ckalloc(sizeof(alignSearchItems));
1220 /* Setup alignment parameters */
1221 alignSearch->matrix = matrix;
1222 alignSearch->gapOpen = gapOpen;
1223 alignSearch->gapExtend = gapExtend;
1224 alignSearch->gapCost = gapOpen+gapExtend;
1225 lowDiag = MIN(MAX(-start1, lowDiag),MIN(start2-start1,0));
1226 highDiag = MAX(MIN(start2, highDiag),MAX(start2-start1,0));
1227
1228 if (start2 <= 0) {
1229 if (start1 > 0)
1230 if (alignScript)
1231 DEL_(start1);
1232 return -gap(start1);
1233 }
1234 if (start1 <= 0) {
1235 if (alignScript)
1236 INS_(start2);
1237 return -gap(start2);
1238 }
1239 if ((band = highDiag-lowDiag+1) <= 1) {
1240 score = 0;
1241 for (i = 1; i <= start1; i++) {
1242 if (alignScript)
1243 REP_;
1244 score += alignSearch->matrix[seq1[i]][seq2[i]];
1245 }
1246 MemFree(alignSearch);
1247 return score;
1248 }
1249
1250 CD = (dp_ptr) ckalloc(sizeof(dp_node)*(band+2));
1251 if (alignScript) {
1252 score = align(seq1,seq2,start1,start2,lowDiag,highDiag,0, CD, alignSearch, data);
1253 }
1254 else {
1255 score = align(seq1,seq2,start1,start2,lowDiag,highDiag, 1, CD, alignSearch, data);
1256 }
1257 MemFree(CD);
1258 MemFree(alignSearch);
1259 return score;
1260 }
1261
1262 /*seq is a piece of a sequence, len is the sength of seq
1263 pass back start and end position where first pattern match occurs*/
get_pat(Uint1 * seq,Int4 len,Int4 * start,Int4 * end,patternSearchItems * patternSearch)1264 static void get_pat(Uint1 *seq, Int4 len, Int4 *start, Int4 *end,
1265 patternSearchItems *patternSearch)
1266 {
1267 Int4 mask; /*mask of input pattern positions after which
1268 a match can be declared*/
1269 Int4 maskShiftPlus1; /*mask shifted left plus 1*/
1270 Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
1271 with seq; e.g., if value is 9 = 0101 then
1272 last 3 chars of seq match first 3 positions in pattern
1273 and last 1 char of seq matches 1 position of pattern*/
1274 Int4 i; /*index over seq */
1275 Int4 rightOne; /*loop index looking for 1 in both mask and
1276 prefixMatchedBitPattern*/
1277 Int4 rightMaskOnly; /*rightmost bit that is 1 in the mask only*/
1278
1279 mask = patternSearch->match_mask;
1280 maskShiftPlus1 = (mask << 1) +1;
1281 for (i = 0, prefixMatchedBitPattern= 0; i < len; i++) {
1282 prefixMatchedBitPattern =
1283 ((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
1284 patternSearch->whichPositionPtr[seq[i]];
1285 }
1286 /*do the work of lenof here*/
1287 rightMaskOnly = -1;
1288 rightOne = prefixMatchedBitPattern & mask;
1289 for (i = 0; i < BITS_PACKED_PER_WORD; i++) {
1290 if ((rightOne >> i) % 2 == 1)
1291 break;
1292 if ((mask >> i) %2 == 1)
1293 rightMaskOnly = i;
1294 }
1295 *start = rightMaskOnly + 1;
1296 *end = i;
1297 }
1298
1299 /*seq is a piece of a sequence, len is the sength of seq
1300 pass back start and end position where first pattern match occurs
1301 Similar to get_pat, but for patterns longer than a word*/
get_patL(Uint1 * seq,Int4 len,Int4 * start,Int4 * end,patternSearchItems * patternSearch)1302 static void get_patL(Uint1 *seq, Int4 len, Int4 *start, Int4 *end,
1303 patternSearchItems *patternSearch)
1304 {
1305 Int4 *mask; /*mask of input pattern positions after which
1306 a match can be declared*/
1307 Int4 *prefixMatchedBitPattern; /*indicates where pattern aligns with seq*/
1308 Int4 wordIndex; /*index over words in pattern*/
1309 Int4 i; /*index over seq*/
1310 Int4 rightMaskOnly; /*rightmost bit that is 1 in the mask only*/
1311 Int4 j; /*index over bits in a word*/
1312 Boolean found = FALSE; /*found match position yet*/
1313
1314 mask = (Int4 *) ckalloc(sizeof(Int4)*patternSearch->numWords);
1315 prefixMatchedBitPattern = (Int4 *)
1316 ckalloc(sizeof(Int4)*patternSearch->numWords);
1317 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1318 mask[wordIndex] = patternSearch->match_maskL[wordIndex];
1319 prefixMatchedBitPattern[wordIndex] = 0;
1320 }
1321 lmove(mask, 1, patternSearch);
1322 for (i = 0; i < len; i++) {
1323 lmove(prefixMatchedBitPattern, 0, patternSearch);
1324 or(prefixMatchedBitPattern, mask, patternSearch);
1325 and(prefixMatchedBitPattern, prefixMatchedBitPattern,
1326 patternSearch->bitPatternByLetter[seq[i]], patternSearch);
1327 }
1328 and(prefixMatchedBitPattern, prefixMatchedBitPattern,
1329 patternSearch->match_maskL, patternSearch);
1330 rightMaskOnly = -1;
1331 for (wordIndex = 0; (wordIndex < patternSearch->numWords) && (!found);
1332 wordIndex++) {
1333 for (j = 0; j < BITS_PACKED_PER_WORD && (!found); j++) {
1334 if ((prefixMatchedBitPattern[wordIndex]>>j) % 2 == 1)
1335 found = TRUE;
1336 else
1337 if ((patternSearch->match_maskL[wordIndex] >> j)%2 == 1)
1338 rightMaskOnly = wordIndex*BITS_PACKED_PER_WORD+j;
1339 }
1340 }
1341 if (found) {
1342 wordIndex--;
1343 j --;
1344 }
1345 prefixMatchedBitPattern = MemFree(prefixMatchedBitPattern);
1346 mask = MemFree(mask);
1347 *start = rightMaskOnly+1;
1348 *end = wordIndex*BITS_PACKED_PER_WORD+j;
1349 }
1350
1351 /*Find pattern occurrences in seq when the pattern description is
1352 extra long, report the results back in hitArray
1353 seq is input sequence, len is length of seq
1354 hitArray stores pairs of length/position for matches */
get_patLL(Uint1 * seq,Int4 len,Int4 * hitArray,patternSearchItems * patternSearch)1355 static void get_patLL(Uint1 *seq, Int4 len, Int4 *hitArray,
1356 patternSearchItems *patternSearch)
1357 {
1358 Int4 i, j; /*indices on one word hits*/
1359 Int4 wordIndex, wordIndex2; /*indices on words in pattern representation*/
1360 Int4 twiceHitsOneWord; /*Twice the number of hits against one
1361 word of the pattern*/
1362 Int4 hitIndex; /*index over hits against one word*/
1363 Int4 pos; /*keeps track of how many intermediate hits*/
1364 Int4 placeIndex; /*index over the number of places in the pattern rep*/
1365 Int4 hitArray1[MAX_HIT];
1366 Int4 oneWordHitArray[MAX_WORDS_IN_PATTERN]; /*hits for one word of
1367 pattern representation*/
1368
1369 i = 1;
1370 hitArray[0] = patternSearch->numPlacesInWord[0]-1;
1371 for (wordIndex = 1; wordIndex < patternSearch->numWords; wordIndex++) {
1372 patternSearch->whichPositionPtr = patternSearch->SLL[wordIndex];
1373 patternSearch->match_mask = patternSearch->match_maskL[wordIndex];
1374 pos = 0;
1375 for (j = 0; j < i; j += wordIndex) {
1376 twiceHitsOneWord = find_hitsS(oneWordHitArray,
1377 (Uint1Ptr) &seq[hitArray[j]+1],
1378 MIN(len-hitArray[j]-1,
1379 patternSearch->spacing[wordIndex-1]+patternSearch->numPlacesInWord[wordIndex]),
1380 patternSearch);
1381 for (hitIndex = 0; hitIndex < twiceHitsOneWord;
1382 hitIndex+= 2, pos+= wordIndex+1) {
1383 hitArray1[pos] = hitArray[j]+oneWordHitArray[hitIndex]+1;
1384 for (wordIndex2 = 1; wordIndex2 < wordIndex; wordIndex2++)
1385 hitArray1[pos+wordIndex2] = hitArray[j+wordIndex2];
1386 hitArray1[pos+wordIndex] = oneWordHitArray[hitIndex+1];
1387 }
1388 }
1389 i = pos;
1390 for (j = 0; j < pos; j++)
1391 hitArray[j] = hitArray1[j];
1392 }
1393 for (j = 0; j < pos; j+= wordIndex) {
1394 for (placeIndex = patternSearch->numPlacesInWord[0], i=j+1;
1395 i < j+wordIndex; i++)
1396 placeIndex += hitArray[i]+patternSearch->numPlacesInWord[i-j];
1397 if (placeIndex != len)
1398 continue;
1399 if (j == 0)
1400 return;
1401 for (i = 0; i < wordIndex; i++)
1402 hitArray[i] = hitArray[i+j];
1403 return;
1404 }
1405 ErrPostEx(SEV_FATAL, 1, 0, "getLL wrong\n");
1406 exit(1);
1407 }
1408
1409 /*align querySeq to dbSeq when guaranteed that the pattern occurs
1410 in each
1411 lenQuerySeq and lenDbSeq are their lengths
1412 alignScript is alignment script; NULL if only score is desired
1413 tback will hold adjusted alignment script for return
1414 gap_align keeps track of parameters for a gapped alignment
1415 multiple is related to significance of the pattern match
1416 the overal match score is returned; a variant of the
1417 score is passed back in useful_score*/
1418
align_of_pattern(Uint1 * querySeq,Uint1 * dbSeq,Int4 lenQuerySeq,Int4 lenDbSeq,Int4 * alignScript,Int4 ** tback,GapAlignBlkPtr gap_align,Int4 * useful_score,Nlm_FloatHi * multiple,patternSearchItems * patternSearch,seedSearchItems * seedSearch)1419 Int4 LIBCALL align_of_pattern(Uint1 *querySeq, Uint1 *dbSeq, Int4 lenQuerySeq,
1420 Int4 lenDbSeq, Int4 *alignScript, Int4 **tback,
1421 GapAlignBlkPtr gap_align, Int4 *useful_score,
1422 Nlm_FloatHi *multiple, patternSearchItems *patternSearch,
1423 seedSearchItems * seedSearch)
1424 {
1425 data_dp *data; /*stores dynaic program info*/
1426 Int4 startQueryMatch, endQueryMatch; /*positions delimiting
1427 where query matches pattern first */
1428 Int4 startDbMatch, endDbMatch; /*positions delimiting where
1429 database sequence matches pattern first*/
1430 Int4 local_score, local_useful_score; /*two versions of score for return*/
1431 Int4 queryMatchOffset, dbMatchOffset; /*offset from sequence start where
1432 pattern character matches,
1433 used as indices*/
1434 Int4 patternPosQuery, patternPosDb; /*positions in pattern
1435 for matches to query and database sequences*/
1436 Int4 wordIndex; /*index over words*/
1437
1438 Int4 placeIndex1, placeIndex2; /*indices over places in pattern*/
1439 Int4 *hitArray1=NULL, *hitArray2=NULL;
1440 Int4 numPlacesInWord[MAX_WORDS_IN_PATTERN];
1441 Int4 **matrix; /*score matrix*/
1442 Int4 gap_open; /*gap opening penalty*/
1443 Int4 gap_extend; /*gap extension penalty*/
1444 Nlm_FloatHi mul = 1.0; /*local copy of what to pass back in multiple*/
1445
1446 if (alignScript)
1447 data = (data_dp *) ckalloc(1 * sizeof(data_dp));
1448 gap_open = gap_align->gap_open;
1449 gap_extend = gap_align->gap_extend;
1450 matrix = gap_align->matrix;
1451 if (patternSearch->flagPatternLength == ONE_WORD_PATTERN) {
1452 get_pat(querySeq, lenQuerySeq, &startQueryMatch, &endQueryMatch,
1453 patternSearch);
1454 get_pat(dbSeq, lenDbSeq, &startDbMatch, &endDbMatch, patternSearch);
1455 }
1456 else
1457 if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN) {
1458 get_patL(querySeq, lenQuerySeq, &startQueryMatch, &endQueryMatch,
1459 patternSearch);
1460 get_patL(dbSeq, lenDbSeq, &startDbMatch, &endDbMatch,
1461 patternSearch);
1462 }
1463 else {
1464 hitArray1 = Nlm_Malloc(MAX_HIT*sizeof(Int4));
1465 hitArray2 = Nlm_Malloc(MAX_HIT*sizeof(Int4));
1466 if (alignScript) {
1467 data->sapp = alignScript;
1468 data->last = 0;
1469 }
1470 get_patLL(querySeq, lenQuerySeq, hitArray1, patternSearch);
1471 get_patLL(dbSeq, lenDbSeq, hitArray2, patternSearch);
1472 mul = 1.0;
1473 local_useful_score = 0;
1474 local_score = 0;
1475 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++)
1476 numPlacesInWord[wordIndex] = patternSearch->numPlacesInWord[wordIndex];
1477 for (placeIndex1 = 0, wordIndex = 0;
1478 placeIndex1 < patternSearch->highestPlace; placeIndex1++) {
1479 if (patternSearch->inputPatternMasked[placeIndex1] < 0) {
1480 for (placeIndex2 = placeIndex1-1; placeIndex2 >=0; placeIndex2--)
1481 if (patternSearch->inputPatternMasked[placeIndex2] != allone)
1482 break;
1483 numPlacesInWord[wordIndex++]-= placeIndex1-placeIndex2-1;
1484 hitArray1[wordIndex] += placeIndex1-placeIndex2-1;
1485 hitArray2[wordIndex] += placeIndex1-placeIndex2-1;
1486 }
1487 }
1488 queryMatchOffset = dbMatchOffset = 0;
1489 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1490 for (placeIndex1 = 0; placeIndex1 < numPlacesInWord[wordIndex]; placeIndex1++) {
1491 if (alignScript)
1492 REP_
1493 else {
1494 if (patternSearch->inputPatternMasked[wordIndex] > allone) {
1495 local_score +=matrix[*querySeq][*dbSeq];
1496 local_useful_score += matrix[*querySeq][*dbSeq];
1497 mul *= seedSearch->charMultiple[*querySeq];
1498 }
1499 else
1500 local_score += matrix[*querySeq][*dbSeq];
1501 }
1502 querySeq++;
1503 dbSeq++;
1504 }
1505 if (wordIndex < patternSearch->numWords-1) {
1506 local_score += ALIGN_B(querySeq-1, dbSeq-1, hitArray1[wordIndex+1], hitArray2[wordIndex+1],
1507 BAND_LOW, BAND_HIGH, matrix, gap_open, gap_extend, alignScript, data);
1508 querySeq += hitArray1[wordIndex+1];
1509 dbSeq += hitArray2[wordIndex+1];
1510 }
1511 }
1512 if (!alignScript) {
1513 *useful_score = local_useful_score;
1514 *multiple = mul;
1515 }
1516 else
1517 *tback = data->sapp;
1518
1519 hitArray1 = MemFree(hitArray1);
1520 hitArray2 = MemFree(hitArray2);
1521
1522 return local_score;
1523 }
1524
1525 local_score = local_useful_score = 0;
1526 if (alignScript) {
1527 data->sapp = alignScript;
1528 data->last = 0;
1529 }
1530 for (patternPosQuery = startQueryMatch, patternPosDb = startDbMatch; patternPosQuery <= endQueryMatch || patternPosDb <= endDbMatch; ) {
1531 if (patternSearch->inputPatternMasked[patternPosQuery] != allone && patternSearch->inputPatternMasked[patternPosDb] != allone) {
1532 if (alignScript)
1533 REP_
1534 else {
1535 local_score += matrix[*querySeq][*dbSeq];
1536 if (patternSearch->inputPatternMasked[patternPosQuery] > allone) {
1537 local_useful_score+= matrix[*querySeq][*dbSeq];
1538 mul *= seedSearch->charMultiple[*querySeq];
1539 }
1540 }
1541 patternPosQuery++;
1542 patternPosDb++;
1543 querySeq++;
1544 dbSeq++;
1545 }
1546 else {
1547 for (queryMatchOffset =0; patternSearch->inputPatternMasked[patternPosQuery] == allone
1548 && patternPosQuery <= endQueryMatch; patternPosQuery++, queryMatchOffset++) ;
1549 for (dbMatchOffset = 0; patternSearch->inputPatternMasked[patternPosDb] == allone && patternPosDb <= endDbMatch; patternPosDb++, dbMatchOffset++) ;
1550 if (queryMatchOffset == dbMatchOffset) {
1551 do {
1552 if (alignScript)
1553 REP_
1554 else
1555 local_score+= matrix[*querySeq][*dbSeq];
1556 querySeq++;
1557 dbSeq++;
1558 queryMatchOffset--;
1559 } while (queryMatchOffset > 0);
1560 }
1561 else {
1562 local_score += ALIGN_B(querySeq-1, dbSeq-1, queryMatchOffset, dbMatchOffset,
1563 BAND_LOW, BAND_HIGH, matrix, gap_open, gap_extend, alignScript, data);
1564 querySeq+=queryMatchOffset;
1565 dbSeq+=dbMatchOffset;
1566 }
1567 }
1568 }
1569 if (!alignScript) {
1570 *multiple = mul;
1571 *useful_score = local_useful_score;
1572 }
1573 else
1574 *tback = data->sapp;
1575 if (alignScript)
1576 MemFree(data);
1577 return local_score;
1578 }
1579
1580
1581 /*print output for sequence seq starting at offset begin and
1582 ending at offset end
1583 called once for each match*/
pat_output(Uint1 * seq,Int4 begin,Int4 end,patternSearchItems * patternSearch,ValNodePtr PNTR info_vnp)1584 void LIBCALL pat_output(Uint1 *seq, Int4 begin, Int4 end, patternSearchItems *patternSearch, ValNodePtr PNTR info_vnp)
1585 {
1586 Int4 startSeqMatch, endSeqMatch; /*positions in seq where
1587 pattern match starts and ends*/
1588 Int4 position; /*position of match*/
1589 Int4 nextMatchStart; /*increment for start of next match*/
1590 Int4 wordIndex; /*index over words in pattern*/
1591 Int4 i; /*index over seq*/
1592 Int4 placeIndex1, placeIndex2; /*indices over places in pattern*/
1593 Int4 spacingArray[MAX_WORDS_IN_PATTERN]; /*number of characters of
1594 sequence used across each variable-length region in pattern*/
1595 Int4 numPlacesInWord[MAX_WORDS_IN_PATTERN]; /*number of places in each word
1596 of the pattern*/
1597 Char buffer[512];
1598
1599 ValNodeCopyStr(info_vnp, 0, "HI "); /*Fixed printing command here*/
1600
1601 if (patternSearch->flagPatternLength == ONE_WORD_PATTERN) {
1602 get_pat(seq+begin, end-begin+1, &startSeqMatch, &endSeqMatch, patternSearch);
1603 }
1604 else
1605 if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN) {
1606 get_patL(seq+begin, end-begin+1, &startSeqMatch, &endSeqMatch, patternSearch);
1607 }
1608 else {
1609 get_patLL(seq+begin, end-begin+1, spacingArray, patternSearch);
1610 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++)
1611 numPlacesInWord[wordIndex] = patternSearch->numPlacesInWord[wordIndex];
1612 for (placeIndex1 = 0, wordIndex = 0; placeIndex1 < patternSearch->highestPlace; placeIndex1++) {
1613 if (patternSearch->inputPatternMasked[placeIndex1] < 0) {
1614 for (placeIndex2 = placeIndex1-1; placeIndex2 >=0; placeIndex2--)
1615 if (patternSearch->inputPatternMasked[placeIndex2] != allone)
1616 break;
1617 numPlacesInWord[wordIndex++]-= placeIndex1-placeIndex2-1;
1618 spacingArray[wordIndex] += placeIndex1-placeIndex2-1;
1619 }
1620 }
1621 position = begin;
1622 for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {
1623 sprintf(buffer, "(%ld %ld)", (long) (1 + position), (long) (1 + position+numPlacesInWord[wordIndex]-1));
1624 ValNodeCopyStr(info_vnp, 0, buffer);
1625 position += numPlacesInWord[wordIndex]+spacingArray[wordIndex+1];
1626 }
1627 ValNodeCopyStr(info_vnp, 0, "\n");
1628 return;
1629 }
1630 nextMatchStart = 0;
1631 for (i = startSeqMatch; i <= endSeqMatch; ) {
1632 if (patternSearch->inputPatternMasked[i] != allone) {
1633 i++;
1634 } else {
1635 if (0 == nextMatchStart)
1636 sprintf(buffer, "(%ld %ld) ", (long) (1 + begin+nextMatchStart), (long) (1+begin+i-1 - startSeqMatch));
1637 else
1638 sprintf(buffer, "(%ld %ld) ", (long) (1 +begin+nextMatchStart - startSeqMatch), (long) (1+begin+i-1 - startSeqMatch));
1639 ValNodeCopyStr(info_vnp, 0, buffer);
1640
1641 for (; patternSearch->inputPatternMasked[i] == allone && i <= endSeqMatch; i++) ;
1642 nextMatchStart = i;
1643 }
1644 }
1645 if (nextMatchStart != i) { /*last match*/
1646 sprintf(buffer, "(%ld %ld)\n", (long) (1+begin+nextMatchStart-startSeqMatch), (long) (1+begin+i-1 - startSeqMatch));
1647 ValNodeCopyStr(info_vnp, 0, buffer);
1648 } else {
1649 ValNodeCopyStr(info_vnp, 0, "\n");
1650 }
1651 }
1652
1653 /*find the places where the pattern matches seq;
1654 len is the length of seq
1655 hitArray stores the results as pairs of positions in consecutive
1656 entries
1657 is_dna indicates whether seq is made of DNA or protein letters
1658 twice the number of hits (length of hitArray filled in is returned
1659 3 different methods are used depending on the length of the pattern*/
find_hits(Int4 * hitArray,Uint1Ptr seq,Int4 len,Boolean is_dna,patternSearchItems * patternSearch)1660 Int4 LIBCALL find_hits(Int4 *hitArray, Uint1Ptr seq, Int4 len, Boolean is_dna,
1661 patternSearchItems * patternSearch)
1662 {
1663 if (patternSearch->flagPatternLength == ONE_WORD_PATTERN)
1664 return find_hitsS_head(hitArray, seq, 0, len, is_dna, patternSearch);
1665 if (patternSearch->flagPatternLength == MULTI_WORD_PATTERN)
1666 return find_hitsL(hitArray, seq, len, patternSearch);
1667 return find_hitsLL(hitArray, seq, len, is_dna, patternSearch);
1668 }
1669
1670 #ifdef __cplusplus
1671 }
1672 #endif
1673