1 static char const rcsid[] = "$Id: gapxdrop.c,v 6.76 2003/11/07 21:52:52 dondosha Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: gapxdrop.c
30
31 Author: Gennadiy Savchuk, Jinqhui Zhang, Tom Madden
32 [From code originally supplied by Webb Miller's lab].
33
34 Contents: Functions to perform a gapped alignment on two sequences.
35
36 ****************************************************************************/
37 /* $Revision: 6.76 $
38 * $Log: gapxdrop.c,v $
39 * Revision 6.76 2003/11/07 21:52:52 dondosha
40 * Fixed SEMI_G_ALIGN_EX so it produces correct scores, by making code exactly correspond to ALIGN_EX
41 *
42 * Revision 6.75 2003/08/20 22:11:49 dondosha
43 * Removed some garbage code
44 *
45 * Revision 6.74 2003/06/23 21:09:23 dondosha
46 * Allow left extension from offset 0, since the starting point must be part of the left extension
47 *
48 * Revision 6.73 2003/05/30 17:25:36 coulouri
49 * add rcsid
50 *
51 * Revision 6.72 2003/05/27 20:29:57 dondosha
52 * Fix for OOF alignments starting at offsets 1 or 2 in query
53 *
54 * Revision 6.71 2003/05/15 21:50:08 dondosha
55 * The break in previous change was put in a wrong place; fixed
56 *
57 * Revision 6.70 2003/05/06 18:56:12 dondosha
58 * In ALIGN_packed_nucl: break out of loop if matrix score is MININT, to avoid gapping through sentinel bytes
59 *
60 * Revision 6.69 2003/04/22 17:50:07 dondosha
61 * Correction in OOFGapXEditBlockToSeqAlign for case when gap is followed by a frame shift
62 *
63 * Revision 6.68 2002/08/22 12:26:42 madden
64 * Removed unused variables
65 *
66 * Revision 6.67 2002/07/24 19:14:42 dondosha
67 * Previous fix was incomplete; correcting
68 *
69 * Revision 6.66 2002/07/02 17:08:48 dondosha
70 * If alignment longer than traceback, set traceback remainder to 0
71 *
72 * Revision 6.65 2002/06/11 14:44:48 dondosha
73 * Return status from some functions instead of search block pointer
74 *
75 * Revision 6.64 2002/05/09 17:01:23 dondosha
76 * Renamed typedefs dp_ptr and dp_node to GapXDPPtr and GapXDP
77 *
78 * Revision 6.63 2002/05/08 22:48:26 dondosha
79 * Allocate memory for dynamic programming upfront in Mega BLAST case
80 *
81 * Revision 6.62 2002/03/05 17:53:41 dondosha
82 * Added one sanity check in GapXEditBlockToSeqAlign
83 *
84 * Revision 6.61 2001/08/27 18:56:26 madden
85 * Fix OOFGapXEditBlockToSeqAlign if alignment overruns end of query
86 *
87 * Revision 6.60 2001/07/23 13:47:17 dondosha
88 * Added LIBCALL to GapXEditScriptDelete declaration
89 *
90 * Revision 6.59 2001/07/19 22:13:31 dondosha
91 * Made GapXEditScriptDelete public for use in mblast.c
92 *
93 * Revision 6.58 2001/04/25 13:25:47 madden
94 * Do not memset tback
95 *
96 * Revision 6.57 2000/12/29 20:45:38 madden
97 * Fix for ALIGN_packed_nucl
98 *
99 * Revision 6.56 2000/11/30 21:37:39 madden
100 * Roll back before GapXDropSetAlignMask changes
101 *
102 * Revision 6.55 2000/11/29 21:01:39 madden
103 * Use shorter length for dp_ptr rather than value from X
104 *
105 * Revision 6.54 2000/11/15 15:16:41 madden
106 * Changes to recycle dynamic programming structure
107 *
108 * Revision 6.53 2000/11/09 15:15:52 shavirin
109 * Removed #include <txalign.h>
110 *
111 * Revision 6.52 2000/10/13 21:23:48 shavirin
112 * Fixed OOF problem with truncated alignments.
113 *
114 * Revision 6.51 2000/10/04 20:43:36 shavirin
115 * Fixed case with many multiple frame shifts - protein end corrected
116 * in SeqAlign.
117 *
118 * Revision 6.50 2000/10/04 18:06:40 dondosha
119 * Subtract decline_penalty from initial values of dyn_prog[i].FF
120 *
121 * Revision 6.49 2000/09/07 16:24:27 shavirin
122 * Fixed minor memory leak.
123 *
124 * Revision 6.48 2000/08/30 14:15:16 shavirin
125 * Empty log message.
126 *
127 * Revision 6.47 2000/08/30 13:56:58 shavirin
128 * Removed (ifdef-ed out) OOF debug printout.
129 *
130 * Revision 6.46 2000/08/29 22:09:04 shavirin
131 * Next update to OOF API. Fixed few cases.
132 *
133 * Revision 6.45 2000/08/25 19:00:45 shavirin
134 * Added adjustment to the order of unaligned regions and gaps to ensure,
135 * that unaligned regions will be always from the left from gap if adjacent
136 *
137 * Revision 6.44 2000/08/08 21:44:25 shavirin
138 * Enabled possibility to create discontinuous alignment.
139 *
140 * Revision 6.43 2000/07/26 21:02:26 shavirin
141 * Removed debug printing of OOF alignment.
142 *
143 * Revision 6.42 2000/07/26 20:33:13 kans
144 * included txalign.h
145 *
146 * Revision 6.41 2000/07/25 16:48:07 shavirin
147 * Changed function to create ASN.1 for results of OOF search.
148 *
149 * Revision 6.40 2000/07/18 22:36:01 shavirin
150 * Fixed ABR, ABW errors or purify in PerformGapped..() functions.
151 *
152 * Revision 6.39 2000/07/17 15:24:21 shavirin
153 * Added parameter to function OOFGapXEditBlockToSeqAlign()
154 *
155 * Revision 6.37 2000/07/13 15:10:55 shavirin
156 * Changed SEMI_G_ALIGN to OOF_SEMI_G_ALIGN for is_ooframe == TRUE.
157 *
158 * Revision 6.36 2000/07/12 23:07:31 kans
159 * reverse_seq moved from pseed3.c to blastool.c, placed in blast.h header, called by gapxdrop.c
160 *
161 * Revision 6.35 2000/07/12 14:48:14 kans
162 * added prototype for reverse_seq, copied from pseed3.c - unable to easily include seed.h due to cascading dependency errors
163 *
164 * Revision 6.34 2000/07/11 20:56:08 shavirin
165 * Fixed minor typo in GapAlignBlkDelete().
166 *
167 * Revision 6.33 2000/07/11 20:49:07 shavirin
168 * Added all major functions for Out-Of-Frame alignment.
169 *
170 * Revision 6.32 2000/06/16 20:36:55 madden
171 * Remove MemSet from setup of state structure
172 *
173 * Revision 6.31 2000/05/16 19:59:39 madden
174 * Fix for rpsblast extensions
175 *
176 * Revision 6.30 2000/05/02 17:17:13 madden
177 * Changes to prevent gapping between sequences in rps-blast
178 *
179 * Revision 6.29 2000/03/29 21:54:28 dondosha
180 * Made GapXEditScriptNew public for use in MegaBlastFillHspGapInfo
181 *
182 * Revision 6.28 2000/02/16 21:45:42 shavirin
183 * Fixed memory leaks in GXEMakeSeqAlign() function.
184 *
185 * Revision 6.27 2000/02/11 21:12:43 shavirin
186 * Fixed creating reversed seqalign (gaps).
187 *
188 * Revision 6.26 2000/02/10 16:41:02 shavirin
189 * Fixed creation of seqalign in case of reversed tblastn case.
190 *
191 * Revision 6.25 1999/12/17 20:47:04 egorov
192 * Fix 'gcc -Wall' warnings
193 *
194 * Revision 6.24 1999/11/26 22:07:48 madden
195 * Added PerformNtGappedAlignment and ALIGN_packed_nucl
196 *
197 * Revision 6.23 1999/10/29 14:50:48 shavirin
198 * Workaround to bypass DECLINE regions.
199 *
200 * Revision 6.22 1999/10/27 16:40:34 shavirin
201 * Produced more correct discontinuous SeqAlign if DECLINE regions exists.
202 *
203 * Revision 6.21 1999/10/04 18:28:45 madden
204 * Add ALIGN_EX and SEMI_ALIGN_EX that do not require reversing sequences
205 *
206 * Revision 6.20 1999/09/23 15:28:23 shavirin
207 * Temporary disabled ability to create discontinuous alignment.
208 *
209 * Revision 6.19 1999/09/21 15:52:53 shavirin
210 * Added possibility to create discontinuous alignment in case when at
211 * leaset one DECLINE_ALIGN region exists in results of BLAST search.
212 *
213 * Revision 6.18 1999/08/27 18:07:34 shavirin
214 * Passed parameter decline_align from top to the engine.
215 *
216 * Revision 6.17 1999/05/12 18:48:52 madden
217 * lift assumption that no deletion follows insertion
218 *
219 * Revision 6.16 1999/05/03 18:59:33 madden
220 * Removed set, but unused, variable count
221 *
222 * Revision 6.15 1999/03/17 18:39:11 madden
223 * Removed unneccessary memset
224 *
225 * Revision 6.14 1999/02/18 21:18:49 madden
226 * MINIINT is INT4_MIN/2
227 *
228 * Revision 6.13 1999/02/17 19:42:42 madden
229 * change INT4_MIN to -9999999 again
230 *
231 * Revision 6.11 1998/12/18 16:20:42 madden
232 * Removed unnecessary memsets
233 *
234 * Revision 6.10 1998/11/19 14:03:38 madden
235 * minor efficiency
236 *
237 * Revision 6.9 1998/11/17 13:39:03 madden
238 * Made ALIGN non-static
239 *
240 * Revision 6.8 1998/09/24 15:26:40 egorov
241 * Fix lint complaints
242 *
243 * Revision 6.7 1998/08/05 21:28:16 madden
244 * Protection against core-dump when gap_extend is zero
245 *
246 * Revision 6.6 1998/05/22 19:16:52 egorov
247 * Another try to fix time problem (increase chunk size to 2Mb)
248 *
249 * Revision 6.5 1998/05/21 12:49:26 egorov
250 * Remove memory allocation optimization to reduce time
251 *
252 * Revision 6.4 1998/04/21 18:39:25 madden
253 * Zhengs suggestion to save memory on long sequences
254 *
255 * Revision 6.3 1998/04/17 19:41:17 madden
256 * Zhengs changes for decline to align
257 *
258 * Revision 6.2 1998/01/06 18:26:48 madden
259 * Gaps have same strand as surrounding sequence
260 *
261 * Revision 6.1 1997/09/22 17:36:29 madden
262 * MACROS for position-specific matrices from Andy Neuwald
263 *
264 * Revision 6.0 1997/08/25 18:53:14 madden
265 * Revision changed to 6.0
266 *
267 * Revision 1.32 1997/06/24 21:03:38 madden
268 * TracebackToGapXEditBlock() check for last GapXEditScript fixed
269 *
270 * Revision 1.31 1997/06/24 19:12:12 tatiana
271 * TracebackToGapXEditBlock() check for last GapXEditScript added
272 *
273 * Revision 1.30 1997/06/24 18:14:33 tatiana
274 * TracebackToGapXEditBlock() fixed
275 *
276 * Revision 1.29 1997/05/13 20:45:50 madden
277 * fixed offset problem in PerformGappedAlignment
278 *
279 * Revision 1.28 1997/04/21 15:35:26 madden
280 * Fixes for 'gapped' StdSegs.
281 *
282 * Revision 1.27 1997/04/16 16:21:38 madden
283 * Set seqalign_type.
284 *
285 * Revision 1.26 1997/04/15 22:01:53 madden
286 * Added original_length[12] for translating searches.
287 *
288 * Revision 1.25 1997/04/14 14:10:37 madden
289 * Fix for the case of dropoff less than the opening penalty.
290 *
291 * Revision 1.24 1997/04/11 19:02:49 madden
292 * Changes for in-frame blastx, tblastn gapping.
293 *
294 * Revision 1.23 1997/03/14 21:01:59 madden
295 * Changed to use less memory in ALIGN, with GapXDropStateArrayStructPtr.
296 *
297 * Revision 1.22 1997/03/11 19:24:08 madden
298 * Check return value of MemNew for state_array in ALIGN.
299 *
300 * Revision 1.21 1997/03/01 18:25:33 madden
301 * Sequences reversed on SeqAlign if reverse flag set on EditBlockPtr.
302 *
303 * Revision 1.20 1997/02/24 16:40:38 madden
304 * Change to GapXEditBlockToSeqAlign to use first SeqIdPtr, duplicate.
305 *
306 * Revision 1.19 1997/02/24 15:09:38 madden
307 * Replaced MemSet by setting some elements of an array to zero,.
308 *
309 * Revision 1.18 1997/02/23 16:44:47 madden
310 * Memory, saved on GapAlignBlkPtr, reuses.
311 *
312 * Revision 1.17 1997/02/20 22:58:49 madden
313 * Support for gapped translated results.
314 *
315 * Revision 1.16 1997/02/20 21:50:24 madden
316 * Added frame and translation information to GapAlignBlk, assigned it.
317 *
318 * Revision 1.15 1997/02/12 22:19:08 madden
319 * Changes for position-based comparisons.
320 *
321 * Revision 1.14 1997/02/10 15:25:33 madden
322 * Changed args. of ALIGN and SEMI_G_ALIGN to pass in gap_align block,
323 * also whether the query was reversed and the query_offset.
324 * Also General cleanup.
325 *
326 * Revision 1.13 1997/02/04 16:22:32 madden
327 * Changes to enable gapped alignments on the reverse strand.
328 *
329 * Revision 1.12 1997/01/22 17:45:08 madden
330 * Added positionBased Boolean to ALIGN and SEMI_G_ALIGN.
331 *
332 * Revision 1.11 1997/01/21 18:55:17 madden
333 * Fixed bug with translation of EditScript to SeqAlign.
334 *
335 * Revision 1.10 1997/01/17 14:29:34 madden
336 * Protection against NULL args added in GapXEditBlockDelete.
337 *
338 * Revision 1.9 1997/01/16 20:20:49 madden
339 * TracebackToGapXEditBlock made non-static.
340 *
341 * Revision 1.8 1997/01/06 22:40:55 madden
342 * Added function SimpleIntervalToGapXEditBlock.
343 *
344 * Revision 1.7 1997/01/06 17:22:59 madden
345 * Used GapXEditScriptToSeqAlign to find SeqAlign.
346 *
347 * Revision 1.6 1996/12/30 21:45:28 madden
348 * UMR fix.
349 *
350 * Revision 1.5 1996/12/30 17:14:06 madden
351 * Fixes for changes for "require a portion of the query sequence".
352 *
353 * Revision 1.4 1996/12/30 15:44:25 madden
354 * Added capability to require a portion of the query sequence.
355 *
356 * Revision 1.3 1996/12/16 15:29:12 madden
357 * Changed gapalign.h to gapxdrop.h
358 *
359 * Revision 1.2 1996/12/12 16:45:03 madden
360 * GapAlignBlkPtr used instead of arguments in functions.
361 *
362 * Revision 1.1 1996/12/12 14:02:51 madden
363 * Initial revision
364 *
365 */
366
367
368 #include <gapxdrop.h>
369 #include <blast.h>
370
371
372 /* A PACKAGE FOR LOCALLY ALIGNING TWO SEQUENCES WITHIN A BAND:
373
374 To invoke, call SEMI_G_ALIGN(A,B,M,N,W,G,H,S,X,&EI,&EJ, score_only, positionBased).
375 The parameters are explained as follows:
376 A, B : two sequences to be aligned
377 M : the length of sequence A
378 N : the length of sequence B
379 W : scoring table for matches and mismatches
380 G : gap-opening penalty
381 H : gap-extension penalty
382 X : maximum drop off
383 S : script for DISPLAY routine
384 *EI : ending position of sequence A in the optimal local alignment
385 *EJ : ending position of sequence B in the optimal local alignment
386 score_only: indicate if ony score is needed. 1--score only 0--alignment
387 is also needed.
388
389 Use best_score to cut unfeasible points. quadratic space and row-wise
390
391 modification (cfj):
392 - These routines can be told to changes their access to A and B, so that they are passed pointers to the
393 forward sequence, but act as if they had been past pointers to reversed sequences.
394 (This is done just as the accesses to posMatrix are flipped if reverse is set.)
395 This obviates the need to actually reverse A and B, which in some runs was almost 20% of
396 the total runtime.
397 This option is invoked by setting reverse_sequence.
398 */
399
400
401 /* Append "Delete k" op */
402 #define DEL_(k) \
403 data.last = (data.last < 0) ? (data.sapp[-1] -= (k)) : (*data.sapp++ = -(k));
404
405 /* Append "Insert k" op */
406 #define INS_(k) \
407 data.last = (data.last > 0) ? (data.sapp[-1] += (k)) : (*data.sapp++ = (k));
408
409 /* Append "Replace" op */
410 #define REP_ \
411 { data.last = *data.sapp++ = 0; }
412
413 /* Divide by two to prevent underflows. */
414 #define MININT INT4_MIN/2
415 #define REPP_ \
416 { *data.sapp++ = MININT; data.last = 0; }
417
418
419 typedef struct {
420 GapXDPPtr CD;
421 Int4Ptr PNTR v;
422 Int4Ptr sapp; /* Current script append ptr */
423 Int4 last;
424 Int4 h, g;
425 } data_t;
426
427 /* #define CHUNKSIZE 1048576 */
428 #define CHUNKSIZE 2097152
429
430 static GapXDropStateArrayStructPtr
GapXDropGetState(GapXDropStateArrayStructPtr PNTR head,Int4 length)431 GapXDropGetState(GapXDropStateArrayStructPtr PNTR head, Int4 length)
432
433 {
434 GapXDropStateArrayStructPtr retval, var, last;
435 Int4 chunksize = MAX(CHUNKSIZE, length + length/3);
436
437 length += length/3; /* Add on about 30% so the end will get reused. */
438 retval = NULL;
439 if (*head == NULL)
440 {
441 retval = (GapXDropStateArrayStructPtr) Nlm_Malloc(sizeof(GapXDropStateArrayStruct));
442 retval->state_array = Nlm_Malloc(chunksize*sizeof(Uint1));
443 retval->length = chunksize;
444 retval->used = 0;
445 retval->next = NULL;
446 *head = retval;
447 }
448 else
449 {
450 var = *head;
451 last = *head;
452 while (var)
453 {
454 if (length < (var->length - var->used))
455 {
456 retval = var;
457 break;
458 }
459 else if (var->used == 0)
460 { /* If it's empty and too small, replace. */
461 var->state_array = MemFree(var->state_array);
462 var->state_array = Nlm_Malloc(chunksize*sizeof(Uint1));
463 var->length = chunksize;
464 retval = var;
465 break;
466 }
467 last = var;
468 var = var->next;
469 }
470
471 if (var == NULL)
472 {
473 retval = (GapXDropStateArrayStructPtr) Nlm_Malloc(sizeof(GapXDropStateArrayStruct));
474 retval->state_array = Nlm_Malloc(chunksize*sizeof(Uint1));
475 retval->length = chunksize;
476 retval->used = 0;
477 retval->next = NULL;
478 last->next = retval;
479 }
480 }
481
482 if (retval->state_array == NULL)
483 ErrPostEx(SEV_ERROR, 0, 0, "state array is NULL");
484
485 return retval;
486
487 }
488
489 static Boolean
GapXDropPurgeState(GapXDropStateArrayStructPtr state_struct)490 GapXDropPurgeState(GapXDropStateArrayStructPtr state_struct)
491
492 {
493 while (state_struct)
494 {
495 /*
496 MemSet(state_struct->state_array, 0, state_struct->used);
497 */
498 state_struct->used = 0;
499 state_struct = state_struct->next;
500 }
501
502 return TRUE;
503 }
504
505 GapXDropStateArrayStructPtr
GapXDropStateDestroy(GapXDropStateArrayStructPtr state_struct)506 GapXDropStateDestroy(GapXDropStateArrayStructPtr state_struct)
507
508 {
509 GapXDropStateArrayStructPtr next;
510
511 while (state_struct)
512 {
513 next = state_struct->next;
514 MemFree(state_struct->state_array);
515 MemFree(state_struct);
516 state_struct = next;
517
518 }
519
520 return NULL;
521 }
522
523 /*
524 Aligns two nucleotide sequences, one (A) should be packed in the
525 same way as the BLAST databases, the other (B) should contain one
526 basepair/byte.
527 Boolean reverse_sequence: do reverse the sequence.
528 */
529
ALIGN_packed_nucl(Uint1Ptr B,Uint1Ptr A,Int4 N,Int4 M,Int4Ptr pej,Int4Ptr pei,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reverse_sequence)530 static Int4 ALIGN_packed_nucl(Uint1Ptr B, Uint1Ptr A, Int4 N, Int4 M,
531 Int4Ptr pej, Int4Ptr pei, GapAlignBlkPtr gap_align,
532 Int4 query_offset, Boolean reverse_sequence)
533 {
534 GapXDPPtr dyn_prog;
535 Int4 i, j, cb, j_r, g, decline_penalty;
536 register Int4 c, d, e, m, tt, h, X, f;
537 Int4 best_score = 0, new_score;
538 Int4Ptr *matrix;
539 register Int4Ptr wa;
540 register GapXDPPtr dp;
541 Uint1Ptr Bptr;
542 Uint1 base_pair;
543 Int4 B_increment=1;
544
545 matrix = gap_align->matrix;
546 *pei = *pej = 0;
547 m = (g=gap_align->gap_open) + gap_align->gap_extend;
548 h = gap_align->gap_extend;
549 decline_penalty = gap_align->decline_align;
550 X = gap_align->x_parameter;
551
552 if (X < m)
553 X = m;
554
555 if(N <= 0 || M <= 0) return 0;
556
557 j = (N + 2) * sizeof(GapXDP);
558 if (gap_align->dyn_prog)
559 dyn_prog = gap_align->dyn_prog;
560 else
561 dyn_prog = (GapXDPPtr)Nlm_Malloc(j);
562 if (!dyn_prog) {
563 ErrPostEx(SEV_ERROR, 0, 0,
564 "Cannot allocate %ld bytes for dynamic programming", j);
565 return -1;
566 }
567 dyn_prog[0].CC = 0; c = dyn_prog[0].DD = -m;
568 dyn_prog[0].FF = -m;
569 for(i = 1; i <= N; i++) {
570 if(c < -X) break;
571 dyn_prog[i].CC = c;
572 dyn_prog[i].DD = c - m;
573 dyn_prog[i].FF = c - m - decline_penalty;
574 c -= h;
575 }
576
577 if(reverse_sequence)
578 B_increment=-1;
579 else
580 B_increment=1;
581
582 tt = 0; j = i;
583 for (j_r = 1; j_r <= M; j_r++) {
584 if(reverse_sequence)
585 {
586 base_pair = READDB_UNPACK_BASE_N(A[(M-j_r)/4], ((j_r-1)%4));
587 wa = matrix[base_pair];
588 }
589 else
590 {
591 base_pair = READDB_UNPACK_BASE_N(A[1+((j_r-1)/4)], (3-((j_r-1)%4)));
592 wa = matrix[base_pair];
593 }
594 e = c =f = MININT;
595 Bptr = &B[tt];
596 if(reverse_sequence)
597 Bptr = &B[N-tt];
598 /*
599 Bptr += B_increment;
600 */
601 for (cb = i = tt, dp = &dyn_prog[i]; i < j; i++) {
602 Bptr += B_increment;
603 new_score = wa[*Bptr];
604
605 d = dp->DD;
606 if (e < f) e = f;
607 if (d < f) d = f;
608 if (c < d || c < e) {
609 if (d < e) {
610 c = e;
611 } else {
612 c = d;
613 }
614 if (best_score - c > X) {
615 c = dp->CC+new_score; f = dp->FF;
616 if (tt == i) tt++;
617 else { dp->CC =dp->FF= MININT;}
618 } else {
619 cb = i;
620 if ((c-=m) > (d-=h)) {
621 dp->DD = c;
622 } else {
623 dp->DD = d;
624 }
625 if (c > (e-=h)) {
626 e = c;
627 }
628 c+=m;
629 d = dp->CC+new_score; dp->CC = c; c=d;
630 d = dp->FF; dp->FF = f-decline_penalty; f = d;
631 }
632 } else {
633 if (best_score - c > X){
634 c = dp->CC+new_score; f= dp->FF;
635 if (tt == i) tt++;
636 else { dp->CC =dp->FF= MININT;}
637 } else {
638 cb = i;
639 if (c > best_score) {
640 best_score = c;
641 *pei = j_r; *pej = i;
642 }
643 if ((c-=m) > (d-=h)) {
644 dp->DD = c;
645 } else {
646 dp->DD = d;
647 }
648 if (c > (e-=h)) {
649 e = c;
650 }
651 c+=m;
652 d = dp->FF;
653 if (c-g>f) dp->FF = c-g-decline_penalty; else dp->FF = f-decline_penalty;
654 f = d;
655 d = dp->CC+new_score; dp->CC = c; c = d;
656 }
657 }
658 if (new_score == MININT)
659 break;
660 dp++;
661 }
662 if (tt == j) break;
663 if (cb < j-1) { j = cb+1;}
664 else while (e >= best_score-X && j <= N) {
665 dyn_prog[j].CC = e; dyn_prog[j].DD = e-m;dyn_prog[j].FF = e-g-decline_penalty;
666 e -= h; j++;
667 }
668 if (j <= N) {
669 dyn_prog[j].DD = dyn_prog[j].CC = dyn_prog[j].FF = MININT; j++;
670 }
671 }
672
673 if (!gap_align->dyn_prog)
674 MemFree(dyn_prog);
675
676 return best_score;
677 }
678 /*
679 Boolean revered: has the sequence been reversed? Used for psi-blast
680 Boolean reverse_sequence: do reverse the sequence.
681 Two Booleans are used to emulate behavior before the sequence could be reversed.
682 */
683
684
685 static Int4
ALIGN_EX(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed,Boolean reverse_sequence)686 ALIGN_EX(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N, Int4Ptr S, Int4Ptr pei,
687 Int4Ptr pej, Int4Ptr PNTR sapp, GapAlignBlkPtr gap_align,
688 Int4 query_offset, Boolean reversed, Boolean reverse_sequence)
689
690 {
691 data_t data;
692 Int4 i, j, cb, j_r, s, k;
693 Uint1 st, std, ste;
694 Int4 gap_open, gap_extend, decline_penalty;
695 register Int4 c, d, e, m,t, tt, f, tt_start;
696 Int4 best_score = 0;
697 register Int4Ptr wa;
698 register GapXDPPtr dp, dyn_prog;
699 Uint1Ptr PNTR state, stp, tmp;
700 Uint1Ptr state_array;
701 Int4Ptr *matrix;
702 register Int4 X;
703 GapXDropStateArrayStructPtr state_struct;
704 Int4 next_c;
705 Uint1Ptr Bptr;
706 Int4 B_increment=1;
707 Int4 align_len;
708
709 matrix = gap_align->matrix;
710 *pei = *pej = 0;
711 data.sapp = *sapp = S;
712 data.last= 0;
713 m = gap_align->gap_open + gap_align->gap_extend;
714 decline_penalty = gap_align->decline_align;
715
716 gap_open = gap_align->gap_open;
717 gap_extend = gap_align->gap_extend;
718 X = gap_align->x_parameter;
719
720 if (X < m)
721 X = m;
722
723 if(N <= 0 || M <= 0) {
724 *pei = *pej;
725 return 0;
726 }
727
728 GapXDropPurgeState(gap_align->state_struct);
729
730 j = (N + 2) * sizeof(GapXDP);
731 if (gap_align->dyn_prog)
732 dyn_prog = gap_align->dyn_prog;
733 else
734 dyn_prog = (GapXDPPtr)Nlm_Malloc(j);
735
736 state = Nlm_Malloc(sizeof(Uint1Ptr)*(M+1));
737 dyn_prog[0].CC = 0;
738 dyn_prog[0].FF = -m - decline_penalty;
739 c = dyn_prog[0].DD = -m;
740
741 /* Protection against divide by zero. */
742 if (gap_extend > 0)
743 state_struct = GapXDropGetState(&gap_align->state_struct, X/gap_extend+5);
744 else
745 state_struct = GapXDropGetState(&gap_align->state_struct, N+3);
746
747 state_array = state_struct->state_array;
748 state[0] = state_array;
749 stp = state[0];
750 for(i = 1; i <= N; i++) {
751 if(c < -X) break;
752 dyn_prog[i].CC = c;
753 dyn_prog[i].DD = c - m;
754 dyn_prog[i].FF = c - m - decline_penalty;
755 c -= gap_extend;
756 stp[i] = 1;
757 }
758 state_struct->used = i+1;
759
760 if(reverse_sequence)
761 B_increment=-1;
762 else
763 B_increment=1;
764
765 tt = 0; j = i;
766 for(j_r = 1; j_r <= M; j_r++) {
767 /* Protection against divide by zero. */
768 if (gap_extend > 0)
769 state_struct = GapXDropGetState(&gap_align->state_struct, j-tt+5+X/gap_extend);
770 else
771 state_struct = GapXDropGetState(&gap_align->state_struct, N-tt+3);
772 state_array = state_struct->state_array + state_struct->used + 1;
773 state[j_r] = state_array - tt + 1;
774 stp = state[j_r];
775 tt_start = tt;
776 if (!(gap_align->positionBased)){ /*AAS*/
777 if(reverse_sequence)
778 wa = matrix[A[M-j_r]];
779 else
780 wa = matrix[A[j_r]];
781 }
782 else {
783 if(reversed || reverse_sequence)
784 wa = gap_align->posMatrix[M - j_r];
785 else
786 wa = gap_align->posMatrix[j_r + query_offset];
787 }
788 e = c = f= MININT;
789 Bptr = &B[tt];
790 if(reverse_sequence)
791 Bptr = &B[N-tt];
792
793 for (cb = i = tt, dp = &dyn_prog[i]; i < j; i++) {
794 Int4 next_f;
795 d = (dp)->DD;
796 Bptr += B_increment;
797 next_c = dp->CC+wa[*Bptr]; /* Bptr is & B[i+1]; */
798 next_f = dp->FF;
799 st = 0;
800 if (c < f) {c = f; st = 3;}
801 if (f > d) {d = f; std = 60;}
802 else {
803 std = 30;
804 if (c < d) { c= d;st = 2;}
805 }
806 if (f > e) {e = f; ste = 20;}
807 else {
808 ste = 10;
809 if (c < e) {c=e; st=1;}
810 }
811 if (best_score - c > X){
812 if (tt == i) tt++;
813 else { dp->CC = MININT; }
814 } else {
815 cb = i;
816 if (c > best_score) {
817 best_score = c;
818 *pei = j_r; *pej = i;
819 }
820 if ((c-=m) > (d-=gap_extend)) {
821 dp->DD = c;
822 } else {
823 dp->DD = d;
824 st+=std;
825 }
826 if (c > (e-=gap_extend)) {
827 e = c;
828 } else {
829 st+=ste;
830 }
831 c+=m;
832 if (f < c-gap_open) {
833 dp->FF = c-gap_open-decline_penalty;
834 } else {
835 dp->FF = f-decline_penalty; st+= 5;
836 }
837 dp->CC = c;
838 }
839 stp[i] = st;
840 c = next_c;
841 f = next_f;
842 dp++;
843 }
844 if (tt == j) { j_r++; break;}
845 if (cb < j-1) { j = cb+1;}
846 else {
847 while (e >= best_score-X && j <= N) {
848 dyn_prog[j].CC = e; dyn_prog[j].DD = e-m; dyn_prog[j].FF = e-gap_open-decline_penalty;
849 e -= gap_extend; stp[j] = 1;
850 j++;
851 }
852 }
853 if (j <= N) {
854 dyn_prog[j].DD = dyn_prog[j].CC= dyn_prog[j].FF = MININT;
855 j++;
856 }
857 state_struct->used += (MAX(i, j) - tt_start + 1);
858 }
859 i = *pei; j = *pej;
860 tmp = Nlm_Malloc(i+j);
861 for (s=0, c = 0; i> 0 || j > 0; c++) {
862 t = state[i][j];
863 k = t %5;
864 if (s == 1)
865 if ((t/10)%3 == 1) k = 1;
866 else if ((t/10)%3 == 2) k = 3;
867 if (s == 2)
868 if ((t/30) == 1) k = 2;
869 else if ((t/30) == 2) k = 3;
870 if (s == 3 && ((t/5)%2) == 1) k = 3;
871 if (k == 1) { j--;}
872 else if (k == 2) {i--;}
873 else {j--; i--;}
874 tmp[c] = s = k;
875 }
876
877 align_len = c;
878 c--;
879 while (c >= 0) {
880 if (tmp[c] == 0) REP_
881 else if (tmp[c] == 1) INS_(1)
882 else if (tmp[c] == 3) REPP_
883 else DEL_(1)
884 c--;
885 }
886
887 MemFree(tmp);
888
889 MemFree(state);
890
891 if (!gap_align->dyn_prog)
892 MemFree(dyn_prog);
893
894 if ((align_len -= data.sapp - S) > 0)
895 MemSet(data.sapp, 0, align_len);
896 *sapp = data.sapp;
897
898 return best_score;
899 }
900
901 Int4 LIBCALL
ALIGN(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed)902 ALIGN(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N,
903 Int4Ptr S, Int4Ptr pei, Int4Ptr pej, Int4Ptr PNTR sapp,
904 GapAlignBlkPtr gap_align, Int4 query_offset, Boolean reversed)
905
906 {
907 return ALIGN_EX(A, B, M, N, S, pei, pej, sapp, gap_align, query_offset, reversed, FALSE);
908 }
909
910 /*
911 Boolean revered: has the sequence been reversed? Used for psi-blast
912 Boolean reverse_sequence: do reverse the sequence.
913 Two Booleans are used to emulate behavior before the sequence could be reversed.
914 */
SEMI_G_ALIGN_EX(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Boolean score_only,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed,Boolean reverse_sequence)915 static Int4 SEMI_G_ALIGN_EX(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N,
916 Int4Ptr S, Int4Ptr pei, Int4Ptr pej,
917 Boolean score_only, Int4Ptr PNTR sapp, GapAlignBlkPtr gap_align,
918 Int4 query_offset, Boolean reversed, Boolean reverse_sequence)
919
920 {
921 GapXDPPtr dyn_prog;
922 Int4 i, j, cb, j_r, g, decline_penalty;
923 register Int4 c, d, e, m, tt, h, X, f;
924 Int4 best_score = 0;
925 Int4Ptr *matrix;
926 register Int4Ptr wa;
927 register GapXDPPtr dp;
928 Uint1Ptr Bptr;
929 Int4 B_increment=1;
930 Int4 next_c, next_f;
931
932 if(!score_only) {
933 return ALIGN_EX(A, B, M, N, S, pei, pej, sapp, gap_align, query_offset, reversed, reverse_sequence);
934 }
935
936 matrix = gap_align->matrix;
937 *pei = *pej = 0;
938 m = (g=gap_align->gap_open) + gap_align->gap_extend;
939 h = gap_align->gap_extend;
940 decline_penalty = gap_align->decline_align;
941
942 X = gap_align->x_parameter;
943
944 if (X < m)
945 X = m;
946
947 if(N <= 0 || M <= 0) return 0;
948
949 j = (N + 2) * sizeof(GapXDP);
950 dyn_prog = (GapXDPPtr)Nlm_Malloc(j);
951
952 dyn_prog[0].CC = 0; c = dyn_prog[0].DD = -m;
953 dyn_prog[0].FF = -m - decline_penalty;
954 for(i = 1; i <= N; i++) {
955 if(c < -X) break;
956 dyn_prog[i].CC = c;
957 dyn_prog[i].DD = c - m;
958 dyn_prog[i].FF = c-m - decline_penalty;
959 c -= h;
960 }
961
962 if(reverse_sequence)
963 B_increment=-1;
964 else
965 B_increment=1;
966
967 tt = 0; j = i;
968 for (j_r = 1; j_r <= M; j_r++) {
969 if (!(gap_align->positionBased)){ /*AAS*/
970 if(reverse_sequence)
971 wa = matrix[A[M-j_r]];
972 else
973 wa = matrix[A[j_r]];
974 }
975 else {
976 if(reversed || reverse_sequence)
977 {
978 wa = gap_align->posMatrix[M - j_r];
979 if (A[M-j_r] == NULLB) /* Prevents gapping through a NULL byte in rps-blast. */
980 break;
981 }
982 else
983 {
984 wa = gap_align->posMatrix[j_r + query_offset];
985 if (A[j_r] == NULLB)
986 break;
987 }
988 }
989 e = c =f = MININT;
990 Bptr = &B[tt];
991 if(reverse_sequence)
992 Bptr = &B[N-tt];
993 for (cb = i = tt, dp = &dyn_prog[i]; i < j; i++) {
994 d = dp->DD;
995 Bptr += B_increment;
996 next_c = dp->CC+wa[*Bptr]; /* Bptr is & B[i+1]; */
997 next_f = dp->FF;
998
999 if (c < f) c = f;
1000 if (f > d) d = f;
1001 else if (c < d) c= d;
1002
1003 if (f > e) e = f;
1004 else if (c < e) c = e;
1005
1006 if (best_score - c > X){
1007 if (tt == i) tt++;
1008 else { dp->CC = MININT; }
1009 } else {
1010 cb = i;
1011 if (c > best_score) {
1012 best_score = c;
1013 *pei = j_r; *pej = i;
1014 }
1015 if ((c-=m) > (d-=h)) {
1016 dp->DD = c;
1017 } else {
1018 dp->DD = d;
1019 }
1020 if (c > (e-=h)) {
1021 e = c;
1022 }
1023 c+=m;
1024 if (f < c-g) {
1025 dp->FF = c-g-decline_penalty;
1026 } else {
1027 dp->FF = f-decline_penalty;
1028 }
1029 dp->CC = c;
1030 }
1031 c = next_c;
1032 f = next_f;
1033 dp++;
1034 }
1035
1036 if (tt == j) break;
1037 if (cb < j-1) { j = cb+1;}
1038 else while (e >= best_score-X && j <= N) {
1039 dyn_prog[j].CC = e; dyn_prog[j].DD = e-m;dyn_prog[j].FF = e-g-decline_penalty;
1040 e -= h; j++;
1041 }
1042 if (j <= N) {
1043 dyn_prog[j].DD = dyn_prog[j].CC = dyn_prog[j].FF = MININT; j++;
1044 }
1045 }
1046
1047 MemFree(dyn_prog);
1048
1049 return best_score;
1050 }
1051
SEMI_G_ALIGN(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Boolean score_only,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed)1052 Int4 SEMI_G_ALIGN(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N,
1053 Int4Ptr S, Int4Ptr pei, Int4Ptr pej,
1054 Boolean score_only, Int4Ptr PNTR sapp, GapAlignBlkPtr gap_align,
1055 Int4 query_offset, Boolean reversed)
1056
1057 {
1058 return SEMI_G_ALIGN_EX(A, B, M, N, S, pei, pej, score_only, sapp, gap_align,
1059 query_offset, reversed, FALSE);
1060 }
1061
OOF_ALIGN(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed)1062 static Int4 OOF_ALIGN(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N,
1063 Int4Ptr S, Int4Ptr pei, Int4Ptr pej, Int4Ptr PNTR sapp,
1064 GapAlignBlkPtr gap_align, Int4 query_offset,
1065 Boolean reversed)
1066
1067 {
1068 data_t data;
1069 Int4 i, j, cb, j_r, s, k, sc, s1, s2, s3, st1, e1, e2, e3, shift;
1070 register Int4 c, d, m,t, tt, tt_start, f1, f2;
1071 Int4 best_score = 0;
1072 register Int4Ptr wa;
1073 Int4 count = 0;
1074 register GapXDPPtr dp;
1075 Uint1Ptr PNTR state, stp, tmp;
1076 Uint1Ptr state_array;
1077 Int4Ptr *matrix;
1078 register Int4 X;
1079 GapXDropStateArrayStructPtr state_struct;
1080
1081 matrix = gap_align->matrix;
1082 *pei =0; *pej = -2;
1083 data.sapp = *sapp = S;
1084 data.last= 0;
1085 m = gap_align->gap_open + gap_align->gap_extend;
1086 data.g = gap_align->gap_open;
1087 data.h = gap_align->gap_extend;
1088 data.v = matrix;
1089 X = gap_align->x_parameter;
1090 shift = gap_align->shift_pen;
1091
1092 if (X < m)
1093 X = m;
1094
1095 if(N <= 0 || M <= 0) {
1096 return 0;
1097 }
1098
1099 N+=2;
1100 GapXDropPurgeState(gap_align->state_struct);
1101
1102 j = (N + 2) * sizeof(GapXDP);
1103 data.CD = (GapXDPPtr)MemNew(j);
1104
1105 state = MemNew(sizeof(Uint1Ptr)*(M+1));
1106 data.CD[0].CC = 0;
1107 c = data.CD[0].DD = -m;
1108 state_struct = GapXDropGetState(&gap_align->state_struct, N+3);
1109 state_array = state_struct->state_array;
1110 state[0] = state_array;
1111 stp = state[0];
1112 data.CD[0].CC = 0; c = data.CD[0].DD = -m;
1113 for(i = 3; i <= N; i+=3) {
1114 data.CD[i].CC = c;
1115 data.CD[i].DD = c - m;
1116 data.CD[i-1].CC = data.CD[i-2].CC = data.CD[i-1].DD =
1117 data.CD[i-2].DD = MININT;
1118 if(c < -X) break;
1119 c -= data.h;
1120 stp[i] = stp[i-1] = stp[i-2] = 6;
1121 }
1122 i -= 2;
1123 data.CD[i].CC = data.CD[i].DD = MININT;
1124 tt = 0; j = i;
1125 state_struct->used = i+1;
1126 B-=2;
1127 tt = 0; j = i;
1128 for(j_r = 1; j_r <= M; j_r++) {
1129 count += j - tt;
1130 state_struct = GapXDropGetState(&gap_align->state_struct, N-tt+3);
1131 state_array = state_struct->state_array + state_struct->used + 1;
1132 state[j_r] = state_array - tt + 1;
1133 stp = state[j_r];
1134 tt_start = tt;
1135 if (!(gap_align->positionBased)) /*AAS*/
1136 wa = matrix[A[j_r]];
1137 else {
1138 if(reversed)
1139 wa = gap_align->posMatrix[M - j_r];
1140 else
1141 wa = gap_align->posMatrix[j_r + query_offset];
1142 }
1143 c = MININT; sc =f1=f2=e1 =e2=e3=s1=s2=s3=MININT;
1144 for(cb = i = tt, dp = &data.CD[i-1]; 1;) {
1145 if (i >= j) break;
1146 sc = MAX(MAX(f1, f2)-shift, s3);
1147 if (sc == s3) st1=3;
1148 else if (sc+shift == f1) {
1149 if (f1 == s2) st1=2; else st1 = 5;
1150 } else if (f2 == s1) st1 = 1; else st1 = 4;
1151 sc += wa[B[i++]];
1152 f1 = s3;
1153 s3 = (++dp)->CC; f1 = MAX(f1, s3);
1154 d = dp->DD;
1155 if (sc < MAX(d, e1)) {
1156 if (d > e1) { sc = d; st1 = 30;}
1157 else {sc = e1; st1 = 36;}
1158 if (best_score -sc > X) {
1159 if (tt == i-1) tt = i;
1160 else dp->CC = MININT;
1161 } else {
1162 cb = i;
1163 dp->CC = sc;
1164 dp->DD = d-data.h;
1165 e1-=data.h;
1166 }
1167 } else {
1168 if (best_score -sc > X) {
1169 if (tt == i-1) tt = i;
1170 else dp->CC = MININT;
1171 } else {
1172 cb = i;
1173 dp->CC = sc;
1174 if (sc > best_score) {best_score = sc; *pei = j_r;*pej=i-3;}
1175 if ((sc-=m) > (e1-=data.h)) e1 = sc; else st1+=10;
1176 if (sc < (d-=data.h)) { dp->DD = d; st1 += 20;}
1177 else dp->DD = sc;
1178 }
1179 }
1180 stp[i-1] = st1;
1181 if (i >= j) {c = e1; e1 = e2; e2 = e3; e3 = c; break;}
1182 sc = MAX(MAX(f1,f2)-shift, s2);
1183 if (sc == s2) st1=3;
1184 else if (sc+shift == f1) {
1185 if (f1 == s3) st1=1; else st1 = 4;
1186 } else if (f2 == s1) st1 = 2; else st1 = 5;
1187 sc += wa[B[i++]];
1188 f2 = s2; s2 = (++dp)->CC; f2 = MAX(f2, s2);
1189 d = dp->DD;
1190 if (sc < MAX(d, e2)) {
1191 if ((sc=MAX(d,e2)) < best_score-X) {
1192 if (tt == i-1) tt = i;
1193 else dp->CC = MININT;
1194 } else {
1195 if (sc == d) st1= 30; else st1=36;
1196 cb = i;
1197 dp->CC = sc;
1198 dp->DD = d-data.h;
1199 e2-=data.h;
1200 }
1201 } else {
1202 if (sc < best_score-X) {
1203 if (tt == i-1) tt = i;
1204 else dp->CC = MININT;
1205 } else {
1206 cb = i;
1207 dp->CC = sc;
1208 if (sc > best_score) {best_score = sc;*pei= j_r; *pej=i-3;}
1209 if ((sc-=m) > (e2-=data.h)) e2 = sc; else st1+=10;
1210 if (sc < (d-=data.h)) {dp->DD = d; st1+=20;}
1211 else dp->DD = sc;
1212 }
1213 }
1214 stp[i-1] = st1;
1215 if (i >= j) { c = e2; e2 = e1; e1 = e3; e3 = c; break; }
1216 sc = MAX(MAX(f1, f2)-shift, s1);
1217 if (sc == s1) st1=3;
1218 else if (sc+shift == f1) {
1219 if (f1 == s3) st1=2; else st1 = 5;
1220 } else if (f2 == s2) st1 = 1; else st1 = 4;
1221 sc += wa[B[i++]];
1222 f1 = f2;
1223 f2 = s1; s1 = (++dp)->CC; f2 = MAX(f2, s1);
1224 d = dp->DD;
1225 if (sc < MAX(d, e3)) {
1226 sc = MAX(d, e3);
1227 if (sc < best_score-X) {
1228 if (tt == i-1) tt = i;
1229 else dp->CC = MININT;
1230 } else {
1231 if (sc == d) st1 = 30; else st1 = 36;
1232 cb = i;
1233 dp->CC = sc;
1234 dp->DD = d-data.h;
1235 e3-=data.h;
1236 }
1237 } else {
1238 if (sc < best_score-X) {
1239 if (tt == i-1) tt = i;
1240 else dp->CC = MININT;
1241 } else {
1242 cb = i;
1243 dp->CC = sc;
1244 if (sc > best_score) {best_score = sc;*pei = j_r; *pej=i-3;}
1245 if ((sc-=m) > (e3-=data.h)) e3 = sc; else st1 += 10;
1246 if (sc < (d-=data.h)) {dp->DD = d; st1 += 20;}
1247 else dp->DD = sc;
1248 }
1249 }
1250 stp[i-1] = st1;
1251 sc = c;
1252 }
1253 if(tt == j) break;
1254 if(cb < j) { j = cb;}
1255 else {
1256 c = (MAX(e1, MAX(e2, e3))+X-best_score)/data.h+j;
1257 if (c > N) c = N;
1258 if (c > j)
1259 while (1) {
1260 data.CD[j].CC = e1;
1261 stp[j] = 36;
1262 data.CD[j++].DD = e1 - m; e1 -=data.h;
1263 if (j > c) break;
1264 data.CD[j].CC = e2; stp[j] = 36;
1265 data.CD[j++].DD = e2- m; e2-=data.h;
1266 if (j > c) break;
1267 data.CD[j].CC = e3; stp[j] = 36;
1268 data.CD[j++].DD = e3- m; e3-=data.h;
1269 if (j > c) break;
1270 }
1271 }
1272 c = j+4;
1273 if (c > N+1) c = N+1;
1274 while (j < c) {
1275 data.CD[j].DD = data.CD[j].CC = MININT;
1276 j++;
1277 }
1278
1279 state_struct->used += (MAX(i, j) - tt_start + 1);
1280 }
1281 i = *pei; j = *pej+2;
1282 /* printf("best = %d i,j=%d %d\n", best_score, i, j); */
1283 tmp = MemNew(i + j);
1284 for (s= 1, c= 0; i > 0 || j > 0; c++, i--) {
1285 k = (t=state[i][j])%10;
1286 if (s == 6 && (t/10)%2 == 1) k = 6;
1287 if (s == 0 && (t/20)== 1) k = 0;
1288 if (k == 6) { j -= 3; i++;}
1289 else {j -= k;}
1290 s = tmp[c] = k;
1291 }
1292 c--;
1293 while(c >= 0) {
1294 *data.sapp++ = tmp[c--];
1295 }
1296
1297 MemFree(tmp);
1298
1299 MemFree(state);
1300
1301 MemFree(data.CD);
1302 *sapp = data.sapp;
1303
1304 return best_score;
1305 }
1306
1307
OOF_SEMI_G_ALIGN(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4Ptr pei,Int4Ptr pej,Boolean score_only,Int4Ptr PNTR sapp,GapAlignBlkPtr gap_align,Int4 query_offset,Boolean reversed)1308 static Int4 OOF_SEMI_G_ALIGN(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N,
1309 Int4Ptr S, Int4Ptr pei, Int4Ptr pej,
1310 Boolean score_only, Int4Ptr PNTR sapp,
1311 GapAlignBlkPtr gap_align,
1312 Int4 query_offset, Boolean reversed)
1313 {
1314 GapXDPPtr CD;
1315 Int4 i, j, cb, j_r;
1316 Int4 e1, e2, e3, s1, s2, s3, shift;
1317 register Int4 c, d, sc, m, tt, h, X, f1, f2;
1318 Int4 best_score = 0, count = 0;
1319 Int4Ptr *matrix;
1320 register Int4Ptr wa;
1321 register GapXDPPtr dp;
1322
1323 if(!score_only)
1324 return OOF_ALIGN(A, B, M, N, S, pei, pej, sapp, gap_align, query_offset, reversed);
1325
1326 /*printf("reversed=%d\n", reversed);*/
1327 matrix = gap_align->matrix;
1328 *pei = 0; *pej = -2;
1329 m = gap_align->gap_open + gap_align->gap_extend;
1330 h = gap_align->gap_extend;
1331 X = gap_align->x_parameter;
1332 shift = gap_align->shift_pen;
1333 /*printf("m=%d h=%d X=%d shift=%d %d\n", m,h, X, shift, B[2]);*/
1334
1335 B-=2;
1336 if (X < m)
1337 X = m;
1338
1339 if(N <= 0 || M <= 0) return 0;
1340 N+=2;
1341
1342 j = (N + 5) * sizeof(GapXDP);
1343 CD = (GapXDPPtr)MemNew(j);
1344 CD[0].CC = 0; c = CD[0].DD = -m;
1345 for(i = 3; i <= N; i+=3) {
1346 CD[i].CC = c;
1347 CD[i].DD = c - m;
1348 CD[i-1].CC = CD[i-2].CC = CD[i-1].DD = CD[i-2].DD = MININT;
1349 if(c < -X) break;
1350 c -= h;
1351 }
1352 i -= 2;
1353 CD[i].CC = CD[i].DD = MININT;
1354 tt = 0; j = i;
1355 for (j_r = 1; j_r <= M; j_r++) {
1356 count += j - tt; CD[2].CC = CD[2].DD= MININT;
1357 if (!(gap_align->positionBased)) /*AAS*/
1358 wa = matrix[A[j_r]];
1359 else {
1360 if(reversed)
1361 wa = gap_align->posMatrix[M - j_r];
1362 else
1363 wa = gap_align->posMatrix[j_r + query_offset];
1364 }
1365 s1 = s2 = s3 = f1= f2 = MININT; f1=f2=e1 = e2 = e3 = MININT; sc = MININT;
1366 for(cb = i = tt, dp = &CD[i-1]; 1;) {
1367 if (i >= j) break;
1368 sc = MAX(MAX(f1, f2)-shift, s3)+wa[B[i++]];
1369 f1 = s3;
1370 s3 = (++dp)->CC; f1 = MAX(f1, s3);
1371 d = dp->DD;
1372 if (sc < MAX(d, e1)) {
1373 sc = MAX(d, e1);
1374 if (best_score -sc > X) {
1375 if (tt == i-1) tt = i;
1376 else dp->CC = MININT;
1377 } else {
1378 cb = i;
1379 dp->CC = sc;
1380 dp->DD = d-h;
1381 e1-=h;
1382 }
1383 } else {
1384 if (best_score -sc > X) {
1385 if (tt == i-1) tt = i;
1386 else dp->CC = MININT;
1387 } else {
1388 cb = i;
1389 dp->CC = sc;
1390 if (sc > best_score) {best_score = sc; *pei = j_r;*pej=i-1;}
1391 if ((sc-=m) > (e1-=h)) e1 = sc;
1392 dp->DD = MAX(sc, d-h);
1393 }
1394 }
1395 if (i >= j) {c = e1; e1 = e2; e2 = e3; e3 = c; break;}
1396 sc = MAX(MAX(f1, f2)-shift, s2)+wa[B[i++]];
1397 f2 = s2; s2 = (++dp)->CC; f2 = MAX(f2, s2);
1398 d = dp->DD;
1399 if (sc < MAX(d, e2)) {
1400 if ((c=MAX(d,e2)) < best_score-X) {
1401 if (tt == i-1) tt = i;
1402 else dp->CC = MININT;
1403 } else {
1404 cb = i;
1405 dp->CC = c;
1406 dp->DD = d-h;
1407 e2-=h;
1408 }
1409 } else {
1410 if (sc < best_score-X) {
1411 if (tt == i-1) tt = i;
1412 else dp->CC = MININT;
1413 } else {
1414 cb = i;
1415 dp->CC = sc;
1416 if (sc > best_score) {best_score = sc;*pei= j_r; *pej=i-1;}
1417 if ((sc-=m) > (e2-=h)) e2 = sc;
1418 dp->DD = MAX(sc, d-h);
1419 }
1420 }
1421 if (i >= j) { c = e2; e2 = e1; e1 = e3; e3 = c; break; }
1422 sc = MAX(MAX(f1, f2)-shift, s1)+wa[B[++i]];
1423 f1 = f2;
1424 f2 = s1; s1 = (++dp)->CC; f2 = MAX(f2, s1);
1425 d = dp->DD;
1426 if (sc < MAX(d, e3)) {
1427 sc = MAX(d, e3);
1428 if (sc < best_score-X) {
1429 if (tt == i-1) tt = i;
1430 else dp->CC = MININT;
1431 } else {
1432 cb = i;
1433 dp->CC = sc;
1434 dp->DD = d-h;
1435 e3-=h;
1436 }
1437 } else {
1438 if (sc < best_score-X) {
1439 if (tt == i-1) tt = i;
1440 else dp->CC = MININT;
1441 } else {
1442 cb = i;
1443 dp->CC = sc;
1444 if (sc > best_score) {best_score = sc;*pei = j_r; *pej=i-1;}
1445 if ((sc-=m) > (e3-=h)) e3 = sc;
1446 dp->DD = MAX(sc, d-h);
1447 }
1448 }
1449 sc = c;
1450 }
1451 if(tt == j) break;
1452 if(cb < j) { j = cb;}
1453 else {
1454 c = (MAX(e1, MAX(e2, e3))+X-best_score)/h+j;
1455 if (c > N) c = N;
1456 if (c > j)
1457 while (1) {
1458 CD[j].CC = e1;
1459 CD[j++].DD = e1 - m; e1 -=h;
1460 if (j > c) break;
1461 CD[j].CC = e2;
1462 CD[j++].DD = e2- m; e2-=h;
1463 if (j > c) break;
1464 CD[j].CC = e3;
1465 CD[j++].DD = e3- m; e3-=h;
1466 if (j > c) break;
1467 }
1468 }
1469 c = j+4;
1470 if (c > N+1) c = N+1;
1471 while (j < c) {
1472 CD[j].DD = CD[j].CC = MININT;
1473 j++;
1474 }
1475 }
1476 /*
1477 printf("j_r=%d M=%d pei pej=%d %d\n", j_r, M, *pei, *pej);
1478 */
1479 /*printf("bestscore = %d\n", best_score);*/
1480
1481 MemFree(CD);
1482
1483 return best_score;
1484 }
1485 /*
1486 Allocates an EditScriptPtr and puts it on the end of
1487 the chain of EditScriptPtr's. Returns a pointer to the
1488 new one.
1489
1490 */
1491 GapXEditScriptPtr
GapXEditScriptNew(GapXEditScriptPtr old)1492 GapXEditScriptNew(GapXEditScriptPtr old)
1493
1494 {
1495 GapXEditScriptPtr new;
1496
1497 new = (GapXEditScriptPtr) MemNew(sizeof(GapXEditScript));
1498
1499 if (old == NULL)
1500 return new;
1501
1502 while (old->next != NULL)
1503 {
1504 old = old->next;
1505 }
1506
1507 old->next = new;
1508
1509 return new;
1510 }
1511
1512 GapXEditScriptPtr LIBCALL
GapXEditScriptDelete(GapXEditScriptPtr old)1513 GapXEditScriptDelete(GapXEditScriptPtr old)
1514 {
1515 GapXEditScriptPtr next;
1516
1517 while (old)
1518 {
1519 next = old->next;
1520 old = MemFree(old);
1521 old = next;
1522 }
1523 return old;
1524 }
1525
1526 GapXEditBlockPtr LIBCALL
GapXEditBlockNew(Int4 start1,Int4 start2)1527 GapXEditBlockNew(Int4 start1, Int4 start2)
1528
1529 {
1530 GapXEditBlockPtr edit_block;
1531
1532 edit_block = (GapXEditBlockPtr) MemNew(sizeof(GapXEditBlock));
1533 edit_block->start1 = start1;
1534 edit_block->start2 = start2;
1535
1536 return edit_block;
1537 }
1538
1539 GapXEditBlockPtr LIBCALL
GapXEditBlockDelete(GapXEditBlockPtr edit_block)1540 GapXEditBlockDelete(GapXEditBlockPtr edit_block)
1541
1542 {
1543 if (edit_block == NULL)
1544 return NULL;
1545
1546 edit_block->esp = GapXEditScriptDelete(edit_block->esp);
1547
1548 edit_block = MemFree(edit_block);
1549
1550 return edit_block;
1551 }
1552
1553 /* Alignment display routine */
1554
1555 GapXEditBlockPtr LIBCALL
TracebackToGapXEditBlock(Uint1Ptr A,Uint1Ptr B,Int4 M,Int4 N,Int4Ptr S,Int4 start1,Int4 start2)1556 TracebackToGapXEditBlock(Uint1Ptr A, Uint1Ptr B, Int4 M, Int4 N, Int4Ptr S, Int4 start1, Int4 start2)
1557 {
1558
1559 register Int4 i, j, op, number_of_subs, number_of_decline;
1560 GapXEditScriptPtr e_script;
1561 GapXEditBlockPtr edit_block;
1562
1563 if (S == NULL)
1564 return NULL;
1565
1566 i = j = op = number_of_subs = number_of_decline = 0;
1567
1568 edit_block = GapXEditBlockNew(start1, start2);
1569 edit_block->esp = e_script = GapXEditScriptNew(NULL);
1570
1571 while(i < M || j < N)
1572 {
1573 op = *S;
1574 if (op != MININT && number_of_decline > 0)
1575 {
1576 e_script->op_type = GAPALIGN_DECLINE;
1577 e_script->num = number_of_decline;
1578 e_script = GapXEditScriptNew(e_script);
1579 number_of_decline = 0;
1580 }
1581 if (op != 0 && number_of_subs > 0)
1582 {
1583 e_script->op_type = GAPALIGN_SUB;
1584 e_script->num = number_of_subs;
1585 e_script = GapXEditScriptNew(e_script);
1586 number_of_subs = 0;
1587 }
1588 if (op == 0) {
1589 i++; j++; number_of_subs++;
1590 } else if (op == MININT) {
1591 i++; j++;
1592 number_of_decline++;
1593 }
1594 else
1595 {
1596 if(op > 0)
1597 {
1598 e_script->op_type = GAPALIGN_DEL;
1599 e_script->num = op;
1600 j += op;
1601 if (i < M || j < N)
1602 e_script = GapXEditScriptNew(e_script);
1603 }
1604 else
1605 {
1606 e_script->op_type = GAPALIGN_INS;
1607 e_script->num = ABS(op);
1608 i += ABS(op);
1609 if (i < M || j < N)
1610 e_script = GapXEditScriptNew(e_script);
1611 }
1612 }
1613 S++;
1614 }
1615
1616 if (number_of_subs > 0)
1617 {
1618 e_script->op_type = GAPALIGN_SUB;
1619 e_script->num = number_of_subs;
1620 } else if (number_of_decline > 0) {
1621 e_script->op_type = GAPALIGN_DECLINE;
1622 e_script->num = number_of_decline;
1623 }
1624
1625 return edit_block;
1626 }
1627 /*
1628 Converts a OOF traceback from the gapped alignment to a
1629 GapXEditBlock.
1630
1631 This function is for out-of-frame gapping:
1632
1633 index1 is for the protein, index2 is for the nucleotides.
1634
1635 0: deletion of a dna codon, i.e dash aligned with a protein letter.
1636 1: one nucleotide vs a protein, deletion of 2 nuc.
1637 2: 2 nucleotides aligned with a protein, i.e deletion of one nuc.
1638 3: substitution, 3 nucleotides vs an amino acid.
1639 4: 4 nuc vs an amino acid.
1640 5: 5 nuc vs an amino acid.
1641 6: a codon aligned with a dash. opposite of 0.
1642 */
1643
1644 GapXEditBlockPtr LIBCALL
OOFTracebackToGapXEditBlock(Int4 M,Int4 N,Int4Ptr S,Int4 start1,Int4 start2)1645 OOFTracebackToGapXEditBlock(Int4 M, Int4 N, Int4Ptr S, Int4 start1, Int4 start2)
1646 {
1647 register Int4 current_val, last_val, number, index1, index2;
1648 GapXEditScriptPtr e_script;
1649 GapXEditBlockPtr edit_block;
1650
1651 if (S == NULL)
1652 return NULL;
1653
1654 number = 0;
1655 index1 = 0;
1656 index2 = 0;
1657
1658 edit_block = GapXEditBlockNew(start1, start2);
1659 edit_block->is_ooframe = TRUE;
1660 edit_block->esp = e_script = GapXEditScriptNew(NULL);
1661
1662 last_val = *S;
1663
1664 /* index1, M - index and length of protein,
1665 index2, N - length of the nucleotide */
1666
1667 for(index1 = 0, index2 = 0; index1 < M && index2 < N; S++, number++) {
1668
1669 current_val = *S;
1670 /* New script element will be created for any new frameshift
1671 region 0,3,6 will be collected in a single segment */
1672 if (current_val != last_val || (current_val%3 != 0 &&
1673 edit_block->esp != e_script)) {
1674 e_script->num = number;
1675 e_script = GapXEditScriptNew(e_script);
1676
1677 /* if(last_val%3 != 0 && current_val%3 == 0) */
1678 if(last_val%3 != 0 && current_val == 3)
1679 /* 1, 2, 4, 5 vs. 0, 3, 6*/
1680 number = 1;
1681 else
1682 number = 0;
1683 }
1684
1685 last_val = current_val;
1686
1687 /* for out_of_frame == TRUE - we have op_type == S parameter */
1688 e_script->op_type = current_val;
1689
1690 if(current_val != 6) {
1691 index1++;
1692 index2 += current_val;
1693 } else {
1694 index2 += 3;
1695 }
1696 }
1697 /* Get the last one. */
1698 e_script->num = number;
1699
1700 return edit_block;
1701 }
1702
1703 /*
1704 Destruct Function for GapAlignBlk. If "state" is not NULL, then
1705 it's deallocated.
1706 */
1707 GapAlignBlkPtr LIBCALL
GapAlignBlkDelete(GapAlignBlkPtr gap_align)1708 GapAlignBlkDelete(GapAlignBlkPtr gap_align)
1709
1710 {
1711 if (gap_align == NULL)
1712 return NULL;
1713
1714 gap_align->state_struct = GapXDropStateDestroy(gap_align->state_struct);
1715 /* GapXEditBlockDelete(gap_align->edit_block); */
1716
1717 gap_align->dyn_prog = MemFree(gap_align->dyn_prog);
1718 gap_align = MemFree(gap_align);
1719
1720 return gap_align;
1721 }
1722
1723 /*
1724 Allocates GapAlignBlkPtr and "state", if state_column_length and
1725 state_row_length are not NULL.
1726
1727 For "call and forget" applications, state_column_length and
1728 state_row_length should both be set to zero. ALIGN will
1729 then allocate and deallocate this memory.
1730 */
1731
1732 GapAlignBlkPtr LIBCALL
GapAlignBlkNew(Int4 state_column_length,Int4 state_row_length)1733 GapAlignBlkNew(Int4 state_column_length, Int4 state_row_length)
1734
1735 {
1736 GapAlignBlkPtr gap_align;
1737
1738 gap_align = MemNew(sizeof(GapAlignBlk));
1739 if (gap_align == NULL)
1740 return NULL;
1741
1742 /* gap_align->decline_align = INT2_MAX; */
1743 return gap_align;
1744 }
1745
1746 Boolean LIBCALL
PerformNtGappedAlignment(GapAlignBlkPtr gap_align)1747 PerformNtGappedAlignment(GapAlignBlkPtr gap_align)
1748
1749 {
1750 Boolean found_start, found_end;
1751 Int4 q_length=0, s_length=0, middle_score, score_right, score_left, private_q_start, private_s_start;
1752 Int4 include_query=0, index;
1753 Uint1Ptr query, subject, query_var, subject_var;
1754
1755 if (gap_align == NULL)
1756 return FALSE;
1757
1758 found_start = FALSE;
1759 found_end = FALSE;
1760
1761 query = gap_align->query;
1762 subject = gap_align->subject;
1763 score_left = 0;
1764 if (gap_align->q_start != 0 && gap_align->s_start != 0)
1765 {
1766 found_start = TRUE;
1767 q_length = (gap_align->q_start + 1);
1768 s_length = (gap_align->s_start + 1);
1769 score_left = ALIGN_packed_nucl(query, subject, q_length, s_length, &private_q_start, &private_s_start, gap_align, gap_align->q_start, TRUE);
1770 if (score_left < 0)
1771 return FALSE;
1772 gap_align->query_start = q_length - private_q_start;
1773 gap_align->subject_start = s_length - private_s_start;
1774 }
1775 else
1776 {
1777 q_length = gap_align->q_start;
1778 s_length = gap_align->s_start;
1779 }
1780
1781 middle_score = 0;
1782 query_var = query+gap_align->q_start;
1783 subject_var = subject+gap_align->s_start;
1784 for (index=0; index<include_query; index++)
1785 {
1786 query_var++;
1787 subject_var++;
1788 if (!(gap_align->positionBased)) /*AAS*/
1789 middle_score += gap_align->matrix[*query_var][*subject_var];
1790 else
1791 middle_score += MtrxScoreGapAlign(gap_align,
1792 gap_align->q_start+1 + index,*subject_var);
1793 }
1794
1795 score_right = 0;
1796 if (gap_align->q_start+include_query < gap_align->query_length && gap_align->s_start+include_query < gap_align->subject_length)
1797 {
1798 found_end = TRUE;
1799 score_right = ALIGN_packed_nucl(query+gap_align->q_start+include_query, subject+(gap_align->s_start+include_query)/4, gap_align->query_length-q_length-include_query, gap_align->subject_length-s_length-include_query, &(gap_align->query_stop), &(gap_align->subject_stop), gap_align, gap_align->q_start+include_query, FALSE);
1800 if (score_right < 0)
1801 return FALSE;
1802 gap_align->query_stop += gap_align->q_start+include_query;
1803 gap_align->subject_stop += gap_align->s_start+include_query;
1804 }
1805
1806 if (found_start == FALSE)
1807 { /* Start never found */
1808 gap_align->query_start = gap_align->q_start;
1809 gap_align->subject_start = gap_align->s_start;
1810 }
1811
1812 if (found_end == FALSE)
1813 {
1814 gap_align->query_stop = gap_align->q_start + include_query - 1;
1815 gap_align->subject_stop = gap_align->s_start + include_query - 1;
1816 }
1817
1818 gap_align->score = score_right+score_left+middle_score;
1819
1820 return TRUE;
1821 }
1822
1823 Boolean LIBCALL
PerformGappedAlignment(GapAlignBlkPtr gap_align)1824 PerformGappedAlignment(GapAlignBlkPtr gap_align)
1825
1826 {
1827 Boolean found_start, found_end;
1828 Int4 q_length=0, s_length=0, middle_score, score_right, score_left, private_q_start, private_s_start;
1829 Int4 include_query, index;
1830 Uint1Ptr q_left=NULL, s_left=NULL;
1831 Uint1Ptr query, subject, query_var, subject_var;
1832
1833 if (gap_align == NULL)
1834 return FALSE;
1835
1836 found_start = FALSE;
1837 found_end = FALSE;
1838
1839 query = gap_align->query;
1840 subject = gap_align->subject;
1841 include_query = gap_align->include_query;
1842
1843 /* Looking for "left" score */
1844 score_left = 0;
1845 if (gap_align->q_start != 0 && gap_align->s_start != 0) {
1846 found_start = TRUE;
1847 if(gap_align->is_ooframe) {
1848 q_left = (Uint1Ptr) MemNew((gap_align->q_start+3)*sizeof(Uint1));
1849 s_left = (Uint1Ptr) MemNew((gap_align->s_start+5)*sizeof(Uint1));
1850
1851 q_length = reverse_seq(query,
1852 query+gap_align->q_start-1, q_left+1);
1853 s_length = reverse_seq(subject,
1854 subject+gap_align->s_start-3, s_left+3);
1855
1856 score_left = OOF_SEMI_G_ALIGN(q_left, s_left+2, q_length, s_length, NULL, &private_q_start, &private_s_start, TRUE, NULL, gap_align, gap_align->q_start, TRUE);
1857 q_left = MemFree(q_left);
1858 s_left = MemFree(s_left);
1859 } else {
1860 q_length = (gap_align->q_start+1);
1861 s_length = (gap_align->s_start+1);
1862 score_left = SEMI_G_ALIGN_EX(query, subject, q_length, s_length, NULL, &private_q_start, &private_s_start, TRUE, NULL, gap_align, gap_align->q_start, FALSE, TRUE);
1863 }
1864
1865 gap_align->query_start = q_length - private_q_start;
1866 gap_align->subject_start = s_length - private_s_start;
1867
1868 } else {
1869 q_length = gap_align->q_start;
1870 s_length = gap_align->s_start;
1871 }
1872
1873 /* Looking for "middle" score */
1874 middle_score = 0;
1875 query_var = query+gap_align->q_start;
1876 subject_var = subject+gap_align->s_start;
1877 for (index=0; index<include_query; index++) {
1878 query_var++;
1879 subject_var++;
1880 if (!(gap_align->positionBased)) /*AAS*/
1881 middle_score += gap_align->matrix[*query_var][*subject_var];
1882 else
1883 middle_score += MtrxScoreGapAlign(gap_align,
1884 gap_align->q_start+1 + index,*subject_var);
1885 }
1886
1887 score_right = 0;
1888 if (gap_align->q_start+include_query < gap_align->query_length && gap_align->s_start+include_query < gap_align->subject_length) {
1889 found_end = TRUE;
1890 if(gap_align->is_ooframe) {
1891 score_right = OOF_SEMI_G_ALIGN(query+gap_align->q_start+include_query-1, subject+gap_align->s_start+include_query-1, gap_align->query_length-q_length-include_query, gap_align->subject_length-s_length-include_query, NULL, &(gap_align->query_stop), &(gap_align->subject_stop), TRUE, NULL, gap_align, gap_align->q_start+include_query, FALSE);
1892 } else {
1893 score_right = SEMI_G_ALIGN_EX(query+gap_align->q_start+include_query, subject+gap_align->s_start+include_query, gap_align->query_length-q_length-include_query, gap_align->subject_length-s_length-include_query, NULL, &(gap_align->query_stop), &(gap_align->subject_stop), TRUE, NULL, gap_align, gap_align->q_start+include_query, FALSE, FALSE);
1894 }
1895
1896 gap_align->query_stop += gap_align->q_start+include_query;
1897 gap_align->subject_stop += gap_align->s_start+include_query;
1898 }
1899
1900 if (found_start == FALSE) { /* Start never found */
1901 gap_align->query_start = gap_align->q_start;
1902 gap_align->subject_start = gap_align->s_start;
1903 }
1904
1905 if (found_end == FALSE) {
1906 gap_align->query_stop = gap_align->q_start + include_query - 1;
1907 gap_align->subject_stop = gap_align->s_start + include_query - 1;
1908
1909 if(gap_align->is_ooframe) { /* Do we really need this ??? */
1910 gap_align->query_stop++;
1911 gap_align->subject_stop++;
1912 }
1913 }
1914
1915 gap_align->score = score_right+score_left+middle_score;
1916
1917 return TRUE;
1918 }
1919
1920 /*
1921 Perform a gapped alignment and return the score. A SeqAlignPtr with
1922 the traceback is also returned.
1923 */
1924 Boolean LIBCALL
PerformGappedAlignmentWithTraceback(GapAlignBlkPtr gap_align)1925 PerformGappedAlignmentWithTraceback(GapAlignBlkPtr gap_align)
1926 {
1927 Boolean found_start, found_end;
1928 Int4 q_length=0, s_length=0, score_right, middle_score, score_left, private_q_length, private_s_length, tmp;
1929 Int4 include_query, index, prev;
1930 Int4Ptr tback, tback1, p, q;
1931 Uint1Ptr q_left=NULL, s_left=NULL;
1932 Uint1Ptr query, subject, query_var, subject_var;
1933
1934 if (gap_align == NULL)
1935 return FALSE;
1936
1937 found_start = FALSE;
1938 found_end = FALSE;
1939
1940 query = gap_align->query;
1941 subject = gap_align->subject;
1942
1943 /*
1944 tback = tback1 = MemNew((gap_align->subject_length+gap_align->query_length)*sizeof(Int4));
1945 */
1946 tback = tback1 =
1947 Nlm_Malloc((gap_align->subject_length+gap_align->query_length)*sizeof(Int4));
1948 include_query = gap_align->include_query;
1949
1950 gap_align->tback = tback;
1951
1952 /* Note: the starting point [q_start,s_start] is included in the left
1953 extension, but not in the right extension */
1954 score_left = 0; prev = 3;
1955 found_start = TRUE;
1956
1957 if(gap_align->is_ooframe) {
1958 q_left = (Uint1Ptr) MemNew((gap_align->q_start+3)*sizeof(Uint1));
1959 s_left = (Uint1Ptr) MemNew((gap_align->s_start+5)*sizeof(Uint1));
1960
1961 q_length = reverse_seq(query,
1962 query+gap_align->q_start-1, q_left+1);
1963 if (gap_align->s_start >= 3) {
1964 s_length = reverse_seq(subject,
1965 subject+gap_align->s_start-3, s_left+3);
1966 } else {
1967 s_length = gap_align->s_start - 2;
1968 }
1969 score_left = OOF_SEMI_G_ALIGN(q_left, s_left+2, q_length, s_length, tback, &private_q_length, &private_s_length, FALSE, &tback1, gap_align, gap_align->q_start, TRUE);
1970
1971 q_left = MemFree(q_left);
1972 s_left = MemFree(s_left);
1973 } else {
1974 q_length = (gap_align->q_start+1);
1975 s_length = (gap_align->s_start+1);
1976
1977 score_left = SEMI_G_ALIGN_EX(query, subject, q_length, s_length, tback, &private_q_length, &private_s_length, FALSE, &tback1, gap_align, gap_align->q_start, FALSE, TRUE);
1978 }
1979
1980 for(p = tback, q = tback1 - 1; p < q; p++, q--) {
1981 tmp = *p;
1982 *p = *q;
1983 *q = tmp;
1984 }
1985
1986 if(gap_align->is_ooframe){
1987 for (prev = 3, p = tback; p < tback1; p++) {
1988 if (*p == 0 || *p == 6) continue;
1989 tmp = *p; *p = prev; prev = tmp;
1990 }
1991 }
1992 gap_align->query_start = q_length - private_q_length;
1993 gap_align->subject_start = s_length - private_s_length;
1994
1995 middle_score = 0;
1996 query_var = query+gap_align->q_start;
1997 subject_var = subject+gap_align->s_start;
1998 for (index=0; index<include_query; index++) {
1999 query_var++;
2000 subject_var++;
2001 if (!(gap_align->positionBased)) /*AAS*/
2002 middle_score += gap_align->matrix[*query_var][*subject_var];
2003 else
2004 middle_score += MtrxScoreGapAlign(gap_align,
2005 gap_align->q_start+1 + index,*subject_var);
2006 *tback1 = 0;
2007 tback1++;
2008 }
2009
2010 score_right = 0;
2011 if ((gap_align->q_start+include_query) < gap_align->query_length && (gap_align->s_start+include_query) < gap_align->subject_length) {
2012 found_end = TRUE;
2013 if(gap_align->is_ooframe){
2014 score_right = OOF_SEMI_G_ALIGN(query+gap_align->q_start+include_query-1, subject+gap_align->s_start+include_query-1, gap_align->query_length-q_length-include_query, gap_align->subject_length-s_length-include_query, tback1, &private_q_length, &private_s_length, FALSE, &tback1, gap_align, gap_align->q_start+include_query, FALSE);
2015 if (prev != 3) {
2016 while (*p == 0 || *p == 6) p++;
2017 *p = prev+*p-3;
2018 }
2019 } else {
2020 score_right = SEMI_G_ALIGN_EX(query+gap_align->q_start+include_query, subject+gap_align->s_start+include_query, gap_align->query_length-q_length-include_query, gap_align->subject_length-s_length-include_query, tback1, &private_q_length, &private_s_length, FALSE, &tback1, gap_align, gap_align->q_start+include_query, FALSE, FALSE);
2021 }
2022
2023 gap_align->query_stop = gap_align->q_start + private_q_length+include_query;
2024 gap_align->subject_stop = gap_align->s_start + private_s_length+include_query;
2025 }
2026
2027 if (found_start == FALSE) { /* Start never found */
2028 gap_align->query_start = gap_align->q_start;
2029 gap_align->subject_start = gap_align->s_start;
2030 }
2031
2032 if (found_end == FALSE) {
2033 gap_align->query_stop = gap_align->q_start + include_query - 1;
2034 gap_align->subject_stop = gap_align->s_start + include_query - 1;
2035
2036 if(FALSE && gap_align->is_ooframe){ /* Do we really need this ??? */
2037 gap_align->query_stop++;
2038 gap_align->subject_stop++;
2039 }
2040 }
2041
2042 if(gap_align->is_ooframe) {
2043 gap_align->edit_block = OOFTracebackToGapXEditBlock(gap_align->query_stop-gap_align->query_start+1, gap_align->subject_stop-gap_align->subject_start+1, tback, gap_align->query_start, gap_align->subject_start);
2044 } else {
2045 gap_align->edit_block = TracebackToGapXEditBlock(query, subject, gap_align->query_stop-gap_align->query_start+1, gap_align->subject_stop-gap_align->subject_start+1, tback, gap_align->query_start, gap_align->subject_start);
2046 }
2047
2048 gap_align->edit_block->frame1 = gap_align->query_frame;
2049 gap_align->edit_block->frame2 = gap_align->subject_frame;
2050 gap_align->edit_block->length1 = gap_align->query_length;
2051 gap_align->edit_block->length2 = gap_align->subject_length;
2052 gap_align->edit_block->translate1 = gap_align->translate1;
2053 gap_align->edit_block->translate2 = gap_align->translate2;
2054 gap_align->edit_block->discontinuous = gap_align->discontinuous;
2055
2056 #ifdef OOF_DEBUG_PRINTOUT
2057 {{
2058 CharPtr pro_seq, dnap_seq;
2059 SeqMapTablePtr smtp;
2060 Int4 i;
2061
2062 pro_seq = MemNew(gap_align->query_length+1);
2063 dnap_seq = MemNew(gap_align->subject_length+3*CODON_LENGTH);
2064
2065 smtp = SeqMapTableFindObj(Seq_code_ncbieaa, Seq_code_ncbistdaa);
2066
2067 for(i = 0; i < gap_align->query_length; i++)
2068 pro_seq[i] = SeqMapTableConvert(smtp, query[i]);
2069
2070 for(i = 0; i < gap_align->subject_length; i++)
2071 dnap_seq[i] = SeqMapTableConvert(smtp, subject[i]);
2072
2073 /* for(i =0; i <gap_align->subject_length+
2074 gap_align->query_length; i++)
2075 printf("%d ", tback[i]); */
2076
2077 OOFDisplayTraceBack1(tback, dnap_seq, pro_seq,
2078 gap_align->subject_stop-1,
2079 gap_align->query_stop-1,
2080 gap_align->query_start,
2081 gap_align->subject_start);
2082 MemFree(dnap_seq);
2083 MemFree(pro_seq);
2084 }}
2085 #endif
2086
2087 tback = MemFree(tback);
2088 gap_align->tback = NULL;
2089
2090 gap_align->score = score_right+score_left+middle_score;
2091
2092 return TRUE;
2093 }
2094
2095 /*
2096 Get the current position.
2097 */
2098
get_current_pos(Int4Ptr pos,Int4 length)2099 static Int4 get_current_pos(Int4Ptr pos, Int4 length)
2100 {
2101 Int4 val;
2102 if(*pos < 0)
2103 val = -(*pos + length -1);
2104 else
2105 val = *pos;
2106 *pos += length;
2107 return val;
2108 }
2109
GXEMakeSeqAlign(SeqIdPtr query_id,SeqIdPtr subject_id,Boolean reverse,Boolean translate1,Boolean translate2,Int4 numseg,Int4Ptr length,Int4Ptr start,Uint1Ptr strands)2110 static SeqAlignPtr GXEMakeSeqAlign(SeqIdPtr query_id, SeqIdPtr subject_id,
2111 Boolean reverse, Boolean translate1,
2112 Boolean translate2, Int4 numseg,
2113 Int4Ptr length, Int4Ptr start,
2114 Uint1Ptr strands)
2115 {
2116 SeqAlignPtr sap;
2117 DenseSegPtr dsp;
2118 StdSegPtr sseg, sseg_head, sseg_old;
2119 SeqLocPtr slp, slp1, slp2;
2120 SeqIntPtr seq_int1;
2121 Int4 index;
2122 Boolean tmp_value;
2123
2124 sap = SeqAlignNew();
2125
2126 sap->dim =2; /**only two dimention alignment**/
2127
2128 /**make the Denseg Object for SeqAlign**/
2129 if (translate1 == FALSE && translate2 == FALSE) {
2130 sap->segtype = SAS_DENSEG; /** use denseg to store the alignment **/
2131 sap->type = SAT_PARTIAL; /**partial for gapped translating search.*/
2132 dsp = DenseSegNew();
2133 dsp->dim = 2;
2134 dsp->numseg = numseg;
2135 if (reverse) {
2136 dsp->ids = SeqIdDup(subject_id);
2137 dsp->ids->next = SeqIdDup(query_id);
2138 } else {
2139 dsp->ids = SeqIdDup(query_id);
2140 dsp->ids->next = SeqIdDup(subject_id);
2141 }
2142 dsp->starts = start;
2143 dsp->strands = strands;
2144 dsp->lens = length;
2145 sap->segs = dsp;
2146 sap->next = NULL;
2147 } else { /****/
2148 sap->type = SAT_PARTIAL; /**partial for gapped translating search. */
2149 sap->segtype = SAS_STD; /**use stdseg to store the alignment**/
2150 sseg_head = NULL;
2151 sseg_old = NULL;
2152
2153 if(reverse) { /* Reverting translation flags */
2154 tmp_value = translate1;
2155 translate1 = translate2;
2156 translate2 = tmp_value;
2157 }
2158
2159 for (index=0; index<numseg; index++) {
2160 sseg = StdSegNew();
2161 sseg->dim = 2;
2162 if (sseg_head == NULL) {
2163 sseg_head = sseg;
2164 }
2165 if (reverse) {
2166 sseg->ids = SeqIdDup(subject_id);
2167 sseg->ids->next = SeqIdDup(query_id);
2168 } else {
2169 sseg->ids = SeqIdDup(query_id);
2170 sseg->ids->next = SeqIdDup(subject_id);
2171 }
2172
2173 slp1 = NULL;
2174 if (start[2*index] != -1) {
2175 seq_int1 = SeqIntNew();
2176 seq_int1->from = start[2*index];
2177 if (translate1)
2178 seq_int1->to = start[2*index] + CODON_LENGTH*length[index] - 1;
2179 else
2180 seq_int1->to = start[2*index] + length[index] - 1;
2181 seq_int1->strand = strands[2*index];
2182
2183 if(reverse) {
2184 seq_int1->id = SeqIdDup(subject_id);
2185 } else {
2186 seq_int1->id = SeqIdDup(query_id);
2187 }
2188
2189 ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
2190 } else {
2191 if(reverse) {
2192 ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(subject_id));
2193 } else {
2194 ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(query_id));
2195 }
2196 }
2197 slp2 = NULL;
2198 if (start[2*index+1] != -1) {
2199 seq_int1 = SeqIntNew();
2200 seq_int1->from = start[2*index+1];
2201 if (translate2)
2202 seq_int1->to = start[2*index+1] + CODON_LENGTH*length[index] - 1;
2203 else
2204 seq_int1->to = start[2*index+1] + length[index] - 1;
2205 seq_int1->strand = strands[2*index+1];
2206
2207 if(reverse) {
2208 seq_int1->id = SeqIdDup(query_id);
2209 } else {
2210 seq_int1->id = SeqIdDup(subject_id);
2211 }
2212
2213 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int1);
2214 } else {
2215 if(reverse) {
2216 ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(query_id));
2217 } else {
2218 ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(subject_id));
2219 }
2220 }
2221 /*
2222 if (reverse) {
2223 slp = slp2;
2224 slp2->next = slp1;
2225 } else {
2226 slp = slp1;
2227 slp1->next = slp2;
2228 } */
2229
2230 slp = slp1;
2231 slp1->next = slp2;
2232
2233 sseg->loc = slp;
2234
2235 if (sseg_old)
2236 sseg_old->next = sseg;
2237 sseg_old = sseg;
2238 }
2239 sap->segs = sseg_head;
2240 sap->next = NULL;
2241
2242 MemFree(start);
2243 MemFree(length);
2244 MemFree(strands);
2245 }
2246
2247 return sap;
2248 }
2249
GXECollectDataForSeqalign(GapXEditBlockPtr edit_block,GapXEditScriptPtr curr_in,Int4 numseg,Int4Ptr PNTR start_out,Int4Ptr PNTR length_out,Uint1Ptr PNTR strands_out,Int4Ptr start1,Int4Ptr start2)2250 Boolean GXECollectDataForSeqalign(GapXEditBlockPtr edit_block,
2251 GapXEditScriptPtr curr_in, Int4 numseg,
2252 Int4Ptr PNTR start_out,
2253 Int4Ptr PNTR length_out,
2254 Uint1Ptr PNTR strands_out,
2255 Int4Ptr start1, Int4Ptr start2)
2256 {
2257 GapXEditScriptPtr curr;
2258 Boolean reverse, translate1, translate2;
2259 Int2 frame1, frame2;
2260 Int4 begin1, begin2, index, length1, length2;
2261 Int4 original_length1, original_length2, i;
2262 Int4Ptr length, start;
2263 Uint1 strand1, strand2;
2264 Uint1Ptr strands;
2265
2266 reverse = edit_block->reverse;
2267 length1 = edit_block->length1;
2268 length2 = edit_block->length2;
2269 original_length1 = edit_block->original_length1;
2270 original_length2 = edit_block->original_length2;
2271 translate1 = edit_block->translate1;
2272 translate2 = edit_block->translate2;
2273 frame1 = edit_block->frame1;
2274 frame2 = edit_block->frame2;
2275
2276 if (frame1 > 0)
2277 strand1 = Seq_strand_plus;
2278 else if (frame1 < 0)
2279 strand1 = Seq_strand_minus;
2280 else
2281 strand1 = Seq_strand_unknown;
2282
2283 if (frame2 > 0)
2284 strand2 = Seq_strand_plus;
2285 else if (frame2 < 0)
2286 strand2 = Seq_strand_minus;
2287 else
2288 strand2 = Seq_strand_unknown;
2289
2290 start = MemNew((2*numseg+1)*sizeof(Int4));
2291 *start_out = start;
2292 length = MemNew((numseg+1)*sizeof(Int4));
2293 *length_out = length;
2294 strands = MemNew((2*numseg+1)*sizeof(Uint1));
2295 *strands_out = strands;
2296
2297 index=0;
2298 for (i = 0, curr=curr_in; curr && i < numseg; curr=curr->next, i++) {
2299 switch(curr->op_type) {
2300 case GAPALIGN_DECLINE:
2301 case GAPALIGN_SUB:
2302 if (strand1 != Seq_strand_minus) {
2303 if(translate1 == FALSE)
2304 begin1 = get_current_pos(start1, curr->num);
2305 else
2306 begin1 = frame1 - 1 + CODON_LENGTH*get_current_pos(start1, curr->num);
2307 } else {
2308 if(translate1 == FALSE)
2309 begin1 = length1 - get_current_pos(start1, curr->num) - curr->num;
2310 else
2311 begin1 = original_length1 - CODON_LENGTH*(get_current_pos(start1, curr->num)+curr->num) + frame1 + 1;
2312 }
2313
2314 if (strand2 != Seq_strand_minus) {
2315 if(translate2 == FALSE)
2316 begin2 = get_current_pos(start2, curr->num);
2317 else
2318 begin2 = frame2 - 1 + CODON_LENGTH*get_current_pos(start2, curr->num);
2319 } else {
2320 if(translate2 == FALSE)
2321 begin2 = length2 - get_current_pos(start2, curr->num) - curr->num;
2322 else
2323 begin2 = original_length2 - CODON_LENGTH*(get_current_pos(start2, curr->num)+curr->num) + frame2 + 1;
2324 }
2325
2326 if (reverse) {
2327 strands[2*index] = strand2;
2328 strands[2*index+1] = strand1;
2329 start[2*index] = begin2;
2330 start[2*index+1] = begin1;
2331 } else {
2332 strands[2*index] = strand1;
2333 strands[2*index+1] = strand2;
2334 start[2*index] = begin1;
2335 start[2*index+1] = begin2;
2336 }
2337
2338 break;
2339
2340 case GAPALIGN_DEL:
2341 begin1 = -1;
2342 if (strand2 != Seq_strand_minus) {
2343 if(translate2 == FALSE)
2344 begin2 = get_current_pos(start2, curr->num);
2345 else
2346 begin2 = frame2 - 1 + CODON_LENGTH*get_current_pos(start2, curr->num);
2347 } else {
2348 if(translate2 == FALSE)
2349 begin2 = length2 - get_current_pos(start2, curr->num) - curr->num;
2350 else
2351 begin2 = original_length2 - CODON_LENGTH*(get_current_pos(start2, curr->num)+curr->num) + frame2 + 1;
2352 }
2353
2354 if (reverse) {
2355 strands[2*index] = strand2;
2356 if (index > 0)
2357 strands[2*index+1] = strands[2*(index-1)+1];
2358 else
2359 strands[2*index+1] = Seq_strand_unknown;
2360 start[2*index] = begin2;
2361 start[2*index+1] = begin1;
2362 } else {
2363 if (index > 0)
2364 strands[2*index] = strands[2*(index-1)];
2365 else
2366 strands[2*index] = Seq_strand_unknown;
2367 strands[2*index+1] = strand2;
2368 start[2*index] = begin1;
2369 start[2*index+1] = begin2;
2370 }
2371
2372 break;
2373
2374 case GAPALIGN_INS:
2375 if (strand1 != Seq_strand_minus) {
2376 if(translate1 == FALSE)
2377 begin1 = get_current_pos(start1, curr->num);
2378 else
2379 begin1 = frame1 - 1 + CODON_LENGTH*get_current_pos(start1, curr->num);
2380 } else {
2381 if(translate1 == FALSE)
2382 begin1 = length1 - get_current_pos(start1, curr->num) - curr->num;
2383 else
2384 begin1 = original_length1 - CODON_LENGTH*(get_current_pos(start1, curr->num)+curr->num) + frame1 + 1;
2385 }
2386 begin2 = -1;
2387 if (reverse) {
2388 if (index > 0)
2389 strands[2*index] = strands[2*(index-1)];
2390 else
2391 strands[2*index] = Seq_strand_unknown;
2392 strands[2*index+1] = strand1;
2393 start[2*index] = begin2;
2394 start[2*index+1] = begin1;
2395 } else {
2396 strands[2*index] = strand1;
2397 if (index > 0)
2398 strands[2*index+1] = strands[2*(index-1)+1];
2399 else
2400 strands[2*index+1] = Seq_strand_unknown;
2401 start[2*index] = begin1;
2402 start[2*index+1] = begin2;
2403 }
2404
2405 break;
2406 default:
2407 break;
2408 }
2409 length[index] = curr->num;
2410 index++;
2411 }
2412
2413 return TRUE;
2414 }
2415
GXECorrectUASequence(GapXEditBlockPtr edit_block)2416 static void GXECorrectUASequence(GapXEditBlockPtr edit_block)
2417 {
2418 GapXEditScriptPtr curr, curr_last, curr_last2;
2419 Boolean last_indel;
2420
2421 last_indel = FALSE;
2422 curr_last = NULL;
2423 curr_last2 = NULL;
2424
2425 for (curr=edit_block->esp; curr; curr = curr->next) {
2426
2427 if(curr->op_type == GAPALIGN_DECLINE && last_indel == TRUE) {
2428 /* This is invalid condition and regions should be
2429 exchanged */
2430
2431 if(curr_last2 != NULL)
2432 curr_last2->next = curr;
2433 else
2434 edit_block->esp = curr; /* First element in a list */
2435
2436 curr_last->next = curr->next;
2437 curr->next = curr_last;
2438 }
2439
2440 last_indel = FALSE;
2441
2442 if(curr->op_type == GAPALIGN_INS || curr->op_type == GAPALIGN_DEL) {
2443 last_indel = TRUE;
2444 curr_last2 = curr_last;
2445 }
2446
2447 curr_last = curr;
2448 }
2449 return;
2450 }
2451
2452 /*
2453 Convert an EditScript chain to a SeqAlign of type DenseSeg.
2454 Used for a non-simple interval (i.e., one without subs. or
2455 deletions).
2456
2457 The first SeqIdPtr in the chain of subject_id and query_id is duplicated for
2458 the SeqAlign.
2459 */
2460
2461 SeqAlignPtr LIBCALL
GapXEditBlockToSeqAlign(GapXEditBlockPtr edit_block,SeqIdPtr subject_id,SeqIdPtr query_id)2462 GapXEditBlockToSeqAlign(GapXEditBlockPtr edit_block, SeqIdPtr subject_id, SeqIdPtr query_id)
2463
2464 {
2465 GapXEditScriptPtr curr, curr_last;
2466 Int4 numseg, start1, start2;
2467 Int4Ptr length, start;
2468 Uint1Ptr strands;
2469 Boolean is_disc_align = FALSE;
2470 SeqAlignPtr sap, sap_disc, sap_head, sap_tail;
2471 Boolean skip_region = FALSE;
2472
2473 is_disc_align = FALSE; numseg = 0;
2474
2475 for (curr=edit_block->esp; curr; curr = curr->next) {
2476 numseg++;
2477 if(/*edit_block->discontinuous && */curr->op_type == GAPALIGN_DECLINE)
2478 is_disc_align = TRUE;
2479 }
2480
2481 start1 = edit_block->start1;
2482 start2 = edit_block->start2;
2483
2484 /* If no GAPALIGN_DECLINE regions exists output seqalign will be
2485 regular Den-Seg or Std-seg */
2486 if(is_disc_align == FALSE) {
2487 /* Please note, that edit_block passed only for data like
2488 strand, translate, reverse etc. Real data is taken starting
2489 from "curr" and taken only "numseg" segments */
2490
2491 GXECollectDataForSeqalign(edit_block, edit_block->esp, numseg,
2492 &start, &length, &strands, &start1, &start2);
2493
2494 /* Result of this function will be either den-seg or Std-seg
2495 depending on translation options */
2496 sap = GXEMakeSeqAlign(query_id, subject_id, edit_block->reverse,
2497 edit_block->translate1, edit_block->translate2,
2498 numseg, length, start, strands);
2499 } else {
2500
2501 /* By request of Steven Altschul - we need to have
2502 the unaligned part being to the left if it is adjacent to the
2503 gap (insertion or deletion) - so this function will do
2504 shaffeling */
2505
2506 GXECorrectUASequence(edit_block);
2507
2508 sap_disc = SeqAlignNew();
2509 sap_disc->dim = 2;
2510 sap_disc->type = SAT_PARTIAL; /* ordered segments, over part of seq */
2511 sap_disc->segtype = SAS_DISC; /* discontinuous alignment */
2512
2513 curr_last = edit_block->esp;
2514 sap_head = NULL; sap_tail = NULL;
2515 while(curr_last != NULL) {
2516 skip_region = FALSE;
2517 for (numseg = 0, curr = curr_last;
2518 curr; curr = curr->next, numseg++) {
2519
2520 if(curr->op_type == GAPALIGN_DECLINE) {
2521 if(numseg != 0) { /* End of aligned area */
2522 break;
2523 } else {
2524 while(curr && curr->op_type == GAPALIGN_DECLINE) {
2525 numseg++;
2526 curr = curr->next;
2527 }
2528 skip_region = TRUE;
2529 break;
2530 }
2531 }
2532 }
2533
2534 GXECollectDataForSeqalign(edit_block, curr_last, numseg,
2535 &start, &length, &strands,
2536 &start1, &start2);
2537
2538 if(!skip_region) {
2539 sap = GXEMakeSeqAlign(query_id, subject_id,
2540 edit_block->reverse,
2541 edit_block->translate1,
2542 edit_block->translate2,
2543 numseg, length, start, strands);
2544
2545 /* Collecting all seqaligns into single linked list */
2546 if(sap_tail == NULL) {
2547 sap_head = sap_tail = sap;
2548 } else {
2549 sap_tail->next = sap;
2550 sap_tail = sap;
2551 }
2552 }
2553
2554 curr_last = curr;
2555 }
2556 sap_disc->segs = sap_head;
2557 sap = sap_disc;
2558 }
2559
2560 return sap;
2561 }
2562
2563 /*
2564 This function is used for Out-Of-Frame traceback conversion
2565 Convert an OOF EditScript chain to a SeqAlign of type StdSeg.
2566 Used for a non-simple interval (i.e., one without subs. or
2567 deletions).
2568
2569 The first SeqIdPtr in the chain of subject_id and query_id is duplicated for
2570 the SeqAlign.
2571 */
OOFGapXEditBlockToSeqAlign(GapXEditBlockPtr edit_block,SeqIdPtr subject_id,SeqIdPtr query_id,Int4 query_length)2572 SeqAlignPtr LIBCALL OOFGapXEditBlockToSeqAlign(GapXEditBlockPtr edit_block, SeqIdPtr subject_id, SeqIdPtr query_id, Int4 query_length)
2573 {
2574 Boolean reverse;
2575 GapXEditScriptPtr curr, esp;
2576 Int2 frame1, frame2;
2577 Int4 start1, start2;
2578 Int4 original_length1, original_length2;
2579 SeqAlignPtr sap;
2580 SeqIntPtr seq_int1, seq_int2;
2581 SeqIntPtr seq_int1_last = NULL, seq_int2_last = NULL;
2582 SeqIdPtr sip;
2583 SeqLocPtr slp, slp1, slp2;
2584 StdSegPtr sseg, sseg_head, sseg_old;
2585 Uint1 strand1, strand2;
2586 Boolean first_shift;
2587
2588 reverse = edit_block->reverse;
2589
2590
2591 start1 = edit_block->start1;
2592 start2 = edit_block->start2;
2593 frame1 = edit_block->frame1;
2594 frame2 = edit_block->frame2;
2595
2596 original_length1 = edit_block->original_length2; /* Protein */
2597 original_length2 = edit_block->original_length1; /* DNA */
2598
2599 /* printf("%d %d %d %d\n", start1, start2, original_length1,
2600 original_length2); */
2601
2602 if(frame1 > 0)
2603 strand1 = Seq_strand_plus;
2604 else if (frame1 < 0)
2605 strand1 = Seq_strand_minus;
2606 else
2607 strand1 = Seq_strand_unknown;
2608
2609 if(frame2 > 0)
2610 strand2 = Seq_strand_plus;
2611 else if (frame2 < 0)
2612 strand2 = Seq_strand_minus;
2613 else
2614 strand2 = Seq_strand_unknown;
2615
2616 esp = edit_block->esp;
2617
2618 sap = SeqAlignNew();
2619
2620 sap->dim =2; /**only two dimention alignment**/
2621
2622 sap->type =3; /**partial for gapped translating search. */
2623 sap->segtype =3; /**use denseg to store the alignment**/
2624 sseg_head = NULL;
2625 sseg_old = NULL;
2626 esp = edit_block->esp;
2627
2628 /* query_length--; */
2629
2630 first_shift = FALSE;
2631
2632 for (curr=esp; curr; curr=curr->next) {
2633
2634 slp1 = NULL;
2635 slp2 = NULL;
2636
2637 switch (curr->op_type) {
2638 case 0: /* deletion of three nucleotides. */
2639
2640 first_shift = FALSE;
2641
2642 seq_int1 = SeqIntNew();
2643 seq_int1->from = get_current_pos(&start1, curr->num);
2644 seq_int1->to = start1 - 1;
2645
2646 if(seq_int1->to >= original_length1)
2647 seq_int1->to = original_length1-1;
2648
2649 seq_int1->id = SeqIdDup(subject_id);
2650 seq_int1->strand = strand1;
2651
2652 ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
2653
2654 /* Empty nucleotide piece */
2655 ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(query_id));
2656
2657 seq_int1_last = seq_int1;
2658 /* Keep previous seq_int2_last, in case there is a frame shift
2659 immediately after this gap */
2660
2661 break;
2662
2663 case 6: /* insertion of three nucleotides. */
2664
2665 /* If gap is followed after frameshift - we have to
2666 add this element for the alignment to be correct */
2667
2668 if(first_shift == TRUE) { /* Second frameshift in a row */
2669 /* Protein coordinates */
2670 seq_int1 = SeqIntNew();
2671 seq_int1->from = get_current_pos(&start1, 1);
2672 seq_int1->to = start1 - 1;
2673
2674 if(seq_int1->to >= original_length1)
2675 seq_int1->to = original_length1-1;
2676
2677 seq_int1->id = SeqIdDup(subject_id);
2678 seq_int1->strand = strand1;
2679
2680 ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
2681
2682 /* Nucleotide scale shifted by op_type */
2683 seq_int2 = SeqIntNew();
2684
2685 seq_int2->from = get_current_pos(&start2, 3);
2686 seq_int2->to = start2 - 1;
2687
2688 if(seq_int2->to >= query_length) {/* Possible with frame shifts */
2689 seq_int2->to = query_length -1;
2690 seq_int1->to--;
2691 }
2692
2693 /* Transfer to DNA minus strand coordinates */
2694 if(strand2 == Seq_strand_minus) {
2695 int tmp_int;
2696 tmp_int = seq_int2->to;
2697 seq_int2->to = original_length2 - seq_int2->from - 1;
2698 seq_int2->from = original_length2 - tmp_int - 1;
2699 }
2700
2701 seq_int2->id = SeqIdDup(query_id);
2702 seq_int2->strand = strand2;
2703
2704 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
2705
2706 /* seq_int1_last = seq_int1;
2707 seq_int2_last = seq_int2; */
2708
2709 /* first_shift = FALSE; */
2710
2711 if (reverse) {
2712 slp = slp1;
2713 slp1->next = slp2;
2714 sip = SeqIdDup(subject_id);
2715 sip->next = SeqIdDup(query_id);
2716 } else {
2717 slp = slp2;
2718 slp2->next = slp1;
2719 sip = SeqIdDup(query_id);
2720 sip->next = SeqIdDup(subject_id);
2721 }
2722
2723 sseg = StdSegNew();
2724 sseg->dim = 2;
2725
2726 if (sseg_head == NULL)
2727 sseg_head = sseg;
2728
2729 sseg->loc = slp;
2730 sseg->ids = sip;
2731
2732 if (sseg_old)
2733 sseg_old->next = sseg;
2734
2735 sseg_old = sseg;
2736
2737 slp1 = NULL;
2738 slp2 = NULL;
2739 }
2740
2741 first_shift = FALSE;
2742
2743 /* Protein piece is empty */
2744 ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(subject_id));
2745
2746 /* Nucleotide scale shifted by 3, protein gapped */
2747 seq_int2 = SeqIntNew();
2748 seq_int2->from = get_current_pos(&start2, curr->num*3);
2749 seq_int2->to = start2 - 1;
2750
2751 if(seq_int2->to >= query_length) {/* Possible with frame shifts */
2752 seq_int2->to = query_length -1;
2753 }
2754
2755 /* Transfer to DNA minus strand coordinates */
2756 if(strand2 == Seq_strand_minus) {
2757 int tmp_int;
2758 tmp_int = seq_int2->to;
2759 seq_int2->to = original_length2 - seq_int2->from - 1;
2760 seq_int2->from = original_length2 - tmp_int - 1;
2761 }
2762
2763 seq_int2->id = SeqIdDup(query_id);
2764 seq_int2->strand = strand2;
2765
2766 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
2767
2768 seq_int1_last = NULL;
2769 seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
2770
2771 break;
2772
2773 case 3: /* Substitution. */
2774
2775 first_shift = FALSE;
2776
2777 /* Protein coordinates */
2778 seq_int1 = SeqIntNew();
2779 seq_int1->from = get_current_pos(&start1, curr->num);
2780 seq_int1->to = start1 - 1;
2781
2782 if(seq_int1->to >= original_length1)
2783 seq_int1->to = original_length1-1;
2784
2785 seq_int1->id = SeqIdDup(subject_id);
2786 seq_int1->strand = strand1;
2787
2788 ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
2789
2790 /* Nucleotide scale shifted by op_type */
2791 seq_int2 = SeqIntNew();
2792
2793 /* Adjusting last segment and new start point in
2794 nucleotide coordinates */
2795 /* if(seq_int2_last != NULL) {
2796 seq_int2_last->to = seq_int2_last->to - (3 - curr->op_type);
2797 start2 = start2 - (3 - curr->op_type);
2798 } */
2799
2800 seq_int2->from = get_current_pos(&start2, curr->num*curr->op_type);
2801 seq_int2->to = start2 - 1;
2802
2803 /* Chop off three bases and one residue at a time.
2804 Why does this happen, seems like a bug?
2805 */
2806 while (seq_int2->to >= query_length) {/* Possible with frame shifts */
2807 seq_int2->to -= 3;
2808 seq_int1->to--;
2809 }
2810
2811 /* Transfer to DNA minus strand coordinates */
2812 if(strand2 == Seq_strand_minus) {
2813 int tmp_int;
2814 tmp_int = seq_int2->to;
2815 seq_int2->to = original_length2 - seq_int2->from - 1;
2816 seq_int2->from = original_length2 - tmp_int - 1;
2817 }
2818
2819 seq_int2->id = SeqIdDup(query_id);
2820 seq_int2->strand = strand2;
2821
2822 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
2823
2824 seq_int1_last = seq_int1; /* Will be used to adjust "to" value */
2825 seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
2826
2827 break;
2828 case 1: /* gap of two nucleotides. */
2829 case 2: /* Gap of one nucleotide. */
2830 case 4: /* Insertion of one nucleotide. */
2831 case 5: /* Insertion of two nucleotides. */
2832
2833 if(first_shift == TRUE) { /* Second frameshift in a row */
2834 /* Protein coordinates */
2835 seq_int1 = SeqIntNew();
2836 seq_int1->from = get_current_pos(&start1, 1);
2837 seq_int1->to = start1 - 1;
2838
2839 if(seq_int1->to >= original_length1)
2840 seq_int1->to = original_length1-1;
2841
2842 seq_int1->id = SeqIdDup(subject_id);
2843 seq_int1->strand = strand1;
2844
2845 ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
2846
2847 /* Nucleotide scale shifted by op_type */
2848 seq_int2 = SeqIntNew();
2849
2850 seq_int2->from = get_current_pos(&start2, curr->op_type);
2851 seq_int2->to = start2 - 1;
2852
2853 if(seq_int2->to >= query_length) {/* Possible with frame shifts */
2854 seq_int2->to = query_length -1;
2855 seq_int1->to--;
2856 }
2857
2858 /* Transfer to DNA minus strand coordinates */
2859 if(strand2 == Seq_strand_minus) {
2860 int tmp_int;
2861 tmp_int = seq_int2->to;
2862 seq_int2->to = original_length2 - seq_int2->from - 1;
2863 seq_int2->from = original_length2 - tmp_int - 1;
2864 }
2865
2866 seq_int2->id = SeqIdDup(query_id);
2867 seq_int2->strand = strand2;
2868
2869 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
2870
2871 seq_int1_last = seq_int1;
2872 seq_int2_last = seq_int2;
2873
2874 /* first_shift = FALSE; */
2875
2876 break;
2877 }
2878
2879 first_shift = TRUE;
2880
2881 /* If this substitution is following simple frameshift
2882 we do not need to start new segment, but may continue
2883 old one */
2884 /* printf("curr_num = %d (%d)\n", curr->num, curr->op_type); */
2885
2886 if(seq_int2_last != NULL) {
2887 get_current_pos(&start2, curr->num*(curr->op_type-3));
2888 if(strand2 != Seq_strand_minus) {
2889 seq_int2_last->to = start2 - 1;
2890 } else {
2891 /* Transfer to DNA minus strand coordinates */
2892 seq_int2_last->from = original_length2 - start2;
2893 }
2894
2895 /* Adjustment for multiple shifts - theoretically possible,
2896 but very unprobable */
2897 if(seq_int2_last->from > seq_int2_last->to) {
2898
2899 if(strand2 != Seq_strand_minus) {
2900 seq_int2_last->to += 3;
2901 } else {
2902 seq_int2_last->from -= 3;
2903 }
2904
2905 if(seq_int1_last != 0)
2906 seq_int1_last++;
2907 }
2908
2909 } else if (curr->op_type > 3) {
2910 /* Protein piece is empty */
2911 ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(subject_id));
2912 /* Simulating insertion of nucleotides */
2913 seq_int2 = SeqIntNew();
2914 seq_int2->from = get_current_pos(&start2,
2915 curr->num*(curr->op_type-3));
2916 seq_int2->to = start2 - 1;
2917
2918 if(seq_int2->to >= query_length) {
2919 seq_int2->to = query_length - 1;
2920 }
2921
2922 /* Transfer to DNA minus strand coordinates */
2923 if(strand2 == Seq_strand_minus) {
2924 int tmp_int;
2925 tmp_int = seq_int2->to;
2926 seq_int2->to = original_length2 - seq_int2->from - 1;
2927 seq_int2->from = original_length2 - tmp_int - 1;
2928 }
2929
2930 seq_int2->id = SeqIdDup(query_id);
2931 seq_int2->strand = strand2;
2932
2933 ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
2934
2935 seq_int1_last = NULL;
2936 seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
2937 break;
2938 } else {
2939 continue; /* Main loop */
2940 }
2941 continue; /* Main loop */
2942 /* break; */
2943 default:
2944 continue; /* Main loop */
2945 /* break; */
2946 }
2947
2948 if (reverse) {
2949 slp = slp1;
2950 slp1->next = slp2;
2951 sip = SeqIdDup(subject_id);
2952 sip->next = SeqIdDup(query_id);
2953 } else {
2954 slp = slp2;
2955 slp2->next = slp1;
2956 sip = SeqIdDup(query_id);
2957 sip->next = SeqIdDup(subject_id);
2958 }
2959
2960 sseg = StdSegNew();
2961 sseg->dim = 2;
2962
2963 if (sseg_head == NULL)
2964 sseg_head = sseg;
2965
2966 sseg->loc = slp;
2967 sseg->ids = sip;
2968
2969 if (sseg_old)
2970 sseg_old->next = sseg;
2971
2972 sseg_old = sseg;
2973 }
2974 sap->segs = sseg_head;
2975 sap->next = NULL;
2976
2977 return sap;
2978 }
2979 /*
2980 SimpleIntervalToGapXEditBlock(Int4 start1, Int4 start2, Int4 length)
2981
2982 Int4 start1, start2: offset of this interval in sequence 1 and 2.
2983 Int4 length: length of the interval.
2984
2985 May be used to produce a gapXEditBlock when there are no subs. or deletions
2986 in the interval (e.g., ungapped BLAST HSP).
2987 */
2988
2989 GapXEditBlockPtr LIBCALL
SimpleIntervalToGapXEditBlock(Int4 start1,Int4 start2,Int4 length)2990 SimpleIntervalToGapXEditBlock (Int4 start1, Int4 start2, Int4 length)
2991
2992 {
2993 GapXEditBlockPtr edit_block;
2994 GapXEditScriptPtr e_script;
2995
2996 edit_block = GapXEditBlockNew(start1, start2);
2997
2998 edit_block->esp = e_script = GapXEditScriptNew(NULL);
2999
3000 e_script->op_type = GAPALIGN_SUB;
3001 e_script->num = length;
3002
3003 return edit_block;
3004 }
3005
3006