1 static char const rcsid[] = "$Id: actutils.c,v 6.56 2008/10/22 17:19:22 bollin Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information (NCBI)
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government do not place any restriction on its use or reproduction.
14 * We would, however, appreciate having the NCBI and the author cited in
15 * any work or product based on this material.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * ===========================================================================
26 *
27 * File Name: actutils.c
28 *
29 * Author: Sarah Wheelan
30 *
31 * Version Creation Date: 2/00
32 *
33 * $Revision: 6.56 $
34 *
35 * File Description: utility functions for alignments
36 *
37 * Modifications:
38 * --------------------------------------------------------------------------
39 * $Log: actutils.c,v $
40 * Revision 6.56 2008/10/22 17:19:22 bollin
41 * Bug fix for truncating alignments when removing inconsistent alignments from
42 * a set.
43 *
44 * Revision 6.55 2008/05/31 18:07:15 kans
45 * TextFsa uses Int4 states
46 *
47 * Revision 6.54 2008/05/08 20:10:53 bollin
48 * Use Post instead of Message to communicate error for not finding alignment
49 * on reverse strand.
50 *
51 * Revision 6.53 2008/05/06 19:13:42 bollin
52 * When cleaning up alignments before converting a list to a global, if one
53 * is removed, the next one needs to be compared to the previous alignment,
54 * so the index in the loop needs to be reduced.
55 *
56 * Revision 6.52 2008/05/02 18:47:50 bollin
57 * Commented out diagnostic function
58 *
59 * Revision 6.51 2008/05/02 18:02:39 bollin
60 * Fixed bug in ACT_CleanUpAlignments - if an alignment span is completely
61 * contained in another alignment, remove the contained alignment.
62 * Also added a flag to Sqn_GlobalAlign2SeqEx for whether or not the alignment
63 * should be expanded to the ends of the sequence.
64 *
65 * Revision 6.50 2008/04/16 18:46:22 bollin
66 * Corrected bug in functions for creating global alignment from local alignments.
67 *
68 * Revision 6.49 2006/01/04 18:52:34 bollin
69 * changes to ACT_FindPiece to avoid memory leak
70 * changes to ACT_RemoveInconsistentAlnsFromSet to avoid global alignments that
71 * conflict by transposing elements
72 * changes to GetOldBlastAlignmentPiece to use a gap_open value that will not
73 * cause BLAST setup to fail
74 *
75 * Revision 6.48 2006/01/03 15:47:25 bollin
76 * allow alignment callback for ACT_FindPiece so that it can use the new BLAST
77 * library
78 *
79 * Revision 6.47 2005/12/19 14:32:54 bollin
80 * attempt to resolve conflicts between overlapping local alignments when
81 * constructing a global alignment - this resolved problems encountered when
82 * there are tandem repeats.
83 *
84 * Revision 6.46 2005/09/15 13:54:56 bollin
85 * don't create new entityID if bioseq already has one
86 *
87 * Revision 6.45 2005/04/20 19:17:38 lavr
88 * +<assert.h>
89 *
90 * Revision 6.44 2005/01/10 14:53:54 bollin
91 * the strand argument in SeqLocCopyRegion is confusing - rather than specifying
92 * the strand onto which the features should be placed, Seq_strand_minus
93 * indicates that the features should be reverse-complemented, while
94 * Seq_strand_plus indicates that the features should remain on their original
95 * strand.
96 *
97 * Revision 6.43 2004/11/22 16:45:24 bollin
98 * created global alignment function with callback method to allow use of new
99 * BLAST library
100 *
101 * Revision 6.42 2004/11/12 20:20:05 bollin
102 * removed unused variables
103 *
104 * Revision 6.41 2004/11/12 17:28:53 kans
105 * reverting for now - tools library cannot depend upon new blast libraries - will need to move to better place later
106 *
107 * Revision 6.40 2004/11/12 14:03:16 bollin
108 * use new BLAST code instead of old BLAST code in Sqn_GlobalAlign2Seq
109 *
110 * Revision 6.39 2004/11/02 18:53:39 bollin
111 * made act_get_eval available to other libraries
112 *
113 * Revision 6.38 2004/10/27 18:36:15 bollin
114 * make function external instead of replicating it elsewhere
115 *
116 * Revision 6.37 2004/07/30 13:36:47 bollin
117 * created separate function for reversing features on a sequence, changed code
118 * to swap plus to minus and minus to plus instead of moving all features to
119 * minus strand
120 *
121 * Revision 6.36 2004/02/11 20:51:48 bollin
122 * use error log instead of popup message to indicate that BLAST found no sequence similarity
123 *
124 * Revision 6.35 2004/01/02 18:11:28 kans
125 * Sqn_GlobalAlign2Seq flips code break, anticodon when reverse complementing sequence
126 *
127 * Revision 6.34 2003/06/30 15:01:29 whlavina
128 * Correct minus strand handling in CreaeContinuousAln functions; previous
129 * code could corrupt alignments (stop2-start1>1 would imply len<-2 if
130 * ExtendAlnRight ever gets called).
131 *
132 * Revision 6.33 2003/05/30 17:25:35 coulouri
133 * add rcsid
134 *
135 * Revision 6.32 2002/12/19 14:02:28 johnson
136 * change c++-style comments to c-style
137 *
138 * Revision 6.31 2002/11/20 16:00:03 johnson
139 * Revamped CpG routines; now equivalent to "strict" human CpG mapviewer track
140 *
141 * Revision 6.30 2002/06/11 15:25:12 johnson
142 * two minor bug fixes to CpG functions
143 *
144 * Revision 6.29 2002/04/02 21:45:32 wheelan
145 * added CpG finding functions written by Philip Johnson
146 *
147 * Revision 6.28 2002/03/27 17:35:24 todorov
148 * 1) AMAlignIndexFreeEitherIndex instead of AMFreeAllIndexes
149 * 2) recreated ACT_MakeProfileFromSA
150 *
151 * Revision 6.27 2002/03/27 14:40:41 kans
152 * #include <alignmgr.h>
153 *
154 * Revision 6.26 2002/03/26 19:56:21 todorov
155 * alignmgr to alignmgr2 transition
156 *
157 * Revision 6.25 2001/12/28 21:22:32 wheelan
158 * bug fix in Sqn_GlobalAlign2Seqs
159 *
160 * Revision 6.24 2001/09/04 13:47:15 wheelan
161 * made several functions extern
162 *
163 * Revision 6.23 2001/07/13 14:17:57 wheelan
164 * moved Sqn_GlobalAlign2Seq and associated functions here
165 *
166 * Revision 6.22 2001/05/22 12:01:07 wheelan
167 * changes to avoid overflow on alpha platform
168 *
169 * Revision 6.21 2001/03/26 16:45:57 wheelan
170 * fixed uninitialized variables
171 *
172 * Revision 6.20 2001/01/09 23:18:55 lewisg
173 * fix memory leaks
174 *
175 * Revision 6.19 2000/10/23 18:43:30 wheelan
176 * minor bug fix
177 *
178 * Revision 6.18 2000/09/07 04:53:40 sicotte
179 * fix alignment calls, bad matrix calls, and misc alignments problems for sequence update
180 *
181 * Revision 6.16 2000/08/28 16:19:11 sicotte
182 * moved AlnMgrSeqAlignMergeTwoPairwiseEx AlnMgrSeqAlignMergeTwoPairwise AlnMgrSeqAlignMergePairwiseSet to actutils.c from alignmgr.c
183 *
184 * Revision 6.15 2000/07/21 21:41:04 sicotte
185 * fix bug in AlnMgrForcePairwiseContinuousEx, that was inserting two copies of all the seqaligns when trying to realign both ends
186 *
187 * Revision 6.14 2000/05/05 11:53:10 wheelan
188 * bug fixes in ACT_MakeProfileFromSA
189 *
190 * Revision 6.13 2000/05/04 16:45:19 wheelan
191 * changes to profile builder to accomodate IBMed BLAST results
192 *
193 * Revision 6.12 2000/05/03 19:59:42 wheelan
194 * added fix for NULL alignments
195 *
196 * Revision 6.11 2000/05/02 12:00:21 wheelan
197 * fixed memory leaks
198 *
199 * Revision 6.10 2000/05/01 19:54:26 wheelan
200 * fixed memory leak
201 *
202 * Revision 6.9 2000/04/22 15:54:57 wheelan
203 * bug fixes in profile maker
204 *
205 * Revision 6.8 2000/04/18 13:57:14 wheelan
206 * added AlnMgrForcePairwiseContinuousEx
207 *
208 * Revision 6.7 2000/04/11 14:50:28 wheelan
209 * bug fix in AlnMgrForceContinuous
210 *
211 * Revision 6.6 2000/03/16 12:53:45 wheelan
212 * bug fix in AlnMgrForceContinuous
213 *
214 * Revision 6.5 2000/03/14 11:25:47 wheelan
215 * added ACT_ProfileFree functions
216 *
217 * Revision 6.4 2000/03/10 16:57:37 wheelan
218 * fixed AlnMgrForceContinuous
219 *
220 * Revision 6.3 2000/03/07 17:55:02 wheelan
221 * bug fixes in AlnMgrForcePairwiseContinuous
222 *
223 * Revision 6.2 2000/03/02 21:11:06 lewisg
224 * use bandalign for import sequence, make standalone ddv use viewmgr, make dialogs modal, send color update
225 *
226 * Revision 6.1 2000/02/11 17:31:44 kans
227 * initial checkin of functions depending upon blast/bandalign (SW)
228 *
229 * ==========================================================================
230 */
231
232 #include <actutils.h>
233 #include <assert.h>
234 #include <viewmgr.h>
235 #include <alignmgr.h>
236 #include <salptool.h>
237
238 static void StateTableSearch (TextFsaPtr tbl, CharPtr txt, Int4Ptr state, Int4 pos, ACT_sitelistPtr PNTR asp_prev, ACT_sitelistPtr PNTR asp_head);
239 static Boolean am_isa_gap(Int4 start, Int4 prevstop, Uint1 strand);
240 static void am_fix_strand(SeqAlignPtr sap, Uint1 strand1, Uint1 strand2);
241
242 #define CG_MINLEN 500 /*minimum length for CpG island; should be equivalent to mapviewer track setting! */
243
244 /*-----------------------------------------------------------------------------
245 PRE : locked nucleotide bioseq ptr
246 POST: entire sequence loaded into memory in iupacna format
247 -----------------------------------------------------------------------------*/
LoadSequence(BioseqPtr bsp)248 static CharPtr LoadSequence(BioseqPtr bsp)
249 {
250 SeqPortPtr spp;
251 CharPtr buf;
252 Int4 readCount;
253
254 buf = (CharPtr) MemNew(bsp->length * sizeof(Char));
255 if (!buf) {
256 ErrPostStr(SEV_ERROR, 0,0, "Error allocating memory for sequence!");
257 return NULL;
258 }
259
260 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown,
261 Seq_code_iupacna);
262
263 readCount = 0;
264 while (readCount < bsp->length) {
265 readCount += SeqPortRead(spp, (UcharPtr) buf+readCount, 25000);
266 assert(readCount != 0);
267 }
268 SeqPortFree(spp);
269
270 return buf;
271 }
272
273 /*-----------------------------------------------------------------------------
274 PRE : sequence, position to be added, cgIsland information
275 POST: cgIsle adjusted according to the nucleotide at position i & i-1
276 -----------------------------------------------------------------------------*/
AddPosition(CharPtr seq,Int4 i,ACT_CGInfoPtr cgIsle)277 static void AddPosition(CharPtr seq, Int4 i, ACT_CGInfoPtr cgIsle)
278 {
279 switch(seq[i]) {
280 case 'A': cgIsle->a++; break;
281 case 'C': cgIsle->c++; break;
282 case 'G': cgIsle->g++;
283 if (i > 0 && seq[i-1] == 'C')
284 cgIsle->cg++;
285 break;
286 case 'T': cgIsle->t++; break;
287 case 'N': cgIsle->n++; break;
288 }
289 }
290
291 /*-----------------------------------------------------------------------------
292 PRE : sequence, position to be removed, cgIsland information
293 POST: cgIsle adjusted according to the nucleotide at position i & i-1
294 -----------------------------------------------------------------------------*/
RemovePosition(CharPtr seq,Int4 i,ACT_CGInfoPtr cgIsle)295 static void RemovePosition(CharPtr seq, Int4 i, ACT_CGInfoPtr cgIsle)
296 {
297 switch(seq[i]) {
298 case 'A': cgIsle->a--; break;
299 case 'C': cgIsle->c--; break;
300 case 'G': cgIsle->g--;
301 if (i > 0 && seq[i-1] == 'C')
302 cgIsle->cg--;
303 break;
304 case 'T': cgIsle->t--; break;
305 case 'N': cgIsle->n--; break;
306 }
307 }
308
309 /*-----------------------------------------------------------------------------
310 PRE : sequence, length of sequence, (ptr to) cgIsle structure with to &
311 from fields filled in
312 POST: cgInfo filled with stats (#a's, c's, etc.) for window (to ->|
313 from)
314 -----------------------------------------------------------------------------*/
CalcWindowStats(CharPtr seq,ACT_CGInfoPtr cgIsle)315 static void CalcWindowStats(CharPtr seq, ACT_CGInfoPtr cgIsle)
316 {
317 Int4 i;
318
319 cgIsle->a = cgIsle->t = cgIsle->g = cgIsle->c = cgIsle->cg = 0;
320 cgIsle->length = cgIsle->to - cgIsle->from + 1;
321
322 for (i = cgIsle->from; i <= cgIsle->to; i++)
323 AddPosition(seq, i, cgIsle);
324 }
325
326 /*-----------------------------------------------------------------------------
327 PRE : island structure filled
328 POST: whether or not we consider this to be a CpG island
329 -----------------------------------------------------------------------------*/
IsIsland(ACT_CGInfoPtr isle)330 static Boolean IsIsland(ACT_CGInfoPtr isle)
331 {
332 Uint4 len = isle->to-isle->from+1;
333
334 return (100*(isle->c + isle->g) > 50*len &&
335 100* isle->cg*len > 60* isle->c*isle->g);
336 }
337
338
339 /*-----------------------------------------------------------------------------
340 PRE : sequence, length of sequence, win->from field filled in
341 POST: whether or not we found a window further down the sequence that
342 meets the mimimum criteria; if so, 'win' is set to that window
343 -----------------------------------------------------------------------------*/
SlideToHit(CharPtr seq,Int4 seqLength,ACT_CGInfoPtr win)344 static Boolean SlideToHit(CharPtr seq, Int4 seqLength, ACT_CGInfoPtr win)
345 {
346 Boolean inIsland, done;
347
348 win->to = win->from + CG_WINDOWSIZE - 1;
349
350 if (win->to >= seqLength)
351 return FALSE;
352
353 CalcWindowStats(seq, win);
354
355 inIsland = FALSE;
356 done = FALSE;
357
358 while (win->to < seqLength && !IsIsland(win)) {
359 /* remove 1 nt from left side */
360 RemovePosition(seq, win->from, win);
361
362 /* advance */
363 ++win->from;
364 ++win->to;
365
366 if (win->to < seqLength) {
367 /* add 1 nt onto right side */
368 AddPosition(seq, win->to, win);
369 }
370 }
371
372 return IsIsland(win);
373 }
374
375 /*-----------------------------------------------------------------------------
376 PRE : sequence, length of sequence, window that meets the GC & CpG criteria
377 POST: whether or not the island can be extended to reach at least the
378 minimum length; if so, isle is set to that window
379 -----------------------------------------------------------------------------*/
ExtendHit(CharPtr seq,Int4 seqLength,ACT_CGInfoPtr isle)380 static Boolean ExtendHit(CharPtr seq, Int4 seqLength, ACT_CGInfoPtr isle)
381 {
382 ACT_CGInfoPtr win = (ACT_CGInfoPtr) MemNew(sizeof(ACT_CGInfo));
383 memcpy(win, isle, sizeof(ACT_CGInfo));
384
385 /* jump by 200bp increments */
386 while (win->to + CG_WINDOWSIZE < seqLength && IsIsland(win)) {
387 win->from += CG_WINDOWSIZE;
388 win->to += CG_WINDOWSIZE;
389 CalcWindowStats(seq, win);
390 }
391
392 /* if we overshot, slide back by 1bp increments */
393 while (!IsIsland(win)) {
394 RemovePosition(seq, win->to, win);
395 --win->from;
396 --win->to;
397 AddPosition(seq, win->from, win);
398 }
399
400 /* trim ends of entire island until we're above criteria again */
401 isle->to = win->to;
402 CalcWindowStats(seq, isle);
403 while(!IsIsland(isle) && (isle->from < isle->to)) {
404 RemovePosition(seq, isle->to, isle);
405 RemovePosition(seq, isle->from, isle);
406 --isle->to;
407 ++isle->from;
408 }
409
410 free(win);
411
412 if (isle->from >= isle->to) {/* in case we trimmed to nothing */
413 isle->to = isle->from;
414 return FALSE;
415 }
416 return (isle->to - isle->from + 1 > CG_MINLEN);
417 }
418
419 /*-----------------------------------------------------------------------------
420 PRE : locked nucleotide bioseq ptr
421 POST: linked list of CpG islands (if any). Uses algorithm from Takai and
422 Jones, Proc Natl Acad Sci U S A 2002 Mar 19;99(6):3740-5.
423 Cutoffs used in the "strict" human CpG map viewer track as of 11/20/02:
424 CG_WINDOWSIZE=200, CG_MINLEN=500, CpG obs/exp >= 0.6, G+C >= 0.5
425 -----------------------------------------------------------------------------*/
ACT_FindCpG(BioseqPtr bsp)426 NLM_EXTERN ACT_CGInfoPtr ACT_FindCpG(BioseqPtr bsp)
427 {
428 CharPtr seq;
429 ACT_CGInfoPtr cgHead, isle;
430
431 if (bsp == NULL || bsp->length < CG_WINDOWSIZE)
432 return NULL;
433 if (bsp->mol == Seq_mol_aa) {
434 Message(SEV_WARNING, "Must use nucleotide sequence\n");
435 return NULL;
436 }
437
438 if (!(seq = LoadSequence(bsp)))
439 return NULL;/* error message displayed in LoadSequence */
440
441
442 cgHead = NULL;
443 isle = (ACT_CGInfoPtr) MemNew(sizeof(ACT_CGInfo));
444
445 isle->from = 0;
446
447 while (SlideToHit(seq, bsp->length, isle)) {
448 if (ExtendHit(seq, bsp->length, isle)) {
449 isle->next = cgHead;
450 cgHead = isle;
451 isle = (ACT_CGInfoPtr) MemNew(sizeof(ACT_CGInfo));
452 isle->to = cgHead->to;
453 }
454 isle->from = isle->to + 1;
455 }
456
457 free(isle);
458 free(seq);
459 return cgHead;
460 }
461
462 /*NLM_EXTERN ACT_CGInfoPtr ACT_FindCpG(BioseqPtr bsp)
463 {
464 ACT_CGInfoPtr acg;
465 ACT_CGInfoPtr acg_head;
466 ACT_CGInfoPtr acg_prev;
467 ACT_sitelistPtr asp;
468 ACT_sitelistPtr asp_head;
469 ACT_sitelistPtr asp_prev;
470 ACT_sitelistPtr asp_tmp;
471 Uint1Ptr buf;
472 CharPtr c;
473 Int2 j;
474 Int4 max_len;
475 Int4 offset;
476 Int4 pos;
477 Uint1 prev;
478 Uint1 r;
479 Uint1 res1;
480 Uint1 res2;
481 Uint1 residue;
482 Int4 start;
483 Int4 state;
484 Int4 state_r;
485 Int4 state_test;
486 TextFsaPtr tbl;
487 Int4 x;
488 FloatHi y;
489
490 if (bsp == NULL)
491 return NULL;
492 if (bsp->mol == Seq_mol_aa)
493 {
494 Message(SEV_WARNING, "Must use nucleotide sequence\n");
495 return NULL;
496 }
497 if (bsp->length < MAX_LEN)
498 max_len = bsp->length+1;
499 else
500 max_len = MAX_LEN;
501 buf = (Uint1Ptr)MemNew((max_len)*sizeof(Uint1));
502 state = 0;
503 prev = 0;
504 pos = 0;
505 state_r = 0;
506 asp_prev = asp_head = NULL;
507 tbl = TextFsaNew();
508 if (tbl == NULL)
509 return NULL;
510 TextFsaAdd(tbl, "CCCGGG");
511 TextFsaAdd(tbl, "CCGCGG");
512 state_test = 0;
513 acg = (ACT_CGInfoPtr)MemNew(sizeof(ACT_CGInfo));
514 acg_head = NULL;
515 offset = 0;
516 while ((residue = ACT_GetResidue(pos, buf, &offset, bsp)) != 0)
517 {
518 if (residue == 65)
519 c = "A";
520 else if (residue == 67)
521 c = "C";
522 else if (residue == 71)
523 c = "G";
524 else if (residue == 84)
525 c = "T";
526 else
527 c = "N";
528 StateTableSearch(tbl, c, &state_test, pos, &asp_prev, &asp_head);
529 acg->length++;
530 acg->to = pos;
531 pos++;
532 if (residue == 65)
533 acg->a++;
534 else if (residue == 67)
535 acg->c++;
536 else if (residue == 71)
537 {
538 if (prev == 67)
539 acg->cg++;
540 acg->g++;
541 } else if (residue == 84)
542 acg->t++;
543 else
544 acg->n++;
545 prev = residue;
546 if (acg->length >= 200)
547 {
548 if (state == 0)
549 {
550 if (100*(acg->cg)*(acg->length) > 60*(acg->c)*(acg->g) && 10*(acg->c+acg->g) > 6*(acg->length))
551 {
552 state = 1;
553 } else
554 {
555 res1 = ACT_GetResidue(acg->from, buf, &offset, bsp);
556 if (res1 == 67)
557 {
558 res2 = ACT_GetResidue(acg->from+1, buf, &offset, bsp);
559 if (res2 == 71)
560 acg->cg--;
561 }
562 if (res1 == 65)
563 acg->a--;
564 else if (res1 == 67)
565 acg->c--;
566 else if (res1 == 71)
567 acg->g--;
568 else if (res1 == 84)
569 acg->t--;
570 else
571 acg->n--;
572 acg->from++;
573 acg->length--;
574 }
575 } else if (state == 1)
576 {
577 if (100*(acg->cg)*(acg->length) <= 60*(acg->c)*(acg->g) || 10*(acg->c+acg->g) < 6*(acg->length))
578 {
579 state = 0;
580 if (acg_head)
581 {
582 acg_prev->next = acg;
583 acg_prev = acg;
584 } else
585 acg_head = acg_prev = acg;
586 j=0;
587 if (acg->from - 2000 < 0)
588 start = 0;
589 else
590 start = acg->from - 2000;
591 r = 1;
592 acg->sequence = (CharPtr)MemNew(20000*sizeof(Char));
593 for (x=start; x<(acg->to+2000) && r > 0; x++, j++)
594 {
595 r = ACT_GetResidue(x, buf, &offset, bsp);
596 if (r == 65)
597 acg->sequence[j] = 'A';
598 else if (r == 67)
599 acg->sequence[j] = 'C';
600 else if (r == 71)
601 acg->sequence[j] = 'G';
602 else if (r == 84)
603 acg->sequence[j] = 'T';
604 else
605 acg->sequence[j] = 'N';
606 }
607 y = (FloatHi)((acg->cg)*(acg->length))/(FloatHi)((acg->c)*(acg->g));
608 printf("Coordinates: %d to %d CpG: %d to %d conf: %f\n%s\n", start, x, acg->from, acg->to, y, acg->sequence);
609 MemFree(acg->sequence);
610 acg = (ACT_CGInfoPtr)MemNew(sizeof(ACT_CGInfo));
611 acg->from = pos;
612 acg->length = 1;
613 if (residue == 65)
614 acg->a++;
615 else if (residue == 67)
616 acg->c++;
617 else if (residue == 71)
618 acg->g++;
619 else if (residue == 84)
620 acg->t++;
621 else
622 acg->n++;
623 } else if (acg->length > 1000)
624 {
625 res1 = ACT_GetResidue(acg->from, buf, &offset, bsp);
626 if (res1 == 65 || res1 == 84)
627 {
628 acg->from++;
629 acg->length--;
630 if (res1 == 65)
631 acg->a--;
632 else if (res1 == 84)
633 acg->t--;
634 }
635 }
636 }
637 }
638 }
639 check for restriction sites in potential islands found
640 acg = acg_head;
641 return acg_head;
642 acg_prev = NULL; Statement not reached...
643 while (acg)
644 {
645 asp_prev = NULL;
646 asp = asp_head;
647 while (asp)
648 {
649 if (asp->start >= acg->from && asp->start < acg->to - 9)
650 {
651 if (asp_prev)
652 {
653 asp_prev->next = asp->next;
654 asp_tmp = asp->next;
655 } else
656 asp_head = asp_tmp = asp->next;
657 asp->next = acg->asp;
658 acg->asp = asp;
659 asp = asp_tmp;
660 } else
661 {
662 asp_prev = asp;
663 asp = asp->next;
664 }
665 }
666 if (acg->asp == NULL)
667 {
668 if (acg_prev != NULL)
669 {
670 acg_prev->next = acg->next;
671 acg->next = NULL;
672 MemFree(acg);
673 acg = acg_prev->next;
674 } else
675 {
676 acg_head = acg->next;
677 acg->next = NULL;
678 MemFree(acg);
679 acg = acg_head;
680 }
681 } else
682 {
683 if (acg->asp->next == NULL)
684 {
685 if (acg_prev != NULL)
686 {
687 acg_prev->next = acg->next;
688 acg->next = NULL;
689 MemFree(acg);
690 acg = acg_prev->next;
691 } else
692 {
693 acg_head = acg->next;
694 acg->next = NULL;
695 MemFree(acg);
696 acg = acg_head;
697 }
698 } else
699 {
700 acg_prev = acg;
701 acg = acg->next;
702 }
703 }
704 }
705 MemFree(buf);
706 return acg_head;
707 }*/
708
ACT_GetResidue(Int4 pos,Uint1Ptr buf,Int4Ptr offset,BioseqPtr bsp)709 NLM_EXTERN Uint1 ACT_GetResidue(Int4 pos, Uint1Ptr buf, Int4Ptr offset, BioseqPtr bsp)
710 {
711 Int4 bufsize;
712 SeqPortPtr spp;
713
714 if (offset == NULL)
715 return 0;
716 if (pos > bsp->length - 1)
717 return 0;
718 if (buf[0] == 0 || pos > (*offset + MAX_LEN - 1) || pos < *offset)
719 {
720 if (pos > *offset + MAX_LEN - 1)
721 *offset = *offset + MAX_LEN;
722 else if (pos < *offset)
723 *offset = pos;
724 else
725 *offset = 0;
726 if (bsp->length < MAX_LEN)
727 {
728 bufsize = bsp->length;
729 spp = SeqPortNew(bsp, 0, -1, 0, Seq_code_iupacna);
730 } else if (bsp->length < *offset + MAX_LEN-1)
731 {
732 bufsize = bsp->length - *offset + 1;
733 spp = SeqPortNew(bsp, *offset, bsp->length-1, 0, Seq_code_iupacna);
734 } else
735 {
736 bufsize = MAX_LEN;
737 spp = SeqPortNew(bsp, *offset, *offset+MAX_LEN-1, 0, Seq_code_iupacna);
738 }
739 if (spp == NULL)
740 {
741 Message(SEV_WARNING, "Couldn't create SeqPort\n");
742 return 0;
743 }
744 SeqPortRead(spp, buf, bufsize);
745 }
746 return (buf[pos-(*offset)]);
747 }
748
StateTableSearch(TextFsaPtr tbl,CharPtr txt,Int4Ptr state,Int4 pos,ACT_sitelistPtr PNTR asp_prev,ACT_sitelistPtr PNTR asp_head)749 static void StateTableSearch (TextFsaPtr tbl, CharPtr txt, Int4Ptr state, Int4 pos, ACT_sitelistPtr PNTR asp_prev, ACT_sitelistPtr PNTR asp_head)
750 {
751 ACT_sitelistPtr asp;
752 Char ch;
753 ValNodePtr matches;
754 CharPtr ptr;
755 ValNodePtr vnp;
756
757 if (tbl == NULL || txt == NULL) return;
758
759 for (ptr = txt, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
760 *state = TextFsaNext (tbl, *state, ch, &matches);
761 for (vnp = matches; vnp != NULL; vnp = vnp->next) {
762 asp = (ACT_sitelistPtr)MemNew(sizeof(ACT_sitelist));
763 if (asp_prev)
764 {
765 if (*asp_prev)
766 {
767 (*asp_prev)->next = asp;
768 *asp_prev = asp;
769 } else
770 *asp_prev = *asp_head = asp;
771 asp->name = (CharPtr) vnp->data.ptrvalue;
772 asp->start = pos;
773 }
774 }
775 }
776 }
777
ACT_ProfileNew(Boolean nuc)778 NLM_EXTERN ACTProfilePtr ACT_ProfileNew(Boolean nuc)
779 {
780 ACTProfilePtr app;
781 FloatHiPtr PNTR freq;
782
783 app = (ACTProfilePtr)MemNew(sizeof(ACTProfile));
784 if (nuc)
785 {
786 freq = (FloatHiPtr PNTR)MemNew(ACT_NUCLEN*sizeof(FloatHiPtr));
787 app->freq = freq;
788 app->nuc = TRUE;
789 } else
790 {
791 freq = (FloatHiPtr PNTR)MemNew(ACT_PROTLEN*sizeof(FloatHiPtr));
792 app->freq = freq;
793 app->nuc = FALSE;
794 }
795 return app;
796 }
797
798 /***************************************************************************
799 *
800 * ACT_ProfileFree frees a single profile; ACT_ProfileSetFree frees an
801 * entire linked list of profiles.
802 *
803 ***************************************************************************/
ACT_ProfileFree(ACTProfilePtr app)804 NLM_EXTERN ACTProfilePtr ACT_ProfileFree(ACTProfilePtr app)
805 {
806 Int4 i;
807 Int4 j;
808
809 if (app == NULL)
810 return NULL;
811 if (app->nuc)
812 j = ACT_NUCLEN;
813 else
814 j = ACT_PROTLEN;
815 for (i=0; i<j; i++)
816 {
817 MemFree(app->freq[i]);
818 }
819 MemFree(app->freq);
820 app->next = NULL;
821 MemFree(app);
822 return NULL;
823 }
824
ACT_ProfileSetFree(ACTProfilePtr app)825 NLM_EXTERN ACTProfilePtr ACT_ProfileSetFree(ACTProfilePtr app)
826 {
827 ACTProfilePtr app_next;
828
829 while (app != NULL)
830 {
831 app_next = app->next;
832 app->next = NULL;
833 ACT_ProfileFree(app);
834 app = app_next;
835 }
836 return NULL;
837 }
838
ACT_BuildProfile(SeqLocPtr slp,ACTProfilePtr PNTR app,Int4Ptr count,Int4 length)839 NLM_EXTERN void ACT_BuildProfile(SeqLocPtr slp, ACTProfilePtr PNTR app, Int4Ptr count, Int4 length)
840 {
841 Int4 i;
842 Int4 len;
843 Uint1 res;
844 SeqPortPtr spp;
845
846 if (app == NULL)
847 return;
848 if (slp == NULL)
849 {
850 *count = *count+length;
851 if ((*app)->len <= *count)
852 {
853 *count = 0;
854 *app = (*app)->next;
855 }
856 return;
857 }
858 len = SeqLocLen(slp);
859 if (len <= 0)
860 return;
861 if ((*app)->len == 0)
862 {
863 (*app)->len = len;
864 if ((*app)->nuc)
865 {
866 for (i=0; i<ACT_NUCLEN; i++)
867 {
868 (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
869 }
870 } else
871 {
872 for (i=0; i<ACT_PROTLEN; i++)
873 {
874 (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
875 }
876 }
877 } else
878 {
879 if (len > (*app)->len) /* seqloc is longer than the */
880 return; /* existing profile -- don't add it */
881 }
882 if ((*app)->nuc)
883 spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
884 else
885 spp = SeqPortNewByLoc(slp, Seq_code_ncbistdaa);
886 if (spp == NULL)
887 return;
888 if (*count == 0)
889 (*app)->numseq++;
890 i=0;
891 if ((*app)->nuc == FALSE)
892 {
893 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
894 {
895 (*app)->freq[res][i+*count]++;
896 i++;
897 }
898 } else
899 {
900 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
901 {
902 if (res == 1)
903 {
904 (*app)->freq[0][i+*count]++;
905 } else if (res == 2)
906 {
907 (*app)->freq[1][i+*count]++;
908 } else if (res == 4)
909 {
910 (*app)->freq[2][i+*count]++;
911 } else if (res == 8)
912 {
913 (*app)->freq[3][i+*count]++;
914 } else
915 {
916 (*app)->freq[4][i+*count]++;
917 }
918 i++;
919 }
920 }
921 SeqPortFree(spp);
922 if (len+*count == (*app)->len)
923 {
924 *app = (*app)->next;
925 *count = 0;
926 } else
927 *count = *count + len;
928 return;
929 }
930
ACT_ScoreProfile(BioseqPtr bsp,Int4 pos,Uint1 strand,ACTProfilePtr app)931 NLM_EXTERN FloatHi ACT_ScoreProfile(BioseqPtr bsp, Int4 pos, Uint1 strand, ACTProfilePtr app)
932 {
933 Int4 i;
934 Uint1 res;
935 FloatHi retval;
936 SeqPortPtr spp;
937
938 retval = 0;
939 if (bsp == NULL || app == NULL || pos < 0)
940 return retval;
941 if (pos + app->len-1 >= bsp->length)
942 return retval;
943 if (ISA_na(bsp->mol))
944 {
945 spp = SeqPortNew(bsp, pos, pos+app->len-1, strand, Seq_code_ncbi4na);
946 if (spp == NULL)
947 return retval;
948 i = 0;
949 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i<app->len)
950 {
951 if (res == 1)
952 {
953 retval += app->freq[0][i];
954 } else if (res == 2)
955 {
956 retval += app->freq[1][i];
957 } else if (res == 4)
958 {
959 retval += app->freq[2][i];
960 } else if (res == 8)
961 {
962 retval += app->freq[3][i];
963 } else
964 {
965 retval += app->freq[4][i];
966 }
967 i++;
968 }
969 retval = retval / app->len;
970 return retval;
971 } else
972 {
973 spp = SeqPortNew(bsp, pos, pos+app->len-1, strand, Seq_code_ncbistdaa);
974 if (spp == NULL)
975 return retval;
976 i = 0;
977 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i<app->len)
978 {
979 retval += app->freq[res][i];
980 i++;
981 }
982 retval = retval / app->len;
983 return retval;
984 }
985 }
986
ACT_EstimateConfidence(ACTProfilePtr app)987 NLM_EXTERN void ACT_EstimateConfidence(ACTProfilePtr app)
988 {
989 FloatHi conf;
990 Int4 i;
991 Int4 j;
992 Int4 max;
993 Int4 numres;
994
995 if (app == NULL)
996 return;
997 if (app->nuc)
998 numres = ACT_NUCLEN;
999 else
1000 numres = ACT_PROTLEN;
1001 conf = 1;
1002 while (app)
1003 {
1004 for (i=0; i<app->len; i++)
1005 {
1006 max = 0;
1007 for (j=0; j<numres; j++)
1008 {
1009 if (app->freq[j][i] > max)
1010 max = app->freq[j][i];
1011 }
1012 if (max > 0)
1013 conf = conf*max;
1014 if (conf > INT4_MAX)
1015 conf = INT4_MAX;
1016 }
1017 app->confidence = conf;
1018 app = app->next;
1019 }
1020 return;
1021 }
1022
ACT_SortProfilesByConfidence(ACTProfilePtr app)1023 NLM_EXTERN ACTProfilePtr ACT_SortProfilesByConfidence(ACTProfilePtr app)
1024 {
1025 ACTProfilePtr app_head;
1026 ACTProfilePtr PNTR array;
1027 Int4 count;
1028 Int4 i;
1029
1030 if (app == NULL)
1031 return NULL;
1032 app_head = app;
1033 count = 0;
1034 while (app != NULL)
1035 {
1036 count++;
1037 app = app->next;
1038 }
1039 array = (ACTProfilePtr PNTR)MemNew(count*sizeof(ACTProfilePtr));
1040 app = app_head;
1041 count = 0;
1042 while (app != NULL)
1043 {
1044 array[count] = app;
1045 count++;
1046 app = app->next;
1047 }
1048 HeapSort((Pointer)array, (size_t)count, sizeof(ACTProfilePtr), ACT_CompareProfileConfidence);
1049 app_head = app = array[0];
1050 for (i=1; i<count; i++)
1051 {
1052 app->next = array[i];
1053 app = app->next;
1054 }
1055 return app_head;
1056 }
1057
ACT_CompareProfileConfidence(VoidPtr base,VoidPtr large_son)1058 NLM_EXTERN int LIBCALLBACK ACT_CompareProfileConfidence(VoidPtr base, VoidPtr large_son)
1059 {
1060 ACTProfilePtr app1;
1061 ACTProfilePtr app2;
1062
1063 app1 = *((ACTProfilePtr PNTR) base);
1064 app2 = *((ACTProfilePtr PNTR) large_son);
1065 if (app1 == NULL || app2 == NULL)
1066 return 0;
1067 if (app1->confidence > app2->confidence)
1068 return -1;
1069 else if (app1->confidence < app2->confidence)
1070 return 1;
1071 else
1072 return 0;
1073 }
1074
ACT_MakeProfileFromSA(SeqAlignPtr sap)1075 NLM_EXTERN ACTProfilePtr ACT_MakeProfileFromSA(SeqAlignPtr sap)
1076 {
1077 AMAlignIndex2Ptr amaip;
1078 AlnMsg2Ptr amp;
1079 ACTProfilePtr app;
1080 ACTProfilePtr app_head;
1081 ACTProfilePtr app_prev;
1082 BioseqPtr bsp;
1083 Int4 count;
1084 Int4 i;
1085 Int4 j;
1086 Boolean more;
1087 Boolean nuc;
1088 Int4 numrows;
1089 SeqIdPtr sip;
1090 SeqLocPtr slp;
1091
1092 if (sap == NULL)
1093 return NULL;
1094 if (sap->saip == NULL)
1095 return NULL;
1096 if (sap->saip->indextype == INDEX_PARENT)
1097 {
1098 amaip = (AMAlignIndex2Ptr)(sap->saip);
1099 if (amaip->alnstyle == AM2_LITE)
1100 return NULL;
1101 }
1102 sip = AlnMgr2GetNthSeqIdPtr(sap, 1);
1103 bsp = BioseqLockById(sip);
1104 if (bsp == NULL)
1105 return NULL;
1106 if (ISA_na(bsp->mol))
1107 nuc = TRUE;
1108 else
1109 nuc = FALSE;
1110 BioseqUnlockById(sip);
1111 sip = SeqIdFree(sip);
1112 amp = AlnMsgNew2();
1113 amp->to_aln = -1;
1114 amp->row_num = 1;
1115 app_head = NULL;
1116 /*
1117 if (sap->saip->indextype == INDEX_PARENT)
1118 {
1119 for (i=0; i<amaip->numseg; i++)
1120 {
1121 app = ACT_ProfileNew(nuc);
1122 app->len = amaip->lens[i];
1123 if (nuc)
1124 {
1125 for (j=0; j<ACT_NUCLEN; j++)
1126 {
1127 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
1128 }
1129 } else
1130 {
1131 for (j=0; j<ACT_PROTLEN; j++)
1132 {
1133 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
1134 }
1135 }
1136 if (app_head != NULL)
1137 {
1138 app_prev->next = app;
1139 app_prev = app;
1140 } else
1141 app_head = app_prev = app;
1142 }
1143 } else
1144 {
1145 */
1146 while ((Boolean) (more = AlnMgr2GetNextAlnBit(sap, amp)))
1147 {
1148 app = ACT_ProfileNew(nuc);
1149 app->len = amp->to_row - amp->from_row + 1;;
1150 if (nuc)
1151 {
1152 for (j=0; j<ACT_NUCLEN; j++)
1153 {
1154 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
1155 }
1156 } else
1157 {
1158 for (j=0; j<ACT_PROTLEN; j++)
1159 {
1160 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
1161 }
1162 }
1163 if (app_head != NULL)
1164 {
1165 app_prev->next = app;
1166 app_prev = app;
1167 } else
1168 app_head = app_prev = app;
1169 }
1170 /*
1171 }
1172 */
1173 numrows = AlnMgr2GetNumRows(sap);
1174 for (i=1; i<=numrows; i++)
1175 {
1176 AlnMsgReNew2(amp);
1177 amp->to_aln = -1;
1178 amp->row_num = i;
1179 app = app_head;
1180
1181 sip = AlnMgr2GetNthSeqIdPtr(sap, i);
1182 bsp = BioseqLockById(sip);
1183 count = 0;
1184 while ((Boolean) (more = AlnMgr2GetNextAlnBit(sap, amp)) && app != NULL)
1185 {
1186 if (amp->type == AM_SEQ && bsp != NULL)
1187 {
1188 slp = SeqLocIntNew(amp->from_row, amp->to_row, amp->strand, sip);
1189 ACT_BuildProfile(slp, &app, &count, 0);
1190 SeqLocFree(slp);
1191 } else if (amp->type == AM_GAP)
1192 ACT_BuildProfile(NULL, &app, &count, (amp->to_row - amp->from_row + 1));
1193 }
1194 BioseqUnlockById(sip);
1195 sip = SeqIdFree(sip);
1196 }
1197 ACT_EstimateConfidence(app_head);
1198 AlnMsgFree2(amp);
1199 return app_head;
1200 }
1201
ACT_SortAndTruncate(ACT_TopScorePtr PNTR ats)1202 NLM_EXTERN ACT_TopScorePtr PNTR ACT_SortAndTruncate(ACT_TopScorePtr PNTR ats)
1203 {
1204 return NULL;
1205 }
1206
ACT_PlaceByScore(ACT_PlaceBoundsPtr abp)1207 NLM_EXTERN ACT_PositionPtr ACT_PlaceByScore(ACT_PlaceBoundsPtr abp)
1208 {
1209 Int4 i;
1210 FloatHi score;
1211
1212 score = ACT_CalcScore(abp);
1213 if (score > abp->apos->score)
1214 {
1215 abp->apos->score = score;
1216 for (i=0; i<abp->nprof; i++)
1217 {
1218 abp->apos->posarray[i] = abp->boundarray[i];
1219 }
1220 }
1221 while (abp->currpos[abp->currprof] < abp->numats[abp->currprof] - 1)
1222 {
1223 abp->currpos[abp->currprof]++;
1224 abp->currats[abp->currprof] = abp->currats[abp->currprof]->next;
1225 abp->boundarray[abp->currprof] = abp->currats[abp->currprof]->pos;
1226 score = ACT_CalcScore(abp);
1227 if (score > abp->apos->score)
1228 {
1229 abp->apos->score = score;
1230 for (i=0; i<abp->nprof; i++)
1231 {
1232 abp->apos->posarray[i] = abp->boundarray[i];
1233 }
1234 }
1235 }
1236 while(abp->currpos[abp->currprof] >= abp->numats[abp->currprof]-1 && abp->currprof >= 0)
1237 {
1238 abp->currprof--;
1239 }
1240 if (abp->currprof < 0)
1241 return (abp->apos);
1242 for (i=abp->currprof+1; i<abp->nprof; i++)
1243 {
1244 abp->currpos[i] = 0;
1245 abp->currats[i] = abp->ats[i];
1246 abp->boundarray[i] = abp->currats[i]->pos;
1247 while (abp->boundarray[i] <= abp->boundarray[i-1])
1248 {
1249 if (abp->currpos[abp->currprof] >= abp->numats[abp->currprof]-1)
1250 return (abp->apos);
1251 abp->currpos[i]+=1;
1252 abp->currats[i] = abp->currats[i]->next;
1253 abp->boundarray[i] = abp->currats[i]->pos;
1254 }
1255 }
1256 abp->currpos[abp->currprof]++;
1257 abp->currats[abp->currprof] = abp->currats[abp->currprof]->next;
1258 abp->boundarray[abp->currprof] = abp->currats[abp->currprof]->pos;
1259 abp->currprof = abp->nprof-1;
1260 abp->apos = ACT_PlaceByScore(abp);
1261 return (abp->apos);
1262 }
1263
ACT_CalcScore(ACT_PlaceBoundsPtr abp)1264 NLM_EXTERN FloatHi ACT_CalcScore(ACT_PlaceBoundsPtr abp)
1265 {
1266 ACTProfilePtr app;
1267 ACT_TopScorePtr ats;
1268 Int4 i;
1269 Int4 j;
1270 FloatHi score;
1271
1272 app = abp->app;
1273 score = 0;
1274 for (i=1; i<abp->nprof; i++)
1275 {
1276 if (app == NULL)
1277 return 0;
1278 if (abp->boundarray[i-1] + app->len >= abp->boundarray[i])
1279 return 0;
1280 j=0;
1281 ats = abp->ats[i-1];
1282 while (j<abp->currpos[i-1])
1283 {
1284 if (ats == NULL)
1285 return 0;
1286 ats = ats->next;
1287 }
1288 score += app->confidence*ats->score;
1289 app = app->next;
1290 }
1291 return score;
1292 }
1293
ACT_FindPeakScores(FloatHiPtr scorearray,Int4 len)1294 NLM_EXTERN ACT_TopScorePtr ACT_FindPeakScores(FloatHiPtr scorearray, Int4 len)
1295 {
1296 ACT_TopScorePtr ats;
1297 ACT_TopScorePtr ats_head;
1298 ACT_TopScorePtr ats_new;
1299 ACT_TopScorePtr ats_newhead;
1300 ACT_TopScorePtr ats_newprev;
1301 ACT_TopScorePtr ats_prev;
1302 FloatHi diff;
1303 FloatHi diff_prev;
1304 Int4 i;
1305
1306 if (scorearray == NULL)
1307 return NULL;
1308 diff = diff_prev = 0;
1309 diff_prev = scorearray[1] - scorearray[0];
1310 ats_head = NULL;
1311 for (i=1; i<len-1; i++)
1312 {
1313 diff = scorearray[i+1]-scorearray[i];
1314 if (diff < 0 && diff_prev >= 0) /* peak */
1315 {
1316 ats = (ACT_TopScorePtr)MemNew(sizeof(ACT_TopScore));
1317 ats->score = scorearray[i-1];
1318 ats->pos = i-1;
1319 if (ats_head != NULL)
1320 {
1321 ats_prev->next = ats;
1322 ats_prev = ats;
1323 } else
1324 ats_head = ats_prev = ats;
1325 }
1326 diff_prev = diff;
1327 }
1328 ats = ats_prev = ats_head;
1329 ats = ats->next;
1330 ats_newhead = NULL;
1331 diff_prev = 0;
1332 while (ats)
1333 {
1334 diff = ats->score - ats_prev->score;
1335 if (diff < 0 && diff_prev >= 0)
1336 {
1337 ats_new = (ACT_TopScorePtr)MemNew(sizeof(ACT_TopScore));
1338 ats_new->score = ats_prev->score;
1339 ats_new->pos = ats_prev->pos;
1340 if (ats_newhead != NULL)
1341 {
1342 ats_newprev->next = ats_new;
1343 ats_newprev = ats_new;
1344 } else
1345 ats_newhead = ats_newprev = ats_new;
1346 }
1347 diff_prev = diff;
1348 ats_prev = ats;
1349 ats = ats->next;
1350 }
1351 ats_prev = ats_head;
1352 while (ats_prev)
1353 {
1354 ats = ats_prev->next;
1355 ats_prev->next = NULL;
1356 MemFree(ats_prev);
1357 ats_prev = ats;
1358 }
1359 return ats_newhead;
1360 }
1361
act_get_eval(Int4 exp)1362 NLM_EXTERN FloatHi act_get_eval(Int4 exp)
1363 {
1364 FloatHi eval;
1365 Int4 i;
1366
1367 eval = 1;
1368 for (i=1; i<=exp; i++)
1369 {
1370 eval = eval/10;
1371 }
1372 return eval;
1373 }
1374
am_isa_gap(Int4 start,Int4 prevstop,Uint1 strand)1375 static Boolean am_isa_gap(Int4 start, Int4 prevstop, Uint1 strand)
1376 {
1377 if (strand != Seq_strand_minus)
1378 {
1379 if (start > prevstop+1)
1380 return TRUE;
1381 else
1382 return FALSE;
1383 } else
1384 {
1385 if (prevstop > start+1)
1386 return TRUE;
1387 else
1388 return FALSE;
1389 }
1390 }
1391
am_fix_strand(SeqAlignPtr sap,Uint1 strand1,Uint1 strand2)1392 static void am_fix_strand(SeqAlignPtr sap, Uint1 strand1, Uint1 strand2)
1393 {
1394 DenseSegPtr dsp;
1395 Int4 i;
1396
1397 if (sap == NULL || strand1 == 0 || strand2 == 0)
1398 return;
1399 if (sap->segtype != SAS_DENSEG)
1400 return;
1401 dsp = (DenseSegPtr)(sap->segs);
1402 if (dsp->dim != 2)
1403 return;
1404 for (i=0; i<dsp->numseg; i++)
1405 {
1406 dsp->strands[i*2] = strand1;
1407 dsp->strands[(i*2) + 1] = strand2;
1408 }
1409 return;
1410 }
1411
1412 typedef struct sq_spin {
1413 Int4 n1;
1414 Int4 n2;
1415 Int4 n3;
1416 Int4 n4;
1417 Int4 n5;
1418 } SQN_n, PNTR SQN_nPtr;
1419
1420 #define SQN_LEFT 1
1421 #define SQN_RIGHT 2
1422 #define SQN_MIDDLE 3
1423
1424 #define SQN_MAXGAP 4
1425
1426 #define SQN_WINDOW 30 /* window in which to search for missing pieces */
1427
ACT_CompareSpins(VoidPtr ptr1,VoidPtr ptr2)1428 static int LIBCALLBACK ACT_CompareSpins (VoidPtr ptr1, VoidPtr ptr2)
1429 {
1430 SQN_nPtr spin1;
1431 SQN_nPtr spin2;
1432
1433 spin1 = *((SQN_nPtr PNTR) ptr1);
1434 spin2 = *((SQN_nPtr PNTR) ptr2);
1435 if (spin1 == NULL || spin2 == NULL)
1436 return 0;
1437 if (spin1->n3 > spin2->n3)
1438 return -1;
1439 if (spin1->n3 < spin2->n3)
1440 return 1;
1441 if (spin1->n2 < spin2->n2)
1442 return -1;
1443 if (spin1->n2 > spin2->n2)
1444 return 1;
1445 return 0;
1446 }
1447
1448
1449 /* spin structure: n1 = which alignment, n2 = start on first row, n3 =
1450 alignment length on 1st row, n4 = start on 2nd row, n5 = 2nd strand */
SetSpinValuesForSeqAlign(SeqAlignPtr salp,SQN_nPtr spin,Int4 list_pos,Int4 first_row)1451 static void SetSpinValuesForSeqAlign (SeqAlignPtr salp, SQN_nPtr spin, Int4 list_pos, Int4 first_row)
1452 {
1453 Int4 start;
1454 Int4 stop;
1455 Int4 strand;
1456
1457 if (salp == NULL || spin == NULL || first_row > 2)
1458 {
1459 return;
1460 }
1461
1462 spin->n1 = list_pos;
1463 AlnMgr2GetNthSeqRangeInSA(salp, first_row, &start, &stop);
1464 spin->n3 = stop - start;
1465 spin->n2 = start;
1466 AlnMgr2GetNthSeqRangeInSA(salp, 3-first_row, &start, &stop);
1467 spin->n4 = start;
1468 strand = AlnMgr2GetNthStrand(salp, 3-first_row);
1469 if (strand == Seq_strand_minus)
1470 spin->n5 = -1;
1471 else
1472 spin->n5 = 1;
1473
1474 }
1475
1476
1477 static Boolean
ResolveSpinConflict(SQN_nPtr spin1,SQN_nPtr spin2,SeqAlignPtr salp,Int4 first_row)1478 ResolveSpinConflict
1479 (SQN_nPtr spin1,
1480 SQN_nPtr spin2,
1481 SeqAlignPtr salp,
1482 Int4 first_row)
1483 {
1484 Boolean row1_conflict = FALSE, row2_conflict = FALSE;
1485 Int4 left_aln_overlap = 0;
1486 Int4 right_aln_overlap = 0;
1487 Int4 spin1_row1_start, spin1_row1_stop;
1488 Int4 spin2_row1_start, spin2_row1_stop;
1489 Int4 spin1_row2_start, spin1_row2_stop;
1490 Int4 spin2_row2_start, spin2_row2_stop;
1491 Int4 aln_len;
1492 Int4 strand;
1493 Boolean resolved = FALSE;
1494 SeqAlignPtr salp_next;
1495
1496 if (spin1 == NULL || spin2 == NULL || salp == NULL)
1497 {
1498 return FALSE;
1499 }
1500
1501 spin1_row1_start = spin1->n2;
1502 spin1_row1_stop = spin1->n2 + spin1->n3 - 1;
1503
1504 spin2_row1_start = spin2->n2;
1505 spin2_row1_stop = spin2->n2 + spin2->n3 - 1;
1506
1507 spin1_row2_start = spin1->n4;
1508 spin1_row2_stop = spin1->n4 + spin1->n3 - 1;
1509
1510 spin2_row2_start = spin2->n4;
1511 spin2_row2_stop = spin2->n4 + spin2->n3 - 1;
1512
1513 strand = spin1->n5;
1514
1515 aln_len = SeqAlignLength (salp);
1516
1517 /* NOTE - the spins were previously sorted, so it is impossible for
1518 * spin2 to be longer than spin1
1519 * Also, row 1 is required to be on the plus strand for both spins.
1520 * We want to find the amount by which the alignment should be truncated,
1521 * either on the left or on the right. If the alignment needs to be truncated
1522 * on both sides, there's probably something wrong.
1523 * The alignment should not overlap on the first row or the second row.
1524 */
1525
1526 /* make sure we aren't aligning out of order */
1527 if (strand == 1)
1528 {
1529 if ((spin1_row1_start > spin2_row1_start && spin1_row2_start < spin2_row2_start)
1530 || (spin1_row1_start < spin2_row1_start && spin1_row2_start > spin2_row2_start))
1531 {
1532 return FALSE;
1533 }
1534 }
1535 else
1536 {
1537 if ((spin1_row1_start > spin2_row1_start && spin1_row2_start > spin2_row2_start)
1538 || (spin1_row1_start < spin2_row1_start && spin1_row2_start < spin2_row2_start))
1539 {
1540 return FALSE;
1541 }
1542 }
1543
1544 /* check first for overlap on first row */
1545 /* see if second is contained in first */
1546 if (spin2_row1_start >= spin1_row1_start
1547 && spin2_row1_stop <= spin1_row1_stop)
1548 {
1549 /*
1550 * spin1 row 1: <---------->
1551 * spin2 row 1: <----->
1552 */
1553 /* spin2 is contained in spin1 - no overlap could be removed */
1554 return FALSE;
1555 }
1556 /* look for overlap on right */
1557 if (spin2_row1_stop > spin1_row1_start
1558 && spin2_row1_start < spin1_row1_start)
1559 {
1560 /*
1561 * spin1 row 1: <----->
1562 * spin2 row 1: <----->
1563 */
1564 right_aln_overlap = aln_len - AlnMgr2MapBioseqToSeqAlign (salp, spin1_row1_start, 1);
1565 }
1566 /* look for overlap on left */
1567 if (spin2_row1_start < spin1_row1_stop
1568 && spin2_row1_stop > spin1_row1_stop)
1569 {
1570 /*
1571 * spin1 row 1: <----->
1572 * spin2 row 1: <----->
1573 */
1574 left_aln_overlap = AlnMgr2MapBioseqToSeqAlign (salp, spin1_row1_stop, 1);
1575 }
1576
1577 /* check for overlap on second row */
1578 /* see if second is contained in first */
1579 if (spin2_row2_start >= spin1_row2_start
1580 && spin2_row2_stop <= spin1_row2_stop)
1581 {
1582 /*
1583 * spin1 row 2: <---------->
1584 * spin2 row 2: <----->
1585 */
1586 /* spin2 is contained in spin1 - no overlap could be removed */
1587 return FALSE;
1588 }
1589
1590 /* look for overlap on right */
1591 if (spin2_row2_start < spin1_row2_start
1592 && spin2_row2_stop > spin1_row2_start)
1593 {
1594 /*
1595 * spin1 row 2: <----->
1596 * spin2 row 2: <----->
1597 */
1598 if (strand == 1)
1599 {
1600 right_aln_overlap = MAX (right_aln_overlap,
1601 aln_len - AlnMgr2MapBioseqToSeqAlign (salp, spin1_row2_start, 1));
1602 }
1603 else
1604 {
1605 left_aln_overlap = MAX (left_aln_overlap, aln_len - AlnMgr2MapBioseqToSeqAlign (salp, spin1_row2_start, 1));
1606 }
1607 }
1608 /* look for overlap on left */
1609 if (spin2_row2_start < spin1_row2_stop
1610 && spin2_row2_stop > spin1_row2_stop)
1611 {
1612 /*
1613 * spin1 row 2: <----->
1614 * spin2 row 2: <----->
1615 */
1616 if (strand == 1)
1617 {
1618 left_aln_overlap = MAX (left_aln_overlap, AlnMgr2MapBioseqToSeqAlign (salp, spin1_row2_stop, 1));
1619 }
1620 else
1621 {
1622 right_aln_overlap = MAX (right_aln_overlap, AlnMgr2MapBioseqToSeqAlign (salp, spin1_row2_stop, 1));
1623 }
1624 }
1625
1626 /* NOTE - we only want to truncate/reindex the current alignment, not any that follow it in the chain.
1627 * For one thing, we might have already deleted the ones that come next.
1628 */
1629 salp_next = salp->next;
1630 salp->next = NULL;
1631 if (left_aln_overlap > 0 && right_aln_overlap == 0)
1632 {
1633 if (TruncateAlignment (salp, left_aln_overlap, TRUE))
1634 {
1635 SetSpinValuesForSeqAlign (salp, spin2, spin2->n1, first_row);
1636 resolved = TRUE;
1637 }
1638 }
1639 else if (left_aln_overlap == 0 && right_aln_overlap > 0)
1640 {
1641 if (TruncateAlignment (salp, right_aln_overlap, FALSE))
1642 {
1643 SetSpinValuesForSeqAlign (salp, spin2, spin2->n1, first_row);
1644 resolved = TRUE;
1645 }
1646 }
1647 /* restore chain */
1648 salp->next = salp_next;
1649
1650 return resolved;
1651 }
1652
1653
SpinsConflict(SQN_nPtr spin1,SQN_nPtr spin2,Int4 fuzz,Int4 strand)1654 static Boolean SpinsConflict (SQN_nPtr spin1, SQN_nPtr spin2, Int4 fuzz, Int4 strand)
1655 {
1656 Boolean conflict = FALSE;
1657
1658 if (spin1 == NULL || spin2 == NULL) return FALSE;
1659
1660 /* check first for conflict on first row */
1661 /* right of i greater than left of curr */
1662 if (spin1->n2 + spin1->n3 - 1 > spin2->n2 + fuzz)
1663 {
1664 /* left of i less than left of curr */
1665 if (spin1->n2 < spin2->n2) {
1666 /* i overlaps curr on the left */
1667 /* curr: [-----------) */
1668 /* i: (------] */
1669 conflict = TRUE;
1670 }
1671 }
1672 /* left of i less than right of curr - fuzz */
1673 if (spin1->n2 < spin2->n2 + spin2->n3 - 1 - fuzz)
1674 {
1675 /* right of i greater than right of curr */
1676 if (spin1->n2 + spin1->n3 - 1 > spin2->n2 + spin2->n3 - 1) {
1677 /* i overlaps curr on the right */
1678 /* curr: (-----------] */
1679 /* i: [------) */
1680 conflict = TRUE;
1681 }
1682 }
1683 /* left of i greater than or equal to left of curr */
1684 if (spin1->n2 >= spin2->n2)
1685 {
1686 /* right of i less than or equal to right of curr */
1687 if (spin1->n2 + spin1->n3 - 1 <= spin2->n2 + spin2->n3 - 1) {
1688 /* i is contained in curr */
1689 conflict = TRUE;
1690 }
1691 }
1692 /* then check for conflict and consistency on second row */
1693 /* right of i less than left of curr + fuzz */
1694 if (spin1->n2 + spin1->n3 - 1 < spin2->n2 + fuzz)
1695 {
1696 if (strand == 1)
1697 {
1698 /* right of i on row 2 greater than left of curr on row 2 */
1699 if (spin1->n4 + spin1->n3 - 1 > spin2->n4 + fuzz) {
1700 /* i: (-------] */
1701 /* curr: [------) */
1702 conflict = TRUE;
1703 }
1704 } else if (strand == -1)
1705 {
1706 if (spin2->n4 + spin2->n3 - 1 - fuzz > spin1->n4)
1707 conflict = TRUE;
1708 }
1709 }
1710 /* right of i greater than or equal to left of curr + fuzz */
1711 else
1712 {
1713 if (strand == 1)
1714 {
1715 /* left of i on row 2 less than right of curr - fuzz on row 2 */
1716 if (spin1->n4 < spin2->n4 + spin2->n3 - fuzz)
1717 conflict = TRUE;
1718 } else if (strand == -1)
1719 {
1720 if (spin1->n4 + spin1->n3 - 1 - fuzz > spin2->n4)
1721 conflict = TRUE;
1722 }
1723 }
1724
1725 /* make sure we aren't taking pieces out of order */
1726 if (strand == 1)
1727 {
1728 if ((spin1->n2 > spin2->n2 && spin1->n4 < spin2->n4)
1729 || (spin1->n2 < spin2->n2 && spin1->n4 > spin2->n4))
1730 {
1731 conflict = TRUE;
1732 }
1733 }
1734 else
1735 {
1736 if ((spin1->n2 > spin2->n2 && spin1->n4 > spin2->n4)
1737 || (spin1->n2 < spin2->n2 && spin1->n4 < spin2->n4))
1738 {
1739 conflict = TRUE;
1740 }
1741 }
1742 return conflict;
1743 }
1744
1745
ACT_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap,Int4 fuzz,Int4 n)1746 extern void ACT_RemoveInconsistentAlnsFromSet (SeqAlignPtr sap, Int4 fuzz, Int4 n)
1747 {
1748 AMAlignIndex2Ptr amaip;
1749 Boolean conflict;
1750 Int4 curr;
1751 Int4 i;
1752 Int4 indextype;
1753 Int4 k;
1754 SeqAlignPtr salp;
1755 SeqAlignPtr salp_head;
1756 SeqAlignPtr salp_prev;
1757 SQN_nPtr PNTR spin;
1758 Int4 strand;
1759
1760 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
1761 return;
1762 if (n > 2)
1763 return;
1764 amaip = (AMAlignIndex2Ptr)(sap->saip);
1765 indextype = amaip->alnstyle;
1766 /* make sure that everything is on the plus strand of the nth sequence */
1767 for (i=0; i<amaip->numsaps; i++)
1768 {
1769 salp = amaip->saps[i];
1770 strand = AlnMgr2GetNthStrand(salp, n);
1771 if (strand == Seq_strand_minus)
1772 {
1773 SAIndex2Free2(salp->saip);
1774 salp->saip = NULL;
1775 salp->next = NULL;
1776 SeqAlignListReverseStrand(salp);
1777 AlnMgr2IndexSingleChildSeqAlign(salp);
1778 }
1779 }
1780 /* spin structure: n1 = which alignment, n2 = start on first row, n3 =
1781 alignment length on 1st row, n4 = start on 2nd row, n5 = 2nd strand */
1782 spin = (SQN_nPtr PNTR)MemNew((amaip->numsaps)*sizeof(SQN_nPtr));
1783 for (i=0; i<amaip->numsaps; i++)
1784 {
1785 spin[i] = (SQN_nPtr)MemNew(sizeof(SQN_n));
1786 SetSpinValuesForSeqAlign (amaip->saps[i], spin[i], i, n);
1787 }
1788 HeapSort((Pointer)spin, (size_t)(amaip->numsaps), sizeof(SQN_nPtr), ACT_CompareSpins);
1789 strand = spin[0]->n5;
1790 for (i=1; i<amaip->numsaps; i++)
1791 {
1792 if (spin[i]->n5 != strand)
1793 {
1794 k = spin[i]->n1;
1795 salp = amaip->saps[k];
1796 salp->next = NULL;
1797 SeqAlignFree(salp);
1798 amaip->saps[k] = NULL;
1799 spin[i]->n1 = -1;
1800 /* fix links for previous alignments */
1801 while (k > 0 && amaip->saps[k - 1] == NULL) {
1802 k--;
1803 }
1804 if (k > 0)
1805 {
1806 if (k < amaip->numsaps - 1)
1807 {
1808 amaip->saps[k - 1]->next = amaip->saps[k + 1];
1809 }
1810 else
1811 {
1812 amaip->saps[k - 1]->next = NULL;
1813 }
1814 }
1815 }
1816 }
1817 for (curr=0; curr<amaip->numsaps; curr++)
1818 {
1819 if (spin[curr]->n1 != -1)
1820 {
1821 for (i=curr+1; i<amaip->numsaps; i++)
1822 {
1823 if (spin[i]->n1 != -1)
1824 {
1825 conflict = FALSE;
1826
1827 /* check first for conflict on first row */
1828 /* right of i greater than left of curr */
1829 if (spin[i]->n2 + spin[i]->n3 - 1 > spin[curr]->n2 + fuzz)
1830 {
1831 /* left of i less than left of curr */
1832 if (spin[i]->n2 < spin[curr]->n2) {
1833 /* i overlaps curr on the left */
1834 /* curr: [-----------) */
1835 /* i: (------] */
1836 conflict = TRUE;
1837 }
1838 }
1839 /* left of i less than right of curr - fuzz */
1840 if (spin[i]->n2 < spin[curr]->n2 + spin[curr]->n3 - 1 - fuzz)
1841 {
1842 /* right of i greater than right of curr */
1843 if (spin[i]->n2 + spin[i]->n3 - 1 > spin[curr]->n2 + spin[curr]->n3 - 1) {
1844 /* i overlaps curr on the right */
1845 /* curr: (-----------] */
1846 /* i: [------) */
1847 conflict = TRUE;
1848 }
1849 }
1850 /* left of i greater than or equal to left of curr */
1851 if (spin[i]->n2 >= spin[curr]->n2)
1852 {
1853 /* right of i less than or equal to right of curr */
1854 if (spin[i]->n2 + spin[i]->n3 - 1 <= spin[curr]->n2 + spin[curr]->n3 - 1) {
1855 /* i is contained in curr */
1856 conflict = TRUE;
1857 }
1858 }
1859 /* then check for conflict and consistency on second row */
1860 /* right of i less than left of curr + fuzz */
1861 if (spin[i]->n2 + spin[i]->n3 - 1 < spin[curr]->n2 + fuzz)
1862 {
1863 if (strand == 1)
1864 {
1865 /* right of i on row 2 greater than left of curr on row 2 */
1866 if (spin[i]->n4 + spin[i]->n3 - 1 > spin[curr]->n4 + fuzz) {
1867 /* i: (-------] */
1868 /* curr: [------) */
1869 conflict = TRUE;
1870 }
1871 } else if (strand == -1)
1872 {
1873 if (spin[curr]->n4 + spin[curr]->n3 - 1 - fuzz > spin[i]->n4)
1874 conflict = TRUE;
1875 }
1876 }
1877 /* right of i greater than or equal to left of curr + fuzz */
1878 else
1879 {
1880 if (strand == 1)
1881 {
1882 /* left of i on row 2 less than right of curr - fuzz on row 2 */
1883 if (spin[i]->n4 < spin[curr]->n4 + spin[curr]->n3 - fuzz)
1884 conflict = TRUE;
1885 } else if (strand == -1)
1886 {
1887 if (spin[i]->n4 + spin[i]->n3 - 1 - fuzz > spin[curr]->n4)
1888 conflict = TRUE;
1889 }
1890 }
1891
1892 /* make sure we aren't taking pieces out of order */
1893 if (strand == 1)
1894 {
1895 if ((spin[i]->n2 > spin[curr]->n2 && spin[i]->n4 < spin[curr]->n4)
1896 || (spin[i]->n2 < spin[curr]->n2 && spin[i]->n4 > spin[curr]->n4))
1897 {
1898 conflict = TRUE;
1899 }
1900 }
1901 else
1902 {
1903 if ((spin[i]->n2 > spin[curr]->n2 && spin[i]->n4 > spin[curr]->n4)
1904 || (spin[i]->n2 < spin[curr]->n2 && spin[i]->n4 < spin[curr]->n4))
1905 {
1906 conflict = TRUE;
1907 }
1908 }
1909
1910 if (conflict && ResolveSpinConflict (spin[curr], spin[i],
1911 amaip->saps[spin[i]->n1],
1912 n))
1913 {
1914 conflict = FALSE;
1915 }
1916
1917 if (conflict)
1918 {
1919 salp = amaip->saps[spin[i]->n1];
1920 salp->next = NULL;
1921 SeqAlignFree(salp);
1922 amaip->saps[spin[i]->n1] = NULL;
1923 spin[i]->n1 = -1;
1924 }
1925 }
1926 }
1927 }
1928 }
1929 salp_head = salp_prev = NULL;
1930 for (i=0; i<amaip->numsaps; i++)
1931 {
1932 MemFree(spin[i]);
1933 if (amaip->saps[i] != NULL)
1934 {
1935 amaip->saps[i]->next = NULL;
1936 if (salp_prev != NULL)
1937 {
1938 salp_prev->next = amaip->saps[i];
1939 salp_prev = salp_prev->next;
1940 } else
1941 salp_head = salp_prev = amaip->saps[i];
1942 }
1943 }
1944 sap->segs = (Pointer)(salp_head);
1945 if (indextype == AM2_LITE)
1946 {
1947 AMAlignIndex2Free2(sap->saip);
1948 sap->saip = NULL;
1949 AlnMgr2IndexLite(sap);
1950 } else
1951 AlnMgr2ReIndexSeqAlign(sap);
1952 MemFree(spin);
1953 }
1954
ACT_GetNthSeqRangeInSASet(SeqAlignPtr sap,Int4 n,Int4Ptr start,Int4Ptr stop)1955 NLM_EXTERN void ACT_GetNthSeqRangeInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop)
1956 {
1957 SeqAlignPtr salp;
1958 Int4 start_tmp;
1959 Int4 stop_tmp;
1960 Int4 tmp1;
1961 Int4 tmp2;
1962
1963 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
1964 return;
1965 salp = (SeqAlignPtr)(sap->segs);
1966 start_tmp = stop_tmp = -1;
1967 while (salp != NULL)
1968 {
1969 if (n > salp->dim)
1970 {
1971 if (start)
1972 *start = -1;
1973 if (stop)
1974 *stop = -1;
1975 return;
1976 }
1977 AlnMgr2GetNthSeqRangeInSA(salp, n, &tmp1, &tmp2);
1978 if (tmp1 < start_tmp || start_tmp == -1)
1979 start_tmp = tmp1;
1980 if (tmp2 > stop_tmp)
1981 stop_tmp = tmp2;
1982 salp = salp->next;
1983 }
1984 if (start)
1985 *start = start_tmp;
1986 if (stop)
1987 *stop = stop_tmp;
1988 }
1989
ACT_FindBestAlnByDotPlot(SeqLocPtr slp1,SeqLocPtr slp2)1990 static SeqAlignPtr ACT_FindBestAlnByDotPlot(SeqLocPtr slp1, SeqLocPtr slp2)
1991 {
1992 DOTDiagPtr ddp;
1993 DenseSegPtr dsp;
1994 Int4 i;
1995 DOTMainDataPtr mip;
1996 SeqAlignPtr sap;
1997 SeqAlignPtr sap_head;
1998 SeqAlignPtr sap_prev;
1999 ScorePtr scp;
2000 Int4 start1;
2001 Int4 start2;
2002 Uint1 strand;
2003
2004 ErrSetMessageLevel(SEV_MAX);
2005 mip = DOT_CreateAndStorebyLoc (slp1, slp2, 6, 10);
2006 ErrSetMessageLevel(SEV_WARNING);
2007 sap = sap_head = sap_prev = NULL;
2008 if (mip == NULL || mip->hitlist == NULL)
2009 return NULL;
2010 i = 0;
2011 ddp = mip->hitlist[i];
2012 start1 = SeqLocStart(slp1);
2013 start2 = SeqLocStart(slp2);
2014 strand = SeqLocStrand(slp2);
2015 /* copy each ddp (a single ungapped alignment) into a one-segment dense-seg alignment */
2016 while (ddp != NULL && i < mip->index)
2017 {
2018 ddp = mip->hitlist[i];
2019 i++;
2020 sap = SeqAlignNew();
2021 dsp = DenseSegNew();
2022 sap->type = SAT_PARTIAL;
2023 sap->segtype = SAS_DENSEG;
2024 sap->dim = 2;
2025 dsp->dim = 2;
2026 dsp->numseg = 1;
2027 dsp->ids = SeqIdDup(SeqLocId(slp1));
2028 dsp->ids->next = SeqIdDup(SeqLocId(slp2));
2029 dsp->strands = (Uint1Ptr)MemNew(2*sizeof(Uint1));
2030 dsp->strands[0] = SeqLocStrand(slp1);
2031 dsp->strands[1] = SeqLocStrand(slp2);
2032 dsp->starts = (Int4Ptr)MemNew(2*sizeof(Int4));
2033 dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
2034 dsp->starts[0] = ddp->q_start;
2035 if (dsp->strands[1] == Seq_strand_minus)
2036 dsp->starts[1] = ddp->s_start - ddp->length + 1;
2037 else
2038 dsp->starts[1] = ddp->s_start;
2039 if (ddp->length > SeqLocLen(slp2))
2040 dsp->lens[0] = SeqLocLen(slp2);
2041 else
2042 dsp->lens[0] = ddp->length - 1;
2043 scp = ScoreNew();
2044 scp->id = ObjectIdNew();
2045 scp->id->str = StringSave("score");
2046 scp->choice = 1;
2047 scp->value.intvalue = ddp->score;
2048 dsp->scores = scp;
2049 sap->segs = (Pointer)(dsp);
2050 if (sap_head != NULL)
2051 {
2052 sap_prev->next = sap;
2053 sap_prev = sap;
2054 } else
2055 sap_head = sap_prev = sap;
2056 }
2057 if (sap_head == NULL)
2058 return NULL;
2059 AlnMgr2IndexLite(sap_head);
2060 ACT_RemoveInconsistentAlnsFromSet(sap_head, 6, 1);
2061 sap = (SeqAlignPtr)(sap_head->segs);
2062 sap_head->segs = NULL;
2063 SeqAlignFree(sap_head);
2064 MemFree(mip->matrix);
2065 MemFree(mip->qseq);
2066 MemFree(mip->sseq);
2067 MemFree(mip->qname);
2068 MemFree(mip->sname);
2069 i = 0;
2070 while (ddp != NULL && i < mip->index)
2071 {
2072 ddp = mip->hitlist[i];
2073 MemFree(ddp);
2074 i++;
2075 }
2076 MemFree(mip->hitlist);
2077 return sap;
2078 }
2079
2080 NLM_EXTERN SeqAlignPtr
ACT_FindPiece(BioseqPtr bsp1,BioseqPtr bsp2,Int4 start1,Int4 stop1,Int4 start2,Int4 stop2,Uint1 strand,Int4 which_side,GetAlignmentPieceFunc aln_piece_func)2081 ACT_FindPiece
2082 (BioseqPtr bsp1,
2083 BioseqPtr bsp2,
2084 Int4 start1,
2085 Int4 stop1,
2086 Int4 start2,
2087 Int4 stop2,
2088 Uint1 strand,
2089 Int4 which_side,
2090 GetAlignmentPieceFunc aln_piece_func)
2091 {
2092 AMAlignIndex2Ptr amaip;
2093 DenseSegPtr dsp;
2094 Int4 i;
2095 Int4 nstart1;
2096 Int4 nstart2;
2097 Int4 nstop1;
2098 Int4 nstop2;
2099 SeqAlignPtr sap;
2100 SeqAlignPtr sap_head;
2101 SeqAlignPtr sap_new;
2102 SeqAlignPtr sap_prev = NULL;
2103 SeqLocPtr slp1;
2104 SeqLocPtr slp2;
2105
2106 if (stop1 - start1 < 7 || stop2 - start2 < 7) /* can't do these by BLAST -- wordsize can't go that small */
2107 return NULL;
2108
2109 if (aln_piece_func == NULL)
2110 {
2111 return NULL;
2112 }
2113 slp1 = SeqLocIntNew(start1, stop1, Seq_strand_plus, bsp1->id);
2114 slp2 = SeqLocIntNew(start2, stop2, strand, bsp2->id);
2115 sap = aln_piece_func (slp1, slp2);
2116
2117 SeqLocFree(slp1);
2118 SeqLocFree(slp2);
2119
2120 if (sap == NULL)
2121 {
2122 /* no real alignment, so create a "dummy" alignment that lines up at
2123 * the requested side (left, right, or middle)
2124 */
2125 sap = SeqAlignNew();
2126 dsp = DenseSegNew();
2127 dsp->numseg = 1;
2128 dsp->starts = (Int4Ptr)MemNew(2*sizeof(Int4));
2129 dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
2130 dsp->strands = (Uint1Ptr)MemNew(2*sizeof(Uint1));
2131 dsp->dim = 2;
2132 dsp->ids = SeqIdDup(bsp1->id);
2133 dsp->ids->next = SeqIdDup(bsp2->id);
2134 dsp->lens[0] = MIN(stop1-start1+1, stop2-start2+1);
2135 dsp->strands[0] = dsp->strands[1] = Seq_strand_plus;
2136 if (which_side == SQN_LEFT || which_side == SQN_MIDDLE)
2137 {
2138 dsp->starts[0] = stop1 - dsp->lens[0] + 1;
2139 dsp->starts[1] = stop2 - dsp->lens[0] + 1;
2140 } else if (which_side == SQN_RIGHT)
2141 {
2142 dsp->starts[0] = start1;
2143 dsp->starts[1] = start2;
2144 }
2145 sap->segs = (Pointer)dsp;
2146 sap->segtype = SAS_DENSEG;
2147 }
2148 else if (sap->next != NULL)
2149 {
2150 /* combine the alignments returned (if there are more than one) into
2151 * a mini global alignment to cover the requested interval
2152 */
2153 AlnMgr2IndexLite(sap);
2154 ACT_RemoveInconsistentAlnsFromSet(sap, 20, 1);
2155 amaip = (AMAlignIndex2Ptr)(sap->saip);
2156 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
2157 ACT_GetNthSeqRangeInSASet(sap, 1, &nstart1, &nstop1);
2158 ACT_GetNthSeqRangeInSASet(sap, 2, &nstart2, &nstop2);
2159 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
2160 sap_head = NULL;
2161 if (strand != Seq_strand_minus)
2162 {
2163 if (nstart1 > start1+20 && nstart2 > start2+20)
2164 {
2165 slp1 = SeqLocIntNew(start1, nstart1, Seq_strand_plus, bsp1->id);
2166 slp2 = SeqLocIntNew(start2, nstart2, strand, bsp2->id);
2167 sap_head = ACT_FindBestAlnByDotPlot(slp1, slp2);
2168 SeqLocFree(slp1);
2169 SeqLocFree(slp2);
2170 }
2171 }
2172 else
2173 {
2174 if (nstart1 > start1+20 && nstop2 < stop2 - 20)
2175 {
2176 slp1 = SeqLocIntNew(start1, nstart1, Seq_strand_plus, bsp1->id);
2177 slp2 = SeqLocIntNew(nstop2, stop2, strand, bsp2->id);
2178 sap_head = ACT_FindBestAlnByDotPlot(slp1, slp2);
2179 SeqLocFree(slp1);
2180 SeqLocFree(slp2);
2181 }
2182 }
2183 for (i=0; i<amaip->numsaps-1; i++)
2184 {
2185 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, NULL, &nstart1);
2186 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &nstop1, NULL);
2187 if (strand != Seq_strand_minus)
2188 {
2189 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, NULL, &nstart2);
2190 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &nstop2, NULL);
2191 }
2192 else
2193 {
2194 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &nstop2, NULL);
2195 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, NULL, &nstart2);
2196 }
2197 }
2198 ACT_GetNthSeqRangeInSASet(sap, 1, &nstart1, &nstop1);
2199 ACT_GetNthSeqRangeInSASet(sap, 2, &nstart2, &nstop2);
2200 sap_prev = sap_head;
2201 if (sap_prev != NULL)
2202 {
2203 while (sap_prev->next != NULL)
2204 {
2205 sap_prev = sap_prev->next;
2206 }
2207 }
2208 if (strand != Seq_strand_minus)
2209 {
2210 if (nstop1 < stop1-20 && nstop2 < stop2-20) /* missing piece at the end */
2211 {
2212 slp1 = SeqLocIntNew(nstop1, stop1, Seq_strand_plus, bsp1->id);
2213 slp2 = SeqLocIntNew(nstop2, stop2, strand, bsp2->id);
2214 sap_new = ACT_FindBestAlnByDotPlot(slp1, slp2);
2215 SeqLocFree(slp1);
2216 SeqLocFree(slp2);
2217 if (sap_new != NULL)
2218 {
2219 if (sap_head != NULL)
2220 {
2221 sap_prev->next = sap_new;
2222 sap_prev = sap_new;
2223 }
2224 else
2225 sap_head = sap_prev = sap_new;
2226 }
2227 }
2228 }
2229 else
2230 {
2231 if (nstop1 < stop1-20 && nstart2 > start2 + 20)
2232 {
2233 slp1 = SeqLocIntNew(nstop1, stop1, Seq_strand_plus, bsp1->id);
2234 slp2 = SeqLocIntNew(start2, nstart2, strand, bsp2->id);
2235 sap_new = ACT_FindBestAlnByDotPlot(slp1, slp2);
2236 SeqLocFree(slp1);
2237 SeqLocFree(slp2);
2238 if (sap_new != NULL)
2239 {
2240 if (sap_head != NULL)
2241 {
2242 sap_prev->next = sap_new;
2243 sap_prev = sap_new;
2244 }
2245 else
2246 sap_head = sap_prev = sap_new;
2247 }
2248 }
2249 }
2250 sap_new = (SeqAlignPtr)(sap->segs);
2251 while (sap_new->next != NULL)
2252 {
2253 sap_new = sap_new->next;
2254 }
2255 sap_new->next = sap_head;
2256 sap_head = (SeqAlignPtr)(sap->segs);
2257 sap->segs = NULL;
2258 SeqAlignFree(sap);
2259 sap = sap_head;
2260 }
2261 return sap;
2262 }
2263
ACT_ExtendAlnRight(SeqAlignPtr sap,Int4 which_row,Int4 start,Int4 stop)2264 static void ACT_ExtendAlnRight(SeqAlignPtr sap, Int4 which_row, Int4 start, Int4 stop)
2265 {
2266 DenseSegPtr dsp;
2267 Int4 i;
2268 Int4Ptr lens;
2269 Int4Ptr starts;
2270 Uint1Ptr strands;
2271
2272 if (sap == NULL)
2273 return;
2274 if (which_row > 2)
2275 return;
2276 dsp = (DenseSegPtr)(sap->segs);
2277 if (dsp->starts[2*(dsp->numseg-1) + which_row - 1] == -1 || dsp->starts[2*(dsp->numseg-1) + (2-which_row)] != -1)
2278 {
2279 starts = (Int4Ptr)MemNew((dsp->numseg+1)*2*sizeof(Int4));
2280 strands = (Uint1Ptr)MemNew((dsp->numseg+1)*2*sizeof(Uint1));
2281 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
2282 for (i=0; i<dsp->numseg; i++)
2283 {
2284 lens[i] = dsp->lens[i];
2285 }
2286 for (i=0; i<=(dsp->dim)*(dsp->numseg-1)+1; i++)
2287 {
2288 starts[i] = dsp->starts[i];
2289 strands[i] = dsp->strands[i];
2290 }
2291 lens[dsp->numseg] = stop - start + 1;
2292 if (dsp->strands[which_row-1] != Seq_strand_minus)
2293 starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = start;
2294 else
2295 starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = stop;
2296 starts[(dsp->dim)*(dsp->numseg) + (2-which_row)] = -1;
2297 strands[(dsp->dim)*(dsp->numseg) + which_row - 1] = dsp->strands[which_row-1];
2298 strands[(dsp->dim)*(dsp->numseg) + (2-which_row)] = dsp->strands[2-which_row];
2299 MemFree(dsp->starts);
2300 MemFree(dsp->lens);
2301 MemFree(dsp->strands);
2302 dsp->numseg++;
2303 dsp->starts = starts;
2304 dsp->strands = strands;
2305 dsp->lens = lens;
2306 } else
2307 {
2308 dsp->lens[dsp->numseg-1] += stop - start + 1;
2309 if (dsp->strands[which_row-1] == Seq_strand_minus)
2310 dsp->starts[(dsp->dim)*(dsp->numseg-1) + which_row - 1] = stop;
2311 }
2312 SAIndex2Free2(sap->saip);
2313 sap->saip = NULL;
2314 AlnMgr2IndexSingleChildSeqAlign(sap);
2315 }
2316
ACT_CreateContinuousAln(SeqAlignPtr PNTR saps,Int4 numsaps)2317 static SeqAlignPtr ACT_CreateContinuousAln(SeqAlignPtr PNTR saps, Int4 numsaps)
2318 {
2319 DenseSegPtr dsp;
2320 DenseSegPtr dsp_tmp;
2321 Int4 i;
2322 Int4 j;
2323 Int4 n1;
2324 Int4 n2;
2325 Int4 numseg;
2326 SeqAlignPtr salp;
2327 Int4 start1;
2328 Int4 start2;
2329 Int4 stop1;
2330 Int4 stop2;
2331 Uint1 strand;
2332
2333 for (i=0; i<numsaps-1; i++)
2334 {
2335 AlnMgr2GetNthSeqRangeInSA(saps[i], 1, &start1, &stop1);
2336 AlnMgr2GetNthSeqRangeInSA(saps[i+1], 1, &start2, &stop2);
2337 if (start2 - stop1 > 1)
2338 ACT_ExtendAlnRight(saps[i], 1, stop1+1, start2-1);
2339 AlnMgr2GetNthSeqRangeInSA(saps[i], 2, &start1, &stop1);
2340 AlnMgr2GetNthSeqRangeInSA(saps[i+1], 2, &start2, &stop2);
2341 strand = AlnMgr2GetNthStrand(saps[i], 2);
2342 if (strand == Seq_strand_minus)
2343 {
2344 if (start1 - stop2 > 1)
2345 ACT_ExtendAlnRight(saps[i], 2, stop2+1, start1-1);
2346 } else
2347 {
2348 if (start2 - stop1 > 1)
2349 ACT_ExtendAlnRight(saps[i], 2, stop1+1, start2-1);
2350 }
2351 }
2352 numseg = 0;
2353 for (i=0; i<numsaps; i++)
2354 {
2355 dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2356 numseg += dsp_tmp->numseg;
2357 }
2358 dsp = DenseSegNew();
2359 dsp->dim = 2;
2360 dsp->numseg = numseg;
2361 dsp->starts = (Int4Ptr)MemNew(2*numseg*sizeof(Int4));
2362 dsp->lens = (Int4Ptr)MemNew(numseg*sizeof(Int4));
2363 dsp->strands = (Uint1Ptr)MemNew(2*numseg*sizeof(Uint1));
2364 n1 = n2 = 0;
2365 for (i=0; i<numsaps; i++)
2366 {
2367 dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2368 if (dsp->ids == NULL)
2369 dsp->ids = SeqIdDupList(dsp_tmp->ids);
2370 for (j=0; j<2*dsp_tmp->numseg; j++)
2371 {
2372 dsp->starts[n1+j] = dsp_tmp->starts[j];
2373 dsp->strands[n1+j] = dsp_tmp->strands[j];
2374 }
2375 for (j=0; j<dsp_tmp->numseg; j++)
2376 {
2377 dsp->lens[n2+j] = dsp_tmp->lens[j];
2378 }
2379 n1 += 2*dsp_tmp->numseg;
2380 n2 += dsp_tmp->numseg;
2381 }
2382 salp = SeqAlignNew();
2383 salp->type = SAT_PARTIAL;
2384 salp->segtype = SAS_DENSEG;
2385 salp->dim = 2;
2386 salp->segs = (Pointer)(dsp);
2387 AlnMgr2IndexSingleChildSeqAlign(salp);
2388 return salp;
2389 }
2390
2391
DeleteAmaipSapI(AMAlignIndex2Ptr amaip,Int4 i)2392 static void DeleteAmaipSapI (AMAlignIndex2Ptr amaip, Int4 i)
2393 {
2394 Int4 j;
2395 if (amaip == NULL || i >= amaip->numsaps) {
2396 return;
2397 }
2398
2399 amaip->saps[i] = SeqAlignFree (amaip->saps[i]);
2400 for (j = i + 1; j < amaip->numsaps; j++) {
2401 amaip->saps[j - 1] = amaip->saps[j];
2402 }
2403 amaip->numsaps --;
2404 }
2405
2406
ACT_CleanUpAlignments(SeqAlignPtr sap,Int4 len1,Int4 len2)2407 NLM_EXTERN SeqAlignPtr ACT_CleanUpAlignments(SeqAlignPtr sap, Int4 len1, Int4 len2)
2408 {
2409 AMAlignIndex2Ptr amaip;
2410 Int4 diff;
2411 DenseSegPtr dsp;
2412 Int4 i;
2413 Int4 numseg;
2414 Int4 start1;
2415 Int4 start2;
2416 Int4 stop1;
2417 Int4 stop2;
2418 Uint1 strand;
2419 Int4 tmp;
2420
2421 if (sap == NULL)
2422 return NULL;
2423 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
2424 amaip = (AMAlignIndex2Ptr)(sap->saip);
2425 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
2426 numseg = 0;
2427 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &start1, NULL);
2428 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &start2, &stop2);
2429 if (strand != Seq_strand_minus)
2430 diff = start2;
2431 else
2432 diff = len2 - stop2;
2433 if (start1 > 0 && diff > 0)
2434 numseg += 2;
2435 for (i=0; i<amaip->numsaps-1; i++)
2436 {
2437 dsp = (DenseSegPtr)(amaip->saps[i]->segs);
2438 numseg += dsp->numseg;
2439 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, NULL, &start1);
2440 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &stop1, &tmp);
2441 if (stop1 < start1+1)
2442 AlnMgr2TruncateSAP(amaip->saps[i+1], start1+1, tmp, 1);
2443 if (strand != Seq_strand_minus)
2444 {
2445 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, NULL, &start2);
2446 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &stop2, &tmp);
2447 if (stop2 < start2+1) {
2448 if (tmp <= start2) {
2449 /* alignment is contained in parent, remove completely */
2450 DeleteAmaipSapI (amaip, i + 1);
2451 /* need to compare this with the one after it */
2452 i--;
2453 continue;
2454 } else {
2455 AlnMgr2TruncateSAP(amaip->saps[i+1], start2+1, tmp, 2);
2456 }
2457 }
2458 } else
2459 {
2460 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &stop2, &tmp);
2461 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, NULL, &start2);
2462 if (stop2 < start2 + 1) {
2463 if (tmp <= start2) {
2464 /* alignment is contained in parent, remove completely */
2465 DeleteAmaipSapI (amaip, i);
2466 /* need to compare this with the one after it */
2467 i--;
2468 continue;
2469 } else {
2470 AlnMgr2TruncateSAP(amaip->saps[i], start2+1, tmp, 2);
2471 }
2472 }
2473 }
2474 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, NULL, &start1);
2475 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &stop1, &tmp);
2476 if (strand != Seq_strand_minus)
2477 {
2478 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, NULL, &start2);
2479 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &stop2, &tmp);
2480 } else
2481 {
2482 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &stop2, &tmp);
2483 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, NULL, &start2);
2484 }
2485 if (stop1 > start1+1)
2486 numseg++;
2487 if (stop2 > start2+1)
2488 numseg++;
2489 }
2490 dsp = (DenseSegPtr)(amaip->saps[amaip->numsaps-1]->segs);
2491 numseg += dsp->numseg;
2492 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 1, NULL, &stop1);
2493 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &start2, &stop2);
2494 if (strand != Seq_strand_minus)
2495 diff = len2 - stop2;
2496 else
2497 diff = start2;
2498 if (stop1 < len1 && diff > 0)
2499 numseg += 2;
2500 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
2501 return (ACT_CreateContinuousAln(amaip->saps, amaip->numsaps));
2502 }
2503
SQN_AddToAln(SeqAlignPtr sap,Int4 offset,Int2 which_end,Uint1 strand)2504 static void SQN_AddToAln(SeqAlignPtr sap, Int4 offset, Int2 which_end, Uint1 strand)
2505 {
2506 DenseSegPtr dsp;
2507 Int4Ptr lens;
2508 Int4 i;
2509 Int4 j;
2510 Int4Ptr starts;
2511 Uint1Ptr strands;
2512
2513 if (sap == NULL || offset == 0)
2514 return;
2515 dsp = (DenseSegPtr)(sap->segs);
2516 if (which_end == SQN_LEFT)
2517 {
2518 if (dsp->starts[0] != -1 && dsp->starts[1] != -1) /* neither sequence is gapped */
2519 {
2520 dsp->starts[0] -= offset;
2521 if (strand != Seq_strand_minus)
2522 dsp->starts[1] -= offset;
2523 dsp->lens[0] += offset;
2524 } else /* one of the sequences is gapped -> add a new segment */
2525 {
2526 starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
2527 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
2528 strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
2529 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
2530 starts[0] = i - offset;
2531 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
2532 if (strand == Seq_strand_minus)
2533 starts[1] = j + 1;
2534 else
2535 starts[1] = i - offset;
2536 lens[0] = offset;
2537 strands[0] = Seq_strand_plus;
2538 strands[1] = strand;
2539 for (i=0; i<dsp->numseg; i++)
2540 {
2541 starts[i+1] = dsp->starts[i];
2542 starts[2*(i+1)] = dsp->starts[2*i];
2543 lens[i+1] = dsp->lens[i];
2544 strands[i+1] = dsp->strands[i];
2545 strands[2*(i+1)] = dsp->strands[2*i];
2546 }
2547 dsp->numseg++;
2548 MemFree(dsp->starts);
2549 MemFree(dsp->lens);
2550 MemFree(dsp->strands);
2551 dsp->starts = starts;
2552 dsp->lens = lens;
2553 dsp->strands = strands;
2554 }
2555 } else if (which_end == SQN_RIGHT)
2556 {
2557 if (dsp->starts[2*(dsp->numseg-1)] != -1 && dsp->starts[2*(dsp->numseg-1)+1] != -1)
2558 {
2559 dsp->lens[dsp->numseg-1] += offset;
2560 if (strand == Seq_strand_minus)
2561 dsp->starts[2*(dsp->numseg-1)+1] -= offset;
2562 } else /* one of the sequences is gapped -> add a new segment */
2563 {
2564 starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
2565 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
2566 strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
2567 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
2568 starts[2*(dsp->numseg)-1] = i+1;
2569 AlnMgr2GetNthSeqRangeInSA(sap, 2, &i, &j);
2570 if (strand == Seq_strand_minus)
2571 starts[2*(dsp->numseg)] = i - offset;
2572 else
2573 starts[2*(dsp->numseg)] = j + 1;
2574 lens[dsp->numseg] = offset;
2575 strands[2*(dsp->numseg)-1] = Seq_strand_plus;
2576 strands[2*(dsp->numseg)] = strand;
2577 for (i=0; i<dsp->numseg; i++)
2578 {
2579 starts[i] = dsp->starts[i];
2580 starts[2*i] = dsp->starts[2*i];
2581 lens[i] = dsp->lens[i];
2582 strands[i] = dsp->strands[i];
2583 strands[2*i] = dsp->strands[2*i];
2584 }
2585 dsp->numseg++;
2586 MemFree(dsp->starts);
2587 MemFree(dsp->lens);
2588 MemFree(dsp->strands);
2589 dsp->starts = starts;
2590 dsp->lens = lens;
2591 dsp->strands = strands;
2592 }
2593 }
2594 /* free the old index and reindex the alignment */
2595 SAIndex2Free2(sap->saip);
2596 sap->saip = NULL;
2597 AlnMgr2IndexSingleChildSeqAlign(sap);
2598 }
2599
SQN_ExtendAlnAlg(SeqAlignPtr sap,Int4 ovl,Int4 which_side,Uint1 strand)2600 NLM_EXTERN void SQN_ExtendAlnAlg(SeqAlignPtr sap, Int4 ovl, Int4 which_side, Uint1 strand)
2601 {
2602 BioseqPtr bsp1;
2603 BioseqPtr bsp2;
2604 Uint1 buf1[20];
2605 Uint1 buf2[20];
2606 DenseSegPtr dsp;
2607 DenseSegPtr dsp_new;
2608 Int4 gap;
2609 Int4 i;
2610 Int4 j;
2611 Boolean mismatch;
2612 SeqIdPtr sip1;
2613 SeqIdPtr sip2;
2614 SeqPortPtr spp;
2615 Int4 start1;
2616 Int4 start2;
2617 Int4 stop1;
2618 Int4 stop2;
2619
2620 if (sap == NULL || ovl == 0)
2621 return;
2622 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start1, &stop1);
2623 AlnMgr2GetNthSeqRangeInSA(sap, 2, &start2, &stop2);
2624 sip1 = AlnMgr2GetNthSeqIdPtr(sap, 1);
2625 sip2 = AlnMgr2GetNthSeqIdPtr(sap, 2);
2626 bsp1 = BioseqLockById(sip1);
2627 bsp2 = BioseqLockById(sip2);
2628 if (which_side == SQN_LEFT && (start1<ovl+SQN_MAXGAP || start2<ovl+SQN_MAXGAP))
2629 {
2630 if (start1 < ovl)
2631 ovl = start1;
2632 if (start2 < ovl)
2633 ovl = start2;
2634 if (ovl <= 0) {
2635 SeqIdFree(sip1);
2636 SeqIdFree(sip2);
2637 BioseqUnlock(bsp1);
2638 BioseqUnlock(bsp2);
2639 return;
2640 }
2641 dsp = (DenseSegPtr)(sap->segs);
2642 dsp_new = DenseSegNew();
2643 dsp_new->dim = 2;
2644 dsp_new->numseg = dsp->numseg + 1;
2645 dsp_new->starts = (Int4Ptr)MemNew(2*(dsp_new->numseg)*sizeof(Int4));
2646 dsp_new->lens = (Int4Ptr)MemNew((dsp_new->numseg)*sizeof(Int4));
2647 dsp_new->strands = (Uint1Ptr)MemNew(2*(dsp_new->numseg)*sizeof(Int4));
2648 dsp_new->ids = dsp->ids;
2649 dsp->ids = NULL;
2650 dsp_new->starts[0] = start1-ovl;
2651 dsp_new->starts[1] = start2-ovl;
2652 dsp_new->lens[0] = ovl;
2653 dsp_new->strands[0] = dsp_new->strands[1] = Seq_strand_plus;
2654 for (i=1; i<dsp_new->numseg; i++)
2655 {
2656 dsp_new->lens[i] = dsp->lens[i-1];
2657 dsp_new->starts[i*dsp->dim] = dsp->starts[(i-1)*dsp->dim];
2658 dsp_new->starts[i*dsp->dim+1] = dsp->starts[(i-1)*dsp->dim+1];
2659 dsp_new->strands[i*dsp->dim] = dsp_new->strands[i*dsp->dim+1] = Seq_strand_plus;
2660 }
2661 DenseSegFree(dsp);
2662 sap->segs = (Pointer)(dsp_new);
2663 SAIndex2Free2(sap->saip);
2664 sap->saip = NULL;
2665 AlnMgr2IndexSingleChildSeqAlign(sap);
2666 SeqIdFree(sip1);
2667 SeqIdFree(sip2);
2668 BioseqUnlock(bsp1);
2669 BioseqUnlock(bsp2);
2670 return;
2671 } else if (which_side == SQN_RIGHT && (bsp1->length-1-stop1 < ovl+SQN_MAXGAP || bsp2->length-1-stop2<ovl+SQN_MAXGAP))
2672 {
2673 if (bsp1->length-1-stop1<ovl)
2674 ovl = bsp1->length-1-stop1;
2675 if (bsp2->length-1-stop2<ovl)
2676 ovl = bsp2->length-1-stop2;
2677 if (ovl <= 0) {
2678 SeqIdFree(sip1);
2679 SeqIdFree(sip2);
2680 BioseqUnlock(bsp1);
2681 BioseqUnlock(bsp2);
2682 return;
2683 }
2684 dsp = (DenseSegPtr)(sap->segs);
2685 dsp_new = DenseSegNew();
2686 dsp_new->dim = 2;
2687 dsp_new->numseg = dsp->numseg + 1;
2688 dsp_new->starts = (Int4Ptr)MemNew(2*(dsp_new->numseg)*sizeof(Int4));
2689 dsp_new->lens = (Int4Ptr)MemNew((dsp_new->numseg)*sizeof(Int4));
2690 dsp_new->strands = (Uint1Ptr)MemNew(2*(dsp_new->numseg)*sizeof(Int4));
2691 dsp_new->ids = dsp->ids;
2692 dsp->ids = NULL;
2693 for (i=0; i<dsp->numseg; i++)
2694 {
2695 dsp_new->lens[i] = dsp->lens[i];
2696 dsp_new->starts[i*dsp->dim] = dsp->starts[i*dsp->dim];
2697 dsp_new->starts[i*dsp->dim+1] = dsp->starts[i*dsp->dim+1];
2698 dsp_new->strands[i*dsp->dim] = dsp_new->strands[i*dsp->dim+1] = Seq_strand_plus;
2699 }
2700 dsp_new->lens[dsp_new->numseg-1] = ovl;
2701 dsp_new->starts[(dsp_new->numseg-1)*2] = stop1+1;
2702 dsp_new->starts[(dsp_new->numseg-1)*2+1] = stop2+1;
2703 dsp_new->strands[(dsp_new->numseg-1)*2] = dsp_new->strands[(dsp_new->numseg-1)*2+1] = Seq_strand_plus;
2704 DenseSegFree(dsp);
2705 sap->segs = (Pointer)dsp_new;
2706 SAIndex2Free2(sap->saip);
2707 sap->saip = NULL;
2708 AlnMgr2IndexSingleChildSeqAlign(sap);
2709 SeqIdFree(sip1);
2710 SeqIdFree(sip2);
2711 BioseqUnlock(bsp1);
2712 BioseqUnlock(bsp2);
2713 return;
2714 }
2715 if (which_side == SQN_LEFT)
2716 {
2717 spp = SeqPortNew(bsp1, MAX(0, start1-(SQN_MAXGAP+ovl)), start1-1, Seq_strand_plus, Seq_code_ncbi4na);
2718 SeqPortRead(spp, buf1, 20);
2719 SeqPortFree(spp);
2720 if (strand == Seq_strand_minus)
2721 {
2722 spp = SeqPortNew(bsp2, stop2+1, stop2 + 1 + ovl, strand, Seq_code_ncbi4na);
2723 SeqPortRead(spp, buf2, 20);
2724 SeqPortFree(spp);
2725 } else
2726 {
2727 spp = SeqPortNew(bsp2, start2-1-ovl, start2-1, strand, Seq_code_ncbi4na);
2728 SeqPortRead(spp, buf2, 20);
2729 SeqPortFree(spp);
2730 }
2731 gap = -1;
2732 for (i=0; i<SQN_MAXGAP; i++)
2733 {
2734 mismatch = FALSE;
2735 for (j=0; j<ovl; j++)
2736 {
2737 if (buf2[j] != buf1[i+j])
2738 mismatch = TRUE;
2739 }
2740 if (mismatch == FALSE)
2741 gap = SQN_MAXGAP-i;
2742 }
2743 if (gap > 0)
2744 {
2745 dsp = (DenseSegPtr)(sap->segs);
2746 dsp_new = DenseSegNew();
2747 dsp_new->dim = 2;
2748 dsp_new->numseg = dsp->numseg+2;
2749 dsp_new->ids = dsp->ids;
2750 dsp->ids = NULL;
2751 dsp_new->starts = (Int4Ptr)MemNew((dsp_new->numseg)*2*sizeof(Int4));
2752 dsp_new->lens = (Int4Ptr)MemNew((dsp_new->numseg)*sizeof(Int4));
2753 dsp_new->strands = (Uint1Ptr)MemNew((dsp_new->numseg)*2*sizeof(Uint1));
2754 for (i=2; i<dsp_new->numseg; i++)
2755 {
2756 dsp_new->starts[i*2] = dsp->starts[(i-2)*2];
2757 dsp_new->starts[i*2+1] = dsp->starts[(i-2)*2+1];
2758 dsp_new->strands[i+2] = dsp->strands[(i-2)*2];
2759 dsp_new->strands[i*2+1] = dsp->strands[(i-2)*2+1];
2760 dsp_new->lens[i] = dsp->lens[i-2];
2761 }
2762 dsp_new->starts[0] = dsp->starts[0] - gap - ovl;
2763 dsp_new->starts[2] = dsp_new->starts[0] + ovl;
2764 dsp_new->starts[3] = -1;
2765 dsp_new->strands[0] = dsp_new->strands[2] = dsp->strands[0];
2766 dsp_new->strands[1] = dsp_new->strands[2] = dsp->strands[1];
2767 dsp_new->lens[0] = ovl;
2768 dsp_new->lens[1] = gap;
2769 if (strand == Seq_strand_minus)
2770 dsp_new->starts[1] = stop2 + 1;
2771 else
2772 dsp_new->starts[1] = start2 - ovl;
2773 sap->segs = (Pointer)dsp_new;
2774 DenseSegFree(dsp);
2775 SAIndex2Free2(sap->saip);
2776 sap->saip = NULL;
2777 AlnMgr2IndexSingleChildSeqAlign(sap);
2778 } else
2779 SQN_AddToAln(sap, ovl, SQN_LEFT, strand);
2780 } else if (which_side == SQN_RIGHT)
2781 {
2782 spp = SeqPortNew(bsp1, MIN(stop1, bsp1->length-1), MIN(stop1+SQN_MAXGAP, bsp1->length-1), Seq_strand_plus, Seq_code_ncbi4na);
2783 SeqPortRead(spp, buf1, 20);
2784 SeqPortFree(spp);
2785 if (strand == Seq_strand_minus)
2786 {
2787 spp = SeqPortNew(bsp2, start2-ovl, start2-1-ovl, strand, Seq_code_ncbi4na);
2788 SeqPortRead(spp, buf2, 20);
2789 SeqPortFree(spp);
2790 } else
2791 {
2792 spp = SeqPortNew(bsp2, MIN(stop2+1, bsp2->length-1), MIN(stop2+1+ovl, bsp2->length-1), strand, Seq_code_ncbi4na);
2793 SeqPortRead(spp, buf2, 20);
2794 SeqPortFree(spp);
2795 }
2796 gap = -1;
2797 for (i=0; i<SQN_MAXGAP; i++)
2798 {
2799 mismatch = FALSE;
2800 for (j=0; j<ovl; j++)
2801 {
2802 if (buf1[i+j] != buf1[j])
2803 mismatch = TRUE;
2804 }
2805 if (mismatch == FALSE && gap == -1)
2806 gap = i;
2807 }
2808 if (gap > 0)
2809 {
2810 dsp = (DenseSegPtr)(sap->segs);
2811 dsp_new = DenseSegNew();
2812 dsp_new->dim = 2;
2813 dsp_new->numseg = dsp->numseg+2;
2814 dsp_new->ids = dsp->ids;
2815 dsp->ids = NULL;
2816 dsp_new->starts = (Int4Ptr)MemNew((dsp_new->numseg)*2*sizeof(Int4));
2817 dsp_new->lens = (Int4Ptr)MemNew((dsp_new->numseg)*sizeof(Int4));
2818 dsp_new->strands = (Uint1Ptr)MemNew((dsp_new->numseg)*2*sizeof(Uint1));
2819 for (i=0; i<dsp->numseg; i++)
2820 {
2821 dsp_new->starts[i*2] = dsp->starts[i*2];
2822 dsp_new->starts[i*2+1] = dsp->starts[i*2+1];
2823 dsp_new->strands[i+2] = dsp->strands[i*2];
2824 dsp_new->strands[i*2+1] = dsp->strands[i*2+1];
2825 dsp_new->lens[i] = dsp->lens[i];
2826 }
2827 dsp_new->strands[2*dsp->numseg] = dsp_new->strands[2*dsp->numseg+2] = dsp->strands[0];
2828 dsp_new->strands[2*dsp->numseg+1] = dsp_new->strands[2*dsp->numseg+3] = dsp->strands[1];
2829 dsp_new->lens[dsp->numseg] = gap;
2830 dsp_new->lens[dsp->numseg+1] = ovl;
2831 dsp_new->starts[2*dsp->numseg] = dsp_new->starts[2*(dsp->numseg-1)]+dsp_new->lens[dsp->numseg-1];
2832 dsp_new->starts[2*dsp->numseg+2] = dsp_new->starts[2*dsp->numseg] + gap;
2833 if (strand == Seq_strand_minus)
2834 dsp_new->starts[2*dsp->numseg+3] = start2 - ovl;
2835 else
2836 dsp_new->starts[2*dsp->numseg+3] = stop2 + 1;
2837 dsp_new->starts[2*dsp->numseg+1] = -1;
2838 sap->segs = (Pointer)dsp_new;
2839 DenseSegFree(dsp);
2840 SAIndex2Free2(sap->saip);
2841 sap->saip = NULL;
2842 AlnMgr2IndexSingleChildSeqAlign(sap);
2843 } else
2844 SQN_AddToAln(sap, ovl, SQN_RIGHT, strand);
2845 }
2846 SeqIdFree(sip1);
2847 SeqIdFree(sip2);
2848 BioseqUnlock(bsp1);
2849 BioseqUnlock(bsp2);
2850 }
2851
sqn_binary_search_on_uint4_list(Uint4Ptr list,Uint4 pos,Uint4 listlen)2852 static Uint4 sqn_binary_search_on_uint4_list(Uint4Ptr list, Uint4 pos, Uint4 listlen)
2853 {
2854 Uint4 L;
2855 Uint4 mid;
2856 Uint4 R;
2857
2858 if (list == NULL || listlen == 0)
2859 return 0;
2860 L = 0;
2861 R = listlen - 1;
2862 while (L < R)
2863 {
2864 mid = (L+R)/2;
2865 if (list[mid + 1] <= pos)
2866 {
2867 L = mid + 1;
2868 } else
2869 {
2870 R = mid;
2871 }
2872 }
2873 return R;
2874 }
2875
MapRowCoordsSpecial(SeqAlignPtr sap,Uint4 pos,Int4 row,Int4 which_end)2876 static Int4 MapRowCoordsSpecial(SeqAlignPtr sap, Uint4 pos, Int4 row, Int4 which_end)
2877 {
2878 DenseSegPtr dsp;
2879 Int4 idx;
2880 Int4 offset;
2881 SAIndex2Ptr saip;
2882 Int4 start;
2883
2884 if (sap == NULL || row < 0)
2885 return -1;
2886 if (sap->saip == NULL)
2887 return -1;
2888 saip = (SAIndex2Ptr)sap->saip;
2889 dsp = (DenseSegPtr)sap->segs;
2890 start = sqn_binary_search_on_uint4_list(saip->aligncoords, pos, dsp->numseg);
2891 offset = pos - saip->aligncoords[start];
2892 idx = (dsp->dim*start) + row - 1;
2893 if (dsp->starts[idx] == -1)
2894 {
2895 if (which_end == SQN_RIGHT)
2896 {
2897 /* round down */
2898 while (start >= 0) {
2899 idx = (dsp->dim*start) + row - 1;
2900 if (dsp->starts[idx] != -1)
2901 return (dsp->starts[idx] + dsp->lens[start] - 1);
2902 start--;
2903 }
2904 return -2;
2905 } else if (which_end == SQN_LEFT)
2906 {
2907 /* round up */
2908 while (start < dsp->numseg) {
2909 idx = (dsp->dim*start) + row - 1;
2910 if (dsp->starts[idx] != -1)
2911 return (dsp->starts[idx]);
2912 start++;
2913 }
2914 return -2;
2915 }
2916 } else
2917 {
2918 idx = (dsp->dim*start) + row - 1;
2919 if (dsp->strands[idx] != Seq_strand_minus)
2920 return (dsp->starts[idx] + offset);
2921 else
2922 return (dsp->starts[idx] + dsp->lens[start] - 1 - offset);
2923 }
2924 return -1;
2925 }
2926
MapBioseqToBioseqSpecial(SeqAlignPtr sap,Int4 begin,Int4 fin,Int4 pos,Int4 which_end)2927 static Int4 MapBioseqToBioseqSpecial(SeqAlignPtr sap, Int4 begin, Int4 fin, Int4 pos, Int4 which_end)
2928 {
2929 Int4 bspos;
2930 Int4 sapos;
2931 Int4 start1;
2932 Int4 start2;
2933 Int4 stop1;
2934 Int4 stop2;
2935
2936 if (sap == NULL || sap->saip == NULL)
2937 return -2;
2938 AlnMgr2GetNthSeqRangeInSA(sap, begin, &start1, &stop1);
2939 AlnMgr2GetNthSeqRangeInSA(sap, fin, &start2, &stop2);
2940 /* check to see whether the position is outside the alignment */
2941 if (pos < start1)
2942 return (start2 - (start1 - pos));
2943 else if (pos > stop1)
2944 return (stop2 + (pos-stop1));
2945 sapos = AlnMgr2MapBioseqToSeqAlign(sap, pos, begin);
2946 bspos = MapRowCoordsSpecial(sap, sapos, fin, which_end);
2947 if (bspos >= 0)
2948 return bspos;
2949 else if (which_end == SQN_LEFT)
2950 return (start2-1);
2951 else if (which_end == SQN_RIGHT)
2952 return (stop2+1);
2953 else
2954 return 0;
2955 }
2956
SPI_flip_sa_list(SeqAlignPtr sap)2957 static void SPI_flip_sa_list (SeqAlignPtr sap)
2958 {
2959 DenseSegPtr dsp;
2960 Int4 i;
2961 SeqIdPtr sip;
2962 SeqIdPtr sip_next;
2963 Int4 tmp_start;
2964 Uint1 tmp_strand;
2965
2966 if (sap == NULL || sap->segtype != SAS_DENSEG)
2967 return;
2968 while (sap != NULL)
2969 {
2970 dsp = (DenseSegPtr)(sap->segs);
2971 if (dsp->dim == 2) /* skip anything with more than 2 rows */
2972 {
2973 /* first switch the ids */
2974 sip = dsp->ids;
2975 sip_next = sip->next;
2976 sip_next->next = sip;
2977 sip->next = NULL;
2978 dsp->ids = sip_next;
2979 /* then switch the starts and strands */
2980 for (i = 0; i<dsp->numseg; i++)
2981 {
2982 tmp_start = dsp->starts[2*i];
2983 dsp->starts[2*i] = dsp->starts[2*i+1];
2984 dsp->starts[2*i+1] = tmp_start;
2985 tmp_strand = dsp->strands[2*i];
2986 dsp->strands[2*i] = dsp->strands[2*i+1];
2987 dsp->strands[2*i+1] = tmp_strand;
2988 }
2989 }
2990 if (sap->saip != NULL) /* free indexes, reindex */
2991 {
2992 SAIndex2Free2(sap->saip);
2993 sap->saip = NULL;
2994 AlnMgr2IndexSingleChildSeqAlign(sap);
2995 }
2996 sap = sap->next;
2997 }
2998 }
2999
ReverseBioseqFeatureStrands(BioseqPtr bsp)3000 NLM_EXTERN void ReverseBioseqFeatureStrands (BioseqPtr bsp)
3001 {
3002 Uint2 entityID;
3003 SeqMgrFeatContext context;
3004 SeqFeatPtr sfp;
3005 SeqIdPtr sip;
3006 SeqLocPtr slp;
3007 CdRegionPtr crp;
3008 CodeBreakPtr cbp;
3009 RnaRefPtr rrp;
3010 tRNAPtr trp;
3011 Uint1 strand = Seq_strand_minus;
3012
3013 if (bsp == NULL) return;
3014
3015 entityID = bsp->idx.entityID;
3016 if (entityID == 0)
3017 {
3018 entityID = ObjMgrGetEntityIDForPointer (bsp);
3019 }
3020 if (! SeqMgrFeaturesAreIndexed (entityID)) {
3021 SeqMgrIndexFeatures (entityID, NULL);
3022 }
3023 sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
3024 while (sfp != NULL) {
3025 sip = SeqLocId (sfp->location);
3026 slp = SeqLocCopyRegion (sip, sfp->location, bsp, 0, bsp->length - 1, strand, FALSE);
3027 sfp->location = SeqLocFree (sfp->location);
3028 sfp->location = slp;
3029 switch (sfp->data.choice) {
3030 case SEQFEAT_CDREGION :
3031 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3032 if (crp != NULL) {
3033 for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
3034 sip = SeqLocId (cbp->loc);
3035 slp = SeqLocCopyRegion (sip, cbp->loc, bsp, 0, bsp->length - 1, strand, FALSE);
3036 cbp->loc = SeqLocFree (cbp->loc);
3037 cbp->loc = slp;
3038 }
3039 }
3040 break;
3041 case SEQFEAT_RNA :
3042 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
3043 if (rrp != NULL && rrp->ext.choice == 2) {
3044 trp = (tRNAPtr) rrp->ext.value.ptrvalue;
3045 if (trp != NULL && trp->anticodon != NULL) {
3046 sip = SeqLocId (trp->anticodon);
3047 slp = SeqLocCopyRegion (sip, trp->anticodon, bsp, 0, bsp->length - 1, strand, FALSE);
3048 trp->anticodon = SeqLocFree (trp->anticodon);
3049 trp->anticodon = slp;
3050 }
3051 }
3052 break;
3053 default :
3054 break;
3055 }
3056 sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context);
3057 }
3058 SeqMgrClearFeatureIndexes (entityID, NULL);
3059 SeqMgrIndexFeatures (entityID, NULL);
3060 }
3061
GetOldBlastAlignment(BioseqPtr bsp1,BioseqPtr bsp2)3062 static SeqAlignPtr LIBCALLBACK GetOldBlastAlignment (BioseqPtr bsp1, BioseqPtr bsp2)
3063 {
3064 BLAST_OptionsBlkPtr options;
3065 CharPtr program = "blastn";
3066 SeqAlignPtr sap;
3067
3068 if (ISA_aa (bsp1->mol)) {
3069 program = "blastp";
3070 }
3071 options = BLASTOptionNew(program, TRUE);
3072 options->gapped_calculation = TRUE;
3073 options->expect_value = 0.001;
3074 if (bsp1->length > 10000 || bsp2->length > 10000)
3075 {
3076 options->expect_value = act_get_eval(60);
3077 options->wordsize = 20;
3078 options->filter_string = StringSave ("m L");
3079 }
3080 sap = BlastTwoSequences(bsp1, bsp2, program, options);
3081 BLASTOptionDelete(options);
3082 return sap;
3083 }
3084
GetOldBlastAlignmentPiece(SeqLocPtr slp1,SeqLocPtr slp2)3085 static SeqAlignPtr LIBCALLBACK GetOldBlastAlignmentPiece (SeqLocPtr slp1, SeqLocPtr slp2)
3086 {
3087 BioseqPtr bsp1, bsp2;
3088 BLAST_OptionsBlkPtr options;
3089 CharPtr program;
3090 SeqAlignPtr sap = NULL;
3091
3092 if (slp1 == NULL || slp2 == NULL)
3093 {
3094 return NULL;
3095 }
3096
3097 bsp1 = BioseqFindFromSeqLoc (slp1);
3098 bsp2 = BioseqFindFromSeqLoc (slp2);
3099
3100 if (bsp1 == NULL || bsp2 == NULL)
3101 {
3102 return NULL;
3103 }
3104
3105 if (ISA_aa(bsp1->mol))
3106 {
3107 if (ISA_aa(bsp2->mol))
3108 program = StringSave("blastp");
3109 else
3110 return NULL;
3111 } else if (ISA_na(bsp1->mol))
3112 {
3113 if (ISA_na(bsp2->mol))
3114 program = StringSave("blastn");
3115 else
3116 return NULL;
3117 }
3118 options = BLASTOptionNew(program, TRUE);
3119 options->gapped_calculation = TRUE;
3120 options->expect_value = 10;
3121 options->gap_x_dropoff_final = 100;
3122 options->gap_open = 4;
3123 options->gap_extend = 1;
3124 options->penalty = -1;
3125 options->wordsize = 7;
3126 sap = BlastTwoSequencesByLoc(slp1, slp2, program, options);
3127 BLASTOptionDelete(options);
3128 MemFree(program);
3129 return sap;
3130 }
3131
DumpAlignments(SeqAlignPtr salp)3132 static void DumpAlignments (SeqAlignPtr salp)
3133 {
3134 AsnIoPtr aip;
3135
3136 aip = AsnIoOpen("tmp.txt", "w");
3137 while (salp != NULL) {
3138 SeqAlignAsnWrite (salp, aip, NULL);
3139 salp = salp->next;
3140 }
3141 AsnIoClose (aip);
3142 }
3143
3144
3145 NLM_EXTERN SeqAlignPtr
Sqn_GlobalAlign2SeqEx(BioseqPtr bsp1,BioseqPtr bsp2,BoolPtr revcomp,GetAlignmentFunc aln_func,GetAlignmentPieceFunc aln_piece_func,Boolean extend_ends)3146 Sqn_GlobalAlign2SeqEx
3147 (BioseqPtr bsp1,
3148 BioseqPtr bsp2,
3149 BoolPtr revcomp,
3150 GetAlignmentFunc aln_func,
3151 GetAlignmentPieceFunc aln_piece_func,
3152 Boolean extend_ends)
3153 {
3154 AMAlignIndex2Ptr amaip;
3155 Int4 i;
3156 SeqAlignPtr sap;
3157 SeqAlignPtr sap_final;
3158 SeqAlignPtr sap_head;
3159 SeqAlignPtr sap_new;
3160 SeqAlignPtr sap_prev;
3161 Int4 start1;
3162 Int4 start2;
3163 Int4 stop1;
3164 Int4 stop2;
3165 Uint1 strand;
3166 Int4 extnd = 20;
3167 Int4 spin_fuzz = 20;
3168
3169 if (bsp1 == NULL || bsp2 == NULL)
3170 return NULL;
3171
3172 sap = aln_func (bsp1, bsp2);
3173 if (sap == NULL)
3174 {
3175 ErrPostEx (SEV_ERROR, 0, 0, "BLAST finds no sequence similarity");
3176 return NULL;
3177 }
3178 /* DumpAlignments(sap); */
3179
3180 AlnMgr2IndexLite(sap);
3181 ACT_RemoveInconsistentAlnsFromSet(sap, spin_fuzz, 1);
3182 amaip = (AMAlignIndex2Ptr)(sap->saip);
3183 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
3184 ACT_GetNthSeqRangeInSASet(sap, 1, &start1, &stop1);
3185 ACT_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
3186 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
3187
3188 /* if opposite strand submitted, reverse complement, realign */
3189
3190 if (strand == Seq_strand_minus) {
3191 if (revcomp != NULL) {
3192 *revcomp = TRUE;
3193 }
3194 AMAlignIndexFreeEitherIndex (sap);
3195 sap = SeqAlignFree (sap);
3196 BioseqRevComp (bsp2);
3197 ReverseBioseqFeatureStrands (bsp2);
3198 sap = aln_func (bsp1, bsp2);
3199 if (sap == NULL)
3200 {
3201 ErrPostEx (SEV_ERROR, 0, 0, "BLAST finds no sequence similarity in reverse complement");
3202 return NULL;
3203 }
3204 AlnMgr2IndexLite(sap);
3205 ACT_RemoveInconsistentAlnsFromSet(sap, spin_fuzz, 1);
3206 amaip = (AMAlignIndex2Ptr)(sap->saip);
3207 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
3208 ACT_GetNthSeqRangeInSASet(sap, 1, &start1, &stop1);
3209 ACT_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
3210 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
3211 }
3212
3213 if (extend_ends) {
3214 /* done with any reverse complementing and reblasting, now extend frayed ends */
3215
3216 sap_head = NULL;
3217
3218 if (start1 > 6 && start1 < extnd)
3219 sap_head = sap_prev = ACT_FindPiece(bsp1, bsp2, MAX(start1-SQN_WINDOW, 0), start1, MAX(start1-SQN_WINDOW, 0), start2, strand, SQN_LEFT, aln_piece_func);
3220 else if (start1 > 0 && start1 < extnd)
3221 SQN_ExtendAlnAlg(amaip->saps[0], start1, SQN_LEFT, Seq_strand_plus);
3222 ACT_GetNthSeqRangeInSASet(sap, 1, &start1, &stop1);
3223 ACT_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
3224 if (start2 > 6 && start2 < extnd)
3225 {
3226 sap_new = ACT_FindPiece(bsp2, bsp1, MAX(start2-SQN_WINDOW, 0), start2, MAX(start1-SQN_WINDOW, 0), start1, strand, SQN_LEFT, aln_piece_func);
3227 if (sap_new != NULL)
3228 SPI_flip_sa_list(sap_new);
3229 if (sap_head != NULL)
3230 {
3231 sap_prev->next = sap_new;
3232 sap_prev = sap_new;
3233 } else
3234 sap_head = sap_prev = sap_new;
3235 } else if (start2 > 0 && start2 < extnd)
3236 SQN_ExtendAlnAlg(amaip->saps[0], start2, SQN_LEFT, Seq_strand_plus);
3237 for (i=0; i<amaip->numsaps-1; i++)
3238 {
3239 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, NULL, &start1);
3240 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &stop1, NULL);
3241 if (strand != Seq_strand_minus)
3242 {
3243 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, NULL, &start2);
3244 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &stop2, NULL);
3245 } else
3246 {
3247 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &stop2, NULL);
3248 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, NULL, &start2);
3249 }
3250 sap_new = ACT_FindPiece(bsp1, bsp2, start1, stop1, start2, stop2, strand, SQN_MIDDLE, aln_piece_func);
3251 if (sap_head)
3252 {
3253 sap_prev->next = sap_new;
3254 if (sap_new != NULL)
3255 sap_prev = sap_new;
3256 } else
3257 sap_head = sap_prev = sap_new;
3258 }
3259 ACT_GetNthSeqRangeInSASet(sap, 1, &start1, &stop1);
3260 ACT_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
3261 if (bsp1->length-stop1 > 6 && bsp1->length-stop1 < extnd)
3262 {
3263 sap_new = ACT_FindPiece(bsp1, bsp2, stop1, MIN(bsp1->length-1, stop1 + SQN_WINDOW), stop2, MIN(bsp2->length-1, stop2+SQN_WINDOW), strand, SQN_RIGHT, aln_piece_func);
3264 if (sap_new != NULL)
3265 {
3266 if (sap_head != NULL)
3267 {
3268 sap_prev->next = sap_new;
3269 sap_prev = sap_new;
3270 } else
3271 sap_head = sap_prev = sap_new;
3272 }
3273 } else if (bsp1->length-stop1 > 0 && bsp1->length-stop1 < extnd)
3274 SQN_ExtendAlnAlg(amaip->saps[amaip->numsaps-1], bsp1->length-stop1, SQN_RIGHT, Seq_strand_plus);
3275 ACT_GetNthSeqRangeInSASet(sap, 1, &start1, &stop1);
3276 ACT_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
3277 if (bsp2->length-stop2 > 6 && bsp2->length-stop2 < extnd)
3278 {
3279 sap_new = ACT_FindPiece(bsp2, bsp1, stop2, MIN(bsp2->length-1, stop2 + SQN_WINDOW), stop1, MIN(bsp1->length-1, stop1+SQN_WINDOW), strand, SQN_RIGHT, aln_piece_func);
3280 if (sap_new != NULL)
3281 {
3282 SPI_flip_sa_list(sap_new);
3283 if (sap_head != NULL)
3284 {
3285 sap_prev->next = sap_new;
3286 sap_prev = sap_new;
3287 } else
3288 sap_head = sap_prev = sap_new;
3289 }
3290 } else if (bsp2->length-stop2 > 0 && bsp2->length-stop2 < extnd)
3291 SQN_ExtendAlnAlg(amaip->saps[amaip->numsaps-1], bsp2->length-stop2, SQN_RIGHT, Seq_strand_plus);
3292 sap_new = (SeqAlignPtr)(sap->segs);
3293 while (sap_new->next != NULL)
3294 {
3295 sap_new = sap_new->next;
3296 }
3297 sap_new->next = sap_head; /* put the new alignments in the original set */
3298 }
3299
3300 AMAlignIndexFreeEitherIndex (sap);
3301 AlnMgr2IndexLite(sap);
3302 ACT_RemoveInconsistentAlnsFromSet(sap, spin_fuzz, 1);
3303 amaip = (AMAlignIndex2Ptr)(sap->saip);
3304
3305 AMAlignIndex2Free2(amaip);
3306 sap->saip = NULL;
3307 AlnMgr2IndexLite(sap); /* reindex the alignments */
3308 sap_final = ACT_CleanUpAlignments(sap, bsp1->length, bsp2->length);
3309 return sap_final;
3310
3311 }
3312
Sqn_GlobalAlign2Seq(BioseqPtr bsp1,BioseqPtr bsp2,BoolPtr revcomp)3313 NLM_EXTERN SeqAlignPtr Sqn_GlobalAlign2Seq (BioseqPtr bsp1, BioseqPtr bsp2, BoolPtr revcomp)
3314 {
3315 return Sqn_GlobalAlign2SeqEx (bsp1, bsp2, revcomp, GetOldBlastAlignment, GetOldBlastAlignmentPiece, TRUE);
3316 }
3317
3318
3319
3320 /***************************************************************************
3321 *
3322 * AlnMgrTruncateSAP truncates a given seqalign to contain only the
3323 * bioseq coordinates from start to stop on the indicated row. Anything
3324 * before those coordinates is discarded; anything remaining afterwards
3325 * is made into another seqalign and put in sap->next (the original next,
3326 * if any, is now at sap->next->next). Doesn't work on parent seqaligns.
3327 * The function returns TRUE if the orignal alignment extended past stop.
3328 *
3329 ***************************************************************************/
AlnMgr2TruncateSAP(SeqAlignPtr sap,Int4 start,Int4 stop,Int4 row)3330 NLM_EXTERN Boolean AlnMgr2TruncateSAP(SeqAlignPtr sap, Int4 start, Int4 stop, Int4 row)
3331 {
3332 DenseDiagPtr ddp;
3333 DenseDiagPtr ddp2;
3334 DenseSegPtr dsp;
3335 Int4 from;
3336 Int4 i;
3337 Int4 mstart;
3338 Int4 mstop;
3339 SeqAlignPtr sap1;
3340 SeqAlignPtr sap2;
3341 Int4 tmp;
3342 Int4 to;
3343
3344 if (sap == NULL || stop<start || row < 1)
3345 return FALSE;
3346 if (sap->segtype == SAS_DENSEG)
3347 {
3348 if (sap->saip == NULL)
3349 AlnMgr2IndexSingleChildSeqAlign(sap);
3350 AlnMgr2GetNthSeqRangeInSA(sap, row, &mstart, &mstop);
3351 if (mstart > start || mstop < stop)
3352 return FALSE;
3353 if (mstart == start)
3354 {
3355 if (mstop == stop)
3356 return FALSE;
3357 else if (mstop > stop)
3358 {
3359 from = AlnMgr2MapBioseqToSeqAlign(sap, start, row);
3360 to = AlnMgr2MapBioseqToSeqAlign(sap, stop, row);
3361 if (to < from)
3362 {
3363 tmp = to;
3364 to = from;
3365 from = tmp;
3366 }
3367 sap1 = AlnMgr2GetSubAlign(sap, from, to, 0, FALSE);
3368 AlnMgr2IndexSingleChildSeqAlign(sap1);
3369 from = AlnMgr2MapBioseqToSeqAlign(sap, stop+1, row);
3370 if (from < 0)
3371 return FALSE;
3372 to = AlnMgr2MapBioseqToSeqAlign(sap, mstop, row);
3373 if (to < from)
3374 {
3375 tmp = to;
3376 to = from;
3377 from = tmp;
3378 }
3379 sap2 = AlnMgr2GetSubAlign(sap, from, to, 0, FALSE);
3380 sap2->next = sap->next;
3381 sap->next = sap2;
3382 dsp = sap->segs;
3383 sap->segs = (Pointer)(sap1->segs);
3384 sap1->segs = NULL;
3385 DenseSegFree(dsp);
3386 SeqAlignFree(sap1);
3387 AlnMgr2IndexSingleChildSeqAlign(sap);
3388 AlnMgr2IndexSingleChildSeqAlign(sap2);
3389 return TRUE;
3390 }
3391 } else if (mstart < start) /* throw away the first part */
3392 {
3393 from = AlnMgr2MapBioseqToSeqAlign(sap, start, row);
3394 to = AlnMgr2MapBioseqToSeqAlign(sap, stop, row);
3395 if (to < from)
3396 {
3397 tmp = to;
3398 to = from;
3399 from = tmp;
3400 }
3401 sap1 = AlnMgr2GetSubAlign(sap, from, to, 0, FALSE);
3402 if (mstop == stop) /* done */
3403 {
3404 dsp = sap->segs;
3405 sap->segs = (Pointer)(sap1->segs);
3406 sap1->segs = NULL;
3407 DenseSegFree(dsp);
3408 SeqAlignFree(sap1);
3409 AlnMgr2IndexSingleChildSeqAlign(sap);
3410 return TRUE;
3411 } else if (mstop > stop)
3412 {
3413 from = AlnMgr2MapBioseqToSeqAlign(sap, stop+1, row);
3414 if (from < 0)
3415 return FALSE;
3416 to = AlnMgr2MapBioseqToSeqAlign(sap, mstop, row);
3417 if (to < from)
3418 {
3419 tmp = to;
3420 to = from;
3421 from = tmp;
3422 }
3423 sap2 = AlnMgr2GetSubAlign(sap, from, to, 0, FALSE);
3424 sap2->next = sap->next;
3425 sap->next = sap2;
3426 AlnMgr2IndexSingleChildSeqAlign(sap2);
3427 dsp = sap->segs;
3428 sap->segs = (Pointer)(sap1->segs);
3429 sap1->segs = NULL;
3430 DenseSegFree(dsp);
3431 SeqAlignFree(sap1);
3432 AlnMgr2IndexSingleChildSeqAlign(sap);
3433 return TRUE;
3434 }
3435 }
3436 } else if (sap->segtype == SAS_DENDIAG)
3437 {
3438 ddp = (DenseDiagPtr)(sap->segs);
3439 if (ddp->dim < row)
3440 return FALSE;
3441 mstart = ddp->starts[row-1];
3442 mstop = mstart + ddp->len - 1;
3443 if (mstart > start || mstop < stop)
3444 return FALSE;
3445 if (mstart == start)
3446 {
3447 if (mstop == stop)
3448 return FALSE;
3449 else if (mstop > stop)
3450 {
3451 ddp2 = DenseDiagNew();
3452 ddp2->dim = ddp->dim;
3453 ddp2->starts = (Int4Ptr)MemNew((ddp->dim)*sizeof(Int4));
3454 ddp2->id = SeqIdDupList(ddp->id);
3455 ddp2->strands = (Uint1Ptr)MemNew((ddp->dim)*sizeof(Uint1));
3456 ddp2->scores = ScoreDup(ddp->scores);
3457 for (i=0; i<ddp->dim; i++)
3458 {
3459 ddp2->starts[i] = ddp->starts[i] + ddp->len - (mstop - stop);
3460 ddp2->strands[i] = ddp->strands[i];
3461 }
3462 ddp2->len = mstop - stop;
3463 ddp->len = ddp->len - (mstop - stop);
3464 sap2 = SeqAlignNew();
3465 sap2->type = SAT_PARTIAL;
3466 sap2->segtype = SAS_DENSEG;
3467 sap2->segs = (Pointer)ddp2;
3468 sap2->next = sap->next;
3469 sap->next = sap2;
3470 AlnMgr2IndexSingleChildSeqAlign(sap2);
3471 return TRUE;
3472 }
3473 } else if (mstart < start)
3474 {
3475 for (i=0; i<ddp->dim; i++)
3476 {
3477 ddp->starts[i] = ddp->starts[i] + start - mstart;
3478 }
3479 ddp->len = ddp->len - (start - mstart);
3480 AlnMgr2IndexSingleChildSeqAlign(sap);
3481 if (mstop == stop)
3482 return FALSE;
3483 else if (mstop > stop)
3484 {
3485 ddp2 = DenseDiagNew();
3486 ddp2->dim = ddp->dim;
3487 ddp2->starts = (Int4Ptr)MemNew((ddp->dim)*sizeof(Int4));
3488 ddp2->id = SeqIdDupList(ddp->id);
3489 ddp2->strands = (Uint1Ptr)MemNew((ddp->dim)*sizeof(Uint1));
3490 ddp2->scores = ScoreDup(ddp->scores);
3491 for (i=0; i<ddp->dim; i++)
3492 {
3493 ddp2->starts[i] = ddp->starts[i] + ddp->len - (mstop - stop);
3494 ddp2->strands[i] = ddp->strands[i];
3495 }
3496 ddp2->len = mstop - stop;
3497 ddp->len = ddp->len - (mstop - stop);
3498 sap2 = SeqAlignNew();
3499 sap2->type = SAT_PARTIAL;
3500 sap2->segtype = SAS_DENSEG;
3501 sap2->segs = (Pointer)ddp2;
3502 sap2->next = sap->next;
3503 sap->next = sap2;
3504 AlnMgr2IndexSingleChildSeqAlign(sap2);
3505 return TRUE;
3506 }
3507 }
3508 } else
3509 return FALSE;
3510 return FALSE;
3511 }
3512
3513