1 static char const rcsid[] = "$Id: spidey.c,v 6.74 2016/09/02 14:57:38 ucko Exp $";
2 
3 /* ===========================================================================
4 *
5 *                            PUBLIC DOMAIN NOTICE
6 *            National Center for Biotechnology Information (NCBI)
7 *
8 *  This software/database is a "United States Government Work" under the
9 *  terms of the United States Copyright Act.  It was written as part of
10 *  the author's official duties as a United States Government employee and
11 *  thus cannot be copyrighted.  This software/database is freely available
12 *  to the public for use. The National Library of Medicine and the U.S.
13 *  Government do not place any restriction on its use or reproduction.
14 *  We would, however, appreciate having the NCBI and the author cited in
15 *  any work or product based on this material.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 * ===========================================================================
26 *
27 * File Name:  spidey.c
28 *
29 * Author:  Sarah Wheelan
30 *
31 * Version Creation Date:   5/01
32 *
33 * $Revision: 6.74 $
34 *
35 * File Description: mrna-to-genomic alignment algorithms and functions
36 *
37 * Modifications:
38 * --------------------------------------------------------------------------
39 * $Log: spidey.c,v $
40 * Revision 6.74  2016/09/02 14:57:38  ucko
41 * Formally clean up calls to printf-family functions that are at least
42 * nominally unsafe, as already done in Debian/Ubuntu packages.
43 *
44 * Revision 6.73  2006/06/01 14:55:31  kskatz
45 * fixed -s option so that it sets gap-open/gap-extend penalties that are valid as a pair for gap statistics and are a bit less stringent thatn default (default = 5 open : 2 extend; interspecies -s = 4 open :  1 extend)
46 *
47 * Revision 6.72  2005/11/17 17:12:50  kskatz
48 * Fixed initializations and removed non-used functions to get rid of warnings
49 *
50 * Revision 6.70  2005/02/22 17:41:59  kskatz
51 * fixed potential dividing by zero in SPI_is_acceptor* and SPI_is_donor* probability calculations
52 *
53 * Revision 6.69  2004/04/09 16:05:21  kskatz
54 * Added sanity check (must be 3 intervals to go through the loop) to SPI_CheckMrnaOrder()
55 *
56 * Revision 6.68  2004/03/25 21:20:03  kskatz
57 * All SPI_is_acceptor_* functions have been corrected: 'N' no longer contributes to nor subtracts from the score, log odds are calculated and the scores added; they are however all antilogged because there are too many places in the code where the score is expected to be between 0 and 1.  Also, corrected sequence frequency determination in SPI_is_acceptor_user and SPI_is_donor_user, as well as correcting for 'N'. Finally, and this all began with, I added matrices for Dictyostelium - command line -r -m
58 *
59 * Revision 6.67  2003/12/12 21:25:26  kskatz
60 * Fixed bug in SPI_CheckForPolyAExon() where multiple SeqAlignPtr's to the same object were not handled carefully: one of the ptr's was being accessed when the object was freed via the other ptr.
61 *
62 * Revision 6.66  2003/12/12 17:57:04  kskatz
63 * Fixed a potential array bounds read error in SPI_CheckMrnaOrder()
64 *
65 * Revision 6.65  2003/12/10 16:53:22  kskatz
66 * Ensured that 'ovl' when used is never negative once set in SPI_AdjustOverlaps() [see revision 6.57]
67 *
68 * Revision 6.64  2003/10/21 15:26:17  kans
69 * fixed typo of SPI_IvalPt to SPI_IvalPtr
70 *
71 * Revision 6.63  2003/10/21 15:14:19  kskatz
72  * Added SPI_CheckMrnaOrder(): Called by GetRegionForSAP() after the ivals for building a region are sorted in genomic order, this function merely checks that the mrna invterals are minimally colinear.
73 *
74 * Revision 6.62  2003/10/06 14:11:20  kskatz
75 * Changed the 'version' number printed by SPI_PrintResult() to '1.40' since it has been '1.35' for so long - mostly to avoid confusion when users report the version number
76 *
77 * Revision 6.61  2003/10/06 14:04:09  kskatz
78 * Correctly! commented out a temporary fix in SPI_AlignInWindows() [line 3880]
79 *
80 * Revision 6.60  2003/09/17 20:39:01  kskatz
81 * Commented out a temporary fix in SPI_AlignInWindows() [line 3880]
82 *
83 * Revision 6.59  2003/09/17 19:53:27  kskatz
84 * Added a check in SPI_FindBestAlnByDotPlot() that both seqs be 2-bit encoded (ncbi2na) in order to meet that implicit requirement of DOT_. If either one is not ncbi2na, SPI_FindBestAlnByDotPlot() will simply return NULL.
85 *
86 * Revision 6.58  2003/08/18 18:17:51  kskatz
87 * Just removing some unused vars
88 *
89 * Revision 6.57  2003/08/18 18:11:39  kskatz
90 * Fixed dynamic allocation of buf in SPI_AdjustOverlaps() - 'ovl' can be negative
91 *
92 * Revision 6.56  2003/08/15 15:23:50  kskatz
93 * Created Choose2LooseMrnaOvLap(), called by SPI_AdjustForSplice(): returns the SeqAlignPtr * to delete *. The choice is based on score + splice donor/acceptor existencefor that 'exon'. Also made buf2 in SPI_AdjustOverlaps() dynamically allocated as it was (obviously) crashing when the overlaps was > 200 bases.
94 *
95 * Revision 6.55  2003/06/30 15:01:29  whlavina
96 * Correct minus strand handling in CreaeContinuousAln functions; previous
97 * code could corrupt alignments (stop2-start1>1 would imply len<-2 if
98 * ExtendAlnRight ever gets called).
99 *
100 * Revision 6.54  2003/05/30 17:25:38  coulouri
101 * add rcsid
102 *
103 * Revision 6.53  2003/04/04 19:42:56  kskatz
104 * Added a new command line option (-R) to allow external users to point spidey to a repeat database that it can pass on to blast for filtering repeats
105 *
106 * Revision 6.52  2002/11/14 17:20:38  johnson
107 * fixed nasty memory misallocation bug in SPI_CheckSplicesForRevComp
108 *
109 * Revision 6.51  2002/11/04 19:48:35  kskatz
110 * wasn't correcting for strand when reporting summary mismatch information in SPI_PrintResults()
111 *
112 * Revision 6.50  2002/10/10 19:39:45  kskatz
113 * Added 'mismatches' to output in SPI_PrintResult(), as well as commented out several unused variables, and two syntax fixes to avoid compiler warning
114 *
115 * Revision 6.49  2002/10/02 16:47:11  kskatz
116 * clarifying the explanation of the -L option
117 *
118 * Revision 6.48  2002/10/02 16:12:53  kskatz
119 * Added a new option to SPI_Options (bigintron_size) that holds a user-supplied maximum size (default = 220000) for introns and requires the option (bool) bigintron to be set to 'TRUE'; The functions affected are SPI_mRNAPtr SPI_AdjustForSplice(), SPI_is_consistent(), and SPI_FindPiece(); note that the default for bigintron_size is not set in SPI_OptionsNew() (yet)
120 *
121 * Revision 6.47  2002/08/28 17:02:51  kskatz
122 * Simplified the loop in SPI_PrintResults() that prints out the 5' splice site and allowed minimum of 2 bases; also fixed more deadly access errors by setting the SPI_RegionInfoPtr *srip to NULL when all regions fall below spot->idcutoff in SPI_SortRegionsByScore()
123 *
124 * Revision 6.46  2002/08/26 20:00:05  kskatz
125 * Fixed off-by-one error in my fix to SPI_PrintResult
126 *
127 * Revision 6.45  2002/08/20 21:07:12  kskatz
128 * Fixed several NULL pointer access errors caused when -c results in the deletion of all regions in SPI_SortRegionsByScore(); also fixed bugs in SPI_PrintResult() caused by not checking to see if a minus strand alignment had start at the end of the sequence when printing out the 10-base buffer of the splice region
129 *
130 * Revision 6.44  2002/08/17 03:08:16  kskatz
131 * allowed to & from to be handled independently in SPI_AlnSinglemRNAToGen()
132 *
133 * Revision 6.43  2002/08/16 22:31:55  kskatz
134 * oops again - changed c++ style comments to c style since this is the c toolkit
135 *
136 * Revision 6.42  2002/08/16 21:15:50  kskatz
137 * oops - this is C toolkit: int -> Int4
138 *
139 * Revision 6.41  2002/08/16 21:03:12  kskatz
140 * SPI_OptionsNew() now sets strand = Seq_strand_both as default, otherwise blast results are hosed; SPI_AlnSinglemRNAToGen() now correctly limits initial blast to user supplied to/from and includes a little sanity check
141 *
142 * Revision 6.40  2002/07/22 13:40:55  wheelan
143 * changes to splice matrices, bug fix in CDS computation
144 *
145 * Revision 6.39  2002/06/27 12:59:34  kans
146 * fix in call to GetScoreAndEvalue
147 *
148 * Revision 6.38  2002/06/27 11:52:53  wheelan
149 * various bug fixes -- fixed off-by-one splice site errors and more
150 *
151 * Revision 6.37  2002/05/07 19:15:09  wheelan
152 * fixed minor bug in splice boundary arithmetic
153 *
154 * Revision 6.36  2002/05/07 18:42:56  wheelan
155 * changes to support user-defined splice matrices
156 *
157 * Revision 6.35  2002/04/04 17:18:20  wheelan
158 * numerous bug fixes and little changes; added SPI_CheckForPolyAExon
159 *
160 * Revision 6.34  2002/01/30 19:09:05  wheelan
161 * better support for revcomp, plus changes for new alignment manager funcs
162 *
163 * Revision 6.33  2001/12/18 18:00:01  wheelan
164 * bug fix for NULL segs in RemoveTeenyAln
165 *
166 * Revision 6.32  2001/12/13 12:28:51  wheelan
167 * fixed bug in multiple printing, bug in ConnectAln
168 *
169 * Revision 6.31  2001/12/10 15:58:04  wheelan
170 * fixed dereferencing of null variable in ConnectAln
171 *
172 * Revision 6.30  2001/12/10 14:42:36  wheelan
173 * bug fix in ConnectAln -- no more using freed pointers
174 *
175 * Revision 6.29  2001/12/05 12:29:37  wheelan
176 * changed to version 1.2
177 *
178 * Revision 6.28  2001/11/30 12:15:03  wheelan
179 * subtle but very important bug fix in SPI_GetNthSeqRangeInSASet
180 *
181 * Revision 6.27  2001/11/20 12:13:24  wheelan
182 * made SPI_GetProteinFrommRNA EXTERN
183 *
184 * Revision 6.26  2001/11/05 16:17:11  wheelan
185 * added option to print multiple alignment to a file
186 *
187 * Revision 6.25  2001/11/02 14:00:52  wheelan
188 * fixed memory access errors in splice printing code
189 *
190 * Revision 6.24  2001/10/26 13:12:07  wheelan
191 * changes to polyA handling, plus bulletproofing
192 *
193 * Revision 6.23  2001/10/18 15:45:56  wheelan
194 * bug fix in ConnectAln
195 *
196 * Revision 6.22  2001/10/18 15:12:22  wheelan
197 * fixed polyAtail alignment problems, fixed score calculation
198 *
199 * Revision 6.21  2001/10/17 16:16:21  wheelan
200 * changes in region sorting plus mrna model gap handling
201 *
202 * Revision 6.20  2001/10/08 17:16:44  wheelan
203 * bug fix in revcmp, made cds 1-based coords, fixed polyA bug
204 *
205 * Revision 6.19  2001/10/04 12:36:21  wheelan
206 * implemented bigintron option; made SPI_ConnectAln run through twice to pick up more pieces
207 *
208 * Revision 6.18  2001/10/03 18:09:54  wheelan
209 * changed AM_LITE define for new alnmgr
210 *
211 * Revision 6.17  2001/10/03 14:19:53  wheelan
212 * change names of all alignmgr calls, plus add profile-making code
213 *
214 * Revision 6.15  2001/09/07 12:15:25  wheelan
215 * small fix for reverse complement translation
216 *
217 * Revision 6.14  2001/09/07 12:05:17  wheelan
218 * moved protein translation for convenient use on web
219 *
220 * Revision 6.13  2001/09/07 11:47:32  wheelan
221 * fixed coordinates and translation for reverse complement cases
222 *
223 * Revision 6.12  2001/09/04 13:46:47  wheelan
224 * made SPI_RemoveInconsistentAlnsFromSet and SPI_flip_sa_list extern
225 *
226 * Revision 6.11  2001/08/24 23:27:15  wheelan
227 * removed unwanted semicolon
228 *
229 * Revision 6.10  2001/08/24 13:45:20  wheelan
230 * better region sorting (better scores), plus different printing options added
231 *
232 * Revision 6.9  2001/08/20 21:28:34  wheelan
233 * improved relative scoring of initial regions, added seqid types
234 *
235 * Revision 6.8  2001/07/20 10:31:10  wheelan
236 * fixed uninitialized variable plus another polyA mistake
237 *
238 * Revision 6.7  2001/07/19 18:22:36  wheelan
239 * better handling of polyA tails
240 *
241 * Revision 6.6  2001/07/11 17:56:53  wheelan
242 * added more functions to deal with making multiple alignments
243 *
244 * Revision 6.5  2001/07/10 16:44:53  wheelan
245 * added functions to make a multiple alignment
246 *
247 * Revision 6.4  2001/07/06 10:27:21  wheelan
248 * fixed minor things pointed out by D. Vakatov
249 *
250 * Revision 6.3  2001/06/25 17:00:47  wheelan
251 * frame fix in GetProteinFrommRNA
252 *
253 * Revision 6.2  2001/06/22 20:54:49  wheelan
254 * spidey now tries to make as many alignments as requested, even if that means throwing away the "best" regions if they have no alignment
255 *
256 * Revision 6.1  2001/05/24 16:28:10  wheelan
257 * initial checkin
258 *
259 *
260 * ==========================================================================
261 */
262 
263 #include <spidey.h>
264 
265 
266 static int LIBCALLBACK SPI_CompareAlnPosForMult(VoidPtr ptr1, VoidPtr ptr2);
267 static Boolean spi_overlaps(SeqAlignPtr sap, SPI_BlockPtr sbp);
268 static void SPI_BeautifySMP(SPI_RegionInfoPtr srip);
269 static void SPI_RemoveOutsideBounds(SeqAlignPtr sap, SPI_OptionsPtr spot);
270 static void SPI_PadRegions(SPI_RegionInfoPtr srip, Int4 bsplen);
271 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2);
272 static void SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip, SPI_OptionsPtr spot);
273 static int LIBCALLBACK SPI_CompareRegions(VoidPtr ptr1, VoidPtr ptr2);
274 static void SPI_PrintAce(FILE *ofp, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, Boolean is_cds);
275 static void SPI_PrintResult(FILE *ofp, FILE *ofp2, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot, Boolean isitCDS);
276 static void SPI_PrintHerdResult(FILE *ofp, FILE *ofp2, SPI_mRNAToHerdPtr herd, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna);
277 static void spi_print_mismatch_line(FILE *ofp, Int4 exonnum, Int4 start, Int4 len, SPI_ExonProfPtr epp, Int4 gstart);
278 static SeqAlignPtr SPI_CreateContinuousAln(SeqAlignPtr PNTR saps, Int4 numsaps);
279 static void SPI_ExtendAlnRight(SeqAlignPtr sap, Int4 which_row, Int4 start, Int4 stop);
280 static SPI_mRNAToHerdPtr SPI_GetHerdInfo(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
281 static SPI_RegionInfoPtr SPI_FindWindows(SeqAlignPtr sap, SPI_OptionsPtr spot);
282 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2);
283 static SPI_RegionInfoPtr SPI_SortRegions(SPI_RegionInfoPtr srip_head);
284 static int LIBCALLBACK SPI_SortSrips(VoidPtr ptr1, VoidPtr ptr2);
285 static SPI_RegionInfoPtr SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list, Int4 num, SPI_RegionInfoPtr PNTR head_srip, SPI_OptionsPtr spot);
286 static SPI_RegionInfoPtr SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list, Int4 num, SeqAlignPtr sap, SPI_OptionsPtr spot);
287 static Int2 SPI_is_consistent(SPI_IvalPtr siip, SPI_RegionInfoPtr srip, SPI_OptionsPtr spot);
288 static int LIBCALLBACK SPI_compare_genomic_loc(VoidPtr ptr1, VoidPtr ptr2);
289 static void SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list, Int4 num, SPI_RegionInfoPtr srip);
290 static void SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
291 static void SPI_DoAln(SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
292 static Boolean SPI_ConnectAln(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip, Boolean do_ends, Boolean firsttime);
293 static SeqAlignPtr SPI_ProcessNewAlns(SeqAlignPtr sap);
294 static Int4 SPI_IsItPolyA(SeqIdPtr sip);
295 static SeqAlignPtr SPI_FillInIntron(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start1, Int4 stop1, Int4 start2, Int4 stop2, Uint1 strand2, SPI_OptionsPtr spot);
296 static Int4 spi_isa_gap(Int4 start, Int4 prevstop, Uint1 strand);
297 static Int4 SPI_GetNthSeqLenInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr numsaps);
298 static void SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
299 static SeqAlignPtr SPI_FindPiece(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start_m, Int4 stop_m, Uint1 strand, Int4 start_g, Int2 which_end, SPI_OptionsPtr spot);
300 static SPI_mRNAPtr SPI_AdjustForSplice(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip);
301 static Int4 SPI_GetExonInfo(SPI_mRNAPtr smp, Int4 n, Int4Ptr start, Int4Ptr stop, Int4Ptr mis, SPI_OptionsPtr spot);
302 static void SPI_AdjustOverlaps(SeqAlignPtr sap1, SeqAlignPtr sap2, Int4 n, SPI_mRNAPtr smp, SPI_OptionsPtr spot);
303 static void SPI_RemoveTeenyAlns(SeqAlignPtr sap, Int4 len);
304 static void SPI_ExtendAlnAlgDumb(SeqAlignPtr sap, Int4 ovl, Int4 which_side, Uint1 strand);
305 static void SPI_GetAcceptorScore(BioseqPtr bsp, Int4 pos1, Int4 pos2, Uint1 strand, FloatHiPtr score, Int4 spllen, SPI_OptionsPtr spot);
306 static Int4 spi_get_overlap (SeqAlignPtr sap1, SeqAlignPtr sap2);
307 static void SPI_AddToAln(SeqAlignPtr sap, Int4 offset, Int2 which_end, Uint1 strand);
308 static SeqAlignPtr SPI_MergeAlignments(SeqAlignPtr sap1, SeqAlignPtr sap2);
309 static SeqAlignPtr SPI_FillInLastmRNAHoles(SeqAlignPtr sap, SeqIdPtr sip_genomic, SeqIdPtr sip_mrna, Int4 start_g, Int4 stop_g, Int4 start_m, Int4 stop_m, Uint1 strand);
310 static SeqAlignPtr SPI_FindBestAlnByDotPlot(SeqLocPtr slp1, SeqLocPtr slp2);
311 static int LIBCALLBACK SPI_comp_aln_pos(VoidPtr ptr1, VoidPtr ptr2);
312 static void SPI_RegionFree (SPI_RegionInfoPtr srip);
313 static void SPI_FreeExonProf(SPI_ExonProfPtr epp);
314 static void SPI_FreeExonProfList(SPI_ExonProfPtr epp);
315 static void SPI_GetDonorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot);
316 static void SPI_is_donor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot);
317 static void SPI_is_donor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
318 static void SPI_is_donor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
319 static void SPI_is_donor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
320 static void SPI_is_donor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
321 static void SPI_is_donor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
322 static void SPI_GetAcceptorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot);
323 static void SPI_is_acceptor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot);
324 static void SPI_is_acceptor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
325 static void SPI_is_acceptor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
326 static void SPI_is_acceptor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
327 static void SPI_is_acceptor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
328 static void SPI_is_acceptor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
329 static void SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp, Int4 fuzz);
330 static void SPI_OrderPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna);
331 static int LIBCALLBACK SPI_CompareFragInfo(VoidPtr ptr1, VoidPtr ptr2);
332 static Boolean SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_contig, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
333 static void SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby, Int4 n, Int4Ptr start, Int4Ptr stop);
334 static Int4 SPI_GetNearbyFrags(SPI_FragPtr sfptarget, Int4 n, SPI_FragPtr ** ptrptr, SPI_FragHerdPtr sfhp, Boolean minus);
335 static void SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_genomic, SPI_OptionsPtr spot);
336 static void SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1, SPI_FragPtr sfp2, BioseqPtr bsp_genomic, SPI_OptionsPtr spot);
337 static SeqAlignPtr SPI_GetNthSAByRow(SeqAlignPtr sap, Int4 row, Int4 n);
338 static SPI_FragSplPtr SPI_GetPossibleSites(SeqAlignPtr sap, BioseqPtr bsp_genomic, SPI_OptionsPtr spot, Boolean donor, Int4 ovl);
339 static void SPI_FragSplFree(SPI_FragSplPtr fsp);
340 static int LIBCALLBACK SPI_CompareSpins(VoidPtr ptr1, VoidPtr ptr2);
341 static void SPI_OrderInternally(SPI_FragHerdPtr sfhp);
342 static int LIBCALLBACK SPI_CompareAlnPos(VoidPtr ptr1, VoidPtr ptr2);
343 static SPI_RegionInfoPtr SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
344 static void SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds, SPI_RegionInfoPtr srip_mrna, Int4 len, Int4 exonstart, Int4 exonstop);
345 static Boolean LIBCALLBACK SPI_GetCDS(SeqFeatPtr sfp, SeqMgrFeatContextPtr context);
346 static Boolean LIBCALLBACK SPI_GetCDSFeat(SeqFeatPtr sfp, SeqMgrFeatContextPtr context);
347 static Int4 SPI_FindLongestProt(CharPtr seq, Int4Ptr pos);
348 static Boolean SPI_GetAccessionFromSeqId(SeqIdPtr sip, Int4Ptr gi, CharPtr PNTR id);
349 static void SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna);
350 static ACTProfilePtr SPI_ProfileNew(Boolean nuc);
351 static ACTProfilePtr SPI_ProfileFree(ACTProfilePtr app);
352 static ACTProfilePtr SPI_ProfileSetFree(ACTProfilePtr app);
353 static void SPI_BuildProfile(SeqLocPtr slp, ACTProfilePtr PNTR app, Int4Ptr count, Int4 length);
354 static ACTProfilePtr SPI_MakeProfileFromSA(SeqAlignPtr sap);
355 static int SPI_Choose2LooseMrnaOvLap (const SeqAlignPtr sap1, const SeqAlignPtr sap2, const SPI_mRNAPtr smp, const int ptr1offset);
356 static void SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp, const int num);
357 
358 
359 /***************************************************************************
360 *
361 *  SPI_AlnmRNAToGenomic is available to outside programs; just pass in the two
362 *  bioseqs and options (to use default options, just pass in NULL, and to use
363 *  other options, call SPI_OptionsNew to get an initialized options pointer and
364 *  make the desired changes).  If options are passed in, they should be freed
365 *  using SPI_OptionsFree.  SPI_AlignmRNAToGenomic returns a linked list of
366 *  SPI_mRNAPtrs, one per gene model (default is to only return one gene model).
367 *  Each SPI_mRNAPtr (see spidey.h) has arrays specifying the exon boundaries in
368 *  genomic and mRNA coordinates as well as information about splice sites,
369 *  percent identity, number of gaps, etc.  The SPI_mRNAPtr also has one alignment
370 *  per exon as well as a single alignment (smp->continuous) that covers the entire
371 *  gene, with big gaps in the mRNA for the genomic introns.  The SPI_mRNAPtr should
372 *  be freed by the calling function, using SPI_mRNAFree.
373 *
374 *  SPI_AlnmRNAToGenomic should only be used on finished sequence; it can handle
375 *  interspecies comparisons but doesn't work on draft sequence.
376 *
377 ***************************************************************************/
SPI_AlignmRNAToGenomic(BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)378 NLM_EXTERN SPI_mRNAPtr SPI_AlignmRNAToGenomic(BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
379 {
380    Int4               a;
381    Int4               i;
382    Boolean            lcl;
383    SeqAlignPtr        parent;
384    SPI_Progress       progress;
385    SeqAlignPtr        PNTR saparray;
386    SPI_mRNAPtr        smp_head;
387    SPI_mRNAPtr        smp_prev;
388    SPI_bsinfoPtr      spig;
389    SPI_bsinfoPtr      spim;
390    SPI_OptionsPtr     spot_lcl;
391    SPI_RegionInfoPtr  srip;
392    SPI_RegionInfoPtr  srip_head;
393 
394    if (bsp_genomic == NULL || bsp_mrna == NULL)
395       return NULL;
396    a = SPI_IsItPolyA(bsp_mrna->id);
397    if (spot == NULL)
398    {
399       spot_lcl = SPI_OptionsNew();
400       lcl = TRUE;
401    } else
402    {
403       spot_lcl = spot;
404       lcl = FALSE;
405    }
406    spig = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
407    spig->bsp = bsp_genomic;
408    spim = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
409    spim->bsp = bsp_mrna;
410    srip_head = SPI_AlnSinglemRNAToGen(spig, spim, NULL, NULL, spot_lcl);
411    if (spot_lcl->callback != NULL)
412    {
413       progress.percentdone = 100;
414       progress.returncode = SPI_FINISHED;
415       if (!spot_lcl->callback(&progress))
416          return NULL;
417    }
418    smp_head = smp_prev = NULL;
419    srip = srip_head;
420    while (srip != NULL)
421    {
422       if (srip->smp->polyAtail == 0)
423          srip->smp->polyAtail = 0-a;
424       srip->smp->revcomp = srip->revcomp;
425       if (smp_head != NULL)
426       {
427          smp_prev->next = srip->smp;
428          smp_prev = srip->smp;
429       } else
430          smp_head = smp_prev = srip->smp;
431       saparray = (SeqAlignPtr PNTR)MemNew((srip->smp->numexons)*sizeof(SeqAlignPtr));
432       for (i=0; i<srip->smp->numexons; i++)
433       {
434          saparray[i] = SeqAlignDup(srip->smp->saps[i]);
435          AlnMgr2IndexSingleChildSeqAlign(saparray[i]);
436       }
437       srip->smp->continuous = SPI_CreateContinuousAln(srip->smp->saps, srip->smp->numexons);
438       for (i=0; i<srip->smp->numexons; i++)
439       {
440          SeqAlignFree(srip->smp->saps[i]);
441          srip->smp->saps[i] = saparray[i];
442          if (i < srip->smp->numexons-1)
443          {
444             srip->smp->saps[i]->next = saparray[i+1];
445             saparray[i+1]->next = NULL;
446          }
447       }
448       MemFree(saparray);
449       parent = SeqAlignNew();
450       parent->segtype = SAS_DISC;
451       parent->segs = (Pointer)(srip->smp->saps[0]);
452       AlnMgr2IndexLite(parent);
453       srip->smp->parent = parent;
454       srip->polyAtail = srip->smp->polyAtail;
455       srip->smp->fallsoff = srip->fallsoff;
456       if (srip->smp->protein == NULL)
457          srip->smp->protein = SPI_GetProteinFrommRNA(spim->bsp, &srip->smp->transstart);
458       srip->smp = NULL; /* so that smp doesn't get freed */
459       srip = srip->next;
460    }
461    SPI_bsinfoFreeList(spig);
462    SPI_bsinfoFreeList(spim);
463    SPI_RegionListFree(srip_head);
464    if (lcl)
465       SPI_OptionsFree(spot_lcl);
466    return (smp_head);
467 }
468 
469 /***************************************************************************
470 *
471 *  SPI_AlnSinglemRNAToGen is called by Main() as well as by
472 *  SPI_AlignmRNAToGenomic. It does the initial BLAST
473 *  (high stringency) and makes sure that all the alignments
474 *  are on the plus strand of the genomic sequence. It then calls the
475 *  functions to create the windows, align in the windows, and sort the
476 *  final alignments. Finally, it calls the functions to print the
477 *  information for each alignment, fetch the CDS and create its
478 *  alignment if requested, and create a continuous alignment if the
479 *  user wishes to print an asn.1 seqalign. When everything is done, it
480 *  frees the SPI_RegionInfoPtrs if the program is run as the
481 *  standalone version, or returns them to the calling function.
482 *
483 ***************************************************************************/
SPI_AlnSinglemRNAToGen(SPI_bsinfoPtr spig,SPI_bsinfoPtr spim,FILE * ofp,FILE * ofp2,SPI_OptionsPtr spot)484 NLM_EXTERN SPI_RegionInfoPtr SPI_AlnSinglemRNAToGen(SPI_bsinfoPtr spig, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot)
485 {
486    Char                 rep_buf[1024] = "m L;R";
487    Int4                 i;
488    BLAST_OptionsBlkPtr  options;
489    SPI_Progress         progress;
490    SeqAlignPtr          salp;
491    SeqAlignPtr          salp_tmp;
492    SeqAlignPtr          sap;
493    SeqAlignPtr          sap_tmp;
494    SeqAlignPtr          sap_tmp2;
495    SeqLocPtr            slp1;
496    SeqLocPtr            slp2;
497    SPI_RegionInfoPtr    srip;
498    SPI_RegionInfoPtr    srip_cds;
499    SPI_RegionInfoPtr    srip_tmp;
500    Boolean              standalone;
501    Uint1                strand;
502 
503    if (spig == NULL || spim == NULL)
504       return NULL;
505    /*sanity checks for to & from*/
506    if (spot->to == 0){
507        spot->to = spig->bsp->length - 1;
508    }
509    else if (spot->to < spot->from){
510        Int4 new_from = spot->to;
511        spot->to = spot->from;
512        spot->from = new_from;
513    }
514    if (spot->from == spot->to){
515        return NULL;
516    }
517    /***
518    if (spot->to < spot->from)
519       return NULL;
520    ***/
521    if (ofp == NULL)
522       standalone = FALSE;
523    else
524       standalone = TRUE;
525    spot->printheader = TRUE;
526    options = BLASTOptionNew("blastn", FALSE);
527 
528    /* KSK added to allow user defined repeat db path */
529    /* options->filter_string = StringSave("m L;R"); */
530    if (spot->repeat_db_file){
531         strcat(rep_buf, " -d ");
532         strcat(rep_buf, spot->repeat_db_file);
533    }
534    options->filter_string = StringSave(rep_buf);
535    /*end of adding repeat db path */
536    options->expect_value = spot->firstpasseval;
537    options->query_lcase_mask = spot->lcaseloc;
538    if (spot->interspecies)
539    {
540       options->gap_x_dropoff_final = 100;
541       options->gap_open = 5;
542       options->gap_extend = 1;
543       options->penalty = -1;
544    }
545    /* do the BLAST with the mRNA as the query, for speed */
546    if (spot->callback != NULL)
547    {
548       progress.percentdone = 5;
549       progress.returncode = SPI_START;
550       if (!spot->callback(&progress))
551          return NULL;
552    }
553    /*** not checking for to/from, so it's gotta be just as easy
554    to send it directly to ..ByLoc()***
555    if (spot->strand == Seq_strand_both)
556     sap = BlastTwoSequences(spim->bsp, spig->bsp, "blastn", options);
557    else
558    {
559    ***/
560    slp1 = SeqLocIntNew(0, spim->bsp->length-1, Seq_strand_plus, spim->bsp->id);
561    slp2 = SeqLocIntNew(spot->from, spot->to, spot->strand, spig->bsp->id);
562    /* slp2 = SeqLocIntNew(0, spig->bsp->length-1, spot->strand, spig->bsp->id); */
563    sap = BlastTwoSequencesByLoc(slp1, slp2, "blastn", options);
564    /* } */
565 
566    if (spot->callback != NULL)
567    {
568       progress.percentdone = 30;
569       progress.returncode = SPI_PROGRESS;
570       if (!spot->callback(&progress))
571          return NULL;
572    }
573    if (sap == NULL)
574    {
575       if (standalone)
576          SPI_PrintResult(ofp, ofp2, NULL, spig->bsp, spim->bsp, spot, FALSE);
577       BLASTOptionDelete(options);
578       return NULL;
579    }
580    if (!AlnMgr2IndexLite(sap))
581    {
582       if (standalone)
583          SPI_PrintResult(ofp, ofp2, NULL, spig->bsp, spim->bsp, spot, FALSE);
584       ErrPostEx(SEV_ERROR, 0, 0, "Alignment indexing error\n");
585       SeqAlignSetFree(sap);
586       BLASTOptionDelete(options);
587       return NULL;
588    }
589    salp = (SeqAlignPtr)(sap->segs);
590    /* since the mRNA was the query, need to flip all the alignments */
591    /* so that the mRNA will end up as the second row                */
592    SPI_flip_sa_list(salp);
593    /* now make sure that everything is on the plus strand of the genomic sequence */
594    while (salp != NULL)
595    {
596       strand = AlnMgr2GetNthStrand(salp, 1);
597       if (strand == Seq_strand_minus)
598       {
599          salp_tmp = salp->next;
600          salp->next = NULL;
601          SAIndex2Free2(salp->saip);
602          salp->saip = NULL;
603          salp = SeqAlignListReverseStrand(salp);
604          AlnMgr2IndexSingleChildSeqAlign(salp);
605          salp->next = salp_tmp;
606       }
607       salp = salp->next;
608    }
609    SPI_RemoveOutsideBounds(sap, spot);
610    BLASTOptionDelete(options);
611    srip = SPI_FindWindows(sap, spot);
612    SPI_PadRegions(srip, spig->bsp->length);
613    /* once the windows are found, throw out the original alignment */
614    /* and carefully align in each window                           */
615    SeqAlignSetFree(sap);
616    SPI_AlignInWindows(&srip, spig->bsp, spim->bsp, spot);
617    if (spot->callback != NULL)
618    {
619       progress.percentdone = 95;
620       progress.returncode = SPI_PROGRESS;
621       if (!spot->callback(&progress))
622          return NULL;
623    }
624    SPI_SortRegionsByScore(&srip, spot);
625    SPI_BeautifySMP(srip);
626    SPI_CheckSplicesForRevComp(srip, spot, spig->bsp, spim->bsp);
627    srip_cds = NULL;
628    /* if the CDS alignment is desired, fetch the CDS information and */
629    /* compute the CDS alignment by truncating the mRNA alignments    */
630    if (standalone)
631    {
632       srip_tmp = srip;
633       for (i=0; i<spot->numreturns; i++)
634       {
635          spot->printheader = FALSE;
636          if (spot->fetchcds)
637          {
638             srip_cds = SPI_GetResultsForCDS(srip_tmp, spim->bsp, spot);
639             SPI_BeautifySMP(srip_cds);
640          }
641          if (srip_cds != NULL && spot->ace == FALSE)
642          {
643             spot->printheader = TRUE;
644             SPI_PrintResult(ofp, ofp2, srip_cds, spig->bsp, spim->bsp, spot, TRUE);
645             SPI_RegionListFree(srip_cds);
646          } else if (srip_cds != NULL && spot->ace == TRUE)
647             SPI_PrintAce(ofp, srip_cds, spig->bsp, spim->bsp, TRUE);
648          if (spot->printheader) /* print the version info only once per mRNA/CDS */
649             spot->printheader = FALSE;
650          else
651             spot->printheader = TRUE;
652          /** KSK bug fix - access of null srip_tmp->smp that
653           results from SPI_SortRegionsByScore() removing
654          all below -c threshold **/
655          if (srip_tmp != NULL && srip_tmp->smp != NULL && srip_tmp->smp->protein == NULL)
656              srip_tmp->smp->protein = SPI_GetProteinFrommRNA(spim->bsp, &srip_tmp->smp->transstart);
657          if (spot->ace == FALSE)
658          {
659             if (srip_tmp == NULL)
660                SPI_PrintResult(ofp, ofp2, srip_tmp, spig->bsp, spim->bsp, spot, FALSE);
661             else
662             {
663                SPI_PrintResult(ofp, ofp2, srip_tmp, spig->bsp, spim->bsp, spot, FALSE);
664                srip_tmp = srip_tmp->next;
665             }
666          } else
667             SPI_PrintAce(ofp, srip_tmp, spig->bsp, spim->bsp, FALSE);
668       }
669       if (spot->printasn && srip != NULL && srip->smp != NULL)
670       {
671          sap_tmp = SPI_CreateContinuousAln(srip->smp->saps, srip->smp->numexons);
672          if (*(spot->sap_head) == NULL)
673             *(spot->sap_head) = sap_tmp;
674          else
675          {
676             sap_tmp2 = *(spot->sap_head);
677             while (sap_tmp2->next != NULL)
678             {
679                sap_tmp2 = sap_tmp2->next;
680             }
681             sap_tmp2->next = sap_tmp;
682          }
683       }
684       if (spot->makemult == FALSE)
685          SPI_RegionListFree(srip);
686       else
687          return srip;
688    } else
689    {
690       if (srip_cds != NULL)
691       {
692          srip_cds->next = srip;
693          srip = srip_cds;
694       }
695       return srip;
696    }
697    spot->printheader = FALSE;
698    return NULL;
699 }
700 
701 /***************************************************************************
702 *
703 *  SPI_CompareAlnPosForMult is the callback for the HeapSort in
704 *  SPI_MakeMultipleAlignment. It simply puts the alignments in order
705 *  along the genomic sequence, from least to greatest if the alignments
706 *  are on the plus strand, greatest to least otherwise.
707 *
708 ***************************************************************************/
SPI_CompareAlnPosForMult(VoidPtr ptr1,VoidPtr ptr2)709 static int LIBCALLBACK SPI_CompareAlnPosForMult(VoidPtr ptr1, VoidPtr ptr2)
710 {
711    Int4         from1;
712    Int4         from2;
713    SeqAlignPtr  sap1;
714    SeqAlignPtr  sap2;
715    Uint1        strand;
716    Int4         to1;
717    Int4         to2;
718 
719    sap1 = *((SeqAlignPtr PNTR)ptr1);
720    sap2 = *((SeqAlignPtr PNTR)ptr2);
721    strand = AlnMgr2GetNthStrand(sap1, 1);
722    AlnMgr2GetNthSeqRangeInSA(sap1, 1, &from1, &to1);
723    AlnMgr2GetNthSeqRangeInSA(sap2, 1, &from2, &to2);
724    if (strand == Seq_strand_minus)
725    {
726       if (from1 < from2)
727          return 1;
728       if (from2 < from1)
729          return -1;
730       if (to1 < to2)
731          return 1;
732       if (to2 < to1)
733          return -1;
734    } else
735    {
736       if (from1 < from2)
737          return -1;
738       if (from2 < from1)
739          return 1;
740       if (to1 < to2)
741          return -1;
742       if (to2 < to1)
743          return 1;
744    }
745    return 0;
746 }
747 
748 /***************************************************************************
749 *
750 *  SPI_OrderBlocksPlus is a callback for SPI_MakeMultipleAlignment. It
751 *  is used to order the blocks along the genomic sequence when the genomic
752 *  strand is plus.
753 *
754 ***************************************************************************/
SPI_OrderBlocksPlus(VoidPtr ptr1,VoidPtr ptr2)755 static int LIBCALLBACK SPI_OrderBlocksPlus(VoidPtr ptr1, VoidPtr ptr2)
756 {
757    SPI_BlockPtr  sbp1;
758    SPI_BlockPtr  sbp2;
759 
760    sbp1 = *((SPI_BlockPtr PNTR)ptr1);
761    sbp2 = *((SPI_BlockPtr PNTR)ptr2);
762    if (sbp1->from_g < sbp2->from_g)
763       return -1;
764    if (sbp2->from_g < sbp1->from_g)
765       return 1;
766    if (sbp1->to_g < sbp2->to_g)
767       return -1;
768    if (sbp1->to_g > sbp2->to_g)
769       return 1;
770    return 0;
771 }
772 
773 /***************************************************************************
774 *
775 *  SPI_OrderBlocksMinus is a callback for SPI_MakeMultipleAlignment. It
776 *  is used to order the blocks along the genomic sequence when the genomic
777 *  strand is minus.
778 *
779 ***************************************************************************/
SPI_OrderBlocksMinus(VoidPtr ptr1,VoidPtr ptr2)780 static int LIBCALLBACK SPI_OrderBlocksMinus(VoidPtr ptr1, VoidPtr ptr2)
781 {
782    SPI_BlockPtr  sbp1;
783    SPI_BlockPtr  sbp2;
784 
785    sbp1 = *((SPI_BlockPtr PNTR)ptr1);
786    sbp2 = *((SPI_BlockPtr PNTR)ptr2);
787    if (sbp1->from_g < sbp2->from_g)
788       return 1;
789    if (sbp2->from_g < sbp1->from_g)
790       return -1;
791    if (sbp1->to_g < sbp2->to_g)
792       return 1;
793    if (sbp1->to_g > sbp2->to_g)
794       return -1;
795    return 0;
796 }
797 
798 /***************************************************************************
799 *
800 *  SPI_CompareSecondRow is the HeapSort callback for SPI_RearrangeAlns;
801 *  it orders two seqaligns in lexical order of the seqid of their second
802 *  row. (All seqaligns involved have only two rows anyway).
803 *
804 ***************************************************************************/
SPI_CompareSecondRow(VoidPtr ptr1,VoidPtr ptr2)805 static int LIBCALLBACK SPI_CompareSecondRow(VoidPtr ptr1, VoidPtr ptr2)
806 {
807    Int4         ret;
808    SeqAlignPtr  sap1;
809    SeqAlignPtr  sap2;
810    SeqIdPtr     sip1;
811    SeqIdPtr     sip2;
812 
813    sap1 = *((SeqAlignPtr PNTR)ptr1);
814    sap2 = *((SeqAlignPtr PNTR)ptr2);
815    sip1 = AlnMgr2GetNthSeqIdPtr(sap1, 2);
816    sip2 = AlnMgr2GetNthSeqIdPtr(sap2, 2);
817    ret = SAM_OrderSeqID(sip1, sip2);
818    SeqIdFree(sip1);
819    SeqIdFree(sip2);
820    return ret;
821 }
822 
823 /***************************************************************************
824 *
825 *  SPI_RearrangeAlns arranges the alignments in a linked list in lexical
826 *  order of their second seqid, so that all blocks will have their
827 *  rows in the same order.
828 *
829 ***************************************************************************/
SPI_RearrangeAlns(SeqAlignPtr sap_head)830 static SeqAlignPtr SPI_RearrangeAlns(SeqAlignPtr sap_head)
831 {
832    Int4         i;
833    Int4         j;
834    SeqAlignPtr  sap;
835    SeqAlignPtr  PNTR saparray;
836 
837    i = 0;
838    sap = sap_head;
839    while (sap != NULL)
840    {
841       i++;
842       sap = sap->next;
843    }
844    if (i == 1)
845       return sap_head;
846    saparray = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
847    sap = sap_head;
848    i = 0;
849    while (sap != NULL)
850    {
851       saparray[i] = sap;
852       i++;
853       sap = sap->next;
854    }
855    HeapSort(saparray, i, sizeof(SeqAlignPtr), SPI_CompareSecondRow);
856    for (j=0; j<i-1; j++)
857    {
858       saparray[j]->next = saparray[j+1];
859    }
860    saparray[i-1]->next = NULL;
861    sap = saparray[0];
862    MemFree(saparray);
863    return sap;
864 }
865 
866 /***************************************************************************
867 *
868 *  SPI_MakeMultipleAlignment takes all exons returned from all mRNAs, all
869 *  regions, and makes multiple alignments out of them (after grouping them
870 *  into blocks. All blocks will not necessarily contain all mRNAs.
871 *
872 ***************************************************************************/
SPI_MakeMultipleAlignment(SPI_RegionInfoPtr srip_head)873 NLM_EXTERN void SPI_MakeMultipleAlignment(SPI_RegionInfoPtr srip_head)
874 {
875    SPI_BlockPtr       PNTR blockarray;
876    Int4               i;
877    Int4               j;
878    Int4               minus;
879    Int4               numblocks;
880    Int4               numsmps;
881    Int4               plus;
882    SeqAlignPtr        sap;
883    SeqAlignPtr        sap_head;
884    SeqAlignPtr        sap_prev;
885    SeqAlignPtr        sap_tmp;
886    SeqAlignPtr        PNTR saparray;
887    SPI_BlockPtr       sbp = NULL;
888    SPI_BlockPtr       sbp_head;
889    SPI_BlockPtr       sbp_prev;
890    SPI_MultPtr        smu;
891    SPI_RegionInfoPtr  srip;
892    Uint1              strand;
893    Uint1              strand_tmp;
894    SeqAlignPtr        sub_sap;
895 
896    if (srip_head->next == NULL)  /* only one alignment here */
897       return;
898    i = 0;
899    sap_head = sap_prev = NULL;
900    numsmps = 0;
901    minus = plus = 0;
902    srip = srip_head;
903    while (srip != NULL)
904    {
905       if (srip->smp != NULL)
906       {
907          if (srip->smp->strand == Seq_strand_minus)
908             minus++;
909          else
910             plus++;
911          for (j=0; j<srip->smp->numexons; j++)
912          {
913             sap = SeqAlignDup(srip->smp->saps[j]);
914             AlnMgr2IndexSingleChildSeqAlign(sap);
915             if (sap_head != NULL)
916             {
917                sap_prev->next = sap;
918                sap_prev = sap;
919             } else
920                sap_head = sap_prev = sap;
921             i++;
922          }
923          numsmps++;
924       }
925       srip = srip->next;
926    }
927    if (numsmps <= 1)
928    {
929       SeqAlignSetFree(sap_head);
930       return;
931    }
932    if (minus > plus)
933       strand = Seq_strand_minus;
934    else
935       strand = Seq_strand_plus;
936    saparray = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
937    sap = sap_head;
938    for (j=0; j<i; j++)
939    {
940       strand_tmp = AlnMgr2GetNthStrand(sap, 1);
941       if ((strand_tmp == Seq_strand_minus && strand != Seq_strand_minus) || (strand_tmp != Seq_strand_minus && strand == Seq_strand_minus))
942       {
943          sap_tmp = sap->next;
944          sap->next = NULL;
945          SeqAlignListReverseStrand(sap);
946          sap->next = sap_tmp;
947       }
948       saparray[j] = sap;
949       sap = sap->next;
950    }
951    HeapSort(saparray, i, sizeof(SeqAlignPtr), SPI_CompareAlnPosForMult);
952    for (j=0; j<i; j++)
953    {
954       saparray[j]->next = NULL;
955    }
956    sbp_head = sbp_prev = NULL;
957    for (j=0; j<i; j++)
958    {
959       if (sbp_head == NULL)
960       {
961          sbp = (SPI_BlockPtr)MemNew(sizeof(SPI_Block));
962          AlnMgr2GetNthSeqRangeInSA(saparray[j], 1, &sbp->from_g, &sbp->to_g);
963          sbp->sap = saparray[j];
964          saparray[j] = NULL;
965          sbp_head = sbp_prev = sbp;
966       } else
967       {
968          if (spi_overlaps(saparray[j], sbp))
969          {
970             sap_tmp = sbp->sap;
971             while (sap_tmp->next != NULL)
972             {
973                sap_tmp = sap_tmp->next;
974             }
975             sap_tmp->next = saparray[j];
976             saparray[j] = NULL;
977          } else
978          {
979             sbp = (SPI_BlockPtr)MemNew(sizeof(SPI_Block));
980             AlnMgr2GetNthSeqRangeInSA(saparray[j], 1, &sbp->from_g, &sbp->to_g);
981             sbp->sap = saparray[j];
982             saparray[j] = NULL;
983             sbp_prev->next = sbp;
984             sbp_prev = sbp;
985          }
986       }
987    }
988    MemFree(saparray);
989    sbp = sbp_head;
990    numblocks = 0;
991    while (sbp)
992    {
993       numblocks++;
994       sbp->sap = SPI_RearrangeAlns(sbp->sap);
995       AlnMgr2IndexIndexedChain(sbp->sap);
996       sub_sap = AlnMgr2GetSubAlign(sbp->sap, 0, -1, 0, TRUE);
997       SeqAlignSetFree(sbp->sap);
998       sbp->sap = sub_sap;
999       if (strand == Seq_strand_minus)
1000          sbp->sap = SeqAlignListReverseStrand(sbp->sap);
1001       AlnMgr2IndexSingleChildSeqAlign(sub_sap);
1002       sbp = sbp->next;
1003    }
1004    blockarray = (SPI_BlockPtr PNTR)MemNew(numblocks*sizeof(SPI_BlockPtr));
1005    sbp = sbp_head;
1006    j = 0;
1007    while (sbp != NULL)
1008    {
1009       blockarray[j] = sbp;
1010       j++;
1011       sbp = sbp->next;
1012    }
1013    if (strand == Seq_strand_minus)
1014       HeapSort(blockarray, numblocks, sizeof(SPI_BlockPtr), SPI_OrderBlocksMinus);
1015    else
1016       HeapSort(blockarray, numblocks, sizeof(SPI_BlockPtr), SPI_OrderBlocksPlus);
1017    saparray = (SeqAlignPtr PNTR)MemNew(numblocks*sizeof(SeqAlignPtr));
1018    for (j=0; j<numblocks; j++)
1019    {
1020       saparray[j] = blockarray[j]->sap;
1021       MemFree(blockarray[j]);
1022    }
1023    MemFree(blockarray);
1024    smu = (SPI_MultPtr)MemNew(sizeof(SPI_Mult));
1025    smu->exons = saparray;
1026    smu->numexons = numblocks;
1027    srip_head->smu = smu;
1028 }
1029 
1030 /***************************************************************************
1031 *
1032 *  SPI_WriteAlnLine prints out the specified row of an alignment, between
1033 *  the alignment coordinates specified. It allocates the charptr itself;
1034 *  this must be freed later by the calling function.
1035 *
1036 ***************************************************************************/
SPI_WriteAlnLine(Int4 row,Int4 from,Int4 to,SeqAlignPtr sap)1037 static CharPtr SPI_WriteAlnLine(Int4 row, Int4 from, Int4 to, SeqAlignPtr sap)
1038 {
1039    AlnMsg2Ptr   amp;
1040    BioseqPtr   bsp;
1041    Uint1       buf[SPI_LINE+2];
1042    Int4        ctr;
1043    Int4        i;
1044    Boolean     more;
1045    Int4        n;
1046    SeqIdPtr    sip;
1047    SeqPortPtr  spp;
1048    CharPtr     string;
1049 
1050    n = AlnMgr2GetNumRows(sap);
1051    if (row > n || row < 1)
1052       return NULL;
1053    string = (CharPtr)MemNew((SPI_LINE+2)*sizeof(Char));
1054    for (n=0; n<(SPI_LINE+2); n++)
1055    {
1056       string[n] = '\0';
1057    }
1058    sip = AlnMgr2GetNthSeqIdPtr(sap, row);
1059    bsp = BioseqLockById(sip);
1060    amp = AlnMsgNew2();
1061    amp->row_num = row;
1062    amp->from_aln = from;
1063    amp->to_aln = to;
1064    if (amp->to_aln < 0)
1065       amp->to_aln = -1;
1066    n = 0;
1067    while ((more = AlnMgr2GetNextAlnBit(sap, amp)) == TRUE)
1068    {
1069       if (amp->to_row - amp->from_row > amp->to_aln - amp->from_aln) /* kludge */
1070       {
1071          if (amp->strand == Seq_strand_minus)
1072             amp->from_row = amp->to_row - (amp->to_aln - amp->from_aln);
1073          else
1074             amp->to_row = amp->from_row + (amp->to_aln - amp->from_aln);
1075       }
1076       if (amp->type == AM_SEQ)
1077       {
1078          spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
1079          ctr = SeqPortRead(spp, buf, (amp->to_row - amp->from_row + 1));
1080          SeqPortFree(spp);
1081          for (i=n; i<n+ctr; i++)
1082          {
1083             string[i] = buf[i-n];
1084          }
1085          n += ctr;
1086       } else
1087       {
1088          for (i=n; i<(n+amp->to_row-amp->from_row+1); i++)
1089          {
1090             string[i] = '-';
1091          }
1092          n += amp->to_row-amp->from_row+1;
1093       }
1094    }
1095    AlnMsgFree2(amp);
1096    SeqIdFree(sip);
1097    return string;
1098 }
1099 
1100 /***************************************************************************
1101 *
1102 *  SPI_MapRowCoords finds the first non-gap character in a row and
1103 *  returns its sequence position. If the row consists only of gaps, it
1104 *  returns -1. If direction is RIGHT, the function searches upwards in
1105 *  alignment coordinates; otherwise it searches the other direction.
1106 *
1107 ***************************************************************************/
SPI_MapRowCoords(SeqAlignPtr sap,Int4 from,Int4 to,Int4 row,Uint1 direction)1108 static Int4 SPI_MapRowCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row, Uint1 direction)
1109 {
1110    Int4  pos;
1111 
1112    if (direction == SPI_RIGHT)
1113    {
1114       pos = AlnMgr2MapSeqAlignToBioseq(sap, from, row);
1115       from++;
1116       while (pos < 0 && from <= to)
1117       {
1118          pos = AlnMgr2MapSeqAlignToBioseq(sap, from, row);
1119          from++;
1120       }
1121    } else
1122    {
1123       pos = AlnMgr2MapSeqAlignToBioseq(sap, to, row);
1124       to--;
1125       while (pos < 0 && to >= from)
1126       {
1127          pos = AlnMgr2MapSeqAlignToBioseq(sap, to, row);
1128          to--;
1129       }
1130    }
1131    if (pos < 0)
1132       return -1;
1133    return pos;
1134 }
1135 
1136 /***************************************************************************
1137 *
1138 *  spi_get_num_places calculates the number of digits in a number, for
1139 *  display neatness purposes.
1140 *
1141 ***************************************************************************/
spi_get_num_places(Int4 num)1142 static Int4 spi_get_num_places(Int4 num)
1143 {
1144    FloatHi  f;
1145    Int4     i;
1146    Int4     x;
1147 
1148    x = 10;
1149    for (i=1; i<21; i++)
1150    {
1151       f = (FloatHi)num/(FloatHi)x;
1152       if (f < 1)
1153       {
1154          if (num < 0)
1155             return (i+1);
1156          else
1157             return i;
1158       }
1159       x = x*10;
1160    }
1161    if (num < 0)
1162       i++;
1163    return i;
1164 }
1165 
1166 /***************************************************************************
1167 *
1168 *  SPI_IsItMult looks over the alignments in smu->exons to see whether
1169 *  there are any aligments with dim > 2; if so, it returns SPI_MULT, and
1170 *  if not, it returns SPI_NOTMULT.
1171 *
1172 ***************************************************************************/
SPI_IsItMult(SPI_MultPtr smu)1173 static Int4 SPI_IsItMult(SPI_MultPtr smu)
1174 {
1175    Int4  i;
1176 
1177    if (smu == NULL)
1178       return SPI_NOTMULT;
1179    for (i=0; i<smu->numexons; i++)
1180    {
1181       if (AlnMgr2GetNumRows(smu->exons[i]) > 2)
1182          return SPI_MULT;
1183    }
1184    return SPI_NOTMULT;
1185 }
1186 
1187 /***************************************************************************
1188 *
1189 *  SPI_PrintMultipleAlignment prints a text or html report of the alignment
1190 *  computed by SPI_MakeMultipleAlignment.
1191 *
1192 ***************************************************************************/
SPI_PrintMultipleAlignment(SPI_RegionInfoPtr srip,Boolean html,BioseqPtr bsp,FILE * ofp)1193 NLM_EXTERN void SPI_PrintMultipleAlignment(SPI_RegionInfoPtr srip, Boolean html, BioseqPtr bsp, FILE * ofp)
1194 {
1195    Char         accsite[SPI_PSPLICE+2];
1196    Int4         c;
1197    Int4Ptr      coord;
1198    Int4         ctr;
1199    Int4         d;
1200    Char         don[SPI_PSPLICE+2];
1201    Int4         from;
1202    Int4         i;
1203    Int4         j;
1204    Int4         len;
1205    Boolean      local;
1206    Int4         n;
1207    Int4         ret;
1208    SeqAlignPtr  sap;
1209    SeqIdPtr     sip;
1210    SPI_MultPtr  smu;
1211    Int4         spacer;
1212    SeqPortPtr   spp;
1213    Uint1        strand;
1214    CharPtr      PNTR stringptr;
1215    Char         textid[42];
1216    Int4         to;
1217 
1218    if (srip == NULL || srip->smu == NULL)
1219       return;
1220    smu = srip->smu;
1221    if (ofp == NULL)
1222    {
1223       local = TRUE;
1224       ofp = FileOpen("stdout", "w");
1225    } else
1226       local = FALSE;
1227    fprintf(ofp, "\n\n");
1228    if (html)
1229       fprintf(ofp, "<h1><center>");
1230    fprintf(ofp, "Multiple Alignments\n");
1231    ret = SPI_IsItMult(smu);
1232    if (ret == SPI_NOTMULT)
1233    {
1234       fprintf(ofp, "None of the alignments in the set appears to be a multiple alignment.\n");
1235       return;
1236    }
1237    if (html)
1238    {
1239       fprintf(ofp, "<br></center></h1>\n");
1240       fprintf(ofp, "<table cellspacing=\"8\" cellpadding=\"5\" border=\"0\" width=\"600\">\n");
1241    }
1242    spacer = SPI_SPACER;
1243    for (i=0; i<smu->numexons; i++)
1244    {
1245       sap = smu->exons[i];
1246       n = AlnMgr2GetNumRows(sap);
1247       if (html)
1248       {
1249          fprintf(ofp, "<tr><td bgcolor=%s width=\"600\">", (i%2)?"#FFFFFF":"#FFFFCC");
1250          fprintf(ofp, "<a name=Block%d></a><h4>Block %d\n</h4><pre>", i+1, i+1);
1251       } else
1252          fprintf(ofp, "Block %d\n", i+1);
1253       for (j=0; j<n; j++)
1254       {
1255          if (j > 0)
1256             sip = AlnMgr2GetNthSeqIdPtr(sap, j+1);
1257          else
1258             sip = bsp->id;
1259          SeqIdWrite(sip, textid, PRINTID_FASTA_LONG, 41);
1260          AlnMgr2GetNthSeqRangeInSA(sap, j+1, &from, &to);
1261          strand = AlnMgr2GetNthStrand(sap, j+1);
1262          if (html)
1263             fprintf(ofp, "<font color=%s>%s: %d to %d</font> %s\n", (j==0)?"#336699":"#800080", textid, from+1, to+1, (strand == Seq_strand_minus)?"<font color=#FF0033>minus strand</font>":"");
1264          else
1265             fprintf(ofp, "%s: %d to %d %s\n", textid, from+1, to+1, (strand == Seq_strand_minus)?"minus strand":"");
1266          if (j > 0)
1267             SeqIdFree(sip);
1268       }
1269       fprintf(ofp, "\n");
1270       len = AlnMgr2GetAlnLength(sap, FALSE);
1271       /* get donor and acceptor sites */
1272       strand = AlnMgr2GetNthStrand(sap, 1);
1273       AlnMgr2GetNthSeqRangeInSA(sap, 1, &from, &to);
1274       if (strand == Seq_strand_minus)
1275       {
1276          spp = SeqPortNew(bsp, to+1, MIN(bsp->length-1, to+SPI_PSPLICE), Seq_strand_minus, Seq_code_iupacna);
1277          ctr = MIN(bsp->length-1, to+SPI_PSPLICE)-(to+1)+1;
1278       } else
1279       {
1280          spp = SeqPortNew(bsp, MAX(0, from-SPI_PSPLICE), from-1, Seq_strand_plus, Seq_code_iupacna);
1281          ctr = from-1-MAX(0, from-SPI_PSPLICE)+1;
1282       }
1283       ctr = SeqPortRead(spp, (Uint1Ptr)accsite, ctr);
1284       accsite[ctr] = '\0';
1285       SeqPortFree(spp);
1286       if (strand == Seq_strand_minus)
1287       {
1288          spp = SeqPortNew(bsp, MAX(0, from-SPI_PSPLICE), from-1, Seq_strand_minus, Seq_code_iupacna);
1289          ctr = from-1-MAX(0, from-SPI_PSPLICE)+1;
1290       } else
1291       {
1292          spp = SeqPortNew(bsp, to+1, MIN(to+SPI_PSPLICE, bsp->length-1), Seq_strand_plus, Seq_code_iupacna);
1293          ctr = MIN(to+SPI_PSPLICE, bsp->length-1)-(to+1)+1;
1294       }
1295       ctr = SeqPortRead(spp, (Uint1Ptr)don, ctr);
1296       don[ctr] = '\0';
1297       SeqPortFree(spp);
1298       StringLower(accsite);
1299       StringLower(don);
1300       fprintf(ofp, "%s<-flank\n", accsite);
1301       stringptr = (CharPtr PNTR)MemNew(n*sizeof(CharPtr));
1302       coord = (Int4Ptr)MemNew(n*sizeof(Int4));
1303       for (c=0; c<len; c+=SPI_LINE-10)
1304       {
1305          for (j=0; j<n; j++)
1306          {
1307             stringptr[j] = SPI_WriteAlnLine(j+1, c, MIN(c+SPI_LINE-10-1, len-1), sap);
1308             coord[j] = SPI_MapRowCoords(sap, c, MIN(c+SPI_LINE-10-1, len-1), j+1, SPI_RIGHT);
1309             if (coord[j] >= 0)
1310                coord[j]++;
1311          }
1312          for (j=0; j<n; j++)
1313          {
1314             if (html)
1315                fprintf(ofp, "<font color=%s>", (j==0)?"#336699":"#800080");
1316             fprintf(ofp, "%d", coord[j]);
1317             if (html)
1318                fprintf(ofp, "</font>");
1319             /* KSK */
1320             /* d = spi_get_num_places(coord[j]); */
1321             for ( d = spi_get_num_places(coord[j]);
1322                   d < spacer; d++)
1323             {
1324                fprintf(ofp, " ");
1325             }
1326             if (j == 0)
1327                fprintf(ofp, "%s", stringptr[j]);
1328             else
1329             {
1330                for (ctr=0; ctr<MIN(SPI_LINE-10, len-c); ctr++)
1331                {
1332                   if (stringptr[j][ctr] == stringptr[0][ctr])
1333                      fprintf(ofp, ".");
1334                   else
1335                   {
1336                      if (html && stringptr[0][ctr] != '-' && stringptr[j][ctr] != '-')
1337                         fprintf(ofp, "<font color=#FF0033>");
1338                      fprintf(ofp, "%c", stringptr[j][ctr]);
1339                      if (html && stringptr[0][ctr] != '-' && stringptr[j][ctr] != '-')
1340                         fprintf(ofp, "</font>");
1341                   }
1342                }
1343             }
1344             fprintf(ofp, "\n");
1345             if (j > 0)
1346                MemFree(stringptr[j]);
1347          }
1348          MemFree(stringptr[0]);
1349          if (c+SPI_LINE-10 < len)
1350             fprintf(ofp, "\n");
1351       }
1352       for (j=0; j<ctr+spacer-7; j++)
1353       {
1354          fprintf(ofp, " ");
1355       }
1356       fprintf(ofp, "flank->%s\n\n", don);
1357       if (html)
1358          fprintf(ofp, "<a href=#TOP>Top</a>\n");
1359       MemFree(stringptr);
1360       MemFree(coord);
1361       if (html)
1362          fprintf(ofp, "</pre></td></tr>\n");
1363    }
1364    if (html)
1365       fprintf(ofp, "</table>");
1366    if (local)
1367       FileClose(ofp);
1368 }
1369 
1370 /***************************************************************************
1371 *
1372 *  spi_overlaps decides whether a new seqalign overlaps the already
1373 *  established range of a block. If it does, the coordinates are checked
1374 *  to see if it extends that range; if so, the block range is widened.
1375 *  If there is no overlap, the function returns FALSE.
1376 *
1377 ***************************************************************************/
spi_overlaps(SeqAlignPtr sap,SPI_BlockPtr sbp)1378 static Boolean spi_overlaps(SeqAlignPtr sap, SPI_BlockPtr sbp)
1379 {
1380    Int4  from;
1381    Int4  to;
1382 
1383    AlnMgr2GetNthSeqRangeInSA(sap, 1, &from, &to);
1384    if ((from <= sbp->from_g && to >= sbp->from_g) || (from <= sbp->to_g && to >= sbp->to_g))
1385    {
1386       if (from < sbp->from_g)
1387          sbp->from_g = from;
1388       if (to > sbp->to_g)
1389          sbp->to_g = to;
1390       return TRUE;
1391    }
1392    return FALSE;
1393 }
1394 
1395 /***************************************************************************
1396 *
1397 *  SPI_BeautifySMP converts all coordinates to 1-based from 0-based, and
1398 *  runs through the mRNA to see whether any mRNA is missing; if so, it
1399 *  flags that alignment with holes=TRUE.
1400 *
1401 ***************************************************************************/
SPI_BeautifySMP(SPI_RegionInfoPtr srip)1402 static void SPI_BeautifySMP(SPI_RegionInfoPtr srip)
1403 {
1404    BioseqPtr    bsp;
1405    Int4         i;
1406    SeqIdPtr     sip;
1407    SPI_mRNAPtr  smp;
1408 
1409    /** KSK bug fix for when smp is null because
1410        SPI_SortRegionsByScore() removed everyone below
1411        -c threshold ***/
1412    while (srip != NULL && srip->smp != NULL)
1413    {
1414       smp = srip->smp;
1415       if (srip->polyAtail == 0 && smp->numexons > 1)
1416       {
1417          sip = AlnMgr2GetNthSeqIdPtr(smp->saps[0], 2);
1418          bsp = BioseqLockById(sip);
1419          srip->polyAtail = SPI_IsItPolyA(sip);
1420          BioseqUnlock(bsp);
1421          SeqIdFree(sip);
1422       }
1423       smp->holes = FALSE;
1424       if (smp->strand == Seq_strand_minus)
1425       {
1426          for (i=smp->numexons-2; i>0 && !smp->holes; i--)
1427          {
1428             if (smp->mstarts[i] != smp->mstops[i+1]+1)
1429                smp->holes = TRUE;
1430          }
1431       } else
1432       {
1433          for (i=1; i<smp->numexons && !smp->holes; i++)
1434          {
1435             if (smp->mstarts[i] != smp->mstops[i-1]+1)
1436                smp->holes = TRUE;
1437          }
1438       }
1439       for (i=0; i<smp->numexons; i++)
1440       {
1441          smp->mstarts[i]++;
1442          smp->mstops[i]++;
1443          smp->gstarts[i]++;
1444          smp->gstops[i]++;
1445       }
1446       srip = srip->next;
1447    }
1448 }
1449 
1450 /***************************************************************************
1451 *
1452 *  SPI_RemoveOutsideBounds removes alignments that fall outside the
1453 *  spot->from and spot->to bounds, so that regions won't be created
1454 *  outside these boundaries.
1455 *
1456 ***************************************************************************/
SPI_RemoveOutsideBounds(SeqAlignPtr sap,SPI_OptionsPtr spot)1457 static void SPI_RemoveOutsideBounds(SeqAlignPtr sap, SPI_OptionsPtr spot)
1458 {
1459    SeqAlignPtr  salp;
1460    SeqAlignPtr  salp_head;
1461    SeqAlignPtr  salp_next;
1462    SeqAlignPtr  salp_prev;
1463    Int4         start;
1464    Int4         stop;
1465 
1466    if (sap == NULL || spot == NULL)
1467       return;
1468    salp = (SeqAlignPtr)(sap->segs);
1469    salp_head = salp_prev = NULL;
1470    while (salp != NULL)
1471    {
1472       salp_next = salp->next;
1473       salp->next = NULL;
1474       AlnMgr2GetNthSeqRangeInSA(salp, 1, &start, &stop);
1475       if (start >= spot->from || stop <= spot->to)
1476       {
1477          if (salp_head != NULL)
1478          {
1479             salp_prev->next = salp;
1480             salp_prev = salp;
1481          } else
1482             salp_head = salp_prev = salp;
1483       } else
1484          SeqAlignFree(salp);
1485       salp = salp_next;
1486    }
1487    sap->segs = (Pointer)(salp_head);
1488 }
1489 
1490 /***************************************************************************
1491 *
1492 *  SPI_PadRegions takes a linked list of regions and adds SPI_PADDING
1493 *  to either end of each region.
1494 *
1495 ***************************************************************************/
SPI_PadRegions(SPI_RegionInfoPtr srip,Int4 bsplen)1496 static void SPI_PadRegions(SPI_RegionInfoPtr srip, Int4 bsplen)
1497 {
1498    while (srip != NULL)
1499    {
1500       srip->gstart = srip->gstart - SPI_PADDING;
1501       if (srip->gstart < 0)
1502          srip->gstart = 0;
1503       srip->gstop = srip->gstop + SPI_PADDING;
1504       if (srip->gstop > bsplen - 1)
1505          srip->gstop = bsplen - 1;
1506       srip = srip->next;
1507    }
1508 }
1509 
1510 
1511 
1512 
1513 /***************************************************************************
1514 *
1515 *  SPI_SortRegionsByScore is called after SPI_AlignInWindows to sort the
1516 *  final regions from best to worst for printing. Since there is now a
1517 *  complete mRNA alignment in each region, the regions can be more
1518 *  thoroughly assessed, and the regions are sorted by mRNA coverage,
1519 *  number of mismatches, and finally by genomic start position.
1520 *
1521 ***************************************************************************/
SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip,SPI_OptionsPtr spot)1522 static void SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip, SPI_OptionsPtr spot)
1523 {
1524    Int4               i;
1525    Int4               j;
1526    SPI_RegionInfoPtr  PNTR srip_array;
1527    SPI_RegionInfoPtr  srip_head;
1528    SPI_RegionInfoPtr  srip_prev;
1529    SPI_RegionInfoPtr  srip_tmp;
1530 
1531    if (srip == NULL || *srip == NULL)
1532       return;
1533    srip_tmp = *srip;
1534    i = 0;
1535    while (srip_tmp != NULL)
1536    {
1537       i++;
1538       srip_tmp = srip_tmp->next;
1539    }
1540    srip_array = (SPI_RegionInfoPtr PNTR)MemNew(i*sizeof(SPI_RegionInfoPtr));
1541    srip_tmp = *srip;
1542    j = 0;
1543    while (srip_tmp != NULL && j < i)
1544    {
1545       srip_array[j] = srip_tmp;
1546       srip_tmp = srip_tmp->next;
1547       j++;
1548    }
1549    HeapSort(srip_array, i, sizeof(SPI_RegionInfoPtr), SPI_CompareRegions);
1550    for (j=0; j<i; j++)  /* remove the ones that don't score above the cutoffs */
1551    {
1552       srip_tmp = srip_array[j];
1553       if (srip_tmp->smp == NULL || (srip_tmp->smp->mRNAcoverage < spot->lencutoff && !srip_tmp->fallsoff)|| srip_tmp->smp->mismatch > 100-spot->idcutoff)
1554       {
1555          SPI_RegionFree(srip_tmp);
1556          srip_array[j] = NULL;
1557       }
1558    }
1559    srip_head = srip_prev = NULL;
1560    for (j=0; j<i; j++)
1561    {
1562       if (srip_array[j] != NULL)
1563       {
1564          if (srip_head != NULL)
1565          {
1566             srip_prev->next = srip_array[j];
1567             srip_array[j]->next = NULL;
1568             srip_prev = srip_array[j];
1569          } else
1570          {
1571             srip_head = srip_prev = srip_array[j];
1572             srip_head->next = NULL;
1573          }
1574       }
1575    }
1576    srip_tmp = srip_prev = srip_head;
1577    i = 1;
1578    /** KSK fix for when all are null **/
1579    if (srip_tmp != NULL && srip_tmp->next != NULL){
1580        srip_tmp = srip_tmp->next; /* know we need the first one at least */
1581        while (srip_tmp != NULL){
1582            if (i+1>spot->numreturns){ /* this guy is one too many */
1583                srip_prev->next = NULL;
1584                SPI_RegionListFree(srip_tmp);
1585                srip_tmp = NULL;
1586            } else {
1587                srip_prev = srip_tmp;
1588                srip_tmp = srip_tmp->next;
1589                i++;
1590            }
1591        }
1592        *srip = srip_head;
1593        MemFree(srip_array);
1594    }
1595    else {
1596        *srip = srip_head;
1597    }
1598 }
1599 
1600 /***************************************************************************
1601 *
1602 *  SPI_CompareRegions is the HeapSort callback for SPI_SortRegionsByScore.
1603 *  It sorts the regions first by mRNA coverage, then by the number of
1604 *  mismatches in the mRNA-to-genomic alignment, and finally by the
1605 *  start position on the genomic sequence.
1606 *
1607 ***************************************************************************/
SPI_CompareRegions(VoidPtr ptr1,VoidPtr ptr2)1608 static int LIBCALLBACK SPI_CompareRegions(VoidPtr ptr1, VoidPtr ptr2)
1609 {
1610    SPI_RegionInfoPtr  srip1;
1611    SPI_RegionInfoPtr  srip2;
1612 
1613    if (ptr1 != NULL && ptr2 != NULL)
1614    {
1615       srip1 = *((SPI_RegionInfoPtr PNTR)ptr1);
1616       srip2 = *((SPI_RegionInfoPtr PNTR)ptr2);
1617       if (srip1->smp == NULL)
1618          return 1;
1619       if (srip2->smp == NULL)
1620          return -1;
1621       if (srip1->smp->mRNAcoverage > srip2->smp->mRNAcoverage)
1622          return -1;
1623       else if (srip1->smp->mRNAcoverage < srip2->smp->mRNAcoverage)
1624          return 1;
1625       else
1626       {
1627          if (srip1->smp->mismatch < srip2->smp->mismatch)
1628             return -1;
1629          else if (srip1->smp->mismatch > srip2->smp->mismatch)
1630             return 1;
1631          else
1632          {
1633             if (srip1->smp->gstarts[0] < srip2->smp->gstarts[0])
1634                return -1;
1635             else if (srip1->smp->gstarts[0] > srip2->smp->gstarts[0])
1636                return 1;
1637             else
1638                return 0;
1639          }
1640       }
1641    }
1642    return 0;
1643 }
1644 
1645 /***************************************************************************
1646 *
1647 *  SPI_PrintAce prints the spidey results in ACEDB format for compatibility
1648 *  with Jean Thierry-Mieg's Acembly software.
1649 *
1650 ***************************************************************************/
SPI_PrintAce(FILE * ofp,SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,Boolean is_cds)1651 static void SPI_PrintAce(FILE *ofp, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, Boolean is_cds)
1652 {
1653    Int4         gi;
1654    Int4         i;
1655    CharPtr      id1;
1656    CharPtr      id2;
1657    Int4         n;
1658    SPI_mRNAPtr  smp;
1659 
1660    smp = srip->smp;
1661    SPI_GetAccessionFromSeqId(bsp_genomic->id, &gi, &id1);
1662    SPI_GetAccessionFromSeqId(bsp_mrna->id, &gi, &id2);
1663    fprintf(ofp, "Sequence %s\n", id1);
1664    fprintf(ofp, "Subsequence Sp_%s.%s", id2, (is_cds == TRUE)?"cds":"mrna");
1665    if (smp->strand == Seq_strand_minus)
1666       fprintf(ofp, "\t%d\t%d\n\n", smp->gstops[smp->numexons-1]+1, smp->gstarts[0] + 1);
1667    else
1668       fprintf(ofp, "\t%d\t%d\n\n", smp->gstarts[0]+1, smp->gstops[smp->numexons-1]+1);
1669    fprintf(ofp, "Sequence Sp_%s.%s\n", id2, (is_cds == TRUE)?"cds":"mrna");
1670    fprintf(ofp, "Method Spidey\n");
1671    n = 1;
1672    if (is_cds)
1673       fprintf(ofp, "CDS\n");
1674    if (smp->strand != Seq_strand_minus)
1675    {
1676       for (i=0; i<smp->numexons; i++)
1677       {
1678          fprintf(ofp, "Source_Exons\t%d\t%d\n", n, n + smp->gstops[i] - smp->gstarts[i]);
1679          n += smp->gstops[i] - smp->gstarts[i];
1680          if (i < smp->numexons-1)
1681             n += smp->gstarts[i+1] - smp->gstops[i];
1682       }
1683    } else
1684    {
1685       for (i = smp->numexons-1; i>=0; i--)
1686       {
1687          fprintf(ofp, "Source_Exons\t%d\t%d\n", n, n + smp->gstops[i] - smp->gstarts[i]);
1688          n += smp->gstops[i] - smp->gstarts[i];
1689          if (i > 0)
1690             n += smp->gstarts[i] - smp->gstops[i-1];
1691       }
1692    }
1693    fprintf(ofp, "DNA_Homol %s\n", id2);
1694    if (smp->missingends == SPI_LEFT)
1695       fprintf(ofp, "Start_not_found\n");
1696    else if (smp->missingends == SPI_RIGHT)
1697       fprintf(ofp, "Stop_not_found\n");
1698    else if (smp->missingends == SPI_BOTH)
1699       fprintf(ofp, "Start_not_found\nStop_not_found\n");
1700    fprintf(ofp, "\n\n");
1701 }
1702 
1703 /***************************************************************************
1704 *
1705 *  SPI_PrintResult prints the summary report and (if requested) the
1706 *  text alignment. Since the exons are stored in the order of the
1707 *  genomic sequence, not the mRNA, they must be reversed to print the
1708 *  mRNA from 5' to 3'. The SPI_ExonProfPtr holds the information about
1709 *  the location of the gaps and mismatches, so this structure is sent
1710 *  to spi_print_mismatch_line, which interprets the information in the
1711 *  ExonProfPtr and creates the mismatch line (vertical bars for identity,
1712 *  nothing for gaps or mismatches).   (PRRESULT)
1713 *
1714 ***************************************************************************/
SPI_PrintResult(FILE * ofp,FILE * ofp2,SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot,Boolean isitCDS)1715 static void SPI_PrintResult(FILE *ofp, FILE *ofp2, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot, Boolean isitCDS)
1716 {
1717    AlnMsg2Ptr        amp;
1718    Boolean          begin;
1719    BioseqPtr        bsp;
1720    Char             buf[61];
1721    Int4             c;
1722    Char             ch;
1723    Int4             counter;
1724    Int4             ctr;
1725    Boolean          done;
1726    Boolean          end;
1727    Int4             endctr;
1728    CharPtr          endstr;
1729    SPI_ExonProfPtr  epp_curr;
1730    Int4             gstart, gbuflen = 0;
1731    Int4             i;
1732    Boolean          is_splice;
1733    Int4             j;
1734    Int4             l;
1735    Int4             len;
1736    Int4             maxline;
1737    Int4             minline;
1738    Boolean          more;
1739    Boolean          ng;
1740    Int4             offset;
1741    CharPtr          p;
1742    Int4             pos;
1743    Char             prot[SPI_LINE+5];
1744    Int4             r;
1745    Int4             s;
1746    SeqAlignPtr      sap;
1747    Int4             splice;
1748    SeqPortPtr       spp = NULL;
1749    Int4             start;
1750    Boolean          start_prot;
1751    Int4             PNTR starts;
1752    Int4             stop;
1753    Int4             PNTR stops;
1754    Uint1            strand;
1755    Boolean          term;
1756    CharPtr          text;
1757    Char             text1[200];
1758    Char             text2[200];
1759    Char             textid1[42];
1760    Char             textid2[42];
1761    Int4             tmp;
1762    CharPtr          tmpstring = NULL;
1763 
1764    if (bsp_genomic == NULL || bsp_mrna == NULL)
1765       return;
1766    if (spot->printaln != 3 && ofp == NULL)
1767       return;
1768    if (spot->printaln >= 2 && ofp2 == NULL)
1769       return;
1770    if (srip != NULL && srip->revcomp)
1771       BioseqRevComp(bsp_mrna);
1772    FastaDefLine (bsp_genomic, text1, 200, NULL, NULL, 0);
1773    SeqIdWrite(bsp_genomic->id, textid1, PRINTID_FASTA_LONG, 41);
1774    FastaDefLine (bsp_mrna, text2, 200, NULL, NULL, 0);
1775    SeqIdWrite(bsp_mrna->id, textid2, PRINTID_FASTA_LONG, 41);
1776    if (spot->printaln != 2)
1777    {
1778       if (spot->printheader)
1779          fprintf(ofp, "--SPIDEY version 1.40--\n");
1780       fprintf(ofp, "Genomic: %s ", textid1);
1781       fprintf(ofp, "%s, ", text1);
1782       fprintf(ofp, "%d bp\n", bsp_genomic->length);
1783       if (isitCDS)
1784       {
1785          tmpstring = StringSave("CDS");
1786          fprintf(ofp, "CDS: %s ", textid2);
1787       } else
1788       {
1789          tmpstring = StringSave("mRNA");
1790          fprintf(ofp, "mRNA: %s ", textid2);
1791       }
1792       fprintf(ofp, "%s, ", text2);
1793       if (isitCDS)
1794       {
1795          fprintf(ofp, "%d - %d, %d bp\n", srip->mstart+1, srip->mstop+1, srip->mlen);
1796          offset = srip->mstart; /* this is where the CDS starts */
1797       } else
1798       {
1799          fprintf(ofp, "%d bp\n", bsp_mrna->length);
1800          offset = 0;
1801       }
1802       if (srip == NULL || srip->smp == NULL)
1803       {
1804          fprintf(ofp, "No alignment found.\n\n");
1805          fflush(ofp);
1806          return;
1807       }
1808       if (srip->smp->strand == Seq_strand_minus)
1809          fprintf(ofp, "Strand: minus");
1810       else
1811          fprintf(ofp, "Strand: plus");
1812       if (srip->revcomp)
1813          fprintf(ofp, "  Reverse complement\n");
1814       else
1815          fprintf(ofp, "\n");
1816       fprintf(ofp, "Number of exons: %d\n", srip->smp->numexons);
1817       splice = 0;
1818 
1819       for (i=0; i < srip->smp->numexons; i++){
1820           if (srip->smp->strand == Seq_strand_minus){
1821               c = srip->smp->numexons - i - 1;
1822           }
1823           else {
1824               c = i;
1825           }
1826           splice += srip->smp->splicedon[i];
1827           epp_curr = srip->smp->epp;
1828           /* KSK to get correct exon info to report mismatches
1829              have to get the exon ptr to the right one */
1830           while (epp_curr != NULL && epp_curr->exonnum != c + 1){
1831               epp_curr = epp_curr->next;
1832           }
1833           if (srip->revcomp){
1834               fprintf(ofp, "Exon %d: %d-%d (gen)  %d-%d (%s)  id %.1f%%   mismatches %d  gaps %d  splice site (d  a): %d  %d", i+1, srip->smp->gstarts[c], srip->smp->gstops[c], bsp_mrna->length-srip->smp->mstarts[c]+1, bsp_mrna->length-srip->smp->mstops[c]+1, tmpstring, srip->smp->exonid[c], (epp_curr != NULL ? epp_curr->nummismatches : 0), srip->smp->exongaps[c], srip->smp->splicedon[c], srip->smp->spliceacc[c]);
1835           }
1836           else {
1837               fprintf(ofp, "Exon %d%s: %d-%d (gen)  %d-%d (%s)  id %.1f%% mismatches %d gaps %d  splice site (d  a): %d  %d", i+1, srip->smp->strand == Seq_strand_minus?"(-)":"", srip->smp->gstarts[c], srip->smp->gstops[c], srip->smp->mstarts[c], srip->smp->mstops[c], tmpstring, srip->smp->exonid[c], (epp_curr != NULL ? epp_curr->nummismatches : 0), srip->smp->exongaps[c], srip->smp->splicedon[c], srip->smp->spliceacc[c]);
1838           }
1839           if (i > 0 && i<srip->smp->numexons-1 && srip->smp->splicedon[c] == 0 && srip->smp->spliceacc[c] == 0){
1840               fprintf(ofp, "   uncertain\n");
1841           }
1842           else {
1843               fprintf(ofp, "\n");
1844           }
1845       }
1846       fprintf(ofp, "Number of splice sites: %d\n", splice);
1847       fprintf(ofp, "%s coverage: %d%%\n", tmpstring, srip->smp->mRNAcoverage);
1848       fprintf(ofp, "overall percent identity: %.1f%%\n", (FloatHi)(100) - srip->smp->mismatch);
1849       if (srip->smp->missingends == SPI_BOTH)
1850          text = StringSave("both");
1851       else if (srip->smp->missingends == SPI_NEITHER)
1852          text = StringSave("neither");
1853       else if (srip->smp->missingends == SPI_LEFT)
1854          text = StringSave("left");
1855       else if (srip->smp->missingends == SPI_RIGHT)
1856          text = StringSave("right");
1857       else
1858          text = StringSave("error");
1859       fprintf(ofp, "Missing %s ends: %s ", tmpstring, text);
1860       if (srip->fallsoff)
1861          fprintf(ofp, " -- may fall off end");
1862       fprintf(ofp, "\n");
1863       if (!isitCDS) /* print poly(A) tail information for mRNAs */
1864       {
1865          if (srip->polyAtail > 0)
1866             fprintf(ofp, "Non-aligning poly(A)+ tail length: %d\n", srip->polyAtail);
1867          else if (srip->polyAtail < 0)
1868                fprintf(ofp, "Aligning poly(A)+ tail length: %d\n", -srip->polyAtail);
1869       } else /* print UTR %id information for CDSs */
1870       {
1871          if (srip->strand != Seq_strand_minus)
1872          {
1873             if (srip->utr.left != -1)
1874                fprintf(ofp, "5' UTR id %.1f%%\n", srip->utr.left);
1875             if (srip->utr.right != -1)
1876                fprintf(ofp, "3' UTR id %.1f%%\n", srip->utr.right);
1877          } else
1878          {
1879             if (srip->utr.right != -1)
1880                fprintf(ofp, "5' UTR id %.1f%%\n", srip->utr.right);
1881             if (srip->utr.left != -1)
1882                fprintf(ofp, "3' UTR id %.1f%%\n", srip->utr.left);
1883          }
1884          if (srip->gstart == 1)
1885             fprintf(ofp, "5' partial\n");
1886          if (srip->gstop == 1)
1887             fprintf(ofp, "3' partial\n");
1888       }
1889       fprintf(ofp, "\n");
1890       fflush(ofp);
1891    }
1892    p = NULL;
1893    if (spot->printaln != 1)  /* print alignment too */
1894    {
1895       if (spot->printaln == 0)
1896          ofp2 = ofp;
1897       p = srip->smp->protein;
1898       pos = srip->smp->transstart;
1899       epp_curr = srip->smp->epp;
1900       if (isitCDS && spot->printaln == 2)
1901          tmpstring = StringSave("CDS");
1902       else if (spot->printaln == 2)
1903          tmpstring = StringSave("mRNA");
1904       fprintf(ofp2, "Genomic: %s %s\n", textid1, text1);
1905       fprintf(ofp2, "%s: %s %s\n", tmpstring, textid2, text2);
1906       if (spot->printaln > 0)
1907       {
1908          if (srip == NULL || srip->smp == NULL)
1909          {
1910             fprintf(ofp2, "No alignment found.\n\n");
1911             fflush(ofp2);
1912             return;
1913          }
1914          if (srip->smp->strand == Seq_strand_minus)
1915             fprintf(ofp2, "Strand: minus");
1916          else
1917             fprintf(ofp2, "Strand: plus");
1918          if (srip->revcomp)
1919             fprintf(ofp2, "  Reverse complement\n");
1920          else
1921             fprintf(ofp2, "\n");
1922          fprintf(ofp2, "Number of exons: %d\n", srip->smp->numexons);
1923       }
1924       amp = AlnMsgNew2();
1925       sap = NULL;
1926       for (i=0; i<srip->smp->numexons; i++)
1927       {
1928          if (sap != NULL)
1929             SeqAlignFree(sap);
1930          sap = NULL;
1931          if (srip->strand == Seq_strand_minus)
1932          {
1933             c = srip->smp->numexons - i - 1;
1934             sap = SeqAlignDup(srip->smp->saps[c]);
1935             SeqAlignListReverseStrand(sap);
1936             AlnMgr2IndexSingleChildSeqAlign(sap);
1937             starts = srip->smp->gstops;
1938             stops = srip->smp->gstarts;
1939             epp_curr = srip->smp->epp;
1940             while (epp_curr != NULL && epp_curr->exonnum != c+1)
1941             {
1942                epp_curr = epp_curr->next;
1943             }
1944             if (epp_curr != NULL && epp_curr->exonnum == c+1)  /* need to change the mismatch positions now */
1945             {
1946                l = AlnMgr2GetAlnLength(sap, FALSE);
1947                for (j=0; j<epp_curr->nummismatches; j++)
1948                {
1949                   epp_curr->mismatches[j] = l - epp_curr->mismatches[j] - 1;
1950                }
1951                for (j=0; j<ceil(epp_curr->nummismatches/2); j++)
1952                {
1953                   tmp = epp_curr->mismatches[j];
1954                   epp_curr->mismatches[j] = epp_curr->mismatches[epp_curr->nummismatches-j-1];
1955                   epp_curr->mismatches[epp_curr->nummismatches-j-1] = tmp;
1956                }
1957             }
1958          } else
1959          {
1960             c = i;
1961             starts = srip->smp->gstarts;
1962             stops = srip->smp->gstops;
1963             epp_curr = srip->smp->epp;
1964             while (epp_curr != NULL && epp_curr->exonnum != c+1)
1965             {
1966                epp_curr = epp_curr->next;
1967             }
1968          }
1969          if (srip->revcomp)
1970             fprintf(ofp2, "Exon %d: %d-%d (gen)  %d-%d (%s)\n", i+1, starts[c], stops[c], bsp_mrna->length-srip->smp->mstarts[c]+1, bsp_mrna->length-srip->smp->mstops[c]+1, tmpstring);
1971          else
1972             fprintf(ofp2, "Exon %d: %d-%d (gen)  %d-%d (%s)\n", i+1, starts[c], stops[c], srip->smp->mstarts[c], srip->smp->mstops[c], tmpstring);
1973          if (sap == NULL)
1974          {
1975             sap = SeqAlignDup(srip->smp->saps[c]);
1976             AlnMgr2IndexSingleChildSeqAlign(sap);
1977          }
1978          strand = AlnMgr2GetNthStrand(sap, 1);
1979          len = AlnMgr2GetAlnLength(sap, FALSE);
1980          AlnMgr2GetNthSeqRangeInSA(sap, 1, &gstart, NULL);
1981          end = FALSE;
1982          ng = FALSE;
1983          term = FALSE;
1984          is_splice = FALSE;
1985          begin = TRUE;
1986          endstr = NULL;
1987          for (l=0; l<len+10; l+= SPI_LINE)
1988          {
1989             start_prot = TRUE;
1990             if (l == SPI_LINE)
1991                l = SPI_LINE - 10;   /* kludge to print genomic splice on 1st line */
1992             minline = maxline = -1; /* reset mRNA bounds */
1993             for (j=1; l<len && j<=2; j++)
1994             {
1995                fprintf(ofp2, "\n");
1996                if (j == 1)
1997                {
1998                   fprintf(ofp2, "\n");
1999                   bsp = bsp_genomic;
2000                } else
2001                {
2002                   bsp = bsp_mrna;
2003                   /* retrieve mismatch information, print the line of vertical bars */
2004                   spi_print_mismatch_line(ofp2, c+1, l, len-1, epp_curr, gstart);
2005                }
2006                AlnMsgReNew2(amp);
2007                amp->from_aln = l;
2008                if (l != 0)
2009                {
2010                   if (l+SPI_LINE-1 >= len-1)
2011                   {
2012                      end = TRUE;
2013                      amp->to_aln = -1;
2014                   } else
2015                      amp->to_aln = l+SPI_LINE-1;
2016                } else
2017                {
2018                   if (begin == FALSE)
2019                   {
2020                      if (l+SPI_LINE-1 >= len-1)
2021                         end = TRUE;
2022                   } else
2023                   {
2024                      if (l+SPI_LINE-1-10 >= len-1)
2025                         end = TRUE;
2026                   }
2027                   amp->to_aln = MIN(SPI_LINE-1-10, len-1);
2028                }
2029                amp->row_num = j;
2030                done = FALSE;
2031                if (is_splice == TRUE)
2032                   is_splice = FALSE;
2033                r=0;
2034                counter = 0;
2035                if (j == 2)
2036                  begin = FALSE;
2037                while ((Boolean)(more = AlnMgr2GetNextAlnBit(sap, amp)))
2038                {
2039                   if (j == 1 && amp->type == AM_SEQ)
2040                      counter += amp->to_row - amp->from_row + 1;
2041                   if (j == 2)
2042                   {
2043                      if (minline == -1 && amp->type == AM_SEQ)
2044                         minline = amp->from_row;
2045                      if (maxline == -1)
2046                         maxline = amp->to_row;
2047                       /*  maxline = AlnMgr2MapSeqAlignToBioseq(sap, amp->to_aln, 2);*/
2048                   }
2049                   /* print splice site */
2050                   /** KSK fix for when minus strand is
2051                       at the end, and simplified this loop  ***/
2052                   if (l==0 && j==1 && !done){
2053                       if (amp->strand != Seq_strand_minus){
2054                           if (amp->from_row < 10){
2055                               start = 0;
2056                               gbuflen = amp->from_row;
2057                               stop = gbuflen - 1;
2058                           }
2059                           else {
2060                               start = amp->from_row - 10;
2061                               stop = amp->from_row - 1;
2062                               gbuflen = 10;
2063                           }
2064                       } else {
2065                           if (amp->to_row + 10 < bsp_genomic->length-1){
2066                               stop = amp->to_row + 10;
2067                               start = amp->to_row + 1;
2068                               gbuflen = 10;
2069                           }
2070                           else {
2071                               stop = bsp_genomic->length-1;
2072                               gbuflen  = (bsp_genomic->length - 1)
2073                                   - (amp->to_row + 1) + 1;
2074                               start = amp->to_row + 1;
2075                           }
2076                       }
2077                       /** KSK fix continues so that only as many
2078                           bases as exist up to 10
2079                           will be read for the intron
2080                           buffer **/
2081 
2082                       if (gbuflen > 1){
2083                           spp = SeqPortNew(bsp_genomic, start, stop, amp->strand, Seq_code_iupacna);
2084                           ctr =  SeqPortRead(spp, (Uint1Ptr)buf, gbuflen);
2085                       }
2086                       else if (gbuflen <= 1){
2087                           ctr = 0;
2088                           spp = NULL;
2089                       }
2090                       buf[ctr] = '\0';
2091                       while (ctr < 10){
2092                           fprintf(ofp2, " ");
2093                           ctr++;
2094                       }
2095                       fwrite(buf, 1, ctr, ofp2);
2096                       if (spp){
2097                           SeqPortFree(spp);
2098                       }
2099                       /** end of region of KSK fix **/
2100 
2101                       done = TRUE;
2102                   } else if (l==0 && j==2 && !done)
2103                   {
2104                      fprintf(ofp2, "          "); /* 10 spaces for splice site */
2105                      done = TRUE;
2106                      is_splice = TRUE;
2107                   }
2108                   /**** used no more ***
2109                   else if (l==0 && j==1 && !done && gstart < 10)
2110                   {
2111                      spp = SeqPortNew(bsp_genomic, 0, gstart, amp->strand, Seq_code_iupacna);
2112                      ctr = SeqPortRead(spp, (Uint1Ptr)buf, gstart);
2113                      buf[ctr] = '\0';
2114                      while (ctr < 10)
2115                      {
2116                         fprintf(ofp2, " ");
2117                         ctr++;
2118                      }
2119                      fprintf(ofp2, buf);
2120                   }
2121                   **********************/
2122                   if (amp->type == AM_SEQ)
2123                   {
2124                      spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
2125                      ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_LINE);
2126                      if (ctr > 0)
2127                         buf[ctr] = '\0';
2128                      fprintf(ofp2, "%s", buf);
2129                      SeqPortFree(spp);
2130                   } else /* print dashes for gaps */
2131                   {
2132                      for (ctr=0; ctr<(amp->to_row - amp->from_row+1); ctr++)
2133                      {
2134                         fprintf(ofp2, "-");
2135                      }
2136                   }
2137                   if (j==1 && end && counter >= amp->to_aln - amp->from_aln)
2138                   {
2139                      AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
2140                      if (strand != Seq_strand_minus)
2141                      {
2142                         if (stop > bsp_genomic->length - 11)
2143                            start = bsp_genomic->length;
2144                         else
2145                            start = stop + SPI_PSPLICE;
2146                         spp = SeqPortNew(bsp_genomic, stop+1, start, strand, Seq_code_iupacna);
2147                      } else
2148                      {
2149                         if (start < SPI_PSPLICE)
2150                            stop = 0;
2151                         else
2152                            stop = start - SPI_PSPLICE;
2153                         spp = SeqPortNew(bsp_genomic, stop, start-1, strand, Seq_code_iupacna);
2154                      }
2155                      endctr = ctr;
2156                      ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_PSPLICE);
2157                      if (ctr > 0)
2158                         buf[ctr] = '\0';
2159                      for (ctr=0; endctr+ctr <= SPI_LINE-1 && ctr<SPI_PSPLICE; ctr++)
2160                      {
2161                         fprintf(ofp2, "%c", buf[ctr]);
2162                      }
2163                      endstr = NULL;
2164                      if (ctr < SPI_PSPLICE)
2165                         endstr = StringSave(&buf[ctr]);
2166 	                 SeqPortFree(spp);
2167                   }
2168                   if (pos <= maxline && amp->type == AM_SEQ && p != NULL && j == 2) /* at least part of this is coding --   */
2169                   {                                     /* print the protein sequence underneath*/
2170                      if (is_splice)
2171                      {
2172                         is_splice = FALSE;
2173                         for (s=r; s<r+11; s++)
2174                         {
2175                            prot[s] = ' ';
2176                         }
2177                         r = s-1;
2178                      }
2179                      if (start_prot)
2180                      {
2181                         for (ctr = 0; ctr < pos-minline; ctr++)
2182                         {
2183                            prot[r] = ' ';
2184                            r++;
2185                         }
2186                         start_prot = FALSE;
2187                      }
2188                      if (pos >= amp->from_row-1 && pos <= amp->to_row)
2189                      {
2190                         ng = TRUE;
2191                         if (pos == minline-1)
2192                         {
2193                            ch = *p;
2194                            prot[r] = ch;
2195                            r++;
2196                            prot[r] = ' ';
2197                            r++;
2198                            if (*p == '*')
2199                               term = TRUE;
2200                            p++;
2201                            pos+=3;
2202                         }
2203                         for (ctr = pos; ctr < maxline && *p != '\0' && !term; ctr += 3)
2204                         {
2205                            ch = *p;
2206                            prot[r] = ' ';
2207                            r++;
2208                            prot[r] = ch;
2209                            r++;
2210                            prot[r] = ' ';
2211                            r++;
2212                            if (*p == '*')
2213                               term = TRUE;
2214                            p++;
2215                            pos+=3;
2216                         }
2217                      }
2218                   } else if (j == 2 && amp->type == AM_SEQ && p != NULL && pos>=minline && pos <= maxline)
2219                   {
2220                      for (s=0; s<(amp->to_row - amp->from_row+1); s++)
2221                      {
2222                         prot[r] = ' ';
2223                         r++;
2224                      }
2225                   }
2226                }
2227                if (j == 2 && ng == TRUE)
2228                {
2229                   prot[r] = '\0';
2230                   fprintf(ofp2, "\n%s\n", prot);
2231                }
2232             }
2233          }
2234          if (endstr == NULL) /* genomic sequence and overhang fit on the same line */
2235             fprintf(ofp2, "\n\n");
2236          else /* there's some extra genomic overhang sequence, print it on the next line */
2237          {
2238             fprintf(ofp2, "\n\n%s\n\n", endstr);
2239             MemFree(endstr);
2240          }
2241       }
2242       AlnMsgFree2(amp);
2243    }
2244    if (srip->revcomp)
2245       BioseqRevComp(bsp_mrna);
2246 }
2247 
2248 /***************************************************************************
2249 *
2250 *  SPI_PrintHerdResult is analogous to SPI_PrintResult; it prints a
2251 *  summary of the mRNA-to-draft alignment and, if requested, it also
2252 *  prints a text alignment. Since the exons are already in order of the
2253 *  mRNA sequence, printing the text of the alignments is pretty
2254 *  straightforward.
2255 *
2256 ***************************************************************************/
SPI_PrintHerdResult(FILE * ofp,FILE * ofp2,SPI_mRNAToHerdPtr herd,SPI_OptionsPtr spot,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna)2257 static void SPI_PrintHerdResult(FILE *ofp, FILE *ofp2, SPI_mRNAToHerdPtr herd, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna)
2258 {
2259    AlnMsg2Ptr        amp;
2260    BioseqPtr        bsp;
2261    Char             buf[61];
2262    Int4             ctr;
2263    Boolean          done;
2264    Boolean          end;
2265    Int4             endctr;
2266    CharPtr          endstr = NULL;
2267    SPI_ExonProfPtr  epp_curr;
2268    Int4             gstart;
2269    Int4             i;
2270    Int4             j;
2271    Int4             l;
2272    Int4             len;
2273    Boolean          more;
2274    Int4             polyAtail;
2275    SeqAlignPtr      sap;
2276    SeqPortPtr       spp;
2277    Int4             start;
2278    Int4Ptr          starts;
2279    Int4             stop;
2280    Int4Ptr          stops;
2281    Uint1            strand;
2282    CharPtr          text;
2283    Char             text1[200];
2284    Char             text2[200];
2285    Char             textid1[42];
2286    Char             textid2[42];
2287 
2288    if (ofp == NULL || herd == NULL || bsp_genomic == NULL || bsp_mrna == NULL)
2289       return;
2290    fprintf(ofp, "--SPIDEY version 1.35--\n");
2291    FastaDefLine (bsp_genomic, text1, 200, NULL, NULL, 0);
2292    SeqIdWrite(bsp_genomic->id, textid1, PRINTID_FASTA_LONG, 41);
2293    fprintf(ofp, "Genomic: %s ", textid1);
2294    fprintf(ofp, "%s, ", text1);
2295    fprintf(ofp, "%d bp\n", bsp_genomic->length);
2296    FastaDefLine (bsp_mrna, text2, 200, NULL, NULL, 0);
2297    SeqIdWrite(bsp_mrna->id, textid2, PRINTID_FASTA_LONG, 41);
2298    fprintf(ofp, "mRNA: %s ", textid2);
2299    fprintf(ofp, "%s, ", text2);
2300    fprintf(ofp, "%d bp\n", bsp_mrna->length);
2301    if (herd->numpieces == 0)
2302    {
2303       fprintf(ofp, "No alignment found.\n\n");
2304       fflush(ofp);
2305       return;
2306    }
2307    fprintf(ofp, "Number of exons: %d\n", herd->numexons);
2308    fprintf(ofp, "Number of pieces: %d\n", herd->numpieces);
2309    for (i=0; i<herd->numpieces; i++)
2310    {
2311       fprintf(ofp, "Fragment %d Exon %d: %d-%d (gen)  %d-%d (mRNA)  id %.1f%%  gaps %d  splice site (d  a): %d  %d  ", herd->fragments[i], herd->exons[i], herd->gstarts[i], herd->gstops[i], herd->mstarts[i], herd->mstops[i], (100)*(1-(FloatHi)herd->pmismatch[i]/(FloatHi)herd->lens[i]), herd->pgaps[i], herd->splicedon[i], herd->spliceacc[i]);
2312       if (herd->strands[i] != Seq_strand_minus)
2313          fprintf(ofp, "Strand: plus\n");
2314       else
2315          fprintf(ofp, "Strand: minus\n");
2316       if (herd->fallsoff[i] != SPI_NEITHER)
2317       {
2318          if (herd->fallsoff[i] == SPI_LEFT)
2319             fprintf(ofp, "May fall off left side\n");
2320          else if (herd->fallsoff[i] == SPI_RIGHT)
2321             fprintf(ofp, "May fall off right side\n");
2322          else if (herd->fallsoff[i] == SPI_BOTH)
2323             fprintf(ofp, "May fall off both sides\n");
2324       }
2325    }
2326    fprintf(ofp, "mRNA coverage: %.1f%%\n", herd->mRNAcoverage);
2327    fprintf(ofp, "overall percent identity: %.1f%%\n", (FloatHi)(100) - herd->mismatch);
2328    if (herd->missingends == SPI_BOTH)
2329       text = StringSave("both");
2330    else if (herd->missingends == SPI_NEITHER)
2331       text = StringSave("neither");
2332    else if (herd->missingends == SPI_LEFT)
2333       text = StringSave("left");
2334    else if (herd->missingends == SPI_RIGHT)
2335       text = StringSave("right");
2336    else
2337       text = StringSave("error");
2338    fprintf(ofp, "Missing mRNA ends: %s\n", text);
2339    polyAtail = SPI_IsItPolyA(bsp_mrna->id);
2340    if (polyAtail >= SPI_MINPOLYASIZE)
2341       fprintf(ofp, "Poly(A)+ tail length: %d\n", polyAtail);
2342    fprintf(ofp, "\n");
2343    fflush(ofp);
2344    if (spot->printaln && ofp2 != NULL)  /* print alignment too */
2345    {
2346       epp_curr = herd->epp;
2347       fprintf(ofp2, "Genomic: %s %s\n", textid1, text1);
2348       fprintf(ofp2, "mRNA: %s %s\n", textid2, text2);
2349       amp = AlnMsgNew2();
2350       for (i=0; i<herd->numpieces; i++)
2351       {
2352          starts = herd->gstarts;
2353          stops = herd->gstops;
2354          epp_curr = herd->epp;
2355          while (epp_curr != NULL && epp_curr->exonnum != i+1)
2356          {
2357             epp_curr = epp_curr->next;
2358          }
2359          fprintf(ofp2, "Fragment %d Exon %d: %d-%d (gen)  %d-%d (mRNA)\n", herd->fragments[i], herd->exons[i], starts[i], stops[i], herd->mstarts[i], herd->mstops[i]);
2360          sap = herd->saps[i];
2361          strand = AlnMgr2GetNthStrand(sap, 1);
2362          len = AlnMgr2GetAlnLength(sap, FALSE);
2363          AlnMgr2GetNthSeqRangeInSA(sap, 1, &gstart, NULL);
2364          end = FALSE;
2365          for (l=0; l<len; l+= SPI_LINE)
2366          {
2367             if (l == SPI_LINE)
2368                l = SPI_LINE - 1 - 10;   /* kludge to print genomic splice on 1st line */
2369             for (j=1; j<=2; j++)
2370             {
2371                fprintf(ofp2, "\n");
2372                if (j == 1)
2373                {
2374                   fprintf(ofp2, "\n");
2375                   bsp = bsp_genomic;
2376                } else
2377                {
2378                   bsp = bsp_mrna;
2379                   spi_print_mismatch_line(ofp2, i+1, l, len-1, epp_curr, gstart);
2380                }
2381                AlnMsgReNew2(amp);
2382                amp->from_aln = l;
2383                if (l != 0)
2384                {
2385                   if (l+SPI_LINE-1 >= len-1)
2386                   {
2387                      end = TRUE;
2388                      amp->to_aln = -1;
2389                   } else
2390                      amp->to_aln = l+SPI_LINE-1;
2391                } else
2392                {
2393                   if (l+SPI_LINE-1 >= len-1)
2394                      end = TRUE;
2395                   amp->to_aln = MIN(SPI_LINE-1-10, len-1);
2396                }
2397                amp->row_num = j;
2398                done = FALSE;
2399                while ((Boolean)(more = AlnMgr2GetNextAlnBit(sap, amp)))
2400                {
2401                   if (l==0 && j==1 && !done)  /* print splice site */
2402                   {
2403                      if (amp->from_row != 0 && amp->from_row != bsp_genomic->length-1)
2404                      {
2405                         if (amp->strand != Seq_strand_minus)
2406                         {
2407                            if (amp->from_row < 10)
2408                               start = 0;
2409                            else
2410                               start = amp->from_row - 10;
2411                            stop = amp->from_row - 1;
2412                         } else
2413                         {
2414                            if (amp->to_row + 10 < bsp_genomic->length-1)
2415                               stop = amp->to_row + 10;
2416                            else
2417                               stop = bsp_genomic->length-1;
2418                            start = amp->to_row+1;
2419                         }
2420                         spp = SeqPortNew(bsp_genomic, start, stop, amp->strand, Seq_code_iupacna);
2421                         ctr = SeqPortRead(spp, (Uint1Ptr)buf, 10);
2422                         if (ctr > 0)
2423                            buf[ctr] = '\0';
2424                         fwrite(buf, 1, ctr, ofp2);
2425                         SeqPortFree(spp);
2426                      }
2427                      done = TRUE;
2428                   } else if (l==0 && j==2 && !done)
2429                   {
2430                      fprintf(ofp2, "          "); /* 10 spaces for splice site */
2431                      done = TRUE;
2432                   }
2433                   if (amp->type == AM_SEQ)
2434                   {
2435                      spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
2436                      ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_LINE);
2437                      if (ctr > 0)
2438                         buf[ctr] = '\0';
2439                      fprintf(ofp2, "%s", buf);
2440                      SeqPortFree(spp);
2441                   } else
2442                   {
2443                      for (ctr=0; ctr<(amp->to_row - amp->from_row+1); ctr++)
2444                      {
2445                         fprintf(ofp2, "-");
2446                      }
2447                   }
2448                   if (j==1 && end)
2449                   {
2450                      AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
2451                      if (strand != Seq_strand_minus)
2452                      {
2453                         if (stop > bsp_genomic->length - 11)
2454                            start = bsp_genomic->length;
2455                         else
2456                            start = stop + SPI_PSPLICE;
2457                         spp = SeqPortNew(bsp_genomic, stop+1, start, strand, Seq_code_iupacna);
2458                      } else
2459                      {
2460                         if (start < SPI_PSPLICE)
2461                            stop = 0;
2462                         else
2463                            stop = start - SPI_PSPLICE;
2464                         spp = SeqPortNew(bsp_genomic, stop, start-1, strand, Seq_code_iupacna);
2465                      }
2466                      endctr = ctr;
2467                      ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_PSPLICE);
2468                      if (ctr > 0)
2469                         buf[ctr] = '\0';
2470                      for (ctr=0; endctr+ctr <= SPI_LINE-1 && ctr<SPI_PSPLICE; ctr++)
2471                      {
2472                         fprintf(ofp2, "%c", buf[ctr]);
2473                      }
2474                      endstr = NULL;
2475                      if (ctr < SPI_PSPLICE)
2476                         endstr = StringSave(&buf[ctr]);
2477 	                 SeqPortFree(spp);
2478                   }
2479                }
2480             }
2481          }
2482          if (endstr == NULL)
2483             fprintf(ofp2, "\n\n");
2484          else
2485          {
2486             fprintf(ofp2, "\n\n%s\n\n", endstr);
2487             MemFree(endstr);
2488          }
2489       }
2490       AlnMsgFree2(amp);
2491    }
2492 }
2493 
2494 /***************************************************************************
2495 *
2496 *  spi_print_mismatch_line takes a SPI_ExonProf structure and interprets
2497 *  the mismatch locations into a line of vertical bars for identity,
2498 *  nothing for mismatches or gaps. spi_print_mismatch_line only goes from
2499 *  start to start+len-1 each time; it does not interpret the entire structure.
2500 *
2501 ***************************************************************************/
spi_print_mismatch_line(FILE * ofp,Int4 exonnum,Int4 start,Int4 len,SPI_ExonProfPtr epp,Int4 gstart)2502 static void spi_print_mismatch_line(FILE *ofp, Int4 exonnum, Int4 start, Int4 len, SPI_ExonProfPtr epp, Int4 gstart)
2503 {
2504    Int4  i;
2505    Int4  j;
2506    Int4  length;
2507 
2508    if (ofp == NULL)
2509       return;
2510    if (start == 0)
2511    {
2512       length = MIN(SPI_LINE-10, len+1);
2513       fprintf(ofp, "          "); /* 10 spaces for splice site */
2514    } else
2515       length = MIN(SPI_LINE, len-start+1);
2516    if (epp != NULL && epp->exonnum == exonnum)
2517    {
2518       j = 0;
2519       while (j<epp->nummismatches && epp->mismatches[j] < start)
2520       {
2521          j++;
2522       }
2523       for (i=0; i<length; i++)
2524       {
2525          if (j<epp->nummismatches && epp->mismatches[j] == start+i) /* here's a mismatch */
2526          {
2527             fprintf(ofp, " ");
2528             j++;
2529          } else /* not a mismatch */
2530             fprintf(ofp, "|");
2531       }
2532    } else /* there are no mismatches at all in this exon, so just print |s */
2533    {
2534       for (i=0; i<length; i++)
2535       {
2536          fprintf(ofp, "|");
2537       }
2538    }
2539    fprintf(ofp, "\n");
2540 }
2541 
2542 /***************************************************************************
2543 *
2544 *  SPI_CreateContinuousAln creates a single dense-seg seqalign from a
2545 *  set of mRNA-to-genomic alignments. The introns are represented simply
2546 *  as gaps in the mRNA sequence. SPI_CreateContinuousAln calls
2547 *  SPI_ExtendAlnRight to extend each of the exon alignments across the intron,
2548 *  then it merges the exon alignments together to create a single seqalign
2549 *  spanning the entire mRNA-to-genomic alignment.
2550 *
2551 ***************************************************************************/
SPI_CreateContinuousAln(SeqAlignPtr PNTR saps,Int4 numsaps)2552 static SeqAlignPtr SPI_CreateContinuousAln(SeqAlignPtr PNTR saps, Int4 numsaps)
2553 {
2554    DenseSegPtr  dsp;
2555    DenseSegPtr  dsp_tmp;
2556    Int4         i;
2557    Int4         j;
2558    Int4         n1;
2559    Int4         n2;
2560    Int4         numseg;
2561    SeqAlignPtr  salp;
2562    Int4         start1;
2563    Int4         start2;
2564    Int4         stop1;
2565    Int4         stop2;
2566    Uint1        strand;
2567 
2568    for (i=0; i<numsaps-1; i++)
2569    {
2570       AlnMgr2GetNthSeqRangeInSA(saps[i], 1, &start1, &stop1);
2571       AlnMgr2GetNthSeqRangeInSA(saps[i+1], 1, &start2, &stop2);
2572       if (start2 - stop1 > 1)  /* genomic gap */
2573          SPI_ExtendAlnRight(saps[i], 1, stop1+1, start2-1);
2574       AlnMgr2GetNthSeqRangeInSA(saps[i], 2, &start1, &stop1);
2575       AlnMgr2GetNthSeqRangeInSA(saps[i+1], 2, &start2, &stop2);
2576       strand = AlnMgr2GetNthStrand(saps[i], 2);
2577       if (strand == Seq_strand_minus)
2578       {
2579          if (start1 - stop2 > 1)
2580             SPI_ExtendAlnRight(saps[i], 2, stop2+1, start1-1);
2581       } else
2582       {
2583          if (start2 - stop1 > 1)
2584             SPI_ExtendAlnRight(saps[i], 2, stop1+1, start2-1);
2585       }
2586    }
2587    numseg = 0;
2588    for (i=0; i<numsaps; i++)
2589    {
2590       dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2591       numseg += dsp_tmp->numseg;
2592    }
2593    /* now make a new seqalign across the whole set */
2594    dsp = DenseSegNew();
2595    dsp->dim = 2;
2596    dsp->numseg = numseg;
2597    dsp->starts = (Int4Ptr)MemNew(2*numseg*sizeof(Int4));
2598    dsp->lens = (Int4Ptr)MemNew(numseg*sizeof(Int4));
2599    dsp->strands = (Uint1Ptr)MemNew(2*numseg*sizeof(Uint1));
2600    n1 = n2 = 0;
2601    for (i=0; i<numsaps; i++)
2602    {
2603       dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2604       if (dsp->ids == NULL)
2605          dsp->ids = SeqIdDupList(dsp_tmp->ids);
2606       for (j=0; j<2*dsp_tmp->numseg; j++)
2607       {
2608          dsp->starts[n1+j] = dsp_tmp->starts[j];
2609          dsp->strands[n1+j] = dsp_tmp->strands[j];
2610       }
2611       for (j=0; j<dsp_tmp->numseg; j++)
2612       {
2613          dsp->lens[n2+j] = dsp_tmp->lens[j];
2614       }
2615       n1 += 2*dsp_tmp->numseg;
2616       n2 += dsp_tmp->numseg;
2617    }
2618    salp = SeqAlignNew();
2619    salp->type = SAT_PARTIAL;
2620    salp->segtype = SAS_DENSEG;
2621    salp->dim = 2;
2622    salp->segs = (Pointer)(dsp);
2623    AlnMgr2IndexSingleChildSeqAlign(salp);
2624    return salp;
2625 }
2626 
2627 /***************************************************************************
2628 *
2629 *  SPI_ExtendAlnRight is used by SPI_CreateContinuousAln to extend each
2630 *  exon alignment across the intron. SPI_ExtendAlnRight simply adds a
2631 *  segment to the exon alignment (or extends an existing segment, if
2632 *  possible) that has a gap in the mRNA sequence. SPI_ExtendAlnRight assumes
2633 *  that the input alignment is a child seqalign with two rows.
2634 *
2635 ***************************************************************************/
SPI_ExtendAlnRight(SeqAlignPtr sap,Int4 which_row,Int4 start,Int4 stop)2636 static void SPI_ExtendAlnRight(SeqAlignPtr sap, Int4 which_row, Int4 start, Int4 stop)
2637 {
2638    DenseSegPtr  dsp;
2639    Int4         i;
2640    Int4Ptr      lens;
2641    Int4Ptr      starts;
2642    Uint1Ptr     strands;
2643 
2644    if (sap == NULL)
2645       return;
2646    if (which_row > 2)
2647       return;
2648    dsp = (DenseSegPtr)(sap->segs);
2649    if (dsp->starts[2*(dsp->numseg-1) + which_row - 1] == -1 || dsp->starts[2*(dsp->numseg-1) + (2-which_row)] != -1)
2650    {
2651       starts = (Int4Ptr)MemNew((dsp->numseg+1)*2*sizeof(Int4));
2652       strands = (Uint1Ptr)MemNew((dsp->numseg+1)*2*sizeof(Uint1));
2653       lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
2654       for (i=0; i<dsp->numseg; i++)
2655       {
2656          lens[i] = dsp->lens[i];
2657       }
2658       for (i=0; i<=(dsp->dim)*(dsp->numseg-1)+1; i++)
2659       {
2660          starts[i] = dsp->starts[i];
2661          strands[i] = dsp->strands[i];
2662       }
2663       lens[dsp->numseg] = stop - start + 1;
2664       if (dsp->strands[which_row-1] != Seq_strand_minus)
2665          starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = start;
2666       else
2667          starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = stop;
2668       starts[(dsp->dim)*(dsp->numseg) + (2-which_row)] = -1;
2669       strands[(dsp->dim)*(dsp->numseg) + which_row - 1] = dsp->strands[which_row-1];
2670       strands[(dsp->dim)*(dsp->numseg) + (2-which_row)] = dsp->strands[2-which_row];
2671       MemFree(dsp->starts);
2672       MemFree(dsp->lens);
2673       MemFree(dsp->strands);
2674       dsp->numseg++;
2675       dsp->starts = starts;
2676       dsp->strands = strands;
2677       dsp->lens = lens;
2678    } else
2679    {
2680       dsp->lens[dsp->numseg-1] += stop - start + 1;
2681       if (dsp->strands[which_row-1] == Seq_strand_minus)
2682          dsp->starts[(dsp->dim)*(dsp->numseg-1) + which_row - 1] = stop;
2683    }
2684    SAIndex2Free2(sap->saip);
2685    sap->saip = NULL;
2686    AlnMgr2IndexSingleChildSeqAlign(sap);
2687 }
2688 
2689 /***************************************************************************
2690 *
2691 *  SPI_AlnSinglemRNAToPieces is the entry point for the mRNA-to-draft
2692 *  sequence functions of spidey, which create an alignment between mRNAs
2693 *  and a series of ordered, unordered, oriented, or unoriented (often
2694 *  a mixture of all of the above) fragments. SPI_AlnSinglemRNAToPieces
2695 *  reads in a tab-delimited file that has information about the order
2696 *  and orientation of the fragments. For example:
2697 
2698 *  ctg name  start   stop  fragment number accession.version
2699 *    |        |       |        | fragment code |   start  stop  strand
2700 *  9/ctg119 775986   784968	137	D	AC020712.4	96692	105674	-
2701 *  9/ctg119	784969	810517	138	D	AC022758.3	103385	128933	+
2702 *  9/ctg119	810518	810880	139	D	AC020712.4	54074	54436	+
2703 *  9/ctg119	810881	822654	140	D	AC022758.3	16691	28464	+
2704 *  9/ctg119	822655	822754	141	N	100	fragment	yes
2705 *  9/ctg119	822755	823638	142	D	AC020712.4	153248	154131	-
2706 *  9/ctg119 823639   823738   143   N  100   fragment no
2707 *  9/ctg119 823739   824581   144   F  AC021710.5  1728  2570  +
2708 
2709 *  The fragment code indicates whether the fragment is draft quality (D),
2710 *  finished (F), predraft (P), or a gap (N). If the fragment is a gap
2711 *  (which consists of 100 Ns), the yes/no field indicates whether the
2712 *  adjoining fragments are ordered across the gap. In spidey, a group
2713 *  is a set of fragments which all reside between the same two gaps.
2714 *  Linked groups, or lgroups, are two groups spanning a gap with a "yes"
2715 *  indicating that the groups have known order. Each fragment has an
2716 *  order within the group, as well.
2717 *  In this example, there are two lgroups (the last fragment is in its
2718 *  own lgroup) and three groups.
2719 *  SPI_AlnSinglemRNAToPieces reads in the tab-delimited file and creates
2720 *  a SPI_Pos structure for each fragment, indicating which group, lgroup,
2721 *  order, and original fragment number this fragment belongs to. Since
2722 *  the gap fragments are not used, the number of SPI_FragPtrs will be
2723 *  less than the number of fragments, so the original fragment numbers
2724 *  must be stored. This function also does the initial high-stringency
2725 *  BLAST alignment of the mRNA and the draft sequence; the alignments
2726 *  are put on the correct strand and then sent to other functions to
2727 *  order the alignments, make them consistent, connect them together,
2728 *  adjust the ends to splice sites, and finally to get summary
2729 *  statistics for printing.
2730 *
2731 ***************************************************************************/
SPI_AlnSinglemRNAToPieces(SPI_bsinfoPtr spig_head,SPI_bsinfoPtr spim,FILE * ofp,FILE * ofp2,SPI_OptionsPtr spot)2732 NLM_EXTERN SPI_mRNAToHerdPtr SPI_AlnSinglemRNAToPieces(SPI_bsinfoPtr spig_head, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot)
2733 {
2734    AMAlignIndex2Ptr      amaip;
2735    Int4                 c;
2736    CharPtr              field[SPI_NUMCOLS];
2737    FILE                 *fp;
2738    Int4                 group;
2739    SPI_mRNAToHerdPtr    herd;
2740    Int4                 i;
2741    Char                 line[200];
2742    Boolean              linked;
2743    Int4                 lgroup;
2744    Int4                 numFields;
2745    BLAST_OptionsBlkPtr  options;
2746    Int4                 order;
2747    SPI_PosPtr           posp;
2748    CharPtr              ptr;
2749    Char                 token;
2750    SeqAlignPtr          salp;
2751    SeqAlignPtr          salp_prev;
2752    SeqAlignPtr          salp_tmp;
2753    SeqAlignPtr          sap;
2754    SeqAlignPtr          sap1;
2755    SeqAlignPtr          sap2;
2756    SPI_FragPtr          sfp;
2757    SPI_FragPtr          sfp_head;
2758    SPI_FragPtr          sfp_prev;
2759    SPI_FragHerdPtr      sfhp;
2760    SeqLocPtr            slp1;
2761    SeqLocPtr            slp2;
2762    Int4                 start;
2763    Int4                 stop;
2764    Uint1                strand;
2765 
2766    if (spot->draftfile == NULL)
2767       return NULL;
2768    if (spot->to < spot->from)
2769       return NULL;
2770    fp = FileOpen(spot->draftfile, "r");
2771    if (fp == NULL)
2772       return NULL;
2773    sfhp = (SPI_FragHerdPtr)MemNew(sizeof(SPI_FragHerd));
2774    sfp_head = sfp_prev = NULL;
2775    group = 0;
2776    order = 0;
2777    lgroup = 0;
2778    linked = FALSE;
2779    while (fgets(line, sizeof (line), fp) != NULL)
2780    {
2781       memset(field, 0, sizeof (field));
2782       ptr = line;
2783       if ((ptr = strchr(ptr, '\t')) == NULL)
2784          token = ' ';
2785       else
2786          token = '\t';
2787       ptr = line;
2788       for (numFields=0; numFields < SPI_NUMCOLS && ptr != NULL; numFields++)
2789       {
2790          if (numFields == 0)
2791             ptr = strtok(ptr, &token);
2792          else
2793             ptr = strtok(NULL, &token);
2794          field[numFields] = ptr;
2795       }
2796       if (!StringICmp(field[4], "N")) /* gap */
2797       {
2798          order = 0;
2799          if (!StringNICmp(field[7], "yes", 3*sizeof(Char))) /* ordered across gap */
2800          {
2801             linked = TRUE;
2802             if (sfp_prev != NULL)
2803             {
2804                if (sfp_prev->position_orig->lgroup != 0)
2805                   lgroup = sfp_prev->position_orig->lgroup;
2806                else
2807                {
2808                   lgroup++;
2809                   sfp_prev->position_orig->lgroup = lgroup;
2810                }
2811             } else
2812                lgroup++;
2813          } else
2814             linked = FALSE;
2815       } else
2816       {
2817          order++;
2818          sfp = (SPI_FragPtr)MemNew(sizeof(SPI_Frag));
2819          sfp->start = atol(field[1]);
2820          sfp->stop = atol(field[2]);
2821          sfp->fragnum = atol(field[3]);
2822          posp = (SPI_PosPtr)MemNew(sizeof(SPI_Pos));
2823          if (linked)
2824             posp->lgroup = lgroup;
2825          if (order > 2)
2826             posp->group = group;
2827          else if (order == 2)
2828          {
2829             if (sfp_prev != NULL)
2830                sfp_prev->position_orig->group = group;
2831          } else
2832             group++;
2833          posp->group = group;
2834          posp->order = order;
2835          sfp->position_orig = posp;
2836          if (sfp_head != NULL)
2837          {
2838             sfp_prev->next = sfp;
2839             sfp_prev = sfp;
2840          } else
2841             sfp_head = sfp_prev = sfp;
2842          sfhp->numfrags++;
2843       }
2844    }
2845    sfhp->sfparray = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
2846    sfp = sfp_head;
2847    for (i=0; i<sfhp->numfrags; i++)
2848    {
2849       sfhp->sfparray[i] = sfp;
2850       sfp = sfp->next;
2851    }
2852    sfhp->polyAtail = SPI_IsItPolyA(spim->bsp->id);
2853    /* search genomic against both strands of mRNA */
2854    if (spot->from == spot->to == 0)
2855       spot->to = spig_head->bsp->length-1;
2856    slp2 = SeqLocIntNew(0, spim->bsp->length-1-sfhp->polyAtail, Seq_strand_minus, spim->bsp->id);
2857    slp1 = SeqLocIntNew(spot->from, spot->to, Seq_strand_plus, spig_head->bsp->id);
2858    options = BLASTOptionNew("blastn", FALSE);
2859    options->filter_string = StringSave("m L");
2860    options->expect_value = spot->secpasseval;
2861    options->query_lcase_mask = spot->lcaseloc;
2862    if (spot->interspecies)
2863    {
2864       options->gap_x_dropoff_final = 100;
2865       options->gap_open = 4;
2866       options->gap_extend = 1;
2867       options->penalty = -1;
2868    }
2869    sap1 = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
2870    SeqLocFree(slp2);
2871    BLASTOptionDelete(options);
2872    slp2 = SeqLocIntNew(0, spim->bsp->length-1-sfhp->polyAtail, Seq_strand_minus, spim->bsp->id);
2873    options = BLASTOptionNew("blastn", FALSE);
2874    options->filter_string = StringSave("m L");
2875    options->expect_value = spot->secpasseval;
2876    options->query_lcase_mask = spot->lcaseloc;
2877    if (spot->interspecies)
2878    {
2879       options->gap_x_dropoff_final = 100;
2880       options->gap_open = 4;
2881       options->gap_extend = 1;
2882       options->penalty = -1;
2883    }
2884    sap2 = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
2885    SeqLocFree(slp1);
2886    SeqLocFree(slp2);
2887    BLASTOptionDelete(options);
2888    AlnMgr2IndexLite(sap1);
2889    AlnMgr2IndexLite(sap2);
2890    sap = NULL;
2891    if (sap1 != NULL && sap2 != NULL)
2892    {
2893       salp = (SeqAlignPtr)(sap1->segs);
2894       while (salp->next != NULL)
2895       {
2896          salp = salp->next;
2897       }
2898       salp->next = (SeqAlignPtr)(sap2->segs);
2899       sap2->segs = NULL;
2900       SeqAlignFree(sap2);
2901       AMAlignIndex2Free2(sap1->saip);
2902       sap1->saip = NULL;
2903       AlnMgr2IndexLite(sap1);
2904       sap = sap1;
2905    } else if (sap1 == NULL)
2906       sap = sap2;
2907    else
2908       sap = sap1;
2909    if (sap == NULL)
2910       return NULL;
2911    SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
2912    AlnMgr2SortAlnSetByNthRowPos(sap, 1);
2913    c = 0;
2914    amaip = (AMAlignIndex2Ptr)(sap->saip);
2915    AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &start, &stop);
2916    for (i=0; i<sfhp->numfrags && c<amaip->numsaps; i++)
2917    {
2918       salp_tmp = salp_prev = NULL;
2919       while (sfhp->sfparray[i]->start <= start && sfhp->sfparray[i]->stop >= start && c<amaip->numsaps)
2920       {
2921          if (salp_tmp == NULL)
2922             salp_tmp = salp_prev = SeqAlignDup(amaip->saps[c]);
2923          else
2924          {
2925             salp_prev->next = SeqAlignDup(amaip->saps[c]);
2926             salp_prev = salp_prev->next;
2927          }
2928          c++;
2929          if (c<amaip->numsaps)
2930             AlnMgr2GetNthSeqRangeInSA(amaip->saps[c], 1, &start, &stop);
2931       }
2932       if (salp_tmp != NULL)
2933       {
2934          AlnMgr2IndexLite(salp_tmp);
2935          SPI_RemoveInconsistentAlnsFromSet(salp_tmp, SPI_TEENYEXON, 1, SPI_LEFT);
2936          sfhp->sfparray[i]->sap = salp_tmp;
2937          /* change all alignments to be on the plus strand of the mRNA */
2938          strand = AlnMgr2GetNthStrand((SeqAlignPtr)(salp_tmp->segs), 2);
2939          if (strand == Seq_strand_minus)
2940             SeqAlignListReverseStrand((SeqAlignPtr)(salp_tmp->segs));
2941       }
2942    }
2943    SeqAlignSetFree(sap);
2944    SPI_OrderInternally(sfhp);
2945    /* take out overlaps */
2946    SPI_RemoveConflictsAmongPieces(sfhp, SPI_TEENYEXON);
2947    /* do an initial ordering */
2948    SPI_OrderPieces(sfhp, spim->bsp);
2949    /* then look for missing pieces */
2950    if (!SPI_ConnectAlnPieces(sfhp, spig_head->bsp, spim->bsp, spot))
2951       return NULL;
2952    SPI_OrderInternally(sfhp);
2953    /* take out any remaining overlaps */
2954    SPI_RemoveConflictsAmongPieces(sfhp, SPI_TEENYEXON);
2955    /* then do the final ordering */
2956    SPI_OrderPieces(sfhp, spim->bsp);
2957    SPI_AdjustSplicesInPieces(sfhp, spig_head->bsp, spot);
2958    herd = SPI_GetHerdInfo(sfhp, spim->bsp, spot);
2959    SPI_PrintHerdResult(ofp, ofp2, herd, spot, spig_head->bsp, spim->bsp);
2960    return herd;
2961 }
2962 
2963 /***************************************************************************
2964 *
2965 *  SPI_GetHerdInfo fills in a SPI_mRNAToHerd structure with all the
2966 *  appropriate information about mRNA and genomic starts, stops, and strands;
2967 *  presence of splice donor and acceptor sites; number of mismatches and
2968 *  gaps for each exon; and one alignment for each exon. SPI_GetHerdInfo
2969 *  first decides how many exons there are and allocates one ExonHerdInfo
2970 *  structure per exon to store the necessary information. The SPI_mRNAToHerd
2971 *  structure is then allocated, and for each exon, SPI_GetExonInfo is called
2972 *  to retrieve the number of gaps, the number of mismatches, and the
2973 *  mismatch line for printing. After all the exons' information is filled
2974 *  in, the function goes through again and checks to see whether any two
2975 *  exons are close to the edges of their respective fragments and abut
2976 *  each other on the mRNA. If so, these "exons" are probably a single exon
2977 *  and are assigned the same exon number. Finally, the alignments are
2978 *  checked to see whether small pieces at the 5' and 3' ends have been
2979 *  omitted; if so, these pieces are added to the alignments.
2980 *
2981 ***************************************************************************/
SPI_GetHerdInfo(SPI_FragHerdPtr sfhp,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)2982 static SPI_mRNAToHerdPtr SPI_GetHerdInfo(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
2983 {
2984    AMAlignIndex2Ptr      amaip;
2985    Int4                 b;
2986    Int4                 c;
2987    SPI_ExonHerdInfoPtr  ehi;
2988    SPI_ExonHerdInfoPtr  ehi_head;
2989    SPI_ExonHerdInfoPtr  ehi_prev;
2990    Int4                 end;
2991    SPI_mRNAToHerdPtr    herd;
2992    Int4                 i;
2993    Int4                 j;
2994    Int4                 k;
2995    Int4                 l;
2996    Int4                 last;
2997    Int4                 len;
2998    Int4                 len_last;
2999    Int4                 max;
3000    Int4                 min;
3001    Int4                 mis;
3002    Int4                 offset;
3003    SeqAlignPtr          salp;
3004    SeqAlignPtr          salp_tmp;
3005    SPI_FragPtr          sfp;
3006    SPI_mRNAPtr          smp_fake;
3007    Uint1                strand;
3008 
3009    herd = (SPI_mRNAToHerdPtr)MemNew(sizeof(SPI_mRNAToHerd));
3010    ehi_head = ehi_prev = NULL;
3011    for (i=0; i<sfhp->numfrags; i++)
3012    {
3013       sfp = sfhp->sfparray[i];
3014       if (sfp->sap != NULL)
3015       {
3016          amaip = (AMAlignIndex2Ptr)(sfp->sap->saip);
3017          strand = AlnMgr2GetNthStrand(amaip->saps[0], 1);
3018          if (strand == Seq_strand_minus)
3019          {
3020             k = amaip->numsaps-1;
3021             l = -1;
3022          } else
3023          {
3024             k = 0;
3025             l = 1;
3026          }
3027          for (j=k; j<amaip->numsaps && j > -1; j+=l)
3028          {
3029             ehi = (SPI_ExonHerdInfoPtr)MemNew(sizeof(SPI_ExonHerdInfo));
3030             ehi->sfpnum = i;
3031             herd->numpieces++;
3032             ehi->sap = amaip->saps[j];
3033             ehi->fragmentnum = sfp->fragnum;
3034             if (amaip->numsaps == 1)
3035             {
3036                ehi->acceptor = sfp->acceptor;
3037                ehi->donor = sfp->donor;
3038             } else if (j == 0)
3039             {
3040                ehi->acceptor = sfp->acceptor;
3041                ehi->donor = sfp->smp->splicedon[j];
3042             } else if (j == amaip->numsaps - 1)
3043             {
3044                ehi->donor = sfp->donor;
3045                ehi->acceptor = sfp->smp->spliceacc[j];
3046             } else
3047             {
3048                ehi->donor = sfp->smp->splicedon[j];
3049                ehi->acceptor = sfp->smp->spliceacc[j];
3050             }
3051             if (ehi_head != NULL)
3052             {
3053                ehi_prev->next = ehi;
3054                ehi_prev = ehi;
3055             } else
3056                ehi_head = ehi_prev = ehi;
3057          }
3058       }
3059    }
3060    ehi = ehi_head;
3061    if (ehi == NULL)
3062       return NULL;
3063    herd->fragments = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3064    herd->sfpnum = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3065    herd->exons = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3066    herd->mstarts = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3067    herd->mstops = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3068    herd->gstarts = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3069    herd->gstops = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3070    herd->lens = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3071    herd->strands = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3072    herd->splicedon = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3073    herd->spliceacc = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3074    herd->pmismatch = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3075    herd->pgaps = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3076    herd->saps = (SeqAlignPtr PNTR)MemNew((herd->numpieces)*sizeof(SeqAlignPtr));
3077    herd->fallsoff = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3078    i = 0;
3079    smp_fake = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
3080    smp_fake->mstarts = (Int4Ptr)MemNew(sizeof(Int4));
3081    smp_fake->mstops = (Int4Ptr)MemNew(sizeof(Int4));
3082    smp_fake->gstarts = (Int4Ptr)MemNew(sizeof(Int4));
3083    smp_fake->gstops = (Int4Ptr)MemNew(sizeof(Int4));
3084    smp_fake->exonid = (FloatHiPtr)MemNew(sizeof(FloatHi));
3085    smp_fake->exongaps = (Int4Ptr)MemNew(sizeof(Int4));
3086    smp_fake->saps = (SeqAlignPtr PNTR)MemNew(sizeof(SeqAlignPtr));
3087    mis = 0;
3088    len = 0;
3089    len_last = 0;
3090    while (ehi != NULL) /* fill in info for each piece */
3091    {
3092       herd->splicedon[i] = ehi->donor;
3093       herd->spliceacc[i] = ehi->acceptor;
3094       herd->strands[i] = AlnMgr2GetNthStrand(ehi->sap, 1);
3095       herd->saps[i] = ehi->sap;
3096       smp_fake->saps[0] = ehi->sap;
3097       last = mis;
3098       len += SPI_GetExonInfo(smp_fake, 0, &b, &c, &mis, spot);
3099       herd->pmismatch[i] = mis - last;
3100       herd->lens[i] = len - len_last;
3101       herd->mstarts[i] = b;
3102       herd->mstops[i] = c;
3103       herd->gstarts[i] = smp_fake->gstarts[0];
3104       herd->gstops[i] = smp_fake->gstops[0];
3105       herd->pgaps[i] = smp_fake->exongaps[0];
3106       sfp = sfhp->sfparray[ehi->sfpnum];
3107       herd->fragments[i] = ehi->fragmentnum;
3108       herd->sfpnum[i] = ehi->sfpnum;
3109       if (herd->gstarts[i] < sfp->start + SPI_FUZZ)
3110       {
3111          if (herd->gstops[i] > sfp->stop - SPI_FUZZ)
3112             herd->fallsoff[i] = SPI_BOTH;
3113          else
3114             herd->fallsoff[i] = SPI_LEFT;
3115       } else
3116       {
3117          if (herd->gstops[i] > sfp->stop - SPI_FUZZ)
3118             herd->fallsoff[i] = SPI_RIGHT;
3119          else
3120             herd->fallsoff[i] = SPI_NEITHER;
3121       }
3122       ehi_prev = ehi;
3123       ehi = ehi->next;
3124       ehi_prev->next = NULL;
3125       MemFree(ehi_prev);
3126       i++;
3127    }
3128    herd->mismatch = (100)*(FloatHi)mis/(FloatHi)len;
3129    herd->epp = smp_fake->epp;
3130    herd->mRNAcoverage = 100*((FloatHi)len/(FloatHi)bsp_mrna->length);
3131    /* now run through to see whether any two pieces should be */
3132    /* merged into a single exon (both near fragment edges)    */
3133    b = 1;
3134    for (i=0; i<herd->numpieces-1; i++)
3135    {
3136       if (((herd->fallsoff[i] == SPI_RIGHT && herd->strands[i] == Seq_strand_plus)
3137         || (herd->fallsoff[i] == SPI_LEFT && herd->strands[i] == Seq_strand_minus)
3138         || (herd->fallsoff[i] == SPI_BOTH)) &&
3139            ((herd->fallsoff[i+1] == SPI_LEFT && herd->strands[i+1] == Seq_strand_plus)
3140         || (herd->fallsoff[i+1] == SPI_RIGHT && herd->strands[i+1] == Seq_strand_minus)
3141         || (herd->fallsoff[i+1] == SPI_BOTH)))
3142          {
3143             if (herd->mstarts[i+1] < herd->mstops[i] - SPI_TEENYEXON)
3144                herd->exons[i] = b;
3145          }
3146       else
3147       {
3148          herd->exons[i] = b;
3149          if (herd->fallsoff[i] == SPI_RIGHT)
3150             herd->fallsoff[i] = SPI_NEITHER;
3151          if (herd->fallsoff[i] == SPI_BOTH)
3152             herd->fallsoff[i] = SPI_LEFT;
3153          if (herd->fallsoff[i+1] == SPI_LEFT)
3154             herd->fallsoff[i+1] = SPI_NEITHER;
3155          if (herd->fallsoff[i+1] == SPI_BOTH)
3156             herd->fallsoff[i+1] = SPI_RIGHT;
3157          b++;
3158       }
3159    }
3160    herd->exons[i] = b;
3161    herd->numexons = b;
3162    /* now get %id per exon, #gaps per exon */
3163    i = 0;
3164    herd->exonid = (FloatHiPtr)MemNew((herd->numexons)*sizeof(FloatHi));
3165    herd->exongaps = (Int4Ptr)MemNew((herd->numexons)*sizeof(Int4));
3166    while (i<herd->numpieces)
3167    {
3168       min = herd->mstarts[i];
3169       b = herd->pmismatch[i];
3170       c = herd->pgaps[i];
3171       while (i<herd->numpieces-1 && herd->exons[i] == herd->exons[i+1])
3172       {
3173          i++;
3174          b += herd->pmismatch[i];
3175          c += herd->pgaps[i];
3176       }
3177       max = herd->mstops[i];
3178       herd->exongaps[herd->exons[i]-1] = c;
3179       herd->exonid[herd->exons[i]-1] = (FloatHi)(max - min + 1 - c - b)/(FloatHi)(max - min + 1 - c);
3180       i++;
3181    }
3182    /* check -- does the alignment leave off a tiny piece of the beginning */
3183    /* or end of the mRNA (ignoring the polyA tail) ? */
3184    /* first check the beginning */
3185    end = bsp_mrna->length - 1 - sfhp->polyAtail;
3186    if (herd->mstarts[0] > 0 && herd->mstarts[0] < SPI_TEENYEXON)
3187    {
3188       strand = AlnMgr2GetNthStrand(herd->saps[0], 1);
3189       if (strand == Seq_strand_minus)
3190       {
3191          salp = (SeqAlignPtr)(herd->saps[0]);
3192          salp_tmp = salp->next;
3193          salp->next = NULL;
3194          SAIndex2Free2(salp->saip);
3195          salp->saip = NULL;
3196          SeqAlignListReverseStrand(salp);
3197          AlnMgr2IndexSingleChildSeqAlign(salp);
3198          salp->next = salp_tmp;
3199       }
3200       sfp = sfhp->sfparray[herd->sfpnum[0]];
3201       offset = herd->mstarts[0];
3202       herd->mstarts[0] = 0;
3203       if (strand == Seq_strand_minus)
3204          herd->gstops[0] += offset;
3205       else
3206          herd->gstarts[0] -= offset;
3207       if (herd->gstarts[0] < sfp->start + herd->mstarts[0])
3208       {
3209          herd->mstarts[0] = herd->mstarts[0] - (herd->gstarts[0] - sfp->start);
3210          herd->gstarts[0] = sfp->start;
3211          offset = herd->gstarts[0] - sfp->start;
3212       } else if (herd->gstops[0] > sfp->stop - herd->mstarts[0])
3213       {
3214          herd->mstarts[0] = herd->mstarts[0] - (sfp->stop - herd->gstops[0]);
3215          herd->gstops[0] = sfp->stop;
3216          offset = sfp->stop - herd->gstops[0];
3217       }
3218       SPI_AddToAln(herd->saps[0], offset, SPI_LEFT, strand);
3219       if (strand == Seq_strand_minus)
3220       {
3221          salp = (SeqAlignPtr)(herd->saps[0]);
3222          salp_tmp = salp->next;
3223          salp->next = NULL;
3224          SAIndex2Free2(salp->saip);
3225          salp->saip = NULL;
3226          SeqAlignListReverseStrand(salp);
3227          AlnMgr2IndexSingleChildSeqAlign(salp);
3228          salp->next = salp_tmp;
3229       }
3230    }
3231    /* now check the end */
3232    if (herd->mstops[herd->numpieces-1] > end - SPI_TEENYEXON && herd->mstops[herd->numpieces-1] != end)
3233    {
3234       strand = AlnMgr2GetNthStrand(herd->saps[herd->numpieces-1], 1);
3235       if (strand == Seq_strand_minus)
3236       {
3237          salp = (SeqAlignPtr)(herd->saps[herd->numpieces-1]);
3238          salp_tmp = salp->next;
3239          salp->next = NULL;
3240          SAIndex2Free2(salp->saip);
3241          salp->saip = NULL;
3242          SeqAlignListReverseStrand(salp);
3243          AlnMgr2IndexSingleChildSeqAlign(salp);
3244          salp->next = salp_tmp;
3245       }
3246       sfp = sfhp->sfparray[herd->sfpnum[herd->numpieces-1]];
3247       offset = end - herd->mstops[herd->numpieces-1];
3248       herd->mstops[herd->numpieces-1] = bsp_mrna->length-1;
3249       if (strand == Seq_strand_minus)
3250          herd->gstarts[herd->numpieces-1] -= offset;
3251       else
3252          herd->gstops[herd->numpieces-1] += offset;
3253       if (herd->gstarts[herd->numpieces-1] < sfp->start + offset)
3254       {
3255          offset = herd->gstops[herd->numpieces-1] - sfp->start;
3256          herd->mstops[herd->numpieces-1] = herd->mstops[herd->numpieces-1] + offset;
3257          herd->gstarts[herd->numpieces-1] = sfp->start;
3258       } else if (herd->gstops[herd->numpieces-1] > sfp->stop - offset)
3259       {
3260          offset = sfp->stop - herd->gstops[herd->numpieces-1];
3261          herd->mstops[herd->numpieces-1] = herd->mstops[herd->numpieces-1] - offset;
3262          herd->gstops[herd->numpieces-1] = sfp->stop;
3263       }
3264       SPI_AddToAln(herd->saps[herd->numpieces-1], offset, SPI_RIGHT, strand);
3265       if (strand == Seq_strand_minus)
3266       {
3267          salp = (SeqAlignPtr)(herd->saps[herd->numpieces-1]);
3268          salp_tmp = salp->next;
3269          salp->next = NULL;
3270          SAIndex2Free2(salp->saip);
3271          salp->saip = NULL;
3272          SeqAlignListReverseStrand(salp);
3273          AlnMgr2IndexSingleChildSeqAlign(salp);
3274          salp->next = salp_tmp;
3275       }
3276    }
3277    if (herd->mstarts[0] > 0)
3278    {
3279       if (herd->mstops[herd->numpieces-1] < bsp_mrna->length-1)
3280          herd->missingends = SPI_BOTH;
3281       else
3282          herd->missingends = SPI_LEFT;
3283    } else
3284    {
3285       if (herd->mstops[herd->numpieces-1] < bsp_mrna->length-1)
3286          herd->missingends = SPI_RIGHT;
3287       else
3288          herd->missingends = SPI_NEITHER;
3289    }
3290    MemFree(smp_fake->mstarts);
3291    MemFree(smp_fake->mstops);
3292    MemFree(smp_fake->gstarts);
3293    MemFree(smp_fake->gstops);
3294    MemFree(smp_fake->exonid);
3295    MemFree(smp_fake->exongaps);
3296    MemFree(smp_fake->saps);
3297    MemFree(smp_fake);
3298    return herd;
3299 }
3300 
3301 /***************************************************************************
3302 *
3303 *  SPI_FindWindows first sorts all the alignments by score, and then
3304 *  sends the array to SPI_AssembleRegions, which puts together
3305 *  nonoverlapping regions containing one or more alignments each. The idea:
3306 *
3307 *  0-----100----200----300----400----500----600----700----800---- (genomic)
3308 *  0  \        \                       \        \
3309 *  50  \        \                                 \
3310 *  100  \                                          \
3311 *  150               \                               \
3312 *  200                \                               \
3313 *  (mRNA)
3314 *    <-1->    <---2---->              <-3->    <---4--->
3315 *  This set of initial alignments defines 4 windows; the best n windows
3316 *  will be chosen and the alignment will be refined in those windows.
3317 *
3318 ***************************************************************************/
SPI_FindWindows(SeqAlignPtr sap,SPI_OptionsPtr spot)3319 static SPI_RegionInfoPtr SPI_FindWindows(SeqAlignPtr sap, SPI_OptionsPtr spot)
3320 {
3321    AMAlignIndex2Ptr    amaip;
3322    /* FloatHi            bit_score; */
3323    /* FloatHi            evalue; */
3324    Int4               i;
3325    /* Int4               number; */
3326    SeqAlignPtr        salp;
3327    /* Int4               score; */
3328    SPI_AlnInfoPtr     PNTR spip_list;
3329    SPI_RegionInfoPtr  srip_head;
3330 FloatHi  s, s1;
3331 Int4  s2, tmp;
3332 
3333    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
3334       return NULL;
3335    amaip = (AMAlignIndex2Ptr)(sap->saip);
3336    spip_list = (SPI_AlnInfoPtr PNTR)MemNew((amaip->numsaps)*sizeof(SPI_AlnInfoPtr));
3337    for (i=0; i<amaip->numsaps; i++)
3338    {
3339       salp = amaip->saps[i];
3340       spip_list[i] = (SPI_AlnInfoPtr)MemNew(sizeof(SPI_AlnInfo));
3341       spip_list[i]->sap = salp;
3342       salp->next = NULL;
3343 /*
3344       spip_list[i]->bit_score = AlnMgr2ComputeScoreForSeqAlign(salp);*/
3345       tmp = spip_list[i]->bit_score;
3346       GetScoreAndEvalue(salp, &tmp, &s, &s1, &s2);
3347       spip_list[i]->bit_score = tmp;
3348    }
3349    HeapSort(spip_list, i, sizeof(SPI_AlnInfoPtr), SPI_compare_aln_score);
3350    srip_head = NULL;
3351    srip_head = SPI_AssembleRegions(spip_list, amaip->numsaps, &srip_head, spot);
3352    srip_head = SPI_SortRegions(srip_head);
3353    for (i=0; i<amaip->numsaps; i++)
3354    {
3355       spip_list[i]->sap = NULL;
3356       MemFree(spip_list[i]);
3357       if (i < amaip->numsaps-1)
3358          amaip->saps[i]->next = amaip->saps[i+1];
3359    }
3360    sap->segs = (Pointer)(amaip->saps[0]);
3361    MemFree(spip_list);
3362    return srip_head;
3363 }
3364 
3365 /***************************************************************************
3366 *
3367 *  SPI_compare_aln_score is the callback for the HeapSort in
3368 *  SPI_FindWindows; it simply compares the scores of two alignments.
3369 *
3370 ***************************************************************************/
SPI_compare_aln_score(VoidPtr ptr1,VoidPtr ptr2)3371 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2)
3372 {
3373    SPI_AlnInfoPtr  spip1;
3374    SPI_AlnInfoPtr  spip2;
3375 
3376    if (ptr1 != NULL && ptr2 != NULL)
3377    {
3378       spip1 = *((SPI_AlnInfoPtr PNTR)ptr1);
3379       spip2 = *((SPI_AlnInfoPtr PNTR)ptr2);
3380       if (spip1->bit_score > spip2->bit_score)
3381          return -1;
3382       else if (spip1->bit_score < spip2->bit_score)
3383          return 1;
3384       else
3385          return 0;
3386    }
3387    return 0;
3388 }
3389 
3390 /***************************************************************************
3391 *
3392 *  SPI_SortRegions takes a linked list of new regions (no alignments) and
3393 *  makes sure that they are in order by score, to ensure that the first
3394 *  region analyzed is the region with the most potential.
3395 *
3396 ***************************************************************************/
SPI_SortRegions(SPI_RegionInfoPtr srip_head)3397 static SPI_RegionInfoPtr SPI_SortRegions(SPI_RegionInfoPtr srip_head)
3398 {
3399    Int4               i;
3400    Int4               j;
3401    SPI_RegionInfoPtr  srip;
3402    SPI_RegionInfoPtr  PNTR sriparray;
3403 
3404    srip = srip_head;
3405    i = 0;
3406    while (srip != NULL)
3407    {
3408       i++;
3409       srip = srip->next;
3410    }
3411    sriparray = (SPI_RegionInfoPtr PNTR)MemNew(i*sizeof(SPI_RegionInfoPtr));
3412    i = 0;
3413    srip = srip_head;
3414    while (srip != NULL)
3415    {
3416       sriparray[i] = srip;
3417       i++;
3418       srip = srip->next;
3419    }
3420    HeapSort(sriparray, i, sizeof(SPI_RegionInfoPtr), SPI_SortSrips);
3421    for (j=0; j<i-1; j++)
3422    {
3423       sriparray[j]->next = sriparray[j+1];
3424    }
3425    sriparray[i-1]->next = NULL;
3426    srip = sriparray[0];
3427    MemFree(sriparray);
3428    return srip;
3429 }
3430 
3431 /***************************************************************************
3432 *
3433 *  SPI_SortSrips is the HeapSort callback for SPI_SortRegions. It simply
3434 *  orders the regions by score.
3435 *
3436 ***************************************************************************/
SPI_SortSrips(VoidPtr ptr1,VoidPtr ptr2)3437 static int LIBCALLBACK SPI_SortSrips(VoidPtr ptr1, VoidPtr ptr2)
3438 {
3439    SPI_RegionInfoPtr  srip1;
3440    SPI_RegionInfoPtr  srip2;
3441 
3442    srip1 = *((SPI_RegionInfoPtr PNTR)ptr1);
3443    srip2 = *((SPI_RegionInfoPtr PNTR)ptr2);
3444    if (srip1->score > srip2->score)
3445       return -1;
3446    if (srip2->score > srip1->score)
3447       return 1;
3448    if (srip1->coverage > srip2->coverage)
3449       return -1;
3450    if (srip2->coverage > srip1->coverage)
3451       return 1;
3452    return 0;
3453 }
3454 
3455 /***************************************************************************
3456 *
3457 *  SPI_AssembleRegions is a recursive function which clusters the
3458 *  alignments into consistent, nonoverlapping windows. On the first pass,
3459 *  all the alignments are sent to the function SPI_GetRegionForSAP, and
3460 *  only the consistent ones are put into the first region. Since the first
3461 *  alignment is the highest-scoring alignment, this first region is
3462 *  usually the best region. On each subsequent pass, an unused alignment
3463 *  is assigned a genomic interval that does not overlap with any other
3464 *  previously defined region, and that alignment and other alignments in
3465 *  the same interval are sent to SPI_GetRegionForSAP to weed out
3466 *  inconsistent alignments. This process is repeated until no alignments
3467 *  are left -- all have either been assigned to a region or designated
3468 *  impossible to assign, since they overlap with a defined region but are
3469 *  inconsistent with other alignments in that region.
3470 *
3471 ***************************************************************************/
SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list,Int4 num,SPI_RegionInfoPtr PNTR head_srip,SPI_OptionsPtr spot)3472 static SPI_RegionInfoPtr SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list, Int4 num, SPI_RegionInfoPtr PNTR head_srip, SPI_OptionsPtr spot)
3473 {
3474     /* FloatHi            bit_score; */
3475     /* FloatHi            evalue; */
3476    Boolean            found;
3477    Int4               i;
3478    Int4               j;
3479    Int4               lim_left;
3480    Int4               lim_right;
3481    Int4               n;
3482    /* Int4               number; */
3483    SeqAlignPtr        sap;
3484    SPI_IvalPtr        siip;
3485    SPI_IvalPtr        siip_head;
3486    SPI_IvalPtr        siip_prev;
3487    SPI_IvalPtr        PNTR siip_list;
3488    SPI_RegionInfoPtr  srip;
3489    SPI_RegionInfoPtr  srip_tmp;
3490    Int4               start;
3491    Int4               stop;
3492 
3493    if (spip_list == NULL || head_srip == NULL)
3494       return NULL;
3495    found = FALSE;
3496    i = 0;
3497    while (i<num && !found)
3498    {
3499       if (spip_list[i]->used == 0)
3500          found = TRUE;
3501       else
3502          i++;
3503    }
3504    if (!found)
3505       return *head_srip;
3506    sap = spip_list[i]->sap;
3507    AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
3508    srip_tmp = *head_srip;
3509    lim_left = -1;
3510    lim_right = -1;
3511    /* figure out what (unused) genomic interval this alignment is in */
3512    while (srip_tmp != NULL)
3513    {
3514       if (srip_tmp->gstop < start)
3515       {
3516          if (srip_tmp->gstop > lim_left)
3517             lim_left = srip_tmp->gstop;
3518       }
3519       if (srip_tmp->gstart > stop)
3520       {
3521          if (lim_right == -1 || srip_tmp->gstart < lim_right)
3522             lim_right = srip_tmp->gstart;
3523       }
3524       srip_tmp = srip_tmp->next;
3525    }
3526    siip_head = siip_prev = NULL;
3527    n = 0;
3528    for (j=0; j<num; j++)
3529    {
3530       if (spip_list[j]->used == 0)
3531       {
3532          AlnMgr2GetNthSeqRangeInSA(spip_list[j]->sap, 1, &start, &stop);
3533         /* if this unused alignment is in the same interval as the one */
3534         /* being looked at, put it in the array */
3535          if (start > lim_left && (stop < lim_right || lim_right == -1))
3536          {
3537             siip = (SPI_IvalPtr)MemNew(sizeof(SPI_Ival));
3538             if (j == i)
3539                siip->used = 1;
3540             siip->n = j;
3541             siip->gstart = start;
3542             siip->gstop = stop;
3543             AlnMgr2GetNthSeqRangeInSA(spip_list[j]->sap, 2, &siip->mstart, &siip->mstop);
3544             siip->strand = AlnMgr2GetNthStrand(spip_list[j]->sap, 2);
3545             siip->sap = spip_list[j]->sap;
3546             siip->score = AlnMgr2ComputeScoreForSeqAlign(siip->sap);
3547             if (siip_head != NULL)
3548             {
3549                siip_prev->next = siip;
3550                siip_prev = siip;
3551             } else
3552                siip_head = siip_prev = siip;
3553             n++;
3554          }
3555       }
3556    }
3557    siip_list = (SPI_IvalPtr PNTR)MemNew(n*sizeof(SPI_IvalPtr));
3558    siip = siip_head;
3559    for (j=0; j<n && siip != NULL; j++)
3560    {
3561       siip_list[j] = siip;
3562       siip = siip->next;
3563    }
3564    /* send the array of unused alignments to SPI_GetRegionForSAP to */
3565    /* weed out inconsistent alignments                              */
3566    srip = SPI_GetRegionForSAP(siip_list, n, sap, spot);
3567    if (srip != NULL)
3568    {
3569       if (*head_srip == NULL)
3570          *head_srip = srip;
3571       else
3572       {
3573          srip_tmp = *head_srip;
3574          while (srip_tmp->next != NULL)
3575          {
3576             srip_tmp = srip_tmp->next;
3577          }
3578          srip_tmp->next = srip;
3579       }
3580       /* update the information about which alignments have been used */
3581       /* and which alignments are impossible                          */
3582       for (j=0; j<n; j++)
3583       {
3584          spip_list[siip_list[j]->n]->used = siip_list[j]->used;
3585          if (((siip_list[j]->gstart > srip->gstart + SPI_FUZZ && siip_list[j]->gstart < srip->gstop - SPI_FUZZ) || (siip_list[j]->gstop > srip->gstart + SPI_FUZZ && siip_list[j]->gstop < srip->gstop - SPI_FUZZ)) && siip_list[j]->used == 0)
3586          {
3587             siip_list[j]->used = -1;
3588             spip_list[siip_list[j]->n]->used = -1;
3589          }
3590       }
3591    }
3592    for (j=0; j<n; j++)
3593    {
3594       MemFree(siip_list[j]);
3595    }
3596    MemFree(siip_list);
3597    /* recursive call to self*/
3598    srip = SPI_AssembleRegions(spip_list, num, head_srip, spot);
3599    return srip;
3600 }
3601 
3602 
3603 /***************************************************************************
3604 *
3605 *  SPI_GetRegionForSAP takes a list of SPI_IvalPtrs, each of which carries
3606 *  the information for one alignment, and a seqalign, which is the
3607 *  "anchor" alignment for the new interval and which is higher-scoring than
3608 *  any other seqalign in the set. The function checks all of the SPI_Ivals
3609 *  to see whether they're consistent with the anchor alignment, and marks
3610 *  each SPI_Ival as used, not used, or impossible.
3611 *
3612 ***************************************************************************/
SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list,Int4 num,SeqAlignPtr sap,SPI_OptionsPtr spot)3613 static SPI_RegionInfoPtr SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list, Int4 num, SeqAlignPtr sap, SPI_OptionsPtr spot)
3614 {
3615    Boolean            done;
3616    Boolean            found;
3617    Int4               i;
3618    Int2               j = SPI_UNKNOWN;
3619    Int4               n;
3620    SPI_RegionInfoPtr  srip;
3621 
3622    if (siip_list == NULL || num == 0 || sap == NULL)
3623       return NULL;
3624    /* sort the alignments along the genomic sequence */
3625    HeapSort(siip_list, num, sizeof(SPI_IvalPtr), SPI_compare_genomic_loc);
3626    SPI_CheckMrnaOrder(siip_list, num);
3627    found = FALSE;
3628    n = 0;
3629    /* figure out which one is the anchor alignment */
3630    while (!found && n<num)
3631    {
3632       if (sap == siip_list[n]->sap)
3633          found = TRUE;
3634       else
3635          n++;
3636    }
3637    if (!found)
3638       return NULL;
3639    /* make a new region with this alignment */
3640    srip = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
3641    AlnMgr2GetNthSeqRangeInSA(sap, 1, &srip->gstart, &srip->gstop);
3642    AlnMgr2GetNthSeqRangeInSA(sap, 2, &srip->mstart, &srip->mstop);
3643    srip->coverage = abs(srip->mstop - srip->mstart) + 1;
3644    srip->score = siip_list[n]->score;
3645    srip->strand = AlnMgr2GetNthStrand(sap, 2);
3646    /* search to the right for consistent alignments */
3647    done = FALSE;
3648    for (i=n+1; i<num && !done; i++)
3649    {
3650       j = SPI_is_consistent(siip_list[i], srip, spot);
3651       if (j == SPI_CONSISTENT)
3652       {
3653          srip->coverage = abs(siip_list[i]->mstop - siip_list[i]->mstart) + srip->coverage + 1;
3654          siip_list[i]->used = 1;
3655          srip->score += siip_list[i]->score;
3656       }
3657       else if (j == SPI_IMPOSSIBLE)
3658          siip_list[i]->used = -1;
3659       else if (j == SPI_DONE1)
3660       {
3661          siip_list[i]->used = 1;
3662          done = TRUE;
3663       } else if (j == SPI_DONE2)
3664          done = TRUE;
3665    }
3666    /* search to the left for consistent alignments */
3667    done = FALSE;
3668    for (i=n-1; i>=0 && !done; i--)
3669    {
3670       j = SPI_is_consistent(siip_list[i], srip, spot);
3671       if (j == SPI_CONSISTENT)
3672       {
3673          srip->coverage = abs(siip_list[i]->mstop - siip_list[i]->mstart) + srip->coverage + 1;
3674          siip_list[i]->used = 1;
3675          srip->score += siip_list[i]->score;
3676       }
3677       else if (j == SPI_IMPOSSIBLE)
3678          siip_list[i]->used = -1;
3679       else if (j == SPI_DONE1)
3680       {
3681          siip_list[i]->used = 1;
3682          done = TRUE;
3683       } else if (j == SPI_DONE2)
3684          done = TRUE;
3685    }
3686    SPI_ExcludeOverlaps(siip_list, num, srip);
3687    return srip;
3688 }
3689 
3690 /***************************************************************************
3691 *
3692 *  SPI_is_consistent is the workhorse of SPI_GetRegionForSAP. Given an
3693 *  SPI_Ival and a region, the function decides whether the alignment in
3694 *  the SPI_Ival is consistent with the rest of the region. Alignments that
3695 *  overlap by more than SPI_FUZZ are labeled impossible; alignments that
3696 *  are consistent in both the genomic and mRNA coordinates and that overlap
3697 *  by less than SPI_FUZZ are labeled consistent. Alignments that do not
3698 *  overlap the region but which are not consistent in either the mRNA or
3699 *  genomic coordinates are labeled unknown.
3700 *
3701 ***************************************************************************/
SPI_is_consistent(SPI_IvalPtr siip,SPI_RegionInfoPtr srip,SPI_OptionsPtr spot)3702 static Int2 SPI_is_consistent(SPI_IvalPtr siip, SPI_RegionInfoPtr srip, SPI_OptionsPtr spot)
3703 {
3704    Int4  intronsize;
3705 
3706    if (siip == NULL || srip == NULL)
3707       return 0;
3708    if ((siip->strand == Seq_strand_minus && srip->strand != Seq_strand_minus) || (srip->strand == Seq_strand_minus && siip->strand != Seq_strand_minus))
3709       return SPI_UNKNOWN;
3710    /*KSK*/
3711    if (spot->bigintron){
3712        intronsize = (spot->bigintron_size > SPI_INTRONSIZEXL
3713                      ? spot->bigintron_size : SPI_INTRONSIZEXL);
3714    }
3715    else{
3716        intronsize = SPI_INTRONSIZE;
3717    }
3718    /*end KSK*/
3719    /* first look for overlaps -- exclude these from the set       */
3720    /* since we search outward from a core hit, there shouldn't be */
3721    /* any overlaps anyway.                                        */
3722    if (siip->gstart > srip->gstart + SPI_FUZZ && siip->gstart < srip->gstop - SPI_FUZZ)
3723       return SPI_IMPOSSIBLE;
3724    if (siip->gstart > srip->gstop - SPI_FUZZ && siip->gstart < srip->gstop + intronsize)
3725    {
3726       if (siip->strand == Seq_strand_minus)
3727       {
3728          if (siip->mstop < srip->mstart + SPI_FUZZ)
3729          {
3730             srip->gstop = siip->gstop;
3731             srip->mstart = siip->mstart;
3732             return SPI_CONSISTENT;
3733          } else
3734             return SPI_UNKNOWN;
3735       } else
3736       {
3737          if (siip->mstart > srip->mstop - SPI_FUZZ)
3738          {
3739             srip->gstop = siip->gstop;
3740             srip->mstop = siip->mstop;
3741             return SPI_CONSISTENT;
3742          } else
3743             return SPI_UNKNOWN;
3744       }
3745    } else if (siip->gstop <= srip->gstart + SPI_FUZZ && siip->gstop > srip->gstart - intronsize)
3746    {
3747       if (siip->strand == Seq_strand_minus)
3748       {
3749          if (siip->mstart > srip->mstop - SPI_FUZZ)
3750          {
3751             srip->gstart = siip->gstart;
3752             srip->mstop = siip->mstop;
3753             return SPI_CONSISTENT;
3754          } else
3755             return SPI_UNKNOWN;
3756       } else
3757       {
3758          if (siip->mstop < srip->mstart + SPI_FUZZ)
3759          {
3760             srip->gstart = siip->gstart;
3761             srip->mstart = siip->mstart;
3762             return SPI_CONSISTENT;
3763          } else
3764             return SPI_UNKNOWN;
3765       }
3766    }
3767    return SPI_UNKNOWN;
3768 }
3769 
3770 /***************************************************************************
3771 *
3772 *  SPI_compare_genomic_loc is the callback for the HeapSort in
3773 *  SPI_GetRegionForSAP. It simply orders two SPI_Ival structures
3774 *  by their genomic start coordinates.
3775 *
3776 ***************************************************************************/
SPI_compare_genomic_loc(VoidPtr ptr1,VoidPtr ptr2)3777 static int LIBCALLBACK SPI_compare_genomic_loc(VoidPtr ptr1, VoidPtr ptr2)
3778 {
3779    SPI_IvalPtr  siip1;
3780    SPI_IvalPtr  siip2;
3781 
3782    if (ptr1 != NULL && ptr2 != NULL)
3783    {
3784       siip1 = *((SPI_IvalPtr PNTR)ptr1);
3785       siip2 = *((SPI_IvalPtr PNTR)ptr2);
3786       if (siip1->gstart < siip2->gstart)
3787          return -1;
3788       else if (siip1->gstart > siip2->gstart)
3789          return 1;
3790       else
3791          return 0;
3792    }
3793    return 0;
3794 }
3795 
3796 /***************************************************************************
3797 *
3798 *  SPI_ExcludeOverlaps takes a completed region and examines all SPI_Ivals
3799 *  for conflicts. Intervals which overlap the completed region are
3800 *  marked impossible.
3801 *
3802 ***************************************************************************/
SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list,Int4 num,SPI_RegionInfoPtr srip)3803 static void SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list, Int4 num, SPI_RegionInfoPtr srip)
3804 {
3805    Int4  i;
3806 
3807    for (i=0; i<num; i++)
3808    {
3809        if (siip_list[i] == 0) /* KSK changed from '==' to '!=' */
3810       {
3811          if ((siip_list[i]->gstart >= srip->gstart && siip_list[i]->gstart <= srip->gstop) || (siip_list[i]->gstop >= srip->gstart && siip_list[i]->gstop <= srip->gstop))
3812             siip_list[i]->used = -1;
3813       }
3814    }
3815 }
3816 
3817 /***************************************************************************
3818 *
3819 *  SPI_AlignInWindows organizes the regions into an array, sends each
3820 *  region to SPI_DoAln for careful alignment, and then takes all the
3821 *  regions that have alignments and puts them into a linked list,
3822 *  freeing those regions that do not have alignments.
3823 *
3824 ***************************************************************************/
SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)3825 static void SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
3826 {
3827    Int4               nalign;
3828    Int4               score;
3829    SPI_RegionInfoPtr  srip;
3830    SPI_RegionInfoPtr  srip_head;
3831    SPI_RegionInfoPtr  srip_next;
3832    SPI_RegionInfoPtr  srip_prev;
3833    SPI_RegionInfoPtr  srip_tmp;
3834 
3835    if (head_srip == NULL || *head_srip == NULL)
3836       return;
3837    srip = *head_srip;
3838    score = srip->coverage;
3839    nalign = 0;
3840    /* KSK temp fix was to go through all srips regardless ***
3841     *  '(while (srip!= NULL)) but the increased time is too much */
3842    while (srip != NULL && (nalign < spot->numreturns+1 || srip->coverage >= score/2)){
3843        SPI_DoAln(srip, bsp_genomic, bsp_mrna, spot);
3844        if (srip->smp != NULL){
3845            nalign++;
3846        }
3847        srip = srip->next;
3848    }
3849    srip_head = srip_prev = NULL;
3850    /* make a linked list of regions that have alignments */
3851    srip_tmp = *head_srip;
3852    while (srip_tmp != NULL)
3853    {
3854       srip_next = srip_tmp->next;
3855       srip_tmp->next = NULL;
3856       if (srip_tmp->smp != NULL)
3857       {
3858          if (srip_head != NULL)
3859          {
3860             srip_prev->next = srip_tmp;
3861             srip_prev = srip_tmp;
3862          } else
3863             srip_head = srip_prev = srip_tmp;
3864       } else
3865          MemFree(srip_tmp);
3866       srip_tmp = srip_next;
3867    }
3868    *head_srip = srip_head;
3869 }
3870 
3871 
3872 /***************************************************************************
3873 *
3874 *  SPI_DoAln first re-BLASTs the mRNA against the genomic interval
3875 *  specified by the region. If the mRNA is truncated in the interval,
3876 *  the function pads the appropriate side to encourage a complete
3877 *  alignment. The function then calls other functions to remove
3878 *  inconsistent alignments from the set, extend the alignments so that
3879 *  they completely span the mRNA, and adjust the alignments to the
3880 *  most appropriate splice sites.
3881 *
3882 ***************************************************************************/
SPI_DoAln(SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)3883 static void SPI_DoAln(SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
3884 {
3885    Int4                 eoff;
3886    BLAST_OptionsBlkPtr  options;
3887    SPI_Progress         progress;
3888    SeqAlignPtr          sap;
3889    SeqLocPtr            slp_g;
3890    SeqLocPtr            slp_m;
3891    Int4                 soff;
3892 
3893    if (srip == NULL)
3894       return;
3895    if (srip->mstart == 0)
3896       soff = 0;
3897    else if (srip->mstart < 50)
3898       soff = 2*(srip->mstart);
3899    else
3900       soff = 3*(srip->mstart);
3901    if (soff > srip->gstart)
3902       soff = srip->gstart;
3903    if (srip->mstop == bsp_mrna->length - 1)
3904       eoff = 0;
3905    else if (bsp_mrna->length - 1 - srip->mstop < 50)
3906       eoff = 2*(bsp_mrna->length - 1 - srip->mstop);
3907    else
3908       eoff = 3*(bsp_mrna->length - 1 - srip->mstop);
3909    if (srip->gstop + eoff > bsp_genomic->length - 1)
3910        eoff = bsp_genomic->length - 1 - srip->gstop;
3911    slp_g = SeqLocIntNew(MAX(srip->gstart-soff, spot->from), MIN(srip->gstop+eoff, spot->to), Seq_strand_plus, bsp_genomic->id);
3912    slp_m = SeqLocIntNew(0, bsp_mrna->length-1, srip->strand, bsp_mrna->id);
3913    if (slp_g == NULL || slp_m == NULL)
3914    {
3915       ErrPostEx(SEV_ERROR, 0, 0, "Error in SPI_DoAln\n");
3916       return;
3917    }
3918    options = BLASTOptionNew("blastn", TRUE);
3919    options->filter_string = StringSave("m L");
3920    options->expect_value = spot->secpasseval;
3921    options->wordsize = 7; /*minimum BLAST wordsize */
3922    if (spot->interspecies)
3923    {
3924       options->gap_x_dropoff_final = 100;
3925       options->gap_open = 4;
3926       options->gap_extend = 1;
3927       options->penalty = -1;
3928    }
3929    options->query_lcase_mask = spot->lcaseloc;
3930    /* use mRNA as the query to speed up BLAST */
3931    sap = BlastTwoSequencesByLoc(slp_m, slp_g, "blastn", options);
3932    if (spot->callback != NULL)
3933    {
3934       progress.percentdone = 50;
3935       progress.returncode = SPI_PROGRESS;
3936       if (!spot->callback(&progress))
3937          return;
3938    }
3939    SeqLocFree(slp_m);
3940    SeqLocFree(slp_g);
3941    BLASTOptionDelete(options);
3942    if (sap == NULL)
3943    {
3944       ErrPostEx(SEV_ERROR, 0, 0, "Error in SPI_DoAln\n");
3945       return;
3946    }
3947 
3948    if (!AlnMgr2IndexLite(sap))
3949       return;
3950    /* flip alignments so genomic sequence is the first row */
3951    SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
3952    /* remove alignments that overlap by more than 2*SPI_TEENYEXON or that */
3953    /* are not consistent along genomic or mRNA coordinates         */
3954    SPI_RemoveInconsistentAlnsFromSet(sap, 2*SPI_TEENYEXON, 1, SPI_LEFT);
3955    if (spot->interspecies == FALSE) /* extend to both ends of mRNA */
3956    {
3957       if (!SPI_ConnectAln(sap, spot, srip, TRUE, TRUE))
3958          return;
3959    } else /* for interspecies alignments, don't try to extend to the ends */
3960    {
3961       if (!SPI_ConnectAln(sap, spot, srip, FALSE, TRUE))
3962          return;
3963    }
3964    if (spot->callback != NULL)
3965    {
3966       progress.percentdone = 75;
3967       progress.returncode = SPI_PROGRESS;
3968       if (!spot->callback(&progress))
3969          return;
3970    }
3971    srip->smp = SPI_AdjustForSplice(sap, spot, srip);
3972 }
3973 
3974 /***************************************************************************
3975 *
3976 *  SPI_CheckForPolyAExon looks at the 3' terminal exon and checks to see
3977 *  whether it consists only of polyAs. If so, the exon is deleted.
3978 *
3979 ***************************************************************************/
SPI_CheckForPolyAExon(SeqAlignPtr sap)3980 static void SPI_CheckForPolyAExon(SeqAlignPtr sap)
3981 {
3982    AMAlignIndex2Ptr  amaip;
3983    BioseqPtr         bsp;
3984    Int4              i;
3985    Int4              len;
3986    Int4              polya;
3987    SeqAlignPtr       salp;
3988    SeqAlignPtr       salp_prev;
3989    SeqAlignPtr       sap_target;
3990    SeqIdPtr          sip;
3991    Int4              start;
3992    Int4              stop;
3993    Uint1             strand;
3994 
3995    amaip = (AMAlignIndex2Ptr)(sap->saip);
3996    for (i=0; i<amaip->numsaps-1; i++)
3997    {
3998       amaip->saps[i]->next = amaip->saps[i+1];
3999    }
4000    amaip->saps[i]->next = NULL;
4001    strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
4002    sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
4003    bsp = BioseqLockById(sip);
4004    len = bsp->length;
4005    BioseqUnlock(bsp);
4006    if (strand == Seq_strand_minus)
4007       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &start, &stop);
4008    else
4009       AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &start, &stop);
4010    polya = SPI_IsItPolyA(sip);
4011    SeqIdFree(sip);
4012    if (len - start > polya)
4013       return;
4014    if (amaip->numsaps == 1)
4015       return;
4016    if (strand == Seq_strand_minus)
4017       sap_target = amaip->saps[0];
4018    else
4019       sap_target = amaip->saps[amaip->numsaps-1];
4020    salp = (SeqAlignPtr)(sap->segs);
4021    salp_prev = NULL;
4022    while (salp != NULL){
4023        if (salp == sap_target){
4024            if (salp_prev == NULL){
4025                sap->segs = (Pointer)(sap_target->next);
4026            }
4027            else {
4028                salp_prev->next = sap_target->next;
4029            }
4030            SeqAlignFree(sap_target);
4031            sap_target = 0;
4032            salp = 0;
4033        }
4034        else {
4035            salp_prev = salp;
4036            salp = salp->next;
4037        }
4038    }
4039    AMAlignIndexFreeEitherIndex(sap);
4040    AlnMgr2IndexLite(sap);
4041    AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4042 }
4043 
4044 /***************************************************************************
4045 *
4046 *  SPI_ConnectAln looks through all the alignments in a set and fills in
4047 *  the gaps on the mRNA sequence. If do_ends is TRUE, the function will
4048 *  try to fill in the alignments until they extend to both ends of the
4049 *  mRNA; otherwise the function only fills in internal gaps. If a gap is
4050 *  greater than the size of the smallest possible exon (SPI_TEENYEXON),
4051 *  the function checks to see whether there's also a gap in the genomic
4052 *  sequence. If both sequences have different-sized gaps bigger than
4053 *  SPI_FUZZ, the function calls SPI_FillInIntron to fill in the gaps.
4054 *  If both sequences have gaps larger than SPI_FUZZ and of similar sizes,
4055 *  the function calls Fasika Aklilu's tree-based alignment
4056 *  functions via SPI_FindBestAlnByDotPlot. SPI_ConnectAln does
4057 *  check to see whether a non-matching mRNA 3' tail is actually a poly(A)
4058 *  tail; if so, the tail is left unaligned.
4059 *
4060 ***************************************************************************/
SPI_ConnectAln(SeqAlignPtr sap,SPI_OptionsPtr spot,SPI_RegionInfoPtr srip,Boolean do_ends,Boolean firsttime)4061 static Boolean SPI_ConnectAln(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip, Boolean do_ends, Boolean firsttime)
4062 {
4063    AMAlignIndex2Ptr  amaip;
4064    BioseqPtr        bsp1;
4065    BioseqPtr        bsp2;
4066    Int4             currstart2;
4067    Int4             end2;
4068    Int4             gap1;
4069    Int4             gap2;
4070    Int4             i;
4071    Boolean          internal;
4072    Int4             j;
4073    Int4             len1;
4074    Int4             len2;
4075    SeqAlignPtr      newsaps;
4076    SeqAlignPtr      newsaps_prev;
4077    Int4             prevstop1;
4078    Int4             prevstop2;
4079    SeqAlignPtr      sap_new;
4080    SeqAlignPtr      sap_prev;
4081    SeqAlignPtr      sap_tmp;
4082    SeqIdPtr         sip1;
4083    SeqIdPtr         sip2;
4084    SeqLocPtr        slp1;
4085    SeqLocPtr        slp2;
4086    Int4             start1;
4087    Int4             start2;
4088    Int4             stop1;
4089    Int4             stop2;
4090    Uint1            strand1;
4091    Uint1            strand2;
4092    Int4             tail;
4093 
4094    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4095       return FALSE;
4096    AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4097    amaip = (AMAlignIndex2Ptr)(sap->saip);
4098    if (amaip->numsaps < 1)
4099       return FALSE;
4100    if (amaip->numsaps == 1 && !do_ends)
4101       return TRUE;
4102    sip1 = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 1);
4103    sip2 = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
4104    bsp1 = BioseqLockById(sip1);
4105    if (bsp1 == NULL)
4106       return FALSE;
4107    bsp2 = BioseqLockById(sip2);
4108    if (bsp2 == NULL)
4109    {
4110       BioseqUnlock(bsp1);
4111       return FALSE;
4112    }
4113    strand1 = AlnMgr2GetNthStrand(amaip->saps[0], 1);
4114    strand2 = AlnMgr2GetNthStrand(amaip->saps[0], 2);
4115    if (do_ends)
4116    {
4117       prevstop1 = prevstop2 = -1;
4118       if (strand2 == Seq_strand_minus)
4119          prevstop2 = bsp2->length-1;
4120    } else
4121    {
4122       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &prevstop1, NULL);
4123       prevstop1--;
4124       if (strand2 == Seq_strand_minus)
4125       {
4126          AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, NULL, &prevstop2);
4127          prevstop2++;
4128       } else
4129       {
4130          AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &prevstop2, NULL);
4131          prevstop2--;
4132       }
4133    }
4134    internal = FALSE;
4135    newsaps = newsaps_prev = NULL;
4136    for (i=0; i<amaip->numsaps; i++)
4137    {
4138       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &start1, &stop1);
4139       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &start2, &stop2);
4140       /* make the mRNA a little 'smaller' to force overlaps between adjacent exons */
4141       start2 = start2 + 2;
4142       stop2 = stop2 - 2;
4143       if (strand2 == Seq_strand_minus)
4144          currstart2 = stop2;
4145       else
4146          currstart2 = start2;
4147       if ((gap2 = spi_isa_gap(currstart2, prevstop2, strand2)) > SPI_TEENYEXON)
4148       {
4149          gap1 = spi_isa_gap(start1, prevstop1, strand1);
4150          if (gap1 < (gap2 - 2*SPI_FUZZ) && gap1 > SPI_FUZZ)
4151          {
4152             if (internal == TRUE && i<amaip->numsaps-1 && spot->interspecies == FALSE)
4153             {
4154                /* gap in mRNA but not genomic - possibly not the right region */
4155                len1 = AlnMgr2GetAlnLength(amaip->saps[i-1], FALSE);
4156                len2 = AlnMgr2GetAlnLength(amaip->saps[i], FALSE);
4157                if (len1 > len2)
4158                {
4159                   SeqAlignFree(amaip->saps[i]);
4160                   amaip->saps[i] = NULL;
4161                   j = i+1;
4162                   /* reset the boundaries */
4163                   stop1 = prevstop1;
4164                   start2 = stop2 = prevstop2;
4165                } else
4166                {
4167                   SeqAlignFree(amaip->saps[i-1]);
4168                   amaip->saps[i-1] = NULL;
4169                   j = i;
4170                }
4171                for ( ; j < amaip->numsaps; j++)
4172                {
4173                   amaip->saps[j-1] = amaip->saps[j];
4174                   amaip->saps[j] = NULL;
4175                }
4176                for (j=0; j<amaip->numsaps-1; j++)
4177                {
4178                   amaip->saps[j]->next  = amaip->saps[j+1];
4179                }
4180                amaip->numsaps--;
4181                amaip->saps[amaip->numsaps-1]->next = NULL;
4182                i--;
4183             } else if (internal == FALSE)    /* mRNA continues past beginning of genomic sequence */
4184             {
4185                sap_new = SPI_FindPiece(sip1, sip2, prevstop2, currstart2, strand2, start1, SPI_LEFT, spot);
4186                sap_new = SPI_ProcessNewAlns(sap_new);
4187                if (sap_new != NULL && newsaps == NULL)
4188                   newsaps = newsaps_prev = sap_new;
4189                else if (sap_new != NULL)
4190                   newsaps_prev->next = sap_new;
4191                while (sap_new != NULL && newsaps_prev->next != NULL)
4192                {
4193                   newsaps_prev = newsaps_prev->next;
4194                }
4195             }
4196          } else if ((gap2-2*SPI_FUZZ <= gap1) && (gap1 <= gap2 + 2*SPI_FUZZ) && gap1 > SPI_FUZZ) /* same size gaps */
4197          {
4198             slp1 = SeqLocIntNew(prevstop1+1, start1-1, strand1, sip1);
4199             if (strand2 != Seq_strand_minus)
4200                slp2 = SeqLocIntNew(prevstop2+1, currstart2-1, strand2, sip2);
4201             else
4202                slp2 = SeqLocIntNew(currstart2+1, prevstop2-1, strand2, sip2);
4203             sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
4204             sap_new = SPI_ProcessNewAlns(sap_new);
4205             SeqLocFree(slp1);
4206             SeqLocFree(slp2);
4207             if (sap_new != NULL && newsaps == NULL)
4208                newsaps = newsaps_prev = sap_new;
4209             else if (sap_new != NULL)
4210                newsaps_prev->next = sap_new;
4211             while (sap_new != NULL && newsaps_prev->next != NULL)
4212             {
4213                newsaps_prev = newsaps_prev->next;
4214             }
4215          } else if (gap1 >= SPI_FUZZ && gap2 >= SPI_FUZZ) /* gaps are different sizes -- intron? */
4216          {
4217             if (internal)
4218             {
4219                sap_new = SPI_FillInIntron(sip1, sip2, prevstop1, start1, prevstop2, currstart2, strand2, spot);
4220                sap_new = SPI_ProcessNewAlns(sap_new);
4221                if (sap_new != NULL && newsaps == NULL)
4222                   newsaps = newsaps_prev = sap_new;
4223                else if (sap_new != NULL)
4224                   newsaps_prev->next = sap_new;
4225                while (sap_new != NULL && newsaps_prev->next != NULL)
4226                {
4227                   newsaps_prev = newsaps_prev->next;
4228                }
4229             } else /* first or last part of mRNA hasn't matched */
4230             {
4231                if (strand2 == Seq_strand_minus && do_ends) /* last exon -- check for polyA */
4232                {
4233                   srip->polyAtail = SPI_IsItPolyA(sip2);
4234                   if (srip->polyAtail >= SPI_MINPOLYASIZE)
4235                   {
4236                      if (srip->polyAtail < prevstop2)
4237                         prevstop2 = prevstop2 - srip->polyAtail; /* don't align tail */
4238                      else
4239                      {
4240                         if (prevstop2 >= SPI_MINPOLYASIZE)
4241                         {
4242                            srip->polyAtail = prevstop2;
4243                            prevstop2 = 0;
4244                         } else
4245                            srip->polyAtail = 0;
4246                      }
4247                   } else
4248                      srip->polyAtail = 0;
4249                }
4250                sap_new = SPI_FindPiece(sip1, sip2, prevstop2, currstart2, strand2, start1, SPI_LEFT, spot);
4251                sap_new = SPI_ProcessNewAlns(sap_new);
4252                if (sap_new != NULL && newsaps == NULL)
4253                   newsaps = newsaps_prev = sap_new;
4254                else if (sap_new != NULL)
4255                   newsaps_prev->next = sap_new;
4256                while (sap_new != NULL && newsaps_prev->next != NULL)
4257                {
4258                   newsaps_prev = newsaps_prev->next;
4259                }
4260             }
4261          }
4262       }
4263       internal = TRUE;
4264       prevstop1 = stop1;
4265       if (strand2 == Seq_strand_minus)
4266          prevstop2 = start2;
4267       else
4268          prevstop2 = stop2;
4269    }
4270    sap_tmp = amaip->saps[0];
4271    for (j=1; j<amaip->numsaps; j++)
4272    {
4273       sap_tmp->next = amaip->saps[j];
4274       sap_tmp = amaip->saps[j];
4275    }
4276    sap_tmp->next = newsaps;
4277    sap->segs = (Pointer)(amaip->saps[0]);
4278    if (do_ends)
4279    {
4280       if (strand2 != Seq_strand_minus) /* last exon -- check for polyA */
4281       {
4282          srip->polyAtail = SPI_IsItPolyA(sip2);
4283          if (srip->polyAtail >= SPI_MINPOLYASIZE)
4284             end2 = bsp2->length - 1 - srip->polyAtail;
4285          else
4286          {
4287             srip->polyAtail = 0;
4288             end2 = bsp2->length-1;
4289          }
4290          if (srip->polyAtail != 0 && prevstop2 > end2)
4291             srip->polyAtail = srip->polyAtail - (prevstop2-end2+1);
4292          if (srip->polyAtail < SPI_MINPOLYASIZE)
4293             srip->polyAtail = 0;
4294       } else
4295          end2 = -1;
4296       if (spi_isa_gap(end2, prevstop2, strand2))
4297       {
4298          sap_new = SPI_FindPiece(sip1, sip2, prevstop2, end2, strand2, prevstop1, SPI_RIGHT, spot);
4299          sap_new = SPI_ProcessNewAlns(sap_new);
4300          sap_tmp = (SeqAlignPtr)(sap->segs);
4301          while (sap_tmp != NULL && sap_tmp->next != NULL)
4302          {
4303             sap_tmp = sap_tmp->next;
4304          }
4305          sap_tmp->next = sap_new;
4306       }
4307    }
4308    sap_tmp = (SeqAlignPtr)(sap->segs);
4309    /* check whether last exon is all polya */
4310    tail = SPI_IsItPolyA(sip2);
4311    if (strand2 == Seq_strand_minus)
4312    {
4313       AlnMgr2GetNthSeqRangeInSA(sap_tmp, 2, &start2, &stop2);
4314       if (stop2 < SPI_TEENYEXON + tail)
4315       {
4316          sap->segs = (Pointer)(sap_tmp->next);
4317          sap_tmp->next = NULL;
4318          SeqAlignFree(sap_tmp);
4319       }
4320    } else
4321    {
4322       sap_prev = NULL;
4323       while (sap_tmp->next != NULL)
4324       {
4325          sap_prev = sap_tmp;
4326          sap_tmp = sap_tmp->next;
4327       }
4328       AlnMgr2GetNthSeqRangeInSA(sap_tmp, 2, &start2, &stop2);
4329       if (bsp2->length - 1 - start2 < SPI_TEENYEXON + tail)
4330       {
4331          if (sap_prev != NULL)
4332             sap_prev->next = NULL;
4333          SeqAlignFree(sap_tmp);
4334          if (sap_prev == NULL)
4335             sap->segs = NULL;
4336       }
4337    }
4338    sap_tmp = (SeqAlignPtr)(sap->segs);
4339    i = 0;
4340    while (sap_tmp != NULL)
4341    {
4342       i++;
4343       sap_tmp = sap_tmp->next;
4344    }
4345    amaip->numsaps = i;
4346    MemFree(amaip->saps);
4347    amaip->saps = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
4348    sap_tmp = (SeqAlignPtr)(sap->segs);
4349    i = 0;
4350    while (sap_tmp != NULL)
4351    {
4352       amaip->saps[i] = sap_tmp;
4353       i++;
4354       sap_tmp = sap_tmp->next;
4355    }
4356    BioseqUnlock (bsp1);
4357    BioseqUnlock (bsp2);
4358    SeqIdFree(sip1);
4359    SeqIdFree(sip2);
4360    if (firsttime) /* reconnect to pick up last pieces */
4361       SPI_ConnectAln(sap, spot, srip, do_ends, FALSE);
4362    AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4363    SPI_CheckForPolyAExon(sap);
4364    return TRUE;
4365 }
4366 
4367 /***************************************************************************
4368 *
4369 *  SPI_ProcessNewAlns takes a linked list of child-type alignments and
4370 *  runs SPI_RemoveInconsistentAlnsFromSet on a "fake" parent alignment
4371 *  that it temporarily attaches to the children.
4372 *
4373 ***************************************************************************/
SPI_ProcessNewAlns(SeqAlignPtr sap)4374 static SeqAlignPtr SPI_ProcessNewAlns(SeqAlignPtr sap)
4375 {
4376    SeqAlignPtr  sap_head;
4377 
4378    if (sap == NULL)
4379       return NULL;
4380    sap_head = SeqAlignNew();
4381    sap_head->segtype = SAS_DISC;
4382    sap_head->segs = (Pointer)sap;
4383    AlnMgr2IndexLite(sap_head);
4384    SPI_RemoveInconsistentAlnsFromSet(sap_head, SPI_TEENYEXON, 1, SPI_LEFT);
4385    sap = (SeqAlignPtr)(sap_head->segs);
4386    sap_head->segs = NULL;
4387    MemFree(sap_head);
4388    return sap;
4389 }
4390 
4391 /***************************************************************************
4392 *
4393 *  SPI_IsItPolyA is a utility function which returns the length of
4394 *  the poly(A) tail of a sequence. The poly(A) tail must be at least
4395 *  SPI_MINPOLYASIZE long and can have non-A residues up to the SPI_LINKERSIZE
4396 *  position from the end (linker used to clone the cDNA may sometimes be left
4397 *  on the end of the poly(A) tail).
4398 *
4399 ***************************************************************************/
SPI_IsItPolyA(SeqIdPtr sip)4400 static Int4 SPI_IsItPolyA(SeqIdPtr sip)
4401 {
4402    BioseqPtr   bsp;
4403    Uint1       buf[SPI_MAXPOLYASIZE];
4404    Int4        count;
4405    Boolean     done;
4406    Int4        i;
4407    Int4        j;
4408    Uint1       res;
4409    SeqPortPtr  spp;
4410    Int4        start;
4411 
4412    bsp = BioseqLockById(sip);
4413    spp = SeqPortNew (bsp, bsp->length - SPI_MAXPOLYASIZE, bsp->length-1, Seq_strand_minus, Seq_code_ncbi4na);
4414    /* port on the minus strand -- shows up earlier if it's not polyA */
4415    done = FALSE;
4416    i = 0;
4417    j = 0;
4418    start = 0;
4419    count = 0;
4420    SeqPortRead(spp, buf, SPI_MAXPOLYASIZE);
4421    while (((res = buf[j]) != SEQPORT_EOF) && !done)
4422    {
4423       if (res != 8)
4424       {
4425          if (count >= SPI_LINKERSIZE) /* can have non-A bases up to the SPI_LINKERSIZE position */
4426             done = TRUE;
4427          else
4428             start++;
4429       } else
4430          i++;
4431       count++;
4432       j++;
4433    }
4434    if (start > 0)
4435    {
4436       if (i-start < SPI_MINPOLYASIZE) /* "tail" is too short */
4437       {
4438          SeqPortFree(spp);
4439          return 0;
4440       }
4441    }
4442    SeqPortFree(spp);
4443    BioseqUnlock(bsp);
4444    return i;
4445 }
4446 
4447 /***************************************************************************
4448 *
4449 *  SPI_FillInIntron is able to fill in internal gaps for SPI_ConnectAln.
4450 *  Given mRNA and genomic boundaries, SPI_FillInIntron first does a low-
4451 *  stringency BLAST, then removes inconsistent and overlapping alignments
4452 *  from the resulting alignment set, and finally calls
4453 *  SPI_FillInLastmRNAHoles to internally connect the new alignments.
4454 *
4455 ***************************************************************************/
SPI_FillInIntron(SeqIdPtr sip1,SeqIdPtr sip2,Int4 start1,Int4 stop1,Int4 start2,Int4 stop2,Uint1 strand2,SPI_OptionsPtr spot)4456 static SeqAlignPtr SPI_FillInIntron(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start1, Int4 stop1, Int4 start2, Int4 stop2, Uint1 strand2, SPI_OptionsPtr spot)
4457 {
4458    BLAST_OptionsBlkPtr  options;
4459    SeqAlignPtr          sap;
4460    SeqLocPtr            slp1;
4461    SeqLocPtr            slp2;
4462    Int4                 start;
4463    Int4                 stop;
4464 
4465    if (stop1 - start1 < SPI_MINBLASTSIZE)
4466       return NULL;
4467    slp1 = SeqLocIntNew(start1, stop1, Seq_strand_plus, sip1);
4468    if (strand2 == Seq_strand_minus)
4469    {
4470       start = stop2;
4471       stop = start2;
4472    } else
4473    {
4474       start = start2;
4475       stop = stop2;
4476    }
4477    if (stop - start < SPI_MINBLASTSIZE)
4478    {
4479       SeqLocFree(slp1);
4480       return NULL;
4481    }
4482    slp2 = SeqLocIntNew(start, stop, strand2, sip2);
4483    options = BLASTOptionNew("blastn", FALSE);
4484    options->filter_string = StringSave("m L");
4485    options->expect_value = spot->thirdpasseval;
4486    options->query_lcase_mask = spot->lcaseloc;
4487    options->wordsize = 7;
4488    if (spot->interspecies)
4489    {
4490       options->gap_x_dropoff_final = 100;
4491       options->gap_open = 4;
4492       options->gap_extend = 1;
4493       options->penalty = -1;
4494    }
4495    sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4496 
4497 
4498    SeqLocFree(slp1);
4499    SeqLocFree(slp2);
4500    AlnMgr2IndexLite(sap);
4501    if (sap != NULL)
4502    {
4503       SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4504       AMAlignIndex2Free2(sap->saip);
4505       sap->saip = NULL;
4506    }
4507    AlnMgr2IndexLite(sap);
4508    SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4509    BLASTOptionDelete(options);
4510    sap = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start1, stop1, start, stop, strand2);
4511    return sap;
4512 }
4513 
4514 /***************************************************************************
4515 *
4516 *  spi_isa_gap is an often-called utility function that returns the size
4517 *  of the difference between two sequence positions, given the strand. If
4518 *  the sequence positions overlap or abut exactly, the function returns
4519 *  0. The strand is important:
4520 *
4521 *  13---------40   50--------60 plus strand--> gap of 8
4522 *              |    |
4523 *          prevstop start
4524 *              |    |
4525 *  60---------50   40-------13 minus strand--> gap of 8
4526 *
4527 ***************************************************************************/
spi_isa_gap(Int4 start,Int4 prevstop,Uint1 strand)4528 static Int4 spi_isa_gap(Int4 start, Int4 prevstop, Uint1 strand)
4529 {
4530    if (prevstop == -1)
4531       prevstop = 0;
4532    if (start == -1)
4533       start = 0;
4534    if (strand != Seq_strand_minus)
4535    {
4536       if (start > prevstop+1)
4537          return (start - prevstop+1);
4538       else
4539          return 0;
4540    } else
4541    {
4542       if (prevstop > start+1)
4543          return (prevstop - (start+1));
4544       else
4545          return 0;
4546    }
4547 }
4548 
4549 /***************************************************************************
4550 *
4551 *  SPI_GetNthSeqLenInSASet assumes that the alignment given is a
4552 *  set of alignments that all have the same rows. The function returns
4553 *  the length of the Nth row that is covered by the alignment set.
4554 *
4555 ***************************************************************************/
SPI_GetNthSeqLenInSASet(SeqAlignPtr sap,Int4 n,Int4Ptr numsaps)4556 static Int4 SPI_GetNthSeqLenInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr numsaps)
4557 {
4558    Int4         len;
4559    Int4         num;
4560    SeqAlignPtr  salp;
4561    Int4         start_tmp;
4562    Int4         stop_tmp;
4563 
4564    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4565       return -1;
4566    salp = (SeqAlignPtr)(sap->segs);
4567    start_tmp = stop_tmp = -1;
4568    len = 0;
4569    num = 0;
4570    while (salp != NULL)
4571    {
4572       num++;
4573       if (n > salp->dim)
4574          return 0;
4575       AlnMgr2GetNthSeqRangeInSA(salp, n, &start_tmp, &stop_tmp);
4576       len += (stop_tmp - start_tmp + 1);
4577       salp = salp->next;
4578    }
4579    if (numsaps)
4580       *numsaps = num;
4581    return len;
4582 }
4583 
4584 /***************************************************************************
4585 *
4586 *  SPI_GetNthSeqRangeInSASet is used to get the 5' and 3' boundaries
4587 *  of a sequence across a set of alignments. 'N' refers to row number,
4588 *  and all the alignments are assumed to have the same rows. Note that
4589 *  this function says nothing about the coverage of the specified
4590 *  sequence.
4591 *
4592 ***************************************************************************/
SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap,Int4 n,Int4Ptr start,Int4Ptr stop)4593 static void SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop)
4594 {
4595    Int4         numrows;
4596    SeqAlignPtr  salp;
4597    Int4         start_tmp;
4598    Int4         stop_tmp;
4599    Int4         tmp1;
4600    Int4         tmp2;
4601 
4602    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4603       return;
4604    salp = (SeqAlignPtr)(sap->segs);
4605    start_tmp = stop_tmp = -1;
4606    while (salp != NULL)
4607    {
4608       numrows = AlnMgr2GetNumRows(salp);
4609       if (n > numrows)
4610       {
4611          if (start)
4612             *start = -1;
4613          if (stop)
4614             *stop = -1;
4615          return;
4616       }
4617       AlnMgr2GetNthSeqRangeInSA(salp, n, &tmp1, &tmp2);
4618       if (tmp1 < start_tmp || start_tmp == -1)
4619          start_tmp = tmp1;
4620       if (tmp2 > stop_tmp)
4621          stop_tmp = tmp2;
4622       salp = salp->next;
4623    }
4624    if (start)
4625       *start = start_tmp;
4626    if (stop)
4627       *stop = stop_tmp;
4628 }
4629 
4630 /***************************************************************************
4631 *
4632 *  SPI_FindPiece is used to align a piece of mRNA with a tail of genomic
4633 *  sequence:
4634 *  which_end            start_g               start_m-stop_m
4635 *   SPI_LEFT    0<-------4000  genomic           X 30-61 mRNA (plus strand)
4636 *   SPI_RIGHT            7000----->3'end genomic X 79-40 mRNA (minus strand)
4637 *
4638 *  SPI_FindPiece first does a low-stringency BLAST search to try to align
4639 *  the desired piece, and then calls functions to fill out the alignment
4640 *  so that the new alignment is well-connected with the other pieces and
4641 *  internally complete.
4642 *  Since the strands and orientation are constrained by the other
4643 *  alignments in the set, the BLAST search is only done within the
4644 *  specified boundaries and for the specified strand.
4645 *
4646 ***************************************************************************/
SPI_FindPiece(SeqIdPtr sip1,SeqIdPtr sip2,Int4 start_m,Int4 stop_m,Uint1 strand,Int4 start_g,Int2 which_end,SPI_OptionsPtr spot)4647 static SeqAlignPtr SPI_FindPiece(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start_m, Int4 stop_m, Uint1 strand, Int4 start_g, Int2 which_end, SPI_OptionsPtr spot)
4648 {
4649    Int4                 bigintron;
4650    BioseqPtr            bsp1;
4651    Int4                 gstart;
4652    Int4                 gstop;
4653    Int4                 mstart;
4654    Int4                 mstop;
4655    BLAST_OptionsBlkPtr  options;
4656    SeqAlignPtr          sap;
4657    SeqAlignPtr          sap_new;
4658    SeqAlignPtr          sap_new2;
4659    SeqLocPtr            slp1;
4660    SeqLocPtr            slp2;
4661    Int4                 start;
4662    Int4                 stop;
4663 
4664    if (sip1 == NULL || sip2 == NULL)
4665       return NULL;
4666    /*KSK*/
4667    if (spot->bigintron){
4668        bigintron = MAX(SPI_BIGINTRONXL, spot->bigintron_size);
4669    }
4670    else {
4671        bigintron = SPI_BIGINTRON;
4672    }
4673    if ((strand == Seq_strand_minus && start_m - stop_m < 7) || (strand != Seq_strand_minus && stop_m - start_m < 7))
4674       return NULL;
4675    if (start_m < 0)
4676       start_m = 0;
4677    if (start_m == -1)
4678       start_m = 0;
4679    if (stop_m == -1)
4680       stop_m = 0;
4681    if (start_g == -1)
4682       start_g = 0;
4683    if (which_end == SPI_LEFT)
4684    {
4685       if (start_g < SPI_FUZZ)
4686          return NULL;
4687       if (strand == Seq_strand_minus)
4688       {
4689          if (start_m - stop_m > start_g + 2*SPI_FUZZ)
4690             return NULL;
4691          start = MAX(0, start_g - bigintron);
4692          slp1 = SeqLocIntNew(MAX(start, spot->from), MIN(start_g, spot->to), Seq_strand_plus, sip1);
4693          slp2 = SeqLocIntNew(stop_m, start_m, strand, sip2);
4694          options = BLASTOptionNew("blastn", FALSE);
4695          options->wordsize = 7;
4696          options->filter_string = StringSave("m L");
4697          options->expect_value = spot->secpasseval;
4698          options->query_lcase_mask = spot->lcaseloc;
4699          if (spot->interspecies)
4700          {
4701             options->gap_x_dropoff_final = 100;
4702             options->gap_open = 4;
4703             options->gap_extend = 1;
4704             options->penalty = -1;
4705          }
4706          sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4707          if (sap == NULL)
4708             return NULL;
4709          AlnMgr2IndexLite(sap);
4710          SeqLocFree(slp1);
4711          SeqLocFree(slp2);
4712          SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4713          AMAlignIndex2Free2(sap->saip);
4714          sap->saip = NULL;
4715          AlnMgr2IndexLite(sap);
4716          SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_RIGHT);
4717          SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4718          SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4719          BLASTOptionDelete(options);
4720          if (spot->draftfile != NULL)
4721             return sap;
4722          if (mstart - stop_m <= 3*SPI_FUZZ)
4723          {
4724             sap_new = sap;
4725             if (sap_new->segtype == SAS_DISC)
4726             {
4727                sap_new2 = (SeqAlignPtr)(sap_new->segs);
4728                sap_new->segs = NULL;
4729                SeqAlignFree(sap_new);
4730                sap_new = sap_new2;
4731                while (sap_new2 != NULL)
4732                {
4733                   AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4734                   sap_new2 = sap_new2->next;
4735                }
4736             } else
4737                SPI_AddToAln(sap_new, mstart - stop_m, SPI_RIGHT, strand);
4738             return sap_new;
4739          } else
4740          {
4741             sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, gstart, start_g-1, stop_m+1, mstop, strand);
4742             return sap_new;
4743          }
4744       } else
4745       {
4746          if (stop_m - start_m > start_g + 2*SPI_FUZZ)
4747             return NULL;
4748          start = MAX(0, start_g - bigintron);
4749          slp1 = SeqLocIntNew(MAX(start, spot->from), MIN(start_g, spot->to), Seq_strand_plus, sip1);
4750          slp2 = SeqLocIntNew(start_m, stop_m, strand, sip2);
4751          options = BLASTOptionNew("blastn", FALSE);
4752          options->wordsize = 7;
4753          options->filter_string = StringSave("m L");
4754          options->expect_value = spot->secpasseval;
4755          options->query_lcase_mask = spot->lcaseloc;
4756          if (spot->interspecies)
4757          {
4758             options->gap_x_dropoff_final = 100;
4759             options->gap_open = 4;
4760             options->gap_extend = 1;
4761             options->penalty = -1;
4762          }
4763          sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4764          if (sap == NULL)
4765             return NULL;
4766          AlnMgr2IndexLite(sap);
4767          SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4768          AMAlignIndex2Free2(sap->saip);
4769          SeqLocFree(slp1);
4770          SeqLocFree(slp2);
4771          sap->saip = NULL;
4772          AlnMgr2IndexLite(sap);
4773          SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_RIGHT);
4774          SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4775          SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4776          BLASTOptionDelete(options);
4777          if (spot->draftfile != NULL)
4778             return sap;
4779          if (stop_m - mstop <= 3*SPI_FUZZ)
4780          {
4781             sap_new = sap;
4782             if (sap_new->segtype == SAS_DISC)
4783             {
4784                sap_new2 = (SeqAlignPtr)(sap_new->segs);
4785                sap_new->segs = NULL;
4786                SeqAlignFree(sap_new);
4787                sap_new = sap_new2;
4788                while (sap_new2 != NULL)
4789                {
4790                   AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4791                   sap_new2 = sap_new2->next;
4792                }
4793             } else
4794                SPI_AddToAln(sap_new, stop_m - mstop, SPI_RIGHT, strand);
4795             return sap_new;
4796          } else
4797          {
4798             sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, gstart, start_g-1, mstart, stop_m+1, strand);
4799             return sap_new;
4800          }
4801       }
4802    } else if (which_end == SPI_RIGHT)
4803    {
4804       bsp1 = BioseqLockById(sip1);
4805       if (bsp1 == NULL)
4806          return NULL;
4807       if (start_g > bsp1->length - SPI_FUZZ)
4808          return NULL;
4809       if (strand == Seq_strand_minus)
4810       {
4811          if (start_m - stop_m > bsp1->length - start_g - 2*SPI_FUZZ)
4812             return NULL;
4813          stop = MIN(bsp1->length-1, start_g + bigintron);
4814          slp1 = SeqLocIntNew(MAX(start_g, spot->from), MIN(stop, spot->to), Seq_strand_plus, sip1);
4815          slp2 = SeqLocIntNew(stop_m, start_m, strand, sip2);
4816          options = BLASTOptionNew("blastn", FALSE);
4817          options->wordsize = 7;
4818          options->filter_string = StringSave("m L");
4819          options->expect_value = spot->secpasseval;
4820          options->query_lcase_mask = spot->lcaseloc;
4821          if (spot->interspecies)
4822          {
4823             options->gap_x_dropoff_final = 100;
4824             options->gap_open = 4;
4825             options->gap_extend = 1;
4826             options->penalty = -1;
4827          }
4828          sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4829          if (sap == NULL)
4830             return NULL;
4831          AlnMgr2IndexLite(sap);
4832          SeqLocFree(slp1);
4833          SeqLocFree(slp2);
4834          SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4835          AMAlignIndex2Free2(sap->saip);
4836          sap->saip = NULL;
4837          AlnMgr2IndexLite(sap);
4838          SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4839          SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4840          SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4841          BLASTOptionDelete(options);
4842          if (spot->draftfile != NULL)
4843             return sap;
4844          if (start_m - mstop <= 3*SPI_FUZZ)
4845          {
4846             sap_new = sap;
4847             if (sap_new->segtype == SAS_DISC)
4848             {
4849                sap_new2 = (SeqAlignPtr)(sap_new->segs);
4850                sap_new->segs = NULL;
4851                SeqAlignFree(sap_new);
4852                sap_new = sap_new2;
4853                while (sap_new2 != NULL)
4854                {
4855                   AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4856                   sap_new2 = sap_new2->next;
4857                }
4858             } else
4859                SPI_AddToAln(sap_new, start_m - mstop, SPI_LEFT, strand);
4860             return sap_new;
4861          } else
4862          {
4863             sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start_g+1, gstop, mstart, start_m-1, strand);
4864             return sap_new;
4865          }
4866       } else
4867       {
4868          if (stop_m - start_m > bsp1->length - start_g - 2*SPI_FUZZ)
4869             return NULL;
4870          stop = MIN(bsp1->length-1, start_g + bigintron);
4871          slp1 = SeqLocIntNew(MAX(start_g, spot->from), MIN(stop, spot->to), Seq_strand_plus, sip1);
4872          slp2 = SeqLocIntNew(start_m, stop_m, strand, sip2);
4873          options = BLASTOptionNew("blastn", FALSE);
4874          options->wordsize = 7;
4875          options->filter_string = StringSave("m L");
4876          options->expect_value = spot->secpasseval;
4877          options->query_lcase_mask = spot->lcaseloc;
4878          if (spot->interspecies)
4879          {
4880             options->gap_x_dropoff_final = 100;
4881             options->gap_open = 4;
4882             options->gap_extend = 1;
4883             options->penalty = -1;
4884          }
4885          sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4886          if (sap == NULL)
4887             return NULL;
4888          AlnMgr2IndexLite(sap);
4889          SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4890          AMAlignIndex2Free2(sap->saip);
4891          SeqLocFree(slp1);
4892          SeqLocFree(slp2);
4893          sap->saip = NULL;
4894          AlnMgr2IndexLite(sap);
4895          SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4896          SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4897          SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4898          BLASTOptionDelete(options);
4899          if (spot->draftfile != NULL)
4900             return sap;
4901          if (mstart - start_m <= 3*SPI_FUZZ)
4902          {
4903             sap_new = sap;
4904             if (sap_new->segtype == SAS_DISC)
4905             {
4906                sap_new2 = (SeqAlignPtr)(sap_new->segs);
4907                sap_new->segs = NULL;
4908                SeqAlignFree(sap_new);
4909                sap_new = sap_new2;
4910                while (sap_new2 != NULL)
4911                {
4912                   AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4913                   sap_new2 = sap_new2->next;
4914                }
4915             } else
4916                SPI_AddToAln(sap_new, mstart - start_m, SPI_LEFT, strand);
4917             return sap_new;
4918          } else
4919          {
4920             sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start_g+1, gstop, start_m+1, mstop, strand);
4921             return sap_new;
4922          }
4923       }
4924    }
4925    return NULL;
4926 }
4927 
4928 
4929 /* added by KSK for SPI_AdjustForSplice() when mRNA regions overlap */
SPI_Choose2LooseMrnaOvLap(const SeqAlignPtr sap1,const SeqAlignPtr sap2,const SPI_mRNAPtr smp,const int ptr1offset)4930 static int SPI_Choose2LooseMrnaOvLap (const SeqAlignPtr sap1, const SeqAlignPtr sap2,
4931                                       const SPI_mRNAPtr smp, const int ptr1offset)
4932 {
4933     Int4 p1_sites = 0, p2_sites = 0;
4934     Int4 score1 = 0, score2 = 0;
4935     float margin = 0;
4936 
4937     if (sap1 == NULL || sap2 == NULL || smp == NULL){
4938         return -1;
4939     }
4940 
4941     score1 = AlnMgr2ComputeScoreForSeqAlign(sap1);
4942     score2 = AlnMgr2ComputeScoreForSeqAlign(sap2);
4943 
4944     if (score1 >= score2){
4945         margin = (float)score1/5;
4946         if ((float)score1 >= (((float)(score2)) + margin)){
4947             return ptr1offset + 1;
4948         }
4949     }
4950     else if (score1 <= score2){
4951         margin = (float)score2/5;
4952         if ((float)score2 >= (((float)(score1)) + margin)){
4953             return ptr1offset;
4954         }
4955     }
4956 
4957     p1_sites = smp->splicedon[ptr1offset] + smp->spliceacc[ptr1offset];
4958     p2_sites = smp->splicedon[ptr1offset + 1] + smp->spliceacc[ptr1offset + 1];
4959 
4960     if (p1_sites > p2_sites){
4961         return ptr1offset + 1;
4962     }
4963     else if (p2_sites > p1_sites){
4964         return ptr1offset;
4965     }
4966     return (score1 >= score2 ? ptr1offset + 1 : ptr1offset);
4967 }
4968 
4969 
4970 
4971 
4972 /***************************************************************************
4973 *
4974 *  SPI_AdjustForSplice adjusts the boundaries of all the alignments in
4975 *  the set so that they abut each other and are at the optimal splice
4976 *  sites. SPI_AdjustForSplice also fills in the mismatch, %id, #gaps
4977 *  and other information for each exon. The function first allocates a new
4978 *  SPI_mRNA structure, then makes sure that the set of alignments doesn't
4979 *  miss tiny pieces on the 5' or 3' end of the mRNA. Next, the alignments
4980 *  are sent in pairs to SPI_AdjustOverlaps, which adjusts the alignment
4981 *  boundaries so that they are adjacent to each other and to good splice
4982 *  sites. The function then checks to see whether any two alignments are
4983 *  adjacent on both the genomic and mRNA sequences; if so, these alignments
4984 *  are merged. Each alignment (now each alignment is exactly one exon) is
4985 *  sent to SPI_GetExonInfo to get the %id, #gaps, etc.; the overall
4986 *  % coverage is computed and the alignments are examined to see whether
4987 *  one or both ends of the mRNA are missing, and then all the information
4988 *  is returned to the calling function.
4989 *
4990 ***************************************************************************/
SPI_AdjustForSplice(SeqAlignPtr sap,SPI_OptionsPtr spot,SPI_RegionInfoPtr srip)4991 static SPI_mRNAPtr SPI_AdjustForSplice(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip)
4992 {
4993    AMAlignIndex2Ptr  amaip;
4994    Int4             b;
4995    BioseqPtr        bsp;
4996    Int4             c;
4997    Int4             count, sap2delete = 0;
4998    Int4             gstart1;
4999    Int4             gstart2;
5000    Int4             gstop1;
5001    Int4             gstop2;
5002    Int4             i;
5003    Int4             intronsize;
5004    Int4             len;
5005    Int4             len1;
5006    Int4             len2;
5007    Int4             min = 0;
5008    Int4             mis;
5009    Int4             max = 0;
5010    Int4             mstart1;
5011    Int4             mstart2;
5012    Int4             mstop1;
5013    Int4             mstop2;
5014    Int4             n;
5015    SeqAlignPtr      PNTR saparray;
5016    SeqIdPtr         sip;
5017    SPI_mRNAPtr      smp;
5018    SPI_mRNAPtr      smp_new;
5019    Uint1            strand;
5020 
5021    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
5022       return NULL;
5023    if (spot->bigintron){
5024        intronsize = (spot->bigintron_size > SPI_INTRONSIZEXL
5025                      ? spot->bigintron_size : SPI_INTRONSIZEXL);
5026        /*intronsize = SPI_INTRONSIZEXL;*/
5027    }
5028    else{
5029        intronsize = SPI_INTRONSIZE;
5030    }
5031    /*end KSK*/
5032    AlnMgr2SortAlnSetByNthRowPos(sap, 1);
5033    SPI_RemoveTeenyAlns(sap, SPI_TEENYEXON);
5034    if (sap->segs == NULL)
5035    {
5036       SeqAlignFree(sap);
5037       return NULL;
5038    }
5039    amaip = (AMAlignIndex2Ptr)(sap->saip);
5040    strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
5041    /* first allocate a new SPI_mRNA structure to hold all the information */
5042    smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
5043    smp->numexons = amaip->numsaps;
5044    smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
5045    smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
5046    smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
5047    smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5048    smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
5049    smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5050    smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5051    smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5052    smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5053    smp->strand = strand;
5054    /* make sure the alignment doesn't leave out little bits on the ends */
5055    sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
5056    bsp = BioseqLockById(sip);
5057    len1 = bsp->length;
5058    len1 = len1 - srip->polyAtail;
5059    BioseqUnlock(bsp);
5060    SeqIdFree(sip);
5061    sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 1);
5062    bsp = BioseqLockById(sip);
5063    len2 = bsp->length;
5064    BioseqUnlock(bsp);
5065    SeqIdFree(sip);
5066    if (strand != Seq_strand_minus)
5067    {
5068       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &gstart1, &gstop1);
5069       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &mstart1, &mstop1);
5070       if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && gstart1 >= mstart1)
5071          SPI_AddToAln(amaip->saps[0], mstart1, SPI_LEFT, strand);
5072       else if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && gstart1 < mstart1)
5073          SPI_AddToAln(amaip->saps[0], gstart1, SPI_LEFT, strand);
5074       AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &mstart2, &mstop2);
5075       AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 1, &gstart2, &gstop2);
5076       if (len1 - srip->polyAtail - mstop2-1 > 0 && len1 - mstop2-1 <= SPI_ENDFUZZ && len2-gstop2 >= len1-mstop2)
5077          SPI_AddToAln(amaip->saps[amaip->numsaps-1], len1-mstop2-1, SPI_RIGHT, strand);
5078       else if (len1-mstop2-1 - srip->polyAtail > 2 && len1 - mstop2-1 <= SPI_ENDFUZZ && len1-mstop2 > len2 - gstop2)
5079          SPI_AddToAln(amaip->saps[amaip->numsaps-1], len2-gstop2-1, SPI_RIGHT, strand);
5080    } else
5081    {
5082       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &gstart1, &gstop1);
5083       AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &mstart2, &mstop2);
5084       if (len1 - mstop2-1-srip->polyAtail > 0 && len1 - mstop2-1 <= 2*SPI_TEENYEXON && gstart1 >= (len1 - mstop2 - 1))
5085          SPI_AddToAln(amaip->saps[0], len1-mstop2-1, SPI_LEFT, strand);
5086       else if (len1 - mstop2-1-srip->polyAtail > 0 && len1 - mstop2-1 <= 2*SPI_TEENYEXON && gstart1 < (len1 - mstop2 - 1))
5087          SPI_AddToAln(amaip->saps[0], gstart1, SPI_LEFT, strand);
5088       AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &mstart1, &mstop1);
5089       AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 1, &gstart2, &gstop2);
5090       if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && len2 - gstop2 -1> mstart1)
5091          SPI_AddToAln(amaip->saps[amaip->numsaps-1], mstart1, SPI_RIGHT, strand);
5092       else if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && len2 - gstop2 -1<= mstart1)
5093           SPI_AddToAln(amaip->saps[amaip->numsaps-1], len2-gstop2-1, SPI_RIGHT, strand);
5094    }
5095    /* send the alignments in pairs to be adjusted to good splice sites */
5096    for (i=0; i<amaip->numsaps - 1; i++)
5097    {
5098       SPI_AdjustOverlaps(amaip->saps[i], amaip->saps[i+1], i, smp, spot);
5099    }
5100    n = 0;
5101    for (i=0; i<amaip->numsaps-1; i++) /* merge adjacent alignments */
5102    {
5103 
5104       amaip->saps[i]->next = NULL;
5105       amaip->saps[i+1]->next = NULL;
5106       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &gstart1, &gstop1);
5107       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &gstart2, &gstop2);
5108       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &mstart1, &mstop1);
5109       AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &mstart2, &mstop2);
5110       /* if (gstart2 >= gstop1 - SPI_EXONMERGESIZE && gstart2 <= gstop1 + SPI_EXONMERGESIZE) */
5111       if (gstart2 >= gstop1 - SPI_EXONMERGESIZE && gstart2 <= gstop1 + SPI_EXONMERGESIZE){
5112           if ((mstart2 >= mstop1 - SPI_EXONMERGESIZE && mstart2 <= mstop1 + SPI_EXONMERGESIZE)
5113               || (mstart1 >= mstop2 - SPI_EXONMERGESIZE && mstart1 <= mstop2 + SPI_EXONMERGESIZE)){
5114               amaip->saps[i+1] = SPI_MergeAlignments(amaip->saps[i], amaip->saps[i+1]);
5115               SeqAlignFree(amaip->saps[i]);
5116               amaip->saps[i] = NULL;
5117               n++;
5118           }
5119       }
5120       /** KSK added this 'else if' block in case there are overlapping **
5121        ** mrna sequences shared by these seqaligns **/
5122 
5123       else if ((strand == Seq_strand_minus && (mstop2 <= mstop1 && mstop2 >= mstart1))
5124                || (strand == Seq_strand_plus && (mstop1 <= mstop2 && mstop1 >= mstart2))){
5125           /* fixes in case the MRNA portion of different regions overlap */
5126           /* first, if one is subsumed */
5127           if (mstart1 >= mstart2 && mstop1 <= mstop2){
5128               SeqAlignFree(amaip->saps[i]);
5129               amaip->saps[i] = NULL;
5130               n++;
5131           }
5132           else if (mstart2 <= mstart1 && mstop2 >= mstop1){
5133               SeqAlignFree(amaip->saps[i+1]);
5134               amaip->saps[i+1] = amaip->saps[i];
5135               amaip->saps[i] = NULL;
5136               n++;
5137           }
5138           /* now if mRNA region overlaps - simply choose the highest score */
5139           /* unless the lower has splice donor & acceptor and the higher */
5140           /* does not */
5141           else {
5142               if ((sap2delete = SPI_Choose2LooseMrnaOvLap(amaip->saps[i], amaip->saps[i+1],
5143                                                           smp, i)) != -1) {
5144                   if (sap2delete == i){
5145                       SeqAlignFree(amaip->saps[i]);
5146                       amaip->saps[i] = NULL;
5147                       n++;
5148                   }
5149                   else if (sap2delete == (i+1)){
5150                       SeqAlignFree(amaip->saps[i+1]);
5151                       amaip->saps[i+1] = amaip->saps[i];
5152                       amaip->saps[i] = NULL;
5153                       n++;
5154                   }
5155               }
5156           }
5157       }
5158    }
5159 
5160    if (n > 0) /* some alignments were merged; need a new smp */
5161    {
5162       saparray = (SeqAlignPtr PNTR)MemNew((amaip->numsaps-n)*sizeof(SeqAlignPtr));
5163       count = 0;
5164       smp_new = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
5165       smp_new->numexons = amaip->numsaps-n;
5166       smp_new->exonid = (FloatHiPtr)MemNew((smp_new->numexons)*sizeof(FloatHi));
5167       smp_new->splicedon = (Uint1Ptr)MemNew((smp_new->numexons)*sizeof(Uint1));
5168       smp_new->spliceacc = (Uint1Ptr)MemNew((smp_new->numexons)*sizeof(Uint1));
5169       smp_new->exongaps = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5170       smp_new->saps = (SeqAlignPtr PNTR)MemNew((smp_new->numexons)*sizeof(SeqAlignPtr));
5171       smp_new->mstarts = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5172       smp_new->mstops = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5173       smp_new->gstarts = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5174       smp_new->gstops = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5175       smp_new->strand = strand;
5176       n = 0;
5177       for (i=0; i<amaip->numsaps; i++)
5178       {
5179          if (amaip->saps[i] != NULL)
5180          {
5181             saparray[n] = amaip->saps[i];
5182             n++;
5183             if (strand == Seq_strand_minus)
5184             {
5185                if (i != 0)
5186                   smp_new->splicedon[count] = smp->splicedon[i-1];
5187                smp_new->spliceacc[count] = smp->spliceacc[i];
5188             } else
5189             {
5190                smp_new->splicedon[count] = smp->splicedon[i];
5191                if (i != 0)
5192                   smp_new->spliceacc[count] = smp->spliceacc[i-1];
5193             }
5194             smp_new->saps[count] = smp->saps[i];
5195             count++;
5196          }
5197          smp->saps[i] = NULL;
5198       }
5199       MemFree(amaip->saps);
5200       amaip->saps = saparray;
5201       amaip->numsaps = n;
5202       SPI_mRNAFree(smp);
5203       smp = smp_new;
5204    }
5205    len = 0;
5206    max = 0;
5207    mis = 0;
5208    /* now get the %id, #mismatches, #gaps, etc. for each exon */
5209    for (i=0; i<amaip->numsaps; i++)
5210    {
5211       smp->saps[i] = amaip->saps[i];
5212       len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
5213       if (i == 0)
5214          min = b;
5215       else
5216       {
5217          if (b < min)
5218             min = b;
5219       }
5220       if (c > max)
5221          max = c;
5222    }
5223    sip = AlnMgr2GetNthSeqIdPtr(smp->saps[0], 1);
5224    bsp = BioseqLockById(sip);
5225    len2 = bsp->length;
5226    BioseqUnlock(bsp);
5227    SeqIdFree(sip);
5228    /* decide whether either end is left out of the alignments */
5229    if (min != 0)
5230    {
5231       if (max < len1 - 1)
5232          smp->missingends = SPI_BOTH;
5233       else
5234          smp->missingends = SPI_LEFT;
5235    } else
5236    {
5237       if (max < len1 - 1)
5238          smp->missingends = SPI_RIGHT;
5239       else
5240          smp->missingends = SPI_NEITHER;
5241    }
5242    if (srip->strand == Seq_strand_minus)
5243    {
5244       if ((smp->missingends == SPI_RIGHT || smp->missingends == SPI_BOTH) && smp->gstarts[0] < intronsize)
5245          srip->fallsoff = TRUE;
5246       if ((smp->missingends == SPI_LEFT || smp->missingends == SPI_BOTH) && smp->gstops[smp->numexons-1] > len2 - intronsize)
5247          srip->fallsoff = TRUE;
5248    } else
5249    {
5250       if ((smp->missingends == SPI_LEFT  || smp->missingends == SPI_BOTH) && smp->gstops[smp->numexons-1] > len2 - intronsize)
5251          srip->fallsoff = TRUE;
5252       if ((smp->missingends == SPI_RIGHT || smp->missingends == SPI_BOTH) && smp->gstarts[0] < intronsize)
5253          srip->fallsoff = TRUE;
5254    }
5255    smp->mRNAcoverage = (100*len)/len1;
5256    smp->mismatch = (FloatHi)(100*mis)/len;
5257    amaip->saps = smp->saps;
5258    for (i=0; i<amaip->numsaps-1; i++)
5259    {
5260       amaip->saps[i]->next = amaip->saps[i+1];
5261       amaip->saps[i+1]->next = NULL;
5262    }
5263    sap->segs = (Pointer)(amaip->saps[0]);
5264    smp->parent = sap;
5265    return smp;
5266 }
5267 
5268 /***************************************************************************
5269 *
5270 *  SPI_GetExonInfo creates a profile of the indicated exon, then
5271 *  walks through the profile structure to find mismatches and gaps.
5272 *  The gap positions are recorded (if the alignment is going to be
5273 *  printed -- otherwise the number of gaps is recorded, but not the
5274 *  positions) in the SPI_TinyInfo structures. If the alignment is going
5275 *  to be printed, a SPI_ExonProf structure is created to hold all the
5276 *  mismatch positions, and the exon profile is put into the smp. Regardless,
5277 *  the number of gaps and the number of mismatches are recorded. The length
5278 *  of the exon is returned.
5279 *
5280 ***************************************************************************/
SPI_GetExonInfo(SPI_mRNAPtr smp,Int4 n,Int4Ptr start,Int4Ptr stop,Int4Ptr mis,SPI_OptionsPtr spot)5281 static Int4 SPI_GetExonInfo(SPI_mRNAPtr smp, Int4 n, Int4Ptr start, Int4Ptr stop, Int4Ptr mis, SPI_OptionsPtr spot)
5282 {
5283    ACTProfilePtr    app;
5284    ACTProfilePtr    app_head;
5285    Int4             c;
5286    Int4             counter;
5287    SPI_ExonProfPtr  epp;
5288    SPI_ExonProfPtr  epp_tmp;
5289    Boolean          found;
5290    Int4             i;
5291    Int4             j;
5292    Int4             mismatch;
5293    SPI_TinyInfoPtr  spit;
5294    SPI_TinyInfoPtr  spit_head;
5295    SPI_TinyInfoPtr  spit_prev;
5296 
5297    AlnMgr2GetNthSeqRangeInSA(smp->saps[n], 2, start, stop);
5298    smp->mstarts[n] = *start;
5299    smp->mstops[n] = *stop;
5300    AlnMgr2GetNthSeqRangeInSA(smp->saps[n], 1, &smp->gstarts[n], &smp->gstops[n]);
5301    app_head = SPI_MakeProfileFromSA(smp->saps[n]);
5302    smp->exongaps[n] = 0;
5303    mismatch = 0;
5304    app = app_head;
5305    spit_head = spit_prev = NULL;
5306    counter = 0;
5307    while (app != NULL)
5308    {
5309       for (i=0; i<app->len; i++)
5310       {
5311          found = FALSE;
5312          c = 0;
5313          for (j=0; j<ACT_NUCLEN; j++)
5314          {
5315             if (app->freq[j][i] == 1 && !found)
5316             {
5317                /*if (app->freq[4][i] == 0)  not an N
5318                {*/
5319                   mismatch += 1;
5320                   found = TRUE;
5321                   if (spot->printaln != 1) /* if it's going to be printed, save up the mismatches */
5322                   {
5323                      spit = (SPI_TinyInfoPtr)MemNew(sizeof(SPI_TinyInfo));
5324                      spit->n = counter;
5325                      if (spit_head != NULL)
5326                      {
5327                         spit_prev->next = spit;
5328                         spit_prev = spit;
5329                      } else
5330                         spit_head = spit_prev = spit;
5331                   }
5332               /* }*/
5333             }
5334             c += app->freq[j][i];
5335          }
5336          if (c != 2)
5337             smp->exongaps[n]++;
5338          counter++;
5339       }
5340       app = app->next;
5341    }
5342    SPI_ProfileSetFree(app_head);
5343 
5344 
5345    smp->exonid[n] = ( *stop - *start +1 > 0
5346                       ?  (FloatHi)(100) - ((FloatHi)(100*mismatch))/(FloatHi)(*stop - *start + 1) : 0 );
5347    if (mismatch > 0 && smp->exonid[n] > 99.9)
5348       smp->exonid[n] = 99.9;
5349    *mis += mismatch;
5350    /* if there are mismatches, and the alignment is going to be printed, then */
5351    /* create an ExonProf structure to hold the mismatch positions.            */
5352    if (spot->printaln != 1 && spit_head != NULL)
5353    {
5354       spit = spit_head;
5355       i = 0;
5356       while (spit != NULL)
5357       {
5358          i++;
5359          spit = spit->next;
5360       }
5361       epp = (SPI_ExonProfPtr)MemNew(sizeof(SPI_ExonProf));
5362       epp->exonnum = n+1;
5363       epp->nummismatches = i;
5364       epp->mismatches = (Int4Ptr)MemNew(i*sizeof(Int4));
5365       i = 0;
5366       spit = spit_head;
5367       while (spit != NULL)
5368       {
5369          epp->mismatches[i] = spit->n;
5370          spit_prev = spit->next;
5371          MemFree(spit);
5372          spit = spit_prev;
5373          i++;
5374       }
5375       if (smp->epp != NULL)
5376       {
5377          epp_tmp = smp->epp;
5378          while (epp_tmp->next != NULL)
5379          {
5380             epp_tmp = epp_tmp->next;
5381          }
5382          epp_tmp->next = epp;
5383       } else
5384          smp->epp = epp;
5385    }
5386    return (*stop - *start + 1);
5387 }
5388 
5389 /***************************************************************************
5390 *
5391 *  SPI_AdjustOverlaps takes two adjacent alignments and adjusts their
5392 *  boundaries so that they abut each other and so that they are adjacent
5393 *  to good splice donor and acceptor sites. The function fills in the
5394 *  appropriate splicedon and spliceacc fields in the smp structure to
5395 *  indicate whether each alignment has a splice donor or acceptor site.
5396 *  Since the splice matrices are organism-specific, this function
5397 *  needs the spot->organism information.
5398 *  SPI_AdjustOverlaps first gets the donor and acceptor splice information
5399 *  (length of consensus sequence and the boundary, which is the offset into
5400 *  the consensus sequence for the intron-exon boundary). Then a window
5401 *  around the right end of sap1 is examined for donor sites. The top
5402 *  SPI_NUMSITES donor sites are examined more carefully: each is evaluated
5403 *  as to whether it has a good acceptor site in sap2, how much it affects
5404 *  the boundaries of sap1 and sap2, and how good its donor site score is.
5405 *  The donor site with the best acceptor site, that affects the alignments
5406 *  the least and has the best donor score, is the site that is chosen. If
5407 *  no good site is found, sap1 is unchanged and sap2 is truncated or
5408 *  extended as needed. If a good site is found, sap1 and sap2 are both
5409 *  truncated or extended as needed.
5410 *
5411 *  For the plus strand models (mRNA and genomic sequence in same orientation):
5412 *
5413 *  start1--------------------stop1          start2----------------stop2
5414 *                   |                   |
5415 *                   <- look from here ->
5416 *                     to here for a donor splice site -- the range is
5417 *  stop1 - ovl - fluff - boundary to stop1 + spllen + fluff.
5418 *  When a sequence matches the consensus splice sequence, its offset
5419 *  into the range is recorded. The new stop position is then
5420 *  stop1 - ovl - fluff - boundary + offset + (spllen - boundary).
5421 *
5422 *
5423 *  For the minus strand models:
5424 *
5425 *  start1--------------------stop1          start2-----------------stop2
5426 *  mstop1--------------------mstart1        mstop2---------------- mstart2
5427 *                                   |                |
5428 *                                   <- search this ->
5429 *                                    interval on the minus strand of the
5430 *  genomic sequence for a donor splice site -- the range is
5431 *  start2 - fluff - spllen to start2 + ovl + fluff + boundary. When
5432 *  the offset of a match is computed, the new mRNA stop position is
5433 *  stop2 - ovl - boundary - fluff + offset + (spllen - boundary).
5434 *
5435 *
5436 *  xxxGTxxxxx <- vertebrate splice donor consensus, GT are the first two
5437 *  residues of the intron.  Here spllen is 10 and boundary is 8 -- when
5438 *  counting from the 3' end, the 8th residue is the first residue of the
5439 *  exon.
5440 *
5441 ***************************************************************************/
SPI_AdjustOverlaps(SeqAlignPtr sap1,SeqAlignPtr sap2,Int4 n,SPI_mRNAPtr smp,SPI_OptionsPtr spot)5442 static void SPI_AdjustOverlaps(SeqAlignPtr sap1, SeqAlignPtr sap2, Int4 n, SPI_mRNAPtr smp, SPI_OptionsPtr spot)
5443 {
5444    Boolean     both;
5445    Int4        boundary;
5446    Int4        boundary_a;
5447    BioseqPtr   bsp;
5448    Uint1Ptr    buf;
5449    /** Uint1       buf2[200]; **/
5450    Uint1Ptr    buf2;
5451    Uint1Ptr    buf3;
5452    Int4        c;
5453    Int4        f;
5454    Int4        fluff;
5455    Int4        gstart;
5456    Int4        i;
5457    FloatHi     maxsc = 0;
5458    Int4        offset;
5459    Int4        ovl;
5460    Int4        pos;
5461    Uint1       res;
5462    FloatHi     score;
5463    SeqIdPtr    sip;
5464    SPI_Splice  splarray[SPI_NUMSITES];
5465    Int4        spllen;
5466    Int4        spllen_a;
5467    SeqPortPtr  spp;
5468    Int4        spp_start;
5469    Int4        spp_end;
5470    Int4        start;
5471    Int4        start1;
5472    Int4        start2;
5473    Int4        stop1;
5474    Int4        stop2;
5475    Uint1       strand;
5476    Int4        tmp;
5477 
5478    strand = AlnMgr2GetNthStrand(sap1, 2);
5479    AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
5480    AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
5481    sip = AlnMgr2GetNthSeqIdPtr(sap1, 1);
5482    bsp = BioseqLockById(sip);
5483    if (strand == Seq_strand_minus)
5484    {
5485       gstart = stop1;
5486       start = start2;
5487    } else
5488    {
5489       gstart = start2;
5490       start = stop1;
5491    }
5492    /* retrieve the organism-specific donor and acceptor information */
5493    SPI_GetDonorSpliceInfo(spot->organism, &spllen, &boundary, spot);
5494    SPI_GetAcceptorSpliceInfo(spot->organism, &spllen_a, &boundary_a, spot);
5495    /* get the overlap between the alignments */
5496    ovl = spi_get_overlap(sap1, sap2);
5497    if (-ovl > 2*SPI_TEENYEXON)
5498       return;
5499    if (ovl < 0 && -ovl <= 2*SPI_TEENYEXON) /* extend alignments until they do overlap, to get the splice site right */
5500    {
5501       SPI_ExtendAlnAlgDumb(sap2, -ovl+2, SPI_LEFT, strand);
5502       SPI_ExtendAlnAlgDumb(sap1, -ovl+2, SPI_RIGHT, strand);
5503       AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
5504       AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
5505       if (strand == Seq_strand_minus)
5506       {
5507          gstart = stop1;
5508          start = start2;
5509       } else
5510       {
5511          gstart = start2;
5512          start = stop1;
5513       }
5514       ovl = -ovl;
5515    }
5516    ovl = MIN(abs(ovl), abs(start2-stop1));
5517    if (spot->interspecies == TRUE)
5518       fluff = SPI_FLUFF;
5519    else
5520       fluff = 0;
5521    if (ovl != 0)
5522    {
5523       /* open a seqport in a window around the end of donor sap, and look for donor sites */
5524       if (strand != Seq_strand_minus)
5525       {
5526          spp_start = start - ovl - fluff - (spllen - boundary);
5527          spp_end = start + spllen + ovl + fluff;
5528          if (start-ovl-fluff-spllen+boundary < 0)
5529             spp_start = 0;
5530          if (start+spllen+fluff+ovl > bsp->length-1)
5531             spp_end = bsp->length-1;
5532          spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5533       } else
5534       {
5535          spp_start = start - fluff - spllen;
5536          spp_end = start + ovl + fluff + spllen - boundary;
5537          if (spp_start < 0)
5538             spp_start = 0;
5539          if (spp_end > bsp->length-1)
5540             spp_end = bsp->length - 1;
5541          spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5542       }
5543       i = 0;
5544       buf = (Uint1Ptr)MemNew((2*fluff+ovl+spllen+2)*sizeof(Uint1));
5545       buf2 = (Uint1Ptr)MemNew((2*fluff+ovl+spllen+2)*sizeof(Uint1));
5546       SeqPortRead(spp, buf2, 2*fluff+ovl+spllen+2);
5547       for (f=0; f<SPI_NUMSITES; f++)
5548       {
5549          splarray[f].i = 0;
5550          splarray[f].score = -2;
5551       }
5552       while (((res = buf2[i]) != SEQPORT_EOF) && i<(2*fluff+ovl+spllen+1))
5553       {
5554          if (res == 1)
5555             buf[i] = 0;
5556          else if (res == 2)
5557             buf[i] = 1;
5558          else if (res == 4)
5559             buf[i] = 2;
5560          else if (res == 8)
5561             buf[i] = 3;
5562          else
5563             buf[i] = 4;
5564          i++;
5565       }
5566       SeqPortFree(spp);
5567       MemFree(buf2);
5568       for (i=0; i<2*fluff+ovl+1; i++)
5569       {
5570          if (spot->dsplicejunc > 0)
5571             SPI_is_donor_user(buf+i, spllen, &score, spot);
5572          else
5573             SPI_is_donor(buf+i, spllen, &score, spot->organism);
5574          c = 0;
5575          if (score > 0.000001)
5576          {
5577             for (f=0; f<SPI_NUMSITES; f++)
5578             {
5579                if (f == 0)
5580                   maxsc = splarray[f].score;
5581                else if (splarray[f].score < maxsc)
5582                {
5583                   maxsc = splarray[f].score;
5584                   c = f;
5585                }
5586             }
5587             if (score > splarray[c].score)
5588             {
5589                splarray[c].score = score;
5590                splarray[c].i = i;
5591             }
5592          }
5593       }
5594       AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
5595       AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
5596       maxsc = 0;
5597       /* for the SPI_NUMSITES best donor sites, get the corresponding acceptor */
5598       /* site score and record how much each alignment would be changed if     */
5599       /* the alignments were truncated/extended to this sites                  */
5600       /* pos is the coordinate of the last residue of the donor exon           */
5601       for (f=0; f<SPI_NUMSITES; f++)
5602       {
5603          if (strand == Seq_strand_minus)
5604          {
5605             pos = stop2 - ovl - fluff + splarray[f].i;
5606             if (stop2 - pos < 0)
5607                splarray[f].diff = pos - stop2;
5608             else
5609                splarray[f].diff = stop2 - pos;
5610             if (start1 - pos < 0)
5611             {
5612                if (pos - start1 < splarray[f].diff)
5613                   splarray[f].diff = pos - start1;
5614             } else
5615             {
5616                if (start1 - pos < splarray[f].diff)
5617                   splarray[f].diff = start1 - pos;
5618             }
5619             tmp = gstart + start1 - (pos + 1);
5620             SPI_GetAcceptorScore(bsp, tmp - (spllen_a - boundary_a), tmp + boundary_a, strand, &splarray[f].score2, spllen_a, spot);
5621          } else
5622          {
5623             pos = stop1 - ovl - fluff + splarray[f].i;
5624             if (stop1 - pos < 0)
5625                splarray[f].diff = pos - stop1;
5626             else
5627                splarray[f].diff = stop1 - pos;
5628             if (start2 - pos < 0)
5629             {
5630                if (pos - start2 < splarray[f].diff)
5631                   splarray[f].diff = pos - start2;
5632             } else
5633             {
5634                if (start2 - pos < splarray[f].diff)
5635                   splarray[f].diff = start2 - pos;
5636             }
5637             tmp = gstart + start2 - (pos + 1);
5638             SPI_GetAcceptorScore(bsp, tmp - boundary_a + (spllen_a - boundary_a), tmp, strand, &splarray[f].score2, spllen_a, spot);
5639          }
5640          if (splarray[f].diff > maxsc)
5641             maxsc = splarray[f].diff;
5642       }
5643       i = 0;
5644       both = FALSE;
5645       for (f=0; f<SPI_NUMSITES && !both; f++)
5646       {
5647          if (splarray[f].score > 0.000001 && splarray[f].score2 > 0.00000002)
5648             both = TRUE;
5649       }
5650       /* look for the position that has a good acceptor (if one of them does have both */
5651       /* a good donor and acceptor) and that changes the alignments the least          */
5652       /* with the highest donor score                                                  */
5653       offset = 0;
5654       for (f=0; f<SPI_NUMSITES; f++)
5655       {
5656          if ((both && splarray[f].score2 > 0.0000000002) || both == FALSE)
5657          {
5658             if(splarray[f].score >= splarray[i].score)
5659             {
5660                maxsc = splarray[f].diff;
5661                offset = splarray[f].i;
5662                i = f;
5663             }
5664          }
5665       }
5666       if (strand == Seq_strand_minus)
5667          pos = stop2 - ovl - fluff + offset;
5668       else
5669          pos = stop1 - ovl - fluff + offset;
5670       MemFree(buf);
5671       if (splarray[i].score >= 0.00001 && pos > 0)
5672       {
5673          if (strand == Seq_strand_minus)
5674             smp->splicedon[n+1] = 1;
5675          else
5676             smp->splicedon[n] = 1;
5677       } else /* if don't find a good site, don't change the alignment */
5678          offset = ovl + fluff;
5679       if (strand == Seq_strand_minus)
5680          pos = stop2 - ovl - fluff + offset;
5681       else
5682          pos = stop1 - ovl - fluff + offset;
5683       if (splarray[i].score2 > 0.0000000002)
5684       {
5685          if (strand == Seq_strand_minus)
5686             smp->spliceacc[n] = 1;
5687          else
5688             smp->spliceacc[n+1] = 1;
5689       }
5690       SeqIdFree(sip);
5691       BioseqUnlock(bsp);
5692    } else
5693    {
5694       AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
5695       AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
5696       if (strand == Seq_strand_minus)
5697          pos = stop2;
5698       else
5699          pos = stop1;
5700       if (strand != Seq_strand_minus)
5701       {
5702          spp_start = start - fluff - (spllen - boundary);
5703          spp_end = start + spllen + fluff;
5704          if (start-ovl-fluff-spllen+boundary < 0)
5705             spp_start = 0;
5706          if (start+spllen+fluff > bsp->length-1)
5707             spp_end = bsp->length-1;
5708          spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5709       } else
5710       {
5711          spp_start = start - fluff - spllen;
5712          spp_end = start + fluff + spllen - boundary;
5713          if (spp_start < 0)
5714             spp_start = 0;
5715          if (spp_end > bsp->length-1)
5716             spp_end = bsp->length - 1;
5717          spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5718       }
5719       i = 0;
5720       buf = (Uint1Ptr)MemNew((spp_end-spp_start+1)*sizeof(Uint1));
5721       buf3 = (Uint1Ptr)MemNew((spp_end-spp_start+1)*sizeof(Uint1));
5722       SeqPortRead(spp, buf3, spp_end-spp_start+1);
5723       while (i<(spp_end-spp_start+1) && ((res = buf3[i]) != SEQPORT_EOF))
5724       {
5725          if (res == 1)
5726             buf[i] = 0;
5727          else if (res == 2)
5728             buf[i] = 1;
5729          else if (res == 4)
5730             buf[i] = 2;
5731          else if (res == 8)
5732             buf[i] = 3;
5733          else
5734             buf[i] = 4;
5735          i++;
5736       }
5737       SeqPortFree(spp);
5738       MemFree(buf3);
5739       if (spot->dsplicejunc > 0)
5740          SPI_is_donor_user(buf, spllen, &score, spot);
5741       else
5742          SPI_is_donor(buf, spllen, &score, spot->organism);
5743       if (score >= 0.00001)
5744       {
5745          if (strand == Seq_strand_minus)
5746             smp->splicedon[n+1] = 1;
5747          else
5748             smp->splicedon[n] = 1;
5749       }
5750       if (strand == Seq_strand_minus)
5751       {
5752          tmp = gstart + start1 - (pos + 1);
5753          SPI_GetAcceptorScore(bsp, tmp - (spllen_a - boundary_a), tmp + boundary_a, strand, &score, spllen_a, spot);
5754       } else
5755       {
5756          tmp = gstart + start2 - (pos + 1);
5757          SPI_GetAcceptorScore(bsp, tmp - boundary_a + (spllen_a - boundary_a), tmp, strand, &score, spllen_a, spot);
5758       }
5759       if (score > 0.0000000002)
5760       {
5761          if (strand == Seq_strand_minus)
5762             smp->spliceacc[n] = 1;
5763          else
5764             smp->spliceacc[n+1] = 1;
5765       }
5766       MemFree(buf);
5767    }
5768    /* extend or truncate sap1 and sap2 to abut each other exactly and to */
5769    /* be adjacent to the chosen splice site                              */
5770    if (strand == Seq_strand_minus)
5771    {
5772       if (pos < stop2)
5773       {
5774          if (AlnMgr2TruncateSeqAlign(sap2, start2, pos, 2))
5775          {
5776             sap2->next->next = NULL;
5777             SeqAlignFree(sap2->next);
5778             sap2->next = NULL;
5779          }
5780       } else if (pos > stop2)
5781          SPI_AddToAln(sap2, MIN(pos-stop2, abs(gstart-start)), SPI_LEFT, strand);
5782       if (start1 == pos + 1)
5783          return;
5784       else if (start1 < pos + 1)
5785       {
5786          if (AlnMgr2TruncateSeqAlign(sap1, pos+1, stop1, 2))
5787          {
5788             sap1->next->next = NULL;
5789             SeqAlignFree(sap1->next);
5790             sap1->next = NULL;
5791          }
5792          return;
5793       } else if (start1 > pos + 1)
5794       {
5795          SPI_AddToAln(sap1, MIN(start1-pos-1, abs(gstart-start)), SPI_RIGHT, strand);
5796          return;
5797       }
5798    } else
5799    {
5800       if (pos < stop1)
5801       {
5802          if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
5803          {
5804             sap1->next->next = NULL;
5805             SeqAlignFree(sap1->next);
5806             sap1->next = NULL;
5807          }
5808       } else if (pos > stop1)
5809          SPI_AddToAln(sap1, MIN(pos - stop1, abs(gstart-start)), SPI_RIGHT, strand);
5810       if (start2 == pos + 1)
5811          return;
5812       else if (start2 < pos + 1)
5813       {
5814          if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
5815          {
5816             sap2->next->next = NULL;
5817             SeqAlignFree(sap2->next);
5818             sap2->next = NULL;
5819          }
5820          return;
5821       } else if (start2 > pos + 1)
5822       {
5823          SPI_AddToAln(sap2, MIN(start2-pos-1, abs(gstart-start)), SPI_LEFT, strand);
5824          return;
5825       }
5826    }
5827 }
5828 
5829 /***************************************************************************
5830 *
5831 *  SPI_RemoveTeenyAlns removes all alignments in a set that are less
5832 *  than len in length.
5833 *
5834 ***************************************************************************/
SPI_RemoveTeenyAlns(SeqAlignPtr sap,Int4 len)5835 static void SPI_RemoveTeenyAlns(SeqAlignPtr sap, Int4 len)
5836 {
5837    Int4             alnlen;
5838    AMAlignIndex2Ptr  amaip;
5839    Int4             i;
5840    SeqAlignPtr      sap_head;
5841    SeqAlignPtr      sap_prev;
5842 
5843    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
5844       return;
5845    sap_head = sap_prev = NULL;
5846    amaip = (AMAlignIndex2Ptr)(sap->saip);
5847    for (i=0; i<amaip->numsaps; i++)
5848    {
5849       amaip->saps[i]->next = NULL;
5850       alnlen = AlnMgr2GetAlnLength(amaip->saps[i], FALSE);
5851       if (alnlen >= len)
5852       {
5853          if (sap_head != NULL)
5854          {
5855             sap_prev->next = amaip->saps[i];
5856             sap_prev = amaip->saps[i];
5857          } else
5858             sap_head = sap_prev = amaip->saps[i];
5859       } else
5860          SeqAlignFree(amaip->saps[i]);
5861    }
5862    sap->segs = (Pointer)(sap_head);
5863    AMAlignIndex2Free2(amaip);
5864    sap->saip = NULL;
5865    if (sap->segs != NULL)
5866       AlnMgr2IndexLite(sap);
5867 }
5868 
SPI_ExtendAlnAlgDumb(SeqAlignPtr sap,Int4 ovl,Int4 which_side,Uint1 strand)5869 static void SPI_ExtendAlnAlgDumb(SeqAlignPtr sap, Int4 ovl, Int4 which_side, Uint1 strand)
5870 {
5871    DenseSegPtr  dsp;
5872    DenseSegPtr  dsp_new;
5873    Int4         i;
5874    Int4         j;
5875    Int4         start1;
5876    Int4         start2;
5877    Int4         stop1;
5878    Int4         stop2;
5879 
5880    dsp = (DenseSegPtr)(sap->segs);
5881    dsp_new = DenseSegNew();
5882    dsp_new->dim = 2;
5883    dsp_new->numseg = dsp->numseg+1;
5884    dsp_new->starts = (Int4Ptr)MemNew(2*dsp_new->numseg*sizeof(Int4));
5885    dsp_new->strands = (Uint1Ptr)MemNew(2*dsp_new->numseg*sizeof(Uint1));
5886    dsp_new->lens = (Int4Ptr)MemNew(dsp_new->numseg*sizeof(Int4));
5887    dsp_new->ids = dsp->ids;
5888    dsp->ids = NULL;
5889    i = 0;
5890    AlnMgr2GetNthSeqRangeInSA(sap, 1, &start1, &stop1);
5891    AlnMgr2GetNthSeqRangeInSA(sap, 2, &start2, &stop2);
5892    if (which_side == SPI_LEFT)
5893    {
5894       dsp_new->starts[0] = start1-ovl;
5895       if (strand == Seq_strand_minus)
5896          dsp_new->starts[1] = stop2+1;
5897       else
5898          dsp_new->starts[1] = start2-ovl;
5899       dsp_new->strands[0] = Seq_strand_plus;
5900       dsp_new->strands[1] = strand;
5901       dsp_new->lens[0] = ovl;
5902       i++;
5903    }
5904    for (j=0; j<dsp->numseg; j++)
5905    {
5906       dsp_new->starts[2*(j+i)] = dsp->starts[2*j];
5907       dsp_new->starts[2*(j+i)+1] = dsp->starts[2*j+1];
5908       dsp_new->strands[2*(j+i)] = Seq_strand_plus;
5909       dsp_new->strands[2*(j+i)+1] = strand;
5910       dsp_new->lens[i+j] = dsp->lens[j];
5911    }
5912    if (which_side == SPI_RIGHT)
5913    {
5914       dsp_new->starts[2*(dsp_new->numseg-1)] = stop1+1;
5915       if (strand == Seq_strand_minus)
5916          dsp_new->starts[2*(dsp_new->numseg-1)+1] = start2-ovl;
5917       else
5918          dsp_new->starts[2*(dsp_new->numseg-1)+1] = stop2+1;
5919       dsp_new->strands[2*(dsp_new->numseg-1)] = Seq_strand_plus;
5920       dsp_new->strands[2*(dsp_new->numseg-1)+1] = strand;
5921       dsp_new->lens[dsp_new->numseg-1] = ovl;
5922    }
5923    DenseSegFree(dsp);
5924    sap->segs = (Pointer)dsp_new;
5925    SAIndex2Free2(sap->saip);
5926    sap->saip = NULL;
5927    AlnMgr2IndexSingleChildSeqAlign(sap);
5928 }
5929 
5930 
5931 /***************************************************************************
5932 *
5933 *  SPI_GetAcceptorScore evaluates a given position in a given bioseq
5934 *  for an acceptor splice site.
5935 *
5936 ***************************************************************************/
SPI_GetAcceptorScore(BioseqPtr bsp,Int4 pos1,Int4 pos2,Uint1 strand,FloatHiPtr score,Int4 spllen,SPI_OptionsPtr spot)5937 static void SPI_GetAcceptorScore(BioseqPtr bsp, Int4 pos1, Int4 pos2, Uint1 strand, FloatHiPtr score, Int4 spllen, SPI_OptionsPtr spot)
5938 {
5939    Uint1Ptr    buf;
5940    Uint1       buf2[100];
5941    Int4        i;
5942    Uint1       res;
5943    SeqPortPtr  spp;
5944 
5945    if (pos1 < 0)
5946       pos1 = 0;
5947    if (pos2 > bsp->length-1)
5948       pos2 = bsp->length-1;
5949    spp = SeqPortNew (bsp, pos1, pos2, strand, Seq_code_ncbi4na);
5950    i = 0;
5951    buf = (Uint1Ptr)MemNew((spllen+2)*sizeof(Uint1));
5952    SeqPortRead(spp, buf2, spllen+2);
5953    while (i<(pos2-pos1+1) && ((res = buf2[i]) != SEQPORT_EOF) && i<spllen+1)
5954    {
5955       if (res == 1)
5956          buf[i] = 0;
5957       else if (res == 2)
5958          buf[i] = 1;
5959       else if (res == 4)
5960          buf[i] = 2;
5961       else if (res == 8)
5962          buf[i] = 3;
5963       else
5964          buf[i] = 4;
5965       i++;
5966    }
5967    SeqPortFree(spp);
5968    if (spot->asplicejunc > 0)
5969       SPI_is_acceptor_user(buf, spllen, score, spot);
5970    else
5971       SPI_is_acceptor (buf, spllen, score, spot->organism);
5972    MemFree(buf);
5973 }
5974 
5975 /***************************************************************************
5976 *
5977 *  spi_get_overlap returns the amount of overlap (on the second, or mRNA
5978 *  sequence) between two given alignments. A negative value means no
5979 *  overlap.
5980 *
5981 ***************************************************************************/
spi_get_overlap(SeqAlignPtr sap1,SeqAlignPtr sap2)5982 static Int4 spi_get_overlap (SeqAlignPtr sap1, SeqAlignPtr sap2)
5983 {
5984    Int4   overlap;
5985    Int4   start1;
5986    Int4   stop1;
5987    Int4   start2;
5988    Int4   stop2;
5989    Uint1  strand;
5990 
5991    strand = AlnMgr2GetNthStrand (sap1, 2);
5992    AlnMgr2GetNthSeqRangeInSA (sap1, 2, &start1, &stop1);
5993    AlnMgr2GetNthSeqRangeInSA (sap2, 2, &start2, &stop2);
5994    if (strand == Seq_strand_minus)
5995       overlap = stop2 - start1 + 1;
5996    else
5997       overlap = stop1 - start2 + 1;
5998    return overlap;
5999 }
6000 
6001 /***************************************************************************
6002 *
6003 *  SPI_AddToAln adds the amount "offset" to the specified end of an
6004 *  alignment by adding a segment of length "offset" to both sequences
6005 *  in the alignment. The function assumes that the alignment has two
6006 *  rows, that the first row is on the plus strand, that the second row
6007 *  is on the strand specified, and that adding the amount "offset" will
6008 *  not go past either end of either sequence. This function is used
6009 *  to adjust alignment boundaries to splice sites and to add small
6010 *  pieces onto alignments to make them abut the next adjacent alignment.
6011 *  If the first or last segment (depending on which_end specified) does
6012 *  not have gaps in either row, that segment is simply extended; otherwise,
6013 *  a new segment must be added onto whichever end is to be extended.
6014 *
6015 ***************************************************************************/
SPI_AddToAln(SeqAlignPtr sap,Int4 offset,Int2 which_end,Uint1 strand)6016 static void SPI_AddToAln(SeqAlignPtr sap, Int4 offset, Int2 which_end, Uint1 strand)
6017 {
6018    DenseSegPtr  dsp;
6019    Int4Ptr      lens;
6020    Int4         i;
6021    Int4         j;
6022    Int4Ptr      starts;
6023    Uint1Ptr     strands;
6024 
6025    if (sap == NULL || offset == 0)
6026       return;
6027    dsp = (DenseSegPtr)(sap->segs);
6028    if (which_end == SPI_LEFT)
6029    {
6030       if (dsp->starts[0] != -1 && dsp->starts[1] != -1) /* neither sequence is gapped */
6031       {
6032          dsp->starts[0] -= offset;
6033          if (strand != Seq_strand_minus)
6034             dsp->starts[1] -= offset;
6035          dsp->lens[0] += offset;
6036       } else /* one of the sequences is gapped -> add a new segment */
6037       {
6038          starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
6039          lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
6040          strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
6041          AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6042          starts[0] = i - offset;
6043          AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6044          if (strand == Seq_strand_minus)
6045             starts[1] = j + 1;
6046          else
6047             starts[1] = i - offset;
6048          lens[0] = offset;
6049          strands[0] = Seq_strand_plus;
6050          strands[1] = strand;
6051          for (i=0; i<dsp->numseg; i++)
6052          {
6053            starts[i+1] = dsp->starts[i];
6054            starts[2*(i+1)] = dsp->starts[2*i];
6055            lens[i+1] = dsp->lens[i];
6056            strands[i+1] = dsp->strands[i];
6057            strands[2*(i+1)] = dsp->strands[2*i];
6058          }
6059          dsp->numseg++;
6060          MemFree(dsp->starts);
6061          MemFree(dsp->lens);
6062          MemFree(dsp->strands);
6063          dsp->starts = starts;
6064          dsp->lens = lens;
6065          dsp->strands = strands;
6066       }
6067    } else if (which_end == SPI_RIGHT)
6068    {
6069       if (dsp->starts[2*(dsp->numseg-1)] != -1 && dsp->starts[2*(dsp->numseg-1)+1] != -1)
6070       {
6071          dsp->lens[dsp->numseg-1] += offset;
6072          if (strand == Seq_strand_minus)
6073             dsp->starts[2*(dsp->numseg-1)+1] -= offset;
6074       } else /* one of the sequences is gapped -> add a new segment */
6075       {
6076          starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
6077          lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
6078          strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
6079          AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6080          starts[2*(dsp->numseg)-1] = i+1;
6081          AlnMgr2GetNthSeqRangeInSA(sap, 2, &i, &j);
6082          if (strand == Seq_strand_minus)
6083            starts[2*(dsp->numseg)] = i - offset;
6084          else
6085            starts[2*(dsp->numseg)] = j + 1;
6086          lens[dsp->numseg] = offset;
6087          strands[2*(dsp->numseg)-1] = Seq_strand_plus;
6088          strands[2*(dsp->numseg)] = strand;
6089          for (i=0; i<dsp->numseg; i++)
6090          {
6091             starts[i] = dsp->starts[i];
6092             starts[2*i] = dsp->starts[2*i];
6093             lens[i] = dsp->lens[i];
6094             strands[i] = dsp->strands[i];
6095             strands[2*i] = dsp->strands[2*i];
6096          }
6097          dsp->numseg++;
6098          MemFree(dsp->starts);
6099          MemFree(dsp->lens);
6100          MemFree(dsp->strands);
6101          dsp->starts = starts;
6102          dsp->lens = lens;
6103          dsp->strands = strands;
6104       }
6105    }
6106    /* free the old index and reindex the alignment */
6107    SAIndex2Free2(sap->saip);
6108    sap->saip = NULL;
6109    AlnMgr2IndexSingleChildSeqAlign(sap);
6110 }
6111 
6112 /***************************************************************************
6113 *
6114 *  SPI_MergeAlignments takes two dense-seg seqaligns, each with the
6115 *  same two rows, and merges them into a single alignment, with sap1
6116 *  on the left and sap2 on the right. The function does not check to make
6117 *  sure that sap2 belongs after sap1. If sap1 and sap2 are not linearly
6118 *  consistent, the function extends and truncates the alignments as needed.
6119 *
6120 ***************************************************************************/
SPI_MergeAlignments(SeqAlignPtr sap1,SeqAlignPtr sap2)6121 static SeqAlignPtr SPI_MergeAlignments(SeqAlignPtr sap1, SeqAlignPtr sap2)
6122 {
6123    DenseSegPtr  dsp1;
6124    DenseSegPtr  dsp2;
6125    Int4         glen;
6126    Int4         gstart1;
6127    Int4         gstart2;
6128    Int4         gstop1;
6129    Int4         gstop2;
6130    Int4         i;
6131    Int4         j;
6132    Int4Ptr      lens;
6133    Int4         mlen;
6134    Int4         mstart1;
6135    Int4         mstart2;
6136    Int4         mstop1;
6137    Int4         mstop2;
6138    Int4         n;
6139    Int4         offset;
6140    Int4Ptr      starts;
6141    Uint1Ptr     strands;
6142 
6143    AlnMgr2GetNthSeqRangeInSA(sap1, 1, &gstart1, &gstop1);
6144    AlnMgr2GetNthSeqRangeInSA(sap2, 1, &gstart2, &gstop2);
6145    glen = mlen = 0;
6146    if (gstart2 <= gstop1)
6147    {
6148       AlnMgr2TruncateSeqAlign(sap1, gstart1, gstart2-1, 1);
6149       gstop1 = gstart2-1;
6150    }
6151    AlnMgr2GetNthSeqRangeInSA(sap1, 2, &mstart1, &mstop1);
6152    AlnMgr2GetNthSeqRangeInSA(sap2, 2, &mstart2, &mstop2);
6153    if (mstop2 > mstop1)
6154    {
6155       if (mstart2 <= mstop1)
6156          AlnMgr2TruncateSeqAlign(sap2, mstop1+1, mstop2, 2);
6157    } else
6158    {
6159       if (mstart1 <= mstop2)
6160          AlnMgr2TruncateSeqAlign(sap2, mstart2, mstart1-1, 2);
6161    }
6162    AlnMgr2GetNthSeqRangeInSA(sap1, 1, &gstart1, &gstop1);
6163    AlnMgr2GetNthSeqRangeInSA(sap2, 1, &gstart2, &gstop2);
6164    AlnMgr2GetNthSeqRangeInSA(sap1, 2, &mstart1, &mstop1);
6165    AlnMgr2GetNthSeqRangeInSA(sap2, 2, &mstart2, &mstop2);
6166    glen = gstart2 - gstop1 - 1;
6167    if (mstop2 > mstop1)
6168       mlen = mstart2 - mstop1 - 1;
6169    else
6170       mlen = mstart1 - mstop2 - 1;
6171    dsp1 = (DenseSegPtr)(sap1->segs);
6172    dsp2 = (DenseSegPtr)(sap2->segs);
6173    n = dsp1->numseg + dsp2->numseg + 2;
6174    starts = (Int4Ptr)MemNew(2*n*sizeof(Int4));
6175    lens = (Int4Ptr)MemNew(n*sizeof(Int4));
6176    strands = (Uint1Ptr)MemNew(2*n*sizeof(Uint1));
6177    for (i=0; i<2*(dsp1->numseg); i++)
6178    {
6179       starts[i] = dsp1->starts[i];
6180       strands[i] = dsp1->strands[i];
6181    }
6182    for (i=0; i<dsp1->numseg; i++)
6183    {
6184       lens[i] = dsp1->lens[i];
6185    }
6186    j = dsp1->numseg;
6187    offset = 0;
6188    if (glen > 0)
6189    {
6190       starts[2*j] = gstop1+1;
6191       starts[2*j+1] = -1;
6192       lens[j] = glen;
6193       j += 1;
6194       offset++;
6195    }
6196    if (mlen > 0)
6197    {
6198       starts[2*j] = -1;
6199       if (mstop2 > mstop1)
6200          starts[2*j+1] = mstop1+1;
6201       else
6202          starts[2*j+1] = mstop2+1;
6203       lens[j] = mlen;
6204       j += 1;
6205       offset++;
6206    }
6207    j = 2*(dsp1->numseg+offset);
6208    for (i=0; i<2*(dsp2->numseg); i++, j++)
6209    {
6210       starts[j] = dsp2->starts[i];
6211       strands[j] = dsp2->strands[i];
6212    }
6213    j = dsp1->numseg+offset;
6214    for (i=0; i<dsp2->numseg; i++, j++)
6215    {
6216       lens[j] = dsp2->lens[i];
6217    }
6218    MemFree(dsp2->starts);
6219    MemFree(dsp2->strands);
6220    MemFree(dsp2->lens);
6221    dsp2->starts = starts;
6222    dsp2->strands = strands;
6223    dsp2->lens = lens;
6224    dsp2->numseg = j;
6225    SAIndex2Free2(sap2->saip);
6226    sap2->saip = NULL;
6227    AlnMgr2IndexSingleChildSeqAlign(sap2);
6228    return (sap2);
6229 }
6230 
6231 /***************************************************************************
6232 *
6233 *  SPI_flip_sa_list takes the head of a list of seqaligns and switches
6234 *  the first and second row of every alignment (alignments should all have
6235 *  two rows). Then, the indexes are freed and the alignments are reindexed.
6236 *
6237 ***************************************************************************/
SPI_flip_sa_list(SeqAlignPtr sap)6238 NLM_EXTERN void SPI_flip_sa_list (SeqAlignPtr sap)
6239 {
6240    DenseSegPtr  dsp;
6241    Int4         i;
6242    SeqIdPtr     sip;
6243    SeqIdPtr     sip_next;
6244    Int4         tmp_start;
6245    Uint1        tmp_strand;
6246 
6247    if (sap == NULL || sap->segtype != SAS_DENSEG)
6248       return;
6249    while (sap != NULL)
6250    {
6251       dsp = (DenseSegPtr)(sap->segs);
6252       if (dsp->dim == 2) /* skip anything with more than 2 rows */
6253       {
6254          /* first switch the ids */
6255          sip = dsp->ids;
6256          sip_next = sip->next;
6257          sip_next->next = sip;
6258          sip->next = NULL;
6259          dsp->ids = sip_next;
6260          /* then switch the starts and strands */
6261          for (i = 0; i<dsp->numseg; i++)
6262          {
6263             tmp_start = dsp->starts[2*i];
6264             dsp->starts[2*i] = dsp->starts[2*i+1];
6265             dsp->starts[2*i+1] = tmp_start;
6266             tmp_strand = dsp->strands[2*i];
6267             dsp->strands[2*i] = dsp->strands[2*i+1];
6268             dsp->strands[2*i+1] = tmp_strand;
6269          }
6270       }
6271       if (sap->saip != NULL) /* free indexes, reindex */
6272       {
6273          SAIndex2Free2(sap->saip);
6274          sap->saip = NULL;
6275          AlnMgr2IndexSingleChildSeqAlign(sap);
6276       }
6277       sap = sap->next;
6278    }
6279 }
6280 
6281 /***************************************************************************
6282 *
6283 *  SPI_FillInLastmRNAHoles mimics the logic of SPI_ConnectAln; it
6284 *  goes through a set of alignments and fills in any missing pieces.
6285 *  Its arguments include the mRNA and genomic boundaries of the alignment,
6286 *  so that the function knows how far to extend the set of alignments.
6287 *  When a hole is found, SPI_FindBestAlnByDotPlot is called to fill
6288 *  in the gap.
6289 *
6290 ***************************************************************************/
SPI_FillInLastmRNAHoles(SeqAlignPtr sap,SeqIdPtr sip_genomic,SeqIdPtr sip_mrna,Int4 start_g,Int4 stop_g,Int4 start_m,Int4 stop_m,Uint1 strand)6291 static SeqAlignPtr SPI_FillInLastmRNAHoles(SeqAlignPtr sap, SeqIdPtr sip_genomic, SeqIdPtr sip_mrna, Int4 start_g, Int4 stop_g, Int4 start_m, Int4 stop_m, Uint1 strand)
6292 {
6293    AMAlignIndex2Ptr  amaip;
6294    Int4             currstart2;
6295    Int4             end2;
6296    Int4             gap1;
6297    Int4             gap2;
6298    Int4             i;
6299    Boolean          internal;
6300    Int4             prevstop1;
6301    Int4             prevstop2;
6302    SeqAlignPtr      sap_new;
6303    SeqAlignPtr      sap_tmp;
6304    SeqLocPtr        slp1;
6305    SeqLocPtr        slp2;
6306    Int4             start1;
6307    Int4             start2;
6308    Int4             stop1;
6309    Int4             stop2;
6310 
6311    if (sip_genomic == NULL || sip_mrna == NULL)
6312       return NULL;
6313    start1 = stop1 = start2 = stop2 = 0;
6314    if (sap != NULL)
6315    {
6316       amaip = (AMAlignIndex2Ptr)(sap->saip);
6317       HeapSort(amaip->saps, amaip->numsaps, sizeof(SeqAlignPtr), SPI_comp_aln_pos);
6318       prevstop1 = start_g;
6319       if (strand == Seq_strand_minus)
6320          prevstop2 = stop_m;
6321       else
6322          prevstop2 = start_m;
6323       internal = FALSE;
6324       for (i=0; i<amaip->numsaps; i++)
6325       {
6326          AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &start1, &stop1);
6327          AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &start2, &stop2);
6328          if (strand == Seq_strand_minus)
6329             currstart2 = stop2;
6330          else
6331             currstart2 = start2;
6332          if ((gap2 = spi_isa_gap(currstart2, prevstop2, strand)) >= SPI_TEENYEXON)
6333          {
6334             if ((gap1 = spi_isa_gap(start1, prevstop1, Seq_strand_plus)) >= SPI_TEENYEXON || (prevstop1 == -1))
6335             {
6336                slp1 = SeqLocIntNew(prevstop1+1, start1-1, Seq_strand_plus, sip_genomic);
6337                if (strand != Seq_strand_minus)
6338                   slp2 = SeqLocIntNew(prevstop2+1, currstart2-1, strand, sip_mrna);
6339                else
6340                   slp2 = SeqLocIntNew(currstart2+1, prevstop2-1, strand, sip_mrna);
6341                sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
6342                SeqLocFree(slp1);
6343                SeqLocFree(slp2);
6344                sap_tmp = (SeqAlignPtr)(sap->segs);
6345                while (sap_tmp->next != NULL)
6346                {
6347                   sap_tmp = sap_tmp->next;
6348                }
6349                sap_tmp->next = sap_new;
6350             }
6351          }
6352          internal = TRUE;
6353          prevstop1 = stop1;
6354          if (strand == Seq_strand_minus)
6355             prevstop2 = start2;
6356          else
6357             prevstop2 = stop2;
6358       }
6359       if (strand != Seq_strand_minus)
6360          end2 = stop_m;
6361       else
6362       {
6363          end2 = prevstop2-1;
6364          prevstop2 = -1;
6365       }
6366       gap1 = spi_isa_gap(prevstop1, stop_g, Seq_strand_plus);
6367       gap2 = spi_isa_gap(end2, prevstop2, strand);
6368       if (gap1 >= SPI_TEENYEXON && gap2 >= SPI_TEENYEXON)
6369       {
6370          slp1 = SeqLocIntNew(prevstop1+1, stop_g, Seq_strand_plus, sip_genomic);
6371          if (strand == Seq_strand_minus)
6372             slp2 = SeqLocIntNew(end2, prevstop2+1, strand, sip_mrna);
6373          else
6374             slp2 = SeqLocIntNew(prevstop2+1, end2, strand, sip_mrna);
6375          sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
6376          SeqLocFree(slp1);
6377          SeqLocFree(slp2);
6378          sap_tmp = (SeqAlignPtr)(sap->segs);
6379          while (sap_tmp->next != NULL)
6380          {
6381             sap_tmp = sap_tmp->next;
6382          }
6383          sap_tmp->next = sap_new;
6384       }
6385       sap_tmp = (SeqAlignPtr)(sap->segs);
6386       i = 0;
6387       while (sap_tmp != NULL)
6388       {
6389          i++;
6390          sap_tmp = sap_tmp->next;
6391       }
6392       amaip->numsaps = i;
6393       MemFree(amaip->saps);
6394       amaip->saps = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
6395       sap_tmp = (SeqAlignPtr)(sap->segs);
6396       i = 0;
6397       while (sap_tmp != NULL)
6398       {
6399          amaip->saps[i] = sap_tmp;
6400          i++;
6401          sap_tmp = sap_tmp->next;
6402       }
6403       if (sap == NULL)
6404          return NULL;
6405       SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON, 1, SPI_LEFT);
6406       sap_tmp = (SeqAlignPtr)(sap->segs);
6407       sap->segs = NULL;
6408       SeqAlignFree(sap);
6409       return sap_tmp;
6410    } else
6411    {
6412       slp1 = SeqLocIntNew(start_g, stop_g, Seq_strand_plus, sip_genomic);
6413       slp2 = SeqLocIntNew(start_m, stop_m, strand, sip_mrna);
6414       sap_tmp = SPI_FindBestAlnByDotPlot(slp1, slp2);
6415       SeqLocFree(slp1);
6416       SeqLocFree(slp2);
6417       return sap_tmp;
6418    }
6419 }
6420 
6421 /***************************************************************************
6422 *
6423 *  SPI_FindBestAlnByDotPlot is spidey's interface to Fasika Aklilu's
6424 *  tree-based string-matching functions. Given two seqlocs, it sends them
6425 *  to Fasika's function and gets a DOTMainDataPtr in return. The
6426 *  DOTMainDataPtr contains all the information for the hits, so by
6427 *  cycling through this data and copying it into dense-seg seqalign
6428 *  structures, SPI_FindBestAlnByDotPlot builds up a set of alignments
6429 *  that specify the relationship between the two seqlocs. These alignments
6430 *  are then pruned to make a consistent, nonoverlapping set.
6431 *
6432 ***************************************************************************/
SPI_FindBestAlnByDotPlot(SeqLocPtr slp1,SeqLocPtr slp2)6433 static SeqAlignPtr SPI_FindBestAlnByDotPlot(SeqLocPtr slp1, SeqLocPtr slp2)
6434 {
6435    DOTDiagPtr      ddp;
6436    DenseSegPtr     dsp;
6437    Int4            i;
6438    DOTMainDataPtr  mip;
6439    SeqAlignPtr     sap;
6440    SeqAlignPtr     sap_head;
6441    SeqAlignPtr     sap_prev;
6442    ScorePtr        scp;
6443    Int4            start1;
6444    Int4            start2;
6445    Uint1           strand;
6446 
6447    BioseqPtr      bsp1 = NULL, bsp2 = NULL;
6448    SeqIdPtr       sidp1 = NULL, sidp2 = NULL;
6449 
6450    /** KSK: this protects spidey from the implicit requirement of DOT_
6451        that *BOTH* seqs are ncbi2na encoded ****/
6452 
6453    if (slp1 != NULL && slp2 != NULL){
6454        sidp1 = SeqLocId(slp1);
6455        sidp2 = SeqLocId(slp2);
6456        if (sidp1 != NULL && sidp2 != NULL){
6457            bsp1 = BioseqFind(sidp1);
6458            bsp2 = BioseqFind(sidp2);
6459            if (bsp1 != NULL && bsp2 != NULL){
6460                if (bsp1->seq_data_type != Seq_code_ncbi2na
6461                    || bsp2->seq_data_type !=  Seq_code_ncbi2na){
6462                    return NULL;
6463                }
6464            }
6465            else {
6466                return NULL;
6467            }
6468        }
6469        else {
6470            return NULL;
6471        }
6472    }
6473    else {
6474        return NULL;
6475    }
6476 
6477 
6478    mip = DOT_CreateAndStorebyLoc (slp1, slp2, SPI_TEENYEXON, 10);
6479    sap = sap_head = sap_prev = NULL;
6480    if (mip == NULL || mip->hitlist == NULL)
6481       return NULL;
6482    i = 0;
6483    ddp = mip->hitlist[i];
6484    start1 = SeqLocStart(slp1);
6485    start2 = SeqLocStart(slp2);
6486    strand = SeqLocStrand(slp2);
6487    /* copy each ddp (a single ungapped alignment) into a one-segment dense-seg alignment */
6488    while (ddp != NULL && i < mip->index)
6489    {
6490       ddp = mip->hitlist[i];
6491       i++;
6492       sap = SeqAlignNew();
6493       dsp = DenseSegNew();
6494       sap->type = SAT_PARTIAL;
6495       sap->segtype = SAS_DENSEG;
6496       sap->dim = 2;
6497       dsp->dim = 2;
6498       dsp->numseg = 1;
6499       dsp->ids = SeqIdDup(SeqLocId(slp1));
6500       dsp->ids->next = SeqIdDup(SeqLocId(slp2));
6501       dsp->strands = (Uint1Ptr)MemNew(2*sizeof(Uint1));
6502       dsp->strands[0] = SeqLocStrand(slp1);
6503       dsp->strands[1] = SeqLocStrand(slp2);
6504       dsp->starts = (Int4Ptr)MemNew(2*sizeof(Int4));
6505       dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
6506       dsp->starts[0] = ddp->q_start;
6507       if (dsp->strands[1] == Seq_strand_minus)
6508          dsp->starts[1] = ddp->s_start - ddp->length + 1;
6509       else
6510          dsp->starts[1] = ddp->s_start;
6511       if (ddp->length > SeqLocLen(slp2))
6512          dsp->lens[0] = SeqLocLen(slp2);
6513       else
6514          dsp->lens[0] = ddp->length - 1;
6515       scp = ScoreNew();
6516       scp->id = ObjectIdNew();
6517       scp->id->str = StringSave("score");
6518       scp->choice = 1;
6519       scp->value.intvalue = ddp->score;
6520       dsp->scores = scp;
6521       sap->segs = (Pointer)(dsp);
6522       if (sap_head != NULL)
6523       {
6524          sap_prev->next = sap;
6525          sap_prev = sap;
6526       } else
6527          sap_head = sap_prev = sap;
6528    }
6529    if (sap_head == NULL)
6530       return NULL;
6531    AlnMgr2IndexLite(sap_head);
6532    SPI_RemoveInconsistentAlnsFromSet(sap_head, SPI_TEENYEXON, 1, SPI_LEFT);
6533    sap = (SeqAlignPtr)(sap_head->segs);
6534    sap_head->segs = NULL;
6535    SeqAlignFree(sap_head);
6536    MemFree(mip->matrix);
6537    MemFree(mip->qseq);
6538    MemFree(mip->sseq);
6539    MemFree(mip->qname);
6540    MemFree(mip->sname);
6541    i = 0;
6542    while (ddp != NULL && i < mip->index)
6543    {
6544       ddp = mip->hitlist[i];
6545       MemFree(ddp);
6546       i++;
6547    }
6548    MemFree(mip->hitlist);
6549    return sap;
6550 }
6551 
6552 /***************************************************************************
6553 *
6554 *  SPI_comp_aln_pos is the HeapSort callback for SPI_FillInLastmRNAHoles.
6555 *  It compares the genomic intervals covered by two seqaligns, and sorts
6556 *  them according to the 5'-most start position.
6557 *
6558 ***************************************************************************/
SPI_comp_aln_pos(VoidPtr ptr1,VoidPtr ptr2)6559 static int LIBCALLBACK SPI_comp_aln_pos(VoidPtr ptr1, VoidPtr ptr2)
6560 {
6561    SeqAlignPtr  sap1;
6562    SeqAlignPtr  sap2;
6563    Int4         start1;
6564    Int4         start2;
6565    Int4         stop1;
6566    Int4         stop2;
6567 
6568    start1 = start2 = stop1 = stop2 = 0;
6569    if (ptr1 != NULL && ptr2 != NULL)
6570    {
6571       sap1 = *((SeqAlignPtr PNTR) ptr1);
6572       sap2 = *((SeqAlignPtr PNTR) ptr2);
6573       AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
6574       AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
6575       if (stop1 < start2)
6576          return -1;
6577       else if (stop2 < start1)
6578          return 1;
6579       else if (start1 < start2)
6580          return -1;
6581       else if (start2 < start1)
6582          return 1;
6583       else
6584          return 0;
6585   }
6586   return 0;
6587 }
6588 
SPI_bsinfoFreeList(SPI_bsinfoPtr spi)6589 NLM_EXTERN void SPI_bsinfoFreeList (SPI_bsinfoPtr spi)
6590 {
6591    SPI_bsinfoPtr  spi_next;
6592 
6593    while (spi != NULL)
6594    {
6595       spi_next = spi->next;
6596       spi->next = NULL;
6597       SeqLocSetFree(spi->lcaseloc);
6598       MemFree(spi);
6599       spi = spi_next;
6600    }
6601 }
6602 
SPI_RegionFree(SPI_RegionInfoPtr srip)6603 static void SPI_RegionFree (SPI_RegionInfoPtr srip)
6604 {
6605    if (srip == NULL)
6606       return;
6607    if (srip->smp){
6608        SPI_mRNAFree(srip->smp);
6609    }
6610    srip->smp = NULL;
6611    srip->next = NULL;
6612    MemFree(srip);
6613 }
6614 
SPI_mRNAFree(SPI_mRNAPtr smp)6615 NLM_EXTERN void SPI_mRNAFree (SPI_mRNAPtr smp)
6616 {
6617    AMAlignIndex2Ptr  amaip;
6618    Int4             i;
6619 
6620    if (smp == NULL)
6621       return;
6622    MemFree(smp->exonid);
6623    MemFree(smp->exongaps);
6624    MemFree(smp->splicedon);
6625    MemFree(smp->spliceacc);
6626    MemFree(smp->mstarts);
6627    MemFree(smp->mstops);
6628    MemFree(smp->gstarts);
6629    MemFree(smp->gstops);
6630    if (smp->saps != NULL)
6631    {
6632       for (i=0; i<smp->numexons; i++)
6633       {
6634          SeqAlignFree(smp->saps[i]);
6635       }
6636    }
6637    if (smp->parent != NULL)
6638    {
6639       smp->parent->segs = NULL;
6640       amaip = (AMAlignIndex2Ptr)(smp->parent->saip);
6641       amaip->saps = NULL;
6642       SeqAlignFree(smp->parent);
6643    }
6644    MemFree(smp->saps);
6645    if (smp->continuous != NULL)
6646       SeqAlignFree(smp->continuous);
6647    if (smp->epp != NULL)
6648       SPI_FreeExonProfList(smp->epp);
6649    if (smp->protein!=NULL)
6650       MemFree(smp->protein);
6651 }
6652 
SPI_FreeExonProf(SPI_ExonProfPtr epp)6653 static void SPI_FreeExonProf(SPI_ExonProfPtr epp)
6654 {
6655    if (epp == NULL)
6656       return;
6657    MemFree(epp->mismatches);
6658    MemFree(epp);
6659 }
6660 
SPI_FreeExonProfList(SPI_ExonProfPtr epp)6661 static void SPI_FreeExonProfList(SPI_ExonProfPtr epp)
6662 {
6663    SPI_ExonProfPtr  epp_next;
6664 
6665    while (epp != NULL)
6666    {
6667       epp_next = epp->next;
6668       SPI_FreeExonProf(epp);
6669       epp = epp_next;
6670    }
6671 }
6672 
SPI_RegionListFree(SPI_RegionInfoPtr srip)6673 NLM_EXTERN void SPI_RegionListFree (SPI_RegionInfoPtr srip)
6674 {
6675    SPI_RegionInfoPtr  srip_tmp;
6676 
6677    if (srip == NULL)
6678       return;
6679    while (srip != NULL)
6680    {
6681       srip_tmp = srip->next;
6682       SPI_RegionFree(srip);
6683       srip = srip_tmp;
6684    }
6685 }
6686 
SPI_OptionsNew(void)6687 NLM_EXTERN SPI_OptionsPtr SPI_OptionsNew(void)
6688 {
6689    SPI_OptionsPtr  spot;
6690 
6691    spot = (SPI_OptionsPtr)MemNew(sizeof(SPI_Options));
6692    spot->firstpasseval = 0.00001;
6693    spot->secpasseval = 0.001;
6694    spot->thirdpasseval = 10;
6695    spot->organism = SPI_VERTEBRATE;
6696    spot->numreturns = 1;
6697    spot->idcutoff = 0;
6698    spot->lencutoff = 0;
6699    spot->interspecies = FALSE;
6700    spot->printaln = FALSE;
6701    spot->printasn = FALSE;
6702    /* if strand set to 'unknown' BlastTwoSequences()
6703       screws-up returned seqalign strand */
6704    spot->strand = Seq_strand_both;
6705    /* lets have defaults for to & from */
6706    spot->to = 0;
6707    spot->from = 0;
6708    spot->bigintron = 0;
6709    spot->bigintron_size = 0;  /* added by KSK*/
6710    spot->repeat_db_file = 0; /* added by KSK */
6711    return spot;
6712 }
6713 
SPI_OptionsFree(SPI_OptionsPtr spot)6714 NLM_EXTERN void SPI_OptionsFree (SPI_OptionsPtr spot)
6715 {
6716    MemFree(spot);
6717 }
6718 
6719 /***************************************************************************
6720 *
6721 *  SPI_GetDonorSpliceInfo fills in the length of the consensus sequence
6722 *  of the donor splice site for the given organism. The boundary is the
6723 *  location of the exon-intron boundary within the consensus sequence.
6724 *
6725 ***************************************************************************/
SPI_GetDonorSpliceInfo(Int4 org,Int4Ptr spllen,Int4Ptr boundary,SPI_OptionsPtr spot)6726 static void SPI_GetDonorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot)
6727 {
6728    Int4               i;
6729    SPI_SpliceInfoPtr  ssp;
6730 
6731    if (spot->dsplicejunc != 0)
6732    {
6733       ssp = spot->dssp_head;
6734       i = 0;
6735       while (ssp != NULL)
6736       {
6737          i++;
6738          ssp = ssp->next;
6739       }
6740       *spllen = i;
6741       /** file should supply column position of last
6742           base of preceding exon **/
6743       *boundary = *spllen-spot->dsplicejunc+1;
6744       return;
6745    }
6746    if (org == SPI_VERTEBRATE)
6747    {
6748       *spllen = 10;
6749       *boundary = 8;
6750    } else if (org == SPI_FLY)
6751    {
6752       *spllen = 15;
6753       *boundary = 11;
6754    } else if (org == SPI_PLANT)
6755    {
6756       *spllen = 9;
6757       *boundary = 7;
6758    } else if (org == SPI_CELEGANS)
6759    {
6760       *spllen = 15;
6761       *boundary = 11;
6762    }
6763    else if (org == SPI_DICTY){
6764        *spllen = 8;
6765        *boundary = 7;
6766    }
6767 }
6768 
6769 /***************************************************************************
6770 *
6771 *  SPI_is_donor is a general interface to the organism-specific donor
6772 *  splice site evaluation functions. It simply passes on the sequence,
6773 *  sequence length, and score pointer to the appropriate organism-
6774 *  specific function.
6775 *  The organism-specific functions all work exactly the same way, but have
6776 *  different splice matrices. They evaluate P(Site|Sequence), which is:
6777 *
6778 *    P(Site|Sequence) = P(Sequence|Site)*P(Site)/P(Sequence)
6779 *
6780 *  Since P(Site) is constant (and unknown), it is ignored; only
6781 *   P(Sequence|Site)/P(Sequence) is calculated, and these values are
6782 *  compared to each other. P(Sequence|Site) is calculated by multiplying
6783 *  the values in the splice site frequency matrix according to the
6784 *  sequence specified. P(Sequence) is the probability of this specific
6785 *  sequence, using the A, T, G, and C frequences specified in the sequence.
6786 *
6787 *  N.B. Ken Katz changed this so that they generate log-odd scores:
6788 *  log[P(X)/F(X)] + log[P(X)/F(X)]....but then generate the antilog
6789 *  since there are too many places in the code where the expected value
6790 *  is the antilog.
6791 *
6792 ***************************************************************************/
SPI_is_donor(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,Int4 org)6793 NLM_EXTERN void SPI_is_donor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org)
6794 {
6795     if (org == SPI_VERTEBRATE){
6796         SPI_is_donor_vert(sequence, seqlen, score);
6797     }
6798     else if (org == SPI_FLY){
6799         SPI_is_donor_fly(sequence, seqlen, score);
6800     }
6801     else if (org == SPI_PLANT){
6802         SPI_is_donor_plant(sequence, seqlen, score);
6803     }
6804     else if (org == SPI_CELEGANS){
6805         SPI_is_donor_cele(sequence, seqlen, score);
6806     }
6807     else if (org == SPI_DICTY){
6808          SPI_is_donor_dicty(sequence, seqlen, score);
6809     }
6810 }
6811 
SPI_is_donor_user(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,SPI_OptionsPtr spot)6812 static void SPI_is_donor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot)
6813 {
6814    Int4               acgt[4] = {0, 0, 0, 0};
6815    Int4               j;
6816    FloatHi            prob_seqgsite = 0;
6817    SPI_SpliceInfoPtr  ssp;
6818 
6819    if (sequence == NULL || score == NULL){
6820        return;
6821    }
6822 
6823    /* get the frequencies first */
6824    for (j=0; j<seqlen; j++){
6825        if (sequence[j] != 4){
6826            acgt[sequence[j]]++;
6827        }
6828    }
6829    *score = 0;
6830    ssp = spot->dssp_head;
6831    /* now calculate for each base the log, adding values to get the score */
6832    for (j=0; j<seqlen; j++){
6833        if (sequence[j] == 0 && ssp->a > 0){
6834            prob_seqgsite +=
6835                log10(ssp->a/((FloatHi)acgt[sequence[j]]/seqlen));
6836        }
6837        else if (sequence[j] == 1 && ssp->c > 0){
6838            prob_seqgsite +=
6839                log10(ssp->c/((FloatHi)acgt[sequence[j]]/seqlen));
6840        }
6841        else if (sequence[j] == 2 && ssp->g > 0){
6842            prob_seqgsite +=
6843                log10(ssp->g/((FloatHi)acgt[sequence[j]]/seqlen));
6844        }
6845        else if (sequence[j] == 3 && ssp->t > 0){
6846            prob_seqgsite +=
6847                log10(ssp->t/((FloatHi)acgt[sequence[j]]/seqlen));
6848        }
6849        ssp = ssp->next;
6850    }
6851    *score = pow(10, prob_seqgsite);
6852 }
6853 
6854 /***************************************************************************
6855 *
6856 *  See the comment for SPI_is_donor for an explanation of how this
6857 *  function works. The splice site frequency matrix is derived from
6858 *  a nonredundant set of vertebrate splice sites provided by Chris Burge.
6859 *
6860 ***************************************************************************/
SPI_is_donor_vert(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6861 static void SPI_is_donor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6862 {
6863     Int4     acgt[4] = {0, 0, 0, 0};
6864     Int4     j = 0;
6865     FloatHi  d[10][4] = {
6866         {0.3361, 0.3587, 0.1882, 0.1170},
6867         {0.5986, 0.1306, 0.1413, 0.1295},
6868         {0.0867, 0.0321, 0.8034, 0.0778},
6869         {0.0000, 0.0000, 1.0000, 0.0000},
6870         {0.0000, 0.0100, 0.0000, 1.0000},
6871         {0.4976, 0.0267, 0.4507, 0.0249},
6872         {0.7162, 0.0730, 0.1223, 0.0885},
6873         {0.0677, 0.0517, 0.8331, 0.0475},
6874         {0.1586, 0.1681, 0.2185, 0.4549},
6875         {0.2559, 0.2120, 0.3593, 0.1728}};
6876 
6877     FloatHi  prob_seqgsite = 0;
6878 
6879     if (sequence == NULL || score == NULL){
6880       return;
6881     }
6882     *score = 0;
6883     if (seqlen < 10){
6884         return;
6885     }
6886     prob_seqgsite = 0;
6887 
6888     /* first get the freqs */
6889     for (j=0; j<seqlen; j++){
6890         if (sequence[j] != 4){
6891             acgt[sequence[j]]++;
6892         }
6893     }
6894     /* now calculate for each base the log, adding values to get the score */
6895     for (j=0; j<seqlen; j++){
6896         if (sequence[j] != 4){
6897             if (d[j][sequence[j]] > 0){
6898                 prob_seqgsite +=
6899                     log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/(FloatHi)seqlen));
6900             }
6901         }
6902     }
6903     *score = pow(10, prob_seqgsite);
6904 }
6905 
6906 
6907 /***************************************************************************
6908 *
6909 *  See the comment for SPI_is_donor for an explanation of how this
6910 *  function works. The splice site frequency matrix is derived from
6911 *  a nonredundant set of Drosophila splice sites provided by Chris Burge.
6912 *
6913 ***************************************************************************/
SPI_is_donor_fly(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6914 static void SPI_is_donor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6915 {
6916    Int4     acgt[4] = {0, 0, 0, 0};
6917    FloatHi  d[15][4] = {
6918        {0.3103, 0.2105, 0.1951, 0.2834},
6919        {0.3045, 0.2335, 0.2131, 0.2482},
6920        {0.3512, 0.2905, 0.2028, 0.1548},
6921        {0.5374, 0.1523, 0.1567, 0.1529},
6922        {0.1216, 0.0685, 0.6935, 0.1158},
6923        {0.0001, 0.0000, 0.9936, 0.0000},
6924        {0.0000, 0.0000, 0.0000, 0.9878},
6925        {0.5886, 0.0115, 0.3506, 0.0486},
6926        {0.7639, 0.0505, 0.1004, 0.0845},
6927        {0.0480, 0.0102, 0.8861, 0.0550},
6928        {0.1190, 0.1068, 0.0537, 0.7198},
6929        {0.3455, 0.1388, 0.1849, 0.3301},
6930        {0.2700, 0.2258, 0.1804, 0.3231},
6931        {0.3353, 0.2092, 0.1612, 0.2930},
6932        {0.2873, 0.2278, 0.1727, 0.3116}};
6933    Int4     j;
6934    FloatHi  prob_seqgsite = 0;
6935 
6936    if (sequence == NULL || score == NULL){
6937       return;
6938    }
6939    *score = 0;
6940    if (seqlen < 15){
6941        return;
6942    }
6943    /* first get the freqs */
6944    for (j=0; j<seqlen; j++){
6945        if (sequence[j] != 4){
6946            acgt[sequence[j]]++;
6947        }
6948    }
6949    /* now calculate for each base the log, adding values to get the score */
6950    for (j=0; j<seqlen; j++){
6951        if (sequence[j] != 4){
6952            if (d[j][sequence[j]] > 0){
6953                prob_seqgsite +=
6954                    log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
6955            }
6956        }
6957    }
6958    *score = pow(10, prob_seqgsite);
6959 }
6960 
6961 /***************************************************************************
6962 *
6963 *  See the comment for SPI_is_donor for an explanation of how this
6964 *  function works. The splice site frequency matrix is derived from
6965 *  a nonredundant set of Arabidopsis splice sites provided by Chris Burge.
6966 *
6967 ***************************************************************************/
SPI_is_donor_plant(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6968 static void SPI_is_donor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6969 {
6970    Int4     acgt[4] = {0, 0, 0, 0};
6971    FloatHi  d[9][4] = {
6972        {0.3563, 0.3526, 0.1840, 0.1068},
6973        {0.6559, 0.1103, 0.0765, 0.1571},
6974        {0.0887, 0.0328, 0.7876, 0.0907},
6975        {0.0001, 0.0000, 0.9930, 0.0000},
6976        {0.0000, 0.0000, 0.0000, 0.9838},
6977        {0.6607, 0.0452, 0.1195, 0.1744},
6978        {0.5407, 0.1394, 0.0546, 0.2650},
6979        {0.1975, 0.0929, 0.5193, 0.1901},
6980        {0.2368, 0.1405, 0.1040, 0.5182}};
6981 
6982    Int4     j;
6983    FloatHi  prob_seqgsite = 0;
6984 
6985    if (sequence == NULL || score == NULL){
6986        return;
6987    }
6988    *score = 0;
6989    if (seqlen < 9){
6990        return;
6991    }
6992    /* first get the freqs */
6993    for (j=0; j<seqlen; j++){
6994        if (sequence[j] != 4){
6995            acgt[sequence[j]]++;
6996        }
6997    }
6998    /* now calculate for each base the log, adding values to get the score */
6999    for (j=0; j<seqlen; j++){
7000        if (sequence[j] != 4){
7001            if (d[j][sequence[j]] > 0){
7002                prob_seqgsite +=
7003                    log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7004            }
7005        }
7006    }
7007    *score = pow(10, prob_seqgsite);
7008 }
7009 
7010 /***************************************************************************
7011 *
7012 *  See the comment for SPI_is_donor for an explanation of how this
7013 *  function works. The splice site frequency matrix is derived from
7014 *  a nonredundant set of C. elegans splice sites provided by Chris Burge.
7015 *
7016 ***************************************************************************/
SPI_is_donor_cele(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7017 static void SPI_is_donor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7018 {
7019    Int4     acgt[4] = {0, 0, 0, 0};
7020    FloatHi  d[15][4] = {
7021        {0.3575, 0.1537, 0.1605, 0.3284},
7022        {0.3541, 0.1838, 0.1662, 0.2959},
7023        {0.3825, 0.2481, 0.1987, 0.1706},
7024        {0.5792, 0.1445, 0.0955, 0.1808},
7025        {0.1828, 0.0609, 0.6046, 0.1517},
7026        {0.0001, 0.0000, 0.9963, 0.0000},
7027        {0.0000, 0.0000, 0.0000, 0.9919},
7028        {0.5904, 0.0146, 0.2400, 0.1550},
7029        {0.6713, 0.0660, 0.0877, 0.1750},
7030        {0.0904, 0.0457, 0.7441, 0.1198},
7031        {0.1896, 0.1077, 0.0850, 0.6178},
7032        {0.2661, 0.0911, 0.1371, 0.5058},
7033        {0.2620, 0.0995, 0.1344, 0.5041},
7034        {0.2840, 0.1141, 0.1039, 0.4980},
7035        {0.2986, 0.1239, 0.1215, 0.4560}};
7036    Int4     j;
7037    FloatHi  prob_seqgsite = 0;
7038 
7039    if (sequence == NULL || score == NULL){
7040        return;
7041    }
7042    *score = 0;
7043    if (seqlen < 15){
7044        return;
7045    }
7046    /* first get the freqs */
7047    for (j=0; j<seqlen; j++){
7048        if (sequence[j] != 4){
7049            acgt[sequence[j]]++;
7050        }
7051    }
7052    /* now calculate for each base the log, adding values to get the score */
7053    for (j=0; j<seqlen; j++){
7054        if (sequence[j] != 4){
7055            if (d[j][sequence[j]] > 0){
7056                prob_seqgsite +=
7057                    log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7058            }
7059        }
7060    }
7061    *score = pow(10, prob_seqgsite);
7062 }
7063 
7064 /***************************************************************************
7065 *
7066 *  See the comment for SPI_is_donor for an explanation of how this
7067 *  function works. Note that the Dicty info is NOT corrected for current
7068 *  sequence composition because the log(likehood)matrix is itself corrected
7069 *  for dicty genome composition. The data were retrieved
7070 *  from the geneid Dd parameter file and used  with the permission of
7071 *  Roderic Guigo. Values were simply translated from log base 2 to log base 10
7072 *
7073 ***************************************************************************/
SPI_is_donor_dicty(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7074 static void SPI_is_donor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score){
7075 
7076     /*  acgt */
7077     FloatHi  d[8][4] = {
7078         {0.1825, -0.2014, -0.0136, -0.1440},
7079         {0.0487, -0.3298,  0.0742, -0.0633},
7080         {-99999, -99999,   0.6020, -99999},
7081         {-99999, -99999,    -99999, 0.6020},
7082         {0.4783, -0.9030, -1.0634, -0.9673},
7083         {0.3026, -1.4202, -0.7392, -0.0150},
7084         {-0.3356, -1.3914, 0.8111, -0.5090},
7085         {-0.7937, -1.0333, -0.5721, 0.4315}};
7086     Int4 j = 0;
7087 
7088     if (sequence == NULL || score == NULL){
7089         return;
7090     }
7091     *score = 0;
7092     if (seqlen < 8){
7093         return;
7094     }
7095     *score = 0;
7096     for (j = 0; j < seqlen; j++){
7097         if (sequence[j] != 4){
7098             *score += d[j][sequence[j]];
7099         }
7100     }
7101     *score = pow(10, *score);
7102 }
7103 
7104 
7105 /***************************************************************************
7106 *
7107 *  SPI_GetAcceptorSpliceInfo fills in the length of the consensus sequence
7108 *  of the acceptor splice site for the given organism. The boundary is the
7109 *  location of the exon-intron boundary within the consensus sequence.
7110 *
7111 ***************************************************************************/
SPI_GetAcceptorSpliceInfo(Int4 org,Int4Ptr spllen,Int4Ptr boundary,SPI_OptionsPtr spot)7112 static void SPI_GetAcceptorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot)
7113 {
7114    Int4               i;
7115    SPI_SpliceInfoPtr  ssp;
7116 
7117    if (spot->asplicejunc != 0)
7118    {
7119       i = 0;
7120       ssp = spot->assp_head;
7121       while (ssp != NULL)
7122       {
7123          i++;
7124          ssp = ssp->next;
7125       }
7126       *spllen = i;
7127       /*** file should supply first exon column
7128           which needs to be zero-base adjusted ***/
7129       *boundary = spot->asplicejunc - 1;
7130       return;
7131    }
7132    if (org == SPI_VERTEBRATE)
7133    {
7134       *spllen = 21;
7135       *boundary = 20;
7136    } else if (org == SPI_FLY)
7137    {
7138       *spllen = 18;
7139       *boundary = 15;
7140    } else if (org == SPI_PLANT)
7141    {
7142       *spllen = 40;
7143       *boundary = 36;
7144    } else if (org == SPI_CELEGANS)
7145    {
7146       *spllen = 18;
7147       *boundary = 15;
7148    }
7149    else if (org == SPI_DICTY){
7150        *spllen = 15;
7151        *boundary = 15;
7152    }
7153 }
7154 
7155 /***************************************************************************
7156 *
7157 *  SPI_is_acceptor is a general interface to the organism-specific acceptor
7158 *  splice site evaluation functions. It simply passes on the sequence,
7159 *  sequence length, and score pointer to the appropriate organism-
7160 *  specific function.
7161 *  The organism-specific functions all work exactly the same way, but have
7162 *  different splice matrices. They evaluate P(Site|Sequence), which is:
7163 *
7164 *    P(Site|Sequence) = P(Sequence|Site)*P(Site)/P(Sequence)
7165 *
7166 *  Since P(Site) is constant (and unknown), it is ignored; only
7167 *   P(Sequence|Site)/P(Sequence) is calculated, and these values are
7168 *  compared to each other. P(Sequence|Site) is calculated by multiplying
7169 *  the values in the splice site frequency matrix according to the
7170 *  sequence specified. P(Sequence) is the probability of this specific
7171 *  sequence, using the A, T, G, and C frequences specified in the sequence.
7172 *
7173 ***************************************************************************/
SPI_is_acceptor(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,Int4 org)7174 NLM_EXTERN void SPI_is_acceptor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org)
7175 {
7176     if (org == SPI_VERTEBRATE){
7177         SPI_is_acceptor_vert(sequence, seqlen, score);
7178     }
7179     else if (org == SPI_FLY){
7180         SPI_is_acceptor_fly(sequence, seqlen, score);
7181     }
7182     else if (org == SPI_PLANT){
7183         SPI_is_acceptor_plant(sequence, seqlen, score);
7184     }
7185     else if (org == SPI_CELEGANS){
7186         SPI_is_acceptor_cele(sequence, seqlen, score);
7187     }
7188     else if (org == SPI_DICTY){
7189         SPI_is_acceptor_dicty(sequence, seqlen, score);
7190     }
7191 }
7192 
SPI_is_acceptor_user(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,SPI_OptionsPtr spot)7193 static void SPI_is_acceptor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot)
7194 {
7195    Int4               acgt[4] = {0, 0, 0, 0};
7196    Int4               j;
7197    FloatHi            prob_seqgsite = 0;
7198    SPI_SpliceInfoPtr  ssp;
7199 
7200    if (sequence == NULL || score == NULL){
7201        return;
7202    }
7203    /* get the frequencies first */
7204    for (j=0; j<seqlen; j++){
7205        if (sequence[j] != 4){
7206            acgt[sequence[j]]++;
7207        }
7208    }
7209    *score = 0;
7210    ssp = spot->assp_head;
7211    /* now calculate for each base the log, adding values to get the score */
7212    for (j=0; j<seqlen; j++){
7213        if (sequence[j] == 0 && ssp->a > 0){
7214            prob_seqgsite +=
7215                log10(ssp->a/((FloatHi)acgt[sequence[j]]/seqlen));
7216        }
7217        else if (sequence[j] == 1 && ssp->c > 0){
7218            prob_seqgsite +=
7219                log10(ssp->c/((FloatHi)acgt[sequence[j]]/seqlen));
7220        }
7221        else if (sequence[j] == 2 && ssp->g > 0){
7222            prob_seqgsite +=
7223                log10(ssp->g/((FloatHi)acgt[sequence[j]]/seqlen));
7224        }
7225        else if (sequence[j] == 3 && ssp->t > 0){
7226            prob_seqgsite +=
7227                log10(ssp->t/((FloatHi)acgt[sequence[j]]/seqlen));
7228        }
7229        ssp = ssp->next;
7230    }
7231    *score = pow(10, prob_seqgsite);
7232 }
7233 
7234 /***************************************************************************
7235 *
7236 *  See the comment for SPI_is_acceptor for an explanation of how this
7237 *  function works. The splice site frequency matrix is derived from
7238 *  a nonredundant set of vertebrate splice sites provided by Chris Burge.
7239 *
7240 ***************************************************************************/
SPI_is_acceptor_vert(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7241 static void SPI_is_acceptor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7242 {
7243    FloatHi  a[21][4] = {
7244        {0.1823, 0.3135, 0.1485, 0.3557},
7245        {0.1568, 0.3319, 0.1681, 0.3432},
7246        {0.1461, 0.3379, 0.1520, 0.3640},
7247        {0.1271, 0.3290, 0.1710, 0.3729},
7248        {0.1342, 0.3593, 0.1366, 0.3700},
7249        {0.1152, 0.3676, 0.1188, 0.3985},
7250        {0.0926, 0.3688, 0.1235, 0.4151},
7251        {0.0879, 0.3426, 0.1205, 0.4489},
7252        {0.0808, 0.3557, 0.1182, 0.4454},
7253        {0.0790, 0.3224, 0.1128, 0.4857},
7254        {0.0748, 0.3581, 0.1075, 0.4596},
7255        {0.0814, 0.3866, 0.1152, 0.4169},
7256        {0.0849, 0.4186, 0.1235, 0.3729},
7257        {0.0867, 0.4240, 0.0849, 0.4044},
7258        {0.0665, 0.4561, 0.0618, 0.4157},
7259        {0.0736, 0.3996, 0.0564, 0.4703},
7260        {0.2251, 0.3409, 0.2126, 0.2215},
7261        {0.0404, 0.7357, 0.0018, 0.2221},
7262        {1.0000, 0.0010, 0.0010, 0.0010},
7263        {0.0010, 0.0010, 1.0000, 0.0010},
7264        {0.2375, 0.1318, 0.5350, 0.0956}};
7265    Int4        acgt[4] = {0, 0, 0, 0};
7266    Int4        j;
7267    FloatHi     prob_seqgsite = 0;
7268 
7269 
7270    if (sequence == NULL || score == NULL){
7271        return;
7272    }
7273    *score = 0;
7274    if (seqlen < 21){
7275        return;
7276    }
7277    /* first get the freqs */
7278    for (j=0; j<seqlen; j++){
7279        if (sequence[j] != 4){
7280            acgt[sequence[j]]++;
7281        }
7282    }
7283    /* now calculate for each base the log, adding values to get the score */
7284    for (j=0; j<seqlen; j++){
7285        if (sequence[j] != 4 && a[j][sequence[j]] > 0 ){
7286            prob_seqgsite +=
7287                log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7288        }
7289    }
7290    *score = pow(10, prob_seqgsite);
7291    /* if (sequence[18] == 0 && sequence[19] == 2){
7292      *score += 0.5;
7293    }
7294    */
7295 }
7296 
7297 /***************************************************************************
7298 *
7299 *  See the comment for SPI_is_acceptor for an explanation of how this
7300 *  function works. The splice site frequency matrix is derived from
7301 *  a nonredundant set of Drosophila splice sites provided by Chris Burge.
7302 *
7303 ***************************************************************************/
SPI_is_acceptor_fly(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7304 static void SPI_is_acceptor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7305 {
7306     FloatHi  a[18][4] = {
7307         {0.2497, 0.2446, 0.1044, 0.4014},
7308         {0.2132, 0.2369, 0.1063, 0.4437},
7309         {0.1946, 0.2196, 0.1082, 0.4776},
7310         {0.2170, 0.2017, 0.0973, 0.4840},
7311         {0.1946, 0.2170, 0.0858, 0.5026},
7312         {0.2004, 0.2433, 0.0858, 0.4706},
7313         {0.2004, 0.2727, 0.0967, 0.4302},
7314         {0.2106, 0.2708, 0.0864, 0.4321},
7315         {0.1876, 0.3035, 0.0608, 0.4481},
7316         {0.1114, 0.2522, 0.0679, 0.5685},
7317         {0.1178, 0.2164, 0.0461, 0.6197},
7318         {0.2830, 0.1639, 0.2913, 0.2618},
7319         {0.0467, 0.7049, 0.0045, 0.2439},
7320         {0.9923, 0.0032, 0.0013, 0.0032},
7321         {0.0032, 0.0038, 0.9910, 0.0019},
7322         {0.3073, 0.1997, 0.3675, 0.1255},
7323         {0.2260, 0.1927, 0.1709, 0.4104},
7324         {0.2574, 0.2855, 0.2279, 0.2292}};
7325     Int4        acgt[4] = {0, 0, 0, 0};
7326     Int4        j;
7327     FloatHi     prob_seqgsite = 0;
7328 
7329     if (sequence == NULL || score == NULL){
7330         return;
7331     }
7332     *score = 0;
7333     if (seqlen < 18){
7334         return;
7335     }
7336     /* first get the freqs */
7337     for (j=0; j<seqlen; j++){
7338         if (sequence[j] != 4){
7339             acgt[sequence[j]]++;
7340         }
7341     }
7342     /* now calculate for each base the log, adding values to get the score */
7343     for (j=0; j<seqlen; j++){
7344         if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7345             prob_seqgsite +=
7346                 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7347         }
7348     }
7349     *score = pow(10, prob_seqgsite);
7350     /* if (sequence[12] == 0 && sequence[13] == 2)
7351        *score += 0.5;
7352        */
7353 }
7354 
7355 /***************************************************************************
7356 *
7357 *  See the comment for SPI_is_acceptor for an explanation of how this
7358 *  function works. The splice site frequency matrix is derived from
7359 *  a nonredundant set of Arabidopsis splice sites provided by Chris Burge.
7360 *
7361 ***************************************************************************/
SPI_is_acceptor_plant(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7362 static void SPI_is_acceptor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7363 {
7364    FloatHi  a[40][4] = {
7365        {0.2959, 0.1512, 0.1632, 0.3896},
7366        {0.2845, 0.1490, 0.1648, 0.4017},
7367        {0.2660, 0.1528, 0.1742, 0.4071},
7368        {0.2843, 0.1346, 0.1744, 0.4067},
7369        {0.2714, 0.1512, 0.1624, 0.4150},
7370        {0.2806, 0.1451, 0.1661, 0.4082},
7371        {0.2753, 0.1486, 0.1650, 0.4111},
7372        {0.2753, 0.1460, 0.1532, 0.4255},
7373        {0.2775, 0.1497, 0.1648, 0.4080},
7374        {0.2898, 0.1429, 0.1543, 0.4130},
7375        {0.2793, 0.1486, 0.1545, 0.4174},
7376        {0.2834, 0.1429, 0.1576, 0.4161},
7377        {0.2725, 0.1471, 0.1517, 0.4285},
7378        {0.2614, 0.1521, 0.1519, 0.4347},
7379        {0.2515, 0.1497, 0.1639, 0.4347},
7380        {0.2408, 0.1460, 0.1619, 0.4513},
7381        {0.2266, 0.1431, 0.1652, 0.4650},
7382        {0.2218, 0.1403, 0.1639, 0.4738},
7383        {0.2122, 0.1292, 0.1661, 0.4926},
7384        {0.1886, 0.1460, 0.1694, 0.4961},
7385        {0.1919, 0.1368, 0.1711, 0.5002},
7386        {0.1921, 0.1375, 0.1641, 0.5063},
7387        {0.1838, 0.1331, 0.1558, 0.5273},
7388        {0.1809, 0.1307, 0.1622, 0.5260},
7389        {0.1694, 0.1364, 0.1761, 0.5181},
7390        {0.2177, 0.1357, 0.1864, 0.4602},
7391        {0.2109, 0.1388, 0.1552, 0.4952},
7392        {0.2150, 0.1300, 0.1538, 0.5011},
7393        {0.1989, 0.1252, 0.1766, 0.4993},
7394        {0.1849, 0.1407, 0.1464, 0.5280},
7395        {0.1554, 0.0997, 0.1069, 0.6381},
7396        {0.2664, 0.0846, 0.3851, 0.2640},
7397        {0.0597, 0.6512, 0.0026, 0.2863},
7398        {0.9937, 0.0017, 0.0024, 0.0022},
7399        {0.0022, 0.0042, 0.9921, 0.0015},
7400        {0.2367, 0.0968, 0.5553, 0.1112},
7401        {0.2281, 0.1534, 0.1766, 0.4419},
7402        {0.2957, 0.1438, 0.2218, 0.3387},
7403        {0.2614, 0.1923, 0.2904, 0.2559},
7404        {0.2950, 0.1777, 0.2205, 0.3068}};
7405    Int4        acgt[4] = {0, 0, 0, 0};
7406    Int4        j;
7407    FloatHi     prob_seqgsite = 0;
7408 
7409    if (sequence == NULL || score == NULL){
7410        return;
7411    }
7412    *score = 0;
7413    if (seqlen < 40){
7414        return;
7415    }
7416    /* first get the freqs */
7417    for (j=0; j<seqlen; j++){
7418        if (sequence[j] != 4){
7419            acgt[sequence[j]]++;
7420        }
7421    }
7422    /* now calculate for each base the log, adding values to get the score */
7423    for (j=0; j<seqlen; j++){
7424        if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7425            prob_seqgsite +=
7426                log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7427        }
7428    }
7429    *score = pow(10, prob_seqgsite);
7430    /* if (sequence[33] == 0 && sequence[34] == 2)
7431        *score += 0.5;
7432        */
7433 }
7434 
7435 /***************************************************************************
7436 *
7437 *  See the comment for SPI_is_acceptor for an explanation of how this
7438 *  function works. The splice site frequency matrix is derived from
7439 *  a nonredundant set of C. elegans splice sites provided by Chris Burge.
7440 *
7441 ***************************************************************************/
SPI_is_acceptor_cele(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7442 static void SPI_is_acceptor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7443 {
7444     FloatHi  a[18][4] = {
7445         {0.4365, 0.1293, 0.0650, 0.3689},
7446         {0.3719, 0.1415, 0.0826, 0.4037},
7447         {0.3550, 0.1374, 0.0883, 0.4190},
7448         {0.3428, 0.1418, 0.0910, 0.4240},
7449         {0.3465, 0.1499, 0.0711, 0.4321},
7450         {0.3594, 0.1492, 0.0707, 0.4203},
7451         {0.3976, 0.1191, 0.0728, 0.4102},
7452         {0.4139, 0.0795, 0.0687, 0.4376},
7453         {0.2812, 0.0799, 0.0690, 0.5695},
7454         {0.0589, 0.0379, 0.0156, 0.8873},
7455         {0.0102, 0.0132, 0.0047, 0.9716},
7456         {0.0975, 0.1391, 0.0917, 0.6714},
7457         {0.0321, 0.8257, 0.0020, 0.1398},
7458         {0.9953, 0.0010, 0.0017, 0.0017},
7459         {0.0020, 0.0020, 0.9946, 0.0010},
7460         {0.3990, 0.1553, 0.3154, 0.1299},
7461         {0.2995, 0.1780, 0.1628, 0.3594},
7462         {0.2975, 0.2288, 0.1878, 0.2856}};
7463     Int4        acgt[4] = {0, 0, 0, 0};
7464     Int4        j;
7465     FloatHi     prob_seqgsite = 0;
7466 
7467     if (sequence == NULL || score == NULL){
7468         return;
7469     }
7470     *score = 0;
7471     if (seqlen < 18){
7472         return;
7473     }
7474     /* first get the freqs */
7475     for (j=0; j<seqlen; j++){
7476         if (sequence[j] != 4){
7477             acgt[sequence[j]]++;
7478         }
7479     }
7480     /* now calculate for each base the log, adding values to get the score */
7481     for (j=0; j<seqlen; j++){
7482         if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7483             prob_seqgsite +=
7484                 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7485         }
7486     }
7487     *score = pow(10, prob_seqgsite);
7488     /* if (sequence[13] == 0 && sequence[14] == 2)
7489         *score += 0.5;
7490         */
7491 }
7492 
7493 
7494 /***************************************************************************
7495 *
7496 *  See the comment for SPI_is_acceptor for an explanation of how this
7497 *  function works. Note that the Dicty info is NOT corrected for current
7498 *  sequence composition because the log(likehood)matrix itself corrected
7499 *  for dicty genome composition.  The data were retrieved
7500 *  from the geneid Dd parameter file and used  with the permission of
7501 *  Roderic Guigo.Values were simply translated from log base 2 to log base 10
7502 *
7503 ***************************************************************************/
SPI_is_acceptor_dicty(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7504 static void SPI_is_acceptor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score){
7505 
7506     /*  acgt */
7507     Int4 j = 0;
7508     FloatHi  a[15][4] = {
7509         {-0.2171, -0.4463, -0.9154, 0.2974},
7510         {-0.0984, -0.3965, -1.3635, 0.2574},
7511         {0.0201, -0.5770, -1.5528, 0.3159},
7512         {-0.0880, -0.6470, -1.0716, 0.2993},
7513         {-0.0483, -0.4077, -0.8955, 0.2326},
7514         {0.1091, -0.4041, -0.9030, 0.2262},
7515         {0.0672, -0.3973, -1.0649, 0.1682},
7516         {0.0592, -0.5731, -1.0634, 0.1480},
7517         {0.1707, -0.9122, -1.0757, 0.1658},
7518         {0.0343, -0.6659, -1.4012, 0.2365},
7519         {0.1407, -0.4903, -1.3757, 0.0521},
7520         {0.1901,-0.7647,-0.9314, 0.1395},
7521         {-0.2300, -0.7210, -0.9030, 0.4531},
7522         {0.6020, -9999, -9999, -9999},
7523         {-9999,-9999, 0.6020, -9999}};
7524 
7525     if (sequence == NULL || score == NULL){
7526         return;
7527     }
7528     *score = 0;
7529     if (seqlen < 15){
7530         return;
7531     }
7532     for (j = 0; j < seqlen; j++){
7533         if (sequence[j] != 4){
7534             *score += a[j][sequence[j]];
7535         }
7536     }
7537     *score = pow(10, *score);
7538     /* if (sequence[13] == 0 && sequence[14] == 2){
7539       *score += 0.5;
7540     }
7541     */
7542 }
7543 
7544 
7545 
7546 /***************************************************************************
7547 *
7548 *  SPI_RemoveConflictsAmongPieces looks at all the alignments for all
7549 *  the fragments and removes overlapping alignment sets. The alignment set
7550 *  with the greatest sequence range will be kept. The function cycles
7551 *  through all the fragments, and if a fragment has an alignment, the
7552 *  range of that alignment is compared with the range of all subsequent
7553 *  fragments' alignments, and if there's an overlap, one of the overlapping
7554 *  alignment sets is deleted. While not the most efficient design, this
7555 *  function works well because most overlaps get deleted early and because
7556 *  the searches are done with repeat masking, reducing the number of
7557 *  overlaps.
7558 *
7559 ***************************************************************************/
SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp,Int4 fuzz)7560 static void SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp, Int4 fuzz)
7561 {
7562    Boolean  conflict;
7563    Boolean  done;
7564    Int4     i;
7565    Int4     j;
7566    Int4     len1;
7567    Int4     len2;
7568    Int4     start1;
7569    Int4     start2;
7570    Int4     stop1;
7571    Int4     stop2;
7572 
7573    i = 0;
7574    while (i<sfhp->numfrags)
7575    {
7576       if (sfhp->sfparray[i]->sap != NULL)
7577       {
7578          SPI_GetNthSeqRangeInSASet(sfhp->sfparray[i]->sap, 2, &start1, &stop1);
7579          done = FALSE;
7580          for (j=i+1; j<sfhp->numfrags && !done; j++)
7581          {
7582             if (sfhp->sfparray[j]->sap != NULL)
7583             {
7584                SPI_GetNthSeqRangeInSASet(sfhp->sfparray[j]->sap, 2, &start2, &stop2);
7585                conflict = FALSE;
7586                if (start2 > start1 && stop2 < stop1)
7587                   conflict = TRUE;
7588                else if (stop2 > start1 + fuzz && start2 < start1 + fuzz)
7589                   conflict = TRUE;
7590                else if (start2 < stop1 - fuzz && stop2 > stop1 - fuzz)
7591                   conflict = TRUE;
7592                else if (start2 < start1 && stop2 > stop1)
7593                   conflict = TRUE;
7594                if (conflict)
7595                {
7596                   len1 = stop1 - start1 + 1;
7597                   len2 = stop2 - start2 + 1;
7598                   if (len2 > len1)
7599                   {
7600                      done = TRUE;
7601                      SeqAlignSetFree(sfhp->sfparray[i]->sap);
7602                      sfhp->sfparray[i]->sap = NULL;
7603                   } else if (len1 >= len2)
7604                   {
7605                      SeqAlignSetFree(sfhp->sfparray[j]->sap);
7606                      sfhp->sfparray[j]->sap = NULL;
7607                   }
7608                }
7609             }
7610          }
7611       }
7612       i++;
7613    }
7614 }
7615 
7616 /***************************************************************************
7617 *
7618 *  SPI_OrderPieces sorts the fragments according to their alignment
7619 *  position (position on the mRNA sequence) as well as by their original
7620 *  fragment order (lgroup, group, and order). Fragments without
7621 *  alignments are placed at the beginning of the set. At the end of the
7622 *  sort, all the initial fragments have no alignments and all the fragments
7623 *  at the end are in order along the mRNA sequence, making filling in the
7624 *  holes in the alignment much easier.
7625 *
7626 ***************************************************************************/
SPI_OrderPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_mrna)7627 static void SPI_OrderPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna)
7628 {
7629    Int4             i;
7630    Int4             j;
7631    SPI_FragInfoPtr  sfi;
7632    SPI_FragInfoPtr  PNTR sfi_array;
7633    SPI_FragPtr      sfp;
7634    SPI_FragPtr      PNTR sfp_array;
7635 
7636    if (sfhp == NULL || bsp_mrna == NULL)
7637       return;
7638    sfi_array = (SPI_FragInfoPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragInfo));
7639    for (i=0; i<sfhp->numfrags; i++)
7640    {
7641       sfi = (SPI_FragInfoPtr)MemNew(sizeof(SPI_FragInfo));
7642       sfp = sfhp->sfparray[i];
7643       if (sfp->sap != NULL)
7644       {
7645          if (sfp->sap->saip == NULL)
7646             AlnMgr2IndexLite(sfp->sap);
7647          SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &sfi->mrnastart, &sfi->mrnastop);
7648       } else
7649          sfi->mrnastart = sfi->mrnastop = -1;
7650       sfi->sfpnum = i;
7651       sfi->position_orig = sfp->position_orig;
7652       sfi->fragnum = sfp->fragnum;
7653       sfi_array[i] = sfi;
7654    }
7655    HeapSort(sfi_array, i, sizeof(SPI_FragInfoPtr), SPI_CompareFragInfo);
7656    j = 0;
7657    for (i=0; i<sfhp->numfrags; i++)
7658    {
7659       if (sfi_array[i]->mrnastart != -1)
7660       {
7661          sfhp->sfparray[sfi_array[i]->sfpnum]->position_mrna = j;
7662          j++;
7663       } else
7664          sfhp->sfparray[sfi_array[i]->sfpnum]->position_mrna = -1;
7665    }
7666    sfp_array = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
7667    for (i=0; i<sfhp->numfrags; i++)
7668    {
7669       sfp_array[i] = sfhp->sfparray[sfi_array[i]->sfpnum];
7670    }
7671    MemFree(sfhp->sfparray);
7672    sfhp->sfparray = sfp_array;
7673    for (i=0; i<sfhp->numfrags; i++)
7674    {
7675       MemFree(sfi_array[i]);
7676    }
7677    MemFree(sfi_array);
7678 }
7679 
7680 /***************************************************************************
7681 *
7682 *  SPI_CompareFragInfo is the HeapSort callback for SPI_OrderPieces. It
7683 *  compares the alignments of two fragments and puts the fragment that
7684 *  is most 5' on the mRNA first. Fragments without alignments are put in
7685 *  their original order, before all the fragments with alignments. If two
7686 *  fragments have the same mRNA position, they are sorted secondarily by
7687 *  their original fragment position.
7688 *
7689 ***************************************************************************/
SPI_CompareFragInfo(VoidPtr ptr1,VoidPtr ptr2)7690 static int LIBCALLBACK SPI_CompareFragInfo(VoidPtr ptr1, VoidPtr ptr2)
7691 {
7692    SPI_FragInfoPtr  sfi1;
7693    SPI_FragInfoPtr  sfi2;
7694 
7695    if (ptr1 != NULL && ptr2 != NULL)
7696    {
7697       sfi1 = *((SPI_FragInfoPtr PNTR)ptr1);
7698       sfi2 = *((SPI_FragInfoPtr PNTR)ptr2);
7699       /* this function orders by mRNA position, secondarily by original position */
7700       if (sfi1->mrnastart != -1 && sfi2->mrnastart != -1)
7701       {
7702          if (sfi1->mrnastart < sfi2->mrnastart)
7703             return -1;
7704          else if (sfi1->mrnastart > sfi2->mrnastart)
7705             return 1;
7706          else if (sfi1->mrnastop > sfi2->mrnastop)
7707             return -1;
7708          else if (sfi1->mrnastop < sfi2->mrnastop)
7709             return 1;
7710          else
7711             return 0;
7712       }
7713       /* put things with no mRNA order first */
7714       if (sfi1->mrnastart != -1 && sfi2->mrnastart == -1)
7715          return 1;
7716       if (sfi1->mrnastart == -1 && sfi2->mrnastart != -1)
7717          return -1;
7718       if (sfi1->position_orig->lgroup != 0 && sfi1->position_orig->lgroup == sfi2->position_orig->lgroup)
7719       {
7720          if (sfi1->position_orig->group < sfi2->position_orig->group)
7721             return -1;
7722          else if (sfi1->position_orig->group > sfi2->position_orig->group)
7723             return 1;
7724          else
7725          {
7726             if (sfi1->position_orig->order < sfi2->position_orig->order)
7727                return -1;
7728             else
7729                return 1;
7730          }
7731       }
7732       /* if fragments are in the same group, keep them in order */
7733       if (sfi1->position_orig->group == sfi2->position_orig->group)
7734       {
7735          if (sfi1->position_orig->order < sfi2->position_orig->order)
7736             return -1;
7737          else
7738             return 1;
7739       }
7740       if (sfi1->position_orig->group < sfi2->position_orig->group)
7741          return -1;
7742       else if (sfi2->position_orig->group > sfi1->position_orig->group)
7743          return 1;
7744       if (sfi1->fragnum < sfi2->fragnum)
7745          return -1;
7746       else
7747          return 1;
7748    }
7749    return 0;
7750 }
7751 
7752 /***************************************************************************
7753 *
7754 *  SPI_ConnectAlnPieces is analogous to SPI_ConnectAln for finished
7755 *  sequence; it fills in the gaps in the mRNA-to-draft alignment. Since
7756 *  the genomic sequence is in fragments, the job is a little trickier
7757 *  here. The function first calls SPI_ConnectAln on each set of
7758 *  alignments for each fragment, to fill in internal gaps between those
7759 *  alignments. Then the alignment sets are all examined (they should not
7760 *  overlap at this point, but they usually have gaps between them) and any
7761 *  gaps between the alignment sets are filled in by first searching in
7762 *  the fragments containing alignments adjacent to the gaps, then by
7763 *  looking in all "nearby" fragments (as defined by SPI_GetNearbyFrags), and
7764 *  finally by looking in all fragments. Since many of the spidey functions
7765 *  assume that the genomic sequence is always the plus strand, and the
7766 *  draft sequence functions all deal with alignments on the plus strand
7767 *  of the mRNA and either strand of the genomic sequence, there are many
7768 *  places in this function where the strands of an alignment must be
7769 *  reversed before and after a function call if the alignment is on the
7770 *  minus strand of the genomic sequence.
7771 *
7772 ***************************************************************************/
SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_contig,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)7773 static Boolean SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_contig, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
7774 {
7775    AMAlignIndex2Ptr      amaip;
7776    Int4                 c;
7777    Int4                 curr;
7778    Boolean              done;
7779    Boolean              found;
7780    Int4                 gapsize;
7781    Int4                 i;
7782    Int4                 j = 0;
7783    Boolean              minus;
7784    Int4                 n;
7785    BLAST_OptionsBlkPtr  options;
7786    Int4                 orderedstart;
7787    Int4                 prevstart;
7788    SeqAlignPtr          salp;
7789    SeqAlignPtr          salp_tmp;
7790    SeqAlignPtr          salp_prev;
7791    SeqAlignPtr          sap;
7792    SeqAlignPtr          sap_b1;
7793    SeqAlignPtr          sap_b2;
7794    SeqAlignPtr          sap_new1;
7795    SeqAlignPtr          sap_new2;
7796    SeqAlignPtr          sap_tmp;
7797    SPI_FragPtr          sfp;
7798    SPI_FragPtr          sfpcurr;
7799    SPI_FragPtr          PNTR sfpnearby;
7800    SPI_FragPtr          sfpprev;
7801    SeqLocPtr            slp_gen;
7802    SeqLocPtr            slp_mrna;
7803    Int4                 start_b;
7804    Int4                 start1;
7805    Int4                 start2;
7806    Int4                 start3;
7807    Int4                 start4;
7808    Int4                 stop_b;
7809    Int4                 stop1;
7810    Int4                 stop2;
7811    Int4                 stop3;
7812    Int4                 stop4;
7813    Uint1                strand;
7814    SPI_FragPtr          PNTR tmparray;
7815 
7816    i = 0;
7817    orderedstart = -1;
7818    /* figure out which sequences have ordering information (by mRNA position) so far */
7819    while (orderedstart == -1 && i < sfhp->numfrags)
7820    {
7821       if (sfhp->sfparray[i]->position_mrna != -1)
7822          orderedstart = i;
7823       i++;
7824    }
7825    if (orderedstart == -1) /* no fragment has alignments */
7826       return FALSE;
7827    /* fill in internal gaps for each contig-to-mRNA alignment */
7828    for (i=orderedstart; i<sfhp->numfrags; i++)
7829    {
7830       if (sfhp->sfparray[i]->sap != NULL)
7831       {
7832          if ((AlnMgr2GetNthStrand(sfhp->sfparray[i]->sap, 1)) == Seq_strand_minus)
7833          {
7834             minus = TRUE;
7835             salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
7836             while (salp != NULL)
7837             {
7838                salp_tmp = salp->next;
7839                salp->next = NULL;
7840                SAIndex2Free2(salp->saip);
7841                salp->saip = NULL;
7842                SeqAlignListReverseStrand(salp);
7843                AlnMgr2IndexSingleChildSeqAlign(salp);
7844                salp->next = salp_tmp;
7845                salp = salp_tmp;
7846             }
7847          } else
7848             minus = FALSE;
7849          if (!SPI_ConnectAln(sfhp->sfparray[i]->sap, spot, NULL, FALSE, TRUE))
7850          {
7851             SeqAlignSetFree(sfhp->sfparray[i]->sap);
7852             sfhp->sfparray[i]->sap = NULL;
7853          }
7854          if (minus && sfhp->sfparray[i]->sap != NULL)
7855          {
7856             salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap);
7857             while (salp != NULL)
7858             {
7859                salp_tmp = salp->next;
7860                salp->next = NULL;
7861                SAIndex2Free2(salp->saip);
7862                salp->saip = NULL;
7863                SeqAlignListReverseStrand(salp);
7864                AlnMgr2IndexSingleChildSeqAlign(salp);
7865                salp->next = salp_tmp;
7866                salp = salp_tmp;
7867             }
7868          }
7869       }
7870       if (i != sfhp->numfrags-1)
7871          sfhp->sfparray[i]->next = sfhp->sfparray[i+1];
7872    }
7873    tmparray = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
7874    prevstart = -1;
7875    done = FALSE;
7876    sfpcurr = sfhp->sfparray[orderedstart];
7877    sfpprev = NULL;
7878    curr = orderedstart;
7879    start3 = stop3 = -1;
7880    sfpnearby = NULL;
7881    /* fill in gaps between contig alignments */
7882    while (!done)
7883    {
7884       sap = sfpcurr->sap;
7885       if (sap != NULL)
7886       {
7887          SPI_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
7888          if ((gapsize = spi_isa_gap(start2, prevstart, Seq_strand_plus)) >= SPI_TEENYEXON)
7889          {
7890             /* first look in the same piece and the ones that are supposed to be adjacent */
7891             strand = AlnMgr2GetNthStrand(sfpcurr->sap, 1);
7892             if (strand == Seq_strand_minus)
7893             {
7894                minus = TRUE;
7895                salp = (SeqAlignPtr)(sfpcurr->sap->segs);
7896                while (salp != NULL)
7897                {
7898                   salp_tmp = salp->next;
7899                   salp->next = NULL;
7900                   SAIndex2Free2(salp->saip);
7901                   salp->saip = NULL;
7902                   SeqAlignListReverseStrand(salp);
7903                   AlnMgr2IndexSingleChildSeqAlign(salp);
7904                   salp->next = salp_tmp;
7905                   salp = salp_tmp;
7906                }
7907             } else
7908                minus = FALSE;
7909             SPI_GetNthSeqRangeInSASet(sfpcurr->sap, 1, &start1, &stop1);
7910             sap_new1 = sap_new2 = NULL;
7911             if (!minus)
7912                sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpcurr->start, start1, prevstart, start2, Seq_strand_minus, spot);
7913             else
7914                sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpcurr->start, start1, start2, prevstart, Seq_strand_plus, spot);
7915             if (sap_new1 != NULL)
7916             {
7917                SPI_GetNthSeqRangeInSASet(sap_new1, 2, &start2, &stop2);
7918                sap_new1->next = (SeqAlignPtr)(sfpcurr->sap->segs);
7919                sfpcurr->sap->segs = (Pointer)(sap_new1);
7920                AlnMgr2ReIndexSeqAlign(sfpcurr->sap);
7921                SPI_RemoveInconsistentAlnsFromSet(sfpcurr->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7922             }
7923             if ((spi_isa_gap(start2, prevstart, Seq_strand_plus)) > SPI_TEENYEXON)
7924             /* look in fragments in the same group or lgroup, */
7925             /* up to the ones that already have hits */
7926             {
7927                if (sfpnearby != NULL)
7928                {
7929                   MemFree(sfpnearby);
7930                   sfpnearby = NULL;
7931                }
7932                j = SPI_GetNearbyFrags(sfpcurr, curr, &sfpnearby, sfhp, minus);
7933                found = FALSE;
7934                for (n=0; n<j && !found; n++)
7935                {
7936                   if (sfpnearby[n]->sap != NULL)
7937                   {
7938                      found = TRUE;
7939                      strand = AlnMgr2GetNthStrand(sfpnearby[n]->sap, 1);
7940                      if (strand == Seq_strand_minus)
7941                      {
7942                         salp = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7943                         while (salp != NULL)
7944                         {
7945                            salp_tmp = salp->next;
7946                            salp->next = NULL;
7947                            SAIndex2Free2(salp->saip);
7948                            salp->saip = NULL;
7949                            SeqAlignListReverseStrand(salp);
7950                            AlnMgr2IndexSingleChildSeqAlign(salp);
7951                            salp->next = salp_tmp;
7952                            salp = salp_tmp;
7953                         }
7954                         SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 1, &start3, &stop3);
7955                         SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 2, &start4, &stop4);
7956                         sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, start4, stop4, start2, strand, spot);
7957                         if (sap_new1 != NULL)
7958                         {
7959                            sap_new1->next = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7960                            sfpnearby[n]->sap->segs = (Pointer)sap_new1;
7961                            AMAlignIndex2Free2(sfpnearby[n]->sap->saip);
7962                            sfpnearby[n]->sap->saip = NULL;
7963                            AlnMgr2IndexLite(sfpnearby[n]->sap);
7964                            SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7965                         }
7966                         salp = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7967                         while (salp != NULL)
7968                         {
7969                            salp_tmp = salp->next;
7970                            salp->next = NULL;
7971                            SAIndex2Free2(salp->saip);
7972                            salp->saip = NULL;
7973                            SeqAlignListReverseStrand(salp);
7974                            AlnMgr2IndexSingleChildSeqAlign(salp);
7975                            salp->next = salp_tmp;
7976                            salp = salp_tmp;
7977                         }
7978                      } else
7979                      {
7980                         SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 1, &start3, &stop3);
7981                         SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 2, &start4, &stop4);
7982                         sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, stop4, sfpnearby[n]->stop, stop3, start2, Seq_strand_plus, spot);
7983                         if (sap_new1 != NULL)
7984                         {
7985                            sap_new1->next = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7986                            sfpnearby[n]->sap->segs = (Pointer)sap_new1;
7987                            AMAlignIndex2Free2(sfpnearby[n]->sap->saip);
7988                            sfpnearby[n]->sap->saip = NULL;
7989                            AlnMgr2IndexLite(sfpnearby[n]->sap);
7990                            SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7991                         }
7992                      }
7993                   } else
7994                   {
7995                      sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, prevstart, start2, Seq_strand_plus, spot);
7996                      if (sap_new1 != NULL)
7997                      {
7998                         sap_tmp = sap_new1;
7999                         while (sap_tmp->next != NULL)
8000                         {
8001                            sap_tmp = sap_tmp->next;
8002                         }
8003                         sap_tmp->next = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, start2, prevstart, Seq_strand_minus, spot);
8004                      } else
8005                         sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, start2, prevstart, Seq_strand_minus, spot);
8006                      if (sap_new1 != NULL)
8007                      {
8008                         AMAlignIndex2Free2(sap_new1->saip);
8009                         sap_new1->saip = NULL;
8010                         AlnMgr2IndexLite(sap_new1);
8011                         sfpnearby[n]->sap = sap_new1;
8012                         SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
8013                      }
8014                   }
8015                }
8016             }
8017             SPI_CleanupAndGetNewmRNARange(sfpnearby, j, &start3, &stop3);
8018             if (start3 != -1 && stop3 != -1)
8019             {
8020                start2 = start3;
8021                stop2 = stop3;
8022             }
8023             MemFree(sfpnearby);
8024             if ((spi_isa_gap(start2, prevstart, Seq_strand_plus)) > SPI_MINBLASTSIZE + 2)
8025             /* now look in all the fragments that don't have hits yet */
8026             {
8027                slp_mrna = SeqLocIntNew(prevstart+1, start2-1, Seq_strand_plus, bsp_mrna->id);
8028                slp_gen = SeqLocIntNew(spot->from, spot->to, Seq_strand_plus, bsp_contig->id);
8029                options = BLASTOptionNew("blastn", TRUE);
8030                options->wordsize = 7;
8031                options->filter_string = StringSave("m L");
8032                options->expect_value = spot->secpasseval;
8033                options->query_lcase_mask = spot->lcaseloc;
8034                if (spot->interspecies)
8035                {
8036                   options->gap_x_dropoff_final = 100;
8037                   options->gap_open = 4;
8038                   options->gap_extend = 1;
8039                   options->penalty = -1;
8040                }
8041                sap_b1 = BlastTwoSequencesByLoc(slp_mrna, slp_gen, "blastn", options);
8042                BLASTOptionDelete(options);
8043                SeqLocFree(slp_gen);
8044                slp_gen = SeqLocIntNew(spot->from, spot->to, Seq_strand_minus, bsp_contig->id);
8045                options = BLASTOptionNew("blastn", TRUE);
8046                options->wordsize = 7;
8047                options->filter_string = StringSave("m L");
8048                options->expect_value = spot->secpasseval;
8049                options->query_lcase_mask = spot->lcaseloc;
8050                if (spot->interspecies)
8051                {
8052                   options->gap_x_dropoff_final = 100;
8053                   options->gap_open = 4;
8054                   options->gap_extend = 1;
8055                   options->penalty = -1;
8056                }
8057                sap_b2 = BlastTwoSequencesByLoc(slp_mrna, slp_gen, "blastn", options);
8058                BLASTOptionDelete(options);
8059                SeqAlignListReverseStrand(sap_b2);
8060                SeqLocFree(slp_gen);
8061                SeqLocFree(slp_mrna);
8062                if (sap_b1 != NULL)
8063                {
8064                   sap_tmp = sap_b1;
8065                   while (sap_tmp->next != NULL)
8066                   {
8067                      sap_tmp = sap_tmp->next;
8068                   }
8069                   sap_tmp->next = sap_b2;
8070                } else
8071                   sap_b1 = sap_b2;
8072                SPI_flip_sa_list(sap_b1);
8073                if (sap_b1 != NULL)
8074                {
8075                   AlnMgr2SortAlnSetByNthRowPos(sap_b1, 1);
8076                   c = 0;
8077                   amaip = (AMAlignIndex2Ptr)(sap_b1->saip);
8078                   AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &start_b, &stop_b);
8079                   for (i=0; i<orderedstart && c<amaip->numsaps; i++)
8080                   {
8081                      salp_tmp = salp_prev = NULL;
8082                      sfp = sfhp->sfparray[i];
8083                      while (sfp->start <= start_b && sfp->stop >= start_b && c<amaip->numsaps)
8084                      {
8085                         if (salp_tmp == NULL)
8086                            salp_tmp = salp_prev = SeqAlignDup(amaip->saps[c]);
8087                         else
8088                         {
8089                            salp_prev->next = SeqAlignDup(amaip->saps[c]);
8090                            salp_prev = salp_prev->next;
8091                         }
8092                         c++;
8093                         if (c<amaip->numsaps)
8094                            AlnMgr2GetNthSeqRangeInSA(amaip->saps[c], 1, &start_b, &stop_b);
8095                      }
8096                      if (salp_tmp != NULL)
8097                      {
8098                         AlnMgr2IndexLite(salp_tmp);
8099                         SPI_RemoveInconsistentAlnsFromSet(salp_tmp, SPI_FUZZ, 2, SPI_LEFT);
8100                         SeqAlignSetFree(sfp->sap);
8101                         sfp->sap = salp_tmp;
8102                         if (!SPI_ConnectAln(sfp->sap, spot, NULL, FALSE, TRUE))
8103                            return FALSE;
8104                         /* change all alignments to be on the plus strand of the mRNA */
8105                         strand = AlnMgr2GetNthStrand((SeqAlignPtr)(salp_tmp->segs), 2);
8106                         if (strand == Seq_strand_minus)
8107                            SeqAlignListReverseStrand((SeqAlignPtr)(salp_tmp->segs));
8108                      }
8109                   }
8110                }
8111                for (i=0; i<orderedstart; i++)
8112                {
8113                   if (sfhp->sfparray[i]->sap != NULL)
8114                      j++;
8115                }
8116                sfpnearby = (SPI_FragPtr PNTR)MemNew(j*sizeof(SPI_FragPtr));
8117                j = 0;
8118                for (i=0; i<orderedstart; i++)
8119                {
8120                   if (sfhp->sfparray[i]->sap != NULL)
8121                   {
8122                      sfpnearby[j] = sfhp->sfparray[i];
8123                      j++;
8124                   }
8125                }
8126                SPI_CleanupAndGetNewmRNARange(sfpnearby, j, &start3, &stop3);
8127             }
8128          }
8129          prevstart = stop2;
8130       }
8131       sfpprev = sfpcurr;
8132       curr++;
8133       if (curr == sfhp->numfrags)
8134          done = TRUE;
8135       else
8136          sfpcurr = sfhp->sfparray[curr];
8137    }
8138    return TRUE;
8139 }
8140 
8141 /***************************************************************************
8142 *
8143 *  SPI_CleanupAndGetNewmRNARange looks through all alignment sets of a
8144 *  group of fragments and removes overlapping alignment sets. Once the
8145 *  group of fragments is consistent, SPI_CleanupAndGetNewmRNARange gets
8146 *  the range of the mRNA sequence covered by all alignment sets of the
8147 *  fragment group.
8148 *
8149 ***************************************************************************/
SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby,Int4 n,Int4Ptr start,Int4Ptr stop)8150 static void SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby, Int4 n, Int4Ptr start, Int4Ptr stop)
8151 {
8152    Boolean      conflict;
8153    Boolean      done;
8154    Int4         i;
8155    Int4         len1;
8156    Int4         len2;
8157    Int4         numconsistent;
8158    Int4         numsaps1;
8159    Int4         numsaps2;
8160    SPI_FragPtr  sfp;
8161    SPI_FragPtr  sfp_head;
8162    SPI_FragPtr  sfp_prev;
8163    Int4         start_m;
8164    Int4         start_m1;
8165    Int4         stop_m;
8166    Int4         stop_m1;
8167    Int4         tmpstart;
8168    Int4         tmpstop;
8169 
8170    numconsistent = 0;
8171    sfp_head = sfp_prev = NULL;
8172    for (i=0; i<n; i++) /* first make the set self-consistent by removing overlapping */
8173    {                   /* sets of alignments among the fragments                     */
8174       sfpnearby[i]->next = NULL;
8175       sfp = sfp_head;
8176       if (sfpnearby[i]->sap != NULL)
8177       {
8178          SPI_GetNthSeqRangeInSASet(sfpnearby[i]->sap, 2, &start_m, &stop_m);
8179          done = FALSE;
8180          conflict = FALSE;
8181          while (sfp != NULL && !done)
8182          {
8183             SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &start_m1, &stop_m1);
8184             if (start_m1 > start_m && stop_m1 < stop_m)
8185                conflict = TRUE;
8186             else if (stop_m1 > start_m + SPI_TEENYEXON && start_m1 < start_m)
8187                conflict = TRUE;
8188             else if (start_m1 < stop_m - SPI_TEENYEXON && stop_m1 > stop_m)
8189                conflict = TRUE;
8190             else if (start_m1 < start_m && stop_m1 > stop_m)
8191                conflict = TRUE;
8192             if (conflict == TRUE) /* keep the longer of the two alignment sets */
8193             {
8194                done = TRUE;
8195                len1 = SPI_GetNthSeqLenInSASet(sfpnearby[i]->sap, 2, &numsaps1);
8196                len2 = SPI_GetNthSeqLenInSASet(sfp->sap, 2, &numsaps2);
8197                if (len1 > len2)
8198                {
8199                   SeqAlignSetFree(sfp->sap);
8200                   sfp->sap = NULL;
8201                   /* new sfp takes the place of the conflicting one */
8202                   if (sfp_prev != NULL)
8203                   {
8204                      sfpnearby[i]->next = sfp_prev->next;
8205                      sfp_prev->next = sfpnearby[i];
8206                   } else
8207                   {
8208                      sfpnearby[i]->next = sfp_head->next;
8209                      sfp_head = sfpnearby[i];
8210                   }
8211                } else /* new one gets its seqalign deleted */
8212                {
8213                   SeqAlignSetFree(sfpnearby[i]->sap);
8214                   sfpnearby[i]->sap = NULL;
8215                }
8216             } else
8217             {
8218                sfp_prev = sfp;
8219                sfp = sfp->next;
8220             }
8221          }
8222          if (!conflict) /* add the new one to the list */
8223          {
8224             sfpnearby[i]->next = sfp_head;
8225             sfp_head = sfpnearby[i];
8226             numconsistent++;
8227          }
8228       }
8229    }
8230    /* then get the start and stop of the mRNA across the set */
8231    if (numconsistent == 0) /* shouldn't ever happen! */
8232    {
8233       *start = -1;
8234       *stop = -1;
8235    } else
8236    {
8237       sfp = sfp_head;
8238       *start = -1;
8239       *stop = -1;
8240       while (sfp != NULL)
8241       {
8242          if (sfp->sap != NULL)
8243          {
8244             SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &tmpstart, &tmpstop);
8245             if (tmpstart < *start || *start == -1)
8246                *start = tmpstart;
8247             if (tmpstop > *stop)
8248                *stop = tmpstop;
8249          }
8250          sfp = sfp->next;
8251       }
8252       sfpnearby[0] = sfp_head;
8253    }
8254 }
8255 
8256 /***************************************************************************
8257 *
8258 *  SPI_GetNearbyFrags takes a fragment herd, a fragment which is the
8259 *  target (n is the number of the target fragment in the herd) and a
8260 *  SPI_FragPtr **, and fills in the ptrptr with a linked list of fragments
8261 *  that are in the same group or lgroup as the target fragment. These
8262 *  fragments are supposedly near the target fragment in the genomic
8263 *  sequence, and should be searched first for pieces missing from the
8264 *  alignment in the target fragment.
8265 *
8266 ***************************************************************************/
SPI_GetNearbyFrags(SPI_FragPtr sfptarget,Int4 n,SPI_FragPtr ** ptrptr,SPI_FragHerdPtr sfhp,Boolean minus)8267 static Int4 SPI_GetNearbyFrags(SPI_FragPtr sfptarget, Int4 n, SPI_FragPtr ** ptrptr, SPI_FragHerdPtr sfhp, Boolean minus)
8268 {
8269    Boolean      found;
8270    Int4         i;
8271    Int4         j;
8272    SPI_FragPtr  sfp;
8273    SPI_FragPtr  sfp_curr;
8274    SPI_FragPtr  sfp_head;
8275    SPI_FragPtr  sfp_prev;
8276    SPI_FragPtr  PNTR sfpnearby;
8277 
8278    j = 0;
8279    sfp_head = NULL;
8280    for (i=0; i<sfhp->numfrags; i++)
8281    {
8282       if (i != n)
8283       {
8284          sfp = sfhp->sfparray[i];
8285          sfp->next = NULL;
8286          if (sfp->position_orig->lgroup == sfptarget->position_orig->lgroup)
8287          {
8288             if (!minus)
8289             {
8290                if (sfp->position_orig->group < sfptarget->position_orig->group && sfp->position_orig->lgroup != 0)
8291                {
8292                   j++;
8293                   sfp_prev = NULL;
8294                   if (sfp_head == NULL)
8295                      sfp_head = sfp;
8296                   else
8297                   {
8298                      sfp_prev = NULL;
8299                      sfp_curr = sfp_head;
8300                      found = FALSE;
8301                      while (sfp_curr != NULL && !found)
8302                      {
8303                         if (sfp->position_orig->group > sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order > sfp_curr->position_orig->order))
8304                            found = TRUE;
8305                         else
8306                         {
8307                            sfp_prev = sfp_curr;
8308                            sfp_curr = sfp_curr->next;
8309                         }
8310                      }
8311                      if (sfp_prev != NULL)
8312                      {
8313                         sfp->next = sfp_prev->next;
8314                         sfp_prev->next = sfp;
8315                      } else
8316                      {
8317                         sfp->next = sfp_head;
8318                         sfp_head = sfp;
8319                      }
8320                   }
8321                } else if (sfp->position_orig->group == sfptarget->position_orig->group && sfp->position_orig->order < sfptarget->position_orig->order)
8322                {
8323                   j++;
8324                   if (sfp_head == NULL)
8325                      sfp_head = sfp;
8326                   else
8327                   {
8328                      sfp_prev = NULL;
8329                      sfp_curr = sfp_head;
8330                      found = FALSE;
8331                      while (sfp_curr != NULL && !found)
8332                      {
8333                         if (sfp->position_orig->group > sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order > sfp_curr->position_orig->order))
8334                            found = TRUE;
8335                         else
8336                         {
8337                            sfp_prev = sfp_curr;
8338                            sfp_curr = sfp_curr->next;
8339                         }
8340                      }
8341                      if (sfp_prev != NULL)
8342                      {
8343                         sfp->next = sfp_prev->next;
8344                         sfp_prev->next = sfp;
8345                      } else
8346                      {
8347                         sfp->next = sfp_head;
8348                         sfp_head = sfp;
8349                      }
8350                   }
8351                }
8352             } else
8353             {
8354                if (sfp->position_orig->group > sfptarget->position_orig->group && sfp->position_orig->lgroup != 0)
8355                {
8356                   j++;
8357                   sfp_prev = NULL;
8358                   if (sfp_head == NULL)
8359                      sfp_head = sfp;
8360                   else
8361                   {
8362                      sfp_prev = NULL;
8363                      sfp_curr = sfp_head;
8364                      found = FALSE;
8365                      while (sfp_curr != NULL && !found)
8366                      {
8367                         if (sfp->position_orig->group < sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order < sfp_curr->position_orig->order))
8368                            found = TRUE;
8369                         else
8370                         {
8371                            sfp_prev = sfp_curr;
8372                            sfp_curr = sfp_curr->next;
8373                         }
8374                      }
8375                      if (sfp_prev != NULL)
8376                      {
8377                         sfp->next = sfp_prev->next;
8378                         sfp_prev->next = sfp;
8379                      } else
8380                      {
8381                         sfp->next = sfp_head;
8382                         sfp_head = sfp;
8383                      }
8384                   }
8385                } else if (sfp->position_orig->group == sfptarget->position_orig->group && sfp->position_orig->order > sfptarget->position_orig->order)
8386                {
8387                   j++;
8388                   if (sfp_head == NULL)
8389                      sfp_head = sfp;
8390                   else
8391                   {
8392                      sfp_prev = NULL;
8393                      sfp_curr = sfp_head;
8394                      found = FALSE;
8395                      while (sfp_curr != NULL && !found)
8396                      {
8397                         if (sfp->position_orig->group < sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order < sfp_curr->position_orig->order))
8398                            found = TRUE;
8399                         else
8400                         {
8401                            sfp_prev = sfp_curr;
8402                            sfp_curr = sfp_curr->next;
8403                         }
8404                      }
8405                      if (sfp_prev != NULL)
8406                      {
8407                         sfp->next = sfp_prev->next;
8408                         sfp_prev->next = sfp;
8409                      } else
8410                      {
8411                         sfp->next = sfp_head;
8412                         sfp_head = sfp;
8413                      }
8414                   }
8415                }
8416             }
8417          }
8418       }
8419    }
8420    if (j == 0)
8421       return 0;
8422    sfpnearby = (SPI_FragPtr PNTR)MemNew(j*sizeof(SPI_FragPtr));
8423    for (i=0, sfp = sfp_head; i<j && sfp!=NULL; i++, sfp = sfp->next)
8424    {
8425       sfpnearby[i] = sfp;
8426    }
8427    *ptrptr = sfpnearby;
8428    return j;
8429 }
8430 
8431 /***************************************************************************
8432 *
8433 *  SPI_AdjustSplicesInPieces first calls SPI_AdjustForSplice on each
8434 *  fragment's alignment set (first reversing those fragment's alignments
8435 *  that are on the minus strand of the genomic sequence, as the draft
8436 *  functions expect the mRNA to be on the plus strand but the finished
8437 *  functions expect the genomic sequence to be on the plus strand). Next,
8438 *  it calls SPI_AdjustEndsOfPieces for each adjacent pair of fragments;
8439 *  this function adjusts the initial and terminal exons of the adjacent
8440 *  pieces so that they abut exactly on the mRNA and they are next to
8441 *  acceptable splice sites.
8442 *
8443 ***************************************************************************/
SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_genomic,SPI_OptionsPtr spot)8444 static void SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_genomic, SPI_OptionsPtr spot)
8445 {
8446    Int4               i;
8447    Int4               j;
8448    Boolean            minus;
8449    SeqAlignPtr        salp;
8450    SeqAlignPtr        salp_tmp;
8451    SPI_mRNAPtr        smp;
8452    SPI_RegionInfoPtr  srip;
8453    Uint1              tmp_acc;
8454    Uint1              tmp_don;
8455 
8456    /* first use standard functions to adjust internal splices */
8457    srip = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
8458    for (i=0; i<sfhp->numfrags; i++)
8459    {
8460       if (sfhp->sfparray[i]->sap != NULL && ((SeqAlignPtr)(sfhp->sfparray[i]->sap->segs))->next != NULL)
8461       {
8462          salp_tmp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8463          srip->strand = AlnMgr2GetNthStrand(salp_tmp, 1);
8464          if (srip->strand == Seq_strand_minus)
8465          {
8466             minus = TRUE;
8467             salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8468             while (salp != NULL)
8469             {
8470                salp_tmp = salp->next;
8471                salp->next = NULL;
8472                SAIndex2Free2(salp->saip);
8473                salp->saip = NULL;
8474                SeqAlignListReverseStrand(salp);
8475                AlnMgr2IndexSingleChildSeqAlign(salp);
8476                salp->next = salp_tmp;
8477                salp = salp_tmp;
8478             }
8479          } else
8480             minus = FALSE;
8481          if (sfhp->sfparray[i]->sap->saip == NULL)
8482             AlnMgr2IndexLite(sfhp->sfparray[i]->sap);
8483          smp = SPI_AdjustForSplice(sfhp->sfparray[i]->sap, spot, srip);
8484          sfhp->sfparray[i]->smp = smp;
8485          if (srip->strand == Seq_strand_minus) /* the exons will be in the wrong order now */
8486          {
8487             for (j=0; j<smp->numexons/2; j++)
8488             {
8489                tmp_don = smp->splicedon[smp->numexons-j-1];
8490                smp->splicedon[smp->numexons-j-1] = smp->splicedon[j];
8491                smp->splicedon[j] = tmp_don;
8492                tmp_acc = smp->spliceacc[smp->numexons-j-1];
8493                smp->spliceacc[smp->numexons-j-1] = smp->spliceacc[j];
8494                smp->spliceacc[j] = tmp_acc;
8495             }
8496             salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8497             while (salp != NULL)
8498             {
8499                salp_tmp = salp->next;
8500                salp->next = NULL;
8501                SAIndex2Free2(salp->saip);
8502                salp->saip = NULL;
8503                SeqAlignListReverseStrand(salp);
8504                AlnMgr2IndexSingleChildSeqAlign(salp);
8505                salp->next = salp_tmp;
8506                salp = salp_tmp;
8507             }
8508          }
8509       }
8510    }
8511    MemFree(srip);
8512    /* now adjust the splice sites between fragments */
8513    for (i=0; i<sfhp->numfrags-1; i++)
8514    {
8515       if (sfhp->sfparray[i]->sap != NULL)
8516       {
8517          j = i+1;
8518          while (j<sfhp->numfrags && sfhp->sfparray[j]->sap == NULL)
8519          {
8520             j++;
8521          }
8522          if (sfhp->sfparray[j]->sap != NULL)
8523             SPI_AdjustEndsOfPieces(sfhp->sfparray[i], sfhp->sfparray[j], bsp_genomic, spot);
8524       }
8525    }
8526 }
8527 
8528 /***************************************************************************
8529 *
8530 *  SPI_AdjustEndsOfPieces takes the last exon in the alignment of sfp1 and
8531 *  the first exon in the alignment of sfp2 and adjusts the boundaries so
8532 *  that the two exons abut exactly on the mRNA (if possible -- if a piece
8533 *  is missing, both exon boundaries are separately adjusted to good splice
8534 *  sites) and so that they are adjacent to good splice sites. After
8535 *  getting the possible splice sites, SPI_AdjustEndsOfPieces looks through
8536 *  the sites to determine which is the highest-scoring site that changes
8537 *  the alignments the least. If no pieces are missing (continuous is TRUE)
8538 *  then both alignments are truncated or extended to the splice site; if
8539 *  continuous is FALSE, the second alignment is adjusted separately to a
8540 *  good acceptor site that changes the alignment the least.
8541 *
8542 ***************************************************************************/
SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1,SPI_FragPtr sfp2,BioseqPtr bsp_genomic,SPI_OptionsPtr spot)8543 static void SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1, SPI_FragPtr sfp2, BioseqPtr bsp_genomic, SPI_OptionsPtr spot)
8544 {
8545    Boolean         continuous;
8546    Int4            f;
8547    SPI_FragSplPtr  fsp1;
8548    SPI_FragSplPtr  fsp2;
8549    Int4            i;
8550    FloatHi         maxsc;
8551    Int4            offset;
8552    Int4            ovl;
8553    Int4            pos;
8554    SeqAlignPtr     sap1;
8555    SeqAlignPtr     sap2;
8556    Int4            start1;
8557    Int4            start2;
8558    Int4            stop1;
8559    Int4            stop2;
8560    Uint1           strand1;
8561    Uint1           strand2;
8562 
8563    sap1 = SPI_GetNthSAByRow(sfp1->sap, 2, -1);
8564    sap2 = SPI_GetNthSAByRow(sfp2->sap, 2, 1);
8565    AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
8566    AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
8567    strand1 = AlnMgr2GetNthStrand(sap1, 1);
8568    strand2 = AlnMgr2GetNthStrand(sap2, 1);
8569    fsp1 = NULL;
8570    fsp2 = NULL;
8571    if (start2 - stop1 <= SPI_TEENYEXON) /* make mRNA continuous, nonoverlapping */
8572    {
8573       if (start2 - stop1 < 0)
8574          ovl = stop1 - start2;
8575       else
8576          ovl = start2 - stop1;
8577       if (ovl < SPI_TEENYEXON)
8578          ovl = SPI_TEENYEXON;
8579       fsp1 = SPI_GetPossibleSites(sap1, bsp_genomic, spot, TRUE, ovl);
8580       continuous = TRUE;
8581    } else /* just adjust ends to good splice sites, don't worry about continuity */
8582    {
8583       ovl = SPI_FUZZ;
8584       fsp1 = SPI_GetPossibleSites(sap1, bsp_genomic, spot, TRUE, SPI_FUZZ);
8585       fsp2 = SPI_GetPossibleSites(sap2, bsp_genomic, spot, FALSE, SPI_FUZZ);
8586       continuous = FALSE;
8587    }
8588    maxsc = 0;
8589    for (f=0; f<SPI_NUMSITES; f++)
8590    {
8591       pos = stop1 - ovl + fsp1->splarray[f].i + fsp1->spllen - fsp1->boundary;
8592       if (stop1 - pos < 0)
8593          fsp1->splarray[f].diff = pos - stop1;
8594       else
8595          fsp1->splarray[f].diff = stop1 - pos;
8596       if (continuous)
8597       {
8598          if (start2 - pos < 0)
8599          {
8600             if (pos - start2 > fsp1->splarray[f].diff)
8601                fsp1->splarray[f].diff = pos - start2;
8602          } else
8603          {
8604             if (start2 - pos > fsp1->splarray[f].diff)
8605                fsp1->splarray[f].diff = start2 - pos;
8606          }
8607       }
8608       if (pos - start1 <= SPI_TEENYEXON)
8609       {
8610          fsp1->splarray[f].score = 0;
8611          fsp1->splarray[f].diff = -1;
8612       }
8613       if (fsp1->splarray[f].diff > maxsc)
8614          maxsc = fsp1->splarray[f].diff;
8615    }
8616    offset = ovl - fsp1->spllen + fsp1->boundary;
8617    i = 0;
8618    for (f=0; f<SPI_NUMSITES; f++)
8619    {
8620       if (fsp1->splarray[f].diff <= maxsc && fsp1->splarray[f].score > 0 && fsp1->splarray[f].diff >= 0)
8621       {
8622          maxsc = fsp1->splarray[f].diff;
8623          offset = fsp1->splarray[f].i;
8624          i = f;
8625       }
8626    }
8627    if (fsp1->splarray[i].score >= 0.00001)
8628       sfp1->donor = 1;
8629    else  /* if don't find a good site, don't change the alignment */
8630       offset = ovl - fsp1->spllen + fsp1->boundary;
8631    pos = stop1 - ovl + offset + fsp1->spllen - fsp1->boundary;
8632    if (strand1 == Seq_strand_minus)
8633    {
8634       sap1->next = NULL;
8635       SAIndex2Free2(sap1->saip);
8636       sap1->saip = NULL;
8637       SeqAlignListReverseStrand(sap1);
8638       AlnMgr2IndexSingleChildSeqAlign(sap1);
8639       if (pos < stop1)
8640       {
8641          if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
8642          {
8643             sap1->next->next = NULL;
8644             SeqAlignFree(sap1->next);
8645             sap1->next = NULL;
8646          }
8647       } else if (pos > stop1)
8648          SPI_AddToAln(sap1, pos - stop1, SPI_LEFT, strand1);
8649       sap1->next = NULL;
8650       SAIndex2Free2(sap1->saip);
8651       sap1->saip = NULL;
8652       SeqAlignListReverseStrand(sap1);
8653       AlnMgr2IndexSingleChildSeqAlign(sap1);
8654    } else
8655    {
8656       if (pos < stop1)
8657       {
8658          if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
8659          {
8660             sap1->next->next = NULL;
8661             SeqAlignFree(sap1->next);
8662             sap1->next = NULL;
8663          }
8664       } else if (pos > stop1)
8665          SPI_AddToAln(sap1, pos - stop1, SPI_RIGHT, strand1);
8666    }
8667    if (!continuous) /* find a decent acceptor site among the ones returned */
8668    {
8669       maxsc = 0;
8670       for (f=0; f<SPI_NUMSITES; f++)
8671       {
8672          pos = start2 - ovl + fsp2->splarray[f].i + fsp2->spllen - fsp2->boundary;
8673          if (start2 - pos < 0)
8674             fsp1->splarray[f].diff = pos - start2;
8675          else
8676             fsp1->splarray[f].diff = start2 - pos;
8677          if (pos - stop2 <= SPI_TEENYEXON || stop2 - pos <= SPI_TEENYEXON)
8678          {
8679             fsp2->splarray[f].score = 0;
8680             fsp2->splarray[f].diff = -1;
8681          }
8682          if (fsp2->splarray[f].diff > maxsc)
8683             maxsc = fsp2->splarray[f].diff;
8684       }
8685       offset = ovl - fsp2->spllen + fsp2->boundary;
8686       i = 0;
8687       for (f=0; f<SPI_NUMSITES; f++)
8688       {
8689          if (fsp2->splarray[f].diff <= maxsc && fsp2->splarray[f].score > 0 && fsp2->splarray[f].diff >= 0)
8690          {
8691             maxsc = fsp2->splarray[f].diff;
8692             offset = fsp2->splarray[f].i;
8693             i = f;
8694          }
8695       }
8696       if (fsp2->splarray[i].score >= 0.0000002)
8697          sfp2->acceptor = 1;
8698       else  /* if don't find a good site, don't change the alignment */
8699          offset = ovl - fsp2->spllen + fsp2->boundary;
8700       pos = start2 - ovl + offset + fsp2->spllen - fsp2->boundary;
8701    }
8702    if (strand2 == Seq_strand_minus)
8703    {
8704       sap2->next = NULL;
8705       SAIndex2Free2(sap2->saip);
8706       sap2->saip = NULL;
8707       SeqAlignListReverseStrand(sap2);
8708       AlnMgr2IndexSingleChildSeqAlign(sap2);
8709       if (start2 < pos + 1)
8710       {
8711          if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
8712          {
8713             sap2->next->next = NULL;
8714             SeqAlignFree(sap2->next);
8715             sap2->next = NULL;
8716          }
8717       } else if (start2 > pos + 1)
8718          SPI_AddToAln(sap2, start2-pos-1, SPI_RIGHT, strand2);
8719       sap2->next = NULL;
8720       SAIndex2Free2(sap2->saip);
8721       sap2->saip = NULL;
8722       SeqAlignListReverseStrand(sap2);
8723       AlnMgr2IndexSingleChildSeqAlign(sap2);
8724    } else
8725    {
8726       if (start2 < pos + 1)
8727       {
8728          if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
8729          {
8730             sap2->next->next = NULL;
8731             SeqAlignFree(sap2->next);
8732             sap2->next = NULL;
8733          }
8734       } else if (start2 > pos + 1)
8735          SPI_AddToAln(sap2, start2-pos-1, SPI_LEFT, strand2);
8736    }
8737    if (continuous) /* check to see whether current breakpoint has a good acceptor site */
8738    {
8739       fsp2 = SPI_GetPossibleSites(sap2, bsp_genomic, spot, FALSE, 0);
8740       if (fsp2->splarray[0].score >= 0.0000002)
8741          sfp2->acceptor = 1;
8742    }
8743    SPI_FragSplFree(fsp1);
8744    SPI_FragSplFree(fsp2);
8745 }
8746 
8747 /***************************************************************************
8748 *
8749 *  SPI_GetNthSAByRow is a useful utility function that sorts a set
8750 *  of alignments by position on the 'row'th row and then retrieves the
8751 *  nth of those alignments. If n is -1, the last alignment is
8752 *  retrieved.
8753 *
8754 ***************************************************************************/
SPI_GetNthSAByRow(SeqAlignPtr sap,Int4 row,Int4 n)8755 static SeqAlignPtr SPI_GetNthSAByRow(SeqAlignPtr sap, Int4 row, Int4 n)
8756 {
8757 /* n = 1 is first alignment, n = -1 is last alignment */
8758    AMAlignIndex2Ptr  amaip;
8759    Int4             i;
8760    SeqAlignPtr      sap_place;
8761    SeqAlignPtr      PNTR saparray;
8762    SeqAlignPtr      PNTR saparray_tmp;
8763 
8764    if (sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
8765       return NULL;
8766    amaip = (AMAlignIndex2Ptr)(sap->saip);
8767    if (n > amaip->numsaps)
8768       return NULL;
8769    saparray = (SeqAlignPtr PNTR)MemNew(amaip->numsaps*sizeof(SeqAlignPtr));
8770    saparray_tmp = amaip->saps;
8771    for (i=0; i<amaip->numsaps; i++)
8772    {
8773       saparray[i] = amaip->saps[i];
8774    }
8775    amaip->saps = saparray;
8776    AlnMgr2SortAlnSetByNthRowPos(sap, row);
8777    if (n > 0)
8778       sap_place = amaip->saps[n-1];
8779    else
8780       sap_place = amaip->saps[amaip->numsaps-1];
8781    amaip->saps = saparray_tmp;
8782    MemFree(saparray);
8783    return sap_place;
8784 }
8785 
8786 /***************************************************************************
8787 *
8788 *  SPI_GetPossibleSites returns the SPI_NUMSITES best donor or acceptor
8789 *  splice sites for an exon (defined by an alignment), within a range
8790 *  defined by the variable ovl. First, the donor or acceptor site
8791 *  consensus length and position of the splice junction is retrieved
8792 *  for the appropriate organism. Then, the interval around the 5' or 3'
8793 *  end of the alignment (dictated by whether the site is a donor (5') or
8794 *  acceptor (3') site) is examined and the SPI_NUMSITES best sites are
8795 *  stored in the SPI_FragSpl structure and returned.
8796 *
8797 ***************************************************************************/
SPI_GetPossibleSites(SeqAlignPtr sap,BioseqPtr bsp_genomic,SPI_OptionsPtr spot,Boolean donor,Int4 ovl)8798 static SPI_FragSplPtr SPI_GetPossibleSites(SeqAlignPtr sap, BioseqPtr bsp_genomic, SPI_OptionsPtr spot, Boolean donor, Int4 ovl)
8799 {
8800    Int4            boundary;
8801    Uint1Ptr        buf;
8802    Int4            c;
8803    Int4            f;
8804    SPI_FragSplPtr  fsp;
8805    Int4            i;
8806    FloatHi         maxsc = 0;
8807    Uint1           res;
8808    FloatHi         score;
8809    SPI_SplicePtr   splarray;
8810    Int4            spllen;
8811    SeqPortPtr      spp;
8812    Int4            start;
8813    Int4            stop;
8814    Uint1           strand;
8815 
8816    strand = AlnMgr2GetNthStrand(sap, 1);
8817    fsp = (SPI_FragSplPtr)MemNew(sizeof(SPI_FragSpl));
8818    AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
8819    if (donor)
8820       SPI_GetDonorSpliceInfo(spot->organism, &spllen, &boundary, spot);
8821    else
8822       SPI_GetAcceptorSpliceInfo(spot->organism, &spllen, &boundary, spot);
8823    if (strand != Seq_strand_minus)
8824    {
8825       if (donor)
8826          spp = SeqPortNew(bsp_genomic, stop-ovl, stop+ovl+spllen, strand, Seq_code_ncbi4na);
8827       else
8828          spp = SeqPortNew(bsp_genomic, start-ovl-spllen, start+ovl, strand, Seq_code_ncbi4na);
8829    } else
8830    {
8831       if (donor)
8832          spp = SeqPortNew(bsp_genomic, start-ovl-spllen, start+ovl, strand, Seq_code_ncbi4na);
8833       else
8834          spp = SeqPortNew(bsp_genomic, stop-ovl, stop+ovl+spllen, strand, Seq_code_ncbi4na);
8835    }
8836    i = 0;
8837    buf = (Uint1Ptr)MemNew((2*ovl+spllen+(spllen-boundary))*sizeof(Uint1));
8838    splarray = (SPI_SplicePtr)MemNew(SPI_NUMSITES*sizeof(SPI_Splice));
8839    for (f=0; f<SPI_NUMSITES; f++)
8840    {
8841       splarray[f].i = 0;
8842       splarray[f].score = -2;
8843    }
8844    while (((res = SeqPortGetResidue(spp)) != SEQPORT_EOF) && i<(2*ovl+1+spllen))
8845    {
8846       if (res == 1)
8847          buf[i] = 0;
8848       else if (res == 2)
8849          buf[i] = 1;
8850       else if (res == 4)
8851          buf[i] = 2;
8852       else if (res == 8)
8853          buf[i] = 3;
8854       else
8855          buf[i] = 4;
8856       i++;
8857    }
8858    SeqPortFree(spp);
8859    for (i=0; i<2*ovl+(spllen-boundary); i++)
8860    {
8861       if (donor)
8862          SPI_is_donor(buf+i, spllen, &score, spot->organism);
8863       else
8864          SPI_is_acceptor(buf+i, spllen, &score, spot->organism);
8865       c = 0;
8866       if (score > 0.000001)
8867       {
8868          for (f=0; f<SPI_NUMSITES; f++)
8869          {
8870             if (f == 0)
8871                maxsc = splarray[f].score;
8872             else if (splarray[f].score < maxsc)
8873             {
8874                maxsc = splarray[f].score;
8875                c = f;
8876             }
8877          }
8878          if (score > splarray[c].score)
8879          {
8880             splarray[c].score = score;
8881             splarray[c].i = i;
8882          }
8883       }
8884    }
8885    MemFree(buf);
8886    fsp->splarray = splarray;
8887    fsp->spllen = spllen;
8888    fsp->boundary = boundary;
8889    return fsp;
8890 }
8891 
SPI_FragSplFree(SPI_FragSplPtr fsp)8892 static void SPI_FragSplFree(SPI_FragSplPtr fsp)
8893 {
8894    if (fsp == NULL)
8895       return;
8896    MemFree(fsp->splarray);
8897    MemFree(fsp);
8898 }
8899 
8900 
8901 /***************************************************************************
8902 *
8903 *  SPI_RemoveInconsistentAlnsFromSet is a greedy algorithm that first
8904 *  sorts the alignments by score, then takes the highest-scoring
8905 *  alignment and compares it to the next-highest-scoring alignment, which
8906 *  is deleted if it is contained; on subsequent loops each next-highest-
8907 *  scoring alignment is compared to the set of alignments that have
8908 *  been kept. The alignments can be sorted along the first or
8909 *  second sequence; the alignments will be reversed so that they are
8910 *  all on the plus strand of the sequence to be examined.
8911 *  The input alignment must be indexed at least at the LITE level;
8912 *  conflicting child alignments will be deleted, not hidden, by this
8913 *  function.  This function assumes that all children have the same two
8914 *  rows. The 'compact' parameter tells the function whether to try to
8915 *  keep alignments that are more to the left in genomic coordinates, or
8916 *  more to the right.
8917 *
8918 ***************************************************************************/
SPI_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap,Int4 fuzz,Int4 n,Int4 compact)8919 NLM_EXTERN void SPI_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap, Int4 fuzz, Int4 n, Int4 compact)
8920 {
8921    AMAlignIndex2Ptr  amaip;
8922    Boolean          conflict;
8923    Int4             curr;
8924    Int4             i;
8925    Int4             indextype;
8926    SeqAlignPtr      salp;
8927    SeqAlignPtr      salp_head;
8928    SeqAlignPtr      salp_prev;
8929    SPI_nPtr         PNTR spin;
8930    Int4             start;
8931    Int4             stop;
8932    Int4             strand;
8933 
8934    if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
8935       return;
8936    if (n > 2)
8937       return;
8938    amaip = (AMAlignIndex2Ptr)(sap->saip);
8939    indextype = amaip->alnstyle;
8940    /* make sure that everything is on the plus strand of the nth sequence */
8941    for (i=0; i<amaip->numsaps; i++)
8942    {
8943       salp = amaip->saps[i];
8944       strand = AlnMgr2GetNthStrand(salp, n);
8945       if (strand == Seq_strand_minus)
8946       {
8947          SAIndex2Free2(salp->saip);
8948          salp->saip = NULL;
8949          salp->next = NULL;
8950          SeqAlignListReverseStrand(salp);
8951          AlnMgr2IndexSingleChildSeqAlign(salp);
8952       }
8953    }
8954    /* spin structure: n1 = which alignment, n2 = start on first row, n3 =
8955       alignment length on 1st row, n4 = start on 2nd row, n5 = 2nd strand */
8956    spin = (SPI_nPtr PNTR)MemNew((amaip->numsaps)*sizeof(SPI_nPtr));
8957    for (i=0; i<amaip->numsaps; i++)
8958    {
8959       spin[i] = (SPI_nPtr)MemNew(sizeof(SPI_n));
8960       salp = amaip->saps[i];
8961       spin[i]->n1 = i;
8962       AlnMgr2GetNthSeqRangeInSA(salp, n, &start, &stop);
8963       spin[i]->n3 = stop - start;
8964       spin[i]->n2 = start;
8965       AlnMgr2GetNthSeqRangeInSA(salp, 3-n, &start, &stop);
8966       spin[i]->n4 = start;
8967       strand = AlnMgr2GetNthStrand(salp, 3-n);
8968       if (strand == Seq_strand_minus)
8969          spin[i]->n5 = -1;
8970       else
8971          spin[i]->n5 = 1;
8972       spin[i]->n6 = compact;
8973    }
8974    HeapSort((Pointer)spin, (size_t)(amaip->numsaps), sizeof(SPI_nPtr), SPI_CompareSpins);
8975    strand = spin[0]->n5;
8976    for (i=1; i<amaip->numsaps; i++)
8977    {
8978       if (spin[i]->n5 != strand)
8979       {
8980          salp = amaip->saps[spin[i]->n1];
8981          salp->next = NULL;
8982          SeqAlignFree(salp);
8983          amaip->saps[spin[i]->n1] = NULL;
8984          spin[i]->n1 = -1;
8985       }
8986    }
8987    for (curr=0; curr<amaip->numsaps; curr++)
8988    {
8989       if (spin[curr]->n1 != -1)
8990       {
8991          for (i=curr+1; i<amaip->numsaps; i++)
8992          {
8993             if (spin[i]->n1 != -1)
8994             {
8995                conflict = FALSE;
8996             /* check first for conflict on first row */
8997                if (spin[i]->n2 + spin[i]->n3 - 1 >= spin[curr]->n2 + fuzz)
8998                {
8999                   if (spin[i]->n2 <= spin[curr]->n2 + fuzz)
9000                      conflict = TRUE;
9001                }
9002                if (spin[i]->n2 <= spin[curr]->n2 + spin[curr]->n3 - 1 - fuzz)
9003                {
9004                   if (spin[i]->n2 + spin[i]->n3 - 1 >= spin[curr]->n2 + spin[curr]->n3 - 1)
9005                      conflict = TRUE;
9006                }
9007                if (spin[i]->n2 >= spin[curr]->n2)
9008                {
9009                   if (spin[i]->n2 + spin[i]->n3 - 1 <= spin[curr]->n2 + spin[curr]->n3 - 1)
9010                      conflict = TRUE;
9011                }
9012             /* then check for conflict and consistency on second row */
9013                if (spin[i]->n4 + spin[i]->n3-1 >= spin[curr]->n4 + fuzz)
9014                {
9015                   if (spin[i]->n4 <= spin[curr]->n4 + fuzz)
9016                      conflict = TRUE;
9017                }
9018                if (spin[i]->n4 <= spin[curr]->n4 + spin[curr]->n3 - 1 - fuzz)
9019                {
9020                   if (spin[i]->n4 + spin[i]->n3 - 1 > spin[curr]->n4 + fuzz)
9021                      conflict = TRUE;
9022                }
9023                if (spin[i]->n4 >= spin[curr]->n4)
9024                {
9025                   if (spin[i]->n4 + spin[i]->n3 - 1 <= spin[curr]->n4 + spin[curr]->n3 - 1)
9026                      conflict = TRUE;
9027                }
9028                if (spin[i]->n2 + spin[i]->n3 - 1 <= spin[curr]->n2 + fuzz)
9029                {
9030                   if (strand == 1)
9031                   {
9032                      if (spin[i]->n4 + spin[i]->n3 - 1 >= spin[curr]->n4 + fuzz)
9033                         conflict = TRUE;
9034                   } else if (strand == -1)
9035                   {
9036                      if (spin[curr]->n4 + spin[curr]->n3 - 1 - fuzz >= spin[i]->n4)
9037                         conflict = TRUE;
9038                   }
9039                } else
9040                {
9041                   if (strand == 1)
9042                   {
9043                      if (spin[i]->n4 <= spin[curr]->n4 + spin[curr]->n3 - fuzz)
9044                         conflict = TRUE;
9045                   } else if (strand == -1)
9046                   {
9047                      if (spin[i]->n4 + spin[i]->n3 - 1 - fuzz >= spin[curr]->n4)
9048                         conflict = TRUE;
9049                   }
9050                }
9051                if (conflict)
9052                {
9053                   salp = amaip->saps[spin[i]->n1];
9054                   salp->next = NULL;
9055                   SeqAlignFree(salp);
9056                   amaip->saps[spin[i]->n1] = NULL;
9057                   spin[i]->n1 = -1;
9058                }
9059             }
9060          }
9061       }
9062    }
9063    salp_head = salp_prev = NULL;
9064    for (i=0; i<amaip->numsaps; i++)
9065    {
9066       MemFree(spin[i]);
9067       if (amaip->saps[i] != NULL)
9068       {
9069          amaip->saps[i]->next = NULL;
9070          if (salp_prev != NULL)
9071          {
9072             salp_prev->next = amaip->saps[i];
9073             salp_prev = salp_prev->next;
9074          } else
9075             salp_head = salp_prev = amaip->saps[i];
9076       }
9077    }
9078    sap->segs = (Pointer)(salp_head);
9079    if (indextype == AM2_LITE)
9080    {
9081       AMAlignIndex2Free2(sap->saip);
9082       sap->saip = NULL;
9083       AlnMgr2IndexLite(sap);
9084    } else
9085       AlnMgr2ReIndexSeqAlign(sap);
9086    MemFree(spin);
9087 }
9088 
9089 /***************************************************************************
9090 *
9091 *  SPI_CompareSpins is the HeapSort callback for
9092 *  SPI_RemoveInconsistentAlnsFromSet. It compares first the alignment
9093 *  length on the first row, then the alignment start on the first row.
9094 *
9095 ***************************************************************************/
SPI_CompareSpins(VoidPtr ptr1,VoidPtr ptr2)9096 static int LIBCALLBACK SPI_CompareSpins(VoidPtr ptr1, VoidPtr ptr2)
9097 {
9098    SPI_nPtr  spin1;
9099    SPI_nPtr  spin2;
9100 
9101    spin1 = *((SPI_nPtr PNTR) ptr1);
9102    spin2 = *((SPI_nPtr PNTR) ptr2);
9103    if (spin1 == NULL || spin2 == NULL)
9104       return 0;
9105    if (spin1->n3 > spin2->n3)
9106       return -1;
9107    if (spin1->n3 < spin2->n3)
9108       return 1;
9109    if (spin1->n6 == SPI_RIGHT)
9110    {
9111       if (spin1->n2 > spin2->n2)
9112          return -1;
9113       if (spin1->n2 < spin2->n2)
9114          return 1;
9115    } else if (spin1->n6 == SPI_LEFT)
9116    {
9117       if (spin1->n2 < spin2->n2)
9118          return -1;
9119       if (spin1->n2 > spin2->n2)
9120          return 1;
9121    }
9122    return 0;
9123 }
9124 
9125 /***************************************************************************
9126 *
9127 *  SPI_OrderInternally takes a herd of fragments and their alignments
9128 *  and sorts the alignments for each fragment by their start positions
9129 *  on the mRNA sequence.
9130 *
9131 ***************************************************************************/
SPI_OrderInternally(SPI_FragHerdPtr sfhp)9132 static void SPI_OrderInternally(SPI_FragHerdPtr sfhp)
9133 {
9134    AMAlignIndex2Ptr  amaip;
9135    Int4             i;
9136    Int4             j;
9137    SeqAlignPtr      salp;
9138    Uint1            strand;
9139 
9140    for (i=0; i<sfhp->numfrags; i++)
9141    {
9142       if (sfhp->sfparray[i]->sap != NULL)
9143       {
9144          amaip = (AMAlignIndex2Ptr)(sfhp->sfparray[i]->sap->saip);
9145          salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
9146          while (salp != NULL)
9147          {
9148             strand = AlnMgr2GetNthStrand(salp, 2);
9149             if (strand == Seq_strand_minus)
9150             {
9151                SAIndex2Free2(salp->saip);
9152                salp->saip = NULL;
9153                salp->next = NULL;
9154                SeqAlignListReverseStrand(salp);
9155                AlnMgr2IndexSingleChildSeqAlign(salp);
9156             }
9157             salp = salp->next;
9158          }
9159          if (amaip->numsaps > 1)
9160          {
9161             HeapSort((Pointer)(amaip->saps), (size_t)(amaip->numsaps), sizeof(SeqAlignPtr), SPI_CompareAlnPos);
9162             for (j=0; j<amaip->numsaps-1; j++)
9163             {
9164                amaip->saps[j]->next = amaip->saps[j+1];
9165                amaip->saps[j+1]->next = NULL;
9166             }
9167             sfhp->sfparray[i]->sap->segs = (Pointer)(amaip->saps[0]);
9168          }
9169       }
9170    }
9171 }
9172 
9173 /***************************************************************************
9174 *
9175 *  SPI_CompareAlnPos is the callback for the HeapSort in
9176 *  SPI_OrderInternally. It compares the start positions on the mRNA
9177 *  sequence of two alignments, and puts the 5'-most alignment first.
9178 *
9179 ***************************************************************************/
SPI_CompareAlnPos(VoidPtr ptr1,VoidPtr ptr2)9180 static int LIBCALLBACK SPI_CompareAlnPos(VoidPtr ptr1, VoidPtr ptr2)
9181 {
9182    SeqAlignPtr  sap1;
9183    SeqAlignPtr  sap2;
9184    Int4         start1;
9185    Int4         start2;
9186 
9187    sap1 = *((SeqAlignPtr PNTR) ptr1);
9188    sap2 = *((SeqAlignPtr PNTR) ptr2);
9189    if (sap1 == NULL || sap2 == NULL)
9190       return 0;
9191    AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, NULL);
9192    AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, NULL);
9193    if (start1 <= start2)
9194       return -1;
9195    else if (start2 > start1)
9196       return 1;
9197    return 0;
9198 }
9199 
9200 /***************************************************************************
9201 *
9202 *  SPI_GetResultsForCDS takes a completed mRNA-to-genomic alignment,
9203 *  extracts the CDS annotation for the mRNA, then truncates the mRNA
9204 *  alignment appropriately to create a CDS alignment.  Most of the
9205 *  mRNA information (splice sites, etc) can simply be duplicated, but
9206 *  the first and last exons are often truncated, so their information
9207 *  must be recomputed. Since the CDS is known, the UTRs are known, so
9208 *  the 5' and 3' UTR %identities are calculated as well.
9209 *
9210 ***************************************************************************/
SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)9211 static SPI_RegionInfoPtr SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
9212 {
9213    Int4               b;
9214    Int4               c;
9215    SeqMgrFeatContext  context;
9216    BoolPtr            featDefFilter;
9217    Int4               i;
9218    Int4               mis;
9219    Int4               len;
9220    Int4               offset;
9221    SeqAlignPtr        sap;
9222    SPI_mRNAPtr        smp;
9223    SPI_nPtr           spin;
9224    SPI_RegionInfoPtr  srip_cds;
9225    Int4               start_cds;
9226    Int4               stop_cds;
9227    Int4Ptr            tmpmstarts;
9228    Int4Ptr            tmpmstops;
9229    Int4               tmp1;
9230    Int4               tmp2;
9231    Int4               tmp3;
9232 
9233    if (srip_mrna == NULL || srip_mrna->revcomp == TRUE)
9234       return NULL;
9235    SeqMgrIndexFeatures(0, (Pointer)bsp_mrna);
9236    featDefFilter = (BoolPtr)MemNew((FEATDEF_MAX)*sizeof(Boolean));
9237    featDefFilter[FEATDEF_CDS] = TRUE;
9238    spin = (SPI_nPtr)MemNew(sizeof(SPI_n));
9239    SeqMgrExploreFeatures(bsp_mrna, (Pointer)spin, SPI_GetCDS, NULL, NULL, featDefFilter);
9240    MemFree(featDefFilter);
9241    context.left = spin->n1;
9242    context.right = spin->n2;
9243    if (context.right == 0)
9244       return NULL;
9245    start_cds = stop_cds = -1;
9246    offset = context.left;
9247    if (srip_mrna->smp->numexons == 1 || srip_mrna->smp->mstarts[0] < srip_mrna->smp->mstarts[1])
9248    {
9249       tmp1 = srip_mrna->smp->mstarts[0]-1;
9250       tmp2 = srip_mrna->smp->mstops[srip_mrna->smp->numexons-1]-1;
9251    } else
9252    {
9253       tmp1 = srip_mrna->smp->mstarts[srip_mrna->smp->numexons-1]-1;
9254       tmp2 = srip_mrna->smp->mstops[0]-1;
9255    }
9256    if (tmp2 < tmp1)
9257    {
9258       tmp3 = tmp2;
9259       tmp2 = tmp1;
9260       tmp1 = tmp3;
9261    }
9262    if (context.left > tmp2 || context.right < tmp1) /* cds not contained in model */
9263       return NULL;
9264    if (srip_mrna->smp->strand != Seq_strand_minus)
9265    {
9266       for (i=0; i<srip_mrna->smp->numexons; i++)
9267       {
9268          if (context.left >= srip_mrna->smp->mstarts[i]-1 && context.left <= srip_mrna->smp->mstops[i]-1)
9269             start_cds = i;
9270          if (context.right >= srip_mrna->smp->mstarts[i]-1 && context.right <= srip_mrna->smp->mstops[i]-1)
9271             stop_cds = i;
9272       }
9273       smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
9274       smp->fallsoff = SPI_NEITHER;
9275       if (start_cds == -1) /* mRNA alignment doesn't include beginning of CDS */
9276       {
9277          start_cds = 0;
9278          smp->fallsoff = SPI_LEFT;
9279       }
9280       if (stop_cds == -1) /* mRNA alignment doesn't include end of CDS */
9281       {
9282          stop_cds = srip_mrna->smp->numexons-1;
9283          if (smp->fallsoff == SPI_LEFT)
9284             smp->fallsoff = SPI_BOTH;
9285          else
9286             smp->fallsoff = SPI_RIGHT;
9287       }
9288       srip_cds = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
9289       srip_cds->smp = smp;
9290       srip_cds->mlen = abs(context.left - context.right) + 1;
9291       smp->numexons = stop_cds - start_cds + 1;
9292       smp->strand = srip_mrna->smp->strand;
9293       smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
9294       smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9295       smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9296       smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9297       smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9298       smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9299       smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9300       smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9301       smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
9302       if (srip_mrna->smp->mstarts[start_cds]-1 < offset)
9303          smp->mstarts[0] = 0;
9304       else
9305          smp->mstarts[0] = srip_mrna->smp->mstarts[start_cds]-1 - offset;
9306       if (smp->numexons > 1)
9307          smp->mstops[0] = srip_mrna->smp->mstops[start_cds]-1 - offset;
9308       for (i=1; i<smp->numexons-1; i++)
9309       {
9310          smp->mstarts[i] = srip_mrna->smp->mstarts[i+start_cds]-1 - offset;
9311          smp->mstops[i] = srip_mrna->smp->mstops[i+start_cds]-1 - offset;
9312       }
9313       if (smp->numexons > 1)
9314          smp->mstarts[smp->numexons-1] = srip_mrna->smp->mstarts[smp->numexons-1 + start_cds]-1 - offset;
9315       if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9316          smp->mstops[smp->numexons-1] = context.right - offset;
9317       else
9318          smp->mstops[smp->numexons-1] = srip_mrna->smp->mstops[smp->numexons-1 + start_cds]-1 - offset;
9319       smp->polyAtail = 0; /* no polyA on a CDS */
9320       /* now copy the splice information and truncate the alignments */
9321       for (i=0; i<smp->numexons; i++)
9322       {
9323          smp->splicedon[i] = srip_mrna->smp->splicedon[i+start_cds];
9324          smp->spliceacc[i] = srip_mrna->smp->spliceacc[i+start_cds];
9325       }
9326       if (smp->numexons > 1)
9327       {
9328          smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9329          if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9330          {
9331             if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, srip_mrna->smp->mstops[start_cds]-1, 2))
9332             {
9333                SeqAlignFree(smp->saps[0]->next);
9334                smp->saps[0]->next = NULL;
9335             }
9336          }
9337          smp->saps[smp->numexons-1] = SeqAlignDup(srip_mrna->smp->saps[stop_cds]);
9338          if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9339          {
9340             if (AlnMgr2TruncateSeqAlign(smp->saps[smp->numexons-1], srip_mrna->smp->mstarts[stop_cds]-1, smp->mstops[smp->numexons-1]+offset, 2))
9341             {
9342                SeqAlignFree(smp->saps[smp->numexons-1]->next);
9343                smp->saps[smp->numexons-1]->next = NULL;
9344             }
9345          }
9346          for (i=1; i<smp->numexons-1; i++)
9347          {
9348             smp->saps[i] = SeqAlignDup(srip_mrna->smp->saps[i+start_cds]);
9349          }
9350       } else
9351       {
9352          smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9353          if (smp->fallsoff != SPI_BOTH)
9354          {
9355             if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9356             {
9357                SeqAlignFree(smp->saps[0]->next);
9358                smp->saps[0]->next = NULL;
9359             }
9360          }
9361       }
9362       mis = 0;
9363       len = 0;
9364       tmpmstarts = smp->mstarts;
9365       tmpmstops = smp->mstops;
9366       smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9367       smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9368       for (i=0; i<smp->numexons; i++)
9369       {
9370          AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9371          len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
9372          SAIndex2Free2(smp->saps[i]->saip);
9373          smp->saps[i]->saip = NULL;
9374          AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9375       }
9376       MemFree(smp->mstarts);
9377       MemFree(smp->mstops);
9378       smp->mstarts = tmpmstarts;
9379       smp->mstops = tmpmstops;
9380       smp->mRNAcoverage = (100*len)/(context.right - context.left);
9381       smp->mismatch = (FloatHi)(100*mis)/len;
9382    } else
9383    {
9384       for (i=0; i<srip_mrna->smp->numexons; i++)
9385       {
9386          if (context.right >= srip_mrna->smp->mstarts[i]-1 && context.right <= srip_mrna->smp->mstops[i]-1)
9387             start_cds = i;
9388          if (context.left >= srip_mrna->smp->mstarts[i]-1 && context.left <= srip_mrna->smp->mstops[i]-1)
9389             stop_cds = i;
9390       }
9391       smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
9392       smp->fallsoff = SPI_NEITHER;
9393       if (start_cds == -1)
9394       {
9395          start_cds = 0;
9396          smp->fallsoff = SPI_RIGHT;
9397       }
9398       if (stop_cds == -1)
9399       {
9400          if (srip_mrna->smp->mstarts[srip_mrna->smp->numexons-1] > context.left)
9401          {
9402             stop_cds = srip_mrna->smp->numexons-1;
9403             if (smp->fallsoff == SPI_RIGHT)
9404                smp->fallsoff = SPI_BOTH;
9405             else
9406                smp->fallsoff = SPI_LEFT;
9407          } else
9408          {
9409             for (i=0; i<srip_mrna->smp->numexons; i++)
9410             {
9411                if (srip_mrna->smp->mstarts[i] > context.left)
9412                   stop_cds = i;
9413             }
9414             if (smp->fallsoff == SPI_RIGHT)
9415                smp->fallsoff = SPI_BOTH;
9416             else
9417                smp->fallsoff = SPI_LEFT;
9418          }
9419       }
9420       srip_cds = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
9421       srip_cds->smp = smp;
9422       srip_cds->mlen = abs(context.left - context.right) + 1;
9423       smp->numexons = stop_cds - start_cds + 1;
9424       smp->strand = srip_mrna->smp->strand;
9425       smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
9426       smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9427       smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9428       smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9429       smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9430       smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9431       smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9432       smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9433       smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
9434       if (smp->numexons > 1)
9435          smp->mstarts[0] = srip_mrna->smp->mstarts[start_cds]-1 - offset;
9436       if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9437          smp->mstops[0] = context.right - offset;
9438       else
9439          smp->mstops[0] = srip_mrna->smp->mstops[start_cds]-1 - offset;
9440       for (i=1; i<smp->numexons-1; i++)
9441       {
9442          smp->mstarts[i] = srip_mrna->smp->mstarts[i+start_cds]-1 - offset;
9443          smp->mstops[i] = srip_mrna->smp->mstops[i+start_cds]-1 - offset;
9444       }
9445       if (smp->numexons > 1)
9446          smp->mstops[smp->numexons-1] = srip_mrna->smp->mstops[smp->numexons-1+start_cds]-1 - offset;
9447       if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9448          smp->mstarts[smp->numexons-1] = 0;
9449       else
9450          smp->mstarts[smp->numexons-1] = srip_mrna->smp->mstarts[smp->numexons-1+start_cds]-1 - offset;
9451       smp->polyAtail = 0;
9452       /* now copy the splice site info and truncate the alignments */
9453       for (i=0; i<smp->numexons; i++)
9454       {
9455          smp->splicedon[i] = srip_mrna->smp->splicedon[i+start_cds];
9456          smp->spliceacc[i] = srip_mrna->smp->spliceacc[i+start_cds];
9457       }
9458       if (smp->numexons > 1)
9459       {
9460          smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9461          if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9462          {
9463             if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9464             {
9465                SeqAlignFree(smp->saps[0]->next);
9466                smp->saps[0]->next = NULL;
9467             }
9468          }
9469          smp->saps[smp->numexons-1] = SeqAlignDup(srip_mrna->smp->saps[smp->numexons-1+start_cds]);
9470          if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9471          {
9472             if (AlnMgr2TruncateSeqAlign(smp->saps[smp->numexons-1], smp->mstarts[smp->numexons-1] + offset, srip_mrna->smp->mstops[smp->numexons-1+start_cds]-1, 2))
9473             {
9474                SeqAlignFree(smp->saps[smp->numexons-1]->next);
9475                smp->saps[smp->numexons-1]->next = NULL;
9476             }
9477          }
9478          for (i=1; i<smp->numexons-1; i++)
9479          {
9480             smp->saps[i] = SeqAlignDup(srip_mrna->smp->saps[i+start_cds]);
9481          }
9482       } else
9483       {
9484          smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9485          if (smp->fallsoff != SPI_BOTH)
9486          {
9487             if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9488             {
9489                SeqAlignFree(smp->saps[0]->next);
9490                smp->saps[0]->next = NULL;
9491             }
9492          }
9493       }
9494       mis = 0;
9495       len = 0;
9496       tmpmstarts = smp->mstarts;
9497       tmpmstops = smp->mstops;
9498       smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9499       smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9500       for (i=0; i<smp->numexons; i++)
9501       {
9502          AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9503          len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
9504          SAIndex2Free2(smp->saps[i]->saip);
9505          smp->saps[i]->saip = NULL;
9506          AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9507       }
9508       MemFree(smp->mstarts);
9509       MemFree(smp->mstops);
9510       smp->mstarts = tmpmstarts;
9511       smp->mstops = tmpmstops;
9512       smp->mRNAcoverage = (100*len)/(context.right - context.left);
9513       smp->mismatch = (FloatHi)(100*mis)/len;
9514    }
9515    for (i=0; i<smp->numexons-1; i++)
9516    {
9517       smp->saps[i]->next = smp->saps[i+1];
9518       smp->saps[i+1]->next = NULL;
9519    }
9520    sap = SeqAlignNew();
9521    sap->segtype = SAS_DISC;
9522    sap->segs = (Pointer)(smp->saps[0]);
9523    sap->dim = 2;
9524    AlnMgr2IndexLite(sap);
9525    smp->parent = sap;
9526    srip_cds->strand = srip_mrna->strand;
9527    /* fill in srip_cds->mstart and mstop with the CDS boundaries for printing */
9528    srip_cds->mstart = context.left;
9529    srip_cds->mstop = context.right;
9530    srip_cds->smp->missingends = srip_cds->smp->fallsoff;
9531    SPI_FillInUTRInfo(srip_cds, srip_mrna, bsp_mrna->length, start_cds, stop_cds);
9532    if (spin->n3 == 1)  /* CDS is 5' partial */
9533       srip_cds->gstart = 1;
9534    else
9535       srip_cds->gstart = 0;
9536    if (spin->n4 == 1)  /* CDS is 3' partial */
9537       srip_cds->gstop = 1;
9538    else
9539       srip_cds->gstop = 0;
9540    MemFree(spin);
9541    return srip_cds;
9542 }
9543 
9544 /***************************************************************************
9545 *
9546 *  SPI_FillInUTRInfo is called by SPI_GetResultsForCDS to figure out the
9547 *  %id of the 5' and 3' UTRs. Since the UTRs usually do not exactly
9548 *  coincide with exon boundaries, this is not a trivial task:
9549 *
9550 *  ----********  *****  ****---  ------  4 exons, *=CDS, -=UTR
9551 *  In this example, the number of mismatches in the 5' UTR is the number
9552 *  of mismatches in exon 1 minus the number of mismatches in exon 1 of
9553 *  the CDS. The number of mismatches in the 3' UTR is the number of
9554 *  mismatches in exon 4 of the mRNA, plus the number of mismatches in
9555 *  exon 3 of the mRNA, minus the number of mismatches in exon 3 of the CDS.
9556 *  The most complicated example is when a single-exon CDS does not quite
9557 *  reach the edges of the corresponding mRNA exon:
9558 *
9559 *   ------- ----*******----- ----- ------ 4 exons
9560 *  In this example, the function actually needs to make a small
9561 *  alignment corresponding to the last part of exon 2 of the mRNA, so
9562 *  that the mismatches for just that piece can be computed.
9563 *
9564 ***************************************************************************/
SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds,SPI_RegionInfoPtr srip_mrna,Int4 len,Int4 exonstart,Int4 exonstop)9565 static void SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds, SPI_RegionInfoPtr srip_mrna, Int4 len, Int4 exonstart, Int4 exonstop)
9566 {
9567    ACTProfilePtr  app1;
9568    ACTProfilePtr  app2;
9569    Boolean        found;
9570    Int4           i;
9571    Int4           j;
9572    FloatHi        mismatch_cds;
9573    FloatHi        mismatch_misc;
9574    FloatHi        mismatch_mrna_l;
9575    FloatHi        mismatch_mrna_r;
9576    SeqAlignPtr    sap_tmp;
9577 
9578    mismatch_mrna_l = mismatch_mrna_r = 0;
9579    for (i=0; i<=exonstart; i++)
9580    {
9581       mismatch_mrna_l += (FloatHi)(1-srip_mrna->smp->exonid[i]/100)*(srip_mrna->smp->mstops[i] - srip_mrna->smp->mstarts[i] + 1);
9582    }
9583    for (i=exonstop; i<srip_mrna->smp->numexons; i++)
9584    {
9585       mismatch_mrna_r += (FloatHi)(1-srip_mrna->smp->exonid[i]/100)*(srip_mrna->smp->mstops[i] - srip_mrna->smp->mstarts[i] + 1);
9586    }
9587    if (exonstart != exonstop)
9588    {
9589       mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9590       mismatch_mrna_l -= mismatch_cds;
9591       mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[srip_cds->smp->numexons-1]/100)*(srip_cds->smp->mstops[srip_cds->smp->numexons-1] - srip_cds->smp->mstarts[srip_cds->smp->numexons-1] + 1);
9592       mismatch_mrna_r -= mismatch_cds;
9593       if (srip_cds->strand != Seq_strand_minus)
9594       {
9595          if (srip_cds->mstart > 0)
9596             srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(srip_cds->mstart)));
9597          else
9598             srip_cds->utr.left = -1;
9599          if (len-1-srip_cds->mstop > 0)
9600             srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/(len-1-srip_cds->mstop)));
9601          else
9602             srip_cds->utr.right = -1;
9603       } else
9604       {
9605          if (srip_cds->mstart > 0)
9606             srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/srip_cds->mstart));
9607          else
9608             srip_cds->utr.right = -1;
9609          if (len-1-srip_cds->mstop > 0)
9610             srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(len-1-srip_cds->mstop)));
9611          else
9612             srip_cds->utr.left = -1;
9613       }
9614    } else /* have to figure out how many mismatches are on each side of the exon now */
9615    {
9616       sap_tmp = SeqAlignDup(srip_mrna->smp->saps[exonstart]);
9617       mismatch_misc = 0;
9618       if (AlnMgr2TruncateSeqAlign(sap_tmp, srip_cds->mstart, srip_cds->mstop, 2))
9619       {
9620          app1 = SPI_MakeProfileFromSA(sap_tmp->next);
9621          app2 = app1;
9622          while (app2 != NULL)
9623          {
9624             for (i=0; i<app2->len-1; i++)
9625             {
9626                found = FALSE;
9627                for (j=0; j<ACT_NUCLEN; j++)
9628                {
9629                   if (app2->freq[j][i] == 1 && !found)
9630                   {
9631                      if (app2->freq[4][i] == 0) /* not an N */
9632                      {
9633                         mismatch_misc += 1;
9634                         found = TRUE;
9635                      }
9636                   }
9637                }
9638             }
9639             app1 = app2->next;
9640             MemFree(app2);
9641             app2 = app1;
9642          }
9643          mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9644          mismatch_mrna_l -= (mismatch_cds + mismatch_misc);
9645          mismatch_mrna_r -= ((FloatHi)(1-srip_mrna->smp->exonid[exonstart]/100)*(srip_mrna->smp->mstops[exonstart] - srip_mrna->smp->mstarts[exonstart] + 1) - mismatch_misc);
9646       } else /* nothing on the right side, so all the extra mismatches are on the left */
9647       {
9648          mismatch_cds = (1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9649          mismatch_mrna_l -= mismatch_cds;
9650          mismatch_mrna_r -= (FloatHi)(1-srip_mrna->smp->exonid[exonstart]/100)*(srip_mrna->smp->mstops[exonstart] - srip_mrna->smp->mstarts[exonstart] + 1);
9651       }
9652       SeqAlignSetFree(sap_tmp);
9653       if (srip_cds->strand != Seq_strand_minus)
9654       {
9655          if (srip_cds->mstart > 0)
9656             srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(srip_cds->mstart)));
9657          else
9658             srip_cds->utr.left = -1;
9659          if (len-1-srip_cds->mstop > 0)
9660             srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/(len-1-srip_cds->mstop)));
9661          else
9662             srip_cds->utr.right = -1;
9663       } else
9664       {
9665          if (srip_cds->mstart > 0)
9666             srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/srip_cds->mstart));
9667          else
9668             srip_cds->utr.right = -1;
9669          if (len-1-srip_cds->mstop > 0)
9670             srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(len-1-srip_cds->mstop
9671 )));
9672          else
9673             srip_cds->utr.left = -1;
9674       }
9675    }
9676 }
9677 
9678 /***************************************************************************
9679 *
9680 *  SPI_GetCDS is the callback for the SeqEntryExplore call in
9681 *  SPI_GetResultsForCDS.  It simply records the left and right-most
9682 *  boundaries of the coding region found.
9683 *
9684 ***************************************************************************/
SPI_GetCDS(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)9685 static Boolean LIBCALLBACK SPI_GetCDS(SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
9686 {
9687    Boolean   p3;
9688    Boolean   p5;
9689    SPI_nPtr  spin;
9690 
9691    if (sfp == NULL)
9692       return FALSE;
9693    spin = (SPI_nPtr)context->userdata;
9694    if (context->seqfeattype == SEQFEAT_CDREGION && context->strand != Seq_strand_minus)
9695    {
9696       spin->n1 = context->left;
9697       spin->n2 = context->right;
9698       CheckSeqLocForPartial(sfp->location, &p5, &p3);
9699       if (p5)
9700          spin->n3 = 1;
9701       if (p3)
9702          spin->n4 = 1;
9703    }
9704    return TRUE;
9705 }
9706 
9707 
9708 /***************************************************************************
9709 *
9710 *  SPI_GetProteinFrommRNA takes an mRNA bioseq and returns a string
9711 *  which is the best protein translation of the mRNA. First, the function
9712 *  looks to see whether there are any annotated CDSs, and if so, it uses
9713 *  the translation of the annotated CDS. If not, the function translates
9714 *  the mRNA in all 3 reading frames and looks for the frame with the
9715 *  longest protein, then returns that protein.
9716 *
9717 ***************************************************************************/
SPI_GetProteinFrommRNA(BioseqPtr bsp_mrna,Int4Ptr start)9718 NLM_EXTERN CharPtr SPI_GetProteinFrommRNA(BioseqPtr bsp_mrna, Int4Ptr start)
9719 {
9720    ByteStorePtr  bs;
9721    CharPtr       c1;
9722    CharPtr       c2;
9723    CharPtr       c3;
9724    Int4          c1len;
9725    Int4          c2len;
9726    Int4          c3len;
9727    Int4          c1start;
9728    Int4          c2start;
9729    Int4          c3start;
9730    BoolPtr       featDefFilter;
9731    CharPtr       seq;
9732    SeqLocPtr     slp;
9733    SPI_SeqPtr    ssp;
9734    CharPtr       tmp;
9735 
9736    if (bsp_mrna == NULL)
9737       return NULL;
9738    SeqMgrIndexFeatures(0, (Pointer)bsp_mrna);
9739    featDefFilter = (BoolPtr)MemNew((FEATDEF_MAX)*sizeof(Boolean));
9740    featDefFilter[FEATDEF_CDS] = TRUE;
9741    ssp = (SPI_SeqPtr)MemNew(sizeof(SPI_Seq));
9742    SeqMgrExploreFeatures(bsp_mrna, (Pointer)ssp, SPI_GetCDSFeat, NULL, NULL, featDefFilter);
9743    seq = ssp->seq;
9744    *start = ssp->start;
9745    MemFree(featDefFilter);
9746    if (seq == NULL) /* no annotated CDS, have to translate to figure out the protein */
9747    {
9748       slp = SeqLocIntNew(0, bsp_mrna->length-1, Seq_strand_plus, bsp_mrna->id);
9749       bs = TransTableTranslateSeqLoc(NULL, slp, 1, 1, TRUE, TRUE);
9750       c1 = BSMerge(bs, NULL);
9751       BSFree(bs);
9752       bs = TransTableTranslateSeqLoc(NULL, slp, 1, 2, TRUE, TRUE);
9753       c2 = BSMerge(bs, NULL);
9754       BSFree(bs);
9755       bs = TransTableTranslateSeqLoc(NULL, slp, 1, 3, TRUE, TRUE);
9756       c3 = BSMerge(bs, NULL);
9757       BSFree(bs);
9758       c1len = SPI_FindLongestProt(c1, &c1start);
9759       c2len = SPI_FindLongestProt(c2, &c2start);
9760       c3len = SPI_FindLongestProt(c3, &c3start);
9761       if (c1len >= c2len && c1len >= c3len)
9762       {
9763          *start = 3*c1start;
9764          tmp = c1;
9765          tmp += c1start;
9766          seq = StringSave(tmp);
9767       } else if (c2len >= c1len && c2len >= c3len)
9768       {
9769          *start = 1+3*c2start;
9770          tmp = c2;
9771          tmp += c2start;
9772          seq = StringSave(tmp);
9773       } else if (c3len >= c1len && c3len >= c2len)
9774       {
9775          *start = 2+3*c3start;
9776          tmp = c3;
9777          tmp += c3start;
9778          seq = StringSave(tmp);
9779       }
9780       MemFree(c1);
9781       MemFree(c2);
9782       MemFree(c3);
9783       SeqLocFree(slp);
9784    }
9785    MemFree(ssp);
9786    return seq;
9787 }
9788 
9789 /***************************************************************************
9790 *
9791 *  SPI_GetCDSFeat is the SeqMgrExplore callback for SPI_GetProteinFrommRNA.
9792 *  When a CDS feature is found, the function gets the protein byte store
9793 *  corresponding to that feature, then converts the byte store into a
9794 *  string representing the protein sequence.
9795 *
9796 ***************************************************************************/
SPI_GetCDSFeat(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)9797 static Boolean LIBCALLBACK SPI_GetCDSFeat(SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
9798 {
9799    ByteStorePtr  bs;
9800    SPI_SeqPtr    ssp;
9801 
9802    ssp = (SPI_SeqPtr)(context->userdata);
9803    if (context->seqfeattype == SEQFEAT_CDREGION)
9804    {
9805       bs = ProteinFromCdRegionEx(sfp, TRUE, TRUE);
9806       ssp->seq = BSMerge(bs, NULL);
9807       ssp->start = context->left;
9808       BSFree(bs);
9809    }
9810    return TRUE;
9811 }
9812 
9813 /***************************************************************************
9814 *
9815 *  SPI_FindLongestProt looks through a string representing a protein
9816 *  sequence (with stop codons), and returns the length of the longest
9817 *  sub-protein (no stops) in the sequence, as well as the position at
9818 *  which the longest protein starts.
9819 *
9820 ***************************************************************************/
SPI_FindLongestProt(CharPtr seq,Int4Ptr pos)9821 static Int4 SPI_FindLongestProt(CharPtr seq, Int4Ptr pos)
9822 {
9823    Int4     i;
9824    Int4     j;
9825    Int4     len;
9826    Int4     max;
9827    CharPtr  p;
9828 
9829    if (seq == NULL)
9830       return 0;
9831    p = seq;
9832    len = max = 0;
9833    i = 0;
9834    j = 0;
9835    *pos = 0;
9836    while (*p != '\0')
9837    {
9838       if (*p == '*')
9839       {
9840          if (len > max)
9841          {
9842             max = len;
9843             *pos = j;
9844          }
9845          len = 0;
9846          j = i+1;
9847       } else
9848          len++;
9849       p++;
9850       i++;
9851    }
9852    return max;
9853 }
9854 
SPI_GetAccessionFromSeqId(SeqIdPtr sip,Int4Ptr gi,CharPtr PNTR id)9855 static Boolean SPI_GetAccessionFromSeqId(SeqIdPtr sip, Int4Ptr gi, CharPtr PNTR id)
9856 {
9857    Boolean numeric_id_type = FALSE;
9858    Int2 id_len;
9859    GiimPtr gip;
9860    ObjectIdPtr oip;
9861    TextSeqIdPtr textsip;
9862    DbtagPtr dbtag;
9863    PatentSeqIdPtr psip;
9864    PDBSeqIdPtr pdbsip;
9865 
9866    *id = NULL;
9867    *gi = 0;
9868 
9869    switch (sip->choice) {
9870    case SEQID_GI: case SEQID_GIBBSQ: case SEQID_GIBBMT:
9871       *gi = sip->data.intvalue;
9872       numeric_id_type = TRUE;
9873       break;
9874    case SEQID_GIIM:
9875       gip = (GiimPtr) sip->data.ptrvalue;
9876       *gi = gip->id;
9877       numeric_id_type = TRUE;
9878       break;
9879    case SEQID_LOCAL:
9880       oip = (ObjectIdPtr) sip->data.ptrvalue;
9881 
9882       if (oip->str) {
9883          id_len = StringLen(oip->str);
9884          *id = (CharPtr) MemNew(id_len+1);
9885          sprintf(*id, "%s", oip->str);
9886       } else {
9887          *id = (CharPtr) MemNew(6);
9888          sprintf(*id, "%d", oip->id);
9889       }
9890       break;
9891    case SEQID_GENBANK: case SEQID_EMBL: case SEQID_PIR: case SEQID_TPG: case SEQID_TPE: case SEQID_TPD:
9892    case SEQID_SWISSPROT: case SEQID_DDBJ: case SEQID_PRF:
9893    case SEQID_OTHER:
9894       textsip = (TextSeqIdPtr)sip->data.ptrvalue;
9895       id_len = StringLen(textsip->accession);
9896       *id = (CharPtr) MemNew(id_len+1);
9897       if (textsip->version > 0)
9898          sprintf(*id, "%s.%d", textsip->accession, textsip->version);
9899       else
9900          sprintf(*id, "%s", textsip->accession);
9901       break;
9902    case SEQID_GENERAL:
9903       dbtag = (DbtagPtr) sip->data.ptrvalue;
9904       if (dbtag->tag->str == NULL) {
9905          numeric_id_type = TRUE;
9906          *gi = dbtag->tag->id;
9907       } else {
9908          id_len = StringLen(dbtag->tag->str);
9909          *id = (CharPtr) MemNew(id_len+1);
9910          sprintf(*id, "%s", dbtag->tag->str);
9911       }
9912       break;
9913    case SEQID_PATENT:
9914       psip = (PatentSeqIdPtr) sip->data.ptrvalue;
9915       *gi = (Int4) psip->seqid;
9916       numeric_id_type = TRUE;
9917       break;
9918    case SEQID_PDB:
9919       pdbsip = (PDBSeqIdPtr) sip->data.ptrvalue;
9920       id_len = StringLen(pdbsip->mol);
9921       *id = (CharPtr) MemNew(id_len+4);
9922       sprintf(*id, "%s%d", pdbsip->mol, pdbsip->chain);
9923       break;
9924    default: break;
9925    }
9926 
9927    return numeric_id_type;
9928 }
9929 
SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip_head,SPI_OptionsPtr spot,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna)9930 static void SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip_head, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna)
9931 {
9932    Int4               c;
9933    Int4               i;
9934    SPI_RegionInfoPtr  revcmp;
9935    SPI_bsinfoPtr      sbp1;
9936    SPI_bsinfoPtr      sbp2;
9937    Int4               sites;
9938    Int4               sites2;
9939    SPI_RegionInfoPtr  srip;
9940 
9941    srip = srip_head;
9942    i = 0;
9943    while (srip != NULL)
9944    {
9945       i++;
9946       srip = srip->next;
9947    }
9948    srip = srip_head;
9949    c = 0;
9950    /** KSK bug fix - access of null srip->smp after
9951        SPI_SortRegionsByScore() removed all below -c
9952        threshold **/
9953    while (srip != NULL && srip->smp != NULL)
9954    {
9955       if (spot->revcomp == FALSE)
9956       {
9957          srip->revcmp_try = TRUE;
9958          sites = 0;
9959          for (i=0; i<srip->smp->numexons; i++)
9960          {
9961             sites += srip->smp->splicedon[i];
9962          }
9963          if ((sites*100)/srip->smp->numexons < SPI_REVCOMPTHRESH)
9964          {
9965              BioseqRevComp(bsp_mrna);
9966              ErrSetMessageLevel(SEV_MAX);
9967              sbp1 = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
9968              sbp1->bsp = bsp_genomic;
9969              sbp2 = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
9970              sbp2->bsp = bsp_mrna;
9971              if (spot->lcaseloc){ /* fixes an ABW ? */
9972                 sbp2->lcaseloc = spot->lcaseloc;
9973              }
9974             /* sbp2->lcaseloc = spot->lcaseloc; */
9975             spot->revcomp = TRUE;
9976             revcmp = SPI_AlnSinglemRNAToGen(sbp1, sbp2, NULL, NULL, spot);
9977             spot->revcomp = FALSE;
9978             if (revcmp != NULL && revcmp->smp != NULL)
9979             {
9980                if (revcmp != NULL && revcmp->smp != NULL)
9981                {
9982                   MemFree(revcmp->smp->protein);
9983                   revcmp->smp->protein = SPI_GetProteinFrommRNA(bsp_mrna, &revcmp->smp->transstart);
9984                }
9985             }
9986             BioseqRevComp(bsp_mrna);
9987             if (revcmp != NULL && revcmp->smp != NULL)
9988             {
9989                sites2 = 0;
9990                for (i=0; i<revcmp->smp->numexons; i++)
9991                {
9992                   sites2 += revcmp->smp->splicedon[i];
9993                }
9994                if ((sites2*100)/revcmp->smp->numexons > (sites*100)/srip->smp->numexons)
9995                {
9996                   if (revcmp->smp->mRNAcoverage > srip->smp->mRNAcoverage - SPI_COVERDIFF)
9997                   {
9998                      if (revcmp->smp->mismatch < srip->smp->mismatch + SPI_MISMTCHDIFF)
9999                      {
10000                         SPI_mRNAFree(srip->smp);
10001                         srip->smp = revcmp->smp;
10002                         srip->revcomp = TRUE;
10003                         srip->mstart = revcmp->mstart;
10004                         srip->mstop = revcmp->mstop;
10005                         srip->strand = revcmp->strand;
10006                         srip->coverage = revcmp->coverage;
10007                         srip->score = revcmp->score;
10008                         srip->polyAtail = revcmp->polyAtail;
10009                         srip->fallsoff = revcmp->fallsoff;
10010                         srip->utr = revcmp->utr;
10011                         MemFree(revcmp);
10012                      }
10013                   }
10014                }
10015             }
10016          }
10017       }
10018       srip = srip->next;
10019       c++;
10020    }
10021 }
10022 
SPI_ProfileNew(Boolean nuc)10023 static ACTProfilePtr SPI_ProfileNew(Boolean nuc)
10024 {
10025    ACTProfilePtr  app;
10026    FloatHiPtr     PNTR freq;
10027 
10028    app = (ACTProfilePtr)MemNew(sizeof(ACTProfile));
10029    if (nuc)
10030    {
10031       freq = (FloatHiPtr PNTR)MemNew(ACT_NUCLEN*sizeof(FloatHiPtr));
10032       app->freq = freq;
10033       app->nuc = TRUE;
10034    } else
10035    {
10036       freq = (FloatHiPtr PNTR)MemNew(ACT_PROTLEN*sizeof(FloatHiPtr));
10037       app->freq = freq;
10038       app->nuc = FALSE;
10039    }
10040    return app;
10041 }
10042 
SPI_ProfileFree(ACTProfilePtr app)10043 static ACTProfilePtr SPI_ProfileFree(ACTProfilePtr app)
10044 {
10045    Int4  i;
10046    Int4  j;
10047 
10048    if (app == NULL)
10049       return NULL;
10050    if (app->nuc)
10051       j = ACT_NUCLEN;
10052    else
10053       j = ACT_PROTLEN;
10054    for (i=0; i<j; i++)
10055    {
10056       MemFree(app->freq[i]);
10057    }
10058    MemFree(app->freq);
10059    app->next = NULL;
10060    MemFree(app);
10061    return NULL;
10062 }
10063 
SPI_ProfileSetFree(ACTProfilePtr app)10064 static ACTProfilePtr SPI_ProfileSetFree(ACTProfilePtr app)
10065 {
10066    ACTProfilePtr  app_next;
10067 
10068    while (app != NULL)
10069    {
10070       app_next = app->next;
10071       app->next = NULL;
10072       SPI_ProfileFree(app);
10073       app = app_next;
10074    }
10075    return NULL;
10076 }
10077 
SPI_BuildProfile(SeqLocPtr slp,ACTProfilePtr PNTR app,Int4Ptr count,Int4 length)10078 static void SPI_BuildProfile(SeqLocPtr slp, ACTProfilePtr PNTR app, Int4Ptr count, Int4 length)
10079 {
10080    Int4        i;
10081    Int4        len;
10082    Uint1       res;
10083    SeqPortPtr  spp;
10084 
10085    if (app == NULL)
10086       return;
10087    if (slp == NULL)
10088    {
10089       if (*count == 0)
10090          (*app)->numseq++;
10091       *count = *count+length;
10092       if ((*app)->len <= *count)
10093       {
10094          *count = 0;
10095          *app = (*app)->next;
10096       }
10097       return;
10098    }
10099    len = SeqLocLen(slp);
10100    if (len <= 0)
10101       return;
10102    if ((*app)->len == 0)
10103    {
10104       (*app)->len = len;
10105       if ((*app)->nuc)
10106       {
10107          for (i=0; i<ACT_NUCLEN; i++)
10108          {
10109             (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
10110          }
10111       } else
10112       {
10113          for (i=0; i<ACT_PROTLEN; i++)
10114          {
10115             (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
10116          }
10117       }
10118    } else
10119    {
10120       if (len > (*app)->len) /* seqloc is longer than the */
10121          return;          /* existing profile -- don't add it     */
10122    }
10123    if ((*app)->nuc)
10124       spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
10125    else
10126       spp = SeqPortNewByLoc(slp, Seq_code_ncbistdaa);
10127    if (spp == NULL)
10128       return;
10129    if (*count == 0)
10130      (*app)->numseq++;
10131    i=0;
10132    if ((*app)->nuc == FALSE)
10133    {
10134       while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
10135       {
10136          (*app)->freq[res][i+*count]++;
10137          i++;
10138       }
10139    } else
10140    {
10141       while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
10142       {
10143          if (res == 1)
10144          {
10145             (*app)->freq[0][i+*count]++;
10146          } else if (res == 2)
10147          {
10148             (*app)->freq[1][i+*count]++;
10149          } else if (res == 4)
10150          {
10151             (*app)->freq[2][i+*count]++;
10152          } else if (res == 8)
10153          {
10154             (*app)->freq[3][i+*count]++;
10155          } else
10156          {
10157             (*app)->freq[4][i+*count]++;
10158          }
10159          i++;
10160       }
10161    }
10162    SeqPortFree(spp);
10163    if (len+*count == (*app)->len)
10164    {
10165       *app = (*app)->next;
10166       *count = 0;
10167    } else
10168       *count = *count + len;
10169    return;
10170 }
10171 
10172 
SPI_MakeProfileFromSA(SeqAlignPtr sap)10173 static ACTProfilePtr SPI_MakeProfileFromSA(SeqAlignPtr sap)
10174 {
10175    AMAlignIndex2Ptr  amaip;
10176    AlnMsg2Ptr       amp;
10177    ACTProfilePtr    app = NULL;
10178    ACTProfilePtr    app_head = NULL;
10179    ACTProfilePtr    app_prev = NULL;
10180    BioseqPtr        bsp;
10181    Int4             count;
10182    Int4             i;
10183    Int4             j;
10184    Boolean          more;
10185    Int4             n;
10186    Boolean          nuc;
10187    Int4             numseg;
10188    Int4             numrows;
10189    SeqIdPtr         sip;
10190    SeqLocPtr        slp;
10191    Int4             start;
10192    Int4             stop;
10193 
10194    if (sap == NULL)
10195       return NULL;
10196    if (sap->saip == NULL)
10197       return NULL;
10198    if (sap->saip->indextype == INDEX_PARENT)
10199    {
10200       amaip = (AMAlignIndex2Ptr)(sap->saip);
10201       if (amaip->alnstyle == AM2_LITE)
10202          return NULL;
10203    }
10204    sip = AlnMgr2GetNthSeqIdPtr(sap, 1);
10205    bsp = BioseqLockById(sip);
10206    if (bsp == NULL)
10207       return NULL;
10208    if (ISA_na(bsp->mol))
10209       nuc = TRUE;
10210    else
10211       nuc = FALSE;
10212    BioseqUnlockById(sip);
10213    sip = SeqIdFree(sip);
10214    amp = AlnMsgNew2();
10215    amp->to_aln = -1;
10216    amp->row_num = 1;
10217    app_head = NULL;
10218    numseg = AlnMgr2GetNumSegs(sap);
10219    for (i=0; i<numseg; i++)
10220    {
10221       app = SPI_ProfileNew(nuc);
10222       AlnMgr2GetNthSegmentRange(sap, i+1, &start, &stop);
10223       app->len = stop - start + 1;
10224       if (nuc)
10225       {
10226         for (j=0; j<ACT_NUCLEN; j++)
10227         {
10228            app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
10229         }
10230       } else
10231       {
10232         for (j=0; j<ACT_PROTLEN; j++)
10233         {
10234            app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
10235         }
10236       }
10237       if (app_head != NULL)
10238       {
10239         app_prev->next = app;
10240         app_prev = app;
10241       } else
10242         app_head = app_prev = app;
10243    }
10244    numrows = AlnMgr2GetNumRows(sap);
10245    for (i=1; i<=numrows; i++)
10246    {
10247       app = app_head;
10248       for (n=0; n<numseg; n++)
10249       {
10250          AlnMsgReNew2(amp);
10251          AlnMgr2GetNthSegmentRange(sap, n+1, &amp->from_aln, &amp->to_aln);
10252          amp->row_num = i;
10253          sip = AlnMgr2GetNthSeqIdPtr(sap, i);
10254          bsp = BioseqLockById(sip);
10255          count = 0;
10256          while ((Boolean) (more = AlnMgr2GetNextAlnBit(sap, amp)) && app != NULL)
10257          {
10258             if (amp->type == AM_SEQ && bsp != NULL)
10259             {
10260                slp = SeqLocIntNew(amp->from_row, amp->to_row, amp->strand, sip);
10261                SPI_BuildProfile(slp, &app, &count, 0);
10262                SeqLocFree(slp);
10263             } else if (amp->type == AM_GAP)
10264                SPI_BuildProfile(NULL, &app, &count, (amp->to_row - amp->from_row + 1));
10265          }
10266          BioseqUnlockById(sip);
10267          sip = SeqIdFree(sip);
10268       }
10269    }
10270    AlnMsgFree2(amp);
10271    return app_head;
10272 }
10273 
10274 /***********************************************************
10275 * SPI_CheckMrnaOrder
10276 *
10277 * After the ivals for building a region are sorted in genomic
10278 * order this function merely checks that the mrna invterals
10279 * are minimally colinear: if the stop of one interval overlaps
10280 * or 'jumps' by more than 20 bases the start of the next interval,
10281 * the one with greatest score is retained and the one with lesser
10282 * score set to 'impossible.
10283  ***********************************************************/
SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp,const int num)10284 static void SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp, const int num)
10285 {
10286     SPI_IvalPtr  ival = 0, ival2 = 0, ival3 = 0;
10287     int x = 0;
10288 
10289     if (num >=3){
10290         for (x = 0, ival = spi_pp[x], ival2 = spi_pp[x + 1];
10291              x < num && ival != 0 && ival2 != 0;
10292              ++x, ival = spi_pp[x],
10293                  ival2 = (x + 1 < num ? spi_pp[x + 1] : 0)){
10294             if (x < num - 2){ /* three to window */
10295                 ival3 = spi_pp[x + 2];
10296                 if ((ival->strand == Seq_strand_plus == ival2->strand
10297                      && ival3->strand == ival->strand
10298                      && (ival->mstop > ival2->mstart + SPI_FUZZ
10299                          && ival->mstop < ival3->mstart + SPI_FUZZ))
10300                     || (ival->strand == Seq_strand_minus == ival2->strand
10301                         && ival3->strand == ival->strand
10302                         && (ival->mstop + SPI_FUZZ < ival2->mstart
10303                             && ival->mstop + SPI_FUZZ > ival3->mstart))){
10304                     if (ival->score > ival2->score){
10305                         ival2->used = -1;
10306                     }
10307                     else if (ival2->score > ival->score){
10308                         ival->used = -1;
10309                     }
10310                 }
10311             }
10312             else if (x < num - 1){ /* two to window */
10313                 if ((ival->strand == Seq_strand_plus == ival2->strand
10314                      && ival->mstop > ival2->mstart)
10315                     || (ival->strand == Seq_strand_plus == ival2->strand
10316                         && ival->mstop < ival2->mstart)){
10317                     if (ival2->score > ival->score){
10318                         ival->used = -1;
10319                     }
10320                     else if (ival->score > ival2->score){
10321                         ival2->used = -1;
10322                     }
10323                 }
10324             }
10325         }
10326     }
10327 }
10328