1 static char const rcsid[] = "$Id: spidey.c,v 6.74 2016/09/02 14:57:38 ucko Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information (NCBI)
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government do not place any restriction on its use or reproduction.
14 * We would, however, appreciate having the NCBI and the author cited in
15 * any work or product based on this material.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * ===========================================================================
26 *
27 * File Name: spidey.c
28 *
29 * Author: Sarah Wheelan
30 *
31 * Version Creation Date: 5/01
32 *
33 * $Revision: 6.74 $
34 *
35 * File Description: mrna-to-genomic alignment algorithms and functions
36 *
37 * Modifications:
38 * --------------------------------------------------------------------------
39 * $Log: spidey.c,v $
40 * Revision 6.74 2016/09/02 14:57:38 ucko
41 * Formally clean up calls to printf-family functions that are at least
42 * nominally unsafe, as already done in Debian/Ubuntu packages.
43 *
44 * Revision 6.73 2006/06/01 14:55:31 kskatz
45 * fixed -s option so that it sets gap-open/gap-extend penalties that are valid as a pair for gap statistics and are a bit less stringent thatn default (default = 5 open : 2 extend; interspecies -s = 4 open : 1 extend)
46 *
47 * Revision 6.72 2005/11/17 17:12:50 kskatz
48 * Fixed initializations and removed non-used functions to get rid of warnings
49 *
50 * Revision 6.70 2005/02/22 17:41:59 kskatz
51 * fixed potential dividing by zero in SPI_is_acceptor* and SPI_is_donor* probability calculations
52 *
53 * Revision 6.69 2004/04/09 16:05:21 kskatz
54 * Added sanity check (must be 3 intervals to go through the loop) to SPI_CheckMrnaOrder()
55 *
56 * Revision 6.68 2004/03/25 21:20:03 kskatz
57 * All SPI_is_acceptor_* functions have been corrected: 'N' no longer contributes to nor subtracts from the score, log odds are calculated and the scores added; they are however all antilogged because there are too many places in the code where the score is expected to be between 0 and 1. Also, corrected sequence frequency determination in SPI_is_acceptor_user and SPI_is_donor_user, as well as correcting for 'N'. Finally, and this all began with, I added matrices for Dictyostelium - command line -r -m
58 *
59 * Revision 6.67 2003/12/12 21:25:26 kskatz
60 * Fixed bug in SPI_CheckForPolyAExon() where multiple SeqAlignPtr's to the same object were not handled carefully: one of the ptr's was being accessed when the object was freed via the other ptr.
61 *
62 * Revision 6.66 2003/12/12 17:57:04 kskatz
63 * Fixed a potential array bounds read error in SPI_CheckMrnaOrder()
64 *
65 * Revision 6.65 2003/12/10 16:53:22 kskatz
66 * Ensured that 'ovl' when used is never negative once set in SPI_AdjustOverlaps() [see revision 6.57]
67 *
68 * Revision 6.64 2003/10/21 15:26:17 kans
69 * fixed typo of SPI_IvalPt to SPI_IvalPtr
70 *
71 * Revision 6.63 2003/10/21 15:14:19 kskatz
72 * Added SPI_CheckMrnaOrder(): Called by GetRegionForSAP() after the ivals for building a region are sorted in genomic order, this function merely checks that the mrna invterals are minimally colinear.
73 *
74 * Revision 6.62 2003/10/06 14:11:20 kskatz
75 * Changed the 'version' number printed by SPI_PrintResult() to '1.40' since it has been '1.35' for so long - mostly to avoid confusion when users report the version number
76 *
77 * Revision 6.61 2003/10/06 14:04:09 kskatz
78 * Correctly! commented out a temporary fix in SPI_AlignInWindows() [line 3880]
79 *
80 * Revision 6.60 2003/09/17 20:39:01 kskatz
81 * Commented out a temporary fix in SPI_AlignInWindows() [line 3880]
82 *
83 * Revision 6.59 2003/09/17 19:53:27 kskatz
84 * Added a check in SPI_FindBestAlnByDotPlot() that both seqs be 2-bit encoded (ncbi2na) in order to meet that implicit requirement of DOT_. If either one is not ncbi2na, SPI_FindBestAlnByDotPlot() will simply return NULL.
85 *
86 * Revision 6.58 2003/08/18 18:17:51 kskatz
87 * Just removing some unused vars
88 *
89 * Revision 6.57 2003/08/18 18:11:39 kskatz
90 * Fixed dynamic allocation of buf in SPI_AdjustOverlaps() - 'ovl' can be negative
91 *
92 * Revision 6.56 2003/08/15 15:23:50 kskatz
93 * Created Choose2LooseMrnaOvLap(), called by SPI_AdjustForSplice(): returns the SeqAlignPtr * to delete *. The choice is based on score + splice donor/acceptor existencefor that 'exon'. Also made buf2 in SPI_AdjustOverlaps() dynamically allocated as it was (obviously) crashing when the overlaps was > 200 bases.
94 *
95 * Revision 6.55 2003/06/30 15:01:29 whlavina
96 * Correct minus strand handling in CreaeContinuousAln functions; previous
97 * code could corrupt alignments (stop2-start1>1 would imply len<-2 if
98 * ExtendAlnRight ever gets called).
99 *
100 * Revision 6.54 2003/05/30 17:25:38 coulouri
101 * add rcsid
102 *
103 * Revision 6.53 2003/04/04 19:42:56 kskatz
104 * Added a new command line option (-R) to allow external users to point spidey to a repeat database that it can pass on to blast for filtering repeats
105 *
106 * Revision 6.52 2002/11/14 17:20:38 johnson
107 * fixed nasty memory misallocation bug in SPI_CheckSplicesForRevComp
108 *
109 * Revision 6.51 2002/11/04 19:48:35 kskatz
110 * wasn't correcting for strand when reporting summary mismatch information in SPI_PrintResults()
111 *
112 * Revision 6.50 2002/10/10 19:39:45 kskatz
113 * Added 'mismatches' to output in SPI_PrintResult(), as well as commented out several unused variables, and two syntax fixes to avoid compiler warning
114 *
115 * Revision 6.49 2002/10/02 16:47:11 kskatz
116 * clarifying the explanation of the -L option
117 *
118 * Revision 6.48 2002/10/02 16:12:53 kskatz
119 * Added a new option to SPI_Options (bigintron_size) that holds a user-supplied maximum size (default = 220000) for introns and requires the option (bool) bigintron to be set to 'TRUE'; The functions affected are SPI_mRNAPtr SPI_AdjustForSplice(), SPI_is_consistent(), and SPI_FindPiece(); note that the default for bigintron_size is not set in SPI_OptionsNew() (yet)
120 *
121 * Revision 6.47 2002/08/28 17:02:51 kskatz
122 * Simplified the loop in SPI_PrintResults() that prints out the 5' splice site and allowed minimum of 2 bases; also fixed more deadly access errors by setting the SPI_RegionInfoPtr *srip to NULL when all regions fall below spot->idcutoff in SPI_SortRegionsByScore()
123 *
124 * Revision 6.46 2002/08/26 20:00:05 kskatz
125 * Fixed off-by-one error in my fix to SPI_PrintResult
126 *
127 * Revision 6.45 2002/08/20 21:07:12 kskatz
128 * Fixed several NULL pointer access errors caused when -c results in the deletion of all regions in SPI_SortRegionsByScore(); also fixed bugs in SPI_PrintResult() caused by not checking to see if a minus strand alignment had start at the end of the sequence when printing out the 10-base buffer of the splice region
129 *
130 * Revision 6.44 2002/08/17 03:08:16 kskatz
131 * allowed to & from to be handled independently in SPI_AlnSinglemRNAToGen()
132 *
133 * Revision 6.43 2002/08/16 22:31:55 kskatz
134 * oops again - changed c++ style comments to c style since this is the c toolkit
135 *
136 * Revision 6.42 2002/08/16 21:15:50 kskatz
137 * oops - this is C toolkit: int -> Int4
138 *
139 * Revision 6.41 2002/08/16 21:03:12 kskatz
140 * SPI_OptionsNew() now sets strand = Seq_strand_both as default, otherwise blast results are hosed; SPI_AlnSinglemRNAToGen() now correctly limits initial blast to user supplied to/from and includes a little sanity check
141 *
142 * Revision 6.40 2002/07/22 13:40:55 wheelan
143 * changes to splice matrices, bug fix in CDS computation
144 *
145 * Revision 6.39 2002/06/27 12:59:34 kans
146 * fix in call to GetScoreAndEvalue
147 *
148 * Revision 6.38 2002/06/27 11:52:53 wheelan
149 * various bug fixes -- fixed off-by-one splice site errors and more
150 *
151 * Revision 6.37 2002/05/07 19:15:09 wheelan
152 * fixed minor bug in splice boundary arithmetic
153 *
154 * Revision 6.36 2002/05/07 18:42:56 wheelan
155 * changes to support user-defined splice matrices
156 *
157 * Revision 6.35 2002/04/04 17:18:20 wheelan
158 * numerous bug fixes and little changes; added SPI_CheckForPolyAExon
159 *
160 * Revision 6.34 2002/01/30 19:09:05 wheelan
161 * better support for revcomp, plus changes for new alignment manager funcs
162 *
163 * Revision 6.33 2001/12/18 18:00:01 wheelan
164 * bug fix for NULL segs in RemoveTeenyAln
165 *
166 * Revision 6.32 2001/12/13 12:28:51 wheelan
167 * fixed bug in multiple printing, bug in ConnectAln
168 *
169 * Revision 6.31 2001/12/10 15:58:04 wheelan
170 * fixed dereferencing of null variable in ConnectAln
171 *
172 * Revision 6.30 2001/12/10 14:42:36 wheelan
173 * bug fix in ConnectAln -- no more using freed pointers
174 *
175 * Revision 6.29 2001/12/05 12:29:37 wheelan
176 * changed to version 1.2
177 *
178 * Revision 6.28 2001/11/30 12:15:03 wheelan
179 * subtle but very important bug fix in SPI_GetNthSeqRangeInSASet
180 *
181 * Revision 6.27 2001/11/20 12:13:24 wheelan
182 * made SPI_GetProteinFrommRNA EXTERN
183 *
184 * Revision 6.26 2001/11/05 16:17:11 wheelan
185 * added option to print multiple alignment to a file
186 *
187 * Revision 6.25 2001/11/02 14:00:52 wheelan
188 * fixed memory access errors in splice printing code
189 *
190 * Revision 6.24 2001/10/26 13:12:07 wheelan
191 * changes to polyA handling, plus bulletproofing
192 *
193 * Revision 6.23 2001/10/18 15:45:56 wheelan
194 * bug fix in ConnectAln
195 *
196 * Revision 6.22 2001/10/18 15:12:22 wheelan
197 * fixed polyAtail alignment problems, fixed score calculation
198 *
199 * Revision 6.21 2001/10/17 16:16:21 wheelan
200 * changes in region sorting plus mrna model gap handling
201 *
202 * Revision 6.20 2001/10/08 17:16:44 wheelan
203 * bug fix in revcmp, made cds 1-based coords, fixed polyA bug
204 *
205 * Revision 6.19 2001/10/04 12:36:21 wheelan
206 * implemented bigintron option; made SPI_ConnectAln run through twice to pick up more pieces
207 *
208 * Revision 6.18 2001/10/03 18:09:54 wheelan
209 * changed AM_LITE define for new alnmgr
210 *
211 * Revision 6.17 2001/10/03 14:19:53 wheelan
212 * change names of all alignmgr calls, plus add profile-making code
213 *
214 * Revision 6.15 2001/09/07 12:15:25 wheelan
215 * small fix for reverse complement translation
216 *
217 * Revision 6.14 2001/09/07 12:05:17 wheelan
218 * moved protein translation for convenient use on web
219 *
220 * Revision 6.13 2001/09/07 11:47:32 wheelan
221 * fixed coordinates and translation for reverse complement cases
222 *
223 * Revision 6.12 2001/09/04 13:46:47 wheelan
224 * made SPI_RemoveInconsistentAlnsFromSet and SPI_flip_sa_list extern
225 *
226 * Revision 6.11 2001/08/24 23:27:15 wheelan
227 * removed unwanted semicolon
228 *
229 * Revision 6.10 2001/08/24 13:45:20 wheelan
230 * better region sorting (better scores), plus different printing options added
231 *
232 * Revision 6.9 2001/08/20 21:28:34 wheelan
233 * improved relative scoring of initial regions, added seqid types
234 *
235 * Revision 6.8 2001/07/20 10:31:10 wheelan
236 * fixed uninitialized variable plus another polyA mistake
237 *
238 * Revision 6.7 2001/07/19 18:22:36 wheelan
239 * better handling of polyA tails
240 *
241 * Revision 6.6 2001/07/11 17:56:53 wheelan
242 * added more functions to deal with making multiple alignments
243 *
244 * Revision 6.5 2001/07/10 16:44:53 wheelan
245 * added functions to make a multiple alignment
246 *
247 * Revision 6.4 2001/07/06 10:27:21 wheelan
248 * fixed minor things pointed out by D. Vakatov
249 *
250 * Revision 6.3 2001/06/25 17:00:47 wheelan
251 * frame fix in GetProteinFrommRNA
252 *
253 * Revision 6.2 2001/06/22 20:54:49 wheelan
254 * spidey now tries to make as many alignments as requested, even if that means throwing away the "best" regions if they have no alignment
255 *
256 * Revision 6.1 2001/05/24 16:28:10 wheelan
257 * initial checkin
258 *
259 *
260 * ==========================================================================
261 */
262
263 #include <spidey.h>
264
265
266 static int LIBCALLBACK SPI_CompareAlnPosForMult(VoidPtr ptr1, VoidPtr ptr2);
267 static Boolean spi_overlaps(SeqAlignPtr sap, SPI_BlockPtr sbp);
268 static void SPI_BeautifySMP(SPI_RegionInfoPtr srip);
269 static void SPI_RemoveOutsideBounds(SeqAlignPtr sap, SPI_OptionsPtr spot);
270 static void SPI_PadRegions(SPI_RegionInfoPtr srip, Int4 bsplen);
271 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2);
272 static void SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip, SPI_OptionsPtr spot);
273 static int LIBCALLBACK SPI_CompareRegions(VoidPtr ptr1, VoidPtr ptr2);
274 static void SPI_PrintAce(FILE *ofp, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, Boolean is_cds);
275 static void SPI_PrintResult(FILE *ofp, FILE *ofp2, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot, Boolean isitCDS);
276 static void SPI_PrintHerdResult(FILE *ofp, FILE *ofp2, SPI_mRNAToHerdPtr herd, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna);
277 static void spi_print_mismatch_line(FILE *ofp, Int4 exonnum, Int4 start, Int4 len, SPI_ExonProfPtr epp, Int4 gstart);
278 static SeqAlignPtr SPI_CreateContinuousAln(SeqAlignPtr PNTR saps, Int4 numsaps);
279 static void SPI_ExtendAlnRight(SeqAlignPtr sap, Int4 which_row, Int4 start, Int4 stop);
280 static SPI_mRNAToHerdPtr SPI_GetHerdInfo(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
281 static SPI_RegionInfoPtr SPI_FindWindows(SeqAlignPtr sap, SPI_OptionsPtr spot);
282 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2);
283 static SPI_RegionInfoPtr SPI_SortRegions(SPI_RegionInfoPtr srip_head);
284 static int LIBCALLBACK SPI_SortSrips(VoidPtr ptr1, VoidPtr ptr2);
285 static SPI_RegionInfoPtr SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list, Int4 num, SPI_RegionInfoPtr PNTR head_srip, SPI_OptionsPtr spot);
286 static SPI_RegionInfoPtr SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list, Int4 num, SeqAlignPtr sap, SPI_OptionsPtr spot);
287 static Int2 SPI_is_consistent(SPI_IvalPtr siip, SPI_RegionInfoPtr srip, SPI_OptionsPtr spot);
288 static int LIBCALLBACK SPI_compare_genomic_loc(VoidPtr ptr1, VoidPtr ptr2);
289 static void SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list, Int4 num, SPI_RegionInfoPtr srip);
290 static void SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
291 static void SPI_DoAln(SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
292 static Boolean SPI_ConnectAln(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip, Boolean do_ends, Boolean firsttime);
293 static SeqAlignPtr SPI_ProcessNewAlns(SeqAlignPtr sap);
294 static Int4 SPI_IsItPolyA(SeqIdPtr sip);
295 static SeqAlignPtr SPI_FillInIntron(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start1, Int4 stop1, Int4 start2, Int4 stop2, Uint1 strand2, SPI_OptionsPtr spot);
296 static Int4 spi_isa_gap(Int4 start, Int4 prevstop, Uint1 strand);
297 static Int4 SPI_GetNthSeqLenInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr numsaps);
298 static void SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
299 static SeqAlignPtr SPI_FindPiece(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start_m, Int4 stop_m, Uint1 strand, Int4 start_g, Int2 which_end, SPI_OptionsPtr spot);
300 static SPI_mRNAPtr SPI_AdjustForSplice(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip);
301 static Int4 SPI_GetExonInfo(SPI_mRNAPtr smp, Int4 n, Int4Ptr start, Int4Ptr stop, Int4Ptr mis, SPI_OptionsPtr spot);
302 static void SPI_AdjustOverlaps(SeqAlignPtr sap1, SeqAlignPtr sap2, Int4 n, SPI_mRNAPtr smp, SPI_OptionsPtr spot);
303 static void SPI_RemoveTeenyAlns(SeqAlignPtr sap, Int4 len);
304 static void SPI_ExtendAlnAlgDumb(SeqAlignPtr sap, Int4 ovl, Int4 which_side, Uint1 strand);
305 static void SPI_GetAcceptorScore(BioseqPtr bsp, Int4 pos1, Int4 pos2, Uint1 strand, FloatHiPtr score, Int4 spllen, SPI_OptionsPtr spot);
306 static Int4 spi_get_overlap (SeqAlignPtr sap1, SeqAlignPtr sap2);
307 static void SPI_AddToAln(SeqAlignPtr sap, Int4 offset, Int2 which_end, Uint1 strand);
308 static SeqAlignPtr SPI_MergeAlignments(SeqAlignPtr sap1, SeqAlignPtr sap2);
309 static SeqAlignPtr SPI_FillInLastmRNAHoles(SeqAlignPtr sap, SeqIdPtr sip_genomic, SeqIdPtr sip_mrna, Int4 start_g, Int4 stop_g, Int4 start_m, Int4 stop_m, Uint1 strand);
310 static SeqAlignPtr SPI_FindBestAlnByDotPlot(SeqLocPtr slp1, SeqLocPtr slp2);
311 static int LIBCALLBACK SPI_comp_aln_pos(VoidPtr ptr1, VoidPtr ptr2);
312 static void SPI_RegionFree (SPI_RegionInfoPtr srip);
313 static void SPI_FreeExonProf(SPI_ExonProfPtr epp);
314 static void SPI_FreeExonProfList(SPI_ExonProfPtr epp);
315 static void SPI_GetDonorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot);
316 static void SPI_is_donor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot);
317 static void SPI_is_donor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
318 static void SPI_is_donor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
319 static void SPI_is_donor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
320 static void SPI_is_donor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
321 static void SPI_is_donor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
322 static void SPI_GetAcceptorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot);
323 static void SPI_is_acceptor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot);
324 static void SPI_is_acceptor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
325 static void SPI_is_acceptor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
326 static void SPI_is_acceptor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
327 static void SPI_is_acceptor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
328 static void SPI_is_acceptor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score);
329 static void SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp, Int4 fuzz);
330 static void SPI_OrderPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna);
331 static int LIBCALLBACK SPI_CompareFragInfo(VoidPtr ptr1, VoidPtr ptr2);
332 static Boolean SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_contig, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
333 static void SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby, Int4 n, Int4Ptr start, Int4Ptr stop);
334 static Int4 SPI_GetNearbyFrags(SPI_FragPtr sfptarget, Int4 n, SPI_FragPtr ** ptrptr, SPI_FragHerdPtr sfhp, Boolean minus);
335 static void SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_genomic, SPI_OptionsPtr spot);
336 static void SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1, SPI_FragPtr sfp2, BioseqPtr bsp_genomic, SPI_OptionsPtr spot);
337 static SeqAlignPtr SPI_GetNthSAByRow(SeqAlignPtr sap, Int4 row, Int4 n);
338 static SPI_FragSplPtr SPI_GetPossibleSites(SeqAlignPtr sap, BioseqPtr bsp_genomic, SPI_OptionsPtr spot, Boolean donor, Int4 ovl);
339 static void SPI_FragSplFree(SPI_FragSplPtr fsp);
340 static int LIBCALLBACK SPI_CompareSpins(VoidPtr ptr1, VoidPtr ptr2);
341 static void SPI_OrderInternally(SPI_FragHerdPtr sfhp);
342 static int LIBCALLBACK SPI_CompareAlnPos(VoidPtr ptr1, VoidPtr ptr2);
343 static SPI_RegionInfoPtr SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
344 static void SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds, SPI_RegionInfoPtr srip_mrna, Int4 len, Int4 exonstart, Int4 exonstop);
345 static Boolean LIBCALLBACK SPI_GetCDS(SeqFeatPtr sfp, SeqMgrFeatContextPtr context);
346 static Boolean LIBCALLBACK SPI_GetCDSFeat(SeqFeatPtr sfp, SeqMgrFeatContextPtr context);
347 static Int4 SPI_FindLongestProt(CharPtr seq, Int4Ptr pos);
348 static Boolean SPI_GetAccessionFromSeqId(SeqIdPtr sip, Int4Ptr gi, CharPtr PNTR id);
349 static void SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna);
350 static ACTProfilePtr SPI_ProfileNew(Boolean nuc);
351 static ACTProfilePtr SPI_ProfileFree(ACTProfilePtr app);
352 static ACTProfilePtr SPI_ProfileSetFree(ACTProfilePtr app);
353 static void SPI_BuildProfile(SeqLocPtr slp, ACTProfilePtr PNTR app, Int4Ptr count, Int4 length);
354 static ACTProfilePtr SPI_MakeProfileFromSA(SeqAlignPtr sap);
355 static int SPI_Choose2LooseMrnaOvLap (const SeqAlignPtr sap1, const SeqAlignPtr sap2, const SPI_mRNAPtr smp, const int ptr1offset);
356 static void SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp, const int num);
357
358
359 /***************************************************************************
360 *
361 * SPI_AlnmRNAToGenomic is available to outside programs; just pass in the two
362 * bioseqs and options (to use default options, just pass in NULL, and to use
363 * other options, call SPI_OptionsNew to get an initialized options pointer and
364 * make the desired changes). If options are passed in, they should be freed
365 * using SPI_OptionsFree. SPI_AlignmRNAToGenomic returns a linked list of
366 * SPI_mRNAPtrs, one per gene model (default is to only return one gene model).
367 * Each SPI_mRNAPtr (see spidey.h) has arrays specifying the exon boundaries in
368 * genomic and mRNA coordinates as well as information about splice sites,
369 * percent identity, number of gaps, etc. The SPI_mRNAPtr also has one alignment
370 * per exon as well as a single alignment (smp->continuous) that covers the entire
371 * gene, with big gaps in the mRNA for the genomic introns. The SPI_mRNAPtr should
372 * be freed by the calling function, using SPI_mRNAFree.
373 *
374 * SPI_AlnmRNAToGenomic should only be used on finished sequence; it can handle
375 * interspecies comparisons but doesn't work on draft sequence.
376 *
377 ***************************************************************************/
SPI_AlignmRNAToGenomic(BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)378 NLM_EXTERN SPI_mRNAPtr SPI_AlignmRNAToGenomic(BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
379 {
380 Int4 a;
381 Int4 i;
382 Boolean lcl;
383 SeqAlignPtr parent;
384 SPI_Progress progress;
385 SeqAlignPtr PNTR saparray;
386 SPI_mRNAPtr smp_head;
387 SPI_mRNAPtr smp_prev;
388 SPI_bsinfoPtr spig;
389 SPI_bsinfoPtr spim;
390 SPI_OptionsPtr spot_lcl;
391 SPI_RegionInfoPtr srip;
392 SPI_RegionInfoPtr srip_head;
393
394 if (bsp_genomic == NULL || bsp_mrna == NULL)
395 return NULL;
396 a = SPI_IsItPolyA(bsp_mrna->id);
397 if (spot == NULL)
398 {
399 spot_lcl = SPI_OptionsNew();
400 lcl = TRUE;
401 } else
402 {
403 spot_lcl = spot;
404 lcl = FALSE;
405 }
406 spig = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
407 spig->bsp = bsp_genomic;
408 spim = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
409 spim->bsp = bsp_mrna;
410 srip_head = SPI_AlnSinglemRNAToGen(spig, spim, NULL, NULL, spot_lcl);
411 if (spot_lcl->callback != NULL)
412 {
413 progress.percentdone = 100;
414 progress.returncode = SPI_FINISHED;
415 if (!spot_lcl->callback(&progress))
416 return NULL;
417 }
418 smp_head = smp_prev = NULL;
419 srip = srip_head;
420 while (srip != NULL)
421 {
422 if (srip->smp->polyAtail == 0)
423 srip->smp->polyAtail = 0-a;
424 srip->smp->revcomp = srip->revcomp;
425 if (smp_head != NULL)
426 {
427 smp_prev->next = srip->smp;
428 smp_prev = srip->smp;
429 } else
430 smp_head = smp_prev = srip->smp;
431 saparray = (SeqAlignPtr PNTR)MemNew((srip->smp->numexons)*sizeof(SeqAlignPtr));
432 for (i=0; i<srip->smp->numexons; i++)
433 {
434 saparray[i] = SeqAlignDup(srip->smp->saps[i]);
435 AlnMgr2IndexSingleChildSeqAlign(saparray[i]);
436 }
437 srip->smp->continuous = SPI_CreateContinuousAln(srip->smp->saps, srip->smp->numexons);
438 for (i=0; i<srip->smp->numexons; i++)
439 {
440 SeqAlignFree(srip->smp->saps[i]);
441 srip->smp->saps[i] = saparray[i];
442 if (i < srip->smp->numexons-1)
443 {
444 srip->smp->saps[i]->next = saparray[i+1];
445 saparray[i+1]->next = NULL;
446 }
447 }
448 MemFree(saparray);
449 parent = SeqAlignNew();
450 parent->segtype = SAS_DISC;
451 parent->segs = (Pointer)(srip->smp->saps[0]);
452 AlnMgr2IndexLite(parent);
453 srip->smp->parent = parent;
454 srip->polyAtail = srip->smp->polyAtail;
455 srip->smp->fallsoff = srip->fallsoff;
456 if (srip->smp->protein == NULL)
457 srip->smp->protein = SPI_GetProteinFrommRNA(spim->bsp, &srip->smp->transstart);
458 srip->smp = NULL; /* so that smp doesn't get freed */
459 srip = srip->next;
460 }
461 SPI_bsinfoFreeList(spig);
462 SPI_bsinfoFreeList(spim);
463 SPI_RegionListFree(srip_head);
464 if (lcl)
465 SPI_OptionsFree(spot_lcl);
466 return (smp_head);
467 }
468
469 /***************************************************************************
470 *
471 * SPI_AlnSinglemRNAToGen is called by Main() as well as by
472 * SPI_AlignmRNAToGenomic. It does the initial BLAST
473 * (high stringency) and makes sure that all the alignments
474 * are on the plus strand of the genomic sequence. It then calls the
475 * functions to create the windows, align in the windows, and sort the
476 * final alignments. Finally, it calls the functions to print the
477 * information for each alignment, fetch the CDS and create its
478 * alignment if requested, and create a continuous alignment if the
479 * user wishes to print an asn.1 seqalign. When everything is done, it
480 * frees the SPI_RegionInfoPtrs if the program is run as the
481 * standalone version, or returns them to the calling function.
482 *
483 ***************************************************************************/
SPI_AlnSinglemRNAToGen(SPI_bsinfoPtr spig,SPI_bsinfoPtr spim,FILE * ofp,FILE * ofp2,SPI_OptionsPtr spot)484 NLM_EXTERN SPI_RegionInfoPtr SPI_AlnSinglemRNAToGen(SPI_bsinfoPtr spig, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot)
485 {
486 Char rep_buf[1024] = "m L;R";
487 Int4 i;
488 BLAST_OptionsBlkPtr options;
489 SPI_Progress progress;
490 SeqAlignPtr salp;
491 SeqAlignPtr salp_tmp;
492 SeqAlignPtr sap;
493 SeqAlignPtr sap_tmp;
494 SeqAlignPtr sap_tmp2;
495 SeqLocPtr slp1;
496 SeqLocPtr slp2;
497 SPI_RegionInfoPtr srip;
498 SPI_RegionInfoPtr srip_cds;
499 SPI_RegionInfoPtr srip_tmp;
500 Boolean standalone;
501 Uint1 strand;
502
503 if (spig == NULL || spim == NULL)
504 return NULL;
505 /*sanity checks for to & from*/
506 if (spot->to == 0){
507 spot->to = spig->bsp->length - 1;
508 }
509 else if (spot->to < spot->from){
510 Int4 new_from = spot->to;
511 spot->to = spot->from;
512 spot->from = new_from;
513 }
514 if (spot->from == spot->to){
515 return NULL;
516 }
517 /***
518 if (spot->to < spot->from)
519 return NULL;
520 ***/
521 if (ofp == NULL)
522 standalone = FALSE;
523 else
524 standalone = TRUE;
525 spot->printheader = TRUE;
526 options = BLASTOptionNew("blastn", FALSE);
527
528 /* KSK added to allow user defined repeat db path */
529 /* options->filter_string = StringSave("m L;R"); */
530 if (spot->repeat_db_file){
531 strcat(rep_buf, " -d ");
532 strcat(rep_buf, spot->repeat_db_file);
533 }
534 options->filter_string = StringSave(rep_buf);
535 /*end of adding repeat db path */
536 options->expect_value = spot->firstpasseval;
537 options->query_lcase_mask = spot->lcaseloc;
538 if (spot->interspecies)
539 {
540 options->gap_x_dropoff_final = 100;
541 options->gap_open = 5;
542 options->gap_extend = 1;
543 options->penalty = -1;
544 }
545 /* do the BLAST with the mRNA as the query, for speed */
546 if (spot->callback != NULL)
547 {
548 progress.percentdone = 5;
549 progress.returncode = SPI_START;
550 if (!spot->callback(&progress))
551 return NULL;
552 }
553 /*** not checking for to/from, so it's gotta be just as easy
554 to send it directly to ..ByLoc()***
555 if (spot->strand == Seq_strand_both)
556 sap = BlastTwoSequences(spim->bsp, spig->bsp, "blastn", options);
557 else
558 {
559 ***/
560 slp1 = SeqLocIntNew(0, spim->bsp->length-1, Seq_strand_plus, spim->bsp->id);
561 slp2 = SeqLocIntNew(spot->from, spot->to, spot->strand, spig->bsp->id);
562 /* slp2 = SeqLocIntNew(0, spig->bsp->length-1, spot->strand, spig->bsp->id); */
563 sap = BlastTwoSequencesByLoc(slp1, slp2, "blastn", options);
564 /* } */
565
566 if (spot->callback != NULL)
567 {
568 progress.percentdone = 30;
569 progress.returncode = SPI_PROGRESS;
570 if (!spot->callback(&progress))
571 return NULL;
572 }
573 if (sap == NULL)
574 {
575 if (standalone)
576 SPI_PrintResult(ofp, ofp2, NULL, spig->bsp, spim->bsp, spot, FALSE);
577 BLASTOptionDelete(options);
578 return NULL;
579 }
580 if (!AlnMgr2IndexLite(sap))
581 {
582 if (standalone)
583 SPI_PrintResult(ofp, ofp2, NULL, spig->bsp, spim->bsp, spot, FALSE);
584 ErrPostEx(SEV_ERROR, 0, 0, "Alignment indexing error\n");
585 SeqAlignSetFree(sap);
586 BLASTOptionDelete(options);
587 return NULL;
588 }
589 salp = (SeqAlignPtr)(sap->segs);
590 /* since the mRNA was the query, need to flip all the alignments */
591 /* so that the mRNA will end up as the second row */
592 SPI_flip_sa_list(salp);
593 /* now make sure that everything is on the plus strand of the genomic sequence */
594 while (salp != NULL)
595 {
596 strand = AlnMgr2GetNthStrand(salp, 1);
597 if (strand == Seq_strand_minus)
598 {
599 salp_tmp = salp->next;
600 salp->next = NULL;
601 SAIndex2Free2(salp->saip);
602 salp->saip = NULL;
603 salp = SeqAlignListReverseStrand(salp);
604 AlnMgr2IndexSingleChildSeqAlign(salp);
605 salp->next = salp_tmp;
606 }
607 salp = salp->next;
608 }
609 SPI_RemoveOutsideBounds(sap, spot);
610 BLASTOptionDelete(options);
611 srip = SPI_FindWindows(sap, spot);
612 SPI_PadRegions(srip, spig->bsp->length);
613 /* once the windows are found, throw out the original alignment */
614 /* and carefully align in each window */
615 SeqAlignSetFree(sap);
616 SPI_AlignInWindows(&srip, spig->bsp, spim->bsp, spot);
617 if (spot->callback != NULL)
618 {
619 progress.percentdone = 95;
620 progress.returncode = SPI_PROGRESS;
621 if (!spot->callback(&progress))
622 return NULL;
623 }
624 SPI_SortRegionsByScore(&srip, spot);
625 SPI_BeautifySMP(srip);
626 SPI_CheckSplicesForRevComp(srip, spot, spig->bsp, spim->bsp);
627 srip_cds = NULL;
628 /* if the CDS alignment is desired, fetch the CDS information and */
629 /* compute the CDS alignment by truncating the mRNA alignments */
630 if (standalone)
631 {
632 srip_tmp = srip;
633 for (i=0; i<spot->numreturns; i++)
634 {
635 spot->printheader = FALSE;
636 if (spot->fetchcds)
637 {
638 srip_cds = SPI_GetResultsForCDS(srip_tmp, spim->bsp, spot);
639 SPI_BeautifySMP(srip_cds);
640 }
641 if (srip_cds != NULL && spot->ace == FALSE)
642 {
643 spot->printheader = TRUE;
644 SPI_PrintResult(ofp, ofp2, srip_cds, spig->bsp, spim->bsp, spot, TRUE);
645 SPI_RegionListFree(srip_cds);
646 } else if (srip_cds != NULL && spot->ace == TRUE)
647 SPI_PrintAce(ofp, srip_cds, spig->bsp, spim->bsp, TRUE);
648 if (spot->printheader) /* print the version info only once per mRNA/CDS */
649 spot->printheader = FALSE;
650 else
651 spot->printheader = TRUE;
652 /** KSK bug fix - access of null srip_tmp->smp that
653 results from SPI_SortRegionsByScore() removing
654 all below -c threshold **/
655 if (srip_tmp != NULL && srip_tmp->smp != NULL && srip_tmp->smp->protein == NULL)
656 srip_tmp->smp->protein = SPI_GetProteinFrommRNA(spim->bsp, &srip_tmp->smp->transstart);
657 if (spot->ace == FALSE)
658 {
659 if (srip_tmp == NULL)
660 SPI_PrintResult(ofp, ofp2, srip_tmp, spig->bsp, spim->bsp, spot, FALSE);
661 else
662 {
663 SPI_PrintResult(ofp, ofp2, srip_tmp, spig->bsp, spim->bsp, spot, FALSE);
664 srip_tmp = srip_tmp->next;
665 }
666 } else
667 SPI_PrintAce(ofp, srip_tmp, spig->bsp, spim->bsp, FALSE);
668 }
669 if (spot->printasn && srip != NULL && srip->smp != NULL)
670 {
671 sap_tmp = SPI_CreateContinuousAln(srip->smp->saps, srip->smp->numexons);
672 if (*(spot->sap_head) == NULL)
673 *(spot->sap_head) = sap_tmp;
674 else
675 {
676 sap_tmp2 = *(spot->sap_head);
677 while (sap_tmp2->next != NULL)
678 {
679 sap_tmp2 = sap_tmp2->next;
680 }
681 sap_tmp2->next = sap_tmp;
682 }
683 }
684 if (spot->makemult == FALSE)
685 SPI_RegionListFree(srip);
686 else
687 return srip;
688 } else
689 {
690 if (srip_cds != NULL)
691 {
692 srip_cds->next = srip;
693 srip = srip_cds;
694 }
695 return srip;
696 }
697 spot->printheader = FALSE;
698 return NULL;
699 }
700
701 /***************************************************************************
702 *
703 * SPI_CompareAlnPosForMult is the callback for the HeapSort in
704 * SPI_MakeMultipleAlignment. It simply puts the alignments in order
705 * along the genomic sequence, from least to greatest if the alignments
706 * are on the plus strand, greatest to least otherwise.
707 *
708 ***************************************************************************/
SPI_CompareAlnPosForMult(VoidPtr ptr1,VoidPtr ptr2)709 static int LIBCALLBACK SPI_CompareAlnPosForMult(VoidPtr ptr1, VoidPtr ptr2)
710 {
711 Int4 from1;
712 Int4 from2;
713 SeqAlignPtr sap1;
714 SeqAlignPtr sap2;
715 Uint1 strand;
716 Int4 to1;
717 Int4 to2;
718
719 sap1 = *((SeqAlignPtr PNTR)ptr1);
720 sap2 = *((SeqAlignPtr PNTR)ptr2);
721 strand = AlnMgr2GetNthStrand(sap1, 1);
722 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &from1, &to1);
723 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &from2, &to2);
724 if (strand == Seq_strand_minus)
725 {
726 if (from1 < from2)
727 return 1;
728 if (from2 < from1)
729 return -1;
730 if (to1 < to2)
731 return 1;
732 if (to2 < to1)
733 return -1;
734 } else
735 {
736 if (from1 < from2)
737 return -1;
738 if (from2 < from1)
739 return 1;
740 if (to1 < to2)
741 return -1;
742 if (to2 < to1)
743 return 1;
744 }
745 return 0;
746 }
747
748 /***************************************************************************
749 *
750 * SPI_OrderBlocksPlus is a callback for SPI_MakeMultipleAlignment. It
751 * is used to order the blocks along the genomic sequence when the genomic
752 * strand is plus.
753 *
754 ***************************************************************************/
SPI_OrderBlocksPlus(VoidPtr ptr1,VoidPtr ptr2)755 static int LIBCALLBACK SPI_OrderBlocksPlus(VoidPtr ptr1, VoidPtr ptr2)
756 {
757 SPI_BlockPtr sbp1;
758 SPI_BlockPtr sbp2;
759
760 sbp1 = *((SPI_BlockPtr PNTR)ptr1);
761 sbp2 = *((SPI_BlockPtr PNTR)ptr2);
762 if (sbp1->from_g < sbp2->from_g)
763 return -1;
764 if (sbp2->from_g < sbp1->from_g)
765 return 1;
766 if (sbp1->to_g < sbp2->to_g)
767 return -1;
768 if (sbp1->to_g > sbp2->to_g)
769 return 1;
770 return 0;
771 }
772
773 /***************************************************************************
774 *
775 * SPI_OrderBlocksMinus is a callback for SPI_MakeMultipleAlignment. It
776 * is used to order the blocks along the genomic sequence when the genomic
777 * strand is minus.
778 *
779 ***************************************************************************/
SPI_OrderBlocksMinus(VoidPtr ptr1,VoidPtr ptr2)780 static int LIBCALLBACK SPI_OrderBlocksMinus(VoidPtr ptr1, VoidPtr ptr2)
781 {
782 SPI_BlockPtr sbp1;
783 SPI_BlockPtr sbp2;
784
785 sbp1 = *((SPI_BlockPtr PNTR)ptr1);
786 sbp2 = *((SPI_BlockPtr PNTR)ptr2);
787 if (sbp1->from_g < sbp2->from_g)
788 return 1;
789 if (sbp2->from_g < sbp1->from_g)
790 return -1;
791 if (sbp1->to_g < sbp2->to_g)
792 return 1;
793 if (sbp1->to_g > sbp2->to_g)
794 return -1;
795 return 0;
796 }
797
798 /***************************************************************************
799 *
800 * SPI_CompareSecondRow is the HeapSort callback for SPI_RearrangeAlns;
801 * it orders two seqaligns in lexical order of the seqid of their second
802 * row. (All seqaligns involved have only two rows anyway).
803 *
804 ***************************************************************************/
SPI_CompareSecondRow(VoidPtr ptr1,VoidPtr ptr2)805 static int LIBCALLBACK SPI_CompareSecondRow(VoidPtr ptr1, VoidPtr ptr2)
806 {
807 Int4 ret;
808 SeqAlignPtr sap1;
809 SeqAlignPtr sap2;
810 SeqIdPtr sip1;
811 SeqIdPtr sip2;
812
813 sap1 = *((SeqAlignPtr PNTR)ptr1);
814 sap2 = *((SeqAlignPtr PNTR)ptr2);
815 sip1 = AlnMgr2GetNthSeqIdPtr(sap1, 2);
816 sip2 = AlnMgr2GetNthSeqIdPtr(sap2, 2);
817 ret = SAM_OrderSeqID(sip1, sip2);
818 SeqIdFree(sip1);
819 SeqIdFree(sip2);
820 return ret;
821 }
822
823 /***************************************************************************
824 *
825 * SPI_RearrangeAlns arranges the alignments in a linked list in lexical
826 * order of their second seqid, so that all blocks will have their
827 * rows in the same order.
828 *
829 ***************************************************************************/
SPI_RearrangeAlns(SeqAlignPtr sap_head)830 static SeqAlignPtr SPI_RearrangeAlns(SeqAlignPtr sap_head)
831 {
832 Int4 i;
833 Int4 j;
834 SeqAlignPtr sap;
835 SeqAlignPtr PNTR saparray;
836
837 i = 0;
838 sap = sap_head;
839 while (sap != NULL)
840 {
841 i++;
842 sap = sap->next;
843 }
844 if (i == 1)
845 return sap_head;
846 saparray = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
847 sap = sap_head;
848 i = 0;
849 while (sap != NULL)
850 {
851 saparray[i] = sap;
852 i++;
853 sap = sap->next;
854 }
855 HeapSort(saparray, i, sizeof(SeqAlignPtr), SPI_CompareSecondRow);
856 for (j=0; j<i-1; j++)
857 {
858 saparray[j]->next = saparray[j+1];
859 }
860 saparray[i-1]->next = NULL;
861 sap = saparray[0];
862 MemFree(saparray);
863 return sap;
864 }
865
866 /***************************************************************************
867 *
868 * SPI_MakeMultipleAlignment takes all exons returned from all mRNAs, all
869 * regions, and makes multiple alignments out of them (after grouping them
870 * into blocks. All blocks will not necessarily contain all mRNAs.
871 *
872 ***************************************************************************/
SPI_MakeMultipleAlignment(SPI_RegionInfoPtr srip_head)873 NLM_EXTERN void SPI_MakeMultipleAlignment(SPI_RegionInfoPtr srip_head)
874 {
875 SPI_BlockPtr PNTR blockarray;
876 Int4 i;
877 Int4 j;
878 Int4 minus;
879 Int4 numblocks;
880 Int4 numsmps;
881 Int4 plus;
882 SeqAlignPtr sap;
883 SeqAlignPtr sap_head;
884 SeqAlignPtr sap_prev;
885 SeqAlignPtr sap_tmp;
886 SeqAlignPtr PNTR saparray;
887 SPI_BlockPtr sbp = NULL;
888 SPI_BlockPtr sbp_head;
889 SPI_BlockPtr sbp_prev;
890 SPI_MultPtr smu;
891 SPI_RegionInfoPtr srip;
892 Uint1 strand;
893 Uint1 strand_tmp;
894 SeqAlignPtr sub_sap;
895
896 if (srip_head->next == NULL) /* only one alignment here */
897 return;
898 i = 0;
899 sap_head = sap_prev = NULL;
900 numsmps = 0;
901 minus = plus = 0;
902 srip = srip_head;
903 while (srip != NULL)
904 {
905 if (srip->smp != NULL)
906 {
907 if (srip->smp->strand == Seq_strand_minus)
908 minus++;
909 else
910 plus++;
911 for (j=0; j<srip->smp->numexons; j++)
912 {
913 sap = SeqAlignDup(srip->smp->saps[j]);
914 AlnMgr2IndexSingleChildSeqAlign(sap);
915 if (sap_head != NULL)
916 {
917 sap_prev->next = sap;
918 sap_prev = sap;
919 } else
920 sap_head = sap_prev = sap;
921 i++;
922 }
923 numsmps++;
924 }
925 srip = srip->next;
926 }
927 if (numsmps <= 1)
928 {
929 SeqAlignSetFree(sap_head);
930 return;
931 }
932 if (minus > plus)
933 strand = Seq_strand_minus;
934 else
935 strand = Seq_strand_plus;
936 saparray = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
937 sap = sap_head;
938 for (j=0; j<i; j++)
939 {
940 strand_tmp = AlnMgr2GetNthStrand(sap, 1);
941 if ((strand_tmp == Seq_strand_minus && strand != Seq_strand_minus) || (strand_tmp != Seq_strand_minus && strand == Seq_strand_minus))
942 {
943 sap_tmp = sap->next;
944 sap->next = NULL;
945 SeqAlignListReverseStrand(sap);
946 sap->next = sap_tmp;
947 }
948 saparray[j] = sap;
949 sap = sap->next;
950 }
951 HeapSort(saparray, i, sizeof(SeqAlignPtr), SPI_CompareAlnPosForMult);
952 for (j=0; j<i; j++)
953 {
954 saparray[j]->next = NULL;
955 }
956 sbp_head = sbp_prev = NULL;
957 for (j=0; j<i; j++)
958 {
959 if (sbp_head == NULL)
960 {
961 sbp = (SPI_BlockPtr)MemNew(sizeof(SPI_Block));
962 AlnMgr2GetNthSeqRangeInSA(saparray[j], 1, &sbp->from_g, &sbp->to_g);
963 sbp->sap = saparray[j];
964 saparray[j] = NULL;
965 sbp_head = sbp_prev = sbp;
966 } else
967 {
968 if (spi_overlaps(saparray[j], sbp))
969 {
970 sap_tmp = sbp->sap;
971 while (sap_tmp->next != NULL)
972 {
973 sap_tmp = sap_tmp->next;
974 }
975 sap_tmp->next = saparray[j];
976 saparray[j] = NULL;
977 } else
978 {
979 sbp = (SPI_BlockPtr)MemNew(sizeof(SPI_Block));
980 AlnMgr2GetNthSeqRangeInSA(saparray[j], 1, &sbp->from_g, &sbp->to_g);
981 sbp->sap = saparray[j];
982 saparray[j] = NULL;
983 sbp_prev->next = sbp;
984 sbp_prev = sbp;
985 }
986 }
987 }
988 MemFree(saparray);
989 sbp = sbp_head;
990 numblocks = 0;
991 while (sbp)
992 {
993 numblocks++;
994 sbp->sap = SPI_RearrangeAlns(sbp->sap);
995 AlnMgr2IndexIndexedChain(sbp->sap);
996 sub_sap = AlnMgr2GetSubAlign(sbp->sap, 0, -1, 0, TRUE);
997 SeqAlignSetFree(sbp->sap);
998 sbp->sap = sub_sap;
999 if (strand == Seq_strand_minus)
1000 sbp->sap = SeqAlignListReverseStrand(sbp->sap);
1001 AlnMgr2IndexSingleChildSeqAlign(sub_sap);
1002 sbp = sbp->next;
1003 }
1004 blockarray = (SPI_BlockPtr PNTR)MemNew(numblocks*sizeof(SPI_BlockPtr));
1005 sbp = sbp_head;
1006 j = 0;
1007 while (sbp != NULL)
1008 {
1009 blockarray[j] = sbp;
1010 j++;
1011 sbp = sbp->next;
1012 }
1013 if (strand == Seq_strand_minus)
1014 HeapSort(blockarray, numblocks, sizeof(SPI_BlockPtr), SPI_OrderBlocksMinus);
1015 else
1016 HeapSort(blockarray, numblocks, sizeof(SPI_BlockPtr), SPI_OrderBlocksPlus);
1017 saparray = (SeqAlignPtr PNTR)MemNew(numblocks*sizeof(SeqAlignPtr));
1018 for (j=0; j<numblocks; j++)
1019 {
1020 saparray[j] = blockarray[j]->sap;
1021 MemFree(blockarray[j]);
1022 }
1023 MemFree(blockarray);
1024 smu = (SPI_MultPtr)MemNew(sizeof(SPI_Mult));
1025 smu->exons = saparray;
1026 smu->numexons = numblocks;
1027 srip_head->smu = smu;
1028 }
1029
1030 /***************************************************************************
1031 *
1032 * SPI_WriteAlnLine prints out the specified row of an alignment, between
1033 * the alignment coordinates specified. It allocates the charptr itself;
1034 * this must be freed later by the calling function.
1035 *
1036 ***************************************************************************/
SPI_WriteAlnLine(Int4 row,Int4 from,Int4 to,SeqAlignPtr sap)1037 static CharPtr SPI_WriteAlnLine(Int4 row, Int4 from, Int4 to, SeqAlignPtr sap)
1038 {
1039 AlnMsg2Ptr amp;
1040 BioseqPtr bsp;
1041 Uint1 buf[SPI_LINE+2];
1042 Int4 ctr;
1043 Int4 i;
1044 Boolean more;
1045 Int4 n;
1046 SeqIdPtr sip;
1047 SeqPortPtr spp;
1048 CharPtr string;
1049
1050 n = AlnMgr2GetNumRows(sap);
1051 if (row > n || row < 1)
1052 return NULL;
1053 string = (CharPtr)MemNew((SPI_LINE+2)*sizeof(Char));
1054 for (n=0; n<(SPI_LINE+2); n++)
1055 {
1056 string[n] = '\0';
1057 }
1058 sip = AlnMgr2GetNthSeqIdPtr(sap, row);
1059 bsp = BioseqLockById(sip);
1060 amp = AlnMsgNew2();
1061 amp->row_num = row;
1062 amp->from_aln = from;
1063 amp->to_aln = to;
1064 if (amp->to_aln < 0)
1065 amp->to_aln = -1;
1066 n = 0;
1067 while ((more = AlnMgr2GetNextAlnBit(sap, amp)) == TRUE)
1068 {
1069 if (amp->to_row - amp->from_row > amp->to_aln - amp->from_aln) /* kludge */
1070 {
1071 if (amp->strand == Seq_strand_minus)
1072 amp->from_row = amp->to_row - (amp->to_aln - amp->from_aln);
1073 else
1074 amp->to_row = amp->from_row + (amp->to_aln - amp->from_aln);
1075 }
1076 if (amp->type == AM_SEQ)
1077 {
1078 spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
1079 ctr = SeqPortRead(spp, buf, (amp->to_row - amp->from_row + 1));
1080 SeqPortFree(spp);
1081 for (i=n; i<n+ctr; i++)
1082 {
1083 string[i] = buf[i-n];
1084 }
1085 n += ctr;
1086 } else
1087 {
1088 for (i=n; i<(n+amp->to_row-amp->from_row+1); i++)
1089 {
1090 string[i] = '-';
1091 }
1092 n += amp->to_row-amp->from_row+1;
1093 }
1094 }
1095 AlnMsgFree2(amp);
1096 SeqIdFree(sip);
1097 return string;
1098 }
1099
1100 /***************************************************************************
1101 *
1102 * SPI_MapRowCoords finds the first non-gap character in a row and
1103 * returns its sequence position. If the row consists only of gaps, it
1104 * returns -1. If direction is RIGHT, the function searches upwards in
1105 * alignment coordinates; otherwise it searches the other direction.
1106 *
1107 ***************************************************************************/
SPI_MapRowCoords(SeqAlignPtr sap,Int4 from,Int4 to,Int4 row,Uint1 direction)1108 static Int4 SPI_MapRowCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row, Uint1 direction)
1109 {
1110 Int4 pos;
1111
1112 if (direction == SPI_RIGHT)
1113 {
1114 pos = AlnMgr2MapSeqAlignToBioseq(sap, from, row);
1115 from++;
1116 while (pos < 0 && from <= to)
1117 {
1118 pos = AlnMgr2MapSeqAlignToBioseq(sap, from, row);
1119 from++;
1120 }
1121 } else
1122 {
1123 pos = AlnMgr2MapSeqAlignToBioseq(sap, to, row);
1124 to--;
1125 while (pos < 0 && to >= from)
1126 {
1127 pos = AlnMgr2MapSeqAlignToBioseq(sap, to, row);
1128 to--;
1129 }
1130 }
1131 if (pos < 0)
1132 return -1;
1133 return pos;
1134 }
1135
1136 /***************************************************************************
1137 *
1138 * spi_get_num_places calculates the number of digits in a number, for
1139 * display neatness purposes.
1140 *
1141 ***************************************************************************/
spi_get_num_places(Int4 num)1142 static Int4 spi_get_num_places(Int4 num)
1143 {
1144 FloatHi f;
1145 Int4 i;
1146 Int4 x;
1147
1148 x = 10;
1149 for (i=1; i<21; i++)
1150 {
1151 f = (FloatHi)num/(FloatHi)x;
1152 if (f < 1)
1153 {
1154 if (num < 0)
1155 return (i+1);
1156 else
1157 return i;
1158 }
1159 x = x*10;
1160 }
1161 if (num < 0)
1162 i++;
1163 return i;
1164 }
1165
1166 /***************************************************************************
1167 *
1168 * SPI_IsItMult looks over the alignments in smu->exons to see whether
1169 * there are any aligments with dim > 2; if so, it returns SPI_MULT, and
1170 * if not, it returns SPI_NOTMULT.
1171 *
1172 ***************************************************************************/
SPI_IsItMult(SPI_MultPtr smu)1173 static Int4 SPI_IsItMult(SPI_MultPtr smu)
1174 {
1175 Int4 i;
1176
1177 if (smu == NULL)
1178 return SPI_NOTMULT;
1179 for (i=0; i<smu->numexons; i++)
1180 {
1181 if (AlnMgr2GetNumRows(smu->exons[i]) > 2)
1182 return SPI_MULT;
1183 }
1184 return SPI_NOTMULT;
1185 }
1186
1187 /***************************************************************************
1188 *
1189 * SPI_PrintMultipleAlignment prints a text or html report of the alignment
1190 * computed by SPI_MakeMultipleAlignment.
1191 *
1192 ***************************************************************************/
SPI_PrintMultipleAlignment(SPI_RegionInfoPtr srip,Boolean html,BioseqPtr bsp,FILE * ofp)1193 NLM_EXTERN void SPI_PrintMultipleAlignment(SPI_RegionInfoPtr srip, Boolean html, BioseqPtr bsp, FILE * ofp)
1194 {
1195 Char accsite[SPI_PSPLICE+2];
1196 Int4 c;
1197 Int4Ptr coord;
1198 Int4 ctr;
1199 Int4 d;
1200 Char don[SPI_PSPLICE+2];
1201 Int4 from;
1202 Int4 i;
1203 Int4 j;
1204 Int4 len;
1205 Boolean local;
1206 Int4 n;
1207 Int4 ret;
1208 SeqAlignPtr sap;
1209 SeqIdPtr sip;
1210 SPI_MultPtr smu;
1211 Int4 spacer;
1212 SeqPortPtr spp;
1213 Uint1 strand;
1214 CharPtr PNTR stringptr;
1215 Char textid[42];
1216 Int4 to;
1217
1218 if (srip == NULL || srip->smu == NULL)
1219 return;
1220 smu = srip->smu;
1221 if (ofp == NULL)
1222 {
1223 local = TRUE;
1224 ofp = FileOpen("stdout", "w");
1225 } else
1226 local = FALSE;
1227 fprintf(ofp, "\n\n");
1228 if (html)
1229 fprintf(ofp, "<h1><center>");
1230 fprintf(ofp, "Multiple Alignments\n");
1231 ret = SPI_IsItMult(smu);
1232 if (ret == SPI_NOTMULT)
1233 {
1234 fprintf(ofp, "None of the alignments in the set appears to be a multiple alignment.\n");
1235 return;
1236 }
1237 if (html)
1238 {
1239 fprintf(ofp, "<br></center></h1>\n");
1240 fprintf(ofp, "<table cellspacing=\"8\" cellpadding=\"5\" border=\"0\" width=\"600\">\n");
1241 }
1242 spacer = SPI_SPACER;
1243 for (i=0; i<smu->numexons; i++)
1244 {
1245 sap = smu->exons[i];
1246 n = AlnMgr2GetNumRows(sap);
1247 if (html)
1248 {
1249 fprintf(ofp, "<tr><td bgcolor=%s width=\"600\">", (i%2)?"#FFFFFF":"#FFFFCC");
1250 fprintf(ofp, "<a name=Block%d></a><h4>Block %d\n</h4><pre>", i+1, i+1);
1251 } else
1252 fprintf(ofp, "Block %d\n", i+1);
1253 for (j=0; j<n; j++)
1254 {
1255 if (j > 0)
1256 sip = AlnMgr2GetNthSeqIdPtr(sap, j+1);
1257 else
1258 sip = bsp->id;
1259 SeqIdWrite(sip, textid, PRINTID_FASTA_LONG, 41);
1260 AlnMgr2GetNthSeqRangeInSA(sap, j+1, &from, &to);
1261 strand = AlnMgr2GetNthStrand(sap, j+1);
1262 if (html)
1263 fprintf(ofp, "<font color=%s>%s: %d to %d</font> %s\n", (j==0)?"#336699":"#800080", textid, from+1, to+1, (strand == Seq_strand_minus)?"<font color=#FF0033>minus strand</font>":"");
1264 else
1265 fprintf(ofp, "%s: %d to %d %s\n", textid, from+1, to+1, (strand == Seq_strand_minus)?"minus strand":"");
1266 if (j > 0)
1267 SeqIdFree(sip);
1268 }
1269 fprintf(ofp, "\n");
1270 len = AlnMgr2GetAlnLength(sap, FALSE);
1271 /* get donor and acceptor sites */
1272 strand = AlnMgr2GetNthStrand(sap, 1);
1273 AlnMgr2GetNthSeqRangeInSA(sap, 1, &from, &to);
1274 if (strand == Seq_strand_minus)
1275 {
1276 spp = SeqPortNew(bsp, to+1, MIN(bsp->length-1, to+SPI_PSPLICE), Seq_strand_minus, Seq_code_iupacna);
1277 ctr = MIN(bsp->length-1, to+SPI_PSPLICE)-(to+1)+1;
1278 } else
1279 {
1280 spp = SeqPortNew(bsp, MAX(0, from-SPI_PSPLICE), from-1, Seq_strand_plus, Seq_code_iupacna);
1281 ctr = from-1-MAX(0, from-SPI_PSPLICE)+1;
1282 }
1283 ctr = SeqPortRead(spp, (Uint1Ptr)accsite, ctr);
1284 accsite[ctr] = '\0';
1285 SeqPortFree(spp);
1286 if (strand == Seq_strand_minus)
1287 {
1288 spp = SeqPortNew(bsp, MAX(0, from-SPI_PSPLICE), from-1, Seq_strand_minus, Seq_code_iupacna);
1289 ctr = from-1-MAX(0, from-SPI_PSPLICE)+1;
1290 } else
1291 {
1292 spp = SeqPortNew(bsp, to+1, MIN(to+SPI_PSPLICE, bsp->length-1), Seq_strand_plus, Seq_code_iupacna);
1293 ctr = MIN(to+SPI_PSPLICE, bsp->length-1)-(to+1)+1;
1294 }
1295 ctr = SeqPortRead(spp, (Uint1Ptr)don, ctr);
1296 don[ctr] = '\0';
1297 SeqPortFree(spp);
1298 StringLower(accsite);
1299 StringLower(don);
1300 fprintf(ofp, "%s<-flank\n", accsite);
1301 stringptr = (CharPtr PNTR)MemNew(n*sizeof(CharPtr));
1302 coord = (Int4Ptr)MemNew(n*sizeof(Int4));
1303 for (c=0; c<len; c+=SPI_LINE-10)
1304 {
1305 for (j=0; j<n; j++)
1306 {
1307 stringptr[j] = SPI_WriteAlnLine(j+1, c, MIN(c+SPI_LINE-10-1, len-1), sap);
1308 coord[j] = SPI_MapRowCoords(sap, c, MIN(c+SPI_LINE-10-1, len-1), j+1, SPI_RIGHT);
1309 if (coord[j] >= 0)
1310 coord[j]++;
1311 }
1312 for (j=0; j<n; j++)
1313 {
1314 if (html)
1315 fprintf(ofp, "<font color=%s>", (j==0)?"#336699":"#800080");
1316 fprintf(ofp, "%d", coord[j]);
1317 if (html)
1318 fprintf(ofp, "</font>");
1319 /* KSK */
1320 /* d = spi_get_num_places(coord[j]); */
1321 for ( d = spi_get_num_places(coord[j]);
1322 d < spacer; d++)
1323 {
1324 fprintf(ofp, " ");
1325 }
1326 if (j == 0)
1327 fprintf(ofp, "%s", stringptr[j]);
1328 else
1329 {
1330 for (ctr=0; ctr<MIN(SPI_LINE-10, len-c); ctr++)
1331 {
1332 if (stringptr[j][ctr] == stringptr[0][ctr])
1333 fprintf(ofp, ".");
1334 else
1335 {
1336 if (html && stringptr[0][ctr] != '-' && stringptr[j][ctr] != '-')
1337 fprintf(ofp, "<font color=#FF0033>");
1338 fprintf(ofp, "%c", stringptr[j][ctr]);
1339 if (html && stringptr[0][ctr] != '-' && stringptr[j][ctr] != '-')
1340 fprintf(ofp, "</font>");
1341 }
1342 }
1343 }
1344 fprintf(ofp, "\n");
1345 if (j > 0)
1346 MemFree(stringptr[j]);
1347 }
1348 MemFree(stringptr[0]);
1349 if (c+SPI_LINE-10 < len)
1350 fprintf(ofp, "\n");
1351 }
1352 for (j=0; j<ctr+spacer-7; j++)
1353 {
1354 fprintf(ofp, " ");
1355 }
1356 fprintf(ofp, "flank->%s\n\n", don);
1357 if (html)
1358 fprintf(ofp, "<a href=#TOP>Top</a>\n");
1359 MemFree(stringptr);
1360 MemFree(coord);
1361 if (html)
1362 fprintf(ofp, "</pre></td></tr>\n");
1363 }
1364 if (html)
1365 fprintf(ofp, "</table>");
1366 if (local)
1367 FileClose(ofp);
1368 }
1369
1370 /***************************************************************************
1371 *
1372 * spi_overlaps decides whether a new seqalign overlaps the already
1373 * established range of a block. If it does, the coordinates are checked
1374 * to see if it extends that range; if so, the block range is widened.
1375 * If there is no overlap, the function returns FALSE.
1376 *
1377 ***************************************************************************/
spi_overlaps(SeqAlignPtr sap,SPI_BlockPtr sbp)1378 static Boolean spi_overlaps(SeqAlignPtr sap, SPI_BlockPtr sbp)
1379 {
1380 Int4 from;
1381 Int4 to;
1382
1383 AlnMgr2GetNthSeqRangeInSA(sap, 1, &from, &to);
1384 if ((from <= sbp->from_g && to >= sbp->from_g) || (from <= sbp->to_g && to >= sbp->to_g))
1385 {
1386 if (from < sbp->from_g)
1387 sbp->from_g = from;
1388 if (to > sbp->to_g)
1389 sbp->to_g = to;
1390 return TRUE;
1391 }
1392 return FALSE;
1393 }
1394
1395 /***************************************************************************
1396 *
1397 * SPI_BeautifySMP converts all coordinates to 1-based from 0-based, and
1398 * runs through the mRNA to see whether any mRNA is missing; if so, it
1399 * flags that alignment with holes=TRUE.
1400 *
1401 ***************************************************************************/
SPI_BeautifySMP(SPI_RegionInfoPtr srip)1402 static void SPI_BeautifySMP(SPI_RegionInfoPtr srip)
1403 {
1404 BioseqPtr bsp;
1405 Int4 i;
1406 SeqIdPtr sip;
1407 SPI_mRNAPtr smp;
1408
1409 /** KSK bug fix for when smp is null because
1410 SPI_SortRegionsByScore() removed everyone below
1411 -c threshold ***/
1412 while (srip != NULL && srip->smp != NULL)
1413 {
1414 smp = srip->smp;
1415 if (srip->polyAtail == 0 && smp->numexons > 1)
1416 {
1417 sip = AlnMgr2GetNthSeqIdPtr(smp->saps[0], 2);
1418 bsp = BioseqLockById(sip);
1419 srip->polyAtail = SPI_IsItPolyA(sip);
1420 BioseqUnlock(bsp);
1421 SeqIdFree(sip);
1422 }
1423 smp->holes = FALSE;
1424 if (smp->strand == Seq_strand_minus)
1425 {
1426 for (i=smp->numexons-2; i>0 && !smp->holes; i--)
1427 {
1428 if (smp->mstarts[i] != smp->mstops[i+1]+1)
1429 smp->holes = TRUE;
1430 }
1431 } else
1432 {
1433 for (i=1; i<smp->numexons && !smp->holes; i++)
1434 {
1435 if (smp->mstarts[i] != smp->mstops[i-1]+1)
1436 smp->holes = TRUE;
1437 }
1438 }
1439 for (i=0; i<smp->numexons; i++)
1440 {
1441 smp->mstarts[i]++;
1442 smp->mstops[i]++;
1443 smp->gstarts[i]++;
1444 smp->gstops[i]++;
1445 }
1446 srip = srip->next;
1447 }
1448 }
1449
1450 /***************************************************************************
1451 *
1452 * SPI_RemoveOutsideBounds removes alignments that fall outside the
1453 * spot->from and spot->to bounds, so that regions won't be created
1454 * outside these boundaries.
1455 *
1456 ***************************************************************************/
SPI_RemoveOutsideBounds(SeqAlignPtr sap,SPI_OptionsPtr spot)1457 static void SPI_RemoveOutsideBounds(SeqAlignPtr sap, SPI_OptionsPtr spot)
1458 {
1459 SeqAlignPtr salp;
1460 SeqAlignPtr salp_head;
1461 SeqAlignPtr salp_next;
1462 SeqAlignPtr salp_prev;
1463 Int4 start;
1464 Int4 stop;
1465
1466 if (sap == NULL || spot == NULL)
1467 return;
1468 salp = (SeqAlignPtr)(sap->segs);
1469 salp_head = salp_prev = NULL;
1470 while (salp != NULL)
1471 {
1472 salp_next = salp->next;
1473 salp->next = NULL;
1474 AlnMgr2GetNthSeqRangeInSA(salp, 1, &start, &stop);
1475 if (start >= spot->from || stop <= spot->to)
1476 {
1477 if (salp_head != NULL)
1478 {
1479 salp_prev->next = salp;
1480 salp_prev = salp;
1481 } else
1482 salp_head = salp_prev = salp;
1483 } else
1484 SeqAlignFree(salp);
1485 salp = salp_next;
1486 }
1487 sap->segs = (Pointer)(salp_head);
1488 }
1489
1490 /***************************************************************************
1491 *
1492 * SPI_PadRegions takes a linked list of regions and adds SPI_PADDING
1493 * to either end of each region.
1494 *
1495 ***************************************************************************/
SPI_PadRegions(SPI_RegionInfoPtr srip,Int4 bsplen)1496 static void SPI_PadRegions(SPI_RegionInfoPtr srip, Int4 bsplen)
1497 {
1498 while (srip != NULL)
1499 {
1500 srip->gstart = srip->gstart - SPI_PADDING;
1501 if (srip->gstart < 0)
1502 srip->gstart = 0;
1503 srip->gstop = srip->gstop + SPI_PADDING;
1504 if (srip->gstop > bsplen - 1)
1505 srip->gstop = bsplen - 1;
1506 srip = srip->next;
1507 }
1508 }
1509
1510
1511
1512
1513 /***************************************************************************
1514 *
1515 * SPI_SortRegionsByScore is called after SPI_AlignInWindows to sort the
1516 * final regions from best to worst for printing. Since there is now a
1517 * complete mRNA alignment in each region, the regions can be more
1518 * thoroughly assessed, and the regions are sorted by mRNA coverage,
1519 * number of mismatches, and finally by genomic start position.
1520 *
1521 ***************************************************************************/
SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip,SPI_OptionsPtr spot)1522 static void SPI_SortRegionsByScore(SPI_RegionInfoPtr PNTR srip, SPI_OptionsPtr spot)
1523 {
1524 Int4 i;
1525 Int4 j;
1526 SPI_RegionInfoPtr PNTR srip_array;
1527 SPI_RegionInfoPtr srip_head;
1528 SPI_RegionInfoPtr srip_prev;
1529 SPI_RegionInfoPtr srip_tmp;
1530
1531 if (srip == NULL || *srip == NULL)
1532 return;
1533 srip_tmp = *srip;
1534 i = 0;
1535 while (srip_tmp != NULL)
1536 {
1537 i++;
1538 srip_tmp = srip_tmp->next;
1539 }
1540 srip_array = (SPI_RegionInfoPtr PNTR)MemNew(i*sizeof(SPI_RegionInfoPtr));
1541 srip_tmp = *srip;
1542 j = 0;
1543 while (srip_tmp != NULL && j < i)
1544 {
1545 srip_array[j] = srip_tmp;
1546 srip_tmp = srip_tmp->next;
1547 j++;
1548 }
1549 HeapSort(srip_array, i, sizeof(SPI_RegionInfoPtr), SPI_CompareRegions);
1550 for (j=0; j<i; j++) /* remove the ones that don't score above the cutoffs */
1551 {
1552 srip_tmp = srip_array[j];
1553 if (srip_tmp->smp == NULL || (srip_tmp->smp->mRNAcoverage < spot->lencutoff && !srip_tmp->fallsoff)|| srip_tmp->smp->mismatch > 100-spot->idcutoff)
1554 {
1555 SPI_RegionFree(srip_tmp);
1556 srip_array[j] = NULL;
1557 }
1558 }
1559 srip_head = srip_prev = NULL;
1560 for (j=0; j<i; j++)
1561 {
1562 if (srip_array[j] != NULL)
1563 {
1564 if (srip_head != NULL)
1565 {
1566 srip_prev->next = srip_array[j];
1567 srip_array[j]->next = NULL;
1568 srip_prev = srip_array[j];
1569 } else
1570 {
1571 srip_head = srip_prev = srip_array[j];
1572 srip_head->next = NULL;
1573 }
1574 }
1575 }
1576 srip_tmp = srip_prev = srip_head;
1577 i = 1;
1578 /** KSK fix for when all are null **/
1579 if (srip_tmp != NULL && srip_tmp->next != NULL){
1580 srip_tmp = srip_tmp->next; /* know we need the first one at least */
1581 while (srip_tmp != NULL){
1582 if (i+1>spot->numreturns){ /* this guy is one too many */
1583 srip_prev->next = NULL;
1584 SPI_RegionListFree(srip_tmp);
1585 srip_tmp = NULL;
1586 } else {
1587 srip_prev = srip_tmp;
1588 srip_tmp = srip_tmp->next;
1589 i++;
1590 }
1591 }
1592 *srip = srip_head;
1593 MemFree(srip_array);
1594 }
1595 else {
1596 *srip = srip_head;
1597 }
1598 }
1599
1600 /***************************************************************************
1601 *
1602 * SPI_CompareRegions is the HeapSort callback for SPI_SortRegionsByScore.
1603 * It sorts the regions first by mRNA coverage, then by the number of
1604 * mismatches in the mRNA-to-genomic alignment, and finally by the
1605 * start position on the genomic sequence.
1606 *
1607 ***************************************************************************/
SPI_CompareRegions(VoidPtr ptr1,VoidPtr ptr2)1608 static int LIBCALLBACK SPI_CompareRegions(VoidPtr ptr1, VoidPtr ptr2)
1609 {
1610 SPI_RegionInfoPtr srip1;
1611 SPI_RegionInfoPtr srip2;
1612
1613 if (ptr1 != NULL && ptr2 != NULL)
1614 {
1615 srip1 = *((SPI_RegionInfoPtr PNTR)ptr1);
1616 srip2 = *((SPI_RegionInfoPtr PNTR)ptr2);
1617 if (srip1->smp == NULL)
1618 return 1;
1619 if (srip2->smp == NULL)
1620 return -1;
1621 if (srip1->smp->mRNAcoverage > srip2->smp->mRNAcoverage)
1622 return -1;
1623 else if (srip1->smp->mRNAcoverage < srip2->smp->mRNAcoverage)
1624 return 1;
1625 else
1626 {
1627 if (srip1->smp->mismatch < srip2->smp->mismatch)
1628 return -1;
1629 else if (srip1->smp->mismatch > srip2->smp->mismatch)
1630 return 1;
1631 else
1632 {
1633 if (srip1->smp->gstarts[0] < srip2->smp->gstarts[0])
1634 return -1;
1635 else if (srip1->smp->gstarts[0] > srip2->smp->gstarts[0])
1636 return 1;
1637 else
1638 return 0;
1639 }
1640 }
1641 }
1642 return 0;
1643 }
1644
1645 /***************************************************************************
1646 *
1647 * SPI_PrintAce prints the spidey results in ACEDB format for compatibility
1648 * with Jean Thierry-Mieg's Acembly software.
1649 *
1650 ***************************************************************************/
SPI_PrintAce(FILE * ofp,SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,Boolean is_cds)1651 static void SPI_PrintAce(FILE *ofp, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, Boolean is_cds)
1652 {
1653 Int4 gi;
1654 Int4 i;
1655 CharPtr id1;
1656 CharPtr id2;
1657 Int4 n;
1658 SPI_mRNAPtr smp;
1659
1660 smp = srip->smp;
1661 SPI_GetAccessionFromSeqId(bsp_genomic->id, &gi, &id1);
1662 SPI_GetAccessionFromSeqId(bsp_mrna->id, &gi, &id2);
1663 fprintf(ofp, "Sequence %s\n", id1);
1664 fprintf(ofp, "Subsequence Sp_%s.%s", id2, (is_cds == TRUE)?"cds":"mrna");
1665 if (smp->strand == Seq_strand_minus)
1666 fprintf(ofp, "\t%d\t%d\n\n", smp->gstops[smp->numexons-1]+1, smp->gstarts[0] + 1);
1667 else
1668 fprintf(ofp, "\t%d\t%d\n\n", smp->gstarts[0]+1, smp->gstops[smp->numexons-1]+1);
1669 fprintf(ofp, "Sequence Sp_%s.%s\n", id2, (is_cds == TRUE)?"cds":"mrna");
1670 fprintf(ofp, "Method Spidey\n");
1671 n = 1;
1672 if (is_cds)
1673 fprintf(ofp, "CDS\n");
1674 if (smp->strand != Seq_strand_minus)
1675 {
1676 for (i=0; i<smp->numexons; i++)
1677 {
1678 fprintf(ofp, "Source_Exons\t%d\t%d\n", n, n + smp->gstops[i] - smp->gstarts[i]);
1679 n += smp->gstops[i] - smp->gstarts[i];
1680 if (i < smp->numexons-1)
1681 n += smp->gstarts[i+1] - smp->gstops[i];
1682 }
1683 } else
1684 {
1685 for (i = smp->numexons-1; i>=0; i--)
1686 {
1687 fprintf(ofp, "Source_Exons\t%d\t%d\n", n, n + smp->gstops[i] - smp->gstarts[i]);
1688 n += smp->gstops[i] - smp->gstarts[i];
1689 if (i > 0)
1690 n += smp->gstarts[i] - smp->gstops[i-1];
1691 }
1692 }
1693 fprintf(ofp, "DNA_Homol %s\n", id2);
1694 if (smp->missingends == SPI_LEFT)
1695 fprintf(ofp, "Start_not_found\n");
1696 else if (smp->missingends == SPI_RIGHT)
1697 fprintf(ofp, "Stop_not_found\n");
1698 else if (smp->missingends == SPI_BOTH)
1699 fprintf(ofp, "Start_not_found\nStop_not_found\n");
1700 fprintf(ofp, "\n\n");
1701 }
1702
1703 /***************************************************************************
1704 *
1705 * SPI_PrintResult prints the summary report and (if requested) the
1706 * text alignment. Since the exons are stored in the order of the
1707 * genomic sequence, not the mRNA, they must be reversed to print the
1708 * mRNA from 5' to 3'. The SPI_ExonProfPtr holds the information about
1709 * the location of the gaps and mismatches, so this structure is sent
1710 * to spi_print_mismatch_line, which interprets the information in the
1711 * ExonProfPtr and creates the mismatch line (vertical bars for identity,
1712 * nothing for gaps or mismatches). (PRRESULT)
1713 *
1714 ***************************************************************************/
SPI_PrintResult(FILE * ofp,FILE * ofp2,SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot,Boolean isitCDS)1715 static void SPI_PrintResult(FILE *ofp, FILE *ofp2, SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot, Boolean isitCDS)
1716 {
1717 AlnMsg2Ptr amp;
1718 Boolean begin;
1719 BioseqPtr bsp;
1720 Char buf[61];
1721 Int4 c;
1722 Char ch;
1723 Int4 counter;
1724 Int4 ctr;
1725 Boolean done;
1726 Boolean end;
1727 Int4 endctr;
1728 CharPtr endstr;
1729 SPI_ExonProfPtr epp_curr;
1730 Int4 gstart, gbuflen = 0;
1731 Int4 i;
1732 Boolean is_splice;
1733 Int4 j;
1734 Int4 l;
1735 Int4 len;
1736 Int4 maxline;
1737 Int4 minline;
1738 Boolean more;
1739 Boolean ng;
1740 Int4 offset;
1741 CharPtr p;
1742 Int4 pos;
1743 Char prot[SPI_LINE+5];
1744 Int4 r;
1745 Int4 s;
1746 SeqAlignPtr sap;
1747 Int4 splice;
1748 SeqPortPtr spp = NULL;
1749 Int4 start;
1750 Boolean start_prot;
1751 Int4 PNTR starts;
1752 Int4 stop;
1753 Int4 PNTR stops;
1754 Uint1 strand;
1755 Boolean term;
1756 CharPtr text;
1757 Char text1[200];
1758 Char text2[200];
1759 Char textid1[42];
1760 Char textid2[42];
1761 Int4 tmp;
1762 CharPtr tmpstring = NULL;
1763
1764 if (bsp_genomic == NULL || bsp_mrna == NULL)
1765 return;
1766 if (spot->printaln != 3 && ofp == NULL)
1767 return;
1768 if (spot->printaln >= 2 && ofp2 == NULL)
1769 return;
1770 if (srip != NULL && srip->revcomp)
1771 BioseqRevComp(bsp_mrna);
1772 FastaDefLine (bsp_genomic, text1, 200, NULL, NULL, 0);
1773 SeqIdWrite(bsp_genomic->id, textid1, PRINTID_FASTA_LONG, 41);
1774 FastaDefLine (bsp_mrna, text2, 200, NULL, NULL, 0);
1775 SeqIdWrite(bsp_mrna->id, textid2, PRINTID_FASTA_LONG, 41);
1776 if (spot->printaln != 2)
1777 {
1778 if (spot->printheader)
1779 fprintf(ofp, "--SPIDEY version 1.40--\n");
1780 fprintf(ofp, "Genomic: %s ", textid1);
1781 fprintf(ofp, "%s, ", text1);
1782 fprintf(ofp, "%d bp\n", bsp_genomic->length);
1783 if (isitCDS)
1784 {
1785 tmpstring = StringSave("CDS");
1786 fprintf(ofp, "CDS: %s ", textid2);
1787 } else
1788 {
1789 tmpstring = StringSave("mRNA");
1790 fprintf(ofp, "mRNA: %s ", textid2);
1791 }
1792 fprintf(ofp, "%s, ", text2);
1793 if (isitCDS)
1794 {
1795 fprintf(ofp, "%d - %d, %d bp\n", srip->mstart+1, srip->mstop+1, srip->mlen);
1796 offset = srip->mstart; /* this is where the CDS starts */
1797 } else
1798 {
1799 fprintf(ofp, "%d bp\n", bsp_mrna->length);
1800 offset = 0;
1801 }
1802 if (srip == NULL || srip->smp == NULL)
1803 {
1804 fprintf(ofp, "No alignment found.\n\n");
1805 fflush(ofp);
1806 return;
1807 }
1808 if (srip->smp->strand == Seq_strand_minus)
1809 fprintf(ofp, "Strand: minus");
1810 else
1811 fprintf(ofp, "Strand: plus");
1812 if (srip->revcomp)
1813 fprintf(ofp, " Reverse complement\n");
1814 else
1815 fprintf(ofp, "\n");
1816 fprintf(ofp, "Number of exons: %d\n", srip->smp->numexons);
1817 splice = 0;
1818
1819 for (i=0; i < srip->smp->numexons; i++){
1820 if (srip->smp->strand == Seq_strand_minus){
1821 c = srip->smp->numexons - i - 1;
1822 }
1823 else {
1824 c = i;
1825 }
1826 splice += srip->smp->splicedon[i];
1827 epp_curr = srip->smp->epp;
1828 /* KSK to get correct exon info to report mismatches
1829 have to get the exon ptr to the right one */
1830 while (epp_curr != NULL && epp_curr->exonnum != c + 1){
1831 epp_curr = epp_curr->next;
1832 }
1833 if (srip->revcomp){
1834 fprintf(ofp, "Exon %d: %d-%d (gen) %d-%d (%s) id %.1f%% mismatches %d gaps %d splice site (d a): %d %d", i+1, srip->smp->gstarts[c], srip->smp->gstops[c], bsp_mrna->length-srip->smp->mstarts[c]+1, bsp_mrna->length-srip->smp->mstops[c]+1, tmpstring, srip->smp->exonid[c], (epp_curr != NULL ? epp_curr->nummismatches : 0), srip->smp->exongaps[c], srip->smp->splicedon[c], srip->smp->spliceacc[c]);
1835 }
1836 else {
1837 fprintf(ofp, "Exon %d%s: %d-%d (gen) %d-%d (%s) id %.1f%% mismatches %d gaps %d splice site (d a): %d %d", i+1, srip->smp->strand == Seq_strand_minus?"(-)":"", srip->smp->gstarts[c], srip->smp->gstops[c], srip->smp->mstarts[c], srip->smp->mstops[c], tmpstring, srip->smp->exonid[c], (epp_curr != NULL ? epp_curr->nummismatches : 0), srip->smp->exongaps[c], srip->smp->splicedon[c], srip->smp->spliceacc[c]);
1838 }
1839 if (i > 0 && i<srip->smp->numexons-1 && srip->smp->splicedon[c] == 0 && srip->smp->spliceacc[c] == 0){
1840 fprintf(ofp, " uncertain\n");
1841 }
1842 else {
1843 fprintf(ofp, "\n");
1844 }
1845 }
1846 fprintf(ofp, "Number of splice sites: %d\n", splice);
1847 fprintf(ofp, "%s coverage: %d%%\n", tmpstring, srip->smp->mRNAcoverage);
1848 fprintf(ofp, "overall percent identity: %.1f%%\n", (FloatHi)(100) - srip->smp->mismatch);
1849 if (srip->smp->missingends == SPI_BOTH)
1850 text = StringSave("both");
1851 else if (srip->smp->missingends == SPI_NEITHER)
1852 text = StringSave("neither");
1853 else if (srip->smp->missingends == SPI_LEFT)
1854 text = StringSave("left");
1855 else if (srip->smp->missingends == SPI_RIGHT)
1856 text = StringSave("right");
1857 else
1858 text = StringSave("error");
1859 fprintf(ofp, "Missing %s ends: %s ", tmpstring, text);
1860 if (srip->fallsoff)
1861 fprintf(ofp, " -- may fall off end");
1862 fprintf(ofp, "\n");
1863 if (!isitCDS) /* print poly(A) tail information for mRNAs */
1864 {
1865 if (srip->polyAtail > 0)
1866 fprintf(ofp, "Non-aligning poly(A)+ tail length: %d\n", srip->polyAtail);
1867 else if (srip->polyAtail < 0)
1868 fprintf(ofp, "Aligning poly(A)+ tail length: %d\n", -srip->polyAtail);
1869 } else /* print UTR %id information for CDSs */
1870 {
1871 if (srip->strand != Seq_strand_minus)
1872 {
1873 if (srip->utr.left != -1)
1874 fprintf(ofp, "5' UTR id %.1f%%\n", srip->utr.left);
1875 if (srip->utr.right != -1)
1876 fprintf(ofp, "3' UTR id %.1f%%\n", srip->utr.right);
1877 } else
1878 {
1879 if (srip->utr.right != -1)
1880 fprintf(ofp, "5' UTR id %.1f%%\n", srip->utr.right);
1881 if (srip->utr.left != -1)
1882 fprintf(ofp, "3' UTR id %.1f%%\n", srip->utr.left);
1883 }
1884 if (srip->gstart == 1)
1885 fprintf(ofp, "5' partial\n");
1886 if (srip->gstop == 1)
1887 fprintf(ofp, "3' partial\n");
1888 }
1889 fprintf(ofp, "\n");
1890 fflush(ofp);
1891 }
1892 p = NULL;
1893 if (spot->printaln != 1) /* print alignment too */
1894 {
1895 if (spot->printaln == 0)
1896 ofp2 = ofp;
1897 p = srip->smp->protein;
1898 pos = srip->smp->transstart;
1899 epp_curr = srip->smp->epp;
1900 if (isitCDS && spot->printaln == 2)
1901 tmpstring = StringSave("CDS");
1902 else if (spot->printaln == 2)
1903 tmpstring = StringSave("mRNA");
1904 fprintf(ofp2, "Genomic: %s %s\n", textid1, text1);
1905 fprintf(ofp2, "%s: %s %s\n", tmpstring, textid2, text2);
1906 if (spot->printaln > 0)
1907 {
1908 if (srip == NULL || srip->smp == NULL)
1909 {
1910 fprintf(ofp2, "No alignment found.\n\n");
1911 fflush(ofp2);
1912 return;
1913 }
1914 if (srip->smp->strand == Seq_strand_minus)
1915 fprintf(ofp2, "Strand: minus");
1916 else
1917 fprintf(ofp2, "Strand: plus");
1918 if (srip->revcomp)
1919 fprintf(ofp2, " Reverse complement\n");
1920 else
1921 fprintf(ofp2, "\n");
1922 fprintf(ofp2, "Number of exons: %d\n", srip->smp->numexons);
1923 }
1924 amp = AlnMsgNew2();
1925 sap = NULL;
1926 for (i=0; i<srip->smp->numexons; i++)
1927 {
1928 if (sap != NULL)
1929 SeqAlignFree(sap);
1930 sap = NULL;
1931 if (srip->strand == Seq_strand_minus)
1932 {
1933 c = srip->smp->numexons - i - 1;
1934 sap = SeqAlignDup(srip->smp->saps[c]);
1935 SeqAlignListReverseStrand(sap);
1936 AlnMgr2IndexSingleChildSeqAlign(sap);
1937 starts = srip->smp->gstops;
1938 stops = srip->smp->gstarts;
1939 epp_curr = srip->smp->epp;
1940 while (epp_curr != NULL && epp_curr->exonnum != c+1)
1941 {
1942 epp_curr = epp_curr->next;
1943 }
1944 if (epp_curr != NULL && epp_curr->exonnum == c+1) /* need to change the mismatch positions now */
1945 {
1946 l = AlnMgr2GetAlnLength(sap, FALSE);
1947 for (j=0; j<epp_curr->nummismatches; j++)
1948 {
1949 epp_curr->mismatches[j] = l - epp_curr->mismatches[j] - 1;
1950 }
1951 for (j=0; j<ceil(epp_curr->nummismatches/2); j++)
1952 {
1953 tmp = epp_curr->mismatches[j];
1954 epp_curr->mismatches[j] = epp_curr->mismatches[epp_curr->nummismatches-j-1];
1955 epp_curr->mismatches[epp_curr->nummismatches-j-1] = tmp;
1956 }
1957 }
1958 } else
1959 {
1960 c = i;
1961 starts = srip->smp->gstarts;
1962 stops = srip->smp->gstops;
1963 epp_curr = srip->smp->epp;
1964 while (epp_curr != NULL && epp_curr->exonnum != c+1)
1965 {
1966 epp_curr = epp_curr->next;
1967 }
1968 }
1969 if (srip->revcomp)
1970 fprintf(ofp2, "Exon %d: %d-%d (gen) %d-%d (%s)\n", i+1, starts[c], stops[c], bsp_mrna->length-srip->smp->mstarts[c]+1, bsp_mrna->length-srip->smp->mstops[c]+1, tmpstring);
1971 else
1972 fprintf(ofp2, "Exon %d: %d-%d (gen) %d-%d (%s)\n", i+1, starts[c], stops[c], srip->smp->mstarts[c], srip->smp->mstops[c], tmpstring);
1973 if (sap == NULL)
1974 {
1975 sap = SeqAlignDup(srip->smp->saps[c]);
1976 AlnMgr2IndexSingleChildSeqAlign(sap);
1977 }
1978 strand = AlnMgr2GetNthStrand(sap, 1);
1979 len = AlnMgr2GetAlnLength(sap, FALSE);
1980 AlnMgr2GetNthSeqRangeInSA(sap, 1, &gstart, NULL);
1981 end = FALSE;
1982 ng = FALSE;
1983 term = FALSE;
1984 is_splice = FALSE;
1985 begin = TRUE;
1986 endstr = NULL;
1987 for (l=0; l<len+10; l+= SPI_LINE)
1988 {
1989 start_prot = TRUE;
1990 if (l == SPI_LINE)
1991 l = SPI_LINE - 10; /* kludge to print genomic splice on 1st line */
1992 minline = maxline = -1; /* reset mRNA bounds */
1993 for (j=1; l<len && j<=2; j++)
1994 {
1995 fprintf(ofp2, "\n");
1996 if (j == 1)
1997 {
1998 fprintf(ofp2, "\n");
1999 bsp = bsp_genomic;
2000 } else
2001 {
2002 bsp = bsp_mrna;
2003 /* retrieve mismatch information, print the line of vertical bars */
2004 spi_print_mismatch_line(ofp2, c+1, l, len-1, epp_curr, gstart);
2005 }
2006 AlnMsgReNew2(amp);
2007 amp->from_aln = l;
2008 if (l != 0)
2009 {
2010 if (l+SPI_LINE-1 >= len-1)
2011 {
2012 end = TRUE;
2013 amp->to_aln = -1;
2014 } else
2015 amp->to_aln = l+SPI_LINE-1;
2016 } else
2017 {
2018 if (begin == FALSE)
2019 {
2020 if (l+SPI_LINE-1 >= len-1)
2021 end = TRUE;
2022 } else
2023 {
2024 if (l+SPI_LINE-1-10 >= len-1)
2025 end = TRUE;
2026 }
2027 amp->to_aln = MIN(SPI_LINE-1-10, len-1);
2028 }
2029 amp->row_num = j;
2030 done = FALSE;
2031 if (is_splice == TRUE)
2032 is_splice = FALSE;
2033 r=0;
2034 counter = 0;
2035 if (j == 2)
2036 begin = FALSE;
2037 while ((Boolean)(more = AlnMgr2GetNextAlnBit(sap, amp)))
2038 {
2039 if (j == 1 && amp->type == AM_SEQ)
2040 counter += amp->to_row - amp->from_row + 1;
2041 if (j == 2)
2042 {
2043 if (minline == -1 && amp->type == AM_SEQ)
2044 minline = amp->from_row;
2045 if (maxline == -1)
2046 maxline = amp->to_row;
2047 /* maxline = AlnMgr2MapSeqAlignToBioseq(sap, amp->to_aln, 2);*/
2048 }
2049 /* print splice site */
2050 /** KSK fix for when minus strand is
2051 at the end, and simplified this loop ***/
2052 if (l==0 && j==1 && !done){
2053 if (amp->strand != Seq_strand_minus){
2054 if (amp->from_row < 10){
2055 start = 0;
2056 gbuflen = amp->from_row;
2057 stop = gbuflen - 1;
2058 }
2059 else {
2060 start = amp->from_row - 10;
2061 stop = amp->from_row - 1;
2062 gbuflen = 10;
2063 }
2064 } else {
2065 if (amp->to_row + 10 < bsp_genomic->length-1){
2066 stop = amp->to_row + 10;
2067 start = amp->to_row + 1;
2068 gbuflen = 10;
2069 }
2070 else {
2071 stop = bsp_genomic->length-1;
2072 gbuflen = (bsp_genomic->length - 1)
2073 - (amp->to_row + 1) + 1;
2074 start = amp->to_row + 1;
2075 }
2076 }
2077 /** KSK fix continues so that only as many
2078 bases as exist up to 10
2079 will be read for the intron
2080 buffer **/
2081
2082 if (gbuflen > 1){
2083 spp = SeqPortNew(bsp_genomic, start, stop, amp->strand, Seq_code_iupacna);
2084 ctr = SeqPortRead(spp, (Uint1Ptr)buf, gbuflen);
2085 }
2086 else if (gbuflen <= 1){
2087 ctr = 0;
2088 spp = NULL;
2089 }
2090 buf[ctr] = '\0';
2091 while (ctr < 10){
2092 fprintf(ofp2, " ");
2093 ctr++;
2094 }
2095 fwrite(buf, 1, ctr, ofp2);
2096 if (spp){
2097 SeqPortFree(spp);
2098 }
2099 /** end of region of KSK fix **/
2100
2101 done = TRUE;
2102 } else if (l==0 && j==2 && !done)
2103 {
2104 fprintf(ofp2, " "); /* 10 spaces for splice site */
2105 done = TRUE;
2106 is_splice = TRUE;
2107 }
2108 /**** used no more ***
2109 else if (l==0 && j==1 && !done && gstart < 10)
2110 {
2111 spp = SeqPortNew(bsp_genomic, 0, gstart, amp->strand, Seq_code_iupacna);
2112 ctr = SeqPortRead(spp, (Uint1Ptr)buf, gstart);
2113 buf[ctr] = '\0';
2114 while (ctr < 10)
2115 {
2116 fprintf(ofp2, " ");
2117 ctr++;
2118 }
2119 fprintf(ofp2, buf);
2120 }
2121 **********************/
2122 if (amp->type == AM_SEQ)
2123 {
2124 spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
2125 ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_LINE);
2126 if (ctr > 0)
2127 buf[ctr] = '\0';
2128 fprintf(ofp2, "%s", buf);
2129 SeqPortFree(spp);
2130 } else /* print dashes for gaps */
2131 {
2132 for (ctr=0; ctr<(amp->to_row - amp->from_row+1); ctr++)
2133 {
2134 fprintf(ofp2, "-");
2135 }
2136 }
2137 if (j==1 && end && counter >= amp->to_aln - amp->from_aln)
2138 {
2139 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
2140 if (strand != Seq_strand_minus)
2141 {
2142 if (stop > bsp_genomic->length - 11)
2143 start = bsp_genomic->length;
2144 else
2145 start = stop + SPI_PSPLICE;
2146 spp = SeqPortNew(bsp_genomic, stop+1, start, strand, Seq_code_iupacna);
2147 } else
2148 {
2149 if (start < SPI_PSPLICE)
2150 stop = 0;
2151 else
2152 stop = start - SPI_PSPLICE;
2153 spp = SeqPortNew(bsp_genomic, stop, start-1, strand, Seq_code_iupacna);
2154 }
2155 endctr = ctr;
2156 ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_PSPLICE);
2157 if (ctr > 0)
2158 buf[ctr] = '\0';
2159 for (ctr=0; endctr+ctr <= SPI_LINE-1 && ctr<SPI_PSPLICE; ctr++)
2160 {
2161 fprintf(ofp2, "%c", buf[ctr]);
2162 }
2163 endstr = NULL;
2164 if (ctr < SPI_PSPLICE)
2165 endstr = StringSave(&buf[ctr]);
2166 SeqPortFree(spp);
2167 }
2168 if (pos <= maxline && amp->type == AM_SEQ && p != NULL && j == 2) /* at least part of this is coding -- */
2169 { /* print the protein sequence underneath*/
2170 if (is_splice)
2171 {
2172 is_splice = FALSE;
2173 for (s=r; s<r+11; s++)
2174 {
2175 prot[s] = ' ';
2176 }
2177 r = s-1;
2178 }
2179 if (start_prot)
2180 {
2181 for (ctr = 0; ctr < pos-minline; ctr++)
2182 {
2183 prot[r] = ' ';
2184 r++;
2185 }
2186 start_prot = FALSE;
2187 }
2188 if (pos >= amp->from_row-1 && pos <= amp->to_row)
2189 {
2190 ng = TRUE;
2191 if (pos == minline-1)
2192 {
2193 ch = *p;
2194 prot[r] = ch;
2195 r++;
2196 prot[r] = ' ';
2197 r++;
2198 if (*p == '*')
2199 term = TRUE;
2200 p++;
2201 pos+=3;
2202 }
2203 for (ctr = pos; ctr < maxline && *p != '\0' && !term; ctr += 3)
2204 {
2205 ch = *p;
2206 prot[r] = ' ';
2207 r++;
2208 prot[r] = ch;
2209 r++;
2210 prot[r] = ' ';
2211 r++;
2212 if (*p == '*')
2213 term = TRUE;
2214 p++;
2215 pos+=3;
2216 }
2217 }
2218 } else if (j == 2 && amp->type == AM_SEQ && p != NULL && pos>=minline && pos <= maxline)
2219 {
2220 for (s=0; s<(amp->to_row - amp->from_row+1); s++)
2221 {
2222 prot[r] = ' ';
2223 r++;
2224 }
2225 }
2226 }
2227 if (j == 2 && ng == TRUE)
2228 {
2229 prot[r] = '\0';
2230 fprintf(ofp2, "\n%s\n", prot);
2231 }
2232 }
2233 }
2234 if (endstr == NULL) /* genomic sequence and overhang fit on the same line */
2235 fprintf(ofp2, "\n\n");
2236 else /* there's some extra genomic overhang sequence, print it on the next line */
2237 {
2238 fprintf(ofp2, "\n\n%s\n\n", endstr);
2239 MemFree(endstr);
2240 }
2241 }
2242 AlnMsgFree2(amp);
2243 }
2244 if (srip->revcomp)
2245 BioseqRevComp(bsp_mrna);
2246 }
2247
2248 /***************************************************************************
2249 *
2250 * SPI_PrintHerdResult is analogous to SPI_PrintResult; it prints a
2251 * summary of the mRNA-to-draft alignment and, if requested, it also
2252 * prints a text alignment. Since the exons are already in order of the
2253 * mRNA sequence, printing the text of the alignments is pretty
2254 * straightforward.
2255 *
2256 ***************************************************************************/
SPI_PrintHerdResult(FILE * ofp,FILE * ofp2,SPI_mRNAToHerdPtr herd,SPI_OptionsPtr spot,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna)2257 static void SPI_PrintHerdResult(FILE *ofp, FILE *ofp2, SPI_mRNAToHerdPtr herd, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna)
2258 {
2259 AlnMsg2Ptr amp;
2260 BioseqPtr bsp;
2261 Char buf[61];
2262 Int4 ctr;
2263 Boolean done;
2264 Boolean end;
2265 Int4 endctr;
2266 CharPtr endstr = NULL;
2267 SPI_ExonProfPtr epp_curr;
2268 Int4 gstart;
2269 Int4 i;
2270 Int4 j;
2271 Int4 l;
2272 Int4 len;
2273 Boolean more;
2274 Int4 polyAtail;
2275 SeqAlignPtr sap;
2276 SeqPortPtr spp;
2277 Int4 start;
2278 Int4Ptr starts;
2279 Int4 stop;
2280 Int4Ptr stops;
2281 Uint1 strand;
2282 CharPtr text;
2283 Char text1[200];
2284 Char text2[200];
2285 Char textid1[42];
2286 Char textid2[42];
2287
2288 if (ofp == NULL || herd == NULL || bsp_genomic == NULL || bsp_mrna == NULL)
2289 return;
2290 fprintf(ofp, "--SPIDEY version 1.35--\n");
2291 FastaDefLine (bsp_genomic, text1, 200, NULL, NULL, 0);
2292 SeqIdWrite(bsp_genomic->id, textid1, PRINTID_FASTA_LONG, 41);
2293 fprintf(ofp, "Genomic: %s ", textid1);
2294 fprintf(ofp, "%s, ", text1);
2295 fprintf(ofp, "%d bp\n", bsp_genomic->length);
2296 FastaDefLine (bsp_mrna, text2, 200, NULL, NULL, 0);
2297 SeqIdWrite(bsp_mrna->id, textid2, PRINTID_FASTA_LONG, 41);
2298 fprintf(ofp, "mRNA: %s ", textid2);
2299 fprintf(ofp, "%s, ", text2);
2300 fprintf(ofp, "%d bp\n", bsp_mrna->length);
2301 if (herd->numpieces == 0)
2302 {
2303 fprintf(ofp, "No alignment found.\n\n");
2304 fflush(ofp);
2305 return;
2306 }
2307 fprintf(ofp, "Number of exons: %d\n", herd->numexons);
2308 fprintf(ofp, "Number of pieces: %d\n", herd->numpieces);
2309 for (i=0; i<herd->numpieces; i++)
2310 {
2311 fprintf(ofp, "Fragment %d Exon %d: %d-%d (gen) %d-%d (mRNA) id %.1f%% gaps %d splice site (d a): %d %d ", herd->fragments[i], herd->exons[i], herd->gstarts[i], herd->gstops[i], herd->mstarts[i], herd->mstops[i], (100)*(1-(FloatHi)herd->pmismatch[i]/(FloatHi)herd->lens[i]), herd->pgaps[i], herd->splicedon[i], herd->spliceacc[i]);
2312 if (herd->strands[i] != Seq_strand_minus)
2313 fprintf(ofp, "Strand: plus\n");
2314 else
2315 fprintf(ofp, "Strand: minus\n");
2316 if (herd->fallsoff[i] != SPI_NEITHER)
2317 {
2318 if (herd->fallsoff[i] == SPI_LEFT)
2319 fprintf(ofp, "May fall off left side\n");
2320 else if (herd->fallsoff[i] == SPI_RIGHT)
2321 fprintf(ofp, "May fall off right side\n");
2322 else if (herd->fallsoff[i] == SPI_BOTH)
2323 fprintf(ofp, "May fall off both sides\n");
2324 }
2325 }
2326 fprintf(ofp, "mRNA coverage: %.1f%%\n", herd->mRNAcoverage);
2327 fprintf(ofp, "overall percent identity: %.1f%%\n", (FloatHi)(100) - herd->mismatch);
2328 if (herd->missingends == SPI_BOTH)
2329 text = StringSave("both");
2330 else if (herd->missingends == SPI_NEITHER)
2331 text = StringSave("neither");
2332 else if (herd->missingends == SPI_LEFT)
2333 text = StringSave("left");
2334 else if (herd->missingends == SPI_RIGHT)
2335 text = StringSave("right");
2336 else
2337 text = StringSave("error");
2338 fprintf(ofp, "Missing mRNA ends: %s\n", text);
2339 polyAtail = SPI_IsItPolyA(bsp_mrna->id);
2340 if (polyAtail >= SPI_MINPOLYASIZE)
2341 fprintf(ofp, "Poly(A)+ tail length: %d\n", polyAtail);
2342 fprintf(ofp, "\n");
2343 fflush(ofp);
2344 if (spot->printaln && ofp2 != NULL) /* print alignment too */
2345 {
2346 epp_curr = herd->epp;
2347 fprintf(ofp2, "Genomic: %s %s\n", textid1, text1);
2348 fprintf(ofp2, "mRNA: %s %s\n", textid2, text2);
2349 amp = AlnMsgNew2();
2350 for (i=0; i<herd->numpieces; i++)
2351 {
2352 starts = herd->gstarts;
2353 stops = herd->gstops;
2354 epp_curr = herd->epp;
2355 while (epp_curr != NULL && epp_curr->exonnum != i+1)
2356 {
2357 epp_curr = epp_curr->next;
2358 }
2359 fprintf(ofp2, "Fragment %d Exon %d: %d-%d (gen) %d-%d (mRNA)\n", herd->fragments[i], herd->exons[i], starts[i], stops[i], herd->mstarts[i], herd->mstops[i]);
2360 sap = herd->saps[i];
2361 strand = AlnMgr2GetNthStrand(sap, 1);
2362 len = AlnMgr2GetAlnLength(sap, FALSE);
2363 AlnMgr2GetNthSeqRangeInSA(sap, 1, &gstart, NULL);
2364 end = FALSE;
2365 for (l=0; l<len; l+= SPI_LINE)
2366 {
2367 if (l == SPI_LINE)
2368 l = SPI_LINE - 1 - 10; /* kludge to print genomic splice on 1st line */
2369 for (j=1; j<=2; j++)
2370 {
2371 fprintf(ofp2, "\n");
2372 if (j == 1)
2373 {
2374 fprintf(ofp2, "\n");
2375 bsp = bsp_genomic;
2376 } else
2377 {
2378 bsp = bsp_mrna;
2379 spi_print_mismatch_line(ofp2, i+1, l, len-1, epp_curr, gstart);
2380 }
2381 AlnMsgReNew2(amp);
2382 amp->from_aln = l;
2383 if (l != 0)
2384 {
2385 if (l+SPI_LINE-1 >= len-1)
2386 {
2387 end = TRUE;
2388 amp->to_aln = -1;
2389 } else
2390 amp->to_aln = l+SPI_LINE-1;
2391 } else
2392 {
2393 if (l+SPI_LINE-1 >= len-1)
2394 end = TRUE;
2395 amp->to_aln = MIN(SPI_LINE-1-10, len-1);
2396 }
2397 amp->row_num = j;
2398 done = FALSE;
2399 while ((Boolean)(more = AlnMgr2GetNextAlnBit(sap, amp)))
2400 {
2401 if (l==0 && j==1 && !done) /* print splice site */
2402 {
2403 if (amp->from_row != 0 && amp->from_row != bsp_genomic->length-1)
2404 {
2405 if (amp->strand != Seq_strand_minus)
2406 {
2407 if (amp->from_row < 10)
2408 start = 0;
2409 else
2410 start = amp->from_row - 10;
2411 stop = amp->from_row - 1;
2412 } else
2413 {
2414 if (amp->to_row + 10 < bsp_genomic->length-1)
2415 stop = amp->to_row + 10;
2416 else
2417 stop = bsp_genomic->length-1;
2418 start = amp->to_row+1;
2419 }
2420 spp = SeqPortNew(bsp_genomic, start, stop, amp->strand, Seq_code_iupacna);
2421 ctr = SeqPortRead(spp, (Uint1Ptr)buf, 10);
2422 if (ctr > 0)
2423 buf[ctr] = '\0';
2424 fwrite(buf, 1, ctr, ofp2);
2425 SeqPortFree(spp);
2426 }
2427 done = TRUE;
2428 } else if (l==0 && j==2 && !done)
2429 {
2430 fprintf(ofp2, " "); /* 10 spaces for splice site */
2431 done = TRUE;
2432 }
2433 if (amp->type == AM_SEQ)
2434 {
2435 spp = SeqPortNew(bsp, amp->from_row, amp->to_row, amp->strand, Seq_code_iupacna);
2436 ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_LINE);
2437 if (ctr > 0)
2438 buf[ctr] = '\0';
2439 fprintf(ofp2, "%s", buf);
2440 SeqPortFree(spp);
2441 } else
2442 {
2443 for (ctr=0; ctr<(amp->to_row - amp->from_row+1); ctr++)
2444 {
2445 fprintf(ofp2, "-");
2446 }
2447 }
2448 if (j==1 && end)
2449 {
2450 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
2451 if (strand != Seq_strand_minus)
2452 {
2453 if (stop > bsp_genomic->length - 11)
2454 start = bsp_genomic->length;
2455 else
2456 start = stop + SPI_PSPLICE;
2457 spp = SeqPortNew(bsp_genomic, stop+1, start, strand, Seq_code_iupacna);
2458 } else
2459 {
2460 if (start < SPI_PSPLICE)
2461 stop = 0;
2462 else
2463 stop = start - SPI_PSPLICE;
2464 spp = SeqPortNew(bsp_genomic, stop, start-1, strand, Seq_code_iupacna);
2465 }
2466 endctr = ctr;
2467 ctr = SeqPortRead(spp, (Uint1Ptr)buf, SPI_PSPLICE);
2468 if (ctr > 0)
2469 buf[ctr] = '\0';
2470 for (ctr=0; endctr+ctr <= SPI_LINE-1 && ctr<SPI_PSPLICE; ctr++)
2471 {
2472 fprintf(ofp2, "%c", buf[ctr]);
2473 }
2474 endstr = NULL;
2475 if (ctr < SPI_PSPLICE)
2476 endstr = StringSave(&buf[ctr]);
2477 SeqPortFree(spp);
2478 }
2479 }
2480 }
2481 }
2482 if (endstr == NULL)
2483 fprintf(ofp2, "\n\n");
2484 else
2485 {
2486 fprintf(ofp2, "\n\n%s\n\n", endstr);
2487 MemFree(endstr);
2488 }
2489 }
2490 AlnMsgFree2(amp);
2491 }
2492 }
2493
2494 /***************************************************************************
2495 *
2496 * spi_print_mismatch_line takes a SPI_ExonProf structure and interprets
2497 * the mismatch locations into a line of vertical bars for identity,
2498 * nothing for mismatches or gaps. spi_print_mismatch_line only goes from
2499 * start to start+len-1 each time; it does not interpret the entire structure.
2500 *
2501 ***************************************************************************/
spi_print_mismatch_line(FILE * ofp,Int4 exonnum,Int4 start,Int4 len,SPI_ExonProfPtr epp,Int4 gstart)2502 static void spi_print_mismatch_line(FILE *ofp, Int4 exonnum, Int4 start, Int4 len, SPI_ExonProfPtr epp, Int4 gstart)
2503 {
2504 Int4 i;
2505 Int4 j;
2506 Int4 length;
2507
2508 if (ofp == NULL)
2509 return;
2510 if (start == 0)
2511 {
2512 length = MIN(SPI_LINE-10, len+1);
2513 fprintf(ofp, " "); /* 10 spaces for splice site */
2514 } else
2515 length = MIN(SPI_LINE, len-start+1);
2516 if (epp != NULL && epp->exonnum == exonnum)
2517 {
2518 j = 0;
2519 while (j<epp->nummismatches && epp->mismatches[j] < start)
2520 {
2521 j++;
2522 }
2523 for (i=0; i<length; i++)
2524 {
2525 if (j<epp->nummismatches && epp->mismatches[j] == start+i) /* here's a mismatch */
2526 {
2527 fprintf(ofp, " ");
2528 j++;
2529 } else /* not a mismatch */
2530 fprintf(ofp, "|");
2531 }
2532 } else /* there are no mismatches at all in this exon, so just print |s */
2533 {
2534 for (i=0; i<length; i++)
2535 {
2536 fprintf(ofp, "|");
2537 }
2538 }
2539 fprintf(ofp, "\n");
2540 }
2541
2542 /***************************************************************************
2543 *
2544 * SPI_CreateContinuousAln creates a single dense-seg seqalign from a
2545 * set of mRNA-to-genomic alignments. The introns are represented simply
2546 * as gaps in the mRNA sequence. SPI_CreateContinuousAln calls
2547 * SPI_ExtendAlnRight to extend each of the exon alignments across the intron,
2548 * then it merges the exon alignments together to create a single seqalign
2549 * spanning the entire mRNA-to-genomic alignment.
2550 *
2551 ***************************************************************************/
SPI_CreateContinuousAln(SeqAlignPtr PNTR saps,Int4 numsaps)2552 static SeqAlignPtr SPI_CreateContinuousAln(SeqAlignPtr PNTR saps, Int4 numsaps)
2553 {
2554 DenseSegPtr dsp;
2555 DenseSegPtr dsp_tmp;
2556 Int4 i;
2557 Int4 j;
2558 Int4 n1;
2559 Int4 n2;
2560 Int4 numseg;
2561 SeqAlignPtr salp;
2562 Int4 start1;
2563 Int4 start2;
2564 Int4 stop1;
2565 Int4 stop2;
2566 Uint1 strand;
2567
2568 for (i=0; i<numsaps-1; i++)
2569 {
2570 AlnMgr2GetNthSeqRangeInSA(saps[i], 1, &start1, &stop1);
2571 AlnMgr2GetNthSeqRangeInSA(saps[i+1], 1, &start2, &stop2);
2572 if (start2 - stop1 > 1) /* genomic gap */
2573 SPI_ExtendAlnRight(saps[i], 1, stop1+1, start2-1);
2574 AlnMgr2GetNthSeqRangeInSA(saps[i], 2, &start1, &stop1);
2575 AlnMgr2GetNthSeqRangeInSA(saps[i+1], 2, &start2, &stop2);
2576 strand = AlnMgr2GetNthStrand(saps[i], 2);
2577 if (strand == Seq_strand_minus)
2578 {
2579 if (start1 - stop2 > 1)
2580 SPI_ExtendAlnRight(saps[i], 2, stop2+1, start1-1);
2581 } else
2582 {
2583 if (start2 - stop1 > 1)
2584 SPI_ExtendAlnRight(saps[i], 2, stop1+1, start2-1);
2585 }
2586 }
2587 numseg = 0;
2588 for (i=0; i<numsaps; i++)
2589 {
2590 dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2591 numseg += dsp_tmp->numseg;
2592 }
2593 /* now make a new seqalign across the whole set */
2594 dsp = DenseSegNew();
2595 dsp->dim = 2;
2596 dsp->numseg = numseg;
2597 dsp->starts = (Int4Ptr)MemNew(2*numseg*sizeof(Int4));
2598 dsp->lens = (Int4Ptr)MemNew(numseg*sizeof(Int4));
2599 dsp->strands = (Uint1Ptr)MemNew(2*numseg*sizeof(Uint1));
2600 n1 = n2 = 0;
2601 for (i=0; i<numsaps; i++)
2602 {
2603 dsp_tmp = (DenseSegPtr)(saps[i]->segs);
2604 if (dsp->ids == NULL)
2605 dsp->ids = SeqIdDupList(dsp_tmp->ids);
2606 for (j=0; j<2*dsp_tmp->numseg; j++)
2607 {
2608 dsp->starts[n1+j] = dsp_tmp->starts[j];
2609 dsp->strands[n1+j] = dsp_tmp->strands[j];
2610 }
2611 for (j=0; j<dsp_tmp->numseg; j++)
2612 {
2613 dsp->lens[n2+j] = dsp_tmp->lens[j];
2614 }
2615 n1 += 2*dsp_tmp->numseg;
2616 n2 += dsp_tmp->numseg;
2617 }
2618 salp = SeqAlignNew();
2619 salp->type = SAT_PARTIAL;
2620 salp->segtype = SAS_DENSEG;
2621 salp->dim = 2;
2622 salp->segs = (Pointer)(dsp);
2623 AlnMgr2IndexSingleChildSeqAlign(salp);
2624 return salp;
2625 }
2626
2627 /***************************************************************************
2628 *
2629 * SPI_ExtendAlnRight is used by SPI_CreateContinuousAln to extend each
2630 * exon alignment across the intron. SPI_ExtendAlnRight simply adds a
2631 * segment to the exon alignment (or extends an existing segment, if
2632 * possible) that has a gap in the mRNA sequence. SPI_ExtendAlnRight assumes
2633 * that the input alignment is a child seqalign with two rows.
2634 *
2635 ***************************************************************************/
SPI_ExtendAlnRight(SeqAlignPtr sap,Int4 which_row,Int4 start,Int4 stop)2636 static void SPI_ExtendAlnRight(SeqAlignPtr sap, Int4 which_row, Int4 start, Int4 stop)
2637 {
2638 DenseSegPtr dsp;
2639 Int4 i;
2640 Int4Ptr lens;
2641 Int4Ptr starts;
2642 Uint1Ptr strands;
2643
2644 if (sap == NULL)
2645 return;
2646 if (which_row > 2)
2647 return;
2648 dsp = (DenseSegPtr)(sap->segs);
2649 if (dsp->starts[2*(dsp->numseg-1) + which_row - 1] == -1 || dsp->starts[2*(dsp->numseg-1) + (2-which_row)] != -1)
2650 {
2651 starts = (Int4Ptr)MemNew((dsp->numseg+1)*2*sizeof(Int4));
2652 strands = (Uint1Ptr)MemNew((dsp->numseg+1)*2*sizeof(Uint1));
2653 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
2654 for (i=0; i<dsp->numseg; i++)
2655 {
2656 lens[i] = dsp->lens[i];
2657 }
2658 for (i=0; i<=(dsp->dim)*(dsp->numseg-1)+1; i++)
2659 {
2660 starts[i] = dsp->starts[i];
2661 strands[i] = dsp->strands[i];
2662 }
2663 lens[dsp->numseg] = stop - start + 1;
2664 if (dsp->strands[which_row-1] != Seq_strand_minus)
2665 starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = start;
2666 else
2667 starts[(dsp->dim)*(dsp->numseg) + which_row - 1] = stop;
2668 starts[(dsp->dim)*(dsp->numseg) + (2-which_row)] = -1;
2669 strands[(dsp->dim)*(dsp->numseg) + which_row - 1] = dsp->strands[which_row-1];
2670 strands[(dsp->dim)*(dsp->numseg) + (2-which_row)] = dsp->strands[2-which_row];
2671 MemFree(dsp->starts);
2672 MemFree(dsp->lens);
2673 MemFree(dsp->strands);
2674 dsp->numseg++;
2675 dsp->starts = starts;
2676 dsp->strands = strands;
2677 dsp->lens = lens;
2678 } else
2679 {
2680 dsp->lens[dsp->numseg-1] += stop - start + 1;
2681 if (dsp->strands[which_row-1] == Seq_strand_minus)
2682 dsp->starts[(dsp->dim)*(dsp->numseg-1) + which_row - 1] = stop;
2683 }
2684 SAIndex2Free2(sap->saip);
2685 sap->saip = NULL;
2686 AlnMgr2IndexSingleChildSeqAlign(sap);
2687 }
2688
2689 /***************************************************************************
2690 *
2691 * SPI_AlnSinglemRNAToPieces is the entry point for the mRNA-to-draft
2692 * sequence functions of spidey, which create an alignment between mRNAs
2693 * and a series of ordered, unordered, oriented, or unoriented (often
2694 * a mixture of all of the above) fragments. SPI_AlnSinglemRNAToPieces
2695 * reads in a tab-delimited file that has information about the order
2696 * and orientation of the fragments. For example:
2697
2698 * ctg name start stop fragment number accession.version
2699 * | | | | fragment code | start stop strand
2700 * 9/ctg119 775986 784968 137 D AC020712.4 96692 105674 -
2701 * 9/ctg119 784969 810517 138 D AC022758.3 103385 128933 +
2702 * 9/ctg119 810518 810880 139 D AC020712.4 54074 54436 +
2703 * 9/ctg119 810881 822654 140 D AC022758.3 16691 28464 +
2704 * 9/ctg119 822655 822754 141 N 100 fragment yes
2705 * 9/ctg119 822755 823638 142 D AC020712.4 153248 154131 -
2706 * 9/ctg119 823639 823738 143 N 100 fragment no
2707 * 9/ctg119 823739 824581 144 F AC021710.5 1728 2570 +
2708
2709 * The fragment code indicates whether the fragment is draft quality (D),
2710 * finished (F), predraft (P), or a gap (N). If the fragment is a gap
2711 * (which consists of 100 Ns), the yes/no field indicates whether the
2712 * adjoining fragments are ordered across the gap. In spidey, a group
2713 * is a set of fragments which all reside between the same two gaps.
2714 * Linked groups, or lgroups, are two groups spanning a gap with a "yes"
2715 * indicating that the groups have known order. Each fragment has an
2716 * order within the group, as well.
2717 * In this example, there are two lgroups (the last fragment is in its
2718 * own lgroup) and three groups.
2719 * SPI_AlnSinglemRNAToPieces reads in the tab-delimited file and creates
2720 * a SPI_Pos structure for each fragment, indicating which group, lgroup,
2721 * order, and original fragment number this fragment belongs to. Since
2722 * the gap fragments are not used, the number of SPI_FragPtrs will be
2723 * less than the number of fragments, so the original fragment numbers
2724 * must be stored. This function also does the initial high-stringency
2725 * BLAST alignment of the mRNA and the draft sequence; the alignments
2726 * are put on the correct strand and then sent to other functions to
2727 * order the alignments, make them consistent, connect them together,
2728 * adjust the ends to splice sites, and finally to get summary
2729 * statistics for printing.
2730 *
2731 ***************************************************************************/
SPI_AlnSinglemRNAToPieces(SPI_bsinfoPtr spig_head,SPI_bsinfoPtr spim,FILE * ofp,FILE * ofp2,SPI_OptionsPtr spot)2732 NLM_EXTERN SPI_mRNAToHerdPtr SPI_AlnSinglemRNAToPieces(SPI_bsinfoPtr spig_head, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot)
2733 {
2734 AMAlignIndex2Ptr amaip;
2735 Int4 c;
2736 CharPtr field[SPI_NUMCOLS];
2737 FILE *fp;
2738 Int4 group;
2739 SPI_mRNAToHerdPtr herd;
2740 Int4 i;
2741 Char line[200];
2742 Boolean linked;
2743 Int4 lgroup;
2744 Int4 numFields;
2745 BLAST_OptionsBlkPtr options;
2746 Int4 order;
2747 SPI_PosPtr posp;
2748 CharPtr ptr;
2749 Char token;
2750 SeqAlignPtr salp;
2751 SeqAlignPtr salp_prev;
2752 SeqAlignPtr salp_tmp;
2753 SeqAlignPtr sap;
2754 SeqAlignPtr sap1;
2755 SeqAlignPtr sap2;
2756 SPI_FragPtr sfp;
2757 SPI_FragPtr sfp_head;
2758 SPI_FragPtr sfp_prev;
2759 SPI_FragHerdPtr sfhp;
2760 SeqLocPtr slp1;
2761 SeqLocPtr slp2;
2762 Int4 start;
2763 Int4 stop;
2764 Uint1 strand;
2765
2766 if (spot->draftfile == NULL)
2767 return NULL;
2768 if (spot->to < spot->from)
2769 return NULL;
2770 fp = FileOpen(spot->draftfile, "r");
2771 if (fp == NULL)
2772 return NULL;
2773 sfhp = (SPI_FragHerdPtr)MemNew(sizeof(SPI_FragHerd));
2774 sfp_head = sfp_prev = NULL;
2775 group = 0;
2776 order = 0;
2777 lgroup = 0;
2778 linked = FALSE;
2779 while (fgets(line, sizeof (line), fp) != NULL)
2780 {
2781 memset(field, 0, sizeof (field));
2782 ptr = line;
2783 if ((ptr = strchr(ptr, '\t')) == NULL)
2784 token = ' ';
2785 else
2786 token = '\t';
2787 ptr = line;
2788 for (numFields=0; numFields < SPI_NUMCOLS && ptr != NULL; numFields++)
2789 {
2790 if (numFields == 0)
2791 ptr = strtok(ptr, &token);
2792 else
2793 ptr = strtok(NULL, &token);
2794 field[numFields] = ptr;
2795 }
2796 if (!StringICmp(field[4], "N")) /* gap */
2797 {
2798 order = 0;
2799 if (!StringNICmp(field[7], "yes", 3*sizeof(Char))) /* ordered across gap */
2800 {
2801 linked = TRUE;
2802 if (sfp_prev != NULL)
2803 {
2804 if (sfp_prev->position_orig->lgroup != 0)
2805 lgroup = sfp_prev->position_orig->lgroup;
2806 else
2807 {
2808 lgroup++;
2809 sfp_prev->position_orig->lgroup = lgroup;
2810 }
2811 } else
2812 lgroup++;
2813 } else
2814 linked = FALSE;
2815 } else
2816 {
2817 order++;
2818 sfp = (SPI_FragPtr)MemNew(sizeof(SPI_Frag));
2819 sfp->start = atol(field[1]);
2820 sfp->stop = atol(field[2]);
2821 sfp->fragnum = atol(field[3]);
2822 posp = (SPI_PosPtr)MemNew(sizeof(SPI_Pos));
2823 if (linked)
2824 posp->lgroup = lgroup;
2825 if (order > 2)
2826 posp->group = group;
2827 else if (order == 2)
2828 {
2829 if (sfp_prev != NULL)
2830 sfp_prev->position_orig->group = group;
2831 } else
2832 group++;
2833 posp->group = group;
2834 posp->order = order;
2835 sfp->position_orig = posp;
2836 if (sfp_head != NULL)
2837 {
2838 sfp_prev->next = sfp;
2839 sfp_prev = sfp;
2840 } else
2841 sfp_head = sfp_prev = sfp;
2842 sfhp->numfrags++;
2843 }
2844 }
2845 sfhp->sfparray = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
2846 sfp = sfp_head;
2847 for (i=0; i<sfhp->numfrags; i++)
2848 {
2849 sfhp->sfparray[i] = sfp;
2850 sfp = sfp->next;
2851 }
2852 sfhp->polyAtail = SPI_IsItPolyA(spim->bsp->id);
2853 /* search genomic against both strands of mRNA */
2854 if (spot->from == spot->to == 0)
2855 spot->to = spig_head->bsp->length-1;
2856 slp2 = SeqLocIntNew(0, spim->bsp->length-1-sfhp->polyAtail, Seq_strand_minus, spim->bsp->id);
2857 slp1 = SeqLocIntNew(spot->from, spot->to, Seq_strand_plus, spig_head->bsp->id);
2858 options = BLASTOptionNew("blastn", FALSE);
2859 options->filter_string = StringSave("m L");
2860 options->expect_value = spot->secpasseval;
2861 options->query_lcase_mask = spot->lcaseloc;
2862 if (spot->interspecies)
2863 {
2864 options->gap_x_dropoff_final = 100;
2865 options->gap_open = 4;
2866 options->gap_extend = 1;
2867 options->penalty = -1;
2868 }
2869 sap1 = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
2870 SeqLocFree(slp2);
2871 BLASTOptionDelete(options);
2872 slp2 = SeqLocIntNew(0, spim->bsp->length-1-sfhp->polyAtail, Seq_strand_minus, spim->bsp->id);
2873 options = BLASTOptionNew("blastn", FALSE);
2874 options->filter_string = StringSave("m L");
2875 options->expect_value = spot->secpasseval;
2876 options->query_lcase_mask = spot->lcaseloc;
2877 if (spot->interspecies)
2878 {
2879 options->gap_x_dropoff_final = 100;
2880 options->gap_open = 4;
2881 options->gap_extend = 1;
2882 options->penalty = -1;
2883 }
2884 sap2 = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
2885 SeqLocFree(slp1);
2886 SeqLocFree(slp2);
2887 BLASTOptionDelete(options);
2888 AlnMgr2IndexLite(sap1);
2889 AlnMgr2IndexLite(sap2);
2890 sap = NULL;
2891 if (sap1 != NULL && sap2 != NULL)
2892 {
2893 salp = (SeqAlignPtr)(sap1->segs);
2894 while (salp->next != NULL)
2895 {
2896 salp = salp->next;
2897 }
2898 salp->next = (SeqAlignPtr)(sap2->segs);
2899 sap2->segs = NULL;
2900 SeqAlignFree(sap2);
2901 AMAlignIndex2Free2(sap1->saip);
2902 sap1->saip = NULL;
2903 AlnMgr2IndexLite(sap1);
2904 sap = sap1;
2905 } else if (sap1 == NULL)
2906 sap = sap2;
2907 else
2908 sap = sap1;
2909 if (sap == NULL)
2910 return NULL;
2911 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
2912 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
2913 c = 0;
2914 amaip = (AMAlignIndex2Ptr)(sap->saip);
2915 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &start, &stop);
2916 for (i=0; i<sfhp->numfrags && c<amaip->numsaps; i++)
2917 {
2918 salp_tmp = salp_prev = NULL;
2919 while (sfhp->sfparray[i]->start <= start && sfhp->sfparray[i]->stop >= start && c<amaip->numsaps)
2920 {
2921 if (salp_tmp == NULL)
2922 salp_tmp = salp_prev = SeqAlignDup(amaip->saps[c]);
2923 else
2924 {
2925 salp_prev->next = SeqAlignDup(amaip->saps[c]);
2926 salp_prev = salp_prev->next;
2927 }
2928 c++;
2929 if (c<amaip->numsaps)
2930 AlnMgr2GetNthSeqRangeInSA(amaip->saps[c], 1, &start, &stop);
2931 }
2932 if (salp_tmp != NULL)
2933 {
2934 AlnMgr2IndexLite(salp_tmp);
2935 SPI_RemoveInconsistentAlnsFromSet(salp_tmp, SPI_TEENYEXON, 1, SPI_LEFT);
2936 sfhp->sfparray[i]->sap = salp_tmp;
2937 /* change all alignments to be on the plus strand of the mRNA */
2938 strand = AlnMgr2GetNthStrand((SeqAlignPtr)(salp_tmp->segs), 2);
2939 if (strand == Seq_strand_minus)
2940 SeqAlignListReverseStrand((SeqAlignPtr)(salp_tmp->segs));
2941 }
2942 }
2943 SeqAlignSetFree(sap);
2944 SPI_OrderInternally(sfhp);
2945 /* take out overlaps */
2946 SPI_RemoveConflictsAmongPieces(sfhp, SPI_TEENYEXON);
2947 /* do an initial ordering */
2948 SPI_OrderPieces(sfhp, spim->bsp);
2949 /* then look for missing pieces */
2950 if (!SPI_ConnectAlnPieces(sfhp, spig_head->bsp, spim->bsp, spot))
2951 return NULL;
2952 SPI_OrderInternally(sfhp);
2953 /* take out any remaining overlaps */
2954 SPI_RemoveConflictsAmongPieces(sfhp, SPI_TEENYEXON);
2955 /* then do the final ordering */
2956 SPI_OrderPieces(sfhp, spim->bsp);
2957 SPI_AdjustSplicesInPieces(sfhp, spig_head->bsp, spot);
2958 herd = SPI_GetHerdInfo(sfhp, spim->bsp, spot);
2959 SPI_PrintHerdResult(ofp, ofp2, herd, spot, spig_head->bsp, spim->bsp);
2960 return herd;
2961 }
2962
2963 /***************************************************************************
2964 *
2965 * SPI_GetHerdInfo fills in a SPI_mRNAToHerd structure with all the
2966 * appropriate information about mRNA and genomic starts, stops, and strands;
2967 * presence of splice donor and acceptor sites; number of mismatches and
2968 * gaps for each exon; and one alignment for each exon. SPI_GetHerdInfo
2969 * first decides how many exons there are and allocates one ExonHerdInfo
2970 * structure per exon to store the necessary information. The SPI_mRNAToHerd
2971 * structure is then allocated, and for each exon, SPI_GetExonInfo is called
2972 * to retrieve the number of gaps, the number of mismatches, and the
2973 * mismatch line for printing. After all the exons' information is filled
2974 * in, the function goes through again and checks to see whether any two
2975 * exons are close to the edges of their respective fragments and abut
2976 * each other on the mRNA. If so, these "exons" are probably a single exon
2977 * and are assigned the same exon number. Finally, the alignments are
2978 * checked to see whether small pieces at the 5' and 3' ends have been
2979 * omitted; if so, these pieces are added to the alignments.
2980 *
2981 ***************************************************************************/
SPI_GetHerdInfo(SPI_FragHerdPtr sfhp,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)2982 static SPI_mRNAToHerdPtr SPI_GetHerdInfo(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
2983 {
2984 AMAlignIndex2Ptr amaip;
2985 Int4 b;
2986 Int4 c;
2987 SPI_ExonHerdInfoPtr ehi;
2988 SPI_ExonHerdInfoPtr ehi_head;
2989 SPI_ExonHerdInfoPtr ehi_prev;
2990 Int4 end;
2991 SPI_mRNAToHerdPtr herd;
2992 Int4 i;
2993 Int4 j;
2994 Int4 k;
2995 Int4 l;
2996 Int4 last;
2997 Int4 len;
2998 Int4 len_last;
2999 Int4 max;
3000 Int4 min;
3001 Int4 mis;
3002 Int4 offset;
3003 SeqAlignPtr salp;
3004 SeqAlignPtr salp_tmp;
3005 SPI_FragPtr sfp;
3006 SPI_mRNAPtr smp_fake;
3007 Uint1 strand;
3008
3009 herd = (SPI_mRNAToHerdPtr)MemNew(sizeof(SPI_mRNAToHerd));
3010 ehi_head = ehi_prev = NULL;
3011 for (i=0; i<sfhp->numfrags; i++)
3012 {
3013 sfp = sfhp->sfparray[i];
3014 if (sfp->sap != NULL)
3015 {
3016 amaip = (AMAlignIndex2Ptr)(sfp->sap->saip);
3017 strand = AlnMgr2GetNthStrand(amaip->saps[0], 1);
3018 if (strand == Seq_strand_minus)
3019 {
3020 k = amaip->numsaps-1;
3021 l = -1;
3022 } else
3023 {
3024 k = 0;
3025 l = 1;
3026 }
3027 for (j=k; j<amaip->numsaps && j > -1; j+=l)
3028 {
3029 ehi = (SPI_ExonHerdInfoPtr)MemNew(sizeof(SPI_ExonHerdInfo));
3030 ehi->sfpnum = i;
3031 herd->numpieces++;
3032 ehi->sap = amaip->saps[j];
3033 ehi->fragmentnum = sfp->fragnum;
3034 if (amaip->numsaps == 1)
3035 {
3036 ehi->acceptor = sfp->acceptor;
3037 ehi->donor = sfp->donor;
3038 } else if (j == 0)
3039 {
3040 ehi->acceptor = sfp->acceptor;
3041 ehi->donor = sfp->smp->splicedon[j];
3042 } else if (j == amaip->numsaps - 1)
3043 {
3044 ehi->donor = sfp->donor;
3045 ehi->acceptor = sfp->smp->spliceacc[j];
3046 } else
3047 {
3048 ehi->donor = sfp->smp->splicedon[j];
3049 ehi->acceptor = sfp->smp->spliceacc[j];
3050 }
3051 if (ehi_head != NULL)
3052 {
3053 ehi_prev->next = ehi;
3054 ehi_prev = ehi;
3055 } else
3056 ehi_head = ehi_prev = ehi;
3057 }
3058 }
3059 }
3060 ehi = ehi_head;
3061 if (ehi == NULL)
3062 return NULL;
3063 herd->fragments = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3064 herd->sfpnum = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3065 herd->exons = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3066 herd->mstarts = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3067 herd->mstops = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3068 herd->gstarts = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3069 herd->gstops = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3070 herd->lens = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3071 herd->strands = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3072 herd->splicedon = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3073 herd->spliceacc = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3074 herd->pmismatch = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3075 herd->pgaps = (Int4Ptr)MemNew((herd->numpieces)*sizeof(Int4));
3076 herd->saps = (SeqAlignPtr PNTR)MemNew((herd->numpieces)*sizeof(SeqAlignPtr));
3077 herd->fallsoff = (Uint1Ptr)MemNew((herd->numpieces)*sizeof(Uint1));
3078 i = 0;
3079 smp_fake = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
3080 smp_fake->mstarts = (Int4Ptr)MemNew(sizeof(Int4));
3081 smp_fake->mstops = (Int4Ptr)MemNew(sizeof(Int4));
3082 smp_fake->gstarts = (Int4Ptr)MemNew(sizeof(Int4));
3083 smp_fake->gstops = (Int4Ptr)MemNew(sizeof(Int4));
3084 smp_fake->exonid = (FloatHiPtr)MemNew(sizeof(FloatHi));
3085 smp_fake->exongaps = (Int4Ptr)MemNew(sizeof(Int4));
3086 smp_fake->saps = (SeqAlignPtr PNTR)MemNew(sizeof(SeqAlignPtr));
3087 mis = 0;
3088 len = 0;
3089 len_last = 0;
3090 while (ehi != NULL) /* fill in info for each piece */
3091 {
3092 herd->splicedon[i] = ehi->donor;
3093 herd->spliceacc[i] = ehi->acceptor;
3094 herd->strands[i] = AlnMgr2GetNthStrand(ehi->sap, 1);
3095 herd->saps[i] = ehi->sap;
3096 smp_fake->saps[0] = ehi->sap;
3097 last = mis;
3098 len += SPI_GetExonInfo(smp_fake, 0, &b, &c, &mis, spot);
3099 herd->pmismatch[i] = mis - last;
3100 herd->lens[i] = len - len_last;
3101 herd->mstarts[i] = b;
3102 herd->mstops[i] = c;
3103 herd->gstarts[i] = smp_fake->gstarts[0];
3104 herd->gstops[i] = smp_fake->gstops[0];
3105 herd->pgaps[i] = smp_fake->exongaps[0];
3106 sfp = sfhp->sfparray[ehi->sfpnum];
3107 herd->fragments[i] = ehi->fragmentnum;
3108 herd->sfpnum[i] = ehi->sfpnum;
3109 if (herd->gstarts[i] < sfp->start + SPI_FUZZ)
3110 {
3111 if (herd->gstops[i] > sfp->stop - SPI_FUZZ)
3112 herd->fallsoff[i] = SPI_BOTH;
3113 else
3114 herd->fallsoff[i] = SPI_LEFT;
3115 } else
3116 {
3117 if (herd->gstops[i] > sfp->stop - SPI_FUZZ)
3118 herd->fallsoff[i] = SPI_RIGHT;
3119 else
3120 herd->fallsoff[i] = SPI_NEITHER;
3121 }
3122 ehi_prev = ehi;
3123 ehi = ehi->next;
3124 ehi_prev->next = NULL;
3125 MemFree(ehi_prev);
3126 i++;
3127 }
3128 herd->mismatch = (100)*(FloatHi)mis/(FloatHi)len;
3129 herd->epp = smp_fake->epp;
3130 herd->mRNAcoverage = 100*((FloatHi)len/(FloatHi)bsp_mrna->length);
3131 /* now run through to see whether any two pieces should be */
3132 /* merged into a single exon (both near fragment edges) */
3133 b = 1;
3134 for (i=0; i<herd->numpieces-1; i++)
3135 {
3136 if (((herd->fallsoff[i] == SPI_RIGHT && herd->strands[i] == Seq_strand_plus)
3137 || (herd->fallsoff[i] == SPI_LEFT && herd->strands[i] == Seq_strand_minus)
3138 || (herd->fallsoff[i] == SPI_BOTH)) &&
3139 ((herd->fallsoff[i+1] == SPI_LEFT && herd->strands[i+1] == Seq_strand_plus)
3140 || (herd->fallsoff[i+1] == SPI_RIGHT && herd->strands[i+1] == Seq_strand_minus)
3141 || (herd->fallsoff[i+1] == SPI_BOTH)))
3142 {
3143 if (herd->mstarts[i+1] < herd->mstops[i] - SPI_TEENYEXON)
3144 herd->exons[i] = b;
3145 }
3146 else
3147 {
3148 herd->exons[i] = b;
3149 if (herd->fallsoff[i] == SPI_RIGHT)
3150 herd->fallsoff[i] = SPI_NEITHER;
3151 if (herd->fallsoff[i] == SPI_BOTH)
3152 herd->fallsoff[i] = SPI_LEFT;
3153 if (herd->fallsoff[i+1] == SPI_LEFT)
3154 herd->fallsoff[i+1] = SPI_NEITHER;
3155 if (herd->fallsoff[i+1] == SPI_BOTH)
3156 herd->fallsoff[i+1] = SPI_RIGHT;
3157 b++;
3158 }
3159 }
3160 herd->exons[i] = b;
3161 herd->numexons = b;
3162 /* now get %id per exon, #gaps per exon */
3163 i = 0;
3164 herd->exonid = (FloatHiPtr)MemNew((herd->numexons)*sizeof(FloatHi));
3165 herd->exongaps = (Int4Ptr)MemNew((herd->numexons)*sizeof(Int4));
3166 while (i<herd->numpieces)
3167 {
3168 min = herd->mstarts[i];
3169 b = herd->pmismatch[i];
3170 c = herd->pgaps[i];
3171 while (i<herd->numpieces-1 && herd->exons[i] == herd->exons[i+1])
3172 {
3173 i++;
3174 b += herd->pmismatch[i];
3175 c += herd->pgaps[i];
3176 }
3177 max = herd->mstops[i];
3178 herd->exongaps[herd->exons[i]-1] = c;
3179 herd->exonid[herd->exons[i]-1] = (FloatHi)(max - min + 1 - c - b)/(FloatHi)(max - min + 1 - c);
3180 i++;
3181 }
3182 /* check -- does the alignment leave off a tiny piece of the beginning */
3183 /* or end of the mRNA (ignoring the polyA tail) ? */
3184 /* first check the beginning */
3185 end = bsp_mrna->length - 1 - sfhp->polyAtail;
3186 if (herd->mstarts[0] > 0 && herd->mstarts[0] < SPI_TEENYEXON)
3187 {
3188 strand = AlnMgr2GetNthStrand(herd->saps[0], 1);
3189 if (strand == Seq_strand_minus)
3190 {
3191 salp = (SeqAlignPtr)(herd->saps[0]);
3192 salp_tmp = salp->next;
3193 salp->next = NULL;
3194 SAIndex2Free2(salp->saip);
3195 salp->saip = NULL;
3196 SeqAlignListReverseStrand(salp);
3197 AlnMgr2IndexSingleChildSeqAlign(salp);
3198 salp->next = salp_tmp;
3199 }
3200 sfp = sfhp->sfparray[herd->sfpnum[0]];
3201 offset = herd->mstarts[0];
3202 herd->mstarts[0] = 0;
3203 if (strand == Seq_strand_minus)
3204 herd->gstops[0] += offset;
3205 else
3206 herd->gstarts[0] -= offset;
3207 if (herd->gstarts[0] < sfp->start + herd->mstarts[0])
3208 {
3209 herd->mstarts[0] = herd->mstarts[0] - (herd->gstarts[0] - sfp->start);
3210 herd->gstarts[0] = sfp->start;
3211 offset = herd->gstarts[0] - sfp->start;
3212 } else if (herd->gstops[0] > sfp->stop - herd->mstarts[0])
3213 {
3214 herd->mstarts[0] = herd->mstarts[0] - (sfp->stop - herd->gstops[0]);
3215 herd->gstops[0] = sfp->stop;
3216 offset = sfp->stop - herd->gstops[0];
3217 }
3218 SPI_AddToAln(herd->saps[0], offset, SPI_LEFT, strand);
3219 if (strand == Seq_strand_minus)
3220 {
3221 salp = (SeqAlignPtr)(herd->saps[0]);
3222 salp_tmp = salp->next;
3223 salp->next = NULL;
3224 SAIndex2Free2(salp->saip);
3225 salp->saip = NULL;
3226 SeqAlignListReverseStrand(salp);
3227 AlnMgr2IndexSingleChildSeqAlign(salp);
3228 salp->next = salp_tmp;
3229 }
3230 }
3231 /* now check the end */
3232 if (herd->mstops[herd->numpieces-1] > end - SPI_TEENYEXON && herd->mstops[herd->numpieces-1] != end)
3233 {
3234 strand = AlnMgr2GetNthStrand(herd->saps[herd->numpieces-1], 1);
3235 if (strand == Seq_strand_minus)
3236 {
3237 salp = (SeqAlignPtr)(herd->saps[herd->numpieces-1]);
3238 salp_tmp = salp->next;
3239 salp->next = NULL;
3240 SAIndex2Free2(salp->saip);
3241 salp->saip = NULL;
3242 SeqAlignListReverseStrand(salp);
3243 AlnMgr2IndexSingleChildSeqAlign(salp);
3244 salp->next = salp_tmp;
3245 }
3246 sfp = sfhp->sfparray[herd->sfpnum[herd->numpieces-1]];
3247 offset = end - herd->mstops[herd->numpieces-1];
3248 herd->mstops[herd->numpieces-1] = bsp_mrna->length-1;
3249 if (strand == Seq_strand_minus)
3250 herd->gstarts[herd->numpieces-1] -= offset;
3251 else
3252 herd->gstops[herd->numpieces-1] += offset;
3253 if (herd->gstarts[herd->numpieces-1] < sfp->start + offset)
3254 {
3255 offset = herd->gstops[herd->numpieces-1] - sfp->start;
3256 herd->mstops[herd->numpieces-1] = herd->mstops[herd->numpieces-1] + offset;
3257 herd->gstarts[herd->numpieces-1] = sfp->start;
3258 } else if (herd->gstops[herd->numpieces-1] > sfp->stop - offset)
3259 {
3260 offset = sfp->stop - herd->gstops[herd->numpieces-1];
3261 herd->mstops[herd->numpieces-1] = herd->mstops[herd->numpieces-1] - offset;
3262 herd->gstops[herd->numpieces-1] = sfp->stop;
3263 }
3264 SPI_AddToAln(herd->saps[herd->numpieces-1], offset, SPI_RIGHT, strand);
3265 if (strand == Seq_strand_minus)
3266 {
3267 salp = (SeqAlignPtr)(herd->saps[herd->numpieces-1]);
3268 salp_tmp = salp->next;
3269 salp->next = NULL;
3270 SAIndex2Free2(salp->saip);
3271 salp->saip = NULL;
3272 SeqAlignListReverseStrand(salp);
3273 AlnMgr2IndexSingleChildSeqAlign(salp);
3274 salp->next = salp_tmp;
3275 }
3276 }
3277 if (herd->mstarts[0] > 0)
3278 {
3279 if (herd->mstops[herd->numpieces-1] < bsp_mrna->length-1)
3280 herd->missingends = SPI_BOTH;
3281 else
3282 herd->missingends = SPI_LEFT;
3283 } else
3284 {
3285 if (herd->mstops[herd->numpieces-1] < bsp_mrna->length-1)
3286 herd->missingends = SPI_RIGHT;
3287 else
3288 herd->missingends = SPI_NEITHER;
3289 }
3290 MemFree(smp_fake->mstarts);
3291 MemFree(smp_fake->mstops);
3292 MemFree(smp_fake->gstarts);
3293 MemFree(smp_fake->gstops);
3294 MemFree(smp_fake->exonid);
3295 MemFree(smp_fake->exongaps);
3296 MemFree(smp_fake->saps);
3297 MemFree(smp_fake);
3298 return herd;
3299 }
3300
3301 /***************************************************************************
3302 *
3303 * SPI_FindWindows first sorts all the alignments by score, and then
3304 * sends the array to SPI_AssembleRegions, which puts together
3305 * nonoverlapping regions containing one or more alignments each. The idea:
3306 *
3307 * 0-----100----200----300----400----500----600----700----800---- (genomic)
3308 * 0 \ \ \ \
3309 * 50 \ \ \
3310 * 100 \ \
3311 * 150 \ \
3312 * 200 \ \
3313 * (mRNA)
3314 * <-1-> <---2----> <-3-> <---4--->
3315 * This set of initial alignments defines 4 windows; the best n windows
3316 * will be chosen and the alignment will be refined in those windows.
3317 *
3318 ***************************************************************************/
SPI_FindWindows(SeqAlignPtr sap,SPI_OptionsPtr spot)3319 static SPI_RegionInfoPtr SPI_FindWindows(SeqAlignPtr sap, SPI_OptionsPtr spot)
3320 {
3321 AMAlignIndex2Ptr amaip;
3322 /* FloatHi bit_score; */
3323 /* FloatHi evalue; */
3324 Int4 i;
3325 /* Int4 number; */
3326 SeqAlignPtr salp;
3327 /* Int4 score; */
3328 SPI_AlnInfoPtr PNTR spip_list;
3329 SPI_RegionInfoPtr srip_head;
3330 FloatHi s, s1;
3331 Int4 s2, tmp;
3332
3333 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
3334 return NULL;
3335 amaip = (AMAlignIndex2Ptr)(sap->saip);
3336 spip_list = (SPI_AlnInfoPtr PNTR)MemNew((amaip->numsaps)*sizeof(SPI_AlnInfoPtr));
3337 for (i=0; i<amaip->numsaps; i++)
3338 {
3339 salp = amaip->saps[i];
3340 spip_list[i] = (SPI_AlnInfoPtr)MemNew(sizeof(SPI_AlnInfo));
3341 spip_list[i]->sap = salp;
3342 salp->next = NULL;
3343 /*
3344 spip_list[i]->bit_score = AlnMgr2ComputeScoreForSeqAlign(salp);*/
3345 tmp = spip_list[i]->bit_score;
3346 GetScoreAndEvalue(salp, &tmp, &s, &s1, &s2);
3347 spip_list[i]->bit_score = tmp;
3348 }
3349 HeapSort(spip_list, i, sizeof(SPI_AlnInfoPtr), SPI_compare_aln_score);
3350 srip_head = NULL;
3351 srip_head = SPI_AssembleRegions(spip_list, amaip->numsaps, &srip_head, spot);
3352 srip_head = SPI_SortRegions(srip_head);
3353 for (i=0; i<amaip->numsaps; i++)
3354 {
3355 spip_list[i]->sap = NULL;
3356 MemFree(spip_list[i]);
3357 if (i < amaip->numsaps-1)
3358 amaip->saps[i]->next = amaip->saps[i+1];
3359 }
3360 sap->segs = (Pointer)(amaip->saps[0]);
3361 MemFree(spip_list);
3362 return srip_head;
3363 }
3364
3365 /***************************************************************************
3366 *
3367 * SPI_compare_aln_score is the callback for the HeapSort in
3368 * SPI_FindWindows; it simply compares the scores of two alignments.
3369 *
3370 ***************************************************************************/
SPI_compare_aln_score(VoidPtr ptr1,VoidPtr ptr2)3371 static int LIBCALLBACK SPI_compare_aln_score(VoidPtr ptr1, VoidPtr ptr2)
3372 {
3373 SPI_AlnInfoPtr spip1;
3374 SPI_AlnInfoPtr spip2;
3375
3376 if (ptr1 != NULL && ptr2 != NULL)
3377 {
3378 spip1 = *((SPI_AlnInfoPtr PNTR)ptr1);
3379 spip2 = *((SPI_AlnInfoPtr PNTR)ptr2);
3380 if (spip1->bit_score > spip2->bit_score)
3381 return -1;
3382 else if (spip1->bit_score < spip2->bit_score)
3383 return 1;
3384 else
3385 return 0;
3386 }
3387 return 0;
3388 }
3389
3390 /***************************************************************************
3391 *
3392 * SPI_SortRegions takes a linked list of new regions (no alignments) and
3393 * makes sure that they are in order by score, to ensure that the first
3394 * region analyzed is the region with the most potential.
3395 *
3396 ***************************************************************************/
SPI_SortRegions(SPI_RegionInfoPtr srip_head)3397 static SPI_RegionInfoPtr SPI_SortRegions(SPI_RegionInfoPtr srip_head)
3398 {
3399 Int4 i;
3400 Int4 j;
3401 SPI_RegionInfoPtr srip;
3402 SPI_RegionInfoPtr PNTR sriparray;
3403
3404 srip = srip_head;
3405 i = 0;
3406 while (srip != NULL)
3407 {
3408 i++;
3409 srip = srip->next;
3410 }
3411 sriparray = (SPI_RegionInfoPtr PNTR)MemNew(i*sizeof(SPI_RegionInfoPtr));
3412 i = 0;
3413 srip = srip_head;
3414 while (srip != NULL)
3415 {
3416 sriparray[i] = srip;
3417 i++;
3418 srip = srip->next;
3419 }
3420 HeapSort(sriparray, i, sizeof(SPI_RegionInfoPtr), SPI_SortSrips);
3421 for (j=0; j<i-1; j++)
3422 {
3423 sriparray[j]->next = sriparray[j+1];
3424 }
3425 sriparray[i-1]->next = NULL;
3426 srip = sriparray[0];
3427 MemFree(sriparray);
3428 return srip;
3429 }
3430
3431 /***************************************************************************
3432 *
3433 * SPI_SortSrips is the HeapSort callback for SPI_SortRegions. It simply
3434 * orders the regions by score.
3435 *
3436 ***************************************************************************/
SPI_SortSrips(VoidPtr ptr1,VoidPtr ptr2)3437 static int LIBCALLBACK SPI_SortSrips(VoidPtr ptr1, VoidPtr ptr2)
3438 {
3439 SPI_RegionInfoPtr srip1;
3440 SPI_RegionInfoPtr srip2;
3441
3442 srip1 = *((SPI_RegionInfoPtr PNTR)ptr1);
3443 srip2 = *((SPI_RegionInfoPtr PNTR)ptr2);
3444 if (srip1->score > srip2->score)
3445 return -1;
3446 if (srip2->score > srip1->score)
3447 return 1;
3448 if (srip1->coverage > srip2->coverage)
3449 return -1;
3450 if (srip2->coverage > srip1->coverage)
3451 return 1;
3452 return 0;
3453 }
3454
3455 /***************************************************************************
3456 *
3457 * SPI_AssembleRegions is a recursive function which clusters the
3458 * alignments into consistent, nonoverlapping windows. On the first pass,
3459 * all the alignments are sent to the function SPI_GetRegionForSAP, and
3460 * only the consistent ones are put into the first region. Since the first
3461 * alignment is the highest-scoring alignment, this first region is
3462 * usually the best region. On each subsequent pass, an unused alignment
3463 * is assigned a genomic interval that does not overlap with any other
3464 * previously defined region, and that alignment and other alignments in
3465 * the same interval are sent to SPI_GetRegionForSAP to weed out
3466 * inconsistent alignments. This process is repeated until no alignments
3467 * are left -- all have either been assigned to a region or designated
3468 * impossible to assign, since they overlap with a defined region but are
3469 * inconsistent with other alignments in that region.
3470 *
3471 ***************************************************************************/
SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list,Int4 num,SPI_RegionInfoPtr PNTR head_srip,SPI_OptionsPtr spot)3472 static SPI_RegionInfoPtr SPI_AssembleRegions(SPI_AlnInfoPtr PNTR spip_list, Int4 num, SPI_RegionInfoPtr PNTR head_srip, SPI_OptionsPtr spot)
3473 {
3474 /* FloatHi bit_score; */
3475 /* FloatHi evalue; */
3476 Boolean found;
3477 Int4 i;
3478 Int4 j;
3479 Int4 lim_left;
3480 Int4 lim_right;
3481 Int4 n;
3482 /* Int4 number; */
3483 SeqAlignPtr sap;
3484 SPI_IvalPtr siip;
3485 SPI_IvalPtr siip_head;
3486 SPI_IvalPtr siip_prev;
3487 SPI_IvalPtr PNTR siip_list;
3488 SPI_RegionInfoPtr srip;
3489 SPI_RegionInfoPtr srip_tmp;
3490 Int4 start;
3491 Int4 stop;
3492
3493 if (spip_list == NULL || head_srip == NULL)
3494 return NULL;
3495 found = FALSE;
3496 i = 0;
3497 while (i<num && !found)
3498 {
3499 if (spip_list[i]->used == 0)
3500 found = TRUE;
3501 else
3502 i++;
3503 }
3504 if (!found)
3505 return *head_srip;
3506 sap = spip_list[i]->sap;
3507 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
3508 srip_tmp = *head_srip;
3509 lim_left = -1;
3510 lim_right = -1;
3511 /* figure out what (unused) genomic interval this alignment is in */
3512 while (srip_tmp != NULL)
3513 {
3514 if (srip_tmp->gstop < start)
3515 {
3516 if (srip_tmp->gstop > lim_left)
3517 lim_left = srip_tmp->gstop;
3518 }
3519 if (srip_tmp->gstart > stop)
3520 {
3521 if (lim_right == -1 || srip_tmp->gstart < lim_right)
3522 lim_right = srip_tmp->gstart;
3523 }
3524 srip_tmp = srip_tmp->next;
3525 }
3526 siip_head = siip_prev = NULL;
3527 n = 0;
3528 for (j=0; j<num; j++)
3529 {
3530 if (spip_list[j]->used == 0)
3531 {
3532 AlnMgr2GetNthSeqRangeInSA(spip_list[j]->sap, 1, &start, &stop);
3533 /* if this unused alignment is in the same interval as the one */
3534 /* being looked at, put it in the array */
3535 if (start > lim_left && (stop < lim_right || lim_right == -1))
3536 {
3537 siip = (SPI_IvalPtr)MemNew(sizeof(SPI_Ival));
3538 if (j == i)
3539 siip->used = 1;
3540 siip->n = j;
3541 siip->gstart = start;
3542 siip->gstop = stop;
3543 AlnMgr2GetNthSeqRangeInSA(spip_list[j]->sap, 2, &siip->mstart, &siip->mstop);
3544 siip->strand = AlnMgr2GetNthStrand(spip_list[j]->sap, 2);
3545 siip->sap = spip_list[j]->sap;
3546 siip->score = AlnMgr2ComputeScoreForSeqAlign(siip->sap);
3547 if (siip_head != NULL)
3548 {
3549 siip_prev->next = siip;
3550 siip_prev = siip;
3551 } else
3552 siip_head = siip_prev = siip;
3553 n++;
3554 }
3555 }
3556 }
3557 siip_list = (SPI_IvalPtr PNTR)MemNew(n*sizeof(SPI_IvalPtr));
3558 siip = siip_head;
3559 for (j=0; j<n && siip != NULL; j++)
3560 {
3561 siip_list[j] = siip;
3562 siip = siip->next;
3563 }
3564 /* send the array of unused alignments to SPI_GetRegionForSAP to */
3565 /* weed out inconsistent alignments */
3566 srip = SPI_GetRegionForSAP(siip_list, n, sap, spot);
3567 if (srip != NULL)
3568 {
3569 if (*head_srip == NULL)
3570 *head_srip = srip;
3571 else
3572 {
3573 srip_tmp = *head_srip;
3574 while (srip_tmp->next != NULL)
3575 {
3576 srip_tmp = srip_tmp->next;
3577 }
3578 srip_tmp->next = srip;
3579 }
3580 /* update the information about which alignments have been used */
3581 /* and which alignments are impossible */
3582 for (j=0; j<n; j++)
3583 {
3584 spip_list[siip_list[j]->n]->used = siip_list[j]->used;
3585 if (((siip_list[j]->gstart > srip->gstart + SPI_FUZZ && siip_list[j]->gstart < srip->gstop - SPI_FUZZ) || (siip_list[j]->gstop > srip->gstart + SPI_FUZZ && siip_list[j]->gstop < srip->gstop - SPI_FUZZ)) && siip_list[j]->used == 0)
3586 {
3587 siip_list[j]->used = -1;
3588 spip_list[siip_list[j]->n]->used = -1;
3589 }
3590 }
3591 }
3592 for (j=0; j<n; j++)
3593 {
3594 MemFree(siip_list[j]);
3595 }
3596 MemFree(siip_list);
3597 /* recursive call to self*/
3598 srip = SPI_AssembleRegions(spip_list, num, head_srip, spot);
3599 return srip;
3600 }
3601
3602
3603 /***************************************************************************
3604 *
3605 * SPI_GetRegionForSAP takes a list of SPI_IvalPtrs, each of which carries
3606 * the information for one alignment, and a seqalign, which is the
3607 * "anchor" alignment for the new interval and which is higher-scoring than
3608 * any other seqalign in the set. The function checks all of the SPI_Ivals
3609 * to see whether they're consistent with the anchor alignment, and marks
3610 * each SPI_Ival as used, not used, or impossible.
3611 *
3612 ***************************************************************************/
SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list,Int4 num,SeqAlignPtr sap,SPI_OptionsPtr spot)3613 static SPI_RegionInfoPtr SPI_GetRegionForSAP(SPI_IvalPtr PNTR siip_list, Int4 num, SeqAlignPtr sap, SPI_OptionsPtr spot)
3614 {
3615 Boolean done;
3616 Boolean found;
3617 Int4 i;
3618 Int2 j = SPI_UNKNOWN;
3619 Int4 n;
3620 SPI_RegionInfoPtr srip;
3621
3622 if (siip_list == NULL || num == 0 || sap == NULL)
3623 return NULL;
3624 /* sort the alignments along the genomic sequence */
3625 HeapSort(siip_list, num, sizeof(SPI_IvalPtr), SPI_compare_genomic_loc);
3626 SPI_CheckMrnaOrder(siip_list, num);
3627 found = FALSE;
3628 n = 0;
3629 /* figure out which one is the anchor alignment */
3630 while (!found && n<num)
3631 {
3632 if (sap == siip_list[n]->sap)
3633 found = TRUE;
3634 else
3635 n++;
3636 }
3637 if (!found)
3638 return NULL;
3639 /* make a new region with this alignment */
3640 srip = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
3641 AlnMgr2GetNthSeqRangeInSA(sap, 1, &srip->gstart, &srip->gstop);
3642 AlnMgr2GetNthSeqRangeInSA(sap, 2, &srip->mstart, &srip->mstop);
3643 srip->coverage = abs(srip->mstop - srip->mstart) + 1;
3644 srip->score = siip_list[n]->score;
3645 srip->strand = AlnMgr2GetNthStrand(sap, 2);
3646 /* search to the right for consistent alignments */
3647 done = FALSE;
3648 for (i=n+1; i<num && !done; i++)
3649 {
3650 j = SPI_is_consistent(siip_list[i], srip, spot);
3651 if (j == SPI_CONSISTENT)
3652 {
3653 srip->coverage = abs(siip_list[i]->mstop - siip_list[i]->mstart) + srip->coverage + 1;
3654 siip_list[i]->used = 1;
3655 srip->score += siip_list[i]->score;
3656 }
3657 else if (j == SPI_IMPOSSIBLE)
3658 siip_list[i]->used = -1;
3659 else if (j == SPI_DONE1)
3660 {
3661 siip_list[i]->used = 1;
3662 done = TRUE;
3663 } else if (j == SPI_DONE2)
3664 done = TRUE;
3665 }
3666 /* search to the left for consistent alignments */
3667 done = FALSE;
3668 for (i=n-1; i>=0 && !done; i--)
3669 {
3670 j = SPI_is_consistent(siip_list[i], srip, spot);
3671 if (j == SPI_CONSISTENT)
3672 {
3673 srip->coverage = abs(siip_list[i]->mstop - siip_list[i]->mstart) + srip->coverage + 1;
3674 siip_list[i]->used = 1;
3675 srip->score += siip_list[i]->score;
3676 }
3677 else if (j == SPI_IMPOSSIBLE)
3678 siip_list[i]->used = -1;
3679 else if (j == SPI_DONE1)
3680 {
3681 siip_list[i]->used = 1;
3682 done = TRUE;
3683 } else if (j == SPI_DONE2)
3684 done = TRUE;
3685 }
3686 SPI_ExcludeOverlaps(siip_list, num, srip);
3687 return srip;
3688 }
3689
3690 /***************************************************************************
3691 *
3692 * SPI_is_consistent is the workhorse of SPI_GetRegionForSAP. Given an
3693 * SPI_Ival and a region, the function decides whether the alignment in
3694 * the SPI_Ival is consistent with the rest of the region. Alignments that
3695 * overlap by more than SPI_FUZZ are labeled impossible; alignments that
3696 * are consistent in both the genomic and mRNA coordinates and that overlap
3697 * by less than SPI_FUZZ are labeled consistent. Alignments that do not
3698 * overlap the region but which are not consistent in either the mRNA or
3699 * genomic coordinates are labeled unknown.
3700 *
3701 ***************************************************************************/
SPI_is_consistent(SPI_IvalPtr siip,SPI_RegionInfoPtr srip,SPI_OptionsPtr spot)3702 static Int2 SPI_is_consistent(SPI_IvalPtr siip, SPI_RegionInfoPtr srip, SPI_OptionsPtr spot)
3703 {
3704 Int4 intronsize;
3705
3706 if (siip == NULL || srip == NULL)
3707 return 0;
3708 if ((siip->strand == Seq_strand_minus && srip->strand != Seq_strand_minus) || (srip->strand == Seq_strand_minus && siip->strand != Seq_strand_minus))
3709 return SPI_UNKNOWN;
3710 /*KSK*/
3711 if (spot->bigintron){
3712 intronsize = (spot->bigintron_size > SPI_INTRONSIZEXL
3713 ? spot->bigintron_size : SPI_INTRONSIZEXL);
3714 }
3715 else{
3716 intronsize = SPI_INTRONSIZE;
3717 }
3718 /*end KSK*/
3719 /* first look for overlaps -- exclude these from the set */
3720 /* since we search outward from a core hit, there shouldn't be */
3721 /* any overlaps anyway. */
3722 if (siip->gstart > srip->gstart + SPI_FUZZ && siip->gstart < srip->gstop - SPI_FUZZ)
3723 return SPI_IMPOSSIBLE;
3724 if (siip->gstart > srip->gstop - SPI_FUZZ && siip->gstart < srip->gstop + intronsize)
3725 {
3726 if (siip->strand == Seq_strand_minus)
3727 {
3728 if (siip->mstop < srip->mstart + SPI_FUZZ)
3729 {
3730 srip->gstop = siip->gstop;
3731 srip->mstart = siip->mstart;
3732 return SPI_CONSISTENT;
3733 } else
3734 return SPI_UNKNOWN;
3735 } else
3736 {
3737 if (siip->mstart > srip->mstop - SPI_FUZZ)
3738 {
3739 srip->gstop = siip->gstop;
3740 srip->mstop = siip->mstop;
3741 return SPI_CONSISTENT;
3742 } else
3743 return SPI_UNKNOWN;
3744 }
3745 } else if (siip->gstop <= srip->gstart + SPI_FUZZ && siip->gstop > srip->gstart - intronsize)
3746 {
3747 if (siip->strand == Seq_strand_minus)
3748 {
3749 if (siip->mstart > srip->mstop - SPI_FUZZ)
3750 {
3751 srip->gstart = siip->gstart;
3752 srip->mstop = siip->mstop;
3753 return SPI_CONSISTENT;
3754 } else
3755 return SPI_UNKNOWN;
3756 } else
3757 {
3758 if (siip->mstop < srip->mstart + SPI_FUZZ)
3759 {
3760 srip->gstart = siip->gstart;
3761 srip->mstart = siip->mstart;
3762 return SPI_CONSISTENT;
3763 } else
3764 return SPI_UNKNOWN;
3765 }
3766 }
3767 return SPI_UNKNOWN;
3768 }
3769
3770 /***************************************************************************
3771 *
3772 * SPI_compare_genomic_loc is the callback for the HeapSort in
3773 * SPI_GetRegionForSAP. It simply orders two SPI_Ival structures
3774 * by their genomic start coordinates.
3775 *
3776 ***************************************************************************/
SPI_compare_genomic_loc(VoidPtr ptr1,VoidPtr ptr2)3777 static int LIBCALLBACK SPI_compare_genomic_loc(VoidPtr ptr1, VoidPtr ptr2)
3778 {
3779 SPI_IvalPtr siip1;
3780 SPI_IvalPtr siip2;
3781
3782 if (ptr1 != NULL && ptr2 != NULL)
3783 {
3784 siip1 = *((SPI_IvalPtr PNTR)ptr1);
3785 siip2 = *((SPI_IvalPtr PNTR)ptr2);
3786 if (siip1->gstart < siip2->gstart)
3787 return -1;
3788 else if (siip1->gstart > siip2->gstart)
3789 return 1;
3790 else
3791 return 0;
3792 }
3793 return 0;
3794 }
3795
3796 /***************************************************************************
3797 *
3798 * SPI_ExcludeOverlaps takes a completed region and examines all SPI_Ivals
3799 * for conflicts. Intervals which overlap the completed region are
3800 * marked impossible.
3801 *
3802 ***************************************************************************/
SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list,Int4 num,SPI_RegionInfoPtr srip)3803 static void SPI_ExcludeOverlaps(SPI_IvalPtr PNTR siip_list, Int4 num, SPI_RegionInfoPtr srip)
3804 {
3805 Int4 i;
3806
3807 for (i=0; i<num; i++)
3808 {
3809 if (siip_list[i] == 0) /* KSK changed from '==' to '!=' */
3810 {
3811 if ((siip_list[i]->gstart >= srip->gstart && siip_list[i]->gstart <= srip->gstop) || (siip_list[i]->gstop >= srip->gstart && siip_list[i]->gstop <= srip->gstop))
3812 siip_list[i]->used = -1;
3813 }
3814 }
3815 }
3816
3817 /***************************************************************************
3818 *
3819 * SPI_AlignInWindows organizes the regions into an array, sends each
3820 * region to SPI_DoAln for careful alignment, and then takes all the
3821 * regions that have alignments and puts them into a linked list,
3822 * freeing those regions that do not have alignments.
3823 *
3824 ***************************************************************************/
SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)3825 static void SPI_AlignInWindows(SPI_RegionInfoPtr PNTR head_srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
3826 {
3827 Int4 nalign;
3828 Int4 score;
3829 SPI_RegionInfoPtr srip;
3830 SPI_RegionInfoPtr srip_head;
3831 SPI_RegionInfoPtr srip_next;
3832 SPI_RegionInfoPtr srip_prev;
3833 SPI_RegionInfoPtr srip_tmp;
3834
3835 if (head_srip == NULL || *head_srip == NULL)
3836 return;
3837 srip = *head_srip;
3838 score = srip->coverage;
3839 nalign = 0;
3840 /* KSK temp fix was to go through all srips regardless ***
3841 * '(while (srip!= NULL)) but the increased time is too much */
3842 while (srip != NULL && (nalign < spot->numreturns+1 || srip->coverage >= score/2)){
3843 SPI_DoAln(srip, bsp_genomic, bsp_mrna, spot);
3844 if (srip->smp != NULL){
3845 nalign++;
3846 }
3847 srip = srip->next;
3848 }
3849 srip_head = srip_prev = NULL;
3850 /* make a linked list of regions that have alignments */
3851 srip_tmp = *head_srip;
3852 while (srip_tmp != NULL)
3853 {
3854 srip_next = srip_tmp->next;
3855 srip_tmp->next = NULL;
3856 if (srip_tmp->smp != NULL)
3857 {
3858 if (srip_head != NULL)
3859 {
3860 srip_prev->next = srip_tmp;
3861 srip_prev = srip_tmp;
3862 } else
3863 srip_head = srip_prev = srip_tmp;
3864 } else
3865 MemFree(srip_tmp);
3866 srip_tmp = srip_next;
3867 }
3868 *head_srip = srip_head;
3869 }
3870
3871
3872 /***************************************************************************
3873 *
3874 * SPI_DoAln first re-BLASTs the mRNA against the genomic interval
3875 * specified by the region. If the mRNA is truncated in the interval,
3876 * the function pads the appropriate side to encourage a complete
3877 * alignment. The function then calls other functions to remove
3878 * inconsistent alignments from the set, extend the alignments so that
3879 * they completely span the mRNA, and adjust the alignments to the
3880 * most appropriate splice sites.
3881 *
3882 ***************************************************************************/
SPI_DoAln(SPI_RegionInfoPtr srip,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)3883 static void SPI_DoAln(SPI_RegionInfoPtr srip, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
3884 {
3885 Int4 eoff;
3886 BLAST_OptionsBlkPtr options;
3887 SPI_Progress progress;
3888 SeqAlignPtr sap;
3889 SeqLocPtr slp_g;
3890 SeqLocPtr slp_m;
3891 Int4 soff;
3892
3893 if (srip == NULL)
3894 return;
3895 if (srip->mstart == 0)
3896 soff = 0;
3897 else if (srip->mstart < 50)
3898 soff = 2*(srip->mstart);
3899 else
3900 soff = 3*(srip->mstart);
3901 if (soff > srip->gstart)
3902 soff = srip->gstart;
3903 if (srip->mstop == bsp_mrna->length - 1)
3904 eoff = 0;
3905 else if (bsp_mrna->length - 1 - srip->mstop < 50)
3906 eoff = 2*(bsp_mrna->length - 1 - srip->mstop);
3907 else
3908 eoff = 3*(bsp_mrna->length - 1 - srip->mstop);
3909 if (srip->gstop + eoff > bsp_genomic->length - 1)
3910 eoff = bsp_genomic->length - 1 - srip->gstop;
3911 slp_g = SeqLocIntNew(MAX(srip->gstart-soff, spot->from), MIN(srip->gstop+eoff, spot->to), Seq_strand_plus, bsp_genomic->id);
3912 slp_m = SeqLocIntNew(0, bsp_mrna->length-1, srip->strand, bsp_mrna->id);
3913 if (slp_g == NULL || slp_m == NULL)
3914 {
3915 ErrPostEx(SEV_ERROR, 0, 0, "Error in SPI_DoAln\n");
3916 return;
3917 }
3918 options = BLASTOptionNew("blastn", TRUE);
3919 options->filter_string = StringSave("m L");
3920 options->expect_value = spot->secpasseval;
3921 options->wordsize = 7; /*minimum BLAST wordsize */
3922 if (spot->interspecies)
3923 {
3924 options->gap_x_dropoff_final = 100;
3925 options->gap_open = 4;
3926 options->gap_extend = 1;
3927 options->penalty = -1;
3928 }
3929 options->query_lcase_mask = spot->lcaseloc;
3930 /* use mRNA as the query to speed up BLAST */
3931 sap = BlastTwoSequencesByLoc(slp_m, slp_g, "blastn", options);
3932 if (spot->callback != NULL)
3933 {
3934 progress.percentdone = 50;
3935 progress.returncode = SPI_PROGRESS;
3936 if (!spot->callback(&progress))
3937 return;
3938 }
3939 SeqLocFree(slp_m);
3940 SeqLocFree(slp_g);
3941 BLASTOptionDelete(options);
3942 if (sap == NULL)
3943 {
3944 ErrPostEx(SEV_ERROR, 0, 0, "Error in SPI_DoAln\n");
3945 return;
3946 }
3947
3948 if (!AlnMgr2IndexLite(sap))
3949 return;
3950 /* flip alignments so genomic sequence is the first row */
3951 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
3952 /* remove alignments that overlap by more than 2*SPI_TEENYEXON or that */
3953 /* are not consistent along genomic or mRNA coordinates */
3954 SPI_RemoveInconsistentAlnsFromSet(sap, 2*SPI_TEENYEXON, 1, SPI_LEFT);
3955 if (spot->interspecies == FALSE) /* extend to both ends of mRNA */
3956 {
3957 if (!SPI_ConnectAln(sap, spot, srip, TRUE, TRUE))
3958 return;
3959 } else /* for interspecies alignments, don't try to extend to the ends */
3960 {
3961 if (!SPI_ConnectAln(sap, spot, srip, FALSE, TRUE))
3962 return;
3963 }
3964 if (spot->callback != NULL)
3965 {
3966 progress.percentdone = 75;
3967 progress.returncode = SPI_PROGRESS;
3968 if (!spot->callback(&progress))
3969 return;
3970 }
3971 srip->smp = SPI_AdjustForSplice(sap, spot, srip);
3972 }
3973
3974 /***************************************************************************
3975 *
3976 * SPI_CheckForPolyAExon looks at the 3' terminal exon and checks to see
3977 * whether it consists only of polyAs. If so, the exon is deleted.
3978 *
3979 ***************************************************************************/
SPI_CheckForPolyAExon(SeqAlignPtr sap)3980 static void SPI_CheckForPolyAExon(SeqAlignPtr sap)
3981 {
3982 AMAlignIndex2Ptr amaip;
3983 BioseqPtr bsp;
3984 Int4 i;
3985 Int4 len;
3986 Int4 polya;
3987 SeqAlignPtr salp;
3988 SeqAlignPtr salp_prev;
3989 SeqAlignPtr sap_target;
3990 SeqIdPtr sip;
3991 Int4 start;
3992 Int4 stop;
3993 Uint1 strand;
3994
3995 amaip = (AMAlignIndex2Ptr)(sap->saip);
3996 for (i=0; i<amaip->numsaps-1; i++)
3997 {
3998 amaip->saps[i]->next = amaip->saps[i+1];
3999 }
4000 amaip->saps[i]->next = NULL;
4001 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
4002 sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
4003 bsp = BioseqLockById(sip);
4004 len = bsp->length;
4005 BioseqUnlock(bsp);
4006 if (strand == Seq_strand_minus)
4007 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &start, &stop);
4008 else
4009 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &start, &stop);
4010 polya = SPI_IsItPolyA(sip);
4011 SeqIdFree(sip);
4012 if (len - start > polya)
4013 return;
4014 if (amaip->numsaps == 1)
4015 return;
4016 if (strand == Seq_strand_minus)
4017 sap_target = amaip->saps[0];
4018 else
4019 sap_target = amaip->saps[amaip->numsaps-1];
4020 salp = (SeqAlignPtr)(sap->segs);
4021 salp_prev = NULL;
4022 while (salp != NULL){
4023 if (salp == sap_target){
4024 if (salp_prev == NULL){
4025 sap->segs = (Pointer)(sap_target->next);
4026 }
4027 else {
4028 salp_prev->next = sap_target->next;
4029 }
4030 SeqAlignFree(sap_target);
4031 sap_target = 0;
4032 salp = 0;
4033 }
4034 else {
4035 salp_prev = salp;
4036 salp = salp->next;
4037 }
4038 }
4039 AMAlignIndexFreeEitherIndex(sap);
4040 AlnMgr2IndexLite(sap);
4041 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4042 }
4043
4044 /***************************************************************************
4045 *
4046 * SPI_ConnectAln looks through all the alignments in a set and fills in
4047 * the gaps on the mRNA sequence. If do_ends is TRUE, the function will
4048 * try to fill in the alignments until they extend to both ends of the
4049 * mRNA; otherwise the function only fills in internal gaps. If a gap is
4050 * greater than the size of the smallest possible exon (SPI_TEENYEXON),
4051 * the function checks to see whether there's also a gap in the genomic
4052 * sequence. If both sequences have different-sized gaps bigger than
4053 * SPI_FUZZ, the function calls SPI_FillInIntron to fill in the gaps.
4054 * If both sequences have gaps larger than SPI_FUZZ and of similar sizes,
4055 * the function calls Fasika Aklilu's tree-based alignment
4056 * functions via SPI_FindBestAlnByDotPlot. SPI_ConnectAln does
4057 * check to see whether a non-matching mRNA 3' tail is actually a poly(A)
4058 * tail; if so, the tail is left unaligned.
4059 *
4060 ***************************************************************************/
SPI_ConnectAln(SeqAlignPtr sap,SPI_OptionsPtr spot,SPI_RegionInfoPtr srip,Boolean do_ends,Boolean firsttime)4061 static Boolean SPI_ConnectAln(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip, Boolean do_ends, Boolean firsttime)
4062 {
4063 AMAlignIndex2Ptr amaip;
4064 BioseqPtr bsp1;
4065 BioseqPtr bsp2;
4066 Int4 currstart2;
4067 Int4 end2;
4068 Int4 gap1;
4069 Int4 gap2;
4070 Int4 i;
4071 Boolean internal;
4072 Int4 j;
4073 Int4 len1;
4074 Int4 len2;
4075 SeqAlignPtr newsaps;
4076 SeqAlignPtr newsaps_prev;
4077 Int4 prevstop1;
4078 Int4 prevstop2;
4079 SeqAlignPtr sap_new;
4080 SeqAlignPtr sap_prev;
4081 SeqAlignPtr sap_tmp;
4082 SeqIdPtr sip1;
4083 SeqIdPtr sip2;
4084 SeqLocPtr slp1;
4085 SeqLocPtr slp2;
4086 Int4 start1;
4087 Int4 start2;
4088 Int4 stop1;
4089 Int4 stop2;
4090 Uint1 strand1;
4091 Uint1 strand2;
4092 Int4 tail;
4093
4094 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4095 return FALSE;
4096 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4097 amaip = (AMAlignIndex2Ptr)(sap->saip);
4098 if (amaip->numsaps < 1)
4099 return FALSE;
4100 if (amaip->numsaps == 1 && !do_ends)
4101 return TRUE;
4102 sip1 = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 1);
4103 sip2 = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
4104 bsp1 = BioseqLockById(sip1);
4105 if (bsp1 == NULL)
4106 return FALSE;
4107 bsp2 = BioseqLockById(sip2);
4108 if (bsp2 == NULL)
4109 {
4110 BioseqUnlock(bsp1);
4111 return FALSE;
4112 }
4113 strand1 = AlnMgr2GetNthStrand(amaip->saps[0], 1);
4114 strand2 = AlnMgr2GetNthStrand(amaip->saps[0], 2);
4115 if (do_ends)
4116 {
4117 prevstop1 = prevstop2 = -1;
4118 if (strand2 == Seq_strand_minus)
4119 prevstop2 = bsp2->length-1;
4120 } else
4121 {
4122 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &prevstop1, NULL);
4123 prevstop1--;
4124 if (strand2 == Seq_strand_minus)
4125 {
4126 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, NULL, &prevstop2);
4127 prevstop2++;
4128 } else
4129 {
4130 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &prevstop2, NULL);
4131 prevstop2--;
4132 }
4133 }
4134 internal = FALSE;
4135 newsaps = newsaps_prev = NULL;
4136 for (i=0; i<amaip->numsaps; i++)
4137 {
4138 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &start1, &stop1);
4139 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &start2, &stop2);
4140 /* make the mRNA a little 'smaller' to force overlaps between adjacent exons */
4141 start2 = start2 + 2;
4142 stop2 = stop2 - 2;
4143 if (strand2 == Seq_strand_minus)
4144 currstart2 = stop2;
4145 else
4146 currstart2 = start2;
4147 if ((gap2 = spi_isa_gap(currstart2, prevstop2, strand2)) > SPI_TEENYEXON)
4148 {
4149 gap1 = spi_isa_gap(start1, prevstop1, strand1);
4150 if (gap1 < (gap2 - 2*SPI_FUZZ) && gap1 > SPI_FUZZ)
4151 {
4152 if (internal == TRUE && i<amaip->numsaps-1 && spot->interspecies == FALSE)
4153 {
4154 /* gap in mRNA but not genomic - possibly not the right region */
4155 len1 = AlnMgr2GetAlnLength(amaip->saps[i-1], FALSE);
4156 len2 = AlnMgr2GetAlnLength(amaip->saps[i], FALSE);
4157 if (len1 > len2)
4158 {
4159 SeqAlignFree(amaip->saps[i]);
4160 amaip->saps[i] = NULL;
4161 j = i+1;
4162 /* reset the boundaries */
4163 stop1 = prevstop1;
4164 start2 = stop2 = prevstop2;
4165 } else
4166 {
4167 SeqAlignFree(amaip->saps[i-1]);
4168 amaip->saps[i-1] = NULL;
4169 j = i;
4170 }
4171 for ( ; j < amaip->numsaps; j++)
4172 {
4173 amaip->saps[j-1] = amaip->saps[j];
4174 amaip->saps[j] = NULL;
4175 }
4176 for (j=0; j<amaip->numsaps-1; j++)
4177 {
4178 amaip->saps[j]->next = amaip->saps[j+1];
4179 }
4180 amaip->numsaps--;
4181 amaip->saps[amaip->numsaps-1]->next = NULL;
4182 i--;
4183 } else if (internal == FALSE) /* mRNA continues past beginning of genomic sequence */
4184 {
4185 sap_new = SPI_FindPiece(sip1, sip2, prevstop2, currstart2, strand2, start1, SPI_LEFT, spot);
4186 sap_new = SPI_ProcessNewAlns(sap_new);
4187 if (sap_new != NULL && newsaps == NULL)
4188 newsaps = newsaps_prev = sap_new;
4189 else if (sap_new != NULL)
4190 newsaps_prev->next = sap_new;
4191 while (sap_new != NULL && newsaps_prev->next != NULL)
4192 {
4193 newsaps_prev = newsaps_prev->next;
4194 }
4195 }
4196 } else if ((gap2-2*SPI_FUZZ <= gap1) && (gap1 <= gap2 + 2*SPI_FUZZ) && gap1 > SPI_FUZZ) /* same size gaps */
4197 {
4198 slp1 = SeqLocIntNew(prevstop1+1, start1-1, strand1, sip1);
4199 if (strand2 != Seq_strand_minus)
4200 slp2 = SeqLocIntNew(prevstop2+1, currstart2-1, strand2, sip2);
4201 else
4202 slp2 = SeqLocIntNew(currstart2+1, prevstop2-1, strand2, sip2);
4203 sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
4204 sap_new = SPI_ProcessNewAlns(sap_new);
4205 SeqLocFree(slp1);
4206 SeqLocFree(slp2);
4207 if (sap_new != NULL && newsaps == NULL)
4208 newsaps = newsaps_prev = sap_new;
4209 else if (sap_new != NULL)
4210 newsaps_prev->next = sap_new;
4211 while (sap_new != NULL && newsaps_prev->next != NULL)
4212 {
4213 newsaps_prev = newsaps_prev->next;
4214 }
4215 } else if (gap1 >= SPI_FUZZ && gap2 >= SPI_FUZZ) /* gaps are different sizes -- intron? */
4216 {
4217 if (internal)
4218 {
4219 sap_new = SPI_FillInIntron(sip1, sip2, prevstop1, start1, prevstop2, currstart2, strand2, spot);
4220 sap_new = SPI_ProcessNewAlns(sap_new);
4221 if (sap_new != NULL && newsaps == NULL)
4222 newsaps = newsaps_prev = sap_new;
4223 else if (sap_new != NULL)
4224 newsaps_prev->next = sap_new;
4225 while (sap_new != NULL && newsaps_prev->next != NULL)
4226 {
4227 newsaps_prev = newsaps_prev->next;
4228 }
4229 } else /* first or last part of mRNA hasn't matched */
4230 {
4231 if (strand2 == Seq_strand_minus && do_ends) /* last exon -- check for polyA */
4232 {
4233 srip->polyAtail = SPI_IsItPolyA(sip2);
4234 if (srip->polyAtail >= SPI_MINPOLYASIZE)
4235 {
4236 if (srip->polyAtail < prevstop2)
4237 prevstop2 = prevstop2 - srip->polyAtail; /* don't align tail */
4238 else
4239 {
4240 if (prevstop2 >= SPI_MINPOLYASIZE)
4241 {
4242 srip->polyAtail = prevstop2;
4243 prevstop2 = 0;
4244 } else
4245 srip->polyAtail = 0;
4246 }
4247 } else
4248 srip->polyAtail = 0;
4249 }
4250 sap_new = SPI_FindPiece(sip1, sip2, prevstop2, currstart2, strand2, start1, SPI_LEFT, spot);
4251 sap_new = SPI_ProcessNewAlns(sap_new);
4252 if (sap_new != NULL && newsaps == NULL)
4253 newsaps = newsaps_prev = sap_new;
4254 else if (sap_new != NULL)
4255 newsaps_prev->next = sap_new;
4256 while (sap_new != NULL && newsaps_prev->next != NULL)
4257 {
4258 newsaps_prev = newsaps_prev->next;
4259 }
4260 }
4261 }
4262 }
4263 internal = TRUE;
4264 prevstop1 = stop1;
4265 if (strand2 == Seq_strand_minus)
4266 prevstop2 = start2;
4267 else
4268 prevstop2 = stop2;
4269 }
4270 sap_tmp = amaip->saps[0];
4271 for (j=1; j<amaip->numsaps; j++)
4272 {
4273 sap_tmp->next = amaip->saps[j];
4274 sap_tmp = amaip->saps[j];
4275 }
4276 sap_tmp->next = newsaps;
4277 sap->segs = (Pointer)(amaip->saps[0]);
4278 if (do_ends)
4279 {
4280 if (strand2 != Seq_strand_minus) /* last exon -- check for polyA */
4281 {
4282 srip->polyAtail = SPI_IsItPolyA(sip2);
4283 if (srip->polyAtail >= SPI_MINPOLYASIZE)
4284 end2 = bsp2->length - 1 - srip->polyAtail;
4285 else
4286 {
4287 srip->polyAtail = 0;
4288 end2 = bsp2->length-1;
4289 }
4290 if (srip->polyAtail != 0 && prevstop2 > end2)
4291 srip->polyAtail = srip->polyAtail - (prevstop2-end2+1);
4292 if (srip->polyAtail < SPI_MINPOLYASIZE)
4293 srip->polyAtail = 0;
4294 } else
4295 end2 = -1;
4296 if (spi_isa_gap(end2, prevstop2, strand2))
4297 {
4298 sap_new = SPI_FindPiece(sip1, sip2, prevstop2, end2, strand2, prevstop1, SPI_RIGHT, spot);
4299 sap_new = SPI_ProcessNewAlns(sap_new);
4300 sap_tmp = (SeqAlignPtr)(sap->segs);
4301 while (sap_tmp != NULL && sap_tmp->next != NULL)
4302 {
4303 sap_tmp = sap_tmp->next;
4304 }
4305 sap_tmp->next = sap_new;
4306 }
4307 }
4308 sap_tmp = (SeqAlignPtr)(sap->segs);
4309 /* check whether last exon is all polya */
4310 tail = SPI_IsItPolyA(sip2);
4311 if (strand2 == Seq_strand_minus)
4312 {
4313 AlnMgr2GetNthSeqRangeInSA(sap_tmp, 2, &start2, &stop2);
4314 if (stop2 < SPI_TEENYEXON + tail)
4315 {
4316 sap->segs = (Pointer)(sap_tmp->next);
4317 sap_tmp->next = NULL;
4318 SeqAlignFree(sap_tmp);
4319 }
4320 } else
4321 {
4322 sap_prev = NULL;
4323 while (sap_tmp->next != NULL)
4324 {
4325 sap_prev = sap_tmp;
4326 sap_tmp = sap_tmp->next;
4327 }
4328 AlnMgr2GetNthSeqRangeInSA(sap_tmp, 2, &start2, &stop2);
4329 if (bsp2->length - 1 - start2 < SPI_TEENYEXON + tail)
4330 {
4331 if (sap_prev != NULL)
4332 sap_prev->next = NULL;
4333 SeqAlignFree(sap_tmp);
4334 if (sap_prev == NULL)
4335 sap->segs = NULL;
4336 }
4337 }
4338 sap_tmp = (SeqAlignPtr)(sap->segs);
4339 i = 0;
4340 while (sap_tmp != NULL)
4341 {
4342 i++;
4343 sap_tmp = sap_tmp->next;
4344 }
4345 amaip->numsaps = i;
4346 MemFree(amaip->saps);
4347 amaip->saps = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
4348 sap_tmp = (SeqAlignPtr)(sap->segs);
4349 i = 0;
4350 while (sap_tmp != NULL)
4351 {
4352 amaip->saps[i] = sap_tmp;
4353 i++;
4354 sap_tmp = sap_tmp->next;
4355 }
4356 BioseqUnlock (bsp1);
4357 BioseqUnlock (bsp2);
4358 SeqIdFree(sip1);
4359 SeqIdFree(sip2);
4360 if (firsttime) /* reconnect to pick up last pieces */
4361 SPI_ConnectAln(sap, spot, srip, do_ends, FALSE);
4362 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
4363 SPI_CheckForPolyAExon(sap);
4364 return TRUE;
4365 }
4366
4367 /***************************************************************************
4368 *
4369 * SPI_ProcessNewAlns takes a linked list of child-type alignments and
4370 * runs SPI_RemoveInconsistentAlnsFromSet on a "fake" parent alignment
4371 * that it temporarily attaches to the children.
4372 *
4373 ***************************************************************************/
SPI_ProcessNewAlns(SeqAlignPtr sap)4374 static SeqAlignPtr SPI_ProcessNewAlns(SeqAlignPtr sap)
4375 {
4376 SeqAlignPtr sap_head;
4377
4378 if (sap == NULL)
4379 return NULL;
4380 sap_head = SeqAlignNew();
4381 sap_head->segtype = SAS_DISC;
4382 sap_head->segs = (Pointer)sap;
4383 AlnMgr2IndexLite(sap_head);
4384 SPI_RemoveInconsistentAlnsFromSet(sap_head, SPI_TEENYEXON, 1, SPI_LEFT);
4385 sap = (SeqAlignPtr)(sap_head->segs);
4386 sap_head->segs = NULL;
4387 MemFree(sap_head);
4388 return sap;
4389 }
4390
4391 /***************************************************************************
4392 *
4393 * SPI_IsItPolyA is a utility function which returns the length of
4394 * the poly(A) tail of a sequence. The poly(A) tail must be at least
4395 * SPI_MINPOLYASIZE long and can have non-A residues up to the SPI_LINKERSIZE
4396 * position from the end (linker used to clone the cDNA may sometimes be left
4397 * on the end of the poly(A) tail).
4398 *
4399 ***************************************************************************/
SPI_IsItPolyA(SeqIdPtr sip)4400 static Int4 SPI_IsItPolyA(SeqIdPtr sip)
4401 {
4402 BioseqPtr bsp;
4403 Uint1 buf[SPI_MAXPOLYASIZE];
4404 Int4 count;
4405 Boolean done;
4406 Int4 i;
4407 Int4 j;
4408 Uint1 res;
4409 SeqPortPtr spp;
4410 Int4 start;
4411
4412 bsp = BioseqLockById(sip);
4413 spp = SeqPortNew (bsp, bsp->length - SPI_MAXPOLYASIZE, bsp->length-1, Seq_strand_minus, Seq_code_ncbi4na);
4414 /* port on the minus strand -- shows up earlier if it's not polyA */
4415 done = FALSE;
4416 i = 0;
4417 j = 0;
4418 start = 0;
4419 count = 0;
4420 SeqPortRead(spp, buf, SPI_MAXPOLYASIZE);
4421 while (((res = buf[j]) != SEQPORT_EOF) && !done)
4422 {
4423 if (res != 8)
4424 {
4425 if (count >= SPI_LINKERSIZE) /* can have non-A bases up to the SPI_LINKERSIZE position */
4426 done = TRUE;
4427 else
4428 start++;
4429 } else
4430 i++;
4431 count++;
4432 j++;
4433 }
4434 if (start > 0)
4435 {
4436 if (i-start < SPI_MINPOLYASIZE) /* "tail" is too short */
4437 {
4438 SeqPortFree(spp);
4439 return 0;
4440 }
4441 }
4442 SeqPortFree(spp);
4443 BioseqUnlock(bsp);
4444 return i;
4445 }
4446
4447 /***************************************************************************
4448 *
4449 * SPI_FillInIntron is able to fill in internal gaps for SPI_ConnectAln.
4450 * Given mRNA and genomic boundaries, SPI_FillInIntron first does a low-
4451 * stringency BLAST, then removes inconsistent and overlapping alignments
4452 * from the resulting alignment set, and finally calls
4453 * SPI_FillInLastmRNAHoles to internally connect the new alignments.
4454 *
4455 ***************************************************************************/
SPI_FillInIntron(SeqIdPtr sip1,SeqIdPtr sip2,Int4 start1,Int4 stop1,Int4 start2,Int4 stop2,Uint1 strand2,SPI_OptionsPtr spot)4456 static SeqAlignPtr SPI_FillInIntron(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start1, Int4 stop1, Int4 start2, Int4 stop2, Uint1 strand2, SPI_OptionsPtr spot)
4457 {
4458 BLAST_OptionsBlkPtr options;
4459 SeqAlignPtr sap;
4460 SeqLocPtr slp1;
4461 SeqLocPtr slp2;
4462 Int4 start;
4463 Int4 stop;
4464
4465 if (stop1 - start1 < SPI_MINBLASTSIZE)
4466 return NULL;
4467 slp1 = SeqLocIntNew(start1, stop1, Seq_strand_plus, sip1);
4468 if (strand2 == Seq_strand_minus)
4469 {
4470 start = stop2;
4471 stop = start2;
4472 } else
4473 {
4474 start = start2;
4475 stop = stop2;
4476 }
4477 if (stop - start < SPI_MINBLASTSIZE)
4478 {
4479 SeqLocFree(slp1);
4480 return NULL;
4481 }
4482 slp2 = SeqLocIntNew(start, stop, strand2, sip2);
4483 options = BLASTOptionNew("blastn", FALSE);
4484 options->filter_string = StringSave("m L");
4485 options->expect_value = spot->thirdpasseval;
4486 options->query_lcase_mask = spot->lcaseloc;
4487 options->wordsize = 7;
4488 if (spot->interspecies)
4489 {
4490 options->gap_x_dropoff_final = 100;
4491 options->gap_open = 4;
4492 options->gap_extend = 1;
4493 options->penalty = -1;
4494 }
4495 sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4496
4497
4498 SeqLocFree(slp1);
4499 SeqLocFree(slp2);
4500 AlnMgr2IndexLite(sap);
4501 if (sap != NULL)
4502 {
4503 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4504 AMAlignIndex2Free2(sap->saip);
4505 sap->saip = NULL;
4506 }
4507 AlnMgr2IndexLite(sap);
4508 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4509 BLASTOptionDelete(options);
4510 sap = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start1, stop1, start, stop, strand2);
4511 return sap;
4512 }
4513
4514 /***************************************************************************
4515 *
4516 * spi_isa_gap is an often-called utility function that returns the size
4517 * of the difference between two sequence positions, given the strand. If
4518 * the sequence positions overlap or abut exactly, the function returns
4519 * 0. The strand is important:
4520 *
4521 * 13---------40 50--------60 plus strand--> gap of 8
4522 * | |
4523 * prevstop start
4524 * | |
4525 * 60---------50 40-------13 minus strand--> gap of 8
4526 *
4527 ***************************************************************************/
spi_isa_gap(Int4 start,Int4 prevstop,Uint1 strand)4528 static Int4 spi_isa_gap(Int4 start, Int4 prevstop, Uint1 strand)
4529 {
4530 if (prevstop == -1)
4531 prevstop = 0;
4532 if (start == -1)
4533 start = 0;
4534 if (strand != Seq_strand_minus)
4535 {
4536 if (start > prevstop+1)
4537 return (start - prevstop+1);
4538 else
4539 return 0;
4540 } else
4541 {
4542 if (prevstop > start+1)
4543 return (prevstop - (start+1));
4544 else
4545 return 0;
4546 }
4547 }
4548
4549 /***************************************************************************
4550 *
4551 * SPI_GetNthSeqLenInSASet assumes that the alignment given is a
4552 * set of alignments that all have the same rows. The function returns
4553 * the length of the Nth row that is covered by the alignment set.
4554 *
4555 ***************************************************************************/
SPI_GetNthSeqLenInSASet(SeqAlignPtr sap,Int4 n,Int4Ptr numsaps)4556 static Int4 SPI_GetNthSeqLenInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr numsaps)
4557 {
4558 Int4 len;
4559 Int4 num;
4560 SeqAlignPtr salp;
4561 Int4 start_tmp;
4562 Int4 stop_tmp;
4563
4564 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4565 return -1;
4566 salp = (SeqAlignPtr)(sap->segs);
4567 start_tmp = stop_tmp = -1;
4568 len = 0;
4569 num = 0;
4570 while (salp != NULL)
4571 {
4572 num++;
4573 if (n > salp->dim)
4574 return 0;
4575 AlnMgr2GetNthSeqRangeInSA(salp, n, &start_tmp, &stop_tmp);
4576 len += (stop_tmp - start_tmp + 1);
4577 salp = salp->next;
4578 }
4579 if (numsaps)
4580 *numsaps = num;
4581 return len;
4582 }
4583
4584 /***************************************************************************
4585 *
4586 * SPI_GetNthSeqRangeInSASet is used to get the 5' and 3' boundaries
4587 * of a sequence across a set of alignments. 'N' refers to row number,
4588 * and all the alignments are assumed to have the same rows. Note that
4589 * this function says nothing about the coverage of the specified
4590 * sequence.
4591 *
4592 ***************************************************************************/
SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap,Int4 n,Int4Ptr start,Int4Ptr stop)4593 static void SPI_GetNthSeqRangeInSASet(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop)
4594 {
4595 Int4 numrows;
4596 SeqAlignPtr salp;
4597 Int4 start_tmp;
4598 Int4 stop_tmp;
4599 Int4 tmp1;
4600 Int4 tmp2;
4601
4602 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
4603 return;
4604 salp = (SeqAlignPtr)(sap->segs);
4605 start_tmp = stop_tmp = -1;
4606 while (salp != NULL)
4607 {
4608 numrows = AlnMgr2GetNumRows(salp);
4609 if (n > numrows)
4610 {
4611 if (start)
4612 *start = -1;
4613 if (stop)
4614 *stop = -1;
4615 return;
4616 }
4617 AlnMgr2GetNthSeqRangeInSA(salp, n, &tmp1, &tmp2);
4618 if (tmp1 < start_tmp || start_tmp == -1)
4619 start_tmp = tmp1;
4620 if (tmp2 > stop_tmp)
4621 stop_tmp = tmp2;
4622 salp = salp->next;
4623 }
4624 if (start)
4625 *start = start_tmp;
4626 if (stop)
4627 *stop = stop_tmp;
4628 }
4629
4630 /***************************************************************************
4631 *
4632 * SPI_FindPiece is used to align a piece of mRNA with a tail of genomic
4633 * sequence:
4634 * which_end start_g start_m-stop_m
4635 * SPI_LEFT 0<-------4000 genomic X 30-61 mRNA (plus strand)
4636 * SPI_RIGHT 7000----->3'end genomic X 79-40 mRNA (minus strand)
4637 *
4638 * SPI_FindPiece first does a low-stringency BLAST search to try to align
4639 * the desired piece, and then calls functions to fill out the alignment
4640 * so that the new alignment is well-connected with the other pieces and
4641 * internally complete.
4642 * Since the strands and orientation are constrained by the other
4643 * alignments in the set, the BLAST search is only done within the
4644 * specified boundaries and for the specified strand.
4645 *
4646 ***************************************************************************/
SPI_FindPiece(SeqIdPtr sip1,SeqIdPtr sip2,Int4 start_m,Int4 stop_m,Uint1 strand,Int4 start_g,Int2 which_end,SPI_OptionsPtr spot)4647 static SeqAlignPtr SPI_FindPiece(SeqIdPtr sip1, SeqIdPtr sip2, Int4 start_m, Int4 stop_m, Uint1 strand, Int4 start_g, Int2 which_end, SPI_OptionsPtr spot)
4648 {
4649 Int4 bigintron;
4650 BioseqPtr bsp1;
4651 Int4 gstart;
4652 Int4 gstop;
4653 Int4 mstart;
4654 Int4 mstop;
4655 BLAST_OptionsBlkPtr options;
4656 SeqAlignPtr sap;
4657 SeqAlignPtr sap_new;
4658 SeqAlignPtr sap_new2;
4659 SeqLocPtr slp1;
4660 SeqLocPtr slp2;
4661 Int4 start;
4662 Int4 stop;
4663
4664 if (sip1 == NULL || sip2 == NULL)
4665 return NULL;
4666 /*KSK*/
4667 if (spot->bigintron){
4668 bigintron = MAX(SPI_BIGINTRONXL, spot->bigintron_size);
4669 }
4670 else {
4671 bigintron = SPI_BIGINTRON;
4672 }
4673 if ((strand == Seq_strand_minus && start_m - stop_m < 7) || (strand != Seq_strand_minus && stop_m - start_m < 7))
4674 return NULL;
4675 if (start_m < 0)
4676 start_m = 0;
4677 if (start_m == -1)
4678 start_m = 0;
4679 if (stop_m == -1)
4680 stop_m = 0;
4681 if (start_g == -1)
4682 start_g = 0;
4683 if (which_end == SPI_LEFT)
4684 {
4685 if (start_g < SPI_FUZZ)
4686 return NULL;
4687 if (strand == Seq_strand_minus)
4688 {
4689 if (start_m - stop_m > start_g + 2*SPI_FUZZ)
4690 return NULL;
4691 start = MAX(0, start_g - bigintron);
4692 slp1 = SeqLocIntNew(MAX(start, spot->from), MIN(start_g, spot->to), Seq_strand_plus, sip1);
4693 slp2 = SeqLocIntNew(stop_m, start_m, strand, sip2);
4694 options = BLASTOptionNew("blastn", FALSE);
4695 options->wordsize = 7;
4696 options->filter_string = StringSave("m L");
4697 options->expect_value = spot->secpasseval;
4698 options->query_lcase_mask = spot->lcaseloc;
4699 if (spot->interspecies)
4700 {
4701 options->gap_x_dropoff_final = 100;
4702 options->gap_open = 4;
4703 options->gap_extend = 1;
4704 options->penalty = -1;
4705 }
4706 sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4707 if (sap == NULL)
4708 return NULL;
4709 AlnMgr2IndexLite(sap);
4710 SeqLocFree(slp1);
4711 SeqLocFree(slp2);
4712 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4713 AMAlignIndex2Free2(sap->saip);
4714 sap->saip = NULL;
4715 AlnMgr2IndexLite(sap);
4716 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_RIGHT);
4717 SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4718 SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4719 BLASTOptionDelete(options);
4720 if (spot->draftfile != NULL)
4721 return sap;
4722 if (mstart - stop_m <= 3*SPI_FUZZ)
4723 {
4724 sap_new = sap;
4725 if (sap_new->segtype == SAS_DISC)
4726 {
4727 sap_new2 = (SeqAlignPtr)(sap_new->segs);
4728 sap_new->segs = NULL;
4729 SeqAlignFree(sap_new);
4730 sap_new = sap_new2;
4731 while (sap_new2 != NULL)
4732 {
4733 AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4734 sap_new2 = sap_new2->next;
4735 }
4736 } else
4737 SPI_AddToAln(sap_new, mstart - stop_m, SPI_RIGHT, strand);
4738 return sap_new;
4739 } else
4740 {
4741 sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, gstart, start_g-1, stop_m+1, mstop, strand);
4742 return sap_new;
4743 }
4744 } else
4745 {
4746 if (stop_m - start_m > start_g + 2*SPI_FUZZ)
4747 return NULL;
4748 start = MAX(0, start_g - bigintron);
4749 slp1 = SeqLocIntNew(MAX(start, spot->from), MIN(start_g, spot->to), Seq_strand_plus, sip1);
4750 slp2 = SeqLocIntNew(start_m, stop_m, strand, sip2);
4751 options = BLASTOptionNew("blastn", FALSE);
4752 options->wordsize = 7;
4753 options->filter_string = StringSave("m L");
4754 options->expect_value = spot->secpasseval;
4755 options->query_lcase_mask = spot->lcaseloc;
4756 if (spot->interspecies)
4757 {
4758 options->gap_x_dropoff_final = 100;
4759 options->gap_open = 4;
4760 options->gap_extend = 1;
4761 options->penalty = -1;
4762 }
4763 sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4764 if (sap == NULL)
4765 return NULL;
4766 AlnMgr2IndexLite(sap);
4767 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4768 AMAlignIndex2Free2(sap->saip);
4769 SeqLocFree(slp1);
4770 SeqLocFree(slp2);
4771 sap->saip = NULL;
4772 AlnMgr2IndexLite(sap);
4773 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_RIGHT);
4774 SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4775 SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4776 BLASTOptionDelete(options);
4777 if (spot->draftfile != NULL)
4778 return sap;
4779 if (stop_m - mstop <= 3*SPI_FUZZ)
4780 {
4781 sap_new = sap;
4782 if (sap_new->segtype == SAS_DISC)
4783 {
4784 sap_new2 = (SeqAlignPtr)(sap_new->segs);
4785 sap_new->segs = NULL;
4786 SeqAlignFree(sap_new);
4787 sap_new = sap_new2;
4788 while (sap_new2 != NULL)
4789 {
4790 AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4791 sap_new2 = sap_new2->next;
4792 }
4793 } else
4794 SPI_AddToAln(sap_new, stop_m - mstop, SPI_RIGHT, strand);
4795 return sap_new;
4796 } else
4797 {
4798 sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, gstart, start_g-1, mstart, stop_m+1, strand);
4799 return sap_new;
4800 }
4801 }
4802 } else if (which_end == SPI_RIGHT)
4803 {
4804 bsp1 = BioseqLockById(sip1);
4805 if (bsp1 == NULL)
4806 return NULL;
4807 if (start_g > bsp1->length - SPI_FUZZ)
4808 return NULL;
4809 if (strand == Seq_strand_minus)
4810 {
4811 if (start_m - stop_m > bsp1->length - start_g - 2*SPI_FUZZ)
4812 return NULL;
4813 stop = MIN(bsp1->length-1, start_g + bigintron);
4814 slp1 = SeqLocIntNew(MAX(start_g, spot->from), MIN(stop, spot->to), Seq_strand_plus, sip1);
4815 slp2 = SeqLocIntNew(stop_m, start_m, strand, sip2);
4816 options = BLASTOptionNew("blastn", FALSE);
4817 options->wordsize = 7;
4818 options->filter_string = StringSave("m L");
4819 options->expect_value = spot->secpasseval;
4820 options->query_lcase_mask = spot->lcaseloc;
4821 if (spot->interspecies)
4822 {
4823 options->gap_x_dropoff_final = 100;
4824 options->gap_open = 4;
4825 options->gap_extend = 1;
4826 options->penalty = -1;
4827 }
4828 sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4829 if (sap == NULL)
4830 return NULL;
4831 AlnMgr2IndexLite(sap);
4832 SeqLocFree(slp1);
4833 SeqLocFree(slp2);
4834 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4835 AMAlignIndex2Free2(sap->saip);
4836 sap->saip = NULL;
4837 AlnMgr2IndexLite(sap);
4838 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4839 SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4840 SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4841 BLASTOptionDelete(options);
4842 if (spot->draftfile != NULL)
4843 return sap;
4844 if (start_m - mstop <= 3*SPI_FUZZ)
4845 {
4846 sap_new = sap;
4847 if (sap_new->segtype == SAS_DISC)
4848 {
4849 sap_new2 = (SeqAlignPtr)(sap_new->segs);
4850 sap_new->segs = NULL;
4851 SeqAlignFree(sap_new);
4852 sap_new = sap_new2;
4853 while (sap_new2 != NULL)
4854 {
4855 AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4856 sap_new2 = sap_new2->next;
4857 }
4858 } else
4859 SPI_AddToAln(sap_new, start_m - mstop, SPI_LEFT, strand);
4860 return sap_new;
4861 } else
4862 {
4863 sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start_g+1, gstop, mstart, start_m-1, strand);
4864 return sap_new;
4865 }
4866 } else
4867 {
4868 if (stop_m - start_m > bsp1->length - start_g - 2*SPI_FUZZ)
4869 return NULL;
4870 stop = MIN(bsp1->length-1, start_g + bigintron);
4871 slp1 = SeqLocIntNew(MAX(start_g, spot->from), MIN(stop, spot->to), Seq_strand_plus, sip1);
4872 slp2 = SeqLocIntNew(start_m, stop_m, strand, sip2);
4873 options = BLASTOptionNew("blastn", FALSE);
4874 options->wordsize = 7;
4875 options->filter_string = StringSave("m L");
4876 options->expect_value = spot->secpasseval;
4877 options->query_lcase_mask = spot->lcaseloc;
4878 if (spot->interspecies)
4879 {
4880 options->gap_x_dropoff_final = 100;
4881 options->gap_open = 4;
4882 options->gap_extend = 1;
4883 options->penalty = -1;
4884 }
4885 sap = BlastTwoSequencesByLoc(slp2, slp1, "blastn", options);
4886 if (sap == NULL)
4887 return NULL;
4888 AlnMgr2IndexLite(sap);
4889 SPI_flip_sa_list((SeqAlignPtr)(sap->segs));
4890 AMAlignIndex2Free2(sap->saip);
4891 SeqLocFree(slp1);
4892 SeqLocFree(slp2);
4893 sap->saip = NULL;
4894 AlnMgr2IndexLite(sap);
4895 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON/2, 1, SPI_LEFT);
4896 SPI_GetNthSeqRangeInSASet(sap, 1, &gstart, &gstop);
4897 SPI_GetNthSeqRangeInSASet(sap, 2, &mstart, &mstop);
4898 BLASTOptionDelete(options);
4899 if (spot->draftfile != NULL)
4900 return sap;
4901 if (mstart - start_m <= 3*SPI_FUZZ)
4902 {
4903 sap_new = sap;
4904 if (sap_new->segtype == SAS_DISC)
4905 {
4906 sap_new2 = (SeqAlignPtr)(sap_new->segs);
4907 sap_new->segs = NULL;
4908 SeqAlignFree(sap_new);
4909 sap_new = sap_new2;
4910 while (sap_new2 != NULL)
4911 {
4912 AlnMgr2IndexSingleChildSeqAlign(sap_new2);
4913 sap_new2 = sap_new2->next;
4914 }
4915 } else
4916 SPI_AddToAln(sap_new, mstart - start_m, SPI_LEFT, strand);
4917 return sap_new;
4918 } else
4919 {
4920 sap_new = SPI_FillInLastmRNAHoles(sap, sip1, sip2, start_g+1, gstop, start_m+1, mstop, strand);
4921 return sap_new;
4922 }
4923 }
4924 }
4925 return NULL;
4926 }
4927
4928
4929 /* added by KSK for SPI_AdjustForSplice() when mRNA regions overlap */
SPI_Choose2LooseMrnaOvLap(const SeqAlignPtr sap1,const SeqAlignPtr sap2,const SPI_mRNAPtr smp,const int ptr1offset)4930 static int SPI_Choose2LooseMrnaOvLap (const SeqAlignPtr sap1, const SeqAlignPtr sap2,
4931 const SPI_mRNAPtr smp, const int ptr1offset)
4932 {
4933 Int4 p1_sites = 0, p2_sites = 0;
4934 Int4 score1 = 0, score2 = 0;
4935 float margin = 0;
4936
4937 if (sap1 == NULL || sap2 == NULL || smp == NULL){
4938 return -1;
4939 }
4940
4941 score1 = AlnMgr2ComputeScoreForSeqAlign(sap1);
4942 score2 = AlnMgr2ComputeScoreForSeqAlign(sap2);
4943
4944 if (score1 >= score2){
4945 margin = (float)score1/5;
4946 if ((float)score1 >= (((float)(score2)) + margin)){
4947 return ptr1offset + 1;
4948 }
4949 }
4950 else if (score1 <= score2){
4951 margin = (float)score2/5;
4952 if ((float)score2 >= (((float)(score1)) + margin)){
4953 return ptr1offset;
4954 }
4955 }
4956
4957 p1_sites = smp->splicedon[ptr1offset] + smp->spliceacc[ptr1offset];
4958 p2_sites = smp->splicedon[ptr1offset + 1] + smp->spliceacc[ptr1offset + 1];
4959
4960 if (p1_sites > p2_sites){
4961 return ptr1offset + 1;
4962 }
4963 else if (p2_sites > p1_sites){
4964 return ptr1offset;
4965 }
4966 return (score1 >= score2 ? ptr1offset + 1 : ptr1offset);
4967 }
4968
4969
4970
4971
4972 /***************************************************************************
4973 *
4974 * SPI_AdjustForSplice adjusts the boundaries of all the alignments in
4975 * the set so that they abut each other and are at the optimal splice
4976 * sites. SPI_AdjustForSplice also fills in the mismatch, %id, #gaps
4977 * and other information for each exon. The function first allocates a new
4978 * SPI_mRNA structure, then makes sure that the set of alignments doesn't
4979 * miss tiny pieces on the 5' or 3' end of the mRNA. Next, the alignments
4980 * are sent in pairs to SPI_AdjustOverlaps, which adjusts the alignment
4981 * boundaries so that they are adjacent to each other and to good splice
4982 * sites. The function then checks to see whether any two alignments are
4983 * adjacent on both the genomic and mRNA sequences; if so, these alignments
4984 * are merged. Each alignment (now each alignment is exactly one exon) is
4985 * sent to SPI_GetExonInfo to get the %id, #gaps, etc.; the overall
4986 * % coverage is computed and the alignments are examined to see whether
4987 * one or both ends of the mRNA are missing, and then all the information
4988 * is returned to the calling function.
4989 *
4990 ***************************************************************************/
SPI_AdjustForSplice(SeqAlignPtr sap,SPI_OptionsPtr spot,SPI_RegionInfoPtr srip)4991 static SPI_mRNAPtr SPI_AdjustForSplice(SeqAlignPtr sap, SPI_OptionsPtr spot, SPI_RegionInfoPtr srip)
4992 {
4993 AMAlignIndex2Ptr amaip;
4994 Int4 b;
4995 BioseqPtr bsp;
4996 Int4 c;
4997 Int4 count, sap2delete = 0;
4998 Int4 gstart1;
4999 Int4 gstart2;
5000 Int4 gstop1;
5001 Int4 gstop2;
5002 Int4 i;
5003 Int4 intronsize;
5004 Int4 len;
5005 Int4 len1;
5006 Int4 len2;
5007 Int4 min = 0;
5008 Int4 mis;
5009 Int4 max = 0;
5010 Int4 mstart1;
5011 Int4 mstart2;
5012 Int4 mstop1;
5013 Int4 mstop2;
5014 Int4 n;
5015 SeqAlignPtr PNTR saparray;
5016 SeqIdPtr sip;
5017 SPI_mRNAPtr smp;
5018 SPI_mRNAPtr smp_new;
5019 Uint1 strand;
5020
5021 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
5022 return NULL;
5023 if (spot->bigintron){
5024 intronsize = (spot->bigintron_size > SPI_INTRONSIZEXL
5025 ? spot->bigintron_size : SPI_INTRONSIZEXL);
5026 /*intronsize = SPI_INTRONSIZEXL;*/
5027 }
5028 else{
5029 intronsize = SPI_INTRONSIZE;
5030 }
5031 /*end KSK*/
5032 AlnMgr2SortAlnSetByNthRowPos(sap, 1);
5033 SPI_RemoveTeenyAlns(sap, SPI_TEENYEXON);
5034 if (sap->segs == NULL)
5035 {
5036 SeqAlignFree(sap);
5037 return NULL;
5038 }
5039 amaip = (AMAlignIndex2Ptr)(sap->saip);
5040 strand = AlnMgr2GetNthStrand(amaip->saps[0], 2);
5041 /* first allocate a new SPI_mRNA structure to hold all the information */
5042 smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
5043 smp->numexons = amaip->numsaps;
5044 smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
5045 smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
5046 smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
5047 smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5048 smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
5049 smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5050 smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5051 smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5052 smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
5053 smp->strand = strand;
5054 /* make sure the alignment doesn't leave out little bits on the ends */
5055 sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 2);
5056 bsp = BioseqLockById(sip);
5057 len1 = bsp->length;
5058 len1 = len1 - srip->polyAtail;
5059 BioseqUnlock(bsp);
5060 SeqIdFree(sip);
5061 sip = AlnMgr2GetNthSeqIdPtr(amaip->saps[0], 1);
5062 bsp = BioseqLockById(sip);
5063 len2 = bsp->length;
5064 BioseqUnlock(bsp);
5065 SeqIdFree(sip);
5066 if (strand != Seq_strand_minus)
5067 {
5068 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &gstart1, &gstop1);
5069 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &mstart1, &mstop1);
5070 if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && gstart1 >= mstart1)
5071 SPI_AddToAln(amaip->saps[0], mstart1, SPI_LEFT, strand);
5072 else if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && gstart1 < mstart1)
5073 SPI_AddToAln(amaip->saps[0], gstart1, SPI_LEFT, strand);
5074 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &mstart2, &mstop2);
5075 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 1, &gstart2, &gstop2);
5076 if (len1 - srip->polyAtail - mstop2-1 > 0 && len1 - mstop2-1 <= SPI_ENDFUZZ && len2-gstop2 >= len1-mstop2)
5077 SPI_AddToAln(amaip->saps[amaip->numsaps-1], len1-mstop2-1, SPI_RIGHT, strand);
5078 else if (len1-mstop2-1 - srip->polyAtail > 2 && len1 - mstop2-1 <= SPI_ENDFUZZ && len1-mstop2 > len2 - gstop2)
5079 SPI_AddToAln(amaip->saps[amaip->numsaps-1], len2-gstop2-1, SPI_RIGHT, strand);
5080 } else
5081 {
5082 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &gstart1, &gstop1);
5083 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 2, &mstart2, &mstop2);
5084 if (len1 - mstop2-1-srip->polyAtail > 0 && len1 - mstop2-1 <= 2*SPI_TEENYEXON && gstart1 >= (len1 - mstop2 - 1))
5085 SPI_AddToAln(amaip->saps[0], len1-mstop2-1, SPI_LEFT, strand);
5086 else if (len1 - mstop2-1-srip->polyAtail > 0 && len1 - mstop2-1 <= 2*SPI_TEENYEXON && gstart1 < (len1 - mstop2 - 1))
5087 SPI_AddToAln(amaip->saps[0], gstart1, SPI_LEFT, strand);
5088 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 2, &mstart1, &mstop1);
5089 AlnMgr2GetNthSeqRangeInSA(amaip->saps[amaip->numsaps-1], 1, &gstart2, &gstop2);
5090 if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && len2 - gstop2 -1> mstart1)
5091 SPI_AddToAln(amaip->saps[amaip->numsaps-1], mstart1, SPI_RIGHT, strand);
5092 else if (mstart1 > 0 && mstart1 <= SPI_ENDFUZZ && len2 - gstop2 -1<= mstart1)
5093 SPI_AddToAln(amaip->saps[amaip->numsaps-1], len2-gstop2-1, SPI_RIGHT, strand);
5094 }
5095 /* send the alignments in pairs to be adjusted to good splice sites */
5096 for (i=0; i<amaip->numsaps - 1; i++)
5097 {
5098 SPI_AdjustOverlaps(amaip->saps[i], amaip->saps[i+1], i, smp, spot);
5099 }
5100 n = 0;
5101 for (i=0; i<amaip->numsaps-1; i++) /* merge adjacent alignments */
5102 {
5103
5104 amaip->saps[i]->next = NULL;
5105 amaip->saps[i+1]->next = NULL;
5106 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &gstart1, &gstop1);
5107 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 1, &gstart2, &gstop2);
5108 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &mstart1, &mstop1);
5109 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i+1], 2, &mstart2, &mstop2);
5110 /* if (gstart2 >= gstop1 - SPI_EXONMERGESIZE && gstart2 <= gstop1 + SPI_EXONMERGESIZE) */
5111 if (gstart2 >= gstop1 - SPI_EXONMERGESIZE && gstart2 <= gstop1 + SPI_EXONMERGESIZE){
5112 if ((mstart2 >= mstop1 - SPI_EXONMERGESIZE && mstart2 <= mstop1 + SPI_EXONMERGESIZE)
5113 || (mstart1 >= mstop2 - SPI_EXONMERGESIZE && mstart1 <= mstop2 + SPI_EXONMERGESIZE)){
5114 amaip->saps[i+1] = SPI_MergeAlignments(amaip->saps[i], amaip->saps[i+1]);
5115 SeqAlignFree(amaip->saps[i]);
5116 amaip->saps[i] = NULL;
5117 n++;
5118 }
5119 }
5120 /** KSK added this 'else if' block in case there are overlapping **
5121 ** mrna sequences shared by these seqaligns **/
5122
5123 else if ((strand == Seq_strand_minus && (mstop2 <= mstop1 && mstop2 >= mstart1))
5124 || (strand == Seq_strand_plus && (mstop1 <= mstop2 && mstop1 >= mstart2))){
5125 /* fixes in case the MRNA portion of different regions overlap */
5126 /* first, if one is subsumed */
5127 if (mstart1 >= mstart2 && mstop1 <= mstop2){
5128 SeqAlignFree(amaip->saps[i]);
5129 amaip->saps[i] = NULL;
5130 n++;
5131 }
5132 else if (mstart2 <= mstart1 && mstop2 >= mstop1){
5133 SeqAlignFree(amaip->saps[i+1]);
5134 amaip->saps[i+1] = amaip->saps[i];
5135 amaip->saps[i] = NULL;
5136 n++;
5137 }
5138 /* now if mRNA region overlaps - simply choose the highest score */
5139 /* unless the lower has splice donor & acceptor and the higher */
5140 /* does not */
5141 else {
5142 if ((sap2delete = SPI_Choose2LooseMrnaOvLap(amaip->saps[i], amaip->saps[i+1],
5143 smp, i)) != -1) {
5144 if (sap2delete == i){
5145 SeqAlignFree(amaip->saps[i]);
5146 amaip->saps[i] = NULL;
5147 n++;
5148 }
5149 else if (sap2delete == (i+1)){
5150 SeqAlignFree(amaip->saps[i+1]);
5151 amaip->saps[i+1] = amaip->saps[i];
5152 amaip->saps[i] = NULL;
5153 n++;
5154 }
5155 }
5156 }
5157 }
5158 }
5159
5160 if (n > 0) /* some alignments were merged; need a new smp */
5161 {
5162 saparray = (SeqAlignPtr PNTR)MemNew((amaip->numsaps-n)*sizeof(SeqAlignPtr));
5163 count = 0;
5164 smp_new = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
5165 smp_new->numexons = amaip->numsaps-n;
5166 smp_new->exonid = (FloatHiPtr)MemNew((smp_new->numexons)*sizeof(FloatHi));
5167 smp_new->splicedon = (Uint1Ptr)MemNew((smp_new->numexons)*sizeof(Uint1));
5168 smp_new->spliceacc = (Uint1Ptr)MemNew((smp_new->numexons)*sizeof(Uint1));
5169 smp_new->exongaps = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5170 smp_new->saps = (SeqAlignPtr PNTR)MemNew((smp_new->numexons)*sizeof(SeqAlignPtr));
5171 smp_new->mstarts = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5172 smp_new->mstops = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5173 smp_new->gstarts = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5174 smp_new->gstops = (Int4Ptr)MemNew((smp_new->numexons)*sizeof(Int4));
5175 smp_new->strand = strand;
5176 n = 0;
5177 for (i=0; i<amaip->numsaps; i++)
5178 {
5179 if (amaip->saps[i] != NULL)
5180 {
5181 saparray[n] = amaip->saps[i];
5182 n++;
5183 if (strand == Seq_strand_minus)
5184 {
5185 if (i != 0)
5186 smp_new->splicedon[count] = smp->splicedon[i-1];
5187 smp_new->spliceacc[count] = smp->spliceacc[i];
5188 } else
5189 {
5190 smp_new->splicedon[count] = smp->splicedon[i];
5191 if (i != 0)
5192 smp_new->spliceacc[count] = smp->spliceacc[i-1];
5193 }
5194 smp_new->saps[count] = smp->saps[i];
5195 count++;
5196 }
5197 smp->saps[i] = NULL;
5198 }
5199 MemFree(amaip->saps);
5200 amaip->saps = saparray;
5201 amaip->numsaps = n;
5202 SPI_mRNAFree(smp);
5203 smp = smp_new;
5204 }
5205 len = 0;
5206 max = 0;
5207 mis = 0;
5208 /* now get the %id, #mismatches, #gaps, etc. for each exon */
5209 for (i=0; i<amaip->numsaps; i++)
5210 {
5211 smp->saps[i] = amaip->saps[i];
5212 len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
5213 if (i == 0)
5214 min = b;
5215 else
5216 {
5217 if (b < min)
5218 min = b;
5219 }
5220 if (c > max)
5221 max = c;
5222 }
5223 sip = AlnMgr2GetNthSeqIdPtr(smp->saps[0], 1);
5224 bsp = BioseqLockById(sip);
5225 len2 = bsp->length;
5226 BioseqUnlock(bsp);
5227 SeqIdFree(sip);
5228 /* decide whether either end is left out of the alignments */
5229 if (min != 0)
5230 {
5231 if (max < len1 - 1)
5232 smp->missingends = SPI_BOTH;
5233 else
5234 smp->missingends = SPI_LEFT;
5235 } else
5236 {
5237 if (max < len1 - 1)
5238 smp->missingends = SPI_RIGHT;
5239 else
5240 smp->missingends = SPI_NEITHER;
5241 }
5242 if (srip->strand == Seq_strand_minus)
5243 {
5244 if ((smp->missingends == SPI_RIGHT || smp->missingends == SPI_BOTH) && smp->gstarts[0] < intronsize)
5245 srip->fallsoff = TRUE;
5246 if ((smp->missingends == SPI_LEFT || smp->missingends == SPI_BOTH) && smp->gstops[smp->numexons-1] > len2 - intronsize)
5247 srip->fallsoff = TRUE;
5248 } else
5249 {
5250 if ((smp->missingends == SPI_LEFT || smp->missingends == SPI_BOTH) && smp->gstops[smp->numexons-1] > len2 - intronsize)
5251 srip->fallsoff = TRUE;
5252 if ((smp->missingends == SPI_RIGHT || smp->missingends == SPI_BOTH) && smp->gstarts[0] < intronsize)
5253 srip->fallsoff = TRUE;
5254 }
5255 smp->mRNAcoverage = (100*len)/len1;
5256 smp->mismatch = (FloatHi)(100*mis)/len;
5257 amaip->saps = smp->saps;
5258 for (i=0; i<amaip->numsaps-1; i++)
5259 {
5260 amaip->saps[i]->next = amaip->saps[i+1];
5261 amaip->saps[i+1]->next = NULL;
5262 }
5263 sap->segs = (Pointer)(amaip->saps[0]);
5264 smp->parent = sap;
5265 return smp;
5266 }
5267
5268 /***************************************************************************
5269 *
5270 * SPI_GetExonInfo creates a profile of the indicated exon, then
5271 * walks through the profile structure to find mismatches and gaps.
5272 * The gap positions are recorded (if the alignment is going to be
5273 * printed -- otherwise the number of gaps is recorded, but not the
5274 * positions) in the SPI_TinyInfo structures. If the alignment is going
5275 * to be printed, a SPI_ExonProf structure is created to hold all the
5276 * mismatch positions, and the exon profile is put into the smp. Regardless,
5277 * the number of gaps and the number of mismatches are recorded. The length
5278 * of the exon is returned.
5279 *
5280 ***************************************************************************/
SPI_GetExonInfo(SPI_mRNAPtr smp,Int4 n,Int4Ptr start,Int4Ptr stop,Int4Ptr mis,SPI_OptionsPtr spot)5281 static Int4 SPI_GetExonInfo(SPI_mRNAPtr smp, Int4 n, Int4Ptr start, Int4Ptr stop, Int4Ptr mis, SPI_OptionsPtr spot)
5282 {
5283 ACTProfilePtr app;
5284 ACTProfilePtr app_head;
5285 Int4 c;
5286 Int4 counter;
5287 SPI_ExonProfPtr epp;
5288 SPI_ExonProfPtr epp_tmp;
5289 Boolean found;
5290 Int4 i;
5291 Int4 j;
5292 Int4 mismatch;
5293 SPI_TinyInfoPtr spit;
5294 SPI_TinyInfoPtr spit_head;
5295 SPI_TinyInfoPtr spit_prev;
5296
5297 AlnMgr2GetNthSeqRangeInSA(smp->saps[n], 2, start, stop);
5298 smp->mstarts[n] = *start;
5299 smp->mstops[n] = *stop;
5300 AlnMgr2GetNthSeqRangeInSA(smp->saps[n], 1, &smp->gstarts[n], &smp->gstops[n]);
5301 app_head = SPI_MakeProfileFromSA(smp->saps[n]);
5302 smp->exongaps[n] = 0;
5303 mismatch = 0;
5304 app = app_head;
5305 spit_head = spit_prev = NULL;
5306 counter = 0;
5307 while (app != NULL)
5308 {
5309 for (i=0; i<app->len; i++)
5310 {
5311 found = FALSE;
5312 c = 0;
5313 for (j=0; j<ACT_NUCLEN; j++)
5314 {
5315 if (app->freq[j][i] == 1 && !found)
5316 {
5317 /*if (app->freq[4][i] == 0) not an N
5318 {*/
5319 mismatch += 1;
5320 found = TRUE;
5321 if (spot->printaln != 1) /* if it's going to be printed, save up the mismatches */
5322 {
5323 spit = (SPI_TinyInfoPtr)MemNew(sizeof(SPI_TinyInfo));
5324 spit->n = counter;
5325 if (spit_head != NULL)
5326 {
5327 spit_prev->next = spit;
5328 spit_prev = spit;
5329 } else
5330 spit_head = spit_prev = spit;
5331 }
5332 /* }*/
5333 }
5334 c += app->freq[j][i];
5335 }
5336 if (c != 2)
5337 smp->exongaps[n]++;
5338 counter++;
5339 }
5340 app = app->next;
5341 }
5342 SPI_ProfileSetFree(app_head);
5343
5344
5345 smp->exonid[n] = ( *stop - *start +1 > 0
5346 ? (FloatHi)(100) - ((FloatHi)(100*mismatch))/(FloatHi)(*stop - *start + 1) : 0 );
5347 if (mismatch > 0 && smp->exonid[n] > 99.9)
5348 smp->exonid[n] = 99.9;
5349 *mis += mismatch;
5350 /* if there are mismatches, and the alignment is going to be printed, then */
5351 /* create an ExonProf structure to hold the mismatch positions. */
5352 if (spot->printaln != 1 && spit_head != NULL)
5353 {
5354 spit = spit_head;
5355 i = 0;
5356 while (spit != NULL)
5357 {
5358 i++;
5359 spit = spit->next;
5360 }
5361 epp = (SPI_ExonProfPtr)MemNew(sizeof(SPI_ExonProf));
5362 epp->exonnum = n+1;
5363 epp->nummismatches = i;
5364 epp->mismatches = (Int4Ptr)MemNew(i*sizeof(Int4));
5365 i = 0;
5366 spit = spit_head;
5367 while (spit != NULL)
5368 {
5369 epp->mismatches[i] = spit->n;
5370 spit_prev = spit->next;
5371 MemFree(spit);
5372 spit = spit_prev;
5373 i++;
5374 }
5375 if (smp->epp != NULL)
5376 {
5377 epp_tmp = smp->epp;
5378 while (epp_tmp->next != NULL)
5379 {
5380 epp_tmp = epp_tmp->next;
5381 }
5382 epp_tmp->next = epp;
5383 } else
5384 smp->epp = epp;
5385 }
5386 return (*stop - *start + 1);
5387 }
5388
5389 /***************************************************************************
5390 *
5391 * SPI_AdjustOverlaps takes two adjacent alignments and adjusts their
5392 * boundaries so that they abut each other and so that they are adjacent
5393 * to good splice donor and acceptor sites. The function fills in the
5394 * appropriate splicedon and spliceacc fields in the smp structure to
5395 * indicate whether each alignment has a splice donor or acceptor site.
5396 * Since the splice matrices are organism-specific, this function
5397 * needs the spot->organism information.
5398 * SPI_AdjustOverlaps first gets the donor and acceptor splice information
5399 * (length of consensus sequence and the boundary, which is the offset into
5400 * the consensus sequence for the intron-exon boundary). Then a window
5401 * around the right end of sap1 is examined for donor sites. The top
5402 * SPI_NUMSITES donor sites are examined more carefully: each is evaluated
5403 * as to whether it has a good acceptor site in sap2, how much it affects
5404 * the boundaries of sap1 and sap2, and how good its donor site score is.
5405 * The donor site with the best acceptor site, that affects the alignments
5406 * the least and has the best donor score, is the site that is chosen. If
5407 * no good site is found, sap1 is unchanged and sap2 is truncated or
5408 * extended as needed. If a good site is found, sap1 and sap2 are both
5409 * truncated or extended as needed.
5410 *
5411 * For the plus strand models (mRNA and genomic sequence in same orientation):
5412 *
5413 * start1--------------------stop1 start2----------------stop2
5414 * | |
5415 * <- look from here ->
5416 * to here for a donor splice site -- the range is
5417 * stop1 - ovl - fluff - boundary to stop1 + spllen + fluff.
5418 * When a sequence matches the consensus splice sequence, its offset
5419 * into the range is recorded. The new stop position is then
5420 * stop1 - ovl - fluff - boundary + offset + (spllen - boundary).
5421 *
5422 *
5423 * For the minus strand models:
5424 *
5425 * start1--------------------stop1 start2-----------------stop2
5426 * mstop1--------------------mstart1 mstop2---------------- mstart2
5427 * | |
5428 * <- search this ->
5429 * interval on the minus strand of the
5430 * genomic sequence for a donor splice site -- the range is
5431 * start2 - fluff - spllen to start2 + ovl + fluff + boundary. When
5432 * the offset of a match is computed, the new mRNA stop position is
5433 * stop2 - ovl - boundary - fluff + offset + (spllen - boundary).
5434 *
5435 *
5436 * xxxGTxxxxx <- vertebrate splice donor consensus, GT are the first two
5437 * residues of the intron. Here spllen is 10 and boundary is 8 -- when
5438 * counting from the 3' end, the 8th residue is the first residue of the
5439 * exon.
5440 *
5441 ***************************************************************************/
SPI_AdjustOverlaps(SeqAlignPtr sap1,SeqAlignPtr sap2,Int4 n,SPI_mRNAPtr smp,SPI_OptionsPtr spot)5442 static void SPI_AdjustOverlaps(SeqAlignPtr sap1, SeqAlignPtr sap2, Int4 n, SPI_mRNAPtr smp, SPI_OptionsPtr spot)
5443 {
5444 Boolean both;
5445 Int4 boundary;
5446 Int4 boundary_a;
5447 BioseqPtr bsp;
5448 Uint1Ptr buf;
5449 /** Uint1 buf2[200]; **/
5450 Uint1Ptr buf2;
5451 Uint1Ptr buf3;
5452 Int4 c;
5453 Int4 f;
5454 Int4 fluff;
5455 Int4 gstart;
5456 Int4 i;
5457 FloatHi maxsc = 0;
5458 Int4 offset;
5459 Int4 ovl;
5460 Int4 pos;
5461 Uint1 res;
5462 FloatHi score;
5463 SeqIdPtr sip;
5464 SPI_Splice splarray[SPI_NUMSITES];
5465 Int4 spllen;
5466 Int4 spllen_a;
5467 SeqPortPtr spp;
5468 Int4 spp_start;
5469 Int4 spp_end;
5470 Int4 start;
5471 Int4 start1;
5472 Int4 start2;
5473 Int4 stop1;
5474 Int4 stop2;
5475 Uint1 strand;
5476 Int4 tmp;
5477
5478 strand = AlnMgr2GetNthStrand(sap1, 2);
5479 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
5480 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
5481 sip = AlnMgr2GetNthSeqIdPtr(sap1, 1);
5482 bsp = BioseqLockById(sip);
5483 if (strand == Seq_strand_minus)
5484 {
5485 gstart = stop1;
5486 start = start2;
5487 } else
5488 {
5489 gstart = start2;
5490 start = stop1;
5491 }
5492 /* retrieve the organism-specific donor and acceptor information */
5493 SPI_GetDonorSpliceInfo(spot->organism, &spllen, &boundary, spot);
5494 SPI_GetAcceptorSpliceInfo(spot->organism, &spllen_a, &boundary_a, spot);
5495 /* get the overlap between the alignments */
5496 ovl = spi_get_overlap(sap1, sap2);
5497 if (-ovl > 2*SPI_TEENYEXON)
5498 return;
5499 if (ovl < 0 && -ovl <= 2*SPI_TEENYEXON) /* extend alignments until they do overlap, to get the splice site right */
5500 {
5501 SPI_ExtendAlnAlgDumb(sap2, -ovl+2, SPI_LEFT, strand);
5502 SPI_ExtendAlnAlgDumb(sap1, -ovl+2, SPI_RIGHT, strand);
5503 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
5504 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
5505 if (strand == Seq_strand_minus)
5506 {
5507 gstart = stop1;
5508 start = start2;
5509 } else
5510 {
5511 gstart = start2;
5512 start = stop1;
5513 }
5514 ovl = -ovl;
5515 }
5516 ovl = MIN(abs(ovl), abs(start2-stop1));
5517 if (spot->interspecies == TRUE)
5518 fluff = SPI_FLUFF;
5519 else
5520 fluff = 0;
5521 if (ovl != 0)
5522 {
5523 /* open a seqport in a window around the end of donor sap, and look for donor sites */
5524 if (strand != Seq_strand_minus)
5525 {
5526 spp_start = start - ovl - fluff - (spllen - boundary);
5527 spp_end = start + spllen + ovl + fluff;
5528 if (start-ovl-fluff-spllen+boundary < 0)
5529 spp_start = 0;
5530 if (start+spllen+fluff+ovl > bsp->length-1)
5531 spp_end = bsp->length-1;
5532 spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5533 } else
5534 {
5535 spp_start = start - fluff - spllen;
5536 spp_end = start + ovl + fluff + spllen - boundary;
5537 if (spp_start < 0)
5538 spp_start = 0;
5539 if (spp_end > bsp->length-1)
5540 spp_end = bsp->length - 1;
5541 spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5542 }
5543 i = 0;
5544 buf = (Uint1Ptr)MemNew((2*fluff+ovl+spllen+2)*sizeof(Uint1));
5545 buf2 = (Uint1Ptr)MemNew((2*fluff+ovl+spllen+2)*sizeof(Uint1));
5546 SeqPortRead(spp, buf2, 2*fluff+ovl+spllen+2);
5547 for (f=0; f<SPI_NUMSITES; f++)
5548 {
5549 splarray[f].i = 0;
5550 splarray[f].score = -2;
5551 }
5552 while (((res = buf2[i]) != SEQPORT_EOF) && i<(2*fluff+ovl+spllen+1))
5553 {
5554 if (res == 1)
5555 buf[i] = 0;
5556 else if (res == 2)
5557 buf[i] = 1;
5558 else if (res == 4)
5559 buf[i] = 2;
5560 else if (res == 8)
5561 buf[i] = 3;
5562 else
5563 buf[i] = 4;
5564 i++;
5565 }
5566 SeqPortFree(spp);
5567 MemFree(buf2);
5568 for (i=0; i<2*fluff+ovl+1; i++)
5569 {
5570 if (spot->dsplicejunc > 0)
5571 SPI_is_donor_user(buf+i, spllen, &score, spot);
5572 else
5573 SPI_is_donor(buf+i, spllen, &score, spot->organism);
5574 c = 0;
5575 if (score > 0.000001)
5576 {
5577 for (f=0; f<SPI_NUMSITES; f++)
5578 {
5579 if (f == 0)
5580 maxsc = splarray[f].score;
5581 else if (splarray[f].score < maxsc)
5582 {
5583 maxsc = splarray[f].score;
5584 c = f;
5585 }
5586 }
5587 if (score > splarray[c].score)
5588 {
5589 splarray[c].score = score;
5590 splarray[c].i = i;
5591 }
5592 }
5593 }
5594 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
5595 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
5596 maxsc = 0;
5597 /* for the SPI_NUMSITES best donor sites, get the corresponding acceptor */
5598 /* site score and record how much each alignment would be changed if */
5599 /* the alignments were truncated/extended to this sites */
5600 /* pos is the coordinate of the last residue of the donor exon */
5601 for (f=0; f<SPI_NUMSITES; f++)
5602 {
5603 if (strand == Seq_strand_minus)
5604 {
5605 pos = stop2 - ovl - fluff + splarray[f].i;
5606 if (stop2 - pos < 0)
5607 splarray[f].diff = pos - stop2;
5608 else
5609 splarray[f].diff = stop2 - pos;
5610 if (start1 - pos < 0)
5611 {
5612 if (pos - start1 < splarray[f].diff)
5613 splarray[f].diff = pos - start1;
5614 } else
5615 {
5616 if (start1 - pos < splarray[f].diff)
5617 splarray[f].diff = start1 - pos;
5618 }
5619 tmp = gstart + start1 - (pos + 1);
5620 SPI_GetAcceptorScore(bsp, tmp - (spllen_a - boundary_a), tmp + boundary_a, strand, &splarray[f].score2, spllen_a, spot);
5621 } else
5622 {
5623 pos = stop1 - ovl - fluff + splarray[f].i;
5624 if (stop1 - pos < 0)
5625 splarray[f].diff = pos - stop1;
5626 else
5627 splarray[f].diff = stop1 - pos;
5628 if (start2 - pos < 0)
5629 {
5630 if (pos - start2 < splarray[f].diff)
5631 splarray[f].diff = pos - start2;
5632 } else
5633 {
5634 if (start2 - pos < splarray[f].diff)
5635 splarray[f].diff = start2 - pos;
5636 }
5637 tmp = gstart + start2 - (pos + 1);
5638 SPI_GetAcceptorScore(bsp, tmp - boundary_a + (spllen_a - boundary_a), tmp, strand, &splarray[f].score2, spllen_a, spot);
5639 }
5640 if (splarray[f].diff > maxsc)
5641 maxsc = splarray[f].diff;
5642 }
5643 i = 0;
5644 both = FALSE;
5645 for (f=0; f<SPI_NUMSITES && !both; f++)
5646 {
5647 if (splarray[f].score > 0.000001 && splarray[f].score2 > 0.00000002)
5648 both = TRUE;
5649 }
5650 /* look for the position that has a good acceptor (if one of them does have both */
5651 /* a good donor and acceptor) and that changes the alignments the least */
5652 /* with the highest donor score */
5653 offset = 0;
5654 for (f=0; f<SPI_NUMSITES; f++)
5655 {
5656 if ((both && splarray[f].score2 > 0.0000000002) || both == FALSE)
5657 {
5658 if(splarray[f].score >= splarray[i].score)
5659 {
5660 maxsc = splarray[f].diff;
5661 offset = splarray[f].i;
5662 i = f;
5663 }
5664 }
5665 }
5666 if (strand == Seq_strand_minus)
5667 pos = stop2 - ovl - fluff + offset;
5668 else
5669 pos = stop1 - ovl - fluff + offset;
5670 MemFree(buf);
5671 if (splarray[i].score >= 0.00001 && pos > 0)
5672 {
5673 if (strand == Seq_strand_minus)
5674 smp->splicedon[n+1] = 1;
5675 else
5676 smp->splicedon[n] = 1;
5677 } else /* if don't find a good site, don't change the alignment */
5678 offset = ovl + fluff;
5679 if (strand == Seq_strand_minus)
5680 pos = stop2 - ovl - fluff + offset;
5681 else
5682 pos = stop1 - ovl - fluff + offset;
5683 if (splarray[i].score2 > 0.0000000002)
5684 {
5685 if (strand == Seq_strand_minus)
5686 smp->spliceacc[n] = 1;
5687 else
5688 smp->spliceacc[n+1] = 1;
5689 }
5690 SeqIdFree(sip);
5691 BioseqUnlock(bsp);
5692 } else
5693 {
5694 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
5695 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
5696 if (strand == Seq_strand_minus)
5697 pos = stop2;
5698 else
5699 pos = stop1;
5700 if (strand != Seq_strand_minus)
5701 {
5702 spp_start = start - fluff - (spllen - boundary);
5703 spp_end = start + spllen + fluff;
5704 if (start-ovl-fluff-spllen+boundary < 0)
5705 spp_start = 0;
5706 if (start+spllen+fluff > bsp->length-1)
5707 spp_end = bsp->length-1;
5708 spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5709 } else
5710 {
5711 spp_start = start - fluff - spllen;
5712 spp_end = start + fluff + spllen - boundary;
5713 if (spp_start < 0)
5714 spp_start = 0;
5715 if (spp_end > bsp->length-1)
5716 spp_end = bsp->length - 1;
5717 spp = SeqPortNew(bsp, spp_start, spp_end, strand, Seq_code_ncbi4na);
5718 }
5719 i = 0;
5720 buf = (Uint1Ptr)MemNew((spp_end-spp_start+1)*sizeof(Uint1));
5721 buf3 = (Uint1Ptr)MemNew((spp_end-spp_start+1)*sizeof(Uint1));
5722 SeqPortRead(spp, buf3, spp_end-spp_start+1);
5723 while (i<(spp_end-spp_start+1) && ((res = buf3[i]) != SEQPORT_EOF))
5724 {
5725 if (res == 1)
5726 buf[i] = 0;
5727 else if (res == 2)
5728 buf[i] = 1;
5729 else if (res == 4)
5730 buf[i] = 2;
5731 else if (res == 8)
5732 buf[i] = 3;
5733 else
5734 buf[i] = 4;
5735 i++;
5736 }
5737 SeqPortFree(spp);
5738 MemFree(buf3);
5739 if (spot->dsplicejunc > 0)
5740 SPI_is_donor_user(buf, spllen, &score, spot);
5741 else
5742 SPI_is_donor(buf, spllen, &score, spot->organism);
5743 if (score >= 0.00001)
5744 {
5745 if (strand == Seq_strand_minus)
5746 smp->splicedon[n+1] = 1;
5747 else
5748 smp->splicedon[n] = 1;
5749 }
5750 if (strand == Seq_strand_minus)
5751 {
5752 tmp = gstart + start1 - (pos + 1);
5753 SPI_GetAcceptorScore(bsp, tmp - (spllen_a - boundary_a), tmp + boundary_a, strand, &score, spllen_a, spot);
5754 } else
5755 {
5756 tmp = gstart + start2 - (pos + 1);
5757 SPI_GetAcceptorScore(bsp, tmp - boundary_a + (spllen_a - boundary_a), tmp, strand, &score, spllen_a, spot);
5758 }
5759 if (score > 0.0000000002)
5760 {
5761 if (strand == Seq_strand_minus)
5762 smp->spliceacc[n] = 1;
5763 else
5764 smp->spliceacc[n+1] = 1;
5765 }
5766 MemFree(buf);
5767 }
5768 /* extend or truncate sap1 and sap2 to abut each other exactly and to */
5769 /* be adjacent to the chosen splice site */
5770 if (strand == Seq_strand_minus)
5771 {
5772 if (pos < stop2)
5773 {
5774 if (AlnMgr2TruncateSeqAlign(sap2, start2, pos, 2))
5775 {
5776 sap2->next->next = NULL;
5777 SeqAlignFree(sap2->next);
5778 sap2->next = NULL;
5779 }
5780 } else if (pos > stop2)
5781 SPI_AddToAln(sap2, MIN(pos-stop2, abs(gstart-start)), SPI_LEFT, strand);
5782 if (start1 == pos + 1)
5783 return;
5784 else if (start1 < pos + 1)
5785 {
5786 if (AlnMgr2TruncateSeqAlign(sap1, pos+1, stop1, 2))
5787 {
5788 sap1->next->next = NULL;
5789 SeqAlignFree(sap1->next);
5790 sap1->next = NULL;
5791 }
5792 return;
5793 } else if (start1 > pos + 1)
5794 {
5795 SPI_AddToAln(sap1, MIN(start1-pos-1, abs(gstart-start)), SPI_RIGHT, strand);
5796 return;
5797 }
5798 } else
5799 {
5800 if (pos < stop1)
5801 {
5802 if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
5803 {
5804 sap1->next->next = NULL;
5805 SeqAlignFree(sap1->next);
5806 sap1->next = NULL;
5807 }
5808 } else if (pos > stop1)
5809 SPI_AddToAln(sap1, MIN(pos - stop1, abs(gstart-start)), SPI_RIGHT, strand);
5810 if (start2 == pos + 1)
5811 return;
5812 else if (start2 < pos + 1)
5813 {
5814 if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
5815 {
5816 sap2->next->next = NULL;
5817 SeqAlignFree(sap2->next);
5818 sap2->next = NULL;
5819 }
5820 return;
5821 } else if (start2 > pos + 1)
5822 {
5823 SPI_AddToAln(sap2, MIN(start2-pos-1, abs(gstart-start)), SPI_LEFT, strand);
5824 return;
5825 }
5826 }
5827 }
5828
5829 /***************************************************************************
5830 *
5831 * SPI_RemoveTeenyAlns removes all alignments in a set that are less
5832 * than len in length.
5833 *
5834 ***************************************************************************/
SPI_RemoveTeenyAlns(SeqAlignPtr sap,Int4 len)5835 static void SPI_RemoveTeenyAlns(SeqAlignPtr sap, Int4 len)
5836 {
5837 Int4 alnlen;
5838 AMAlignIndex2Ptr amaip;
5839 Int4 i;
5840 SeqAlignPtr sap_head;
5841 SeqAlignPtr sap_prev;
5842
5843 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
5844 return;
5845 sap_head = sap_prev = NULL;
5846 amaip = (AMAlignIndex2Ptr)(sap->saip);
5847 for (i=0; i<amaip->numsaps; i++)
5848 {
5849 amaip->saps[i]->next = NULL;
5850 alnlen = AlnMgr2GetAlnLength(amaip->saps[i], FALSE);
5851 if (alnlen >= len)
5852 {
5853 if (sap_head != NULL)
5854 {
5855 sap_prev->next = amaip->saps[i];
5856 sap_prev = amaip->saps[i];
5857 } else
5858 sap_head = sap_prev = amaip->saps[i];
5859 } else
5860 SeqAlignFree(amaip->saps[i]);
5861 }
5862 sap->segs = (Pointer)(sap_head);
5863 AMAlignIndex2Free2(amaip);
5864 sap->saip = NULL;
5865 if (sap->segs != NULL)
5866 AlnMgr2IndexLite(sap);
5867 }
5868
SPI_ExtendAlnAlgDumb(SeqAlignPtr sap,Int4 ovl,Int4 which_side,Uint1 strand)5869 static void SPI_ExtendAlnAlgDumb(SeqAlignPtr sap, Int4 ovl, Int4 which_side, Uint1 strand)
5870 {
5871 DenseSegPtr dsp;
5872 DenseSegPtr dsp_new;
5873 Int4 i;
5874 Int4 j;
5875 Int4 start1;
5876 Int4 start2;
5877 Int4 stop1;
5878 Int4 stop2;
5879
5880 dsp = (DenseSegPtr)(sap->segs);
5881 dsp_new = DenseSegNew();
5882 dsp_new->dim = 2;
5883 dsp_new->numseg = dsp->numseg+1;
5884 dsp_new->starts = (Int4Ptr)MemNew(2*dsp_new->numseg*sizeof(Int4));
5885 dsp_new->strands = (Uint1Ptr)MemNew(2*dsp_new->numseg*sizeof(Uint1));
5886 dsp_new->lens = (Int4Ptr)MemNew(dsp_new->numseg*sizeof(Int4));
5887 dsp_new->ids = dsp->ids;
5888 dsp->ids = NULL;
5889 i = 0;
5890 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start1, &stop1);
5891 AlnMgr2GetNthSeqRangeInSA(sap, 2, &start2, &stop2);
5892 if (which_side == SPI_LEFT)
5893 {
5894 dsp_new->starts[0] = start1-ovl;
5895 if (strand == Seq_strand_minus)
5896 dsp_new->starts[1] = stop2+1;
5897 else
5898 dsp_new->starts[1] = start2-ovl;
5899 dsp_new->strands[0] = Seq_strand_plus;
5900 dsp_new->strands[1] = strand;
5901 dsp_new->lens[0] = ovl;
5902 i++;
5903 }
5904 for (j=0; j<dsp->numseg; j++)
5905 {
5906 dsp_new->starts[2*(j+i)] = dsp->starts[2*j];
5907 dsp_new->starts[2*(j+i)+1] = dsp->starts[2*j+1];
5908 dsp_new->strands[2*(j+i)] = Seq_strand_plus;
5909 dsp_new->strands[2*(j+i)+1] = strand;
5910 dsp_new->lens[i+j] = dsp->lens[j];
5911 }
5912 if (which_side == SPI_RIGHT)
5913 {
5914 dsp_new->starts[2*(dsp_new->numseg-1)] = stop1+1;
5915 if (strand == Seq_strand_minus)
5916 dsp_new->starts[2*(dsp_new->numseg-1)+1] = start2-ovl;
5917 else
5918 dsp_new->starts[2*(dsp_new->numseg-1)+1] = stop2+1;
5919 dsp_new->strands[2*(dsp_new->numseg-1)] = Seq_strand_plus;
5920 dsp_new->strands[2*(dsp_new->numseg-1)+1] = strand;
5921 dsp_new->lens[dsp_new->numseg-1] = ovl;
5922 }
5923 DenseSegFree(dsp);
5924 sap->segs = (Pointer)dsp_new;
5925 SAIndex2Free2(sap->saip);
5926 sap->saip = NULL;
5927 AlnMgr2IndexSingleChildSeqAlign(sap);
5928 }
5929
5930
5931 /***************************************************************************
5932 *
5933 * SPI_GetAcceptorScore evaluates a given position in a given bioseq
5934 * for an acceptor splice site.
5935 *
5936 ***************************************************************************/
SPI_GetAcceptorScore(BioseqPtr bsp,Int4 pos1,Int4 pos2,Uint1 strand,FloatHiPtr score,Int4 spllen,SPI_OptionsPtr spot)5937 static void SPI_GetAcceptorScore(BioseqPtr bsp, Int4 pos1, Int4 pos2, Uint1 strand, FloatHiPtr score, Int4 spllen, SPI_OptionsPtr spot)
5938 {
5939 Uint1Ptr buf;
5940 Uint1 buf2[100];
5941 Int4 i;
5942 Uint1 res;
5943 SeqPortPtr spp;
5944
5945 if (pos1 < 0)
5946 pos1 = 0;
5947 if (pos2 > bsp->length-1)
5948 pos2 = bsp->length-1;
5949 spp = SeqPortNew (bsp, pos1, pos2, strand, Seq_code_ncbi4na);
5950 i = 0;
5951 buf = (Uint1Ptr)MemNew((spllen+2)*sizeof(Uint1));
5952 SeqPortRead(spp, buf2, spllen+2);
5953 while (i<(pos2-pos1+1) && ((res = buf2[i]) != SEQPORT_EOF) && i<spllen+1)
5954 {
5955 if (res == 1)
5956 buf[i] = 0;
5957 else if (res == 2)
5958 buf[i] = 1;
5959 else if (res == 4)
5960 buf[i] = 2;
5961 else if (res == 8)
5962 buf[i] = 3;
5963 else
5964 buf[i] = 4;
5965 i++;
5966 }
5967 SeqPortFree(spp);
5968 if (spot->asplicejunc > 0)
5969 SPI_is_acceptor_user(buf, spllen, score, spot);
5970 else
5971 SPI_is_acceptor (buf, spllen, score, spot->organism);
5972 MemFree(buf);
5973 }
5974
5975 /***************************************************************************
5976 *
5977 * spi_get_overlap returns the amount of overlap (on the second, or mRNA
5978 * sequence) between two given alignments. A negative value means no
5979 * overlap.
5980 *
5981 ***************************************************************************/
spi_get_overlap(SeqAlignPtr sap1,SeqAlignPtr sap2)5982 static Int4 spi_get_overlap (SeqAlignPtr sap1, SeqAlignPtr sap2)
5983 {
5984 Int4 overlap;
5985 Int4 start1;
5986 Int4 stop1;
5987 Int4 start2;
5988 Int4 stop2;
5989 Uint1 strand;
5990
5991 strand = AlnMgr2GetNthStrand (sap1, 2);
5992 AlnMgr2GetNthSeqRangeInSA (sap1, 2, &start1, &stop1);
5993 AlnMgr2GetNthSeqRangeInSA (sap2, 2, &start2, &stop2);
5994 if (strand == Seq_strand_minus)
5995 overlap = stop2 - start1 + 1;
5996 else
5997 overlap = stop1 - start2 + 1;
5998 return overlap;
5999 }
6000
6001 /***************************************************************************
6002 *
6003 * SPI_AddToAln adds the amount "offset" to the specified end of an
6004 * alignment by adding a segment of length "offset" to both sequences
6005 * in the alignment. The function assumes that the alignment has two
6006 * rows, that the first row is on the plus strand, that the second row
6007 * is on the strand specified, and that adding the amount "offset" will
6008 * not go past either end of either sequence. This function is used
6009 * to adjust alignment boundaries to splice sites and to add small
6010 * pieces onto alignments to make them abut the next adjacent alignment.
6011 * If the first or last segment (depending on which_end specified) does
6012 * not have gaps in either row, that segment is simply extended; otherwise,
6013 * a new segment must be added onto whichever end is to be extended.
6014 *
6015 ***************************************************************************/
SPI_AddToAln(SeqAlignPtr sap,Int4 offset,Int2 which_end,Uint1 strand)6016 static void SPI_AddToAln(SeqAlignPtr sap, Int4 offset, Int2 which_end, Uint1 strand)
6017 {
6018 DenseSegPtr dsp;
6019 Int4Ptr lens;
6020 Int4 i;
6021 Int4 j;
6022 Int4Ptr starts;
6023 Uint1Ptr strands;
6024
6025 if (sap == NULL || offset == 0)
6026 return;
6027 dsp = (DenseSegPtr)(sap->segs);
6028 if (which_end == SPI_LEFT)
6029 {
6030 if (dsp->starts[0] != -1 && dsp->starts[1] != -1) /* neither sequence is gapped */
6031 {
6032 dsp->starts[0] -= offset;
6033 if (strand != Seq_strand_minus)
6034 dsp->starts[1] -= offset;
6035 dsp->lens[0] += offset;
6036 } else /* one of the sequences is gapped -> add a new segment */
6037 {
6038 starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
6039 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
6040 strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
6041 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6042 starts[0] = i - offset;
6043 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6044 if (strand == Seq_strand_minus)
6045 starts[1] = j + 1;
6046 else
6047 starts[1] = i - offset;
6048 lens[0] = offset;
6049 strands[0] = Seq_strand_plus;
6050 strands[1] = strand;
6051 for (i=0; i<dsp->numseg; i++)
6052 {
6053 starts[i+1] = dsp->starts[i];
6054 starts[2*(i+1)] = dsp->starts[2*i];
6055 lens[i+1] = dsp->lens[i];
6056 strands[i+1] = dsp->strands[i];
6057 strands[2*(i+1)] = dsp->strands[2*i];
6058 }
6059 dsp->numseg++;
6060 MemFree(dsp->starts);
6061 MemFree(dsp->lens);
6062 MemFree(dsp->strands);
6063 dsp->starts = starts;
6064 dsp->lens = lens;
6065 dsp->strands = strands;
6066 }
6067 } else if (which_end == SPI_RIGHT)
6068 {
6069 if (dsp->starts[2*(dsp->numseg-1)] != -1 && dsp->starts[2*(dsp->numseg-1)+1] != -1)
6070 {
6071 dsp->lens[dsp->numseg-1] += offset;
6072 if (strand == Seq_strand_minus)
6073 dsp->starts[2*(dsp->numseg-1)+1] -= offset;
6074 } else /* one of the sequences is gapped -> add a new segment */
6075 {
6076 starts = (Int4Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Int4));
6077 lens = (Int4Ptr)MemNew((dsp->numseg+1)*sizeof(Int4));
6078 strands = (Uint1Ptr)MemNew(2*(dsp->numseg+1)*sizeof(Uint1));
6079 AlnMgr2GetNthSeqRangeInSA(sap, 1, &i, &j);
6080 starts[2*(dsp->numseg)-1] = i+1;
6081 AlnMgr2GetNthSeqRangeInSA(sap, 2, &i, &j);
6082 if (strand == Seq_strand_minus)
6083 starts[2*(dsp->numseg)] = i - offset;
6084 else
6085 starts[2*(dsp->numseg)] = j + 1;
6086 lens[dsp->numseg] = offset;
6087 strands[2*(dsp->numseg)-1] = Seq_strand_plus;
6088 strands[2*(dsp->numseg)] = strand;
6089 for (i=0; i<dsp->numseg; i++)
6090 {
6091 starts[i] = dsp->starts[i];
6092 starts[2*i] = dsp->starts[2*i];
6093 lens[i] = dsp->lens[i];
6094 strands[i] = dsp->strands[i];
6095 strands[2*i] = dsp->strands[2*i];
6096 }
6097 dsp->numseg++;
6098 MemFree(dsp->starts);
6099 MemFree(dsp->lens);
6100 MemFree(dsp->strands);
6101 dsp->starts = starts;
6102 dsp->lens = lens;
6103 dsp->strands = strands;
6104 }
6105 }
6106 /* free the old index and reindex the alignment */
6107 SAIndex2Free2(sap->saip);
6108 sap->saip = NULL;
6109 AlnMgr2IndexSingleChildSeqAlign(sap);
6110 }
6111
6112 /***************************************************************************
6113 *
6114 * SPI_MergeAlignments takes two dense-seg seqaligns, each with the
6115 * same two rows, and merges them into a single alignment, with sap1
6116 * on the left and sap2 on the right. The function does not check to make
6117 * sure that sap2 belongs after sap1. If sap1 and sap2 are not linearly
6118 * consistent, the function extends and truncates the alignments as needed.
6119 *
6120 ***************************************************************************/
SPI_MergeAlignments(SeqAlignPtr sap1,SeqAlignPtr sap2)6121 static SeqAlignPtr SPI_MergeAlignments(SeqAlignPtr sap1, SeqAlignPtr sap2)
6122 {
6123 DenseSegPtr dsp1;
6124 DenseSegPtr dsp2;
6125 Int4 glen;
6126 Int4 gstart1;
6127 Int4 gstart2;
6128 Int4 gstop1;
6129 Int4 gstop2;
6130 Int4 i;
6131 Int4 j;
6132 Int4Ptr lens;
6133 Int4 mlen;
6134 Int4 mstart1;
6135 Int4 mstart2;
6136 Int4 mstop1;
6137 Int4 mstop2;
6138 Int4 n;
6139 Int4 offset;
6140 Int4Ptr starts;
6141 Uint1Ptr strands;
6142
6143 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &gstart1, &gstop1);
6144 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &gstart2, &gstop2);
6145 glen = mlen = 0;
6146 if (gstart2 <= gstop1)
6147 {
6148 AlnMgr2TruncateSeqAlign(sap1, gstart1, gstart2-1, 1);
6149 gstop1 = gstart2-1;
6150 }
6151 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &mstart1, &mstop1);
6152 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &mstart2, &mstop2);
6153 if (mstop2 > mstop1)
6154 {
6155 if (mstart2 <= mstop1)
6156 AlnMgr2TruncateSeqAlign(sap2, mstop1+1, mstop2, 2);
6157 } else
6158 {
6159 if (mstart1 <= mstop2)
6160 AlnMgr2TruncateSeqAlign(sap2, mstart2, mstart1-1, 2);
6161 }
6162 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &gstart1, &gstop1);
6163 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &gstart2, &gstop2);
6164 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &mstart1, &mstop1);
6165 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &mstart2, &mstop2);
6166 glen = gstart2 - gstop1 - 1;
6167 if (mstop2 > mstop1)
6168 mlen = mstart2 - mstop1 - 1;
6169 else
6170 mlen = mstart1 - mstop2 - 1;
6171 dsp1 = (DenseSegPtr)(sap1->segs);
6172 dsp2 = (DenseSegPtr)(sap2->segs);
6173 n = dsp1->numseg + dsp2->numseg + 2;
6174 starts = (Int4Ptr)MemNew(2*n*sizeof(Int4));
6175 lens = (Int4Ptr)MemNew(n*sizeof(Int4));
6176 strands = (Uint1Ptr)MemNew(2*n*sizeof(Uint1));
6177 for (i=0; i<2*(dsp1->numseg); i++)
6178 {
6179 starts[i] = dsp1->starts[i];
6180 strands[i] = dsp1->strands[i];
6181 }
6182 for (i=0; i<dsp1->numseg; i++)
6183 {
6184 lens[i] = dsp1->lens[i];
6185 }
6186 j = dsp1->numseg;
6187 offset = 0;
6188 if (glen > 0)
6189 {
6190 starts[2*j] = gstop1+1;
6191 starts[2*j+1] = -1;
6192 lens[j] = glen;
6193 j += 1;
6194 offset++;
6195 }
6196 if (mlen > 0)
6197 {
6198 starts[2*j] = -1;
6199 if (mstop2 > mstop1)
6200 starts[2*j+1] = mstop1+1;
6201 else
6202 starts[2*j+1] = mstop2+1;
6203 lens[j] = mlen;
6204 j += 1;
6205 offset++;
6206 }
6207 j = 2*(dsp1->numseg+offset);
6208 for (i=0; i<2*(dsp2->numseg); i++, j++)
6209 {
6210 starts[j] = dsp2->starts[i];
6211 strands[j] = dsp2->strands[i];
6212 }
6213 j = dsp1->numseg+offset;
6214 for (i=0; i<dsp2->numseg; i++, j++)
6215 {
6216 lens[j] = dsp2->lens[i];
6217 }
6218 MemFree(dsp2->starts);
6219 MemFree(dsp2->strands);
6220 MemFree(dsp2->lens);
6221 dsp2->starts = starts;
6222 dsp2->strands = strands;
6223 dsp2->lens = lens;
6224 dsp2->numseg = j;
6225 SAIndex2Free2(sap2->saip);
6226 sap2->saip = NULL;
6227 AlnMgr2IndexSingleChildSeqAlign(sap2);
6228 return (sap2);
6229 }
6230
6231 /***************************************************************************
6232 *
6233 * SPI_flip_sa_list takes the head of a list of seqaligns and switches
6234 * the first and second row of every alignment (alignments should all have
6235 * two rows). Then, the indexes are freed and the alignments are reindexed.
6236 *
6237 ***************************************************************************/
SPI_flip_sa_list(SeqAlignPtr sap)6238 NLM_EXTERN void SPI_flip_sa_list (SeqAlignPtr sap)
6239 {
6240 DenseSegPtr dsp;
6241 Int4 i;
6242 SeqIdPtr sip;
6243 SeqIdPtr sip_next;
6244 Int4 tmp_start;
6245 Uint1 tmp_strand;
6246
6247 if (sap == NULL || sap->segtype != SAS_DENSEG)
6248 return;
6249 while (sap != NULL)
6250 {
6251 dsp = (DenseSegPtr)(sap->segs);
6252 if (dsp->dim == 2) /* skip anything with more than 2 rows */
6253 {
6254 /* first switch the ids */
6255 sip = dsp->ids;
6256 sip_next = sip->next;
6257 sip_next->next = sip;
6258 sip->next = NULL;
6259 dsp->ids = sip_next;
6260 /* then switch the starts and strands */
6261 for (i = 0; i<dsp->numseg; i++)
6262 {
6263 tmp_start = dsp->starts[2*i];
6264 dsp->starts[2*i] = dsp->starts[2*i+1];
6265 dsp->starts[2*i+1] = tmp_start;
6266 tmp_strand = dsp->strands[2*i];
6267 dsp->strands[2*i] = dsp->strands[2*i+1];
6268 dsp->strands[2*i+1] = tmp_strand;
6269 }
6270 }
6271 if (sap->saip != NULL) /* free indexes, reindex */
6272 {
6273 SAIndex2Free2(sap->saip);
6274 sap->saip = NULL;
6275 AlnMgr2IndexSingleChildSeqAlign(sap);
6276 }
6277 sap = sap->next;
6278 }
6279 }
6280
6281 /***************************************************************************
6282 *
6283 * SPI_FillInLastmRNAHoles mimics the logic of SPI_ConnectAln; it
6284 * goes through a set of alignments and fills in any missing pieces.
6285 * Its arguments include the mRNA and genomic boundaries of the alignment,
6286 * so that the function knows how far to extend the set of alignments.
6287 * When a hole is found, SPI_FindBestAlnByDotPlot is called to fill
6288 * in the gap.
6289 *
6290 ***************************************************************************/
SPI_FillInLastmRNAHoles(SeqAlignPtr sap,SeqIdPtr sip_genomic,SeqIdPtr sip_mrna,Int4 start_g,Int4 stop_g,Int4 start_m,Int4 stop_m,Uint1 strand)6291 static SeqAlignPtr SPI_FillInLastmRNAHoles(SeqAlignPtr sap, SeqIdPtr sip_genomic, SeqIdPtr sip_mrna, Int4 start_g, Int4 stop_g, Int4 start_m, Int4 stop_m, Uint1 strand)
6292 {
6293 AMAlignIndex2Ptr amaip;
6294 Int4 currstart2;
6295 Int4 end2;
6296 Int4 gap1;
6297 Int4 gap2;
6298 Int4 i;
6299 Boolean internal;
6300 Int4 prevstop1;
6301 Int4 prevstop2;
6302 SeqAlignPtr sap_new;
6303 SeqAlignPtr sap_tmp;
6304 SeqLocPtr slp1;
6305 SeqLocPtr slp2;
6306 Int4 start1;
6307 Int4 start2;
6308 Int4 stop1;
6309 Int4 stop2;
6310
6311 if (sip_genomic == NULL || sip_mrna == NULL)
6312 return NULL;
6313 start1 = stop1 = start2 = stop2 = 0;
6314 if (sap != NULL)
6315 {
6316 amaip = (AMAlignIndex2Ptr)(sap->saip);
6317 HeapSort(amaip->saps, amaip->numsaps, sizeof(SeqAlignPtr), SPI_comp_aln_pos);
6318 prevstop1 = start_g;
6319 if (strand == Seq_strand_minus)
6320 prevstop2 = stop_m;
6321 else
6322 prevstop2 = start_m;
6323 internal = FALSE;
6324 for (i=0; i<amaip->numsaps; i++)
6325 {
6326 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 1, &start1, &stop1);
6327 AlnMgr2GetNthSeqRangeInSA(amaip->saps[i], 2, &start2, &stop2);
6328 if (strand == Seq_strand_minus)
6329 currstart2 = stop2;
6330 else
6331 currstart2 = start2;
6332 if ((gap2 = spi_isa_gap(currstart2, prevstop2, strand)) >= SPI_TEENYEXON)
6333 {
6334 if ((gap1 = spi_isa_gap(start1, prevstop1, Seq_strand_plus)) >= SPI_TEENYEXON || (prevstop1 == -1))
6335 {
6336 slp1 = SeqLocIntNew(prevstop1+1, start1-1, Seq_strand_plus, sip_genomic);
6337 if (strand != Seq_strand_minus)
6338 slp2 = SeqLocIntNew(prevstop2+1, currstart2-1, strand, sip_mrna);
6339 else
6340 slp2 = SeqLocIntNew(currstart2+1, prevstop2-1, strand, sip_mrna);
6341 sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
6342 SeqLocFree(slp1);
6343 SeqLocFree(slp2);
6344 sap_tmp = (SeqAlignPtr)(sap->segs);
6345 while (sap_tmp->next != NULL)
6346 {
6347 sap_tmp = sap_tmp->next;
6348 }
6349 sap_tmp->next = sap_new;
6350 }
6351 }
6352 internal = TRUE;
6353 prevstop1 = stop1;
6354 if (strand == Seq_strand_minus)
6355 prevstop2 = start2;
6356 else
6357 prevstop2 = stop2;
6358 }
6359 if (strand != Seq_strand_minus)
6360 end2 = stop_m;
6361 else
6362 {
6363 end2 = prevstop2-1;
6364 prevstop2 = -1;
6365 }
6366 gap1 = spi_isa_gap(prevstop1, stop_g, Seq_strand_plus);
6367 gap2 = spi_isa_gap(end2, prevstop2, strand);
6368 if (gap1 >= SPI_TEENYEXON && gap2 >= SPI_TEENYEXON)
6369 {
6370 slp1 = SeqLocIntNew(prevstop1+1, stop_g, Seq_strand_plus, sip_genomic);
6371 if (strand == Seq_strand_minus)
6372 slp2 = SeqLocIntNew(end2, prevstop2+1, strand, sip_mrna);
6373 else
6374 slp2 = SeqLocIntNew(prevstop2+1, end2, strand, sip_mrna);
6375 sap_new = SPI_FindBestAlnByDotPlot(slp1, slp2);
6376 SeqLocFree(slp1);
6377 SeqLocFree(slp2);
6378 sap_tmp = (SeqAlignPtr)(sap->segs);
6379 while (sap_tmp->next != NULL)
6380 {
6381 sap_tmp = sap_tmp->next;
6382 }
6383 sap_tmp->next = sap_new;
6384 }
6385 sap_tmp = (SeqAlignPtr)(sap->segs);
6386 i = 0;
6387 while (sap_tmp != NULL)
6388 {
6389 i++;
6390 sap_tmp = sap_tmp->next;
6391 }
6392 amaip->numsaps = i;
6393 MemFree(amaip->saps);
6394 amaip->saps = (SeqAlignPtr PNTR)MemNew(i*sizeof(SeqAlignPtr));
6395 sap_tmp = (SeqAlignPtr)(sap->segs);
6396 i = 0;
6397 while (sap_tmp != NULL)
6398 {
6399 amaip->saps[i] = sap_tmp;
6400 i++;
6401 sap_tmp = sap_tmp->next;
6402 }
6403 if (sap == NULL)
6404 return NULL;
6405 SPI_RemoveInconsistentAlnsFromSet(sap, SPI_TEENYEXON, 1, SPI_LEFT);
6406 sap_tmp = (SeqAlignPtr)(sap->segs);
6407 sap->segs = NULL;
6408 SeqAlignFree(sap);
6409 return sap_tmp;
6410 } else
6411 {
6412 slp1 = SeqLocIntNew(start_g, stop_g, Seq_strand_plus, sip_genomic);
6413 slp2 = SeqLocIntNew(start_m, stop_m, strand, sip_mrna);
6414 sap_tmp = SPI_FindBestAlnByDotPlot(slp1, slp2);
6415 SeqLocFree(slp1);
6416 SeqLocFree(slp2);
6417 return sap_tmp;
6418 }
6419 }
6420
6421 /***************************************************************************
6422 *
6423 * SPI_FindBestAlnByDotPlot is spidey's interface to Fasika Aklilu's
6424 * tree-based string-matching functions. Given two seqlocs, it sends them
6425 * to Fasika's function and gets a DOTMainDataPtr in return. The
6426 * DOTMainDataPtr contains all the information for the hits, so by
6427 * cycling through this data and copying it into dense-seg seqalign
6428 * structures, SPI_FindBestAlnByDotPlot builds up a set of alignments
6429 * that specify the relationship between the two seqlocs. These alignments
6430 * are then pruned to make a consistent, nonoverlapping set.
6431 *
6432 ***************************************************************************/
SPI_FindBestAlnByDotPlot(SeqLocPtr slp1,SeqLocPtr slp2)6433 static SeqAlignPtr SPI_FindBestAlnByDotPlot(SeqLocPtr slp1, SeqLocPtr slp2)
6434 {
6435 DOTDiagPtr ddp;
6436 DenseSegPtr dsp;
6437 Int4 i;
6438 DOTMainDataPtr mip;
6439 SeqAlignPtr sap;
6440 SeqAlignPtr sap_head;
6441 SeqAlignPtr sap_prev;
6442 ScorePtr scp;
6443 Int4 start1;
6444 Int4 start2;
6445 Uint1 strand;
6446
6447 BioseqPtr bsp1 = NULL, bsp2 = NULL;
6448 SeqIdPtr sidp1 = NULL, sidp2 = NULL;
6449
6450 /** KSK: this protects spidey from the implicit requirement of DOT_
6451 that *BOTH* seqs are ncbi2na encoded ****/
6452
6453 if (slp1 != NULL && slp2 != NULL){
6454 sidp1 = SeqLocId(slp1);
6455 sidp2 = SeqLocId(slp2);
6456 if (sidp1 != NULL && sidp2 != NULL){
6457 bsp1 = BioseqFind(sidp1);
6458 bsp2 = BioseqFind(sidp2);
6459 if (bsp1 != NULL && bsp2 != NULL){
6460 if (bsp1->seq_data_type != Seq_code_ncbi2na
6461 || bsp2->seq_data_type != Seq_code_ncbi2na){
6462 return NULL;
6463 }
6464 }
6465 else {
6466 return NULL;
6467 }
6468 }
6469 else {
6470 return NULL;
6471 }
6472 }
6473 else {
6474 return NULL;
6475 }
6476
6477
6478 mip = DOT_CreateAndStorebyLoc (slp1, slp2, SPI_TEENYEXON, 10);
6479 sap = sap_head = sap_prev = NULL;
6480 if (mip == NULL || mip->hitlist == NULL)
6481 return NULL;
6482 i = 0;
6483 ddp = mip->hitlist[i];
6484 start1 = SeqLocStart(slp1);
6485 start2 = SeqLocStart(slp2);
6486 strand = SeqLocStrand(slp2);
6487 /* copy each ddp (a single ungapped alignment) into a one-segment dense-seg alignment */
6488 while (ddp != NULL && i < mip->index)
6489 {
6490 ddp = mip->hitlist[i];
6491 i++;
6492 sap = SeqAlignNew();
6493 dsp = DenseSegNew();
6494 sap->type = SAT_PARTIAL;
6495 sap->segtype = SAS_DENSEG;
6496 sap->dim = 2;
6497 dsp->dim = 2;
6498 dsp->numseg = 1;
6499 dsp->ids = SeqIdDup(SeqLocId(slp1));
6500 dsp->ids->next = SeqIdDup(SeqLocId(slp2));
6501 dsp->strands = (Uint1Ptr)MemNew(2*sizeof(Uint1));
6502 dsp->strands[0] = SeqLocStrand(slp1);
6503 dsp->strands[1] = SeqLocStrand(slp2);
6504 dsp->starts = (Int4Ptr)MemNew(2*sizeof(Int4));
6505 dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
6506 dsp->starts[0] = ddp->q_start;
6507 if (dsp->strands[1] == Seq_strand_minus)
6508 dsp->starts[1] = ddp->s_start - ddp->length + 1;
6509 else
6510 dsp->starts[1] = ddp->s_start;
6511 if (ddp->length > SeqLocLen(slp2))
6512 dsp->lens[0] = SeqLocLen(slp2);
6513 else
6514 dsp->lens[0] = ddp->length - 1;
6515 scp = ScoreNew();
6516 scp->id = ObjectIdNew();
6517 scp->id->str = StringSave("score");
6518 scp->choice = 1;
6519 scp->value.intvalue = ddp->score;
6520 dsp->scores = scp;
6521 sap->segs = (Pointer)(dsp);
6522 if (sap_head != NULL)
6523 {
6524 sap_prev->next = sap;
6525 sap_prev = sap;
6526 } else
6527 sap_head = sap_prev = sap;
6528 }
6529 if (sap_head == NULL)
6530 return NULL;
6531 AlnMgr2IndexLite(sap_head);
6532 SPI_RemoveInconsistentAlnsFromSet(sap_head, SPI_TEENYEXON, 1, SPI_LEFT);
6533 sap = (SeqAlignPtr)(sap_head->segs);
6534 sap_head->segs = NULL;
6535 SeqAlignFree(sap_head);
6536 MemFree(mip->matrix);
6537 MemFree(mip->qseq);
6538 MemFree(mip->sseq);
6539 MemFree(mip->qname);
6540 MemFree(mip->sname);
6541 i = 0;
6542 while (ddp != NULL && i < mip->index)
6543 {
6544 ddp = mip->hitlist[i];
6545 MemFree(ddp);
6546 i++;
6547 }
6548 MemFree(mip->hitlist);
6549 return sap;
6550 }
6551
6552 /***************************************************************************
6553 *
6554 * SPI_comp_aln_pos is the HeapSort callback for SPI_FillInLastmRNAHoles.
6555 * It compares the genomic intervals covered by two seqaligns, and sorts
6556 * them according to the 5'-most start position.
6557 *
6558 ***************************************************************************/
SPI_comp_aln_pos(VoidPtr ptr1,VoidPtr ptr2)6559 static int LIBCALLBACK SPI_comp_aln_pos(VoidPtr ptr1, VoidPtr ptr2)
6560 {
6561 SeqAlignPtr sap1;
6562 SeqAlignPtr sap2;
6563 Int4 start1;
6564 Int4 start2;
6565 Int4 stop1;
6566 Int4 stop2;
6567
6568 start1 = start2 = stop1 = stop2 = 0;
6569 if (ptr1 != NULL && ptr2 != NULL)
6570 {
6571 sap1 = *((SeqAlignPtr PNTR) ptr1);
6572 sap2 = *((SeqAlignPtr PNTR) ptr2);
6573 AlnMgr2GetNthSeqRangeInSA(sap1, 1, &start1, &stop1);
6574 AlnMgr2GetNthSeqRangeInSA(sap2, 1, &start2, &stop2);
6575 if (stop1 < start2)
6576 return -1;
6577 else if (stop2 < start1)
6578 return 1;
6579 else if (start1 < start2)
6580 return -1;
6581 else if (start2 < start1)
6582 return 1;
6583 else
6584 return 0;
6585 }
6586 return 0;
6587 }
6588
SPI_bsinfoFreeList(SPI_bsinfoPtr spi)6589 NLM_EXTERN void SPI_bsinfoFreeList (SPI_bsinfoPtr spi)
6590 {
6591 SPI_bsinfoPtr spi_next;
6592
6593 while (spi != NULL)
6594 {
6595 spi_next = spi->next;
6596 spi->next = NULL;
6597 SeqLocSetFree(spi->lcaseloc);
6598 MemFree(spi);
6599 spi = spi_next;
6600 }
6601 }
6602
SPI_RegionFree(SPI_RegionInfoPtr srip)6603 static void SPI_RegionFree (SPI_RegionInfoPtr srip)
6604 {
6605 if (srip == NULL)
6606 return;
6607 if (srip->smp){
6608 SPI_mRNAFree(srip->smp);
6609 }
6610 srip->smp = NULL;
6611 srip->next = NULL;
6612 MemFree(srip);
6613 }
6614
SPI_mRNAFree(SPI_mRNAPtr smp)6615 NLM_EXTERN void SPI_mRNAFree (SPI_mRNAPtr smp)
6616 {
6617 AMAlignIndex2Ptr amaip;
6618 Int4 i;
6619
6620 if (smp == NULL)
6621 return;
6622 MemFree(smp->exonid);
6623 MemFree(smp->exongaps);
6624 MemFree(smp->splicedon);
6625 MemFree(smp->spliceacc);
6626 MemFree(smp->mstarts);
6627 MemFree(smp->mstops);
6628 MemFree(smp->gstarts);
6629 MemFree(smp->gstops);
6630 if (smp->saps != NULL)
6631 {
6632 for (i=0; i<smp->numexons; i++)
6633 {
6634 SeqAlignFree(smp->saps[i]);
6635 }
6636 }
6637 if (smp->parent != NULL)
6638 {
6639 smp->parent->segs = NULL;
6640 amaip = (AMAlignIndex2Ptr)(smp->parent->saip);
6641 amaip->saps = NULL;
6642 SeqAlignFree(smp->parent);
6643 }
6644 MemFree(smp->saps);
6645 if (smp->continuous != NULL)
6646 SeqAlignFree(smp->continuous);
6647 if (smp->epp != NULL)
6648 SPI_FreeExonProfList(smp->epp);
6649 if (smp->protein!=NULL)
6650 MemFree(smp->protein);
6651 }
6652
SPI_FreeExonProf(SPI_ExonProfPtr epp)6653 static void SPI_FreeExonProf(SPI_ExonProfPtr epp)
6654 {
6655 if (epp == NULL)
6656 return;
6657 MemFree(epp->mismatches);
6658 MemFree(epp);
6659 }
6660
SPI_FreeExonProfList(SPI_ExonProfPtr epp)6661 static void SPI_FreeExonProfList(SPI_ExonProfPtr epp)
6662 {
6663 SPI_ExonProfPtr epp_next;
6664
6665 while (epp != NULL)
6666 {
6667 epp_next = epp->next;
6668 SPI_FreeExonProf(epp);
6669 epp = epp_next;
6670 }
6671 }
6672
SPI_RegionListFree(SPI_RegionInfoPtr srip)6673 NLM_EXTERN void SPI_RegionListFree (SPI_RegionInfoPtr srip)
6674 {
6675 SPI_RegionInfoPtr srip_tmp;
6676
6677 if (srip == NULL)
6678 return;
6679 while (srip != NULL)
6680 {
6681 srip_tmp = srip->next;
6682 SPI_RegionFree(srip);
6683 srip = srip_tmp;
6684 }
6685 }
6686
SPI_OptionsNew(void)6687 NLM_EXTERN SPI_OptionsPtr SPI_OptionsNew(void)
6688 {
6689 SPI_OptionsPtr spot;
6690
6691 spot = (SPI_OptionsPtr)MemNew(sizeof(SPI_Options));
6692 spot->firstpasseval = 0.00001;
6693 spot->secpasseval = 0.001;
6694 spot->thirdpasseval = 10;
6695 spot->organism = SPI_VERTEBRATE;
6696 spot->numreturns = 1;
6697 spot->idcutoff = 0;
6698 spot->lencutoff = 0;
6699 spot->interspecies = FALSE;
6700 spot->printaln = FALSE;
6701 spot->printasn = FALSE;
6702 /* if strand set to 'unknown' BlastTwoSequences()
6703 screws-up returned seqalign strand */
6704 spot->strand = Seq_strand_both;
6705 /* lets have defaults for to & from */
6706 spot->to = 0;
6707 spot->from = 0;
6708 spot->bigintron = 0;
6709 spot->bigintron_size = 0; /* added by KSK*/
6710 spot->repeat_db_file = 0; /* added by KSK */
6711 return spot;
6712 }
6713
SPI_OptionsFree(SPI_OptionsPtr spot)6714 NLM_EXTERN void SPI_OptionsFree (SPI_OptionsPtr spot)
6715 {
6716 MemFree(spot);
6717 }
6718
6719 /***************************************************************************
6720 *
6721 * SPI_GetDonorSpliceInfo fills in the length of the consensus sequence
6722 * of the donor splice site for the given organism. The boundary is the
6723 * location of the exon-intron boundary within the consensus sequence.
6724 *
6725 ***************************************************************************/
SPI_GetDonorSpliceInfo(Int4 org,Int4Ptr spllen,Int4Ptr boundary,SPI_OptionsPtr spot)6726 static void SPI_GetDonorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot)
6727 {
6728 Int4 i;
6729 SPI_SpliceInfoPtr ssp;
6730
6731 if (spot->dsplicejunc != 0)
6732 {
6733 ssp = spot->dssp_head;
6734 i = 0;
6735 while (ssp != NULL)
6736 {
6737 i++;
6738 ssp = ssp->next;
6739 }
6740 *spllen = i;
6741 /** file should supply column position of last
6742 base of preceding exon **/
6743 *boundary = *spllen-spot->dsplicejunc+1;
6744 return;
6745 }
6746 if (org == SPI_VERTEBRATE)
6747 {
6748 *spllen = 10;
6749 *boundary = 8;
6750 } else if (org == SPI_FLY)
6751 {
6752 *spllen = 15;
6753 *boundary = 11;
6754 } else if (org == SPI_PLANT)
6755 {
6756 *spllen = 9;
6757 *boundary = 7;
6758 } else if (org == SPI_CELEGANS)
6759 {
6760 *spllen = 15;
6761 *boundary = 11;
6762 }
6763 else if (org == SPI_DICTY){
6764 *spllen = 8;
6765 *boundary = 7;
6766 }
6767 }
6768
6769 /***************************************************************************
6770 *
6771 * SPI_is_donor is a general interface to the organism-specific donor
6772 * splice site evaluation functions. It simply passes on the sequence,
6773 * sequence length, and score pointer to the appropriate organism-
6774 * specific function.
6775 * The organism-specific functions all work exactly the same way, but have
6776 * different splice matrices. They evaluate P(Site|Sequence), which is:
6777 *
6778 * P(Site|Sequence) = P(Sequence|Site)*P(Site)/P(Sequence)
6779 *
6780 * Since P(Site) is constant (and unknown), it is ignored; only
6781 * P(Sequence|Site)/P(Sequence) is calculated, and these values are
6782 * compared to each other. P(Sequence|Site) is calculated by multiplying
6783 * the values in the splice site frequency matrix according to the
6784 * sequence specified. P(Sequence) is the probability of this specific
6785 * sequence, using the A, T, G, and C frequences specified in the sequence.
6786 *
6787 * N.B. Ken Katz changed this so that they generate log-odd scores:
6788 * log[P(X)/F(X)] + log[P(X)/F(X)]....but then generate the antilog
6789 * since there are too many places in the code where the expected value
6790 * is the antilog.
6791 *
6792 ***************************************************************************/
SPI_is_donor(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,Int4 org)6793 NLM_EXTERN void SPI_is_donor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org)
6794 {
6795 if (org == SPI_VERTEBRATE){
6796 SPI_is_donor_vert(sequence, seqlen, score);
6797 }
6798 else if (org == SPI_FLY){
6799 SPI_is_donor_fly(sequence, seqlen, score);
6800 }
6801 else if (org == SPI_PLANT){
6802 SPI_is_donor_plant(sequence, seqlen, score);
6803 }
6804 else if (org == SPI_CELEGANS){
6805 SPI_is_donor_cele(sequence, seqlen, score);
6806 }
6807 else if (org == SPI_DICTY){
6808 SPI_is_donor_dicty(sequence, seqlen, score);
6809 }
6810 }
6811
SPI_is_donor_user(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,SPI_OptionsPtr spot)6812 static void SPI_is_donor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot)
6813 {
6814 Int4 acgt[4] = {0, 0, 0, 0};
6815 Int4 j;
6816 FloatHi prob_seqgsite = 0;
6817 SPI_SpliceInfoPtr ssp;
6818
6819 if (sequence == NULL || score == NULL){
6820 return;
6821 }
6822
6823 /* get the frequencies first */
6824 for (j=0; j<seqlen; j++){
6825 if (sequence[j] != 4){
6826 acgt[sequence[j]]++;
6827 }
6828 }
6829 *score = 0;
6830 ssp = spot->dssp_head;
6831 /* now calculate for each base the log, adding values to get the score */
6832 for (j=0; j<seqlen; j++){
6833 if (sequence[j] == 0 && ssp->a > 0){
6834 prob_seqgsite +=
6835 log10(ssp->a/((FloatHi)acgt[sequence[j]]/seqlen));
6836 }
6837 else if (sequence[j] == 1 && ssp->c > 0){
6838 prob_seqgsite +=
6839 log10(ssp->c/((FloatHi)acgt[sequence[j]]/seqlen));
6840 }
6841 else if (sequence[j] == 2 && ssp->g > 0){
6842 prob_seqgsite +=
6843 log10(ssp->g/((FloatHi)acgt[sequence[j]]/seqlen));
6844 }
6845 else if (sequence[j] == 3 && ssp->t > 0){
6846 prob_seqgsite +=
6847 log10(ssp->t/((FloatHi)acgt[sequence[j]]/seqlen));
6848 }
6849 ssp = ssp->next;
6850 }
6851 *score = pow(10, prob_seqgsite);
6852 }
6853
6854 /***************************************************************************
6855 *
6856 * See the comment for SPI_is_donor for an explanation of how this
6857 * function works. The splice site frequency matrix is derived from
6858 * a nonredundant set of vertebrate splice sites provided by Chris Burge.
6859 *
6860 ***************************************************************************/
SPI_is_donor_vert(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6861 static void SPI_is_donor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6862 {
6863 Int4 acgt[4] = {0, 0, 0, 0};
6864 Int4 j = 0;
6865 FloatHi d[10][4] = {
6866 {0.3361, 0.3587, 0.1882, 0.1170},
6867 {0.5986, 0.1306, 0.1413, 0.1295},
6868 {0.0867, 0.0321, 0.8034, 0.0778},
6869 {0.0000, 0.0000, 1.0000, 0.0000},
6870 {0.0000, 0.0100, 0.0000, 1.0000},
6871 {0.4976, 0.0267, 0.4507, 0.0249},
6872 {0.7162, 0.0730, 0.1223, 0.0885},
6873 {0.0677, 0.0517, 0.8331, 0.0475},
6874 {0.1586, 0.1681, 0.2185, 0.4549},
6875 {0.2559, 0.2120, 0.3593, 0.1728}};
6876
6877 FloatHi prob_seqgsite = 0;
6878
6879 if (sequence == NULL || score == NULL){
6880 return;
6881 }
6882 *score = 0;
6883 if (seqlen < 10){
6884 return;
6885 }
6886 prob_seqgsite = 0;
6887
6888 /* first get the freqs */
6889 for (j=0; j<seqlen; j++){
6890 if (sequence[j] != 4){
6891 acgt[sequence[j]]++;
6892 }
6893 }
6894 /* now calculate for each base the log, adding values to get the score */
6895 for (j=0; j<seqlen; j++){
6896 if (sequence[j] != 4){
6897 if (d[j][sequence[j]] > 0){
6898 prob_seqgsite +=
6899 log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/(FloatHi)seqlen));
6900 }
6901 }
6902 }
6903 *score = pow(10, prob_seqgsite);
6904 }
6905
6906
6907 /***************************************************************************
6908 *
6909 * See the comment for SPI_is_donor for an explanation of how this
6910 * function works. The splice site frequency matrix is derived from
6911 * a nonredundant set of Drosophila splice sites provided by Chris Burge.
6912 *
6913 ***************************************************************************/
SPI_is_donor_fly(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6914 static void SPI_is_donor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6915 {
6916 Int4 acgt[4] = {0, 0, 0, 0};
6917 FloatHi d[15][4] = {
6918 {0.3103, 0.2105, 0.1951, 0.2834},
6919 {0.3045, 0.2335, 0.2131, 0.2482},
6920 {0.3512, 0.2905, 0.2028, 0.1548},
6921 {0.5374, 0.1523, 0.1567, 0.1529},
6922 {0.1216, 0.0685, 0.6935, 0.1158},
6923 {0.0001, 0.0000, 0.9936, 0.0000},
6924 {0.0000, 0.0000, 0.0000, 0.9878},
6925 {0.5886, 0.0115, 0.3506, 0.0486},
6926 {0.7639, 0.0505, 0.1004, 0.0845},
6927 {0.0480, 0.0102, 0.8861, 0.0550},
6928 {0.1190, 0.1068, 0.0537, 0.7198},
6929 {0.3455, 0.1388, 0.1849, 0.3301},
6930 {0.2700, 0.2258, 0.1804, 0.3231},
6931 {0.3353, 0.2092, 0.1612, 0.2930},
6932 {0.2873, 0.2278, 0.1727, 0.3116}};
6933 Int4 j;
6934 FloatHi prob_seqgsite = 0;
6935
6936 if (sequence == NULL || score == NULL){
6937 return;
6938 }
6939 *score = 0;
6940 if (seqlen < 15){
6941 return;
6942 }
6943 /* first get the freqs */
6944 for (j=0; j<seqlen; j++){
6945 if (sequence[j] != 4){
6946 acgt[sequence[j]]++;
6947 }
6948 }
6949 /* now calculate for each base the log, adding values to get the score */
6950 for (j=0; j<seqlen; j++){
6951 if (sequence[j] != 4){
6952 if (d[j][sequence[j]] > 0){
6953 prob_seqgsite +=
6954 log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
6955 }
6956 }
6957 }
6958 *score = pow(10, prob_seqgsite);
6959 }
6960
6961 /***************************************************************************
6962 *
6963 * See the comment for SPI_is_donor for an explanation of how this
6964 * function works. The splice site frequency matrix is derived from
6965 * a nonredundant set of Arabidopsis splice sites provided by Chris Burge.
6966 *
6967 ***************************************************************************/
SPI_is_donor_plant(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)6968 static void SPI_is_donor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
6969 {
6970 Int4 acgt[4] = {0, 0, 0, 0};
6971 FloatHi d[9][4] = {
6972 {0.3563, 0.3526, 0.1840, 0.1068},
6973 {0.6559, 0.1103, 0.0765, 0.1571},
6974 {0.0887, 0.0328, 0.7876, 0.0907},
6975 {0.0001, 0.0000, 0.9930, 0.0000},
6976 {0.0000, 0.0000, 0.0000, 0.9838},
6977 {0.6607, 0.0452, 0.1195, 0.1744},
6978 {0.5407, 0.1394, 0.0546, 0.2650},
6979 {0.1975, 0.0929, 0.5193, 0.1901},
6980 {0.2368, 0.1405, 0.1040, 0.5182}};
6981
6982 Int4 j;
6983 FloatHi prob_seqgsite = 0;
6984
6985 if (sequence == NULL || score == NULL){
6986 return;
6987 }
6988 *score = 0;
6989 if (seqlen < 9){
6990 return;
6991 }
6992 /* first get the freqs */
6993 for (j=0; j<seqlen; j++){
6994 if (sequence[j] != 4){
6995 acgt[sequence[j]]++;
6996 }
6997 }
6998 /* now calculate for each base the log, adding values to get the score */
6999 for (j=0; j<seqlen; j++){
7000 if (sequence[j] != 4){
7001 if (d[j][sequence[j]] > 0){
7002 prob_seqgsite +=
7003 log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7004 }
7005 }
7006 }
7007 *score = pow(10, prob_seqgsite);
7008 }
7009
7010 /***************************************************************************
7011 *
7012 * See the comment for SPI_is_donor for an explanation of how this
7013 * function works. The splice site frequency matrix is derived from
7014 * a nonredundant set of C. elegans splice sites provided by Chris Burge.
7015 *
7016 ***************************************************************************/
SPI_is_donor_cele(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7017 static void SPI_is_donor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7018 {
7019 Int4 acgt[4] = {0, 0, 0, 0};
7020 FloatHi d[15][4] = {
7021 {0.3575, 0.1537, 0.1605, 0.3284},
7022 {0.3541, 0.1838, 0.1662, 0.2959},
7023 {0.3825, 0.2481, 0.1987, 0.1706},
7024 {0.5792, 0.1445, 0.0955, 0.1808},
7025 {0.1828, 0.0609, 0.6046, 0.1517},
7026 {0.0001, 0.0000, 0.9963, 0.0000},
7027 {0.0000, 0.0000, 0.0000, 0.9919},
7028 {0.5904, 0.0146, 0.2400, 0.1550},
7029 {0.6713, 0.0660, 0.0877, 0.1750},
7030 {0.0904, 0.0457, 0.7441, 0.1198},
7031 {0.1896, 0.1077, 0.0850, 0.6178},
7032 {0.2661, 0.0911, 0.1371, 0.5058},
7033 {0.2620, 0.0995, 0.1344, 0.5041},
7034 {0.2840, 0.1141, 0.1039, 0.4980},
7035 {0.2986, 0.1239, 0.1215, 0.4560}};
7036 Int4 j;
7037 FloatHi prob_seqgsite = 0;
7038
7039 if (sequence == NULL || score == NULL){
7040 return;
7041 }
7042 *score = 0;
7043 if (seqlen < 15){
7044 return;
7045 }
7046 /* first get the freqs */
7047 for (j=0; j<seqlen; j++){
7048 if (sequence[j] != 4){
7049 acgt[sequence[j]]++;
7050 }
7051 }
7052 /* now calculate for each base the log, adding values to get the score */
7053 for (j=0; j<seqlen; j++){
7054 if (sequence[j] != 4){
7055 if (d[j][sequence[j]] > 0){
7056 prob_seqgsite +=
7057 log10((d[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7058 }
7059 }
7060 }
7061 *score = pow(10, prob_seqgsite);
7062 }
7063
7064 /***************************************************************************
7065 *
7066 * See the comment for SPI_is_donor for an explanation of how this
7067 * function works. Note that the Dicty info is NOT corrected for current
7068 * sequence composition because the log(likehood)matrix is itself corrected
7069 * for dicty genome composition. The data were retrieved
7070 * from the geneid Dd parameter file and used with the permission of
7071 * Roderic Guigo. Values were simply translated from log base 2 to log base 10
7072 *
7073 ***************************************************************************/
SPI_is_donor_dicty(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7074 static void SPI_is_donor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score){
7075
7076 /* acgt */
7077 FloatHi d[8][4] = {
7078 {0.1825, -0.2014, -0.0136, -0.1440},
7079 {0.0487, -0.3298, 0.0742, -0.0633},
7080 {-99999, -99999, 0.6020, -99999},
7081 {-99999, -99999, -99999, 0.6020},
7082 {0.4783, -0.9030, -1.0634, -0.9673},
7083 {0.3026, -1.4202, -0.7392, -0.0150},
7084 {-0.3356, -1.3914, 0.8111, -0.5090},
7085 {-0.7937, -1.0333, -0.5721, 0.4315}};
7086 Int4 j = 0;
7087
7088 if (sequence == NULL || score == NULL){
7089 return;
7090 }
7091 *score = 0;
7092 if (seqlen < 8){
7093 return;
7094 }
7095 *score = 0;
7096 for (j = 0; j < seqlen; j++){
7097 if (sequence[j] != 4){
7098 *score += d[j][sequence[j]];
7099 }
7100 }
7101 *score = pow(10, *score);
7102 }
7103
7104
7105 /***************************************************************************
7106 *
7107 * SPI_GetAcceptorSpliceInfo fills in the length of the consensus sequence
7108 * of the acceptor splice site for the given organism. The boundary is the
7109 * location of the exon-intron boundary within the consensus sequence.
7110 *
7111 ***************************************************************************/
SPI_GetAcceptorSpliceInfo(Int4 org,Int4Ptr spllen,Int4Ptr boundary,SPI_OptionsPtr spot)7112 static void SPI_GetAcceptorSpliceInfo (Int4 org, Int4Ptr spllen, Int4Ptr boundary, SPI_OptionsPtr spot)
7113 {
7114 Int4 i;
7115 SPI_SpliceInfoPtr ssp;
7116
7117 if (spot->asplicejunc != 0)
7118 {
7119 i = 0;
7120 ssp = spot->assp_head;
7121 while (ssp != NULL)
7122 {
7123 i++;
7124 ssp = ssp->next;
7125 }
7126 *spllen = i;
7127 /*** file should supply first exon column
7128 which needs to be zero-base adjusted ***/
7129 *boundary = spot->asplicejunc - 1;
7130 return;
7131 }
7132 if (org == SPI_VERTEBRATE)
7133 {
7134 *spllen = 21;
7135 *boundary = 20;
7136 } else if (org == SPI_FLY)
7137 {
7138 *spllen = 18;
7139 *boundary = 15;
7140 } else if (org == SPI_PLANT)
7141 {
7142 *spllen = 40;
7143 *boundary = 36;
7144 } else if (org == SPI_CELEGANS)
7145 {
7146 *spllen = 18;
7147 *boundary = 15;
7148 }
7149 else if (org == SPI_DICTY){
7150 *spllen = 15;
7151 *boundary = 15;
7152 }
7153 }
7154
7155 /***************************************************************************
7156 *
7157 * SPI_is_acceptor is a general interface to the organism-specific acceptor
7158 * splice site evaluation functions. It simply passes on the sequence,
7159 * sequence length, and score pointer to the appropriate organism-
7160 * specific function.
7161 * The organism-specific functions all work exactly the same way, but have
7162 * different splice matrices. They evaluate P(Site|Sequence), which is:
7163 *
7164 * P(Site|Sequence) = P(Sequence|Site)*P(Site)/P(Sequence)
7165 *
7166 * Since P(Site) is constant (and unknown), it is ignored; only
7167 * P(Sequence|Site)/P(Sequence) is calculated, and these values are
7168 * compared to each other. P(Sequence|Site) is calculated by multiplying
7169 * the values in the splice site frequency matrix according to the
7170 * sequence specified. P(Sequence) is the probability of this specific
7171 * sequence, using the A, T, G, and C frequences specified in the sequence.
7172 *
7173 ***************************************************************************/
SPI_is_acceptor(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,Int4 org)7174 NLM_EXTERN void SPI_is_acceptor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org)
7175 {
7176 if (org == SPI_VERTEBRATE){
7177 SPI_is_acceptor_vert(sequence, seqlen, score);
7178 }
7179 else if (org == SPI_FLY){
7180 SPI_is_acceptor_fly(sequence, seqlen, score);
7181 }
7182 else if (org == SPI_PLANT){
7183 SPI_is_acceptor_plant(sequence, seqlen, score);
7184 }
7185 else if (org == SPI_CELEGANS){
7186 SPI_is_acceptor_cele(sequence, seqlen, score);
7187 }
7188 else if (org == SPI_DICTY){
7189 SPI_is_acceptor_dicty(sequence, seqlen, score);
7190 }
7191 }
7192
SPI_is_acceptor_user(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score,SPI_OptionsPtr spot)7193 static void SPI_is_acceptor_user(Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, SPI_OptionsPtr spot)
7194 {
7195 Int4 acgt[4] = {0, 0, 0, 0};
7196 Int4 j;
7197 FloatHi prob_seqgsite = 0;
7198 SPI_SpliceInfoPtr ssp;
7199
7200 if (sequence == NULL || score == NULL){
7201 return;
7202 }
7203 /* get the frequencies first */
7204 for (j=0; j<seqlen; j++){
7205 if (sequence[j] != 4){
7206 acgt[sequence[j]]++;
7207 }
7208 }
7209 *score = 0;
7210 ssp = spot->assp_head;
7211 /* now calculate for each base the log, adding values to get the score */
7212 for (j=0; j<seqlen; j++){
7213 if (sequence[j] == 0 && ssp->a > 0){
7214 prob_seqgsite +=
7215 log10(ssp->a/((FloatHi)acgt[sequence[j]]/seqlen));
7216 }
7217 else if (sequence[j] == 1 && ssp->c > 0){
7218 prob_seqgsite +=
7219 log10(ssp->c/((FloatHi)acgt[sequence[j]]/seqlen));
7220 }
7221 else if (sequence[j] == 2 && ssp->g > 0){
7222 prob_seqgsite +=
7223 log10(ssp->g/((FloatHi)acgt[sequence[j]]/seqlen));
7224 }
7225 else if (sequence[j] == 3 && ssp->t > 0){
7226 prob_seqgsite +=
7227 log10(ssp->t/((FloatHi)acgt[sequence[j]]/seqlen));
7228 }
7229 ssp = ssp->next;
7230 }
7231 *score = pow(10, prob_seqgsite);
7232 }
7233
7234 /***************************************************************************
7235 *
7236 * See the comment for SPI_is_acceptor for an explanation of how this
7237 * function works. The splice site frequency matrix is derived from
7238 * a nonredundant set of vertebrate splice sites provided by Chris Burge.
7239 *
7240 ***************************************************************************/
SPI_is_acceptor_vert(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7241 static void SPI_is_acceptor_vert (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7242 {
7243 FloatHi a[21][4] = {
7244 {0.1823, 0.3135, 0.1485, 0.3557},
7245 {0.1568, 0.3319, 0.1681, 0.3432},
7246 {0.1461, 0.3379, 0.1520, 0.3640},
7247 {0.1271, 0.3290, 0.1710, 0.3729},
7248 {0.1342, 0.3593, 0.1366, 0.3700},
7249 {0.1152, 0.3676, 0.1188, 0.3985},
7250 {0.0926, 0.3688, 0.1235, 0.4151},
7251 {0.0879, 0.3426, 0.1205, 0.4489},
7252 {0.0808, 0.3557, 0.1182, 0.4454},
7253 {0.0790, 0.3224, 0.1128, 0.4857},
7254 {0.0748, 0.3581, 0.1075, 0.4596},
7255 {0.0814, 0.3866, 0.1152, 0.4169},
7256 {0.0849, 0.4186, 0.1235, 0.3729},
7257 {0.0867, 0.4240, 0.0849, 0.4044},
7258 {0.0665, 0.4561, 0.0618, 0.4157},
7259 {0.0736, 0.3996, 0.0564, 0.4703},
7260 {0.2251, 0.3409, 0.2126, 0.2215},
7261 {0.0404, 0.7357, 0.0018, 0.2221},
7262 {1.0000, 0.0010, 0.0010, 0.0010},
7263 {0.0010, 0.0010, 1.0000, 0.0010},
7264 {0.2375, 0.1318, 0.5350, 0.0956}};
7265 Int4 acgt[4] = {0, 0, 0, 0};
7266 Int4 j;
7267 FloatHi prob_seqgsite = 0;
7268
7269
7270 if (sequence == NULL || score == NULL){
7271 return;
7272 }
7273 *score = 0;
7274 if (seqlen < 21){
7275 return;
7276 }
7277 /* first get the freqs */
7278 for (j=0; j<seqlen; j++){
7279 if (sequence[j] != 4){
7280 acgt[sequence[j]]++;
7281 }
7282 }
7283 /* now calculate for each base the log, adding values to get the score */
7284 for (j=0; j<seqlen; j++){
7285 if (sequence[j] != 4 && a[j][sequence[j]] > 0 ){
7286 prob_seqgsite +=
7287 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7288 }
7289 }
7290 *score = pow(10, prob_seqgsite);
7291 /* if (sequence[18] == 0 && sequence[19] == 2){
7292 *score += 0.5;
7293 }
7294 */
7295 }
7296
7297 /***************************************************************************
7298 *
7299 * See the comment for SPI_is_acceptor for an explanation of how this
7300 * function works. The splice site frequency matrix is derived from
7301 * a nonredundant set of Drosophila splice sites provided by Chris Burge.
7302 *
7303 ***************************************************************************/
SPI_is_acceptor_fly(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7304 static void SPI_is_acceptor_fly (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7305 {
7306 FloatHi a[18][4] = {
7307 {0.2497, 0.2446, 0.1044, 0.4014},
7308 {0.2132, 0.2369, 0.1063, 0.4437},
7309 {0.1946, 0.2196, 0.1082, 0.4776},
7310 {0.2170, 0.2017, 0.0973, 0.4840},
7311 {0.1946, 0.2170, 0.0858, 0.5026},
7312 {0.2004, 0.2433, 0.0858, 0.4706},
7313 {0.2004, 0.2727, 0.0967, 0.4302},
7314 {0.2106, 0.2708, 0.0864, 0.4321},
7315 {0.1876, 0.3035, 0.0608, 0.4481},
7316 {0.1114, 0.2522, 0.0679, 0.5685},
7317 {0.1178, 0.2164, 0.0461, 0.6197},
7318 {0.2830, 0.1639, 0.2913, 0.2618},
7319 {0.0467, 0.7049, 0.0045, 0.2439},
7320 {0.9923, 0.0032, 0.0013, 0.0032},
7321 {0.0032, 0.0038, 0.9910, 0.0019},
7322 {0.3073, 0.1997, 0.3675, 0.1255},
7323 {0.2260, 0.1927, 0.1709, 0.4104},
7324 {0.2574, 0.2855, 0.2279, 0.2292}};
7325 Int4 acgt[4] = {0, 0, 0, 0};
7326 Int4 j;
7327 FloatHi prob_seqgsite = 0;
7328
7329 if (sequence == NULL || score == NULL){
7330 return;
7331 }
7332 *score = 0;
7333 if (seqlen < 18){
7334 return;
7335 }
7336 /* first get the freqs */
7337 for (j=0; j<seqlen; j++){
7338 if (sequence[j] != 4){
7339 acgt[sequence[j]]++;
7340 }
7341 }
7342 /* now calculate for each base the log, adding values to get the score */
7343 for (j=0; j<seqlen; j++){
7344 if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7345 prob_seqgsite +=
7346 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7347 }
7348 }
7349 *score = pow(10, prob_seqgsite);
7350 /* if (sequence[12] == 0 && sequence[13] == 2)
7351 *score += 0.5;
7352 */
7353 }
7354
7355 /***************************************************************************
7356 *
7357 * See the comment for SPI_is_acceptor for an explanation of how this
7358 * function works. The splice site frequency matrix is derived from
7359 * a nonredundant set of Arabidopsis splice sites provided by Chris Burge.
7360 *
7361 ***************************************************************************/
SPI_is_acceptor_plant(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7362 static void SPI_is_acceptor_plant (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7363 {
7364 FloatHi a[40][4] = {
7365 {0.2959, 0.1512, 0.1632, 0.3896},
7366 {0.2845, 0.1490, 0.1648, 0.4017},
7367 {0.2660, 0.1528, 0.1742, 0.4071},
7368 {0.2843, 0.1346, 0.1744, 0.4067},
7369 {0.2714, 0.1512, 0.1624, 0.4150},
7370 {0.2806, 0.1451, 0.1661, 0.4082},
7371 {0.2753, 0.1486, 0.1650, 0.4111},
7372 {0.2753, 0.1460, 0.1532, 0.4255},
7373 {0.2775, 0.1497, 0.1648, 0.4080},
7374 {0.2898, 0.1429, 0.1543, 0.4130},
7375 {0.2793, 0.1486, 0.1545, 0.4174},
7376 {0.2834, 0.1429, 0.1576, 0.4161},
7377 {0.2725, 0.1471, 0.1517, 0.4285},
7378 {0.2614, 0.1521, 0.1519, 0.4347},
7379 {0.2515, 0.1497, 0.1639, 0.4347},
7380 {0.2408, 0.1460, 0.1619, 0.4513},
7381 {0.2266, 0.1431, 0.1652, 0.4650},
7382 {0.2218, 0.1403, 0.1639, 0.4738},
7383 {0.2122, 0.1292, 0.1661, 0.4926},
7384 {0.1886, 0.1460, 0.1694, 0.4961},
7385 {0.1919, 0.1368, 0.1711, 0.5002},
7386 {0.1921, 0.1375, 0.1641, 0.5063},
7387 {0.1838, 0.1331, 0.1558, 0.5273},
7388 {0.1809, 0.1307, 0.1622, 0.5260},
7389 {0.1694, 0.1364, 0.1761, 0.5181},
7390 {0.2177, 0.1357, 0.1864, 0.4602},
7391 {0.2109, 0.1388, 0.1552, 0.4952},
7392 {0.2150, 0.1300, 0.1538, 0.5011},
7393 {0.1989, 0.1252, 0.1766, 0.4993},
7394 {0.1849, 0.1407, 0.1464, 0.5280},
7395 {0.1554, 0.0997, 0.1069, 0.6381},
7396 {0.2664, 0.0846, 0.3851, 0.2640},
7397 {0.0597, 0.6512, 0.0026, 0.2863},
7398 {0.9937, 0.0017, 0.0024, 0.0022},
7399 {0.0022, 0.0042, 0.9921, 0.0015},
7400 {0.2367, 0.0968, 0.5553, 0.1112},
7401 {0.2281, 0.1534, 0.1766, 0.4419},
7402 {0.2957, 0.1438, 0.2218, 0.3387},
7403 {0.2614, 0.1923, 0.2904, 0.2559},
7404 {0.2950, 0.1777, 0.2205, 0.3068}};
7405 Int4 acgt[4] = {0, 0, 0, 0};
7406 Int4 j;
7407 FloatHi prob_seqgsite = 0;
7408
7409 if (sequence == NULL || score == NULL){
7410 return;
7411 }
7412 *score = 0;
7413 if (seqlen < 40){
7414 return;
7415 }
7416 /* first get the freqs */
7417 for (j=0; j<seqlen; j++){
7418 if (sequence[j] != 4){
7419 acgt[sequence[j]]++;
7420 }
7421 }
7422 /* now calculate for each base the log, adding values to get the score */
7423 for (j=0; j<seqlen; j++){
7424 if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7425 prob_seqgsite +=
7426 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7427 }
7428 }
7429 *score = pow(10, prob_seqgsite);
7430 /* if (sequence[33] == 0 && sequence[34] == 2)
7431 *score += 0.5;
7432 */
7433 }
7434
7435 /***************************************************************************
7436 *
7437 * See the comment for SPI_is_acceptor for an explanation of how this
7438 * function works. The splice site frequency matrix is derived from
7439 * a nonredundant set of C. elegans splice sites provided by Chris Burge.
7440 *
7441 ***************************************************************************/
SPI_is_acceptor_cele(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7442 static void SPI_is_acceptor_cele (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score)
7443 {
7444 FloatHi a[18][4] = {
7445 {0.4365, 0.1293, 0.0650, 0.3689},
7446 {0.3719, 0.1415, 0.0826, 0.4037},
7447 {0.3550, 0.1374, 0.0883, 0.4190},
7448 {0.3428, 0.1418, 0.0910, 0.4240},
7449 {0.3465, 0.1499, 0.0711, 0.4321},
7450 {0.3594, 0.1492, 0.0707, 0.4203},
7451 {0.3976, 0.1191, 0.0728, 0.4102},
7452 {0.4139, 0.0795, 0.0687, 0.4376},
7453 {0.2812, 0.0799, 0.0690, 0.5695},
7454 {0.0589, 0.0379, 0.0156, 0.8873},
7455 {0.0102, 0.0132, 0.0047, 0.9716},
7456 {0.0975, 0.1391, 0.0917, 0.6714},
7457 {0.0321, 0.8257, 0.0020, 0.1398},
7458 {0.9953, 0.0010, 0.0017, 0.0017},
7459 {0.0020, 0.0020, 0.9946, 0.0010},
7460 {0.3990, 0.1553, 0.3154, 0.1299},
7461 {0.2995, 0.1780, 0.1628, 0.3594},
7462 {0.2975, 0.2288, 0.1878, 0.2856}};
7463 Int4 acgt[4] = {0, 0, 0, 0};
7464 Int4 j;
7465 FloatHi prob_seqgsite = 0;
7466
7467 if (sequence == NULL || score == NULL){
7468 return;
7469 }
7470 *score = 0;
7471 if (seqlen < 18){
7472 return;
7473 }
7474 /* first get the freqs */
7475 for (j=0; j<seqlen; j++){
7476 if (sequence[j] != 4){
7477 acgt[sequence[j]]++;
7478 }
7479 }
7480 /* now calculate for each base the log, adding values to get the score */
7481 for (j=0; j<seqlen; j++){
7482 if (sequence[j] != 4 && a[j][sequence[j]] > 0){
7483 prob_seqgsite +=
7484 log10((a[j][sequence[j]])/((FloatHi)acgt[sequence[j]]/seqlen));
7485 }
7486 }
7487 *score = pow(10, prob_seqgsite);
7488 /* if (sequence[13] == 0 && sequence[14] == 2)
7489 *score += 0.5;
7490 */
7491 }
7492
7493
7494 /***************************************************************************
7495 *
7496 * See the comment for SPI_is_acceptor for an explanation of how this
7497 * function works. Note that the Dicty info is NOT corrected for current
7498 * sequence composition because the log(likehood)matrix itself corrected
7499 * for dicty genome composition. The data were retrieved
7500 * from the geneid Dd parameter file and used with the permission of
7501 * Roderic Guigo.Values were simply translated from log base 2 to log base 10
7502 *
7503 ***************************************************************************/
SPI_is_acceptor_dicty(Uint1Ptr sequence,Int4 seqlen,FloatHiPtr score)7504 static void SPI_is_acceptor_dicty (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score){
7505
7506 /* acgt */
7507 Int4 j = 0;
7508 FloatHi a[15][4] = {
7509 {-0.2171, -0.4463, -0.9154, 0.2974},
7510 {-0.0984, -0.3965, -1.3635, 0.2574},
7511 {0.0201, -0.5770, -1.5528, 0.3159},
7512 {-0.0880, -0.6470, -1.0716, 0.2993},
7513 {-0.0483, -0.4077, -0.8955, 0.2326},
7514 {0.1091, -0.4041, -0.9030, 0.2262},
7515 {0.0672, -0.3973, -1.0649, 0.1682},
7516 {0.0592, -0.5731, -1.0634, 0.1480},
7517 {0.1707, -0.9122, -1.0757, 0.1658},
7518 {0.0343, -0.6659, -1.4012, 0.2365},
7519 {0.1407, -0.4903, -1.3757, 0.0521},
7520 {0.1901,-0.7647,-0.9314, 0.1395},
7521 {-0.2300, -0.7210, -0.9030, 0.4531},
7522 {0.6020, -9999, -9999, -9999},
7523 {-9999,-9999, 0.6020, -9999}};
7524
7525 if (sequence == NULL || score == NULL){
7526 return;
7527 }
7528 *score = 0;
7529 if (seqlen < 15){
7530 return;
7531 }
7532 for (j = 0; j < seqlen; j++){
7533 if (sequence[j] != 4){
7534 *score += a[j][sequence[j]];
7535 }
7536 }
7537 *score = pow(10, *score);
7538 /* if (sequence[13] == 0 && sequence[14] == 2){
7539 *score += 0.5;
7540 }
7541 */
7542 }
7543
7544
7545
7546 /***************************************************************************
7547 *
7548 * SPI_RemoveConflictsAmongPieces looks at all the alignments for all
7549 * the fragments and removes overlapping alignment sets. The alignment set
7550 * with the greatest sequence range will be kept. The function cycles
7551 * through all the fragments, and if a fragment has an alignment, the
7552 * range of that alignment is compared with the range of all subsequent
7553 * fragments' alignments, and if there's an overlap, one of the overlapping
7554 * alignment sets is deleted. While not the most efficient design, this
7555 * function works well because most overlaps get deleted early and because
7556 * the searches are done with repeat masking, reducing the number of
7557 * overlaps.
7558 *
7559 ***************************************************************************/
SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp,Int4 fuzz)7560 static void SPI_RemoveConflictsAmongPieces(SPI_FragHerdPtr sfhp, Int4 fuzz)
7561 {
7562 Boolean conflict;
7563 Boolean done;
7564 Int4 i;
7565 Int4 j;
7566 Int4 len1;
7567 Int4 len2;
7568 Int4 start1;
7569 Int4 start2;
7570 Int4 stop1;
7571 Int4 stop2;
7572
7573 i = 0;
7574 while (i<sfhp->numfrags)
7575 {
7576 if (sfhp->sfparray[i]->sap != NULL)
7577 {
7578 SPI_GetNthSeqRangeInSASet(sfhp->sfparray[i]->sap, 2, &start1, &stop1);
7579 done = FALSE;
7580 for (j=i+1; j<sfhp->numfrags && !done; j++)
7581 {
7582 if (sfhp->sfparray[j]->sap != NULL)
7583 {
7584 SPI_GetNthSeqRangeInSASet(sfhp->sfparray[j]->sap, 2, &start2, &stop2);
7585 conflict = FALSE;
7586 if (start2 > start1 && stop2 < stop1)
7587 conflict = TRUE;
7588 else if (stop2 > start1 + fuzz && start2 < start1 + fuzz)
7589 conflict = TRUE;
7590 else if (start2 < stop1 - fuzz && stop2 > stop1 - fuzz)
7591 conflict = TRUE;
7592 else if (start2 < start1 && stop2 > stop1)
7593 conflict = TRUE;
7594 if (conflict)
7595 {
7596 len1 = stop1 - start1 + 1;
7597 len2 = stop2 - start2 + 1;
7598 if (len2 > len1)
7599 {
7600 done = TRUE;
7601 SeqAlignSetFree(sfhp->sfparray[i]->sap);
7602 sfhp->sfparray[i]->sap = NULL;
7603 } else if (len1 >= len2)
7604 {
7605 SeqAlignSetFree(sfhp->sfparray[j]->sap);
7606 sfhp->sfparray[j]->sap = NULL;
7607 }
7608 }
7609 }
7610 }
7611 }
7612 i++;
7613 }
7614 }
7615
7616 /***************************************************************************
7617 *
7618 * SPI_OrderPieces sorts the fragments according to their alignment
7619 * position (position on the mRNA sequence) as well as by their original
7620 * fragment order (lgroup, group, and order). Fragments without
7621 * alignments are placed at the beginning of the set. At the end of the
7622 * sort, all the initial fragments have no alignments and all the fragments
7623 * at the end are in order along the mRNA sequence, making filling in the
7624 * holes in the alignment much easier.
7625 *
7626 ***************************************************************************/
SPI_OrderPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_mrna)7627 static void SPI_OrderPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_mrna)
7628 {
7629 Int4 i;
7630 Int4 j;
7631 SPI_FragInfoPtr sfi;
7632 SPI_FragInfoPtr PNTR sfi_array;
7633 SPI_FragPtr sfp;
7634 SPI_FragPtr PNTR sfp_array;
7635
7636 if (sfhp == NULL || bsp_mrna == NULL)
7637 return;
7638 sfi_array = (SPI_FragInfoPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragInfo));
7639 for (i=0; i<sfhp->numfrags; i++)
7640 {
7641 sfi = (SPI_FragInfoPtr)MemNew(sizeof(SPI_FragInfo));
7642 sfp = sfhp->sfparray[i];
7643 if (sfp->sap != NULL)
7644 {
7645 if (sfp->sap->saip == NULL)
7646 AlnMgr2IndexLite(sfp->sap);
7647 SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &sfi->mrnastart, &sfi->mrnastop);
7648 } else
7649 sfi->mrnastart = sfi->mrnastop = -1;
7650 sfi->sfpnum = i;
7651 sfi->position_orig = sfp->position_orig;
7652 sfi->fragnum = sfp->fragnum;
7653 sfi_array[i] = sfi;
7654 }
7655 HeapSort(sfi_array, i, sizeof(SPI_FragInfoPtr), SPI_CompareFragInfo);
7656 j = 0;
7657 for (i=0; i<sfhp->numfrags; i++)
7658 {
7659 if (sfi_array[i]->mrnastart != -1)
7660 {
7661 sfhp->sfparray[sfi_array[i]->sfpnum]->position_mrna = j;
7662 j++;
7663 } else
7664 sfhp->sfparray[sfi_array[i]->sfpnum]->position_mrna = -1;
7665 }
7666 sfp_array = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
7667 for (i=0; i<sfhp->numfrags; i++)
7668 {
7669 sfp_array[i] = sfhp->sfparray[sfi_array[i]->sfpnum];
7670 }
7671 MemFree(sfhp->sfparray);
7672 sfhp->sfparray = sfp_array;
7673 for (i=0; i<sfhp->numfrags; i++)
7674 {
7675 MemFree(sfi_array[i]);
7676 }
7677 MemFree(sfi_array);
7678 }
7679
7680 /***************************************************************************
7681 *
7682 * SPI_CompareFragInfo is the HeapSort callback for SPI_OrderPieces. It
7683 * compares the alignments of two fragments and puts the fragment that
7684 * is most 5' on the mRNA first. Fragments without alignments are put in
7685 * their original order, before all the fragments with alignments. If two
7686 * fragments have the same mRNA position, they are sorted secondarily by
7687 * their original fragment position.
7688 *
7689 ***************************************************************************/
SPI_CompareFragInfo(VoidPtr ptr1,VoidPtr ptr2)7690 static int LIBCALLBACK SPI_CompareFragInfo(VoidPtr ptr1, VoidPtr ptr2)
7691 {
7692 SPI_FragInfoPtr sfi1;
7693 SPI_FragInfoPtr sfi2;
7694
7695 if (ptr1 != NULL && ptr2 != NULL)
7696 {
7697 sfi1 = *((SPI_FragInfoPtr PNTR)ptr1);
7698 sfi2 = *((SPI_FragInfoPtr PNTR)ptr2);
7699 /* this function orders by mRNA position, secondarily by original position */
7700 if (sfi1->mrnastart != -1 && sfi2->mrnastart != -1)
7701 {
7702 if (sfi1->mrnastart < sfi2->mrnastart)
7703 return -1;
7704 else if (sfi1->mrnastart > sfi2->mrnastart)
7705 return 1;
7706 else if (sfi1->mrnastop > sfi2->mrnastop)
7707 return -1;
7708 else if (sfi1->mrnastop < sfi2->mrnastop)
7709 return 1;
7710 else
7711 return 0;
7712 }
7713 /* put things with no mRNA order first */
7714 if (sfi1->mrnastart != -1 && sfi2->mrnastart == -1)
7715 return 1;
7716 if (sfi1->mrnastart == -1 && sfi2->mrnastart != -1)
7717 return -1;
7718 if (sfi1->position_orig->lgroup != 0 && sfi1->position_orig->lgroup == sfi2->position_orig->lgroup)
7719 {
7720 if (sfi1->position_orig->group < sfi2->position_orig->group)
7721 return -1;
7722 else if (sfi1->position_orig->group > sfi2->position_orig->group)
7723 return 1;
7724 else
7725 {
7726 if (sfi1->position_orig->order < sfi2->position_orig->order)
7727 return -1;
7728 else
7729 return 1;
7730 }
7731 }
7732 /* if fragments are in the same group, keep them in order */
7733 if (sfi1->position_orig->group == sfi2->position_orig->group)
7734 {
7735 if (sfi1->position_orig->order < sfi2->position_orig->order)
7736 return -1;
7737 else
7738 return 1;
7739 }
7740 if (sfi1->position_orig->group < sfi2->position_orig->group)
7741 return -1;
7742 else if (sfi2->position_orig->group > sfi1->position_orig->group)
7743 return 1;
7744 if (sfi1->fragnum < sfi2->fragnum)
7745 return -1;
7746 else
7747 return 1;
7748 }
7749 return 0;
7750 }
7751
7752 /***************************************************************************
7753 *
7754 * SPI_ConnectAlnPieces is analogous to SPI_ConnectAln for finished
7755 * sequence; it fills in the gaps in the mRNA-to-draft alignment. Since
7756 * the genomic sequence is in fragments, the job is a little trickier
7757 * here. The function first calls SPI_ConnectAln on each set of
7758 * alignments for each fragment, to fill in internal gaps between those
7759 * alignments. Then the alignment sets are all examined (they should not
7760 * overlap at this point, but they usually have gaps between them) and any
7761 * gaps between the alignment sets are filled in by first searching in
7762 * the fragments containing alignments adjacent to the gaps, then by
7763 * looking in all "nearby" fragments (as defined by SPI_GetNearbyFrags), and
7764 * finally by looking in all fragments. Since many of the spidey functions
7765 * assume that the genomic sequence is always the plus strand, and the
7766 * draft sequence functions all deal with alignments on the plus strand
7767 * of the mRNA and either strand of the genomic sequence, there are many
7768 * places in this function where the strands of an alignment must be
7769 * reversed before and after a function call if the alignment is on the
7770 * minus strand of the genomic sequence.
7771 *
7772 ***************************************************************************/
SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_contig,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)7773 static Boolean SPI_ConnectAlnPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_contig, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
7774 {
7775 AMAlignIndex2Ptr amaip;
7776 Int4 c;
7777 Int4 curr;
7778 Boolean done;
7779 Boolean found;
7780 Int4 gapsize;
7781 Int4 i;
7782 Int4 j = 0;
7783 Boolean minus;
7784 Int4 n;
7785 BLAST_OptionsBlkPtr options;
7786 Int4 orderedstart;
7787 Int4 prevstart;
7788 SeqAlignPtr salp;
7789 SeqAlignPtr salp_tmp;
7790 SeqAlignPtr salp_prev;
7791 SeqAlignPtr sap;
7792 SeqAlignPtr sap_b1;
7793 SeqAlignPtr sap_b2;
7794 SeqAlignPtr sap_new1;
7795 SeqAlignPtr sap_new2;
7796 SeqAlignPtr sap_tmp;
7797 SPI_FragPtr sfp;
7798 SPI_FragPtr sfpcurr;
7799 SPI_FragPtr PNTR sfpnearby;
7800 SPI_FragPtr sfpprev;
7801 SeqLocPtr slp_gen;
7802 SeqLocPtr slp_mrna;
7803 Int4 start_b;
7804 Int4 start1;
7805 Int4 start2;
7806 Int4 start3;
7807 Int4 start4;
7808 Int4 stop_b;
7809 Int4 stop1;
7810 Int4 stop2;
7811 Int4 stop3;
7812 Int4 stop4;
7813 Uint1 strand;
7814 SPI_FragPtr PNTR tmparray;
7815
7816 i = 0;
7817 orderedstart = -1;
7818 /* figure out which sequences have ordering information (by mRNA position) so far */
7819 while (orderedstart == -1 && i < sfhp->numfrags)
7820 {
7821 if (sfhp->sfparray[i]->position_mrna != -1)
7822 orderedstart = i;
7823 i++;
7824 }
7825 if (orderedstart == -1) /* no fragment has alignments */
7826 return FALSE;
7827 /* fill in internal gaps for each contig-to-mRNA alignment */
7828 for (i=orderedstart; i<sfhp->numfrags; i++)
7829 {
7830 if (sfhp->sfparray[i]->sap != NULL)
7831 {
7832 if ((AlnMgr2GetNthStrand(sfhp->sfparray[i]->sap, 1)) == Seq_strand_minus)
7833 {
7834 minus = TRUE;
7835 salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
7836 while (salp != NULL)
7837 {
7838 salp_tmp = salp->next;
7839 salp->next = NULL;
7840 SAIndex2Free2(salp->saip);
7841 salp->saip = NULL;
7842 SeqAlignListReverseStrand(salp);
7843 AlnMgr2IndexSingleChildSeqAlign(salp);
7844 salp->next = salp_tmp;
7845 salp = salp_tmp;
7846 }
7847 } else
7848 minus = FALSE;
7849 if (!SPI_ConnectAln(sfhp->sfparray[i]->sap, spot, NULL, FALSE, TRUE))
7850 {
7851 SeqAlignSetFree(sfhp->sfparray[i]->sap);
7852 sfhp->sfparray[i]->sap = NULL;
7853 }
7854 if (minus && sfhp->sfparray[i]->sap != NULL)
7855 {
7856 salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap);
7857 while (salp != NULL)
7858 {
7859 salp_tmp = salp->next;
7860 salp->next = NULL;
7861 SAIndex2Free2(salp->saip);
7862 salp->saip = NULL;
7863 SeqAlignListReverseStrand(salp);
7864 AlnMgr2IndexSingleChildSeqAlign(salp);
7865 salp->next = salp_tmp;
7866 salp = salp_tmp;
7867 }
7868 }
7869 }
7870 if (i != sfhp->numfrags-1)
7871 sfhp->sfparray[i]->next = sfhp->sfparray[i+1];
7872 }
7873 tmparray = (SPI_FragPtr PNTR)MemNew((sfhp->numfrags)*sizeof(SPI_FragPtr));
7874 prevstart = -1;
7875 done = FALSE;
7876 sfpcurr = sfhp->sfparray[orderedstart];
7877 sfpprev = NULL;
7878 curr = orderedstart;
7879 start3 = stop3 = -1;
7880 sfpnearby = NULL;
7881 /* fill in gaps between contig alignments */
7882 while (!done)
7883 {
7884 sap = sfpcurr->sap;
7885 if (sap != NULL)
7886 {
7887 SPI_GetNthSeqRangeInSASet(sap, 2, &start2, &stop2);
7888 if ((gapsize = spi_isa_gap(start2, prevstart, Seq_strand_plus)) >= SPI_TEENYEXON)
7889 {
7890 /* first look in the same piece and the ones that are supposed to be adjacent */
7891 strand = AlnMgr2GetNthStrand(sfpcurr->sap, 1);
7892 if (strand == Seq_strand_minus)
7893 {
7894 minus = TRUE;
7895 salp = (SeqAlignPtr)(sfpcurr->sap->segs);
7896 while (salp != NULL)
7897 {
7898 salp_tmp = salp->next;
7899 salp->next = NULL;
7900 SAIndex2Free2(salp->saip);
7901 salp->saip = NULL;
7902 SeqAlignListReverseStrand(salp);
7903 AlnMgr2IndexSingleChildSeqAlign(salp);
7904 salp->next = salp_tmp;
7905 salp = salp_tmp;
7906 }
7907 } else
7908 minus = FALSE;
7909 SPI_GetNthSeqRangeInSASet(sfpcurr->sap, 1, &start1, &stop1);
7910 sap_new1 = sap_new2 = NULL;
7911 if (!minus)
7912 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpcurr->start, start1, prevstart, start2, Seq_strand_minus, spot);
7913 else
7914 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpcurr->start, start1, start2, prevstart, Seq_strand_plus, spot);
7915 if (sap_new1 != NULL)
7916 {
7917 SPI_GetNthSeqRangeInSASet(sap_new1, 2, &start2, &stop2);
7918 sap_new1->next = (SeqAlignPtr)(sfpcurr->sap->segs);
7919 sfpcurr->sap->segs = (Pointer)(sap_new1);
7920 AlnMgr2ReIndexSeqAlign(sfpcurr->sap);
7921 SPI_RemoveInconsistentAlnsFromSet(sfpcurr->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7922 }
7923 if ((spi_isa_gap(start2, prevstart, Seq_strand_plus)) > SPI_TEENYEXON)
7924 /* look in fragments in the same group or lgroup, */
7925 /* up to the ones that already have hits */
7926 {
7927 if (sfpnearby != NULL)
7928 {
7929 MemFree(sfpnearby);
7930 sfpnearby = NULL;
7931 }
7932 j = SPI_GetNearbyFrags(sfpcurr, curr, &sfpnearby, sfhp, minus);
7933 found = FALSE;
7934 for (n=0; n<j && !found; n++)
7935 {
7936 if (sfpnearby[n]->sap != NULL)
7937 {
7938 found = TRUE;
7939 strand = AlnMgr2GetNthStrand(sfpnearby[n]->sap, 1);
7940 if (strand == Seq_strand_minus)
7941 {
7942 salp = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7943 while (salp != NULL)
7944 {
7945 salp_tmp = salp->next;
7946 salp->next = NULL;
7947 SAIndex2Free2(salp->saip);
7948 salp->saip = NULL;
7949 SeqAlignListReverseStrand(salp);
7950 AlnMgr2IndexSingleChildSeqAlign(salp);
7951 salp->next = salp_tmp;
7952 salp = salp_tmp;
7953 }
7954 SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 1, &start3, &stop3);
7955 SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 2, &start4, &stop4);
7956 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, start4, stop4, start2, strand, spot);
7957 if (sap_new1 != NULL)
7958 {
7959 sap_new1->next = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7960 sfpnearby[n]->sap->segs = (Pointer)sap_new1;
7961 AMAlignIndex2Free2(sfpnearby[n]->sap->saip);
7962 sfpnearby[n]->sap->saip = NULL;
7963 AlnMgr2IndexLite(sfpnearby[n]->sap);
7964 SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7965 }
7966 salp = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7967 while (salp != NULL)
7968 {
7969 salp_tmp = salp->next;
7970 salp->next = NULL;
7971 SAIndex2Free2(salp->saip);
7972 salp->saip = NULL;
7973 SeqAlignListReverseStrand(salp);
7974 AlnMgr2IndexSingleChildSeqAlign(salp);
7975 salp->next = salp_tmp;
7976 salp = salp_tmp;
7977 }
7978 } else
7979 {
7980 SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 1, &start3, &stop3);
7981 SPI_GetNthSeqRangeInSASet(sfpnearby[n]->sap, 2, &start4, &stop4);
7982 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, stop4, sfpnearby[n]->stop, stop3, start2, Seq_strand_plus, spot);
7983 if (sap_new1 != NULL)
7984 {
7985 sap_new1->next = (SeqAlignPtr)(sfpnearby[n]->sap->segs);
7986 sfpnearby[n]->sap->segs = (Pointer)sap_new1;
7987 AMAlignIndex2Free2(sfpnearby[n]->sap->saip);
7988 sfpnearby[n]->sap->saip = NULL;
7989 AlnMgr2IndexLite(sfpnearby[n]->sap);
7990 SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
7991 }
7992 }
7993 } else
7994 {
7995 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, prevstart, start2, Seq_strand_plus, spot);
7996 if (sap_new1 != NULL)
7997 {
7998 sap_tmp = sap_new1;
7999 while (sap_tmp->next != NULL)
8000 {
8001 sap_tmp = sap_tmp->next;
8002 }
8003 sap_tmp->next = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, start2, prevstart, Seq_strand_minus, spot);
8004 } else
8005 sap_new1 = SPI_FillInIntron(bsp_contig->id, bsp_mrna->id, sfpnearby[n]->start, sfpnearby[n]->stop, start2, prevstart, Seq_strand_minus, spot);
8006 if (sap_new1 != NULL)
8007 {
8008 AMAlignIndex2Free2(sap_new1->saip);
8009 sap_new1->saip = NULL;
8010 AlnMgr2IndexLite(sap_new1);
8011 sfpnearby[n]->sap = sap_new1;
8012 SPI_RemoveInconsistentAlnsFromSet(sfpnearby[n]->sap, SPI_TEENYEXON, 2, SPI_LEFT);
8013 }
8014 }
8015 }
8016 }
8017 SPI_CleanupAndGetNewmRNARange(sfpnearby, j, &start3, &stop3);
8018 if (start3 != -1 && stop3 != -1)
8019 {
8020 start2 = start3;
8021 stop2 = stop3;
8022 }
8023 MemFree(sfpnearby);
8024 if ((spi_isa_gap(start2, prevstart, Seq_strand_plus)) > SPI_MINBLASTSIZE + 2)
8025 /* now look in all the fragments that don't have hits yet */
8026 {
8027 slp_mrna = SeqLocIntNew(prevstart+1, start2-1, Seq_strand_plus, bsp_mrna->id);
8028 slp_gen = SeqLocIntNew(spot->from, spot->to, Seq_strand_plus, bsp_contig->id);
8029 options = BLASTOptionNew("blastn", TRUE);
8030 options->wordsize = 7;
8031 options->filter_string = StringSave("m L");
8032 options->expect_value = spot->secpasseval;
8033 options->query_lcase_mask = spot->lcaseloc;
8034 if (spot->interspecies)
8035 {
8036 options->gap_x_dropoff_final = 100;
8037 options->gap_open = 4;
8038 options->gap_extend = 1;
8039 options->penalty = -1;
8040 }
8041 sap_b1 = BlastTwoSequencesByLoc(slp_mrna, slp_gen, "blastn", options);
8042 BLASTOptionDelete(options);
8043 SeqLocFree(slp_gen);
8044 slp_gen = SeqLocIntNew(spot->from, spot->to, Seq_strand_minus, bsp_contig->id);
8045 options = BLASTOptionNew("blastn", TRUE);
8046 options->wordsize = 7;
8047 options->filter_string = StringSave("m L");
8048 options->expect_value = spot->secpasseval;
8049 options->query_lcase_mask = spot->lcaseloc;
8050 if (spot->interspecies)
8051 {
8052 options->gap_x_dropoff_final = 100;
8053 options->gap_open = 4;
8054 options->gap_extend = 1;
8055 options->penalty = -1;
8056 }
8057 sap_b2 = BlastTwoSequencesByLoc(slp_mrna, slp_gen, "blastn", options);
8058 BLASTOptionDelete(options);
8059 SeqAlignListReverseStrand(sap_b2);
8060 SeqLocFree(slp_gen);
8061 SeqLocFree(slp_mrna);
8062 if (sap_b1 != NULL)
8063 {
8064 sap_tmp = sap_b1;
8065 while (sap_tmp->next != NULL)
8066 {
8067 sap_tmp = sap_tmp->next;
8068 }
8069 sap_tmp->next = sap_b2;
8070 } else
8071 sap_b1 = sap_b2;
8072 SPI_flip_sa_list(sap_b1);
8073 if (sap_b1 != NULL)
8074 {
8075 AlnMgr2SortAlnSetByNthRowPos(sap_b1, 1);
8076 c = 0;
8077 amaip = (AMAlignIndex2Ptr)(sap_b1->saip);
8078 AlnMgr2GetNthSeqRangeInSA(amaip->saps[0], 1, &start_b, &stop_b);
8079 for (i=0; i<orderedstart && c<amaip->numsaps; i++)
8080 {
8081 salp_tmp = salp_prev = NULL;
8082 sfp = sfhp->sfparray[i];
8083 while (sfp->start <= start_b && sfp->stop >= start_b && c<amaip->numsaps)
8084 {
8085 if (salp_tmp == NULL)
8086 salp_tmp = salp_prev = SeqAlignDup(amaip->saps[c]);
8087 else
8088 {
8089 salp_prev->next = SeqAlignDup(amaip->saps[c]);
8090 salp_prev = salp_prev->next;
8091 }
8092 c++;
8093 if (c<amaip->numsaps)
8094 AlnMgr2GetNthSeqRangeInSA(amaip->saps[c], 1, &start_b, &stop_b);
8095 }
8096 if (salp_tmp != NULL)
8097 {
8098 AlnMgr2IndexLite(salp_tmp);
8099 SPI_RemoveInconsistentAlnsFromSet(salp_tmp, SPI_FUZZ, 2, SPI_LEFT);
8100 SeqAlignSetFree(sfp->sap);
8101 sfp->sap = salp_tmp;
8102 if (!SPI_ConnectAln(sfp->sap, spot, NULL, FALSE, TRUE))
8103 return FALSE;
8104 /* change all alignments to be on the plus strand of the mRNA */
8105 strand = AlnMgr2GetNthStrand((SeqAlignPtr)(salp_tmp->segs), 2);
8106 if (strand == Seq_strand_minus)
8107 SeqAlignListReverseStrand((SeqAlignPtr)(salp_tmp->segs));
8108 }
8109 }
8110 }
8111 for (i=0; i<orderedstart; i++)
8112 {
8113 if (sfhp->sfparray[i]->sap != NULL)
8114 j++;
8115 }
8116 sfpnearby = (SPI_FragPtr PNTR)MemNew(j*sizeof(SPI_FragPtr));
8117 j = 0;
8118 for (i=0; i<orderedstart; i++)
8119 {
8120 if (sfhp->sfparray[i]->sap != NULL)
8121 {
8122 sfpnearby[j] = sfhp->sfparray[i];
8123 j++;
8124 }
8125 }
8126 SPI_CleanupAndGetNewmRNARange(sfpnearby, j, &start3, &stop3);
8127 }
8128 }
8129 prevstart = stop2;
8130 }
8131 sfpprev = sfpcurr;
8132 curr++;
8133 if (curr == sfhp->numfrags)
8134 done = TRUE;
8135 else
8136 sfpcurr = sfhp->sfparray[curr];
8137 }
8138 return TRUE;
8139 }
8140
8141 /***************************************************************************
8142 *
8143 * SPI_CleanupAndGetNewmRNARange looks through all alignment sets of a
8144 * group of fragments and removes overlapping alignment sets. Once the
8145 * group of fragments is consistent, SPI_CleanupAndGetNewmRNARange gets
8146 * the range of the mRNA sequence covered by all alignment sets of the
8147 * fragment group.
8148 *
8149 ***************************************************************************/
SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby,Int4 n,Int4Ptr start,Int4Ptr stop)8150 static void SPI_CleanupAndGetNewmRNARange(SPI_FragPtr PNTR sfpnearby, Int4 n, Int4Ptr start, Int4Ptr stop)
8151 {
8152 Boolean conflict;
8153 Boolean done;
8154 Int4 i;
8155 Int4 len1;
8156 Int4 len2;
8157 Int4 numconsistent;
8158 Int4 numsaps1;
8159 Int4 numsaps2;
8160 SPI_FragPtr sfp;
8161 SPI_FragPtr sfp_head;
8162 SPI_FragPtr sfp_prev;
8163 Int4 start_m;
8164 Int4 start_m1;
8165 Int4 stop_m;
8166 Int4 stop_m1;
8167 Int4 tmpstart;
8168 Int4 tmpstop;
8169
8170 numconsistent = 0;
8171 sfp_head = sfp_prev = NULL;
8172 for (i=0; i<n; i++) /* first make the set self-consistent by removing overlapping */
8173 { /* sets of alignments among the fragments */
8174 sfpnearby[i]->next = NULL;
8175 sfp = sfp_head;
8176 if (sfpnearby[i]->sap != NULL)
8177 {
8178 SPI_GetNthSeqRangeInSASet(sfpnearby[i]->sap, 2, &start_m, &stop_m);
8179 done = FALSE;
8180 conflict = FALSE;
8181 while (sfp != NULL && !done)
8182 {
8183 SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &start_m1, &stop_m1);
8184 if (start_m1 > start_m && stop_m1 < stop_m)
8185 conflict = TRUE;
8186 else if (stop_m1 > start_m + SPI_TEENYEXON && start_m1 < start_m)
8187 conflict = TRUE;
8188 else if (start_m1 < stop_m - SPI_TEENYEXON && stop_m1 > stop_m)
8189 conflict = TRUE;
8190 else if (start_m1 < start_m && stop_m1 > stop_m)
8191 conflict = TRUE;
8192 if (conflict == TRUE) /* keep the longer of the two alignment sets */
8193 {
8194 done = TRUE;
8195 len1 = SPI_GetNthSeqLenInSASet(sfpnearby[i]->sap, 2, &numsaps1);
8196 len2 = SPI_GetNthSeqLenInSASet(sfp->sap, 2, &numsaps2);
8197 if (len1 > len2)
8198 {
8199 SeqAlignSetFree(sfp->sap);
8200 sfp->sap = NULL;
8201 /* new sfp takes the place of the conflicting one */
8202 if (sfp_prev != NULL)
8203 {
8204 sfpnearby[i]->next = sfp_prev->next;
8205 sfp_prev->next = sfpnearby[i];
8206 } else
8207 {
8208 sfpnearby[i]->next = sfp_head->next;
8209 sfp_head = sfpnearby[i];
8210 }
8211 } else /* new one gets its seqalign deleted */
8212 {
8213 SeqAlignSetFree(sfpnearby[i]->sap);
8214 sfpnearby[i]->sap = NULL;
8215 }
8216 } else
8217 {
8218 sfp_prev = sfp;
8219 sfp = sfp->next;
8220 }
8221 }
8222 if (!conflict) /* add the new one to the list */
8223 {
8224 sfpnearby[i]->next = sfp_head;
8225 sfp_head = sfpnearby[i];
8226 numconsistent++;
8227 }
8228 }
8229 }
8230 /* then get the start and stop of the mRNA across the set */
8231 if (numconsistent == 0) /* shouldn't ever happen! */
8232 {
8233 *start = -1;
8234 *stop = -1;
8235 } else
8236 {
8237 sfp = sfp_head;
8238 *start = -1;
8239 *stop = -1;
8240 while (sfp != NULL)
8241 {
8242 if (sfp->sap != NULL)
8243 {
8244 SPI_GetNthSeqRangeInSASet(sfp->sap, 2, &tmpstart, &tmpstop);
8245 if (tmpstart < *start || *start == -1)
8246 *start = tmpstart;
8247 if (tmpstop > *stop)
8248 *stop = tmpstop;
8249 }
8250 sfp = sfp->next;
8251 }
8252 sfpnearby[0] = sfp_head;
8253 }
8254 }
8255
8256 /***************************************************************************
8257 *
8258 * SPI_GetNearbyFrags takes a fragment herd, a fragment which is the
8259 * target (n is the number of the target fragment in the herd) and a
8260 * SPI_FragPtr **, and fills in the ptrptr with a linked list of fragments
8261 * that are in the same group or lgroup as the target fragment. These
8262 * fragments are supposedly near the target fragment in the genomic
8263 * sequence, and should be searched first for pieces missing from the
8264 * alignment in the target fragment.
8265 *
8266 ***************************************************************************/
SPI_GetNearbyFrags(SPI_FragPtr sfptarget,Int4 n,SPI_FragPtr ** ptrptr,SPI_FragHerdPtr sfhp,Boolean minus)8267 static Int4 SPI_GetNearbyFrags(SPI_FragPtr sfptarget, Int4 n, SPI_FragPtr ** ptrptr, SPI_FragHerdPtr sfhp, Boolean minus)
8268 {
8269 Boolean found;
8270 Int4 i;
8271 Int4 j;
8272 SPI_FragPtr sfp;
8273 SPI_FragPtr sfp_curr;
8274 SPI_FragPtr sfp_head;
8275 SPI_FragPtr sfp_prev;
8276 SPI_FragPtr PNTR sfpnearby;
8277
8278 j = 0;
8279 sfp_head = NULL;
8280 for (i=0; i<sfhp->numfrags; i++)
8281 {
8282 if (i != n)
8283 {
8284 sfp = sfhp->sfparray[i];
8285 sfp->next = NULL;
8286 if (sfp->position_orig->lgroup == sfptarget->position_orig->lgroup)
8287 {
8288 if (!minus)
8289 {
8290 if (sfp->position_orig->group < sfptarget->position_orig->group && sfp->position_orig->lgroup != 0)
8291 {
8292 j++;
8293 sfp_prev = NULL;
8294 if (sfp_head == NULL)
8295 sfp_head = sfp;
8296 else
8297 {
8298 sfp_prev = NULL;
8299 sfp_curr = sfp_head;
8300 found = FALSE;
8301 while (sfp_curr != NULL && !found)
8302 {
8303 if (sfp->position_orig->group > sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order > sfp_curr->position_orig->order))
8304 found = TRUE;
8305 else
8306 {
8307 sfp_prev = sfp_curr;
8308 sfp_curr = sfp_curr->next;
8309 }
8310 }
8311 if (sfp_prev != NULL)
8312 {
8313 sfp->next = sfp_prev->next;
8314 sfp_prev->next = sfp;
8315 } else
8316 {
8317 sfp->next = sfp_head;
8318 sfp_head = sfp;
8319 }
8320 }
8321 } else if (sfp->position_orig->group == sfptarget->position_orig->group && sfp->position_orig->order < sfptarget->position_orig->order)
8322 {
8323 j++;
8324 if (sfp_head == NULL)
8325 sfp_head = sfp;
8326 else
8327 {
8328 sfp_prev = NULL;
8329 sfp_curr = sfp_head;
8330 found = FALSE;
8331 while (sfp_curr != NULL && !found)
8332 {
8333 if (sfp->position_orig->group > sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order > sfp_curr->position_orig->order))
8334 found = TRUE;
8335 else
8336 {
8337 sfp_prev = sfp_curr;
8338 sfp_curr = sfp_curr->next;
8339 }
8340 }
8341 if (sfp_prev != NULL)
8342 {
8343 sfp->next = sfp_prev->next;
8344 sfp_prev->next = sfp;
8345 } else
8346 {
8347 sfp->next = sfp_head;
8348 sfp_head = sfp;
8349 }
8350 }
8351 }
8352 } else
8353 {
8354 if (sfp->position_orig->group > sfptarget->position_orig->group && sfp->position_orig->lgroup != 0)
8355 {
8356 j++;
8357 sfp_prev = NULL;
8358 if (sfp_head == NULL)
8359 sfp_head = sfp;
8360 else
8361 {
8362 sfp_prev = NULL;
8363 sfp_curr = sfp_head;
8364 found = FALSE;
8365 while (sfp_curr != NULL && !found)
8366 {
8367 if (sfp->position_orig->group < sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order < sfp_curr->position_orig->order))
8368 found = TRUE;
8369 else
8370 {
8371 sfp_prev = sfp_curr;
8372 sfp_curr = sfp_curr->next;
8373 }
8374 }
8375 if (sfp_prev != NULL)
8376 {
8377 sfp->next = sfp_prev->next;
8378 sfp_prev->next = sfp;
8379 } else
8380 {
8381 sfp->next = sfp_head;
8382 sfp_head = sfp;
8383 }
8384 }
8385 } else if (sfp->position_orig->group == sfptarget->position_orig->group && sfp->position_orig->order > sfptarget->position_orig->order)
8386 {
8387 j++;
8388 if (sfp_head == NULL)
8389 sfp_head = sfp;
8390 else
8391 {
8392 sfp_prev = NULL;
8393 sfp_curr = sfp_head;
8394 found = FALSE;
8395 while (sfp_curr != NULL && !found)
8396 {
8397 if (sfp->position_orig->group < sfp_curr->position_orig->group || (sfp->position_orig->group == sfp_curr->position_orig->group && sfp->position_orig->order < sfp_curr->position_orig->order))
8398 found = TRUE;
8399 else
8400 {
8401 sfp_prev = sfp_curr;
8402 sfp_curr = sfp_curr->next;
8403 }
8404 }
8405 if (sfp_prev != NULL)
8406 {
8407 sfp->next = sfp_prev->next;
8408 sfp_prev->next = sfp;
8409 } else
8410 {
8411 sfp->next = sfp_head;
8412 sfp_head = sfp;
8413 }
8414 }
8415 }
8416 }
8417 }
8418 }
8419 }
8420 if (j == 0)
8421 return 0;
8422 sfpnearby = (SPI_FragPtr PNTR)MemNew(j*sizeof(SPI_FragPtr));
8423 for (i=0, sfp = sfp_head; i<j && sfp!=NULL; i++, sfp = sfp->next)
8424 {
8425 sfpnearby[i] = sfp;
8426 }
8427 *ptrptr = sfpnearby;
8428 return j;
8429 }
8430
8431 /***************************************************************************
8432 *
8433 * SPI_AdjustSplicesInPieces first calls SPI_AdjustForSplice on each
8434 * fragment's alignment set (first reversing those fragment's alignments
8435 * that are on the minus strand of the genomic sequence, as the draft
8436 * functions expect the mRNA to be on the plus strand but the finished
8437 * functions expect the genomic sequence to be on the plus strand). Next,
8438 * it calls SPI_AdjustEndsOfPieces for each adjacent pair of fragments;
8439 * this function adjusts the initial and terminal exons of the adjacent
8440 * pieces so that they abut exactly on the mRNA and they are next to
8441 * acceptable splice sites.
8442 *
8443 ***************************************************************************/
SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp,BioseqPtr bsp_genomic,SPI_OptionsPtr spot)8444 static void SPI_AdjustSplicesInPieces(SPI_FragHerdPtr sfhp, BioseqPtr bsp_genomic, SPI_OptionsPtr spot)
8445 {
8446 Int4 i;
8447 Int4 j;
8448 Boolean minus;
8449 SeqAlignPtr salp;
8450 SeqAlignPtr salp_tmp;
8451 SPI_mRNAPtr smp;
8452 SPI_RegionInfoPtr srip;
8453 Uint1 tmp_acc;
8454 Uint1 tmp_don;
8455
8456 /* first use standard functions to adjust internal splices */
8457 srip = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
8458 for (i=0; i<sfhp->numfrags; i++)
8459 {
8460 if (sfhp->sfparray[i]->sap != NULL && ((SeqAlignPtr)(sfhp->sfparray[i]->sap->segs))->next != NULL)
8461 {
8462 salp_tmp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8463 srip->strand = AlnMgr2GetNthStrand(salp_tmp, 1);
8464 if (srip->strand == Seq_strand_minus)
8465 {
8466 minus = TRUE;
8467 salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8468 while (salp != NULL)
8469 {
8470 salp_tmp = salp->next;
8471 salp->next = NULL;
8472 SAIndex2Free2(salp->saip);
8473 salp->saip = NULL;
8474 SeqAlignListReverseStrand(salp);
8475 AlnMgr2IndexSingleChildSeqAlign(salp);
8476 salp->next = salp_tmp;
8477 salp = salp_tmp;
8478 }
8479 } else
8480 minus = FALSE;
8481 if (sfhp->sfparray[i]->sap->saip == NULL)
8482 AlnMgr2IndexLite(sfhp->sfparray[i]->sap);
8483 smp = SPI_AdjustForSplice(sfhp->sfparray[i]->sap, spot, srip);
8484 sfhp->sfparray[i]->smp = smp;
8485 if (srip->strand == Seq_strand_minus) /* the exons will be in the wrong order now */
8486 {
8487 for (j=0; j<smp->numexons/2; j++)
8488 {
8489 tmp_don = smp->splicedon[smp->numexons-j-1];
8490 smp->splicedon[smp->numexons-j-1] = smp->splicedon[j];
8491 smp->splicedon[j] = tmp_don;
8492 tmp_acc = smp->spliceacc[smp->numexons-j-1];
8493 smp->spliceacc[smp->numexons-j-1] = smp->spliceacc[j];
8494 smp->spliceacc[j] = tmp_acc;
8495 }
8496 salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
8497 while (salp != NULL)
8498 {
8499 salp_tmp = salp->next;
8500 salp->next = NULL;
8501 SAIndex2Free2(salp->saip);
8502 salp->saip = NULL;
8503 SeqAlignListReverseStrand(salp);
8504 AlnMgr2IndexSingleChildSeqAlign(salp);
8505 salp->next = salp_tmp;
8506 salp = salp_tmp;
8507 }
8508 }
8509 }
8510 }
8511 MemFree(srip);
8512 /* now adjust the splice sites between fragments */
8513 for (i=0; i<sfhp->numfrags-1; i++)
8514 {
8515 if (sfhp->sfparray[i]->sap != NULL)
8516 {
8517 j = i+1;
8518 while (j<sfhp->numfrags && sfhp->sfparray[j]->sap == NULL)
8519 {
8520 j++;
8521 }
8522 if (sfhp->sfparray[j]->sap != NULL)
8523 SPI_AdjustEndsOfPieces(sfhp->sfparray[i], sfhp->sfparray[j], bsp_genomic, spot);
8524 }
8525 }
8526 }
8527
8528 /***************************************************************************
8529 *
8530 * SPI_AdjustEndsOfPieces takes the last exon in the alignment of sfp1 and
8531 * the first exon in the alignment of sfp2 and adjusts the boundaries so
8532 * that the two exons abut exactly on the mRNA (if possible -- if a piece
8533 * is missing, both exon boundaries are separately adjusted to good splice
8534 * sites) and so that they are adjacent to good splice sites. After
8535 * getting the possible splice sites, SPI_AdjustEndsOfPieces looks through
8536 * the sites to determine which is the highest-scoring site that changes
8537 * the alignments the least. If no pieces are missing (continuous is TRUE)
8538 * then both alignments are truncated or extended to the splice site; if
8539 * continuous is FALSE, the second alignment is adjusted separately to a
8540 * good acceptor site that changes the alignment the least.
8541 *
8542 ***************************************************************************/
SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1,SPI_FragPtr sfp2,BioseqPtr bsp_genomic,SPI_OptionsPtr spot)8543 static void SPI_AdjustEndsOfPieces(SPI_FragPtr sfp1, SPI_FragPtr sfp2, BioseqPtr bsp_genomic, SPI_OptionsPtr spot)
8544 {
8545 Boolean continuous;
8546 Int4 f;
8547 SPI_FragSplPtr fsp1;
8548 SPI_FragSplPtr fsp2;
8549 Int4 i;
8550 FloatHi maxsc;
8551 Int4 offset;
8552 Int4 ovl;
8553 Int4 pos;
8554 SeqAlignPtr sap1;
8555 SeqAlignPtr sap2;
8556 Int4 start1;
8557 Int4 start2;
8558 Int4 stop1;
8559 Int4 stop2;
8560 Uint1 strand1;
8561 Uint1 strand2;
8562
8563 sap1 = SPI_GetNthSAByRow(sfp1->sap, 2, -1);
8564 sap2 = SPI_GetNthSAByRow(sfp2->sap, 2, 1);
8565 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, &stop1);
8566 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, &stop2);
8567 strand1 = AlnMgr2GetNthStrand(sap1, 1);
8568 strand2 = AlnMgr2GetNthStrand(sap2, 1);
8569 fsp1 = NULL;
8570 fsp2 = NULL;
8571 if (start2 - stop1 <= SPI_TEENYEXON) /* make mRNA continuous, nonoverlapping */
8572 {
8573 if (start2 - stop1 < 0)
8574 ovl = stop1 - start2;
8575 else
8576 ovl = start2 - stop1;
8577 if (ovl < SPI_TEENYEXON)
8578 ovl = SPI_TEENYEXON;
8579 fsp1 = SPI_GetPossibleSites(sap1, bsp_genomic, spot, TRUE, ovl);
8580 continuous = TRUE;
8581 } else /* just adjust ends to good splice sites, don't worry about continuity */
8582 {
8583 ovl = SPI_FUZZ;
8584 fsp1 = SPI_GetPossibleSites(sap1, bsp_genomic, spot, TRUE, SPI_FUZZ);
8585 fsp2 = SPI_GetPossibleSites(sap2, bsp_genomic, spot, FALSE, SPI_FUZZ);
8586 continuous = FALSE;
8587 }
8588 maxsc = 0;
8589 for (f=0; f<SPI_NUMSITES; f++)
8590 {
8591 pos = stop1 - ovl + fsp1->splarray[f].i + fsp1->spllen - fsp1->boundary;
8592 if (stop1 - pos < 0)
8593 fsp1->splarray[f].diff = pos - stop1;
8594 else
8595 fsp1->splarray[f].diff = stop1 - pos;
8596 if (continuous)
8597 {
8598 if (start2 - pos < 0)
8599 {
8600 if (pos - start2 > fsp1->splarray[f].diff)
8601 fsp1->splarray[f].diff = pos - start2;
8602 } else
8603 {
8604 if (start2 - pos > fsp1->splarray[f].diff)
8605 fsp1->splarray[f].diff = start2 - pos;
8606 }
8607 }
8608 if (pos - start1 <= SPI_TEENYEXON)
8609 {
8610 fsp1->splarray[f].score = 0;
8611 fsp1->splarray[f].diff = -1;
8612 }
8613 if (fsp1->splarray[f].diff > maxsc)
8614 maxsc = fsp1->splarray[f].diff;
8615 }
8616 offset = ovl - fsp1->spllen + fsp1->boundary;
8617 i = 0;
8618 for (f=0; f<SPI_NUMSITES; f++)
8619 {
8620 if (fsp1->splarray[f].diff <= maxsc && fsp1->splarray[f].score > 0 && fsp1->splarray[f].diff >= 0)
8621 {
8622 maxsc = fsp1->splarray[f].diff;
8623 offset = fsp1->splarray[f].i;
8624 i = f;
8625 }
8626 }
8627 if (fsp1->splarray[i].score >= 0.00001)
8628 sfp1->donor = 1;
8629 else /* if don't find a good site, don't change the alignment */
8630 offset = ovl - fsp1->spllen + fsp1->boundary;
8631 pos = stop1 - ovl + offset + fsp1->spllen - fsp1->boundary;
8632 if (strand1 == Seq_strand_minus)
8633 {
8634 sap1->next = NULL;
8635 SAIndex2Free2(sap1->saip);
8636 sap1->saip = NULL;
8637 SeqAlignListReverseStrand(sap1);
8638 AlnMgr2IndexSingleChildSeqAlign(sap1);
8639 if (pos < stop1)
8640 {
8641 if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
8642 {
8643 sap1->next->next = NULL;
8644 SeqAlignFree(sap1->next);
8645 sap1->next = NULL;
8646 }
8647 } else if (pos > stop1)
8648 SPI_AddToAln(sap1, pos - stop1, SPI_LEFT, strand1);
8649 sap1->next = NULL;
8650 SAIndex2Free2(sap1->saip);
8651 sap1->saip = NULL;
8652 SeqAlignListReverseStrand(sap1);
8653 AlnMgr2IndexSingleChildSeqAlign(sap1);
8654 } else
8655 {
8656 if (pos < stop1)
8657 {
8658 if (AlnMgr2TruncateSeqAlign(sap1, start1, pos, 2))
8659 {
8660 sap1->next->next = NULL;
8661 SeqAlignFree(sap1->next);
8662 sap1->next = NULL;
8663 }
8664 } else if (pos > stop1)
8665 SPI_AddToAln(sap1, pos - stop1, SPI_RIGHT, strand1);
8666 }
8667 if (!continuous) /* find a decent acceptor site among the ones returned */
8668 {
8669 maxsc = 0;
8670 for (f=0; f<SPI_NUMSITES; f++)
8671 {
8672 pos = start2 - ovl + fsp2->splarray[f].i + fsp2->spllen - fsp2->boundary;
8673 if (start2 - pos < 0)
8674 fsp1->splarray[f].diff = pos - start2;
8675 else
8676 fsp1->splarray[f].diff = start2 - pos;
8677 if (pos - stop2 <= SPI_TEENYEXON || stop2 - pos <= SPI_TEENYEXON)
8678 {
8679 fsp2->splarray[f].score = 0;
8680 fsp2->splarray[f].diff = -1;
8681 }
8682 if (fsp2->splarray[f].diff > maxsc)
8683 maxsc = fsp2->splarray[f].diff;
8684 }
8685 offset = ovl - fsp2->spllen + fsp2->boundary;
8686 i = 0;
8687 for (f=0; f<SPI_NUMSITES; f++)
8688 {
8689 if (fsp2->splarray[f].diff <= maxsc && fsp2->splarray[f].score > 0 && fsp2->splarray[f].diff >= 0)
8690 {
8691 maxsc = fsp2->splarray[f].diff;
8692 offset = fsp2->splarray[f].i;
8693 i = f;
8694 }
8695 }
8696 if (fsp2->splarray[i].score >= 0.0000002)
8697 sfp2->acceptor = 1;
8698 else /* if don't find a good site, don't change the alignment */
8699 offset = ovl - fsp2->spllen + fsp2->boundary;
8700 pos = start2 - ovl + offset + fsp2->spllen - fsp2->boundary;
8701 }
8702 if (strand2 == Seq_strand_minus)
8703 {
8704 sap2->next = NULL;
8705 SAIndex2Free2(sap2->saip);
8706 sap2->saip = NULL;
8707 SeqAlignListReverseStrand(sap2);
8708 AlnMgr2IndexSingleChildSeqAlign(sap2);
8709 if (start2 < pos + 1)
8710 {
8711 if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
8712 {
8713 sap2->next->next = NULL;
8714 SeqAlignFree(sap2->next);
8715 sap2->next = NULL;
8716 }
8717 } else if (start2 > pos + 1)
8718 SPI_AddToAln(sap2, start2-pos-1, SPI_RIGHT, strand2);
8719 sap2->next = NULL;
8720 SAIndex2Free2(sap2->saip);
8721 sap2->saip = NULL;
8722 SeqAlignListReverseStrand(sap2);
8723 AlnMgr2IndexSingleChildSeqAlign(sap2);
8724 } else
8725 {
8726 if (start2 < pos + 1)
8727 {
8728 if (AlnMgr2TruncateSeqAlign(sap2, pos+1, stop2, 2))
8729 {
8730 sap2->next->next = NULL;
8731 SeqAlignFree(sap2->next);
8732 sap2->next = NULL;
8733 }
8734 } else if (start2 > pos + 1)
8735 SPI_AddToAln(sap2, start2-pos-1, SPI_LEFT, strand2);
8736 }
8737 if (continuous) /* check to see whether current breakpoint has a good acceptor site */
8738 {
8739 fsp2 = SPI_GetPossibleSites(sap2, bsp_genomic, spot, FALSE, 0);
8740 if (fsp2->splarray[0].score >= 0.0000002)
8741 sfp2->acceptor = 1;
8742 }
8743 SPI_FragSplFree(fsp1);
8744 SPI_FragSplFree(fsp2);
8745 }
8746
8747 /***************************************************************************
8748 *
8749 * SPI_GetNthSAByRow is a useful utility function that sorts a set
8750 * of alignments by position on the 'row'th row and then retrieves the
8751 * nth of those alignments. If n is -1, the last alignment is
8752 * retrieved.
8753 *
8754 ***************************************************************************/
SPI_GetNthSAByRow(SeqAlignPtr sap,Int4 row,Int4 n)8755 static SeqAlignPtr SPI_GetNthSAByRow(SeqAlignPtr sap, Int4 row, Int4 n)
8756 {
8757 /* n = 1 is first alignment, n = -1 is last alignment */
8758 AMAlignIndex2Ptr amaip;
8759 Int4 i;
8760 SeqAlignPtr sap_place;
8761 SeqAlignPtr PNTR saparray;
8762 SeqAlignPtr PNTR saparray_tmp;
8763
8764 if (sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
8765 return NULL;
8766 amaip = (AMAlignIndex2Ptr)(sap->saip);
8767 if (n > amaip->numsaps)
8768 return NULL;
8769 saparray = (SeqAlignPtr PNTR)MemNew(amaip->numsaps*sizeof(SeqAlignPtr));
8770 saparray_tmp = amaip->saps;
8771 for (i=0; i<amaip->numsaps; i++)
8772 {
8773 saparray[i] = amaip->saps[i];
8774 }
8775 amaip->saps = saparray;
8776 AlnMgr2SortAlnSetByNthRowPos(sap, row);
8777 if (n > 0)
8778 sap_place = amaip->saps[n-1];
8779 else
8780 sap_place = amaip->saps[amaip->numsaps-1];
8781 amaip->saps = saparray_tmp;
8782 MemFree(saparray);
8783 return sap_place;
8784 }
8785
8786 /***************************************************************************
8787 *
8788 * SPI_GetPossibleSites returns the SPI_NUMSITES best donor or acceptor
8789 * splice sites for an exon (defined by an alignment), within a range
8790 * defined by the variable ovl. First, the donor or acceptor site
8791 * consensus length and position of the splice junction is retrieved
8792 * for the appropriate organism. Then, the interval around the 5' or 3'
8793 * end of the alignment (dictated by whether the site is a donor (5') or
8794 * acceptor (3') site) is examined and the SPI_NUMSITES best sites are
8795 * stored in the SPI_FragSpl structure and returned.
8796 *
8797 ***************************************************************************/
SPI_GetPossibleSites(SeqAlignPtr sap,BioseqPtr bsp_genomic,SPI_OptionsPtr spot,Boolean donor,Int4 ovl)8798 static SPI_FragSplPtr SPI_GetPossibleSites(SeqAlignPtr sap, BioseqPtr bsp_genomic, SPI_OptionsPtr spot, Boolean donor, Int4 ovl)
8799 {
8800 Int4 boundary;
8801 Uint1Ptr buf;
8802 Int4 c;
8803 Int4 f;
8804 SPI_FragSplPtr fsp;
8805 Int4 i;
8806 FloatHi maxsc = 0;
8807 Uint1 res;
8808 FloatHi score;
8809 SPI_SplicePtr splarray;
8810 Int4 spllen;
8811 SeqPortPtr spp;
8812 Int4 start;
8813 Int4 stop;
8814 Uint1 strand;
8815
8816 strand = AlnMgr2GetNthStrand(sap, 1);
8817 fsp = (SPI_FragSplPtr)MemNew(sizeof(SPI_FragSpl));
8818 AlnMgr2GetNthSeqRangeInSA(sap, 1, &start, &stop);
8819 if (donor)
8820 SPI_GetDonorSpliceInfo(spot->organism, &spllen, &boundary, spot);
8821 else
8822 SPI_GetAcceptorSpliceInfo(spot->organism, &spllen, &boundary, spot);
8823 if (strand != Seq_strand_minus)
8824 {
8825 if (donor)
8826 spp = SeqPortNew(bsp_genomic, stop-ovl, stop+ovl+spllen, strand, Seq_code_ncbi4na);
8827 else
8828 spp = SeqPortNew(bsp_genomic, start-ovl-spllen, start+ovl, strand, Seq_code_ncbi4na);
8829 } else
8830 {
8831 if (donor)
8832 spp = SeqPortNew(bsp_genomic, start-ovl-spllen, start+ovl, strand, Seq_code_ncbi4na);
8833 else
8834 spp = SeqPortNew(bsp_genomic, stop-ovl, stop+ovl+spllen, strand, Seq_code_ncbi4na);
8835 }
8836 i = 0;
8837 buf = (Uint1Ptr)MemNew((2*ovl+spllen+(spllen-boundary))*sizeof(Uint1));
8838 splarray = (SPI_SplicePtr)MemNew(SPI_NUMSITES*sizeof(SPI_Splice));
8839 for (f=0; f<SPI_NUMSITES; f++)
8840 {
8841 splarray[f].i = 0;
8842 splarray[f].score = -2;
8843 }
8844 while (((res = SeqPortGetResidue(spp)) != SEQPORT_EOF) && i<(2*ovl+1+spllen))
8845 {
8846 if (res == 1)
8847 buf[i] = 0;
8848 else if (res == 2)
8849 buf[i] = 1;
8850 else if (res == 4)
8851 buf[i] = 2;
8852 else if (res == 8)
8853 buf[i] = 3;
8854 else
8855 buf[i] = 4;
8856 i++;
8857 }
8858 SeqPortFree(spp);
8859 for (i=0; i<2*ovl+(spllen-boundary); i++)
8860 {
8861 if (donor)
8862 SPI_is_donor(buf+i, spllen, &score, spot->organism);
8863 else
8864 SPI_is_acceptor(buf+i, spllen, &score, spot->organism);
8865 c = 0;
8866 if (score > 0.000001)
8867 {
8868 for (f=0; f<SPI_NUMSITES; f++)
8869 {
8870 if (f == 0)
8871 maxsc = splarray[f].score;
8872 else if (splarray[f].score < maxsc)
8873 {
8874 maxsc = splarray[f].score;
8875 c = f;
8876 }
8877 }
8878 if (score > splarray[c].score)
8879 {
8880 splarray[c].score = score;
8881 splarray[c].i = i;
8882 }
8883 }
8884 }
8885 MemFree(buf);
8886 fsp->splarray = splarray;
8887 fsp->spllen = spllen;
8888 fsp->boundary = boundary;
8889 return fsp;
8890 }
8891
SPI_FragSplFree(SPI_FragSplPtr fsp)8892 static void SPI_FragSplFree(SPI_FragSplPtr fsp)
8893 {
8894 if (fsp == NULL)
8895 return;
8896 MemFree(fsp->splarray);
8897 MemFree(fsp);
8898 }
8899
8900
8901 /***************************************************************************
8902 *
8903 * SPI_RemoveInconsistentAlnsFromSet is a greedy algorithm that first
8904 * sorts the alignments by score, then takes the highest-scoring
8905 * alignment and compares it to the next-highest-scoring alignment, which
8906 * is deleted if it is contained; on subsequent loops each next-highest-
8907 * scoring alignment is compared to the set of alignments that have
8908 * been kept. The alignments can be sorted along the first or
8909 * second sequence; the alignments will be reversed so that they are
8910 * all on the plus strand of the sequence to be examined.
8911 * The input alignment must be indexed at least at the LITE level;
8912 * conflicting child alignments will be deleted, not hidden, by this
8913 * function. This function assumes that all children have the same two
8914 * rows. The 'compact' parameter tells the function whether to try to
8915 * keep alignments that are more to the left in genomic coordinates, or
8916 * more to the right.
8917 *
8918 ***************************************************************************/
SPI_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap,Int4 fuzz,Int4 n,Int4 compact)8919 NLM_EXTERN void SPI_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap, Int4 fuzz, Int4 n, Int4 compact)
8920 {
8921 AMAlignIndex2Ptr amaip;
8922 Boolean conflict;
8923 Int4 curr;
8924 Int4 i;
8925 Int4 indextype;
8926 SeqAlignPtr salp;
8927 SeqAlignPtr salp_head;
8928 SeqAlignPtr salp_prev;
8929 SPI_nPtr PNTR spin;
8930 Int4 start;
8931 Int4 stop;
8932 Int4 strand;
8933
8934 if (sap == NULL || sap->saip == NULL || sap->saip->indextype != INDEX_PARENT)
8935 return;
8936 if (n > 2)
8937 return;
8938 amaip = (AMAlignIndex2Ptr)(sap->saip);
8939 indextype = amaip->alnstyle;
8940 /* make sure that everything is on the plus strand of the nth sequence */
8941 for (i=0; i<amaip->numsaps; i++)
8942 {
8943 salp = amaip->saps[i];
8944 strand = AlnMgr2GetNthStrand(salp, n);
8945 if (strand == Seq_strand_minus)
8946 {
8947 SAIndex2Free2(salp->saip);
8948 salp->saip = NULL;
8949 salp->next = NULL;
8950 SeqAlignListReverseStrand(salp);
8951 AlnMgr2IndexSingleChildSeqAlign(salp);
8952 }
8953 }
8954 /* spin structure: n1 = which alignment, n2 = start on first row, n3 =
8955 alignment length on 1st row, n4 = start on 2nd row, n5 = 2nd strand */
8956 spin = (SPI_nPtr PNTR)MemNew((amaip->numsaps)*sizeof(SPI_nPtr));
8957 for (i=0; i<amaip->numsaps; i++)
8958 {
8959 spin[i] = (SPI_nPtr)MemNew(sizeof(SPI_n));
8960 salp = amaip->saps[i];
8961 spin[i]->n1 = i;
8962 AlnMgr2GetNthSeqRangeInSA(salp, n, &start, &stop);
8963 spin[i]->n3 = stop - start;
8964 spin[i]->n2 = start;
8965 AlnMgr2GetNthSeqRangeInSA(salp, 3-n, &start, &stop);
8966 spin[i]->n4 = start;
8967 strand = AlnMgr2GetNthStrand(salp, 3-n);
8968 if (strand == Seq_strand_minus)
8969 spin[i]->n5 = -1;
8970 else
8971 spin[i]->n5 = 1;
8972 spin[i]->n6 = compact;
8973 }
8974 HeapSort((Pointer)spin, (size_t)(amaip->numsaps), sizeof(SPI_nPtr), SPI_CompareSpins);
8975 strand = spin[0]->n5;
8976 for (i=1; i<amaip->numsaps; i++)
8977 {
8978 if (spin[i]->n5 != strand)
8979 {
8980 salp = amaip->saps[spin[i]->n1];
8981 salp->next = NULL;
8982 SeqAlignFree(salp);
8983 amaip->saps[spin[i]->n1] = NULL;
8984 spin[i]->n1 = -1;
8985 }
8986 }
8987 for (curr=0; curr<amaip->numsaps; curr++)
8988 {
8989 if (spin[curr]->n1 != -1)
8990 {
8991 for (i=curr+1; i<amaip->numsaps; i++)
8992 {
8993 if (spin[i]->n1 != -1)
8994 {
8995 conflict = FALSE;
8996 /* check first for conflict on first row */
8997 if (spin[i]->n2 + spin[i]->n3 - 1 >= spin[curr]->n2 + fuzz)
8998 {
8999 if (spin[i]->n2 <= spin[curr]->n2 + fuzz)
9000 conflict = TRUE;
9001 }
9002 if (spin[i]->n2 <= spin[curr]->n2 + spin[curr]->n3 - 1 - fuzz)
9003 {
9004 if (spin[i]->n2 + spin[i]->n3 - 1 >= spin[curr]->n2 + spin[curr]->n3 - 1)
9005 conflict = TRUE;
9006 }
9007 if (spin[i]->n2 >= spin[curr]->n2)
9008 {
9009 if (spin[i]->n2 + spin[i]->n3 - 1 <= spin[curr]->n2 + spin[curr]->n3 - 1)
9010 conflict = TRUE;
9011 }
9012 /* then check for conflict and consistency on second row */
9013 if (spin[i]->n4 + spin[i]->n3-1 >= spin[curr]->n4 + fuzz)
9014 {
9015 if (spin[i]->n4 <= spin[curr]->n4 + fuzz)
9016 conflict = TRUE;
9017 }
9018 if (spin[i]->n4 <= spin[curr]->n4 + spin[curr]->n3 - 1 - fuzz)
9019 {
9020 if (spin[i]->n4 + spin[i]->n3 - 1 > spin[curr]->n4 + fuzz)
9021 conflict = TRUE;
9022 }
9023 if (spin[i]->n4 >= spin[curr]->n4)
9024 {
9025 if (spin[i]->n4 + spin[i]->n3 - 1 <= spin[curr]->n4 + spin[curr]->n3 - 1)
9026 conflict = TRUE;
9027 }
9028 if (spin[i]->n2 + spin[i]->n3 - 1 <= spin[curr]->n2 + fuzz)
9029 {
9030 if (strand == 1)
9031 {
9032 if (spin[i]->n4 + spin[i]->n3 - 1 >= spin[curr]->n4 + fuzz)
9033 conflict = TRUE;
9034 } else if (strand == -1)
9035 {
9036 if (spin[curr]->n4 + spin[curr]->n3 - 1 - fuzz >= spin[i]->n4)
9037 conflict = TRUE;
9038 }
9039 } else
9040 {
9041 if (strand == 1)
9042 {
9043 if (spin[i]->n4 <= spin[curr]->n4 + spin[curr]->n3 - fuzz)
9044 conflict = TRUE;
9045 } else if (strand == -1)
9046 {
9047 if (spin[i]->n4 + spin[i]->n3 - 1 - fuzz >= spin[curr]->n4)
9048 conflict = TRUE;
9049 }
9050 }
9051 if (conflict)
9052 {
9053 salp = amaip->saps[spin[i]->n1];
9054 salp->next = NULL;
9055 SeqAlignFree(salp);
9056 amaip->saps[spin[i]->n1] = NULL;
9057 spin[i]->n1 = -1;
9058 }
9059 }
9060 }
9061 }
9062 }
9063 salp_head = salp_prev = NULL;
9064 for (i=0; i<amaip->numsaps; i++)
9065 {
9066 MemFree(spin[i]);
9067 if (amaip->saps[i] != NULL)
9068 {
9069 amaip->saps[i]->next = NULL;
9070 if (salp_prev != NULL)
9071 {
9072 salp_prev->next = amaip->saps[i];
9073 salp_prev = salp_prev->next;
9074 } else
9075 salp_head = salp_prev = amaip->saps[i];
9076 }
9077 }
9078 sap->segs = (Pointer)(salp_head);
9079 if (indextype == AM2_LITE)
9080 {
9081 AMAlignIndex2Free2(sap->saip);
9082 sap->saip = NULL;
9083 AlnMgr2IndexLite(sap);
9084 } else
9085 AlnMgr2ReIndexSeqAlign(sap);
9086 MemFree(spin);
9087 }
9088
9089 /***************************************************************************
9090 *
9091 * SPI_CompareSpins is the HeapSort callback for
9092 * SPI_RemoveInconsistentAlnsFromSet. It compares first the alignment
9093 * length on the first row, then the alignment start on the first row.
9094 *
9095 ***************************************************************************/
SPI_CompareSpins(VoidPtr ptr1,VoidPtr ptr2)9096 static int LIBCALLBACK SPI_CompareSpins(VoidPtr ptr1, VoidPtr ptr2)
9097 {
9098 SPI_nPtr spin1;
9099 SPI_nPtr spin2;
9100
9101 spin1 = *((SPI_nPtr PNTR) ptr1);
9102 spin2 = *((SPI_nPtr PNTR) ptr2);
9103 if (spin1 == NULL || spin2 == NULL)
9104 return 0;
9105 if (spin1->n3 > spin2->n3)
9106 return -1;
9107 if (spin1->n3 < spin2->n3)
9108 return 1;
9109 if (spin1->n6 == SPI_RIGHT)
9110 {
9111 if (spin1->n2 > spin2->n2)
9112 return -1;
9113 if (spin1->n2 < spin2->n2)
9114 return 1;
9115 } else if (spin1->n6 == SPI_LEFT)
9116 {
9117 if (spin1->n2 < spin2->n2)
9118 return -1;
9119 if (spin1->n2 > spin2->n2)
9120 return 1;
9121 }
9122 return 0;
9123 }
9124
9125 /***************************************************************************
9126 *
9127 * SPI_OrderInternally takes a herd of fragments and their alignments
9128 * and sorts the alignments for each fragment by their start positions
9129 * on the mRNA sequence.
9130 *
9131 ***************************************************************************/
SPI_OrderInternally(SPI_FragHerdPtr sfhp)9132 static void SPI_OrderInternally(SPI_FragHerdPtr sfhp)
9133 {
9134 AMAlignIndex2Ptr amaip;
9135 Int4 i;
9136 Int4 j;
9137 SeqAlignPtr salp;
9138 Uint1 strand;
9139
9140 for (i=0; i<sfhp->numfrags; i++)
9141 {
9142 if (sfhp->sfparray[i]->sap != NULL)
9143 {
9144 amaip = (AMAlignIndex2Ptr)(sfhp->sfparray[i]->sap->saip);
9145 salp = (SeqAlignPtr)(sfhp->sfparray[i]->sap->segs);
9146 while (salp != NULL)
9147 {
9148 strand = AlnMgr2GetNthStrand(salp, 2);
9149 if (strand == Seq_strand_minus)
9150 {
9151 SAIndex2Free2(salp->saip);
9152 salp->saip = NULL;
9153 salp->next = NULL;
9154 SeqAlignListReverseStrand(salp);
9155 AlnMgr2IndexSingleChildSeqAlign(salp);
9156 }
9157 salp = salp->next;
9158 }
9159 if (amaip->numsaps > 1)
9160 {
9161 HeapSort((Pointer)(amaip->saps), (size_t)(amaip->numsaps), sizeof(SeqAlignPtr), SPI_CompareAlnPos);
9162 for (j=0; j<amaip->numsaps-1; j++)
9163 {
9164 amaip->saps[j]->next = amaip->saps[j+1];
9165 amaip->saps[j+1]->next = NULL;
9166 }
9167 sfhp->sfparray[i]->sap->segs = (Pointer)(amaip->saps[0]);
9168 }
9169 }
9170 }
9171 }
9172
9173 /***************************************************************************
9174 *
9175 * SPI_CompareAlnPos is the callback for the HeapSort in
9176 * SPI_OrderInternally. It compares the start positions on the mRNA
9177 * sequence of two alignments, and puts the 5'-most alignment first.
9178 *
9179 ***************************************************************************/
SPI_CompareAlnPos(VoidPtr ptr1,VoidPtr ptr2)9180 static int LIBCALLBACK SPI_CompareAlnPos(VoidPtr ptr1, VoidPtr ptr2)
9181 {
9182 SeqAlignPtr sap1;
9183 SeqAlignPtr sap2;
9184 Int4 start1;
9185 Int4 start2;
9186
9187 sap1 = *((SeqAlignPtr PNTR) ptr1);
9188 sap2 = *((SeqAlignPtr PNTR) ptr2);
9189 if (sap1 == NULL || sap2 == NULL)
9190 return 0;
9191 AlnMgr2GetNthSeqRangeInSA(sap1, 2, &start1, NULL);
9192 AlnMgr2GetNthSeqRangeInSA(sap2, 2, &start2, NULL);
9193 if (start1 <= start2)
9194 return -1;
9195 else if (start2 > start1)
9196 return 1;
9197 return 0;
9198 }
9199
9200 /***************************************************************************
9201 *
9202 * SPI_GetResultsForCDS takes a completed mRNA-to-genomic alignment,
9203 * extracts the CDS annotation for the mRNA, then truncates the mRNA
9204 * alignment appropriately to create a CDS alignment. Most of the
9205 * mRNA information (splice sites, etc) can simply be duplicated, but
9206 * the first and last exons are often truncated, so their information
9207 * must be recomputed. Since the CDS is known, the UTRs are known, so
9208 * the 5' and 3' UTR %identities are calculated as well.
9209 *
9210 ***************************************************************************/
SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna,BioseqPtr bsp_mrna,SPI_OptionsPtr spot)9211 static SPI_RegionInfoPtr SPI_GetResultsForCDS(SPI_RegionInfoPtr srip_mrna, BioseqPtr bsp_mrna, SPI_OptionsPtr spot)
9212 {
9213 Int4 b;
9214 Int4 c;
9215 SeqMgrFeatContext context;
9216 BoolPtr featDefFilter;
9217 Int4 i;
9218 Int4 mis;
9219 Int4 len;
9220 Int4 offset;
9221 SeqAlignPtr sap;
9222 SPI_mRNAPtr smp;
9223 SPI_nPtr spin;
9224 SPI_RegionInfoPtr srip_cds;
9225 Int4 start_cds;
9226 Int4 stop_cds;
9227 Int4Ptr tmpmstarts;
9228 Int4Ptr tmpmstops;
9229 Int4 tmp1;
9230 Int4 tmp2;
9231 Int4 tmp3;
9232
9233 if (srip_mrna == NULL || srip_mrna->revcomp == TRUE)
9234 return NULL;
9235 SeqMgrIndexFeatures(0, (Pointer)bsp_mrna);
9236 featDefFilter = (BoolPtr)MemNew((FEATDEF_MAX)*sizeof(Boolean));
9237 featDefFilter[FEATDEF_CDS] = TRUE;
9238 spin = (SPI_nPtr)MemNew(sizeof(SPI_n));
9239 SeqMgrExploreFeatures(bsp_mrna, (Pointer)spin, SPI_GetCDS, NULL, NULL, featDefFilter);
9240 MemFree(featDefFilter);
9241 context.left = spin->n1;
9242 context.right = spin->n2;
9243 if (context.right == 0)
9244 return NULL;
9245 start_cds = stop_cds = -1;
9246 offset = context.left;
9247 if (srip_mrna->smp->numexons == 1 || srip_mrna->smp->mstarts[0] < srip_mrna->smp->mstarts[1])
9248 {
9249 tmp1 = srip_mrna->smp->mstarts[0]-1;
9250 tmp2 = srip_mrna->smp->mstops[srip_mrna->smp->numexons-1]-1;
9251 } else
9252 {
9253 tmp1 = srip_mrna->smp->mstarts[srip_mrna->smp->numexons-1]-1;
9254 tmp2 = srip_mrna->smp->mstops[0]-1;
9255 }
9256 if (tmp2 < tmp1)
9257 {
9258 tmp3 = tmp2;
9259 tmp2 = tmp1;
9260 tmp1 = tmp3;
9261 }
9262 if (context.left > tmp2 || context.right < tmp1) /* cds not contained in model */
9263 return NULL;
9264 if (srip_mrna->smp->strand != Seq_strand_minus)
9265 {
9266 for (i=0; i<srip_mrna->smp->numexons; i++)
9267 {
9268 if (context.left >= srip_mrna->smp->mstarts[i]-1 && context.left <= srip_mrna->smp->mstops[i]-1)
9269 start_cds = i;
9270 if (context.right >= srip_mrna->smp->mstarts[i]-1 && context.right <= srip_mrna->smp->mstops[i]-1)
9271 stop_cds = i;
9272 }
9273 smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
9274 smp->fallsoff = SPI_NEITHER;
9275 if (start_cds == -1) /* mRNA alignment doesn't include beginning of CDS */
9276 {
9277 start_cds = 0;
9278 smp->fallsoff = SPI_LEFT;
9279 }
9280 if (stop_cds == -1) /* mRNA alignment doesn't include end of CDS */
9281 {
9282 stop_cds = srip_mrna->smp->numexons-1;
9283 if (smp->fallsoff == SPI_LEFT)
9284 smp->fallsoff = SPI_BOTH;
9285 else
9286 smp->fallsoff = SPI_RIGHT;
9287 }
9288 srip_cds = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
9289 srip_cds->smp = smp;
9290 srip_cds->mlen = abs(context.left - context.right) + 1;
9291 smp->numexons = stop_cds - start_cds + 1;
9292 smp->strand = srip_mrna->smp->strand;
9293 smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
9294 smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9295 smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9296 smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9297 smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9298 smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9299 smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9300 smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9301 smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
9302 if (srip_mrna->smp->mstarts[start_cds]-1 < offset)
9303 smp->mstarts[0] = 0;
9304 else
9305 smp->mstarts[0] = srip_mrna->smp->mstarts[start_cds]-1 - offset;
9306 if (smp->numexons > 1)
9307 smp->mstops[0] = srip_mrna->smp->mstops[start_cds]-1 - offset;
9308 for (i=1; i<smp->numexons-1; i++)
9309 {
9310 smp->mstarts[i] = srip_mrna->smp->mstarts[i+start_cds]-1 - offset;
9311 smp->mstops[i] = srip_mrna->smp->mstops[i+start_cds]-1 - offset;
9312 }
9313 if (smp->numexons > 1)
9314 smp->mstarts[smp->numexons-1] = srip_mrna->smp->mstarts[smp->numexons-1 + start_cds]-1 - offset;
9315 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9316 smp->mstops[smp->numexons-1] = context.right - offset;
9317 else
9318 smp->mstops[smp->numexons-1] = srip_mrna->smp->mstops[smp->numexons-1 + start_cds]-1 - offset;
9319 smp->polyAtail = 0; /* no polyA on a CDS */
9320 /* now copy the splice information and truncate the alignments */
9321 for (i=0; i<smp->numexons; i++)
9322 {
9323 smp->splicedon[i] = srip_mrna->smp->splicedon[i+start_cds];
9324 smp->spliceacc[i] = srip_mrna->smp->spliceacc[i+start_cds];
9325 }
9326 if (smp->numexons > 1)
9327 {
9328 smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9329 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9330 {
9331 if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, srip_mrna->smp->mstops[start_cds]-1, 2))
9332 {
9333 SeqAlignFree(smp->saps[0]->next);
9334 smp->saps[0]->next = NULL;
9335 }
9336 }
9337 smp->saps[smp->numexons-1] = SeqAlignDup(srip_mrna->smp->saps[stop_cds]);
9338 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9339 {
9340 if (AlnMgr2TruncateSeqAlign(smp->saps[smp->numexons-1], srip_mrna->smp->mstarts[stop_cds]-1, smp->mstops[smp->numexons-1]+offset, 2))
9341 {
9342 SeqAlignFree(smp->saps[smp->numexons-1]->next);
9343 smp->saps[smp->numexons-1]->next = NULL;
9344 }
9345 }
9346 for (i=1; i<smp->numexons-1; i++)
9347 {
9348 smp->saps[i] = SeqAlignDup(srip_mrna->smp->saps[i+start_cds]);
9349 }
9350 } else
9351 {
9352 smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9353 if (smp->fallsoff != SPI_BOTH)
9354 {
9355 if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9356 {
9357 SeqAlignFree(smp->saps[0]->next);
9358 smp->saps[0]->next = NULL;
9359 }
9360 }
9361 }
9362 mis = 0;
9363 len = 0;
9364 tmpmstarts = smp->mstarts;
9365 tmpmstops = smp->mstops;
9366 smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9367 smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9368 for (i=0; i<smp->numexons; i++)
9369 {
9370 AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9371 len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
9372 SAIndex2Free2(smp->saps[i]->saip);
9373 smp->saps[i]->saip = NULL;
9374 AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9375 }
9376 MemFree(smp->mstarts);
9377 MemFree(smp->mstops);
9378 smp->mstarts = tmpmstarts;
9379 smp->mstops = tmpmstops;
9380 smp->mRNAcoverage = (100*len)/(context.right - context.left);
9381 smp->mismatch = (FloatHi)(100*mis)/len;
9382 } else
9383 {
9384 for (i=0; i<srip_mrna->smp->numexons; i++)
9385 {
9386 if (context.right >= srip_mrna->smp->mstarts[i]-1 && context.right <= srip_mrna->smp->mstops[i]-1)
9387 start_cds = i;
9388 if (context.left >= srip_mrna->smp->mstarts[i]-1 && context.left <= srip_mrna->smp->mstops[i]-1)
9389 stop_cds = i;
9390 }
9391 smp = (SPI_mRNAPtr)MemNew(sizeof(SPI_mRNA));
9392 smp->fallsoff = SPI_NEITHER;
9393 if (start_cds == -1)
9394 {
9395 start_cds = 0;
9396 smp->fallsoff = SPI_RIGHT;
9397 }
9398 if (stop_cds == -1)
9399 {
9400 if (srip_mrna->smp->mstarts[srip_mrna->smp->numexons-1] > context.left)
9401 {
9402 stop_cds = srip_mrna->smp->numexons-1;
9403 if (smp->fallsoff == SPI_RIGHT)
9404 smp->fallsoff = SPI_BOTH;
9405 else
9406 smp->fallsoff = SPI_LEFT;
9407 } else
9408 {
9409 for (i=0; i<srip_mrna->smp->numexons; i++)
9410 {
9411 if (srip_mrna->smp->mstarts[i] > context.left)
9412 stop_cds = i;
9413 }
9414 if (smp->fallsoff == SPI_RIGHT)
9415 smp->fallsoff = SPI_BOTH;
9416 else
9417 smp->fallsoff = SPI_LEFT;
9418 }
9419 }
9420 srip_cds = (SPI_RegionInfoPtr)MemNew(sizeof(SPI_RegionInfo));
9421 srip_cds->smp = smp;
9422 srip_cds->mlen = abs(context.left - context.right) + 1;
9423 smp->numexons = stop_cds - start_cds + 1;
9424 smp->strand = srip_mrna->smp->strand;
9425 smp->exonid = (FloatHiPtr)MemNew((smp->numexons)*sizeof(FloatHi));
9426 smp->exongaps = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9427 smp->splicedon = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9428 smp->spliceacc = (Uint1Ptr)MemNew((smp->numexons)*sizeof(Uint1));
9429 smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9430 smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9431 smp->gstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9432 smp->gstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9433 smp->saps = (SeqAlignPtr PNTR)MemNew((smp->numexons)*sizeof(SeqAlignPtr));
9434 if (smp->numexons > 1)
9435 smp->mstarts[0] = srip_mrna->smp->mstarts[start_cds]-1 - offset;
9436 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9437 smp->mstops[0] = context.right - offset;
9438 else
9439 smp->mstops[0] = srip_mrna->smp->mstops[start_cds]-1 - offset;
9440 for (i=1; i<smp->numexons-1; i++)
9441 {
9442 smp->mstarts[i] = srip_mrna->smp->mstarts[i+start_cds]-1 - offset;
9443 smp->mstops[i] = srip_mrna->smp->mstops[i+start_cds]-1 - offset;
9444 }
9445 if (smp->numexons > 1)
9446 smp->mstops[smp->numexons-1] = srip_mrna->smp->mstops[smp->numexons-1+start_cds]-1 - offset;
9447 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9448 smp->mstarts[smp->numexons-1] = 0;
9449 else
9450 smp->mstarts[smp->numexons-1] = srip_mrna->smp->mstarts[smp->numexons-1+start_cds]-1 - offset;
9451 smp->polyAtail = 0;
9452 /* now copy the splice site info and truncate the alignments */
9453 for (i=0; i<smp->numexons; i++)
9454 {
9455 smp->splicedon[i] = srip_mrna->smp->splicedon[i+start_cds];
9456 smp->spliceacc[i] = srip_mrna->smp->spliceacc[i+start_cds];
9457 }
9458 if (smp->numexons > 1)
9459 {
9460 smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9461 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_LEFT)
9462 {
9463 if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9464 {
9465 SeqAlignFree(smp->saps[0]->next);
9466 smp->saps[0]->next = NULL;
9467 }
9468 }
9469 smp->saps[smp->numexons-1] = SeqAlignDup(srip_mrna->smp->saps[smp->numexons-1+start_cds]);
9470 if (smp->fallsoff == SPI_NEITHER || smp->fallsoff == SPI_RIGHT)
9471 {
9472 if (AlnMgr2TruncateSeqAlign(smp->saps[smp->numexons-1], smp->mstarts[smp->numexons-1] + offset, srip_mrna->smp->mstops[smp->numexons-1+start_cds]-1, 2))
9473 {
9474 SeqAlignFree(smp->saps[smp->numexons-1]->next);
9475 smp->saps[smp->numexons-1]->next = NULL;
9476 }
9477 }
9478 for (i=1; i<smp->numexons-1; i++)
9479 {
9480 smp->saps[i] = SeqAlignDup(srip_mrna->smp->saps[i+start_cds]);
9481 }
9482 } else
9483 {
9484 smp->saps[0] = SeqAlignDup(srip_mrna->smp->saps[start_cds]);
9485 if (smp->fallsoff != SPI_BOTH)
9486 {
9487 if (AlnMgr2TruncateSeqAlign(smp->saps[0], smp->mstarts[0] + offset, smp->mstops[0] + offset, 2))
9488 {
9489 SeqAlignFree(smp->saps[0]->next);
9490 smp->saps[0]->next = NULL;
9491 }
9492 }
9493 }
9494 mis = 0;
9495 len = 0;
9496 tmpmstarts = smp->mstarts;
9497 tmpmstops = smp->mstops;
9498 smp->mstarts = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9499 smp->mstops = (Int4Ptr)MemNew((smp->numexons)*sizeof(Int4));
9500 for (i=0; i<smp->numexons; i++)
9501 {
9502 AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9503 len += SPI_GetExonInfo(smp, i, &b, &c, &mis, spot);
9504 SAIndex2Free2(smp->saps[i]->saip);
9505 smp->saps[i]->saip = NULL;
9506 AlnMgr2IndexSingleChildSeqAlign(smp->saps[i]);
9507 }
9508 MemFree(smp->mstarts);
9509 MemFree(smp->mstops);
9510 smp->mstarts = tmpmstarts;
9511 smp->mstops = tmpmstops;
9512 smp->mRNAcoverage = (100*len)/(context.right - context.left);
9513 smp->mismatch = (FloatHi)(100*mis)/len;
9514 }
9515 for (i=0; i<smp->numexons-1; i++)
9516 {
9517 smp->saps[i]->next = smp->saps[i+1];
9518 smp->saps[i+1]->next = NULL;
9519 }
9520 sap = SeqAlignNew();
9521 sap->segtype = SAS_DISC;
9522 sap->segs = (Pointer)(smp->saps[0]);
9523 sap->dim = 2;
9524 AlnMgr2IndexLite(sap);
9525 smp->parent = sap;
9526 srip_cds->strand = srip_mrna->strand;
9527 /* fill in srip_cds->mstart and mstop with the CDS boundaries for printing */
9528 srip_cds->mstart = context.left;
9529 srip_cds->mstop = context.right;
9530 srip_cds->smp->missingends = srip_cds->smp->fallsoff;
9531 SPI_FillInUTRInfo(srip_cds, srip_mrna, bsp_mrna->length, start_cds, stop_cds);
9532 if (spin->n3 == 1) /* CDS is 5' partial */
9533 srip_cds->gstart = 1;
9534 else
9535 srip_cds->gstart = 0;
9536 if (spin->n4 == 1) /* CDS is 3' partial */
9537 srip_cds->gstop = 1;
9538 else
9539 srip_cds->gstop = 0;
9540 MemFree(spin);
9541 return srip_cds;
9542 }
9543
9544 /***************************************************************************
9545 *
9546 * SPI_FillInUTRInfo is called by SPI_GetResultsForCDS to figure out the
9547 * %id of the 5' and 3' UTRs. Since the UTRs usually do not exactly
9548 * coincide with exon boundaries, this is not a trivial task:
9549 *
9550 * ----******** ***** ****--- ------ 4 exons, *=CDS, -=UTR
9551 * In this example, the number of mismatches in the 5' UTR is the number
9552 * of mismatches in exon 1 minus the number of mismatches in exon 1 of
9553 * the CDS. The number of mismatches in the 3' UTR is the number of
9554 * mismatches in exon 4 of the mRNA, plus the number of mismatches in
9555 * exon 3 of the mRNA, minus the number of mismatches in exon 3 of the CDS.
9556 * The most complicated example is when a single-exon CDS does not quite
9557 * reach the edges of the corresponding mRNA exon:
9558 *
9559 * ------- ----*******----- ----- ------ 4 exons
9560 * In this example, the function actually needs to make a small
9561 * alignment corresponding to the last part of exon 2 of the mRNA, so
9562 * that the mismatches for just that piece can be computed.
9563 *
9564 ***************************************************************************/
SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds,SPI_RegionInfoPtr srip_mrna,Int4 len,Int4 exonstart,Int4 exonstop)9565 static void SPI_FillInUTRInfo(SPI_RegionInfoPtr srip_cds, SPI_RegionInfoPtr srip_mrna, Int4 len, Int4 exonstart, Int4 exonstop)
9566 {
9567 ACTProfilePtr app1;
9568 ACTProfilePtr app2;
9569 Boolean found;
9570 Int4 i;
9571 Int4 j;
9572 FloatHi mismatch_cds;
9573 FloatHi mismatch_misc;
9574 FloatHi mismatch_mrna_l;
9575 FloatHi mismatch_mrna_r;
9576 SeqAlignPtr sap_tmp;
9577
9578 mismatch_mrna_l = mismatch_mrna_r = 0;
9579 for (i=0; i<=exonstart; i++)
9580 {
9581 mismatch_mrna_l += (FloatHi)(1-srip_mrna->smp->exonid[i]/100)*(srip_mrna->smp->mstops[i] - srip_mrna->smp->mstarts[i] + 1);
9582 }
9583 for (i=exonstop; i<srip_mrna->smp->numexons; i++)
9584 {
9585 mismatch_mrna_r += (FloatHi)(1-srip_mrna->smp->exonid[i]/100)*(srip_mrna->smp->mstops[i] - srip_mrna->smp->mstarts[i] + 1);
9586 }
9587 if (exonstart != exonstop)
9588 {
9589 mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9590 mismatch_mrna_l -= mismatch_cds;
9591 mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[srip_cds->smp->numexons-1]/100)*(srip_cds->smp->mstops[srip_cds->smp->numexons-1] - srip_cds->smp->mstarts[srip_cds->smp->numexons-1] + 1);
9592 mismatch_mrna_r -= mismatch_cds;
9593 if (srip_cds->strand != Seq_strand_minus)
9594 {
9595 if (srip_cds->mstart > 0)
9596 srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(srip_cds->mstart)));
9597 else
9598 srip_cds->utr.left = -1;
9599 if (len-1-srip_cds->mstop > 0)
9600 srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/(len-1-srip_cds->mstop)));
9601 else
9602 srip_cds->utr.right = -1;
9603 } else
9604 {
9605 if (srip_cds->mstart > 0)
9606 srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/srip_cds->mstart));
9607 else
9608 srip_cds->utr.right = -1;
9609 if (len-1-srip_cds->mstop > 0)
9610 srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(len-1-srip_cds->mstop)));
9611 else
9612 srip_cds->utr.left = -1;
9613 }
9614 } else /* have to figure out how many mismatches are on each side of the exon now */
9615 {
9616 sap_tmp = SeqAlignDup(srip_mrna->smp->saps[exonstart]);
9617 mismatch_misc = 0;
9618 if (AlnMgr2TruncateSeqAlign(sap_tmp, srip_cds->mstart, srip_cds->mstop, 2))
9619 {
9620 app1 = SPI_MakeProfileFromSA(sap_tmp->next);
9621 app2 = app1;
9622 while (app2 != NULL)
9623 {
9624 for (i=0; i<app2->len-1; i++)
9625 {
9626 found = FALSE;
9627 for (j=0; j<ACT_NUCLEN; j++)
9628 {
9629 if (app2->freq[j][i] == 1 && !found)
9630 {
9631 if (app2->freq[4][i] == 0) /* not an N */
9632 {
9633 mismatch_misc += 1;
9634 found = TRUE;
9635 }
9636 }
9637 }
9638 }
9639 app1 = app2->next;
9640 MemFree(app2);
9641 app2 = app1;
9642 }
9643 mismatch_cds = (FloatHi)(1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9644 mismatch_mrna_l -= (mismatch_cds + mismatch_misc);
9645 mismatch_mrna_r -= ((FloatHi)(1-srip_mrna->smp->exonid[exonstart]/100)*(srip_mrna->smp->mstops[exonstart] - srip_mrna->smp->mstarts[exonstart] + 1) - mismatch_misc);
9646 } else /* nothing on the right side, so all the extra mismatches are on the left */
9647 {
9648 mismatch_cds = (1-srip_cds->smp->exonid[0]/100)*(srip_cds->smp->mstops[0] - srip_cds->smp->mstarts[0] + 1);
9649 mismatch_mrna_l -= mismatch_cds;
9650 mismatch_mrna_r -= (FloatHi)(1-srip_mrna->smp->exonid[exonstart]/100)*(srip_mrna->smp->mstops[exonstart] - srip_mrna->smp->mstarts[exonstart] + 1);
9651 }
9652 SeqAlignSetFree(sap_tmp);
9653 if (srip_cds->strand != Seq_strand_minus)
9654 {
9655 if (srip_cds->mstart > 0)
9656 srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(srip_cds->mstart)));
9657 else
9658 srip_cds->utr.left = -1;
9659 if (len-1-srip_cds->mstop > 0)
9660 srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/(len-1-srip_cds->mstop)));
9661 else
9662 srip_cds->utr.right = -1;
9663 } else
9664 {
9665 if (srip_cds->mstart > 0)
9666 srip_cds->utr.right = (FloatHi)(100)*(1-(mismatch_mrna_r/srip_cds->mstart));
9667 else
9668 srip_cds->utr.right = -1;
9669 if (len-1-srip_cds->mstop > 0)
9670 srip_cds->utr.left = (FloatHi)(100)*(1-(mismatch_mrna_l/(len-1-srip_cds->mstop
9671 )));
9672 else
9673 srip_cds->utr.left = -1;
9674 }
9675 }
9676 }
9677
9678 /***************************************************************************
9679 *
9680 * SPI_GetCDS is the callback for the SeqEntryExplore call in
9681 * SPI_GetResultsForCDS. It simply records the left and right-most
9682 * boundaries of the coding region found.
9683 *
9684 ***************************************************************************/
SPI_GetCDS(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)9685 static Boolean LIBCALLBACK SPI_GetCDS(SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
9686 {
9687 Boolean p3;
9688 Boolean p5;
9689 SPI_nPtr spin;
9690
9691 if (sfp == NULL)
9692 return FALSE;
9693 spin = (SPI_nPtr)context->userdata;
9694 if (context->seqfeattype == SEQFEAT_CDREGION && context->strand != Seq_strand_minus)
9695 {
9696 spin->n1 = context->left;
9697 spin->n2 = context->right;
9698 CheckSeqLocForPartial(sfp->location, &p5, &p3);
9699 if (p5)
9700 spin->n3 = 1;
9701 if (p3)
9702 spin->n4 = 1;
9703 }
9704 return TRUE;
9705 }
9706
9707
9708 /***************************************************************************
9709 *
9710 * SPI_GetProteinFrommRNA takes an mRNA bioseq and returns a string
9711 * which is the best protein translation of the mRNA. First, the function
9712 * looks to see whether there are any annotated CDSs, and if so, it uses
9713 * the translation of the annotated CDS. If not, the function translates
9714 * the mRNA in all 3 reading frames and looks for the frame with the
9715 * longest protein, then returns that protein.
9716 *
9717 ***************************************************************************/
SPI_GetProteinFrommRNA(BioseqPtr bsp_mrna,Int4Ptr start)9718 NLM_EXTERN CharPtr SPI_GetProteinFrommRNA(BioseqPtr bsp_mrna, Int4Ptr start)
9719 {
9720 ByteStorePtr bs;
9721 CharPtr c1;
9722 CharPtr c2;
9723 CharPtr c3;
9724 Int4 c1len;
9725 Int4 c2len;
9726 Int4 c3len;
9727 Int4 c1start;
9728 Int4 c2start;
9729 Int4 c3start;
9730 BoolPtr featDefFilter;
9731 CharPtr seq;
9732 SeqLocPtr slp;
9733 SPI_SeqPtr ssp;
9734 CharPtr tmp;
9735
9736 if (bsp_mrna == NULL)
9737 return NULL;
9738 SeqMgrIndexFeatures(0, (Pointer)bsp_mrna);
9739 featDefFilter = (BoolPtr)MemNew((FEATDEF_MAX)*sizeof(Boolean));
9740 featDefFilter[FEATDEF_CDS] = TRUE;
9741 ssp = (SPI_SeqPtr)MemNew(sizeof(SPI_Seq));
9742 SeqMgrExploreFeatures(bsp_mrna, (Pointer)ssp, SPI_GetCDSFeat, NULL, NULL, featDefFilter);
9743 seq = ssp->seq;
9744 *start = ssp->start;
9745 MemFree(featDefFilter);
9746 if (seq == NULL) /* no annotated CDS, have to translate to figure out the protein */
9747 {
9748 slp = SeqLocIntNew(0, bsp_mrna->length-1, Seq_strand_plus, bsp_mrna->id);
9749 bs = TransTableTranslateSeqLoc(NULL, slp, 1, 1, TRUE, TRUE);
9750 c1 = BSMerge(bs, NULL);
9751 BSFree(bs);
9752 bs = TransTableTranslateSeqLoc(NULL, slp, 1, 2, TRUE, TRUE);
9753 c2 = BSMerge(bs, NULL);
9754 BSFree(bs);
9755 bs = TransTableTranslateSeqLoc(NULL, slp, 1, 3, TRUE, TRUE);
9756 c3 = BSMerge(bs, NULL);
9757 BSFree(bs);
9758 c1len = SPI_FindLongestProt(c1, &c1start);
9759 c2len = SPI_FindLongestProt(c2, &c2start);
9760 c3len = SPI_FindLongestProt(c3, &c3start);
9761 if (c1len >= c2len && c1len >= c3len)
9762 {
9763 *start = 3*c1start;
9764 tmp = c1;
9765 tmp += c1start;
9766 seq = StringSave(tmp);
9767 } else if (c2len >= c1len && c2len >= c3len)
9768 {
9769 *start = 1+3*c2start;
9770 tmp = c2;
9771 tmp += c2start;
9772 seq = StringSave(tmp);
9773 } else if (c3len >= c1len && c3len >= c2len)
9774 {
9775 *start = 2+3*c3start;
9776 tmp = c3;
9777 tmp += c3start;
9778 seq = StringSave(tmp);
9779 }
9780 MemFree(c1);
9781 MemFree(c2);
9782 MemFree(c3);
9783 SeqLocFree(slp);
9784 }
9785 MemFree(ssp);
9786 return seq;
9787 }
9788
9789 /***************************************************************************
9790 *
9791 * SPI_GetCDSFeat is the SeqMgrExplore callback for SPI_GetProteinFrommRNA.
9792 * When a CDS feature is found, the function gets the protein byte store
9793 * corresponding to that feature, then converts the byte store into a
9794 * string representing the protein sequence.
9795 *
9796 ***************************************************************************/
SPI_GetCDSFeat(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)9797 static Boolean LIBCALLBACK SPI_GetCDSFeat(SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
9798 {
9799 ByteStorePtr bs;
9800 SPI_SeqPtr ssp;
9801
9802 ssp = (SPI_SeqPtr)(context->userdata);
9803 if (context->seqfeattype == SEQFEAT_CDREGION)
9804 {
9805 bs = ProteinFromCdRegionEx(sfp, TRUE, TRUE);
9806 ssp->seq = BSMerge(bs, NULL);
9807 ssp->start = context->left;
9808 BSFree(bs);
9809 }
9810 return TRUE;
9811 }
9812
9813 /***************************************************************************
9814 *
9815 * SPI_FindLongestProt looks through a string representing a protein
9816 * sequence (with stop codons), and returns the length of the longest
9817 * sub-protein (no stops) in the sequence, as well as the position at
9818 * which the longest protein starts.
9819 *
9820 ***************************************************************************/
SPI_FindLongestProt(CharPtr seq,Int4Ptr pos)9821 static Int4 SPI_FindLongestProt(CharPtr seq, Int4Ptr pos)
9822 {
9823 Int4 i;
9824 Int4 j;
9825 Int4 len;
9826 Int4 max;
9827 CharPtr p;
9828
9829 if (seq == NULL)
9830 return 0;
9831 p = seq;
9832 len = max = 0;
9833 i = 0;
9834 j = 0;
9835 *pos = 0;
9836 while (*p != '\0')
9837 {
9838 if (*p == '*')
9839 {
9840 if (len > max)
9841 {
9842 max = len;
9843 *pos = j;
9844 }
9845 len = 0;
9846 j = i+1;
9847 } else
9848 len++;
9849 p++;
9850 i++;
9851 }
9852 return max;
9853 }
9854
SPI_GetAccessionFromSeqId(SeqIdPtr sip,Int4Ptr gi,CharPtr PNTR id)9855 static Boolean SPI_GetAccessionFromSeqId(SeqIdPtr sip, Int4Ptr gi, CharPtr PNTR id)
9856 {
9857 Boolean numeric_id_type = FALSE;
9858 Int2 id_len;
9859 GiimPtr gip;
9860 ObjectIdPtr oip;
9861 TextSeqIdPtr textsip;
9862 DbtagPtr dbtag;
9863 PatentSeqIdPtr psip;
9864 PDBSeqIdPtr pdbsip;
9865
9866 *id = NULL;
9867 *gi = 0;
9868
9869 switch (sip->choice) {
9870 case SEQID_GI: case SEQID_GIBBSQ: case SEQID_GIBBMT:
9871 *gi = sip->data.intvalue;
9872 numeric_id_type = TRUE;
9873 break;
9874 case SEQID_GIIM:
9875 gip = (GiimPtr) sip->data.ptrvalue;
9876 *gi = gip->id;
9877 numeric_id_type = TRUE;
9878 break;
9879 case SEQID_LOCAL:
9880 oip = (ObjectIdPtr) sip->data.ptrvalue;
9881
9882 if (oip->str) {
9883 id_len = StringLen(oip->str);
9884 *id = (CharPtr) MemNew(id_len+1);
9885 sprintf(*id, "%s", oip->str);
9886 } else {
9887 *id = (CharPtr) MemNew(6);
9888 sprintf(*id, "%d", oip->id);
9889 }
9890 break;
9891 case SEQID_GENBANK: case SEQID_EMBL: case SEQID_PIR: case SEQID_TPG: case SEQID_TPE: case SEQID_TPD:
9892 case SEQID_SWISSPROT: case SEQID_DDBJ: case SEQID_PRF:
9893 case SEQID_OTHER:
9894 textsip = (TextSeqIdPtr)sip->data.ptrvalue;
9895 id_len = StringLen(textsip->accession);
9896 *id = (CharPtr) MemNew(id_len+1);
9897 if (textsip->version > 0)
9898 sprintf(*id, "%s.%d", textsip->accession, textsip->version);
9899 else
9900 sprintf(*id, "%s", textsip->accession);
9901 break;
9902 case SEQID_GENERAL:
9903 dbtag = (DbtagPtr) sip->data.ptrvalue;
9904 if (dbtag->tag->str == NULL) {
9905 numeric_id_type = TRUE;
9906 *gi = dbtag->tag->id;
9907 } else {
9908 id_len = StringLen(dbtag->tag->str);
9909 *id = (CharPtr) MemNew(id_len+1);
9910 sprintf(*id, "%s", dbtag->tag->str);
9911 }
9912 break;
9913 case SEQID_PATENT:
9914 psip = (PatentSeqIdPtr) sip->data.ptrvalue;
9915 *gi = (Int4) psip->seqid;
9916 numeric_id_type = TRUE;
9917 break;
9918 case SEQID_PDB:
9919 pdbsip = (PDBSeqIdPtr) sip->data.ptrvalue;
9920 id_len = StringLen(pdbsip->mol);
9921 *id = (CharPtr) MemNew(id_len+4);
9922 sprintf(*id, "%s%d", pdbsip->mol, pdbsip->chain);
9923 break;
9924 default: break;
9925 }
9926
9927 return numeric_id_type;
9928 }
9929
SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip_head,SPI_OptionsPtr spot,BioseqPtr bsp_genomic,BioseqPtr bsp_mrna)9930 static void SPI_CheckSplicesForRevComp(SPI_RegionInfoPtr srip_head, SPI_OptionsPtr spot, BioseqPtr bsp_genomic, BioseqPtr bsp_mrna)
9931 {
9932 Int4 c;
9933 Int4 i;
9934 SPI_RegionInfoPtr revcmp;
9935 SPI_bsinfoPtr sbp1;
9936 SPI_bsinfoPtr sbp2;
9937 Int4 sites;
9938 Int4 sites2;
9939 SPI_RegionInfoPtr srip;
9940
9941 srip = srip_head;
9942 i = 0;
9943 while (srip != NULL)
9944 {
9945 i++;
9946 srip = srip->next;
9947 }
9948 srip = srip_head;
9949 c = 0;
9950 /** KSK bug fix - access of null srip->smp after
9951 SPI_SortRegionsByScore() removed all below -c
9952 threshold **/
9953 while (srip != NULL && srip->smp != NULL)
9954 {
9955 if (spot->revcomp == FALSE)
9956 {
9957 srip->revcmp_try = TRUE;
9958 sites = 0;
9959 for (i=0; i<srip->smp->numexons; i++)
9960 {
9961 sites += srip->smp->splicedon[i];
9962 }
9963 if ((sites*100)/srip->smp->numexons < SPI_REVCOMPTHRESH)
9964 {
9965 BioseqRevComp(bsp_mrna);
9966 ErrSetMessageLevel(SEV_MAX);
9967 sbp1 = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
9968 sbp1->bsp = bsp_genomic;
9969 sbp2 = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
9970 sbp2->bsp = bsp_mrna;
9971 if (spot->lcaseloc){ /* fixes an ABW ? */
9972 sbp2->lcaseloc = spot->lcaseloc;
9973 }
9974 /* sbp2->lcaseloc = spot->lcaseloc; */
9975 spot->revcomp = TRUE;
9976 revcmp = SPI_AlnSinglemRNAToGen(sbp1, sbp2, NULL, NULL, spot);
9977 spot->revcomp = FALSE;
9978 if (revcmp != NULL && revcmp->smp != NULL)
9979 {
9980 if (revcmp != NULL && revcmp->smp != NULL)
9981 {
9982 MemFree(revcmp->smp->protein);
9983 revcmp->smp->protein = SPI_GetProteinFrommRNA(bsp_mrna, &revcmp->smp->transstart);
9984 }
9985 }
9986 BioseqRevComp(bsp_mrna);
9987 if (revcmp != NULL && revcmp->smp != NULL)
9988 {
9989 sites2 = 0;
9990 for (i=0; i<revcmp->smp->numexons; i++)
9991 {
9992 sites2 += revcmp->smp->splicedon[i];
9993 }
9994 if ((sites2*100)/revcmp->smp->numexons > (sites*100)/srip->smp->numexons)
9995 {
9996 if (revcmp->smp->mRNAcoverage > srip->smp->mRNAcoverage - SPI_COVERDIFF)
9997 {
9998 if (revcmp->smp->mismatch < srip->smp->mismatch + SPI_MISMTCHDIFF)
9999 {
10000 SPI_mRNAFree(srip->smp);
10001 srip->smp = revcmp->smp;
10002 srip->revcomp = TRUE;
10003 srip->mstart = revcmp->mstart;
10004 srip->mstop = revcmp->mstop;
10005 srip->strand = revcmp->strand;
10006 srip->coverage = revcmp->coverage;
10007 srip->score = revcmp->score;
10008 srip->polyAtail = revcmp->polyAtail;
10009 srip->fallsoff = revcmp->fallsoff;
10010 srip->utr = revcmp->utr;
10011 MemFree(revcmp);
10012 }
10013 }
10014 }
10015 }
10016 }
10017 }
10018 srip = srip->next;
10019 c++;
10020 }
10021 }
10022
SPI_ProfileNew(Boolean nuc)10023 static ACTProfilePtr SPI_ProfileNew(Boolean nuc)
10024 {
10025 ACTProfilePtr app;
10026 FloatHiPtr PNTR freq;
10027
10028 app = (ACTProfilePtr)MemNew(sizeof(ACTProfile));
10029 if (nuc)
10030 {
10031 freq = (FloatHiPtr PNTR)MemNew(ACT_NUCLEN*sizeof(FloatHiPtr));
10032 app->freq = freq;
10033 app->nuc = TRUE;
10034 } else
10035 {
10036 freq = (FloatHiPtr PNTR)MemNew(ACT_PROTLEN*sizeof(FloatHiPtr));
10037 app->freq = freq;
10038 app->nuc = FALSE;
10039 }
10040 return app;
10041 }
10042
SPI_ProfileFree(ACTProfilePtr app)10043 static ACTProfilePtr SPI_ProfileFree(ACTProfilePtr app)
10044 {
10045 Int4 i;
10046 Int4 j;
10047
10048 if (app == NULL)
10049 return NULL;
10050 if (app->nuc)
10051 j = ACT_NUCLEN;
10052 else
10053 j = ACT_PROTLEN;
10054 for (i=0; i<j; i++)
10055 {
10056 MemFree(app->freq[i]);
10057 }
10058 MemFree(app->freq);
10059 app->next = NULL;
10060 MemFree(app);
10061 return NULL;
10062 }
10063
SPI_ProfileSetFree(ACTProfilePtr app)10064 static ACTProfilePtr SPI_ProfileSetFree(ACTProfilePtr app)
10065 {
10066 ACTProfilePtr app_next;
10067
10068 while (app != NULL)
10069 {
10070 app_next = app->next;
10071 app->next = NULL;
10072 SPI_ProfileFree(app);
10073 app = app_next;
10074 }
10075 return NULL;
10076 }
10077
SPI_BuildProfile(SeqLocPtr slp,ACTProfilePtr PNTR app,Int4Ptr count,Int4 length)10078 static void SPI_BuildProfile(SeqLocPtr slp, ACTProfilePtr PNTR app, Int4Ptr count, Int4 length)
10079 {
10080 Int4 i;
10081 Int4 len;
10082 Uint1 res;
10083 SeqPortPtr spp;
10084
10085 if (app == NULL)
10086 return;
10087 if (slp == NULL)
10088 {
10089 if (*count == 0)
10090 (*app)->numseq++;
10091 *count = *count+length;
10092 if ((*app)->len <= *count)
10093 {
10094 *count = 0;
10095 *app = (*app)->next;
10096 }
10097 return;
10098 }
10099 len = SeqLocLen(slp);
10100 if (len <= 0)
10101 return;
10102 if ((*app)->len == 0)
10103 {
10104 (*app)->len = len;
10105 if ((*app)->nuc)
10106 {
10107 for (i=0; i<ACT_NUCLEN; i++)
10108 {
10109 (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
10110 }
10111 } else
10112 {
10113 for (i=0; i<ACT_PROTLEN; i++)
10114 {
10115 (*app)->freq[i] = (FloatHiPtr)MemNew((*app)->len*sizeof(FloatHi));
10116 }
10117 }
10118 } else
10119 {
10120 if (len > (*app)->len) /* seqloc is longer than the */
10121 return; /* existing profile -- don't add it */
10122 }
10123 if ((*app)->nuc)
10124 spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
10125 else
10126 spp = SeqPortNewByLoc(slp, Seq_code_ncbistdaa);
10127 if (spp == NULL)
10128 return;
10129 if (*count == 0)
10130 (*app)->numseq++;
10131 i=0;
10132 if ((*app)->nuc == FALSE)
10133 {
10134 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
10135 {
10136 (*app)->freq[res][i+*count]++;
10137 i++;
10138 }
10139 } else
10140 {
10141 while ((res = SeqPortGetResidue(spp)) != SEQPORT_EOF && i+*count<((*app)->len))
10142 {
10143 if (res == 1)
10144 {
10145 (*app)->freq[0][i+*count]++;
10146 } else if (res == 2)
10147 {
10148 (*app)->freq[1][i+*count]++;
10149 } else if (res == 4)
10150 {
10151 (*app)->freq[2][i+*count]++;
10152 } else if (res == 8)
10153 {
10154 (*app)->freq[3][i+*count]++;
10155 } else
10156 {
10157 (*app)->freq[4][i+*count]++;
10158 }
10159 i++;
10160 }
10161 }
10162 SeqPortFree(spp);
10163 if (len+*count == (*app)->len)
10164 {
10165 *app = (*app)->next;
10166 *count = 0;
10167 } else
10168 *count = *count + len;
10169 return;
10170 }
10171
10172
SPI_MakeProfileFromSA(SeqAlignPtr sap)10173 static ACTProfilePtr SPI_MakeProfileFromSA(SeqAlignPtr sap)
10174 {
10175 AMAlignIndex2Ptr amaip;
10176 AlnMsg2Ptr amp;
10177 ACTProfilePtr app = NULL;
10178 ACTProfilePtr app_head = NULL;
10179 ACTProfilePtr app_prev = NULL;
10180 BioseqPtr bsp;
10181 Int4 count;
10182 Int4 i;
10183 Int4 j;
10184 Boolean more;
10185 Int4 n;
10186 Boolean nuc;
10187 Int4 numseg;
10188 Int4 numrows;
10189 SeqIdPtr sip;
10190 SeqLocPtr slp;
10191 Int4 start;
10192 Int4 stop;
10193
10194 if (sap == NULL)
10195 return NULL;
10196 if (sap->saip == NULL)
10197 return NULL;
10198 if (sap->saip->indextype == INDEX_PARENT)
10199 {
10200 amaip = (AMAlignIndex2Ptr)(sap->saip);
10201 if (amaip->alnstyle == AM2_LITE)
10202 return NULL;
10203 }
10204 sip = AlnMgr2GetNthSeqIdPtr(sap, 1);
10205 bsp = BioseqLockById(sip);
10206 if (bsp == NULL)
10207 return NULL;
10208 if (ISA_na(bsp->mol))
10209 nuc = TRUE;
10210 else
10211 nuc = FALSE;
10212 BioseqUnlockById(sip);
10213 sip = SeqIdFree(sip);
10214 amp = AlnMsgNew2();
10215 amp->to_aln = -1;
10216 amp->row_num = 1;
10217 app_head = NULL;
10218 numseg = AlnMgr2GetNumSegs(sap);
10219 for (i=0; i<numseg; i++)
10220 {
10221 app = SPI_ProfileNew(nuc);
10222 AlnMgr2GetNthSegmentRange(sap, i+1, &start, &stop);
10223 app->len = stop - start + 1;
10224 if (nuc)
10225 {
10226 for (j=0; j<ACT_NUCLEN; j++)
10227 {
10228 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
10229 }
10230 } else
10231 {
10232 for (j=0; j<ACT_PROTLEN; j++)
10233 {
10234 app->freq[j] = (FloatHiPtr)MemNew(app->len*sizeof(FloatHi));
10235 }
10236 }
10237 if (app_head != NULL)
10238 {
10239 app_prev->next = app;
10240 app_prev = app;
10241 } else
10242 app_head = app_prev = app;
10243 }
10244 numrows = AlnMgr2GetNumRows(sap);
10245 for (i=1; i<=numrows; i++)
10246 {
10247 app = app_head;
10248 for (n=0; n<numseg; n++)
10249 {
10250 AlnMsgReNew2(amp);
10251 AlnMgr2GetNthSegmentRange(sap, n+1, &->from_aln, &->to_aln);
10252 amp->row_num = i;
10253 sip = AlnMgr2GetNthSeqIdPtr(sap, i);
10254 bsp = BioseqLockById(sip);
10255 count = 0;
10256 while ((Boolean) (more = AlnMgr2GetNextAlnBit(sap, amp)) && app != NULL)
10257 {
10258 if (amp->type == AM_SEQ && bsp != NULL)
10259 {
10260 slp = SeqLocIntNew(amp->from_row, amp->to_row, amp->strand, sip);
10261 SPI_BuildProfile(slp, &app, &count, 0);
10262 SeqLocFree(slp);
10263 } else if (amp->type == AM_GAP)
10264 SPI_BuildProfile(NULL, &app, &count, (amp->to_row - amp->from_row + 1));
10265 }
10266 BioseqUnlockById(sip);
10267 sip = SeqIdFree(sip);
10268 }
10269 }
10270 AlnMsgFree2(amp);
10271 return app_head;
10272 }
10273
10274 /***********************************************************
10275 * SPI_CheckMrnaOrder
10276 *
10277 * After the ivals for building a region are sorted in genomic
10278 * order this function merely checks that the mrna invterals
10279 * are minimally colinear: if the stop of one interval overlaps
10280 * or 'jumps' by more than 20 bases the start of the next interval,
10281 * the one with greatest score is retained and the one with lesser
10282 * score set to 'impossible.
10283 ***********************************************************/
SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp,const int num)10284 static void SPI_CheckMrnaOrder(SPI_IvalPtr PNTR spi_pp, const int num)
10285 {
10286 SPI_IvalPtr ival = 0, ival2 = 0, ival3 = 0;
10287 int x = 0;
10288
10289 if (num >=3){
10290 for (x = 0, ival = spi_pp[x], ival2 = spi_pp[x + 1];
10291 x < num && ival != 0 && ival2 != 0;
10292 ++x, ival = spi_pp[x],
10293 ival2 = (x + 1 < num ? spi_pp[x + 1] : 0)){
10294 if (x < num - 2){ /* three to window */
10295 ival3 = spi_pp[x + 2];
10296 if ((ival->strand == Seq_strand_plus == ival2->strand
10297 && ival3->strand == ival->strand
10298 && (ival->mstop > ival2->mstart + SPI_FUZZ
10299 && ival->mstop < ival3->mstart + SPI_FUZZ))
10300 || (ival->strand == Seq_strand_minus == ival2->strand
10301 && ival3->strand == ival->strand
10302 && (ival->mstop + SPI_FUZZ < ival2->mstart
10303 && ival->mstop + SPI_FUZZ > ival3->mstart))){
10304 if (ival->score > ival2->score){
10305 ival2->used = -1;
10306 }
10307 else if (ival2->score > ival->score){
10308 ival->used = -1;
10309 }
10310 }
10311 }
10312 else if (x < num - 1){ /* two to window */
10313 if ((ival->strand == Seq_strand_plus == ival2->strand
10314 && ival->mstop > ival2->mstart)
10315 || (ival->strand == Seq_strand_plus == ival2->strand
10316 && ival->mstop < ival2->mstart)){
10317 if (ival2->score > ival->score){
10318 ival->used = -1;
10319 }
10320 else if (ival->score > ival2->score){
10321 ival2->used = -1;
10322 }
10323 }
10324 }
10325 }
10326 }
10327 }
10328