1 /* $Id: cddutil.c,v 1.96 2007/05/07 13:27:49 kans Exp $
2 *===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: cddutil.c
27 *
28 * Author: Aron Marchler-Bauer
29 *
30 * Initial Version Creation Date: 10/18/1999
31 *
32 * $Revision: 1.96 $
33 *
34 * File Description: CDD utility routines
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * $Log: cddutil.c,v $
39 * Revision 1.96 2007/05/07 13:27:49 kans
40 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
41 *
42 * Revision 1.95 2007/01/22 20:16:38 kans
43 * added new parameter to posPurgeMatches from posit.h
44 *
45 * Revision 1.94 2005/08/31 20:34:30 coulouri
46 * From Mike Gertz:
47 * cddutil.c in CddReadBlastOptions:
48 * - Removed manipulation of hitlist_size as it is no longer correct to
49 * manipulate the hitlist_size that way.
50 * - Increase options->expect_value for composition-based statistics,
51 * mode > 1 (although the routine does not yet understand mode > 1).
52 *
53 * Revision 1.93 2004/11/05 15:36:28 coulouri
54 * remove explicit stdio.h include; it causes LONG_BIT to be defined incorrectly on amd64
55 *
56 * Revision 1.92 2004/06/22 14:20:19 camacho
57 * Changed invocation of posFreqsToMatrix to conform with new signature.
58 *
59 * Revision 1.91 2003/12/09 22:21:35 bauer
60 * added CddCountResTypes
61 *
62 * Revision 1.90 2003/12/09 18:48:05 bauer
63 * commented out deprecated fields in BlastKarlinBlkCopy
64 *
65 * Revision 1.89 2003/10/07 21:19:39 bauer
66 * made CddMasterIs3D more general
67 *
68 * Revision 1.88 2003/08/25 19:09:47 bauer
69 * added SeqAlignReadFromFile
70 *
71 * Revision 1.87 2003/05/22 18:39:14 bauer
72 * list txids in Cdd XML dumper
73 *
74 * Revision 1.86 2003/05/21 17:25:21 bauer
75 * optional ObjMgr in CddReadDBGetBioseq
76 *
77 * Revision 1.85 2003/05/09 20:38:12 bauer
78 * report invalid intervals in CddRelocateSeqLoc
79 *
80 * Revision 1.84 2003/05/08 14:40:09 bauer
81 * bugfix in CddConsensus
82 *
83 * Revision 1.83 2003/04/25 17:17:32 thiessen
84 * fix return value type
85 *
86 * Revision 1.82 2003/04/25 14:36:20 bauer
87 * impalaScaling now returns boolean value
88 *
89 * Revision 1.81 2003/02/06 21:04:27 bauer
90 * fixed bug in reindexing to consensus
91 *
92 * Revision 1.80 2002/12/03 14:36:31 bauer
93 * added CddMSLMixedToMSLDenDiag
94 *
95 * Revision 1.79 2002/11/22 21:35:23 bauer
96 * added SeqAnnotReadFromFile and preservation of scores in DenseSeg to DenseDiag conversion
97 *
98 * Revision 1.78 2002/10/22 14:54:27 bauer
99 * fixed date format in XML dumper
100 *
101 * Revision 1.77 2002/10/10 20:38:19 bauer
102 * changes to accomodate new spec items
103 * - old-root node
104 * - curation-status
105 *
106 * Revision 1.76 2002/10/02 17:32:21 bauer
107 * avoid merging blocks when reindexing alignments
108 *
109 * Revision 1.75 2002/08/17 11:55:07 bauer
110 * backed out changes
111 *
112 * Revision 1.74 2002/08/16 19:51:45 bauer
113 * added Ben's CddSrvGetStyle2
114 *
115 * Revision 1.73 2002/08/06 12:54:26 bauer
116 * fixes to accomodate COGs
117 *
118 * Revision 1.72 2002/07/31 17:15:29 kans
119 * added prototype for BlastDefLineSetFree for Mac compiler
120 *
121 * Revision 1.71 2002/07/31 14:58:58 bauer
122 * BLAST DB Sequence Retrieval
123 *
124 * Revision 1.70 2002/07/17 18:54:10 bauer
125 * CddFeaturesAreConsistent now returns explicit error messages
126 *
127 * Revision 1.69 2002/07/11 14:43:43 bauer
128 * column 21(X) is now always -1 in PSSMs
129 *
130 * Revision 1.68 2002/07/10 22:51:10 bauer
131 * changed score for 26th pssm column to match whats done by makemat
132 *
133 * Revision 1.67 2002/07/10 22:06:07 bauer
134 * replaced posScaling by impalaScaling
135 *
136 * Revision 1.66 2002/07/10 15:34:31 bauer
137 * made SipIsConsensus public
138 *
139 * Revision 1.65 2002/07/09 21:12:39 bauer
140 * added CddDenDiagCposComp2KBP to return Karlin-Altschul parameters together with PSSM
141 *
142 * Revision 1.64 2002/07/05 21:09:25 bauer
143 * added GetAlignmentSize
144 *
145 * Revision 1.63 2002/06/12 15:22:51 bauer
146 * 6/11/02 update
147 *
148 * Revision 1.62 2002/05/31 17:54:31 thiessen
149 * fix for read-only string literal (e.g. on Mac)
150 *
151 * Revision 1.61 2002/05/24 17:49:01 bauer
152 * Unlock Bioseqs again in SeqAlignInform
153 *
154 * Revision 1.60 2002/05/16 22:37:49 bauer
155 * free seqalign in CddExpAlignToSeqAlign if empty
156 *
157 * Revision 1.59 2002/05/06 16:59:50 bauer
158 * remove blanks in inferred CD short names
159 *
160 * Revision 1.58 2002/04/25 14:30:19 bauer
161 * fixed CddFindMMDBIdInBioseq
162 *
163 * Revision 1.57 2002/04/22 16:37:31 bauer
164 * added check for missing structure alignments
165 *
166 * Revision 1.56 2002/04/18 21:00:26 bauer
167 * added check CddFeaturesAreConsistent
168 *
169 * Revision 1.55 2002/04/12 14:02:43 bauer
170 * added update_date case to CddAssignDescr
171 *
172 * Revision 1.54 2002/04/11 14:34:25 bauer
173 * added CddRemoveConsensus
174 *
175 * Revision 1.53 2002/03/28 15:55:00 bauer
176 * fixed bug in CddFindBioseqInSeqEntry, thanks to Ben
177 *
178 * Revision 1.52 2002/02/20 22:27:28 bauer
179 * utility functions for the CD-Server
180 *
181 * Revision 1.51 2002/02/12 23:00:46 bauer
182 * added missing break in CddRelocateSeqLoc
183 *
184 * Revision 1.50 2002/02/06 19:35:37 bauer
185 * some more CddGet.. functionality
186 *
187 * Revision 1.49 2002/02/05 23:15:40 bauer
188 * added a few CddGet.. utility functions, changes to CddAddDescr allow to extend scrapbook line by line
189 *
190 * Revision 1.48 2002/01/28 14:11:29 bauer
191 * add score to entrez neighbor lists
192 *
193 * Revision 1.47 2002/01/05 14:49:44 bauer
194 * made SeqAlignDup a local function
195 *
196 * Revision 1.46 2002/01/04 19:46:56 bauer
197 * added functions to interconvert PSSM-Ids and accessions
198 *
199 * Revision 1.45 2001/11/15 15:35:13 kans
200 * changed strdup to StringSave for Mac
201 *
202 * Revision 1.44 2001/11/13 19:51:52 bauer
203 * support for annotation transfer in alignment reindexing
204 *
205 * Revision 1.43 2001/05/24 15:02:22 kans
206 * included salpacc for Mac compiler
207 *
208 * Revision 1.42 2001/05/23 21:18:05 bauer
209 * added functions for alignment block structure control
210 *
211 * Revision 1.41 2001/04/11 19:49:56 kans
212 * include profiles.h for Mac compiler
213 *
214 * Revision 1.40 2001/04/10 20:26:23 bauer
215 * *** empty log message ***
216 *
217 * Revision 1.39 2001/04/10 20:18:09 bauer
218 * write out ascii-formatted mtx-Files for copymat
219 *
220 * Revision 1.38 2001/03/07 20:27:37 bauer
221 * fixed bug in CddBioseqCopy
222 *
223 * Revision 1.37 2001/03/07 16:30:33 bauer
224 * fixed alignment reindexing bug
225 *
226 * Revision 1.36 2001/03/05 20:25:09 bauer
227 * fixed gap-penalty selection when changing scoring matrix
228 *
229 * Revision 1.35 2001/02/15 15:41:41 shoemake
230 * fixed small memory leak
231 *
232 * Revision 1.34 2001/02/14 17:11:03 bauer
233 * relaced calls to BioseqCopy with CddBioseqCopy
234 *
235 * Revision 1.33 2001/02/13 20:55:10 bauer
236 * fixed bug when comparing local ids
237 *
238 * Revision 1.32 2001/02/07 11:36:51 thiessen
239 * fix minor memory leak
240 *
241 * Revision 1.31 2001/02/06 22:55:35 bauer
242 * Scoring Matrix now a function parameter in CddDenDiagCposComp2
243 *
244 * Revision 1.30 2001/02/06 20:56:10 hurwitz
245 * get rid of problematic BioSeqNew
246 *
247 * Revision 1.29 2001/02/05 22:58:46 bauer
248 * added alignment reindexing function
249 *
250 * Revision 1.28 2001/02/01 17:15:15 bauer
251 * removed SeqIdDup in CddReadBlastOptions
252 *
253 * Revision 1.27 2001/01/26 15:06:40 lewisg
254 * use entrez2 to retrieve structures
255 *
256 * Revision 1.26 2001/01/24 03:08:09 bauer
257 * fixed memory leaks
258 *
259 * Revision 1.25 2001/01/22 21:25:06 hurwitz
260 * bioseq id shouldn't be freed
261 *
262 * Revision 1.24 2001/01/19 23:34:55 bauer
263 * fixed memory leaks
264 *
265 * Revision 1.23 2001/01/19 21:43:44 bauer
266 * removed dependency from NR for threading PSSM calculations
267 *
268 * Revision 1.22 2001/01/17 21:32:02 bauer
269 * changes to PSSM calculation
270 *
271 * Revision 1.21 2001/01/12 01:33:32 bauer
272 * removed CddGetBlastOptions
273 *
274 * Revision 1.20 2001/01/12 00:17:01 bauer
275 * added routines for information content calculation
276 *
277 * Revision 1.19 2001/01/11 23:48:24 bauer
278 * added check for consensus-Cd
279 *
280 * Revision 1.18 2000/12/29 00:52:51 hurwitz
281 * cleaning memory leaks
282 *
283 * Revision 1.17 2000/12/20 18:56:40 hurwitz
284 * new random num gen, more debug printing
285 *
286 * Revision 1.16 2000/12/01 19:36:57 hurwitz
287 * added scale factor to PSSM calcs
288 *
289 * Revision 1.15 2000/11/22 22:34:49 hurwitz
290 * changed pssm scale factor to match contact potential scale factor
291 *
292 * Revision 1.14 2000/11/14 22:08:44 hurwitz
293 * added weighting for pssm calculation
294 *
295 * Revision 1.13 2000/10/10 18:55:06 shoemake
296 * fixed memory error in CddAssignProfileRange
297 *
298 * Revision 1.12 2000/09/08 21:43:51 hurwitz
299 * adding PSSM calculation to DDE
300 *
301 * Revision 1.11 2000/09/01 21:59:04 hurwitz
302 * re-order columns from PSSM of CDs to column order expected in threading
303 *
304 * Revision 1.10 2000/08/30 21:33:55 hurwitz
305 * added new and free functions for Seq_Mtf and Qry_Seq
306 *
307 * Revision 1.9 2000/08/30 16:03:57 bauer
308 * fixed GCC compiler warning fo CddGetPairId
309 *
310 * Revision 1.8 2000/08/14 21:52:04 hurwitz
311 * added CddCalcPSSM
312 *
313 * Revision 1.7 2000/08/14 19:36:26 hurwitz
314 * got CddCposComp working and tested
315 *
316 * Revision 1.6 2000/08/11 19:54:00 hurwitz
317 * restored CddDenDiagCposComputation and CddCposComputation to original, added CddCposComp which combines the 2
318 *
319 * Revision 1.5 2000/08/10 18:18:59 kans
320 * commented out direct calls to ID1 services
321 *
322 * Revision 1.4 2000/08/10 16:50:06 kans
323 * use StringSave instead of unavailable strdup
324 *
325 * Revision 1.3 2000/08/09 21:29:08 hurwitz
326 * adding cddutil.c to VC++ build
327 *
328 * Revision 1.2 2000/07/19 19:32:55 bauer
329 * added modification logging
330 *
331 *
332 * ==========================================================================
333 */
334
335
336 #include <ncbi.h>
337 /*#include <accentr.h>*/
338 #include <lsqfetch.h>
339 /* #include <netentr.h> */
340 /* #include <www.h> */
341 /* #include <sys/resource.h> */
342 /*#include <accutils.h>*/
343 #include <mmdbapi.h>
344 #include <mmdbapi1.h>
345 /* #include <asnmime.h> */
346 #include <objseq.h>
347 #include <objmime.h>
348 #include <strimprt.h>
349 #include <cdd.h>
350 /* #include <objcdd.h> */
351 #include <readdb.h>
352 #include <fdlobj.h>
353 #include <cddutil.h>
354 #include <edutil.h>
355 #include <posit.h>
356 #include <cddposutil.h>
357 #include <blastpri.h>
358 #include <accid1.h>
359 #include <thrddecl.h>
360 #include <profiles.h>
361 #include <salpacc.h>
362
363 /* Multiplier by which to increase the evalue when mode > 1 composition-based
364 * statistics is being used. */
365 #define EVALUE_EXPAND 10
366
367 static void CddCposCompPart1(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
368 compactSearchItems* compactSearch, ValNodePtr* LetterHead,
369 posSearchItems* posSearch);
370
371 /*---------------------------------------------------------------------------*/
372 /*---------------------------------------------------------------------------*/
373 /* Cdd asn1 reader and writer wrappers */
374 /*---------------------------------------------------------------------------*/
375 /*---------------------------------------------------------------------------*/
CddWriteToFile(CddPtr pcdd,CharPtr cFile,Boolean bBin)376 Boolean LIBCALL CddWriteToFile(CddPtr pcdd, CharPtr cFile, Boolean bBin)
377 {
378 AsnIoPtr aip = NULL;
379 Boolean bWriteOK;
380
381 if (!pcdd) return(FALSE);
382 if (bBin) {
383 aip = AsnIoOpen(cFile,"wb");
384 } else {
385 aip = AsnIoOpen(cFile,"w");
386 }
387 bWriteOK = CddAsnWrite(pcdd,aip,NULL);
388 AsnIoClose(aip);
389 return(bWriteOK);
390 }
391
392
CddReadFromFile(CharPtr cFile,Boolean bBin)393 CddPtr LIBCALL CddReadFromFile(CharPtr cFile, Boolean bBin)
394 {
395 AsnIoPtr aip = NULL;
396 CddPtr pcdd;
397
398 if (bBin) {
399 aip = AsnIoOpen(cFile,"rb");
400 } else {
401 aip = AsnIoOpen(cFile,"r");
402 }
403 pcdd = CddAsnRead(aip,NULL);
404 AsnIoClose(aip);
405 return(pcdd);
406 }
407
SeqAnnotReadFromFile(CharPtr cFile,Boolean bBin)408 SeqAnnotPtr LIBCALL SeqAnnotReadFromFile(CharPtr cFile, Boolean bBin)
409 {
410 AsnIoPtr aip = NULL;
411 SeqAnnotPtr sap;
412
413 if (bBin) {
414 aip = AsnIoOpen(cFile,"rb");
415 } else {
416 aip = AsnIoOpen(cFile,"r");
417 }
418 sap = SeqAnnotAsnRead(aip, NULL);
419 AsnIoClose(aip);
420 return(sap);
421 }
422
SeqAlignReadFromFile(CharPtr cFile,Boolean bBin)423 SeqAlignPtr LIBCALL SeqAlignReadFromFile(CharPtr cFile, Boolean bBin)
424 {
425 AsnIoPtr aip = NULL;
426 SeqAlignPtr salp;
427
428 if (bBin) {
429 aip = AsnIoOpen(cFile,"rb");
430 } else {
431 aip = AsnIoOpen(cFile,"r");
432 }
433 salp = SeqAlignAsnRead(aip, NULL);
434 AsnIoClose(aip);
435 return(salp);
436 }
437
438 /*---------------------------------------------------------------------------*/
439 /*---------------------------------------------------------------------------*/
440 /* Cdd-tree asn1 reader and writer wrappers */
441 /*---------------------------------------------------------------------------*/
442 /*---------------------------------------------------------------------------*/
CddTreeWriteToFile(CddTreePtr pcddt,CharPtr cFile,Boolean bBin)443 Boolean LIBCALL CddTreeWriteToFile(CddTreePtr pcddt, CharPtr cFile, Boolean bBin)
444 {
445 AsnIoPtr aip = NULL;
446 Boolean bWriteOK;
447
448 if (!pcddt) return(FALSE);
449 if (bBin) {
450 aip = AsnIoOpen(cFile,"wb");
451 } else {
452 aip = AsnIoOpen(cFile,"w");
453 }
454 bWriteOK = CddTreeAsnWrite(pcddt,aip,NULL);
455 AsnIoClose(aip);
456 return(bWriteOK);
457 }
458
459
CddTreeReadFromFile(CharPtr cFile,Boolean bBin)460 CddTreePtr LIBCALL CddTreeReadFromFile(CharPtr cFile, Boolean bBin)
461 {
462 AsnIoPtr aip = NULL;
463 CddTreePtr pcddt;
464
465 if (bBin) {
466 aip = AsnIoOpen(cFile,"rb");
467 pcddt = (CddTreePtr) CddTreeAsnRead(aip,NULL);
468 } else {
469 aip = AsnIoOpen(cFile,"r");
470 pcddt = (CddTreePtr) CddTreeAsnRead(aip,NULL);
471 }
472
473 AsnIoClose(aip);
474 return(pcddt);
475 }
476
477 /*---------------------------------------------------------------------------*/
478 /*---------------------------------------------------------------------------*/
479 /* Functions to add to a Cdd data structure */
480 /*---------------------------------------------------------------------------*/
481 /*---------------------------------------------------------------------------*/
CddAssignDescr(CddPtr pcdd,Pointer pThis,Int4 iWhat,Int4 iIval)482 void LIBCALL CddAssignDescr(CddPtr pcdd, Pointer pThis, Int4 iWhat, Int4 iIval)
483 {
484 ValNodePtr vnp, vnp2;
485
486 vnp = ValNodeNew(NULL);
487 vnp->choice = iWhat;
488 switch (iWhat) {
489 case CddDescr_othername:
490 vnp->data.ptrvalue = (CharPtr) pThis;
491 break;
492 case CddDescr_category:
493 vnp->data.ptrvalue = (CharPtr) pThis;
494 break;
495 case CddDescr_comment:
496 vnp->data.ptrvalue = (CharPtr) pThis;
497 break;
498 case CddDescr_reference:
499 vnp->data.ptrvalue = (ValNodePtr) pThis;
500 break;
501 case CddDescr_create_date:
502 case CddDescr_update_date:
503 vnp->data.ptrvalue = (DatePtr) pThis;
504 break;
505 case CddDescr_tax_source:
506 vnp->data.ptrvalue = (OrgRefPtr) pThis;
507 break;
508 case CddDescr_source:
509 vnp->data.ptrvalue = (CharPtr) pThis;
510 break;
511 case CddDescr_status:
512 vnp->data.intvalue = (Int4) iIval;
513 break;
514 case CddDescr_curation_status:
515 vnp->data.intvalue = (Int4) iIval;
516 break;
517 case CddDescr_old_root:
518 vnp->data.ptrvalue = (ValNodePtr) pThis;
519 break;
520 case CddDescr_scrapbook:
521 vnp = ValNodeExtractList(&pcdd->description, CddDescr_scrapbook);
522 vnp2 = ValNodeCopyStr(NULL,0,(CharPtr)pThis);
523 if(vnp) {
524 if(vnp->next) ErrPostEx(SEV_WARNING,0,0,"Multiple scrapbooks");
525 ValNodeLink((ValNodePtr *)&vnp->data.ptrvalue, vnp2);
526 } else ValNodeAddPointer(&vnp, CddDescr_scrapbook, vnp2);
527 break;
528 default:
529 vnp->data.ptrvalue = pThis;
530 break;
531 }
532 ValNodeLink(&(pcdd->description),vnp);
533 }
534
535
536 /*---------------------------------------------------------------------------*/
537 /*---------------------------------------------------------------------------*/
538 /* remove an item from the CD description. Works for a limited set of types. */
539 /* status and repeats are removed no matter what. For comments, references */
540 /* etc. the content of the description is compared to the input */
541 /*---------------------------------------------------------------------------*/
542 /*---------------------------------------------------------------------------*/
CddKillDescr(CddPtr pcdd,Pointer pThis,Int4 iWhat,Int4 iIval)543 Boolean LIBCALL CddKillDescr(CddPtr pcdd, Pointer pThis, Int4 iWhat, Int4 iIval)
544 {
545 ValNodePtr vnp, vnpold = NULL, vnpthis = NULL, vnpbefore = NULL, pub;
546
547 vnp = pcdd->description;
548 while (vnp) {
549 if (vnp->choice == iWhat) {
550 switch (iWhat) {
551 case CddDescr_comment:
552 case CddDescr_othername:
553 case CddDescr_category:
554 case CddDescr_source:
555 if (Nlm_StrCmp((CharPtr)pThis,(CharPtr)vnp->data.ptrvalue) == 0) {
556 if (!vnpthis) {
557 vnpthis = vnp;
558 vnpbefore = vnpold;
559 }
560 }
561 break;
562 case CddDescr_status:
563 case CddDescr_curation_status:
564 case CddDescr_old_root:
565 case CddDescr_repeats:
566 if (!vnpthis) {
567 vnpthis = vnp;
568 vnpbefore = vnpold;
569 }
570 break;
571 case CddDescr_reference:
572 pub = vnp->data.ptrvalue;
573 if ((pThis && pThis == pub) ||
574 (iIval>0 && pub->choice==PUB_PMid && pub->data.intvalue==iIval)) {
575 if (!vnpthis) {
576 vnpthis = vnp;
577 vnpbefore = vnpold;
578 }
579 }
580 default:
581 break;
582 }
583 }
584 vnpold = vnp;
585 vnp = vnp->next;
586 }
587 if (vnpthis) {
588 if (vnpbefore) {
589 vnpbefore->next = vnpthis->next;
590 } else {
591 pcdd->description = vnpthis->next;
592 }
593 return TRUE;
594 } else return FALSE;
595 }
596
597 /*---------------------------------------------------------------------------*/
598 /*---------------------------------------------------------------------------*/
599 /* series of functions to extract info from CDs */
600 /*---------------------------------------------------------------------------*/
601 /*---------------------------------------------------------------------------*/
CddGetVersion(CddPtr pcdd)602 Int4 LIBCALL CddGetVersion(CddPtr pcdd)
603 {
604 CddIdPtr cid;
605 GlobalIdPtr pGid;
606
607 cid = pcdd->id; while (cid) {
608 if (cid->choice == CddId_gid) {
609 pGid = cid->data.ptrvalue;
610 return pGid->version;
611 }
612 cid = cid->next;
613 }
614 return 0;
615 }
616
CddGetOrgRef(CddPtr pcdd)617 OrgRefPtr LIBCALL CddGetOrgRef(CddPtr pcdd)
618 {
619 CddDescrPtr pcdsc;
620
621 pcdsc = pcdd->description;
622 while (pcdsc) {
623 if (pcdsc->choice == CddDescr_tax_source) {
624 return (OrgRefPtr) pcdsc->data.ptrvalue;
625 }
626 pcdsc = pcdsc->next;
627 }
628 return NULL;
629 }
630
CddGetPssmId(CddPtr pcdd)631 Int4 LIBCALL CddGetPssmId(CddPtr pcdd)
632 {
633 CddIdPtr cid;
634
635 cid = pcdd->id; while (cid) {
636 if (cid->choice == CddId_uid) {
637 return cid->data.intvalue;
638 }
639 cid = cid->next;
640 }
641 return 0;
642 }
643
CddGetPmIds(CddPtr pcdd,Int4Ptr iPMids)644 Int4 LIBCALL CddGetPmIds(CddPtr pcdd, Int4Ptr iPMids)
645 {
646 Int4 count = 0;
647 CddDescrPtr pcdsc;
648 ValNodePtr pub;
649 Boolean bRefOpen = FALSE;
650
651 if (iPMids) iPMids = MemNew(250*sizeof(Int4));
652 pcdsc = pcdd->description;
653 while (pcdsc) {
654 if (pcdsc->choice == CddDescr_reference) {
655 pub = pcdsc->data.ptrvalue;
656 if (pub->choice == PUB_PMid) {
657 iPMids[count] = pub->data.intvalue;
658 count++;
659 if (count >= 250) return count;
660 }
661 }
662 pcdsc = pcdsc->next;
663 }
664 return count;
665 }
666
667
CddGetDescr(CddPtr pcdd)668 CharPtr LIBCALL CddGetDescr(CddPtr pcdd)
669 {
670 CddDescrPtr pcdsc;
671
672 pcdsc = pcdd->description;
673 while (pcdsc) {
674 if (pcdsc->choice == CddDescr_comment) {
675 if (Nlm_StrCmp(pcdsc->data.ptrvalue,"linked to 3D-structure") != 0) {
676 return (CharPtr) pcdsc->data.ptrvalue;
677 }
678 }
679 pcdsc = pcdsc->next;
680 }
681 return NULL;
682 }
683
CddGetCreateDate(CddPtr pcdd)684 DatePtr LIBCALL CddGetCreateDate(CddPtr pcdd)
685 {
686 CddDescrPtr pcdsc;
687
688 pcdsc = pcdd->description;
689 while (pcdsc) {
690 if (pcdsc->choice == CddDescr_create_date) {
691 return (DatePtr) pcdsc->data.ptrvalue;
692 }
693 pcdsc = pcdsc->next;
694 }
695 return NULL;
696 }
697
CddGetUpdateDate(CddPtr pcdd)698 DatePtr LIBCALL CddGetUpdateDate(CddPtr pcdd)
699 {
700 DatePtr pcd = NULL, pcdthis;
701 CddDescrPtr pcdsc;
702
703 pcdsc = pcdd->description;
704 while (pcdsc) {
705 if (pcdsc->choice == CddDescr_update_date) {
706 pcdthis = (DatePtr) pcdsc->data.ptrvalue;
707 if (!pcd) {
708 pcd = pcdthis;
709 } else {
710 if (pcdthis->data[1] > pcd->data[1]) {
711 pcd = pcdthis;
712 } else {
713 if (pcdthis->data[2] > pcd->data[2]) {
714 pcd = pcdthis;
715 } else {
716 if (pcdthis->data[3] > pcd->data[3]) pcd = pcdthis;
717 }
718 }
719 }
720 }
721 pcdsc = pcdsc->next;
722 }
723 return pcd;
724 }
725
CddGetSource(CddPtr pcdd)726 CharPtr LIBCALL CddGetSource(CddPtr pcdd)
727 {
728 CddDescrPtr pcdsc;
729
730 pcdsc = pcdd->description;
731 while (pcdsc) {
732 if (pcdsc->choice == CddDescr_source) {
733 return (CharPtr) pcdsc->data.ptrvalue;
734 }
735 pcdsc = pcdsc->next;
736 }
737 return NULL;
738 }
739
CddGetSourceId(CddPtr pcdd)740 CharPtr LIBCALL CddGetSourceId(CddPtr pcdd)
741 {
742 CddDescrPtr pcdsc;
743 GlobalIdPtr pGid;
744 ValNodePtr vnp;
745
746 pcdsc = pcdd->description;
747 while (pcdsc) {
748 if (pcdsc->choice == CddDescr_source_id) {
749 vnp = pcdsc->data.ptrvalue;
750 if (vnp->choice == CddId_gid) {
751 pGid = vnp->data.ptrvalue;
752 return (CharPtr) pGid->accession;
753 }
754 }
755 pcdsc = pcdsc->next;
756 }
757 return NULL;
758 }
759
CddGetAlignmentLength(CddPtr pcdd)760 Int4 LIBCALL CddGetAlignmentLength(CddPtr pcdd)
761 {
762 SeqAnnotPtr sap;
763 SeqAlignPtr salp;
764 Int4 iLength = 0;
765
766 sap = pcdd->seqannot;
767 if (sap) {
768 salp = (SeqAlignPtr) sap->data;
769 while (salp) {
770 iLength++;
771 salp = salp->next;
772 }
773 }
774 return iLength + 1;
775 }
776
777 /*---------------------------------------------------------------------------*/
778 /* return number of rows and average number of blocks per row */
779 /*---------------------------------------------------------------------------*/
GetAlignmentSize(SeqAlignPtr salp)780 Int4Ptr LIBCALL GetAlignmentSize(SeqAlignPtr salp)
781 {
782 Int4Ptr pRet;
783 Int4 iLength = 0;
784 Int4 iBlocks = 0;
785 DenseDiagPtr ddp;
786 SeqAlignPtr salpThis;
787
788 salpThis = salp;
789 while (salpThis) {
790 iLength++;
791 ddp = salpThis->segs;
792 while (ddp) {
793 iBlocks++;
794 ddp = ddp->next;
795 }
796 salpThis = salpThis->next;
797 }
798 pRet = MemNew(2*sizeof(Int4));
799 pRet[0] = iLength + 1;
800 pRet[1] = iBlocks / iLength;
801 return pRet;
802 }
803
CddGetAnnotNames(CddPtr pcdd)804 ValNodePtr LIBCALL CddGetAnnotNames(CddPtr pcdd)
805 {
806 AlignAnnotPtr aap;
807 ValNodePtr vnp, vnpHead = NULL;
808
809 aap = pcdd->alignannot;
810 while (aap) {
811 vnp = ValNodeCopyStr(&(vnpHead),0,aap->description);
812 aap = aap->next;
813 }
814 return (vnpHead);
815 }
816
817 /*---------------------------------------------------------------------------*/
818 /*---------------------------------------------------------------------------*/
819 /* Query the status of a CD */
820 /*---------------------------------------------------------------------------*/
821 /*---------------------------------------------------------------------------*/
CddGetStatus(CddPtr pcdd)822 Int4 LIBCALL CddGetStatus(CddPtr pcdd)
823 {
824 CddDescrPtr pCddesc;
825
826 pCddesc = pcdd->description;
827 while (pCddesc) {
828 if (pCddesc->choice == CddDescr_status) {
829 return(pCddesc->data.intvalue);
830 }
831 pCddesc = pCddesc->next;
832 }
833 return 0;
834 }
835
836 /*---------------------------------------------------------------------------*/
837 /* find a particular descriptive string in a CD */
838 /*---------------------------------------------------------------------------*/
CddHasDescription(CddPtr pcdd,CharPtr pc)839 Boolean LIBCALL CddHasDescription(CddPtr pcdd, CharPtr pc)
840 {
841 CddDescrPtr pCddesc;
842
843 if (!pcdd) return FALSE;
844 pCddesc = pcdd->description;
845 while (pCddesc) {
846 if (pCddesc->choice == CddDescr_comment) {
847 if (Nlm_StrCmp((CharPtr)pCddesc->data.ptrvalue,pc) == 0) return TRUE;
848 }
849 pCddesc = pCddesc->next;
850 }
851 return FALSE;
852 }
853
854 /*---------------------------------------------------------------------------*/
855 /* Is the alignment decorated with feature annotation? */
856 /*---------------------------------------------------------------------------*/
CddHasAnnotation(CddPtr pcdd)857 Boolean LIBCALL CddHasAnnotation(CddPtr pcdd)
858 {
859 AlignAnnotPtr aap;
860
861 aap = pcdd->alignannot;
862 if (aap) {
863 if (aap->description) return TRUE;
864 }
865 return FALSE;
866 }
867
868 /*---------------------------------------------------------------------------*/
869 /*---------------------------------------------------------------------------*/
870 /* Is the CD master a 3D Structure */
871 /*---------------------------------------------------------------------------*/
872 /*---------------------------------------------------------------------------*/
CddMasterIs3D(CddPtr pcdd)873 Boolean LIBCALL CddMasterIs3D(CddPtr pcdd)
874 {
875 SeqAlignPtr salp;
876 DenseDiagPtr ddp;
877 SeqIdPtr sip;
878
879 if (!pcdd) return FALSE;
880 if (!pcdd->seqannot) return FALSE;
881 salp = pcdd->seqannot->data;
882 ddp = salp->segs;
883 sip = ddp->id;
884 if (sip->choice == SEQID_PDB) return TRUE;
885 if (sip->choice == SEQID_LOCAL) {
886 sip = ddp->id->next;
887 if (sip->choice == SEQID_PDB) return TRUE;
888 }
889 return FALSE;
890 }
891
892 /*---------------------------------------------------------------------------*/
893 /*---------------------------------------------------------------------------*/
894 /* How many structure-related alignments in a CDD? */
895 /*---------------------------------------------------------------------------*/
896 /*---------------------------------------------------------------------------*/
CddCount3DAlignments(CddPtr pcdd)897 Int4 LIBCALL CddCount3DAlignments(CddPtr pcdd)
898 {
899 Int4 n3dali = 0;
900 Boolean bHasConsensus;
901 Boolean bHas3DMaster;
902 SeqAlignPtr salp;
903 DenseDiagPtr ddp;
904 SeqIdPtr sip;
905
906
907 bHasConsensus = CddHasConsensus(pcdd);
908 bHas3DMaster = CddMasterIs3D(pcdd);
909
910 if (!bHas3DMaster) return(0);
911 salp = pcdd->seqannot->data;
912 while (salp) {
913 ddp = salp->segs;
914 sip = ddp->id->next;
915 if (sip->choice == SEQID_PDB) n3dali++;
916 salp = salp->next;
917 }
918 if (n3dali && bHasConsensus) n3dali--;
919 return (n3dali);
920 }
921
922 /*---------------------------------------------------------------------------*/
923 /* Check if alignment master is a consensus Sequence */
924 /*---------------------------------------------------------------------------*/
SeqAlignHasConsensus(SeqAlignPtr salp)925 Boolean LIBCALL SeqAlignHasConsensus(SeqAlignPtr salp)
926 {
927 DenseDiagPtr ddp;
928 SeqIdPtr sip;
929 ObjectIdPtr oidp;
930
931 if (!salp) return(FALSE);
932 ASSERT(salp->segtype == SAS_DENDIAG);
933 ddp = salp->segs; if (!ddp) return (FALSE);
934 sip = ddp->id; if (!sip) return (FALSE);
935 if (sip->choice != SEQID_LOCAL) return (FALSE);
936 oidp = sip->data.ptrvalue; if (!oidp) return(FALSE);
937 if (StringCmp(oidp->str,"consensus")==0) return (TRUE);
938 return(FALSE);
939 }
940
941 /*---------------------------------------------------------------------------*/
942 /* Check if Cdd has a consensus Sequence */
943 /*---------------------------------------------------------------------------*/
SipIsConsensus(SeqIdPtr sip)944 Boolean LIBCALL SipIsConsensus(SeqIdPtr sip)
945 {
946 ObjectIdPtr oidp;
947
948 if (!sip) return (FALSE);
949 if (sip->choice != SEQID_LOCAL) return (FALSE);
950 oidp = sip->data.ptrvalue; if (!oidp) return(FALSE);
951 if (StringCmp(oidp->str,"consensus")==0) return (TRUE);
952 return(FALSE);
953 }
954
CddHasConsensus(CddPtr pcdd)955 Boolean LIBCALL CddHasConsensus(CddPtr pcdd)
956 {
957 SeqIntPtr sintp;
958 SeqIdPtr sip;
959 SeqAlignPtr salp;
960 DenseDiagPtr ddp;
961
962 if (!pcdd) return (FALSE);
963 sintp = (SeqIntPtr) pcdd->profile_range;
964 if (!sintp) {
965 salp = pcdd->seqannot->data;
966 ddp = salp->segs;
967 sip = ddp->id;
968 return (SipIsConsensus(sip));
969 }
970 sip = (SeqIdPtr) sintp->id;
971 return(SipIsConsensus(sip));
972 }
973
974 /*---------------------------------------------------------------------------*/
975 /*---------------------------------------------------------------------------*/
976 /* fix CD file names and accessions / Character Arrays must be allocated */
977 /*---------------------------------------------------------------------------*/
978 /*---------------------------------------------------------------------------*/
CddRegularizeFileName(CharPtr cIn,CharPtr cAcc,CharPtr cFn,CharPtr cEx)979 void LIBCALL CddRegularizeFileName(CharPtr cIn, CharPtr cAcc, CharPtr cFn,
980 CharPtr cEx)
981 {
982 if (Nlm_StrStr(cIn, cEx) != 0) {
983 Nlm_StrCpy(cFn,cIn);
984 Nlm_StrNCpy(cAcc,cIn,Nlm_StrLen(cIn)-Nlm_StrLen(cEx));
985 cAcc[Nlm_StrLen(cIn)-Nlm_StrLen(cEx)] = '\0';
986 } else {
987 Nlm_StrCpy(cAcc,cIn);
988 Nlm_StrCpy(cFn,cIn);
989 if (cEx[0] == '.') {
990 Nlm_StrCat(cFn,cEx);
991 }
992 else {
993 Nlm_StrCat(cFn,".");
994 Nlm_StrCat(cFn,cEx);
995 }
996 }
997 }
998
999 /*---------------------------------------------------------------------------*/
1000 /*---------------------------------------------------------------------------*/
1001 /* repeat detection - find repeated subsequences in a CD consensus Sequence */
1002 /*---------------------------------------------------------------------------*/
1003 /*---------------------------------------------------------------------------*/
CddCheckForRepeats(CddPtr pcdd,Int4 width,Int4 GapI,Int4 GapE,Nlm_FloatHi rthresh,Boolean bOutput)1004 Boolean LIBCALL CddCheckForRepeats(CddPtr pcdd, Int4 width, Int4 GapI, Int4 GapE,
1005 Nlm_FloatHi rthresh, Boolean bOutput)
1006 {
1007 CddRepeatPtr pcdr;
1008 BLAST_ScoreBlkPtr sbp;
1009 BioseqPtr bsp;
1010 Int4 len, winner = 1;
1011 Int4 **iscore, repct = 0;
1012 Int4 **fscore, bst, pnum, plen;
1013 Int2 iStatus;
1014 Int4 i, j, k, l, m, n, t1, t2, laststart;
1015 SeqPortPtr spp;
1016 Uint1Ptr buffer;
1017 Boolean done = FALSE;
1018 Boolean tbdone;
1019 Nlm_FloatHi cfac, wplaus = 0.0;
1020 SeqLocPtr slp, slpHead;
1021 SeqIntPtr sintp;
1022
1023 typedef struct repstep {
1024 Int4 score;
1025 Int4 start;
1026 Int4 npred;
1027 Int4 lpred;
1028 Int4 lremn;
1029 Nlm_FloatHi plaus;
1030 } RepStep;
1031
1032 RepStep rep[100];
1033
1034 if (!CddHasConsensus) return FALSE;
1035 bsp = pcdd->trunc_master;
1036 len = bsp->length;
1037 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
1038 sbp->read_in_matrix = TRUE;
1039 sbp->protein_alphabet = TRUE;
1040 sbp->posMatrix = NULL;
1041 sbp->number_of_contexts = 1;
1042 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
1043
1044 spp = SeqPortNew(bsp, 0, LAST_RESIDUE, Seq_strand_unknown,Seq_code_ncbistdaa);
1045 buffer = MemNew((len)*sizeof(Uint1));
1046 for (i=0; i<len; i++) buffer[i] = SeqPortGetResidue(spp);
1047 spp = SeqPortFree(spp);
1048
1049 iscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1050 for (i=0;i<len;i++) iscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1051 fscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1052 for (i=0;i<len;i++) fscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1053
1054 for (i=0;i<len;i++) {
1055 t1 = buffer[i];
1056 iscore[i][i] = sbp->matrix[t1][t1];
1057 for (j=i+1;j<len;j++) {
1058 t2 = buffer[j];
1059 iscore[i][j] = sbp->matrix[t1][t2];
1060 fscore[j][i] = 0;
1061 }
1062 }
1063
1064 laststart = 0;
1065 while (!done) {
1066 for (i=0;i<len;i++) for (j=i;j<len;j++) fscore[i][j] = iscore[i][j];
1067 for (i=len-2;i>=0;i--) {
1068 for (j=len-2;j>=i;j--) {
1069 bst = fscore[i+1][j+1];
1070 for (k = i+2;k<i+2+width && k<len;k++) bst = MAX(bst,fscore[k][j+1]-GapI-GapE*(k-i-1));
1071 for (l = j+2;l<j+2+width && l<len;l++) bst = MAX(bst,fscore[i+1][l]-GapI-GapE*(l-j-1));
1072 bst = MAX(bst, 0);
1073 fscore[i][j] += bst;
1074 }
1075 }
1076
1077 bst = 0; l = 0;
1078 for (j=laststart;j<len;j++) if (fscore[0][j] >= bst) {
1079 bst = fscore[0][j]; l = j;
1080 }
1081 if (bst <= 0.0) {
1082 done = TRUE;
1083 break;
1084 }
1085 laststart = l;
1086 /* printf("Next best alignment scores %4d at %4d (of %4d: %5.1f%%)\n",bst,l,len, 100.0*l/len); */
1087 rep[repct].start = l;
1088 rep[repct].score = bst;
1089 rep[repct].npred = 1;
1090 rep[repct].plaus = 0.0;
1091 rep[repct].lpred = len;
1092 rep[repct].lremn = len - rep[repct].start;
1093 if (repct > 0) {
1094 rep[repct].lpred = (Int4) (0.5 + ((Nlm_FloatHi) l / (Nlm_FloatHi) repct));
1095 rep[repct].npred = (Int4) (0.5 + ((Nlm_FloatHi) len / (Nlm_FloatHi) rep[repct].lpred));
1096 }
1097 repct++;
1098 i = 0; j = l;
1099 tbdone = FALSE;
1100 if (l >= len-1) {
1101 tbdone = TRUE;
1102 done = TRUE;
1103 }
1104 while (!tbdone) {
1105 m = i+1; n = j+1;
1106 bst = fscore[m][n];
1107 for (k=i+2;k<i+2+width && k<len;k++) {
1108 if (fscore[k][j+1]-GapI-GapE*(k-i-1) > bst) {
1109 bst = fscore[k][j+1]-GapI-GapE*(k-i-1);
1110 m = k; n = j+1;
1111 }
1112 }
1113 for (l=j+2;l<j+2+width && l<len;l++) {
1114 if (fscore[i+1][l]-GapI-GapE*(l-j-1) > bst) {
1115 bst = fscore[i+1][l]-GapI-GapE*(l-j-1);
1116 m = i+1; n = l;
1117 }
1118 }
1119 iscore[i][j] = -10000;
1120 if (bst < 0) tbdone = TRUE;
1121 i = m; j = n;
1122 if (i >= len-1 || j >= len-1) {
1123 tbdone = TRUE;
1124 iscore[i][j] = -10000;
1125 }
1126 }
1127 }
1128
1129 for (i=0;i<len;i++) MemFree(fscore[i]);
1130 MemFree(fscore);
1131 for (i=0;i<len;i++) MemFree(iscore[i]);
1132 MemFree(iscore);
1133 rep[0].plaus = (Nlm_FloatHi) (rep[0].score - rep[1].score) / (Nlm_FloatHi) rep[0].score;
1134 if (rep[0].plaus > rthresh) rep[0].plaus = 1.0;
1135 winner = 1; wplaus = rep[0].plaus;
1136
1137 for (i=1;i<repct;i++) {
1138 pnum = i+1;
1139 plen = (Int4) (((Nlm_FloatHi)len/(Nlm_FloatHi)pnum)+.5);
1140 cfac = 0.0;
1141 for (j=i;j>0;j--) {
1142 cfac += (Nlm_FloatHi)(abs(rep[j].start-(j*plen)))/(Nlm_FloatHi)plen;
1143 }
1144 cfac /= (Nlm_FloatHi) i;
1145 rep[i].plaus = 1.0 - cfac;
1146
1147 if (rep[i].plaus > wplaus) {
1148 wplaus = rep[i].plaus;
1149 winner = pnum;
1150 }
1151 }
1152 for (i=0;i<winner;i++) {
1153 if (rep[i].lpred < 3 || (i>0 && rep[i].score < 17)) {
1154 winner = 1; break;
1155 }
1156 }
1157
1158 if (bOutput) {
1159 for (i=0;i<repct;i++) {
1160 printf("%s %3d %4d %8.5f %4d %8.5f %2d %3d %8.5f\n",pcdd->name,
1161 i,rep[i].score,(Nlm_FloatHi)rep[i].score/(Nlm_FloatHi)rep[0].score,
1162 rep[i].start, (Nlm_FloatHi)rep[i].start/(Nlm_FloatHi)len,
1163 rep[i].npred,rep[i].lpred, rep[i].plaus);
1164 }
1165 printf("%s: predicted %d repeats\n",pcdd->name,winner);
1166 }
1167
1168 CddKillDescr(pcdd,NULL,CddDescr_repeats,0);
1169 if (winner > 1) {
1170 pcdr = CddRepeatNew(); pcdr->avglen = 0;
1171 slp = NULL; slpHead = NULL;
1172 for (i=0;i<winner;i++) {
1173 k = rep[i].start;
1174 if (i < (winner-1)) l = rep[i+1].start - 1; else l = len - 1;
1175 pcdr->avglen += l-k+1;
1176 slp = (SeqLocPtr) ValNodeNew(NULL);
1177 slp->choice = SEQLOC_INT;
1178 sintp = SeqIntNew();
1179 sintp->from = k;
1180 sintp->to = l;
1181 sintp->id = SeqIdDup(pcdd->profile_range->id);
1182 slp->data.ptrvalue = sintp;
1183 if (!slpHead) slpHead = slp; else ValNodeLink(&(slpHead),slp);
1184 }
1185 pcdr->avglen /= winner;
1186 pcdr->count = winner;
1187 pcdr->location = (SeqLocPtr) ValNodeNew(NULL);
1188 pcdr->location->choice = SEQLOC_PACKED_INT;
1189 pcdr->location->data.ptrvalue = slpHead;
1190 CddAssignDescr(pcdd,pcdr,CddDescr_repeats,0);
1191 }
1192
1193 sbp = BLAST_ScoreBlkDestruct(sbp);
1194 MemFree(buffer);
1195 if (winner > 1) return TRUE;
1196 return (FALSE);
1197 }
1198
1199
1200 /*---------------------------------------------------------------------------*/
1201 /*---------------------------------------------------------------------------*/
1202 /* retrieve the accession of a CD as a character string */
1203 /*---------------------------------------------------------------------------*/
1204 /*---------------------------------------------------------------------------*/
CddGetAccession(CddPtr pcdd)1205 CharPtr LIBCALL CddGetAccession(CddPtr pcdd)
1206 {
1207 CharPtr cAccession = NULL;
1208 ValNodePtr vnp;
1209 GlobalIdPtr pGid;
1210
1211 if (!pcdd) return NULL;
1212 vnp = pcdd->id;
1213 while (vnp) {
1214 if (vnp->choice == CddId_gid) {
1215 pGid = (GlobalIdPtr) vnp->data.ptrvalue;
1216 cAccession = StringSave (pGid->accession);
1217 }
1218 vnp = vnp->next;
1219 }
1220 return cAccession;
1221 }
1222
1223 /*---------------------------------------------------------------------------*/
1224 /*---------------------------------------------------------------------------*/
1225 /* report Errors in processing and exit immediately */
1226 /*---------------------------------------------------------------------------*/
1227 /*---------------------------------------------------------------------------*/
CddSimpleHtmlError(CharPtr cErrTxt)1228 void LIBCALL CddSimpleHtmlError(CharPtr cErrTxt)
1229 {
1230 printf("Content-type: text/html\n\n");
1231 printf("<h2>Error:</h2>\n");
1232 printf("<h3>%s</h3>\n",cErrTxt);
1233 exit(1);
1234 }
1235
CddSevError(CharPtr cErrTxt)1236 void LIBCALL CddSevError(CharPtr cErrTxt)
1237 {
1238 printf(" Error: %s\n",cErrTxt);
1239 exit(1);
1240 }
1241
1242 /*---------------------------------------------------------------------------*/
1243 /*---------------------------------------------------------------------------*/
1244 /* Find bioseqptr in Cdd internal sequence set */
1245 /*---------------------------------------------------------------------------*/
1246 /*---------------------------------------------------------------------------*/
CddFindSip(SeqIdPtr sip,SeqEntryPtr sep)1247 BioseqPtr LIBCALL CddFindSip(SeqIdPtr sip, SeqEntryPtr sep)
1248 {
1249 BioseqPtr bsp = NULL;
1250 BioseqSetPtr bssp;
1251 SeqEntryPtr sepThis;
1252
1253 sepThis = sep;
1254
1255 while (sepThis) {
1256 if (sepThis->choice == 2) {
1257 bssp = sepThis->data.ptrvalue;
1258 bsp = CddFindSip(sip, bssp->seq_set);
1259 } else if (sepThis->choice == 1) {
1260 bsp = (BioseqPtr) sepThis->data.ptrvalue;
1261 if (CddSameSip(bsp->id, sip)) return(bsp);
1262 }
1263 sepThis = sepThis->next;
1264 }
1265 return(NULL);
1266 }
1267
1268
1269 /*---------------------------------------------------------------------------*/
1270 /*---------------------------------------------------------------------------*/
1271 /* version of BioseqCopy that doesn't use BioseqFind and expects the old bsp */
1272 /* adapted by Ben */
1273 /*---------------------------------------------------------------------------*/
1274 /*---------------------------------------------------------------------------*/
CddBioseqCopy(SeqIdPtr newid,BioseqPtr oldbsp,Int4 from,Int4 to,Uint1 strand,Boolean do_feat)1275 BioseqPtr LIBCALL CddBioseqCopy (SeqIdPtr newid, BioseqPtr oldbsp, Int4 from,
1276 Int4 to, Uint1 strand, Boolean do_feat)
1277 {
1278 BioseqPtr newbsp=NULL, tmpbsp;
1279 Uint1 seqtype;
1280 ValNodePtr tmp;
1281 ObjectIdPtr oid;
1282 Int4 len;
1283 ValNode fake;
1284 SeqLocPtr the_segs, head, curr;
1285 Boolean handled = FALSE, split;
1286 SeqFeatPtr sfp, newsfp, lastsfp;
1287 DeltaSeqPtr dsp;
1288 SeqEntryPtr oldscope;
1289
1290
1291 if (from < 0) return FALSE;
1292 if (oldbsp == NULL) return NULL;
1293
1294 if (oldbsp->seq_data_type == Seq_code_gap) return NULL;
1295
1296 len = to - from + 1;
1297 if (len <= 0) return NULL;
1298 newbsp = BioseqNew();
1299 if (newid != NULL) newbsp->id = SeqIdDup(newid);
1300 else {
1301 tmp = ValNodeNew(NULL);
1302 tmp->choice = SEQID_LOCAL;
1303 oid = ObjectIdNew();
1304 tmp->data.ptrvalue = (Pointer)oid;
1305 oid->str = StringSave("Clipboard");
1306 tmpbsp = BioseqFind(tmp); /* old clipboard present? */
1307 if (tmpbsp == NULL) {
1308 oldscope = SeqEntrySetScope (NULL);
1309 if (oldscope != NULL) {
1310 tmpbsp = BioseqFind(tmp);
1311 SeqEntrySetScope (oldscope);
1312 }
1313 }
1314 if (tmpbsp != NULL) BioseqFree(tmpbsp);
1315 newbsp->id = tmp;
1316 }
1317
1318 newbsp->repr = oldbsp->repr;
1319 newbsp->mol = oldbsp->mol;
1320 newbsp->length = len;
1321 newbsp->seq_ext_type = oldbsp->seq_ext_type;
1322
1323 if (newbsp->repr == Seq_repr_virtual)
1324 handled = TRUE; /* no more to do */
1325
1326 if ((newbsp->repr == Seq_repr_raw) ||
1327 (newbsp->repr == Seq_repr_const)) {
1328 if (ISA_aa(newbsp->mol)) {
1329 seqtype = Seq_code_ncbieaa;
1330 } else {
1331 seqtype = Seq_code_iupacna;
1332 }
1333 /*---------------------------------------------------------------------------*/
1334 /* need to check whether seqtypes agree, otherwise the ByteStore is invalid! */
1335 /*---------------------------------------------------------------------------*/
1336 if (seqtype != oldbsp->seq_data_type) {
1337 seqtype = oldbsp->seq_data_type;
1338 }
1339 newbsp->seq_data_type = seqtype;
1340 newbsp->seq_data = (SeqDataPtr) BSDup((ByteStorePtr) oldbsp->seq_data);
1341 handled = TRUE;
1342 }
1343
1344 if ((newbsp->repr == Seq_repr_seg) ||
1345 (newbsp->repr == Seq_repr_ref) ||
1346 (newbsp->repr == Seq_repr_delta)) {
1347 if (newbsp->repr == Seq_repr_seg) { /* segmented */
1348 fake.choice = SEQLOC_MIX; /* make SEQUENCE OF Seq-loc, into one */
1349 fake.data.ptrvalue = oldbsp->seq_ext;
1350 fake.next = NULL;
1351 the_segs = (SeqLocPtr)&fake;
1352 head = (SeqLocPtr)SeqLocCopyPart (the_segs, from, to, strand, FALSE, NULL, NULL);
1353 } else if (newbsp->repr == Seq_repr_ref) { /* reference: is a Seq-loc */
1354 head = (SeqLocPtr)SeqLocCopyPart ((SeqLocPtr)(oldbsp->seq_ext), from, to,
1355 strand, TRUE, NULL, NULL);
1356 } else if (newbsp->repr == Seq_repr_delta) {
1357 dsp = (DeltaSeqPtr)(oldbsp->seq_ext); /* real data is here */
1358 the_segs = (SeqLocPtr)DeltaSeqsToSeqLocs(dsp);
1359 head = (SeqLocPtr)SeqLocCopyPart (the_segs, from, to, strand, FALSE, NULL, NULL);
1360 SeqLocFree (the_segs);
1361 }
1362 newbsp->seq_ext = (Pointer)head;
1363 handled = TRUE;
1364 }
1365
1366 if (newbsp->repr == Seq_repr_map) {
1367 lastsfp = NULL;
1368 for (sfp = (SeqFeatPtr)(oldbsp->seq_ext); sfp != NULL; sfp = sfp->next) {
1369 split = FALSE;
1370 curr = (SeqLocPtr)SeqLocCopyRegion(newbsp->id, sfp->location, oldbsp, from, to, strand, &split);
1371 if (curr != NULL) { /* got one */
1372 newsfp = (SeqFeatPtr)AsnIoMemCopy((Pointer)sfp, (AsnReadFunc)SeqFeatAsnRead, (AsnWriteFunc)SeqFeatAsnWrite);
1373 SeqLocFree(newsfp->location);
1374 newsfp->location = curr;
1375 if (split)
1376 newsfp->partial = TRUE;
1377 if (lastsfp == NULL) /* first one */
1378 newbsp->seq_ext = (Pointer)newsfp;
1379 else
1380 lastsfp->next = newsfp;
1381 lastsfp = newsfp;
1382 }
1383 }
1384 handled = TRUE;
1385 }
1386 if (! handled) goto erret;
1387 if (do_feat)
1388 SeqFeatsCopy (newbsp, oldbsp, from, to, strand);
1389
1390 return newbsp;
1391
1392 erret:
1393 BioseqFree(newbsp);
1394 return NULL;
1395 }
1396
1397
1398 /*---------------------------------------------------------------------------*/
1399 /*---------------------------------------------------------------------------*/
1400 /* extract from a SeqEntryPtr a "minimum bioseq" with a bit of description */
1401 /*---------------------------------------------------------------------------*/
1402 /*---------------------------------------------------------------------------*/
CddExtractBioseq(SeqEntryPtr sep,SeqIdPtr sip)1403 BioseqPtr LIBCALL CddExtractBioseq(SeqEntryPtr sep, SeqIdPtr sip)
1404 {
1405 BioseqPtr bsp, bspNew, bspTemp;
1406 BioseqSetPtr bssp;
1407 SeqIdPtr sipNew;
1408 ValNodePtr vnpThis, vnp = NULL;
1409 SeqAnnotPtr sanp;
1410
1411 bsp = BioseqFindInSeqEntry(sip,sep);
1412 sipNew = SeqIdDup(sip);
1413 bspNew = (BioseqPtr) CddBioseqCopy(sipNew,bsp,0,bsp->length-1,0,FALSE);
1414 sipNew = SeqIdFree(sipNew);
1415 if (sep->choice == 2) {
1416 bssp = sep->data.ptrvalue;
1417 vnp = bssp->descr;
1418 bssp->descr = NULL;
1419 } else if (sep->choice == 1) {
1420 bspTemp = sep->data.ptrvalue;
1421 vnp = bspTemp->descr;
1422 bspTemp->descr = NULL;
1423 }
1424 sanp = bsp->annot;
1425 vnpThis = bsp->descr;
1426 bsp->descr = NULL;
1427 ValNodeLink(&(vnp),vnpThis);
1428 bspNew->descr = vnp;
1429 if (sanp) {
1430 bspNew->annot = sanp;
1431 bsp->annot = NULL;
1432 }
1433 return(bspNew);
1434 }
1435
1436
1437 /*---------------------------------------------------------------------------*/
1438 /*---------------------------------------------------------------------------*/
1439 /* Cdd specific alignment format converters */
1440 /*---------------------------------------------------------------------------*/
1441 /*---------------------------------------------------------------------------*/
1442
1443 /*---------------------------------------------------------------------------*/
1444 /*---------------------------------------------------------------------------*/
1445 /* Create a Dense-Diag seqalign from a Discontinuous SeqAlign */
1446 /*---------------------------------------------------------------------------*/
1447 /*---------------------------------------------------------------------------*/
CddMSLMixedToMSLDenDiag(SeqAlignPtr salp)1448 SeqAlignPtr LIBCALL CddMSLMixedToMSLDenDiag(SeqAlignPtr salp)
1449 {
1450 SeqAlignPtr salpSub, salpnew, salphead, salptail;
1451 DenseDiagPtr ddp, ddphead, ddptail;
1452 DenseSegPtr dsp;
1453 Int4 i, s1, s2;
1454 ScorePtr scp, scpHead, scpTail, scpThis;
1455
1456 if (!salp) return(NULL);
1457 salphead = NULL;
1458 salptail = NULL;
1459 while (salp) {
1460 if (salp->segtype) {
1461 if (salp->segtype != SAS_DISC && salp->segtype != SAS_DENSEG) {
1462 CddSevError("CddMSLDiscToMSLDenDiag: Wrong segtype specified!");
1463 }
1464 }
1465 if (salp->dim) {
1466 if (salp->dim != 2) {
1467 CddSevError("CddMSLDiscToMSLDenDiag: Expect alignments of dimension 2!");
1468 }
1469 }
1470 ddphead = NULL;
1471 ddptail = NULL;
1472
1473 if (salp->segtype == SAS_DISC) {
1474 salpSub = salp->segs;
1475 while (salpSub) {
1476 if (salpSub->segtype != SAS_DENSEG)
1477 CddSevError("CddMSLDiscToMSLDenDiag: Wrong segtype in Sub-Alignment!");
1478 if (salpSub->dim && salpSub->dim !=2)
1479 CddSevError("CddMSLDiscToMSLDenDiag: Wrong dimension in Sub-Alignment!");
1480 dsp = salpSub->segs;
1481 for (i=0;i<dsp->numseg;i++) {
1482 s1 = dsp->starts[i*2];
1483 s2 = dsp->starts[i*2+1];
1484 if (s1 >=0 && s2>= 0) {
1485 ddp = DenseDiagNew();
1486 ddp->starts = MemNew(2*sizeof(Int4));
1487 ddp->starts[0] = s1;
1488 ddp->starts[1] = s2;
1489 ddp->len=dsp->lens[i];
1490 ddp->id = SeqIdDupList(dsp->ids);
1491 ddp->dim = 2;
1492 if (!ddphead) {
1493 ddphead = ddp;
1494 ddptail = ddp;
1495 ddptail->next = NULL;
1496 } else {
1497 ddptail->next = ddp;
1498 ddptail = ddp;
1499 ddptail->next = NULL;
1500 }
1501 }
1502 }
1503 salpSub = salpSub->next;
1504 }
1505 } else { /* segtype is plain SAS_DENSEG */
1506 dsp = salp->segs;
1507 ddphead = NULL;
1508 ddptail = NULL;
1509 for (i=0;i<dsp->numseg;i++) {
1510 s1 = dsp->starts[i*2];
1511 s2 = dsp->starts[i*2+1];
1512 if (s1 >=0 && s2>= 0) {
1513 ddp = DenseDiagNew();
1514 ddp->starts = MemNew(2*sizeof(Int4));
1515 ddp->starts[0] = s1;
1516 ddp->starts[1] = s2;
1517 ddp->len=dsp->lens[i];
1518 ddp->id = SeqIdDupList(dsp->ids);
1519 ddp->dim = 2;
1520 if (!ddphead) {
1521 ddphead = ddp;
1522 ddptail = ddp;
1523 ddptail->next = NULL;
1524 } else {
1525 ddptail->next = ddp;
1526 ddptail = ddp;
1527 ddptail->next = NULL;
1528 }
1529 }
1530 }
1531 }
1532 salpnew = SeqAlignNew();
1533 salpnew->type = SAT_PARTIAL;
1534 salpnew->segtype = SAS_DENDIAG;
1535 salpnew->dim = 2;
1536 salpnew->segs = ddphead;
1537 scp = salp->score; scpHead = NULL;
1538 while (scp) {
1539 scpThis = ScoreNew();
1540 scpThis->id = ObjectIdDup(scp->id);
1541 scpThis->choice = scp->choice;
1542 scpThis->value = scp->value;
1543 if (!scpHead) {
1544 scpHead = scpThis;
1545 } else {
1546 scpTail->next = scpThis;
1547 }
1548 scpTail = scpThis;
1549 scp = scp->next;
1550 }
1551 salpnew->score = scpHead;
1552 if (!salphead) {
1553 salphead = salpnew;
1554 salptail = salpnew;
1555 salptail->next = NULL;
1556 } else {
1557 salptail->next = salpnew;
1558 salptail = salpnew;
1559 salptail->next = NULL;
1560 }
1561 salp = salp->next;
1562 }
1563 return(salphead);
1564 }
1565
1566 /*---------------------------------------------------------------------------*/
1567 /*---------------------------------------------------------------------------*/
1568 /* Create a Dense-Diag seqalign from a Dense-Seg SeqAlign */
1569 /*---------------------------------------------------------------------------*/
1570 /*---------------------------------------------------------------------------*/
CddMSLDenSegToMSLDenDiag(SeqAlignPtr salp)1571 SeqAlignPtr LIBCALL CddMSLDenSegToMSLDenDiag(SeqAlignPtr salp)
1572 {
1573 SeqAlignPtr salpnew, salphead, salptail;
1574 DenseDiagPtr ddp, ddphead, ddptail;
1575 DenseSegPtr dsp;
1576 Int4 i, s1, s2;
1577 ScorePtr scp, scpHead, scpTail, scpThis;
1578
1579
1580 if (!salp) return(NULL);
1581 salphead = NULL;
1582 salptail = NULL;
1583 while (salp) {
1584 if (salp->segtype) {
1585 if (salp->segtype != SAS_DENSEG) {
1586 CddSevError("CddMSLDenSegToMSLDenDiag: Wrong segtype specified!");
1587 }
1588 }
1589 if (salp->dim) {
1590 if (salp->dim != 2) {
1591 CddSevError("CddMSLDenSegToMSLDenDiag: Expect alignments of dimension 2!");
1592 }
1593 }
1594 dsp = salp->segs;
1595 ddphead = NULL;
1596 ddptail = NULL;
1597 for (i=0;i<dsp->numseg;i++) {
1598 s1 = dsp->starts[i*2];
1599 s2 = dsp->starts[i*2+1];
1600 if (s1 >=0 && s2>= 0) {
1601 ddp = DenseDiagNew();
1602 ddp->starts = MemNew(2*sizeof(Int4));
1603 ddp->starts[0] = s1;
1604 ddp->starts[1] = s2;
1605 ddp->len=dsp->lens[i];
1606 ddp->id = SeqIdDupList(dsp->ids);
1607 ddp->dim = 2;
1608 if (!ddphead) {
1609 ddphead = ddp;
1610 ddptail = ddp;
1611 ddptail->next = NULL;
1612 } else {
1613 ddptail->next = ddp;
1614 ddptail = ddp;
1615 ddptail->next = NULL;
1616 }
1617 }
1618 }
1619 salpnew = SeqAlignNew();
1620 salpnew->type = SAT_PARTIAL;
1621 salpnew->segtype = SAS_DENDIAG;
1622 salpnew->dim = 2;
1623 salpnew->segs = ddphead;
1624 scp = salp->score; scpHead = NULL;
1625 while (scp) {
1626 scpThis = ScoreNew();
1627 scpThis->id = ObjectIdDup(scp->id);
1628 scpThis->choice = scp->choice;
1629 scpThis->value = scp->value;
1630 if (!scpHead) {
1631 scpHead = scpThis;
1632 } else {
1633 scpTail->next = scpThis;
1634 }
1635 scpTail = scpThis;
1636 scp = scp->next;
1637 }
1638 salpnew->score = scpHead;
1639 if (!salphead) {
1640 salphead = salpnew;
1641 salptail = salpnew;
1642 salptail->next = NULL;
1643 } else {
1644 salptail->next = salpnew;
1645 salptail = salpnew;
1646 salptail->next = NULL;
1647 }
1648 salp = salp->next;
1649 }
1650 return(salphead);
1651 }
1652
1653 /*---------------------------------------------------------------------------*/
1654 /*---------------------------------------------------------------------------*/
1655 /* make a DenseSeg Alignment-Set turning each diag into a separate alignment */
1656 /*---------------------------------------------------------------------------*/
1657 /*---------------------------------------------------------------------------*/
CddMSLDenDiagToMSLDenSeg(SeqAlignPtr salp)1658 SeqAlignPtr LIBCALL CddMSLDenDiagToMSLDenSeg(SeqAlignPtr salp)
1659 {
1660 SeqAlignPtr salpnew, salphead, salptail;
1661 DenseDiagPtr ddp;
1662 DenseSegPtr dsp;
1663 Int4 lastm, lasts;
1664
1665 if (!salp) return(NULL);
1666 salphead = NULL;
1667 salptail = NULL;
1668 while (salp) {
1669 if (salp->segtype) {
1670 if (salp->segtype != SAS_DENDIAG) {
1671 CddSevError("CddMSLDenDiagToMSLDenSeg: Wrong segtype specified!");
1672 }
1673 }
1674 if (salp->dim) {
1675 if (salp->dim != 2) {
1676 CddSevError("CddMSLDenDiagToMSLDenSeg: Expect alignments of dimension 2!");
1677 }
1678 }
1679 ddp = salp->segs;
1680 lastm = -1;
1681 lasts = -1;
1682 while (ddp) {
1683 salpnew = SeqAlignNew();
1684 if (!salphead) salphead = salpnew;
1685 salpnew->type = SAT_PARTIAL;
1686 salpnew->segtype = SAS_DENSEG;
1687 salpnew->dim = 2;
1688 salpnew->score = ddp->scores;
1689 dsp = DenseSegNew();
1690 salpnew->segs = dsp;
1691 dsp->dim = 2;
1692 dsp->numseg = 1;
1693 dsp->ids = SeqIdDupList(ddp->id);
1694 dsp->starts = ddp->starts;
1695 dsp->lens = &(ddp->len);
1696 dsp->strands = ddp->strands;
1697 dsp->scores = ddp->scores;
1698 if (salptail) salptail->next = salpnew;
1699 salptail = salpnew;
1700 ddp = ddp->next;
1701 }
1702 salp = salp->next;
1703 }
1704 return(salphead);
1705 }
1706
1707 /*---------------------------------------------------------------------------*/
1708 /*---------------------------------------------------------------------------*/
1709 /* convert a dendiag pairwise alignment set into a multiple dendiag alignment*/
1710 /* if possible - i.e. check that the number of segments and their extents on */
1711 /* the common master is the same for all pairwise alignments */
1712 /*---------------------------------------------------------------------------*/
1713 /*---------------------------------------------------------------------------*/
CddMSLDenDiagToMULDenDiag(SeqAlignPtr salp)1714 SeqAlignPtr LIBCALL CddMSLDenDiagToMULDenDiag(SeqAlignPtr salp)
1715 {
1716 SeqAlignPtr salpHead;
1717 SeqAlignPtr salpNew;
1718 Int4 iNumSegs, iCnt, iDim;
1719 Int4 *iSegStart;
1720 Int4 *iSegLen;
1721 DenseDiagPtr ddp, ddpNew = NULL, ddpTail = NULL;
1722 Boolean bIsOk = TRUE;
1723 SeqIdPtr sipNew, sipTail;
1724
1725 if (!salp) return NULL;
1726 if (salp->dim && salp->dim != 2) {
1727 printf(" CddMSLDenDiagToMulDenDiag Warning: Can't convert alignment of dim!=2\n");
1728 return(salp);
1729 }
1730 iNumSegs = 0;
1731 ddp = (DenseDiagPtr) salp->segs;
1732 while (ddp) {
1733 iNumSegs++;
1734 ddp = ddp->next;
1735 }
1736 iSegStart = calloc(iNumSegs,sizeof(Int4));
1737 iSegLen = calloc(iNumSegs,sizeof(Int4));
1738 iCnt = 0;
1739 ddp = (DenseDiagPtr) salp->segs;
1740 while (ddp) {
1741 iSegStart[iCnt] = ddp->starts[0];
1742 iSegLen[iCnt] = ddp->len;
1743 iCnt++;
1744 ddp = ddp->next;
1745 }
1746 iDim = 1;
1747 salpHead = salp;
1748 while (salpHead) {
1749 iDim++;
1750 if (iDim > 2) {
1751 ddp = (DenseDiagPtr) salpHead->segs;
1752 iCnt = 0;
1753 while (ddp) {
1754 if (iCnt > iNumSegs || ddp->starts[0] != iSegStart[iCnt] ||
1755 ddp->len != iSegLen[iCnt]) {
1756 bIsOk = FALSE;
1757 break;
1758 }
1759 iCnt++;
1760 ddp = ddp->next;
1761 }
1762 if (iCnt != iNumSegs) {
1763 bIsOk = FALSE;
1764 break;
1765 }
1766 }
1767 salpHead = salpHead->next;
1768 }
1769
1770 if (!bIsOk) {
1771 printf(" CddMSLDenDiagToMulDenDiag Warning: Can't convert alignment!\n");
1772 return(salp);
1773 }
1774 salpNew = SeqAlignNew();
1775 salpNew->type = salp->type;
1776 salpNew->segtype = salp->segtype;
1777 salpNew->dim = iDim;
1778
1779 sipNew = NULL;
1780 salpHead = salp;
1781 while (salpHead) {
1782 ddp = (DenseDiagPtr)salpHead->segs;
1783 if (!sipNew) {
1784 sipNew = SeqIdDup(ddp->id);
1785 sipNew->next = SeqIdDup(ddp->id->next);
1786 sipTail = sipNew->next;
1787 } else {
1788 sipTail->next = SeqIdDup(ddp->id->next);
1789 sipTail = sipTail->next;
1790 }
1791 salpHead = salpHead->next;
1792 }
1793
1794 salpHead = salp;
1795 iCnt = 0;
1796 while (salpHead) {
1797 ddp = (DenseDiagPtr) salpHead->segs;
1798 while (ddp) {
1799 if (!ddpNew) {
1800 ddpNew = (DenseDiagPtr) DenseDiagNew();
1801 ddpNew->dim = iDim;
1802 ddpNew->id = sipNew;
1803 ddpNew->starts = calloc(iDim,sizeof(Int4));
1804 ddpNew->starts[0]=ddp->starts[0];
1805 ddpNew->starts[1]=ddp->starts[1];
1806 ddpNew->len=ddp->len;
1807 if (!ddpTail) {
1808 salpNew->segs = ddpNew;
1809 ddpTail = ddpNew;
1810 } else {
1811 ddpTail->next = ddpNew;
1812 ddpTail = ddpTail->next;
1813 }
1814 } else {
1815 ddpNew->starts[iCnt+1]=ddp->starts[1];
1816 }
1817 if (ddpTail->next) {
1818 ddpTail = ddpTail->next;
1819 ddpNew = ddpTail;
1820 } else {
1821 ddpNew = NULL;
1822 }
1823 ddp = ddp->next;
1824 if (!ddp) {
1825 ddpNew = (DenseDiagPtr) salpNew->segs;
1826 ddpTail = ddpNew;
1827 }
1828 }
1829 iCnt++;
1830 salpHead = salpHead->next;
1831 }
1832 free(iSegLen);
1833 free(iSegStart);
1834 return(salpNew);
1835 }
1836
1837 /*---------------------------------------------------------------------------*/
1838 /*---------------------------------------------------------------------------*/
1839 /* Calculate per column information content for a SeqAlign */
1840 /*---------------------------------------------------------------------------*/
1841 /*---------------------------------------------------------------------------*/
SeqAlignConservation(SeqAlignPtr salp,Nlm_FloatHi fract,BioseqPtr bsp_master,Boolean bHasConsensus,Int4 offset)1842 Nlm_FloatHi LIBCALL SeqAlignConservation(SeqAlignPtr salp, Nlm_FloatHi fract,
1843 BioseqPtr bsp_master, Boolean bHasConsensus, Int4 offset)
1844 {
1845 Int4 i, c, a;
1846 Int4 qlength, nover = 0;
1847 Int4 alphabetSize = 26;
1848 Int4 *ntypes;
1849 Nlm_FloatHi **typefreq, sumfreq;
1850 DenseDiagPtr ddp;
1851 BioseqPtr bsp;
1852 SeqIdPtr sip;
1853 SeqPortPtr spp;
1854 Uint1Ptr buffer;
1855
1856 if (!salp) return(0.0);
1857 if (!bsp_master) return(0.0);
1858 qlength = bsp_master->length;
1859 ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
1860 for (i=0;i<qlength;i++) ntypes[i] = 0;
1861 typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
1862 for (i=0;i<qlength;i++) {
1863 typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1864 for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
1865 }
1866 if (!bHasConsensus) { /* count residues in the master/representative too */
1867 ddp = salp->segs;
1868 sip = ddp->id;
1869 bsp = bsp_master;
1870 buffer = MemNew((bsp->length)*sizeof(Uint1));
1871 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1872 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1873 spp = SeqPortFree(spp);
1874 while (ddp) {
1875 for (c=ddp->starts[0]-offset;c<ddp->starts[0]-offset+ddp->len;c++) {
1876 if (buffer[c] >= 0 && buffer[c] < (Uint1) alphabetSize)
1877 typefreq[c][buffer[c]] += 1.0;
1878 }
1879 ddp = ddp->next;
1880 }
1881 MemFree(buffer);
1882 }
1883 while (salp) {
1884 ddp = salp->segs;
1885 sip = ddp->id->next;
1886 bsp = BioseqLockById(sip);
1887 buffer = MemNew((bsp->length)*sizeof(Uint1));
1888 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1889 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1890 spp = SeqPortFree(spp);
1891 BioseqUnlock(bsp);
1892 while (ddp) {
1893 for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
1894 i = ddp->starts[0]-offset+c-ddp->starts[1];
1895 if (buffer[c] >= 0 && buffer[c] < (Uint1) alphabetSize)
1896 typefreq[i][buffer[c]] += 1.0;
1897 }
1898 ddp = ddp->next;
1899 }
1900 MemFree(buffer);
1901 salp = salp->next;
1902 }
1903 for (i=0;i<qlength;i++) {
1904 sumfreq = 0.0;
1905 for (a=0;a<alphabetSize;a++) {
1906 if (typefreq[i][a] > 0.0) {
1907 ntypes[i]++;
1908 sumfreq += typefreq[i][a];
1909 }
1910 }
1911 if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
1912 typefreq[i][a] /= sumfreq;
1913 }
1914 for (a=0;a<alphabetSize;a++) {
1915 if (typefreq[i][a] >= fract) {
1916 nover++; break;
1917 }
1918 }
1919 }
1920 for (i=0;i<qlength;i++) MemFree(typefreq[i]);
1921 MemFree(typefreq);
1922 MemFree(ntypes);
1923 return((Nlm_FloatHi)nover/(Nlm_FloatHi)qlength);
1924 }
1925
1926 /*---------------------------------------------------------------------------*/
1927 /*---------------------------------------------------------------------------*/
1928 /* Calculate per column information content for a SeqAlign */
1929 /*---------------------------------------------------------------------------*/
1930 /*---------------------------------------------------------------------------*/
SeqAlignInform(SeqAlignPtr salp,BioseqPtr bsp_master,Boolean bHasConsensus,Int4 offset)1931 Nlm_FloatHiPtr LIBCALL SeqAlignInform(SeqAlignPtr salp, BioseqPtr bsp_master,
1932 Boolean bHasConsensus, Int4 offset)
1933 {
1934 BLAST_ScoreBlkPtr sbp;
1935 BLAST_ResFreqPtr stdrfp;
1936 Nlm_FloatHiPtr standardProb;
1937 Nlm_FloatHiPtr Informativeness;
1938 Int2 iStatus;
1939 Int4 i, c, a;
1940 Int4 qlength;
1941 Int4 alphabetSize = 26;
1942 Nlm_FloatHi posEpsilon = 0.0001;
1943 Nlm_FloatHi QoverPest;
1944 Int4 *ntypes;
1945 Nlm_FloatHi **typefreq, sumfreq;
1946 DenseDiagPtr ddp;
1947 BioseqPtr bsp;
1948 SeqIdPtr sip;
1949 SeqPortPtr spp;
1950 Uint1Ptr buffer;
1951 Char matrix[32];
1952
1953 if (!salp) return(NULL);
1954 if (!bsp_master) return(NULL);
1955 qlength = bsp_master->length;
1956 Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
1957 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
1958 sbp->read_in_matrix = TRUE;
1959 sbp->protein_alphabet = TRUE;
1960 sbp->posMatrix = NULL;
1961 sbp->number_of_contexts = 1;
1962 StrCpy(matrix,"BLOSUM62");
1963 iStatus = BlastScoreBlkMatFill(sbp,matrix);
1964 stdrfp = BlastResFreqNew(sbp);
1965 BlastResFreqStdComp(sbp,stdrfp);
1966 standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1967 for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
1968 stdrfp = BlastResFreqDestruct(stdrfp);
1969 sbp = (BLAST_ScoreBlkPtr) BLAST_ScoreBlkDestruct(sbp);
1970 ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
1971 for (i=0;i<qlength;i++) ntypes[i] = 0;
1972 typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
1973 for (i=0;i<qlength;i++) {
1974 typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1975 for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
1976 }
1977 if (!bHasConsensus) { /* count residues in the master/representative too */
1978 ddp = salp->segs;
1979 sip = ddp->id;
1980 bsp = bsp_master;
1981 buffer = MemNew((bsp->length)*sizeof(Uint1));
1982 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1983 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1984 spp = SeqPortFree(spp);
1985 while (ddp) {
1986 for (c=ddp->starts[0]-offset;c<ddp->starts[0]-offset+ddp->len;c++) {
1987 if (buffer[c] < (Uint1) alphabetSize)
1988 typefreq[c][buffer[c]] += 1.0;
1989 }
1990 ddp = ddp->next;
1991 }
1992 MemFree(buffer);
1993 }
1994 while (salp) {
1995 ddp = salp->segs;
1996 sip = ddp->id->next;
1997 bsp = BioseqLockById(sip);
1998 buffer = MemNew((bsp->length)*sizeof(Uint1));
1999 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2000 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2001 spp = SeqPortFree(spp);
2002 BioseqUnlock(bsp);
2003 while (ddp) {
2004 for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2005 i = ddp->starts[0]-offset+c-ddp->starts[1];
2006 if (buffer[c] < (Uint1) alphabetSize)
2007 typefreq[i][buffer[c]] += 1.0;
2008 }
2009 ddp = ddp->next;
2010 }
2011 MemFree(buffer);
2012 salp = salp->next;
2013 }
2014 for (i=0;i<qlength;i++) {
2015 sumfreq = 0.0;
2016 for (a=0;a<alphabetSize;a++) {
2017 if (typefreq[i][a] > 0.0) {
2018 ntypes[i]++;
2019 sumfreq += typefreq[i][a];
2020 }
2021 }
2022 if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
2023 typefreq[i][a] /= sumfreq;
2024 }
2025 }
2026
2027 for (c=0;c<qlength;c++) {
2028 Informativeness[c] = 0.0;
2029 for (a=0;a<alphabetSize;a++) {
2030 if (standardProb[a] > posEpsilon) {
2031 QoverPest = typefreq[c][a] / standardProb[a];
2032 if (QoverPest > posEpsilon) {
2033 Informativeness[c] += typefreq[c][a] * log(QoverPest) / NCBIMATH_LN2;
2034 }
2035 }
2036 }
2037 }
2038 for (i=0;i<qlength;i++) MemFree(typefreq[i]);
2039 MemFree(typefreq);
2040 MemFree(ntypes);
2041 MemFree(standardProb);
2042 return(Informativeness);
2043 }
2044
2045 /*---------------------------------------------------------------------------*/
2046 /*---------------------------------------------------------------------------*/
2047 /* Calculate per column information content for a CDD from the seqalign */
2048 /*---------------------------------------------------------------------------*/
2049 /*---------------------------------------------------------------------------*/
CddCountResTypes(CddPtr pcdd,Int4 * ncols)2050 Int4 ** LIBCALL CddCountResTypes(CddPtr pcdd, Int4 *ncols)
2051 {
2052 BioseqSetPtr bssp;
2053 Boolean bHasConsensus = CddHasConsensus(pcdd);
2054 Int4 offset, qlength, i, j, c;
2055 Int4 **ResFreqTable;
2056 SeqAlignPtr salp;
2057 DenseDiagPtr ddp;
2058 SeqIdPtr sip;
2059 BioseqPtr bsp;
2060 Uint1 *buffer;
2061 SeqPortPtr spp;
2062
2063 if (!pcdd) return(NULL);
2064 if (!pcdd->trunc_master) return(NULL);
2065 bssp = pcdd->sequences->data.ptrvalue;
2066 offset = pcdd->profile_range->from;
2067 qlength = pcdd->trunc_master->length;
2068 *ncols = qlength;
2069 ResFreqTable = MemNew(qlength*sizeof(Int4 *));
2070 for (i=0;i<qlength;i++) {
2071 ResFreqTable[i] = MemNew(26*sizeof(Int4));
2072 for (j=0;j<26;j++) ResFreqTable[i][j] = 0;
2073 }
2074 salp = pcdd->seqannot->data;
2075 if (!bHasConsensus) { /* count residues in the master/representative too */
2076 ddp = salp->segs;
2077 sip = ddp->id;
2078 bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2079 buffer = MemNew((bsp->length)*sizeof(Uint1));
2080 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2081 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2082 spp = SeqPortFree(spp);
2083 while (ddp) {
2084 for (c=ddp->starts[0];c<ddp->starts[0]+ddp->len;c++) {
2085 ResFreqTable[c-offset][buffer[c]]++;
2086 }
2087 ddp = ddp->next;
2088 }
2089 MemFree(buffer);
2090 }
2091 while (salp) {
2092 ddp = salp->segs;
2093 sip = ddp->id->next;
2094 bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2095 buffer = MemNew((bsp->length)*sizeof(Uint1));
2096 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2097 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2098 spp = SeqPortFree(spp);
2099 while (ddp) {
2100 for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2101 i = ddp->starts[0]-offset+c-ddp->starts[1];
2102 ResFreqTable[i][buffer[c]]++;
2103 }
2104 ddp = ddp->next;
2105 }
2106 MemFree(buffer);
2107 salp = salp->next;
2108 }
2109 return(ResFreqTable);
2110 }
2111
2112 /*---------------------------------------------------------------------------*/
2113 /*---------------------------------------------------------------------------*/
2114 /* Calculate per column information content for a CDD from the seqalign */
2115 /*---------------------------------------------------------------------------*/
2116 /*---------------------------------------------------------------------------*/
CddAlignInform(CddPtr pcdd,Nlm_FloatHi * Niobs)2117 Nlm_FloatHiPtr LIBCALL CddAlignInform(CddPtr pcdd, Nlm_FloatHi * Niobs)
2118 {
2119 Int4 offset;
2120 Boolean bHasConsensus = CddHasConsensus(pcdd);
2121 BLAST_ScoreBlkPtr sbp;
2122 BLAST_ResFreqPtr stdrfp;
2123 Nlm_FloatHiPtr standardProb;
2124 Nlm_FloatHiPtr Informativeness;
2125 Int2 iStatus;
2126 Int4 i, c, a;
2127 Int4 qlength;
2128 Int4 alphabetSize = 26;
2129 Nlm_FloatHi posEpsilon = 0.0001;
2130 Nlm_FloatHi QoverPest;
2131 Int4 *ntypes;
2132 Nlm_FloatHi **typefreq, sumfreq;
2133 SeqAlignPtr salp;
2134 DenseDiagPtr ddp;
2135 BioseqPtr bsp;
2136 SeqIdPtr sip;
2137 BioseqSetPtr bssp;
2138 SeqPortPtr spp;
2139 Uint1Ptr buffer;
2140
2141 bssp = pcdd->sequences->data.ptrvalue;
2142 offset = pcdd->profile_range->from;
2143 if (!pcdd) return(NULL);
2144 if (!pcdd->trunc_master) return(NULL);
2145 qlength = pcdd->trunc_master->length;
2146 Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2147 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2148 sbp->read_in_matrix = TRUE;
2149 sbp->protein_alphabet = TRUE;
2150 sbp->posMatrix = NULL;
2151 sbp->number_of_contexts = 1;
2152 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2153 stdrfp = BlastResFreqNew(sbp);
2154 BlastResFreqStdComp(sbp,stdrfp);
2155 standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2156 for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2157 stdrfp = BlastResFreqDestruct(stdrfp);
2158 ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
2159 for (i=0;i<qlength;i++) ntypes[i] = 0;
2160 typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
2161 for (i=0;i<qlength;i++) {
2162 typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2163 for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
2164 }
2165 salp = pcdd->seqannot->data;
2166 if (!bHasConsensus) { /* count residues in the master/representative too */
2167 ddp = salp->segs;
2168 sip = ddp->id;
2169 bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2170 buffer = MemNew((bsp->length)*sizeof(Uint1));
2171 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2172 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2173 spp = SeqPortFree(spp);
2174 while (ddp) {
2175 for (c=ddp->starts[0];c<ddp->starts[0]+ddp->len;c++) {
2176 typefreq[c-offset][buffer[c]] += 1.0;
2177 }
2178 ddp = ddp->next;
2179 }
2180 MemFree(buffer);
2181 }
2182 while (salp) {
2183 ddp = salp->segs;
2184 sip = ddp->id->next;
2185 bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2186 buffer = MemNew((bsp->length)*sizeof(Uint1));
2187 spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2188 for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2189 spp = SeqPortFree(spp);
2190 while (ddp) {
2191 for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2192 i = ddp->starts[0]-offset+c-ddp->starts[1];
2193 typefreq[i][buffer[c]] += 1.0;
2194 }
2195 ddp = ddp->next;
2196 }
2197 MemFree(buffer);
2198 salp = salp->next;
2199 }
2200 for (i=0;i<qlength;i++) {
2201 sumfreq = 0.0;
2202 for (a=0;a<alphabetSize;a++) {
2203 if (typefreq[i][a] > 0.0) {
2204 ntypes[i]++;
2205 sumfreq += typefreq[i][a];
2206 }
2207 }
2208 if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
2209 typefreq[i][a] /= sumfreq;
2210 }
2211 }
2212
2213 for (c=0;c<qlength;c++) {
2214 Informativeness[c] = 0.0;
2215 for (a=0;a<alphabetSize;a++) {
2216 if (standardProb[a] > posEpsilon) {
2217 QoverPest = typefreq[c][a] / standardProb[a];
2218 if (QoverPest > posEpsilon) {
2219 Informativeness[c] += typefreq[c][a] * log(QoverPest) / NCBIMATH_LN2;
2220 }
2221 }
2222 }
2223 }
2224
2225 *Niobs = 0.0;
2226 for (i=0;i<qlength;i++) *Niobs += (Nlm_FloatHi) ntypes[i];
2227 *Niobs /= (Nlm_FloatHi) qlength;
2228
2229 for (i=0;i<qlength;i++) MemFree(typefreq[i]);
2230 MemFree(typefreq);
2231 MemFree(ntypes);
2232 MemFree(standardProb);
2233 return(Informativeness);
2234 }
2235
2236 /*---------------------------------------------------------------------------*/
2237 /*---------------------------------------------------------------------------*/
2238 /* Calculate per column information content for a CDD pssm */
2239 /*---------------------------------------------------------------------------*/
2240 /*---------------------------------------------------------------------------*/
CddPssmInform(CddPtr pcdd)2241 Nlm_FloatHiPtr LIBCALL CddPssmInform(CddPtr pcdd)
2242 {
2243 BLAST_ScoreBlkPtr sbp;
2244 BLAST_ResFreqPtr stdrfp;
2245 Nlm_FloatHiPtr standardProb;
2246 Nlm_FloatHiPtr Informativeness;
2247 Nlm_FloatHi thisposFreq;
2248 Int2 iStatus;
2249 Int4 c, a;
2250 Int4 qlength;
2251 Int4 alphabetSize = 26;
2252 MatrixPtr posfreq;
2253 ValNodePtr vnp;
2254 Nlm_FloatHi posEpsilon = 0.0001;
2255 Nlm_FloatHi QoverPest;
2256
2257 if (!pcdd) return(NULL);
2258 if (!pcdd->trunc_master) return(NULL);
2259 qlength = pcdd->trunc_master->length;
2260 Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2261 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2262 sbp->read_in_matrix = TRUE;
2263 sbp->protein_alphabet = TRUE;
2264 sbp->posMatrix = NULL;
2265 sbp->number_of_contexts = 1;
2266 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2267 stdrfp = BlastResFreqNew(sbp);
2268 BlastResFreqStdComp(sbp,stdrfp);
2269 standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2270 for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2271 stdrfp = BlastResFreqDestruct(stdrfp);
2272 posfreq = pcdd->posfreq;
2273 if (posfreq->nrows != alphabetSize || posfreq->ncolumns != qlength) {
2274 CddSevError("posfreq matrix size mismatch in CddPssmInform");
2275 }
2276 vnp = posfreq->columns;
2277 for (c=0;c<qlength;c++) {
2278 Informativeness[c] = 0.0;
2279 for (a=0;a<alphabetSize;a++) {
2280 if (standardProb[a] > posEpsilon) {
2281 thisposFreq = (Nlm_FloatHi) vnp->data.intvalue / (Nlm_FloatHi) posfreq->scale_factor;
2282 QoverPest = thisposFreq / standardProb[a];
2283 if (QoverPest > posEpsilon) {
2284 Informativeness[c] += thisposFreq * log(QoverPest) / NCBIMATH_LN2;
2285 }
2286 }
2287 vnp = vnp->next;
2288 }
2289 }
2290 MemFree(standardProb);
2291 return(Informativeness);
2292 }
2293
2294 /*---------------------------------------------------------------------------*/
2295 /*---------------------------------------------------------------------------*/
2296 /* Calculate per column information content for a frequency table */
2297 /*---------------------------------------------------------------------------*/
2298 /*---------------------------------------------------------------------------*/
CddPosFreqInform(Nlm_FloatHi ** posFreq,Int4 ncol,Int4 nrow)2299 Nlm_FloatHiPtr LIBCALL CddPosFreqInform(Nlm_FloatHi **posFreq, Int4 ncol, Int4 nrow)
2300 {
2301 BLAST_ScoreBlkPtr sbp;
2302 BLAST_ResFreqPtr stdrfp;
2303 Nlm_FloatHiPtr standardProb;
2304 Nlm_FloatHiPtr Informativeness;
2305 Nlm_FloatHi thisposFreq;
2306 Int2 iStatus;
2307 Int4 c, a;
2308 Int4 qlength;
2309 Int4 alphabetSize = 26;
2310 Nlm_FloatHi posEpsilon = 0.0001;
2311 Nlm_FloatHi QoverPest;
2312
2313 if (!posFreq) return(NULL);
2314 qlength = ncol;
2315 if (nrow != alphabetSize) return(NULL);
2316 Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2317 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2318 sbp->read_in_matrix = TRUE;
2319 sbp->protein_alphabet = TRUE;
2320 sbp->posMatrix = NULL;
2321 sbp->number_of_contexts = 1;
2322 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2323 stdrfp = BlastResFreqNew(sbp);
2324 BlastResFreqStdComp(sbp,stdrfp);
2325 standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2326 for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2327 stdrfp = BlastResFreqDestruct(stdrfp);
2328
2329 for (c=0;c<qlength;c++) {
2330 Informativeness[c] = 0.0;
2331 for (a=0;a<alphabetSize;a++) {
2332 if (standardProb[a] > posEpsilon) {
2333 thisposFreq = posFreq[c][a];
2334 QoverPest = thisposFreq / standardProb[a];
2335 if (QoverPest > posEpsilon) {
2336 Informativeness[c] += thisposFreq * log(QoverPest) / NCBIMATH_LN2;
2337 }
2338 }
2339 }
2340 }
2341 MemFree(standardProb);
2342 return(Informativeness);
2343 }
2344
2345 /*---------------------------------------------------------------------------*/
2346 /*---------------------------------------------------------------------------*/
2347 /* Generation of position-specific scoring matrices for database scanning */
2348 /*---------------------------------------------------------------------------*/
2349 /*---------------------------------------------------------------------------*/
CddDenDiagCposComputation(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,BioseqPtr bspF,CddPtr pcdd,Int4 pseudoCnt)2350 void LIBCALL CddDenDiagCposComputation(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
2351 BioseqPtr bspF, CddPtr pcdd, Int4 pseudoCnt)
2352 {
2353 Int4 numalign, numseq; /*number of alignments and seqs */
2354 posSearchItems *posSearch; /*holds position-specific info */
2355 compactSearchItems *compactSearch = NULL;
2356 BLAST_ScoreBlkPtr sbp;
2357 SeqLocPtr private_slp = NULL;
2358 SeqPortPtr spp = NULL;
2359 Uint1Ptr query_seq, query_seq_start;
2360 Uint1 residue;
2361 Int4 index, a, KarlinReturn, array_size, iNew;
2362 Nlm_FloatHiPtr lambda, K, H;
2363 Int4Ptr open, extend;
2364 BLAST_ResFreqPtr stdrfp;
2365 Int2 iStatus;
2366 ValNodePtr error_return = NULL;
2367 ValNodePtr ColumnHead;
2368 ValNodePtr newRow, RowHead;
2369 ValNodePtr newLetter, LetterHead;
2370 SeqCodeTablePtr sctp;
2371 Char ckptFileName[PATH_MAX];
2372 Char cseqFileName[PATH_MAX];
2373 FILE *fp;
2374 Boolean bHasConsensus;
2375 CharPtr cAccession;
2376
2377 bHasConsensus = CddHasConsensus(pcdd);
2378
2379 numalign = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
2380 posSearch = (posSearchItems *) MemNew(1*sizeof(posSearchItems));
2381 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2382 sbp->read_in_matrix = TRUE;
2383 sbp->protein_alphabet = TRUE;
2384 sbp->posMatrix = NULL;
2385 sbp->number_of_contexts = 1;
2386 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2387 compactSearch = compactSearchNew(compactSearch);
2388 compactSearch->qlength = bsp->length;
2389 compactSearch->alphabetSize = sbp->alphabet_size;
2390 compactSearch->matrix = sbp->matrix;
2391 compactSearch->gapped_calculation = TRUE;
2392 compactSearch->pseudoCountConst = pseudoCnt;
2393 compactSearch->ethresh = 0.001;
2394
2395 /*---------------------------------------------------------------------------*/
2396 /* get the query sequence */
2397 /*---------------------------------------------------------------------------*/
2398 private_slp = SeqLocIntNew(0, bspF->length-1 , Seq_strand_plus, SeqIdFindBest(bspF->id, SEQID_GI));
2399 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
2400 SeqPortSet_do_virtual(spp, TRUE);
2401 query_seq_start = (Uint1Ptr) MemNew(((bspF->length)+2)*sizeof(Char));
2402 query_seq_start[0] = NULLB;
2403 query_seq = query_seq_start+1;
2404 index=0;
2405 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
2406 if (IS_residue(residue)) {
2407 query_seq[index] = residue;
2408 index++;
2409 }
2410 }
2411 query_seq[index] = NULLB;
2412 spp = SeqPortFree(spp);
2413 private_slp = SeqLocFree(private_slp);
2414 compactSearch->query = query_seq;
2415
2416 BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
2417
2418 sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
2419 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
2420 if (1 == KarlinReturn) {
2421 BlastErrorPrint(error_return);
2422 }
2423 sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
2424 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
2425 if (1 == KarlinReturn) {
2426 BlastErrorPrint(error_return);
2427 }
2428
2429 array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
2430 if (array_size > 0) {
2431 for (index = 0; index < array_size; index++) {
2432 if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
2433 sbp->kbp_ideal = BlastKarlinBlkCreate();
2434 sbp->kbp_ideal->Lambda = lambda[index];
2435 sbp->kbp_ideal->K = K[index];
2436 sbp->kbp_ideal->H = H[index];
2437 }
2438 }
2439 MemFree(open);
2440 MemFree(extend);
2441 MemFree(lambda);
2442 MemFree(K);
2443 MemFree(H);
2444 }
2445 if (sbp->kbp_ideal == NULL) {
2446 sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
2447 }
2448 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
2449 compactSearch->kbp_std = sbp->kbp_std;
2450 compactSearch->kbp_psi = sbp->kbp_psi;
2451 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
2452 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
2453 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
2454 compactSearch->K_ideal = sbp->kbp_ideal->K;
2455
2456 stdrfp = BlastResFreqNew(sbp);
2457 BlastResFreqStdComp(sbp,stdrfp);
2458 compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
2459 for(a = 0; a < compactSearch->alphabetSize; a++)
2460 compactSearch->standardProb[a] = stdrfp->prob[a];
2461 stdrfp = BlastResFreqDestruct(stdrfp);
2462 posSearch->posInformation = NULL;
2463 /*---------------------------------------------------------------------------*/
2464 /* numseq is replaced with numalign (last argument) - each alignment is a */
2465 /* set of diags and represents an independent sequence/domain */
2466 /*---------------------------------------------------------------------------*/
2467 posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
2468 CddfindDenseDiagThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
2469 sbp->kbp = sbp->kbp_psi;
2470 sbp->kbp_gap = sbp->kbp_gap_psi;
2471 CddposInitializeInformation(posSearch,sbp,compactSearch);
2472 CddposDenseDiagDemographics(posSearch, compactSearch, listOfSeqAligns);
2473 posPurgeMatches(posSearch, compactSearch, NULL);
2474 /*---------------------------------------------------------------------------*/
2475 /* remove the consensus from the CposComputation if present in the CDD */
2476 /*---------------------------------------------------------------------------*/
2477 if (bHasConsensus) posCancel(posSearch,compactSearch,0,0,0,compactSearch->qlength);
2478 posComputeExtents(posSearch, compactSearch);
2479 posComputeSequenceWeights(posSearch, compactSearch, 1.0);
2480 posCheckWeights(posSearch, compactSearch);
2481 posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
2482 CddposFreqsToMatrix(posSearch,compactSearch);
2483 posScaling(posSearch, compactSearch);
2484
2485 sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
2486 LetterHead = NULL;
2487 for (a=0;a<compactSearch->alphabetSize;a++) {
2488 newLetter = ValNodeNew(NULL);
2489 newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
2490 Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
2491 ValNodeLink(&(LetterHead),newLetter);
2492 }
2493
2494 pcdd->posfreq = (MatrixPtr) MatrixNew();
2495 pcdd->posfreq->ncolumns = compactSearch->qlength;
2496 pcdd->posfreq->nrows = compactSearch->alphabetSize;
2497 pcdd->posfreq->scale_factor = POSFREQ_SCALE;
2498 pcdd->posfreq->row_labels = LetterHead;
2499 ColumnHead = NULL;
2500 for (index = 0; index<compactSearch->qlength;index++) {
2501 RowHead = NULL;
2502 for (a=0;a<compactSearch->alphabetSize;a++) {
2503 newRow = ValNodeNew(NULL);
2504 iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
2505 newRow->data.intvalue = iNew;
2506 ValNodeLink(&(ColumnHead),newRow);
2507 }
2508 }
2509 pcdd->posfreq->columns = ColumnHead;
2510
2511 pcdd->scoremat = (MatrixPtr) MatrixNew();
2512 pcdd->scoremat->ncolumns = compactSearch->qlength;
2513 pcdd->scoremat->nrows = compactSearch->alphabetSize;
2514 pcdd->scoremat->scale_factor = 1;
2515 pcdd->scoremat->row_labels = LetterHead;
2516 ColumnHead = NULL;
2517 for (index = 0; index<compactSearch->qlength;index++) {
2518 RowHead = NULL;
2519 for (a=0;a<compactSearch->alphabetSize;a++) {
2520 newRow = ValNodeNew(NULL);
2521 iNew = (Int4) posSearch->posMatrix[index][a];
2522 newRow->data.intvalue = iNew;
2523 ValNodeLink(&(ColumnHead),newRow);
2524 }
2525 }
2526 pcdd->scoremat->columns = ColumnHead;
2527
2528 /*---------------------------------------------------------------------------*/
2529 /* Construct name for checkpoint file */
2530 /*---------------------------------------------------------------------------*/
2531 cAccession = CddGetAccession(pcdd);
2532 strcpy(ckptFileName,cAccession);
2533 strcat(ckptFileName,CKPTEXT);
2534 CddposTakeCheckpoint(posSearch, compactSearch, ckptFileName, &error_return);
2535 strcpy(cseqFileName,cAccession);
2536 strcat(cseqFileName,CSEQEXT);
2537 MemFree(cAccession);
2538
2539 fp = FileOpen(cseqFileName, "w");
2540 if (NULL == fp) {
2541 BlastConstructErrorMessage("CddDenDiagCposComputation", "Could not open fasta file", 1, &error_return);
2542 }
2543 BioseqToFasta(bsp,fp,FALSE);
2544 FileClose(fp);
2545 }
2546
2547
2548 /*---------------------------------------------------------------------------*/
2549 /*---------------------------------------------------------------------------*/
2550 /* Generation of position-specific scoring matrices for database scanning */
2551 /*---------------------------------------------------------------------------*/
2552 /*---------------------------------------------------------------------------*/
CddCposComputation(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,CddPtr pcdd)2553 void LIBCALL CddCposComputation(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
2554 CddPtr pcdd)
2555 {
2556 Int4 numalign, numseq; /*number of alignments and seqs */
2557 posSearchItems *posSearch; /*holds position-specific info */
2558 compactSearchItems *compactSearch = NULL;
2559 BLAST_ScoreBlkPtr sbp;
2560 SeqLocPtr private_slp = NULL;
2561 SeqPortPtr spp = NULL;
2562 Uint1Ptr query_seq, query_seq_start;
2563 Uint1 residue;
2564 Int4 index, a, KarlinReturn, array_size, iNew;
2565 Nlm_FloatHiPtr lambda, K, H;
2566 Int4Ptr open, extend;
2567 BLAST_ResFreqPtr stdrfp;
2568 Int2 iStatus;
2569 ValNodePtr error_return = NULL;
2570 ValNodePtr ColumnHead;
2571 ValNodePtr newRow, RowHead;
2572 ValNodePtr newLetter, LetterHead;
2573 SeqCodeTablePtr sctp;
2574 Char ckptFileName[PATH_MAX];
2575 Char cseqFileName[PATH_MAX];
2576 FILE *fp;
2577 CharPtr cAccession;
2578
2579 numalign = CddCountSeqAligns(listOfSeqAligns, &numseq);
2580 posSearch = (posSearchItems *) MemNew(1*sizeof(posSearchItems));
2581 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2582 sbp->read_in_matrix = TRUE;
2583 sbp->protein_alphabet = TRUE;
2584 sbp->posMatrix = NULL;
2585 sbp->number_of_contexts = 1;
2586 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2587 compactSearch = compactSearchNew(compactSearch);
2588 compactSearch->qlength = bsp->length;
2589 compactSearch->alphabetSize = sbp->alphabet_size;
2590 compactSearch->matrix = sbp->matrix;
2591 compactSearch->gapped_calculation = TRUE;
2592 compactSearch->pseudoCountConst = 10;
2593 compactSearch->ethresh = 0.001;
2594
2595 /*---------------------------------------------------------------------------*/
2596 /* get the query sequence */
2597 /*---------------------------------------------------------------------------*/
2598 private_slp = SeqLocIntNew(0, bsp->length-1 , Seq_strand_plus, SeqIdFindBest(bsp->id, SEQID_GI));
2599 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
2600 SeqPortSet_do_virtual(spp, TRUE);
2601 query_seq_start = (Uint1Ptr) MemNew(((bsp->length)+2)*sizeof(Char));
2602 query_seq_start[0] = NULLB;
2603 query_seq = query_seq_start+1;
2604 index=0;
2605 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
2606 if (IS_residue(residue)) {
2607 query_seq[index] = residue;
2608 index++;
2609 }
2610 }
2611 query_seq[index] = NULLB;
2612 spp = SeqPortFree(spp);
2613 private_slp = SeqLocFree(private_slp);
2614 compactSearch->query = query_seq;
2615
2616 BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
2617
2618 sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
2619 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
2620 if (1 == KarlinReturn) {
2621 BlastErrorPrint(error_return);
2622 }
2623 sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
2624 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
2625 if (1 == KarlinReturn) {
2626 BlastErrorPrint(error_return);
2627 }
2628
2629 array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
2630 if (array_size > 0) {
2631 for (index = 0; index < array_size; index++) {
2632 if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
2633 sbp->kbp_ideal = BlastKarlinBlkCreate();
2634 sbp->kbp_ideal->Lambda = lambda[index];
2635 sbp->kbp_ideal->K = K[index];
2636 sbp->kbp_ideal->H = H[index];
2637 }
2638 }
2639 MemFree(open);
2640 MemFree(extend);
2641 MemFree(lambda);
2642 MemFree(K);
2643 MemFree(H);
2644 }
2645 if (sbp->kbp_ideal == NULL) {
2646 sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
2647 }
2648 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
2649 compactSearch->kbp_std = sbp->kbp_std;
2650 compactSearch->kbp_psi = sbp->kbp_psi;
2651 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
2652 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
2653 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
2654 compactSearch->K_ideal = sbp->kbp_ideal->K;
2655
2656 stdrfp = BlastResFreqNew(sbp);
2657 BlastResFreqStdComp(sbp,stdrfp);
2658 compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
2659 for(a = 0; a < compactSearch->alphabetSize; a++)
2660 compactSearch->standardProb[a] = stdrfp->prob[a];
2661 stdrfp = BlastResFreqDestruct(stdrfp);
2662 posSearch->posInformation = NULL;
2663 /*---------------------------------------------------------------------------*/
2664 /* numseq is replaced with numalign (last argument) - each alignment is a */
2665 /* single segment and represents a separate sequence */
2666 /*---------------------------------------------------------------------------*/
2667 posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
2668 CddfindThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
2669 sbp->kbp = sbp->kbp_psi;
2670 sbp->kbp_gap = sbp->kbp_gap_psi;
2671 CddposInitializeInformation(posSearch,sbp,compactSearch);
2672 CddposDemographics(posSearch, compactSearch, listOfSeqAligns);
2673 posPurgeMatches(posSearch, compactSearch, NULL);
2674 posComputeExtents(posSearch, compactSearch);
2675 posComputeSequenceWeights(posSearch, compactSearch, 1.0);
2676 posCheckWeights(posSearch, compactSearch);
2677 posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
2678 CddposFreqsToMatrix(posSearch,compactSearch);
2679 posScaling(posSearch, compactSearch);
2680
2681 sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
2682 LetterHead = NULL;
2683 for (a=0;a<compactSearch->alphabetSize;a++) {
2684 newLetter = ValNodeNew(NULL);
2685 newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
2686 Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
2687 ValNodeLink(&(LetterHead),newLetter);
2688 }
2689
2690 pcdd->posfreq = (MatrixPtr) MatrixNew();
2691 pcdd->posfreq->ncolumns = compactSearch->qlength;
2692 pcdd->posfreq->nrows = compactSearch->alphabetSize;
2693 pcdd->posfreq->scale_factor = POSFREQ_SCALE;
2694 pcdd->posfreq->row_labels = LetterHead;
2695 ColumnHead = NULL;
2696 for (index = 0; index<compactSearch->qlength;index++) {
2697 RowHead = NULL;
2698 for (a=0;a<compactSearch->alphabetSize;a++) {
2699 newRow = ValNodeNew(NULL);
2700 iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
2701 newRow->data.intvalue = iNew;
2702 ValNodeLink(&(ColumnHead),newRow);
2703 }
2704 }
2705 pcdd->posfreq->columns = ColumnHead;
2706
2707 pcdd->scoremat = (MatrixPtr) MatrixNew();
2708 pcdd->scoremat->ncolumns = compactSearch->qlength;
2709 pcdd->scoremat->nrows = compactSearch->alphabetSize;
2710 pcdd->scoremat->scale_factor = 1;
2711 pcdd->scoremat->row_labels = LetterHead;
2712 ColumnHead = NULL;
2713 for (index = 0; index<compactSearch->qlength;index++) {
2714 RowHead = NULL;
2715 for (a=0;a<compactSearch->alphabetSize;a++) {
2716 newRow = ValNodeNew(NULL);
2717 iNew = (Int4) posSearch->posMatrix[index][a];
2718 newRow->data.intvalue = iNew;
2719 ValNodeLink(&(ColumnHead),newRow);
2720 }
2721 }
2722 pcdd->scoremat->columns = ColumnHead;
2723
2724 /*---------------------------------------------------------------------------*/
2725 /* Construct name for checkpoint file */
2726 /*---------------------------------------------------------------------------*/
2727 cAccession = CddGetAccession(pcdd);
2728 strcpy(ckptFileName,cAccession);
2729 strcat(ckptFileName,CKPTEXT);
2730
2731 CddposTakeCheckpoint(posSearch, compactSearch, ckptFileName, &error_return);
2732 strcpy(cseqFileName,cAccession);
2733 strcat(cseqFileName,CSEQEXT);
2734
2735 MemFree(cAccession);
2736
2737 fp = FileOpen(cseqFileName, "w");
2738 if (NULL == fp) {
2739 BlastConstructErrorMessage("CddCposComputation", "Could not open fasta file", 1, &error_return);
2740 }
2741 BioseqToFasta(bsp,fp,FALSE);
2742 FileClose(fp);
2743 }
2744
2745
2746 /*---------------------------------------------------------------------------*/
2747 /*---------------------------------------------------------------------------*/
2748 /* Count the number of seqAligns in a list (returned) and count the number */
2749 /* of distinct target sequences represented (passed back in numSequences); */
2750 /* Important assumption: All SeqAligns with the same target sequence are */
2751 /* consecutive in the list */
2752 /* ONLY works for lists of Seqaligns containing DenseSegs!!! */
2753 /*---------------------------------------------------------------------------*/
2754 /*---------------------------------------------------------------------------*/
CddCountSeqAligns(SeqAlignPtr listOfSeqAligns,Int4 * numSequences)2755 Int4 LIBCALL CddCountSeqAligns(SeqAlignPtr listOfSeqAligns, Int4 * numSequences)
2756 {
2757 SeqAlignPtr curSeqAlign, prevSeqAlign;
2758 Int4 seqAlignCounter;
2759 DenseSegPtr curSegs;
2760 SeqIdPtr curId, prevId;
2761
2762 seqAlignCounter = 0;
2763 *numSequences = 0;
2764 curSeqAlign = listOfSeqAligns;
2765 prevSeqAlign = NULL;
2766 while (NULL != curSeqAlign) {
2767 curSegs = (DenseSegPtr) curSeqAlign->segs;
2768 if(curSegs->ids == NULL) break;
2769 curId = curSegs->ids->next;
2770 seqAlignCounter++;
2771 if ((NULL == prevSeqAlign) || (!(SeqIdMatch(curId, prevId))))
2772 (*numSequences)++;
2773 prevSeqAlign = curSeqAlign;
2774 prevId = curId;
2775 curSeqAlign = curSeqAlign->next;
2776 }
2777 return(seqAlignCounter);
2778 }
2779
2780 /*---------------------------------------------------------------------------*/
2781 /*---------------------------------------------------------------------------*/
2782 /* Count the number of seqAligns in a list (returned) and count the number */
2783 /* of distinct target sequences represented (passed back in numSequences); */
2784 /* Important assumption: All SeqAligns with the same target sequence are */
2785 /* consecutive in the list */
2786 /* ONLY works for lists of Seqaligns containing DenseDiags!!! */
2787 /*---------------------------------------------------------------------------*/
2788 /*---------------------------------------------------------------------------*/
CddCountDenDiagSeqAligns(SeqAlignPtr listOfSeqAligns,Int4 * numSequences)2789 Int4 LIBCALL CddCountDenDiagSeqAligns(SeqAlignPtr listOfSeqAligns, Int4 * numSequences)
2790 {
2791 SeqAlignPtr curSeqAlign;
2792 Int4 seqAlignCounter;
2793 DenseDiagPtr curSegs;
2794
2795 seqAlignCounter = 0;
2796 *numSequences = 0;
2797 curSeqAlign = listOfSeqAligns;
2798 while (NULL != curSeqAlign) {
2799 curSegs = (DenseDiagPtr) curSeqAlign->segs;
2800 if(curSegs->id == NULL) break;
2801 seqAlignCounter++;
2802 (*numSequences)++;
2803 curSeqAlign = curSeqAlign->next;
2804 }
2805 return(seqAlignCounter);
2806 }
2807
2808 /*---------------------------------------------------------------------------*/
2809 /*---------------------------------------------------------------------------*/
2810 /* assign the range of the master sequence involved in alignments */
2811 /*---------------------------------------------------------------------------*/
2812 /*---------------------------------------------------------------------------*/
CddAssignProfileRange(CddPtr pcdd,SeqIdPtr sip)2813 void LIBCALL CddAssignProfileRange(CddPtr pcdd, SeqIdPtr sip)
2814 {
2815 SeqAlignPtr salp;
2816 DenseDiagPtr ddp;
2817 SeqIntPtr sintp;
2818 Int4 iMax = -10000000;
2819 Int4 iMin = 10000000;
2820
2821 sintp = (SeqIntPtr) SeqIntNew();
2822
2823 salp = (SeqAlignPtr) pcdd->seqannot->data;
2824
2825 while (salp) {
2826 ddp = (DenseDiagPtr) salp->segs;
2827 while (ddp) {
2828 if (ddp->starts[0] < iMin) iMin = ddp->starts[0];
2829 if ((ddp->starts[0] + ddp->len - 1) > iMax)
2830 iMax = ddp->starts[0] + ddp->len - 1;
2831 ddp = ddp->next;
2832 }
2833 salp = salp->next;
2834 }
2835 sintp->from = iMin;
2836 sintp->to = iMax;
2837 sintp->id = (SeqIdPtr) SeqIdDup(sip);
2838 if (iMax >= iMin) {
2839 pcdd->profile_range = (struct seqint PNTR) sintp;
2840 } else {
2841 printf(" CddAssignProfileRange: Warning: Can not assign alignment range!\n");
2842 }
2843 }
2844
2845
2846 /*---------------------------------------------------------------------------*/
2847 /*---------------------------------------------------------------------------*/
2848 /* return a pointer to a specific bioseq from a seq-entry set */
2849 /*---------------------------------------------------------------------------*/
2850 /*---------------------------------------------------------------------------*/
CddFindBioseqInSeqEntry(SeqEntryPtr sep,SeqIdPtr sip)2851 BioseqPtr LIBCALL CddFindBioseqInSeqEntry(SeqEntryPtr sep, SeqIdPtr sip)
2852 {
2853 SeqEntryPtr sepThis;
2854 BioseqPtr bsp;
2855 BioseqSetPtr bssp;
2856
2857 sepThis = sep;
2858
2859 while (sepThis) {
2860 if (IS_Bioseq(sepThis)) {
2861 bsp = sepThis->data.ptrvalue;
2862 if (CddSameSip(bsp->id, sip)) return(bsp);
2863 } else if (IS_Bioseq_set(sepThis)) {
2864 bssp = sepThis->data.ptrvalue;
2865 bsp = CddFindBioseqInSeqEntry(bssp->seq_set,sip);
2866 if (CddSameSip(bsp->id, sip)) return(bsp);
2867 }
2868 sepThis = sepThis->next;
2869 }
2870 return(NULL);
2871 }
2872
2873 /*---------------------------------------------------------------------------*/
2874 /* rips out and returns a PDBSeqId from a SeqId */
2875 /*---------------------------------------------------------------------------*/
CddGetPdbSeqId(SeqIdPtr sip)2876 PDBSeqIdPtr LIBCALL CddGetPdbSeqId(SeqIdPtr sip)
2877 /* may need to be modified according to how bioseq id is */
2878 {
2879 SeqIdPtr seq_id = NULL;
2880 PDBSeqIdPtr pdb_seq_id = NULL;
2881
2882 seq_id = sip;
2883 while(seq_id != NULL){
2884 if(seq_id->choice == SEQID_PDB) {
2885 pdb_seq_id = seq_id->data.ptrvalue;
2886 break;
2887 }
2888 seq_id = seq_id->next;
2889 }
2890 return(pdb_seq_id);
2891 }
2892
2893
2894 /*---------------------------------------------------------------------------*/
2895 /* test whether two SeqIds are the same, only considers PDB-Ids and GIs */
2896 /* as well as local sequence-id's */
2897 /*---------------------------------------------------------------------------*/
CddSameSip(SeqIdPtr sip1,SeqIdPtr sip2)2898 Boolean LIBCALL CddSameSip(SeqIdPtr sip1, SeqIdPtr sip2)
2899 {
2900 PDBSeqIdPtr psip1, psip2;
2901 SeqIdPtr tsip1, tsip2;
2902 ObjectIdPtr oidp1, oidp2;
2903
2904 if (sip1 == sip2) return(TRUE);
2905 tsip1 = sip1;
2906 while (tsip1) {
2907 tsip2 = sip2;
2908 while (tsip2) {
2909 if (tsip1->choice == tsip2->choice) {
2910 if (tsip1->choice == SEQID_PDB) {
2911 psip1 = (PDBSeqIdPtr) CddGetPdbSeqId(tsip1);
2912 psip2 = (PDBSeqIdPtr) CddGetPdbSeqId(tsip2);
2913 if (strcmp(psip1->mol,psip2->mol)==0 && psip1->chain==psip2->chain)
2914 return(TRUE);
2915 } else if (tsip1->choice == SEQID_LOCAL) {
2916 oidp1 = tsip1->data.ptrvalue;
2917 oidp2 = tsip2->data.ptrvalue;
2918 if (NULL != oidp1 && strcmp(oidp1->str,oidp2->str)==0) return(TRUE);
2919 } else if (tsip1->choice == SEQID_GI) {
2920 if (tsip1->data.intvalue == tsip2->data.intvalue) return(TRUE);
2921 }
2922 }
2923 tsip2 = tsip2->next;
2924 }
2925 tsip1 = tsip1->next;
2926 }
2927 return(FALSE);
2928 }
2929
2930 /*---------------------------------------------------------------------------*/
2931 /*---------------------------------------------------------------------------*/
2932 /* reindex master of a MSL-DenSeg Alignment according to a n-terminal offset */
2933 /* must allocate new arrays for "starts", these may just be pointers to DDG */
2934 /*---------------------------------------------------------------------------*/
2935 /*---------------------------------------------------------------------------*/
CddReindexMSLDenSegMaster(SeqAlignPtr salp,Int4 offset)2936 void LIBCALL CddReindexMSLDenSegMaster(SeqAlignPtr salp, Int4 offset)
2937 {
2938 SeqAlignPtr salpThis;
2939 DenseSegPtr dsp;
2940 Int4 *newstarts, i;
2941
2942 if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENSEG) {
2943 salpThis = salp;
2944 while (salpThis) {
2945 dsp = salpThis->segs;
2946 if (dsp) {
2947 newstarts = (Int4 *)MemNew(dsp->dim * sizeof(Int4));
2948 for (i=0;i<dsp->dim;i++) newstarts[i] = dsp->starts[i];
2949 newstarts[0] -= offset;
2950 dsp->starts = newstarts;
2951 }
2952 salpThis = salpThis->next;
2953 }
2954 }
2955 }
2956
2957
2958 /*---------------------------------------------------------------------------*/
2959 /*---------------------------------------------------------------------------*/
2960 /* reindex master of a MSL-DenDiag Alignment according to a n-terminal offset*/
2961 /* must allocate new arrays for "starts", these may just be pointers to DDG */
2962 /*---------------------------------------------------------------------------*/
2963 /*---------------------------------------------------------------------------*/
CddReindexMSLDenDiagMaster(SeqAlignPtr salp,Int4 offset)2964 void LIBCALL CddReindexMSLDenDiagMaster(SeqAlignPtr salp, Int4 offset)
2965 {
2966 SeqAlignPtr salpThis;
2967 DenseDiagPtr ddp;
2968
2969 if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENDIAG) {
2970 salpThis = salp;
2971 while (salpThis) {
2972 ddp = salpThis->segs;
2973 while (ddp) {
2974 ddp->starts[0] -= offset;
2975 ddp = ddp->next;
2976 }
2977 salpThis = salpThis->next;
2978 }
2979 }
2980 }
2981
2982
2983 /*---------------------------------------------------------------------------*/
2984 /*---------------------------------------------------------------------------*/
2985 /* makes a copy of one master-slave dense-diag alignment - does NOT copy the */
2986 /* identifiers. Should only be used for a reindexing the master for CposComp.*/
2987 /*---------------------------------------------------------------------------*/
2988 /*---------------------------------------------------------------------------*/
CddCopyMSLDenDiag(SeqAlignPtr salp)2989 SeqAlignPtr LIBCALL CddCopyMSLDenDiag(SeqAlignPtr salp)
2990 {
2991 SeqAlignPtr salpNew = NULL, salpHead = NULL, salpTail = NULL;
2992 DenseDiagPtr ddp, ddpNew, ddpTail;
2993 Int4 i;
2994
2995 if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENDIAG) {
2996 while (salp) {
2997 salpNew = SeqAlignNew();
2998 salpNew->type = salp->type;
2999 salpNew->segtype = salp->segtype;
3000 salpNew->dim = salp->dim;
3001 salpNew->segs = NULL;
3002 ddp = salp->segs;
3003 while (ddp) {
3004 ddpNew = DenseDiagNew();
3005 ddpNew->dim = ddp->dim;
3006 ddpNew->id = ddp->id;
3007 ddpNew->starts = MemNew(ddpNew->dim * sizeof(Int4));
3008 for (i =0;i<ddpNew->dim;i++) ddpNew->starts[i] = ddp->starts[i];
3009 ddpNew->len = ddp->len;
3010 if (!salpNew->segs) {
3011 salpNew->segs = ddpNew;
3012 ddpTail = ddpNew;
3013 ddpTail->next = NULL;
3014 } else {
3015 ddpTail->next = ddpNew;
3016 ddpTail = ddpNew;
3017 ddpTail->next = NULL;
3018 }
3019 ddp = ddp->next;
3020 }
3021 if (!salpHead) {
3022 salpHead = salpNew;
3023 salpTail = salpHead;
3024 salpTail->next = NULL;
3025 } else {
3026 salpTail->next = salpNew;
3027 salpTail = salpNew;
3028 salpTail->next = NULL;
3029 }
3030 salp = salp->next;
3031 }
3032 }
3033 return salpHead;
3034 }
3035
3036
3037 /*---------------------------------------------------------------------------*/
3038 /*---------------------------------------------------------------------------*/
3039 /* allocate and free a new ExplicitAlignment object */
3040 /*---------------------------------------------------------------------------*/
3041 /*---------------------------------------------------------------------------*/
CddExpAlignNew()3042 CddExpAlignPtr CddExpAlignNew()
3043 {
3044 CddExpAlignPtr pCDea;
3045
3046 pCDea = MemNew(sizeof(CddExpAlign));
3047 if (!pCDea) return NULL;
3048 pCDea->length = 0;
3049 pCDea->ids = NULL;
3050 pCDea->adata = NULL;
3051 pCDea->starts = NULL;
3052 pCDea->bIdAlloc = FALSE;
3053 return(pCDea);
3054 }
3055
CddExpAlignFree(CddExpAlignPtr pCDea)3056 CddExpAlignPtr CddExpAlignFree(CddExpAlignPtr pCDea)
3057 {
3058 if (!pCDea) return NULL;
3059 if (NULL != pCDea->adata) free(pCDea->adata);
3060 pCDea->ids = SeqIdSetFree(pCDea->ids);
3061 if (NULL != pCDea->starts) free(pCDea->starts);
3062 return MemFree(pCDea);
3063 }
3064
CddExpAlignAlloc(CddExpAlignPtr pCDea,Int4 iLength)3065 void CddExpAlignAlloc(CddExpAlignPtr pCDea, Int4 iLength)
3066 {
3067 Int4 i;
3068
3069 if (!pCDea || !iLength) return;
3070 pCDea->length = iLength;
3071 if (NULL != pCDea->adata) free(pCDea->adata);
3072 pCDea->adata = MemNew(sizeof(Int4)*pCDea->length);
3073 for (i=0;i<pCDea->length;i++) pCDea->adata[i] = -1;
3074 if (NULL != pCDea->starts) free(pCDea->starts);
3075 pCDea->starts = MemNew(sizeof(Int4)*pCDea->length);
3076 for (i=0;i<pCDea->length;i++) pCDea->starts[i] = 0;
3077 }
3078
3079 /*---------------------------------------------------------------------------*/
3080 /* Convert a SeqAlign (pairwise, DenseDiag) into an ExplicitAlignmentObject */
3081 /*---------------------------------------------------------------------------*/
SeqAlignToCddExpAlign(SeqAlignPtr salp,SeqEntryPtr sep)3082 CddExpAlignPtr LIBCALL SeqAlignToCddExpAlign(SeqAlignPtr salp, SeqEntryPtr sep)
3083 {
3084 CddExpAlignPtr pCDea;
3085 BioseqPtr bsp1;
3086 DenseDiagPtr ddp;
3087 Int4 i;
3088 SeqIdPtr sip;
3089
3090 pCDea = CddExpAlignNew();
3091 ddp = salp->segs;
3092 pCDea->ids = SeqIdDupList(ddp->id);
3093 sip = SeqIdDup(pCDea->ids);
3094 bsp1 = CddRetrieveBioseqById(sip,sep);
3095 SeqIdFree(sip);
3096 CddExpAlignAlloc(pCDea,bsp1->length);
3097 while (ddp) {
3098 for (i=0;i<ddp->len;i++) {
3099 if (ddp->starts[0]+i >= bsp1->length) {
3100 printf("Warning: Indexing error!\n");
3101 }
3102 pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3103 }
3104 pCDea->starts[ddp->starts[0]] = 1;
3105 ddp = ddp->next;
3106 }
3107 return(pCDea);
3108 }
3109
3110 /*---------------------------------------------------------------------------*/
3111 /* Invert an explicit alignment object (flip master-slave) */
3112 /*---------------------------------------------------------------------------*/
InvertCddExpAlign(CddExpAlignPtr pCDea,SeqEntryPtr sep)3113 CddExpAlignPtr InvertCddExpAlign(CddExpAlignPtr pCDea, SeqEntryPtr sep)
3114 {
3115 BioseqPtr bsp;
3116 CddExpAlignPtr pCDeaNew;
3117 Int4 i;
3118
3119 if (!pCDea->ids || !pCDea->ids->next) CddSevError("No SeqId in InvertCddExpAlign");
3120 pCDeaNew = CddExpAlignNew();
3121 pCDeaNew->ids = SeqIdDup(pCDea->ids->next);
3122 pCDeaNew->ids->next = SeqIdDup(pCDea->ids);
3123 pCDeaNew->bIdAlloc = TRUE;
3124 bsp = CddRetrieveBioseqById(pCDea->ids->next, sep);
3125 CddExpAlignAlloc(pCDeaNew, bsp->length);
3126 for (i=0;i<pCDea->length;i++) {
3127 if (pCDea->adata[i] != -1) {
3128 pCDeaNew->adata[pCDea->adata[i]] = i;
3129 }
3130 if (pCDea->starts[i] > 0) {
3131 pCDeaNew->starts[pCDea->adata[i]] = 1;
3132 }
3133 }
3134 return(pCDeaNew);
3135 }
3136
3137 /*---------------------------------------------------------------------------*/
3138 /* Convert an Explicit Alignment Object (with proper IDs) to a SeqAlign */
3139 /*---------------------------------------------------------------------------*/
CddExpAlignToSeqAlign(CddExpAlignPtr pCDea,Int4Ptr iBreakAfter)3140 SeqAlignPtr CddExpAlignToSeqAlign(CddExpAlignPtr pCDea, Int4Ptr iBreakAfter)
3141 {
3142 SeqAlignPtr salp;
3143 DenseDiagPtr ddp, ddplast = NULL;
3144 Int4 i;
3145
3146 if (!pCDea->ids || !pCDea->ids->next)
3147 CddSevError("Missing Sequence ID in CddExpAlignToSeqAlign");
3148 salp = SeqAlignNew();
3149 salp->type = SAT_PARTIAL;
3150 salp->segtype = SAS_DENDIAG;
3151 salp->dim = 2;
3152 salp->segs = NULL;
3153
3154 i= -1;
3155 while (++i < pCDea->length) {
3156 if (pCDea->adata[i] != -1) {
3157 ddp = DenseDiagNew();
3158 ddp->id = SeqIdDup(pCDea->ids);
3159 ddp->id->next = SeqIdDup(pCDea->ids->next);
3160 ddp->dim = 2;
3161 ddp->starts = MemNew(2*sizeof(Int4));
3162 ddp->starts[0] = i;
3163 ddp->starts[1] = pCDea->adata[i];
3164 ddp->len = 1;
3165 while (i<(pCDea->length-1) && pCDea->adata[i+1]==(pCDea->adata[i]+1) &&
3166 (NULL == iBreakAfter || iBreakAfter[i] == 0) &&
3167 pCDea->starts[i+1] == 0) {
3168 ddp->len++;
3169 i++;
3170 }
3171 if (ddplast) {
3172 ddplast->next = ddp;
3173 } else {
3174 salp->segs = ddp;
3175 }
3176 ddplast = ddp;
3177 }
3178 }
3179 if (!salp->segs) salp = SeqAlignFree(salp);
3180 return(salp);
3181 }
3182
3183 /*---------------------------------------------------------------------------*/
3184 /*---------------------------------------------------------------------------*/
3185 /* reindex a m-s1, m-s2 alignment pair to a s1-s2 alignment pair */
3186 /*---------------------------------------------------------------------------*/
3187 /*---------------------------------------------------------------------------*/
CddReindexExpAlign(CddExpAlignPtr pCDea1,Int4 newlength,CddExpAlignPtr pCDea2,Int4 iOuter,Int4 iInner)3188 CddExpAlignPtr CddReindexExpAlign(CddExpAlignPtr pCDea1, Int4 newlength,
3189 CddExpAlignPtr pCDea2, Int4 iOuter, Int4 iInner)
3190 {
3191 CddExpAlignPtr pCDea = NULL;
3192 Int4 i, iP1, iP2;
3193 SeqIdPtr sip1, sip2;
3194
3195 if (!pCDea1 || !pCDea2) return pCDea;
3196 if (pCDea1->ids && pCDea1->ids->next) {
3197 sip1 = SeqIdDup(pCDea1->ids->next);
3198 } else sip1 = NULL;
3199 if (pCDea2->ids && pCDea2->ids->next) {
3200 sip2 = SeqIdDup(pCDea2->ids->next);
3201 } else sip2 = NULL;
3202
3203 pCDea = CddExpAlignNew();
3204 CddExpAlignAlloc(pCDea,newlength);
3205 for (i=0;i<pCDea1->length && i<pCDea2->length;i++) {
3206 iP1 = pCDea1->adata[i]; iP2 = pCDea2->adata[i];
3207 if (iP1 > -1 && iP2 > -1){
3208 if (iP1 >= newlength) {
3209 printf("Warning: index error between sequences %d and %d\n",
3210 iInner, iOuter);
3211 CddSevError("Error while reindexing explicit alignments!");
3212 } else pCDea->adata[iP1] = iP2;
3213 if (pCDea1->starts[i] > 0) pCDea->starts[iP1] = 1;
3214 }
3215 }
3216 if (sip1 && sip2) {
3217 pCDea->ids = sip1;
3218 pCDea->ids->next = sip2;
3219 pCDea->bIdAlloc = TRUE;
3220 }
3221 return pCDea;
3222 }
3223
3224 /*---------------------------------------------------------------------------*/
3225 /*---------------------------------------------------------------------------*/
3226 /* returns a pairwise % id for two sequences in a CD (ordered by alignmt) */
3227 /*---------------------------------------------------------------------------*/
3228 /*---------------------------------------------------------------------------*/
CddGetPairId(TrianglePtr pTri,Int4 idx1,Int4 idx2)3229 FloatHi CddGetPairId(TrianglePtr pTri, Int4 idx1, Int4 idx2)
3230 {
3231 Int4 i, index;
3232 float findex;
3233 ScorePtr psc;
3234
3235 if (idx1 == idx2) return 100.0;
3236 if (idx1 > idx2) { i = idx1; idx1 = idx2; idx2 = i; }
3237 findex=(float)idx1*((float)pTri->nelements-((float)idx1+1.0)/2.0-1.0)+(float)idx2-1.0;
3238 index = (Int4) findex;
3239 if (index < 0) CddSevError("Impossible index in CddGetPairId!");
3240 psc = (ScorePtr) pTri->scores; i=0;
3241 while (psc && (i<index)) {
3242 i++; psc = psc->next;
3243 }
3244 if (psc) return psc->value.realvalue;
3245 else CddSevError("Ran out of scores in CddGetPairId!");
3246 return((FloatHi) 0.0);
3247 }
3248
HitYet(Int4Ptr retlist,Int4 index,Int4 i)3249 static Boolean HitYet(Int4Ptr retlist, Int4 index, Int4 i)
3250 {
3251 Int4 j;
3252 for (j=0;j<index;j++) {
3253 if (retlist[j]==i) {
3254 return TRUE;
3255 }
3256 }
3257 return FALSE;
3258 }
3259
3260 /*---------------------------------------------------------------------------*/
3261 /*---------------------------------------------------------------------------*/
3262 /* return a list of indices for the n most dissimilar sequences in a CD */
3263 /*---------------------------------------------------------------------------*/
3264 /*---------------------------------------------------------------------------*/
CddMostDiverse(TrianglePtr pTri,Int4 length,Int4 maxdiv)3265 Int4Ptr CddMostDiverse(TrianglePtr pTri, Int4 length, Int4 maxdiv)
3266 {
3267 Int4Ptr retlist;
3268 Int4 index, winner, i, j, ncomp;
3269 FloatHi sumcomp, mincomp;
3270 FloatHi **iMat;
3271 ScorePtr psc;
3272 ValNodePtr vnp;
3273
3274 if (!pTri) return NULL;
3275 if (length >= pTri->nelements) return NULL;
3276 if (length > maxdiv) length = maxdiv;
3277 retlist = MemNew(length * sizeof(Int4));
3278 retlist[0] = 0;
3279
3280 if (pTri->div_ranks) {
3281 vnp = pTri->div_ranks; i = 0;
3282 while (vnp) {
3283 retlist[i] = vnp->data.intvalue;
3284 i++;
3285 if (i >= length) break;
3286 vnp = vnp->next;
3287 }
3288 return(retlist);
3289 }
3290
3291 iMat = (FloatHi **) calloc(pTri->nelements,sizeof (FloatHi *));
3292 for (i=0;i<pTri->nelements;i++) {
3293 iMat[i] = (FloatHi *) calloc(pTri->nelements,sizeof(FloatHi));
3294 iMat[i][i] = 100;
3295 }
3296 psc = (ScorePtr) pTri->scores;
3297 for (i=0;i<pTri->nelements;i++) {
3298 for (j=i+1;j<pTri->nelements;j++) {
3299 iMat[i][j] = iMat[j][i] = psc->value.realvalue;
3300 psc = psc->next;
3301 }
3302 }
3303
3304 for (index = 1; index < length; index++) {
3305 mincomp = 100.0, winner = -1;
3306 for (i=1;i<pTri->nelements;i++) {
3307 sumcomp = 0.0; ncomp = 0;
3308 if (!HitYet(retlist,index,i)) {
3309 for (j=0;j<pTri->nelements;j++) {
3310 if (HitYet(retlist,index,j)) {
3311 /* sumcomp += CddGetPairId(pTri, i, j); */
3312 sumcomp += iMat[i][j];
3313 ncomp ++;
3314 }
3315 }
3316 if (ncomp) sumcomp /= (FloatHi) ncomp;
3317 if (sumcomp < mincomp) {
3318 mincomp = sumcomp; winner = i;
3319 }
3320 }
3321 }
3322 retlist[index] = winner;
3323 }
3324 for (i=0;i<pTri->nelements;i++) free(iMat[i]);
3325 free(iMat);
3326 return (retlist);
3327 }
3328
3329 /*---------------------------------------------------------------------------*/
3330 /*---------------------------------------------------------------------------*/
3331 /* return a list of indices for the n most similar sequences for the query */
3332 /*---------------------------------------------------------------------------*/
3333 /*---------------------------------------------------------------------------*/
CddMostSimilarToQuery(ScorePtr psc,Int4 length)3334 Int4Ptr CddMostSimilarToQuery(ScorePtr psc, Int4 length)
3335 {
3336 Int4Ptr retlist;
3337 Int4 winner, index, i, nelements;
3338 FloatHi sumcomp, mincomp;
3339 ScorePtr thispsc;
3340
3341 if (!psc) return NULL;
3342 nelements = 0; thispsc = psc;
3343 while (thispsc) { nelements++; thispsc = thispsc->next; }
3344 if (length >= nelements) return NULL;
3345 retlist = MemNew(length * sizeof(Int4));
3346 retlist[0] = 0;
3347 for (index=1;index<length;index++) {
3348 mincomp = 0.0; winner = -1;
3349 i = 0;
3350 thispsc = psc->next;
3351 while (thispsc) {
3352 i++;
3353 if (!HitYet(retlist,index,i)) {
3354 sumcomp = thispsc->value.realvalue;
3355 if (sumcomp > mincomp) {
3356 mincomp = sumcomp;
3357 winner = i;
3358 }
3359 }
3360 thispsc = thispsc->next;
3361 }
3362 retlist[index] = winner;
3363 }
3364 return(retlist);
3365 }
3366
3367
3368 /*---------------------------------------------------------------------------*/
3369 /*---------------------------------------------------------------------------*/
3370 /* retrieve a sequence by trying to locate it in the CDD bioseq-set or other */
3371 /* wise through the sequence/object-manager or ID1 */
3372 /*---------------------------------------------------------------------------*/
3373 /*---------------------------------------------------------------------------*/
CddRetrieveBioseqById(SeqIdPtr sip,SeqEntryPtr sep)3374 BioseqPtr CddRetrieveBioseqById(SeqIdPtr sip, SeqEntryPtr sep)
3375 {
3376 BioseqPtr bsp = NULL;
3377 Int4 uid = 0;
3378 /* SeqEntryPtr sepThis; */
3379 Int2 retcode = 1;
3380
3381 bsp = (BioseqPtr) CddFindSip(sip, sep);
3382 if (!bsp) {
3383 bsp = BioseqLockById(sip);
3384 }
3385 /*
3386 if (!bsp) {
3387 uid = ID1FindSeqId(sip);
3388 if (uid > 0) {
3389 sepThis = (SeqEntryPtr) ID1SeqEntryGet(uid,retcode);
3390 if (sepThis) {
3391 bsp = CddExtractBioseq(sepThis, sip);
3392 }
3393 }
3394 }
3395 */
3396 return bsp;
3397 }
3398
3399 /*---------------------------------------------------------------------------*/
3400 /*---------------------------------------------------------------------------*/
3401 /* Calculate pairwise percentages of identical residues for sequences in a CD*/
3402 /*---------------------------------------------------------------------------*/
3403 /*---------------------------------------------------------------------------*/
CddCalculateTriangle(CddPtr pcdd)3404 TrianglePtr CddCalculateTriangle(CddPtr pcdd)
3405 {
3406 BioseqPtr bsp1 = NULL, bsp2, bsp3;
3407 BioseqSetPtr bssp = NULL;
3408 CddExpAlignPtr pCDea, pCDea1, pCDea2;
3409 DenseDiagPtr ddp;
3410 SeqAlignPtr salpOuter, salpInner, salpHead;
3411 SeqIdPtr sip1, sip2, sip3;
3412 SeqPortPtr spp1, spp2, spp3;
3413 ScorePtr pscHead=NULL, psc, pscTail;
3414 TrianglePtr pTri;
3415 Uint1Ptr buffer1, buffer2, buffer3;
3416 Int4 i, alen, nid, iOuter, iInner;
3417
3418
3419 if (!pcdd->seqannot) return NULL;
3420 salpHead = pcdd->seqannot->data;
3421 pTri = TriangleNew();
3422 pTri->nelements = 1;
3423
3424 /*---------------------------------------------------------------------------*/
3425 /* Loop around Outer pointer to get master-slave alignments */
3426 /*---------------------------------------------------------------------------*/
3427 salpOuter = salpHead; bsp1 = NULL; sip1 = NULL;
3428 while (salpOuter) {
3429 pTri->nelements++;
3430 ddp = salpOuter->segs;
3431 if (!sip1) sip1 = SeqIdDup(ddp->id);
3432 sip2 = SeqIdDup(ddp->id->next);
3433 if (!bssp) bssp = pcdd->sequences->data.ptrvalue;
3434 if (!bsp1) {
3435 bsp1 = (BioseqPtr) CddRetrieveBioseqById(sip1, bssp->seq_set);
3436 buffer1 = MemNew((bsp1->length)*sizeof(Uint1));
3437 spp1 = SeqPortNew(bsp1, 0, bsp1->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3438 for (i=0; i<bsp1->length;i++) buffer1[i] = SeqPortGetResidue(spp1);
3439 spp1 = SeqPortFree(spp1);
3440 }
3441 bsp2 = NULL;
3442 bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3443 buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3444 spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3445 for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3446 spp2 = SeqPortFree(spp2);
3447 pCDea = CddExpAlignNew();
3448 CddExpAlignAlloc(pCDea,bsp1->length);
3449 while (ddp) {
3450 for (i=0;i<ddp->len;i++) {
3451 pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3452 }
3453 ddp = ddp->next;
3454 }
3455 alen = 0; nid = 0;
3456 for (i=0;i<bsp1->length;i++) {
3457 if (pCDea->adata[i] > -1) {
3458 alen++; if (buffer1[i] == buffer2[pCDea->adata[i]]) nid++;
3459 }
3460 }
3461 psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3462 psc->choice = 2;
3463 if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3464 else psc->value.realvalue = 0.0;
3465 psc->next = NULL;
3466 if (!pscHead) {
3467 pscHead = psc; pscTail = psc;
3468 } else {
3469 pscTail->next = psc; pscTail = psc; psc = NULL;
3470 }
3471 pCDea = CddExpAlignFree(pCDea);
3472 MemFree(buffer2); SeqIdFree(sip2);
3473 salpOuter = salpOuter->next;
3474 }
3475
3476 /*---------------------------------------------------------------------------*/
3477 /* Loop around Outer and Inner Pointer to get slave-slave similarities */
3478 /*---------------------------------------------------------------------------*/
3479 salpOuter = salpHead; iOuter = 1;
3480 while (salpOuter->next) {
3481 iOuter++;
3482 ddp = salpOuter->segs;
3483 sip2 = SeqIdDup(ddp->id->next); bsp2 = NULL;
3484 bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3485 buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3486 spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3487 for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3488 spp2 = SeqPortFree(spp2);
3489 pCDea1 = CddExpAlignNew();
3490 CddExpAlignAlloc(pCDea1,bsp1->length);
3491 while (ddp) {
3492 for (i=0;i<ddp->len;i++) pCDea1->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3493 ddp = ddp->next;
3494 }
3495 salpInner = salpOuter->next;
3496 iInner = iOuter;
3497 while (salpInner) {
3498 iInner++;
3499 ddp = salpInner->segs;
3500 sip3 = SeqIdDup(ddp->id->next); bsp3 = NULL;
3501 bsp3 = (BioseqPtr) CddRetrieveBioseqById(sip3,bssp->seq_set);
3502 buffer3 =MemNew((bsp3->length)*sizeof(Uint1));
3503 spp3 = SeqPortNew(bsp3, 0, bsp3->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3504 for (i=0; i<bsp3->length;i++) buffer3[i] = SeqPortGetResidue(spp3);
3505 spp3 = SeqPortFree(spp3);
3506 pCDea2 = CddExpAlignNew();
3507 CddExpAlignAlloc(pCDea2,bsp1->length);
3508 while (ddp) {
3509 for (i=0;i<ddp->len;i++) pCDea2->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3510 ddp = ddp->next;
3511 }
3512 pCDea = CddReindexExpAlign(pCDea1, bsp2->length, pCDea2, iOuter, iInner);
3513 alen = 0; nid = 0;
3514 for (i=0;i<bsp2->length;i++) {
3515 if (pCDea->adata[i] > -1) {
3516 alen++; if (buffer2[i] == buffer3[pCDea->adata[i]]) nid++;
3517 }
3518 }
3519 psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3520 psc->choice = 2;
3521 if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3522 else psc->value.realvalue = 0.0;
3523 psc->next = NULL;
3524 pscTail->next = psc; pscTail = psc; psc = NULL;
3525 pCDea2 = CddExpAlignFree(pCDea2);
3526 pCDea = CddExpAlignFree(pCDea);
3527 MemFree(buffer3); SeqIdFree(sip3);
3528 salpInner = salpInner->next;
3529 }
3530 pCDea1 = CddExpAlignFree(pCDea1);
3531 MemFree(buffer2); SeqIdFree(sip2);
3532 salpOuter = salpOuter->next;
3533 }
3534 if (sip1) SeqIdFree(sip1); if (bsp1) bsp1 = NULL;
3535 if (buffer1) MemFree(buffer1);
3536 pTri->scores = (struct score PNTR) pscHead;
3537 return(pTri);
3538 }
3539
3540 /*---------------------------------------------------------------------------*/
3541 /*---------------------------------------------------------------------------*/
3542 /* Calculate pairwise similarities for the query added to a cd alignment */
3543 /*---------------------------------------------------------------------------*/
3544 /*---------------------------------------------------------------------------*/
CddCalculateQuerySim(CddPtr pcdd,SeqAlignPtr salp)3545 ScorePtr CddCalculateQuerySim(CddPtr pcdd, SeqAlignPtr salp)
3546 {
3547 BioseqPtr bsp1, bsp2, bsp3;
3548 BioseqSetPtr bssp = NULL;
3549 CddExpAlignPtr pCDea, pCDea1, pCDea2;
3550 DenseDiagPtr ddp;
3551 SeqAlignPtr salpOuter, salpInner, salpHead;
3552 SeqIdPtr sip1, sip2, sip3;
3553 SeqPortPtr spp1, spp2, spp3;
3554 ScorePtr pscHead=NULL, psc, pscTail;
3555 Uint1Ptr buffer1, buffer2, buffer3;
3556 Int4 i, alen, nid;
3557
3558
3559 if (!salp || !pcdd) return NULL;
3560 salpHead = salp;
3561 /*---------------------------------------------------------------------------*/
3562 /* master-query alignment */
3563 /*---------------------------------------------------------------------------*/
3564 salpOuter = salpHead; bsp1 = NULL; sip1 = NULL;
3565 pCDea = CddExpAlignNew();
3566 ddp = salpOuter->segs;
3567 sip1 = SeqIdDup(ddp->id);
3568 sip2 = SeqIdDup(ddp->id->next);
3569 bssp = pcdd->sequences->data.ptrvalue;
3570 bsp1 = (BioseqPtr) CddRetrieveBioseqById(sip1, bssp->seq_set);
3571 buffer1 = MemNew((bsp1->length)*sizeof(Uint1));
3572 spp1 = SeqPortNew(bsp1, 0, bsp1->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3573 for (i=0; i<bsp1->length;i++) buffer1[i] = SeqPortGetResidue(spp1);
3574 spp1 = SeqPortFree(spp1);
3575 bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3576 buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3577 spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3578 for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3579 spp2 = SeqPortFree(spp2);
3580 CddExpAlignAlloc(pCDea,bsp1->length);
3581 while (ddp) {
3582 for (i=0;i<ddp->len;i++) {
3583 pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3584 }
3585 ddp = ddp->next;
3586 }
3587 alen = 0; nid = 0;
3588 for (i=0;i<bsp1->length;i++) {
3589 if (pCDea->adata[i] > -1) {
3590 alen++; if (buffer1[i] == buffer2[pCDea->adata[i]]) nid++;
3591 }
3592 }
3593 pCDea = CddExpAlignFree(pCDea);
3594 psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3595 psc->choice = 2;
3596 if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3597 else psc->value.realvalue = 0.0;
3598 psc->next = NULL;
3599 pscHead = psc; pscTail = psc; psc = NULL;
3600 MemFree(buffer2); SeqIdFree(sip2);
3601 /*---------------------------------------------------------------------------*/
3602 /* Loop around Inner Pointer to get query-slave similarities */
3603 /*---------------------------------------------------------------------------*/
3604 salpOuter = salpHead;
3605 pCDea1 = CddExpAlignNew();
3606 ddp = salpOuter->segs;
3607 sip2 = SeqIdDup(ddp->id->next);
3608 bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3609 buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3610 spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3611 for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3612 spp2 = SeqPortFree(spp2);
3613 CddExpAlignAlloc(pCDea1,bsp1->length);
3614 while (ddp) {
3615 for (i=0;i<ddp->len;i++) pCDea1->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3616 ddp = ddp->next;
3617 }
3618 salpInner = salpOuter->next;
3619 while (salpInner) {
3620 pCDea2 = CddExpAlignNew();
3621 ddp = salpInner->segs;
3622 sip3 = SeqIdDup(ddp->id->next);
3623 bsp3 = (BioseqPtr) CddRetrieveBioseqById(sip3,bssp->seq_set);
3624 buffer3 =MemNew((bsp3->length)*sizeof(Uint1));
3625 spp3 = SeqPortNew(bsp3, 0, bsp3->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3626 for (i=0; i<bsp3->length;i++) buffer3[i] = SeqPortGetResidue(spp3);
3627 spp3 = SeqPortFree(spp3);
3628 CddExpAlignAlloc(pCDea2,bsp1->length);
3629 while (ddp) {
3630 for (i=0;i<ddp->len;i++) pCDea2->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3631 ddp = ddp->next;
3632 }
3633 pCDea = CddReindexExpAlign(pCDea1, bsp2->length, pCDea2,0,0);
3634 alen = 0; nid = 0;
3635 for (i=0;i<bsp2->length;i++) {
3636 if (pCDea->adata[i] > -1) {
3637 alen++; if (buffer2[i] == buffer3[pCDea->adata[i]]) nid++;
3638 }
3639 }
3640 pCDea = CddExpAlignFree(pCDea);
3641 psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3642 psc->choice = 2;
3643 if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3644 else psc->value.realvalue = 0.0;
3645 psc->next = NULL;
3646 pscTail->next = psc; pscTail = psc; psc = NULL;
3647 pCDea2 = CddExpAlignFree(pCDea2);
3648 MemFree(buffer3); SeqIdFree(sip3);
3649 salpInner = salpInner->next;
3650 }
3651 pCDea1 = CddExpAlignFree(pCDea1);
3652 MemFree(buffer2); SeqIdFree(sip2);
3653 if (sip1) SeqIdFree(sip1); if (bsp1) bsp1 = NULL;
3654 if (buffer1) MemFree(buffer1);
3655 return(pscHead);
3656 }
3657
3658
3659 /*---------------------------------------------------------------------------*/
3660 /*---------------------------------------------------------------------------*/
3661 /* cloned from salpedit.c and salpacc.c */
3662 /*---------------------------------------------------------------------------*/
3663 /*---------------------------------------------------------------------------*/
CddSeqAnnotForSeqAlign(SeqAlignPtr salp)3664 SeqAnnotPtr LIBCALL CddSeqAnnotForSeqAlign (SeqAlignPtr salp)
3665 {
3666 SeqAnnotPtr sap;
3667
3668 if (salp==NULL)
3669 return NULL;
3670 sap = SeqAnnotNew ();
3671 sap->type = 2;
3672 sap->data = (Pointer) salp;
3673 return sap;
3674 }
3675
CddSeqAlignDup(SeqAlignPtr salp)3676 SeqAlignPtr LIBCALL CddSeqAlignDup (SeqAlignPtr salp)
3677 {
3678 SeqAnnotPtr sap,
3679 sap2;
3680 SeqAlignPtr salp2 = NULL,
3681 next;
3682
3683 next = salp->next;
3684 salp->next = NULL;
3685 sap = CddSeqAnnotForSeqAlign (salp);
3686 sap2 = (SeqAnnotPtr) AsnIoMemCopy ((Pointer) sap, (AsnReadFunc) SeqAnnotAsnRead, (AsnWriteFunc) SeqAnnotAsnWrite);
3687 if (sap2!=NULL) {
3688 salp2 = (SeqAlignPtr)sap2->data;
3689 sap2->data = NULL;
3690 SeqAnnotFree (sap2);
3691 }
3692 if (next != NULL)
3693 salp->next = next;
3694 sap->data = NULL;
3695 SeqAnnotFree (sap);
3696 return salp2;
3697 }
3698
3699
3700
3701 /*---------------------------------------------------------------------------*/
3702 /*---------------------------------------------------------------------------*/
3703 /* Duplicate a whole Sequence Alignment (i.e. a linked list of Seqaligns */
3704 /*---------------------------------------------------------------------------*/
3705 /*---------------------------------------------------------------------------*/
3706
3707
SeqAlignSetDup(SeqAlignPtr salp)3708 SeqAlignPtr LIBCALL SeqAlignSetDup(SeqAlignPtr salp)
3709 {
3710 SeqAlignPtr salpHead = NULL;
3711 SeqAlignPtr salpTail = NULL;
3712 SeqAlignPtr salpThis;
3713
3714 while (salp) {
3715 salpThis = (SeqAlignPtr) CddSeqAlignDup(salp);
3716 if (salpTail) {
3717 salpTail->next = salpThis;
3718 salpTail = salpThis;
3719 salpThis->next = NULL;
3720 } else {
3721 salpHead = salpThis;
3722 salpTail = salpThis;
3723 salpThis->next = NULL;
3724 }
3725 salp = salp->next;
3726 }
3727 return (salpHead);
3728 }
3729
3730
3731 /*---------------------------------------------------------------------------*/
3732 /*---------------------------------------------------------------------------*/
3733 /* fill internal gaps in a sequence alignment to prepare it for PSSM calc. */
3734 /*---------------------------------------------------------------------------*/
3735 /*---------------------------------------------------------------------------*/
CddDegapSeqAlign(SeqAlignPtr salp)3736 void LIBCALL CddDegapSeqAlign(SeqAlignPtr salp)
3737 {
3738 DenseDiagPtr ddp;
3739 Int4 iM, iS, iN, iC;
3740 Boolean bFirst, bInit = TRUE;
3741 BioseqPtr bsp;
3742 SeqIdPtr sip;
3743 Int4 mLen, sLen;
3744
3745 while (salp) {
3746 ddp = salp->segs;
3747 if (bInit) {
3748 sip = ddp->id;
3749 bsp = BioseqLockById(sip);
3750 mLen = bsp->length;
3751 BioseqUnlock(bsp);
3752 bInit = FALSE;
3753 }
3754 sip = ddp->id->next;
3755 bsp = BioseqLockById(sip);
3756 sLen = bsp->length;
3757 BioseqUnlock(bsp);
3758 bFirst = TRUE;
3759 while (ddp) {
3760 if (bFirst) { /* extend first aligned block to N-terminus */
3761 iM = ddp->starts[0];
3762 iS = ddp->starts[1];
3763 if (iM > iS) iM = iS;
3764 ddp->starts[0] -= iM;
3765 ddp->starts[1] -= iM;
3766 ddp->len += iM;
3767 bFirst = FALSE;
3768 }
3769 if (ddp->next) {
3770 iM = ddp->next->starts[0] - (ddp->starts[0]+ddp->len);
3771 iS = ddp->next->starts[1] - (ddp->starts[1]+ddp->len);
3772 if (iM > iS) iM = iS;
3773 if (iM > 0) {
3774 if (iM == 1) {
3775 iN = 1; iC = 0;
3776 } else {
3777 if ((iM % 2) == 0) {
3778 iN = iC = iM / 2;
3779 } else {
3780 iC = iM / 2; iN = iC + 1;
3781 }
3782 }
3783 ASSERT(iN+iC == iM);
3784 ddp->len += iN;
3785 ddp->next->starts[0] -= iC;
3786 ddp->next->starts[1] -= iC;
3787 ddp->next->len += iC;
3788 }
3789 } else { /* extend last aligned block to C-terminus */
3790 iM = mLen - (ddp->starts[0]+ddp->len);
3791 iS = sLen - (ddp->starts[1]+ddp->len);
3792 if (iM > iS) iM = iS;
3793 ddp->len += iM;
3794 }
3795 ddp = ddp->next;
3796 }
3797 salp = salp->next;
3798 }
3799 }
3800
3801 /*---------------------------------------------------------------------------*/
3802 /*---------------------------------------------------------------------------*/
3803 /* Duplicate a SeqId, pick the GI if encountering a linked list of several */
3804 /*---------------------------------------------------------------------------*/
3805 /*---------------------------------------------------------------------------*/
CddSeqIdDupPDBGI(SeqIdPtr sipold)3806 SeqIdPtr LIBCALL CddSeqIdDupPDBGI(SeqIdPtr sipold)
3807 {
3808 SeqIdPtr sip, sipgi = NULL, sippdb = NULL;
3809
3810 sip = sipold;
3811 while (sip) {
3812 if (sip->choice == SEQID_GI) sipgi = sip;
3813 if (sip->choice == SEQID_PDB) sippdb = sip;
3814 sip = sip->next;
3815 }
3816 if (sippdb) return(SeqIdDup(sippdb));
3817 if (sipgi) return(SeqIdDup(sipgi));
3818 return (SeqIdDup(sipold));
3819 }
3820
3821 /*---------------------------------------------------------------------------*/
3822 /*---------------------------------------------------------------------------*/
3823 /* Calculate a weighted 50/50 consensus sequence and make it new master */
3824 /*---------------------------------------------------------------------------*/
3825 /*---------------------------------------------------------------------------*/
CddConsensus(SeqAlignPtr salp,SeqEntryPtr sep,BioseqPtr bsp,SeqIntPtr range,BioseqPtr * bspCons,SeqAlignPtr * salpCons)3826 SeqAlignPtr LIBCALL CddConsensus(SeqAlignPtr salp,
3827 SeqEntryPtr sep,
3828 BioseqPtr bsp,
3829 SeqIntPtr range,
3830 BioseqPtr *bspCons,
3831 SeqAlignPtr *salpCons)
3832 {
3833 BioseqPtr bspThis;
3834 CddAlignWeightPtr pCAW;
3835 CddExpAlignPtr pCDea;
3836 CddExtAlignCellPtr *pCEACell;
3837 CddExtAlignColPtr pCEACol;
3838 CharPtr cbuffer, outptr;
3839 DenseDiagPtr ddp;
3840 Int4Ptr trunc_on_virtual;
3841 Int4Ptr trunc_aligned;
3842 ObjectIdPtr oidp;
3843 SeqAlignPtr salpThis, salpNew, salpTail;
3844 SeqEntryPtr sepNew;
3845 SeqIdPtr sipThis, sipCopy;
3846 SeqMapTablePtr smtp;
3847 SeqPortPtr spp;
3848 Uint1Ptr buffer;
3849 FloatHi SumWeight;
3850 Int4 maxextlen, nalign, ConsensusLen;
3851 Int4 index, midpt, ContWeight = 0;
3852 Int4 offset, i, j , k, ipt, seqpos;
3853 Int4 m_last, m_frst, s_insert, m_end;
3854 Int4 s_last, s_frst, maxpos = 0;
3855
3856 if (!salp) return (NULL);
3857 if (salp->type != SAT_PARTIAL || salp->segtype != SAS_DENDIAG) return(NULL);
3858 if (!range) return (NULL);
3859 offset = range->from;
3860 /*---------------------------------------------------------------------------*/
3861 /* trunc_on_virtual holds, for every residue of the trunc_master, its posi- */
3862 /* tion on the maximal expanded new master sequence */
3863 /* bsp is supposed to be the "truncated master", i.e. a bioseq covering the */
3864 /* interval from the first to the last aligned residue on the master sequence*/
3865 /*---------------------------------------------------------------------------*/
3866 trunc_on_virtual = MemNew(sizeof(Int4)*bsp->length);
3867 for (i=0;i<bsp->length;i++) trunc_on_virtual[i] = i;
3868 maxextlen = bsp->length-1;
3869 trunc_aligned = MemNew(sizeof(Int4)*bsp->length);
3870 for (i=0;i<bsp->length;i++) trunc_aligned[i] = 0;
3871
3872 /*---------------------------------------------------------------------------*/
3873 /* expand values on trunc_on_virtual to consider insertions relativ to master*/
3874 /*---------------------------------------------------------------------------*/
3875 nalign = 1;
3876 salpThis = salp;
3877 while (salpThis) {
3878 nalign++;
3879 ddp = salpThis->segs;
3880 if (ddp) for (j=ddp->starts[0]-offset;j<ddp->starts[0]+ddp->len-offset;j++) trunc_aligned[j]=1;
3881 while (ddp && ddp->next) {
3882 for (j=ddp->next->starts[0]-offset;j<ddp->next->starts[0]+ddp->next->len-offset;j++) trunc_aligned[j]=1;
3883 m_last = ddp->starts[0]+ddp->len - 1 - offset;
3884 m_frst = ddp->next->starts[0] - offset;
3885 m_end = ddp->next->starts[0] + ddp->next->len - 1 - offset;
3886 ASSERT(m_frst >= 0 && m_frst < bsp->length);
3887 ASSERT(m_last >= 0 && m_last < bsp->length);
3888 ASSERT(m_end >= 0 && m_end < bsp->length);
3889 /*---------------------------------------------------------------------------*/
3890 /* shift block from the c- to n-terminus if necessary */
3891 /*---------------------------------------------------------------------------*/
3892 /* for (j = m_end; j > m_frst; j--) {
3893 if (trunc_on_virtual[j-1] < trunc_on_virtual[j]-1) {
3894 trunc_on_virtual[j-1] = trunc_on_virtual[j] - 1;
3895 }
3896 }
3897 */
3898 s_insert = ddp->next->starts[1] - ddp->starts[1] - ddp->len;
3899 if (s_insert > (m_frst-m_last- 1)) {
3900 if (s_insert > (trunc_on_virtual[m_frst]-trunc_on_virtual[m_last]-1)) {
3901 /*---------------------------------------------------------------------------*/
3902 /* need to insert additional space, find insertion point first */
3903 /*---------------------------------------------------------------------------*/
3904 ipt = m_frst;
3905 for (j=m_frst;j>m_last;j--) {
3906 if (trunc_on_virtual[j-1] < trunc_on_virtual[j]-1) {
3907 ipt = j; break;
3908 }
3909 }
3910 /*---------------------------------------------------------------------------*/
3911 /* do the shift starting at the insertion point and moving to the end */
3912 /*---------------------------------------------------------------------------*/
3913 s_insert -= trunc_on_virtual[m_frst]-trunc_on_virtual[m_last]-1;
3914 ASSERT(s_insert > 0);
3915 for (j=ipt;j<bsp->length;j++) trunc_on_virtual[j]+=s_insert;
3916 }
3917 }
3918 ddp = ddp->next;
3919 }
3920 salpThis = salpThis->next;
3921 }
3922 /*---------------------------------------------------------------------------*/
3923 /* take care of unaligned regions on the master - distribute evenly on both */
3924 /* sides of the gap. If uneven number, one more residue shifted to N-terminus*/
3925 /*---------------------------------------------------------------------------*/
3926 i=-1;
3927 while (++i < bsp->length-1) {
3928 if (trunc_aligned[i]==0) {
3929 j=i; while(trunc_aligned[j]==0 && j < (bsp->length)) j++;
3930 j--;
3931 if (j > i && j < (bsp->length)) {
3932 midpt = i + (j-i)/2;
3933 for (k=j;k>midpt;k--) {
3934 if (trunc_on_virtual[k]<trunc_on_virtual[k+1]-1) {
3935 trunc_on_virtual[k] = trunc_on_virtual[k+1]-1;
3936 }
3937 }
3938 i = j;
3939 }
3940 }
3941 }
3942 /* for (j=0;j<bsp->length;j++) printf(" %d",trunc_on_virtual[j]);
3943 printf("\n"); */
3944 maxextlen = trunc_on_virtual[bsp->length-1]+1;
3945 for (i=1;i<bsp->length;i++) {
3946 if (trunc_on_virtual[i] <= trunc_on_virtual[i-1]) {
3947 CddSevError("Error determining alignment coordinate system");
3948 }
3949 }
3950 /*---------------------------------------------------------------------------*/
3951 /* allocate space for storing the extended alignment plus weights and columns*/
3952 /*---------------------------------------------------------------------------*/
3953 pCAW = MemNew(nalign * sizeof(CddAlignWeight));
3954 pCEACol = MemNew(maxextlen * sizeof(CddExtAlignCol));
3955 pCEACell = MemNew(nalign * sizeof(CddExtAlignCellPtr));
3956 for (j=0;j<nalign;j++) {
3957 pCEACell[j] = MemNew(maxextlen * sizeof(CddExtAlignCell));
3958 for (k=0;k<maxextlen;k++) {
3959 pCEACell[j][k].seqpos = -1;
3960 pCEACell[j][k].aatype = 26;
3961 }
3962 }
3963 /*---------------------------------------------------------------------------*/
3964 /* start to fill in the extended alignment data, row 0 is taken by old */
3965 /* master which is added as fully extended sequence (i.e. not truncated) */
3966 /*---------------------------------------------------------------------------*/
3967 i = 0;
3968 salpThis = salp;
3969 ddp = salpThis->segs;
3970 sipCopy = SeqIdDup(ddp->id); sipCopy->next = NULL;
3971 bspThis = CddRetrieveBioseqById(sipCopy,sep);
3972 SeqIdFree(sipCopy);
3973 pCAW[i].bsp = bspThis;
3974 spp = SeqPortNew(bspThis, 0, LAST_RESIDUE, Seq_strand_unknown,
3975 Seq_code_ncbistdaa);
3976 buffer = MemNew((bspThis->length)*sizeof(Uint1));
3977 for (index=0; index<bspThis->length; index++)
3978 buffer[index] = SeqPortGetResidue(spp);
3979 spp = SeqPortFree(spp);
3980 for (j=0;j<bsp->length;j++) {
3981 k = trunc_on_virtual[j];
3982 seqpos = j + offset;
3983 ASSERT (k>=0 && k<maxextlen);
3984 pCEACell[i][k].seqpos = seqpos;
3985 pCEACell[i][k].aatype = buffer[seqpos];
3986 if (pCEACell[i][k].aatype > (Uint1) 26) {
3987 pCEACell[i][k].aatype = 26;
3988 }
3989 }
3990 MemFree(buffer);
3991 /*---------------------------------------------------------------------------*/
3992 /* continue to fill in alignment data for the remaining sequences */
3993 /*---------------------------------------------------------------------------*/
3994 salpThis = salp;
3995 while (salpThis) {
3996 i++;
3997 ddp = salpThis->segs;
3998 sipThis = ddp->id->next;
3999 bspThis = CddRetrieveBioseqById(sipThis,sep);
4000 pCAW[i].bsp = bspThis;
4001 spp = SeqPortNew(bspThis, 0, LAST_RESIDUE, Seq_strand_unknown,
4002 Seq_code_ncbistdaa);
4003 buffer = MemNew((bspThis->length)*sizeof(Uint1));
4004 for (index=0; index<bspThis->length; index++)
4005 buffer[index] = SeqPortGetResidue(spp);
4006 spp = SeqPortFree(spp);
4007 while (ddp) {
4008 for (j = ddp->starts[0]; j<ddp->starts[0]+ddp->len;j++) {
4009 k = trunc_on_virtual[j - offset];
4010 ASSERT (k>=0 && k<maxextlen);
4011 pCEACell[i][k].seqpos = ddp->starts[1] + j - ddp->starts[0];
4012 pCEACell[i][k].aatype = buffer[pCEACell[i][k].seqpos];
4013 if (pCEACell[i][k].aatype > (Uint1) 26) {
4014 pCEACell[i][k].aatype = 26;
4015 }
4016 }
4017 if (ddp->next) {
4018 m_last = ddp->starts[0]+ddp->len-1-offset;
4019 m_frst = ddp->next->starts[0]-offset;
4020 s_last = ddp->starts[1]+ddp->len-1;
4021 s_frst = ddp->next->starts[1];
4022 if (s_frst - s_last > 1) { /* have unaligned residues */
4023 midpt = s_last + (s_frst - s_last)/2;
4024 for (j = s_last+1;j<=midpt;j++) {
4025 k = trunc_on_virtual[m_last] + j - s_last;
4026 ASSERT (k>=0 && k<maxextlen);
4027 pCEACell[i][k].seqpos = j;
4028 pCEACell[i][k].aatype = buffer[j];
4029 if (pCEACell[i][k].aatype > (Uint1) 26) {
4030 pCEACell[i][k].aatype = 26;
4031 }
4032 }
4033 for (j = s_frst-1; j>midpt; j--) {
4034 k = trunc_on_virtual[m_frst] - (s_frst - j);
4035 ASSERT (k>=0 && k<maxextlen);
4036 pCEACell[i][k].seqpos = j;
4037 pCEACell[i][k].aatype = buffer[j];
4038 if (pCEACell[i][k].aatype > (Uint1) 26) {
4039 pCEACell[i][k].aatype = 26;
4040 }
4041 }
4042 }
4043 }
4044 ddp = ddp->next;
4045 }
4046 MemFree(buffer);
4047 salpThis = salpThis->next;
4048 }
4049
4050 /*---------------------------------------------------------------------------*/
4051 /* now do statistics on the columns in this matrix to determine weights */
4052 /*---------------------------------------------------------------------------*/
4053 for (k=0;k<maxextlen;k++) {
4054 pCEACol[k].conpos = -1;
4055 pCEACol[k].occpos = 0;
4056 pCEACol[k].ntypes = 0;
4057 pCEACol[k].aatype = 0;
4058 pCEACol[k].w_occpos = 0.0;
4059 pCEACol[k].typecount = (Int4 *) MemNew(26*sizeof(Int4));
4060 pCEACol[k].wtypefreq = (FloatHi *) MemNew(26*sizeof(FloatHi));
4061 for (j=0;j<26;j++) {
4062 pCEACol[k].typecount[j] = 0;
4063 pCEACol[k].wtypefreq[j] = 0.0;
4064 }
4065 for (i=0;i<nalign;i++) {
4066 if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < 26) {
4067 pCEACol[k].typecount[(Int4)pCEACell[i][k].aatype]++;
4068 pCEACol[k].occpos++;
4069 if (pCEACol[k].occpos > maxpos) maxpos = pCEACol[k].occpos;
4070 }
4071 }
4072 for (j=0;j<26;j++) if (pCEACol[k].typecount[j]) pCEACol[k].ntypes++;
4073 }
4074 /*---------------------------------------------------------------------------*/
4075 /* use blocks with the highest position counts to determine weight */
4076 /*---------------------------------------------------------------------------*/
4077 for (i=0;i<nalign;i++) {
4078 pCAW[i].weight = 0.0;
4079 pCAW[i].nposaligned = 0;
4080 }
4081 for (k=0;k<maxextlen;k++) {
4082 if (pCEACol[k].occpos >= maxpos) {
4083 ContWeight++;
4084 for (i=0;i<nalign;i++) {
4085 if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < (Uint1) 26) {
4086 pCAW[i].nposaligned++;
4087 if (pCEACol[k].typecount[pCEACell[i][k].aatype] > 0) {
4088 pCAW[i].weight += 1.0 / (pCEACol[k].ntypes *
4089 pCEACol[k].typecount[pCEACell[i][k].aatype]);
4090 }
4091 }
4092 }
4093 }
4094 }
4095 /*---------------------------------------------------------------------------*/
4096 /* if sequence not covered by weight calculation, give it the expected weight*/
4097 /*---------------------------------------------------------------------------*/
4098 for (i=0;i<nalign;i++) {
4099 if (pCAW[i].weight <= 0.0) {
4100 pCAW[i].weight = (FloatHi) ContWeight * 1.0 / (FloatHi) nalign;
4101 }
4102 }
4103 SumWeight = 0.0;
4104 for(i=0;i<nalign;i++) {
4105 SumWeight += pCAW[i].weight;
4106 }
4107 for(i=0;i<nalign;i++) pCAW[i].weight /= SumWeight;
4108 /*---------------------------------------------------------------------------*/
4109 /* now determine columns with at least 50% representation and the consensus */
4110 /*---------------------------------------------------------------------------*/
4111 ConsensusLen = 0;
4112 for (k=0;k<maxextlen;k++) {
4113 for (i=0;i<nalign;i++) {
4114 if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < (Uint1) 26) {
4115 pCEACol[k].w_occpos += pCAW[i].weight;
4116 pCEACol[k].wtypefreq[pCEACell[i][k].aatype]+=pCAW[i].weight;
4117 }
4118 }
4119 if (pCEACol[k].w_occpos >= 0.5) {
4120 pCEACol[k].conpos = ConsensusLen;
4121 SumWeight = 0.0;
4122 ConsensusLen++;
4123 for (j=0;j<26;j++) {
4124 if (pCEACol[k].wtypefreq[j] > SumWeight) {
4125 SumWeight = pCEACol[k].wtypefreq[j];
4126 pCEACol[k].aatype = (Uint1) j;
4127 }
4128 }
4129 }
4130 }
4131 buffer = (Uint1 *) MemNew((ConsensusLen+1) * sizeof(Uint1));
4132 cbuffer = (Char *) MemNew((ConsensusLen+1) * sizeof(Char));
4133 for (k=0;k<maxextlen;k++) {
4134 if (pCEACol[k].conpos != -1) {
4135 buffer[pCEACol[k].conpos] = pCEACol[k].aatype;
4136 }
4137 }
4138 /*---------------------------------------------------------------------------*/
4139 /* translate sequence into ascii/character and make new bioseq */
4140 /*---------------------------------------------------------------------------*/
4141 smtp = SeqMapTableFind(Seq_code_ncbieaa, Seq_code_ncbistdaa);
4142 for (i=0;i<ConsensusLen;i++) {
4143 cbuffer[i] = (Char) SeqMapTableConvert(smtp, buffer[i]);
4144 }
4145 sepNew = FastaToSeqBuffEx(cbuffer,&outptr,FALSE,NULL,FALSE);
4146 *bspCons = (BioseqPtr) sepNew->data.ptrvalue;
4147 sipThis = (*bspCons)->id;
4148 oidp = ObjectIdNew();
4149 oidp->str = MemNew(10*sizeof(Char));
4150 strcpy(oidp->str,"consensus");
4151 oidp->str[9]='\0';
4152 MemFree(sipThis->data.ptrvalue);
4153 sipThis->data.ptrvalue = oidp;
4154 sipThis->choice = SEQID_LOCAL;
4155 sipThis->next = NULL;
4156
4157 /*---------------------------------------------------------------------------*/
4158 /* create the pairwise alignment old-master(not truncated) new-master */
4159 /*---------------------------------------------------------------------------*/
4160 bspThis = pCAW[0].bsp;
4161 pCDea = CddExpAlignNew();
4162 CddExpAlignAlloc(pCDea,bspThis->length);
4163 pCDea->ids = CddSeqIdDupPDBGI(bspThis->id);
4164 pCDea->ids->next = SeqIdDup((*bspCons)->id);
4165 pCDea->bIdAlloc = TRUE;
4166 for (k=0;k<maxextlen;k++) {
4167 if (pCEACol[k].conpos != -1) {
4168 pCDea->adata[pCEACell[0][k].seqpos]=pCEACol[k].conpos;
4169 }
4170 }
4171 /*---------------------------------------------------------------------------*/
4172 /* record block structure present in the input alignment to prevent merges */
4173 /*---------------------------------------------------------------------------*/
4174 ddp = salp->segs;
4175 while (ddp) {
4176 pCDea->starts[ddp->starts[0]] = 1;
4177 ddp = ddp->next;
4178 }
4179 *salpCons = CddExpAlignToSeqAlign(pCDea,NULL);
4180 pCDea = CddExpAlignFree(pCDea);
4181
4182 /*---------------------------------------------------------------------------*/
4183 /* create a multiple alignment salpNew where the consensus is the new master */
4184 /*---------------------------------------------------------------------------*/
4185 salpNew = NULL;
4186 for (i=0;i<nalign;i++) {
4187 pCDea = CddExpAlignNew();
4188 CddExpAlignAlloc(pCDea,ConsensusLen);
4189 pCDea->ids = SeqIdDup((*bspCons)->id);
4190 pCDea->ids->next = SeqIdDup(pCAW[i].bsp->id);
4191 pCDea->bIdAlloc = TRUE;
4192 for (k=0;k<maxextlen;k++) {
4193 if (pCEACol[k].conpos != -1) {
4194 pCDea->adata[pCEACol[k].conpos]=pCEACell[i][k].seqpos;
4195 }
4196 }
4197 salpThis = CddExpAlignToSeqAlign(pCDea,NULL);
4198 pCDea = CddExpAlignFree(pCDea);
4199 if (salpThis) {
4200 if (!salpNew) {
4201 salpNew = salpThis;
4202 } else {
4203 salpTail->next = salpThis;
4204 }
4205 salpTail = salpThis;
4206 }
4207 }
4208
4209 /*---------------------------------------------------------------------------*/
4210 /* deallocate memory for this function */
4211 /*---------------------------------------------------------------------------*/
4212 MemFree(cbuffer);
4213 MemFree(buffer);
4214 for (j=0;j<nalign;j++) MemFree(pCEACell[j]);
4215 MemFree(pCEACell);
4216 for (j=0;j<maxextlen;j++) {
4217 MemFree(pCEACol[j].typecount);
4218 MemFree(pCEACol[j].wtypefreq);
4219 }
4220 MemFree(pCEACol);
4221 MemFree(pCAW);
4222 MemFree(trunc_aligned);
4223 MemFree(trunc_on_virtual);
4224 return(salpNew);
4225 }
4226
4227
4228 /*---------------------------------------------------------------------------*/
4229 /*---------------------------------------------------------------------------*/
4230 /* first part of CddCposComp() */
4231 /* basically returns compactSearch, LetterHead, and posSearch */
4232 /*---------------------------------------------------------------------------*/
4233 /*---------------------------------------------------------------------------*/
CddCposCompPart1(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,compactSearchItems * compactSearch,ValNodePtr * LetterHead,posSearchItems * posSearch)4234 static void CddCposCompPart1(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
4235 compactSearchItems* compactSearch, ValNodePtr* LetterHead,
4236 posSearchItems* posSearch) {
4237 Int4 numalign, numseq; /*number of alignments and seqs */
4238 BLAST_ScoreBlkPtr sbp;
4239 SeqLocPtr private_slp = NULL;
4240 SeqPortPtr spp = NULL;
4241 Uint1Ptr query_seq, query_seq_start;
4242 Uint1 residue;
4243 Int4 index, a, KarlinReturn, array_size;
4244 Nlm_FloatHiPtr lambda, K, H;
4245 Int4Ptr open, extend;
4246 BLAST_ResFreqPtr stdrfp;
4247 Int2 iStatus;
4248 ValNodePtr error_return = NULL;
4249 ValNodePtr newLetter;
4250 SeqCodeTablePtr sctp;
4251 BioseqPtr bspFake, temp_bsp;
4252
4253 /* this is used for the DenseDiag calculations */
4254 bspFake = (BioseqPtr) CddBioseqCopy(NULL,bsp,0,bsp->length-1,0,FALSE);
4255
4256 if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4257 numalign = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
4258 }
4259 else {
4260 numalign = CddCountSeqAligns(listOfSeqAligns, &numseq);
4261 }
4262 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
4263 sbp->read_in_matrix = TRUE;
4264 sbp->protein_alphabet = TRUE;
4265 sbp->posMatrix = NULL;
4266 sbp->number_of_contexts = 1;
4267 iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
4268 ASSERT(iStatus == 0);
4269 compactSearch->qlength = bsp->length;
4270 compactSearch->alphabetSize = sbp->alphabet_size;
4271 compactSearch->matrix = sbp->matrix;
4272 compactSearch->gapped_calculation = TRUE;
4273 compactSearch->pseudoCountConst = 10;
4274 compactSearch->ethresh = 0.001;
4275
4276 /*---------------------------------------------------------------------------*/
4277 /* get the query sequence */
4278 /*---------------------------------------------------------------------------*/
4279 temp_bsp = bsp;
4280 if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4281 bsp = bspFake; /* to make call compatible with CddDenDiagCposComputation */
4282 }
4283 private_slp = SeqLocIntNew(0, bsp->length-1 , Seq_strand_plus, SeqIdFindBest(bsp->id, SEQID_GI));
4284 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
4285 SeqPortSet_do_virtual(spp, TRUE);
4286 query_seq_start = (Uint1Ptr) MemNew(((bsp->length)+2)*sizeof(Char));
4287 bsp = temp_bsp;
4288 query_seq_start[0] = NULLB;
4289 query_seq = query_seq_start+1;
4290 index=0;
4291 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
4292 if (IS_residue(residue)) {
4293 query_seq[index] = residue;
4294 index++;
4295 }
4296 }
4297 query_seq[index] = NULLB;
4298 spp = SeqPortFree(spp);
4299 private_slp = SeqLocFree(private_slp);
4300 compactSearch->query = query_seq;
4301
4302 BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
4303
4304 sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
4305 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
4306 if (1 == KarlinReturn) {
4307 BlastErrorPrint(error_return);
4308 }
4309 sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
4310 KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
4311 if (1 == KarlinReturn) {
4312 BlastErrorPrint(error_return);
4313 }
4314
4315 array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
4316 if (array_size > 0) {
4317 for (index = 0; index < array_size; index++) {
4318 if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
4319 sbp->kbp_ideal = BlastKarlinBlkCreate();
4320 sbp->kbp_ideal->Lambda = lambda[index];
4321 sbp->kbp_ideal->K = K[index];
4322 sbp->kbp_ideal->H = H[index];
4323 }
4324 }
4325 MemFree(open);
4326 MemFree(extend);
4327 MemFree(lambda);
4328 MemFree(K);
4329 MemFree(H);
4330 }
4331 if (sbp->kbp_ideal == NULL) {
4332 sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
4333 }
4334 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
4335 compactSearch->kbp_std = sbp->kbp_std;
4336 compactSearch->kbp_psi = sbp->kbp_psi;
4337 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
4338 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
4339 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
4340 compactSearch->K_ideal = sbp->kbp_ideal->K;
4341
4342 stdrfp = BlastResFreqNew(sbp);
4343 BlastResFreqStdComp(sbp,stdrfp);
4344 compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
4345 for(a = 0; a < compactSearch->alphabetSize; a++)
4346 compactSearch->standardProb[a] = stdrfp->prob[a];
4347 stdrfp = BlastResFreqDestruct(stdrfp);
4348 posSearch->posInformation = NULL;
4349 /*---------------------------------------------------------------------------*/
4350 /* numseq is replaced with numalign (last argument) - each alignment is a */
4351 /* single segment and represents a separate sequence */
4352 /*---------------------------------------------------------------------------*/
4353 posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
4354 if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4355 CddfindDenseDiagThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
4356 }
4357 else {
4358 CddfindThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
4359 }
4360 sbp->kbp = sbp->kbp_psi;
4361 sbp->kbp_gap = sbp->kbp_gap_psi;
4362 CddposInitializeInformation(posSearch,sbp,compactSearch);
4363 if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4364 CddposDenseDiagDemographics(posSearch, compactSearch, listOfSeqAligns);
4365 }
4366 else {
4367 CddposDemographics(posSearch, compactSearch, listOfSeqAligns);
4368 }
4369 posPurgeMatches(posSearch, compactSearch, NULL);
4370 posComputeExtents(posSearch, compactSearch);
4371 posComputeSequenceWeights(posSearch, compactSearch, 1.0);
4372 posCheckWeights(posSearch, compactSearch);
4373 posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
4374 CddposFreqsToMatrix(posSearch,compactSearch);
4375 posScaling(posSearch, compactSearch);
4376
4377 sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
4378 for (a=0;a<compactSearch->alphabetSize;a++) {
4379 newLetter = ValNodeNew(NULL);
4380 newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
4381 Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
4382 ValNodeLink(LetterHead,newLetter);
4383 }
4384 }
4385
4386
CddGetNewIndexForThreading(char InChar,char * Output)4387 Int4 LIBCALL CddGetNewIndexForThreading(char InChar, char* Output) {
4388 /*---------------------------------------------------------------------------*/
4389 /* get the index in Output where InChar is found. */
4390 /* return -1 if char is not found in the Output array. */
4391 /*---------------------------------------------------------------------------*/
4392 Int4 i, Index=0;
4393
4394 /* look through Output for InChar */
4395 for (i=0; i<StrLen(Output); i++) {
4396 if (Output[i] == InChar) {
4397 return(i);
4398 }
4399 }
4400 return(-1);
4401 }
4402
4403
CddCalcPSSM(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,double Weight,double ScaleFactor)4404 Seq_Mtf* LIBCALL CddCalcPSSM(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
4405 double Weight, double ScaleFactor) {
4406 /*---------------------------------------------------------------------------*/
4407 /* Generation of position-specific scoring matrices. */
4408 /* Put results in a Seq_Mtf structure in preparation for calling atd(). */
4409 /* allocate space for 2-d arrays pssm->ww and pssm->freq here. */
4410 /* pssm->ww[i][j] is pssm, where i is sequence index, j is residue index. */
4411 /* pssm->freqs[i][j] is table of residue frequencies. */
4412 /*---------------------------------------------------------------------------*/
4413 posSearchItems posSearch; /*holds position-specific info */
4414 compactSearchItems compactSearch;
4415 ValNodePtr LetterHead = NULL;
4416 Int4 i, j, jj;
4417 char* Input; /* order of residue-types from CD's */
4418 char* Output; /* order of residue-types needed for threading */
4419 Boolean* Coverage; /* for making sure all columns are filled */
4420 Int4 OutLen;
4421 Seq_Mtf* pssm;
4422
4423 /* allocate arrays for strings that determine input and output order */
4424 Input = (char*) MemNew(sizeof(InputOrder));
4425 Output = (char*) MemNew(sizeof(OutputOrder));
4426 StrCpy(Input, InputOrder);
4427 StrCpy(Output, OutputOrder);
4428
4429 OutLen = StrLen(Output);
4430 CddCposCompPart1(listOfSeqAligns, bsp, &compactSearch, &LetterHead, &posSearch);
4431
4432 /* construct new pssm */
4433 pssm = NewSeqMtf(compactSearch.qlength, OutLen);
4434 Coverage = (Boolean*) MemNew(sizeof(Boolean) * OutLen); /* set to 0 by MemNew */
4435
4436 /* copy results to pssm */
4437 for (j=0; j<compactSearch.alphabetSize;j++) {
4438 jj = CddGetNewIndexForThreading(Input[j], Output);
4439 if (jj != -1) {
4440 for (i=0; i<compactSearch.qlength;i++) {
4441 pssm->ww[i][jj] = ThrdRound(posSearch.posMatrix[i][j]*ScaleFactor*Weight);
4442 pssm->freqs[i][jj] = ThrdRound(posSearch.posFreqs[i][j]*ScaleFactor);
4443 Coverage[jj] = TRUE;
4444 }
4445 }
4446 }
4447
4448 /* make sure all columns are filled */
4449 for (i=0; i<OutLen; i++) {
4450 if (Coverage[i] == FALSE) {
4451 ASSERT(FALSE); /* should never get here */
4452 }
4453 }
4454
4455 /* clean up */
4456 MemFree(Input);
4457 MemFree(Output);
4458 MemFree(Coverage);
4459 CddposFreeMemory(&posSearch);
4460
4461 return(pssm);
4462 }
4463
4464
CddCposComp(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,CddPtr pcdd)4465 void LIBCALL CddCposComp(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp, CddPtr pcdd) {
4466 /*---------------------------------------------------------------------------*/
4467 /* Generation of position-specific scoring matrices for database scanning */
4468 /*---------------------------------------------------------------------------*/
4469 posSearchItems posSearch; /*holds position-specific info */
4470 compactSearchItems compactSearch;
4471 Int4 index, a, iNew;
4472 ValNodePtr error_return = NULL;
4473 ValNodePtr ColumnHead;
4474 ValNodePtr newRow, RowHead;
4475 ValNodePtr LetterHead = NULL;
4476 Char ckptFileName[PATH_MAX];
4477 Char cseqFileName[PATH_MAX];
4478 FILE *fp;
4479 CharPtr cAccession;
4480
4481 CddCposCompPart1(listOfSeqAligns, bsp, &compactSearch, &LetterHead, &posSearch);
4482
4483 /* put results in proper format... */
4484
4485 pcdd->posfreq = (MatrixPtr) MatrixNew();
4486 pcdd->posfreq->ncolumns = compactSearch.qlength;
4487 pcdd->posfreq->nrows = compactSearch.alphabetSize;
4488 pcdd->posfreq->scale_factor = POSFREQ_SCALE;
4489 pcdd->posfreq->row_labels = LetterHead;
4490 ColumnHead = NULL;
4491 for (index = 0; index<compactSearch.qlength;index++) {
4492 RowHead = NULL;
4493 for (a=0;a<compactSearch.alphabetSize;a++) {
4494 newRow = ValNodeNew(NULL);
4495 iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch.posFreqs[index][a]);
4496 newRow->data.intvalue = iNew;
4497 ValNodeLink(&(ColumnHead),newRow);
4498 }
4499 }
4500 pcdd->posfreq->columns = ColumnHead;
4501
4502 pcdd->scoremat = (MatrixPtr) MatrixNew();
4503 pcdd->scoremat->ncolumns = compactSearch.qlength;
4504 pcdd->scoremat->nrows = compactSearch.alphabetSize;
4505 pcdd->scoremat->scale_factor = 1;
4506 pcdd->scoremat->row_labels = LetterHead;
4507 ColumnHead = NULL;
4508 for (index = 0; index<compactSearch.qlength;index++) {
4509 RowHead = NULL;
4510 for (a=0;a<compactSearch.alphabetSize;a++) {
4511 newRow = ValNodeNew(NULL);
4512 iNew = (Int4) posSearch.posMatrix[index][a];
4513 newRow->data.intvalue = iNew;
4514 ValNodeLink(&(ColumnHead),newRow);
4515 }
4516 }
4517 pcdd->scoremat->columns = ColumnHead;
4518
4519 /*---------------------------------------------------------------------------*/
4520 /* Construct name for checkpoint file */
4521 /*---------------------------------------------------------------------------*/
4522 cAccession = CddGetAccession(pcdd);
4523 strcpy(ckptFileName,cAccession);
4524 strcat(ckptFileName,CKPTEXT);
4525
4526 CddposTakeCheckpoint(&posSearch, &compactSearch, ckptFileName, &error_return);
4527 strcpy(cseqFileName,cAccession);
4528 strcat(cseqFileName,CSEQEXT);
4529
4530 MemFree(cAccession);
4531
4532 fp = FileOpen(cseqFileName, "w");
4533 if (NULL == fp) {
4534 BlastConstructErrorMessage("CddCposComp", "Could not open fasta file", 1, &error_return);
4535 }
4536 BioseqToFasta(bsp,fp,FALSE);
4537 FileClose(fp);
4538 }
4539
4540 /*---------------------------------------------------------------------------*/
4541 /*---------------------------------------------------------------------------*/
4542 /* modified after PGPReadBlastOptions from blastpgp.c, as of 12/27/00 */
4543 /*---------------------------------------------------------------------------*/
4544 /*---------------------------------------------------------------------------*/
CddReadBlastOptions(BioseqPtr bsp,Int4 iPseudo,CharPtr matrix_name)4545 static PGPBlastOptionsPtr CddReadBlastOptions(BioseqPtr bsp, Int4 iPseudo, CharPtr matrix_name)
4546 {
4547 PGPBlastOptionsPtr bop;
4548 BLAST_OptionsBlkPtr options;
4549
4550 bop = MemNew(sizeof(PGPBlastOptions));
4551 bop->blast_database = StringSave("nr");
4552 bop->number_of_descriptions = 500;
4553 bop->number_of_alignments = 500;
4554 bop->believe_query = FALSE;
4555 options = BLASTOptionNew("blastp",TRUE); /* gapped Blast! */
4556 bop->options = options;
4557 bop->query_bsp = bsp;
4558 BLASTOptionSetGapParams(options, matrix_name, 0, 0);
4559 /* bop->is_xml_output = FALSE; */
4560 bop->align_options = 0;
4561 bop->print_options = 0;
4562 options->required_start = 0;
4563 options->required_end = -1;
4564 options->window_size = 40;
4565 options->dropoff_2nd_pass = 7.0;
4566 options->expect_value = 10.0;
4567 options->kappa_expect_value = options->expect_value;
4568 options->hitlist_size = 500;
4569 options->two_pass_method = FALSE;
4570 options->multiple_hits_only = TRUE;
4571 if (strcmp(matrix_name,"BLOSUM62") == 0) {
4572 options->gap_open = 11;
4573 options->gap_extend = 1;
4574 }
4575 options->decline_align = INT2_MAX;
4576 options->gap_x_dropoff = 15;
4577 options->gap_x_dropoff_final = 25;
4578 options->gap_trigger = 22.0;
4579 options->filter_string = StringSave("F");
4580 options->number_of_cpus = 1;
4581 options->isPatternSearch = FALSE;
4582 options->ethresh = 0.0; /* zeroed out, will not be used */
4583 options->pseudoCountConst = iPseudo;
4584 options->maxNumPasses = 1;
4585 options->hsp_range_max = 0;
4586 options->block_width = 20; /* Default value - previously '-L' parameter */
4587 options->tweak_parameters = TRUE;
4588 options->smith_waterman = FALSE;
4589 if (options->tweak_parameters > 1) {
4590 /* relax the cutoff evalue so that we don't loose too many
4591 * hits for which compositional adjustment improves the
4592 * evalue. */
4593 options->expect_value *= EVALUE_EXPAND;
4594 }
4595
4596 options = BLASTOptionValidate(options, "blastp");
4597 if (options == NULL) return NULL;
4598 bop->fake_bsp = bop->query_bsp;
4599 #if 0
4600 BioseqNew();
4601 bop->fake_bsp->descr = bop->query_bsp->descr;
4602 bop->fake_bsp->repr = bop->query_bsp->repr;
4603 bop->fake_bsp->mol = bop->query_bsp->mol;
4604 bop->fake_bsp->length = bop->query_bsp->length;
4605 bop->fake_bsp->seq_data_type = bop->query_bsp->seq_data_type;
4606 bop->fake_bsp->seq_data = bop->query_bsp->seq_data;
4607 bop->fake_bsp->id = bop->query_bsp->id;
4608 #endif
4609 /* bop->fake_bsp->seq_data = BSDup(bop->query_bsp->seq_data); */
4610 /* bop->fake_bsp->id = SeqIdDup(bop->query_bsp->id); */
4611 /* bug fix -- Lewis, Dave and Aron */
4612 /* bop->query_bsp->id = SeqIdSetFree(bop->query_bsp->id); */
4613 /* BLASTUpdateSeqIdInSeqInt(options->query_lcase_mask, bop->fake_bsp->id); */
4614 return bop;
4615 }
4616
4617
4618
4619 /*---------------------------------------------------------------------------*/
4620 /*---------------------------------------------------------------------------*/
4621 /* write out a matrix-file (ASCII formatted) for use with copymat/formatdb */
4622 /* routines from makemat, 4/9/01 */
4623 /*---------------------------------------------------------------------------*/
4624 /*---------------------------------------------------------------------------*/
4625
CddputMatrixKbp(FILE * checkFile,BLAST_KarlinBlkPtr kbp,Boolean scaling,Nlm_FloatHi scalingDown)4626 static void CddputMatrixKbp(FILE *checkFile, BLAST_KarlinBlkPtr kbp,
4627 Boolean scaling, Nlm_FloatHi scalingDown)
4628 {
4629 if (scaling)
4630 fprintf(checkFile,"%le\n",kbp->Lambda * scalingDown);
4631 else
4632 fprintf(checkFile,"%le\n",kbp->Lambda);
4633 fprintf(checkFile,"%le\n",kbp->K);
4634 fprintf(checkFile,"%le\n",kbp->logK);
4635 fprintf(checkFile,"%le\n",kbp->H);
4636 }
4637
CddputMatrixMatrix(FILE * checkFile,compactSearchItems * compactSearch,posSearchItems * posSearch,Boolean scaleScores)4638 static void CddputMatrixMatrix(FILE *checkFile,
4639 compactSearchItems *compactSearch,
4640 posSearchItems *posSearch,
4641 Boolean scaleScores)
4642 {
4643 Int4 i, j; /*loop indices*/
4644
4645 if (scaleScores) {
4646 for(i = 0; i < compactSearch->qlength; i++) {
4647 for(j = 0; j < compactSearch->alphabetSize; j++)
4648 fprintf(checkFile,"%ld ", (long) posSearch->posPrivateMatrix[i][j]);
4649 fprintf(checkFile,"\n");
4650 }
4651 }
4652 else {
4653 for(i = 0; i < compactSearch->qlength; i++) {
4654 for(j = 0; j < compactSearch->alphabetSize; j++)
4655 fprintf(checkFile,"%ld ", (long) posSearch->posMatrix[i][j]);
4656 fprintf(checkFile,"\n");
4657 }
4658 }
4659 }
4660
CddtakeMatrixCheckpoint(compactSearchItems * compactSearch,posSearchItems * posSearch,BLAST_ScoreBlkPtr sbp,Char * fileName,ValNodePtr * error_return,Boolean scaleScores,Nlm_FloatHi scalingFactor)4661 static Boolean CddtakeMatrixCheckpoint(compactSearchItems *compactSearch,
4662 posSearchItems *posSearch,
4663 BLAST_ScoreBlkPtr sbp,
4664 Char *fileName,ValNodePtr *error_return,
4665 Boolean scaleScores,
4666 Nlm_FloatHi scalingFactor)
4667 {
4668 FILE * checkFile; /* file in which to take the checkpoint */
4669 Int4 length; /* length of query sequence, and an index for it */
4670 Int4 i; /* indices to position and alphabet */
4671 Char localChar; /* temporary character */
4672
4673 checkFile = FileOpen(fileName, "w");
4674 if (NULL == checkFile) {
4675 BlastConstructErrorMessage("posTakeCheckpoint", "Could not open checkpoint file", 1, error_return);
4676 return(FALSE);
4677 }
4678 length = compactSearch->qlength;
4679 fprintf(checkFile,"%ld\n",(long) length);
4680 for(i = 0; i < length; i++) {
4681 localChar = getRes(compactSearch->query[i]);
4682 fprintf(checkFile,"%c",localChar);
4683 posSearch->posMatrix[i][Xchar] = Xscore;
4684 posSearch->posPrivateMatrix[i][Xchar] = Xscore * scalingFactor;
4685 }
4686 fprintf(checkFile,"\n");
4687 CddputMatrixKbp(checkFile, compactSearch->kbp_gap_std[0], scaleScores, 1/scalingFactor);
4688 CddputMatrixKbp(checkFile, compactSearch->kbp_gap_psi[0], scaleScores, 1/scalingFactor);
4689 CddputMatrixKbp(checkFile, sbp->kbp_ideal, scaleScores, 1/scalingFactor);
4690 CddputMatrixMatrix(checkFile, compactSearch, posSearch, scaleScores);
4691 FileClose(checkFile);
4692 return(TRUE);
4693 }
4694
4695 /*---------------------------------------------------------------------------*/
4696 /*---------------------------------------------------------------------------*/
4697 /*---------------------------------------------------------------------------*/
4698 /*---------------------------------------------------------------------------*/
BlastKarlinBlkCopy(BLAST_KarlinBlkPtr kbp_in,BLAST_KarlinBlkPtr kbp_out)4699 static void BlastKarlinBlkCopy(BLAST_KarlinBlkPtr kbp_in,BLAST_KarlinBlkPtr kbp_out)
4700 {
4701 kbp_out->Lambda = kbp_in->Lambda;
4702 kbp_out->K = kbp_in->K;
4703 kbp_out->logK = kbp_in->logK;
4704 kbp_out->H = kbp_in->H;
4705 /* kbp_out->Lambda_real = kbp_in->Lambda_real;
4706 kbp_out->K_real = kbp_in->K_real;
4707 kbp_out->logK_real = kbp_in->logK_real;
4708 kbp_out->H_real = kbp_in->H_real;
4709 kbp_out->q_frame = kbp_in->q_frame;
4710 kbp_out->s_frame = kbp_in->s_frame; */
4711 kbp_out->paramC = kbp_in->paramC;
4712 }
4713
4714 /*---------------------------------------------------------------------------*/
4715 /*---------------------------------------------------------------------------*/
4716 /* version of the PSSM calculation routine which does not return Karlin- */
4717 /* Altschul parameters */
4718 /*---------------------------------------------------------------------------*/
4719 /*---------------------------------------------------------------------------*/
CddDenDiagCposComp2(BioseqPtr bspFake,Int4 iPseudo,SeqAlignPtr salp,CddPtr pcdd,BioseqPtr bspOut,double Weight,double ScaleFactor,CharPtr matrix_name)4720 Seq_Mtf * LIBCALL CddDenDiagCposComp2(BioseqPtr bspFake, Int4 iPseudo,
4721 SeqAlignPtr salp, CddPtr pcdd,
4722 BioseqPtr bspOut, double Weight,
4723 double ScaleFactor, CharPtr matrix_name)
4724 {
4725 return CddDenDiagCposComp2KBP(bspFake, iPseudo, salp, pcdd, bspOut,
4726 Weight, ScaleFactor, matrix_name, NULL);
4727 }
4728
4729 /*---------------------------------------------------------------------------*/
4730 /*---------------------------------------------------------------------------*/
4731 /* updated version of the PSSM calculation routine, modified from blastpgp.c */
4732 /* as of 12/27/2000 */
4733 /*---------------------------------------------------------------------------*/
4734 /*---------------------------------------------------------------------------*/
CddDenDiagCposComp2KBP(BioseqPtr bspFake,Int4 iPseudo,SeqAlignPtr salp,CddPtr pcdd,BioseqPtr bspOut,double Weight,double ScaleFactor,CharPtr matrix_name,BLAST_KarlinBlkPtr kbp)4735 Seq_Mtf * LIBCALL CddDenDiagCposComp2KBP(BioseqPtr bspFake, Int4 iPseudo,
4736 SeqAlignPtr salp, CddPtr pcdd,
4737 BioseqPtr bspOut, double Weight,
4738 double ScaleFactor, CharPtr matrix_name,
4739 BLAST_KarlinBlkPtr kbp)
4740 {
4741 PGPBlastOptionsPtr bop;
4742 BlastSearchBlkPtr search;
4743 posSearchItems *posSearch = NULL;
4744 compactSearchItems *compactSearch = NULL;
4745 ValNodePtr ColumnHead, LetterHead, newRow, RowHead, newLetter;
4746 SeqCodeTablePtr sctp;
4747 Int4 numSeqs, numASeqs, alignLength, index, a, iNew, i;
4748 /*
4749 Int4 scale_factor;
4750 */
4751 Int4 j, OutLen, jj;
4752 Char ckptFileName[PATH_MAX];
4753 Char cseqFileName[PATH_MAX];
4754 /*
4755 Char mtrxFileName[PATH_MAX];
4756 */
4757 FILE *fp;
4758 Boolean bHasConsensus;
4759 Boolean bWriteOut = FALSE;
4760 Nlm_FloatHi *AInf, SumAInf = 0.0;
4761 char* Input; /* order of residue-types from CD's */
4762 char* Output; /*order of res-types needed for threading*/
4763 Boolean* Coverage; /* for making sure all columns are filled*/
4764 Seq_Mtf* pssm;
4765 CharPtr cAccession;
4766 Boolean scalingOkay = TRUE;
4767
4768 cAccession = CddGetAccession(pcdd);
4769 if (pcdd) {
4770 bHasConsensus = CddHasConsensus(pcdd);
4771 bWriteOut = CddHasDescription(pcdd,"Write out matrix");
4772 } else { /* determine from seqalign */
4773 bHasConsensus = SeqAlignHasConsensus(salp);
4774 }
4775 if (iPseudo <= 0) { /* need to determine first */
4776 AInf = SeqAlignInform(salp, bspFake, bHasConsensus, 0);
4777 for (i=0;i<bspFake->length;i++) SumAInf += AInf[i];
4778 if (SumAInf > 84 ) iPseudo = 10; /* purely empirical */
4779 else if (SumAInf > 55 ) iPseudo = 7;
4780 else if (SumAInf > 43 ) iPseudo = 5;
4781 else if (SumAInf > 41.5) iPseudo = 4;
4782 else if (SumAInf > 40 ) iPseudo = 3;
4783 else if (SumAInf > 39 ) iPseudo = 2;
4784 else iPseudo = 1;
4785 MemFree(AInf);
4786 if (pcdd && bWriteOut) printf("%s AInf:%6.1f PseudoCt: %d\n",cAccession,SumAInf, iPseudo);
4787 }
4788 if (NULL == matrix_name) {
4789 static CharPtr defaultMatrix = "BLOSUM62";
4790 matrix_name = defaultMatrix;
4791 }
4792 bop = (PGPBlastOptionsPtr) CddReadBlastOptions(bspFake, iPseudo, matrix_name);
4793 if (pcdd) { /* set up search with nr database size for CD dumping */
4794 search = BLASTSetUpSearchWithReadDb(bop->fake_bsp, "blastp",
4795 bop->query_bsp->length,
4796 bop->blast_database,
4797 bop->options, NULL);
4798 } else { /* or avoid using nr when calculating PSSM for threading */
4799 search = BLASTSetUpSearch(bop->fake_bsp, "blastp",
4800 bop->query_bsp->length,
4801 0, NULL, bop->options, NULL);
4802 }
4803 posSearch = NULL;
4804 compactSearch = NULL;
4805 posSearch = (posSearchItems *)MemNew(sizeof(posSearchItems));
4806 compactSearch = compactSearchNew(compactSearch);
4807 copySearchItems(compactSearch, search, bop->options->matrix);
4808 posInitializeInformation(posSearch,search);
4809
4810 /*---------------------------------------------------------------------------*/
4811 /* section modelled after BposComputation, posit.c as of 12/28/2000 */
4812 /*---------------------------------------------------------------------------*/
4813 search->posConverged = FALSE;
4814 numSeqs = CddCountDenDiagSeqAligns(salp, &numASeqs) + 1;
4815 alignLength = bspFake->length;
4816 posAllocateMemory(posSearch, compactSearch->alphabetSize,
4817 compactSearch->qlength, numSeqs);
4818 CddposProcessAlignment(posSearch,compactSearch,salp,numSeqs,alignLength,bspFake);
4819 posSearch->posNumSequences = numSeqs;
4820 posPurgeMatches(posSearch, compactSearch, NULL);
4821 if (bHasConsensus) posCancel(posSearch,compactSearch,0,0,0,compactSearch->qlength);
4822 posComputeExtents(posSearch, compactSearch);
4823 posComputeSequenceWeights(posSearch, compactSearch, 1.0);
4824 posCheckWeights(posSearch, compactSearch);
4825 /* printf("Calling posComputePseudoFreqs with %s\n",compactSearch->standardMatrixName);
4826 fflush(stdout); */
4827 posSearch->posFreqs = posComputePseudoFreqs(posSearch,compactSearch,TRUE);
4828 if (NULL == search->sbp->posFreqs)
4829 search->sbp->posFreqs = allocatePosFreqs(compactSearch->qlength, compactSearch->alphabetSize);
4830 copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs, compactSearch->qlength, compactSearch->alphabetSize);
4831
4832 /*---------------------------------------------------------------------------*/
4833 /* Construct name for checkpoint file and write out (if in a CD context) */
4834 /*---------------------------------------------------------------------------*/
4835 if (pcdd && bWriteOut) {
4836 strcpy(ckptFileName,cAccession);
4837 strcat(ckptFileName,CKPTEXT);
4838 if (NULL != ckptFileName)
4839 posTakeCheckpoint(posSearch, compactSearch, ckptFileName, NULL);
4840 }
4841
4842 posFreqsToMatrix(posSearch,compactSearch);
4843 /*---------------------------------------------------------------------------*/
4844 /* kludge to set the matrix column for matches to * (stop codons) to the */
4845 /* most negative value used, so that matrix-file corresponds to what makemat */
4846 /* produces. */
4847 /*---------------------------------------------------------------------------*/
4848 ASSERT(compactSearch->alphabetSize == 26); /* sanity check */
4849 /*
4850 for (c=0;c<compactSearch->qlength;c++) {
4851 posSearch->posPrivateMatrix[c][compactSearch->alphabetSize-1] = BLAST_SCORE_MIN;
4852 }
4853 */
4854 /* posScaling(posSearch, compactSearch); */
4855 scalingOkay = impalaScaling(posSearch, compactSearch, 1.0, TRUE);
4856 if (!scalingOkay) goto cleanup;
4857
4858 if (pcdd) {
4859 /*---------------------------------------------------------------------------*/
4860 /* also, write out the PSSM as an ASCII-formatted MATRIX file, for use with */
4861 /* copymat */
4862 /*---------------------------------------------------------------------------*/
4863 /* strcpy(mtrxFileName,cAccession);
4864 strcat(mtrxFileName,MTRXEXT);
4865 CddtakeMatrixCheckpoint(compactSearch,posSearch,search->sbp,mtrxFileName,
4866 NULL,FALSE,1.0);
4867 */
4868 /*---------------------------------------------------------------------------*/
4869 /* in compliance with makemat, set the score for the X-column */
4870 /*---------------------------------------------------------------------------*/
4871 for (index = 0; index<compactSearch->qlength;index++) {
4872 posSearch->posMatrix[index][21] = Xscore;
4873 }
4874 sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
4875 LetterHead = NULL;
4876 for (a=0;a<compactSearch->alphabetSize;a++) {
4877 newLetter = ValNodeNew(NULL);
4878 newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
4879 Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
4880 ValNodeLink(&(LetterHead),newLetter);
4881 }
4882
4883 pcdd->posfreq = (MatrixPtr) MatrixNew();
4884 pcdd->posfreq->ncolumns = compactSearch->qlength;
4885 pcdd->posfreq->nrows = compactSearch->alphabetSize;
4886 pcdd->posfreq->scale_factor = POSFREQ_SCALE;
4887 pcdd->posfreq->row_labels = LetterHead;
4888 ColumnHead = NULL;
4889 for (index = 0; index<compactSearch->qlength;index++) {
4890 RowHead = NULL;
4891 for (a=0;a<compactSearch->alphabetSize;a++) {
4892 newRow = ValNodeNew(NULL);
4893 iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
4894 newRow->data.intvalue = iNew;
4895 ValNodeLink(&(ColumnHead),newRow);
4896 }
4897 }
4898 pcdd->posfreq->columns = ColumnHead;
4899 pcdd->scoremat = (MatrixPtr) MatrixNew();
4900 pcdd->scoremat->ncolumns = compactSearch->qlength;
4901 pcdd->scoremat->nrows = compactSearch->alphabetSize;
4902 pcdd->scoremat->scale_factor = 1;
4903 pcdd->scoremat->row_labels = LetterHead;
4904 ColumnHead = NULL;
4905 for (index = 0; index<compactSearch->qlength;index++) {
4906 RowHead = NULL;
4907 for (a=0;a<compactSearch->alphabetSize;a++) {
4908 newRow = ValNodeNew(NULL);
4909 iNew = (Int4) posSearch->posMatrix[index][a];
4910 newRow->data.intvalue = iNew;
4911 ValNodeLink(&(ColumnHead),newRow);
4912 }
4913 }
4914 pcdd->scoremat->columns = ColumnHead;
4915
4916 } else { /* if pcdd is empty, fill in the Seq_Mtf data structure */
4917 Input = (char*) MemNew(sizeof(InputOrder));
4918 Output = (char*) MemNew(sizeof(OutputOrder));
4919 StrCpy(Input, InputOrder);
4920 StrCpy(Output, OutputOrder);
4921 OutLen = StrLen(Output);
4922 pssm = NewSeqMtf(compactSearch->qlength, OutLen);
4923 Coverage = (Boolean*) MemNew(sizeof(Boolean) * OutLen); /* set to 0 by MemNew */
4924 for (j=0; j<compactSearch->alphabetSize;j++) {
4925 jj = CddGetNewIndexForThreading(Input[j], Output);
4926 if (jj != -1) {
4927 for (i=0; i<compactSearch->qlength;i++) {
4928 pssm->ww[i][jj] = ThrdRound(posSearch->posMatrix[i][j]*ScaleFactor*Weight);
4929 pssm->freqs[i][jj] = ThrdRound(posSearch->posFreqs[i][j]*ScaleFactor);
4930 Coverage[jj] = TRUE;
4931 }
4932 }
4933 }
4934 for (i=0; i<OutLen; i++) {
4935 if (Coverage[i] == FALSE) {
4936 ASSERT(FALSE); /* should never get here */
4937 }
4938 }
4939 MemFree(Input);
4940 MemFree(Output);
4941 MemFree(Coverage);
4942 }
4943
4944 if (bspOut && pcdd && bWriteOut) {
4945 strcpy(cseqFileName,cAccession);
4946 strcat(cseqFileName,CSEQEXT);
4947 fp = FileOpen(cseqFileName, "w");
4948 if (NULL == fp) {
4949 BlastConstructErrorMessage("CddDenDiagCposComp2", "Could not open fasta file", 1, NULL);
4950 }
4951 BioseqToFasta(bspOut,fp,FALSE);
4952 FileClose(fp);
4953 }
4954
4955 /*---------------------------------------------------------------------------*/
4956 /* clean up */
4957 /*---------------------------------------------------------------------------*/
4958 cleanup:
4959 MemFree(cAccession);
4960 posSearch->NumSequences = posSearch->posNumSequences;
4961 posSearch->QuerySize = alignLength;
4962 CddposFreeMemory(posSearch);
4963 MemFree(posSearch);
4964 if (kbp) {
4965 BlastKarlinBlkCopy(*(compactSearch->kbp_gap_psi),kbp);
4966 }
4967 CddposFreeMemory2(compactSearch);
4968 MemFree(compactSearch);
4969 search = BlastSearchBlkDestruct(search);
4970 bop->options = BLASTOptionDelete(bop->options);
4971 MemFree(bop->blast_database);
4972 MemFree(bop);
4973 if (NULL == pcdd && scalingOkay) { /* return pssm if from within threader */
4974 return(pssm);
4975 }
4976 if (NULL != pcdd && !scalingOkay) { /* fields are empty if scaling failed */
4977 pcdd->posfreq = NULL;
4978 pcdd->scoremat = NULL;
4979 }
4980 return NULL;
4981 }
4982
4983 /*---------------------------------------------------------------------------*/
4984 /*---------------------------------------------------------------------------*/
4985 /* Free a Cdd data structure - works on a single CD only (not linked lists) */
4986 /*---------------------------------------------------------------------------*/
4987 /*---------------------------------------------------------------------------*/
CddFreeCarefully(CddPtr pcdd)4988 CddPtr LIBCALL CddFreeCarefully(CddPtr pcdd)
4989 {
4990 BioseqPtr bsp;
4991 BioseqSetPtr bssp;
4992 ScorePtr nxtscore, thisscore;
4993 SeqAnnotPtr sap, sapThis;
4994 SeqEntryPtr sep, sepThis;
4995 ValNodePtr vnp, vnpThis;
4996
4997 if(pcdd == NULL) return NULL;
4998 if (NULL != pcdd->name) pcdd->name = MemFree(pcdd->name);
4999 if (NULL != pcdd->id) pcdd->id = (CddIdSetPtr) CddIdSetFree(pcdd->id);
5000
5001 sap = pcdd->seqannot;
5002 while (sap) {
5003 sapThis = sap->next;
5004 sap = SeqAnnotFree(sap);
5005 sap = sapThis;
5006 }
5007
5008 vnp = pcdd->description;
5009 while (vnp) {
5010 vnpThis = vnp->next;
5011 switch (vnp->choice) {
5012 case CddDescr_othername:
5013 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5014 break;
5015 case CddDescr_category:
5016 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5017 break;
5018 case CddDescr_comment:
5019 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5020 break;
5021 case CddDescr_reference:
5022 vnp->data.ptrvalue = PubFree(vnp->data.ptrvalue);
5023 break;
5024 case CddDescr_create_date:
5025 vnp->data.ptrvalue = DateFree(vnp->data.ptrvalue);
5026 break;
5027 case CddDescr_tax_source:
5028 vnp->data.ptrvalue = OrgRefFree(vnp->data.ptrvalue);
5029 break;
5030 case CddDescr_source:
5031 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5032 break;
5033 default:
5034 break;
5035 }
5036 vnp = MemFree(vnp);
5037 vnp = vnpThis;
5038 }
5039
5040 if (NULL != pcdd->features) {
5041 pcdd->features = BiostrucAnnotSetFree(pcdd->features);
5042 }
5043
5044 if (NULL != pcdd->sequences) {
5045 bssp = pcdd->sequences->data.ptrvalue;
5046 if (NULL != bssp) {
5047 sep = bssp->seq_set;
5048 while (sep) {
5049 sepThis = sep->next;
5050 if (sep->choice == 1) {
5051 bsp = sep->data.ptrvalue;
5052 if (bsp->id->choice != SEQID_LOCAL)BioseqUnlock(bsp);
5053 BioseqFree(bsp);
5054 }
5055 MemFree(sep);
5056 sep = sepThis;
5057 }
5058 bssp = MemFree(bssp);
5059 }
5060 pcdd->sequences = ValNodeFree(pcdd->sequences);
5061 }
5062
5063 if(NULL != pcdd->profile_range) {
5064 pcdd->profile_range = SeqIntFree(pcdd->profile_range);
5065 }
5066 if (NULL != pcdd->trunc_master) {
5067 pcdd->trunc_master = BioseqFree(pcdd->trunc_master);
5068 }
5069 if (NULL != pcdd->posfreq) {
5070 vnp = pcdd->posfreq->row_labels; if (NULL != vnp) ValNodeFree(vnp);
5071 vnp = pcdd->posfreq->columns; if (NULL != vnp) ValNodeFree(vnp);
5072 pcdd->posfreq = MemFree(pcdd->posfreq);
5073 }
5074 if (NULL != pcdd->scoremat) {
5075 vnp = pcdd->scoremat->columns; if (NULL != vnp) ValNodeFree(vnp);
5076 pcdd->scoremat = MemFree(pcdd->scoremat);
5077 }
5078 if (NULL != pcdd->distance) {
5079 nxtscore = pcdd->distance->scores;
5080 while (nxtscore) {
5081 if (NULL != nxtscore->id) ObjectIdFree(nxtscore->id);
5082 thisscore = nxtscore;
5083 nxtscore = nxtscore->next;
5084 thisscore = MemFree(thisscore);
5085 }
5086 pcdd->distance = MemFree(pcdd->distance);
5087 }
5088 return MemFree(pcdd);
5089 }
5090
5091 /*---------------------------------------------------------------------------*/
5092 /* Translate a SeqLoc into an Integer list of residue numbers */
5093 /*---------------------------------------------------------------------------*/
CddGetFeatLocList(SeqLocPtr location,Int4 * nres)5094 Int4Ptr LIBCALL CddGetFeatLocList(SeqLocPtr location, Int4 *nres)
5095 {
5096 SeqLocPtr slp;
5097 SeqIntPtr sintp;
5098 PackSeqPntPtr pspp;
5099 SeqPntPtr spp;
5100 Int4Ptr presnum = NULL;
5101 Int4 i;
5102 Int4 nSubLoc;
5103 Int4Ptr pSubLoc;
5104
5105 presnum = MemNew(100 * sizeof(Int4));
5106 *nres = 0;
5107 if (!location) return NULL;
5108 switch (location->choice) {
5109 case SEQLOC_NULL:
5110 case SEQLOC_FEAT:
5111 case SEQLOC_BOND:
5112 case SEQLOC_EMPTY:
5113 case SEQLOC_WHOLE:
5114 break;
5115 case SEQLOC_INT:
5116 sintp = (SeqIntPtr) location->data.ptrvalue;
5117 for (i=sintp->from;i<=sintp->to;i++) {
5118 presnum[*nres] = i;
5119 (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5120 }
5121 break;
5122 case SEQLOC_PNT:
5123 spp = location->data.ptrvalue;
5124 presnum[*nres] = spp->point;
5125 (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5126 break;
5127 case SEQLOC_PACKED_PNT:
5128 pspp = location->data.ptrvalue;
5129 while (pspp) {
5130 for (i=0;i<pspp->used;i++) {
5131 presnum[*nres] = pspp->pnts[i];
5132 (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5133 }
5134 pspp = pspp->next;
5135 }
5136 break;
5137 case SEQLOC_PACKED_INT:
5138 case SEQLOC_MIX:
5139 case SEQLOC_EQUIV:
5140 slp = (SeqLocPtr)location->data.ptrvalue;
5141 while (slp) {
5142 pSubLoc = CddGetFeatLocList(slp, &nSubLoc);
5143 for (i=0;i<nSubLoc;i++) {
5144 presnum[*nres] = pSubLoc[i];
5145 (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5146 }
5147 slp = slp->next;
5148 }
5149 default:
5150 break;
5151 }
5152 return(presnum);
5153 }
5154
5155 /*---------------------------------------------------------------------------*/
5156 /* check for violations of an interval defined on the aligned master with */
5157 /* respect to aligned residues. Returns -1 if everything is OK, returns */
5158 /* the first location on the master (counting from 0) if a violation is */
5159 /* detected, returns INT4_MAX if negative interval numbers are encountered */
5160 /*---------------------------------------------------------------------------*/
CddSeqLocInExpAlign(SeqLocPtr location,CddExpAlignPtr eap)5161 static Int4 CddSeqLocInExpAlign(SeqLocPtr location, CddExpAlignPtr eap)
5162 {
5163 SeqIntPtr sintp;
5164 SeqPntPtr spp;
5165 PackSeqPntPtr pspp;
5166 SeqLocPtr slp;
5167 Int4 i;
5168
5169 if (!location) return(-1);
5170 switch (location->choice) {
5171 case SEQLOC_NULL:
5172 case SEQLOC_FEAT:
5173 case SEQLOC_BOND:
5174 case SEQLOC_EMPTY:
5175 case SEQLOC_WHOLE:
5176 break;
5177 case SEQLOC_INT:
5178 sintp = (SeqIntPtr) location->data.ptrvalue;
5179 if (sintp->from < 0 || sintp->to < 0) return(INT4_MAX);
5180 for (i=sintp->from;i<=sintp->to;i++) {
5181 if (eap->adata[i] < 0) return(i);
5182 }
5183 break;
5184 case SEQLOC_PNT:
5185 spp = location->data.ptrvalue;
5186 if (spp->point < 0) return(INT4_MAX);
5187 if (eap->adata[spp->point] < 0) return(spp->point);
5188 break;
5189 case SEQLOC_PACKED_PNT:
5190 pspp = location->data.ptrvalue;
5191 while (pspp) {
5192 for (i=0;i<pspp->used;i++) {
5193 if (pspp->pnts[i] < 0) return (INT4_MAX);
5194 if (eap->adata[pspp->pnts[i]] < 0) return(pspp->pnts[i]);
5195 }
5196 pspp = pspp->next;
5197 }
5198 break;
5199 case SEQLOC_PACKED_INT:
5200 case SEQLOC_MIX:
5201 case SEQLOC_EQUIV:
5202 slp = (SeqLocPtr)location->data.ptrvalue;
5203 while (slp) {
5204 i = CddSeqLocInExpAlign(slp,eap);
5205 if (i >= 0) return(i);
5206 slp = slp->next;
5207 }
5208 default:
5209 break;
5210 }
5211 return(-1);
5212 }
5213
5214
5215 /*---------------------------------------------------------------------------*/
5216 /*---------------------------------------------------------------------------*/
CddRelocateSeqLoc(SeqLocPtr location,SeqIdPtr sip,Int4 * ali)5217 static void CddRelocateSeqLoc(SeqLocPtr location, SeqIdPtr sip, Int4 *ali)
5218 {
5219 SeqIntPtr sintp;
5220 SeqPntPtr spp;
5221 PackSeqPntPtr pspp;
5222 SeqLocPtr slp;
5223 Int4 i;
5224
5225 if (!location) return;
5226 switch (location->choice) {
5227 case SEQLOC_NULL:
5228 case SEQLOC_FEAT:
5229 case SEQLOC_BOND:
5230 break;
5231 case SEQLOC_EMPTY:
5232 case SEQLOC_WHOLE:
5233 SeqIdFree(location->data.ptrvalue);
5234 location->data.ptrvalue = (SeqIdPtr) SeqIdDup(sip);
5235 break;
5236 case SEQLOC_INT:
5237 sintp = (SeqIntPtr) location->data.ptrvalue;
5238 SeqIdFree(sintp->id);
5239 sintp->id = SeqIdDup(sip);
5240 sintp->from = ali[sintp->from];
5241 sintp->to = ali[sintp->to];
5242 if (sintp->from < 0 || sintp->to < 0) CddSevError("Invalid SeqLoc Interval!");
5243 break;
5244 case SEQLOC_PNT:
5245 spp = location->data.ptrvalue;
5246 SeqIdFree(spp->id);
5247 spp->id = (SeqIdPtr) SeqIdDup(sip);
5248 spp->point = ali[spp->point];
5249 if (spp->point < 0) CddSevError("Invalid SeqLoc Interval!");
5250 break;
5251 case SEQLOC_PACKED_PNT:
5252 pspp = location->data.ptrvalue;
5253 while (pspp) {
5254 SeqIdFree(pspp->id);
5255 pspp->id = (SeqIdPtr) SeqIdDup(sip);
5256 for (i=0;i<pspp->used;i++) {
5257 pspp->pnts[i] = ali[pspp->pnts[i]];
5258 if (pspp->pnts[i] < 0) CddSevError("Invalid SeqLoc Interval!");
5259 }
5260 pspp = pspp->next;
5261 }
5262 break;
5263 case SEQLOC_PACKED_INT:
5264 case SEQLOC_MIX:
5265 case SEQLOC_EQUIV:
5266 slp = (SeqLocPtr)location->data.ptrvalue;
5267 while (slp) {
5268 CddRelocateSeqLoc(slp,sip,ali);
5269 slp = slp->next;
5270 }
5271 default:
5272 break;
5273 }
5274 }
5275
CddFindSeqIdInAlignAnnot(AlignAnnotPtr oldannot)5276 static SeqIdPtr CddFindSeqIdInAlignAnnot(AlignAnnotPtr oldannot)
5277 {
5278 SeqLocPtr location;
5279 location = oldannot->location;
5280 return SeqLocId(location);
5281 }
5282
5283
5284 /*---------------------------------------------------------------------------*/
5285 /*---------------------------------------------------------------------------*/
5286 /* transfer alignment annotations from the current master to another */
5287 /* sequence (before reindexing the alignment to this new master) */
5288 /*---------------------------------------------------------------------------*/
5289 /*---------------------------------------------------------------------------*/
CddTransferAlignAnnot(AlignAnnotPtr oldannot,SeqIdPtr newMaster,SeqAlignPtr salp,BioseqSetPtr bssp)5290 void LIBCALL CddTransferAlignAnnot(AlignAnnotPtr oldannot,
5291 SeqIdPtr newMaster,
5292 SeqAlignPtr salp,
5293 BioseqSetPtr bssp)
5294 {
5295 CddExpAlignPtr pCDeaO= NULL, pCDeaN = NULL, pCDeaON = NULL;
5296 SeqLocPtr slp;
5297 SeqIdPtr oldMaster, sip, sipTmp;
5298 Boolean bOldIsTop = FALSE;
5299 Boolean bNewIsTop = FALSE;
5300 DenseDiagPtr ddp;
5301 SeqAlignPtr salpThis;
5302 BioseqPtr bsp;
5303 SeqEntryPtr sep;
5304 AlignAnnotPtr aap;
5305
5306 slp = (SeqLocPtr) oldannot->location;
5307 oldMaster = CddFindSeqIdInAlignAnnot(oldannot);
5308 if (oldMaster->choice == SEQID_GI) {
5309 sep = bssp->seq_set; while (sep) {
5310 if (sep->choice == 1) {
5311 bsp = sep->data.ptrvalue;
5312 if (CddSameSip(oldMaster, bsp->id)) {
5313 sipTmp = bsp->id;
5314 while (sipTmp) {
5315 if (sipTmp->choice == SEQID_PDB) {
5316 oldMaster = sipTmp;
5317 break;
5318 }
5319 sipTmp = sipTmp->next;
5320 }
5321 break;
5322 }
5323 }
5324 sep = sep->next;
5325 }
5326 }
5327 ddp = salp->segs;
5328 sip = SeqIdDup(ddp->id); SeqIdFree(sip->next);
5329 if (CddSameSip(oldMaster,sip)) bOldIsTop = TRUE;
5330 if (CddSameSip(newMaster,sip)) bNewIsTop = TRUE;
5331 SeqIdFree(sip);
5332 salpThis = salp; while (salpThis) {
5333 ddp = salpThis->segs;
5334 if (CddSameSip(oldMaster,ddp->id->next)) {
5335 if (bNewIsTop) {
5336 pCDeaON = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5337 break;
5338 } else {
5339 pCDeaO = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5340 if (pCDeaN) break;
5341 }
5342 }
5343 if (CddSameSip(newMaster,ddp->id->next)) {
5344 if (bOldIsTop) {
5345 pCDeaON = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5346 break;
5347 } else {
5348 pCDeaN = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5349 if (pCDeaO) break;
5350 }
5351 }
5352 salpThis = salpThis->next;
5353 }
5354 if (!pCDeaON) { /* no direct relation */
5355 bsp = CddRetrieveBioseqById(oldMaster,bssp->seq_set);
5356 pCDeaON = CddReindexExpAlign(pCDeaO, bsp->length, pCDeaN, 0, 1);
5357 if (bNewIsTop) {
5358
5359 }
5360 }
5361 if (bNewIsTop) { /* need to revert alignment */
5362 pCDeaON = InvertCddExpAlign(pCDeaON,bssp->seq_set);
5363 }
5364 if (!pCDeaON) CddSevError("Error in CDDTransferAlignAnnot, can not locate alignment data!");
5365 aap = oldannot; while (aap) {
5366 slp = aap->location;
5367 while (slp) {
5368 CddRelocateSeqLoc(slp,newMaster,pCDeaON->adata);
5369 slp = slp->next;
5370 }
5371 aap = aap->next;
5372 }
5373 }
5374
5375
5376 /*---------------------------------------------------------------------------*/
5377 /*---------------------------------------------------------------------------*/
5378 /* reindex a SeqAlign (a linked list of pairwise Den-Diag Master-Slave Seq- */
5379 /* aligns) to a new Master, pointed out by it's SeqId. Will pick the first */
5380 /* instance of that SeqId in the alignment, and return a SeqAlign of the */
5381 /* same format as the input was */
5382 /*---------------------------------------------------------------------------*/
5383 /*---------------------------------------------------------------------------*/
CddReindexSeqAlign(SeqAlignPtr salp,SeqIdPtr sipMaster,BioseqSetPtr bssp)5384 SeqAlignPtr LIBCALL CddReindexSeqAlign(SeqAlignPtr salp, SeqIdPtr sipMaster,
5385 BioseqSetPtr bssp)
5386 {
5387 SeqAlignPtr salpThis, salpNew, salpHead, salpTail;
5388 DenseDiagPtr ddp;
5389 CddExpAlignPtr pCDea, pCDea1, pCDeaR;
5390 Int4 iNotThis = -1, i = -1, mlength;
5391 BioseqPtr bsp;
5392
5393 bsp = CddRetrieveBioseqById(sipMaster, bssp->seq_set);
5394 if (!bsp) return NULL;
5395 mlength = bsp->length;
5396 salpThis = salp; while(salpThis) {
5397 iNotThis++;
5398 ddp = salpThis->segs;
5399 if (CddSameSip(sipMaster,ddp->id->next)) break;
5400 else salpThis = salpThis->next;
5401 }
5402 pCDea1 = (CddExpAlignPtr) SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5403 pCDeaR = (CddExpAlignPtr) InvertCddExpAlign(pCDea1, bssp->seq_set);
5404 salpNew = (SeqAlignPtr) CddExpAlignToSeqAlign(pCDeaR,NULL);
5405 pCDeaR = CddExpAlignFree(pCDeaR);
5406 salpTail = salpNew;
5407 salpHead = salp;
5408 while (salpHead) {
5409 i++;
5410 if (i!= iNotThis) {
5411 pCDea = (CddExpAlignPtr) SeqAlignToCddExpAlign(salpHead, bssp->seq_set);
5412 pCDeaR = (CddExpAlignPtr) CddReindexExpAlign(pCDea1,mlength,pCDea,0,i);
5413 salpThis = (SeqAlignPtr) CddExpAlignToSeqAlign(pCDeaR,NULL);
5414 pCDeaR = CddExpAlignFree(pCDeaR);
5415 pCDea = CddExpAlignFree(pCDea);
5416 salpTail->next = salpThis;
5417 salpTail = salpThis;
5418 }
5419 salpHead = salpHead->next;
5420 }
5421
5422 pCDea1 = CddExpAlignFree(pCDea1);
5423 return (salpNew);
5424 }
5425 /********************************************************************/
5426 /* Remove all description/annotation info from the bioseq, */
5427 /* except for tax and mmdb info. */
5428 /********************************************************************/
CddShrinkBioseq(BioseqPtr bsp)5429 void LIBCALL CddShrinkBioseq(BioseqPtr bsp)
5430 {
5431 SeqDescrPtr next, sdp, sdp2 = NULL;
5432 SeqAnnotPtr sap, sap_next, thissap, sap2 = NULL;
5433
5434 sdp=bsp->descr;
5435 while (sdp != NULL) {
5436 next = sdp->next;
5437 sdp->next = NULL;
5438 if(sdp->choice == Seq_descr_title ||
5439 sdp->choice == Seq_descr_org ||
5440 sdp->choice == Seq_descr_pdb ||
5441 sdp->choice == Seq_descr_source) {
5442 ValNodeLink(&sdp2, sdp);
5443 } else SeqDescFree(sdp);
5444 sdp = next;
5445 }
5446 bsp->descr = sdp2;
5447
5448 sap = bsp->annot;
5449 while (sap != NULL) {
5450 sap_next = sap->next;
5451 if ((sap->type == 4 && ((SeqIdPtr)sap->data)->choice == SEQID_GENERAL
5452 && Nlm_StrCmp(((DbtagPtr)((SeqIdPtr)sap->data)->data.ptrvalue)->db,"mmdb") == 0)
5453 || (sap->type == 1 && ((SeqFeatPtr)sap->data)->data.choice == SEQFEAT_PROT)) {
5454 if (!sap2) {
5455 sap2 = sap;
5456 } else {
5457 thissap = sap2; while (thissap->next) thissap = thissap->next;
5458 thissap->next = sap;
5459 }
5460 sap->next = NULL;
5461 } else SeqAnnotFree(sap);
5462 sap = sap_next;
5463 }
5464 bsp->annot = sap2;
5465 }
5466
5467 /*---------------------------------------------------------------------------*/
5468 /*---------------------------------------------------------------------------*/
5469 /* Make a list of m-s seq-aligns proper, aligning only columns */
5470 /* found in all pairs. */
5471 /*---------------------------------------------------------------------------*/
5472 /*---------------------------------------------------------------------------*/
CddTrimSeqAligns(CddPtr pcdd)5473 Int2 LIBCALL CddTrimSeqAligns(CddPtr pcdd)
5474 {
5475 CddExpAlignPtr ceap,tceap;
5476 SeqAlignPtr salp,salp2,head,head2=NULL;
5477 SeqEntryPtr sep;
5478 ValNodePtr vnp,vhead=NULL;
5479 Int2 i;
5480 Int4Ptr iBreakAfter;
5481 DenseDiagPtr ddp;
5482
5483 if(pcdd == NULL) return 1;
5484 sep = ((BioseqSetPtr)pcdd->sequences->data.ptrvalue)->seq_set;
5485 head = pcdd->seqannot->data;
5486 if(head == NULL /*|| head->next == NULL */) return 1;
5487
5488 tceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(head, sep);
5489 iBreakAfter = MemNew(tceap->length * sizeof(Int4));
5490 /*---------------------------------------------------------------------------*/
5491 /* record existing block structure to make sure adjacent blocks aren't merged*/
5492 /*---------------------------------------------------------------------------*/
5493 for (i=0;i<tceap->length;i++) iBreakAfter[i] = 0;
5494 ddp = head->segs;
5495 while (ddp) {
5496 iBreakAfter[ddp->starts[0]+ddp->len-1] = 1;
5497 ddp = ddp->next;
5498 }
5499 for(salp=head; salp; salp=salp->next) {
5500 ceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(salp, sep);
5501 ValNodeAddPointer(&vhead, 0, ceap);
5502 if(SeqIdComp(tceap->ids,ceap->ids) != SIC_YES) {
5503 for(vnp=vhead; vnp; vnp=vnp->next) {
5504 ceap = vnp->data.ptrvalue;
5505 ceap->ids = SeqIdSetFree(ceap->ids);
5506 CddExpAlignFree(ceap);
5507 }
5508 tceap->ids = SeqIdSetFree(tceap->ids);
5509 CddExpAlignFree(tceap);
5510 ErrPostEx(SEV_WARNING,0,0,"Not master/slave alignments");
5511 return 1;
5512 }
5513 for(i=0; i<tceap->length; i++) {
5514 if(ceap->adata[i] == -1) {
5515 tceap->adata[i] = -1;
5516 if (i > 0) {
5517 if (ceap->adata[i-1] != -1) iBreakAfter[i-1] = 1;
5518 if (tceap->adata[i-1] != -1) iBreakAfter[i-1] = 1;
5519 }
5520 } else {
5521 if (i > 0) {
5522 if (tceap->adata[i-1] != -1 && (tceap->adata[i]-tceap->adata[i-1])>1)
5523 iBreakAfter[i-1] = 1;
5524 if (ceap->adata[i-1] != -1 && (ceap->adata[i]-ceap->adata[i-1])>1)
5525 iBreakAfter[i-1] = 1;
5526 }
5527 }
5528 }
5529 }
5530 for(vnp=vhead; vnp; vnp=vnp->next) {
5531 ceap = vnp->data.ptrvalue;
5532 for(i=0; i<tceap->length; i++)
5533 if(tceap->adata[i] == -1) ceap->adata[i] = -1;
5534 if(head2) {
5535 salp2->next = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBreakAfter);
5536 salp2 = salp2->next;
5537 } else
5538 head2 = salp2 = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBreakAfter);
5539 ceap->ids = SeqIdSetFree(ceap->ids);
5540 CddExpAlignFree(ceap);
5541 }
5542
5543 tceap->ids = SeqIdSetFree(tceap->ids);
5544 MemFree(iBreakAfter);
5545 CddExpAlignFree(tceap);
5546 ValNodeFree(vhead);
5547 SeqAlignListFree(pcdd->seqannot->data);
5548 pcdd->seqannot->data = head2;
5549
5550 return 0;
5551 }
5552
5553 /********************************************************************/
5554 /* Make a list of m-s seq-aligns proper, aligning only columns */
5555 /* found in all pairs. If !modify, don't change seq-aligns, only */
5556 /* read them to populate iBreakAfter. */
5557 /********************************************************************/
CddGetProperBlocks(CddPtr pcdd,Boolean modify,Int4Ptr * iBreakAfter)5558 Int2 LIBCALL CddGetProperBlocks(CddPtr pcdd, Boolean modify,
5559 Int4Ptr *iBreakAfter)
5560 {
5561 CddExpAlignPtr ceap,tceap;
5562 SeqAlignPtr salp,salp2,head,head2=NULL;
5563 SeqEntryPtr sep;
5564 ValNodePtr vnp,vhead=NULL;
5565 Int2 i;
5566 Int4Ptr iBrkAft;
5567
5568 if(pcdd == NULL) return 1;
5569 sep = ((BioseqSetPtr)pcdd->sequences->data.ptrvalue)->seq_set;
5570 head = pcdd->seqannot->data;
5571 if(head == NULL || head->next == NULL) return 1;
5572
5573 tceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(head, sep);
5574 iBrkAft = MemNew(tceap->length * sizeof(Int4));
5575 for (i=0;i<tceap->length;i++) iBrkAft[i] = 0;
5576 for(salp=head; salp; salp=salp->next) {
5577 ceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(salp, sep);
5578 ValNodeAddPointer(&vhead, 0, ceap);
5579 if(SeqIdComp(tceap->ids,ceap->ids) != SIC_YES) {
5580 for(vnp=vhead; vnp; vnp=vnp->next) {
5581 ceap = vnp->data.ptrvalue;
5582 ceap->ids = SeqIdSetFree(ceap->ids);
5583 CddExpAlignFree(ceap);
5584 }
5585 tceap->ids = SeqIdSetFree(tceap->ids);
5586 CddExpAlignFree(tceap);
5587 MemFree(iBrkAft);
5588 ErrPostEx(SEV_WARNING,0,0,"Not master/slave alignments");
5589 return 1;
5590 }
5591 for(i=0; i<tceap->length; i++) {
5592 if(ceap->adata[i] == -1) {
5593 tceap->adata[i] = -1;
5594 if (i > 0) {
5595 if (ceap->adata[i-1] != -1) iBrkAft[i-1] = 1;
5596 if (tceap->adata[i-1] != -1) iBrkAft[i-1] = 1;
5597 }
5598 } else {
5599 if (i > 0) {
5600 if (tceap->adata[i-1] != -1 && (tceap->adata[i]-tceap->adata[i-1])>1)
5601 iBrkAft[i-1] = 1;
5602 if (ceap->adata[i-1] != -1 && (ceap->adata[i]-ceap->adata[i-1])>1)
5603 iBrkAft[i-1] = 1;
5604 }
5605 }
5606 }
5607 }
5608 if(modify) {
5609 for(vnp=vhead; vnp; vnp=vnp->next) {
5610 ceap = vnp->data.ptrvalue;
5611 for(i=0; i<tceap->length; i++)
5612 if(tceap->adata[i] == -1) ceap->adata[i] = -1;
5613 if(head2) {
5614 salp2->next = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBrkAft);
5615 salp2 = salp2->next;
5616 } else
5617 head2 = salp2 = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBrkAft);
5618 ceap->ids = SeqIdSetFree(ceap->ids);
5619 CddExpAlignFree(ceap);
5620 }
5621 SeqAlignListFree(pcdd->seqannot->data);
5622 pcdd->seqannot->data = head2;
5623 }
5624
5625 if(iBreakAfter) *iBreakAfter = iBrkAft;
5626 else MemFree(iBrkAft);
5627 tceap->ids = SeqIdSetFree(tceap->ids);
5628 CddExpAlignFree(tceap);
5629 ValNodeFree(vhead);
5630
5631 return 0;
5632 }
5633
5634 /*---------------------------------------------------------------------------*/
5635 /* fix character string for export into XML */
5636 /*---------------------------------------------------------------------------*/
CddFixForXML(CharPtr pc)5637 static CharPtr CddFixForXML(CharPtr pc)
5638 {
5639 CharPtr pcNew;
5640 Int4 i = 0, j = 0;
5641
5642 pcNew = MemNew(sizeof(Char) * (Nlm_StrLen(pc) + 100));
5643
5644 while (pc[i] != '\0') {
5645 if (pc[i] == '>') {
5646 pcNew[j] = '&';
5647 pcNew[j+1] = 'g';
5648 pcNew[j+2] = 't';
5649 pcNew[j+3] = ';';
5650 j += 4;
5651 } else if (pc[i] == '<') {
5652 pcNew[j] = '&';
5653 pcNew[j+1] = 'l';
5654 pcNew[j+2] = 't';
5655 pcNew[j+3] = ';';
5656 j += 4;
5657 } else if (pc[i] == '&') {
5658 pcNew[j] = '&';
5659 pcNew[j+1] = 'a';
5660 pcNew[j+2] = 'm';
5661 pcNew[j+3] = 'p';
5662 pcNew[j+4] = ';';
5663 j += 5;
5664 } else {
5665 pcNew[j] = pc[i];
5666 j++;
5667 }
5668 i++;
5669 }
5670 MemFree(pc);
5671 pcNew[j] = '\0';
5672 pcNew = Nlm_Realloc(pcNew, sizeof(Char) * Nlm_StrLen(pcNew));
5673 return(pcNew);
5674 }
5675
5676 /*---------------------------------------------------------------------------*/
5677 /*---------------------------------------------------------------------------*/
5678 /* dump out pubmed links for a CD to be used in Entrez neighboring */
5679 /*---------------------------------------------------------------------------*/
5680 /*---------------------------------------------------------------------------*/
CddDumpPMLinks(CddPtr pcdd,FILE * FP)5681 void LIBCALL CddDumpPMLinks(CddPtr pcdd, FILE *FP)
5682 {
5683 CddIdPtr cid;
5684 Int4 gi = 0, pmid = 0;
5685 ValNodePtr pub, pdescr;
5686
5687 cid = (CddIdPtr) pcdd->id;
5688 while (cid) {
5689 switch (cid->choice) {
5690 case CddId_uid:
5691 gi = cid->data.intvalue;
5692 break;
5693 default:
5694 break;
5695 }
5696 cid = cid->next;
5697 }
5698 if (gi > 0) {
5699 pdescr = pcdd->description;
5700 while (pdescr) {
5701 switch (pdescr->choice) {
5702 case CddDescr_reference:
5703 pub = pdescr->data.ptrvalue;
5704 if (pub->choice == PUB_PMid) {
5705 pmid = pub->data.intvalue;
5706 if (pmid > 0) {
5707 fprintf(FP,"%d\t%d\t0\n",gi,pmid);
5708 }
5709 }
5710 break;
5711 default:
5712 break;
5713 }
5714 pdescr = pdescr->next;
5715 }
5716 }
5717 }
5718
5719 /*---------------------------------------------------------------------------*/
5720 /*---------------------------------------------------------------------------*/
5721 /* dump out taxonomy links for a CD to be used in Entrez neighboring */
5722 /*---------------------------------------------------------------------------*/
5723 /*---------------------------------------------------------------------------*/
CddDumpTaxLinks(CddPtr pcdd,FILE * FP)5724 void LIBCALL CddDumpTaxLinks(CddPtr pcdd, FILE *FP)
5725 {
5726 CddIdPtr cid;
5727 Int4 txid = 0, gi = 0;
5728 DbtagPtr dbtp;
5729 ObjectIdPtr oidp;
5730 ValNodePtr pdescr, vnp;
5731 OrgRefPtr pOrgRef;
5732
5733 cid = (CddIdPtr) pcdd->id;
5734 while (cid) {
5735 switch (cid->choice) {
5736 case CddId_uid:
5737 gi = cid->data.intvalue;
5738 break;
5739 default:
5740 break;
5741 }
5742 cid = cid->next;
5743 }
5744 if (gi > 0) {
5745 pdescr = pcdd->description;
5746 while (pdescr) {
5747 switch (pdescr->choice) {
5748 case CddDescr_tax_source:
5749 pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5750 vnp = pOrgRef->db;
5751 while (vnp) {
5752 dbtp = (DbtagPtr) vnp->data.ptrvalue;
5753 if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5754 oidp = dbtp->tag;
5755 if (oidp && oidp->id >= 0) {
5756 fprintf(FP,"%d\t%d\t0\n",gi,oidp->id);
5757 }
5758 }
5759 vnp = vnp->next;
5760 }
5761 break;
5762 default:
5763 break;
5764 }
5765 pdescr = pdescr->next;
5766 }
5767 }
5768 }
5769
5770 /*---------------------------------------------------------------------------*/
5771 /*---------------------------------------------------------------------------*/
5772 /* dump out contents of a CD used for Entrez-indexing in pseudo-XML */
5773 /*---------------------------------------------------------------------------*/
5774 /*---------------------------------------------------------------------------*/
CddDumpXML(CddPtr pcdd,FILE * FP)5775 void LIBCALL CddDumpXML(CddPtr pcdd, FILE *FP)
5776 {
5777 ValNodePtr pdescr, cid;
5778 CharPtr cShortName = NULL, cAbstract = NULL, cPubDate = NULL;
5779 CharPtr cEntrezDate = NULL, cFilter = NULL, cOrganism = NULL;
5780 CharPtr cAccession = NULL, cDatabase = NULL;
5781 Int4 gi = 0, txid = -1;
5782 Boolean bHaveAbstract = FALSE, bAllocDatabase = FALSE;
5783 GlobalIdPtr pGid;
5784 OrgRefPtr pOrgRef;
5785 DatePtr pDate;
5786 ObjectIdPtr oidp;
5787 ValNodePtr vnp;
5788 DbtagPtr dbtp;
5789
5790 cEntrezDate = MemNew(11 * sizeof(Char));
5791 cid = (CddIdPtr) pcdd->id;
5792 while (cid) {
5793 switch (cid->choice) {
5794 case CddId_uid:
5795 gi = cid->data.intvalue;
5796 break;
5797 case CddId_gid:
5798 pGid = (GlobalIdPtr) cid->data.ptrvalue;
5799 cAccession = pGid->accession;
5800 break;
5801 default:
5802 break;
5803 }
5804 cid = cid->next;
5805 }
5806 pdescr = pcdd->description;
5807 while (pdescr) {
5808 switch (pdescr->choice) {
5809 case CddDescr_comment:
5810 if (!bHaveAbstract) {
5811 cAbstract = (CharPtr) pdescr->data.ptrvalue;
5812 bHaveAbstract = TRUE;
5813 }
5814 break;
5815 case CddDescr_source:
5816 cDatabase = (CharPtr) pdescr->data.ptrvalue;
5817 case CddDescr_tax_source:
5818 pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5819 cOrganism = pOrgRef->taxname;
5820 vnp = pOrgRef->db;
5821 while (vnp) {
5822 dbtp = (DbtagPtr) vnp->data.ptrvalue;
5823 if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5824 oidp = dbtp->tag;
5825 if (oidp && oidp->id >= 0) {
5826 txid = oidp->id;
5827 }
5828 }
5829 vnp = vnp->next;
5830 }
5831 break;
5832 case CddDescr_create_date:
5833 case CddDescr_update_date:
5834 cPubDate = MemNew(11 * sizeof(Char));
5835 pDate = (DatePtr) pdescr->data.ptrvalue;
5836 if (pDate->data[2] > 0 && pDate->data[3] > 0)
5837 sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5838 if (pDate->data[2] > 0 && pDate->data[3] == 0)
5839 sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5840 if (pDate->data[2] == 0 && pDate->data[3] == 0)
5841 sprintf(cPubDate,"%d",pDate->data[1]+1900);
5842 break;
5843 default:
5844 break;
5845 }
5846 pdescr = pdescr->next;
5847 }
5848 cShortName = (CharPtr) pcdd->name;
5849 if (cAccession) {
5850 if (Nlm_StrNCmp(cAccession,"cd",2) == 0) {
5851 cDatabase = StringSave("Cdd");
5852 bAllocDatabase = TRUE;
5853 }
5854 }
5855 if (!cPubDate) {
5856 cPubDate = MemNew(11 * sizeof(Char));
5857 pDate = (DatePtr) DateCurr();
5858 if (pDate->data[2] > 0 && pDate->data[3] > 0)
5859 sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5860 if (pDate->data[2] > 0 && pDate->data[3] == 0)
5861 sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5862 if (pDate->data[2] == 0 && pDate->data[3] == 0)
5863 sprintf(cPubDate,"%d",pDate->data[1]+1900);
5864 }
5865 pDate = (DatePtr) DateCurr();
5866 if (pDate->data[2] > 0 && pDate->data[3] > 0)
5867 sprintf(cEntrezDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5868 if (pDate->data[2] > 0 && pDate->data[3] == 0)
5869 sprintf(cEntrezDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5870 if (pDate->data[2] == 0 && pDate->data[3] == 0)
5871 sprintf(cEntrezDate,"%d",pDate->data[1]+1900);
5872
5873 cAbstract = CddFixForXML(cAbstract);
5874 cShortName = CddFixForXML(cShortName);
5875
5876 fprintf(FP," <IdxDocument>\n");
5877 if (gi > 0)
5878 fprintf(FP," <IdxUid>%d</IdxUid>\n",gi);
5879 fprintf(FP," <IdxKeyFields>\n");
5880 if (cPubDate)
5881 fprintf(FP," <PubDate>%s</PubDate>\n",cPubDate);
5882 if (cEntrezDate)
5883 fprintf(FP," <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5884 fprintf(FP," </IdxKeyFields>\n");
5885 fprintf(FP," <IdxDisplayFields>\n");
5886 if (cAccession)
5887 fprintf(FP," <Accession>%s</Accession>\n",cAccession);
5888 if (cShortName)
5889 fprintf(FP," <Title>%s</Title>\n",cShortName);
5890 if (cAbstract)
5891 fprintf(FP," <Abstract>%s</Abstract>\n",cAbstract);
5892 fprintf(FP," </IdxDisplayFields>\n");
5893 fprintf(FP," <IdxSearchFields>\n");
5894 if (gi > 0)
5895 fprintf(FP," <Uid>%d</Uid>\n",gi);
5896 if (cFilter)
5897 fprintf(FP," <Filter>%s</Filter>\n",cFilter);
5898 if (cAccession)
5899 fprintf(FP," <Accession>%s</Accession>\n",cAccession);
5900 if (cDatabase)
5901 fprintf(FP," <Database>%s</Database>\n",cDatabase);
5902 if (cShortName)
5903 fprintf(FP," <Title>%s</Title>\n",cShortName);
5904 if (cAbstract)
5905 fprintf(FP," <Text>%s</Text>\n",cAbstract);
5906 if (txid >= 0)
5907 fprintf(FP," <Organism>txid%d</Organism>\n",txid);
5908 if (cOrganism)
5909 fprintf(FP," <Organism>%s</Organism>\n",cOrganism);
5910 if (cPubDate)
5911 fprintf(FP," <PubDate>%s</PubDate>\n",cPubDate);
5912 if (cEntrezDate)
5913 fprintf(FP," <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5914 fprintf(FP," </IdxSearchFields>\n");
5915 fprintf(FP," </IdxDocument>\n");
5916
5917 if (bAllocDatabase) MemFree(cDatabase);
5918 MemFree(cPubDate);
5919 MemFree(cEntrezDate);
5920 }
5921
5922 /*---------------------------------------------------------------------------*/
5923 /*---------------------------------------------------------------------------*/
5924 /* create and link a CddIdxData structure */
5925 /*---------------------------------------------------------------------------*/
5926 /*---------------------------------------------------------------------------*/
CddIdxDataNew(CharPtr acc,Int4 uid)5927 CddIdxDataPtr LIBCALL CddIdxDataNew(CharPtr acc, Int4 uid)
5928 {
5929
5930 CddIdxDataPtr cidp;
5931
5932 cidp = MemNew(sizeof(CddIdxData));
5933
5934 if (acc) {
5935 cidp->cCDDid = acc;
5936 cidp->iPssmId = uid;
5937 }
5938 cidp->next = NULL;
5939 return(cidp);
5940 }
5941
CddIdxDataLink(CddIdxDataPtr PNTR head,CddIdxDataPtr cidp)5942 CddIdxDataPtr LIBCALL CddIdxDataLink(CddIdxDataPtr PNTR head, CddIdxDataPtr cidp)
5943 {
5944 CddIdxDataPtr cidpThis;
5945
5946 cidpThis = *head;
5947 if (cidpThis) {
5948 while (cidpThis->next != NULL) cidpThis = cidpThis->next;
5949 cidpThis->next = cidp;
5950 } else {
5951 *head = cidp;
5952 }
5953 return *head;
5954 }
5955
5956
5957 /*---------------------------------------------------------------------------*/
5958 /*---------------------------------------------------------------------------*/
5959 /* read in the cdd.idx file to return data associating accessions with uids */
5960 /*---------------------------------------------------------------------------*/
5961 /*---------------------------------------------------------------------------*/
CddReadIdx(CharPtr CDDidx)5962 CddIdxDataPtr LIBCALL CddReadIdx(CharPtr CDDidx)
5963 {
5964 CddIdxDataPtr cidp, cidpHead = NULL;
5965 Char pcBuf[2048];
5966 CharPtr pcTest;
5967 CharPtr acc;
5968 Int4 uid;
5969 FILE *fp;
5970
5971 fp = FileOpen(CDDidx,"r");
5972 if (!fp) CddSevError("could not open cdd.idx!");
5973 do {
5974 pcBuf[0]='\0';
5975 pcTest = fgets(pcBuf, (size_t)2048, fp);
5976 if (pcTest) if (pcTest[0] != ' ') {
5977 uid = (Int4) atoi(Nlm_StrTok(pcTest," "));
5978 acc = (CharPtr) StringSave(Nlm_StrTok(NULL," "));
5979 if (acc) {
5980 acc[Nlm_StrLen(acc) - 1] = '\0';
5981 cidp = CddIdxDataNew(acc,uid);
5982 CddIdxDataLink(&(cidpHead),cidp);
5983 }
5984 }
5985 } while (pcTest);
5986 FileClose(fp);
5987 return(cidpHead);
5988 }
5989
5990
5991 /*---------------------------------------------------------------------------*/
5992 /*---------------------------------------------------------------------------*/
5993 /* retrieve a CDD accession via the PSSMid or vice versa */
5994 /*---------------------------------------------------------------------------*/
5995 /*---------------------------------------------------------------------------*/
CddAccFromPssmId(Int4 iPssmId,CharPtr cAcc,CharPtr CDDidx)5996 void LIBCALL CddAccFromPssmId(Int4 iPssmId, CharPtr cAcc, CharPtr CDDidx)
5997 {
5998 CddIdxDataPtr cidp;
5999
6000 cidp = CddReadIdx(CDDidx);
6001 if (!cidp) CddSevError("cdd.idx read failed!");
6002 while (cidp) {
6003 if (iPssmId == cidp->iPssmId) {
6004 Nlm_StrCpy(cAcc, cidp->cCDDid);
6005 break;
6006 }
6007 cidp = cidp->next;
6008 }
6009 }
6010
CddPssmIdFromAcc(Int4 * iPssmId,CharPtr cAcc,CharPtr CDDidx)6011 void LIBCALL CddPssmIdFromAcc(Int4 *iPssmId, CharPtr cAcc, CharPtr CDDidx)
6012 {
6013 CddIdxDataPtr cidp;
6014
6015 cidp = CddReadIdx(CDDidx);
6016 if (!cidp) CddSevError("cdd.idx read failed!");
6017 while (cidp) {
6018 if (Nlm_StrCmp(cAcc, cidp->cCDDid) == 0) {
6019 *iPssmId = cidp->iPssmId;
6020 break;
6021 }
6022 cidp = cidp->next;
6023 }
6024 }
6025
6026 /*---------------------------------------------------------------------------*/
6027 /*---------------------------------------------------------------------------*/
6028 /* truncate a string right at the position of the first puncutation mark */
6029 /*---------------------------------------------------------------------------*/
6030 /*---------------------------------------------------------------------------*/
CddTruncStringAtFirstPunct(CharPtr pChar)6031 void LIBCALL CddTruncStringAtFirstPunct(CharPtr pChar)
6032 {
6033 Int4 i;
6034
6035 for (i=0;i<Nlm_StrLen(pChar);i++) {
6036 if (pChar[i] == ',' ||
6037 pChar[i] == ';' ||
6038 pChar[i] == '.') {
6039 pChar[i] = '\0'; break;
6040 }
6041 }
6042 }
6043
CddFillBlanksInString(CharPtr pChar)6044 void LIBCALL CddFillBlanksInString(CharPtr pChar)
6045 {
6046 Int4 i;
6047
6048 for (i=0;i<Nlm_StrLen(pChar);i++) {
6049 if (pChar[i] == ' ') pChar[i] = '_';
6050 }
6051 }
6052
6053 /*---------------------------------------------------------------------------*/
6054 /*---------------------------------------------------------------------------*/
6055 /* Remove a Consensus from a CD */
6056 /*---------------------------------------------------------------------------*/
6057 /*---------------------------------------------------------------------------*/
CddRemoveConsensus(CddPtr pcdd)6058 Boolean LIBCALL CddRemoveConsensus(CddPtr pcdd)
6059 {
6060 SeqIdPtr sip;
6061 BioseqPtr bsp;
6062 SeqAlignPtr salp, salpnew;
6063 DenseDiagPtr ddp;
6064 BioseqSetPtr bssp;
6065 SeqEntryPtr sepThis, sepHead = NULL, sepTail = NULL;
6066
6067 if (!CddHasConsensus(pcdd)) return FALSE;
6068 salp = (SeqAlignPtr) pcdd->seqannot->data;
6069 ddp = (DenseDiagPtr) salp->segs;
6070 sip = (SeqIdPtr) ddp->id->next;
6071 bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6072 if (pcdd->alignannot) {
6073 CddTransferAlignAnnot(pcdd->alignannot, sip, salp, bssp);
6074 }
6075 salpnew = CddReindexSeqAlign(salp,sip,bssp);
6076 pcdd->seqannot->data = salpnew->next;
6077 sepThis = bssp->seq_set;
6078 while (sepThis) {
6079 bsp = (BioseqPtr) sepThis->data.ptrvalue;
6080 if (!SipIsConsensus(bsp->id)) {
6081 if (!sepHead) sepHead = sepThis;
6082 if (sepTail) sepTail->next = sepThis;
6083 sepTail = sepThis;
6084 }
6085 sepThis = sepThis->next;
6086 }
6087 sepTail->next = NULL;
6088 bssp->seq_set = sepHead;
6089 pcdd->sequences->data.ptrvalue = bssp;
6090 pcdd->profile_range = NULL;
6091 pcdd->trunc_master = NULL;
6092 pcdd->posfreq = NULL;
6093 pcdd->scoremat = NULL;
6094 pcdd->distance = NULL;
6095 return TRUE;
6096 }
6097
6098 /*---------------------------------------------------------------------------*/
6099 /*---------------------------------------------------------------------------*/
6100 /* Check if a CD has annotation that is inconsistent with the alignment */
6101 /*---------------------------------------------------------------------------*/
6102 /*---------------------------------------------------------------------------*/
CddFeaturesAreConsistent(CddPtr pcdd,CharPtr errmsg)6103 Boolean LIBCALL CddFeaturesAreConsistent(CddPtr pcdd, CharPtr errmsg)
6104 {
6105 SeqAlignPtr salp;
6106 CddExpAlignPtr eap;
6107 BioseqSetPtr bssp;
6108 AlignAnnotPtr aap;
6109 SeqLocPtr slp;
6110 Int4 iWhere;
6111
6112 if (!pcdd->alignannot) return TRUE;
6113 bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6114 salp = (SeqAlignPtr) pcdd->seqannot->data;
6115 if (!salp) return TRUE;
6116 eap = SeqAlignToCddExpAlign(salp,bssp->seq_set);
6117 aap = pcdd->alignannot;
6118 while (aap) {
6119 slp = aap->location;
6120 iWhere = CddSeqLocInExpAlign(slp,eap);
6121 if (iWhere >= 0) {
6122 if (iWhere == INT4_MAX) {
6123 sprintf(errmsg,"Feature: %s, illegal feature address",aap->description);
6124 } else {
6125 sprintf(errmsg,"Feature: %s, Location: %d",aap->description,iWhere+1);
6126 }
6127 return FALSE;
6128 }
6129 aap = aap->next;
6130 }
6131 CddExpAlignFree(eap);
6132 return TRUE;
6133
6134 }
6135
6136 /*---------------------------------------------------------------------------*/
6137 /*---------------------------------------------------------------------------*/
6138 /* find a MMDB-Identifier in a Bioseq */
6139 /*---------------------------------------------------------------------------*/
6140 /*---------------------------------------------------------------------------*/
CddFindMMDBIdInBioseq(BioseqPtr bsp,Int4 * iMMDBid)6141 SeqAnnotPtr LIBCALL CddFindMMDBIdInBioseq(BioseqPtr bsp, Int4 *iMMDBid)
6142 {
6143 SeqAnnotPtr annot, prevannot = NULL;
6144 SeqIdPtr sip;
6145 DbtagPtr dbtp;
6146 ObjectIdPtr oidp;
6147
6148 *iMMDBid = 0;
6149 annot = bsp->annot;
6150 while(annot) {
6151 if (annot->type == 4) {
6152 sip = annot->data;
6153 while (sip) {
6154 if (sip->choice == SEQID_GENERAL) {
6155 dbtp = sip->data.ptrvalue;
6156 if (Nlm_StrCmp(dbtp->db,"mmdb") == 0) {
6157 oidp = dbtp->tag;
6158 *iMMDBid = oidp->id;
6159 return(prevannot);
6160 }
6161 }
6162 sip = sip->next;
6163 }
6164 }
6165 if (annot->next) prevannot = annot;
6166 annot = annot->next;
6167 }
6168 return (NULL);
6169 }
6170
6171
6172 /*---------------------------------------------------------------------------*/
6173 /*---------------------------------------------------------------------------*/
6174 /* Check if a CD has 3D superposition information for all aligned chains */
6175 /*---------------------------------------------------------------------------*/
6176 /*---------------------------------------------------------------------------*/
CddHas3DSuperpos(CddPtr pcdd)6177 Boolean LIBCALL CddHas3DSuperpos(CddPtr pcdd)
6178 {
6179 BioseqSetPtr bssp;
6180 BiostrucAnnotSetPtr basp = NULL;
6181 SeqAlignPtr salp;
6182 DenseDiagPtr ddp;
6183 SeqIdPtr sip1, sip2;
6184 Int4Ptr pMMDBid;
6185 Int4 i, nStruct = 0;
6186 BioseqPtr bsp;
6187 BiostrucFeatureSetPtr bfsp;
6188 BiostrucFeaturePtr bfp;
6189 ValNodePtr location, vnp;
6190 ChemGraphAlignmentPtr cgap;
6191 Int4 thisslave;
6192
6193 pMMDBid = MemNew(250 * sizeof(Int4));
6194 basp = pcdd->features;
6195 if (!basp) return FALSE;
6196 bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6197 /*---------------------------------------------------------------------------*/
6198 /* create a list of MMDB-Id pairs which need to be checked */
6199 /*---------------------------------------------------------------------------*/
6200 salp = pcdd->seqannot->data;
6201 while (salp) {
6202 ddp = salp->segs;
6203 sip1 = ddp->id;
6204 sip2 = ddp->id->next;
6205 if (sip1->choice == SEQID_PDB) {
6206 if (nStruct == 0) {
6207 bsp = CddFindSip(sip1,bssp->seq_set);
6208 CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6209 nStruct++;
6210 }
6211 }
6212 if (sip2->choice == SEQID_PDB) {
6213 bsp = CddFindSip(sip2,bssp->seq_set);
6214 CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6215 nStruct++;
6216 }
6217 salp = salp->next;
6218 }
6219 /*---------------------------------------------------------------------------*/
6220 /* now walk down the biostruc annot set and remove all pairs encountered */
6221 /*---------------------------------------------------------------------------*/
6222 bfsp = basp->features;
6223 while (bfsp) {
6224 bfp = bfsp->features;
6225 while (bfp) {
6226 location = bfp->Location_location;
6227 if (location->choice == Location_location_alignment) {
6228 cgap = location->data.ptrvalue;
6229 vnp = cgap->biostruc_ids;
6230 thisslave = vnp->next->data.intvalue;
6231 for (i=1;i<nStruct;i++) {
6232 if (thisslave == pMMDBid[i]) {
6233 pMMDBid[i] = 0; break;
6234 }
6235 }
6236 }
6237 bfp = bfp->next;
6238 }
6239 bfsp = bfsp->next;
6240 }
6241
6242 /*---------------------------------------------------------------------------*/
6243 /* at this point check if any pair is left (i.e. any mmdb-id except master) */
6244 /*---------------------------------------------------------------------------*/
6245 for (i=1;i<nStruct;i++) {
6246 if (pMMDBid[i]) {
6247 MemFree(pMMDBid); return(FALSE);
6248 }
6249 }
6250 MemFree(pMMDBid); return TRUE;
6251 }
6252
6253 /*---------------------------------------------------------------------------*/
6254 /*---------------------------------------------------------------------------*/
6255 /* CD has unfinished alignment business */
6256 /*---------------------------------------------------------------------------*/
6257 /*---------------------------------------------------------------------------*/
CddHasPendingAlignments(CddPtr pcdd)6258 Boolean LIBCALL CddHasPendingAlignments(CddPtr pcdd)
6259 {
6260 if (pcdd->pending) return TRUE;
6261 return FALSE;
6262 }
6263
6264 /*---------------------------------------------------------------------------*/
6265 /*---------------------------------------------------------------------------*/
6266 /* check
6267 /*---------------------------------------------------------------------------*/
6268 /*---------------------------------------------------------------------------*/
SeqHasTax(BioseqPtr bsp)6269 Boolean LIBCALL SeqHasTax(BioseqPtr bsp)
6270 {
6271 ValNodePtr descr;
6272
6273 descr = bsp->descr;
6274 while (descr) {
6275 if (descr->choice == Seq_descr_source) {
6276 return(TRUE);
6277 }
6278 descr = descr->next;
6279 }
6280 return(FALSE);
6281 }
6282
6283 /*---------------------------------------------------------------------------*/
6284 /*---------------------------------------------------------------------------*/
6285 /* Fetch bioseq using BLAST dB, removing ids redundant to query. */
6286 /* index is the NR index in the dB -- if you don't have it, use -1. */
6287 /*---------------------------------------------------------------------------*/
6288 /*---------------------------------------------------------------------------*/
6289 NLM_EXTERN BlastDefLineSetPtr LIBCALL BlastDefLineSetFree PROTO ((BlastDefLineSetPtr ));
6290
CddReadDBGetBioseq(SeqIdPtr query,Int4 index,ReadDBFILEPtr rdfp)6291 BioseqPtr LIBCALL CddReadDBGetBioseq(SeqIdPtr query, Int4 index,
6292 ReadDBFILEPtr rdfp)
6293 {
6294 return CddReadDBGetBioseqEx(query,index,rdfp,TRUE);
6295 }
6296
CddReadDBGetBioseqEx(SeqIdPtr query,Int4 index,ReadDBFILEPtr rdfp,Boolean bUseObjMgr)6297 BioseqPtr LIBCALL CddReadDBGetBioseqEx(SeqIdPtr query, Int4 index,
6298 ReadDBFILEPtr rdfp, Boolean bUseObjMgr)
6299 {
6300 BioseqPtr bsp;
6301 BlastDefLinePtr bdp,tbdp;
6302 RDBTaxNamesPtr tnames;
6303 Char buf[64];
6304
6305 if(query == NULL) return NULL;
6306 if(index < 0 && (index = SeqId2OrdinalId(rdfp, query)) < 0)
6307 return NULL;
6308 if(!(bsp = readdb_get_bioseq_ex(rdfp, index, bUseObjMgr, FALSE))) return NULL;
6309 if(!(tbdp = FDGetDeflineAsnFromBioseq(bsp))) return NULL;
6310
6311 for(bdp=tbdp; bdp; bdp=bdp->next) {
6312 if(SeqIdComp(bdp->seqid,query) == SIC_YES ||
6313 SeqIdComp(bdp->seqid->next,query) == SIC_YES) {
6314 tnames = RDBGetTaxNames(rdfp->taxinfo, bdp->taxid);
6315 break;
6316 }
6317 }
6318 if(bdp == NULL) {
6319 SeqIdPrint(query,buf,PRINTID_FASTA_SHORT);
6320 ErrPostEx(SEV_WARNING,0,0,"Bad seqid %s",buf);
6321 return NULL;
6322 }
6323 bsp->id = SeqIdSetFree(bsp->id);
6324 bsp->id = SeqIdDup(bdp->seqid->next);
6325 bsp->id->next = SeqIdDup(bdp->seqid);
6326
6327 /* Remove concatenated deflines. */
6328 SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_title));
6329 /* Replace with single defline. */
6330 ValNodeCopyStr(&bsp->descr, Seq_descr_title, bdp->title);
6331
6332 tbdp = BlastDefLineSetFree(tbdp);
6333
6334 /* Remove all user data (tax info, asn1 defline). */
6335 SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_user));
6336
6337 if(tnames) { /* Add taxid, taxnames. */
6338 BioSourcePtr bsrp;
6339 DbtagPtr dtp;
6340
6341 bsrp = BioSourceNew();
6342 bsrp->subtype = NULL;
6343 bsrp->org = OrgRefNew();
6344 if(tnames->sci_name)
6345 bsrp->org->taxname = StringSave(tnames->sci_name);
6346 if(tnames->common_name)
6347 bsrp->org->common = StringSave(tnames->common_name);
6348 dtp = DbtagNew();
6349 dtp->db = StringSave("taxon");
6350 dtp->tag = ObjectIdNew();
6351 dtp->tag->id = tnames->tax_id;
6352 bsrp->org->db = NULL;
6353 ValNodeAddPointer(&bsrp->org->db, 0, dtp);
6354 ValNodeAddPointer(&bsp->descr, Seq_descr_source, bsrp);
6355 }
6356 RDBTaxNamesFree(tnames);
6357
6358 /* if (!SeqHasTax(bsp)) {
6359 printf("Warning: Bioseq without taxonomy!");
6360 if (query->choice == SEQID_GI) {
6361 printf("GI: %d\n",query->data.intvalue);
6362 } else printf("\n");
6363 } */
6364 return bsp;
6365 }
6366
6367 /*---------------------------------------------------------------------------*/
6368 /*---------------------------------------------------------------------------*/
6369 /* return a common style dictionary for Cn3D 4.0 */
6370 /*---------------------------------------------------------------------------*/
6371 /*---------------------------------------------------------------------------*/
6372 /*
6373 Cn3dStyleDictionaryPtr LIBCALL CddSrvGetStyle2(Int4 *styles[], Int4 nstyles)
6374 {
6375 Cn3dStyleDictionaryPtr csdp;
6376 Cn3dStyleTableItemPtr cstip,cstipTail,cstipHead=NULL;
6377 Int4 i;
6378
6379 if(nstyles < 1) return(NULL);
6380 csdp = Cn3dStyleDictionaryNew();
6381 csdp->global_style = CddSrvGetStyle2_Ex(styles[0]);
6382 for(i=1; i<nstyles; i++) {
6383 cstip = Cn3dStyleTableItemNew();
6384 cstip->id = i;
6385 cstip->style = CddSrvGetStyle2_Ex(styles[i]);
6386 cstip->next = NULL;
6387 if(!cstipHead) cstipHead = cstipTail = cstip;
6388 else {
6389 cstipTail->next = cstip;
6390 cstipTail = cstip;
6391 }
6392 }
6393 csdp->style_table = cstipHead;
6394
6395 return(csdp);
6396 }
6397 static Cn3dStyleSettingsPtr CddSrvGetStyle2_Ex(Int4 style[])
6398 {
6399 Cn3dStyleSettingsPtr cssp;
6400 Cn3dBackboneStylePtr cbsp;
6401 Cn3dGeneralStylePtr cgsp;
6402 Cn3dBackboneLabelStylePtr cblsp;
6403
6404 cssp = Cn3dStyleSettingsNew();
6405 cssp->next = NULL;
6406 cssp->name = NULL;
6407 cbsp = Cn3dBackboneStyleNew();
6408 cbsp->type = style[0];
6409 cbsp->style = style[1];
6410 cbsp->color_scheme = style[2];
6411 cbsp->user_color =
6412 MyCn3dColorInit(style[3],style[4],style[5],
6413 style[6],style[7]);
6414 cssp->protein_backbone = cbsp;
6415 cbsp = Cn3dBackboneStyleNew();
6416 cbsp->type = style[8];
6417 cbsp->style = style[9];
6418 cbsp->color_scheme = style[10];
6419 cbsp->user_color =
6420 MyCn3dColorInit(style[11],style[12],style[13],
6421 style[14],style[15]);
6422 cssp->nucleotide_backbone = cbsp;
6423 cgsp = Cn3dGeneralStyleNew();
6424 cgsp->is_on = style[16];
6425 cgsp->style = style[17];
6426 cgsp->color_scheme = style[18];
6427 cgsp->user_color =
6428 MyCn3dColorInit(style[19],style[20],style[21],
6429 style[22],style[23]);
6430 cssp->protein_sidechains = cgsp;
6431 cgsp = Cn3dGeneralStyleNew();
6432 cgsp->is_on = style[24];
6433 cgsp->style = style[25];
6434 cgsp->color_scheme = style[26];
6435 cgsp->user_color =
6436 MyCn3dColorInit(style[27],style[28],style[29],
6437 style[30],style[31]);
6438 cssp->nucleotide_sidechains = cgsp;
6439 cgsp = Cn3dGeneralStyleNew();
6440 cgsp->is_on = style[32];
6441 cgsp->style = style[33];
6442 cgsp->color_scheme = style[34];
6443 cgsp->user_color =
6444 MyCn3dColorInit(style[35],style[36],style[37],
6445 style[38],style[39]);
6446 cssp->heterogens = cgsp;
6447 cgsp = Cn3dGeneralStyleNew();
6448 cgsp->is_on = style[40];
6449 cgsp->style = style[41];
6450 cgsp->color_scheme = style[42];
6451 cgsp->user_color =
6452 MyCn3dColorInit(style[43],style[44],style[45],
6453 style[46],style[47]);
6454 cssp->solvents = cgsp;
6455 cgsp = Cn3dGeneralStyleNew();
6456 cgsp->is_on = style[48];
6457 cgsp->style = style[49];
6458 cgsp->color_scheme = style[50];
6459 cgsp->user_color =
6460 MyCn3dColorInit(style[51],style[52],style[53],
6461 style[54],style[55]);
6462 cssp->connections = cgsp;
6463 cgsp = Cn3dGeneralStyleNew();
6464 cgsp->is_on = style[56];
6465 cgsp->style = style[57];
6466 cgsp->color_scheme = style[58];
6467 cgsp->user_color =
6468 MyCn3dColorInit(style[59],style[60],style[61],
6469 style[62],style[63]);
6470 cssp->helix_objects = cgsp;
6471 cgsp = Cn3dGeneralStyleNew();
6472 cgsp->is_on = style[64];
6473 cgsp->style = style[65];
6474 cgsp->color_scheme = style[66];
6475 cgsp->user_color =
6476 MyCn3dColorInit(style[67],style[68],style[69],
6477 style[70],style[71]);
6478 cssp->strand_objects = cgsp;
6479 cssp->virtual_disulfides_on = style[72];
6480 cssp->virtual_disulfide_color =
6481 MyCn3dColorInit(style[73],style[74],style[75],
6482 style[76],style[77]);
6483 cssp->hydrogens_on = style[78];
6484 cssp->background_color =
6485 MyCn3dColorInit(style[79],style[80],style[81],
6486 style[82],style[83]);
6487 cssp->scale_factor = style[84];
6488 cssp->space_fill_proportion = style[85];
6489 cssp->ball_radius = style[86];
6490 cssp->stick_radius = style[87];
6491 cssp->tube_radius = style[88];
6492 cssp->tube_worm_radius = style[89];
6493 cssp->helix_radius = style[90];
6494 cssp->strand_width = style[91];
6495 cssp->strand_thickness = style[92];
6496 cblsp = Cn3dBackboneLabelStyleNew();
6497 cblsp->spacing = style[93];
6498 cblsp->type = style[94];
6499 cblsp->number = style[95];
6500 cblsp->termini = style[96];
6501 cblsp->white = style[97];
6502 cssp->protein_labels = cblsp;
6503 cblsp = Cn3dBackboneLabelStyleNew();
6504 cblsp->spacing = style[98];
6505 cblsp->type = style[99];
6506 cblsp->number = style[100];
6507 cblsp->termini = style[101];
6508 cblsp->white = style[102];
6509 cssp->nucleotide_labels =cblsp;
6510 cssp->ion_labels = style[103];
6511
6512 return(cssp);
6513 }
6514 static Cn3dColorPtr MyCn3dColorInit(Int4 scale_factor, Int4 red, Int4 green, Int4 blue,
6515 Int4 alpha)
6516 {
6517 Cn3dColorPtr ccp;
6518
6519 ccp = Cn3dColorNew();
6520 ccp->scale_factor = scale_factor;
6521 ccp->red = red;
6522 ccp->green = green;
6523 ccp->blue = blue;
6524 ccp->alpha = alpha;
6525
6526 return ccp;
6527 }
6528 */
6529