1 /* $Id: cddutil.c,v 1.96 2007/05/07 13:27:49 kans Exp $
2 *===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  cddutil.c
27 *
28 * Author:  Aron Marchler-Bauer
29 *
30 * Initial Version Creation Date: 10/18/1999
31 *
32 * $Revision: 1.96 $
33 *
34 * File Description: CDD utility routines
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * $Log: cddutil.c,v $
39 * Revision 1.96  2007/05/07 13:27:49  kans
40 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
41 *
42 * Revision 1.95  2007/01/22 20:16:38  kans
43 * added new parameter to posPurgeMatches from posit.h
44 *
45 * Revision 1.94  2005/08/31 20:34:30  coulouri
46 * From Mike Gertz:
47 * cddutil.c in CddReadBlastOptions:
48 *   - Removed manipulation of hitlist_size as it is no longer correct to
49 *     manipulate the hitlist_size that way.
50 *   - Increase options->expect_value for composition-based statistics,
51 *     mode > 1 (although the routine does not yet understand mode > 1).
52 *
53 * Revision 1.93  2004/11/05 15:36:28  coulouri
54 * remove explicit stdio.h include; it causes LONG_BIT to be defined incorrectly on amd64
55 *
56 * Revision 1.92  2004/06/22 14:20:19  camacho
57 * Changed invocation of posFreqsToMatrix to conform with new signature.
58 *
59 * Revision 1.91  2003/12/09 22:21:35  bauer
60 * added CddCountResTypes
61 *
62 * Revision 1.90  2003/12/09 18:48:05  bauer
63 * commented out deprecated fields in BlastKarlinBlkCopy
64 *
65 * Revision 1.89  2003/10/07 21:19:39  bauer
66 * made CddMasterIs3D more general
67 *
68 * Revision 1.88  2003/08/25 19:09:47  bauer
69 * added SeqAlignReadFromFile
70 *
71 * Revision 1.87  2003/05/22 18:39:14  bauer
72 * list txids in Cdd XML dumper
73 *
74 * Revision 1.86  2003/05/21 17:25:21  bauer
75 * optional ObjMgr in CddReadDBGetBioseq
76 *
77 * Revision 1.85  2003/05/09 20:38:12  bauer
78 * report invalid intervals in CddRelocateSeqLoc
79 *
80 * Revision 1.84  2003/05/08 14:40:09  bauer
81 * bugfix in CddConsensus
82 *
83 * Revision 1.83  2003/04/25 17:17:32  thiessen
84 * fix return value type
85 *
86 * Revision 1.82  2003/04/25 14:36:20  bauer
87 * impalaScaling now returns boolean value
88 *
89 * Revision 1.81  2003/02/06 21:04:27  bauer
90 * fixed bug in reindexing to consensus
91 *
92 * Revision 1.80  2002/12/03 14:36:31  bauer
93 * added CddMSLMixedToMSLDenDiag
94 *
95 * Revision 1.79  2002/11/22 21:35:23  bauer
96 * added SeqAnnotReadFromFile and preservation of scores in DenseSeg to DenseDiag conversion
97 *
98 * Revision 1.78  2002/10/22 14:54:27  bauer
99 * fixed date format in XML dumper
100 *
101 * Revision 1.77  2002/10/10 20:38:19  bauer
102 * changes to accomodate new spec items
103 * - old-root node
104 * - curation-status
105 *
106 * Revision 1.76  2002/10/02 17:32:21  bauer
107 * avoid merging blocks when reindexing alignments
108 *
109 * Revision 1.75  2002/08/17 11:55:07  bauer
110 * backed out changes
111 *
112 * Revision 1.74  2002/08/16 19:51:45  bauer
113 * added Ben's CddSrvGetStyle2
114 *
115 * Revision 1.73  2002/08/06 12:54:26  bauer
116 * fixes to accomodate COGs
117 *
118 * Revision 1.72  2002/07/31 17:15:29  kans
119 * added prototype for BlastDefLineSetFree for Mac compiler
120 *
121 * Revision 1.71  2002/07/31 14:58:58  bauer
122 * BLAST DB Sequence Retrieval
123 *
124 * Revision 1.70  2002/07/17 18:54:10  bauer
125 * CddFeaturesAreConsistent now returns explicit error messages
126 *
127 * Revision 1.69  2002/07/11 14:43:43  bauer
128 * column 21(X) is now always -1 in PSSMs
129 *
130 * Revision 1.68  2002/07/10 22:51:10  bauer
131 * changed score for 26th pssm column to match whats done by makemat
132 *
133 * Revision 1.67  2002/07/10 22:06:07  bauer
134 * replaced posScaling by impalaScaling
135 *
136 * Revision 1.66  2002/07/10 15:34:31  bauer
137 * made SipIsConsensus public
138 *
139 * Revision 1.65  2002/07/09 21:12:39  bauer
140 * added CddDenDiagCposComp2KBP to return Karlin-Altschul parameters together with PSSM
141 *
142 * Revision 1.64  2002/07/05 21:09:25  bauer
143 * added GetAlignmentSize
144 *
145 * Revision 1.63  2002/06/12 15:22:51  bauer
146 * 6/11/02 update
147 *
148 * Revision 1.62  2002/05/31 17:54:31  thiessen
149 * fix for read-only string literal (e.g. on Mac)
150 *
151 * Revision 1.61  2002/05/24 17:49:01  bauer
152 * Unlock Bioseqs again in SeqAlignInform
153 *
154 * Revision 1.60  2002/05/16 22:37:49  bauer
155 * free seqalign in CddExpAlignToSeqAlign if empty
156 *
157 * Revision 1.59  2002/05/06 16:59:50  bauer
158 * remove blanks in inferred CD short names
159 *
160 * Revision 1.58  2002/04/25 14:30:19  bauer
161 * fixed CddFindMMDBIdInBioseq
162 *
163 * Revision 1.57  2002/04/22 16:37:31  bauer
164 * added check for missing structure alignments
165 *
166 * Revision 1.56  2002/04/18 21:00:26  bauer
167 * added check CddFeaturesAreConsistent
168 *
169 * Revision 1.55  2002/04/12 14:02:43  bauer
170 * added update_date case to CddAssignDescr
171 *
172 * Revision 1.54  2002/04/11 14:34:25  bauer
173 * added CddRemoveConsensus
174 *
175 * Revision 1.53  2002/03/28 15:55:00  bauer
176 * fixed bug in CddFindBioseqInSeqEntry, thanks to Ben
177 *
178 * Revision 1.52  2002/02/20 22:27:28  bauer
179 * utility functions for the CD-Server
180 *
181 * Revision 1.51  2002/02/12 23:00:46  bauer
182 * added missing break in CddRelocateSeqLoc
183 *
184 * Revision 1.50  2002/02/06 19:35:37  bauer
185 * some more CddGet.. functionality
186 *
187 * Revision 1.49  2002/02/05 23:15:40  bauer
188 * added a few CddGet.. utility functions, changes to CddAddDescr allow to extend scrapbook line by line
189 *
190 * Revision 1.48  2002/01/28 14:11:29  bauer
191 * add score to entrez neighbor lists
192 *
193 * Revision 1.47  2002/01/05 14:49:44  bauer
194 * made SeqAlignDup a local function
195 *
196 * Revision 1.46  2002/01/04 19:46:56  bauer
197 * added functions to interconvert PSSM-Ids and accessions
198 *
199 * Revision 1.45  2001/11/15 15:35:13  kans
200 * changed strdup to StringSave for Mac
201 *
202 * Revision 1.44  2001/11/13 19:51:52  bauer
203 * support for annotation transfer in alignment reindexing
204 *
205 * Revision 1.43  2001/05/24 15:02:22  kans
206 * included salpacc for Mac compiler
207 *
208 * Revision 1.42  2001/05/23 21:18:05  bauer
209 * added functions for alignment block structure control
210 *
211 * Revision 1.41  2001/04/11 19:49:56  kans
212 * include profiles.h for Mac compiler
213 *
214 * Revision 1.40  2001/04/10 20:26:23  bauer
215 * *** empty log message ***
216 *
217 * Revision 1.39  2001/04/10 20:18:09  bauer
218 * write out ascii-formatted mtx-Files for copymat
219 *
220 * Revision 1.38  2001/03/07 20:27:37  bauer
221 * fixed bug in CddBioseqCopy
222 *
223 * Revision 1.37  2001/03/07 16:30:33  bauer
224 * fixed alignment reindexing bug
225 *
226 * Revision 1.36  2001/03/05 20:25:09  bauer
227 * fixed gap-penalty selection when changing scoring matrix
228 *
229 * Revision 1.35  2001/02/15 15:41:41  shoemake
230 * fixed small memory leak
231 *
232 * Revision 1.34  2001/02/14 17:11:03  bauer
233 * relaced calls to BioseqCopy with CddBioseqCopy
234 *
235 * Revision 1.33  2001/02/13 20:55:10  bauer
236 * fixed bug when comparing local ids
237 *
238 * Revision 1.32  2001/02/07 11:36:51  thiessen
239 * fix minor memory leak
240 *
241 * Revision 1.31  2001/02/06 22:55:35  bauer
242 * Scoring Matrix now a function parameter in CddDenDiagCposComp2
243 *
244 * Revision 1.30  2001/02/06 20:56:10  hurwitz
245 * get rid of problematic BioSeqNew
246 *
247 * Revision 1.29  2001/02/05 22:58:46  bauer
248 * added alignment reindexing function
249 *
250 * Revision 1.28  2001/02/01 17:15:15  bauer
251 * removed SeqIdDup in CddReadBlastOptions
252 *
253 * Revision 1.27  2001/01/26 15:06:40  lewisg
254 * use entrez2 to retrieve structures
255 *
256 * Revision 1.26  2001/01/24 03:08:09  bauer
257 * fixed memory leaks
258 *
259 * Revision 1.25  2001/01/22 21:25:06  hurwitz
260 * bioseq id shouldn't be freed
261 *
262 * Revision 1.24  2001/01/19 23:34:55  bauer
263 * fixed memory leaks
264 *
265 * Revision 1.23  2001/01/19 21:43:44  bauer
266 * removed dependency from NR for threading PSSM calculations
267 *
268 * Revision 1.22  2001/01/17 21:32:02  bauer
269 * changes to PSSM calculation
270 *
271 * Revision 1.21  2001/01/12 01:33:32  bauer
272 * removed CddGetBlastOptions
273 *
274 * Revision 1.20  2001/01/12 00:17:01  bauer
275 * added routines for information content calculation
276 *
277 * Revision 1.19  2001/01/11 23:48:24  bauer
278 * added check for consensus-Cd
279 *
280 * Revision 1.18  2000/12/29 00:52:51  hurwitz
281 * cleaning memory leaks
282 *
283 * Revision 1.17  2000/12/20 18:56:40  hurwitz
284 * new random num gen, more debug printing
285 *
286 * Revision 1.16  2000/12/01 19:36:57  hurwitz
287 * added scale factor to PSSM calcs
288 *
289 * Revision 1.15  2000/11/22 22:34:49  hurwitz
290 * changed pssm scale factor to match contact potential scale factor
291 *
292 * Revision 1.14  2000/11/14 22:08:44  hurwitz
293 * added weighting for pssm calculation
294 *
295 * Revision 1.13  2000/10/10 18:55:06  shoemake
296 * fixed memory error in CddAssignProfileRange
297 *
298 * Revision 1.12  2000/09/08 21:43:51  hurwitz
299 * adding PSSM calculation to DDE
300 *
301 * Revision 1.11  2000/09/01 21:59:04  hurwitz
302 * re-order columns from PSSM of CDs to column order expected in threading
303 *
304 * Revision 1.10  2000/08/30 21:33:55  hurwitz
305 * added new and free functions for Seq_Mtf and Qry_Seq
306 *
307 * Revision 1.9  2000/08/30 16:03:57  bauer
308 * fixed GCC compiler warning fo CddGetPairId
309 *
310 * Revision 1.8  2000/08/14 21:52:04  hurwitz
311 * added CddCalcPSSM
312 *
313 * Revision 1.7  2000/08/14 19:36:26  hurwitz
314 * got CddCposComp working and tested
315 *
316 * Revision 1.6  2000/08/11 19:54:00  hurwitz
317 * restored CddDenDiagCposComputation and CddCposComputation to original, added CddCposComp which combines the 2
318 *
319 * Revision 1.5  2000/08/10 18:18:59  kans
320 * commented out direct calls to ID1 services
321 *
322 * Revision 1.4  2000/08/10 16:50:06  kans
323 * use StringSave instead of unavailable strdup
324 *
325 * Revision 1.3  2000/08/09 21:29:08  hurwitz
326 * adding cddutil.c to VC++ build
327 *
328 * Revision 1.2  2000/07/19 19:32:55  bauer
329 * added modification logging
330 *
331 *
332 * ==========================================================================
333 */
334 
335 
336 #include <ncbi.h>
337 /*#include <accentr.h>*/
338 #include <lsqfetch.h>
339 /* #include <netentr.h> */
340 /* #include <www.h> */
341 /* #include <sys/resource.h> */
342 /*#include <accutils.h>*/
343 #include <mmdbapi.h>
344 #include <mmdbapi1.h>
345 /* #include <asnmime.h> */
346 #include <objseq.h>
347 #include <objmime.h>
348 #include <strimprt.h>
349 #include <cdd.h>
350 /* #include <objcdd.h> */
351 #include <readdb.h>
352 #include <fdlobj.h>
353 #include <cddutil.h>
354 #include <edutil.h>
355 #include <posit.h>
356 #include <cddposutil.h>
357 #include <blastpri.h>
358 #include <accid1.h>
359 #include <thrddecl.h>
360 #include <profiles.h>
361 #include <salpacc.h>
362 
363 /* Multiplier by which to increase the evalue when mode > 1 composition-based
364  * statistics is being used. */
365 #define EVALUE_EXPAND 10
366 
367 static void CddCposCompPart1(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
368                              compactSearchItems* compactSearch, ValNodePtr* LetterHead,
369                              posSearchItems* posSearch);
370 
371 /*---------------------------------------------------------------------------*/
372 /*---------------------------------------------------------------------------*/
373 /* Cdd asn1 reader and writer wrappers                                       */
374 /*---------------------------------------------------------------------------*/
375 /*---------------------------------------------------------------------------*/
CddWriteToFile(CddPtr pcdd,CharPtr cFile,Boolean bBin)376 Boolean LIBCALL CddWriteToFile(CddPtr pcdd, CharPtr cFile, Boolean bBin)
377 {
378   AsnIoPtr aip = NULL;
379   Boolean  bWriteOK;
380 
381   if (!pcdd) return(FALSE);
382   if (bBin) {
383      aip = AsnIoOpen(cFile,"wb");
384   } else {
385      aip = AsnIoOpen(cFile,"w");
386   }
387   bWriteOK = CddAsnWrite(pcdd,aip,NULL);
388   AsnIoClose(aip);
389   return(bWriteOK);
390 }
391 
392 
CddReadFromFile(CharPtr cFile,Boolean bBin)393 CddPtr LIBCALL CddReadFromFile(CharPtr cFile, Boolean bBin)
394 {
395   AsnIoPtr aip = NULL;
396   CddPtr   pcdd;
397 
398   if (bBin) {
399     aip = AsnIoOpen(cFile,"rb");
400   } else {
401     aip = AsnIoOpen(cFile,"r");
402   }
403   pcdd = CddAsnRead(aip,NULL);
404   AsnIoClose(aip);
405   return(pcdd);
406 }
407 
SeqAnnotReadFromFile(CharPtr cFile,Boolean bBin)408 SeqAnnotPtr LIBCALL SeqAnnotReadFromFile(CharPtr cFile, Boolean bBin)
409 {
410   AsnIoPtr    aip = NULL;
411   SeqAnnotPtr sap;
412 
413   if (bBin) {
414     aip = AsnIoOpen(cFile,"rb");
415   } else {
416     aip = AsnIoOpen(cFile,"r");
417   }
418   sap = SeqAnnotAsnRead(aip, NULL);
419   AsnIoClose(aip);
420   return(sap);
421 }
422 
SeqAlignReadFromFile(CharPtr cFile,Boolean bBin)423 SeqAlignPtr LIBCALL SeqAlignReadFromFile(CharPtr cFile, Boolean bBin)
424 {
425   AsnIoPtr    aip = NULL;
426   SeqAlignPtr salp;
427 
428   if (bBin) {
429     aip = AsnIoOpen(cFile,"rb");
430   } else {
431     aip = AsnIoOpen(cFile,"r");
432   }
433   salp = SeqAlignAsnRead(aip, NULL);
434   AsnIoClose(aip);
435   return(salp);
436 }
437 
438 /*---------------------------------------------------------------------------*/
439 /*---------------------------------------------------------------------------*/
440 /* Cdd-tree asn1 reader and writer wrappers                                  */
441 /*---------------------------------------------------------------------------*/
442 /*---------------------------------------------------------------------------*/
CddTreeWriteToFile(CddTreePtr pcddt,CharPtr cFile,Boolean bBin)443 Boolean LIBCALL CddTreeWriteToFile(CddTreePtr pcddt, CharPtr cFile, Boolean bBin)
444 {
445   AsnIoPtr aip = NULL;
446   Boolean  bWriteOK;
447 
448   if (!pcddt) return(FALSE);
449   if (bBin) {
450      aip = AsnIoOpen(cFile,"wb");
451   } else {
452      aip = AsnIoOpen(cFile,"w");
453   }
454   bWriteOK = CddTreeAsnWrite(pcddt,aip,NULL);
455   AsnIoClose(aip);
456   return(bWriteOK);
457 }
458 
459 
CddTreeReadFromFile(CharPtr cFile,Boolean bBin)460 CddTreePtr LIBCALL CddTreeReadFromFile(CharPtr cFile, Boolean bBin)
461 {
462   AsnIoPtr     aip = NULL;
463   CddTreePtr   pcddt;
464 
465   if (bBin) {
466      aip = AsnIoOpen(cFile,"rb");
467      pcddt = (CddTreePtr) CddTreeAsnRead(aip,NULL);
468   } else {
469      aip = AsnIoOpen(cFile,"r");
470      pcddt = (CddTreePtr) CddTreeAsnRead(aip,NULL);
471   }
472 
473   AsnIoClose(aip);
474   return(pcddt);
475 }
476 
477 /*---------------------------------------------------------------------------*/
478 /*---------------------------------------------------------------------------*/
479 /* Functions to add to a Cdd data structure                                  */
480 /*---------------------------------------------------------------------------*/
481 /*---------------------------------------------------------------------------*/
CddAssignDescr(CddPtr pcdd,Pointer pThis,Int4 iWhat,Int4 iIval)482 void LIBCALL CddAssignDescr(CddPtr pcdd, Pointer pThis, Int4 iWhat, Int4 iIval)
483 {
484   ValNodePtr  vnp, vnp2;
485 
486   vnp = ValNodeNew(NULL);
487   vnp->choice = iWhat;
488   switch (iWhat) {
489     case CddDescr_othername:
490       vnp->data.ptrvalue = (CharPtr) pThis;
491       break;
492     case CddDescr_category:
493       vnp->data.ptrvalue = (CharPtr) pThis;
494       break;
495     case CddDescr_comment:
496       vnp->data.ptrvalue = (CharPtr) pThis;
497       break;
498     case CddDescr_reference:
499       vnp->data.ptrvalue = (ValNodePtr) pThis;
500       break;
501     case CddDescr_create_date:
502     case CddDescr_update_date:
503       vnp->data.ptrvalue = (DatePtr) pThis;
504       break;
505     case CddDescr_tax_source:
506       vnp->data.ptrvalue = (OrgRefPtr) pThis;
507       break;
508     case CddDescr_source:
509       vnp->data.ptrvalue = (CharPtr) pThis;
510       break;
511     case CddDescr_status:
512       vnp->data.intvalue = (Int4) iIval;
513       break;
514     case CddDescr_curation_status:
515       vnp->data.intvalue = (Int4) iIval;
516       break;
517     case CddDescr_old_root:
518       vnp->data.ptrvalue = (ValNodePtr) pThis;
519       break;
520     case CddDescr_scrapbook:
521       vnp = ValNodeExtractList(&pcdd->description, CddDescr_scrapbook);
522       vnp2 = ValNodeCopyStr(NULL,0,(CharPtr)pThis);
523       if(vnp) {
524 	if(vnp->next) ErrPostEx(SEV_WARNING,0,0,"Multiple scrapbooks");
525 	ValNodeLink((ValNodePtr *)&vnp->data.ptrvalue, vnp2);
526       } else ValNodeAddPointer(&vnp, CddDescr_scrapbook, vnp2);
527       break;
528     default:
529       vnp->data.ptrvalue = pThis;
530       break;
531   }
532   ValNodeLink(&(pcdd->description),vnp);
533 }
534 
535 
536 /*---------------------------------------------------------------------------*/
537 /*---------------------------------------------------------------------------*/
538 /* remove an item from the CD description. Works for a limited set of types. */
539 /* status and repeats are removed no matter what. For comments, references   */
540 /* etc. the content of the description is compared to the input              */
541 /*---------------------------------------------------------------------------*/
542 /*---------------------------------------------------------------------------*/
CddKillDescr(CddPtr pcdd,Pointer pThis,Int4 iWhat,Int4 iIval)543 Boolean LIBCALL CddKillDescr(CddPtr pcdd, Pointer pThis, Int4 iWhat, Int4 iIval)
544 {
545   ValNodePtr   vnp, vnpold = NULL, vnpthis = NULL, vnpbefore = NULL, pub;
546 
547   vnp = pcdd->description;
548   while (vnp) {
549     if (vnp->choice == iWhat) {
550       switch (iWhat) {
551         case CddDescr_comment:
552         case CddDescr_othername:
553         case CddDescr_category:
554         case CddDescr_source:
555           if (Nlm_StrCmp((CharPtr)pThis,(CharPtr)vnp->data.ptrvalue) == 0) {
556             if (!vnpthis) {
557 	      vnpthis = vnp;
558 	      vnpbefore = vnpold;
559             }
560 	  }
561           break;
562 	case CddDescr_status:
563 	case CddDescr_curation_status:
564 	case CddDescr_old_root:
565 	case CddDescr_repeats:
566           if (!vnpthis) {
567 	    vnpthis = vnp;
568 	    vnpbefore = vnpold;
569           }
570 	  break;
571         case CddDescr_reference:
572 	  pub = vnp->data.ptrvalue;
573 	  if ((pThis && pThis == pub) ||
574 	      (iIval>0 && pub->choice==PUB_PMid && pub->data.intvalue==iIval)) {
575 	    if (!vnpthis) {
576 	      vnpthis = vnp;
577 	      vnpbefore = vnpold;
578 	    }
579 	  }
580         default:
581 	  break;
582       }
583     }
584     vnpold = vnp;
585     vnp = vnp->next;
586   }
587   if (vnpthis) {
588     if (vnpbefore) {
589       vnpbefore->next = vnpthis->next;
590     } else {
591       pcdd->description = vnpthis->next;
592     }
593     return TRUE;
594   } else return FALSE;
595 }
596 
597 /*---------------------------------------------------------------------------*/
598 /*---------------------------------------------------------------------------*/
599 /* series of functions to extract info from CDs                              */
600 /*---------------------------------------------------------------------------*/
601 /*---------------------------------------------------------------------------*/
CddGetVersion(CddPtr pcdd)602 Int4 LIBCALL CddGetVersion(CddPtr pcdd)
603 {
604   CddIdPtr    cid;
605   GlobalIdPtr pGid;
606 
607   cid = pcdd->id; while (cid) {
608     if (cid->choice == CddId_gid) {
609       pGid = cid->data.ptrvalue;
610       return pGid->version;
611     }
612     cid = cid->next;
613   }
614   return 0;
615 }
616 
CddGetOrgRef(CddPtr pcdd)617 OrgRefPtr LIBCALL CddGetOrgRef(CddPtr pcdd)
618 {
619   CddDescrPtr   pcdsc;
620 
621   pcdsc = pcdd->description;
622   while (pcdsc) {
623     if (pcdsc->choice == CddDescr_tax_source) {
624       return (OrgRefPtr) pcdsc->data.ptrvalue;
625     }
626     pcdsc = pcdsc->next;
627   }
628   return NULL;
629 }
630 
CddGetPssmId(CddPtr pcdd)631 Int4 LIBCALL CddGetPssmId(CddPtr pcdd)
632 {
633   CddIdPtr cid;
634 
635   cid = pcdd->id; while (cid) {
636     if (cid->choice == CddId_uid) {
637       return cid->data.intvalue;
638     }
639     cid = cid->next;
640   }
641   return 0;
642 }
643 
CddGetPmIds(CddPtr pcdd,Int4Ptr iPMids)644 Int4 LIBCALL CddGetPmIds(CddPtr pcdd, Int4Ptr iPMids)
645 {
646   Int4          count = 0;
647   CddDescrPtr   pcdsc;
648   ValNodePtr    pub;
649   Boolean       bRefOpen = FALSE;
650 
651   if (iPMids) iPMids = MemNew(250*sizeof(Int4));
652   pcdsc = pcdd->description;
653   while (pcdsc) {
654     if (pcdsc->choice == CddDescr_reference) {
655       pub = pcdsc->data.ptrvalue;
656       if (pub->choice == PUB_PMid) {
657         iPMids[count] = pub->data.intvalue;
658         count++;
659         if (count >= 250) return count;
660       }
661     }
662     pcdsc = pcdsc->next;
663   }
664   return count;
665 }
666 
667 
CddGetDescr(CddPtr pcdd)668 CharPtr LIBCALL CddGetDescr(CddPtr pcdd)
669 {
670   CddDescrPtr   pcdsc;
671 
672   pcdsc = pcdd->description;
673   while (pcdsc) {
674     if (pcdsc->choice == CddDescr_comment) {
675       if (Nlm_StrCmp(pcdsc->data.ptrvalue,"linked to 3D-structure") != 0) {
676         return (CharPtr) pcdsc->data.ptrvalue;
677       }
678     }
679     pcdsc = pcdsc->next;
680   }
681   return NULL;
682 }
683 
CddGetCreateDate(CddPtr pcdd)684 DatePtr LIBCALL CddGetCreateDate(CddPtr pcdd)
685 {
686   CddDescrPtr pcdsc;
687 
688   pcdsc = pcdd->description;
689   while (pcdsc) {
690     if (pcdsc->choice == CddDescr_create_date) {
691       return (DatePtr) pcdsc->data.ptrvalue;
692     }
693     pcdsc = pcdsc->next;
694   }
695   return NULL;
696 }
697 
CddGetUpdateDate(CddPtr pcdd)698 DatePtr LIBCALL CddGetUpdateDate(CddPtr pcdd)
699 {
700   DatePtr pcd = NULL, pcdthis;
701   CddDescrPtr pcdsc;
702 
703   pcdsc = pcdd->description;
704   while (pcdsc) {
705     if (pcdsc->choice == CddDescr_update_date) {
706       pcdthis = (DatePtr) pcdsc->data.ptrvalue;
707       if (!pcd) {
708 	pcd = pcdthis;
709       } else {
710 	if (pcdthis->data[1] > pcd->data[1]) {
711 	  pcd = pcdthis;
712 	} else {
713 	  if (pcdthis->data[2] > pcd->data[2]) {
714 	    pcd = pcdthis;
715 	  } else {
716 	    if (pcdthis->data[3] > pcd->data[3]) pcd = pcdthis;
717 	  }
718 	}
719       }
720     }
721     pcdsc = pcdsc->next;
722   }
723   return pcd;
724 }
725 
CddGetSource(CddPtr pcdd)726 CharPtr LIBCALL CddGetSource(CddPtr pcdd)
727 {
728   CddDescrPtr pcdsc;
729 
730   pcdsc = pcdd->description;
731   while (pcdsc) {
732     if (pcdsc->choice == CddDescr_source) {
733       return (CharPtr) pcdsc->data.ptrvalue;
734     }
735     pcdsc = pcdsc->next;
736   }
737   return NULL;
738 }
739 
CddGetSourceId(CddPtr pcdd)740 CharPtr LIBCALL CddGetSourceId(CddPtr pcdd)
741 {
742   CddDescrPtr pcdsc;
743   GlobalIdPtr pGid;
744   ValNodePtr  vnp;
745 
746   pcdsc = pcdd->description;
747   while (pcdsc) {
748     if (pcdsc->choice == CddDescr_source_id) {
749       vnp = pcdsc->data.ptrvalue;
750       if (vnp->choice == CddId_gid) {
751         pGid = vnp->data.ptrvalue;
752         return (CharPtr) pGid->accession;
753       }
754     }
755     pcdsc = pcdsc->next;
756   }
757   return NULL;
758 }
759 
CddGetAlignmentLength(CddPtr pcdd)760 Int4 LIBCALL CddGetAlignmentLength(CddPtr pcdd)
761 {
762   SeqAnnotPtr sap;
763   SeqAlignPtr salp;
764   Int4        iLength = 0;
765 
766   sap = pcdd->seqannot;
767   if (sap) {
768     salp = (SeqAlignPtr) sap->data;
769     while (salp) {
770       iLength++;
771       salp = salp->next;
772     }
773   }
774   return iLength + 1;
775 }
776 
777 /*---------------------------------------------------------------------------*/
778 /* return number of rows and average number of blocks per row                */
779 /*---------------------------------------------------------------------------*/
GetAlignmentSize(SeqAlignPtr salp)780 Int4Ptr LIBCALL GetAlignmentSize(SeqAlignPtr salp)
781 {
782   Int4Ptr       pRet;
783   Int4          iLength = 0;
784   Int4          iBlocks = 0;
785   DenseDiagPtr  ddp;
786   SeqAlignPtr   salpThis;
787 
788   salpThis = salp;
789   while (salpThis) {
790     iLength++;
791     ddp = salpThis->segs;
792     while (ddp) {
793       iBlocks++;
794       ddp = ddp->next;
795     }
796     salpThis = salpThis->next;
797   }
798   pRet = MemNew(2*sizeof(Int4));
799   pRet[0] = iLength + 1;
800   pRet[1] = iBlocks / iLength;
801   return pRet;
802 }
803 
CddGetAnnotNames(CddPtr pcdd)804 ValNodePtr LIBCALL CddGetAnnotNames(CddPtr pcdd)
805 {
806   AlignAnnotPtr   aap;
807   ValNodePtr      vnp, vnpHead = NULL;
808 
809   aap = pcdd->alignannot;
810   while (aap) {
811     vnp = ValNodeCopyStr(&(vnpHead),0,aap->description);
812     aap = aap->next;
813   }
814   return (vnpHead);
815 }
816 
817 /*---------------------------------------------------------------------------*/
818 /*---------------------------------------------------------------------------*/
819 /* Query the status of a CD                                                  */
820 /*---------------------------------------------------------------------------*/
821 /*---------------------------------------------------------------------------*/
CddGetStatus(CddPtr pcdd)822 Int4 LIBCALL CddGetStatus(CddPtr pcdd)
823 {
824   CddDescrPtr    pCddesc;
825 
826   pCddesc = pcdd->description;
827   while (pCddesc) {
828     if (pCddesc->choice == CddDescr_status) {
829       return(pCddesc->data.intvalue);
830     }
831     pCddesc = pCddesc->next;
832   }
833   return 0;
834 }
835 
836 /*---------------------------------------------------------------------------*/
837 /* find a particular descriptive string in a CD                              */
838 /*---------------------------------------------------------------------------*/
CddHasDescription(CddPtr pcdd,CharPtr pc)839 Boolean LIBCALL CddHasDescription(CddPtr pcdd, CharPtr pc)
840 {
841   CddDescrPtr    pCddesc;
842 
843   if (!pcdd) return FALSE;
844   pCddesc = pcdd->description;
845   while (pCddesc) {
846     if (pCddesc->choice == CddDescr_comment) {
847       if (Nlm_StrCmp((CharPtr)pCddesc->data.ptrvalue,pc) == 0) return TRUE;
848     }
849     pCddesc = pCddesc->next;
850   }
851   return FALSE;
852 }
853 
854 /*---------------------------------------------------------------------------*/
855 /* Is the alignment decorated with feature annotation?                       */
856 /*---------------------------------------------------------------------------*/
CddHasAnnotation(CddPtr pcdd)857 Boolean LIBCALL CddHasAnnotation(CddPtr pcdd)
858 {
859   AlignAnnotPtr aap;
860 
861   aap = pcdd->alignannot;
862   if (aap) {
863     if (aap->description) return TRUE;
864   }
865   return FALSE;
866 }
867 
868 /*---------------------------------------------------------------------------*/
869 /*---------------------------------------------------------------------------*/
870 /* Is the CD master a 3D Structure                                           */
871 /*---------------------------------------------------------------------------*/
872 /*---------------------------------------------------------------------------*/
CddMasterIs3D(CddPtr pcdd)873 Boolean LIBCALL CddMasterIs3D(CddPtr pcdd)
874 {
875   SeqAlignPtr     salp;
876   DenseDiagPtr    ddp;
877   SeqIdPtr        sip;
878 
879   if (!pcdd) return FALSE;
880   if (!pcdd->seqannot) return FALSE;
881   salp = pcdd->seqannot->data;
882   ddp = salp->segs;
883   sip = ddp->id;
884   if (sip->choice == SEQID_PDB) return TRUE;
885   if (sip->choice == SEQID_LOCAL) {
886     sip = ddp->id->next;
887     if (sip->choice == SEQID_PDB) return TRUE;
888   }
889   return FALSE;
890 }
891 
892 /*---------------------------------------------------------------------------*/
893 /*---------------------------------------------------------------------------*/
894 /* How many structure-related alignments in a CDD?                           */
895 /*---------------------------------------------------------------------------*/
896 /*---------------------------------------------------------------------------*/
CddCount3DAlignments(CddPtr pcdd)897 Int4 LIBCALL CddCount3DAlignments(CddPtr pcdd)
898 {
899   Int4              n3dali = 0;
900   Boolean           bHasConsensus;
901   Boolean           bHas3DMaster;
902   SeqAlignPtr       salp;
903   DenseDiagPtr      ddp;
904   SeqIdPtr          sip;
905 
906 
907   bHasConsensus = CddHasConsensus(pcdd);
908   bHas3DMaster = CddMasterIs3D(pcdd);
909 
910   if (!bHas3DMaster) return(0);
911   salp = pcdd->seqannot->data;
912   while (salp) {
913     ddp = salp->segs;
914     sip = ddp->id->next;
915     if (sip->choice == SEQID_PDB) n3dali++;
916     salp = salp->next;
917   }
918   if (n3dali && bHasConsensus) n3dali--;
919   return (n3dali);
920 }
921 
922 /*---------------------------------------------------------------------------*/
923 /* Check if alignment master is a consensus Sequence                         */
924 /*---------------------------------------------------------------------------*/
SeqAlignHasConsensus(SeqAlignPtr salp)925 Boolean LIBCALL SeqAlignHasConsensus(SeqAlignPtr salp)
926 {
927   DenseDiagPtr      ddp;
928   SeqIdPtr          sip;
929   ObjectIdPtr       oidp;
930 
931   if (!salp) return(FALSE);
932   ASSERT(salp->segtype == SAS_DENDIAG);
933   ddp = salp->segs; if (!ddp) return (FALSE);
934   sip = ddp->id;    if (!sip) return (FALSE);
935   if (sip->choice != SEQID_LOCAL) return (FALSE);
936   oidp = sip->data.ptrvalue; if (!oidp) return(FALSE);
937   if (StringCmp(oidp->str,"consensus")==0) return (TRUE);
938   return(FALSE);
939 }
940 
941 /*---------------------------------------------------------------------------*/
942 /* Check if Cdd has a consensus Sequence                                     */
943 /*---------------------------------------------------------------------------*/
SipIsConsensus(SeqIdPtr sip)944 Boolean LIBCALL SipIsConsensus(SeqIdPtr sip)
945 {
946   ObjectIdPtr   oidp;
947 
948   if (!sip) return (FALSE);
949   if (sip->choice != SEQID_LOCAL) return (FALSE);
950   oidp = sip->data.ptrvalue; if (!oidp) return(FALSE);
951   if (StringCmp(oidp->str,"consensus")==0) return (TRUE);
952   return(FALSE);
953 }
954 
CddHasConsensus(CddPtr pcdd)955 Boolean LIBCALL CddHasConsensus(CddPtr pcdd)
956 {
957   SeqIntPtr     sintp;
958   SeqIdPtr      sip;
959   SeqAlignPtr   salp;
960   DenseDiagPtr  ddp;
961 
962   if (!pcdd) return (FALSE);
963   sintp = (SeqIntPtr) pcdd->profile_range;
964   if (!sintp) {
965     salp = pcdd->seqannot->data;
966     ddp = salp->segs;
967     sip = ddp->id;
968     return (SipIsConsensus(sip));
969   }
970   sip = (SeqIdPtr) sintp->id;
971   return(SipIsConsensus(sip));
972 }
973 
974 /*---------------------------------------------------------------------------*/
975 /*---------------------------------------------------------------------------*/
976 /* fix CD file names and accessions / Character Arrays must be allocated     */
977 /*---------------------------------------------------------------------------*/
978 /*---------------------------------------------------------------------------*/
CddRegularizeFileName(CharPtr cIn,CharPtr cAcc,CharPtr cFn,CharPtr cEx)979 void LIBCALL CddRegularizeFileName(CharPtr cIn, CharPtr cAcc, CharPtr cFn,
980                                    CharPtr cEx)
981 {
982   if (Nlm_StrStr(cIn, cEx) != 0) {
983     Nlm_StrCpy(cFn,cIn);
984     Nlm_StrNCpy(cAcc,cIn,Nlm_StrLen(cIn)-Nlm_StrLen(cEx));
985     cAcc[Nlm_StrLen(cIn)-Nlm_StrLen(cEx)] = '\0';
986   } else {
987     Nlm_StrCpy(cAcc,cIn);
988     Nlm_StrCpy(cFn,cIn);
989     if (cEx[0] == '.') {
990       Nlm_StrCat(cFn,cEx);
991     }
992     else {
993       Nlm_StrCat(cFn,".");
994       Nlm_StrCat(cFn,cEx);
995     }
996   }
997 }
998 
999 /*---------------------------------------------------------------------------*/
1000 /*---------------------------------------------------------------------------*/
1001 /* repeat detection - find repeated subsequences in a CD consensus Sequence  */
1002 /*---------------------------------------------------------------------------*/
1003 /*---------------------------------------------------------------------------*/
CddCheckForRepeats(CddPtr pcdd,Int4 width,Int4 GapI,Int4 GapE,Nlm_FloatHi rthresh,Boolean bOutput)1004 Boolean LIBCALL CddCheckForRepeats(CddPtr pcdd, Int4 width, Int4 GapI, Int4 GapE,
1005                                    Nlm_FloatHi rthresh, Boolean bOutput)
1006 {
1007   CddRepeatPtr        pcdr;
1008   BLAST_ScoreBlkPtr   sbp;
1009   BioseqPtr           bsp;
1010   Int4                len, winner = 1;
1011   Int4                **iscore, repct = 0;
1012   Int4                **fscore, bst, pnum, plen;
1013   Int2                iStatus;
1014   Int4                i, j, k, l, m, n, t1, t2, laststart;
1015   SeqPortPtr          spp;
1016   Uint1Ptr            buffer;
1017   Boolean             done = FALSE;
1018   Boolean             tbdone;
1019   Nlm_FloatHi         cfac, wplaus = 0.0;
1020   SeqLocPtr           slp, slpHead;
1021   SeqIntPtr           sintp;
1022 
1023   typedef struct repstep {
1024     Int4            score;
1025     Int4            start;
1026     Int4            npred;
1027     Int4            lpred;
1028     Int4            lremn;
1029     Nlm_FloatHi     plaus;
1030   } RepStep;
1031 
1032   RepStep            rep[100];
1033 
1034   if (!CddHasConsensus) return FALSE;
1035   bsp = pcdd->trunc_master;
1036   len = bsp->length;
1037   sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
1038   sbp->read_in_matrix = TRUE;
1039   sbp->protein_alphabet = TRUE;
1040   sbp->posMatrix = NULL;
1041   sbp->number_of_contexts = 1;
1042   iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
1043 
1044   spp = SeqPortNew(bsp, 0, LAST_RESIDUE, Seq_strand_unknown,Seq_code_ncbistdaa);
1045   buffer = MemNew((len)*sizeof(Uint1));
1046   for (i=0; i<len; i++) buffer[i] = SeqPortGetResidue(spp);
1047   spp = SeqPortFree(spp);
1048 
1049   iscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1050   for (i=0;i<len;i++) iscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1051   fscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1052   for (i=0;i<len;i++) fscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1053 
1054   for (i=0;i<len;i++) {
1055     t1 = buffer[i];
1056     iscore[i][i] = sbp->matrix[t1][t1];
1057     for (j=i+1;j<len;j++) {
1058       t2 = buffer[j];
1059       iscore[i][j] = sbp->matrix[t1][t2];
1060       fscore[j][i] = 0;
1061     }
1062   }
1063 
1064   laststart = 0;
1065   while (!done) {
1066     for (i=0;i<len;i++) for (j=i;j<len;j++) fscore[i][j] = iscore[i][j];
1067     for (i=len-2;i>=0;i--) {
1068       for (j=len-2;j>=i;j--) {
1069         bst = fscore[i+1][j+1];
1070         for (k = i+2;k<i+2+width && k<len;k++) bst = MAX(bst,fscore[k][j+1]-GapI-GapE*(k-i-1));
1071         for (l = j+2;l<j+2+width && l<len;l++) bst = MAX(bst,fscore[i+1][l]-GapI-GapE*(l-j-1));
1072         bst = MAX(bst, 0);
1073         fscore[i][j] += bst;
1074       }
1075     }
1076 
1077     bst = 0; l = 0;
1078     for (j=laststart;j<len;j++) if (fscore[0][j] >= bst) {
1079       bst = fscore[0][j]; l = j;
1080     }
1081     if (bst <= 0.0) {
1082       done = TRUE;
1083       break;
1084     }
1085     laststart = l;
1086 /*    printf("Next best alignment scores %4d at %4d (of %4d: %5.1f%%)\n",bst,l,len, 100.0*l/len); */
1087     rep[repct].start = l;
1088     rep[repct].score = bst;
1089     rep[repct].npred = 1;
1090     rep[repct].plaus = 0.0;
1091     rep[repct].lpred = len;
1092     rep[repct].lremn = len - rep[repct].start;
1093     if (repct > 0) {
1094       rep[repct].lpred = (Int4) (0.5 + ((Nlm_FloatHi)   l / (Nlm_FloatHi) repct));
1095       rep[repct].npred = (Int4) (0.5 + ((Nlm_FloatHi) len / (Nlm_FloatHi) rep[repct].lpred));
1096     }
1097     repct++;
1098     i = 0; j = l;
1099     tbdone = FALSE;
1100     if (l >= len-1) {
1101       tbdone = TRUE;
1102       done = TRUE;
1103     }
1104     while (!tbdone) {
1105       m = i+1; n = j+1;
1106       bst = fscore[m][n];
1107       for (k=i+2;k<i+2+width && k<len;k++) {
1108         if (fscore[k][j+1]-GapI-GapE*(k-i-1) > bst) {
1109 	  bst = fscore[k][j+1]-GapI-GapE*(k-i-1);
1110 	  m = k; n = j+1;
1111 	}
1112       }
1113       for (l=j+2;l<j+2+width && l<len;l++) {
1114         if (fscore[i+1][l]-GapI-GapE*(l-j-1) > bst) {
1115 	  bst = fscore[i+1][l]-GapI-GapE*(l-j-1);
1116 	  m = i+1; n = l;
1117 	}
1118       }
1119       iscore[i][j] = -10000;
1120       if (bst < 0) tbdone = TRUE;
1121       i = m; j = n;
1122       if (i >= len-1 || j >= len-1) {
1123 	tbdone = TRUE;
1124 	iscore[i][j] = -10000;
1125       }
1126     }
1127   }
1128 
1129   for (i=0;i<len;i++) MemFree(fscore[i]);
1130   MemFree(fscore);
1131   for (i=0;i<len;i++) MemFree(iscore[i]);
1132   MemFree(iscore);
1133   rep[0].plaus = (Nlm_FloatHi) (rep[0].score - rep[1].score) / (Nlm_FloatHi) rep[0].score;
1134   if (rep[0].plaus > rthresh) rep[0].plaus = 1.0;
1135   winner = 1; wplaus = rep[0].plaus;
1136 
1137   for (i=1;i<repct;i++) {
1138     pnum = i+1;
1139     plen = (Int4) (((Nlm_FloatHi)len/(Nlm_FloatHi)pnum)+.5);
1140     cfac = 0.0;
1141     for (j=i;j>0;j--) {
1142       cfac += (Nlm_FloatHi)(abs(rep[j].start-(j*plen)))/(Nlm_FloatHi)plen;
1143     }
1144     cfac /= (Nlm_FloatHi) i;
1145     rep[i].plaus = 1.0 - cfac;
1146 
1147     if (rep[i].plaus > wplaus) {
1148       wplaus = rep[i].plaus;
1149       winner = pnum;
1150     }
1151   }
1152   for (i=0;i<winner;i++) {
1153     if (rep[i].lpred < 3 || (i>0 && rep[i].score < 17)) {
1154       winner = 1; break;
1155     }
1156   }
1157 
1158   if (bOutput) {
1159     for (i=0;i<repct;i++) {
1160       printf("%s %3d %4d %8.5f %4d %8.5f %2d %3d %8.5f\n",pcdd->name,
1161              i,rep[i].score,(Nlm_FloatHi)rep[i].score/(Nlm_FloatHi)rep[0].score,
1162 	     rep[i].start, (Nlm_FloatHi)rep[i].start/(Nlm_FloatHi)len,
1163 	     rep[i].npred,rep[i].lpred, rep[i].plaus);
1164     }
1165     printf("%s: predicted %d repeats\n",pcdd->name,winner);
1166   }
1167 
1168   CddKillDescr(pcdd,NULL,CddDescr_repeats,0);
1169   if (winner > 1) {
1170     pcdr = CddRepeatNew(); pcdr->avglen = 0;
1171     slp = NULL; slpHead = NULL;
1172     for (i=0;i<winner;i++) {
1173       k = rep[i].start;
1174       if (i < (winner-1)) l = rep[i+1].start - 1; else l = len - 1;
1175       pcdr->avglen += l-k+1;
1176       slp = (SeqLocPtr) ValNodeNew(NULL);
1177       slp->choice = SEQLOC_INT;
1178       sintp = SeqIntNew();
1179       sintp->from = k;
1180       sintp->to = l;
1181       sintp->id = SeqIdDup(pcdd->profile_range->id);
1182       slp->data.ptrvalue = sintp;
1183       if (!slpHead) slpHead = slp; else ValNodeLink(&(slpHead),slp);
1184     }
1185     pcdr->avglen /= winner;
1186     pcdr->count = winner;
1187     pcdr->location = (SeqLocPtr) ValNodeNew(NULL);
1188     pcdr->location->choice = SEQLOC_PACKED_INT;
1189     pcdr->location->data.ptrvalue = slpHead;
1190     CddAssignDescr(pcdd,pcdr,CddDescr_repeats,0);
1191   }
1192 
1193   sbp = BLAST_ScoreBlkDestruct(sbp);
1194   MemFree(buffer);
1195   if (winner > 1) return TRUE;
1196   return (FALSE);
1197 }
1198 
1199 
1200 /*---------------------------------------------------------------------------*/
1201 /*---------------------------------------------------------------------------*/
1202 /* retrieve the accession of a CD as a character string                      */
1203 /*---------------------------------------------------------------------------*/
1204 /*---------------------------------------------------------------------------*/
CddGetAccession(CddPtr pcdd)1205 CharPtr LIBCALL CddGetAccession(CddPtr pcdd)
1206 {
1207   CharPtr     cAccession = NULL;
1208   ValNodePtr  vnp;
1209   GlobalIdPtr pGid;
1210 
1211   if (!pcdd) return NULL;
1212   vnp = pcdd->id;
1213   while (vnp) {
1214     if (vnp->choice == CddId_gid) {
1215       pGid = (GlobalIdPtr) vnp->data.ptrvalue;
1216       cAccession = StringSave (pGid->accession);
1217     }
1218     vnp = vnp->next;
1219   }
1220   return cAccession;
1221 }
1222 
1223 /*---------------------------------------------------------------------------*/
1224 /*---------------------------------------------------------------------------*/
1225 /* report Errors in processing and exit immediately                          */
1226 /*---------------------------------------------------------------------------*/
1227 /*---------------------------------------------------------------------------*/
CddSimpleHtmlError(CharPtr cErrTxt)1228 void LIBCALL CddSimpleHtmlError(CharPtr cErrTxt)
1229 {
1230   printf("Content-type: text/html\n\n");
1231   printf("<h2>Error:</h2>\n");
1232   printf("<h3>%s</h3>\n",cErrTxt);
1233   exit(1);
1234 }
1235 
CddSevError(CharPtr cErrTxt)1236 void LIBCALL CddSevError(CharPtr cErrTxt)
1237 {
1238   printf(" Error: %s\n",cErrTxt);
1239   exit(1);
1240 }
1241 
1242 /*---------------------------------------------------------------------------*/
1243 /*---------------------------------------------------------------------------*/
1244 /* Find bioseqptr in Cdd internal sequence set                               */
1245 /*---------------------------------------------------------------------------*/
1246 /*---------------------------------------------------------------------------*/
CddFindSip(SeqIdPtr sip,SeqEntryPtr sep)1247 BioseqPtr LIBCALL CddFindSip(SeqIdPtr sip, SeqEntryPtr sep)
1248 {
1249   BioseqPtr    bsp = NULL;
1250   BioseqSetPtr bssp;
1251   SeqEntryPtr  sepThis;
1252 
1253   sepThis = sep;
1254 
1255   while (sepThis) {
1256     if (sepThis->choice == 2) {
1257       bssp = sepThis->data.ptrvalue;
1258       bsp = CddFindSip(sip, bssp->seq_set);
1259     } else if (sepThis->choice == 1) {
1260       bsp = (BioseqPtr) sepThis->data.ptrvalue;
1261       if (CddSameSip(bsp->id, sip)) return(bsp);
1262     }
1263     sepThis = sepThis->next;
1264   }
1265   return(NULL);
1266 }
1267 
1268 
1269 /*---------------------------------------------------------------------------*/
1270 /*---------------------------------------------------------------------------*/
1271 /* version of BioseqCopy that doesn't use BioseqFind and expects the old bsp */
1272 /* adapted by Ben                                                            */
1273 /*---------------------------------------------------------------------------*/
1274 /*---------------------------------------------------------------------------*/
CddBioseqCopy(SeqIdPtr newid,BioseqPtr oldbsp,Int4 from,Int4 to,Uint1 strand,Boolean do_feat)1275 BioseqPtr LIBCALL CddBioseqCopy (SeqIdPtr newid, BioseqPtr oldbsp, Int4 from,
1276                                  Int4 to, Uint1 strand, Boolean do_feat)
1277 {
1278   BioseqPtr   newbsp=NULL, tmpbsp;
1279   Uint1       seqtype;
1280   ValNodePtr  tmp;
1281   ObjectIdPtr oid;
1282   Int4        len;
1283   ValNode     fake;
1284   SeqLocPtr   the_segs, head, curr;
1285   Boolean     handled = FALSE, split;
1286   SeqFeatPtr  sfp, newsfp, lastsfp;
1287   DeltaSeqPtr dsp;
1288   SeqEntryPtr oldscope;
1289 
1290 
1291   if (from < 0) return FALSE;
1292   if (oldbsp == NULL) return NULL;
1293 
1294   if (oldbsp->seq_data_type == Seq_code_gap) return NULL;
1295 
1296   len = to - from + 1;
1297   if (len <= 0) return NULL;
1298   newbsp = BioseqNew();
1299   if (newid != NULL) newbsp->id = SeqIdDup(newid);
1300   else {
1301     tmp = ValNodeNew(NULL);
1302     tmp->choice = SEQID_LOCAL;
1303     oid = ObjectIdNew();
1304     tmp->data.ptrvalue = (Pointer)oid;
1305     oid->str = StringSave("Clipboard");
1306     tmpbsp = BioseqFind(tmp);   /* old clipboard present? */
1307     if (tmpbsp == NULL) {
1308       oldscope = SeqEntrySetScope (NULL);
1309       if (oldscope != NULL) {
1310         tmpbsp = BioseqFind(tmp);
1311         SeqEntrySetScope (oldscope);
1312       }
1313     }
1314     if (tmpbsp != NULL) BioseqFree(tmpbsp);
1315     newbsp->id = tmp;
1316   }
1317 
1318   newbsp->repr = oldbsp->repr;
1319   newbsp->mol = oldbsp->mol;
1320   newbsp->length = len;
1321   newbsp->seq_ext_type = oldbsp->seq_ext_type;
1322 
1323   if (newbsp->repr == Seq_repr_virtual)
1324     handled = TRUE;               /* no more to do */
1325 
1326   if ((newbsp->repr == Seq_repr_raw) ||
1327     (newbsp->repr == Seq_repr_const)) {
1328     if (ISA_aa(newbsp->mol)) {
1329       seqtype = Seq_code_ncbieaa;
1330     } else {
1331       seqtype = Seq_code_iupacna;
1332     }
1333 /*---------------------------------------------------------------------------*/
1334 /* need to check whether seqtypes agree, otherwise the ByteStore is invalid! */
1335 /*---------------------------------------------------------------------------*/
1336     if (seqtype != oldbsp->seq_data_type) {
1337       seqtype = oldbsp->seq_data_type;
1338     }
1339     newbsp->seq_data_type = seqtype;
1340     newbsp->seq_data = (SeqDataPtr) BSDup((ByteStorePtr) oldbsp->seq_data);
1341     handled = TRUE;
1342   }
1343 
1344   if ((newbsp->repr == Seq_repr_seg) ||
1345     (newbsp->repr == Seq_repr_ref) ||
1346     (newbsp->repr == Seq_repr_delta)) {
1347     if (newbsp->repr == Seq_repr_seg) { /* segmented */
1348       fake.choice = SEQLOC_MIX;   /* make SEQUENCE OF Seq-loc, into one */
1349       fake.data.ptrvalue = oldbsp->seq_ext;
1350       fake.next = NULL;
1351       the_segs = (SeqLocPtr)&fake;
1352       head = (SeqLocPtr)SeqLocCopyPart (the_segs, from, to, strand, FALSE, NULL, NULL);
1353     } else if (newbsp->repr == Seq_repr_ref) { /* reference: is a Seq-loc */
1354       head = (SeqLocPtr)SeqLocCopyPart ((SeqLocPtr)(oldbsp->seq_ext), from, to,
1355                                strand, TRUE, NULL, NULL);
1356     } else if (newbsp->repr == Seq_repr_delta) {
1357       dsp = (DeltaSeqPtr)(oldbsp->seq_ext);  /* real data is here */
1358       the_segs = (SeqLocPtr)DeltaSeqsToSeqLocs(dsp);
1359       head = (SeqLocPtr)SeqLocCopyPart (the_segs, from, to, strand, FALSE, NULL, NULL);
1360       SeqLocFree (the_segs);
1361     }
1362     newbsp->seq_ext = (Pointer)head;
1363     handled = TRUE;
1364   }
1365 
1366   if (newbsp->repr == Seq_repr_map) {
1367     lastsfp = NULL;
1368     for (sfp = (SeqFeatPtr)(oldbsp->seq_ext); sfp != NULL; sfp = sfp->next) {
1369       split = FALSE;
1370       curr = (SeqLocPtr)SeqLocCopyRegion(newbsp->id, sfp->location, oldbsp, from, to, strand, &split);
1371       if (curr != NULL) {  /* got one */
1372         newsfp = (SeqFeatPtr)AsnIoMemCopy((Pointer)sfp, (AsnReadFunc)SeqFeatAsnRead, (AsnWriteFunc)SeqFeatAsnWrite);
1373         SeqLocFree(newsfp->location);
1374         newsfp->location = curr;
1375         if (split)
1376           newsfp->partial = TRUE;
1377         if (lastsfp == NULL)  /* first one */
1378           newbsp->seq_ext = (Pointer)newsfp;
1379         else
1380           lastsfp->next = newsfp;
1381         lastsfp = newsfp;
1382       }
1383     }
1384     handled = TRUE;
1385   }
1386   if (! handled) goto erret;
1387   if (do_feat)
1388     SeqFeatsCopy (newbsp, oldbsp, from, to, strand);
1389 
1390   return newbsp;
1391 
1392 erret:
1393   BioseqFree(newbsp);
1394   return NULL;
1395 }
1396 
1397 
1398 /*---------------------------------------------------------------------------*/
1399 /*---------------------------------------------------------------------------*/
1400 /* extract from a SeqEntryPtr a "minimum bioseq" with a bit of description   */
1401 /*---------------------------------------------------------------------------*/
1402 /*---------------------------------------------------------------------------*/
CddExtractBioseq(SeqEntryPtr sep,SeqIdPtr sip)1403 BioseqPtr LIBCALL CddExtractBioseq(SeqEntryPtr sep, SeqIdPtr sip)
1404 {
1405   BioseqPtr    bsp, bspNew, bspTemp;
1406   BioseqSetPtr bssp;
1407   SeqIdPtr     sipNew;
1408   ValNodePtr   vnpThis, vnp = NULL;
1409   SeqAnnotPtr  sanp;
1410 
1411   bsp = BioseqFindInSeqEntry(sip,sep);
1412   sipNew = SeqIdDup(sip);
1413   bspNew = (BioseqPtr) CddBioseqCopy(sipNew,bsp,0,bsp->length-1,0,FALSE);
1414   sipNew = SeqIdFree(sipNew);
1415   if (sep->choice == 2) {
1416     bssp = sep->data.ptrvalue;
1417     vnp = bssp->descr;
1418     bssp->descr = NULL;
1419   } else if (sep->choice == 1) {
1420     bspTemp = sep->data.ptrvalue;
1421     vnp = bspTemp->descr;
1422     bspTemp->descr = NULL;
1423   }
1424   sanp = bsp->annot;
1425   vnpThis = bsp->descr;
1426   bsp->descr = NULL;
1427   ValNodeLink(&(vnp),vnpThis);
1428   bspNew->descr = vnp;
1429   if (sanp) {
1430     bspNew->annot = sanp;
1431     bsp->annot = NULL;
1432   }
1433   return(bspNew);
1434 }
1435 
1436 
1437 /*---------------------------------------------------------------------------*/
1438 /*---------------------------------------------------------------------------*/
1439 /* Cdd specific alignment format converters                                  */
1440 /*---------------------------------------------------------------------------*/
1441 /*---------------------------------------------------------------------------*/
1442 
1443 /*---------------------------------------------------------------------------*/
1444 /*---------------------------------------------------------------------------*/
1445 /* Create a Dense-Diag seqalign from a Discontinuous SeqAlign                */
1446 /*---------------------------------------------------------------------------*/
1447 /*---------------------------------------------------------------------------*/
CddMSLMixedToMSLDenDiag(SeqAlignPtr salp)1448 SeqAlignPtr LIBCALL CddMSLMixedToMSLDenDiag(SeqAlignPtr salp)
1449 {
1450   SeqAlignPtr         salpSub, salpnew, salphead, salptail;
1451   DenseDiagPtr        ddp, ddphead, ddptail;
1452   DenseSegPtr         dsp;
1453   Int4                i, s1, s2;
1454   ScorePtr            scp, scpHead, scpTail, scpThis;
1455 
1456   if (!salp) return(NULL);
1457   salphead = NULL;
1458   salptail = NULL;
1459   while (salp) {
1460     if (salp->segtype) {
1461       if (salp->segtype != SAS_DISC && salp->segtype != SAS_DENSEG) {
1462         CddSevError("CddMSLDiscToMSLDenDiag: Wrong segtype specified!");
1463       }
1464     }
1465     if (salp->dim) {
1466       if (salp->dim != 2) {
1467         CddSevError("CddMSLDiscToMSLDenDiag: Expect alignments of dimension 2!");
1468       }
1469     }
1470     ddphead = NULL;
1471     ddptail = NULL;
1472 
1473     if (salp->segtype == SAS_DISC) {
1474       salpSub = salp->segs;
1475       while (salpSub) {
1476         if (salpSub->segtype != SAS_DENSEG)
1477           CddSevError("CddMSLDiscToMSLDenDiag: Wrong segtype in Sub-Alignment!");
1478         if (salpSub->dim && salpSub->dim !=2)
1479           CddSevError("CddMSLDiscToMSLDenDiag: Wrong dimension in Sub-Alignment!");
1480         dsp = salpSub->segs;
1481         for (i=0;i<dsp->numseg;i++) {
1482           s1 = dsp->starts[i*2];
1483           s2 = dsp->starts[i*2+1];
1484           if (s1 >=0 && s2>= 0) {
1485             ddp = DenseDiagNew();
1486 	    ddp->starts = MemNew(2*sizeof(Int4));
1487 	    ddp->starts[0] = s1;
1488 	    ddp->starts[1] = s2;
1489 	    ddp->len=dsp->lens[i];
1490             ddp->id = SeqIdDupList(dsp->ids);
1491 	    ddp->dim = 2;
1492             if (!ddphead) {
1493 	      ddphead = ddp;
1494 	      ddptail = ddp;
1495 	      ddptail->next = NULL;
1496 	    } else {
1497 	      ddptail->next = ddp;
1498 	      ddptail = ddp;
1499 	      ddptail->next = NULL;
1500 	    }
1501           }
1502         }
1503         salpSub = salpSub->next;
1504       }
1505     } else { /* segtype is plain SAS_DENSEG */
1506       dsp = salp->segs;
1507       ddphead = NULL;
1508       ddptail = NULL;
1509       for (i=0;i<dsp->numseg;i++) {
1510         s1 = dsp->starts[i*2];
1511         s2 = dsp->starts[i*2+1];
1512         if (s1 >=0 && s2>= 0) {
1513           ddp = DenseDiagNew();
1514 	  ddp->starts = MemNew(2*sizeof(Int4));
1515 	  ddp->starts[0] = s1;
1516 	  ddp->starts[1] = s2;
1517 	  ddp->len=dsp->lens[i];
1518           ddp->id = SeqIdDupList(dsp->ids);
1519 	  ddp->dim = 2;
1520           if (!ddphead) {
1521 	    ddphead = ddp;
1522 	    ddptail = ddp;
1523 	    ddptail->next = NULL;
1524 	  } else {
1525 	    ddptail->next = ddp;
1526 	    ddptail = ddp;
1527 	    ddptail->next = NULL;
1528 	  }
1529         }
1530       }
1531     }
1532     salpnew = SeqAlignNew();
1533     salpnew->type = SAT_PARTIAL;
1534     salpnew->segtype = SAS_DENDIAG;
1535     salpnew->dim = 2;
1536     salpnew->segs = ddphead;
1537     scp = salp->score; scpHead = NULL;
1538     while (scp) {
1539       scpThis = ScoreNew();
1540       scpThis->id = ObjectIdDup(scp->id);
1541       scpThis->choice = scp->choice;
1542       scpThis->value = scp->value;
1543       if (!scpHead) {
1544         scpHead = scpThis;
1545       } else {
1546         scpTail->next = scpThis;
1547       }
1548       scpTail = scpThis;
1549       scp = scp->next;
1550     }
1551     salpnew->score = scpHead;
1552     if (!salphead) {
1553       salphead = salpnew;
1554       salptail = salpnew;
1555       salptail->next = NULL;
1556     } else {
1557       salptail->next = salpnew;
1558       salptail = salpnew;
1559       salptail->next = NULL;
1560     }
1561     salp = salp->next;
1562   }
1563   return(salphead);
1564 }
1565 
1566 /*---------------------------------------------------------------------------*/
1567 /*---------------------------------------------------------------------------*/
1568 /* Create a Dense-Diag seqalign from a Dense-Seg SeqAlign                    */
1569 /*---------------------------------------------------------------------------*/
1570 /*---------------------------------------------------------------------------*/
CddMSLDenSegToMSLDenDiag(SeqAlignPtr salp)1571 SeqAlignPtr LIBCALL CddMSLDenSegToMSLDenDiag(SeqAlignPtr salp)
1572 {
1573   SeqAlignPtr         salpnew, salphead, salptail;
1574   DenseDiagPtr        ddp, ddphead, ddptail;
1575   DenseSegPtr         dsp;
1576   Int4                i, s1, s2;
1577   ScorePtr            scp, scpHead, scpTail, scpThis;
1578 
1579 
1580   if (!salp) return(NULL);
1581   salphead = NULL;
1582   salptail = NULL;
1583   while (salp) {
1584     if (salp->segtype) {
1585       if (salp->segtype != SAS_DENSEG) {
1586         CddSevError("CddMSLDenSegToMSLDenDiag: Wrong segtype specified!");
1587       }
1588     }
1589     if (salp->dim) {
1590       if (salp->dim != 2) {
1591         CddSevError("CddMSLDenSegToMSLDenDiag: Expect alignments of dimension 2!");
1592       }
1593     }
1594     dsp = salp->segs;
1595     ddphead = NULL;
1596     ddptail = NULL;
1597     for (i=0;i<dsp->numseg;i++) {
1598       s1 = dsp->starts[i*2];
1599       s2 = dsp->starts[i*2+1];
1600       if (s1 >=0 && s2>= 0) {
1601         ddp = DenseDiagNew();
1602 	ddp->starts = MemNew(2*sizeof(Int4));
1603 	ddp->starts[0] = s1;
1604 	ddp->starts[1] = s2;
1605 	ddp->len=dsp->lens[i];
1606         ddp->id = SeqIdDupList(dsp->ids);
1607 	ddp->dim = 2;
1608         if (!ddphead) {
1609 	  ddphead = ddp;
1610 	  ddptail = ddp;
1611 	  ddptail->next = NULL;
1612 	} else {
1613 	  ddptail->next = ddp;
1614 	  ddptail = ddp;
1615 	  ddptail->next = NULL;
1616 	}
1617       }
1618     }
1619     salpnew = SeqAlignNew();
1620     salpnew->type = SAT_PARTIAL;
1621     salpnew->segtype = SAS_DENDIAG;
1622     salpnew->dim = 2;
1623     salpnew->segs = ddphead;
1624     scp = salp->score; scpHead = NULL;
1625     while (scp) {
1626       scpThis = ScoreNew();
1627       scpThis->id = ObjectIdDup(scp->id);
1628       scpThis->choice = scp->choice;
1629       scpThis->value = scp->value;
1630       if (!scpHead) {
1631         scpHead = scpThis;
1632       } else {
1633         scpTail->next = scpThis;
1634       }
1635       scpTail = scpThis;
1636       scp = scp->next;
1637     }
1638     salpnew->score = scpHead;
1639     if (!salphead) {
1640       salphead = salpnew;
1641       salptail = salpnew;
1642       salptail->next = NULL;
1643     } else {
1644       salptail->next = salpnew;
1645       salptail = salpnew;
1646       salptail->next = NULL;
1647     }
1648     salp = salp->next;
1649   }
1650   return(salphead);
1651 }
1652 
1653 /*---------------------------------------------------------------------------*/
1654 /*---------------------------------------------------------------------------*/
1655 /* make a DenseSeg Alignment-Set turning each diag into a separate alignment */
1656 /*---------------------------------------------------------------------------*/
1657 /*---------------------------------------------------------------------------*/
CddMSLDenDiagToMSLDenSeg(SeqAlignPtr salp)1658 SeqAlignPtr LIBCALL CddMSLDenDiagToMSLDenSeg(SeqAlignPtr salp)
1659 {
1660   SeqAlignPtr         salpnew, salphead, salptail;
1661   DenseDiagPtr        ddp;
1662   DenseSegPtr         dsp;
1663   Int4                lastm, lasts;
1664 
1665   if (!salp) return(NULL);
1666   salphead = NULL;
1667   salptail = NULL;
1668   while (salp) {
1669     if (salp->segtype) {
1670       if (salp->segtype != SAS_DENDIAG) {
1671         CddSevError("CddMSLDenDiagToMSLDenSeg: Wrong segtype specified!");
1672       }
1673     }
1674     if (salp->dim) {
1675       if (salp->dim != 2) {
1676         CddSevError("CddMSLDenDiagToMSLDenSeg: Expect alignments of dimension 2!");
1677       }
1678     }
1679     ddp = salp->segs;
1680     lastm = -1;
1681     lasts = -1;
1682     while (ddp) {
1683       salpnew = SeqAlignNew();
1684       if (!salphead) salphead = salpnew;
1685       salpnew->type = SAT_PARTIAL;
1686       salpnew->segtype = SAS_DENSEG;
1687       salpnew->dim = 2;
1688       salpnew->score = ddp->scores;
1689       dsp = DenseSegNew();
1690       salpnew->segs = dsp;
1691       dsp->dim = 2;
1692       dsp->numseg = 1;
1693       dsp->ids = SeqIdDupList(ddp->id);
1694       dsp->starts = ddp->starts;
1695       dsp->lens = &(ddp->len);
1696       dsp->strands = ddp->strands;
1697       dsp->scores = ddp->scores;
1698       if (salptail) salptail->next = salpnew;
1699       salptail = salpnew;
1700       ddp = ddp->next;
1701     }
1702     salp = salp->next;
1703   }
1704   return(salphead);
1705 }
1706 
1707 /*---------------------------------------------------------------------------*/
1708 /*---------------------------------------------------------------------------*/
1709 /* convert a dendiag pairwise alignment set into a multiple dendiag alignment*/
1710 /* if possible - i.e. check that the number of segments and their extents on */
1711 /* the common master is the same for all pairwise alignments                 */
1712 /*---------------------------------------------------------------------------*/
1713 /*---------------------------------------------------------------------------*/
CddMSLDenDiagToMULDenDiag(SeqAlignPtr salp)1714 SeqAlignPtr LIBCALL CddMSLDenDiagToMULDenDiag(SeqAlignPtr salp)
1715 {
1716   SeqAlignPtr     salpHead;
1717   SeqAlignPtr     salpNew;
1718   Int4            iNumSegs, iCnt, iDim;
1719   Int4            *iSegStart;
1720   Int4            *iSegLen;
1721   DenseDiagPtr    ddp, ddpNew = NULL, ddpTail = NULL;
1722   Boolean         bIsOk = TRUE;
1723   SeqIdPtr        sipNew, sipTail;
1724 
1725   if (!salp) return NULL;
1726   if (salp->dim && salp->dim != 2) {
1727     printf(" CddMSLDenDiagToMulDenDiag Warning: Can't convert alignment of dim!=2\n");
1728     return(salp);
1729   }
1730   iNumSegs = 0;
1731   ddp = (DenseDiagPtr) salp->segs;
1732   while (ddp) {
1733     iNumSegs++;
1734     ddp = ddp->next;
1735   }
1736   iSegStart = calloc(iNumSegs,sizeof(Int4));
1737   iSegLen = calloc(iNumSegs,sizeof(Int4));
1738   iCnt = 0;
1739   ddp = (DenseDiagPtr) salp->segs;
1740   while (ddp) {
1741     iSegStart[iCnt] = ddp->starts[0];
1742     iSegLen[iCnt] = ddp->len;
1743     iCnt++;
1744     ddp = ddp->next;
1745   }
1746   iDim = 1;
1747   salpHead = salp;
1748   while (salpHead) {
1749     iDim++;
1750     if (iDim > 2) {
1751       ddp = (DenseDiagPtr) salpHead->segs;
1752       iCnt = 0;
1753       while (ddp) {
1754         if (iCnt > iNumSegs || ddp->starts[0] != iSegStart[iCnt] ||
1755             ddp->len != iSegLen[iCnt]) {
1756           bIsOk = FALSE;
1757           break;
1758         }
1759         iCnt++;
1760         ddp = ddp->next;
1761       }
1762       if (iCnt != iNumSegs) {
1763         bIsOk = FALSE;
1764         break;
1765       }
1766     }
1767     salpHead = salpHead->next;
1768   }
1769 
1770   if (!bIsOk) {
1771     printf(" CddMSLDenDiagToMulDenDiag Warning: Can't convert alignment!\n");
1772     return(salp);
1773   }
1774   salpNew = SeqAlignNew();
1775   salpNew->type = salp->type;
1776   salpNew->segtype = salp->segtype;
1777   salpNew->dim = iDim;
1778 
1779   sipNew = NULL;
1780   salpHead = salp;
1781   while (salpHead) {
1782     ddp = (DenseDiagPtr)salpHead->segs;
1783     if (!sipNew) {
1784       sipNew = SeqIdDup(ddp->id);
1785       sipNew->next = SeqIdDup(ddp->id->next);
1786       sipTail = sipNew->next;
1787     } else {
1788       sipTail->next = SeqIdDup(ddp->id->next);
1789       sipTail = sipTail->next;
1790     }
1791     salpHead = salpHead->next;
1792   }
1793 
1794   salpHead = salp;
1795   iCnt = 0;
1796   while (salpHead) {
1797     ddp = (DenseDiagPtr) salpHead->segs;
1798     while (ddp) {
1799       if (!ddpNew) {
1800         ddpNew = (DenseDiagPtr) DenseDiagNew();
1801         ddpNew->dim = iDim;
1802         ddpNew->id = sipNew;
1803         ddpNew->starts = calloc(iDim,sizeof(Int4));
1804         ddpNew->starts[0]=ddp->starts[0];
1805         ddpNew->starts[1]=ddp->starts[1];
1806         ddpNew->len=ddp->len;
1807         if (!ddpTail) {
1808           salpNew->segs = ddpNew;
1809           ddpTail = ddpNew;
1810         } else {
1811           ddpTail->next = ddpNew;
1812           ddpTail = ddpTail->next;
1813         }
1814       } else {
1815         ddpNew->starts[iCnt+1]=ddp->starts[1];
1816       }
1817       if (ddpTail->next) {
1818         ddpTail = ddpTail->next;
1819         ddpNew = ddpTail;
1820       } else {
1821         ddpNew = NULL;
1822       }
1823       ddp = ddp->next;
1824       if (!ddp) {
1825         ddpNew = (DenseDiagPtr) salpNew->segs;
1826         ddpTail = ddpNew;
1827       }
1828     }
1829     iCnt++;
1830     salpHead = salpHead->next;
1831   }
1832   free(iSegLen);
1833   free(iSegStart);
1834   return(salpNew);
1835 }
1836 
1837 /*---------------------------------------------------------------------------*/
1838 /*---------------------------------------------------------------------------*/
1839 /* Calculate per column information content for a SeqAlign                   */
1840 /*---------------------------------------------------------------------------*/
1841 /*---------------------------------------------------------------------------*/
SeqAlignConservation(SeqAlignPtr salp,Nlm_FloatHi fract,BioseqPtr bsp_master,Boolean bHasConsensus,Int4 offset)1842 Nlm_FloatHi LIBCALL SeqAlignConservation(SeqAlignPtr salp, Nlm_FloatHi fract,
1843                                          BioseqPtr bsp_master, Boolean bHasConsensus, Int4 offset)
1844 {
1845   Int4               i, c, a;
1846   Int4               qlength, nover = 0;
1847   Int4               alphabetSize = 26;
1848   Int4               *ntypes;
1849   Nlm_FloatHi        **typefreq, sumfreq;
1850   DenseDiagPtr       ddp;
1851   BioseqPtr          bsp;
1852   SeqIdPtr           sip;
1853   SeqPortPtr         spp;
1854   Uint1Ptr           buffer;
1855 
1856   if (!salp) return(0.0);
1857   if (!bsp_master) return(0.0);
1858   qlength = bsp_master->length;
1859   ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
1860   for (i=0;i<qlength;i++) ntypes[i] = 0;
1861   typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
1862   for (i=0;i<qlength;i++) {
1863     typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1864     for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
1865   }
1866   if (!bHasConsensus) {   /* count residues in the master/representative too */
1867     ddp = salp->segs;
1868     sip = ddp->id;
1869     bsp = bsp_master;
1870     buffer = MemNew((bsp->length)*sizeof(Uint1));
1871     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1872     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1873     spp = SeqPortFree(spp);
1874     while (ddp) {
1875       for (c=ddp->starts[0]-offset;c<ddp->starts[0]-offset+ddp->len;c++) {
1876         if (buffer[c] >= 0 && buffer[c] < (Uint1) alphabetSize)
1877           typefreq[c][buffer[c]] += 1.0;
1878       }
1879       ddp = ddp->next;
1880     }
1881     MemFree(buffer);
1882   }
1883   while (salp) {
1884     ddp = salp->segs;
1885     sip = ddp->id->next;
1886     bsp = BioseqLockById(sip);
1887     buffer = MemNew((bsp->length)*sizeof(Uint1));
1888     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1889     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1890     spp = SeqPortFree(spp);
1891     BioseqUnlock(bsp);
1892     while (ddp) {
1893       for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
1894         i = ddp->starts[0]-offset+c-ddp->starts[1];
1895         if (buffer[c] >= 0 && buffer[c] < (Uint1) alphabetSize)
1896 	  typefreq[i][buffer[c]] += 1.0;
1897       }
1898       ddp = ddp->next;
1899     }
1900     MemFree(buffer);
1901     salp = salp->next;
1902   }
1903   for (i=0;i<qlength;i++) {
1904     sumfreq = 0.0;
1905     for (a=0;a<alphabetSize;a++) {
1906       if (typefreq[i][a] > 0.0) {
1907         ntypes[i]++;
1908 	sumfreq += typefreq[i][a];
1909       }
1910     }
1911     if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
1912       typefreq[i][a] /= sumfreq;
1913     }
1914     for (a=0;a<alphabetSize;a++) {
1915       if (typefreq[i][a] >= fract) {
1916         nover++; break;
1917       }
1918     }
1919   }
1920   for (i=0;i<qlength;i++) MemFree(typefreq[i]);
1921   MemFree(typefreq);
1922   MemFree(ntypes);
1923   return((Nlm_FloatHi)nover/(Nlm_FloatHi)qlength);
1924 }
1925 
1926 /*---------------------------------------------------------------------------*/
1927 /*---------------------------------------------------------------------------*/
1928 /* Calculate per column information content for a SeqAlign                   */
1929 /*---------------------------------------------------------------------------*/
1930 /*---------------------------------------------------------------------------*/
SeqAlignInform(SeqAlignPtr salp,BioseqPtr bsp_master,Boolean bHasConsensus,Int4 offset)1931 Nlm_FloatHiPtr LIBCALL SeqAlignInform(SeqAlignPtr salp, BioseqPtr bsp_master,
1932                                       Boolean bHasConsensus, Int4 offset)
1933 {
1934   BLAST_ScoreBlkPtr  sbp;
1935   BLAST_ResFreqPtr   stdrfp;
1936   Nlm_FloatHiPtr     standardProb;
1937   Nlm_FloatHiPtr     Informativeness;
1938   Int2               iStatus;
1939   Int4               i, c, a;
1940   Int4               qlength;
1941   Int4               alphabetSize = 26;
1942   Nlm_FloatHi        posEpsilon = 0.0001;
1943   Nlm_FloatHi        QoverPest;
1944   Int4               *ntypes;
1945   Nlm_FloatHi        **typefreq, sumfreq;
1946   DenseDiagPtr       ddp;
1947   BioseqPtr          bsp;
1948   SeqIdPtr           sip;
1949   SeqPortPtr         spp;
1950   Uint1Ptr           buffer;
1951   Char               matrix[32];
1952 
1953   if (!salp) return(NULL);
1954   if (!bsp_master) return(NULL);
1955   qlength = bsp_master->length;
1956   Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
1957   sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
1958   sbp->read_in_matrix = TRUE;
1959   sbp->protein_alphabet = TRUE;
1960   sbp->posMatrix = NULL;
1961   sbp->number_of_contexts = 1;
1962   StrCpy(matrix,"BLOSUM62");
1963   iStatus = BlastScoreBlkMatFill(sbp,matrix);
1964   stdrfp = BlastResFreqNew(sbp);
1965   BlastResFreqStdComp(sbp,stdrfp);
1966   standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1967   for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
1968   stdrfp = BlastResFreqDestruct(stdrfp);
1969   sbp = (BLAST_ScoreBlkPtr) BLAST_ScoreBlkDestruct(sbp);
1970   ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
1971   for (i=0;i<qlength;i++) ntypes[i] = 0;
1972   typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
1973   for (i=0;i<qlength;i++) {
1974     typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
1975     for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
1976   }
1977   if (!bHasConsensus) {   /* count residues in the master/representative too */
1978     ddp = salp->segs;
1979     sip = ddp->id;
1980     bsp = bsp_master;
1981     buffer = MemNew((bsp->length)*sizeof(Uint1));
1982     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
1983     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
1984     spp = SeqPortFree(spp);
1985     while (ddp) {
1986       for (c=ddp->starts[0]-offset;c<ddp->starts[0]-offset+ddp->len;c++) {
1987         if (buffer[c] < (Uint1) alphabetSize)
1988           typefreq[c][buffer[c]] += 1.0;
1989       }
1990       ddp = ddp->next;
1991     }
1992     MemFree(buffer);
1993   }
1994   while (salp) {
1995     ddp = salp->segs;
1996     sip = ddp->id->next;
1997     bsp = BioseqLockById(sip);
1998     buffer = MemNew((bsp->length)*sizeof(Uint1));
1999     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2000     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2001     spp = SeqPortFree(spp);
2002     BioseqUnlock(bsp);
2003     while (ddp) {
2004       for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2005         i = ddp->starts[0]-offset+c-ddp->starts[1];
2006         if (buffer[c] < (Uint1) alphabetSize)
2007 	  typefreq[i][buffer[c]] += 1.0;
2008       }
2009       ddp = ddp->next;
2010     }
2011     MemFree(buffer);
2012     salp = salp->next;
2013   }
2014   for (i=0;i<qlength;i++) {
2015     sumfreq = 0.0;
2016     for (a=0;a<alphabetSize;a++) {
2017       if (typefreq[i][a] > 0.0) {
2018         ntypes[i]++;
2019 	sumfreq += typefreq[i][a];
2020       }
2021     }
2022     if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
2023       typefreq[i][a] /= sumfreq;
2024     }
2025   }
2026 
2027   for (c=0;c<qlength;c++) {
2028     Informativeness[c] = 0.0;
2029     for (a=0;a<alphabetSize;a++) {
2030       if (standardProb[a] > posEpsilon) {
2031         QoverPest = typefreq[c][a] / standardProb[a];
2032         if (QoverPest > posEpsilon) {
2033 	  Informativeness[c] += typefreq[c][a] * log(QoverPest) / NCBIMATH_LN2;
2034         }
2035       }
2036     }
2037   }
2038   for (i=0;i<qlength;i++) MemFree(typefreq[i]);
2039   MemFree(typefreq);
2040   MemFree(ntypes);
2041   MemFree(standardProb);
2042   return(Informativeness);
2043 }
2044 
2045 /*---------------------------------------------------------------------------*/
2046 /*---------------------------------------------------------------------------*/
2047 /* Calculate per column information content for a CDD from the seqalign      */
2048 /*---------------------------------------------------------------------------*/
2049 /*---------------------------------------------------------------------------*/
CddCountResTypes(CddPtr pcdd,Int4 * ncols)2050 Int4 ** LIBCALL CddCountResTypes(CddPtr pcdd, Int4 *ncols)
2051 {
2052   BioseqSetPtr       bssp;
2053   Boolean            bHasConsensus = CddHasConsensus(pcdd);
2054   Int4               offset, qlength, i, j, c;
2055   Int4             **ResFreqTable;
2056   SeqAlignPtr        salp;
2057   DenseDiagPtr       ddp;
2058   SeqIdPtr           sip;
2059   BioseqPtr          bsp;
2060   Uint1             *buffer;
2061   SeqPortPtr         spp;
2062 
2063   if (!pcdd) return(NULL);
2064   if (!pcdd->trunc_master) return(NULL);
2065   bssp = pcdd->sequences->data.ptrvalue;
2066   offset = pcdd->profile_range->from;
2067   qlength = pcdd->trunc_master->length;
2068   *ncols = qlength;
2069   ResFreqTable = MemNew(qlength*sizeof(Int4 *));
2070   for (i=0;i<qlength;i++) {
2071     ResFreqTable[i] = MemNew(26*sizeof(Int4));
2072     for (j=0;j<26;j++) ResFreqTable[i][j] = 0;
2073   }
2074   salp = pcdd->seqannot->data;
2075   if (!bHasConsensus) {   /* count residues in the master/representative too */
2076     ddp = salp->segs;
2077     sip = ddp->id;
2078     bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2079     buffer = MemNew((bsp->length)*sizeof(Uint1));
2080     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2081     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2082     spp = SeqPortFree(spp);
2083     while (ddp) {
2084       for (c=ddp->starts[0];c<ddp->starts[0]+ddp->len;c++) {
2085         ResFreqTable[c-offset][buffer[c]]++;
2086       }
2087       ddp = ddp->next;
2088     }
2089     MemFree(buffer);
2090   }
2091   while (salp) {
2092     ddp = salp->segs;
2093     sip = ddp->id->next;
2094     bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2095     buffer = MemNew((bsp->length)*sizeof(Uint1));
2096     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2097     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2098     spp = SeqPortFree(spp);
2099     while (ddp) {
2100       for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2101         i = ddp->starts[0]-offset+c-ddp->starts[1];
2102         ResFreqTable[i][buffer[c]]++;
2103       }
2104       ddp = ddp->next;
2105     }
2106     MemFree(buffer);
2107     salp = salp->next;
2108   }
2109   return(ResFreqTable);
2110 }
2111 
2112 /*---------------------------------------------------------------------------*/
2113 /*---------------------------------------------------------------------------*/
2114 /* Calculate per column information content for a CDD from the seqalign      */
2115 /*---------------------------------------------------------------------------*/
2116 /*---------------------------------------------------------------------------*/
CddAlignInform(CddPtr pcdd,Nlm_FloatHi * Niobs)2117 Nlm_FloatHiPtr LIBCALL CddAlignInform(CddPtr pcdd, Nlm_FloatHi * Niobs)
2118 {
2119   Int4               offset;
2120   Boolean            bHasConsensus = CddHasConsensus(pcdd);
2121   BLAST_ScoreBlkPtr  sbp;
2122   BLAST_ResFreqPtr   stdrfp;
2123   Nlm_FloatHiPtr     standardProb;
2124   Nlm_FloatHiPtr     Informativeness;
2125   Int2               iStatus;
2126   Int4               i, c, a;
2127   Int4               qlength;
2128   Int4               alphabetSize = 26;
2129   Nlm_FloatHi        posEpsilon = 0.0001;
2130   Nlm_FloatHi        QoverPest;
2131   Int4               *ntypes;
2132   Nlm_FloatHi        **typefreq, sumfreq;
2133   SeqAlignPtr        salp;
2134   DenseDiagPtr       ddp;
2135   BioseqPtr          bsp;
2136   SeqIdPtr           sip;
2137   BioseqSetPtr       bssp;
2138   SeqPortPtr         spp;
2139   Uint1Ptr           buffer;
2140 
2141   bssp = pcdd->sequences->data.ptrvalue;
2142   offset = pcdd->profile_range->from;
2143   if (!pcdd) return(NULL);
2144   if (!pcdd->trunc_master) return(NULL);
2145   qlength = pcdd->trunc_master->length;
2146   Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2147   sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2148   sbp->read_in_matrix = TRUE;
2149   sbp->protein_alphabet = TRUE;
2150   sbp->posMatrix = NULL;
2151   sbp->number_of_contexts = 1;
2152   iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2153   stdrfp = BlastResFreqNew(sbp);
2154   BlastResFreqStdComp(sbp,stdrfp);
2155   standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2156   for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2157   stdrfp = BlastResFreqDestruct(stdrfp);
2158   ntypes = (Int4 *) MemNew(qlength*sizeof(Int4));
2159   for (i=0;i<qlength;i++) ntypes[i] = 0;
2160   typefreq = (Nlm_FloatHi**) MemNew(qlength * sizeof(Nlm_FloatHi *));
2161   for (i=0;i<qlength;i++) {
2162     typefreq[i] = (Nlm_FloatHi *)MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2163     for (a=0;a<alphabetSize;a++) typefreq[i][a] = 0.0;
2164   }
2165   salp = pcdd->seqannot->data;
2166   if (!bHasConsensus) {   /* count residues in the master/representative too */
2167     ddp = salp->segs;
2168     sip = ddp->id;
2169     bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2170     buffer = MemNew((bsp->length)*sizeof(Uint1));
2171     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2172     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2173     spp = SeqPortFree(spp);
2174     while (ddp) {
2175       for (c=ddp->starts[0];c<ddp->starts[0]+ddp->len;c++) {
2176         typefreq[c-offset][buffer[c]] += 1.0;
2177       }
2178       ddp = ddp->next;
2179     }
2180     MemFree(buffer);
2181   }
2182   while (salp) {
2183     ddp = salp->segs;
2184     sip = ddp->id->next;
2185     bsp = CddRetrieveBioseqById(sip,bssp->seq_set);
2186     buffer = MemNew((bsp->length)*sizeof(Uint1));
2187     spp = SeqPortNew(bsp, 0, bsp->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
2188     for (i=0; i<bsp->length;i++) buffer[i] = SeqPortGetResidue(spp);
2189     spp = SeqPortFree(spp);
2190     while (ddp) {
2191       for (c=ddp->starts[1];c<ddp->starts[1]+ddp->len;c++) {
2192         i = ddp->starts[0]-offset+c-ddp->starts[1];
2193         typefreq[i][buffer[c]] += 1.0;
2194       }
2195       ddp = ddp->next;
2196     }
2197     MemFree(buffer);
2198     salp = salp->next;
2199   }
2200   for (i=0;i<qlength;i++) {
2201     sumfreq = 0.0;
2202     for (a=0;a<alphabetSize;a++) {
2203       if (typefreq[i][a] > 0.0) {
2204         ntypes[i]++;
2205 	sumfreq += typefreq[i][a];
2206       }
2207     }
2208     if (sumfreq > 0.0) for (a=0;a<alphabetSize;a++) {
2209       typefreq[i][a] /= sumfreq;
2210     }
2211   }
2212 
2213   for (c=0;c<qlength;c++) {
2214     Informativeness[c] = 0.0;
2215     for (a=0;a<alphabetSize;a++) {
2216       if (standardProb[a] > posEpsilon) {
2217         QoverPest = typefreq[c][a] / standardProb[a];
2218         if (QoverPest > posEpsilon) {
2219 	  Informativeness[c] += typefreq[c][a] * log(QoverPest) / NCBIMATH_LN2;
2220         }
2221       }
2222     }
2223   }
2224 
2225   *Niobs = 0.0;
2226   for (i=0;i<qlength;i++) *Niobs += (Nlm_FloatHi) ntypes[i];
2227   *Niobs /= (Nlm_FloatHi) qlength;
2228 
2229   for (i=0;i<qlength;i++) MemFree(typefreq[i]);
2230   MemFree(typefreq);
2231   MemFree(ntypes);
2232   MemFree(standardProb);
2233   return(Informativeness);
2234 }
2235 
2236 /*---------------------------------------------------------------------------*/
2237 /*---------------------------------------------------------------------------*/
2238 /* Calculate per column information content for a CDD pssm                   */
2239 /*---------------------------------------------------------------------------*/
2240 /*---------------------------------------------------------------------------*/
CddPssmInform(CddPtr pcdd)2241 Nlm_FloatHiPtr LIBCALL CddPssmInform(CddPtr pcdd)
2242 {
2243   BLAST_ScoreBlkPtr  sbp;
2244   BLAST_ResFreqPtr   stdrfp;
2245   Nlm_FloatHiPtr     standardProb;
2246   Nlm_FloatHiPtr     Informativeness;
2247   Nlm_FloatHi        thisposFreq;
2248   Int2               iStatus;
2249   Int4               c, a;
2250   Int4               qlength;
2251   Int4               alphabetSize = 26;
2252   MatrixPtr          posfreq;
2253   ValNodePtr         vnp;
2254   Nlm_FloatHi        posEpsilon = 0.0001;
2255   Nlm_FloatHi        QoverPest;
2256 
2257   if (!pcdd) return(NULL);
2258   if (!pcdd->trunc_master) return(NULL);
2259   qlength = pcdd->trunc_master->length;
2260   Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2261   sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2262   sbp->read_in_matrix = TRUE;
2263   sbp->protein_alphabet = TRUE;
2264   sbp->posMatrix = NULL;
2265   sbp->number_of_contexts = 1;
2266   iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2267   stdrfp = BlastResFreqNew(sbp);
2268   BlastResFreqStdComp(sbp,stdrfp);
2269   standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2270   for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2271   stdrfp = BlastResFreqDestruct(stdrfp);
2272   posfreq = pcdd->posfreq;
2273   if (posfreq->nrows != alphabetSize || posfreq->ncolumns != qlength) {
2274     CddSevError("posfreq matrix size mismatch in CddPssmInform");
2275   }
2276   vnp = posfreq->columns;
2277   for (c=0;c<qlength;c++) {
2278     Informativeness[c] = 0.0;
2279     for (a=0;a<alphabetSize;a++) {
2280       if (standardProb[a] > posEpsilon) {
2281         thisposFreq = (Nlm_FloatHi) vnp->data.intvalue / (Nlm_FloatHi) posfreq->scale_factor;
2282         QoverPest = thisposFreq / standardProb[a];
2283         if (QoverPest > posEpsilon) {
2284 	  Informativeness[c] += thisposFreq * log(QoverPest) / NCBIMATH_LN2;
2285         }
2286       }
2287       vnp = vnp->next;
2288     }
2289   }
2290   MemFree(standardProb);
2291   return(Informativeness);
2292 }
2293 
2294 /*---------------------------------------------------------------------------*/
2295 /*---------------------------------------------------------------------------*/
2296 /* Calculate per column information content for a frequency table            */
2297 /*---------------------------------------------------------------------------*/
2298 /*---------------------------------------------------------------------------*/
CddPosFreqInform(Nlm_FloatHi ** posFreq,Int4 ncol,Int4 nrow)2299 Nlm_FloatHiPtr LIBCALL CddPosFreqInform(Nlm_FloatHi **posFreq, Int4 ncol, Int4 nrow)
2300 {
2301   BLAST_ScoreBlkPtr  sbp;
2302   BLAST_ResFreqPtr   stdrfp;
2303   Nlm_FloatHiPtr     standardProb;
2304   Nlm_FloatHiPtr     Informativeness;
2305   Nlm_FloatHi        thisposFreq;
2306   Int2               iStatus;
2307   Int4               c, a;
2308   Int4               qlength;
2309   Int4               alphabetSize = 26;
2310   Nlm_FloatHi        posEpsilon = 0.0001;
2311   Nlm_FloatHi        QoverPest;
2312 
2313   if (!posFreq) return(NULL);
2314   qlength = ncol;
2315   if (nrow != alphabetSize) return(NULL);
2316   Informativeness = MemNew(sizeof(Nlm_FloatHi) * qlength);
2317   sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2318   sbp->read_in_matrix = TRUE;
2319   sbp->protein_alphabet = TRUE;
2320   sbp->posMatrix = NULL;
2321   sbp->number_of_contexts = 1;
2322   iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2323   stdrfp = BlastResFreqNew(sbp);
2324   BlastResFreqStdComp(sbp,stdrfp);
2325   standardProb = MemNew(alphabetSize * sizeof(Nlm_FloatHi));
2326   for(a = 0; a < alphabetSize; a++) standardProb[a] = stdrfp->prob[a];
2327   stdrfp = BlastResFreqDestruct(stdrfp);
2328 
2329   for (c=0;c<qlength;c++) {
2330     Informativeness[c] = 0.0;
2331     for (a=0;a<alphabetSize;a++) {
2332       if (standardProb[a] > posEpsilon) {
2333         thisposFreq = posFreq[c][a];
2334         QoverPest = thisposFreq / standardProb[a];
2335         if (QoverPest > posEpsilon) {
2336 	  Informativeness[c] += thisposFreq * log(QoverPest) / NCBIMATH_LN2;
2337         }
2338       }
2339     }
2340   }
2341   MemFree(standardProb);
2342   return(Informativeness);
2343 }
2344 
2345 /*---------------------------------------------------------------------------*/
2346 /*---------------------------------------------------------------------------*/
2347 /* Generation of position-specific scoring matrices for database scanning    */
2348 /*---------------------------------------------------------------------------*/
2349 /*---------------------------------------------------------------------------*/
CddDenDiagCposComputation(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,BioseqPtr bspF,CddPtr pcdd,Int4 pseudoCnt)2350 void LIBCALL CddDenDiagCposComputation(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
2351                                        BioseqPtr bspF, CddPtr pcdd, Int4 pseudoCnt)
2352 {
2353     Int4                numalign, numseq;    /*number of alignments and seqs */
2354     posSearchItems      *posSearch;          /*holds position-specific info  */
2355     compactSearchItems  *compactSearch = NULL;
2356     BLAST_ScoreBlkPtr   sbp;
2357     SeqLocPtr           private_slp = NULL;
2358     SeqPortPtr          spp = NULL;
2359     Uint1Ptr            query_seq, query_seq_start;
2360     Uint1               residue;
2361     Int4                index, a, KarlinReturn, array_size, iNew;
2362     Nlm_FloatHiPtr      lambda, K, H;
2363     Int4Ptr             open, extend;
2364     BLAST_ResFreqPtr    stdrfp;
2365     Int2                iStatus;
2366     ValNodePtr          error_return = NULL;
2367     ValNodePtr          ColumnHead;
2368     ValNodePtr          newRow, RowHead;
2369     ValNodePtr          newLetter, LetterHead;
2370     SeqCodeTablePtr     sctp;
2371     Char                ckptFileName[PATH_MAX];
2372     Char                cseqFileName[PATH_MAX];
2373     FILE                *fp;
2374     Boolean             bHasConsensus;
2375     CharPtr             cAccession;
2376 
2377     bHasConsensus = CddHasConsensus(pcdd);
2378 
2379     numalign = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
2380     posSearch = (posSearchItems *) MemNew(1*sizeof(posSearchItems));
2381     sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2382     sbp->read_in_matrix = TRUE;
2383     sbp->protein_alphabet = TRUE;
2384     sbp->posMatrix = NULL;
2385     sbp->number_of_contexts = 1;
2386     iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2387     compactSearch = compactSearchNew(compactSearch);
2388     compactSearch->qlength = bsp->length;
2389     compactSearch->alphabetSize = sbp->alphabet_size;
2390     compactSearch->matrix = sbp->matrix;
2391     compactSearch->gapped_calculation = TRUE;
2392     compactSearch->pseudoCountConst = pseudoCnt;
2393     compactSearch->ethresh = 0.001;
2394 
2395 /*---------------------------------------------------------------------------*/
2396 /* get the query sequence                                                    */
2397 /*---------------------------------------------------------------------------*/
2398     private_slp = SeqLocIntNew(0, bspF->length-1 , Seq_strand_plus, SeqIdFindBest(bspF->id, SEQID_GI));
2399 		spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
2400     SeqPortSet_do_virtual(spp, TRUE);
2401 		query_seq_start = (Uint1Ptr) MemNew(((bspF->length)+2)*sizeof(Char));
2402 		query_seq_start[0] = NULLB;
2403 		query_seq = query_seq_start+1;
2404     index=0;
2405     while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
2406       if (IS_residue(residue)) {
2407 				query_seq[index] = residue;
2408 				index++;
2409 			}
2410 		}
2411 		query_seq[index] = NULLB;
2412 		spp = SeqPortFree(spp);
2413 		private_slp = SeqLocFree(private_slp);
2414     compactSearch->query = query_seq;
2415 
2416     BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
2417 
2418     sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
2419     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
2420     if (1 == KarlinReturn) {
2421       BlastErrorPrint(error_return);
2422     }
2423     sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
2424     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
2425     if (1 == KarlinReturn) {
2426       BlastErrorPrint(error_return);
2427     }
2428 
2429     array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
2430     if (array_size > 0) {
2431       for (index = 0; index < array_size; index++) {
2432         if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
2433           sbp->kbp_ideal = BlastKarlinBlkCreate();
2434           sbp->kbp_ideal->Lambda = lambda[index];
2435           sbp->kbp_ideal->K = K[index];
2436           sbp->kbp_ideal->H = H[index];
2437         }
2438       }
2439       MemFree(open);
2440       MemFree(extend);
2441       MemFree(lambda);
2442       MemFree(K);
2443       MemFree(H);
2444     }
2445     if (sbp->kbp_ideal == NULL) {
2446       sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
2447     }
2448     compactSearch->lambda       = sbp->kbp_gap_std[0]->Lambda;
2449     compactSearch->kbp_std      = sbp->kbp_std;
2450     compactSearch->kbp_psi      = sbp->kbp_psi;
2451     compactSearch->kbp_gap_std  = sbp->kbp_gap_std;
2452     compactSearch->kbp_gap_psi  = sbp->kbp_gap_psi;
2453     compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
2454     compactSearch->K_ideal      = sbp->kbp_ideal->K;
2455 
2456     stdrfp = BlastResFreqNew(sbp);
2457     BlastResFreqStdComp(sbp,stdrfp);
2458     compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
2459     for(a = 0; a < compactSearch->alphabetSize; a++)
2460       compactSearch->standardProb[a] = stdrfp->prob[a];
2461     stdrfp = BlastResFreqDestruct(stdrfp);
2462     posSearch->posInformation = NULL;
2463 /*---------------------------------------------------------------------------*/
2464 /* numseq is replaced with numalign (last argument) - each alignment is a    */
2465 /* set of diags and represents an independent sequence/domain                */
2466 /*---------------------------------------------------------------------------*/
2467     posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
2468     CddfindDenseDiagThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
2469     sbp->kbp     = sbp->kbp_psi;
2470     sbp->kbp_gap = sbp->kbp_gap_psi;
2471     CddposInitializeInformation(posSearch,sbp,compactSearch);
2472     CddposDenseDiagDemographics(posSearch, compactSearch, listOfSeqAligns);
2473     posPurgeMatches(posSearch, compactSearch, NULL);
2474 /*---------------------------------------------------------------------------*/
2475 /* remove the consensus from the CposComputation if present in the CDD       */
2476 /*---------------------------------------------------------------------------*/
2477     if (bHasConsensus) posCancel(posSearch,compactSearch,0,0,0,compactSearch->qlength);
2478     posComputeExtents(posSearch, compactSearch);
2479     posComputeSequenceWeights(posSearch, compactSearch, 1.0);
2480     posCheckWeights(posSearch, compactSearch);
2481     posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
2482     CddposFreqsToMatrix(posSearch,compactSearch);
2483     posScaling(posSearch, compactSearch);
2484 
2485     sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
2486     LetterHead = NULL;
2487     for (a=0;a<compactSearch->alphabetSize;a++) {
2488       newLetter = ValNodeNew(NULL);
2489       newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
2490       Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
2491       ValNodeLink(&(LetterHead),newLetter);
2492     }
2493 
2494     pcdd->posfreq = (MatrixPtr) MatrixNew();
2495     pcdd->posfreq->ncolumns = compactSearch->qlength;
2496     pcdd->posfreq->nrows = compactSearch->alphabetSize;
2497     pcdd->posfreq->scale_factor = POSFREQ_SCALE;
2498     pcdd->posfreq->row_labels = LetterHead;
2499     ColumnHead = NULL;
2500     for (index = 0; index<compactSearch->qlength;index++) {
2501       RowHead = NULL;
2502       for (a=0;a<compactSearch->alphabetSize;a++) {
2503          newRow = ValNodeNew(NULL);
2504          iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
2505          newRow->data.intvalue = iNew;
2506            ValNodeLink(&(ColumnHead),newRow);
2507       }
2508     }
2509     pcdd->posfreq->columns = ColumnHead;
2510 
2511     pcdd->scoremat = (MatrixPtr) MatrixNew();
2512     pcdd->scoremat->ncolumns = compactSearch->qlength;
2513     pcdd->scoremat->nrows = compactSearch->alphabetSize;
2514     pcdd->scoremat->scale_factor = 1;
2515     pcdd->scoremat->row_labels = LetterHead;
2516     ColumnHead = NULL;
2517     for (index = 0; index<compactSearch->qlength;index++) {
2518       RowHead = NULL;
2519       for (a=0;a<compactSearch->alphabetSize;a++) {
2520          newRow = ValNodeNew(NULL);
2521          iNew = (Int4) posSearch->posMatrix[index][a];
2522          newRow->data.intvalue = iNew;
2523            ValNodeLink(&(ColumnHead),newRow);
2524       }
2525     }
2526     pcdd->scoremat->columns = ColumnHead;
2527 
2528 /*---------------------------------------------------------------------------*/
2529 /* Construct name for checkpoint file                                        */
2530 /*---------------------------------------------------------------------------*/
2531     cAccession = CddGetAccession(pcdd);
2532     strcpy(ckptFileName,cAccession);
2533     strcat(ckptFileName,CKPTEXT);
2534     CddposTakeCheckpoint(posSearch, compactSearch, ckptFileName, &error_return);
2535     strcpy(cseqFileName,cAccession);
2536     strcat(cseqFileName,CSEQEXT);
2537     MemFree(cAccession);
2538 
2539     fp = FileOpen(cseqFileName, "w");
2540     if (NULL == fp) {
2541       BlastConstructErrorMessage("CddDenDiagCposComputation", "Could not open fasta file", 1, &error_return);
2542     }
2543     BioseqToFasta(bsp,fp,FALSE);
2544     FileClose(fp);
2545 }
2546 
2547 
2548 /*---------------------------------------------------------------------------*/
2549 /*---------------------------------------------------------------------------*/
2550 /* Generation of position-specific scoring matrices for database scanning    */
2551 /*---------------------------------------------------------------------------*/
2552 /*---------------------------------------------------------------------------*/
CddCposComputation(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,CddPtr pcdd)2553 void LIBCALL CddCposComputation(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
2554                                 CddPtr pcdd)
2555 {
2556     Int4                numalign, numseq;    /*number of alignments and seqs */
2557     posSearchItems      *posSearch;          /*holds position-specific info  */
2558     compactSearchItems  *compactSearch = NULL;
2559     BLAST_ScoreBlkPtr   sbp;
2560     SeqLocPtr           private_slp = NULL;
2561     SeqPortPtr          spp = NULL;
2562     Uint1Ptr            query_seq, query_seq_start;
2563     Uint1               residue;
2564     Int4                index, a, KarlinReturn, array_size, iNew;
2565     Nlm_FloatHiPtr      lambda, K, H;
2566     Int4Ptr             open, extend;
2567     BLAST_ResFreqPtr    stdrfp;
2568     Int2                iStatus;
2569     ValNodePtr          error_return = NULL;
2570     ValNodePtr          ColumnHead;
2571     ValNodePtr          newRow, RowHead;
2572     ValNodePtr          newLetter, LetterHead;
2573     SeqCodeTablePtr     sctp;
2574     Char                ckptFileName[PATH_MAX];
2575     Char                cseqFileName[PATH_MAX];
2576     FILE                *fp;
2577     CharPtr             cAccession;
2578 
2579     numalign = CddCountSeqAligns(listOfSeqAligns, &numseq);
2580     posSearch = (posSearchItems *) MemNew(1*sizeof(posSearchItems));
2581     sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
2582     sbp->read_in_matrix = TRUE;
2583     sbp->protein_alphabet = TRUE;
2584     sbp->posMatrix = NULL;
2585     sbp->number_of_contexts = 1;
2586     iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
2587     compactSearch = compactSearchNew(compactSearch);
2588     compactSearch->qlength = bsp->length;
2589     compactSearch->alphabetSize = sbp->alphabet_size;
2590     compactSearch->matrix = sbp->matrix;
2591     compactSearch->gapped_calculation = TRUE;
2592     compactSearch->pseudoCountConst = 10;
2593     compactSearch->ethresh = 0.001;
2594 
2595 /*---------------------------------------------------------------------------*/
2596 /* get the query sequence                                                    */
2597 /*---------------------------------------------------------------------------*/
2598     private_slp = SeqLocIntNew(0, bsp->length-1 , Seq_strand_plus, SeqIdFindBest(bsp->id, SEQID_GI));
2599 		spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
2600     SeqPortSet_do_virtual(spp, TRUE);
2601 		query_seq_start = (Uint1Ptr) MemNew(((bsp->length)+2)*sizeof(Char));
2602 		query_seq_start[0] = NULLB;
2603 		query_seq = query_seq_start+1;
2604     index=0;
2605     while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
2606       if (IS_residue(residue)) {
2607 				query_seq[index] = residue;
2608 				index++;
2609 			}
2610 		}
2611 		query_seq[index] = NULLB;
2612 		spp = SeqPortFree(spp);
2613 		private_slp = SeqLocFree(private_slp);
2614     compactSearch->query = query_seq;
2615 
2616     BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
2617 
2618     sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
2619     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
2620     if (1 == KarlinReturn) {
2621       BlastErrorPrint(error_return);
2622     }
2623     sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
2624     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
2625     if (1 == KarlinReturn) {
2626       BlastErrorPrint(error_return);
2627     }
2628 
2629     array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
2630     if (array_size > 0) {
2631       for (index = 0; index < array_size; index++) {
2632         if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
2633           sbp->kbp_ideal = BlastKarlinBlkCreate();
2634           sbp->kbp_ideal->Lambda = lambda[index];
2635           sbp->kbp_ideal->K = K[index];
2636           sbp->kbp_ideal->H = H[index];
2637         }
2638       }
2639       MemFree(open);
2640       MemFree(extend);
2641       MemFree(lambda);
2642       MemFree(K);
2643       MemFree(H);
2644     }
2645     if (sbp->kbp_ideal == NULL) {
2646       sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
2647     }
2648     compactSearch->lambda       = sbp->kbp_gap_std[0]->Lambda;
2649     compactSearch->kbp_std      = sbp->kbp_std;
2650     compactSearch->kbp_psi      = sbp->kbp_psi;
2651     compactSearch->kbp_gap_std  = sbp->kbp_gap_std;
2652     compactSearch->kbp_gap_psi  = sbp->kbp_gap_psi;
2653     compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
2654     compactSearch->K_ideal      = sbp->kbp_ideal->K;
2655 
2656     stdrfp = BlastResFreqNew(sbp);
2657     BlastResFreqStdComp(sbp,stdrfp);
2658     compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
2659     for(a = 0; a < compactSearch->alphabetSize; a++)
2660       compactSearch->standardProb[a] = stdrfp->prob[a];
2661     stdrfp = BlastResFreqDestruct(stdrfp);
2662     posSearch->posInformation = NULL;
2663 /*---------------------------------------------------------------------------*/
2664 /* numseq is replaced with numalign (last argument) - each alignment is a    */
2665 /* single segment and represents a separate sequence                         */
2666 /*---------------------------------------------------------------------------*/
2667     posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
2668     CddfindThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
2669     sbp->kbp     = sbp->kbp_psi;
2670     sbp->kbp_gap = sbp->kbp_gap_psi;
2671     CddposInitializeInformation(posSearch,sbp,compactSearch);
2672     CddposDemographics(posSearch, compactSearch, listOfSeqAligns);
2673     posPurgeMatches(posSearch, compactSearch, NULL);
2674     posComputeExtents(posSearch, compactSearch);
2675     posComputeSequenceWeights(posSearch, compactSearch, 1.0);
2676     posCheckWeights(posSearch, compactSearch);
2677     posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
2678     CddposFreqsToMatrix(posSearch,compactSearch);
2679     posScaling(posSearch, compactSearch);
2680 
2681     sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
2682     LetterHead = NULL;
2683     for (a=0;a<compactSearch->alphabetSize;a++) {
2684       newLetter = ValNodeNew(NULL);
2685       newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
2686       Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
2687       ValNodeLink(&(LetterHead),newLetter);
2688     }
2689 
2690     pcdd->posfreq = (MatrixPtr) MatrixNew();
2691     pcdd->posfreq->ncolumns = compactSearch->qlength;
2692     pcdd->posfreq->nrows = compactSearch->alphabetSize;
2693     pcdd->posfreq->scale_factor = POSFREQ_SCALE;
2694     pcdd->posfreq->row_labels = LetterHead;
2695     ColumnHead = NULL;
2696     for (index = 0; index<compactSearch->qlength;index++) {
2697       RowHead = NULL;
2698       for (a=0;a<compactSearch->alphabetSize;a++) {
2699          newRow = ValNodeNew(NULL);
2700          iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
2701          newRow->data.intvalue = iNew;
2702            ValNodeLink(&(ColumnHead),newRow);
2703       }
2704     }
2705     pcdd->posfreq->columns = ColumnHead;
2706 
2707     pcdd->scoremat = (MatrixPtr) MatrixNew();
2708     pcdd->scoremat->ncolumns = compactSearch->qlength;
2709     pcdd->scoremat->nrows = compactSearch->alphabetSize;
2710     pcdd->scoremat->scale_factor = 1;
2711     pcdd->scoremat->row_labels = LetterHead;
2712     ColumnHead = NULL;
2713     for (index = 0; index<compactSearch->qlength;index++) {
2714       RowHead = NULL;
2715       for (a=0;a<compactSearch->alphabetSize;a++) {
2716          newRow = ValNodeNew(NULL);
2717          iNew = (Int4) posSearch->posMatrix[index][a];
2718          newRow->data.intvalue = iNew;
2719            ValNodeLink(&(ColumnHead),newRow);
2720       }
2721     }
2722     pcdd->scoremat->columns = ColumnHead;
2723 
2724 /*---------------------------------------------------------------------------*/
2725 /* Construct name for checkpoint file                                        */
2726 /*---------------------------------------------------------------------------*/
2727     cAccession = CddGetAccession(pcdd);
2728     strcpy(ckptFileName,cAccession);
2729     strcat(ckptFileName,CKPTEXT);
2730 
2731     CddposTakeCheckpoint(posSearch, compactSearch, ckptFileName, &error_return);
2732     strcpy(cseqFileName,cAccession);
2733     strcat(cseqFileName,CSEQEXT);
2734 
2735     MemFree(cAccession);
2736 
2737     fp = FileOpen(cseqFileName, "w");
2738     if (NULL == fp) {
2739       BlastConstructErrorMessage("CddCposComputation", "Could not open fasta file", 1, &error_return);
2740     }
2741     BioseqToFasta(bsp,fp,FALSE);
2742     FileClose(fp);
2743 }
2744 
2745 
2746 /*---------------------------------------------------------------------------*/
2747 /*---------------------------------------------------------------------------*/
2748 /* Count the number of seqAligns in a list (returned) and count the number   */
2749 /* of distinct target sequences represented (passed back in numSequences);   */
2750 /* Important assumption: All SeqAligns with  the same target sequence are    */
2751 /* consecutive in the list                                                   */
2752 /* ONLY works for lists of Seqaligns containing DenseSegs!!!                 */
2753 /*---------------------------------------------------------------------------*/
2754 /*---------------------------------------------------------------------------*/
CddCountSeqAligns(SeqAlignPtr listOfSeqAligns,Int4 * numSequences)2755 Int4 LIBCALL CddCountSeqAligns(SeqAlignPtr listOfSeqAligns, Int4 * numSequences)
2756 {
2757   SeqAlignPtr curSeqAlign, prevSeqAlign;
2758   Int4        seqAlignCounter;
2759   DenseSegPtr curSegs;
2760   SeqIdPtr    curId, prevId;
2761 
2762   seqAlignCounter = 0;
2763   *numSequences = 0;
2764   curSeqAlign = listOfSeqAligns;
2765   prevSeqAlign = NULL;
2766   while (NULL != curSeqAlign) {
2767     curSegs = (DenseSegPtr) curSeqAlign->segs;
2768     if(curSegs->ids == NULL) break;
2769     curId = curSegs->ids->next;
2770     seqAlignCounter++;
2771     if ((NULL == prevSeqAlign) || (!(SeqIdMatch(curId, prevId))))
2772       (*numSequences)++;
2773     prevSeqAlign = curSeqAlign;
2774     prevId = curId;
2775     curSeqAlign = curSeqAlign->next;
2776   }
2777   return(seqAlignCounter);
2778 }
2779 
2780 /*---------------------------------------------------------------------------*/
2781 /*---------------------------------------------------------------------------*/
2782 /* Count the number of seqAligns in a list (returned) and count the number   */
2783 /* of distinct target sequences represented (passed back in numSequences);   */
2784 /* Important assumption: All SeqAligns with  the same target sequence are    */
2785 /* consecutive in the list                                                   */
2786 /* ONLY works for lists of Seqaligns containing DenseDiags!!!                */
2787 /*---------------------------------------------------------------------------*/
2788 /*---------------------------------------------------------------------------*/
CddCountDenDiagSeqAligns(SeqAlignPtr listOfSeqAligns,Int4 * numSequences)2789 Int4 LIBCALL CddCountDenDiagSeqAligns(SeqAlignPtr listOfSeqAligns, Int4 * numSequences)
2790 {
2791   SeqAlignPtr  curSeqAlign;
2792   Int4         seqAlignCounter;
2793   DenseDiagPtr curSegs;
2794 
2795   seqAlignCounter = 0;
2796   *numSequences = 0;
2797   curSeqAlign = listOfSeqAligns;
2798   while (NULL != curSeqAlign) {
2799     curSegs = (DenseDiagPtr) curSeqAlign->segs;
2800     if(curSegs->id == NULL) break;
2801     seqAlignCounter++;
2802     (*numSequences)++;
2803     curSeqAlign = curSeqAlign->next;
2804   }
2805   return(seqAlignCounter);
2806 }
2807 
2808 /*---------------------------------------------------------------------------*/
2809 /*---------------------------------------------------------------------------*/
2810 /* assign the range of the master sequence involved in alignments            */
2811 /*---------------------------------------------------------------------------*/
2812 /*---------------------------------------------------------------------------*/
CddAssignProfileRange(CddPtr pcdd,SeqIdPtr sip)2813 void LIBCALL CddAssignProfileRange(CddPtr pcdd, SeqIdPtr sip)
2814 {
2815   SeqAlignPtr  salp;
2816   DenseDiagPtr ddp;
2817   SeqIntPtr    sintp;
2818   Int4         iMax = -10000000;
2819   Int4         iMin =  10000000;
2820 
2821   sintp = (SeqIntPtr) SeqIntNew();
2822 
2823   salp = (SeqAlignPtr) pcdd->seqannot->data;
2824 
2825   while (salp) {
2826     ddp = (DenseDiagPtr) salp->segs;
2827     while (ddp) {
2828       if (ddp->starts[0] < iMin) iMin = ddp->starts[0];
2829       if ((ddp->starts[0] + ddp->len - 1) > iMax)
2830         iMax = ddp->starts[0] + ddp->len - 1;
2831       ddp = ddp->next;
2832     }
2833     salp = salp->next;
2834   }
2835   sintp->from = iMin;
2836   sintp->to = iMax;
2837   sintp->id = (SeqIdPtr) SeqIdDup(sip);
2838   if (iMax >= iMin) {
2839     pcdd->profile_range = (struct seqint PNTR) sintp;
2840   } else {
2841     printf(" CddAssignProfileRange: Warning: Can not assign alignment range!\n");
2842   }
2843 }
2844 
2845 
2846 /*---------------------------------------------------------------------------*/
2847 /*---------------------------------------------------------------------------*/
2848 /* return a pointer to a specific bioseq from a seq-entry set                */
2849 /*---------------------------------------------------------------------------*/
2850 /*---------------------------------------------------------------------------*/
CddFindBioseqInSeqEntry(SeqEntryPtr sep,SeqIdPtr sip)2851 BioseqPtr LIBCALL CddFindBioseqInSeqEntry(SeqEntryPtr sep, SeqIdPtr sip)
2852 {
2853   SeqEntryPtr  sepThis;
2854   BioseqPtr    bsp;
2855   BioseqSetPtr bssp;
2856 
2857   sepThis = sep;
2858 
2859   while (sepThis) {
2860     if (IS_Bioseq(sepThis)) {
2861       bsp = sepThis->data.ptrvalue;
2862       if (CddSameSip(bsp->id, sip)) return(bsp);
2863     } else if (IS_Bioseq_set(sepThis)) {
2864       bssp = sepThis->data.ptrvalue;
2865       bsp = CddFindBioseqInSeqEntry(bssp->seq_set,sip);
2866       if (CddSameSip(bsp->id, sip)) return(bsp);
2867     }
2868     sepThis = sepThis->next;
2869   }
2870   return(NULL);
2871 }
2872 
2873 /*---------------------------------------------------------------------------*/
2874 /* rips out and returns a PDBSeqId from a SeqId                              */
2875 /*---------------------------------------------------------------------------*/
CddGetPdbSeqId(SeqIdPtr sip)2876 PDBSeqIdPtr LIBCALL CddGetPdbSeqId(SeqIdPtr sip)
2877                     /* may need to be modified according to how bioseq id is */
2878 {
2879   SeqIdPtr    seq_id = NULL;
2880   PDBSeqIdPtr pdb_seq_id = NULL;
2881 
2882   seq_id = sip;
2883   while(seq_id != NULL){
2884     if(seq_id->choice == SEQID_PDB) {
2885       pdb_seq_id = seq_id->data.ptrvalue;
2886       break;
2887     }
2888     seq_id = seq_id->next;
2889   }
2890   return(pdb_seq_id);
2891 }
2892 
2893 
2894 /*---------------------------------------------------------------------------*/
2895 /* test whether two SeqIds are the same, only considers PDB-Ids and GIs      */
2896 /* as well as local sequence-id's                                            */
2897 /*---------------------------------------------------------------------------*/
CddSameSip(SeqIdPtr sip1,SeqIdPtr sip2)2898 Boolean LIBCALL CddSameSip(SeqIdPtr sip1, SeqIdPtr sip2)
2899 {
2900   PDBSeqIdPtr psip1, psip2;
2901   SeqIdPtr    tsip1, tsip2;
2902   ObjectIdPtr oidp1, oidp2;
2903 
2904   if (sip1 == sip2) return(TRUE);
2905   tsip1 = sip1;
2906   while (tsip1) {
2907     tsip2 = sip2;
2908     while (tsip2) {
2909       if (tsip1->choice == tsip2->choice) {
2910         if (tsip1->choice == SEQID_PDB) {
2911           psip1 = (PDBSeqIdPtr) CddGetPdbSeqId(tsip1);
2912           psip2 = (PDBSeqIdPtr) CddGetPdbSeqId(tsip2);
2913           if (strcmp(psip1->mol,psip2->mol)==0 && psip1->chain==psip2->chain)
2914             return(TRUE);
2915         } else if (tsip1->choice == SEQID_LOCAL) {
2916           oidp1 = tsip1->data.ptrvalue;
2917           oidp2 = tsip2->data.ptrvalue;
2918           if (NULL != oidp1 && strcmp(oidp1->str,oidp2->str)==0) return(TRUE);
2919         } else if (tsip1->choice == SEQID_GI) {
2920 	  if (tsip1->data.intvalue == tsip2->data.intvalue) return(TRUE);
2921 	}
2922       }
2923       tsip2 = tsip2->next;
2924     }
2925     tsip1 = tsip1->next;
2926   }
2927   return(FALSE);
2928 }
2929 
2930 /*---------------------------------------------------------------------------*/
2931 /*---------------------------------------------------------------------------*/
2932 /* reindex master of a MSL-DenSeg Alignment according to a n-terminal offset */
2933 /* must allocate new arrays for "starts", these may just be pointers to DDG  */
2934 /*---------------------------------------------------------------------------*/
2935 /*---------------------------------------------------------------------------*/
CddReindexMSLDenSegMaster(SeqAlignPtr salp,Int4 offset)2936 void LIBCALL CddReindexMSLDenSegMaster(SeqAlignPtr salp, Int4 offset)
2937 {
2938   SeqAlignPtr salpThis;
2939   DenseSegPtr dsp;
2940   Int4        *newstarts, i;
2941 
2942   if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENSEG) {
2943     salpThis = salp;
2944     while (salpThis) {
2945       dsp = salpThis->segs;
2946       if (dsp) {
2947         newstarts = (Int4 *)MemNew(dsp->dim * sizeof(Int4));
2948         for (i=0;i<dsp->dim;i++) newstarts[i] = dsp->starts[i];
2949         newstarts[0] -= offset;
2950         dsp->starts = newstarts;
2951       }
2952       salpThis = salpThis->next;
2953     }
2954   }
2955 }
2956 
2957 
2958 /*---------------------------------------------------------------------------*/
2959 /*---------------------------------------------------------------------------*/
2960 /* reindex master of a MSL-DenDiag Alignment according to a n-terminal offset*/
2961 /* must allocate new arrays for "starts", these may just be pointers to DDG  */
2962 /*---------------------------------------------------------------------------*/
2963 /*---------------------------------------------------------------------------*/
CddReindexMSLDenDiagMaster(SeqAlignPtr salp,Int4 offset)2964 void LIBCALL CddReindexMSLDenDiagMaster(SeqAlignPtr salp, Int4 offset)
2965 {
2966   SeqAlignPtr  salpThis;
2967   DenseDiagPtr ddp;
2968 
2969   if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENDIAG) {
2970     salpThis = salp;
2971     while (salpThis) {
2972       ddp = salpThis->segs;
2973       while (ddp) {
2974         ddp->starts[0] -= offset;
2975         ddp = ddp->next;
2976       }
2977       salpThis = salpThis->next;
2978     }
2979   }
2980 }
2981 
2982 
2983 /*---------------------------------------------------------------------------*/
2984 /*---------------------------------------------------------------------------*/
2985 /* makes a copy of one master-slave dense-diag alignment - does NOT copy the */
2986 /* identifiers. Should only be used for a reindexing the master for CposComp.*/
2987 /*---------------------------------------------------------------------------*/
2988 /*---------------------------------------------------------------------------*/
CddCopyMSLDenDiag(SeqAlignPtr salp)2989 SeqAlignPtr LIBCALL CddCopyMSLDenDiag(SeqAlignPtr salp)
2990 {
2991   SeqAlignPtr  salpNew = NULL, salpHead = NULL, salpTail = NULL;
2992   DenseDiagPtr ddp, ddpNew, ddpTail;
2993   Int4         i;
2994 
2995   if (salp->type == SAT_PARTIAL && salp->segtype == SAS_DENDIAG) {
2996     while (salp) {
2997       salpNew = SeqAlignNew();
2998       salpNew->type = salp->type;
2999       salpNew->segtype = salp->segtype;
3000       salpNew->dim = salp->dim;
3001       salpNew->segs = NULL;
3002       ddp = salp->segs;
3003       while (ddp) {
3004         ddpNew = DenseDiagNew();
3005         ddpNew->dim = ddp->dim;
3006         ddpNew->id = ddp->id;
3007         ddpNew->starts = MemNew(ddpNew->dim * sizeof(Int4));
3008         for (i =0;i<ddpNew->dim;i++) ddpNew->starts[i] = ddp->starts[i];
3009         ddpNew->len = ddp->len;
3010         if (!salpNew->segs) {
3011           salpNew->segs = ddpNew;
3012           ddpTail = ddpNew;
3013           ddpTail->next = NULL;
3014         } else {
3015           ddpTail->next = ddpNew;
3016           ddpTail = ddpNew;
3017           ddpTail->next = NULL;
3018         }
3019         ddp = ddp->next;
3020       }
3021       if (!salpHead) {
3022         salpHead = salpNew;
3023         salpTail = salpHead;
3024         salpTail->next = NULL;
3025       } else {
3026         salpTail->next = salpNew;
3027         salpTail = salpNew;
3028         salpTail->next = NULL;
3029       }
3030       salp = salp->next;
3031     }
3032   }
3033   return salpHead;
3034 }
3035 
3036 
3037 /*---------------------------------------------------------------------------*/
3038 /*---------------------------------------------------------------------------*/
3039 /* allocate and free a new ExplicitAlignment object                          */
3040 /*---------------------------------------------------------------------------*/
3041 /*---------------------------------------------------------------------------*/
CddExpAlignNew()3042 CddExpAlignPtr CddExpAlignNew()
3043 {
3044   CddExpAlignPtr pCDea;
3045 
3046   pCDea = MemNew(sizeof(CddExpAlign));
3047   if (!pCDea) return NULL;
3048   pCDea->length = 0;
3049   pCDea->ids = NULL;
3050   pCDea->adata = NULL;
3051   pCDea->starts = NULL;
3052   pCDea->bIdAlloc = FALSE;
3053   return(pCDea);
3054 }
3055 
CddExpAlignFree(CddExpAlignPtr pCDea)3056 CddExpAlignPtr CddExpAlignFree(CddExpAlignPtr pCDea)
3057 {
3058   if (!pCDea) return NULL;
3059   if (NULL != pCDea->adata) free(pCDea->adata);
3060   pCDea->ids = SeqIdSetFree(pCDea->ids);
3061   if (NULL != pCDea->starts) free(pCDea->starts);
3062   return MemFree(pCDea);
3063 }
3064 
CddExpAlignAlloc(CddExpAlignPtr pCDea,Int4 iLength)3065 void CddExpAlignAlloc(CddExpAlignPtr pCDea, Int4 iLength)
3066 {
3067   Int4    i;
3068 
3069   if (!pCDea || !iLength) return;
3070   pCDea->length = iLength;
3071   if (NULL != pCDea->adata) free(pCDea->adata);
3072   pCDea->adata = MemNew(sizeof(Int4)*pCDea->length);
3073   for (i=0;i<pCDea->length;i++) pCDea->adata[i] = -1;
3074   if (NULL != pCDea->starts) free(pCDea->starts);
3075   pCDea->starts = MemNew(sizeof(Int4)*pCDea->length);
3076   for (i=0;i<pCDea->length;i++) pCDea->starts[i] = 0;
3077 }
3078 
3079 /*---------------------------------------------------------------------------*/
3080 /* Convert a SeqAlign (pairwise, DenseDiag) into an ExplicitAlignmentObject  */
3081 /*---------------------------------------------------------------------------*/
SeqAlignToCddExpAlign(SeqAlignPtr salp,SeqEntryPtr sep)3082 CddExpAlignPtr LIBCALL SeqAlignToCddExpAlign(SeqAlignPtr salp, SeqEntryPtr sep)
3083 {
3084   CddExpAlignPtr   pCDea;
3085   BioseqPtr        bsp1;
3086   DenseDiagPtr     ddp;
3087   Int4             i;
3088   SeqIdPtr         sip;
3089 
3090   pCDea = CddExpAlignNew();
3091   ddp = salp->segs;
3092   pCDea->ids = SeqIdDupList(ddp->id);
3093   sip = SeqIdDup(pCDea->ids);
3094   bsp1 = CddRetrieveBioseqById(sip,sep);
3095   SeqIdFree(sip);
3096   CddExpAlignAlloc(pCDea,bsp1->length);
3097   while (ddp) {
3098     for (i=0;i<ddp->len;i++) {
3099       if (ddp->starts[0]+i >= bsp1->length) {
3100         printf("Warning: Indexing error!\n");
3101       }
3102       pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3103     }
3104     pCDea->starts[ddp->starts[0]] = 1;
3105     ddp = ddp->next;
3106   }
3107   return(pCDea);
3108 }
3109 
3110 /*---------------------------------------------------------------------------*/
3111 /* Invert an explicit alignment object (flip master-slave)                   */
3112 /*---------------------------------------------------------------------------*/
InvertCddExpAlign(CddExpAlignPtr pCDea,SeqEntryPtr sep)3113 CddExpAlignPtr InvertCddExpAlign(CddExpAlignPtr pCDea, SeqEntryPtr sep)
3114 {
3115   BioseqPtr      bsp;
3116   CddExpAlignPtr pCDeaNew;
3117   Int4           i;
3118 
3119   if (!pCDea->ids || !pCDea->ids->next) CddSevError("No SeqId in InvertCddExpAlign");
3120   pCDeaNew = CddExpAlignNew();
3121   pCDeaNew->ids = SeqIdDup(pCDea->ids->next);
3122   pCDeaNew->ids->next = SeqIdDup(pCDea->ids);
3123   pCDeaNew->bIdAlloc = TRUE;
3124   bsp = CddRetrieveBioseqById(pCDea->ids->next, sep);
3125   CddExpAlignAlloc(pCDeaNew, bsp->length);
3126   for (i=0;i<pCDea->length;i++) {
3127     if (pCDea->adata[i] != -1) {
3128       pCDeaNew->adata[pCDea->adata[i]] = i;
3129     }
3130     if (pCDea->starts[i] > 0) {
3131       pCDeaNew->starts[pCDea->adata[i]] = 1;
3132     }
3133   }
3134   return(pCDeaNew);
3135 }
3136 
3137 /*---------------------------------------------------------------------------*/
3138 /* Convert an Explicit Alignment Object (with proper IDs) to a SeqAlign      */
3139 /*---------------------------------------------------------------------------*/
CddExpAlignToSeqAlign(CddExpAlignPtr pCDea,Int4Ptr iBreakAfter)3140 SeqAlignPtr CddExpAlignToSeqAlign(CddExpAlignPtr pCDea, Int4Ptr iBreakAfter)
3141 {
3142   SeqAlignPtr   salp;
3143   DenseDiagPtr  ddp, ddplast = NULL;
3144   Int4          i;
3145 
3146   if (!pCDea->ids || !pCDea->ids->next)
3147    CddSevError("Missing Sequence ID in CddExpAlignToSeqAlign");
3148   salp = SeqAlignNew();
3149   salp->type = SAT_PARTIAL;
3150   salp->segtype = SAS_DENDIAG;
3151   salp->dim = 2;
3152   salp->segs = NULL;
3153 
3154   i= -1;
3155   while (++i < pCDea->length) {
3156     if (pCDea->adata[i] != -1) {
3157       ddp = DenseDiagNew();
3158       ddp->id = SeqIdDup(pCDea->ids);
3159       ddp->id->next = SeqIdDup(pCDea->ids->next);
3160       ddp->dim = 2;
3161       ddp->starts = MemNew(2*sizeof(Int4));
3162       ddp->starts[0] = i;
3163       ddp->starts[1] = pCDea->adata[i];
3164       ddp->len = 1;
3165       while (i<(pCDea->length-1) && pCDea->adata[i+1]==(pCDea->adata[i]+1) &&
3166              (NULL == iBreakAfter || iBreakAfter[i] == 0) &&
3167 	     pCDea->starts[i+1] == 0) {
3168 	ddp->len++;
3169 	i++;
3170       }
3171       if (ddplast) {
3172         ddplast->next = ddp;
3173       } else {
3174         salp->segs = ddp;
3175       }
3176       ddplast = ddp;
3177     }
3178   }
3179   if (!salp->segs) salp = SeqAlignFree(salp);
3180   return(salp);
3181 }
3182 
3183 /*---------------------------------------------------------------------------*/
3184 /*---------------------------------------------------------------------------*/
3185 /* reindex a m-s1, m-s2 alignment pair to a s1-s2 alignment pair             */
3186 /*---------------------------------------------------------------------------*/
3187 /*---------------------------------------------------------------------------*/
CddReindexExpAlign(CddExpAlignPtr pCDea1,Int4 newlength,CddExpAlignPtr pCDea2,Int4 iOuter,Int4 iInner)3188 CddExpAlignPtr CddReindexExpAlign(CddExpAlignPtr pCDea1, Int4 newlength,
3189                                   CddExpAlignPtr pCDea2, Int4 iOuter, Int4 iInner)
3190 {
3191   CddExpAlignPtr     pCDea = NULL;
3192   Int4               i, iP1, iP2;
3193   SeqIdPtr           sip1, sip2;
3194 
3195   if (!pCDea1 || !pCDea2) return pCDea;
3196   if (pCDea1->ids && pCDea1->ids->next) {
3197     sip1 = SeqIdDup(pCDea1->ids->next);
3198   } else sip1 = NULL;
3199   if (pCDea2->ids && pCDea2->ids->next) {
3200     sip2 = SeqIdDup(pCDea2->ids->next);
3201   } else sip2 = NULL;
3202 
3203   pCDea = CddExpAlignNew();
3204   CddExpAlignAlloc(pCDea,newlength);
3205   for (i=0;i<pCDea1->length && i<pCDea2->length;i++) {
3206     iP1 = pCDea1->adata[i]; iP2 = pCDea2->adata[i];
3207     if (iP1 > -1 && iP2 > -1){
3208       if (iP1 >= newlength) {
3209         printf("Warning: index error between sequences %d and %d\n",
3210 	        iInner, iOuter);
3211         CddSevError("Error while reindexing explicit alignments!");
3212       } else  pCDea->adata[iP1] = iP2;
3213       if (pCDea1->starts[i] > 0) pCDea->starts[iP1] = 1;
3214     }
3215   }
3216   if (sip1 && sip2) {
3217     pCDea->ids = sip1;
3218     pCDea->ids->next = sip2;
3219     pCDea->bIdAlloc = TRUE;
3220   }
3221   return pCDea;
3222 }
3223 
3224 /*---------------------------------------------------------------------------*/
3225 /*---------------------------------------------------------------------------*/
3226 /* returns a pairwise % id for two sequences in a CD (ordered by alignmt)    */
3227 /*---------------------------------------------------------------------------*/
3228 /*---------------------------------------------------------------------------*/
CddGetPairId(TrianglePtr pTri,Int4 idx1,Int4 idx2)3229 FloatHi CddGetPairId(TrianglePtr pTri, Int4 idx1, Int4 idx2)
3230 {
3231   Int4 i,  index;
3232   float    findex;
3233   ScorePtr psc;
3234 
3235   if (idx1 == idx2) return 100.0;
3236   if (idx1 > idx2) { i = idx1; idx1 = idx2; idx2 = i; }
3237   findex=(float)idx1*((float)pTri->nelements-((float)idx1+1.0)/2.0-1.0)+(float)idx2-1.0;
3238   index = (Int4) findex;
3239   if (index < 0) CddSevError("Impossible index in CddGetPairId!");
3240   psc = (ScorePtr) pTri->scores; i=0;
3241   while (psc && (i<index)) {
3242     i++; psc = psc->next;
3243   }
3244   if (psc) return psc->value.realvalue;
3245   else CddSevError("Ran out of scores in CddGetPairId!");
3246   return((FloatHi) 0.0);
3247 }
3248 
HitYet(Int4Ptr retlist,Int4 index,Int4 i)3249 static Boolean HitYet(Int4Ptr retlist, Int4 index, Int4 i)
3250 {
3251   Int4 j;
3252   for (j=0;j<index;j++) {
3253     if (retlist[j]==i) {
3254       return TRUE;
3255     }
3256   }
3257   return FALSE;
3258 }
3259 
3260 /*---------------------------------------------------------------------------*/
3261 /*---------------------------------------------------------------------------*/
3262 /* return a list of indices for the n most dissimilar sequences in a CD      */
3263 /*---------------------------------------------------------------------------*/
3264 /*---------------------------------------------------------------------------*/
CddMostDiverse(TrianglePtr pTri,Int4 length,Int4 maxdiv)3265 Int4Ptr CddMostDiverse(TrianglePtr pTri, Int4 length, Int4 maxdiv)
3266 {
3267   Int4Ptr     retlist;
3268   Int4        index, winner, i, j, ncomp;
3269   FloatHi     sumcomp, mincomp;
3270   FloatHi     **iMat;
3271   ScorePtr    psc;
3272   ValNodePtr  vnp;
3273 
3274   if (!pTri) return NULL;
3275   if (length >= pTri->nelements) return NULL;
3276   if (length > maxdiv) length = maxdiv;
3277   retlist = MemNew(length * sizeof(Int4));
3278   retlist[0] = 0;
3279 
3280   if (pTri->div_ranks) {
3281     vnp = pTri->div_ranks; i = 0;
3282     while (vnp) {
3283       retlist[i] = vnp->data.intvalue;
3284       i++;
3285       if (i >= length) break;
3286       vnp = vnp->next;
3287     }
3288     return(retlist);
3289   }
3290 
3291   iMat = (FloatHi **) calloc(pTri->nelements,sizeof (FloatHi *));
3292   for (i=0;i<pTri->nelements;i++) {
3293     iMat[i] = (FloatHi *) calloc(pTri->nelements,sizeof(FloatHi));
3294     iMat[i][i] = 100;
3295   }
3296   psc = (ScorePtr) pTri->scores;
3297   for (i=0;i<pTri->nelements;i++) {
3298     for (j=i+1;j<pTri->nelements;j++) {
3299       iMat[i][j] = iMat[j][i] = psc->value.realvalue;
3300       psc = psc->next;
3301     }
3302   }
3303 
3304   for (index = 1; index < length; index++) {
3305     mincomp = 100.0, winner = -1;
3306     for (i=1;i<pTri->nelements;i++) {
3307       sumcomp = 0.0; ncomp = 0;
3308       if (!HitYet(retlist,index,i)) {
3309         for (j=0;j<pTri->nelements;j++) {
3310 	  if (HitYet(retlist,index,j)) {
3311 /*	    sumcomp += CddGetPairId(pTri, i, j); */
3312             sumcomp += iMat[i][j];
3313 	    ncomp ++;
3314 	  }
3315 	}
3316 	if (ncomp) sumcomp /= (FloatHi) ncomp;
3317 	if (sumcomp < mincomp) {
3318 	  mincomp = sumcomp; winner = i;
3319 	}
3320       }
3321     }
3322     retlist[index] = winner;
3323   }
3324   for (i=0;i<pTri->nelements;i++) free(iMat[i]);
3325   free(iMat);
3326   return (retlist);
3327 }
3328 
3329 /*---------------------------------------------------------------------------*/
3330 /*---------------------------------------------------------------------------*/
3331 /* return a list of indices for the n most similar sequences for the query   */
3332 /*---------------------------------------------------------------------------*/
3333 /*---------------------------------------------------------------------------*/
CddMostSimilarToQuery(ScorePtr psc,Int4 length)3334 Int4Ptr CddMostSimilarToQuery(ScorePtr psc, Int4 length)
3335 {
3336   Int4Ptr    retlist;
3337   Int4       winner, index, i, nelements;
3338   FloatHi    sumcomp, mincomp;
3339   ScorePtr   thispsc;
3340 
3341   if (!psc) return NULL;
3342   nelements = 0; thispsc = psc;
3343   while (thispsc) { nelements++; thispsc = thispsc->next; }
3344   if (length >= nelements) return NULL;
3345   retlist = MemNew(length * sizeof(Int4));
3346   retlist[0] = 0;
3347   for (index=1;index<length;index++) {
3348     mincomp = 0.0; winner = -1;
3349     i = 0;
3350     thispsc = psc->next;
3351     while (thispsc) {
3352       i++;
3353       if (!HitYet(retlist,index,i)) {
3354         sumcomp = thispsc->value.realvalue;
3355         if (sumcomp > mincomp) {
3356 	  mincomp = sumcomp;
3357 	  winner = i;
3358 	}
3359       }
3360       thispsc = thispsc->next;
3361     }
3362     retlist[index] = winner;
3363   }
3364   return(retlist);
3365 }
3366 
3367 
3368 /*---------------------------------------------------------------------------*/
3369 /*---------------------------------------------------------------------------*/
3370 /* retrieve a sequence by trying to locate it in the CDD bioseq-set or other */
3371 /* wise through the sequence/object-manager or ID1                           */
3372 /*---------------------------------------------------------------------------*/
3373 /*---------------------------------------------------------------------------*/
CddRetrieveBioseqById(SeqIdPtr sip,SeqEntryPtr sep)3374 BioseqPtr CddRetrieveBioseqById(SeqIdPtr sip, SeqEntryPtr sep)
3375 {
3376   BioseqPtr   bsp = NULL;
3377   Int4        uid = 0;
3378 /*  SeqEntryPtr sepThis; */
3379   Int2        retcode = 1;
3380 
3381   bsp = (BioseqPtr) CddFindSip(sip, sep);
3382   if (!bsp) {
3383     bsp = BioseqLockById(sip);
3384   }
3385   /*
3386   if (!bsp) {
3387     uid = ID1FindSeqId(sip);
3388     if (uid > 0) {
3389       sepThis = (SeqEntryPtr) ID1SeqEntryGet(uid,retcode);
3390       if (sepThis) {
3391         bsp = CddExtractBioseq(sepThis, sip);
3392       }
3393     }
3394   }
3395   */
3396   return bsp;
3397 }
3398 
3399 /*---------------------------------------------------------------------------*/
3400 /*---------------------------------------------------------------------------*/
3401 /* Calculate pairwise percentages of identical residues for sequences in a CD*/
3402 /*---------------------------------------------------------------------------*/
3403 /*---------------------------------------------------------------------------*/
CddCalculateTriangle(CddPtr pcdd)3404 TrianglePtr CddCalculateTriangle(CddPtr pcdd)
3405 {
3406    BioseqPtr      bsp1 = NULL, bsp2, bsp3;
3407    BioseqSetPtr   bssp = NULL;
3408    CddExpAlignPtr pCDea, pCDea1, pCDea2;
3409    DenseDiagPtr   ddp;
3410    SeqAlignPtr    salpOuter, salpInner, salpHead;
3411    SeqIdPtr       sip1, sip2, sip3;
3412    SeqPortPtr     spp1, spp2, spp3;
3413    ScorePtr       pscHead=NULL, psc, pscTail;
3414    TrianglePtr    pTri;
3415    Uint1Ptr       buffer1, buffer2, buffer3;
3416    Int4           i, alen, nid, iOuter, iInner;
3417 
3418 
3419    if (!pcdd->seqannot) return NULL;
3420    salpHead = pcdd->seqannot->data;
3421    pTri = TriangleNew();
3422    pTri->nelements = 1;
3423 
3424 /*---------------------------------------------------------------------------*/
3425 /* Loop around Outer pointer to get master-slave alignments                  */
3426 /*---------------------------------------------------------------------------*/
3427    salpOuter = salpHead; bsp1 = NULL; sip1 = NULL;
3428    while (salpOuter) {
3429      pTri->nelements++;
3430      ddp = salpOuter->segs;
3431      if (!sip1) sip1 = SeqIdDup(ddp->id);
3432      sip2 = SeqIdDup(ddp->id->next);
3433      if (!bssp) bssp = pcdd->sequences->data.ptrvalue;
3434      if (!bsp1) {
3435        bsp1 = (BioseqPtr) CddRetrieveBioseqById(sip1, bssp->seq_set);
3436        buffer1 = MemNew((bsp1->length)*sizeof(Uint1));
3437        spp1 = SeqPortNew(bsp1, 0, bsp1->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3438        for (i=0; i<bsp1->length;i++) buffer1[i] = SeqPortGetResidue(spp1);
3439        spp1 = SeqPortFree(spp1);
3440      }
3441      bsp2 = NULL;
3442      bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3443      buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3444      spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3445      for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3446      spp2 = SeqPortFree(spp2);
3447      pCDea = CddExpAlignNew();
3448      CddExpAlignAlloc(pCDea,bsp1->length);
3449      while (ddp) {
3450        for (i=0;i<ddp->len;i++) {
3451          pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3452        }
3453        ddp = ddp->next;
3454      }
3455      alen = 0; nid = 0;
3456      for (i=0;i<bsp1->length;i++) {
3457        if (pCDea->adata[i] > -1) {
3458          alen++; if (buffer1[i] == buffer2[pCDea->adata[i]]) nid++;
3459        }
3460      }
3461      psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3462      psc->choice = 2;
3463      if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3464      else psc->value.realvalue = 0.0;
3465      psc->next = NULL;
3466      if (!pscHead) {
3467        pscHead = psc; pscTail = psc;
3468      } else {
3469        pscTail->next = psc; pscTail = psc; psc = NULL;
3470      }
3471      pCDea = CddExpAlignFree(pCDea);
3472      MemFree(buffer2); SeqIdFree(sip2);
3473      salpOuter = salpOuter->next;
3474    }
3475 
3476 /*---------------------------------------------------------------------------*/
3477 /* Loop around Outer and Inner Pointer to get slave-slave similarities       */
3478 /*---------------------------------------------------------------------------*/
3479    salpOuter = salpHead; iOuter = 1;
3480    while (salpOuter->next) {
3481      iOuter++;
3482      ddp = salpOuter->segs;
3483      sip2 = SeqIdDup(ddp->id->next); bsp2 = NULL;
3484      bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3485      buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3486      spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3487      for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3488      spp2 = SeqPortFree(spp2);
3489      pCDea1 = CddExpAlignNew();
3490      CddExpAlignAlloc(pCDea1,bsp1->length);
3491      while (ddp) {
3492        for (i=0;i<ddp->len;i++) pCDea1->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3493        ddp = ddp->next;
3494      }
3495      salpInner = salpOuter->next;
3496      iInner = iOuter;
3497      while (salpInner) {
3498        iInner++;
3499        ddp = salpInner->segs;
3500        sip3 = SeqIdDup(ddp->id->next); bsp3 = NULL;
3501        bsp3 = (BioseqPtr) CddRetrieveBioseqById(sip3,bssp->seq_set);
3502        buffer3  =MemNew((bsp3->length)*sizeof(Uint1));
3503        spp3 = SeqPortNew(bsp3, 0, bsp3->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3504        for (i=0; i<bsp3->length;i++) buffer3[i] = SeqPortGetResidue(spp3);
3505        spp3 = SeqPortFree(spp3);
3506        pCDea2 = CddExpAlignNew();
3507        CddExpAlignAlloc(pCDea2,bsp1->length);
3508        while (ddp) {
3509          for (i=0;i<ddp->len;i++) pCDea2->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3510          ddp = ddp->next;
3511        }
3512        pCDea = CddReindexExpAlign(pCDea1, bsp2->length, pCDea2, iOuter, iInner);
3513        alen = 0; nid = 0;
3514        for (i=0;i<bsp2->length;i++) {
3515          if (pCDea->adata[i] > -1) {
3516            alen++; if (buffer2[i] == buffer3[pCDea->adata[i]]) nid++;
3517          }
3518        }
3519        psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3520        psc->choice = 2;
3521        if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3522        else psc->value.realvalue = 0.0;
3523        psc->next = NULL;
3524        pscTail->next = psc; pscTail = psc; psc = NULL;
3525        pCDea2 = CddExpAlignFree(pCDea2);
3526        pCDea = CddExpAlignFree(pCDea);
3527        MemFree(buffer3); SeqIdFree(sip3);
3528        salpInner = salpInner->next;
3529      }
3530      pCDea1 = CddExpAlignFree(pCDea1);
3531      MemFree(buffer2); SeqIdFree(sip2);
3532      salpOuter = salpOuter->next;
3533    }
3534    if (sip1) SeqIdFree(sip1); if (bsp1) bsp1 = NULL;
3535    if (buffer1) MemFree(buffer1);
3536    pTri->scores = (struct score PNTR) pscHead;
3537    return(pTri);
3538 }
3539 
3540 /*---------------------------------------------------------------------------*/
3541 /*---------------------------------------------------------------------------*/
3542 /* Calculate pairwise similarities for the query added to a cd alignment     */
3543 /*---------------------------------------------------------------------------*/
3544 /*---------------------------------------------------------------------------*/
CddCalculateQuerySim(CddPtr pcdd,SeqAlignPtr salp)3545 ScorePtr CddCalculateQuerySim(CddPtr pcdd, SeqAlignPtr salp)
3546 {
3547    BioseqPtr      bsp1, bsp2, bsp3;
3548    BioseqSetPtr   bssp = NULL;
3549    CddExpAlignPtr pCDea, pCDea1, pCDea2;
3550    DenseDiagPtr   ddp;
3551    SeqAlignPtr    salpOuter, salpInner, salpHead;
3552    SeqIdPtr       sip1, sip2, sip3;
3553    SeqPortPtr     spp1, spp2, spp3;
3554    ScorePtr       pscHead=NULL, psc, pscTail;
3555    Uint1Ptr       buffer1, buffer2, buffer3;
3556    Int4           i, alen, nid;
3557 
3558 
3559    if (!salp || !pcdd) return NULL;
3560    salpHead = salp;
3561 /*---------------------------------------------------------------------------*/
3562 /* master-query alignment                                                    */
3563 /*---------------------------------------------------------------------------*/
3564    salpOuter = salpHead; bsp1 = NULL; sip1 = NULL;
3565    pCDea = CddExpAlignNew();
3566    ddp = salpOuter->segs;
3567    sip1 = SeqIdDup(ddp->id);
3568    sip2 = SeqIdDup(ddp->id->next);
3569    bssp = pcdd->sequences->data.ptrvalue;
3570    bsp1 = (BioseqPtr) CddRetrieveBioseqById(sip1, bssp->seq_set);
3571    buffer1 = MemNew((bsp1->length)*sizeof(Uint1));
3572    spp1 = SeqPortNew(bsp1, 0, bsp1->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3573    for (i=0; i<bsp1->length;i++) buffer1[i] = SeqPortGetResidue(spp1);
3574    spp1 = SeqPortFree(spp1);
3575    bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3576    buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3577    spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3578    for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3579    spp2 = SeqPortFree(spp2);
3580    CddExpAlignAlloc(pCDea,bsp1->length);
3581    while (ddp) {
3582      for (i=0;i<ddp->len;i++) {
3583        pCDea->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3584      }
3585      ddp = ddp->next;
3586    }
3587    alen = 0; nid = 0;
3588    for (i=0;i<bsp1->length;i++) {
3589      if (pCDea->adata[i] > -1) {
3590        alen++; if (buffer1[i] == buffer2[pCDea->adata[i]]) nid++;
3591      }
3592    }
3593    pCDea = CddExpAlignFree(pCDea);
3594    psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3595    psc->choice = 2;
3596    if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3597    else psc->value.realvalue = 0.0;
3598    psc->next = NULL;
3599    pscHead = psc; pscTail = psc; psc = NULL;
3600    MemFree(buffer2); SeqIdFree(sip2);
3601 /*---------------------------------------------------------------------------*/
3602 /* Loop around Inner Pointer to get query-slave similarities                 */
3603 /*---------------------------------------------------------------------------*/
3604    salpOuter = salpHead;
3605    pCDea1 = CddExpAlignNew();
3606    ddp = salpOuter->segs;
3607    sip2 = SeqIdDup(ddp->id->next);
3608    bsp2 = (BioseqPtr) CddRetrieveBioseqById(sip2, bssp->seq_set);
3609    buffer2 = MemNew((bsp2->length)*sizeof(Uint1));
3610    spp2 = SeqPortNew(bsp2, 0, bsp2->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3611    for (i=0; i<bsp2->length;i++) buffer2[i] = SeqPortGetResidue(spp2);
3612    spp2 = SeqPortFree(spp2);
3613    CddExpAlignAlloc(pCDea1,bsp1->length);
3614    while (ddp) {
3615      for (i=0;i<ddp->len;i++) pCDea1->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3616      ddp = ddp->next;
3617    }
3618    salpInner = salpOuter->next;
3619    while (salpInner) {
3620      pCDea2 = CddExpAlignNew();
3621      ddp = salpInner->segs;
3622      sip3 = SeqIdDup(ddp->id->next);
3623      bsp3 = (BioseqPtr) CddRetrieveBioseqById(sip3,bssp->seq_set);
3624      buffer3  =MemNew((bsp3->length)*sizeof(Uint1));
3625      spp3 = SeqPortNew(bsp3, 0, bsp3->length-1, Seq_strand_unknown, Seq_code_ncbistdaa);
3626      for (i=0; i<bsp3->length;i++) buffer3[i] = SeqPortGetResidue(spp3);
3627      spp3 = SeqPortFree(spp3);
3628      CddExpAlignAlloc(pCDea2,bsp1->length);
3629      while (ddp) {
3630        for (i=0;i<ddp->len;i++) pCDea2->adata[ddp->starts[0]+i]=ddp->starts[1]+i;
3631        ddp = ddp->next;
3632      }
3633      pCDea = CddReindexExpAlign(pCDea1, bsp2->length, pCDea2,0,0);
3634      alen = 0; nid = 0;
3635      for (i=0;i<bsp2->length;i++) {
3636        if (pCDea->adata[i] > -1) {
3637          alen++; if (buffer2[i] == buffer3[pCDea->adata[i]]) nid++;
3638        }
3639      }
3640      pCDea = CddExpAlignFree(pCDea);
3641      psc = ScoreNew(); psc->id=ObjectIdNew(); psc->id->str=StringSave("%id");
3642      psc->choice = 2;
3643      if (alen) psc->value.realvalue = 100.0 * (FloatHi) nid / (FloatHi) alen;
3644      else psc->value.realvalue = 0.0;
3645      psc->next = NULL;
3646      pscTail->next = psc; pscTail = psc; psc = NULL;
3647      pCDea2 = CddExpAlignFree(pCDea2);
3648      MemFree(buffer3); SeqIdFree(sip3);
3649      salpInner = salpInner->next;
3650    }
3651    pCDea1 = CddExpAlignFree(pCDea1);
3652    MemFree(buffer2); SeqIdFree(sip2);
3653    if (sip1) SeqIdFree(sip1); if (bsp1) bsp1 = NULL;
3654    if (buffer1) MemFree(buffer1);
3655    return(pscHead);
3656 }
3657 
3658 
3659 /*---------------------------------------------------------------------------*/
3660 /*---------------------------------------------------------------------------*/
3661 /* cloned from salpedit.c and salpacc.c                                      */
3662 /*---------------------------------------------------------------------------*/
3663 /*---------------------------------------------------------------------------*/
CddSeqAnnotForSeqAlign(SeqAlignPtr salp)3664 SeqAnnotPtr LIBCALL CddSeqAnnotForSeqAlign (SeqAlignPtr salp)
3665 {
3666   SeqAnnotPtr sap;
3667 
3668   if (salp==NULL)
3669      return NULL;
3670   sap = SeqAnnotNew ();
3671   sap->type = 2;
3672   sap->data = (Pointer) salp;
3673   return sap;
3674 }
3675 
CddSeqAlignDup(SeqAlignPtr salp)3676 SeqAlignPtr LIBCALL CddSeqAlignDup (SeqAlignPtr salp)
3677 {
3678   SeqAnnotPtr sap,
3679               sap2;
3680   SeqAlignPtr salp2 = NULL,
3681               next;
3682 
3683   next = salp->next;
3684   salp->next = NULL;
3685   sap = CddSeqAnnotForSeqAlign (salp);
3686   sap2 = (SeqAnnotPtr) AsnIoMemCopy ((Pointer) sap, (AsnReadFunc) SeqAnnotAsnRead, (AsnWriteFunc) SeqAnnotAsnWrite);
3687   if (sap2!=NULL) {
3688      salp2 = (SeqAlignPtr)sap2->data;
3689      sap2->data = NULL;
3690      SeqAnnotFree (sap2);
3691   }
3692   if (next != NULL)
3693      salp->next = next;
3694   sap->data = NULL;
3695   SeqAnnotFree (sap);
3696   return salp2;
3697 }
3698 
3699 
3700 
3701 /*---------------------------------------------------------------------------*/
3702 /*---------------------------------------------------------------------------*/
3703 /* Duplicate a whole Sequence Alignment (i.e. a linked list of Seqaligns     */
3704 /*---------------------------------------------------------------------------*/
3705 /*---------------------------------------------------------------------------*/
3706 
3707 
SeqAlignSetDup(SeqAlignPtr salp)3708 SeqAlignPtr LIBCALL SeqAlignSetDup(SeqAlignPtr salp)
3709 {
3710   SeqAlignPtr   salpHead = NULL;
3711   SeqAlignPtr   salpTail = NULL;
3712   SeqAlignPtr   salpThis;
3713 
3714   while (salp) {
3715     salpThis = (SeqAlignPtr) CddSeqAlignDup(salp);
3716     if (salpTail) {
3717       salpTail->next = salpThis;
3718       salpTail = salpThis;
3719       salpThis->next = NULL;
3720     } else {
3721       salpHead = salpThis;
3722       salpTail = salpThis;
3723       salpThis->next = NULL;
3724     }
3725     salp = salp->next;
3726   }
3727   return (salpHead);
3728 }
3729 
3730 
3731 /*---------------------------------------------------------------------------*/
3732 /*---------------------------------------------------------------------------*/
3733 /* fill internal gaps in a sequence alignment to prepare it for PSSM calc.   */
3734 /*---------------------------------------------------------------------------*/
3735 /*---------------------------------------------------------------------------*/
CddDegapSeqAlign(SeqAlignPtr salp)3736 void LIBCALL CddDegapSeqAlign(SeqAlignPtr salp)
3737 {
3738   DenseDiagPtr    ddp;
3739   Int4            iM, iS, iN, iC;
3740   Boolean         bFirst, bInit = TRUE;
3741   BioseqPtr       bsp;
3742   SeqIdPtr        sip;
3743   Int4            mLen, sLen;
3744 
3745   while (salp) {
3746     ddp = salp->segs;
3747     if (bInit) {
3748       sip = ddp->id;
3749       bsp = BioseqLockById(sip);
3750       mLen = bsp->length;
3751       BioseqUnlock(bsp);
3752       bInit = FALSE;
3753     }
3754     sip = ddp->id->next;
3755     bsp = BioseqLockById(sip);
3756     sLen = bsp->length;
3757     BioseqUnlock(bsp);
3758     bFirst = TRUE;
3759     while (ddp) {
3760       if (bFirst) { /* extend first aligned block to N-terminus */
3761         iM = ddp->starts[0];
3762 	iS = ddp->starts[1];
3763 	if (iM > iS) iM = iS;
3764 	ddp->starts[0] -= iM;
3765 	ddp->starts[1] -= iM;
3766 	ddp->len += iM;
3767         bFirst = FALSE;
3768       }
3769       if (ddp->next) {
3770         iM = ddp->next->starts[0] - (ddp->starts[0]+ddp->len);
3771         iS = ddp->next->starts[1] - (ddp->starts[1]+ddp->len);
3772         if (iM > iS) iM = iS;
3773         if (iM > 0) {
3774           if (iM == 1) {
3775 	    iN = 1; iC = 0;
3776           } else {
3777 	    if ((iM % 2) == 0) {
3778 	      iN = iC = iM / 2;
3779 	    } else {
3780 	      iC = iM / 2; iN = iC + 1;
3781 	    }
3782           }
3783 	  ASSERT(iN+iC == iM);
3784 	  ddp->len += iN;
3785 	  ddp->next->starts[0] -= iC;
3786 	  ddp->next->starts[1] -= iC;
3787 	  ddp->next->len += iC;
3788         }
3789       } else { /* extend last aligned block to C-terminus */
3790         iM = mLen - (ddp->starts[0]+ddp->len);
3791         iS = sLen - (ddp->starts[1]+ddp->len);
3792         if (iM > iS) iM = iS;
3793 	ddp->len += iM;
3794       }
3795       ddp = ddp->next;
3796     }
3797     salp = salp->next;
3798   }
3799 }
3800 
3801 /*---------------------------------------------------------------------------*/
3802 /*---------------------------------------------------------------------------*/
3803 /* Duplicate a SeqId, pick the GI if encountering a linked list of several   */
3804 /*---------------------------------------------------------------------------*/
3805 /*---------------------------------------------------------------------------*/
CddSeqIdDupPDBGI(SeqIdPtr sipold)3806 SeqIdPtr LIBCALL CddSeqIdDupPDBGI(SeqIdPtr sipold)
3807 {
3808   SeqIdPtr   sip, sipgi = NULL, sippdb = NULL;
3809 
3810   sip = sipold;
3811   while (sip) {
3812     if (sip->choice == SEQID_GI)   sipgi = sip;
3813     if (sip->choice == SEQID_PDB) sippdb = sip;
3814     sip = sip->next;
3815   }
3816   if (sippdb) return(SeqIdDup(sippdb));
3817   if (sipgi) return(SeqIdDup(sipgi));
3818   return (SeqIdDup(sipold));
3819 }
3820 
3821 /*---------------------------------------------------------------------------*/
3822 /*---------------------------------------------------------------------------*/
3823 /* Calculate a weighted 50/50 consensus sequence and make it new master      */
3824 /*---------------------------------------------------------------------------*/
3825 /*---------------------------------------------------------------------------*/
CddConsensus(SeqAlignPtr salp,SeqEntryPtr sep,BioseqPtr bsp,SeqIntPtr range,BioseqPtr * bspCons,SeqAlignPtr * salpCons)3826 SeqAlignPtr LIBCALL CddConsensus(SeqAlignPtr salp,
3827                                  SeqEntryPtr sep,
3828 				 BioseqPtr   bsp,
3829 				 SeqIntPtr   range,
3830 				 BioseqPtr   *bspCons,
3831 				 SeqAlignPtr *salpCons)
3832 {
3833   BioseqPtr           bspThis;
3834   CddAlignWeightPtr   pCAW;
3835   CddExpAlignPtr      pCDea;
3836   CddExtAlignCellPtr  *pCEACell;
3837   CddExtAlignColPtr   pCEACol;
3838   CharPtr             cbuffer, outptr;
3839   DenseDiagPtr        ddp;
3840   Int4Ptr             trunc_on_virtual;
3841   Int4Ptr             trunc_aligned;
3842   ObjectIdPtr         oidp;
3843   SeqAlignPtr         salpThis, salpNew, salpTail;
3844   SeqEntryPtr         sepNew;
3845   SeqIdPtr            sipThis, sipCopy;
3846   SeqMapTablePtr      smtp;
3847   SeqPortPtr          spp;
3848   Uint1Ptr            buffer;
3849   FloatHi             SumWeight;
3850   Int4                maxextlen, nalign, ConsensusLen;
3851   Int4                index, midpt, ContWeight = 0;
3852   Int4                offset, i, j , k, ipt, seqpos;
3853   Int4                m_last, m_frst, s_insert, m_end;
3854   Int4                s_last, s_frst, maxpos = 0;
3855 
3856   if (!salp) return (NULL);
3857   if (salp->type != SAT_PARTIAL || salp->segtype != SAS_DENDIAG) return(NULL);
3858   if (!range) return (NULL);
3859   offset = range->from;
3860 /*---------------------------------------------------------------------------*/
3861 /* trunc_on_virtual holds, for every residue of the trunc_master, its posi-  */
3862 /* tion on the maximal expanded new master sequence                          */
3863 /* bsp is supposed to be the "truncated master", i.e. a bioseq covering the  */
3864 /* interval from the first to the last aligned residue on the master sequence*/
3865 /*---------------------------------------------------------------------------*/
3866   trunc_on_virtual = MemNew(sizeof(Int4)*bsp->length);
3867   for (i=0;i<bsp->length;i++) trunc_on_virtual[i] = i;
3868   maxextlen = bsp->length-1;
3869   trunc_aligned = MemNew(sizeof(Int4)*bsp->length);
3870   for (i=0;i<bsp->length;i++) trunc_aligned[i] = 0;
3871 
3872 /*---------------------------------------------------------------------------*/
3873 /* expand values on trunc_on_virtual to consider insertions relativ to master*/
3874 /*---------------------------------------------------------------------------*/
3875   nalign = 1;
3876   salpThis = salp;
3877   while (salpThis) {
3878     nalign++;
3879     ddp = salpThis->segs;
3880     if (ddp) for (j=ddp->starts[0]-offset;j<ddp->starts[0]+ddp->len-offset;j++) trunc_aligned[j]=1;
3881     while (ddp && ddp->next) {
3882       for (j=ddp->next->starts[0]-offset;j<ddp->next->starts[0]+ddp->next->len-offset;j++) trunc_aligned[j]=1;
3883       m_last = ddp->starts[0]+ddp->len - 1 - offset;
3884       m_frst = ddp->next->starts[0] - offset;
3885       m_end  = ddp->next->starts[0] + ddp->next->len - 1 - offset;
3886       ASSERT(m_frst >= 0 && m_frst < bsp->length);
3887       ASSERT(m_last >= 0 && m_last < bsp->length);
3888       ASSERT(m_end >= 0 && m_end < bsp->length);
3889 /*---------------------------------------------------------------------------*/
3890 /* shift block from the c- to n-terminus if necessary                        */
3891 /*---------------------------------------------------------------------------*/
3892 /*    for (j = m_end; j > m_frst; j--) {
3893         if (trunc_on_virtual[j-1] < trunc_on_virtual[j]-1) {
3894 	  trunc_on_virtual[j-1] = trunc_on_virtual[j] - 1;
3895 	}
3896       }
3897 */
3898       s_insert = ddp->next->starts[1] - ddp->starts[1] - ddp->len;
3899       if (s_insert > (m_frst-m_last- 1)) {
3900         if (s_insert > (trunc_on_virtual[m_frst]-trunc_on_virtual[m_last]-1)) {
3901 /*---------------------------------------------------------------------------*/
3902 /* need to insert additional space, find insertion point first               */
3903 /*---------------------------------------------------------------------------*/
3904 	  ipt = m_frst;
3905 	  for (j=m_frst;j>m_last;j--) {
3906 	    if (trunc_on_virtual[j-1] < trunc_on_virtual[j]-1) {
3907 	      ipt = j; break;
3908 	    }
3909 	  }
3910 /*---------------------------------------------------------------------------*/
3911 /* do the shift starting at the insertion point and moving to the end        */
3912 /*---------------------------------------------------------------------------*/
3913           s_insert -= trunc_on_virtual[m_frst]-trunc_on_virtual[m_last]-1;
3914 	  ASSERT(s_insert > 0);
3915 	  for (j=ipt;j<bsp->length;j++) trunc_on_virtual[j]+=s_insert;
3916 	}
3917       }
3918       ddp = ddp->next;
3919     }
3920     salpThis = salpThis->next;
3921   }
3922 /*---------------------------------------------------------------------------*/
3923 /* take care of unaligned regions on the master - distribute evenly on both  */
3924 /* sides of the gap. If uneven number, one more residue shifted to N-terminus*/
3925 /*---------------------------------------------------------------------------*/
3926   i=-1;
3927   while (++i < bsp->length-1) {
3928     if (trunc_aligned[i]==0) {
3929       j=i; while(trunc_aligned[j]==0 && j < (bsp->length)) j++;
3930       j--;
3931       if (j > i && j < (bsp->length)) {
3932         midpt = i + (j-i)/2;
3933 	for (k=j;k>midpt;k--) {
3934 	  if (trunc_on_virtual[k]<trunc_on_virtual[k+1]-1) {
3935 	    trunc_on_virtual[k] = trunc_on_virtual[k+1]-1;
3936 	  }
3937 	}
3938         i = j;
3939       }
3940     }
3941   }
3942 /*  for (j=0;j<bsp->length;j++) printf(" %d",trunc_on_virtual[j]);
3943   printf("\n"); */
3944   maxextlen = trunc_on_virtual[bsp->length-1]+1;
3945   for (i=1;i<bsp->length;i++) {
3946     if (trunc_on_virtual[i] <= trunc_on_virtual[i-1]) {
3947       CddSevError("Error determining alignment coordinate system");
3948     }
3949   }
3950 /*---------------------------------------------------------------------------*/
3951 /* allocate space for storing the extended alignment plus weights and columns*/
3952 /*---------------------------------------------------------------------------*/
3953   pCAW = MemNew(nalign * sizeof(CddAlignWeight));
3954   pCEACol = MemNew(maxextlen * sizeof(CddExtAlignCol));
3955   pCEACell = MemNew(nalign * sizeof(CddExtAlignCellPtr));
3956   for (j=0;j<nalign;j++) {
3957     pCEACell[j] = MemNew(maxextlen * sizeof(CddExtAlignCell));
3958     for (k=0;k<maxextlen;k++) {
3959       pCEACell[j][k].seqpos = -1;
3960       pCEACell[j][k].aatype = 26;
3961     }
3962   }
3963 /*---------------------------------------------------------------------------*/
3964 /* start to fill in the extended alignment data, row 0 is taken by old       */
3965 /* master which is added as fully extended sequence (i.e. not truncated)     */
3966 /*---------------------------------------------------------------------------*/
3967   i = 0;
3968   salpThis = salp;
3969   ddp = salpThis->segs;
3970   sipCopy = SeqIdDup(ddp->id); sipCopy->next = NULL;
3971   bspThis = CddRetrieveBioseqById(sipCopy,sep);
3972   SeqIdFree(sipCopy);
3973   pCAW[i].bsp = bspThis;
3974   spp = SeqPortNew(bspThis, 0, LAST_RESIDUE, Seq_strand_unknown,
3975                    Seq_code_ncbistdaa);
3976   buffer = MemNew((bspThis->length)*sizeof(Uint1));
3977   for (index=0; index<bspThis->length; index++)
3978     buffer[index] = SeqPortGetResidue(spp);
3979   spp = SeqPortFree(spp);
3980   for (j=0;j<bsp->length;j++) {
3981     k = trunc_on_virtual[j];
3982     seqpos = j + offset;
3983     ASSERT (k>=0 && k<maxextlen);
3984     pCEACell[i][k].seqpos = seqpos;
3985     pCEACell[i][k].aatype = buffer[seqpos];
3986     if (pCEACell[i][k].aatype > (Uint1) 26) {
3987       pCEACell[i][k].aatype = 26;
3988     }
3989   }
3990   MemFree(buffer);
3991 /*---------------------------------------------------------------------------*/
3992 /* continue to fill in alignment data for the remaining sequences            */
3993 /*---------------------------------------------------------------------------*/
3994   salpThis = salp;
3995   while (salpThis) {
3996     i++;
3997     ddp = salpThis->segs;
3998     sipThis = ddp->id->next;
3999     bspThis = CddRetrieveBioseqById(sipThis,sep);
4000     pCAW[i].bsp = bspThis;
4001     spp = SeqPortNew(bspThis, 0, LAST_RESIDUE, Seq_strand_unknown,
4002                      Seq_code_ncbistdaa);
4003     buffer = MemNew((bspThis->length)*sizeof(Uint1));
4004     for (index=0; index<bspThis->length; index++)
4005       buffer[index] = SeqPortGetResidue(spp);
4006     spp = SeqPortFree(spp);
4007     while (ddp) {
4008       for (j = ddp->starts[0]; j<ddp->starts[0]+ddp->len;j++) {
4009         k = trunc_on_virtual[j - offset];
4010         ASSERT (k>=0 && k<maxextlen);
4011 	pCEACell[i][k].seqpos = ddp->starts[1] + j - ddp->starts[0];
4012 	pCEACell[i][k].aatype = buffer[pCEACell[i][k].seqpos];
4013         if (pCEACell[i][k].aatype > (Uint1) 26) {
4014           pCEACell[i][k].aatype = 26;
4015         }
4016       }
4017       if (ddp->next) {
4018         m_last = ddp->starts[0]+ddp->len-1-offset;
4019 	m_frst = ddp->next->starts[0]-offset;
4020 	s_last = ddp->starts[1]+ddp->len-1;
4021 	s_frst = ddp->next->starts[1];
4022 	if (s_frst - s_last > 1) {   /* have unaligned residues */
4023           midpt = s_last + (s_frst - s_last)/2;
4024 	  for (j = s_last+1;j<=midpt;j++) {
4025 	    k = trunc_on_virtual[m_last] + j - s_last;
4026             ASSERT (k>=0 && k<maxextlen);
4027 	    pCEACell[i][k].seqpos = j;
4028 	    pCEACell[i][k].aatype = buffer[j];
4029             if (pCEACell[i][k].aatype > (Uint1) 26) {
4030               pCEACell[i][k].aatype = 26;
4031             }
4032 	  }
4033           for (j = s_frst-1; j>midpt; j--) {
4034 	    k = trunc_on_virtual[m_frst] - (s_frst - j);
4035             ASSERT (k>=0 && k<maxextlen);
4036 	    pCEACell[i][k].seqpos = j;
4037 	    pCEACell[i][k].aatype = buffer[j];
4038             if (pCEACell[i][k].aatype > (Uint1) 26) {
4039               pCEACell[i][k].aatype = 26;
4040             }
4041 	  }
4042 	}
4043       }
4044       ddp = ddp->next;
4045     }
4046     MemFree(buffer);
4047     salpThis = salpThis->next;
4048   }
4049 
4050 /*---------------------------------------------------------------------------*/
4051 /* now do statistics on the columns in this matrix to determine weights      */
4052 /*---------------------------------------------------------------------------*/
4053   for (k=0;k<maxextlen;k++) {
4054     pCEACol[k].conpos = -1;
4055     pCEACol[k].occpos = 0;
4056     pCEACol[k].ntypes = 0;
4057     pCEACol[k].aatype = 0;
4058     pCEACol[k].w_occpos = 0.0;
4059     pCEACol[k].typecount = (Int4 *) MemNew(26*sizeof(Int4));
4060     pCEACol[k].wtypefreq = (FloatHi *) MemNew(26*sizeof(FloatHi));
4061     for (j=0;j<26;j++) {
4062       pCEACol[k].typecount[j] = 0;
4063       pCEACol[k].wtypefreq[j] = 0.0;
4064     }
4065     for (i=0;i<nalign;i++) {
4066       if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < 26) {
4067         pCEACol[k].typecount[(Int4)pCEACell[i][k].aatype]++;
4068 	pCEACol[k].occpos++;
4069 	if (pCEACol[k].occpos > maxpos) maxpos = pCEACol[k].occpos;
4070       }
4071     }
4072     for (j=0;j<26;j++) if (pCEACol[k].typecount[j]) pCEACol[k].ntypes++;
4073   }
4074 /*---------------------------------------------------------------------------*/
4075 /* use blocks with the highest position counts to determine weight           */
4076 /*---------------------------------------------------------------------------*/
4077   for (i=0;i<nalign;i++) {
4078     pCAW[i].weight = 0.0;
4079     pCAW[i].nposaligned = 0;
4080   }
4081   for (k=0;k<maxextlen;k++) {
4082     if (pCEACol[k].occpos >= maxpos) {
4083       ContWeight++;
4084       for (i=0;i<nalign;i++) {
4085         if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < (Uint1) 26) {
4086           pCAW[i].nposaligned++;
4087 	  if (pCEACol[k].typecount[pCEACell[i][k].aatype] > 0) {
4088   	    pCAW[i].weight += 1.0 / (pCEACol[k].ntypes *
4089 	                             pCEACol[k].typecount[pCEACell[i][k].aatype]);
4090 	  }
4091         }
4092       }
4093     }
4094   }
4095 /*---------------------------------------------------------------------------*/
4096 /* if sequence not covered by weight calculation, give it the expected weight*/
4097 /*---------------------------------------------------------------------------*/
4098   for (i=0;i<nalign;i++) {
4099     if (pCAW[i].weight <= 0.0) {
4100       pCAW[i].weight = (FloatHi) ContWeight * 1.0 / (FloatHi) nalign;
4101     }
4102   }
4103   SumWeight = 0.0;
4104   for(i=0;i<nalign;i++) {
4105     SumWeight += pCAW[i].weight;
4106   }
4107   for(i=0;i<nalign;i++) pCAW[i].weight /= SumWeight;
4108 /*---------------------------------------------------------------------------*/
4109 /* now determine columns with at least 50% representation and the consensus  */
4110 /*---------------------------------------------------------------------------*/
4111   ConsensusLen = 0;
4112   for (k=0;k<maxextlen;k++) {
4113     for (i=0;i<nalign;i++) {
4114       if (pCEACell[i][k].seqpos != -1 && pCEACell[i][k].aatype < (Uint1) 26) {
4115         pCEACol[k].w_occpos += pCAW[i].weight;
4116 	pCEACol[k].wtypefreq[pCEACell[i][k].aatype]+=pCAW[i].weight;
4117       }
4118     }
4119     if (pCEACol[k].w_occpos >= 0.5) {
4120       pCEACol[k].conpos = ConsensusLen;
4121       SumWeight = 0.0;
4122       ConsensusLen++;
4123       for (j=0;j<26;j++) {
4124         if (pCEACol[k].wtypefreq[j] > SumWeight) {
4125 	  SumWeight = pCEACol[k].wtypefreq[j];
4126 	  pCEACol[k].aatype = (Uint1) j;
4127 	}
4128       }
4129     }
4130   }
4131   buffer = (Uint1 *) MemNew((ConsensusLen+1) * sizeof(Uint1));
4132   cbuffer = (Char *) MemNew((ConsensusLen+1) * sizeof(Char));
4133   for (k=0;k<maxextlen;k++) {
4134     if (pCEACol[k].conpos != -1) {
4135       buffer[pCEACol[k].conpos] = pCEACol[k].aatype;
4136     }
4137   }
4138 /*---------------------------------------------------------------------------*/
4139 /* translate sequence into ascii/character and make new bioseq               */
4140 /*---------------------------------------------------------------------------*/
4141   smtp = SeqMapTableFind(Seq_code_ncbieaa, Seq_code_ncbistdaa);
4142   for (i=0;i<ConsensusLen;i++) {
4143     cbuffer[i] = (Char) SeqMapTableConvert(smtp, buffer[i]);
4144   }
4145   sepNew = FastaToSeqBuffEx(cbuffer,&outptr,FALSE,NULL,FALSE);
4146   *bspCons = (BioseqPtr) sepNew->data.ptrvalue;
4147   sipThis = (*bspCons)->id;
4148   oidp = ObjectIdNew();
4149   oidp->str = MemNew(10*sizeof(Char));
4150   strcpy(oidp->str,"consensus");
4151   oidp->str[9]='\0';
4152   MemFree(sipThis->data.ptrvalue);
4153   sipThis->data.ptrvalue = oidp;
4154   sipThis->choice = SEQID_LOCAL;
4155   sipThis->next = NULL;
4156 
4157 /*---------------------------------------------------------------------------*/
4158 /* create the pairwise alignment old-master(not truncated) new-master        */
4159 /*---------------------------------------------------------------------------*/
4160   bspThis = pCAW[0].bsp;
4161   pCDea = CddExpAlignNew();
4162   CddExpAlignAlloc(pCDea,bspThis->length);
4163   pCDea->ids = CddSeqIdDupPDBGI(bspThis->id);
4164   pCDea->ids->next = SeqIdDup((*bspCons)->id);
4165   pCDea->bIdAlloc = TRUE;
4166   for (k=0;k<maxextlen;k++) {
4167     if (pCEACol[k].conpos != -1) {
4168       pCDea->adata[pCEACell[0][k].seqpos]=pCEACol[k].conpos;
4169     }
4170   }
4171 /*---------------------------------------------------------------------------*/
4172 /* record block structure present in the input alignment to prevent merges   */
4173 /*---------------------------------------------------------------------------*/
4174   ddp = salp->segs;
4175   while (ddp) {
4176     pCDea->starts[ddp->starts[0]] = 1;
4177     ddp = ddp->next;
4178   }
4179   *salpCons = CddExpAlignToSeqAlign(pCDea,NULL);
4180   pCDea = CddExpAlignFree(pCDea);
4181 
4182 /*---------------------------------------------------------------------------*/
4183 /* create a multiple alignment salpNew where the consensus is the new master */
4184 /*---------------------------------------------------------------------------*/
4185   salpNew = NULL;
4186   for (i=0;i<nalign;i++) {
4187     pCDea = CddExpAlignNew();
4188     CddExpAlignAlloc(pCDea,ConsensusLen);
4189     pCDea->ids = SeqIdDup((*bspCons)->id);
4190     pCDea->ids->next = SeqIdDup(pCAW[i].bsp->id);
4191     pCDea->bIdAlloc = TRUE;
4192     for (k=0;k<maxextlen;k++) {
4193       if (pCEACol[k].conpos != -1) {
4194         pCDea->adata[pCEACol[k].conpos]=pCEACell[i][k].seqpos;
4195       }
4196     }
4197     salpThis = CddExpAlignToSeqAlign(pCDea,NULL);
4198     pCDea = CddExpAlignFree(pCDea);
4199     if (salpThis) {
4200       if (!salpNew) {
4201         salpNew = salpThis;
4202       } else {
4203         salpTail->next = salpThis;
4204       }
4205       salpTail = salpThis;
4206     }
4207   }
4208 
4209 /*---------------------------------------------------------------------------*/
4210 /* deallocate memory for this function                                       */
4211 /*---------------------------------------------------------------------------*/
4212   MemFree(cbuffer);
4213   MemFree(buffer);
4214   for (j=0;j<nalign;j++) MemFree(pCEACell[j]);
4215   MemFree(pCEACell);
4216   for (j=0;j<maxextlen;j++) {
4217     MemFree(pCEACol[j].typecount);
4218     MemFree(pCEACol[j].wtypefreq);
4219   }
4220   MemFree(pCEACol);
4221   MemFree(pCAW);
4222   MemFree(trunc_aligned);
4223   MemFree(trunc_on_virtual);
4224   return(salpNew);
4225 }
4226 
4227 
4228 /*---------------------------------------------------------------------------*/
4229 /*---------------------------------------------------------------------------*/
4230 /* first part of CddCposComp()                                               */
4231 /* basically returns compactSearch, LetterHead, and posSearch                */
4232 /*---------------------------------------------------------------------------*/
4233 /*---------------------------------------------------------------------------*/
CddCposCompPart1(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,compactSearchItems * compactSearch,ValNodePtr * LetterHead,posSearchItems * posSearch)4234 static void CddCposCompPart1(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
4235                              compactSearchItems* compactSearch, ValNodePtr* LetterHead,
4236                              posSearchItems* posSearch) {
4237     Int4                numalign, numseq;    /*number of alignments and seqs */
4238     BLAST_ScoreBlkPtr   sbp;
4239     SeqLocPtr           private_slp = NULL;
4240     SeqPortPtr          spp = NULL;
4241     Uint1Ptr            query_seq, query_seq_start;
4242     Uint1               residue;
4243     Int4                index, a, KarlinReturn, array_size;
4244     Nlm_FloatHiPtr      lambda, K, H;
4245     Int4Ptr             open, extend;
4246     BLAST_ResFreqPtr    stdrfp;
4247     Int2                iStatus;
4248     ValNodePtr          error_return = NULL;
4249     ValNodePtr          newLetter;
4250     SeqCodeTablePtr     sctp;
4251     BioseqPtr           bspFake, temp_bsp;
4252 
4253     /* this is used for the DenseDiag calculations */
4254     bspFake  = (BioseqPtr) CddBioseqCopy(NULL,bsp,0,bsp->length-1,0,FALSE);
4255 
4256     if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4257       numalign = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
4258     }
4259     else {
4260       numalign = CddCountSeqAligns(listOfSeqAligns, &numseq);
4261     }
4262     sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
4263     sbp->read_in_matrix = TRUE;
4264     sbp->protein_alphabet = TRUE;
4265     sbp->posMatrix = NULL;
4266     sbp->number_of_contexts = 1;
4267     iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
4268     ASSERT(iStatus == 0);
4269     compactSearch->qlength = bsp->length;
4270     compactSearch->alphabetSize = sbp->alphabet_size;
4271     compactSearch->matrix = sbp->matrix;
4272     compactSearch->gapped_calculation = TRUE;
4273     compactSearch->pseudoCountConst = 10;
4274     compactSearch->ethresh = 0.001;
4275 
4276 /*---------------------------------------------------------------------------*/
4277 /* get the query sequence                                                    */
4278 /*---------------------------------------------------------------------------*/
4279     temp_bsp = bsp;
4280     if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4281       bsp = bspFake;  /* to make call compatible with CddDenDiagCposComputation */
4282     }
4283     private_slp = SeqLocIntNew(0, bsp->length-1 , Seq_strand_plus, SeqIdFindBest(bsp->id, SEQID_GI));
4284 		spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
4285     SeqPortSet_do_virtual(spp, TRUE);
4286     query_seq_start = (Uint1Ptr) MemNew(((bsp->length)+2)*sizeof(Char));
4287     bsp = temp_bsp;
4288 		query_seq_start[0] = NULLB;
4289     query_seq = query_seq_start+1;
4290     index=0;
4291     while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
4292       if (IS_residue(residue)) {
4293 				query_seq[index] = residue;
4294 				index++;
4295 			}
4296 		}
4297 		query_seq[index] = NULLB;
4298 		spp = SeqPortFree(spp);
4299 		private_slp = SeqLocFree(private_slp);
4300     compactSearch->query = query_seq;
4301 
4302     BlastScoreBlkFill(sbp,(CharPtr) compactSearch->query,compactSearch->qlength,0);
4303 
4304     sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
4305     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0],11,1,sbp->name,&error_return);
4306     if (1 == KarlinReturn) {
4307       BlastErrorPrint(error_return);
4308     }
4309     sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
4310     KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0],11,1,sbp->name,&error_return);
4311     if (1 == KarlinReturn) {
4312       BlastErrorPrint(error_return);
4313     }
4314 
4315     array_size = BlastKarlinGetMatrixValues(sbp->name,&open,&extend,&lambda,&K,&H,NULL);
4316     if (array_size > 0) {
4317       for (index = 0; index < array_size; index++) {
4318         if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
4319           sbp->kbp_ideal = BlastKarlinBlkCreate();
4320           sbp->kbp_ideal->Lambda = lambda[index];
4321           sbp->kbp_ideal->K = K[index];
4322           sbp->kbp_ideal->H = H[index];
4323         }
4324       }
4325       MemFree(open);
4326       MemFree(extend);
4327       MemFree(lambda);
4328       MemFree(K);
4329       MemFree(H);
4330     }
4331     if (sbp->kbp_ideal == NULL) {
4332       sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
4333     }
4334     compactSearch->lambda       = sbp->kbp_gap_std[0]->Lambda;
4335     compactSearch->kbp_std      = sbp->kbp_std;
4336     compactSearch->kbp_psi      = sbp->kbp_psi;
4337     compactSearch->kbp_gap_std  = sbp->kbp_gap_std;
4338     compactSearch->kbp_gap_psi  = sbp->kbp_gap_psi;
4339     compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
4340     compactSearch->K_ideal      = sbp->kbp_ideal->K;
4341 
4342     stdrfp = BlastResFreqNew(sbp);
4343     BlastResFreqStdComp(sbp,stdrfp);
4344     compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
4345     for(a = 0; a < compactSearch->alphabetSize; a++)
4346       compactSearch->standardProb[a] = stdrfp->prob[a];
4347     stdrfp = BlastResFreqDestruct(stdrfp);
4348     posSearch->posInformation = NULL;
4349 /*---------------------------------------------------------------------------*/
4350 /* numseq is replaced with numalign (last argument) - each alignment is a    */
4351 /* single segment and represents a separate sequence                         */
4352 /*---------------------------------------------------------------------------*/
4353     posAllocateMemory(posSearch, compactSearch->alphabetSize, compactSearch->qlength, numalign);
4354     if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4355       CddfindDenseDiagThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
4356     }
4357     else {
4358       CddfindThreshSequences(posSearch, listOfSeqAligns, numalign, numalign);
4359     }
4360     sbp->kbp     = sbp->kbp_psi;
4361     sbp->kbp_gap = sbp->kbp_gap_psi;
4362     CddposInitializeInformation(posSearch,sbp,compactSearch);
4363     if (listOfSeqAligns->segtype == SAS_DENDIAG) {
4364       CddposDenseDiagDemographics(posSearch, compactSearch, listOfSeqAligns);
4365     }
4366     else {
4367       CddposDemographics(posSearch, compactSearch, listOfSeqAligns);
4368     }
4369     posPurgeMatches(posSearch, compactSearch, NULL);
4370     posComputeExtents(posSearch, compactSearch);
4371     posComputeSequenceWeights(posSearch, compactSearch, 1.0);
4372     posCheckWeights(posSearch, compactSearch);
4373     posSearch->posFreqs = (Nlm_FloatHi **) posComputePseudoFreqs(posSearch, compactSearch, TRUE);
4374     CddposFreqsToMatrix(posSearch,compactSearch);
4375     posScaling(posSearch, compactSearch);
4376 
4377     sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
4378     for (a=0;a<compactSearch->alphabetSize;a++) {
4379       newLetter = ValNodeNew(NULL);
4380       newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
4381       Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
4382       ValNodeLink(LetterHead,newLetter);
4383     }
4384 }
4385 
4386 
CddGetNewIndexForThreading(char InChar,char * Output)4387 Int4 LIBCALL CddGetNewIndexForThreading(char InChar, char* Output) {
4388 /*---------------------------------------------------------------------------*/
4389 /*  get the index in Output where InChar is found.                           */
4390 /*  return -1 if char is not found in the Output array.                      */
4391 /*---------------------------------------------------------------------------*/
4392     Int4  i, Index=0;
4393 
4394     /* look through Output for InChar */
4395     for (i=0; i<StrLen(Output); i++) {
4396       if (Output[i] == InChar) {
4397         return(i);
4398       }
4399     }
4400     return(-1);
4401 }
4402 
4403 
CddCalcPSSM(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,double Weight,double ScaleFactor)4404 Seq_Mtf* LIBCALL CddCalcPSSM(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp,
4405                              double Weight, double ScaleFactor) {
4406 /*---------------------------------------------------------------------------*/
4407 /* Generation of position-specific scoring matrices.                         */
4408 /* Put results in a Seq_Mtf structure in preparation for calling atd().      */
4409 /* allocate space for 2-d arrays pssm->ww and pssm->freq here.               */
4410 /* pssm->ww[i][j] is pssm, where i is sequence index, j is residue index.    */
4411 /* pssm->freqs[i][j] is table of residue frequencies.                        */
4412 /*---------------------------------------------------------------------------*/
4413     posSearchItems      posSearch;           /*holds position-specific info  */
4414     compactSearchItems  compactSearch;
4415     ValNodePtr          LetterHead = NULL;
4416     Int4                i, j, jj;
4417     char*               Input;     /* order of residue-types from CD's */
4418     char*               Output;    /* order of residue-types needed for threading */
4419     Boolean*            Coverage;  /* for making sure all columns are filled */
4420     Int4                OutLen;
4421     Seq_Mtf*            pssm;
4422 
4423     /* allocate arrays for strings that determine input and output order */
4424     Input = (char*) MemNew(sizeof(InputOrder));
4425     Output = (char*) MemNew(sizeof(OutputOrder));
4426     StrCpy(Input, InputOrder);
4427     StrCpy(Output, OutputOrder);
4428 
4429     OutLen = StrLen(Output);
4430     CddCposCompPart1(listOfSeqAligns, bsp, &compactSearch, &LetterHead, &posSearch);
4431 
4432     /* construct new pssm */
4433     pssm = NewSeqMtf(compactSearch.qlength, OutLen);
4434     Coverage = (Boolean*) MemNew(sizeof(Boolean) * OutLen);  /* set to 0 by MemNew */
4435 
4436     /* copy results to pssm */
4437     for (j=0; j<compactSearch.alphabetSize;j++) {
4438       jj = CddGetNewIndexForThreading(Input[j], Output);
4439       if (jj != -1) {
4440         for (i=0; i<compactSearch.qlength;i++) {
4441           pssm->ww[i][jj] = ThrdRound(posSearch.posMatrix[i][j]*ScaleFactor*Weight);
4442           pssm->freqs[i][jj] = ThrdRound(posSearch.posFreqs[i][j]*ScaleFactor);
4443           Coverage[jj] = TRUE;
4444         }
4445       }
4446     }
4447 
4448     /* make sure all columns are filled */
4449     for (i=0; i<OutLen; i++) {
4450       if (Coverage[i] == FALSE) {
4451         ASSERT(FALSE);  /* should never get here */
4452       }
4453     }
4454 
4455     /* clean up */
4456     MemFree(Input);
4457     MemFree(Output);
4458     MemFree(Coverage);
4459     CddposFreeMemory(&posSearch);
4460 
4461     return(pssm);
4462 }
4463 
4464 
CddCposComp(SeqAlignPtr listOfSeqAligns,BioseqPtr bsp,CddPtr pcdd)4465 void LIBCALL CddCposComp(SeqAlignPtr listOfSeqAligns, BioseqPtr bsp, CddPtr pcdd) {
4466 /*---------------------------------------------------------------------------*/
4467 /* Generation of position-specific scoring matrices for database scanning    */
4468 /*---------------------------------------------------------------------------*/
4469     posSearchItems      posSearch;           /*holds position-specific info  */
4470     compactSearchItems  compactSearch;
4471     Int4                index, a, iNew;
4472     ValNodePtr          error_return = NULL;
4473     ValNodePtr          ColumnHead;
4474     ValNodePtr          newRow, RowHead;
4475     ValNodePtr          LetterHead = NULL;
4476     Char                ckptFileName[PATH_MAX];
4477     Char                cseqFileName[PATH_MAX];
4478     FILE                *fp;
4479     CharPtr             cAccession;
4480 
4481     CddCposCompPart1(listOfSeqAligns, bsp, &compactSearch, &LetterHead, &posSearch);
4482 
4483     /* put results in proper format... */
4484 
4485     pcdd->posfreq = (MatrixPtr) MatrixNew();
4486     pcdd->posfreq->ncolumns = compactSearch.qlength;
4487     pcdd->posfreq->nrows = compactSearch.alphabetSize;
4488     pcdd->posfreq->scale_factor = POSFREQ_SCALE;
4489     pcdd->posfreq->row_labels = LetterHead;
4490     ColumnHead = NULL;
4491     for (index = 0; index<compactSearch.qlength;index++) {
4492       RowHead = NULL;
4493       for (a=0;a<compactSearch.alphabetSize;a++) {
4494          newRow = ValNodeNew(NULL);
4495          iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch.posFreqs[index][a]);
4496          newRow->data.intvalue = iNew;
4497            ValNodeLink(&(ColumnHead),newRow);
4498       }
4499     }
4500     pcdd->posfreq->columns = ColumnHead;
4501 
4502     pcdd->scoremat = (MatrixPtr) MatrixNew();
4503     pcdd->scoremat->ncolumns = compactSearch.qlength;
4504     pcdd->scoremat->nrows = compactSearch.alphabetSize;
4505     pcdd->scoremat->scale_factor = 1;
4506     pcdd->scoremat->row_labels = LetterHead;
4507     ColumnHead = NULL;
4508     for (index = 0; index<compactSearch.qlength;index++) {
4509       RowHead = NULL;
4510       for (a=0;a<compactSearch.alphabetSize;a++) {
4511          newRow = ValNodeNew(NULL);
4512          iNew = (Int4) posSearch.posMatrix[index][a];
4513          newRow->data.intvalue = iNew;
4514            ValNodeLink(&(ColumnHead),newRow);
4515       }
4516     }
4517     pcdd->scoremat->columns = ColumnHead;
4518 
4519 /*---------------------------------------------------------------------------*/
4520 /* Construct name for checkpoint file                                        */
4521 /*---------------------------------------------------------------------------*/
4522     cAccession = CddGetAccession(pcdd);
4523     strcpy(ckptFileName,cAccession);
4524     strcat(ckptFileName,CKPTEXT);
4525 
4526     CddposTakeCheckpoint(&posSearch, &compactSearch, ckptFileName, &error_return);
4527     strcpy(cseqFileName,cAccession);
4528     strcat(cseqFileName,CSEQEXT);
4529 
4530     MemFree(cAccession);
4531 
4532     fp = FileOpen(cseqFileName, "w");
4533     if (NULL == fp) {
4534       BlastConstructErrorMessage("CddCposComp", "Could not open fasta file", 1, &error_return);
4535     }
4536     BioseqToFasta(bsp,fp,FALSE);
4537     FileClose(fp);
4538 }
4539 
4540 /*---------------------------------------------------------------------------*/
4541 /*---------------------------------------------------------------------------*/
4542 /* modified after PGPReadBlastOptions from blastpgp.c, as of 12/27/00        */
4543 /*---------------------------------------------------------------------------*/
4544 /*---------------------------------------------------------------------------*/
CddReadBlastOptions(BioseqPtr bsp,Int4 iPseudo,CharPtr matrix_name)4545 static PGPBlastOptionsPtr CddReadBlastOptions(BioseqPtr bsp, Int4 iPseudo, CharPtr matrix_name)
4546 {
4547     PGPBlastOptionsPtr  bop;
4548     BLAST_OptionsBlkPtr options;
4549 
4550     bop = MemNew(sizeof(PGPBlastOptions));
4551     bop->blast_database   = StringSave("nr");
4552     bop->number_of_descriptions = 500;
4553     bop->number_of_alignments = 500;
4554     bop->believe_query = FALSE;
4555     options = BLASTOptionNew("blastp",TRUE);                /* gapped Blast! */
4556     bop->options = options;
4557     bop->query_bsp = bsp;
4558     BLASTOptionSetGapParams(options, matrix_name, 0, 0);
4559     /* bop->is_xml_output = FALSE; */
4560     bop->align_options = 0;
4561     bop->print_options = 0;
4562     options->required_start = 0;
4563     options->required_end = -1;
4564     options->window_size = 40;
4565     options->dropoff_2nd_pass  = 7.0;
4566     options->expect_value  = 10.0;
4567     options->kappa_expect_value = options->expect_value;
4568     options->hitlist_size = 500;
4569     options->two_pass_method  = FALSE;
4570     options->multiple_hits_only  = TRUE;
4571     if (strcmp(matrix_name,"BLOSUM62") == 0) {
4572       options->gap_open = 11;
4573       options->gap_extend = 1;
4574     }
4575     options->decline_align = INT2_MAX;
4576     options->gap_x_dropoff = 15;
4577     options->gap_x_dropoff_final = 25;
4578     options->gap_trigger = 22.0;
4579     options->filter_string = StringSave("F");
4580     options->number_of_cpus = 1;
4581     options->isPatternSearch = FALSE;
4582     options->ethresh = 0.0;                  /* zeroed out, will not be used */
4583     options->pseudoCountConst = iPseudo;
4584     options->maxNumPasses = 1;
4585     options->hsp_range_max  = 0;
4586     options->block_width  = 20; /* Default value - previously '-L' parameter */
4587     options->tweak_parameters = TRUE;
4588     options->smith_waterman = FALSE;
4589     if (options->tweak_parameters > 1) {
4590         /* relax the cutoff evalue so that we don't loose too many
4591          * hits for which compositional adjustment improves the
4592          * evalue. */
4593         options->expect_value *= EVALUE_EXPAND;
4594     }
4595 
4596     options = BLASTOptionValidate(options, "blastp");
4597     if (options == NULL) return NULL;
4598     bop->fake_bsp = bop->query_bsp;
4599 #if 0
4600       BioseqNew();
4601     bop->fake_bsp->descr = bop->query_bsp->descr;
4602     bop->fake_bsp->repr = bop->query_bsp->repr;
4603     bop->fake_bsp->mol = bop->query_bsp->mol;
4604     bop->fake_bsp->length = bop->query_bsp->length;
4605     bop->fake_bsp->seq_data_type = bop->query_bsp->seq_data_type;
4606     bop->fake_bsp->seq_data = bop->query_bsp->seq_data;
4607     bop->fake_bsp->id = bop->query_bsp->id;
4608 #endif
4609     /* bop->fake_bsp->seq_data = BSDup(bop->query_bsp->seq_data); */
4610     /* bop->fake_bsp->id = SeqIdDup(bop->query_bsp->id); */
4611     /* bug fix -- Lewis, Dave and Aron */
4612     /* bop->query_bsp->id = SeqIdSetFree(bop->query_bsp->id); */
4613     /* BLASTUpdateSeqIdInSeqInt(options->query_lcase_mask, bop->fake_bsp->id); */
4614     return bop;
4615 }
4616 
4617 
4618 
4619 /*---------------------------------------------------------------------------*/
4620 /*---------------------------------------------------------------------------*/
4621 /* write out a matrix-file (ASCII formatted) for use with copymat/formatdb   */
4622 /* routines from makemat, 4/9/01                                             */
4623 /*---------------------------------------------------------------------------*/
4624 /*---------------------------------------------------------------------------*/
4625 
CddputMatrixKbp(FILE * checkFile,BLAST_KarlinBlkPtr kbp,Boolean scaling,Nlm_FloatHi scalingDown)4626 static void CddputMatrixKbp(FILE *checkFile, BLAST_KarlinBlkPtr kbp,
4627                             Boolean scaling, Nlm_FloatHi scalingDown)
4628 {
4629   if (scaling)
4630     fprintf(checkFile,"%le\n",kbp->Lambda * scalingDown);
4631   else
4632     fprintf(checkFile,"%le\n",kbp->Lambda);
4633   fprintf(checkFile,"%le\n",kbp->K);
4634   fprintf(checkFile,"%le\n",kbp->logK);
4635   fprintf(checkFile,"%le\n",kbp->H);
4636 }
4637 
CddputMatrixMatrix(FILE * checkFile,compactSearchItems * compactSearch,posSearchItems * posSearch,Boolean scaleScores)4638 static void CddputMatrixMatrix(FILE *checkFile,
4639                                compactSearchItems *compactSearch,
4640 			       posSearchItems *posSearch,
4641 			       Boolean scaleScores)
4642 {
4643   Int4 i, j;                                                  /*loop indices*/
4644 
4645   if (scaleScores) {
4646     for(i = 0; i < compactSearch->qlength; i++) {
4647       for(j = 0; j < compactSearch->alphabetSize; j++)
4648         fprintf(checkFile,"%ld  ", (long) posSearch->posPrivateMatrix[i][j]);
4649       fprintf(checkFile,"\n");
4650     }
4651   }
4652   else {
4653     for(i = 0; i < compactSearch->qlength; i++) {
4654       for(j = 0; j < compactSearch->alphabetSize; j++)
4655         fprintf(checkFile,"%ld  ", (long) posSearch->posMatrix[i][j]);
4656       fprintf(checkFile,"\n");
4657     }
4658   }
4659 }
4660 
CddtakeMatrixCheckpoint(compactSearchItems * compactSearch,posSearchItems * posSearch,BLAST_ScoreBlkPtr sbp,Char * fileName,ValNodePtr * error_return,Boolean scaleScores,Nlm_FloatHi scalingFactor)4661 static Boolean CddtakeMatrixCheckpoint(compactSearchItems *compactSearch,
4662                                        posSearchItems *posSearch,
4663 				       BLAST_ScoreBlkPtr sbp,
4664                                        Char *fileName,ValNodePtr *error_return,
4665 				       Boolean scaleScores,
4666 				       Nlm_FloatHi scalingFactor)
4667 {
4668   FILE * checkFile;                  /* file in which to take the checkpoint */
4669   Int4 length;              /* length of query sequence, and an index for it */
4670   Int4 i;                                /* indices to position and alphabet */
4671   Char localChar;                                     /* temporary character */
4672 
4673   checkFile = FileOpen(fileName, "w");
4674   if (NULL == checkFile) {
4675     BlastConstructErrorMessage("posTakeCheckpoint", "Could not open checkpoint file", 1, error_return);
4676     return(FALSE);
4677   }
4678   length = compactSearch->qlength;
4679   fprintf(checkFile,"%ld\n",(long) length);
4680   for(i = 0; i < length; i++) {
4681     localChar = getRes(compactSearch->query[i]);
4682     fprintf(checkFile,"%c",localChar);
4683     posSearch->posMatrix[i][Xchar] = Xscore;
4684     posSearch->posPrivateMatrix[i][Xchar] = Xscore * scalingFactor;
4685   }
4686   fprintf(checkFile,"\n");
4687   CddputMatrixKbp(checkFile, compactSearch->kbp_gap_std[0], scaleScores, 1/scalingFactor);
4688   CddputMatrixKbp(checkFile, compactSearch->kbp_gap_psi[0], scaleScores, 1/scalingFactor);
4689   CddputMatrixKbp(checkFile, sbp->kbp_ideal, scaleScores, 1/scalingFactor);
4690   CddputMatrixMatrix(checkFile, compactSearch, posSearch, scaleScores);
4691   FileClose(checkFile);
4692   return(TRUE);
4693 }
4694 
4695 /*---------------------------------------------------------------------------*/
4696 /*---------------------------------------------------------------------------*/
4697 /*---------------------------------------------------------------------------*/
4698 /*---------------------------------------------------------------------------*/
BlastKarlinBlkCopy(BLAST_KarlinBlkPtr kbp_in,BLAST_KarlinBlkPtr kbp_out)4699 static void BlastKarlinBlkCopy(BLAST_KarlinBlkPtr kbp_in,BLAST_KarlinBlkPtr kbp_out)
4700 {
4701   kbp_out->Lambda = kbp_in->Lambda;
4702   kbp_out->K = kbp_in->K;
4703   kbp_out->logK = kbp_in->logK;
4704   kbp_out->H = kbp_in->H;
4705 /*  kbp_out->Lambda_real = kbp_in->Lambda_real;
4706   kbp_out->K_real = kbp_in->K_real;
4707   kbp_out->logK_real = kbp_in->logK_real;
4708   kbp_out->H_real = kbp_in->H_real;
4709   kbp_out->q_frame = kbp_in->q_frame;
4710   kbp_out->s_frame = kbp_in->s_frame; */
4711   kbp_out->paramC = kbp_in->paramC;
4712 }
4713 
4714 /*---------------------------------------------------------------------------*/
4715 /*---------------------------------------------------------------------------*/
4716 /* version of the PSSM calculation routine which does not return Karlin-     */
4717 /* Altschul parameters                                                       */
4718 /*---------------------------------------------------------------------------*/
4719 /*---------------------------------------------------------------------------*/
CddDenDiagCposComp2(BioseqPtr bspFake,Int4 iPseudo,SeqAlignPtr salp,CddPtr pcdd,BioseqPtr bspOut,double Weight,double ScaleFactor,CharPtr matrix_name)4720 Seq_Mtf * LIBCALL CddDenDiagCposComp2(BioseqPtr bspFake, Int4 iPseudo,
4721                                       SeqAlignPtr salp, CddPtr pcdd,
4722 				      BioseqPtr bspOut, double Weight,
4723 				      double ScaleFactor, CharPtr matrix_name)
4724 {
4725   return CddDenDiagCposComp2KBP(bspFake, iPseudo, salp, pcdd, bspOut,
4726                                 Weight, ScaleFactor, matrix_name, NULL);
4727 }
4728 
4729 /*---------------------------------------------------------------------------*/
4730 /*---------------------------------------------------------------------------*/
4731 /* updated version of the PSSM calculation routine, modified from blastpgp.c */
4732 /* as of 12/27/2000                                                          */
4733 /*---------------------------------------------------------------------------*/
4734 /*---------------------------------------------------------------------------*/
CddDenDiagCposComp2KBP(BioseqPtr bspFake,Int4 iPseudo,SeqAlignPtr salp,CddPtr pcdd,BioseqPtr bspOut,double Weight,double ScaleFactor,CharPtr matrix_name,BLAST_KarlinBlkPtr kbp)4735 Seq_Mtf * LIBCALL CddDenDiagCposComp2KBP(BioseqPtr bspFake, Int4 iPseudo,
4736                                          SeqAlignPtr salp, CddPtr pcdd,
4737 				         BioseqPtr bspOut, double Weight,
4738 				         double ScaleFactor, CharPtr matrix_name,
4739 				         BLAST_KarlinBlkPtr kbp)
4740 {
4741   PGPBlastOptionsPtr      bop;
4742   BlastSearchBlkPtr       search;
4743   posSearchItems         *posSearch = NULL;
4744   compactSearchItems     *compactSearch = NULL;
4745   ValNodePtr              ColumnHead, LetterHead, newRow, RowHead, newLetter;
4746   SeqCodeTablePtr         sctp;
4747   Int4                    numSeqs, numASeqs, alignLength, index, a, iNew, i;
4748   /*
4749   Int4                    scale_factor;
4750   */
4751   Int4                    j, OutLen, jj;
4752   Char                    ckptFileName[PATH_MAX];
4753   Char                    cseqFileName[PATH_MAX];
4754   /*
4755   Char                    mtrxFileName[PATH_MAX];
4756   */
4757   FILE                   *fp;
4758   Boolean                 bHasConsensus;
4759   Boolean                 bWriteOut = FALSE;
4760   Nlm_FloatHi            *AInf, SumAInf = 0.0;
4761   char*                   Input;    /* order of residue-types from CD's      */
4762   char*                   Output;   /*order of res-types needed for threading*/
4763   Boolean*                Coverage; /* for making sure all columns are filled*/
4764   Seq_Mtf*                pssm;
4765   CharPtr                 cAccession;
4766   Boolean                 scalingOkay = TRUE;
4767 
4768   cAccession = CddGetAccession(pcdd);
4769   if (pcdd) {
4770     bHasConsensus = CddHasConsensus(pcdd);
4771     bWriteOut = CddHasDescription(pcdd,"Write out matrix");
4772   } else {                                        /* determine from seqalign */
4773     bHasConsensus = SeqAlignHasConsensus(salp);
4774   }
4775   if (iPseudo <= 0) {                             /* need to determine first */
4776     AInf = SeqAlignInform(salp, bspFake, bHasConsensus, 0);
4777     for (i=0;i<bspFake->length;i++) SumAInf += AInf[i];
4778     if      (SumAInf > 84  ) iPseudo = 10;             /* purely empirical   */
4779     else if (SumAInf > 55  ) iPseudo =  7;
4780     else if (SumAInf > 43  ) iPseudo =  5;
4781     else if (SumAInf > 41.5) iPseudo =  4;
4782     else if (SumAInf > 40  ) iPseudo =  3;
4783     else if (SumAInf > 39  ) iPseudo =  2;
4784     else                     iPseudo =  1;
4785     MemFree(AInf);
4786     if (pcdd && bWriteOut) printf("%s AInf:%6.1f PseudoCt: %d\n",cAccession,SumAInf, iPseudo);
4787   }
4788   if (NULL == matrix_name) {
4789     static CharPtr defaultMatrix = "BLOSUM62";
4790     matrix_name = defaultMatrix;
4791   }
4792   bop = (PGPBlastOptionsPtr) CddReadBlastOptions(bspFake, iPseudo, matrix_name);
4793   if (pcdd) {          /* set up search with nr database size for CD dumping */
4794     search = BLASTSetUpSearchWithReadDb(bop->fake_bsp, "blastp",
4795                                         bop->query_bsp->length,
4796                                         bop->blast_database,
4797                                         bop->options, NULL);
4798   } else {          /* or avoid using nr when calculating PSSM for threading */
4799     search = BLASTSetUpSearch(bop->fake_bsp, "blastp",
4800                               bop->query_bsp->length,
4801 			      0, NULL, bop->options, NULL);
4802   }
4803   posSearch = NULL;
4804   compactSearch = NULL;
4805   posSearch = (posSearchItems *)MemNew(sizeof(posSearchItems));
4806   compactSearch = compactSearchNew(compactSearch);
4807   copySearchItems(compactSearch, search, bop->options->matrix);
4808   posInitializeInformation(posSearch,search);
4809 
4810 /*---------------------------------------------------------------------------*/
4811 /* section modelled after BposComputation, posit.c as of 12/28/2000          */
4812 /*---------------------------------------------------------------------------*/
4813   search->posConverged = FALSE;
4814   numSeqs = CddCountDenDiagSeqAligns(salp, &numASeqs) + 1;
4815   alignLength = bspFake->length;
4816   posAllocateMemory(posSearch, compactSearch->alphabetSize,
4817                        compactSearch->qlength, numSeqs);
4818   CddposProcessAlignment(posSearch,compactSearch,salp,numSeqs,alignLength,bspFake);
4819   posSearch->posNumSequences = numSeqs;
4820   posPurgeMatches(posSearch, compactSearch, NULL);
4821   if (bHasConsensus) posCancel(posSearch,compactSearch,0,0,0,compactSearch->qlength);
4822   posComputeExtents(posSearch, compactSearch);
4823   posComputeSequenceWeights(posSearch, compactSearch, 1.0);
4824   posCheckWeights(posSearch, compactSearch);
4825 /*  printf("Calling posComputePseudoFreqs with %s\n",compactSearch->standardMatrixName);
4826   fflush(stdout); */
4827   posSearch->posFreqs = posComputePseudoFreqs(posSearch,compactSearch,TRUE);
4828   if (NULL == search->sbp->posFreqs)
4829     search->sbp->posFreqs =  allocatePosFreqs(compactSearch->qlength, compactSearch->alphabetSize);
4830   copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs, compactSearch->qlength, compactSearch->alphabetSize);
4831 
4832 /*---------------------------------------------------------------------------*/
4833 /* Construct name for checkpoint file and write out (if in a CD context)     */
4834 /*---------------------------------------------------------------------------*/
4835   if (pcdd && bWriteOut) {
4836     strcpy(ckptFileName,cAccession);
4837     strcat(ckptFileName,CKPTEXT);
4838     if (NULL != ckptFileName)
4839       posTakeCheckpoint(posSearch, compactSearch, ckptFileName, NULL);
4840   }
4841 
4842   posFreqsToMatrix(posSearch,compactSearch);
4843 /*---------------------------------------------------------------------------*/
4844 /* kludge to set the matrix column for matches to * (stop codons) to the     */
4845 /* most negative value used, so that matrix-file corresponds to what makemat */
4846 /* produces.                                                                 */
4847 /*---------------------------------------------------------------------------*/
4848   ASSERT(compactSearch->alphabetSize == 26);                 /* sanity check */
4849 /*
4850   for (c=0;c<compactSearch->qlength;c++) {
4851     posSearch->posPrivateMatrix[c][compactSearch->alphabetSize-1] = BLAST_SCORE_MIN;
4852   }
4853 */
4854 /*  posScaling(posSearch, compactSearch); */
4855   scalingOkay = impalaScaling(posSearch, compactSearch, 1.0, TRUE);
4856   if (!scalingOkay) goto cleanup;
4857 
4858   if (pcdd) {
4859 /*---------------------------------------------------------------------------*/
4860 /* also, write out the PSSM as an ASCII-formatted MATRIX file, for use with  */
4861 /* copymat                                                                   */
4862 /*---------------------------------------------------------------------------*/
4863 /*    strcpy(mtrxFileName,cAccession);
4864     strcat(mtrxFileName,MTRXEXT);
4865     CddtakeMatrixCheckpoint(compactSearch,posSearch,search->sbp,mtrxFileName,
4866                             NULL,FALSE,1.0);
4867 */
4868 /*---------------------------------------------------------------------------*/
4869 /* in compliance with makemat, set the score for the X-column                */
4870 /*---------------------------------------------------------------------------*/
4871     for (index = 0; index<compactSearch->qlength;index++) {
4872       posSearch->posMatrix[index][21] = Xscore;
4873     }
4874     sctp = SeqCodeTableFind(Seq_code_ncbistdaa);
4875     LetterHead = NULL;
4876     for (a=0;a<compactSearch->alphabetSize;a++) {
4877       newLetter = ValNodeNew(NULL);
4878       newLetter->data.ptrvalue = MemNew(2*sizeof(Char));
4879       Nlm_StrNCpy(newLetter->data.ptrvalue,&(sctp->letters[a]),1);
4880       ValNodeLink(&(LetterHead),newLetter);
4881     }
4882 
4883     pcdd->posfreq = (MatrixPtr) MatrixNew();
4884     pcdd->posfreq->ncolumns = compactSearch->qlength;
4885     pcdd->posfreq->nrows = compactSearch->alphabetSize;
4886     pcdd->posfreq->scale_factor = POSFREQ_SCALE;
4887     pcdd->posfreq->row_labels = LetterHead;
4888     ColumnHead = NULL;
4889     for (index = 0; index<compactSearch->qlength;index++) {
4890       RowHead = NULL;
4891       for (a=0;a<compactSearch->alphabetSize;a++) {
4892          newRow = ValNodeNew(NULL);
4893          iNew = (Int4) (0.5 + (Nlm_FloatHi) pcdd->posfreq->scale_factor * posSearch->posFreqs[index][a]);
4894          newRow->data.intvalue = iNew;
4895            ValNodeLink(&(ColumnHead),newRow);
4896       }
4897     }
4898     pcdd->posfreq->columns = ColumnHead;
4899     pcdd->scoremat = (MatrixPtr) MatrixNew();
4900     pcdd->scoremat->ncolumns = compactSearch->qlength;
4901     pcdd->scoremat->nrows = compactSearch->alphabetSize;
4902     pcdd->scoremat->scale_factor = 1;
4903     pcdd->scoremat->row_labels = LetterHead;
4904     ColumnHead = NULL;
4905     for (index = 0; index<compactSearch->qlength;index++) {
4906       RowHead = NULL;
4907       for (a=0;a<compactSearch->alphabetSize;a++) {
4908          newRow = ValNodeNew(NULL);
4909          iNew = (Int4) posSearch->posMatrix[index][a];
4910          newRow->data.intvalue = iNew;
4911            ValNodeLink(&(ColumnHead),newRow);
4912       }
4913     }
4914     pcdd->scoremat->columns = ColumnHead;
4915 
4916   } else  { /* if pcdd is empty, fill in the Seq_Mtf data structure */
4917     Input = (char*) MemNew(sizeof(InputOrder));
4918     Output = (char*) MemNew(sizeof(OutputOrder));
4919     StrCpy(Input, InputOrder);
4920     StrCpy(Output, OutputOrder);
4921     OutLen = StrLen(Output);
4922     pssm = NewSeqMtf(compactSearch->qlength, OutLen);
4923     Coverage = (Boolean*) MemNew(sizeof(Boolean) * OutLen);  /* set to 0 by MemNew */
4924     for (j=0; j<compactSearch->alphabetSize;j++) {
4925       jj = CddGetNewIndexForThreading(Input[j], Output);
4926       if (jj != -1) {
4927         for (i=0; i<compactSearch->qlength;i++) {
4928           pssm->ww[i][jj] = ThrdRound(posSearch->posMatrix[i][j]*ScaleFactor*Weight);
4929           pssm->freqs[i][jj] = ThrdRound(posSearch->posFreqs[i][j]*ScaleFactor);
4930           Coverage[jj] = TRUE;
4931         }
4932       }
4933     }
4934     for (i=0; i<OutLen; i++) {
4935       if (Coverage[i] == FALSE) {
4936         ASSERT(FALSE);  /* should never get here */
4937       }
4938     }
4939     MemFree(Input);
4940     MemFree(Output);
4941     MemFree(Coverage);
4942   }
4943 
4944   if (bspOut && pcdd && bWriteOut) {
4945     strcpy(cseqFileName,cAccession);
4946     strcat(cseqFileName,CSEQEXT);
4947     fp = FileOpen(cseqFileName, "w");
4948     if (NULL == fp) {
4949       BlastConstructErrorMessage("CddDenDiagCposComp2", "Could not open fasta file", 1, NULL);
4950     }
4951     BioseqToFasta(bspOut,fp,FALSE);
4952     FileClose(fp);
4953   }
4954 
4955 /*---------------------------------------------------------------------------*/
4956 /* clean up                                                                  */
4957 /*---------------------------------------------------------------------------*/
4958 cleanup:
4959   MemFree(cAccession);
4960   posSearch->NumSequences = posSearch->posNumSequences;
4961   posSearch->QuerySize = alignLength;
4962   CddposFreeMemory(posSearch);
4963   MemFree(posSearch);
4964   if (kbp) {
4965     BlastKarlinBlkCopy(*(compactSearch->kbp_gap_psi),kbp);
4966   }
4967   CddposFreeMemory2(compactSearch);
4968   MemFree(compactSearch);
4969   search = BlastSearchBlkDestruct(search);
4970   bop->options = BLASTOptionDelete(bop->options);
4971   MemFree(bop->blast_database);
4972   MemFree(bop);
4973   if (NULL == pcdd && scalingOkay) {  /* return pssm if from within threader */
4974     return(pssm);
4975   }
4976   if (NULL != pcdd && !scalingOkay) { /* fields are empty if scaling failed  */
4977     pcdd->posfreq = NULL;
4978     pcdd->scoremat = NULL;
4979   }
4980   return NULL;
4981 }
4982 
4983 /*---------------------------------------------------------------------------*/
4984 /*---------------------------------------------------------------------------*/
4985 /* Free a Cdd data structure - works on a single CD only (not linked lists)  */
4986 /*---------------------------------------------------------------------------*/
4987 /*---------------------------------------------------------------------------*/
CddFreeCarefully(CddPtr pcdd)4988 CddPtr LIBCALL CddFreeCarefully(CddPtr pcdd)
4989 {
4990   BioseqPtr    bsp;
4991   BioseqSetPtr bssp;
4992   ScorePtr     nxtscore, thisscore;
4993   SeqAnnotPtr  sap, sapThis;
4994   SeqEntryPtr  sep, sepThis;
4995   ValNodePtr   vnp, vnpThis;
4996 
4997   if(pcdd == NULL) return NULL;
4998   if (NULL != pcdd->name) pcdd->name = MemFree(pcdd->name);
4999   if (NULL != pcdd->id)   pcdd->id   = (CddIdSetPtr) CddIdSetFree(pcdd->id);
5000 
5001   sap = pcdd->seqannot;
5002   while (sap) {
5003     sapThis = sap->next;
5004     sap = SeqAnnotFree(sap);
5005     sap = sapThis;
5006   }
5007 
5008   vnp = pcdd->description;
5009   while (vnp) {
5010     vnpThis = vnp->next;
5011     switch (vnp->choice) {
5012       case CddDescr_othername:
5013         vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5014 	break;
5015       case CddDescr_category:
5016         vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5017 	break;
5018       case CddDescr_comment:
5019         vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5020 	break;
5021       case CddDescr_reference:
5022         vnp->data.ptrvalue = PubFree(vnp->data.ptrvalue);
5023 	break;
5024       case CddDescr_create_date:
5025         vnp->data.ptrvalue = DateFree(vnp->data.ptrvalue);
5026 	break;
5027       case CddDescr_tax_source:
5028         vnp->data.ptrvalue = OrgRefFree(vnp->data.ptrvalue);
5029 	break;
5030       case CddDescr_source:
5031         vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
5032 	break;
5033       default:
5034 	break;
5035     }
5036     vnp = MemFree(vnp);
5037     vnp = vnpThis;
5038   }
5039 
5040   if (NULL != pcdd->features) {
5041     pcdd->features = BiostrucAnnotSetFree(pcdd->features);
5042   }
5043 
5044   if (NULL != pcdd->sequences) {
5045     bssp = pcdd->sequences->data.ptrvalue;
5046     if (NULL != bssp) {
5047       sep = bssp->seq_set;
5048       while (sep) {
5049         sepThis = sep->next;
5050 	if (sep->choice == 1) {
5051           bsp = sep->data.ptrvalue;
5052 	  if (bsp->id->choice != SEQID_LOCAL)BioseqUnlock(bsp);
5053 	  BioseqFree(bsp);
5054         }
5055         MemFree(sep);
5056 	sep = sepThis;
5057       }
5058       bssp = MemFree(bssp);
5059     }
5060     pcdd->sequences = ValNodeFree(pcdd->sequences);
5061   }
5062 
5063   if(NULL != pcdd->profile_range) {
5064     pcdd->profile_range = SeqIntFree(pcdd->profile_range);
5065   }
5066   if (NULL != pcdd->trunc_master) {
5067     pcdd->trunc_master = BioseqFree(pcdd->trunc_master);
5068   }
5069   if (NULL != pcdd->posfreq) {
5070     vnp = pcdd->posfreq->row_labels; if (NULL != vnp) ValNodeFree(vnp);
5071     vnp = pcdd->posfreq->columns;    if (NULL != vnp) ValNodeFree(vnp);
5072     pcdd->posfreq = MemFree(pcdd->posfreq);
5073   }
5074   if (NULL != pcdd->scoremat) {
5075     vnp = pcdd->scoremat->columns;    if (NULL != vnp) ValNodeFree(vnp);
5076     pcdd->scoremat = MemFree(pcdd->scoremat);
5077   }
5078   if (NULL != pcdd->distance) {
5079     nxtscore = pcdd->distance->scores;
5080     while (nxtscore) {
5081       if (NULL != nxtscore->id) ObjectIdFree(nxtscore->id);
5082       thisscore = nxtscore;
5083       nxtscore = nxtscore->next;
5084       thisscore = MemFree(thisscore);
5085     }
5086     pcdd->distance = MemFree(pcdd->distance);
5087   }
5088   return MemFree(pcdd);
5089 }
5090 
5091 /*---------------------------------------------------------------------------*/
5092 /* Translate a SeqLoc into an Integer list of residue numbers                */
5093 /*---------------------------------------------------------------------------*/
CddGetFeatLocList(SeqLocPtr location,Int4 * nres)5094 Int4Ptr LIBCALL CddGetFeatLocList(SeqLocPtr location, Int4 *nres)
5095 {
5096   SeqLocPtr     slp;
5097   SeqIntPtr     sintp;
5098   PackSeqPntPtr pspp;
5099   SeqPntPtr     spp;
5100   Int4Ptr       presnum = NULL;
5101   Int4          i;
5102   Int4          nSubLoc;
5103   Int4Ptr       pSubLoc;
5104 
5105   presnum = MemNew(100 * sizeof(Int4));
5106   *nres = 0;
5107   if (!location) return NULL;
5108   switch (location->choice) {
5109     case SEQLOC_NULL:
5110     case SEQLOC_FEAT:
5111     case SEQLOC_BOND:
5112     case SEQLOC_EMPTY:
5113     case SEQLOC_WHOLE:
5114       break;
5115     case SEQLOC_INT:
5116       sintp = (SeqIntPtr) location->data.ptrvalue;
5117       for (i=sintp->from;i<=sintp->to;i++) {
5118         presnum[*nres] = i;
5119 	(*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5120       }
5121       break;
5122     case SEQLOC_PNT:
5123       spp = location->data.ptrvalue;
5124       presnum[*nres] = spp->point;
5125 	(*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5126       break;
5127     case SEQLOC_PACKED_PNT:
5128       pspp = location->data.ptrvalue;
5129       while (pspp) {
5130 	for (i=0;i<pspp->used;i++) {
5131 	  presnum[*nres] = pspp->pnts[i];
5132 	  (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5133 	}
5134         pspp = pspp->next;
5135       }
5136       break;
5137     case SEQLOC_PACKED_INT:
5138     case SEQLOC_MIX:
5139     case SEQLOC_EQUIV:
5140       slp = (SeqLocPtr)location->data.ptrvalue;
5141       while (slp) {
5142         pSubLoc = CddGetFeatLocList(slp, &nSubLoc);
5143         for (i=0;i<nSubLoc;i++) {
5144 	  presnum[*nres] = pSubLoc[i];
5145 	  (*nres)++; if (*nres >= 100) presnum = Nlm_MemMore(presnum,((*nres)+1)*sizeof(Int4));
5146 	}
5147         slp = slp->next;
5148       }
5149     default:
5150       break;
5151   }
5152   return(presnum);
5153 }
5154 
5155 /*---------------------------------------------------------------------------*/
5156 /* check for violations of an interval defined on the aligned master with    */
5157 /* respect to aligned residues. Returns -1 if everything is OK, returns      */
5158 /* the first location on the master (counting from 0) if a violation is      */
5159 /* detected, returns INT4_MAX if negative interval numbers are encountered   */
5160 /*---------------------------------------------------------------------------*/
CddSeqLocInExpAlign(SeqLocPtr location,CddExpAlignPtr eap)5161 static Int4 CddSeqLocInExpAlign(SeqLocPtr location, CddExpAlignPtr eap)
5162 {
5163   SeqIntPtr     sintp;
5164   SeqPntPtr     spp;
5165   PackSeqPntPtr pspp;
5166   SeqLocPtr     slp;
5167   Int4          i;
5168 
5169   if (!location) return(-1);
5170   switch (location->choice) {
5171     case SEQLOC_NULL:
5172     case SEQLOC_FEAT:
5173     case SEQLOC_BOND:
5174     case SEQLOC_EMPTY:
5175     case SEQLOC_WHOLE:
5176       break;
5177     case SEQLOC_INT:
5178       sintp = (SeqIntPtr) location->data.ptrvalue;
5179       if (sintp->from < 0 || sintp->to < 0) return(INT4_MAX);
5180       for (i=sintp->from;i<=sintp->to;i++) {
5181         if (eap->adata[i] < 0) return(i);
5182       }
5183       break;
5184     case SEQLOC_PNT:
5185       spp = location->data.ptrvalue;
5186       if (spp->point < 0) return(INT4_MAX);
5187       if (eap->adata[spp->point] < 0) return(spp->point);
5188       break;
5189     case SEQLOC_PACKED_PNT:
5190       pspp = location->data.ptrvalue;
5191       while (pspp) {
5192 	for (i=0;i<pspp->used;i++) {
5193           if (pspp->pnts[i] < 0) return (INT4_MAX);
5194           if (eap->adata[pspp->pnts[i]] < 0) return(pspp->pnts[i]);
5195 	}
5196         pspp = pspp->next;
5197       }
5198       break;
5199     case SEQLOC_PACKED_INT:
5200     case SEQLOC_MIX:
5201     case SEQLOC_EQUIV:
5202       slp = (SeqLocPtr)location->data.ptrvalue;
5203       while (slp) {
5204         i = CddSeqLocInExpAlign(slp,eap);
5205         if (i >= 0) return(i);
5206         slp = slp->next;
5207       }
5208     default:
5209       break;
5210   }
5211   return(-1);
5212 }
5213 
5214 
5215 /*---------------------------------------------------------------------------*/
5216 /*---------------------------------------------------------------------------*/
CddRelocateSeqLoc(SeqLocPtr location,SeqIdPtr sip,Int4 * ali)5217 static void CddRelocateSeqLoc(SeqLocPtr location, SeqIdPtr sip, Int4 *ali)
5218 {
5219   SeqIntPtr     sintp;
5220   SeqPntPtr     spp;
5221   PackSeqPntPtr pspp;
5222   SeqLocPtr     slp;
5223   Int4          i;
5224 
5225   if (!location) return;
5226   switch (location->choice) {
5227     case SEQLOC_NULL:
5228     case SEQLOC_FEAT:
5229     case SEQLOC_BOND:
5230       break;
5231     case SEQLOC_EMPTY:
5232     case SEQLOC_WHOLE:
5233       SeqIdFree(location->data.ptrvalue);
5234       location->data.ptrvalue = (SeqIdPtr) SeqIdDup(sip);
5235       break;
5236     case SEQLOC_INT:
5237       sintp = (SeqIntPtr) location->data.ptrvalue;
5238       SeqIdFree(sintp->id);
5239       sintp->id = SeqIdDup(sip);
5240       sintp->from = ali[sintp->from];
5241       sintp->to   = ali[sintp->to];
5242       if (sintp->from < 0 || sintp->to < 0) CddSevError("Invalid SeqLoc Interval!");
5243       break;
5244     case SEQLOC_PNT:
5245       spp = location->data.ptrvalue;
5246       SeqIdFree(spp->id);
5247       spp->id = (SeqIdPtr) SeqIdDup(sip);
5248       spp->point = ali[spp->point];
5249       if (spp->point < 0) CddSevError("Invalid SeqLoc Interval!");
5250       break;
5251     case SEQLOC_PACKED_PNT:
5252       pspp = location->data.ptrvalue;
5253       while (pspp) {
5254         SeqIdFree(pspp->id);
5255         pspp->id = (SeqIdPtr) SeqIdDup(sip);
5256 	for (i=0;i<pspp->used;i++) {
5257           pspp->pnts[i] = ali[pspp->pnts[i]];
5258           if (pspp->pnts[i] < 0) CddSevError("Invalid SeqLoc Interval!");
5259 	}
5260         pspp = pspp->next;
5261       }
5262       break;
5263     case SEQLOC_PACKED_INT:
5264     case SEQLOC_MIX:
5265     case SEQLOC_EQUIV:
5266       slp = (SeqLocPtr)location->data.ptrvalue;
5267       while (slp) {
5268         CddRelocateSeqLoc(slp,sip,ali);
5269         slp = slp->next;
5270       }
5271     default:
5272       break;
5273   }
5274 }
5275 
CddFindSeqIdInAlignAnnot(AlignAnnotPtr oldannot)5276 static SeqIdPtr CddFindSeqIdInAlignAnnot(AlignAnnotPtr oldannot)
5277 {
5278   SeqLocPtr location;
5279   location = oldannot->location;
5280   return SeqLocId(location);
5281 }
5282 
5283 
5284 /*---------------------------------------------------------------------------*/
5285 /*---------------------------------------------------------------------------*/
5286 /* transfer alignment annotations from the current master to another         */
5287 /* sequence (before reindexing the alignment to this new master)             */
5288 /*---------------------------------------------------------------------------*/
5289 /*---------------------------------------------------------------------------*/
CddTransferAlignAnnot(AlignAnnotPtr oldannot,SeqIdPtr newMaster,SeqAlignPtr salp,BioseqSetPtr bssp)5290 void LIBCALL CddTransferAlignAnnot(AlignAnnotPtr oldannot,
5291                                             SeqIdPtr newMaster,
5292 					    SeqAlignPtr salp,
5293 					    BioseqSetPtr bssp)
5294 {
5295   CddExpAlignPtr    pCDeaO= NULL, pCDeaN = NULL, pCDeaON = NULL;
5296   SeqLocPtr         slp;
5297   SeqIdPtr          oldMaster, sip, sipTmp;
5298   Boolean           bOldIsTop = FALSE;
5299   Boolean           bNewIsTop = FALSE;
5300   DenseDiagPtr      ddp;
5301   SeqAlignPtr       salpThis;
5302   BioseqPtr         bsp;
5303   SeqEntryPtr       sep;
5304   AlignAnnotPtr     aap;
5305 
5306   slp = (SeqLocPtr) oldannot->location;
5307   oldMaster = CddFindSeqIdInAlignAnnot(oldannot);
5308   if (oldMaster->choice == SEQID_GI) {
5309     sep = bssp->seq_set; while (sep) {
5310       if (sep->choice == 1) {
5311         bsp = sep->data.ptrvalue;
5312 	if (CddSameSip(oldMaster, bsp->id)) {
5313 	  sipTmp = bsp->id;
5314 	  while (sipTmp) {
5315             if (sipTmp->choice == SEQID_PDB) {
5316 	      oldMaster = sipTmp;
5317 	      break;
5318 	    }
5319 	    sipTmp = sipTmp->next;
5320 	  }
5321 	  break;
5322 	}
5323       }
5324       sep = sep->next;
5325     }
5326   }
5327   ddp = salp->segs;
5328   sip = SeqIdDup(ddp->id);  SeqIdFree(sip->next);
5329   if (CddSameSip(oldMaster,sip)) bOldIsTop = TRUE;
5330   if (CddSameSip(newMaster,sip)) bNewIsTop = TRUE;
5331   SeqIdFree(sip);
5332   salpThis = salp; while (salpThis) {
5333     ddp = salpThis->segs;
5334     if (CddSameSip(oldMaster,ddp->id->next)) {
5335       if (bNewIsTop) {
5336         pCDeaON = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5337 	break;
5338       } else {
5339         pCDeaO = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5340 	if (pCDeaN) break;
5341       }
5342     }
5343     if (CddSameSip(newMaster,ddp->id->next)) {
5344       if (bOldIsTop) {
5345         pCDeaON = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5346         break;
5347       } else {
5348         pCDeaN = SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5349 	if (pCDeaO) break;
5350       }
5351     }
5352     salpThis = salpThis->next;
5353   }
5354   if (!pCDeaON) {                                     /* no direct relation */
5355     bsp = CddRetrieveBioseqById(oldMaster,bssp->seq_set);
5356     pCDeaON = CddReindexExpAlign(pCDeaO, bsp->length, pCDeaN, 0, 1);
5357     if (bNewIsTop) {
5358 
5359     }
5360   }
5361   if (bNewIsTop) {                             /* need to revert alignment  */
5362     pCDeaON = InvertCddExpAlign(pCDeaON,bssp->seq_set);
5363   }
5364   if (!pCDeaON) CddSevError("Error in CDDTransferAlignAnnot, can not locate alignment data!");
5365   aap = oldannot; while (aap) {
5366     slp = aap->location;
5367     while (slp) {
5368       CddRelocateSeqLoc(slp,newMaster,pCDeaON->adata);
5369       slp = slp->next;
5370     }
5371     aap = aap->next;
5372   }
5373 }
5374 
5375 
5376 /*---------------------------------------------------------------------------*/
5377 /*---------------------------------------------------------------------------*/
5378 /* reindex a SeqAlign (a linked list of pairwise Den-Diag Master-Slave Seq-  */
5379 /* aligns) to a new Master, pointed out by it's SeqId. Will pick the first   */
5380 /* instance of that SeqId in the alignment, and return a SeqAlign of the     */
5381 /* same format as the input was                                              */
5382 /*---------------------------------------------------------------------------*/
5383 /*---------------------------------------------------------------------------*/
CddReindexSeqAlign(SeqAlignPtr salp,SeqIdPtr sipMaster,BioseqSetPtr bssp)5384 SeqAlignPtr LIBCALL CddReindexSeqAlign(SeqAlignPtr salp, SeqIdPtr sipMaster,
5385                                        BioseqSetPtr bssp)
5386 {
5387   SeqAlignPtr    salpThis, salpNew, salpHead, salpTail;
5388   DenseDiagPtr   ddp;
5389   CddExpAlignPtr pCDea, pCDea1, pCDeaR;
5390   Int4           iNotThis = -1, i = -1, mlength;
5391   BioseqPtr      bsp;
5392 
5393   bsp = CddRetrieveBioseqById(sipMaster, bssp->seq_set);
5394   if (!bsp) return NULL;
5395   mlength = bsp->length;
5396   salpThis = salp; while(salpThis) {
5397     iNotThis++;
5398     ddp = salpThis->segs;
5399     if (CddSameSip(sipMaster,ddp->id->next)) break;
5400     else salpThis = salpThis->next;
5401   }
5402   pCDea1 = (CddExpAlignPtr) SeqAlignToCddExpAlign(salpThis,bssp->seq_set);
5403   pCDeaR = (CddExpAlignPtr) InvertCddExpAlign(pCDea1, bssp->seq_set);
5404   salpNew = (SeqAlignPtr) CddExpAlignToSeqAlign(pCDeaR,NULL);
5405   pCDeaR = CddExpAlignFree(pCDeaR);
5406   salpTail = salpNew;
5407   salpHead = salp;
5408   while (salpHead) {
5409     i++;
5410     if (i!= iNotThis) {
5411       pCDea = (CddExpAlignPtr) SeqAlignToCddExpAlign(salpHead, bssp->seq_set);
5412       pCDeaR = (CddExpAlignPtr) CddReindexExpAlign(pCDea1,mlength,pCDea,0,i);
5413       salpThis = (SeqAlignPtr) CddExpAlignToSeqAlign(pCDeaR,NULL);
5414       pCDeaR = CddExpAlignFree(pCDeaR);
5415       pCDea = CddExpAlignFree(pCDea);
5416       salpTail->next = salpThis;
5417       salpTail = salpThis;
5418     }
5419     salpHead = salpHead->next;
5420   }
5421 
5422   pCDea1 = CddExpAlignFree(pCDea1);
5423   return (salpNew);
5424 }
5425 /********************************************************************/
5426 /* Remove all description/annotation info from the bioseq,	    */
5427 /* except for tax and mmdb info.				    */
5428 /********************************************************************/
CddShrinkBioseq(BioseqPtr bsp)5429 void LIBCALL CddShrinkBioseq(BioseqPtr bsp)
5430 {
5431   SeqDescrPtr next, sdp, sdp2              = NULL;
5432   SeqAnnotPtr sap, sap_next, thissap, sap2 = NULL;
5433 
5434   sdp=bsp->descr;
5435   while (sdp != NULL) {
5436     next = sdp->next;
5437     sdp->next = NULL;
5438     if(sdp->choice == Seq_descr_title ||
5439       sdp->choice == Seq_descr_org ||
5440       sdp->choice == Seq_descr_pdb ||
5441       sdp->choice == Seq_descr_source) {
5442         ValNodeLink(&sdp2, sdp);
5443       } else SeqDescFree(sdp);
5444     sdp = next;
5445   }
5446   bsp->descr = sdp2;
5447 
5448   sap = bsp->annot;
5449   while (sap != NULL) {
5450     sap_next = sap->next;
5451     if ((sap->type == 4 && ((SeqIdPtr)sap->data)->choice == SEQID_GENERAL
5452          && Nlm_StrCmp(((DbtagPtr)((SeqIdPtr)sap->data)->data.ptrvalue)->db,"mmdb") == 0)
5453 	|| (sap->type == 1 && ((SeqFeatPtr)sap->data)->data.choice == SEQFEAT_PROT)) {
5454       if (!sap2) {
5455         sap2 = sap;
5456       } else {
5457         thissap = sap2; while (thissap->next) thissap = thissap->next;
5458         thissap->next = sap;
5459       }
5460       sap->next = NULL;
5461     } else SeqAnnotFree(sap);
5462     sap = sap_next;
5463   }
5464   bsp->annot = sap2;
5465 }
5466 
5467 /*---------------------------------------------------------------------------*/
5468 /*---------------------------------------------------------------------------*/
5469 /* Make a list of m-s seq-aligns proper, aligning only columns	             */
5470 /* found in all pairs.						             */
5471 /*---------------------------------------------------------------------------*/
5472 /*---------------------------------------------------------------------------*/
CddTrimSeqAligns(CddPtr pcdd)5473 Int2 LIBCALL CddTrimSeqAligns(CddPtr pcdd)
5474 {
5475     CddExpAlignPtr	ceap,tceap;
5476     SeqAlignPtr		salp,salp2,head,head2=NULL;
5477     SeqEntryPtr		sep;
5478     ValNodePtr		vnp,vhead=NULL;
5479     Int2		i;
5480     Int4Ptr             iBreakAfter;
5481     DenseDiagPtr        ddp;
5482 
5483     if(pcdd == NULL) return 1;
5484     sep = ((BioseqSetPtr)pcdd->sequences->data.ptrvalue)->seq_set;
5485     head = pcdd->seqannot->data;
5486     if(head == NULL /*|| head->next == NULL */) return 1;
5487 
5488     tceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(head, sep);
5489     iBreakAfter = MemNew(tceap->length * sizeof(Int4));
5490 /*---------------------------------------------------------------------------*/
5491 /* record existing block structure to make sure adjacent blocks aren't merged*/
5492 /*---------------------------------------------------------------------------*/
5493     for (i=0;i<tceap->length;i++) iBreakAfter[i] = 0;
5494     ddp = head->segs;
5495     while (ddp) {
5496       iBreakAfter[ddp->starts[0]+ddp->len-1] = 1;
5497       ddp = ddp->next;
5498     }
5499     for(salp=head; salp; salp=salp->next) {
5500       ceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(salp, sep);
5501       ValNodeAddPointer(&vhead, 0, ceap);
5502       if(SeqIdComp(tceap->ids,ceap->ids) != SIC_YES) {
5503 	for(vnp=vhead; vnp; vnp=vnp->next) {
5504 	  ceap = vnp->data.ptrvalue;
5505 	  ceap->ids = SeqIdSetFree(ceap->ids);
5506 	  CddExpAlignFree(ceap);
5507 	}
5508 	tceap->ids = SeqIdSetFree(tceap->ids);
5509 	CddExpAlignFree(tceap);
5510 	ErrPostEx(SEV_WARNING,0,0,"Not master/slave alignments");
5511 	return 1;
5512       }
5513       for(i=0; i<tceap->length; i++) {
5514 	if(ceap->adata[i] == -1) {
5515 	  tceap->adata[i] = -1;
5516 	  if (i > 0) {
5517 	    if (ceap->adata[i-1] != -1)  iBreakAfter[i-1] = 1;
5518 	    if (tceap->adata[i-1] != -1) iBreakAfter[i-1] = 1;
5519           }
5520 	} else {
5521 	  if (i > 0) {
5522 	    if (tceap->adata[i-1] != -1 && (tceap->adata[i]-tceap->adata[i-1])>1)
5523 	      iBreakAfter[i-1] = 1;
5524 	    if (ceap->adata[i-1] != -1 && (ceap->adata[i]-ceap->adata[i-1])>1)
5525 	      iBreakAfter[i-1] = 1;
5526 	  }
5527 	}
5528       }
5529     }
5530     for(vnp=vhead; vnp; vnp=vnp->next) {
5531       ceap = vnp->data.ptrvalue;
5532       for(i=0; i<tceap->length; i++)
5533 	if(tceap->adata[i] == -1) ceap->adata[i] = -1;
5534       if(head2) {
5535 	salp2->next = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBreakAfter);
5536 	salp2 = salp2->next;
5537       } else
5538 	head2 = salp2 = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBreakAfter);
5539       ceap->ids = SeqIdSetFree(ceap->ids);
5540       CddExpAlignFree(ceap);
5541     }
5542 
5543     tceap->ids = SeqIdSetFree(tceap->ids);
5544     MemFree(iBreakAfter);
5545     CddExpAlignFree(tceap);
5546     ValNodeFree(vhead);
5547     SeqAlignListFree(pcdd->seqannot->data);
5548     pcdd->seqannot->data = head2;
5549 
5550     return 0;
5551 }
5552 
5553 /********************************************************************/
5554 /* Make a list of m-s seq-aligns proper, aligning only columns	    */
5555 /* found in all pairs.	If !modify, don't change seq-aligns, only   */
5556 /* read them to populate iBreakAfter.				    */
5557 /********************************************************************/
CddGetProperBlocks(CddPtr pcdd,Boolean modify,Int4Ptr * iBreakAfter)5558 Int2 LIBCALL CddGetProperBlocks(CddPtr pcdd, Boolean modify,
5559 				Int4Ptr *iBreakAfter)
5560 {
5561     CddExpAlignPtr	ceap,tceap;
5562     SeqAlignPtr		salp,salp2,head,head2=NULL;
5563     SeqEntryPtr		sep;
5564     ValNodePtr		vnp,vhead=NULL;
5565     Int2		i;
5566     Int4Ptr             iBrkAft;
5567 
5568     if(pcdd == NULL) return 1;
5569     sep = ((BioseqSetPtr)pcdd->sequences->data.ptrvalue)->seq_set;
5570     head = pcdd->seqannot->data;
5571     if(head == NULL || head->next == NULL) return 1;
5572 
5573     tceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(head, sep);
5574     iBrkAft = MemNew(tceap->length * sizeof(Int4));
5575     for (i=0;i<tceap->length;i++) iBrkAft[i] = 0;
5576     for(salp=head; salp; salp=salp->next) {
5577       ceap = (CddExpAlignPtr)SeqAlignToCddExpAlign(salp, sep);
5578       ValNodeAddPointer(&vhead, 0, ceap);
5579       if(SeqIdComp(tceap->ids,ceap->ids) != SIC_YES) {
5580 	for(vnp=vhead; vnp; vnp=vnp->next) {
5581 	  ceap = vnp->data.ptrvalue;
5582 	  ceap->ids = SeqIdSetFree(ceap->ids);
5583 	  CddExpAlignFree(ceap);
5584 	}
5585 	tceap->ids = SeqIdSetFree(tceap->ids);
5586 	CddExpAlignFree(tceap);
5587 	MemFree(iBrkAft);
5588 	ErrPostEx(SEV_WARNING,0,0,"Not master/slave alignments");
5589 	return 1;
5590       }
5591       for(i=0; i<tceap->length; i++) {
5592 	if(ceap->adata[i] == -1) {
5593 	  tceap->adata[i] = -1;
5594 	  if (i > 0) {
5595 	    if (ceap->adata[i-1] != -1)  iBrkAft[i-1] = 1;
5596 	    if (tceap->adata[i-1] != -1) iBrkAft[i-1] = 1;
5597           }
5598 	} else {
5599 	  if (i > 0) {
5600 	    if (tceap->adata[i-1] != -1 && (tceap->adata[i]-tceap->adata[i-1])>1)
5601 	      iBrkAft[i-1] = 1;
5602 	    if (ceap->adata[i-1] != -1 && (ceap->adata[i]-ceap->adata[i-1])>1)
5603 	      iBrkAft[i-1] = 1;
5604 	  }
5605 	}
5606       }
5607     }
5608     if(modify) {
5609 	for(vnp=vhead; vnp; vnp=vnp->next) {
5610 	    ceap = vnp->data.ptrvalue;
5611 	    for(i=0; i<tceap->length; i++)
5612 		if(tceap->adata[i] == -1) ceap->adata[i] = -1;
5613 	    if(head2) {
5614 		salp2->next = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBrkAft);
5615 		salp2 = salp2->next;
5616 	    } else
5617 		head2 = salp2 = (SeqAlignPtr)CddExpAlignToSeqAlign(ceap,iBrkAft);
5618 	    ceap->ids = SeqIdSetFree(ceap->ids);
5619 	    CddExpAlignFree(ceap);
5620 	}
5621 	SeqAlignListFree(pcdd->seqannot->data);
5622 	pcdd->seqannot->data = head2;
5623     }
5624 
5625     if(iBreakAfter) *iBreakAfter = iBrkAft;
5626     else            MemFree(iBrkAft);
5627     tceap->ids = SeqIdSetFree(tceap->ids);
5628     CddExpAlignFree(tceap);
5629     ValNodeFree(vhead);
5630 
5631     return 0;
5632 }
5633 
5634 /*---------------------------------------------------------------------------*/
5635 /* fix character string for export into XML                                  */
5636 /*---------------------------------------------------------------------------*/
CddFixForXML(CharPtr pc)5637 static CharPtr CddFixForXML(CharPtr pc)
5638 {
5639   CharPtr pcNew;
5640   Int4    i = 0, j = 0;
5641 
5642   pcNew = MemNew(sizeof(Char) * (Nlm_StrLen(pc) + 100));
5643 
5644   while (pc[i] != '\0') {
5645     if (pc[i] == '>') {
5646       pcNew[j] = '&';
5647       pcNew[j+1] = 'g';
5648       pcNew[j+2] = 't';
5649       pcNew[j+3] = ';';
5650       j += 4;
5651     } else if (pc[i] == '<') {
5652       pcNew[j] = '&';
5653       pcNew[j+1] = 'l';
5654       pcNew[j+2] = 't';
5655       pcNew[j+3] = ';';
5656       j += 4;
5657     } else if (pc[i] == '&') {
5658       pcNew[j] = '&';
5659       pcNew[j+1] = 'a';
5660       pcNew[j+2] = 'm';
5661       pcNew[j+3] = 'p';
5662       pcNew[j+4] = ';';
5663       j += 5;
5664     } else {
5665       pcNew[j] = pc[i];
5666       j++;
5667     }
5668     i++;
5669   }
5670   MemFree(pc);
5671   pcNew[j] = '\0';
5672   pcNew = Nlm_Realloc(pcNew, sizeof(Char) * Nlm_StrLen(pcNew));
5673   return(pcNew);
5674 }
5675 
5676 /*---------------------------------------------------------------------------*/
5677 /*---------------------------------------------------------------------------*/
5678 /* dump out pubmed links for a CD to be used in Entrez neighboring           */
5679 /*---------------------------------------------------------------------------*/
5680 /*---------------------------------------------------------------------------*/
CddDumpPMLinks(CddPtr pcdd,FILE * FP)5681 void LIBCALL CddDumpPMLinks(CddPtr pcdd, FILE *FP)
5682 {
5683   CddIdPtr     cid;
5684   Int4         gi = 0, pmid = 0;
5685   ValNodePtr   pub, pdescr;
5686 
5687   cid = (CddIdPtr) pcdd->id;
5688   while (cid) {
5689     switch (cid->choice) {
5690       case CddId_uid:
5691         gi = cid->data.intvalue;
5692         break;
5693       default:
5694         break;
5695     }
5696     cid = cid->next;
5697   }
5698   if (gi > 0) {
5699     pdescr = pcdd->description;
5700     while (pdescr) {
5701       switch (pdescr->choice) {
5702         case CddDescr_reference:
5703           pub = pdescr->data.ptrvalue;
5704 	  if (pub->choice == PUB_PMid) {
5705             pmid = pub->data.intvalue;
5706 	    if (pmid > 0) {
5707 	      fprintf(FP,"%d\t%d\t0\n",gi,pmid);
5708 	    }
5709 	  }
5710           break;
5711         default:
5712           break;
5713       }
5714       pdescr = pdescr->next;
5715     }
5716   }
5717 }
5718 
5719 /*---------------------------------------------------------------------------*/
5720 /*---------------------------------------------------------------------------*/
5721 /* dump out taxonomy links for a CD to be used in Entrez neighboring         */
5722 /*---------------------------------------------------------------------------*/
5723 /*---------------------------------------------------------------------------*/
CddDumpTaxLinks(CddPtr pcdd,FILE * FP)5724 void LIBCALL CddDumpTaxLinks(CddPtr pcdd, FILE *FP)
5725 {
5726   CddIdPtr     cid;
5727   Int4         txid = 0, gi = 0;
5728   DbtagPtr     dbtp;
5729   ObjectIdPtr  oidp;
5730   ValNodePtr   pdescr, vnp;
5731   OrgRefPtr    pOrgRef;
5732 
5733   cid = (CddIdPtr) pcdd->id;
5734   while (cid) {
5735     switch (cid->choice) {
5736       case CddId_uid:
5737         gi = cid->data.intvalue;
5738         break;
5739       default:
5740         break;
5741     }
5742     cid = cid->next;
5743   }
5744   if (gi > 0) {
5745     pdescr = pcdd->description;
5746     while (pdescr) {
5747       switch (pdescr->choice) {
5748         case CddDescr_tax_source:
5749           pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5750           vnp = pOrgRef->db;
5751 	  while (vnp) {
5752 	    dbtp = (DbtagPtr) vnp->data.ptrvalue;
5753 	    if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5754 	      oidp = dbtp->tag;
5755 	      if (oidp && oidp->id >= 0) {
5756                 fprintf(FP,"%d\t%d\t0\n",gi,oidp->id);
5757 	      }
5758 	    }
5759 	    vnp = vnp->next;
5760 	  }
5761           break;
5762         default:
5763           break;
5764       }
5765       pdescr = pdescr->next;
5766     }
5767   }
5768 }
5769 
5770 /*---------------------------------------------------------------------------*/
5771 /*---------------------------------------------------------------------------*/
5772 /* dump out contents of a CD used for Entrez-indexing in pseudo-XML          */
5773 /*---------------------------------------------------------------------------*/
5774 /*---------------------------------------------------------------------------*/
CddDumpXML(CddPtr pcdd,FILE * FP)5775 void LIBCALL CddDumpXML(CddPtr pcdd, FILE *FP)
5776 {
5777   ValNodePtr    pdescr, cid;
5778   CharPtr       cShortName  = NULL, cAbstract = NULL, cPubDate  = NULL;
5779   CharPtr       cEntrezDate = NULL, cFilter   = NULL, cOrganism = NULL;
5780   CharPtr       cAccession  = NULL, cDatabase = NULL;
5781   Int4          gi = 0, txid = -1;
5782   Boolean       bHaveAbstract = FALSE, bAllocDatabase = FALSE;
5783   GlobalIdPtr   pGid;
5784   OrgRefPtr     pOrgRef;
5785   DatePtr       pDate;
5786   ObjectIdPtr   oidp;
5787   ValNodePtr    vnp;
5788   DbtagPtr      dbtp;
5789 
5790   cEntrezDate = MemNew(11 * sizeof(Char));
5791   cid = (CddIdPtr) pcdd->id;
5792   while (cid) {
5793     switch (cid->choice) {
5794       case CddId_uid:
5795         gi = cid->data.intvalue;
5796         break;
5797       case CddId_gid:
5798         pGid = (GlobalIdPtr) cid->data.ptrvalue;
5799 	cAccession = pGid->accession;
5800         break;
5801       default:
5802         break;
5803     }
5804     cid = cid->next;
5805   }
5806   pdescr = pcdd->description;
5807   while (pdescr) {
5808     switch (pdescr->choice) {
5809       case CddDescr_comment:
5810         if (!bHaveAbstract) {
5811 	  cAbstract = (CharPtr) pdescr->data.ptrvalue;
5812 	  bHaveAbstract = TRUE;
5813 	}
5814         break;
5815       case CddDescr_source:
5816         cDatabase = (CharPtr) pdescr->data.ptrvalue;
5817       case CddDescr_tax_source:
5818         pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5819 	cOrganism = pOrgRef->taxname;
5820 	vnp = pOrgRef->db;
5821 	while (vnp) {
5822 	  dbtp = (DbtagPtr) vnp->data.ptrvalue;
5823 	  if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5824 	    oidp = dbtp->tag;
5825 	    if (oidp && oidp->id >= 0) {
5826               txid = oidp->id;
5827 	    }
5828 	  }
5829 	  vnp = vnp->next;
5830 	}
5831         break;
5832       case CddDescr_create_date:
5833       case CddDescr_update_date:
5834         cPubDate = MemNew(11 * sizeof(Char));
5835         pDate = (DatePtr) pdescr->data.ptrvalue;
5836 	if (pDate->data[2] > 0 && pDate->data[3] > 0)
5837 	  sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5838         if (pDate->data[2] > 0 && pDate->data[3] == 0)
5839           sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5840         if (pDate->data[2] == 0 && pDate->data[3] == 0)
5841           sprintf(cPubDate,"%d",pDate->data[1]+1900);
5842         break;
5843       default:
5844         break;
5845     }
5846     pdescr = pdescr->next;
5847   }
5848   cShortName = (CharPtr) pcdd->name;
5849   if (cAccession) {
5850     if (Nlm_StrNCmp(cAccession,"cd",2) == 0) {
5851       cDatabase = StringSave("Cdd");
5852       bAllocDatabase = TRUE;
5853     }
5854   }
5855   if (!cPubDate) {
5856     cPubDate = MemNew(11 * sizeof(Char));
5857     pDate = (DatePtr) DateCurr();
5858     if (pDate->data[2] > 0 && pDate->data[3] > 0)
5859       sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5860     if (pDate->data[2] > 0 && pDate->data[3] == 0)
5861       sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5862     if (pDate->data[2] == 0 && pDate->data[3] == 0)
5863       sprintf(cPubDate,"%d",pDate->data[1]+1900);
5864   }
5865   pDate = (DatePtr) DateCurr();
5866   if (pDate->data[2] > 0 && pDate->data[3] > 0)
5867     sprintf(cEntrezDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5868   if (pDate->data[2] > 0 && pDate->data[3] == 0)
5869     sprintf(cEntrezDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5870   if (pDate->data[2] == 0 && pDate->data[3] == 0)
5871     sprintf(cEntrezDate,"%d",pDate->data[1]+1900);
5872 
5873   cAbstract = CddFixForXML(cAbstract);
5874   cShortName = CddFixForXML(cShortName);
5875 
5876   fprintf(FP,"    <IdxDocument>\n");
5877   if (gi > 0)
5878     fprintf(FP,"        <IdxUid>%d</IdxUid>\n",gi);
5879   fprintf(FP,"        <IdxKeyFields>\n");
5880   if (cPubDate)
5881     fprintf(FP,"            <PubDate>%s</PubDate>\n",cPubDate);
5882   if (cEntrezDate)
5883     fprintf(FP,"            <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5884   fprintf(FP,"        </IdxKeyFields>\n");
5885   fprintf(FP,"        <IdxDisplayFields>\n");
5886   if (cAccession)
5887     fprintf(FP,"            <Accession>%s</Accession>\n",cAccession);
5888   if (cShortName)
5889     fprintf(FP,"            <Title>%s</Title>\n",cShortName);
5890   if (cAbstract)
5891     fprintf(FP,"            <Abstract>%s</Abstract>\n",cAbstract);
5892   fprintf(FP,"        </IdxDisplayFields>\n");
5893   fprintf(FP,"        <IdxSearchFields>\n");
5894   if (gi > 0)
5895     fprintf(FP,"            <Uid>%d</Uid>\n",gi);
5896   if (cFilter)
5897     fprintf(FP,"            <Filter>%s</Filter>\n",cFilter);
5898   if (cAccession)
5899     fprintf(FP,"            <Accession>%s</Accession>\n",cAccession);
5900   if (cDatabase)
5901     fprintf(FP,"            <Database>%s</Database>\n",cDatabase);
5902   if (cShortName)
5903     fprintf(FP,"            <Title>%s</Title>\n",cShortName);
5904   if (cAbstract)
5905     fprintf(FP,"            <Text>%s</Text>\n",cAbstract);
5906   if (txid >= 0)
5907     fprintf(FP,"            <Organism>txid%d</Organism>\n",txid);
5908   if (cOrganism)
5909     fprintf(FP,"            <Organism>%s</Organism>\n",cOrganism);
5910   if (cPubDate)
5911     fprintf(FP,"            <PubDate>%s</PubDate>\n",cPubDate);
5912   if (cEntrezDate)
5913     fprintf(FP,"            <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5914   fprintf(FP,"        </IdxSearchFields>\n");
5915   fprintf(FP,"    </IdxDocument>\n");
5916 
5917   if (bAllocDatabase) MemFree(cDatabase);
5918   MemFree(cPubDate);
5919   MemFree(cEntrezDate);
5920 }
5921 
5922 /*---------------------------------------------------------------------------*/
5923 /*---------------------------------------------------------------------------*/
5924 /* create and link a CddIdxData structure                                    */
5925 /*---------------------------------------------------------------------------*/
5926 /*---------------------------------------------------------------------------*/
CddIdxDataNew(CharPtr acc,Int4 uid)5927 CddIdxDataPtr LIBCALL CddIdxDataNew(CharPtr acc, Int4 uid)
5928 {
5929 
5930   CddIdxDataPtr     cidp;
5931 
5932   cidp = MemNew(sizeof(CddIdxData));
5933 
5934   if (acc) {
5935     cidp->cCDDid  = acc;
5936     cidp->iPssmId = uid;
5937   }
5938   cidp->next = NULL;
5939   return(cidp);
5940 }
5941 
CddIdxDataLink(CddIdxDataPtr PNTR head,CddIdxDataPtr cidp)5942 CddIdxDataPtr LIBCALL CddIdxDataLink(CddIdxDataPtr PNTR head, CddIdxDataPtr cidp)
5943 {
5944   CddIdxDataPtr  cidpThis;
5945 
5946   cidpThis = *head;
5947   if (cidpThis) {
5948     while (cidpThis->next != NULL) cidpThis = cidpThis->next;
5949     cidpThis->next = cidp;
5950   } else {
5951     *head = cidp;
5952   }
5953   return *head;
5954 }
5955 
5956 
5957 /*---------------------------------------------------------------------------*/
5958 /*---------------------------------------------------------------------------*/
5959 /* read in the cdd.idx file to return data associating accessions with uids  */
5960 /*---------------------------------------------------------------------------*/
5961 /*---------------------------------------------------------------------------*/
CddReadIdx(CharPtr CDDidx)5962 CddIdxDataPtr LIBCALL CddReadIdx(CharPtr CDDidx)
5963 {
5964   CddIdxDataPtr cidp, cidpHead = NULL;
5965   Char          pcBuf[2048];
5966   CharPtr       pcTest;
5967   CharPtr       acc;
5968   Int4          uid;
5969   FILE         *fp;
5970 
5971   fp = FileOpen(CDDidx,"r");
5972   if (!fp) CddSevError("could not open cdd.idx!");
5973   do {
5974     pcBuf[0]='\0';
5975     pcTest = fgets(pcBuf, (size_t)2048, fp);
5976     if (pcTest) if (pcTest[0] != ' ') {
5977       uid = (Int4) atoi(Nlm_StrTok(pcTest," "));
5978       acc = (CharPtr) StringSave(Nlm_StrTok(NULL," "));
5979       if (acc) {
5980         acc[Nlm_StrLen(acc) - 1] = '\0';
5981         cidp = CddIdxDataNew(acc,uid);
5982         CddIdxDataLink(&(cidpHead),cidp);
5983       }
5984     }
5985   } while (pcTest);
5986   FileClose(fp);
5987   return(cidpHead);
5988 }
5989 
5990 
5991 /*---------------------------------------------------------------------------*/
5992 /*---------------------------------------------------------------------------*/
5993 /* retrieve a CDD accession via the PSSMid or vice versa                     */
5994 /*---------------------------------------------------------------------------*/
5995 /*---------------------------------------------------------------------------*/
CddAccFromPssmId(Int4 iPssmId,CharPtr cAcc,CharPtr CDDidx)5996 void LIBCALL CddAccFromPssmId(Int4 iPssmId, CharPtr cAcc, CharPtr CDDidx)
5997 {
5998   CddIdxDataPtr cidp;
5999 
6000   cidp = CddReadIdx(CDDidx);
6001   if (!cidp) CddSevError("cdd.idx read failed!");
6002   while (cidp) {
6003     if (iPssmId == cidp->iPssmId) {
6004       Nlm_StrCpy(cAcc, cidp->cCDDid);
6005       break;
6006     }
6007     cidp = cidp->next;
6008   }
6009 }
6010 
CddPssmIdFromAcc(Int4 * iPssmId,CharPtr cAcc,CharPtr CDDidx)6011 void LIBCALL CddPssmIdFromAcc(Int4 *iPssmId, CharPtr cAcc, CharPtr CDDidx)
6012 {
6013   CddIdxDataPtr cidp;
6014 
6015   cidp = CddReadIdx(CDDidx);
6016   if (!cidp) CddSevError("cdd.idx read failed!");
6017   while (cidp) {
6018     if (Nlm_StrCmp(cAcc, cidp->cCDDid) == 0) {
6019       *iPssmId = cidp->iPssmId;
6020       break;
6021     }
6022     cidp = cidp->next;
6023   }
6024 }
6025 
6026 /*---------------------------------------------------------------------------*/
6027 /*---------------------------------------------------------------------------*/
6028 /* truncate a string right at the position of the first puncutation mark     */
6029 /*---------------------------------------------------------------------------*/
6030 /*---------------------------------------------------------------------------*/
CddTruncStringAtFirstPunct(CharPtr pChar)6031 void LIBCALL CddTruncStringAtFirstPunct(CharPtr pChar)
6032 {
6033   Int4   i;
6034 
6035   for (i=0;i<Nlm_StrLen(pChar);i++) {
6036     if (pChar[i] == ',' ||
6037 	pChar[i] == ';' ||
6038 	pChar[i] == '.') {
6039       pChar[i] = '\0'; break;
6040     }
6041   }
6042 }
6043 
CddFillBlanksInString(CharPtr pChar)6044 void LIBCALL CddFillBlanksInString(CharPtr pChar)
6045 {
6046   Int4   i;
6047 
6048   for (i=0;i<Nlm_StrLen(pChar);i++) {
6049     if (pChar[i] == ' ') pChar[i] = '_';
6050   }
6051 }
6052 
6053 /*---------------------------------------------------------------------------*/
6054 /*---------------------------------------------------------------------------*/
6055 /* Remove a Consensus from a CD                                              */
6056 /*---------------------------------------------------------------------------*/
6057 /*---------------------------------------------------------------------------*/
CddRemoveConsensus(CddPtr pcdd)6058 Boolean LIBCALL CddRemoveConsensus(CddPtr pcdd)
6059 {
6060   SeqIdPtr     sip;
6061   BioseqPtr    bsp;
6062   SeqAlignPtr  salp, salpnew;
6063   DenseDiagPtr ddp;
6064   BioseqSetPtr bssp;
6065   SeqEntryPtr  sepThis, sepHead = NULL, sepTail = NULL;
6066 
6067   if (!CddHasConsensus(pcdd)) return FALSE;
6068   salp = (SeqAlignPtr)  pcdd->seqannot->data;
6069   ddp  = (DenseDiagPtr) salp->segs;
6070   sip  = (SeqIdPtr)     ddp->id->next;
6071   bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6072   if (pcdd->alignannot) {
6073     CddTransferAlignAnnot(pcdd->alignannot, sip, salp, bssp);
6074   }
6075   salpnew = CddReindexSeqAlign(salp,sip,bssp);
6076   pcdd->seqannot->data = salpnew->next;
6077   sepThis = bssp->seq_set;
6078   while (sepThis) {
6079     bsp = (BioseqPtr) sepThis->data.ptrvalue;
6080     if (!SipIsConsensus(bsp->id)) {
6081       if (!sepHead) sepHead = sepThis;
6082       if (sepTail) sepTail->next = sepThis;
6083       sepTail = sepThis;
6084     }
6085     sepThis = sepThis->next;
6086   }
6087   sepTail->next = NULL;
6088   bssp->seq_set = sepHead;
6089   pcdd->sequences->data.ptrvalue = bssp;
6090   pcdd->profile_range = NULL;
6091   pcdd->trunc_master = NULL;
6092   pcdd->posfreq = NULL;
6093   pcdd->scoremat = NULL;
6094   pcdd->distance = NULL;
6095   return TRUE;
6096 }
6097 
6098 /*---------------------------------------------------------------------------*/
6099 /*---------------------------------------------------------------------------*/
6100 /* Check if a CD has annotation that is inconsistent with the alignment      */
6101 /*---------------------------------------------------------------------------*/
6102 /*---------------------------------------------------------------------------*/
CddFeaturesAreConsistent(CddPtr pcdd,CharPtr errmsg)6103 Boolean LIBCALL CddFeaturesAreConsistent(CddPtr pcdd, CharPtr errmsg)
6104 {
6105   SeqAlignPtr    salp;
6106   CddExpAlignPtr eap;
6107   BioseqSetPtr   bssp;
6108   AlignAnnotPtr  aap;
6109   SeqLocPtr      slp;
6110   Int4           iWhere;
6111 
6112   if (!pcdd->alignannot) return TRUE;
6113   bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6114   salp = (SeqAlignPtr) pcdd->seqannot->data;
6115   if (!salp) return TRUE;
6116   eap = SeqAlignToCddExpAlign(salp,bssp->seq_set);
6117   aap = pcdd->alignannot;
6118   while (aap) {
6119     slp = aap->location;
6120     iWhere = CddSeqLocInExpAlign(slp,eap);
6121     if (iWhere >= 0) {
6122       if (iWhere == INT4_MAX) {
6123         sprintf(errmsg,"Feature: %s, illegal feature address",aap->description);
6124       } else {
6125         sprintf(errmsg,"Feature: %s, Location: %d",aap->description,iWhere+1);
6126       }
6127       return FALSE;
6128     }
6129     aap = aap->next;
6130   }
6131   CddExpAlignFree(eap);
6132   return TRUE;
6133 
6134 }
6135 
6136 /*---------------------------------------------------------------------------*/
6137 /*---------------------------------------------------------------------------*/
6138 /* find a MMDB-Identifier in a Bioseq                                        */
6139 /*---------------------------------------------------------------------------*/
6140 /*---------------------------------------------------------------------------*/
CddFindMMDBIdInBioseq(BioseqPtr bsp,Int4 * iMMDBid)6141 SeqAnnotPtr LIBCALL CddFindMMDBIdInBioseq(BioseqPtr bsp, Int4 *iMMDBid)
6142 {
6143   SeqAnnotPtr annot, prevannot = NULL;
6144   SeqIdPtr    sip;
6145   DbtagPtr    dbtp;
6146   ObjectIdPtr oidp;
6147 
6148   *iMMDBid = 0;
6149   annot = bsp->annot;
6150   while(annot) {
6151     if (annot->type == 4) {
6152       sip = annot->data;
6153       while (sip) {
6154         if (sip->choice == SEQID_GENERAL) {
6155           dbtp = sip->data.ptrvalue;
6156 	  if (Nlm_StrCmp(dbtp->db,"mmdb") == 0) {
6157             oidp = dbtp->tag;
6158 	    *iMMDBid = oidp->id;
6159 	    return(prevannot);
6160 	  }
6161         }
6162         sip = sip->next;
6163       }
6164     }
6165     if (annot->next) prevannot = annot;
6166     annot = annot->next;
6167   }
6168   return (NULL);
6169 }
6170 
6171 
6172 /*---------------------------------------------------------------------------*/
6173 /*---------------------------------------------------------------------------*/
6174 /* Check if a CD has 3D superposition information for all aligned chains     */
6175 /*---------------------------------------------------------------------------*/
6176 /*---------------------------------------------------------------------------*/
CddHas3DSuperpos(CddPtr pcdd)6177 Boolean LIBCALL CddHas3DSuperpos(CddPtr pcdd)
6178 {
6179   BioseqSetPtr          bssp;
6180   BiostrucAnnotSetPtr   basp        = NULL;
6181   SeqAlignPtr           salp;
6182   DenseDiagPtr          ddp;
6183   SeqIdPtr              sip1, sip2;
6184   Int4Ptr               pMMDBid;
6185   Int4                  i, nStruct  = 0;
6186   BioseqPtr             bsp;
6187   BiostrucFeatureSetPtr bfsp;
6188   BiostrucFeaturePtr    bfp;
6189   ValNodePtr            location, vnp;
6190   ChemGraphAlignmentPtr cgap;
6191   Int4                  thisslave;
6192 
6193   pMMDBid = MemNew(250 * sizeof(Int4));
6194   basp = pcdd->features;
6195   if (!basp) return FALSE;
6196   bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6197 /*---------------------------------------------------------------------------*/
6198 /* create a list of MMDB-Id pairs which need to be checked                   */
6199 /*---------------------------------------------------------------------------*/
6200   salp = pcdd->seqannot->data;
6201   while (salp) {
6202     ddp = salp->segs;
6203     sip1 = ddp->id;
6204 	  sip2 = ddp->id->next;
6205     if (sip1->choice == SEQID_PDB) {
6206 		  if (nStruct == 0) {
6207         bsp = CddFindSip(sip1,bssp->seq_set);
6208         CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6209         nStruct++;
6210       }
6211     }
6212     if (sip2->choice == SEQID_PDB) {
6213       bsp = CddFindSip(sip2,bssp->seq_set);
6214       CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6215       nStruct++;
6216     }
6217     salp = salp->next;
6218   }
6219 /*---------------------------------------------------------------------------*/
6220 /* now walk down the biostruc annot set and remove all pairs encountered     */
6221 /*---------------------------------------------------------------------------*/
6222   bfsp = basp->features;
6223   while (bfsp) {
6224     bfp = bfsp->features;
6225     while (bfp) {
6226       location = bfp->Location_location;
6227       if (location->choice == Location_location_alignment) {
6228         cgap = location->data.ptrvalue;
6229 	vnp = cgap->biostruc_ids;
6230         thisslave = vnp->next->data.intvalue;
6231         for (i=1;i<nStruct;i++) {
6232 	  if (thisslave == pMMDBid[i]) {
6233             pMMDBid[i] = 0; break;
6234 	  }
6235 	}
6236       }
6237       bfp = bfp->next;
6238     }
6239     bfsp = bfsp->next;
6240   }
6241 
6242 /*---------------------------------------------------------------------------*/
6243 /* at this point check if any pair is left (i.e. any mmdb-id except master)  */
6244 /*---------------------------------------------------------------------------*/
6245   for (i=1;i<nStruct;i++) {
6246     if (pMMDBid[i]) {
6247       MemFree(pMMDBid); return(FALSE);
6248     }
6249   }
6250   MemFree(pMMDBid); return TRUE;
6251 }
6252 
6253 /*---------------------------------------------------------------------------*/
6254 /*---------------------------------------------------------------------------*/
6255 /* CD has unfinished alignment business                                      */
6256 /*---------------------------------------------------------------------------*/
6257 /*---------------------------------------------------------------------------*/
CddHasPendingAlignments(CddPtr pcdd)6258 Boolean    LIBCALL CddHasPendingAlignments(CddPtr pcdd)
6259 {
6260   if (pcdd->pending) return TRUE;
6261   return FALSE;
6262 }
6263 
6264 /*---------------------------------------------------------------------------*/
6265 /*---------------------------------------------------------------------------*/
6266 /* check
6267 /*---------------------------------------------------------------------------*/
6268 /*---------------------------------------------------------------------------*/
SeqHasTax(BioseqPtr bsp)6269 Boolean LIBCALL SeqHasTax(BioseqPtr bsp)
6270 {
6271   ValNodePtr  descr;
6272 
6273   descr = bsp->descr;
6274   while (descr) {
6275     if (descr->choice == Seq_descr_source) {
6276       return(TRUE);
6277     }
6278     descr = descr->next;
6279   }
6280   return(FALSE);
6281 }
6282 
6283 /*---------------------------------------------------------------------------*/
6284 /*---------------------------------------------------------------------------*/
6285 /* Fetch bioseq using BLAST dB, removing ids redundant to query.             */
6286 /* index is the NR index in the dB -- if you don't have it, use -1.          */
6287 /*---------------------------------------------------------------------------*/
6288 /*---------------------------------------------------------------------------*/
6289 NLM_EXTERN BlastDefLineSetPtr LIBCALL BlastDefLineSetFree PROTO ((BlastDefLineSetPtr ));
6290 
CddReadDBGetBioseq(SeqIdPtr query,Int4 index,ReadDBFILEPtr rdfp)6291 BioseqPtr LIBCALL CddReadDBGetBioseq(SeqIdPtr query, Int4 index,
6292                                      ReadDBFILEPtr rdfp)
6293 {
6294   return CddReadDBGetBioseqEx(query,index,rdfp,TRUE);
6295 }
6296 
CddReadDBGetBioseqEx(SeqIdPtr query,Int4 index,ReadDBFILEPtr rdfp,Boolean bUseObjMgr)6297 BioseqPtr LIBCALL CddReadDBGetBioseqEx(SeqIdPtr query, Int4 index,
6298 				       ReadDBFILEPtr rdfp, Boolean bUseObjMgr)
6299 {
6300     BioseqPtr		bsp;
6301     BlastDefLinePtr	bdp,tbdp;
6302     RDBTaxNamesPtr	tnames;
6303     Char		buf[64];
6304 
6305     if(query == NULL) return NULL;
6306     if(index < 0 && (index = SeqId2OrdinalId(rdfp, query)) < 0)
6307 	return NULL;
6308     if(!(bsp = readdb_get_bioseq_ex(rdfp, index, bUseObjMgr, FALSE))) return NULL;
6309     if(!(tbdp = FDGetDeflineAsnFromBioseq(bsp))) return NULL;
6310 
6311     for(bdp=tbdp; bdp; bdp=bdp->next) {
6312 	if(SeqIdComp(bdp->seqid,query) == SIC_YES ||
6313 	   SeqIdComp(bdp->seqid->next,query) == SIC_YES) {
6314 	    tnames = RDBGetTaxNames(rdfp->taxinfo, bdp->taxid);
6315 	    break;
6316 	}
6317     }
6318     if(bdp == NULL) {
6319 	SeqIdPrint(query,buf,PRINTID_FASTA_SHORT);
6320 	ErrPostEx(SEV_WARNING,0,0,"Bad seqid %s",buf);
6321 	return NULL;
6322     }
6323     bsp->id = SeqIdSetFree(bsp->id);
6324     bsp->id = SeqIdDup(bdp->seqid->next);
6325     bsp->id->next = SeqIdDup(bdp->seqid);
6326 
6327     /* Remove concatenated deflines. */
6328     SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_title));
6329     /* Replace with single defline. */
6330     ValNodeCopyStr(&bsp->descr, Seq_descr_title, bdp->title);
6331 
6332     tbdp = BlastDefLineSetFree(tbdp);
6333 
6334     /* Remove all user data (tax info, asn1 defline). */
6335     SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_user));
6336 
6337     if(tnames) {		/* Add taxid, taxnames. */
6338 	BioSourcePtr	bsrp;
6339 	DbtagPtr	dtp;
6340 
6341 	bsrp = BioSourceNew();
6342 	bsrp->subtype = NULL;
6343 	bsrp->org = OrgRefNew();
6344 	if(tnames->sci_name)
6345 	    bsrp->org->taxname = StringSave(tnames->sci_name);
6346 	if(tnames->common_name)
6347 	    bsrp->org->common = StringSave(tnames->common_name);
6348 	dtp = DbtagNew();
6349 	dtp->db = StringSave("taxon");
6350 	dtp->tag = ObjectIdNew();
6351 	dtp->tag->id = tnames->tax_id;
6352 	bsrp->org->db = NULL;
6353 	ValNodeAddPointer(&bsrp->org->db, 0, dtp);
6354 	ValNodeAddPointer(&bsp->descr, Seq_descr_source, bsrp);
6355     }
6356     RDBTaxNamesFree(tnames);
6357 
6358 /*    if (!SeqHasTax(bsp)) {
6359       printf("Warning: Bioseq without taxonomy!");
6360       if (query->choice == SEQID_GI) {
6361         printf("GI: %d\n",query->data.intvalue);
6362       } else printf("\n");
6363     } */
6364     return bsp;
6365 }
6366 
6367 /*---------------------------------------------------------------------------*/
6368 /*---------------------------------------------------------------------------*/
6369 /* return a common style dictionary for Cn3D 4.0                             */
6370 /*---------------------------------------------------------------------------*/
6371 /*---------------------------------------------------------------------------*/
6372 /*
6373 Cn3dStyleDictionaryPtr LIBCALL CddSrvGetStyle2(Int4 *styles[], Int4 nstyles)
6374 {
6375     Cn3dStyleDictionaryPtr    csdp;
6376     Cn3dStyleTableItemPtr     cstip,cstipTail,cstipHead=NULL;
6377     Int4			    i;
6378 
6379     if(nstyles < 1) return(NULL);
6380     csdp = Cn3dStyleDictionaryNew();
6381     csdp->global_style = CddSrvGetStyle2_Ex(styles[0]);
6382     for(i=1; i<nstyles; i++) {
6383 	cstip = Cn3dStyleTableItemNew();
6384 	cstip->id = i;
6385 	cstip->style = CddSrvGetStyle2_Ex(styles[i]);
6386 	cstip->next = NULL;
6387 	if(!cstipHead) cstipHead = cstipTail = cstip;
6388 	else {
6389 	    cstipTail->next = cstip;
6390 	    cstipTail = cstip;
6391 	}
6392     }
6393     csdp->style_table = cstipHead;
6394 
6395     return(csdp);
6396 }
6397 static Cn3dStyleSettingsPtr CddSrvGetStyle2_Ex(Int4 style[])
6398 {
6399   Cn3dStyleSettingsPtr      cssp;
6400   Cn3dBackboneStylePtr      cbsp;
6401   Cn3dGeneralStylePtr       cgsp;
6402   Cn3dBackboneLabelStylePtr cblsp;
6403 
6404     cssp = Cn3dStyleSettingsNew();
6405       cssp->next = NULL;
6406       cssp->name = NULL;
6407       cbsp = Cn3dBackboneStyleNew();
6408         cbsp->type = style[0];
6409         cbsp->style = style[1];
6410         cbsp->color_scheme = style[2];
6411         cbsp->user_color =
6412             MyCn3dColorInit(style[3],style[4],style[5],
6413                             style[6],style[7]);
6414       cssp->protein_backbone = cbsp;
6415       cbsp = Cn3dBackboneStyleNew();
6416         cbsp->type = style[8];
6417         cbsp->style = style[9];
6418         cbsp->color_scheme = style[10];
6419         cbsp->user_color =
6420             MyCn3dColorInit(style[11],style[12],style[13],
6421                             style[14],style[15]);
6422       cssp->nucleotide_backbone = cbsp;
6423         cgsp = Cn3dGeneralStyleNew();
6424         cgsp->is_on = style[16];
6425         cgsp->style = style[17];
6426         cgsp->color_scheme = style[18];
6427         cgsp->user_color =
6428             MyCn3dColorInit(style[19],style[20],style[21],
6429                             style[22],style[23]);
6430       cssp->protein_sidechains = cgsp;
6431         cgsp = Cn3dGeneralStyleNew();
6432         cgsp->is_on = style[24];
6433         cgsp->style = style[25];
6434         cgsp->color_scheme = style[26];
6435         cgsp->user_color =
6436             MyCn3dColorInit(style[27],style[28],style[29],
6437                             style[30],style[31]);
6438       cssp->nucleotide_sidechains = cgsp;
6439         cgsp = Cn3dGeneralStyleNew();
6440         cgsp->is_on = style[32];
6441         cgsp->style = style[33];
6442         cgsp->color_scheme = style[34];
6443         cgsp->user_color =
6444             MyCn3dColorInit(style[35],style[36],style[37],
6445                             style[38],style[39]);
6446        cssp->heterogens = cgsp;
6447         cgsp = Cn3dGeneralStyleNew();
6448         cgsp->is_on = style[40];
6449         cgsp->style = style[41];
6450         cgsp->color_scheme = style[42];
6451         cgsp->user_color =
6452             MyCn3dColorInit(style[43],style[44],style[45],
6453                             style[46],style[47]);
6454       cssp->solvents = cgsp;
6455         cgsp = Cn3dGeneralStyleNew();
6456         cgsp->is_on = style[48];
6457         cgsp->style = style[49];
6458         cgsp->color_scheme = style[50];
6459         cgsp->user_color =
6460             MyCn3dColorInit(style[51],style[52],style[53],
6461                             style[54],style[55]);
6462        cssp->connections = cgsp;
6463         cgsp = Cn3dGeneralStyleNew();
6464         cgsp->is_on = style[56];
6465         cgsp->style = style[57];
6466         cgsp->color_scheme = style[58];
6467         cgsp->user_color =
6468             MyCn3dColorInit(style[59],style[60],style[61],
6469                             style[62],style[63]);
6470       cssp->helix_objects = cgsp;
6471         cgsp = Cn3dGeneralStyleNew();
6472         cgsp->is_on = style[64];
6473         cgsp->style = style[65];
6474         cgsp->color_scheme = style[66];
6475         cgsp->user_color =
6476             MyCn3dColorInit(style[67],style[68],style[69],
6477                             style[70],style[71]);
6478       cssp->strand_objects = cgsp;
6479       cssp->virtual_disulfides_on = style[72];
6480       cssp->virtual_disulfide_color =
6481           MyCn3dColorInit(style[73],style[74],style[75],
6482                           style[76],style[77]);
6483       cssp->hydrogens_on = style[78];
6484       cssp->background_color =
6485           MyCn3dColorInit(style[79],style[80],style[81],
6486                           style[82],style[83]);
6487       cssp->scale_factor = style[84];
6488       cssp->space_fill_proportion = style[85];
6489       cssp->ball_radius = style[86];
6490       cssp->stick_radius = style[87];
6491       cssp->tube_radius = style[88];
6492       cssp->tube_worm_radius = style[89];
6493       cssp->helix_radius = style[90];
6494       cssp->strand_width = style[91];
6495       cssp->strand_thickness = style[92];
6496         cblsp = Cn3dBackboneLabelStyleNew();
6497         cblsp->spacing = style[93];
6498         cblsp->type = style[94];
6499         cblsp->number = style[95];
6500         cblsp->termini = style[96];
6501         cblsp->white = style[97];
6502       cssp->protein_labels = cblsp;
6503         cblsp = Cn3dBackboneLabelStyleNew();
6504         cblsp->spacing = style[98];
6505         cblsp->type = style[99];
6506         cblsp->number = style[100];
6507         cblsp->termini = style[101];
6508         cblsp->white = style[102];
6509       cssp->nucleotide_labels =cblsp;
6510       cssp->ion_labels = style[103];
6511 
6512   return(cssp);
6513 }
6514 static Cn3dColorPtr MyCn3dColorInit(Int4 scale_factor, Int4 red, Int4 green, Int4 blue,
6515 			     Int4 alpha)
6516 {
6517     Cn3dColorPtr	ccp;
6518 
6519     ccp = Cn3dColorNew();
6520     ccp->scale_factor = scale_factor;
6521     ccp->red = red;
6522     ccp->green = green;
6523     ccp->blue = blue;
6524     ccp->alpha = alpha;
6525 
6526     return ccp;
6527 }
6528 */
6529