1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: ccp.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.14 $
33 *
34 * File Description: coiled-coil prediction
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: ccp.c,v $
41 * Revision 6.14  1998/12/18 16:24:52  kuzio
42 * big GIs
43 *
44 * Revision 6.13  1998/11/16 14:34:09  kuzio
45 * flagBoundaryCondition
46 *
47 * Revision 6.12  1998/09/16 18:19:26  kuzio
48 * cvs logging
49 *
50 * ==========================================================================
51 */
52 
53 #include <ncbi.h>
54 #include <accentr.h>
55 #include <gather.h>
56 #include <tofasta.h>
57 #include <urkutil.h>
58 #include <urkpcc.h>
59 
60 #define TOP_ERROR 1
61 static char _this_module[] = "ccp";
62 #undef  THIS_MODULE
63 #define THIS_MODULE _this_module
64 static char _this_file[] = __FILE__;
65 #undef  THIS_FILE
66 #define THIS_FILE _this_file
67 
68 #define L_T     256
69 #define L_ID    32
70 #define L_DEF   220
71 
72 typedef struct gather_Prot_Bioseq
73 {
74   Int4      gi;
75   BioseqPtr bsp;
76 } Gather_PBS, PNTR Gather_PBSPtr;
77 
78 Args myargs[] =
79 {
80   { "protein GI", "0", "0", "9000000", TRUE,
81     'g', ARG_INT, 0.0, 0, NULL},
82   { "FastA file", NULL, NULL, NULL, TRUE,
83     'f', ARG_STRING, 0.0, 0, NULL},
84   { "cc window 1", "22", "7", "42", TRUE,
85     'w', ARG_INT, 0.0, 0, NULL},
86   { "cc window 2", "-1", "7", "42", TRUE,
87     'x', ARG_INT, 0.0, 0, NULL},
88   { "cc window 3", "-1", "7", "42", TRUE,
89     'y', ARG_INT, 0.0, 0, NULL},
90   { "cc window 4", "-1", "7", "42", TRUE,
91     'z', ARG_INT, 0.0, 0, NULL},
92   { "sequence output", "FALSE", "FALSE", "TRUE", TRUE,
93     'S', ARG_BOOLEAN, 0.0, 0, NULL},
94   { "X-out sequence output for blast", "FALSE", "FALSE", "TRUE", TRUE,
95     'X', ARG_BOOLEAN, 0.0, 0, NULL},
96   { "stringent filter", "FALSE", "FALSE", "TRUE", TRUE,
97     's', ARG_BOOLEAN, 0.0, 0, NULL},
98   { "very stringent filter", "FALSE", "FALSE", "TRUE", TRUE,
99     'v', ARG_BOOLEAN, 0.0, 0, NULL},
100   { "output line length", "50", "40", "160", TRUE,
101     'l', ARG_INT, 0.0, 0, NULL},
102   { "data file 0=KSpcc 1=KSmtk 2=KSmtidk", "0", "0", "2", TRUE,
103     'd', ARG_INT, 0.0, 0, NULL},
104   { "Filter boundary condition hits only", "FALSE", "FALSE", "TRUE", TRUE,
105     'B', ARG_BOOLEAN, 0.0, 0, NULL}
106 };
107 
GetProteinBioseq(GatherContextPtr gcp)108 static Boolean GetProteinBioseq (GatherContextPtr gcp)
109 {
110   Gather_PBSPtr  gpbsp;
111   BioseqPtr      bsp;
112   Int4           gi, entrezgi;
113 
114   if (gcp == NULL)
115     return FALSE;
116   if ((gpbsp = (Gather_PBSPtr) gcp->userdata) == NULL)
117     return FALSE;
118 
119   if (gpbsp->bsp != NULL)
120     return TRUE;
121   if (gcp->thistype != OBJ_BIOSEQ)
122     return TRUE;
123   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
124     return TRUE;
125 
126   gi = gpbsp->gi;
127   if (gi > 0)
128   {
129     entrezgi = GetGIForSeqId (bsp->id);
130     if (gi == entrezgi)
131       gpbsp->bsp = bsp;
132     return TRUE;
133   }
134   else
135   {
136     gpbsp->bsp = bsp;
137     return TRUE;
138   }
139 }
140 
Main(void)141 Int2 Main (void)
142 {
143   Int2        argcount;
144   Boolean     flagHaveNet;
145 
146   Int4        gi;
147   SeqEntryPtr sep;
148   PccDatPtr   pccp;
149 
150   FILE        *fiop;
151   Char        fastafile[256];
152   CharPtr     title;
153   CharPtr     datafile[3] = {"KSpcc.mat", "KSmtk.mat", "KSmtidk.mat"};
154 
155   Int4        i, iloop, iwindow, pccwindow, start, stop, linelen;
156   FloatHiPtr  pccscore, pccscore1, pccscore2, pccscore3, pccscore4;
157   SeqLocPtr   slp, slpn;
158   Uint1Ptr    sequence;
159   SeqPortPtr  spp;
160 
161   static GatherScope  gs;
162   GatherScopePtr      gsp;
163   static Gather_PBS   gpbs;
164   Gather_PBSPtr       gpbsp;
165 
166   argcount = sizeof (myargs) / sizeof (Args);
167   if (!GetArgs ("CCP", argcount, myargs))
168     return 1;
169 
170   gsp = &gs;
171   gpbsp = &gpbs;
172 
173   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
174   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
175           (size_t) (OBJ_MAX * sizeof (Boolean)));
176   gsp->ignore[OBJ_BIOSEQ] = FALSE;
177 
178   gpbsp->bsp = NULL;
179 
180   if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
181   {
182     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
183                "No gi or FastA file given :: for help :   ccp -");
184     ErrShow ();
185     exit (1);
186   }
187 
188   gi = myargs[0].intvalue;
189   if (myargs[1].strvalue != NULL)
190     StrCpy (fastafile, myargs[1].strvalue);
191   else
192     fastafile[0] = '\0';
193 
194   if (gi > 0)
195   {
196     if (!EntrezInit ("CCP", FALSE, &flagHaveNet))
197     {
198       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
199                  "Entrez init failed");
200       ErrShow ();
201       exit (1);
202     }
203   }
204 
205   if (gi > 0)
206   {
207     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
208   }
209   else
210   {
211     if ((fiop = FileOpen (fastafile, "r")) == NULL)
212     {
213       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
214                  "Failed to open FastA file");
215       ErrShow ();
216       exit (1);
217     }
218     sep = FastaToSeqEntry (fiop, FALSE);
219   }
220 
221   if (sep == NULL)
222   {
223     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
224                "No seqentry found");
225     ErrShow ();
226     exit (1);
227   }
228 
229   linelen = myargs[10].intvalue;
230   while (sep != NULL)
231   {
232     gpbsp->gi = gi;
233     GatherSeqEntry (sep, (Pointer) gpbsp, GetProteinBioseq,
234                     (Pointer) gsp);
235 
236     if (gpbsp->bsp != NULL)
237     {
238       if (ISA_aa (gpbsp->bsp->mol))
239       {
240         title = FastaTitle (gpbsp->bsp, ">", NULL);
241 
242         iwindow = 0;
243         pccscore1 = NULL;
244         pccscore2 = NULL;
245         pccscore3 = NULL;
246         pccscore4 = NULL;
247         for (i = 0; i < 4; i++)
248         {
249           pccwindow = 0;
250           switch (i)
251           {
252            case 0:
253             pccwindow = myargs[2].intvalue;
254             break;
255            case 1:
256             pccwindow = myargs[3].intvalue;
257             break;
258            case 2:
259             pccwindow = myargs[4].intvalue;
260             break;
261            case 3:
262             pccwindow = myargs[5].intvalue;
263             break;
264            default:
265             break;
266           }
267           if (pccwindow > 0)
268           {
269             pccp = PccDatNew ();
270             pccp->window = pccwindow;
271             MemFree (pccp->pccdatafile);
272             pccp->pccdatafile = StringSave (datafile[myargs[11].intvalue]);
273             if (ReadPccData (pccp) == 0)
274             {
275               ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
276                          "Could not open or read %s data file",
277                          pccp->pccdatafile);
278               ErrShow ();
279               exit (1);
280             }
281             pccscore = PredictCCBioseq (gpbsp->bsp, 0, gpbsp->bsp->length-1,
282                                         pccp);
283             PccDatFree (pccp);
284             if (pccscore != NULL)
285             {
286               iwindow++;
287               switch (iwindow)
288               {
289                case 1:
290                 pccscore1 = pccscore;
291                 break;
292                case 2:
293                 pccscore2 = pccscore;
294                 break;
295                case 3:
296                 pccscore3 = pccscore;
297                 break;
298                case 4:
299                 pccscore4 = pccscore;
300                 break;
301                default:
302                 iwindow = 4;
303                 break;
304               }
305             }
306           }
307         }
308         if (myargs[6].intvalue == FALSE)
309         {
310           printf ("%s\n", title);
311           switch (iwindow)
312           {
313            case 1:
314             for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
315               printf ("%lf\n", (double) pccscore1[iloop]);
316             break;
317            case 2:
318             for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
319               printf ("%lf %lf\n", (double) pccscore1[iloop],
320                                    (double) pccscore2[iloop]);
321             break;
322            case 3:
323             for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
324               printf ("%lf %lf %lf\n", (double) pccscore1[iloop],
325                                        (double) pccscore2[iloop],
326                                        (double) pccscore3[iloop]);
327             break;
328            case 4:
329             for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
330               printf ("%lf %lf %lf %lf\n", (double) pccscore1[iloop],
331                                            (double) pccscore2[iloop],
332                                            (double) pccscore3[iloop],
333                                            (double) pccscore4[iloop]);
334             break;
335            default:
336             ErrPostEx (SEV_ERROR, TOP_ERROR, 107,
337                        "No coiled-coil predictions made");
338             ErrShow ();
339           }
340         }
341         else
342         {
343           sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
344                                                   gpbsp->bsp->length+1));
345           spp = SeqPortNew (gpbsp->bsp, 0, gpbsp->bsp->length-1, 0,
346                             Seq_code_iupacna);
347           SeqPortSeek (spp, 0, SEEK_SET);
348 
349           i = 0;
350           while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
351           {
352             if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
353                 ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
354               i++;
355           }
356           sequence[i] = 0;
357 
358           i = 0;
359           while (sequence[i] != 0)
360           {
361             sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
362             i++;
363           }
364 
365           for (iloop = 0; iloop < iwindow; iloop++)
366           {
367             switch (iloop)
368             {
369              default:
370              case 0:
371               pccscore = pccscore1;
372               break;
373              case 1:
374               pccscore = pccscore2;
375               break;
376              case 2:
377               pccscore = pccscore3;
378               break;
379              case 3:
380               pccscore = pccscore4;
381               break;
382             }
383             if (myargs[9].intvalue)
384             {
385               slpn = FilterCCVS (pccscore, 40, gpbsp->bsp->length, 32,
386                                  gpbsp->bsp->id,
387                                  (Boolean) myargs[12].intvalue);
388             }
389             else if (myargs[8].intvalue)
390             {
391               slpn = FilterCCVS (pccscore, 50, gpbsp->bsp->length, 24,
392                                  gpbsp->bsp->id,
393                                  (Boolean) myargs[12].intvalue);
394             }
395             else
396             {
397               slpn = FilterCC (pccscore, 50, gpbsp->bsp->length, 0,
398                                gpbsp->bsp->id,
399                                (Boolean) myargs[12].intvalue);
400             }
401             slp = slpn;
402             while (slp != NULL)
403             {
404               start = SeqLocStart (slp);
405               stop = SeqLocStop (slp);
406               for (i = start; i <= stop; i++)
407               {
408                 if (myargs[7].intvalue == TRUE)
409                   sequence[i] = (Uint1) 'x';
410                 else
411                   sequence[i] = (Uint1) TO_LOWER ((Char) sequence[i]);
412               }
413               slpn = slp->next;
414               SeqLocFree (slp);
415               slp = slpn;
416             }
417           }
418 
419           printf ("%s\n", title);
420           i = 0;
421           while (sequence[i] != 0)
422           {
423             printf ("%c", (Char) sequence[i]);
424             i++;
425             if (i % linelen == 0)
426             {
427               if (myargs[7].intvalue == TRUE)
428                 printf ("\n");
429               else
430                 printf (" %8ld\n", (long) i);
431             }
432           }
433           if (i % linelen != 0)
434             printf ("\n");
435           SeqPortFree (spp);
436           MemFree (sequence);
437         }
438         MemFree (title);
439         MemFree (pccscore1);
440         MemFree (pccscore2);
441         MemFree (pccscore3);
442         MemFree (pccscore4);
443       }
444       else
445       {
446         ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
447                    "Not a protein bioseq");
448         ErrShow ();
449         exit (1);
450       }
451     }
452     else
453     {
454       ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
455                  "No bioseq found");
456       ErrShow ();
457       exit (1);
458     }
459     sep = SeqEntryFree (sep);
460     if (gi <= 0)
461     {
462       sep = FastaToSeqEntry (fiop, FALSE);
463       gpbsp->bsp = NULL;
464     }
465   }
466 
467   if (gi > 0)
468     EntrezFini ();
469   else
470     FileClose (fiop);
471 
472   return 0;
473 }
474