1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: ccp.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.14 $
33 *
34 * File Description: coiled-coil prediction
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: ccp.c,v $
41 * Revision 6.14 1998/12/18 16:24:52 kuzio
42 * big GIs
43 *
44 * Revision 6.13 1998/11/16 14:34:09 kuzio
45 * flagBoundaryCondition
46 *
47 * Revision 6.12 1998/09/16 18:19:26 kuzio
48 * cvs logging
49 *
50 * ==========================================================================
51 */
52
53 #include <ncbi.h>
54 #include <accentr.h>
55 #include <gather.h>
56 #include <tofasta.h>
57 #include <urkutil.h>
58 #include <urkpcc.h>
59
60 #define TOP_ERROR 1
61 static char _this_module[] = "ccp";
62 #undef THIS_MODULE
63 #define THIS_MODULE _this_module
64 static char _this_file[] = __FILE__;
65 #undef THIS_FILE
66 #define THIS_FILE _this_file
67
68 #define L_T 256
69 #define L_ID 32
70 #define L_DEF 220
71
72 typedef struct gather_Prot_Bioseq
73 {
74 Int4 gi;
75 BioseqPtr bsp;
76 } Gather_PBS, PNTR Gather_PBSPtr;
77
78 Args myargs[] =
79 {
80 { "protein GI", "0", "0", "9000000", TRUE,
81 'g', ARG_INT, 0.0, 0, NULL},
82 { "FastA file", NULL, NULL, NULL, TRUE,
83 'f', ARG_STRING, 0.0, 0, NULL},
84 { "cc window 1", "22", "7", "42", TRUE,
85 'w', ARG_INT, 0.0, 0, NULL},
86 { "cc window 2", "-1", "7", "42", TRUE,
87 'x', ARG_INT, 0.0, 0, NULL},
88 { "cc window 3", "-1", "7", "42", TRUE,
89 'y', ARG_INT, 0.0, 0, NULL},
90 { "cc window 4", "-1", "7", "42", TRUE,
91 'z', ARG_INT, 0.0, 0, NULL},
92 { "sequence output", "FALSE", "FALSE", "TRUE", TRUE,
93 'S', ARG_BOOLEAN, 0.0, 0, NULL},
94 { "X-out sequence output for blast", "FALSE", "FALSE", "TRUE", TRUE,
95 'X', ARG_BOOLEAN, 0.0, 0, NULL},
96 { "stringent filter", "FALSE", "FALSE", "TRUE", TRUE,
97 's', ARG_BOOLEAN, 0.0, 0, NULL},
98 { "very stringent filter", "FALSE", "FALSE", "TRUE", TRUE,
99 'v', ARG_BOOLEAN, 0.0, 0, NULL},
100 { "output line length", "50", "40", "160", TRUE,
101 'l', ARG_INT, 0.0, 0, NULL},
102 { "data file 0=KSpcc 1=KSmtk 2=KSmtidk", "0", "0", "2", TRUE,
103 'd', ARG_INT, 0.0, 0, NULL},
104 { "Filter boundary condition hits only", "FALSE", "FALSE", "TRUE", TRUE,
105 'B', ARG_BOOLEAN, 0.0, 0, NULL}
106 };
107
GetProteinBioseq(GatherContextPtr gcp)108 static Boolean GetProteinBioseq (GatherContextPtr gcp)
109 {
110 Gather_PBSPtr gpbsp;
111 BioseqPtr bsp;
112 Int4 gi, entrezgi;
113
114 if (gcp == NULL)
115 return FALSE;
116 if ((gpbsp = (Gather_PBSPtr) gcp->userdata) == NULL)
117 return FALSE;
118
119 if (gpbsp->bsp != NULL)
120 return TRUE;
121 if (gcp->thistype != OBJ_BIOSEQ)
122 return TRUE;
123 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
124 return TRUE;
125
126 gi = gpbsp->gi;
127 if (gi > 0)
128 {
129 entrezgi = GetGIForSeqId (bsp->id);
130 if (gi == entrezgi)
131 gpbsp->bsp = bsp;
132 return TRUE;
133 }
134 else
135 {
136 gpbsp->bsp = bsp;
137 return TRUE;
138 }
139 }
140
Main(void)141 Int2 Main (void)
142 {
143 Int2 argcount;
144 Boolean flagHaveNet;
145
146 Int4 gi;
147 SeqEntryPtr sep;
148 PccDatPtr pccp;
149
150 FILE *fiop;
151 Char fastafile[256];
152 CharPtr title;
153 CharPtr datafile[3] = {"KSpcc.mat", "KSmtk.mat", "KSmtidk.mat"};
154
155 Int4 i, iloop, iwindow, pccwindow, start, stop, linelen;
156 FloatHiPtr pccscore, pccscore1, pccscore2, pccscore3, pccscore4;
157 SeqLocPtr slp, slpn;
158 Uint1Ptr sequence;
159 SeqPortPtr spp;
160
161 static GatherScope gs;
162 GatherScopePtr gsp;
163 static Gather_PBS gpbs;
164 Gather_PBSPtr gpbsp;
165
166 argcount = sizeof (myargs) / sizeof (Args);
167 if (!GetArgs ("CCP", argcount, myargs))
168 return 1;
169
170 gsp = &gs;
171 gpbsp = &gpbs;
172
173 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
174 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
175 (size_t) (OBJ_MAX * sizeof (Boolean)));
176 gsp->ignore[OBJ_BIOSEQ] = FALSE;
177
178 gpbsp->bsp = NULL;
179
180 if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
181 {
182 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
183 "No gi or FastA file given :: for help : ccp -");
184 ErrShow ();
185 exit (1);
186 }
187
188 gi = myargs[0].intvalue;
189 if (myargs[1].strvalue != NULL)
190 StrCpy (fastafile, myargs[1].strvalue);
191 else
192 fastafile[0] = '\0';
193
194 if (gi > 0)
195 {
196 if (!EntrezInit ("CCP", FALSE, &flagHaveNet))
197 {
198 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
199 "Entrez init failed");
200 ErrShow ();
201 exit (1);
202 }
203 }
204
205 if (gi > 0)
206 {
207 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
208 }
209 else
210 {
211 if ((fiop = FileOpen (fastafile, "r")) == NULL)
212 {
213 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
214 "Failed to open FastA file");
215 ErrShow ();
216 exit (1);
217 }
218 sep = FastaToSeqEntry (fiop, FALSE);
219 }
220
221 if (sep == NULL)
222 {
223 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
224 "No seqentry found");
225 ErrShow ();
226 exit (1);
227 }
228
229 linelen = myargs[10].intvalue;
230 while (sep != NULL)
231 {
232 gpbsp->gi = gi;
233 GatherSeqEntry (sep, (Pointer) gpbsp, GetProteinBioseq,
234 (Pointer) gsp);
235
236 if (gpbsp->bsp != NULL)
237 {
238 if (ISA_aa (gpbsp->bsp->mol))
239 {
240 title = FastaTitle (gpbsp->bsp, ">", NULL);
241
242 iwindow = 0;
243 pccscore1 = NULL;
244 pccscore2 = NULL;
245 pccscore3 = NULL;
246 pccscore4 = NULL;
247 for (i = 0; i < 4; i++)
248 {
249 pccwindow = 0;
250 switch (i)
251 {
252 case 0:
253 pccwindow = myargs[2].intvalue;
254 break;
255 case 1:
256 pccwindow = myargs[3].intvalue;
257 break;
258 case 2:
259 pccwindow = myargs[4].intvalue;
260 break;
261 case 3:
262 pccwindow = myargs[5].intvalue;
263 break;
264 default:
265 break;
266 }
267 if (pccwindow > 0)
268 {
269 pccp = PccDatNew ();
270 pccp->window = pccwindow;
271 MemFree (pccp->pccdatafile);
272 pccp->pccdatafile = StringSave (datafile[myargs[11].intvalue]);
273 if (ReadPccData (pccp) == 0)
274 {
275 ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
276 "Could not open or read %s data file",
277 pccp->pccdatafile);
278 ErrShow ();
279 exit (1);
280 }
281 pccscore = PredictCCBioseq (gpbsp->bsp, 0, gpbsp->bsp->length-1,
282 pccp);
283 PccDatFree (pccp);
284 if (pccscore != NULL)
285 {
286 iwindow++;
287 switch (iwindow)
288 {
289 case 1:
290 pccscore1 = pccscore;
291 break;
292 case 2:
293 pccscore2 = pccscore;
294 break;
295 case 3:
296 pccscore3 = pccscore;
297 break;
298 case 4:
299 pccscore4 = pccscore;
300 break;
301 default:
302 iwindow = 4;
303 break;
304 }
305 }
306 }
307 }
308 if (myargs[6].intvalue == FALSE)
309 {
310 printf ("%s\n", title);
311 switch (iwindow)
312 {
313 case 1:
314 for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
315 printf ("%lf\n", (double) pccscore1[iloop]);
316 break;
317 case 2:
318 for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
319 printf ("%lf %lf\n", (double) pccscore1[iloop],
320 (double) pccscore2[iloop]);
321 break;
322 case 3:
323 for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
324 printf ("%lf %lf %lf\n", (double) pccscore1[iloop],
325 (double) pccscore2[iloop],
326 (double) pccscore3[iloop]);
327 break;
328 case 4:
329 for (iloop = 0; iloop < gpbsp->bsp->length; iloop++)
330 printf ("%lf %lf %lf %lf\n", (double) pccscore1[iloop],
331 (double) pccscore2[iloop],
332 (double) pccscore3[iloop],
333 (double) pccscore4[iloop]);
334 break;
335 default:
336 ErrPostEx (SEV_ERROR, TOP_ERROR, 107,
337 "No coiled-coil predictions made");
338 ErrShow ();
339 }
340 }
341 else
342 {
343 sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
344 gpbsp->bsp->length+1));
345 spp = SeqPortNew (gpbsp->bsp, 0, gpbsp->bsp->length-1, 0,
346 Seq_code_iupacna);
347 SeqPortSeek (spp, 0, SEEK_SET);
348
349 i = 0;
350 while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
351 {
352 if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
353 ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
354 i++;
355 }
356 sequence[i] = 0;
357
358 i = 0;
359 while (sequence[i] != 0)
360 {
361 sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
362 i++;
363 }
364
365 for (iloop = 0; iloop < iwindow; iloop++)
366 {
367 switch (iloop)
368 {
369 default:
370 case 0:
371 pccscore = pccscore1;
372 break;
373 case 1:
374 pccscore = pccscore2;
375 break;
376 case 2:
377 pccscore = pccscore3;
378 break;
379 case 3:
380 pccscore = pccscore4;
381 break;
382 }
383 if (myargs[9].intvalue)
384 {
385 slpn = FilterCCVS (pccscore, 40, gpbsp->bsp->length, 32,
386 gpbsp->bsp->id,
387 (Boolean) myargs[12].intvalue);
388 }
389 else if (myargs[8].intvalue)
390 {
391 slpn = FilterCCVS (pccscore, 50, gpbsp->bsp->length, 24,
392 gpbsp->bsp->id,
393 (Boolean) myargs[12].intvalue);
394 }
395 else
396 {
397 slpn = FilterCC (pccscore, 50, gpbsp->bsp->length, 0,
398 gpbsp->bsp->id,
399 (Boolean) myargs[12].intvalue);
400 }
401 slp = slpn;
402 while (slp != NULL)
403 {
404 start = SeqLocStart (slp);
405 stop = SeqLocStop (slp);
406 for (i = start; i <= stop; i++)
407 {
408 if (myargs[7].intvalue == TRUE)
409 sequence[i] = (Uint1) 'x';
410 else
411 sequence[i] = (Uint1) TO_LOWER ((Char) sequence[i]);
412 }
413 slpn = slp->next;
414 SeqLocFree (slp);
415 slp = slpn;
416 }
417 }
418
419 printf ("%s\n", title);
420 i = 0;
421 while (sequence[i] != 0)
422 {
423 printf ("%c", (Char) sequence[i]);
424 i++;
425 if (i % linelen == 0)
426 {
427 if (myargs[7].intvalue == TRUE)
428 printf ("\n");
429 else
430 printf (" %8ld\n", (long) i);
431 }
432 }
433 if (i % linelen != 0)
434 printf ("\n");
435 SeqPortFree (spp);
436 MemFree (sequence);
437 }
438 MemFree (title);
439 MemFree (pccscore1);
440 MemFree (pccscore2);
441 MemFree (pccscore3);
442 MemFree (pccscore4);
443 }
444 else
445 {
446 ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
447 "Not a protein bioseq");
448 ErrShow ();
449 exit (1);
450 }
451 }
452 else
453 {
454 ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
455 "No bioseq found");
456 ErrShow ();
457 exit (1);
458 }
459 sep = SeqEntryFree (sep);
460 if (gi <= 0)
461 {
462 sep = FastaToSeqEntry (fiop, FALSE);
463 gpbsp->bsp = NULL;
464 }
465 }
466
467 if (gi > 0)
468 EntrezFini ();
469 else
470 FileClose (fiop);
471
472 return 0;
473 }
474