1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: sigme.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.13 $
33 *
34 * File Description: signal peptide and transmembrane region prediction
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: sigme.c,v $
41 * Revision 6.13 1998/12/18 16:24:56 kuzio
42 * big GIs
43 *
44 * Revision 6.12 1998/11/24 15:42:00 kuzio
45 * refine boundary condition for multiple potential leader pepides
46 *
47 * Revision 6.11 1998/11/16 14:34:13 kuzio
48 * flagBoundaryCondition
49 *
50 * Revision 6.10 1998/09/16 18:19:30 kuzio
51 * cvs logging
52 *
53 * ==========================================================================
54 */
55
56 #include <ncbi.h>
57 #include <accentr.h>
58 #include <gather.h>
59 #include <tofasta.h>
60 #include <urkutil.h>
61 #include <urkfltr.h>
62 #include <urkptpf.h>
63 #include <urksigu.h>
64
65 #define TOP_ERROR 1
66 static char _this_module[] = "sigme";
67 #undef THIS_MODULE
68 #define THIS_MODULE _this_module
69 static char _this_file[] = __FILE__;
70 #undef THIS_FILE
71 #define THIS_FILE _this_file
72
73 typedef struct gather_Bioseq
74 {
75 Int4 gi;
76 BioseqPtr bsp;
77 } Gather_BS, PNTR Gather_BSPtr;
78
79 Args myargs[] =
80 {
81 { "nucleotide GI", "0", "0", "9000000", TRUE,
82 'g', ARG_INT, 0.0, 0, NULL},
83 { "FastA file", NULL, NULL, NULL, TRUE,
84 'f', ARG_STRING, 0.0, 0, NULL },
85 { "gram negative signal peptide", "FALSE", "TRUE", "FALSE", TRUE,
86 'n', ARG_BOOLEAN, 0.0, 0, NULL},
87 { "gram positive signal peptide", "FALSE", "TRUE", "FALSE", TRUE,
88 'p', ARG_BOOLEAN, 0.0, 0, NULL},
89 { "Report boundary conditions", "FALSE", "TRUE", "FALSE", TRUE,
90 'B', ARG_BOOLEAN, 0.0, 0, NULL}
91 };
92
GetBioseq(GatherContextPtr gcp)93 static Boolean GetBioseq (GatherContextPtr gcp)
94 {
95 Gather_BSPtr gbsp;
96 BioseqPtr bsp;
97 Int4 gi, entrezgi;
98
99 if (gcp == NULL)
100 return FALSE;
101 if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
102 return FALSE;
103
104 if (gbsp->bsp != NULL)
105 return TRUE;
106 if (gcp->thistype != OBJ_BIOSEQ)
107 return TRUE;
108 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
109 return TRUE;
110
111 gi = gbsp->gi;
112 if (gi > 0)
113 {
114 entrezgi = GetGIForSeqId (bsp->id);
115 if (gi == entrezgi)
116 gbsp->bsp = bsp;
117 return TRUE;
118 }
119 else
120 {
121 gbsp->bsp = bsp;
122 return TRUE;
123 }
124 }
125
Main(void)126 Int2 Main (void)
127 {
128 Int2 argcount;
129 Boolean flagHaveNet;
130
131 Int4 gi;
132 SeqEntryPtr sep;
133
134 FILE *fiop;
135 Char fastafile[256];
136 CharPtr title;
137
138 ComProfPtr leadprofile, cutprofile;
139 FilterDatPtr fltp;
140 ComPatPtr cpp;
141
142 SeqAlignPtr sap;
143 SeqLocPtr slph, slp, pslp, pslph;
144 SeqPortPtr spp;
145 Int4 i, start, stop, prestart, poststop, pstart;
146 Uint1Ptr sequence;
147 Boolean flagCharge;
148
149 FloatHi leadcut = 3.3;
150 FloatHi cutcut = 2.1;
151 Boolean flagSPFuzz, flagBoundaryCondition;
152 Int4 range = 40;
153 Int4 ctermsig;
154
155 FloatHi cutoff = 50.0;
156 Int4 window = 19;
157 FloatHiPtr fptr;
158
159 Boolean flagTitle;
160
161 static GatherScope gs;
162 GatherScopePtr gsp;
163 static Gather_BS gbs;
164 Gather_BSPtr gbsp;
165
166 argcount = sizeof (myargs) / sizeof (Args);
167 if (!GetArgs ("sigme", argcount, myargs))
168 return 1;
169
170 if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
171 {
172 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
173 "No gi or FastA file given :: for help : sigme -");
174 ErrShow ();
175 exit (1);
176 }
177
178 gsp = &gs;
179 gbsp = &gbs;
180
181 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
182 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
183 (size_t) (OBJ_MAX * sizeof (Boolean)));
184
185 gsp->ignore[OBJ_SEQDESC] = TRUE;
186 gsp->ignore[OBJ_BIOSEQ] = FALSE;
187
188 gi = myargs[0].intvalue;
189 if (myargs[1].strvalue != NULL)
190 StrCpy (fastafile, myargs[1].strvalue);
191 else
192 fastafile[0] = '\0';
193
194 if (gi > 0)
195 {
196 if (!EntrezInit ("sigme", FALSE, &flagHaveNet))
197 {
198 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
199 "Entrez init failed");
200 ErrShow ();
201 exit (1);
202 }
203 }
204
205 fiop = NULL;
206 if (gi > 0)
207 {
208 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
209 }
210 else
211 {
212 if ((fiop = FileOpen (fastafile, "r")) == NULL)
213 {
214 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
215 "Failed to open FastA file: %s", fastafile);
216 ErrShow ();
217 exit (1);
218 }
219 sep = FastaToSeqEntry (fiop, FALSE);
220 }
221 if (sep == NULL)
222 {
223 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
224 "No seqentry found");
225 ErrShow ();
226 exit (1);
227 }
228
229 if (myargs[2].intvalue)
230 {
231 leadprofile = ReadProfile ("KSnsigl.mat");
232 cutprofile = ReadProfile ("KSnsigc.mat");
233 }
234 else if (myargs[3].intvalue)
235 {
236 leadprofile = ReadProfile ("KSpsigl.mat");
237 cutprofile = ReadProfile ("KSpsigc.mat");
238 }
239 else
240 {
241 leadprofile = ReadProfile ("KSesigl.mat");
242 cutprofile = ReadProfile ("KSesigc.mat");
243 }
244 flagBoundaryCondition = (Boolean) myargs[4].intvalue;
245
246 while (sep != NULL)
247 {
248 gbsp->bsp = NULL;
249 gbsp->gi = gi;
250 GatherSeqEntry (sep, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
251
252 if (gbsp->bsp != NULL)
253 {
254 if (ISA_aa (gbsp->bsp->mol))
255 {
256 title = FastaTitle (gbsp->bsp, ">", NULL);
257 sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
258 gbsp->bsp->length+1));
259 spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, 0,
260 Seq_code_iupacna);
261 SeqPortSeek (spp, 0, SEEK_SET);
262 i = 0;
263 while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
264 {
265 if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
266 ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
267 i++;
268 }
269 sequence[i] = 0;
270 i = 0;
271 while (sequence[i] != 0)
272 {
273 sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
274 i++;
275 }
276 SeqPortFree (spp);
277
278 flagTitle = FALSE;
279 slph = slp = FilterSigSeq (gbsp->bsp, leadprofile, cutprofile,
280 leadcut, cutcut, range,
281 gbsp->bsp->id, FALSE, FALSE);
282 ctermsig = EndOfSig (slp);
283 if (slp != NULL)
284 {
285 flagTitle = TRUE;
286 printf ("%s\n", title);
287 printf (" Signal Sequence\n");
288 }
289 while (slp != NULL)
290 {
291 start = SeqLocStart (slp);
292 stop = SeqLocStop (slp);
293 printf ("%4ld %4ld ", (long) (start+1), (long) (stop+1));
294 for (i = start; i <= stop; i++)
295 printf ("%c", (Char) sequence[i]);
296 printf ("\n");
297 slp = slp->next;
298 }
299 slp = slph;
300 while (slp != NULL)
301 {
302 slph = slp->next;
303 SeqLocFree (slp);
304 slp = slph;
305 }
306 /* determine fuzzy signal if requested */
307 if (flagBoundaryCondition)
308 {
309 slp = FilterSigSeq (gbsp->bsp, leadprofile, cutprofile,
310 leadcut, cutcut, range,
311 gbsp->bsp->id, TRUE, TRUE);
312 flagSPFuzz = FALSE;
313 if (slp != NULL)
314 {
315 flagSPFuzz = TRUE;
316 while (slp != NULL)
317 {
318 slph = slp->next;
319 SeqLocFree (slp);
320 slp = slph;
321 }
322 }
323 if (flagSPFuzz)
324 {
325 if (flagTitle == FALSE)
326 {
327 flagTitle = TRUE;
328 printf ("%s\n", title);
329 printf (" Signal Sequence\n");
330 }
331 printf (" signal peptide prediction is fuzzy\n");
332 }
333 }
334
335 fltp = FilterDatNew (AA_FILTER_COMP_KYTE, window);
336 fltp->filterdatafile = StringSave ("KSkyte.flt");
337
338 if (ReadFilterData (fltp) != 24)
339 {
340 printf ("Error: could not read filter data\n");
341 exit (1);
342 }
343 fptr = FilterBioseq (gbsp->bsp, 0, gbsp->bsp->length-1, fltp);
344 slp = FilterFilter (fptr, fltp->maxscr, window, cutoff,
345 gbsp->bsp->length, gbsp->bsp->id,
346 TRUE, FALSE);
347 MemFree (fptr);
348 slph = slp = CheckOverlap (slp, ctermsig);
349 if (slp != NULL && !flagTitle)
350 {
351 printf ("%s\n", title);
352 }
353 MemFree (title);
354 if (slp != NULL)
355 {
356 printf (" Transmembrane Sequence\n");
357 }
358 while (slp != NULL)
359 {
360 start = SeqLocStart (slp);
361 stop = SeqLocStop (slp);
362 printf ("%4ld %4ld ", (long) (start+1), (long) (stop+1));
363 prestart = start - 5;
364 if (prestart < 0)
365 prestart = 0;
366 poststop = stop + 5;
367 if (poststop > gbsp->bsp->length - 1)
368 poststop = gbsp->bsp->length - 1;
369 for (i = prestart; i < start; i++)
370 printf ("%c", TO_LOWER ((Char) sequence[i]));
371 for (i = start; i <= stop; i++)
372 printf ("%c", (Char) sequence[i]);
373 for (i = stop + 1; i <= poststop; i++)
374 printf ("%c", TO_LOWER ((Char) sequence[i]));
375 printf ("\n");
376 cpp = CompilePattern ("[C,D,E,K,N,Q,H,R]", 1);
377 if (cpp != NULL)
378 {
379 sap = PatternMatchBioseq (gbsp->bsp, cpp, 0);
380 pslph = MatchSa2Sl (&sap);
381 printf (" ");
382 for (i = prestart; i <= poststop; i++)
383 {
384 pslp = pslph;
385 flagCharge = FALSE;
386 while (pslp != NULL)
387 {
388 pstart = SeqLocStart (pslp);
389 if (pstart == i)
390 {
391 flagCharge = TRUE;
392 break;
393 }
394 pslp = pslp->next;
395 }
396 if (flagCharge)
397 printf ("%c", (Char) sequence[i]);
398 else
399 printf ("-");
400 }
401 printf ("\n");
402 ComPatFree (cpp);
403 pslp = pslph;
404 while (pslp != NULL)
405 {
406 pslph = pslp->next;
407 SeqLocFree (pslp);
408 pslp = pslph;
409 }
410 }
411 slp = slp->next;
412 }
413 slp = slph;
414 while (slp != NULL)
415 {
416 slph = slp->next;
417 SeqLocFree (slp);
418 slp = slph;
419 }
420 FilterDatFree (fltp);
421 MemFree (sequence);
422 }
423 else
424 {
425 ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
426 "Not a peptide bioseq");
427 ErrShow ();
428 exit (1);
429 }
430 }
431 else
432 {
433 ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
434 "No bioseq found");
435 ErrShow ();
436 exit (1);
437 }
438 SeqEntryFree (sep);
439 sep = NULL;
440 if (fiop != NULL)
441 sep = FastaToSeqEntry (fiop, FALSE);
442 }
443
444 ComProfFree (leadprofile);
445 ComProfFree (cutprofile);
446 FileClose (fiop);
447 if (gi > 0)
448 EntrezFini ();
449
450 return 0;
451 }
452