1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: sigme.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.13 $
33 *
34 * File Description: signal peptide and transmembrane region prediction
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: sigme.c,v $
41 * Revision 6.13  1998/12/18 16:24:56  kuzio
42 * big GIs
43 *
44 * Revision 6.12  1998/11/24 15:42:00  kuzio
45 * refine boundary condition for multiple potential leader pepides
46 *
47 * Revision 6.11  1998/11/16 14:34:13  kuzio
48 * flagBoundaryCondition
49 *
50 * Revision 6.10  1998/09/16 18:19:30  kuzio
51 * cvs logging
52 *
53 * ==========================================================================
54 */
55 
56 #include <ncbi.h>
57 #include <accentr.h>
58 #include <gather.h>
59 #include <tofasta.h>
60 #include <urkutil.h>
61 #include <urkfltr.h>
62 #include <urkptpf.h>
63 #include <urksigu.h>
64 
65 #define TOP_ERROR 1
66 static char _this_module[] = "sigme";
67 #undef  THIS_MODULE
68 #define THIS_MODULE _this_module
69 static char _this_file[] = __FILE__;
70 #undef  THIS_FILE
71 #define THIS_FILE _this_file
72 
73 typedef struct gather_Bioseq
74 {
75   Int4      gi;
76   BioseqPtr bsp;
77 } Gather_BS, PNTR Gather_BSPtr;
78 
79 Args myargs[] =
80 {
81   { "nucleotide GI", "0", "0", "9000000", TRUE,
82     'g', ARG_INT, 0.0, 0, NULL},
83   { "FastA file", NULL, NULL, NULL, TRUE,
84     'f', ARG_STRING, 0.0, 0, NULL },
85   { "gram negative signal peptide", "FALSE", "TRUE", "FALSE", TRUE,
86     'n', ARG_BOOLEAN, 0.0, 0, NULL},
87   { "gram positive signal peptide", "FALSE", "TRUE", "FALSE", TRUE,
88     'p', ARG_BOOLEAN, 0.0, 0, NULL},
89   { "Report boundary conditions", "FALSE", "TRUE", "FALSE", TRUE,
90     'B', ARG_BOOLEAN, 0.0, 0, NULL}
91 };
92 
GetBioseq(GatherContextPtr gcp)93 static Boolean GetBioseq (GatherContextPtr gcp)
94 {
95   Gather_BSPtr   gbsp;
96   BioseqPtr      bsp;
97   Int4           gi, entrezgi;
98 
99   if (gcp == NULL)
100     return FALSE;
101   if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
102     return FALSE;
103 
104   if (gbsp->bsp != NULL)
105     return TRUE;
106   if (gcp->thistype != OBJ_BIOSEQ)
107     return TRUE;
108   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
109     return TRUE;
110 
111   gi = gbsp->gi;
112   if (gi > 0)
113   {
114     entrezgi = GetGIForSeqId (bsp->id);
115     if (gi == entrezgi)
116       gbsp->bsp = bsp;
117     return TRUE;
118   }
119   else
120   {
121     gbsp->bsp = bsp;
122     return TRUE;
123   }
124 }
125 
Main(void)126 Int2 Main (void)
127 {
128   Int2        argcount;
129   Boolean     flagHaveNet;
130 
131   Int4        gi;
132   SeqEntryPtr sep;
133 
134   FILE        *fiop;
135   Char        fastafile[256];
136   CharPtr     title;
137 
138   ComProfPtr   leadprofile, cutprofile;
139   FilterDatPtr fltp;
140   ComPatPtr    cpp;
141 
142   SeqAlignPtr sap;
143   SeqLocPtr   slph, slp, pslp, pslph;
144   SeqPortPtr  spp;
145   Int4        i, start, stop, prestart, poststop, pstart;
146   Uint1Ptr    sequence;
147   Boolean     flagCharge;
148 
149   FloatHi     leadcut = 3.3;
150   FloatHi     cutcut = 2.1;
151   Boolean     flagSPFuzz, flagBoundaryCondition;
152   Int4        range = 40;
153   Int4        ctermsig;
154 
155   FloatHi     cutoff = 50.0;
156   Int4        window = 19;
157   FloatHiPtr  fptr;
158 
159   Boolean     flagTitle;
160 
161   static GatherScope  gs;
162   GatherScopePtr      gsp;
163   static Gather_BS    gbs;
164   Gather_BSPtr        gbsp;
165 
166   argcount = sizeof (myargs) / sizeof (Args);
167   if (!GetArgs ("sigme", argcount, myargs))
168     return 1;
169 
170   if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
171   {
172     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
173                "No gi or FastA file given :: for help :   sigme -");
174     ErrShow ();
175     exit (1);
176   }
177 
178   gsp = &gs;
179   gbsp = &gbs;
180 
181   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
182   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
183           (size_t) (OBJ_MAX * sizeof (Boolean)));
184 
185   gsp->ignore[OBJ_SEQDESC] = TRUE;
186   gsp->ignore[OBJ_BIOSEQ] = FALSE;
187 
188   gi = myargs[0].intvalue;
189   if (myargs[1].strvalue != NULL)
190     StrCpy (fastafile, myargs[1].strvalue);
191   else
192     fastafile[0] = '\0';
193 
194   if (gi > 0)
195   {
196     if (!EntrezInit ("sigme", FALSE, &flagHaveNet))
197     {
198       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
199                  "Entrez init failed");
200       ErrShow ();
201       exit (1);
202     }
203   }
204 
205   fiop = NULL;
206   if (gi > 0)
207   {
208     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
209   }
210   else
211   {
212     if ((fiop = FileOpen (fastafile, "r")) == NULL)
213     {
214       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
215                  "Failed to open FastA file: %s", fastafile);
216       ErrShow ();
217       exit (1);
218     }
219     sep = FastaToSeqEntry (fiop, FALSE);
220   }
221   if (sep == NULL)
222   {
223     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
224                "No seqentry found");
225     ErrShow ();
226     exit (1);
227   }
228 
229   if (myargs[2].intvalue)
230   {
231     leadprofile = ReadProfile ("KSnsigl.mat");
232     cutprofile = ReadProfile ("KSnsigc.mat");
233   }
234   else if (myargs[3].intvalue)
235   {
236     leadprofile = ReadProfile ("KSpsigl.mat");
237     cutprofile = ReadProfile ("KSpsigc.mat");
238   }
239   else
240   {
241     leadprofile = ReadProfile ("KSesigl.mat");
242     cutprofile = ReadProfile ("KSesigc.mat");
243   }
244   flagBoundaryCondition = (Boolean) myargs[4].intvalue;
245 
246   while (sep != NULL)
247   {
248     gbsp->bsp = NULL;
249     gbsp->gi = gi;
250     GatherSeqEntry (sep, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
251 
252     if (gbsp->bsp != NULL)
253     {
254       if (ISA_aa (gbsp->bsp->mol))
255       {
256         title = FastaTitle (gbsp->bsp, ">", NULL);
257         sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
258                                                 gbsp->bsp->length+1));
259         spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, 0,
260                           Seq_code_iupacna);
261         SeqPortSeek (spp, 0, SEEK_SET);
262         i = 0;
263         while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
264         {
265           if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
266               ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
267             i++;
268         }
269         sequence[i] = 0;
270         i = 0;
271         while (sequence[i] != 0)
272         {
273           sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
274           i++;
275         }
276         SeqPortFree (spp);
277 
278         flagTitle = FALSE;
279         slph = slp = FilterSigSeq (gbsp->bsp, leadprofile, cutprofile,
280                                    leadcut, cutcut, range,
281                                    gbsp->bsp->id, FALSE, FALSE);
282         ctermsig = EndOfSig (slp);
283         if (slp != NULL)
284         {
285           flagTitle = TRUE;
286           printf ("%s\n", title);
287           printf (" Signal Sequence\n");
288         }
289         while (slp != NULL)
290         {
291           start = SeqLocStart (slp);
292           stop = SeqLocStop (slp);
293           printf ("%4ld %4ld   ", (long) (start+1), (long) (stop+1));
294           for (i = start; i <= stop; i++)
295             printf ("%c", (Char) sequence[i]);
296           printf ("\n");
297           slp = slp->next;
298         }
299         slp = slph;
300         while (slp != NULL)
301         {
302           slph = slp->next;
303           SeqLocFree (slp);
304           slp = slph;
305         }
306 /* determine fuzzy signal if requested */
307         if (flagBoundaryCondition)
308         {
309           slp = FilterSigSeq (gbsp->bsp, leadprofile, cutprofile,
310                               leadcut, cutcut, range,
311                               gbsp->bsp->id, TRUE, TRUE);
312           flagSPFuzz = FALSE;
313           if (slp != NULL)
314           {
315             flagSPFuzz = TRUE;
316             while (slp != NULL)
317             {
318               slph = slp->next;
319               SeqLocFree (slp);
320               slp = slph;
321             }
322           }
323           if (flagSPFuzz)
324           {
325             if (flagTitle == FALSE)
326             {
327               flagTitle = TRUE;
328               printf ("%s\n", title);
329               printf (" Signal Sequence\n");
330             }
331             printf (" signal peptide prediction is fuzzy\n");
332           }
333         }
334 
335         fltp = FilterDatNew (AA_FILTER_COMP_KYTE, window);
336         fltp->filterdatafile = StringSave ("KSkyte.flt");
337 
338         if (ReadFilterData (fltp) != 24)
339         {
340           printf ("Error: could not read filter data\n");
341           exit (1);
342         }
343         fptr = FilterBioseq (gbsp->bsp, 0, gbsp->bsp->length-1, fltp);
344         slp = FilterFilter (fptr, fltp->maxscr, window, cutoff,
345                             gbsp->bsp->length, gbsp->bsp->id,
346                             TRUE, FALSE);
347         MemFree (fptr);
348         slph = slp = CheckOverlap (slp, ctermsig);
349         if (slp != NULL && !flagTitle)
350         {
351           printf ("%s\n", title);
352         }
353         MemFree (title);
354         if (slp != NULL)
355         {
356           printf (" Transmembrane Sequence\n");
357         }
358         while (slp != NULL)
359         {
360           start = SeqLocStart (slp);
361           stop = SeqLocStop (slp);
362           printf ("%4ld %4ld   ", (long) (start+1), (long) (stop+1));
363           prestart = start - 5;
364           if (prestart < 0)
365             prestart = 0;
366           poststop = stop + 5;
367           if (poststop > gbsp->bsp->length - 1)
368             poststop = gbsp->bsp->length - 1;
369           for (i = prestart; i < start; i++)
370             printf ("%c",  TO_LOWER ((Char) sequence[i]));
371           for (i = start; i <= stop; i++)
372             printf ("%c", (Char) sequence[i]);
373           for (i = stop + 1; i <= poststop; i++)
374             printf ("%c",  TO_LOWER ((Char) sequence[i]));
375           printf ("\n");
376           cpp = CompilePattern ("[C,D,E,K,N,Q,H,R]", 1);
377           if (cpp != NULL)
378           {
379             sap = PatternMatchBioseq (gbsp->bsp, cpp, 0);
380             pslph = MatchSa2Sl (&sap);
381             printf ("            ");
382             for (i = prestart; i <= poststop; i++)
383             {
384               pslp = pslph;
385               flagCharge = FALSE;
386               while (pslp != NULL)
387               {
388                 pstart = SeqLocStart (pslp);
389                 if (pstart == i)
390                 {
391                   flagCharge = TRUE;
392                   break;
393                 }
394                 pslp = pslp->next;
395               }
396               if (flagCharge)
397                 printf ("%c", (Char) sequence[i]);
398               else
399                 printf ("-");
400             }
401             printf ("\n");
402             ComPatFree (cpp);
403             pslp = pslph;
404             while (pslp != NULL)
405             {
406               pslph = pslp->next;
407               SeqLocFree (pslp);
408               pslp = pslph;
409             }
410           }
411           slp = slp->next;
412         }
413         slp = slph;
414         while (slp != NULL)
415         {
416           slph = slp->next;
417           SeqLocFree (slp);
418           slp = slph;
419         }
420         FilterDatFree (fltp);
421         MemFree (sequence);
422       }
423       else
424       {
425         ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
426                    "Not a peptide bioseq");
427         ErrShow ();
428         exit (1);
429       }
430     }
431     else
432     {
433       ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
434                  "No bioseq found");
435       ErrShow ();
436       exit (1);
437     }
438     SeqEntryFree (sep);
439     sep = NULL;
440     if (fiop != NULL)
441       sep = FastaToSeqEntry (fiop, FALSE);
442   }
443 
444   ComProfFree (leadprofile);
445   ComProfFree (cutprofile);
446   FileClose (fiop);
447   if (gi > 0)
448     EntrezFini ();
449 
450   return 0;
451 }
452