1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: srchaa.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.15 $
33 *
34 * File Description: peptide pattern match
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: srchaa.c,v $
41 * Revision 6.15  1999/07/13 13:07:32  sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.14  1999/02/26 18:51:54  kuzio
45 * mismatch option
46 *
47 * Revision 6.13  1999/02/25 20:34:19  kuzio
48 * print title option
49 *
50 * Revision 6.12  1998/12/18 16:24:56  kuzio
51 * big GIs
52 *
53 * Revision 6.11  1998/09/21 14:48:54  kuzio
54 * cmdln pattern option
55 *
56 * Revision 6.10  1998/09/16 18:19:30  kuzio
57 * cvs logging
58 *
59 * ==========================================================================
60 */
61 
62 /*
63 CC   /TAXO-RANGE=ABEPV;
64    -  `A' stands for archaebacteria      Archaea
65    -  `B' stands for bacteriophages      Viruses
66    -  `E' stands for eukaryotes          Eukaryotae
67    -  `P' stands for prokaryotes         Eubacteria
68    -  `V' stands for eukaryotic viruses  Viruses
69 */
70 
71 /*
72 Patterns: explanation and examples:
73 
74 X       - any amino acid
75 [xyz]   - one of x y or z
76 (#1:#2) - in the range of
77 
78 M (2:4)[ST] P
79  a methionine
80  a run of 2 to 4 serine or threonine residues
81  a proline after the serine/threonine run
82 
83  YES: GRMSTSTPGLV
84         ++++++
85 
86  NO:  GRMSTSTLVPG
87         +++++!
88 
89 (6:12)X
90  defines a variable region in the range of 6 to 12 amino acid residues
91 
92 see - ncbipros.dat for Prosite data
93     - ncbipnam.dat for sample pattern names file
94     - ncbiendo.dat for endopeptidase data
95 */
96 
97 #include <ncbi.h>
98 #include <accentr.h>
99 #include <gather.h>
100 
101 #ifndef NO_TAX_NET
102 #include <taxutil.h>
103 #endif
104 
105 #include <tofasta.h>
106 #include <urkutil.h>
107 #include <urkptpf.h>
108 
109 #define TOP_ERROR 1
110 static char _this_module[] = "srchaa";
111 #undef  THIS_MODULE
112 #define THIS_MODULE _this_module
113 static char _this_file[] = __FILE__;
114 #undef  THIS_FILE
115 #define THIS_FILE _this_file
116 
117 typedef struct gather_TaxId
118 {
119   Int4   taxid;
120 } Gather_TaxId, PNTR Gather_TaxIdPtr;
121 
122 typedef struct gather_Prot_Bioseq
123 {
124   Int4      gi;
125   BioseqPtr bsp;
126 } Gather_PBS, PNTR Gather_PBSPtr;
127 
128 Args myargs[] =
129 {
130   { "protein GI", "0", "0", "9000000", TRUE,
131     'g', ARG_INT, 0.0, 0, NULL},
132   { "FastA file", NULL, NULL, NULL, TRUE,
133     'f', ARG_STRING, 0.0, 0, NULL },
134   { "use Prosite SkipFlag", "FALSE", "FALSE", "TRUE", TRUE,
135     's', ARG_BOOLEAN, 0.0, 0, NULL },
136   { "cut-off below  r  random score [25]", "1000000", "0", "1000000", TRUE,
137     'r', ARG_INT, 0.0, 0, NULL},
138 #ifndef NO_TAX_NET
139   { "use Prosite TaxonomyFlag", "FALSE", "FALSE", "TRUE", TRUE,
140     't', ARG_BOOLEAN, 0.0, 0, NULL },
141 #endif
142   { "protease digest information", "FALSE", "FALSE", "TRUE", TRUE,
143     'p', ARG_BOOLEAN, 0.0, 0, NULL },
144   { "sort digest by molecular weight", "FALSE", "FALSE", "TRUE", TRUE,
145     'm', ARG_BOOLEAN, 0.0, 0, NULL },
146   { "use ncbipnam.dat names file", "FALSE", "FALSE", "TRUE", TRUE,
147     'n', ARG_BOOLEAN, 0.0, 0, NULL},
148   { "user names file", NULL, NULL, NULL, TRUE,
149     'N', ARG_STRING, 0.0, 0, NULL },
150   { "user pattern", NULL, NULL, NULL, TRUE,
151     'S', ARG_STRING, 0.0, 0, NULL },
152   { "title on hit only", "FALSE", "FALSE", "TRUE", TRUE,
153     'T', ARG_BOOLEAN, 0.0, 0, NULL},
154   { "mismatch", "0", "0", "2", TRUE,
155     'M', ARG_INT, 0.0, 0, NULL}
156 };
157 
158 #ifndef NO_TAX_NET
159 
WhatOrg(Int4 taxid,CharPtr taxon)160 static Boolean WhatOrg (Int4 taxid, CharPtr taxon)
161 {
162   OrgRefPtr  orp;
163   CharPtr    lineage;
164   CharPtr    nodes[] = {"Archaea", "Viruses", "Eukaryotae", "Eubacteria"};
165   Int4       i;
166   Boolean    flagTaxHit;
167 
168 /*
169  orp pointers only; mem doesn't have to be free'd
170 */
171   if ((orp = tax1_getOrgRef (taxid, NULL, NULL, NULL)) != NULL)
172   {
173     if (orp->taxname != NULL)
174     {
175       flagTaxHit = FALSE;
176       if ((lineage = get_lineage (orp->taxname)) != NULL)
177       {
178         for (i = 0; i < 4; i++)
179         {
180           if (StrStr (lineage, nodes[i]))
181           {
182             switch (i)
183             {
184              case 0:
185               taxon[0] = 'A';
186               flagTaxHit = TRUE;
187               break;
188              case 1:
189               taxon[1] = 'B';
190               taxon[4] = 'V';
191               flagTaxHit = TRUE;
192               break;
193              case 2:
194               taxon[2] = 'E';
195               flagTaxHit = TRUE;
196               break;
197              case 3:
198               taxon[3] = 'P';
199               flagTaxHit = TRUE;
200               break;
201             }
202           }
203         }
204         MemFree (lineage);
205         return flagTaxHit;
206       }
207     }
208   }
209   return FALSE;
210 }
211 
GetTaxId(GatherContextPtr gcp)212 static Boolean GetTaxId (GatherContextPtr gcp)
213 {
214   Gather_TaxIdPtr  gtip;
215   ValNodePtr       desc;
216   PdbBlockPtr      pdbp;
217   BioSourcePtr     biop;
218   OrgRefPtr        orp;
219   Int4             taxid;
220   CharPtr          source, aptr, bptr;
221 
222   if (gcp == NULL)
223     return FALSE;
224   if ((gtip = (Gather_TaxIdPtr) gcp->userdata) == NULL)
225     return FALSE;
226 
227   if (gtip->taxid != 0)
228     return TRUE;
229   if (gcp->thistype != OBJ_SEQDESC)
230     return TRUE;
231   if ((desc = (ValNodePtr) (gcp->thisitem)) == NULL)
232     return TRUE;
233 
234   while (desc != NULL)
235   {
236     biop = NULL;
237     switch (desc->choice)
238     {
239      case Seq_descr_source:
240       biop = (BioSourcePtr) desc->data.ptrvalue;
241       break;
242      case Seq_descr_pdb:
243       pdbp = (PdbBlockPtr) desc->data.ptrvalue;
244       source = (CharPtr) pdbp->source->data.ptrvalue;
245       aptr = StrStr (source, "Organism_scientific: ");
246       aptr += StrLen ("Organism_scientific: ");
247       bptr = StrChr (aptr, ';');
248       *bptr = '\0';
249       if ((taxid = tax1_getTaxIdByName (aptr)) != 0)
250       {
251         gtip->taxid = taxid;
252         return TRUE;
253       }
254       break;
255      default:
256       break;
257     }
258     if (biop != NULL)
259     {
260       orp = biop->org;
261       if (orp != NULL)
262       {
263         if ((taxid = tax1_getTaxIdByOrgRef (orp)) != 0)
264         {
265           gtip->taxid = taxid;
266           return TRUE;
267         }
268       }
269     }
270     desc = desc->next;
271   }
272   return FALSE;
273 }
274 
275 #endif
276 
GetBioseq(GatherContextPtr gcp)277 static Boolean GetBioseq (GatherContextPtr gcp)
278 {
279   Gather_PBSPtr  gpbsp;
280   BioseqPtr      bsp;
281   Int4           gi, entrezgi;
282 
283   if (gcp == NULL)
284     return FALSE;
285   if ((gpbsp = (Gather_PBSPtr) gcp->userdata) == NULL)
286     return FALSE;
287 
288   if (gpbsp->bsp != NULL)
289     return TRUE;
290   if (gcp->thistype != OBJ_BIOSEQ)
291     return TRUE;
292   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
293     return TRUE;
294 
295   gi = gpbsp->gi;
296   if (gi > 0)
297   {
298     entrezgi = GetGIForSeqId (bsp->id);
299     if (gi == entrezgi)
300       gpbsp->bsp = bsp;
301     return TRUE;
302   }
303   else
304   {
305     gpbsp->bsp = bsp;
306     return TRUE;
307   }
308 }
309 
Main(void)310 Int2 Main (void)
311 {
312   Int2        argcount;
313   Boolean     flagHaveNet;
314 
315   Int4        gi;
316   SeqEntryPtr sep;
317   ComPatPtr   cpp, cpph = NULL;
318   SeqAlignPtr sap, sapn;
319   StdSegPtr   ssp;
320   SeqLocPtr   slp, slpn;
321   Int4        start, stop;
322 
323   FILE        *fiop;
324   Char        fastafile[256], namesfile[256];
325   CharPtr     title;
326   CharPtr     taxon;
327 
328   FloatHi     mw;
329   ValNodePtr  namelist = NULL;
330 
331   static CharPtr pattern_file = "ncbipros.dat";
332   static CharPtr protease_file = "ncbiendo.dat";
333   static CharPtr names_file = "ncbipnam.dat";
334 
335   static GatherScope  gs;
336   GatherScopePtr      gsp;
337   static Gather_PBS   gpbs;
338   Gather_PBSPtr       gpbsp;
339 
340 #ifndef NO_TAX_NET
341   Int4   i;
342   static Char taxdata[8];
343   static Gather_TaxId gti;
344   Gather_TaxIdPtr     gtip;
345 #endif
346 
347 #ifndef NO_TAX_NET
348   Int2   ia=4, ib=5, ic=6, id=7, ie=8, ig=9, ih=10, ii=11;
349 #else
350   Int2         ib=4, ic=5, id=6, ie=7, ig=8, ih=9,  ii=10;
351 #endif
352 
353   argcount = sizeof (myargs) / sizeof (Args);
354   if (!GetArgs ("ProSiteSearch", argcount, myargs))
355     return 1;
356 
357   if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
358   {
359     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
360                "No gi or FastA file given :: for help :   srchaa -");
361     ErrShow ();
362     exit (1);
363   }
364 
365   gsp = &gs;
366 
367 #ifndef NO_TAX_NET
368   gtip = &gti;
369 #endif
370   gpbsp = &gpbs;
371 
372   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
373   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
374           (size_t) (OBJ_MAX * sizeof (Boolean)));
375 
376   gsp->ignore[OBJ_SEQDESC] = TRUE;
377   gsp->ignore[OBJ_BIOSEQ] = FALSE;
378 
379   gpbsp->bsp = NULL;
380 
381   gi = myargs[0].intvalue;
382   if (myargs[1].strvalue != NULL)
383     StrCpy (fastafile, myargs[1].strvalue);
384   else
385     fastafile[0] = '\0';
386 
387   if (gi > 0)
388   {
389     if (!EntrezInit ("srchaa", FALSE, &flagHaveNet))
390     {
391       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
392                  "Entrez init failed");
393       ErrShow ();
394       exit (1);
395     }
396   }
397 
398 #ifndef NO_TAX_NET
399   if (myargs[ia].intvalue)
400   {
401     if (!TaxArchInit ())
402     {
403       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
404                  "Taxonomy init failed");
405       ErrShow ();
406       exit (1);
407     }
408   }
409 #endif
410 
411   fiop = NULL;
412   if (gi > 0)
413   {
414     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
415   }
416   else
417   {
418     if ((fiop = FileOpen (fastafile, "r")) == NULL)
419     {
420       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
421                  "Failed to open FastA file: %s", fastafile);
422       ErrShow ();
423       exit (1);
424     }
425     sep = FastaToSeqEntry (fiop, FALSE);
426   }
427 
428   if (sep == NULL)
429   {
430     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
431                "No seqentry found");
432     ErrShow ();
433     exit (1);
434   }
435 
436   while (sep != NULL)
437   {
438     gsp->ignore[OBJ_SEQDESC] = TRUE;
439     gsp->ignore[OBJ_BIOSEQ] = FALSE;
440     gpbsp->bsp = NULL;
441     gpbsp->gi = gi;
442     GatherSeqEntry (sep, (Pointer) gpbsp, GetBioseq, (Pointer) gsp);
443 
444     taxon = NULL;
445 #ifndef NO_TAX_NET
446     if (myargs[ia].intvalue)
447     {
448       for (i = 0; i < 8; i++)
449         taxdata[i] = '-';
450       taxon = taxdata;
451 
452       gsp->ignore[OBJ_SEQDESC] = FALSE;
453       gsp->ignore[OBJ_BIOSEQ] = TRUE;
454 
455       gtip->taxid = 0;
456       GatherSeqEntry (sep, (Pointer) gtip, GetTaxId, (Pointer) gsp);
457 
458       if (gtip->taxid != 0)
459         WhatOrg (gtip->taxid, taxon);
460       else
461         taxon = NULL;
462     }
463 #endif
464 
465     if (gpbsp->bsp != NULL)
466     {
467       if (ISA_aa (gpbsp->bsp->mol))
468       {
469         if (cpph == NULL)
470         {
471           namesfile[0] = '\0';
472           if (myargs[id].intvalue)
473             StrCpy (namesfile, names_file);
474           if (myargs[ie].strvalue != NULL)
475             StrCpy (namesfile, myargs[ie].strvalue);
476 
477           if (myargs[ig].strvalue != NULL)
478           {
479             if ((cpph = CompilePattern (myargs[ig].strvalue, 1)) != NULL)
480               StrCpy (cpph->name, "User Pattern");
481           }
482           else
483           {
484             namelist = ReadPatternNames (namesfile);
485             if (myargs[ib].intvalue)
486               cpph = ReadPrositePattern (protease_file,
487                                          (Boolean) myargs[2].intvalue,
488                                          myargs[3].intvalue,
489                                          taxon, NULL);
490             else
491               cpph = ReadPrositePattern (pattern_file,
492                                          (Boolean) myargs[2].intvalue,
493                                          myargs[3].intvalue,
494                                          taxon, namelist);
495           }
496         }
497 
498         if (!(Boolean) myargs[ih].intvalue)
499         {
500           title = FastaTitle (gpbsp->bsp, ">", NULL);
501           printf ("%s\n", title);
502           MemFree (title);
503         }
504         cpp = cpph;
505         while (cpp != NULL)
506         {
507           sap = PatternMatchBioseq (gpbsp->bsp, cpp,
508                                     (Int4)myargs[ii].intvalue);
509           if (myargs[ib].intvalue)
510           {
511             printf (">%s\n", cpp->name);
512             if (sap != NULL)
513               printf ("   Start     Stop       M.W.\n");
514           }
515           if (myargs[ib].intvalue)
516           {
517             EmbedMolecularWeightInfo (sap, gpbsp->bsp);
518             if (myargs[ic].intvalue)
519               URK_SeqAlignSortByMolWt (&sap);
520             while (sap != NULL)
521             {
522               ssp = (StdSegPtr) sap->segs;
523               slp = ssp->loc;
524               start = SeqLocStart (slp);
525               stop = SeqLocStop (slp);
526               mw = ssp->scores->value.realvalue;
527               printf ("%8ld %8ld    %9.2f\n",
528                       (long) start+1, (long) stop+1, mw);
529               sapn = sap->next;
530               SeqAlignFree (sap);
531               sap = sapn;
532             }
533           }
534           else
535           {
536             slp = MatchSa2Sl (&sap);
537             if (myargs[ih].intvalue && slp != NULL)
538             {
539               title = FastaTitle (gpbsp->bsp, ">", NULL);
540               printf ("%s\n", title);
541               MemFree (title);
542             }
543             while (slp != NULL)
544             {
545               start = SeqLocStart (slp);
546               stop = SeqLocStop (slp);
547               printf ("%8ld %8ld    %s\n",
548                       (long) start+1, (long) stop+1, cpp->name);
549               slpn = slp->next;
550               SeqLocFree (slp);
551               slp = slpn;
552             }
553           }
554           cpp = cpp->nextpattern;
555         }
556       }
557       else
558       {
559         ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
560                    "Not a protein bioseq");
561         ErrShow ();
562         exit (1);
563       }
564     }
565     else
566     {
567       ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
568                  "No bioseq found");
569       ErrShow ();
570       exit (1);
571     }
572     SeqEntryFree (sep);
573     sep = NULL;
574     if (fiop != NULL)
575       sep = FastaToSeqEntry (fiop, FALSE);
576   }
577 
578   ComPatFree (cpph);
579   ValNodeFreeData (namelist);
580   FileClose (fiop);
581   if (gi > 0)
582     EntrezFini ();
583 #ifndef NO_TAX_NET
584   if (myargs[ia].intvalue)
585     TaxArchFini ();
586 #endif
587   return 0;
588 }
589