1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: srchaa.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.15 $
33 *
34 * File Description: peptide pattern match
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: srchaa.c,v $
41 * Revision 6.15 1999/07/13 13:07:32 sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.14 1999/02/26 18:51:54 kuzio
45 * mismatch option
46 *
47 * Revision 6.13 1999/02/25 20:34:19 kuzio
48 * print title option
49 *
50 * Revision 6.12 1998/12/18 16:24:56 kuzio
51 * big GIs
52 *
53 * Revision 6.11 1998/09/21 14:48:54 kuzio
54 * cmdln pattern option
55 *
56 * Revision 6.10 1998/09/16 18:19:30 kuzio
57 * cvs logging
58 *
59 * ==========================================================================
60 */
61
62 /*
63 CC /TAXO-RANGE=ABEPV;
64 - `A' stands for archaebacteria Archaea
65 - `B' stands for bacteriophages Viruses
66 - `E' stands for eukaryotes Eukaryotae
67 - `P' stands for prokaryotes Eubacteria
68 - `V' stands for eukaryotic viruses Viruses
69 */
70
71 /*
72 Patterns: explanation and examples:
73
74 X - any amino acid
75 [xyz] - one of x y or z
76 (#1:#2) - in the range of
77
78 M (2:4)[ST] P
79 a methionine
80 a run of 2 to 4 serine or threonine residues
81 a proline after the serine/threonine run
82
83 YES: GRMSTSTPGLV
84 ++++++
85
86 NO: GRMSTSTLVPG
87 +++++!
88
89 (6:12)X
90 defines a variable region in the range of 6 to 12 amino acid residues
91
92 see - ncbipros.dat for Prosite data
93 - ncbipnam.dat for sample pattern names file
94 - ncbiendo.dat for endopeptidase data
95 */
96
97 #include <ncbi.h>
98 #include <accentr.h>
99 #include <gather.h>
100
101 #ifndef NO_TAX_NET
102 #include <taxutil.h>
103 #endif
104
105 #include <tofasta.h>
106 #include <urkutil.h>
107 #include <urkptpf.h>
108
109 #define TOP_ERROR 1
110 static char _this_module[] = "srchaa";
111 #undef THIS_MODULE
112 #define THIS_MODULE _this_module
113 static char _this_file[] = __FILE__;
114 #undef THIS_FILE
115 #define THIS_FILE _this_file
116
117 typedef struct gather_TaxId
118 {
119 Int4 taxid;
120 } Gather_TaxId, PNTR Gather_TaxIdPtr;
121
122 typedef struct gather_Prot_Bioseq
123 {
124 Int4 gi;
125 BioseqPtr bsp;
126 } Gather_PBS, PNTR Gather_PBSPtr;
127
128 Args myargs[] =
129 {
130 { "protein GI", "0", "0", "9000000", TRUE,
131 'g', ARG_INT, 0.0, 0, NULL},
132 { "FastA file", NULL, NULL, NULL, TRUE,
133 'f', ARG_STRING, 0.0, 0, NULL },
134 { "use Prosite SkipFlag", "FALSE", "FALSE", "TRUE", TRUE,
135 's', ARG_BOOLEAN, 0.0, 0, NULL },
136 { "cut-off below r random score [25]", "1000000", "0", "1000000", TRUE,
137 'r', ARG_INT, 0.0, 0, NULL},
138 #ifndef NO_TAX_NET
139 { "use Prosite TaxonomyFlag", "FALSE", "FALSE", "TRUE", TRUE,
140 't', ARG_BOOLEAN, 0.0, 0, NULL },
141 #endif
142 { "protease digest information", "FALSE", "FALSE", "TRUE", TRUE,
143 'p', ARG_BOOLEAN, 0.0, 0, NULL },
144 { "sort digest by molecular weight", "FALSE", "FALSE", "TRUE", TRUE,
145 'm', ARG_BOOLEAN, 0.0, 0, NULL },
146 { "use ncbipnam.dat names file", "FALSE", "FALSE", "TRUE", TRUE,
147 'n', ARG_BOOLEAN, 0.0, 0, NULL},
148 { "user names file", NULL, NULL, NULL, TRUE,
149 'N', ARG_STRING, 0.0, 0, NULL },
150 { "user pattern", NULL, NULL, NULL, TRUE,
151 'S', ARG_STRING, 0.0, 0, NULL },
152 { "title on hit only", "FALSE", "FALSE", "TRUE", TRUE,
153 'T', ARG_BOOLEAN, 0.0, 0, NULL},
154 { "mismatch", "0", "0", "2", TRUE,
155 'M', ARG_INT, 0.0, 0, NULL}
156 };
157
158 #ifndef NO_TAX_NET
159
WhatOrg(Int4 taxid,CharPtr taxon)160 static Boolean WhatOrg (Int4 taxid, CharPtr taxon)
161 {
162 OrgRefPtr orp;
163 CharPtr lineage;
164 CharPtr nodes[] = {"Archaea", "Viruses", "Eukaryotae", "Eubacteria"};
165 Int4 i;
166 Boolean flagTaxHit;
167
168 /*
169 orp pointers only; mem doesn't have to be free'd
170 */
171 if ((orp = tax1_getOrgRef (taxid, NULL, NULL, NULL)) != NULL)
172 {
173 if (orp->taxname != NULL)
174 {
175 flagTaxHit = FALSE;
176 if ((lineage = get_lineage (orp->taxname)) != NULL)
177 {
178 for (i = 0; i < 4; i++)
179 {
180 if (StrStr (lineage, nodes[i]))
181 {
182 switch (i)
183 {
184 case 0:
185 taxon[0] = 'A';
186 flagTaxHit = TRUE;
187 break;
188 case 1:
189 taxon[1] = 'B';
190 taxon[4] = 'V';
191 flagTaxHit = TRUE;
192 break;
193 case 2:
194 taxon[2] = 'E';
195 flagTaxHit = TRUE;
196 break;
197 case 3:
198 taxon[3] = 'P';
199 flagTaxHit = TRUE;
200 break;
201 }
202 }
203 }
204 MemFree (lineage);
205 return flagTaxHit;
206 }
207 }
208 }
209 return FALSE;
210 }
211
GetTaxId(GatherContextPtr gcp)212 static Boolean GetTaxId (GatherContextPtr gcp)
213 {
214 Gather_TaxIdPtr gtip;
215 ValNodePtr desc;
216 PdbBlockPtr pdbp;
217 BioSourcePtr biop;
218 OrgRefPtr orp;
219 Int4 taxid;
220 CharPtr source, aptr, bptr;
221
222 if (gcp == NULL)
223 return FALSE;
224 if ((gtip = (Gather_TaxIdPtr) gcp->userdata) == NULL)
225 return FALSE;
226
227 if (gtip->taxid != 0)
228 return TRUE;
229 if (gcp->thistype != OBJ_SEQDESC)
230 return TRUE;
231 if ((desc = (ValNodePtr) (gcp->thisitem)) == NULL)
232 return TRUE;
233
234 while (desc != NULL)
235 {
236 biop = NULL;
237 switch (desc->choice)
238 {
239 case Seq_descr_source:
240 biop = (BioSourcePtr) desc->data.ptrvalue;
241 break;
242 case Seq_descr_pdb:
243 pdbp = (PdbBlockPtr) desc->data.ptrvalue;
244 source = (CharPtr) pdbp->source->data.ptrvalue;
245 aptr = StrStr (source, "Organism_scientific: ");
246 aptr += StrLen ("Organism_scientific: ");
247 bptr = StrChr (aptr, ';');
248 *bptr = '\0';
249 if ((taxid = tax1_getTaxIdByName (aptr)) != 0)
250 {
251 gtip->taxid = taxid;
252 return TRUE;
253 }
254 break;
255 default:
256 break;
257 }
258 if (biop != NULL)
259 {
260 orp = biop->org;
261 if (orp != NULL)
262 {
263 if ((taxid = tax1_getTaxIdByOrgRef (orp)) != 0)
264 {
265 gtip->taxid = taxid;
266 return TRUE;
267 }
268 }
269 }
270 desc = desc->next;
271 }
272 return FALSE;
273 }
274
275 #endif
276
GetBioseq(GatherContextPtr gcp)277 static Boolean GetBioseq (GatherContextPtr gcp)
278 {
279 Gather_PBSPtr gpbsp;
280 BioseqPtr bsp;
281 Int4 gi, entrezgi;
282
283 if (gcp == NULL)
284 return FALSE;
285 if ((gpbsp = (Gather_PBSPtr) gcp->userdata) == NULL)
286 return FALSE;
287
288 if (gpbsp->bsp != NULL)
289 return TRUE;
290 if (gcp->thistype != OBJ_BIOSEQ)
291 return TRUE;
292 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
293 return TRUE;
294
295 gi = gpbsp->gi;
296 if (gi > 0)
297 {
298 entrezgi = GetGIForSeqId (bsp->id);
299 if (gi == entrezgi)
300 gpbsp->bsp = bsp;
301 return TRUE;
302 }
303 else
304 {
305 gpbsp->bsp = bsp;
306 return TRUE;
307 }
308 }
309
Main(void)310 Int2 Main (void)
311 {
312 Int2 argcount;
313 Boolean flagHaveNet;
314
315 Int4 gi;
316 SeqEntryPtr sep;
317 ComPatPtr cpp, cpph = NULL;
318 SeqAlignPtr sap, sapn;
319 StdSegPtr ssp;
320 SeqLocPtr slp, slpn;
321 Int4 start, stop;
322
323 FILE *fiop;
324 Char fastafile[256], namesfile[256];
325 CharPtr title;
326 CharPtr taxon;
327
328 FloatHi mw;
329 ValNodePtr namelist = NULL;
330
331 static CharPtr pattern_file = "ncbipros.dat";
332 static CharPtr protease_file = "ncbiendo.dat";
333 static CharPtr names_file = "ncbipnam.dat";
334
335 static GatherScope gs;
336 GatherScopePtr gsp;
337 static Gather_PBS gpbs;
338 Gather_PBSPtr gpbsp;
339
340 #ifndef NO_TAX_NET
341 Int4 i;
342 static Char taxdata[8];
343 static Gather_TaxId gti;
344 Gather_TaxIdPtr gtip;
345 #endif
346
347 #ifndef NO_TAX_NET
348 Int2 ia=4, ib=5, ic=6, id=7, ie=8, ig=9, ih=10, ii=11;
349 #else
350 Int2 ib=4, ic=5, id=6, ie=7, ig=8, ih=9, ii=10;
351 #endif
352
353 argcount = sizeof (myargs) / sizeof (Args);
354 if (!GetArgs ("ProSiteSearch", argcount, myargs))
355 return 1;
356
357 if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
358 {
359 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
360 "No gi or FastA file given :: for help : srchaa -");
361 ErrShow ();
362 exit (1);
363 }
364
365 gsp = &gs;
366
367 #ifndef NO_TAX_NET
368 gtip = >i;
369 #endif
370 gpbsp = &gpbs;
371
372 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
373 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
374 (size_t) (OBJ_MAX * sizeof (Boolean)));
375
376 gsp->ignore[OBJ_SEQDESC] = TRUE;
377 gsp->ignore[OBJ_BIOSEQ] = FALSE;
378
379 gpbsp->bsp = NULL;
380
381 gi = myargs[0].intvalue;
382 if (myargs[1].strvalue != NULL)
383 StrCpy (fastafile, myargs[1].strvalue);
384 else
385 fastafile[0] = '\0';
386
387 if (gi > 0)
388 {
389 if (!EntrezInit ("srchaa", FALSE, &flagHaveNet))
390 {
391 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
392 "Entrez init failed");
393 ErrShow ();
394 exit (1);
395 }
396 }
397
398 #ifndef NO_TAX_NET
399 if (myargs[ia].intvalue)
400 {
401 if (!TaxArchInit ())
402 {
403 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
404 "Taxonomy init failed");
405 ErrShow ();
406 exit (1);
407 }
408 }
409 #endif
410
411 fiop = NULL;
412 if (gi > 0)
413 {
414 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
415 }
416 else
417 {
418 if ((fiop = FileOpen (fastafile, "r")) == NULL)
419 {
420 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
421 "Failed to open FastA file: %s", fastafile);
422 ErrShow ();
423 exit (1);
424 }
425 sep = FastaToSeqEntry (fiop, FALSE);
426 }
427
428 if (sep == NULL)
429 {
430 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
431 "No seqentry found");
432 ErrShow ();
433 exit (1);
434 }
435
436 while (sep != NULL)
437 {
438 gsp->ignore[OBJ_SEQDESC] = TRUE;
439 gsp->ignore[OBJ_BIOSEQ] = FALSE;
440 gpbsp->bsp = NULL;
441 gpbsp->gi = gi;
442 GatherSeqEntry (sep, (Pointer) gpbsp, GetBioseq, (Pointer) gsp);
443
444 taxon = NULL;
445 #ifndef NO_TAX_NET
446 if (myargs[ia].intvalue)
447 {
448 for (i = 0; i < 8; i++)
449 taxdata[i] = '-';
450 taxon = taxdata;
451
452 gsp->ignore[OBJ_SEQDESC] = FALSE;
453 gsp->ignore[OBJ_BIOSEQ] = TRUE;
454
455 gtip->taxid = 0;
456 GatherSeqEntry (sep, (Pointer) gtip, GetTaxId, (Pointer) gsp);
457
458 if (gtip->taxid != 0)
459 WhatOrg (gtip->taxid, taxon);
460 else
461 taxon = NULL;
462 }
463 #endif
464
465 if (gpbsp->bsp != NULL)
466 {
467 if (ISA_aa (gpbsp->bsp->mol))
468 {
469 if (cpph == NULL)
470 {
471 namesfile[0] = '\0';
472 if (myargs[id].intvalue)
473 StrCpy (namesfile, names_file);
474 if (myargs[ie].strvalue != NULL)
475 StrCpy (namesfile, myargs[ie].strvalue);
476
477 if (myargs[ig].strvalue != NULL)
478 {
479 if ((cpph = CompilePattern (myargs[ig].strvalue, 1)) != NULL)
480 StrCpy (cpph->name, "User Pattern");
481 }
482 else
483 {
484 namelist = ReadPatternNames (namesfile);
485 if (myargs[ib].intvalue)
486 cpph = ReadPrositePattern (protease_file,
487 (Boolean) myargs[2].intvalue,
488 myargs[3].intvalue,
489 taxon, NULL);
490 else
491 cpph = ReadPrositePattern (pattern_file,
492 (Boolean) myargs[2].intvalue,
493 myargs[3].intvalue,
494 taxon, namelist);
495 }
496 }
497
498 if (!(Boolean) myargs[ih].intvalue)
499 {
500 title = FastaTitle (gpbsp->bsp, ">", NULL);
501 printf ("%s\n", title);
502 MemFree (title);
503 }
504 cpp = cpph;
505 while (cpp != NULL)
506 {
507 sap = PatternMatchBioseq (gpbsp->bsp, cpp,
508 (Int4)myargs[ii].intvalue);
509 if (myargs[ib].intvalue)
510 {
511 printf (">%s\n", cpp->name);
512 if (sap != NULL)
513 printf (" Start Stop M.W.\n");
514 }
515 if (myargs[ib].intvalue)
516 {
517 EmbedMolecularWeightInfo (sap, gpbsp->bsp);
518 if (myargs[ic].intvalue)
519 URK_SeqAlignSortByMolWt (&sap);
520 while (sap != NULL)
521 {
522 ssp = (StdSegPtr) sap->segs;
523 slp = ssp->loc;
524 start = SeqLocStart (slp);
525 stop = SeqLocStop (slp);
526 mw = ssp->scores->value.realvalue;
527 printf ("%8ld %8ld %9.2f\n",
528 (long) start+1, (long) stop+1, mw);
529 sapn = sap->next;
530 SeqAlignFree (sap);
531 sap = sapn;
532 }
533 }
534 else
535 {
536 slp = MatchSa2Sl (&sap);
537 if (myargs[ih].intvalue && slp != NULL)
538 {
539 title = FastaTitle (gpbsp->bsp, ">", NULL);
540 printf ("%s\n", title);
541 MemFree (title);
542 }
543 while (slp != NULL)
544 {
545 start = SeqLocStart (slp);
546 stop = SeqLocStop (slp);
547 printf ("%8ld %8ld %s\n",
548 (long) start+1, (long) stop+1, cpp->name);
549 slpn = slp->next;
550 SeqLocFree (slp);
551 slp = slpn;
552 }
553 }
554 cpp = cpp->nextpattern;
555 }
556 }
557 else
558 {
559 ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
560 "Not a protein bioseq");
561 ErrShow ();
562 exit (1);
563 }
564 }
565 else
566 {
567 ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
568 "No bioseq found");
569 ErrShow ();
570 exit (1);
571 }
572 SeqEntryFree (sep);
573 sep = NULL;
574 if (fiop != NULL)
575 sep = FastaToSeqEntry (fiop, FALSE);
576 }
577
578 ComPatFree (cpph);
579 ValNodeFreeData (namelist);
580 FileClose (fiop);
581 if (gi > 0)
582 EntrezFini ();
583 #ifndef NO_TAX_NET
584 if (myargs[ia].intvalue)
585 TaxArchFini ();
586 #endif
587 return 0;
588 }
589