1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: srchnt.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.16 $
33 *
34 * File Description: nucleotide pattern match
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: srchnt.c,v $
41 * Revision 6.16  1999/07/13 13:07:32  sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.15  1999/02/26 18:52:11  kuzio
45 * correct cast
46 *
47 * Revision 6.14  1998/12/18 16:24:57  kuzio
48 * big GIs
49 *
50 * Revision 6.13  1998/09/21 14:48:54  kuzio
51 * cmdln pattern option
52 *
53 * Revision 6.12  1998/09/16 18:19:31  kuzio
54 * cvs logging
55 *
56 * ==========================================================================
57 */
58 
59 /*
60 Patterns: explanation and examples:
61 
62 N       - any nucleotide
63 [xyz]   - one of x y or z
64 (#1:#2) - in the range of
65 
66 a class of enhancer sequence:
67 
68 G[CT][ST][GT]T[AT]C[AG][AC]G[AT][AG]S[AG][ACT]
69 [CT][CT]S[CT]AC[CT]CG[AT](1:4)AGC
70 
71 a type of promoter region:
72 
73 [AGT]TAAG   (50:70)N     ANNATG[AG]
74 
75 =========   ========     =========
76 a kind of   50 to 70     Kozak-like
77 promoter    nucleotides  initiator
78 
79 a bogus example (but syntactically correct):
80 
81 A[(3:5)GT]C
82 
83 A
84  followed by
85 (3 to 5 G's)  or  (a single T)
86  followed by
87 C
88 
89 using above would find:     AGGGC AGGGGGC ATCG
90                             +++++ +++++++ +++
91 
92 using above would not find: AGGGT ATTTC   AGGC
93                             ++++! ++!     +++!
94 
95 But, variable differential ranges are not supported.
96 Define two separate patterns.
97 
98 see - ncbiren.dat for REN data
99     - ncbirnam.dat for sample REN names data file
100 */
101 
102 #include <ncbi.h>
103 #include <accentr.h>
104 #include <gather.h>
105 #include <tofasta.h>
106 #include <urkutil.h>
107 #include <urkptpf.h>
108 
109 #define TOP_ERROR 1
110 static char _this_module[] = "srchnt";
111 #undef  THIS_MODULE
112 #define THIS_MODULE _this_module
113 static char _this_file[] = __FILE__;
114 #undef  THIS_FILE
115 #define THIS_FILE _this_file
116 
117 typedef struct gather_Nuc_Bioseq
118 {
119   Int4      gi;
120   BioseqPtr bsp;
121 } Gather_NBS, PNTR Gather_NBSPtr;
122 
123 Args myargs[] =
124 {
125   { "nucleotide GI", "0", "0", "9000000", TRUE,
126     'g', ARG_INT, 0.0, 0, NULL},
127   { "FastA file", NULL, NULL, NULL, TRUE,
128     'f', ARG_STRING, 0.0, 0, NULL },
129   { "local REN file", NULL, NULL, NULL, TRUE,
130     'r', ARG_STRING, 0.0, 0, NULL },
131   { "sort by fragment length", "FALSE", "FALSE", "TRUE", TRUE,
132     's', ARG_BOOLEAN, 0.0, 0, NULL},
133   { "mismatch", "0", "0", "2", TRUE,
134     'm', ARG_INT, 0.0, 0, NULL},
135   { "enzyme with known cutsite only", "TRUE", "FALSE", "TRUE", TRUE,
136     'c', ARG_BOOLEAN, 0.0, 0, NULL},
137   { "use ncbirnam.dat names file", "FALSE", "FALSE", "TRUE", TRUE,
138     'n', ARG_BOOLEAN, 0.0, 0, NULL},
139   { "user names file", NULL, NULL, NULL, TRUE,
140     'N', ARG_STRING, 0.0, 0, NULL },
141   { "user pattern", NULL, NULL, NULL, TRUE,
142     'S', ARG_STRING, 0.0, 0, NULL }
143 };
144 
GetBioseq(GatherContextPtr gcp)145 static Boolean GetBioseq (GatherContextPtr gcp)
146 {
147   Gather_NBSPtr  gnbsp;
148   BioseqPtr      bsp;
149   Int4           gi, entrezgi;
150 
151   if (gcp == NULL)
152     return FALSE;
153   if ((gnbsp = (Gather_NBSPtr) gcp->userdata) == NULL)
154     return FALSE;
155 
156   if (gnbsp->bsp != NULL)
157     return TRUE;
158   if (gcp->thistype != OBJ_BIOSEQ)
159     return TRUE;
160   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
161     return TRUE;
162 
163   gi = gnbsp->gi;
164   if (gi > 0)
165   {
166     entrezgi = GetGIForSeqId (bsp->id);
167     if (gi == entrezgi)
168       gnbsp->bsp = bsp;
169     return TRUE;
170   }
171   else
172   {
173     gnbsp->bsp = bsp;
174     return TRUE;
175   }
176 }
177 
Main(void)178 Int2 Main (void)
179 {
180   Int2        argcount;
181   Boolean     flagHaveNet;
182 
183   Int4        gi;
184   SeqEntryPtr sep;
185   ComPatPtr   cpp, cpph = NULL;
186   SeqAlignPtr sap, sapn;
187   SeqLocPtr   slp, slpn, slpt;
188   StdSegPtr   ssp;
189   Int4        start;
190 
191   FILE        *fiop;
192   Char        fastafile[256], namesfile[256];
193   CharPtr     title;
194 
195   ValNodePtr  namelist;
196 
197   static CharPtr pattern_file = "ncbiren.dat";
198   static CharPtr names_file = "ncbirnam.dat";
199 
200   static GatherScope  gs;
201   GatherScopePtr      gsp;
202   static Gather_NBS   gpbs;
203   Gather_NBSPtr       gnbsp;
204 
205   argcount = sizeof (myargs) / sizeof (Args);
206   if (!GetArgs ("REN_Search", argcount, myargs))
207     return 1;
208 
209   if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
210   {
211     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
212                "No gi or FastA file given :: for help :   srchnt -");
213     ErrShow ();
214     exit (1);
215   }
216 
217   gsp = &gs;
218   gnbsp = &gpbs;
219 
220   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
221   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
222           (size_t) (OBJ_MAX * sizeof (Boolean)));
223 
224   gsp->ignore[OBJ_SEQDESC] = TRUE;
225   gsp->ignore[OBJ_BIOSEQ] = FALSE;
226 
227   gnbsp->bsp = NULL;
228 
229   gi = myargs[0].intvalue;
230   if (myargs[1].strvalue != NULL)
231     StrCpy (fastafile, myargs[1].strvalue);
232   else
233     fastafile[0] = '\0';
234 
235   if (gi > 0)
236   {
237     if (!EntrezInit ("srchnt", FALSE, &flagHaveNet))
238     {
239       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
240                  "Entrez init failed");
241       ErrShow ();
242       exit (1);
243     }
244   }
245 
246   fiop = NULL;
247   if (gi > 0)
248   {
249     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
250   }
251   else
252   {
253     if ((fiop = FileOpen (fastafile, "r")) == NULL)
254     {
255       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
256                  "Failed to open FastA file: %s", fastafile);
257       ErrShow ();
258       exit (1);
259     }
260     sep = FastaToSeqEntry (fiop, TRUE);
261   }
262 
263   if (sep == NULL)
264   {
265     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
266                "No seqentry found");
267     ErrShow ();
268     exit (1);
269   }
270 
271   namesfile[0] = '\0';
272   if (myargs[6].intvalue)
273     StrCpy (namesfile, names_file);
274   if (myargs[7].strvalue != NULL)
275     StrCpy (namesfile, myargs[7].strvalue);
276 
277   namelist = ReadPatternNames (namesfile);
278 
279   while (sep != NULL)
280   {
281     gnbsp->gi = gi;
282     GatherSeqEntry (sep, (Pointer) gnbsp, GetBioseq, (Pointer) gsp);
283 
284     if (gnbsp->bsp != NULL)
285     {
286       if (ISA_na (gnbsp->bsp->mol))
287       {
288         if (cpph == NULL)
289         {
290           if (myargs[8].strvalue != NULL)
291           {
292             if ((cpph = CompilePattern (myargs[8].strvalue, 0)) != NULL)
293               StrCpy (cpph->name, "User Pattern");
294           }
295           else
296           {
297             if (myargs[2].strvalue != NULL)
298               cpph = ReadRENPattern (myargs[2].strvalue,
299                                      (Boolean) myargs[5].intvalue, namelist);
300             else
301               cpph = ReadRENPattern (pattern_file,
302                                      (Boolean) myargs[5].intvalue, namelist);
303           }
304           PalindromeCheck (cpph);
305         }
306         title = FastaTitle (gnbsp->bsp, ">", NULL);
307         printf ("%s\n", title);
308         MemFree (title);
309         cpp = cpph;
310         while (cpp != NULL)
311         {
312           printf (">%s\n", cpp->name);
313           sap = PatternMatchBioseq (gnbsp->bsp, cpp,
314                                     (Int4) myargs[4].intvalue);
315           if (sap != NULL)
316             printf ("   Start\n");
317           URK_SeqAlignSortByStart (&sap);
318           if (myargs[3].intvalue)
319           {
320             EmbedFragLengthInfo (sap, gnbsp->bsp->length);
321             URK_SeqAlignSortByLength (&sap);
322             while (sap != NULL)
323             {
324               ssp = (StdSegPtr) sap->segs;
325               slp = ssp->loc;
326               if (slp->choice != SEQLOC_MIX)
327               {
328                 start = SeqLocStart (slp);
329               }
330               else
331               {
332                 slpt = (SeqLocPtr) slp->data.ptrvalue;
333                 start = SeqLocStart (slpt);
334               }
335               printf ("%8ld  %8ld\n", (long) start+1,
336                                       (long) ssp->scores->value.intvalue);
337               sapn = sap->next;
338               SeqAlignFree (sap);
339               sap = sapn;
340             }
341           }
342           else
343           {
344             slp = MatchSa2Sl (&sap);
345             while (slp != NULL)
346             {
347               if (slp->choice != SEQLOC_MIX)
348               {
349                 start = SeqLocStart (slp);
350               }
351               else
352               {
353                 slpt = (SeqLocPtr) slp->data.ptrvalue;
354                 start = SeqLocStart (slpt);
355               }
356               printf ("%8ld\n", (long) start+1);
357               slpn = slp->next;
358               SeqLocFree (slp);
359               slp = slpn;
360             }
361           }
362           cpp = cpp->nextpattern;
363         }
364       }
365       else
366       {
367         ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
368                    "Not a nucleic bioseq");
369         ErrShow ();
370         exit (1);
371       }
372     }
373     else
374     {
375       ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
376                  "No bioseq found");
377       ErrShow ();
378       exit (1);
379     }
380     SeqEntryFree (sep);
381     sep = NULL;
382     if (fiop != NULL)
383       sep = FastaToSeqEntry (fiop, TRUE);
384   }
385 
386   ComPatFree (cpph);
387   ValNodeFreeData (namelist);
388   FileClose (fiop);
389   if (gi > 0)
390     EntrezFini ();
391 
392   return 0;
393 }
394