1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: srchnt.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.16 $
33 *
34 * File Description: nucleotide pattern match
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: srchnt.c,v $
41 * Revision 6.16 1999/07/13 13:07:32 sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.15 1999/02/26 18:52:11 kuzio
45 * correct cast
46 *
47 * Revision 6.14 1998/12/18 16:24:57 kuzio
48 * big GIs
49 *
50 * Revision 6.13 1998/09/21 14:48:54 kuzio
51 * cmdln pattern option
52 *
53 * Revision 6.12 1998/09/16 18:19:31 kuzio
54 * cvs logging
55 *
56 * ==========================================================================
57 */
58
59 /*
60 Patterns: explanation and examples:
61
62 N - any nucleotide
63 [xyz] - one of x y or z
64 (#1:#2) - in the range of
65
66 a class of enhancer sequence:
67
68 G[CT][ST][GT]T[AT]C[AG][AC]G[AT][AG]S[AG][ACT]
69 [CT][CT]S[CT]AC[CT]CG[AT](1:4)AGC
70
71 a type of promoter region:
72
73 [AGT]TAAG (50:70)N ANNATG[AG]
74
75 ========= ======== =========
76 a kind of 50 to 70 Kozak-like
77 promoter nucleotides initiator
78
79 a bogus example (but syntactically correct):
80
81 A[(3:5)GT]C
82
83 A
84 followed by
85 (3 to 5 G's) or (a single T)
86 followed by
87 C
88
89 using above would find: AGGGC AGGGGGC ATCG
90 +++++ +++++++ +++
91
92 using above would not find: AGGGT ATTTC AGGC
93 ++++! ++! +++!
94
95 But, variable differential ranges are not supported.
96 Define two separate patterns.
97
98 see - ncbiren.dat for REN data
99 - ncbirnam.dat for sample REN names data file
100 */
101
102 #include <ncbi.h>
103 #include <accentr.h>
104 #include <gather.h>
105 #include <tofasta.h>
106 #include <urkutil.h>
107 #include <urkptpf.h>
108
109 #define TOP_ERROR 1
110 static char _this_module[] = "srchnt";
111 #undef THIS_MODULE
112 #define THIS_MODULE _this_module
113 static char _this_file[] = __FILE__;
114 #undef THIS_FILE
115 #define THIS_FILE _this_file
116
117 typedef struct gather_Nuc_Bioseq
118 {
119 Int4 gi;
120 BioseqPtr bsp;
121 } Gather_NBS, PNTR Gather_NBSPtr;
122
123 Args myargs[] =
124 {
125 { "nucleotide GI", "0", "0", "9000000", TRUE,
126 'g', ARG_INT, 0.0, 0, NULL},
127 { "FastA file", NULL, NULL, NULL, TRUE,
128 'f', ARG_STRING, 0.0, 0, NULL },
129 { "local REN file", NULL, NULL, NULL, TRUE,
130 'r', ARG_STRING, 0.0, 0, NULL },
131 { "sort by fragment length", "FALSE", "FALSE", "TRUE", TRUE,
132 's', ARG_BOOLEAN, 0.0, 0, NULL},
133 { "mismatch", "0", "0", "2", TRUE,
134 'm', ARG_INT, 0.0, 0, NULL},
135 { "enzyme with known cutsite only", "TRUE", "FALSE", "TRUE", TRUE,
136 'c', ARG_BOOLEAN, 0.0, 0, NULL},
137 { "use ncbirnam.dat names file", "FALSE", "FALSE", "TRUE", TRUE,
138 'n', ARG_BOOLEAN, 0.0, 0, NULL},
139 { "user names file", NULL, NULL, NULL, TRUE,
140 'N', ARG_STRING, 0.0, 0, NULL },
141 { "user pattern", NULL, NULL, NULL, TRUE,
142 'S', ARG_STRING, 0.0, 0, NULL }
143 };
144
GetBioseq(GatherContextPtr gcp)145 static Boolean GetBioseq (GatherContextPtr gcp)
146 {
147 Gather_NBSPtr gnbsp;
148 BioseqPtr bsp;
149 Int4 gi, entrezgi;
150
151 if (gcp == NULL)
152 return FALSE;
153 if ((gnbsp = (Gather_NBSPtr) gcp->userdata) == NULL)
154 return FALSE;
155
156 if (gnbsp->bsp != NULL)
157 return TRUE;
158 if (gcp->thistype != OBJ_BIOSEQ)
159 return TRUE;
160 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
161 return TRUE;
162
163 gi = gnbsp->gi;
164 if (gi > 0)
165 {
166 entrezgi = GetGIForSeqId (bsp->id);
167 if (gi == entrezgi)
168 gnbsp->bsp = bsp;
169 return TRUE;
170 }
171 else
172 {
173 gnbsp->bsp = bsp;
174 return TRUE;
175 }
176 }
177
Main(void)178 Int2 Main (void)
179 {
180 Int2 argcount;
181 Boolean flagHaveNet;
182
183 Int4 gi;
184 SeqEntryPtr sep;
185 ComPatPtr cpp, cpph = NULL;
186 SeqAlignPtr sap, sapn;
187 SeqLocPtr slp, slpn, slpt;
188 StdSegPtr ssp;
189 Int4 start;
190
191 FILE *fiop;
192 Char fastafile[256], namesfile[256];
193 CharPtr title;
194
195 ValNodePtr namelist;
196
197 static CharPtr pattern_file = "ncbiren.dat";
198 static CharPtr names_file = "ncbirnam.dat";
199
200 static GatherScope gs;
201 GatherScopePtr gsp;
202 static Gather_NBS gpbs;
203 Gather_NBSPtr gnbsp;
204
205 argcount = sizeof (myargs) / sizeof (Args);
206 if (!GetArgs ("REN_Search", argcount, myargs))
207 return 1;
208
209 if (myargs[0].intvalue == 0 && myargs[1].strvalue == NULL)
210 {
211 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
212 "No gi or FastA file given :: for help : srchnt -");
213 ErrShow ();
214 exit (1);
215 }
216
217 gsp = &gs;
218 gnbsp = &gpbs;
219
220 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
221 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
222 (size_t) (OBJ_MAX * sizeof (Boolean)));
223
224 gsp->ignore[OBJ_SEQDESC] = TRUE;
225 gsp->ignore[OBJ_BIOSEQ] = FALSE;
226
227 gnbsp->bsp = NULL;
228
229 gi = myargs[0].intvalue;
230 if (myargs[1].strvalue != NULL)
231 StrCpy (fastafile, myargs[1].strvalue);
232 else
233 fastafile[0] = '\0';
234
235 if (gi > 0)
236 {
237 if (!EntrezInit ("srchnt", FALSE, &flagHaveNet))
238 {
239 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
240 "Entrez init failed");
241 ErrShow ();
242 exit (1);
243 }
244 }
245
246 fiop = NULL;
247 if (gi > 0)
248 {
249 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
250 }
251 else
252 {
253 if ((fiop = FileOpen (fastafile, "r")) == NULL)
254 {
255 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
256 "Failed to open FastA file: %s", fastafile);
257 ErrShow ();
258 exit (1);
259 }
260 sep = FastaToSeqEntry (fiop, TRUE);
261 }
262
263 if (sep == NULL)
264 {
265 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
266 "No seqentry found");
267 ErrShow ();
268 exit (1);
269 }
270
271 namesfile[0] = '\0';
272 if (myargs[6].intvalue)
273 StrCpy (namesfile, names_file);
274 if (myargs[7].strvalue != NULL)
275 StrCpy (namesfile, myargs[7].strvalue);
276
277 namelist = ReadPatternNames (namesfile);
278
279 while (sep != NULL)
280 {
281 gnbsp->gi = gi;
282 GatherSeqEntry (sep, (Pointer) gnbsp, GetBioseq, (Pointer) gsp);
283
284 if (gnbsp->bsp != NULL)
285 {
286 if (ISA_na (gnbsp->bsp->mol))
287 {
288 if (cpph == NULL)
289 {
290 if (myargs[8].strvalue != NULL)
291 {
292 if ((cpph = CompilePattern (myargs[8].strvalue, 0)) != NULL)
293 StrCpy (cpph->name, "User Pattern");
294 }
295 else
296 {
297 if (myargs[2].strvalue != NULL)
298 cpph = ReadRENPattern (myargs[2].strvalue,
299 (Boolean) myargs[5].intvalue, namelist);
300 else
301 cpph = ReadRENPattern (pattern_file,
302 (Boolean) myargs[5].intvalue, namelist);
303 }
304 PalindromeCheck (cpph);
305 }
306 title = FastaTitle (gnbsp->bsp, ">", NULL);
307 printf ("%s\n", title);
308 MemFree (title);
309 cpp = cpph;
310 while (cpp != NULL)
311 {
312 printf (">%s\n", cpp->name);
313 sap = PatternMatchBioseq (gnbsp->bsp, cpp,
314 (Int4) myargs[4].intvalue);
315 if (sap != NULL)
316 printf (" Start\n");
317 URK_SeqAlignSortByStart (&sap);
318 if (myargs[3].intvalue)
319 {
320 EmbedFragLengthInfo (sap, gnbsp->bsp->length);
321 URK_SeqAlignSortByLength (&sap);
322 while (sap != NULL)
323 {
324 ssp = (StdSegPtr) sap->segs;
325 slp = ssp->loc;
326 if (slp->choice != SEQLOC_MIX)
327 {
328 start = SeqLocStart (slp);
329 }
330 else
331 {
332 slpt = (SeqLocPtr) slp->data.ptrvalue;
333 start = SeqLocStart (slpt);
334 }
335 printf ("%8ld %8ld\n", (long) start+1,
336 (long) ssp->scores->value.intvalue);
337 sapn = sap->next;
338 SeqAlignFree (sap);
339 sap = sapn;
340 }
341 }
342 else
343 {
344 slp = MatchSa2Sl (&sap);
345 while (slp != NULL)
346 {
347 if (slp->choice != SEQLOC_MIX)
348 {
349 start = SeqLocStart (slp);
350 }
351 else
352 {
353 slpt = (SeqLocPtr) slp->data.ptrvalue;
354 start = SeqLocStart (slpt);
355 }
356 printf ("%8ld\n", (long) start+1);
357 slpn = slp->next;
358 SeqLocFree (slp);
359 slp = slpn;
360 }
361 }
362 cpp = cpp->nextpattern;
363 }
364 }
365 else
366 {
367 ErrPostEx (SEV_ERROR, TOP_ERROR, 106,
368 "Not a nucleic bioseq");
369 ErrShow ();
370 exit (1);
371 }
372 }
373 else
374 {
375 ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
376 "No bioseq found");
377 ErrShow ();
378 exit (1);
379 }
380 SeqEntryFree (sep);
381 sep = NULL;
382 if (fiop != NULL)
383 sep = FastaToSeqEntry (fiop, TRUE);
384 }
385
386 ComPatFree (cpph);
387 ValNodeFreeData (namelist);
388 FileClose (fiop);
389 if (gi > 0)
390 EntrezFini ();
391
392 return 0;
393 }
394