1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: dst.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.7 $
33 *
34 * File Description: a dust utility
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: dst.c,v $
41 * Revision 6.7 1998/12/18 16:24:54 kuzio
42 * big GIs
43 *
44 * Revision 6.6 1998/09/16 18:19:28 kuzio
45 * cvs logging
46 *
47 *
48 *
49 *
50 * ==========================================================================
51 */
52
53 #include <ncbi.h>
54 #include <accentr.h>
55 #include <gather.h>
56 #include <tofasta.h>
57 #include <urkutil.h>
58 #include <urkdust.h>
59
60 #define TOP_ERROR 1
61 static char _this_module[] = "dust";
62 #undef THIS_MODULE
63 #define THIS_MODULE _this_module
64 static char _this_file[] = __FILE__;
65 #undef THIS_FILE
66 #define THIS_FILE _this_file
67
68 typedef struct gather_Bioseq
69 {
70 Int4 gi;
71 BioseqPtr bsp;
72 } Gather_BS, PNTR Gather_BSPtr;
73
74 Args myargs[] =
75 {
76 { "FastA file", NULL, NULL, NULL, TRUE,
77 'f', ARG_STRING, 0.0, 0, NULL},
78 { "nucleotide GI", "0", "0", "9000000", TRUE,
79 'g', ARG_INT, 0.0, 0, NULL},
80 { "dust window", "64", "16", "256", TRUE,
81 'w', ARG_INT, 0.0, 0, NULL},
82 { "dust cutoff", "20", "10", "100", TRUE,
83 'c', ARG_INT, 0.0, 0, NULL},
84 { "brute force", "FALSE", "FALSE", "TRUE", TRUE,
85 'b', ARG_BOOLEAN, 0.0, 0, NULL},
86 { "output line length", "50", "40", "160", TRUE,
87 'l', ARG_INT, 0.0, 0, NULL},
88 { "n-out for Blast", "FALSE", "FALSE", "TRUE", TRUE,
89 'n', ARG_BOOLEAN, 0.0, 0, NULL},
90 { "numeric output", "FALSE", "FALSE", "TRUE", TRUE,
91 'N', ARG_BOOLEAN, 0.0, 0, NULL}
92 };
93
GetNucleotideBioseq(GatherContextPtr gcp)94 static Boolean GetNucleotideBioseq (GatherContextPtr gcp)
95 {
96 Gather_BSPtr gbsp;
97 BioseqPtr bsp;
98 Int4 gi, entrezgi;
99
100 if (gcp == NULL)
101 return FALSE;
102 if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
103 return FALSE;
104
105 if (gbsp->bsp != NULL)
106 return TRUE;
107 if (gcp->thistype != OBJ_BIOSEQ)
108 return TRUE;
109 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
110 return TRUE;
111
112 gi = gbsp->gi;
113 if (gi > 0)
114 {
115 entrezgi = GetGIForSeqId (bsp->id);
116 if (gi == entrezgi)
117 gbsp->bsp = bsp;
118 return TRUE;
119 }
120 else
121 {
122 gbsp->bsp = bsp;
123 return TRUE;
124 }
125 }
126
Main(void)127 Int2 Main (void)
128 {
129 Int2 argcount;
130 Boolean flagHaveNet;
131
132 Int4 gi;
133 SeqEntryPtr sep;
134
135 FILE *fiop;
136 Char fastafile[256];
137 CharPtr title;
138
139 static GatherScope gs;
140 GatherScopePtr gsp;
141 static Gather_BS gbs;
142 Gather_BSPtr gbsp;
143
144 Int4 i, start, stop, linelen;
145 SeqLocPtr slp, slpn;
146 Uint1Ptr sequence;
147 SeqPortPtr spp;
148
149 DustDataPtr ddp;
150 DustRegionPtr drp;
151
152 argcount = sizeof (myargs) / sizeof (Args);
153 if (!GetArgs ("dust", argcount, myargs))
154 return 1;
155
156 gsp = &gs;
157 gbsp = &gbs;
158
159 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
160 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
161 (size_t) (OBJ_MAX * sizeof (Boolean)));
162 gsp->ignore[OBJ_BIOSEQ] = FALSE;
163
164 gbsp->bsp = NULL;
165
166 if (myargs[1].intvalue == 0 && myargs[0].strvalue == NULL)
167 {
168 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
169 "No gi or FastA file given :: for help : dust -");
170 ErrShow ();
171 exit (1);
172 }
173
174 gi = myargs[1].intvalue;
175 if (myargs[0].strvalue != NULL)
176 StrCpy (fastafile, myargs[0].strvalue);
177 else
178 fastafile[0] = '\0';
179
180 linelen = myargs[5].intvalue;
181
182 if (gi > 0)
183 {
184 if (!EntrezInit ("dust", FALSE, &flagHaveNet))
185 {
186 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
187 "Entrez init failed");
188 ErrShow ();
189 exit (1);
190 }
191 }
192
193 if (gi > 0)
194 {
195 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
196 }
197 else
198 {
199 if ((fiop = FileOpen (fastafile, "r")) == NULL)
200 {
201 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
202 "Failed to open FastA file");
203 ErrShow ();
204 exit (1);
205 }
206 sep = FastaToSeqEntry (fiop, TRUE);
207 }
208
209 if (sep == NULL)
210 {
211 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
212 "No seqentry found");
213 ErrShow ();
214 exit (1);
215 }
216
217 while (sep != NULL)
218 {
219 gbsp->gi = gi;
220 GatherSeqEntry (sep, (Pointer) gbsp, GetNucleotideBioseq,
221 (Pointer) gsp);
222
223 if (gbsp->bsp != NULL)
224 {
225 if (ISA_na (gbsp->bsp->mol))
226 {
227 ddp = DustDataNew ();
228 ddp->windowsize = myargs[2].intvalue;
229 ddp->level = myargs[3].intvalue;
230 ddp->percent = myargs[3].floatvalue;
231 ddp->flagBrute = (Boolean) myargs[4].intvalue;
232 drp = DustBioseq (gbsp->bsp, 0, gbsp->bsp->length-1, ddp);
233 slpn = slp = DustFilter (drp, ddp->percent, gbsp->bsp->id);
234 DustDataFree (ddp);
235 DustRegionFree (drp);
236
237 title = FastaTitle (gbsp->bsp, ">", NULL);
238 printf ("%s\n", title);
239 MemFree (title);
240 if (myargs[7].intvalue == TRUE)
241 {
242 slp = slpn;
243 while (slp != NULL)
244 {
245 start = SeqLocStart (slp) + 1;
246 stop = SeqLocStop (slp) + 1;
247 printf ("%9ld %9ld\n", (long) start, (long) stop);
248 slp = slp->next;
249 }
250 }
251 else
252 {
253 sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
254 gbsp->bsp->length+1));
255 spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, 0,
256 Seq_code_iupacna);
257 SeqPortSeek (spp, 0, SEEK_SET);
258
259 i = 0;
260 while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
261 {
262 if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
263 ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
264 i++;
265 }
266 sequence[i] = 0;
267
268 i = 0;
269 while (sequence[i] != 0)
270 {
271 sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
272 i++;
273 }
274
275 slp = slpn;
276 while (slp != NULL)
277 {
278 start = SeqLocStart (slp);
279 stop = SeqLocStop (slp);
280 for (i = start; i <= stop; i++)
281 {
282 if (myargs[6].intvalue == TRUE)
283 sequence[i] = (Uint1) 'n';
284 else
285 sequence[i] = (Uint1) TO_LOWER ((Char) sequence[i]);
286 }
287 slp = slp->next;
288 }
289
290 i = 0;
291 while (sequence[i] != 0)
292 {
293 printf ("%c", (Char) sequence[i]);
294 i++;
295 if (i % linelen == 0)
296 {
297 if (myargs[6].intvalue == TRUE)
298 printf ("\n");
299 else
300 printf (" %8ld\n", (long) i);
301 }
302 }
303 if (i % linelen != 0)
304 printf ("\n");
305
306 SeqPortFree (spp);
307 MemFree (sequence);
308 }
309 slp = slpn;
310 while (slp != NULL)
311 {
312 slpn = slp->next;
313 SeqLocFree (slp);
314 slp = slpn;
315 }
316 }
317 }
318 sep = SeqEntryFree (sep);
319 if (gi <= 0)
320 {
321 sep = FastaToSeqEntry (fiop, TRUE);
322 gbsp->bsp = NULL;
323 }
324 }
325
326 if (gi > 0)
327 EntrezFini ();
328 else
329 FileClose (fiop);
330 return 0;
331 }
332