1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: dst.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.7 $
33 *
34 * File Description: a dust utility
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: dst.c,v $
41 * Revision 6.7  1998/12/18 16:24:54  kuzio
42 * big GIs
43 *
44 * Revision 6.6  1998/09/16 18:19:28  kuzio
45 * cvs logging
46 *
47 *
48 *
49 *
50 * ==========================================================================
51 */
52 
53 #include <ncbi.h>
54 #include <accentr.h>
55 #include <gather.h>
56 #include <tofasta.h>
57 #include <urkutil.h>
58 #include <urkdust.h>
59 
60 #define TOP_ERROR 1
61 static char _this_module[] = "dust";
62 #undef  THIS_MODULE
63 #define THIS_MODULE _this_module
64 static char _this_file[] = __FILE__;
65 #undef  THIS_FILE
66 #define THIS_FILE _this_file
67 
68 typedef struct gather_Bioseq
69 {
70   Int4      gi;
71   BioseqPtr bsp;
72 } Gather_BS, PNTR Gather_BSPtr;
73 
74 Args myargs[] =
75 {
76   { "FastA file", NULL, NULL, NULL, TRUE,
77     'f', ARG_STRING, 0.0, 0, NULL},
78   { "nucleotide GI", "0", "0", "9000000", TRUE,
79     'g', ARG_INT, 0.0, 0, NULL},
80   { "dust window", "64", "16", "256", TRUE,
81     'w', ARG_INT, 0.0, 0, NULL},
82   { "dust cutoff", "20", "10", "100", TRUE,
83     'c', ARG_INT, 0.0, 0, NULL},
84   { "brute force", "FALSE", "FALSE", "TRUE", TRUE,
85     'b', ARG_BOOLEAN, 0.0, 0, NULL},
86   { "output line length", "50", "40", "160", TRUE,
87     'l', ARG_INT, 0.0, 0, NULL},
88   { "n-out for Blast", "FALSE", "FALSE", "TRUE", TRUE,
89     'n', ARG_BOOLEAN, 0.0, 0, NULL},
90   { "numeric output", "FALSE", "FALSE", "TRUE", TRUE,
91     'N', ARG_BOOLEAN, 0.0, 0, NULL}
92 };
93 
GetNucleotideBioseq(GatherContextPtr gcp)94 static Boolean GetNucleotideBioseq (GatherContextPtr gcp)
95 {
96   Gather_BSPtr   gbsp;
97   BioseqPtr      bsp;
98   Int4           gi, entrezgi;
99 
100   if (gcp == NULL)
101     return FALSE;
102   if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
103     return FALSE;
104 
105   if (gbsp->bsp != NULL)
106     return TRUE;
107   if (gcp->thistype != OBJ_BIOSEQ)
108     return TRUE;
109   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
110     return TRUE;
111 
112   gi = gbsp->gi;
113   if (gi > 0)
114   {
115     entrezgi = GetGIForSeqId (bsp->id);
116     if (gi == entrezgi)
117       gbsp->bsp = bsp;
118     return TRUE;
119   }
120   else
121   {
122     gbsp->bsp = bsp;
123     return TRUE;
124   }
125 }
126 
Main(void)127 Int2 Main (void)
128 {
129   Int2        argcount;
130   Boolean     flagHaveNet;
131 
132   Int4        gi;
133   SeqEntryPtr sep;
134 
135   FILE        *fiop;
136   Char        fastafile[256];
137   CharPtr     title;
138 
139   static GatherScope  gs;
140   GatherScopePtr      gsp;
141   static Gather_BS    gbs;
142   Gather_BSPtr        gbsp;
143 
144   Int4           i, start, stop, linelen;
145   SeqLocPtr      slp, slpn;
146   Uint1Ptr       sequence;
147   SeqPortPtr     spp;
148 
149   DustDataPtr    ddp;
150   DustRegionPtr  drp;
151 
152   argcount = sizeof (myargs) / sizeof (Args);
153   if (!GetArgs ("dust", argcount, myargs))
154     return 1;
155 
156   gsp = &gs;
157   gbsp = &gbs;
158 
159   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
160   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
161           (size_t) (OBJ_MAX * sizeof (Boolean)));
162   gsp->ignore[OBJ_BIOSEQ] = FALSE;
163 
164   gbsp->bsp = NULL;
165 
166   if (myargs[1].intvalue == 0 && myargs[0].strvalue == NULL)
167   {
168     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
169                "No gi or FastA file given :: for help :   dust -");
170     ErrShow ();
171     exit (1);
172   }
173 
174   gi = myargs[1].intvalue;
175   if (myargs[0].strvalue != NULL)
176     StrCpy (fastafile, myargs[0].strvalue);
177   else
178     fastafile[0] = '\0';
179 
180   linelen = myargs[5].intvalue;
181 
182   if (gi > 0)
183   {
184     if (!EntrezInit ("dust", FALSE, &flagHaveNet))
185     {
186       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
187                  "Entrez init failed");
188       ErrShow ();
189       exit (1);
190     }
191   }
192 
193   if (gi > 0)
194   {
195     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
196   }
197   else
198   {
199     if ((fiop = FileOpen (fastafile, "r")) == NULL)
200     {
201       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
202                  "Failed to open FastA file");
203       ErrShow ();
204       exit (1);
205     }
206     sep = FastaToSeqEntry (fiop, TRUE);
207   }
208 
209   if (sep == NULL)
210   {
211     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
212                "No seqentry found");
213     ErrShow ();
214     exit (1);
215   }
216 
217   while (sep != NULL)
218   {
219     gbsp->gi = gi;
220     GatherSeqEntry (sep, (Pointer) gbsp, GetNucleotideBioseq,
221                     (Pointer) gsp);
222 
223     if (gbsp->bsp != NULL)
224     {
225       if (ISA_na (gbsp->bsp->mol))
226       {
227         ddp = DustDataNew ();
228         ddp->windowsize = myargs[2].intvalue;
229         ddp->level = myargs[3].intvalue;
230         ddp->percent = myargs[3].floatvalue;
231         ddp->flagBrute = (Boolean) myargs[4].intvalue;
232         drp = DustBioseq (gbsp->bsp, 0, gbsp->bsp->length-1, ddp);
233         slpn = slp = DustFilter (drp, ddp->percent, gbsp->bsp->id);
234         DustDataFree (ddp);
235         DustRegionFree (drp);
236 
237         title = FastaTitle (gbsp->bsp, ">", NULL);
238         printf ("%s\n", title);
239         MemFree (title);
240         if (myargs[7].intvalue == TRUE)
241         {
242           slp = slpn;
243           while (slp != NULL)
244           {
245             start = SeqLocStart (slp) + 1;
246             stop = SeqLocStop (slp) + 1;
247             printf ("%9ld %9ld\n", (long) start, (long) stop);
248             slp = slp->next;
249           }
250         }
251         else
252         {
253           sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) *
254                                                   gbsp->bsp->length+1));
255           spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, 0,
256                             Seq_code_iupacna);
257           SeqPortSeek (spp, 0, SEEK_SET);
258 
259           i = 0;
260           while ((sequence[i] = SeqPortGetResidue (spp)) != SEQPORT_EOF)
261           {
262             if (('a' <= (Char) sequence[i] && (Char) sequence[i] <= 'z') ||
263                 ('A' <= (Char) sequence[i] && (Char) sequence[i] <= 'Z'))
264               i++;
265           }
266           sequence[i] = 0;
267 
268           i = 0;
269           while (sequence[i] != 0)
270           {
271             sequence[i] = (Uint1) TO_UPPER ((Char) sequence[i]);
272             i++;
273           }
274 
275           slp = slpn;
276           while (slp != NULL)
277           {
278             start = SeqLocStart (slp);
279             stop = SeqLocStop (slp);
280             for (i = start; i <= stop; i++)
281             {
282               if (myargs[6].intvalue == TRUE)
283                 sequence[i] = (Uint1) 'n';
284               else
285                 sequence[i] = (Uint1) TO_LOWER ((Char) sequence[i]);
286             }
287             slp = slp->next;
288           }
289 
290           i = 0;
291           while (sequence[i] != 0)
292           {
293             printf ("%c", (Char) sequence[i]);
294             i++;
295             if (i % linelen == 0)
296             {
297               if (myargs[6].intvalue == TRUE)
298                 printf ("\n");
299               else
300                 printf (" %8ld\n", (long) i);
301             }
302           }
303           if (i % linelen != 0)
304             printf ("\n");
305 
306           SeqPortFree (spp);
307           MemFree (sequence);
308         }
309         slp = slpn;
310         while (slp != NULL)
311         {
312           slpn = slp->next;
313           SeqLocFree (slp);
314           slp = slpn;
315         }
316       }
317     }
318     sep = SeqEntryFree (sep);
319     if (gi <= 0)
320     {
321       sep = FastaToSeqEntry (fiop, TRUE);
322       gbsp->bsp = NULL;
323     }
324   }
325 
326   if (gi > 0)
327     EntrezFini ();
328   else
329     FileClose (fiop);
330   return 0;
331 }
332