1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: mts.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.11 $
33 *
34 * File Description: profile search
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: mts.c,v $
41 * Revision 6.11  1999/07/13 13:07:33  sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.10  1998/12/18 16:24:55  kuzio
45 * big GIs
46 *
47 * Revision 6.9  1998/11/16 14:34:12  kuzio
48 * flagBoundaryCondition
49 *
50 * Revision 6.8  1998/09/28 16:34:27  kuzio
51 * cmdln phrasing
52 *
53 * Revision 6.7  1998/09/16 18:19:29  kuzio
54 * cvs logging
55 *
56 * ==========================================================================
57 */
58 
59 #include <ncbi.h>
60 #include <accentr.h>
61 #include <gather.h>
62 #include <tofasta.h>
63 #include <urkutil.h>
64 #include <urkptpf.h>
65 
66 #define TOP_ERROR 1
67 static char _this_module[] = "mts";
68 #undef  THIS_MODULE
69 #define THIS_MODULE _this_module
70 static char _this_file[] = __FILE__;
71 #undef  THIS_FILE
72 #define THIS_FILE _this_file
73 
74 typedef struct gather_Bioseq
75 {
76   Int4      gi;
77   BioseqPtr bsp;
78 } Gather_BS, PNTR Gather_BSPtr;
79 
80 Args myargs[] =
81 {
82   { "profile matrix", NULL, NULL, NULL, FALSE,
83     'm', ARG_STRING, 0.0, 0, NULL},
84   { "integer matrix", "FALSE", "TRUE", "FALSE", TRUE,
85     'M', ARG_BOOLEAN, 0.0, 0, NULL},
86   { "GI", "0", "0", "9000000", TRUE,
87     'g', ARG_INT, 0.0, 0, NULL},
88   { "UID (GI) list", NULL, NULL, NULL, TRUE,
89     'G', ARG_STRING, 0.0, 0, NULL},
90   { "FastA file", NULL, NULL, NULL, TRUE,
91     'f', ARG_STRING, 0.0, 0, NULL},
92   { "FastA file is nucleotide", "FALSE", "TRUE", "FALSE", TRUE,
93     'n', ARG_BOOLEAN, 0.0, 0, NULL},
94   { "cutoff score", "0.0", "-1000000.0", "1000000.0", TRUE,
95     'c', ARG_FLOAT, 0.0, 0, NULL},
96   { "show hit sequences", "FALSE", "TRUE", "FALSE", TRUE,
97     'a', ARG_BOOLEAN, 0.0, 0, NULL},
98   { "output file", NULL, NULL, NULL, TRUE,
99     'o', ARG_STRING, 0.0, 0, NULL},
100   { "forced window size", "0", "0", "256", TRUE,
101     'w', ARG_INT, 0.0, 0, NULL}
102 };
103 
GetBioseq(GatherContextPtr gcp)104 static Boolean GetBioseq (GatherContextPtr gcp)
105 {
106   Gather_BSPtr   gbsp;
107   BioseqPtr      bsp;
108   Int4           gi, entrezgi;
109 
110   if (gcp == NULL)
111     return FALSE;
112   if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
113     return FALSE;
114 
115   if (gbsp->bsp != NULL)
116     return TRUE;
117   if (gcp->thistype != OBJ_BIOSEQ)
118     return TRUE;
119   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
120     return TRUE;
121 
122   gi = gbsp->gi;
123   if (gi > 0)
124   {
125     entrezgi = GetGIForSeqId (bsp->id);
126     if (gi == entrezgi)
127       gbsp->bsp = bsp;
128     return TRUE;
129   }
130   else
131   {
132     gbsp->bsp = bsp;
133     return TRUE;
134   }
135 }
136 
Main(void)137 Int2 Main (void)
138 {
139   Int2          argcount;
140   Boolean       flagHaveNet;
141 
142   Int4          gi;
143   SeqEntryPtr   sep;
144   Boolean       flagEntrez;
145 
146   FILE          *fiop, *fout;
147   Char          matrixfile[256], uidfile[256], fastafile[256];
148   Char          bufin[256];
149   CharPtr       title;
150   BioseqPtr     bsp;
151 
152   static GatherScope  gs;
153   GatherScopePtr      gsp;
154   static Gather_BS    gbs;
155   Gather_BSPtr        gbsp;
156 
157   Boolean       flagFirstPass, flagIsNA;
158 /*  Boolean       flagIsDNA; */
159   ComProfPtr    profile, invprofile;
160   FloatHi       cutoff;
161   SeqAlignPtr   saph, sap;
162   StdSegPtr     ssp;
163   SeqLocPtr     slp;
164   Int4          start, stop, proflen, windowsize;
165   Uint1         strand;
166   Char          cstr;
167   SeqPortPtr    spp;
168   Uint1Ptr      sequence;
169 
170   argcount = sizeof (myargs) / sizeof (Args);
171   if (!GetArgs ("mts", argcount, myargs))
172     return 1;
173 
174   if (myargs[2].intvalue == 0     &&
175       myargs[3].strvalue == NULL  &&
176       myargs[4].strvalue == NULL)
177   {
178     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
179                "No gi, UID list, or FastA file given :: for help :   mts -");
180     ErrShow ();
181     exit (1);
182   }
183 
184   StrCpy (matrixfile, myargs[0].strvalue);
185   if ((profile = ReadProfile (matrixfile)) == NULL)
186   {
187     ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
188                "Failed to read profile: %s", matrixfile);
189     ErrShow ();
190     exit (1);
191   }
192   if (myargs[1].intvalue)
193     IntegerProfile (profile);
194   proflen = ProfLenMax (profile);
195   windowsize = myargs[9].intvalue;
196   if (windowsize > proflen)
197     proflen = CatenateProfile (profile, proflen, windowsize-proflen);
198   cutoff = myargs[6].floatvalue;
199   if (myargs[7].intvalue == TRUE)
200     sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) * (proflen+1)));
201   else
202     sequence = NULL;
203 
204   if (myargs[8].strvalue == NULL)
205   {
206     fout = stdout;
207   }
208   else
209   {
210     if ((fout = FileOpen (myargs[8].strvalue, "w")) == NULL)
211     {
212       ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
213                  "Failed to open output file: %s", myargs[8].strvalue);
214       ErrShow ();
215       exit (1);
216     }
217   }
218 
219   gsp = &gs;
220   gbsp = &gbs;
221 
222   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
223   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
224           (size_t) (OBJ_MAX * sizeof (Boolean)));
225 
226   gsp->ignore[OBJ_BIOSEQ] = FALSE;
227 
228   uidfile[0] = '\0';
229   fastafile[0] = '\0';
230 
231   gi = myargs[2].intvalue;
232   if (myargs[3].strvalue != NULL)
233     StrCpy (uidfile, myargs[3].strvalue);
234   if (myargs[4].strvalue != NULL)
235     StrCpy (fastafile, myargs[4].strvalue);
236 
237   flagEntrez = FALSE;
238   if (gi > 0 || (StrLen (uidfile) > 0))
239   {
240     if (!EntrezInit ("mts", FALSE, &flagHaveNet))
241     {
242       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
243                  "Entrez init failed");
244       ErrShow ();
245       exit (1);
246     }
247     flagEntrez = TRUE;
248   }
249 
250   fiop = NULL;
251 
252   if (gi > 0)
253   {
254     sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
255   }
256   else if (StrLen (uidfile) > 0)
257   {
258     if ((fiop = FileOpen (uidfile, "r")) == NULL)
259     {
260       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
261                  "Failed to open UID file: %s", uidfile);
262       ErrShow ();
263       exit (1);
264     }
265     while ((FileGets (bufin, sizeof (bufin), fiop)) != NULL)
266     {
267       if (bufin[0] == '>')
268         continue;
269       gi = atol (bufin);
270       sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
271       break;
272     }
273   }
274   else
275   {
276     if ((fiop = FileOpen (fastafile, "r")) == NULL)
277     {
278       ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
279                  "Failed to open FastA file: %s", fastafile);
280       ErrShow ();
281       exit (1);
282     }
283     sep = FastaToSeqEntry (fiop, (Boolean) myargs[5].intvalue);
284   }
285 
286   if (sep == NULL)
287   {
288     ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
289                "No seqentry found");
290     ErrShow ();
291     exit (1);
292   }
293 
294   flagFirstPass = TRUE;
295 /*  flagIsDNA = FALSE; */
296   flagIsNA = FALSE;
297   invprofile = NULL;
298   while (sep != NULL)
299   {
300     gbsp->bsp = NULL;
301     gbsp->gi = gi;
302     GatherSeqEntry (sep, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
303 
304     if (gbsp->bsp != NULL)
305     {
306       bsp = gbsp->bsp;
307 /*
308  in multiple sequence sets the first entry determines
309  if the set is nucleotides or peptides
310 */
311       if (flagFirstPass)
312       {
313 /* just DNA? shouldn't have to (want to?) rev-cmp RNA */
314 /*        if (bsp->mol == Seq_mol_dna) */
315         if (ISA_na (bsp->mol))
316         {
317           invprofile = InvertProfile (profile);
318           if (myargs[1].intvalue)
319             IntegerProfile (invprofile);
320 /*          flagIsDNA = TRUE; */
321         }
322         if (ISA_na (bsp->mol))
323           flagIsNA = TRUE;
324         flagFirstPass = FALSE;
325       }
326 
327       title = FastaTitle (bsp, ">", NULL);
328       if (myargs[1].intvalue)
329         saph = sap = IntProfileMatchBioseq (gbsp->bsp, profile, invprofile,
330                                             (Int4) cutoff, FALSE);
331       else
332         saph = sap = ProfileMatchBioseq (gbsp->bsp, profile, invprofile,
333                                          cutoff, FALSE);
334       if (sap != NULL)
335         fprintf (fout, "%s\n", title);
336       MemFree (title);
337       URK_SeqAlignSortByStart (&sap);
338 
339       while (sap != NULL)
340       {
341         ssp = (StdSegPtr) sap->segs;
342         slp = ssp->loc;
343         start = SeqLocStart (slp);
344         stop = SeqLocStop (slp);
345         strand = SeqLocStrand (slp);
346         if (strand != Seq_strand_minus)
347           cstr = '+';
348         else
349           cstr = '-';
350         if (myargs[7].intvalue)
351         {
352           if (flagIsNA)
353             spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, strand,
354                               Seq_code_iupacna);
355           else
356             spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, strand,
357                               Seq_code_iupacaa);
358           if (strand != Seq_strand_minus)
359             SeqPortSeek (spp, start, SEEK_SET);
360           else
361             SeqPortSeek (spp, gbsp->bsp->length-stop-1, SEEK_SET);
362           SeqPortRead (spp, sequence, (Int2) (stop-start+1));
363           sequence[stop-start+1] = 0;
364           fprintf (fout, "%8ld %8ld  %c %6.2f  %s\n",
365                    (long) (start+1), (long) (stop+1),
366                    cstr, (FloatHi) sap->score->value.realvalue,
367                    (CharPtr) sequence);
368           SeqPortFree (spp);
369         }
370         else
371         {
372           fprintf (fout, "%8ld %8ld  %c %6.2f\n",
373                    (long) (start+1), (long) (stop+1),
374                    cstr, (FloatHi) sap->score->value.realvalue);
375         }
376         fflush (fout);
377         sap = sap->next;
378       }
379 
380       sap = saph;
381       while (sap != NULL)
382       {
383         saph = sap->next;
384         SeqAlignFree (sap);
385         sap = saph;
386       }
387     }
388     else
389     {
390       ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
391                  "No bioseq found");
392       ErrShow ();
393       exit (1);
394     }
395 
396     SeqEntryFree (sep);
397 
398     if (StrLen (uidfile) > 0)
399     {
400       while ((FileGets (bufin, sizeof (bufin), fiop)) != NULL)
401       {
402         if (bufin[0] == '>')
403           continue;
404         gi = atol (bufin);
405         sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
406         break;
407       }
408     }
409     else if (StrLen (fastafile) > 0)
410     {
411       sep = FastaToSeqEntry (fiop, (Boolean) myargs[5].intvalue);
412     }
413     else
414     {
415       sep = NULL;
416     }
417   }
418 
419   MemFree (sequence);
420   ComProfFree (profile);
421   ComProfFree (invprofile);
422   if (fout != stdout)
423     FileClose (fout);
424   FileClose (fiop);
425   if (flagEntrez)
426     EntrezFini ();
427 
428   return 0;
429 }
430