1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: mts.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.11 $
33 *
34 * File Description: profile search
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: mts.c,v $
41 * Revision 6.11 1999/07/13 13:07:33 sicotte
42 * prefix functions SeqAlignSort* with URK to resolve toolkit conflicts
43 *
44 * Revision 6.10 1998/12/18 16:24:55 kuzio
45 * big GIs
46 *
47 * Revision 6.9 1998/11/16 14:34:12 kuzio
48 * flagBoundaryCondition
49 *
50 * Revision 6.8 1998/09/28 16:34:27 kuzio
51 * cmdln phrasing
52 *
53 * Revision 6.7 1998/09/16 18:19:29 kuzio
54 * cvs logging
55 *
56 * ==========================================================================
57 */
58
59 #include <ncbi.h>
60 #include <accentr.h>
61 #include <gather.h>
62 #include <tofasta.h>
63 #include <urkutil.h>
64 #include <urkptpf.h>
65
66 #define TOP_ERROR 1
67 static char _this_module[] = "mts";
68 #undef THIS_MODULE
69 #define THIS_MODULE _this_module
70 static char _this_file[] = __FILE__;
71 #undef THIS_FILE
72 #define THIS_FILE _this_file
73
74 typedef struct gather_Bioseq
75 {
76 Int4 gi;
77 BioseqPtr bsp;
78 } Gather_BS, PNTR Gather_BSPtr;
79
80 Args myargs[] =
81 {
82 { "profile matrix", NULL, NULL, NULL, FALSE,
83 'm', ARG_STRING, 0.0, 0, NULL},
84 { "integer matrix", "FALSE", "TRUE", "FALSE", TRUE,
85 'M', ARG_BOOLEAN, 0.0, 0, NULL},
86 { "GI", "0", "0", "9000000", TRUE,
87 'g', ARG_INT, 0.0, 0, NULL},
88 { "UID (GI) list", NULL, NULL, NULL, TRUE,
89 'G', ARG_STRING, 0.0, 0, NULL},
90 { "FastA file", NULL, NULL, NULL, TRUE,
91 'f', ARG_STRING, 0.0, 0, NULL},
92 { "FastA file is nucleotide", "FALSE", "TRUE", "FALSE", TRUE,
93 'n', ARG_BOOLEAN, 0.0, 0, NULL},
94 { "cutoff score", "0.0", "-1000000.0", "1000000.0", TRUE,
95 'c', ARG_FLOAT, 0.0, 0, NULL},
96 { "show hit sequences", "FALSE", "TRUE", "FALSE", TRUE,
97 'a', ARG_BOOLEAN, 0.0, 0, NULL},
98 { "output file", NULL, NULL, NULL, TRUE,
99 'o', ARG_STRING, 0.0, 0, NULL},
100 { "forced window size", "0", "0", "256", TRUE,
101 'w', ARG_INT, 0.0, 0, NULL}
102 };
103
GetBioseq(GatherContextPtr gcp)104 static Boolean GetBioseq (GatherContextPtr gcp)
105 {
106 Gather_BSPtr gbsp;
107 BioseqPtr bsp;
108 Int4 gi, entrezgi;
109
110 if (gcp == NULL)
111 return FALSE;
112 if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
113 return FALSE;
114
115 if (gbsp->bsp != NULL)
116 return TRUE;
117 if (gcp->thistype != OBJ_BIOSEQ)
118 return TRUE;
119 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
120 return TRUE;
121
122 gi = gbsp->gi;
123 if (gi > 0)
124 {
125 entrezgi = GetGIForSeqId (bsp->id);
126 if (gi == entrezgi)
127 gbsp->bsp = bsp;
128 return TRUE;
129 }
130 else
131 {
132 gbsp->bsp = bsp;
133 return TRUE;
134 }
135 }
136
Main(void)137 Int2 Main (void)
138 {
139 Int2 argcount;
140 Boolean flagHaveNet;
141
142 Int4 gi;
143 SeqEntryPtr sep;
144 Boolean flagEntrez;
145
146 FILE *fiop, *fout;
147 Char matrixfile[256], uidfile[256], fastafile[256];
148 Char bufin[256];
149 CharPtr title;
150 BioseqPtr bsp;
151
152 static GatherScope gs;
153 GatherScopePtr gsp;
154 static Gather_BS gbs;
155 Gather_BSPtr gbsp;
156
157 Boolean flagFirstPass, flagIsNA;
158 /* Boolean flagIsDNA; */
159 ComProfPtr profile, invprofile;
160 FloatHi cutoff;
161 SeqAlignPtr saph, sap;
162 StdSegPtr ssp;
163 SeqLocPtr slp;
164 Int4 start, stop, proflen, windowsize;
165 Uint1 strand;
166 Char cstr;
167 SeqPortPtr spp;
168 Uint1Ptr sequence;
169
170 argcount = sizeof (myargs) / sizeof (Args);
171 if (!GetArgs ("mts", argcount, myargs))
172 return 1;
173
174 if (myargs[2].intvalue == 0 &&
175 myargs[3].strvalue == NULL &&
176 myargs[4].strvalue == NULL)
177 {
178 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
179 "No gi, UID list, or FastA file given :: for help : mts -");
180 ErrShow ();
181 exit (1);
182 }
183
184 StrCpy (matrixfile, myargs[0].strvalue);
185 if ((profile = ReadProfile (matrixfile)) == NULL)
186 {
187 ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
188 "Failed to read profile: %s", matrixfile);
189 ErrShow ();
190 exit (1);
191 }
192 if (myargs[1].intvalue)
193 IntegerProfile (profile);
194 proflen = ProfLenMax (profile);
195 windowsize = myargs[9].intvalue;
196 if (windowsize > proflen)
197 proflen = CatenateProfile (profile, proflen, windowsize-proflen);
198 cutoff = myargs[6].floatvalue;
199 if (myargs[7].intvalue == TRUE)
200 sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Uint1) * (proflen+1)));
201 else
202 sequence = NULL;
203
204 if (myargs[8].strvalue == NULL)
205 {
206 fout = stdout;
207 }
208 else
209 {
210 if ((fout = FileOpen (myargs[8].strvalue, "w")) == NULL)
211 {
212 ErrPostEx (SEV_ERROR, TOP_ERROR, 101,
213 "Failed to open output file: %s", myargs[8].strvalue);
214 ErrShow ();
215 exit (1);
216 }
217 }
218
219 gsp = &gs;
220 gbsp = &gbs;
221
222 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
223 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
224 (size_t) (OBJ_MAX * sizeof (Boolean)));
225
226 gsp->ignore[OBJ_BIOSEQ] = FALSE;
227
228 uidfile[0] = '\0';
229 fastafile[0] = '\0';
230
231 gi = myargs[2].intvalue;
232 if (myargs[3].strvalue != NULL)
233 StrCpy (uidfile, myargs[3].strvalue);
234 if (myargs[4].strvalue != NULL)
235 StrCpy (fastafile, myargs[4].strvalue);
236
237 flagEntrez = FALSE;
238 if (gi > 0 || (StrLen (uidfile) > 0))
239 {
240 if (!EntrezInit ("mts", FALSE, &flagHaveNet))
241 {
242 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
243 "Entrez init failed");
244 ErrShow ();
245 exit (1);
246 }
247 flagEntrez = TRUE;
248 }
249
250 fiop = NULL;
251
252 if (gi > 0)
253 {
254 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
255 }
256 else if (StrLen (uidfile) > 0)
257 {
258 if ((fiop = FileOpen (uidfile, "r")) == NULL)
259 {
260 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
261 "Failed to open UID file: %s", uidfile);
262 ErrShow ();
263 exit (1);
264 }
265 while ((FileGets (bufin, sizeof (bufin), fiop)) != NULL)
266 {
267 if (bufin[0] == '>')
268 continue;
269 gi = atol (bufin);
270 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
271 break;
272 }
273 }
274 else
275 {
276 if ((fiop = FileOpen (fastafile, "r")) == NULL)
277 {
278 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
279 "Failed to open FastA file: %s", fastafile);
280 ErrShow ();
281 exit (1);
282 }
283 sep = FastaToSeqEntry (fiop, (Boolean) myargs[5].intvalue);
284 }
285
286 if (sep == NULL)
287 {
288 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
289 "No seqentry found");
290 ErrShow ();
291 exit (1);
292 }
293
294 flagFirstPass = TRUE;
295 /* flagIsDNA = FALSE; */
296 flagIsNA = FALSE;
297 invprofile = NULL;
298 while (sep != NULL)
299 {
300 gbsp->bsp = NULL;
301 gbsp->gi = gi;
302 GatherSeqEntry (sep, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
303
304 if (gbsp->bsp != NULL)
305 {
306 bsp = gbsp->bsp;
307 /*
308 in multiple sequence sets the first entry determines
309 if the set is nucleotides or peptides
310 */
311 if (flagFirstPass)
312 {
313 /* just DNA? shouldn't have to (want to?) rev-cmp RNA */
314 /* if (bsp->mol == Seq_mol_dna) */
315 if (ISA_na (bsp->mol))
316 {
317 invprofile = InvertProfile (profile);
318 if (myargs[1].intvalue)
319 IntegerProfile (invprofile);
320 /* flagIsDNA = TRUE; */
321 }
322 if (ISA_na (bsp->mol))
323 flagIsNA = TRUE;
324 flagFirstPass = FALSE;
325 }
326
327 title = FastaTitle (bsp, ">", NULL);
328 if (myargs[1].intvalue)
329 saph = sap = IntProfileMatchBioseq (gbsp->bsp, profile, invprofile,
330 (Int4) cutoff, FALSE);
331 else
332 saph = sap = ProfileMatchBioseq (gbsp->bsp, profile, invprofile,
333 cutoff, FALSE);
334 if (sap != NULL)
335 fprintf (fout, "%s\n", title);
336 MemFree (title);
337 URK_SeqAlignSortByStart (&sap);
338
339 while (sap != NULL)
340 {
341 ssp = (StdSegPtr) sap->segs;
342 slp = ssp->loc;
343 start = SeqLocStart (slp);
344 stop = SeqLocStop (slp);
345 strand = SeqLocStrand (slp);
346 if (strand != Seq_strand_minus)
347 cstr = '+';
348 else
349 cstr = '-';
350 if (myargs[7].intvalue)
351 {
352 if (flagIsNA)
353 spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, strand,
354 Seq_code_iupacna);
355 else
356 spp = SeqPortNew (gbsp->bsp, 0, gbsp->bsp->length-1, strand,
357 Seq_code_iupacaa);
358 if (strand != Seq_strand_minus)
359 SeqPortSeek (spp, start, SEEK_SET);
360 else
361 SeqPortSeek (spp, gbsp->bsp->length-stop-1, SEEK_SET);
362 SeqPortRead (spp, sequence, (Int2) (stop-start+1));
363 sequence[stop-start+1] = 0;
364 fprintf (fout, "%8ld %8ld %c %6.2f %s\n",
365 (long) (start+1), (long) (stop+1),
366 cstr, (FloatHi) sap->score->value.realvalue,
367 (CharPtr) sequence);
368 SeqPortFree (spp);
369 }
370 else
371 {
372 fprintf (fout, "%8ld %8ld %c %6.2f\n",
373 (long) (start+1), (long) (stop+1),
374 cstr, (FloatHi) sap->score->value.realvalue);
375 }
376 fflush (fout);
377 sap = sap->next;
378 }
379
380 sap = saph;
381 while (sap != NULL)
382 {
383 saph = sap->next;
384 SeqAlignFree (sap);
385 sap = saph;
386 }
387 }
388 else
389 {
390 ErrPostEx (SEV_ERROR, TOP_ERROR, 105,
391 "No bioseq found");
392 ErrShow ();
393 exit (1);
394 }
395
396 SeqEntryFree (sep);
397
398 if (StrLen (uidfile) > 0)
399 {
400 while ((FileGets (bufin, sizeof (bufin), fiop)) != NULL)
401 {
402 if (bufin[0] == '>')
403 continue;
404 gi = atol (bufin);
405 sep = EntrezSeqEntryGet (gi, SEQENTRY_READ_BIOSEQ);
406 break;
407 }
408 }
409 else if (StrLen (fastafile) > 0)
410 {
411 sep = FastaToSeqEntry (fiop, (Boolean) myargs[5].intvalue);
412 }
413 else
414 {
415 sep = NULL;
416 }
417 }
418
419 MemFree (sequence);
420 ComProfFree (profile);
421 ComProfFree (invprofile);
422 if (fout != stdout)
423 FileClose (fout);
424 FileClose (fiop);
425 if (flagEntrez)
426 EntrezFini ();
427
428 return 0;
429 }
430