1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: twop.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.5 $
33 *
34 * File Description: best id blast of two proteins
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: twop.c,v $
41 * Revision 6.5 1998/09/16 18:19:32 kuzio
42 * cvs logging
43 *
44 * ==========================================================================
45 */
46
47 /*
48 BLAST pair score for longest gapped overlap
49 */
50
51 #include <ncbi.h>
52 #include <accentr.h>
53 #include <tofasta.h>
54 #include <gather.h>
55 #include <blast.h>
56 #include <urkutil.h>
57 #include <urkptpf.h>
58
59 #define TOP_ERROR 1
60 static char _this_module[] = "twop";
61 #undef THIS_MODULE
62 #define THIS_MODULE _this_module
63 static char _this_file[] = __FILE__;
64 #undef THIS_FILE
65 #define THIS_FILE _this_file
66
67 typedef struct gather_Bioseq
68 {
69 Int4 gi;
70 BioseqPtr bsp;
71 } Gather_BS, PNTR Gather_BSPtr;
72
73 Args myargs[] =
74 {
75 { "FastA protein target file (search)", NULL, NULL, NULL, TRUE,
76 'F', ARG_STRING, 0.0, 0, NULL },
77 { "FastA protein probe file (query)", NULL, NULL, NULL, TRUE,
78 'f', ARG_STRING, 0.0, 0, NULL },
79 { "List of protein gi's target file (search)", NULL, NULL, NULL, TRUE,
80 'L', ARG_STRING, 0.0, 0, NULL },
81 { "List of protein gi's probe file (query)", NULL, NULL, NULL, TRUE,
82 'l', ARG_STRING, 0.0, 0, NULL },
83 { "Output file", NULL, NULL, NULL, TRUE,
84 'o', ARG_STRING, 0.0, 0, NULL },
85 { "Show pattern match results", "FALSE", "FALSE", "TRUE", TRUE,
86 'p', ARG_BOOLEAN, 0.0, 0, NULL }
87 };
88
GetBioseq(GatherContextPtr gcp)89 static Boolean GetBioseq (GatherContextPtr gcp)
90 {
91 Gather_BSPtr gbsp;
92 BioseqPtr bsp;
93 Int4 entrezgi;
94
95 if (gcp == NULL)
96 return FALSE;
97 if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
98 return FALSE;
99
100 if (gbsp->bsp != NULL)
101 return TRUE;
102 if (gcp->thistype != OBJ_BIOSEQ)
103 return TRUE;
104 if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
105 return TRUE;
106
107 if (gbsp->gi > 0)
108 {
109 entrezgi = GetGIForSeqId (bsp->id);
110 if (gbsp->gi == entrezgi)
111 gbsp->bsp = bsp;
112 return TRUE;
113 }
114 else
115 {
116 gbsp->bsp = bsp;
117 return TRUE;
118 }
119 }
120
Main(void)121 Int2 Main (void)
122 {
123 Int2 argcount;
124 Boolean flagHaveNet;
125
126 FILE *fioT, *fioP, *fout;
127 Boolean flagFastaT, flagFastaP, flagFastaL, flagFastal;
128 Char bufuid[256];
129 Int4 giT, giP;
130
131 SeqEntryPtr sepT, sepP;
132 BioseqPtr bspT, bspP;
133 SeqLocPtr slpT, slpP;
134
135 BLAST_OptionsBlkPtr blstopt;
136
137 Int4 i, loop;
138 Int4 startT, startP;
139 SeqAlignPtr sap, sapn;
140 DenseSegPtr dsp;
141 SeqPortPtr sppT, sppP;
142 Uint1 seqT, seqP;
143 Int4 effective_length, eff_len_max, seq_identity, seq_id_max;
144 CharPtr titleT, titleP;
145 CharPtr cptr;
146
147 Boolean flagPattern;
148 CharPtr pattern_file = "ncbipros.dat";
149 ComPatPtr cpp, cpph;
150 SeqAlignPtr sapT, sapP;
151
152 static GatherScope gs;
153 GatherScopePtr gsp;
154 static Gather_BS gbs;
155 Gather_BSPtr gbsp;
156
157 argcount = sizeof (myargs) / sizeof (Args);
158 if (!GetArgs ("twop", argcount, myargs))
159 return 1;
160
161 ErrSetLogLevel (SEV_ERROR);
162 ErrSetMessageLevel (SEV_ERROR);
163
164 gsp = &gs;
165 MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
166 MemSet ((Pointer) gsp->ignore, (int) (TRUE),
167 (size_t) (OBJ_MAX * sizeof (Boolean)));
168 gsp->ignore[OBJ_BIOSEQ] = FALSE;
169 gbsp = &gbs;
170
171 flagPattern = (Boolean) myargs[5].intvalue;
172
173 if (myargs[0].strvalue != NULL)
174 {
175 flagFastaT = TRUE;
176 flagFastaL = FALSE;
177 }
178 else if (myargs[2].strvalue != NULL)
179 {
180 flagFastaT = FALSE;
181 flagFastaL = TRUE;
182 }
183 else
184 {
185 flagFastaT = FALSE;
186 flagFastaL = FALSE;
187 }
188 if (myargs[1].strvalue != NULL)
189 {
190 flagFastaP = TRUE;
191 flagFastal = FALSE;
192 }
193 else if (myargs[3].strvalue != NULL)
194 {
195 flagFastaP = FALSE;
196 flagFastal = TRUE;
197 }
198 else
199 {
200 flagFastaP = FALSE;
201 flagFastal = FALSE;
202 }
203
204 if (!flagFastaT && !flagFastaL)
205 {
206 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
207 "No target file given :: for help : twop -");
208 ErrShow ();
209 exit (1);
210 }
211 if (!flagFastaP && !flagFastal)
212 {
213 ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
214 "No probe file given :: for help : twop -");
215 ErrShow ();
216 exit (1);
217 }
218
219 if (flagFastaL || flagFastal)
220 {
221 if (!EntrezInit ("twop", FALSE, &flagHaveNet))
222 {
223 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
224 "Entrez init failed");
225 ErrShow ();
226 exit (1);
227 }
228 }
229
230 giT = 0;
231 giP = 0;
232
233 if (flagFastaT)
234 {
235 if ((fioT = FileOpen (myargs[0].strvalue, "r")) == NULL)
236 {
237 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
238 "Failed to open FastA file %s", myargs[0].strvalue);
239 ErrShow ();
240 exit (1);
241 }
242 sepT = FastaToSeqEntry (fioT, FALSE);
243 }
244
245 if (flagFastaP)
246 {
247 if ((fioP = FileOpen (myargs[1].strvalue, "r")) == NULL)
248 {
249 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
250 "Failed to open FastA file %s", myargs[1].strvalue);
251 ErrShow ();
252 exit (1);
253 }
254 sepP = FastaToSeqEntry (fioP, FALSE);
255 }
256
257 if (flagFastaL)
258 {
259 if ((fioT = FileOpen (myargs[2].strvalue, "r")) == NULL)
260 {
261 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
262 "Failed to open gi list file %s", myargs[2].strvalue);
263 ErrShow ();
264 exit (1);
265 }
266 while ((FileGets (bufuid, sizeof (bufuid), fioT)) != NULL)
267 {
268 if (bufuid[0] == '>')
269 continue;
270 break;
271 }
272 giT = atol (bufuid);
273 sepT = EntrezSeqEntryGet (giT, SEQENTRY_READ_NUC_PROT);
274 }
275
276 if (flagFastal)
277 {
278 if ((fioP = FileOpen (myargs[3].strvalue, "r")) == NULL)
279 {
280 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
281 "Failed to open gi list file %s", myargs[3].strvalue);
282 ErrShow ();
283 exit (1);
284 }
285 while ((FileGets (bufuid, sizeof (bufuid), fioP)) != NULL)
286 {
287 if (bufuid[0] == '>')
288 continue;
289 break;
290 }
291 giP = atol (bufuid);
292 sepP = EntrezSeqEntryGet (giP, SEQENTRY_READ_NUC_PROT);
293 }
294
295 if (sepT == NULL || sepP == NULL)
296 {
297 ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
298 "Seqentry for target or probe not found");
299 ErrShow ();
300 exit (1);
301 }
302
303 if (myargs[4].strvalue != NULL)
304 {
305 if ((fout = FileOpen (myargs[4].strvalue, "w")) == NULL)
306 {
307 ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
308 "Failed to open output file %s", myargs[4].strvalue);
309 ErrShow ();
310 exit (1);
311 }
312 }
313 else
314 {
315 fout = stdout;
316 }
317
318 if (flagPattern)
319 cpph = ReadPrositePattern (pattern_file, FALSE, 25, NULL, NULL);
320 else
321 cpph = NULL;
322
323 blstopt = BLASTOptionNew ("blastp", TRUE);
324 blstopt->wordsize = 3;
325 blstopt->expect_value = 10.0;
326 blstopt->filter = 0;
327 blstopt->gap_open = 9;
328 blstopt->gap_extend = 2;
329 blstopt->gap_x_dropoff = 50;
330 blstopt->threshold_second = 9;
331 blstopt->two_pass_method = FALSE;
332 blstopt->multiple_hits_only = FALSE;
333
334 while (sepT != NULL && sepP != NULL)
335 {
336 gbsp->gi = giT;
337 gbsp->bsp = NULL;
338 GatherSeqEntry (sepT, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
339 bspT = gbsp->bsp;
340 gbsp->gi = giP;
341 gbsp->bsp = NULL;
342 GatherSeqEntry (sepP, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
343 bspP = gbsp->bsp;
344 if (bspT == NULL || bspP == NULL)
345 {
346 ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
347 "Bioseq for target or probe not found");
348 ErrShow ();
349 exit (1);
350 }
351 slpT = SeqLocIntNew (0, bspT->length-1, Seq_strand_unknown, bspT->id);
352 slpP = SeqLocIntNew (0, bspP->length-1, Seq_strand_unknown, bspP->id);
353
354 sap = BlastTwoSequencesByLoc (slpT, slpP, "blastp", blstopt);
355 seq_id_max = 0;
356 eff_len_max = 0;
357 while (sap != NULL)
358 {
359 effective_length = 0;
360 seq_identity = 0;
361 if (sap->segtype == 2) /* DenseSeg gapped search */
362 {
363 sppT = SeqPortNew (bspT, 0, bspT->length-1, 0, Seq_code_iupacaa);
364 sppP = SeqPortNew (bspP, 0, bspP->length-1, 0, Seq_code_iupacaa);
365 dsp = sap->segs;
366 for (loop = 0; loop < (Int4) dsp->numseg; loop++)
367 {
368 if (dsp->starts[loop*2] == -1 || dsp->starts[(loop*2)+1] == -1)
369 {
370 effective_length += dsp->lens[loop];
371 }
372 else
373 {
374 effective_length += dsp->lens[loop];
375 startT = dsp->starts[loop*2];
376 startP = dsp->starts[(loop*2)+1];
377 SeqPortSeek (sppT, startT, SEEK_SET);
378 SeqPortSeek (sppP, startP, SEEK_SET);
379 for (i = 0; i < dsp->lens[loop]; i++)
380 {
381 seqT = SeqPortGetResidue (sppT);
382 seqP = SeqPortGetResidue (sppP);
383 if (seqT == seqP)
384 seq_identity++;
385 }
386 }
387 }
388 SeqPortFree (sppT);
389 SeqPortFree (sppP);
390 }
391 if (effective_length > eff_len_max)
392 {
393 seq_id_max = seq_identity;
394 eff_len_max = effective_length;
395 }
396 sapn = sap->next;
397 SeqAlignFree (sap);
398 sap = sapn;
399 }
400
401 titleT = FastaTitle (bspT, ">", NULL);
402 titleP = FastaTitle (bspP, ">", NULL);
403
404 cptr = titleT;
405 while (*cptr != '\0')
406 {
407 if (*cptr == ' ')
408 break;
409 cptr++;
410 }
411 *cptr = '\0';
412 cptr = titleP;
413 while (*cptr != '\0')
414 {
415 if (*cptr == ' ')
416 break;
417 cptr++;
418 }
419 *cptr = '\0';
420
421 if (eff_len_max > 0)
422 {
423 fprintf (fout,
424 "%16s %4ld %16s %4ld -- %6.2f identity %4ld residues\n",
425 titleT, (long) bspT->length, titleP, (long) bspP->length,
426 (FloatHi) seq_id_max / (FloatHi) eff_len_max * 100.0,
427 (long) eff_len_max);
428 }
429 else
430 {
431 fprintf (fout,
432 "%16s %4ld %16s %4ld -- %6.2f identity\n",
433 titleT, (long) bspT->length, titleP, (long) bspP->length, 0.0);
434 }
435 cpp = cpph;
436 while (cpp != NULL)
437 {
438 sapT = PatternMatchBioseq (bspT, cpp, 0);
439 sapP = PatternMatchBioseq (bspP, cpp, 0);
440 if (sapT != NULL && sapP != NULL)
441 {
442 fprintf (fout, "target and probe: %s\n", cpp->name);
443 }
444 sap = sapT;
445 while (sap != NULL)
446 {
447 sapn = sap->next;
448 SeqAlignFree (sap);
449 sap = sapn;
450 }
451 sap = sapP;
452 while (sap != NULL)
453 {
454 sapn = sap->next;
455 SeqAlignFree (sap);
456 sap = sapn;
457 }
458 cpp = cpp->nextpattern;
459 }
460 SeqLocFree (slpT);
461 SeqLocFree (slpP);
462 SeqEntryFree (sepT);
463 SeqEntryFree (sepP);
464 MemFree (titleT);
465 MemFree (titleP);
466 if (flagFastaT)
467 {
468 sepT = FastaToSeqEntry (fioT, FALSE);
469 }
470 if (flagFastaP)
471 {
472 sepP = FastaToSeqEntry (fioP, FALSE);
473 }
474 if (flagFastaL)
475 {
476 bufuid[0] = '\0';
477 FileGets (bufuid, sizeof (bufuid), fioT);
478 giT = atol (bufuid);
479 sepT = EntrezSeqEntryGet (giT, SEQENTRY_READ_NUC_PROT);
480 }
481 if (flagFastal)
482 {
483 bufuid[0] = '\0';
484 FileGets (bufuid, sizeof (bufuid), fioP);
485 giP = atol (bufuid);
486 sepP = EntrezSeqEntryGet (giP, SEQENTRY_READ_NUC_PROT);
487 }
488 }
489
490 if (sepT != NULL)
491 SeqEntryFree (sepT);
492 if (sepP != NULL)
493 SeqEntryFree (sepP);
494 FileClose (fioT);
495 FileClose (fioP);
496 if (flagFastaL || flagFastal)
497 EntrezFini ();
498 if (fout != stdout)
499 FileClose (fout);
500 BLASTOptionDelete (blstopt);
501 if (flagPattern)
502 ComPatFree (cpph);
503
504 return 0;
505 }
506