1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: twop.c
27 *
28 * Author(s): John Kuzio
29 *
30 * Version Creation Date: 98-01-01
31 *
32 * $Revision: 6.5 $
33 *
34 * File Description: best id blast of two proteins
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * --------------------------------------------------------------------------
40 * $Log: twop.c,v $
41 * Revision 6.5  1998/09/16 18:19:32  kuzio
42 * cvs logging
43 *
44 * ==========================================================================
45 */
46 
47 /*
48   BLAST pair score for longest gapped overlap
49 */
50 
51 #include <ncbi.h>
52 #include <accentr.h>
53 #include <tofasta.h>
54 #include <gather.h>
55 #include <blast.h>
56 #include <urkutil.h>
57 #include <urkptpf.h>
58 
59 #define TOP_ERROR 1
60 static char _this_module[] = "twop";
61 #undef  THIS_MODULE
62 #define THIS_MODULE _this_module
63 static char _this_file[] = __FILE__;
64 #undef  THIS_FILE
65 #define THIS_FILE _this_file
66 
67 typedef struct gather_Bioseq
68 {
69   Int4      gi;
70   BioseqPtr bsp;
71 } Gather_BS, PNTR Gather_BSPtr;
72 
73 Args myargs[] =
74 {
75   { "FastA protein target file (search)", NULL, NULL, NULL, TRUE,
76     'F', ARG_STRING, 0.0, 0, NULL },
77   { "FastA protein probe file (query)", NULL, NULL, NULL, TRUE,
78     'f', ARG_STRING, 0.0, 0, NULL },
79   { "List of protein gi's target file (search)", NULL, NULL, NULL, TRUE,
80     'L', ARG_STRING, 0.0, 0, NULL },
81   { "List of protein gi's probe file (query)", NULL, NULL, NULL, TRUE,
82     'l', ARG_STRING, 0.0, 0, NULL },
83   { "Output file", NULL, NULL, NULL, TRUE,
84     'o', ARG_STRING, 0.0, 0, NULL },
85   { "Show pattern match results", "FALSE", "FALSE", "TRUE", TRUE,
86     'p', ARG_BOOLEAN, 0.0, 0, NULL }
87 };
88 
GetBioseq(GatherContextPtr gcp)89 static Boolean GetBioseq (GatherContextPtr gcp)
90 {
91   Gather_BSPtr   gbsp;
92   BioseqPtr      bsp;
93   Int4           entrezgi;
94 
95   if (gcp == NULL)
96     return FALSE;
97   if ((gbsp = (Gather_BSPtr) gcp->userdata) == NULL)
98     return FALSE;
99 
100   if (gbsp->bsp != NULL)
101     return TRUE;
102   if (gcp->thistype != OBJ_BIOSEQ)
103     return TRUE;
104   if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
105     return TRUE;
106 
107   if (gbsp->gi > 0)
108   {
109     entrezgi = GetGIForSeqId (bsp->id);
110     if (gbsp->gi == entrezgi)
111       gbsp->bsp = bsp;
112     return TRUE;
113   }
114   else
115   {
116     gbsp->bsp = bsp;
117     return TRUE;
118   }
119 }
120 
Main(void)121 Int2 Main (void)
122 {
123   Int2           argcount;
124   Boolean        flagHaveNet;
125 
126   FILE           *fioT, *fioP, *fout;
127   Boolean        flagFastaT, flagFastaP, flagFastaL, flagFastal;
128   Char           bufuid[256];
129   Int4           giT, giP;
130 
131   SeqEntryPtr    sepT, sepP;
132   BioseqPtr      bspT, bspP;
133   SeqLocPtr      slpT, slpP;
134 
135   BLAST_OptionsBlkPtr blstopt;
136 
137   Int4           i, loop;
138   Int4           startT, startP;
139   SeqAlignPtr    sap, sapn;
140   DenseSegPtr    dsp;
141   SeqPortPtr     sppT, sppP;
142   Uint1          seqT, seqP;
143   Int4           effective_length, eff_len_max, seq_identity, seq_id_max;
144   CharPtr        titleT, titleP;
145   CharPtr        cptr;
146 
147   Boolean        flagPattern;
148   CharPtr        pattern_file = "ncbipros.dat";
149   ComPatPtr      cpp, cpph;
150   SeqAlignPtr    sapT, sapP;
151 
152   static GatherScope  gs;
153   GatherScopePtr      gsp;
154   static Gather_BS    gbs;
155   Gather_BSPtr        gbsp;
156 
157   argcount = sizeof (myargs) / sizeof (Args);
158   if (!GetArgs ("twop", argcount, myargs))
159     return 1;
160 
161   ErrSetLogLevel (SEV_ERROR);
162   ErrSetMessageLevel (SEV_ERROR);
163 
164   gsp = &gs;
165   MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
166   MemSet ((Pointer) gsp->ignore, (int) (TRUE),
167           (size_t) (OBJ_MAX * sizeof (Boolean)));
168   gsp->ignore[OBJ_BIOSEQ] = FALSE;
169   gbsp = &gbs;
170 
171   flagPattern = (Boolean) myargs[5].intvalue;
172 
173   if (myargs[0].strvalue != NULL)
174   {
175     flagFastaT = TRUE;
176     flagFastaL = FALSE;
177   }
178   else if (myargs[2].strvalue != NULL)
179   {
180     flagFastaT = FALSE;
181     flagFastaL = TRUE;
182   }
183   else
184   {
185     flagFastaT = FALSE;
186     flagFastaL = FALSE;
187   }
188   if (myargs[1].strvalue != NULL)
189   {
190     flagFastaP = TRUE;
191     flagFastal = FALSE;
192   }
193   else if (myargs[3].strvalue != NULL)
194   {
195     flagFastaP = FALSE;
196     flagFastal = TRUE;
197   }
198   else
199   {
200     flagFastaP = FALSE;
201     flagFastal = FALSE;
202   }
203 
204   if (!flagFastaT && !flagFastaL)
205   {
206     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
207                "No target file given :: for help :   twop -");
208     ErrShow ();
209     exit (1);
210   }
211   if (!flagFastaP && !flagFastal)
212   {
213     ErrPostEx (SEV_ERROR, TOP_ERROR, 100,
214                "No probe file given :: for help :   twop -");
215     ErrShow ();
216     exit (1);
217   }
218 
219   if (flagFastaL || flagFastal)
220   {
221     if (!EntrezInit ("twop", FALSE, &flagHaveNet))
222     {
223       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
224                  "Entrez init failed");
225       ErrShow ();
226       exit (1);
227     }
228   }
229 
230   giT = 0;
231   giP = 0;
232 
233   if (flagFastaT)
234   {
235     if ((fioT = FileOpen (myargs[0].strvalue, "r")) == NULL)
236     {
237       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
238                  "Failed to open FastA file %s", myargs[0].strvalue);
239       ErrShow ();
240       exit (1);
241     }
242     sepT = FastaToSeqEntry (fioT, FALSE);
243   }
244 
245   if (flagFastaP)
246   {
247     if ((fioP = FileOpen (myargs[1].strvalue, "r")) == NULL)
248     {
249       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
250                  "Failed to open FastA file %s", myargs[1].strvalue);
251       ErrShow ();
252       exit (1);
253     }
254     sepP = FastaToSeqEntry (fioP, FALSE);
255   }
256 
257   if (flagFastaL)
258   {
259     if ((fioT = FileOpen (myargs[2].strvalue, "r")) == NULL)
260     {
261       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
262                  "Failed to open gi list file %s", myargs[2].strvalue);
263       ErrShow ();
264       exit (1);
265     }
266     while ((FileGets (bufuid, sizeof (bufuid), fioT)) != NULL)
267     {
268       if (bufuid[0] == '>')
269         continue;
270       break;
271     }
272     giT = atol (bufuid);
273     sepT = EntrezSeqEntryGet (giT, SEQENTRY_READ_NUC_PROT);
274   }
275 
276   if (flagFastal)
277   {
278     if ((fioP = FileOpen (myargs[3].strvalue, "r")) == NULL)
279     {
280       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
281                  "Failed to open gi list file %s", myargs[3].strvalue);
282       ErrShow ();
283       exit (1);
284     }
285     while ((FileGets (bufuid, sizeof (bufuid), fioP)) != NULL)
286     {
287       if (bufuid[0] == '>')
288         continue;
289       break;
290     }
291     giP = atol (bufuid);
292     sepP = EntrezSeqEntryGet (giP, SEQENTRY_READ_NUC_PROT);
293   }
294 
295   if (sepT == NULL || sepP == NULL)
296   {
297     ErrPostEx (SEV_ERROR, TOP_ERROR, 103,
298                "Seqentry for target or probe not found");
299     ErrShow ();
300     exit (1);
301   }
302 
303   if (myargs[4].strvalue != NULL)
304   {
305     if ((fout = FileOpen (myargs[4].strvalue, "w")) == NULL)
306     {
307       ErrPostEx (SEV_ERROR, TOP_ERROR, 102,
308                  "Failed to open output file %s", myargs[4].strvalue);
309       ErrShow ();
310       exit (1);
311     }
312   }
313   else
314   {
315     fout = stdout;
316   }
317 
318   if (flagPattern)
319     cpph = ReadPrositePattern (pattern_file, FALSE, 25, NULL, NULL);
320   else
321     cpph = NULL;
322 
323   blstopt = BLASTOptionNew ("blastp", TRUE);
324   blstopt->wordsize = 3;
325   blstopt->expect_value = 10.0;
326   blstopt->filter = 0;
327   blstopt->gap_open = 9;
328   blstopt->gap_extend = 2;
329   blstopt->gap_x_dropoff = 50;
330   blstopt->threshold_second = 9;
331   blstopt->two_pass_method = FALSE;
332   blstopt->multiple_hits_only = FALSE;
333 
334   while (sepT != NULL && sepP != NULL)
335   {
336     gbsp->gi = giT;
337     gbsp->bsp = NULL;
338     GatherSeqEntry (sepT, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
339     bspT = gbsp->bsp;
340     gbsp->gi = giP;
341     gbsp->bsp = NULL;
342     GatherSeqEntry (sepP, (Pointer) gbsp, GetBioseq, (Pointer) gsp);
343     bspP = gbsp->bsp;
344     if (bspT == NULL || bspP == NULL)
345     {
346       ErrPostEx (SEV_ERROR, TOP_ERROR, 104,
347                  "Bioseq for target or probe not found");
348       ErrShow ();
349       exit (1);
350     }
351     slpT = SeqLocIntNew (0, bspT->length-1, Seq_strand_unknown, bspT->id);
352     slpP = SeqLocIntNew (0, bspP->length-1, Seq_strand_unknown, bspP->id);
353 
354     sap = BlastTwoSequencesByLoc (slpT, slpP, "blastp", blstopt);
355     seq_id_max = 0;
356     eff_len_max = 0;
357     while (sap != NULL)
358     {
359       effective_length = 0;
360       seq_identity = 0;
361       if (sap->segtype == 2)   /* DenseSeg gapped search */
362       {
363         sppT = SeqPortNew (bspT, 0, bspT->length-1, 0, Seq_code_iupacaa);
364         sppP = SeqPortNew (bspP, 0, bspP->length-1, 0, Seq_code_iupacaa);
365         dsp = sap->segs;
366         for (loop = 0; loop < (Int4) dsp->numseg; loop++)
367         {
368           if (dsp->starts[loop*2] == -1 || dsp->starts[(loop*2)+1] == -1)
369           {
370             effective_length += dsp->lens[loop];
371           }
372           else
373           {
374             effective_length += dsp->lens[loop];
375             startT = dsp->starts[loop*2];
376             startP = dsp->starts[(loop*2)+1];
377             SeqPortSeek (sppT, startT, SEEK_SET);
378             SeqPortSeek (sppP, startP, SEEK_SET);
379             for (i = 0; i < dsp->lens[loop]; i++)
380             {
381               seqT = SeqPortGetResidue (sppT);
382               seqP = SeqPortGetResidue (sppP);
383               if (seqT == seqP)
384                 seq_identity++;
385             }
386           }
387         }
388         SeqPortFree (sppT);
389         SeqPortFree (sppP);
390       }
391       if (effective_length > eff_len_max)
392       {
393         seq_id_max = seq_identity;
394         eff_len_max = effective_length;
395       }
396       sapn = sap->next;
397       SeqAlignFree (sap);
398       sap = sapn;
399     }
400 
401     titleT = FastaTitle (bspT, ">", NULL);
402     titleP = FastaTitle (bspP, ">", NULL);
403 
404     cptr = titleT;
405     while (*cptr != '\0')
406     {
407       if (*cptr == ' ')
408         break;
409       cptr++;
410     }
411     *cptr = '\0';
412     cptr = titleP;
413     while (*cptr != '\0')
414     {
415       if (*cptr == ' ')
416         break;
417       cptr++;
418     }
419     *cptr = '\0';
420 
421     if (eff_len_max > 0)
422     {
423       fprintf (fout,
424  "%16s %4ld %16s %4ld -- %6.2f identity %4ld residues\n",
425  titleT, (long) bspT->length, titleP, (long) bspP->length,
426  (FloatHi) seq_id_max / (FloatHi) eff_len_max * 100.0,
427  (long) eff_len_max);
428     }
429     else
430     {
431       fprintf (fout,
432  "%16s %4ld %16s %4ld -- %6.2f identity\n",
433  titleT, (long) bspT->length, titleP, (long) bspP->length, 0.0);
434     }
435     cpp = cpph;
436     while (cpp != NULL)
437     {
438       sapT = PatternMatchBioseq (bspT, cpp, 0);
439       sapP = PatternMatchBioseq (bspP, cpp, 0);
440       if (sapT != NULL && sapP != NULL)
441       {
442         fprintf (fout, "target and probe:  %s\n", cpp->name);
443       }
444       sap = sapT;
445       while (sap != NULL)
446       {
447         sapn = sap->next;
448         SeqAlignFree (sap);
449         sap = sapn;
450       }
451       sap = sapP;
452       while (sap != NULL)
453       {
454         sapn = sap->next;
455         SeqAlignFree (sap);
456         sap = sapn;
457       }
458       cpp = cpp->nextpattern;
459     }
460     SeqLocFree (slpT);
461     SeqLocFree (slpP);
462     SeqEntryFree (sepT);
463     SeqEntryFree (sepP);
464     MemFree (titleT);
465     MemFree (titleP);
466     if (flagFastaT)
467     {
468       sepT = FastaToSeqEntry (fioT, FALSE);
469     }
470     if (flagFastaP)
471     {
472       sepP = FastaToSeqEntry (fioP, FALSE);
473     }
474     if (flagFastaL)
475     {
476       bufuid[0] = '\0';
477       FileGets (bufuid, sizeof (bufuid), fioT);
478       giT = atol (bufuid);
479       sepT = EntrezSeqEntryGet (giT, SEQENTRY_READ_NUC_PROT);
480     }
481     if (flagFastal)
482     {
483       bufuid[0] = '\0';
484       FileGets (bufuid, sizeof (bufuid), fioP);
485       giP = atol (bufuid);
486       sepP = EntrezSeqEntryGet (giP, SEQENTRY_READ_NUC_PROT);
487     }
488   }
489 
490   if (sepT != NULL)
491     SeqEntryFree (sepT);
492   if (sepP != NULL)
493     SeqEntryFree (sepP);
494   FileClose (fioT);
495   FileClose (fioP);
496   if (flagFastaL || flagFastal)
497     EntrezFini ();
498   if (fout != stdout)
499     FileClose (fout);
500   BLASTOptionDelete (blstopt);
501   if (flagPattern)
502     ComPatFree (cpph);
503 
504   return 0;
505 }
506