1 static char const rcsid[] = "$Id: xmlblast.c,v 6.41 2007/05/07 13:30:54 kans Exp $";
2 
3 /* $Id: xmlblast.c,v 6.41 2007/05/07 13:30:54 kans Exp $ */
4 /**************************************************************************
5 *                                                                         *
6 *                             COPYRIGHT NOTICE                            *
7 *                                                                         *
8 * This software/database is categorized as "United States Government      *
9 * Work" under the terms of the United States Copyright Act.  It was       *
10 * produced as part of the author's official duties as a Government        *
11 * employee and thus can not be copyrighted.  This software/database is    *
12 * freely available to the public for use without a copyright notice.      *
13 * Restrictions can not be placed on its present or future use.            *
14 *                                                                         *
15 * Although all reasonable efforts have been taken to ensure the accuracy  *
16 * and reliability of the software and data, the National Library of       *
17 * Medicine (NLM) and the U.S. Government do not and can not warrant the   *
18 * performance or results that may be obtained by using this software,     *
19 * data, or derivative works thereof.  The NLM and the U.S. Government     *
20 * disclaim any and all warranties, expressed or implied, as to the        *
21 * performance, merchantability or fitness for any particular purpose or   *
22 * use.                                                                    *
23 *                                                                         *
24 * In any work or product derived from this material, proper attribution   *
25 * of the author(s) as the source of the software or data would be         *
26 * appreciated.                                                            *
27 *                                                                         *
28 **************************************************************************
29 * File Name:  xmlblast.c
30 *
31 * Author:  Sergei B. Shavirin
32 *
33 * Version Creation Date: 05/17/2000
34 *
35 * $Revision: 6.41 $
36 *
37 * File Description:  Functions to print simplified BLAST output (XML)
38 *
39 *
40 * $Log: xmlblast.c,v $
41 * Revision 6.41  2007/05/07 13:30:54  kans
42 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
43 *
44 * Revision 6.40  2007/03/23 14:38:31  madden
45 * Fix RT 15264581 (ungapped use of std-seg) and add needed BioseqUnlock
46 *
47 * Revision 6.39  2005/07/28 14:57:10  coulouri
48 * remove dead code
49 *
50 * Revision 6.38  2005/07/05 14:55:07  madden
51 * Add call to AsnSetXMLmodulePrefix
52 *
53 * Revision 6.37  2005/05/16 18:16:40  dondosha
54 * Removed calls to ObjMgrFreeCache - it should be called on a higher level
55 *
56 * Revision 6.36  2004/10/20 19:58:58  dondosha
57 * Replace sequence data in Bioseq, as done in txalign.c, to assure proper masking and identities/positives calculation for all programs
58 *
59 * Revision 6.35  2004/06/30 12:32:20  madden
60 * Added include for blfmtutl.h, removed unused variable
61 *
62 * Revision 6.34  2004/06/23 21:08:58  dondosha
63 * Fixed masking of filtered locations for translated queries
64 *
65 * Revision 6.33  2004/04/29 19:55:35  dondosha
66 * Mask filtered locations in query sequence lines
67 *
68 * Revision 6.32  2004/03/31 17:58:23  dondosha
69 * Added PSIXmlReset function to allow keeping the AsnIoPtr between outputs for multiple queries in blastpgp
70 *
71 * Revision 6.31  2003/08/04 16:19:16  dondosha
72 * Added effective HSP length (length adjustment) to other returns, so it can be reported in XML output
73 *
74 * Revision 6.30  2003/05/30 17:25:38  coulouri
75 * add rcsid
76 *
77 * Revision 6.29  2003/03/21 21:01:16  camacho
78 * Fixed inversion of subject and query sequences for tblastx
79 *
80 * Revision 6.28  2003/02/19 15:42:32  madden
81 * Check glb_matrix before freeing
82 *
83 * Revision 6.27  2003/01/28 16:57:11  dondosha
84 * For single query, call the old BXMLPrintOutput function from BXMLPrintMultiQueryOutput
85 *
86 * Revision 6.26  2003/01/23 20:02:53  dondosha
87 * Distinguish between blastn and megablast programs in multi-query XML output
88 *
89 * Revision 6.25  2003/01/23 19:55:20  dondosha
90 * Added the closing part for multi-query XML output.
91 *
92 * Revision 6.24  2003/01/06 23:01:40  dondosha
93 * Added function to create a multi-query XML output for web megablast
94 *
95 * Revision 6.23  2002/11/14 15:37:18  dondosha
96 * Added functions to extract all hit information from seqalign that can be extracted without loading sequences
97 *
98 * Revision 6.22  2002/07/17 22:28:13  dondosha
99 * Added support for megablast XML output
100 *
101 * Revision 6.21  2002/04/23 20:48:24  madden
102 * Fix hsp_count for ungapped case
103 *
104 * Revision 6.20  2002/03/08 21:34:57  madden
105 * If no title for Bioseq use "No definition line found"
106 *
107 * Revision 6.19  2002/01/15 21:56:38  madden
108 * Fixes from Jed Wing (Turbogenomics) for ungapped runs
109 *
110 * Revision 6.18  2001/05/01 20:54:32  madden
111 * Set glb_matrix in BXMLSeqAlignToHits if not already set
112 *
113 * Revision 6.17  2001/02/28 21:37:25  shavirin
114 * Fixed XML printing in case when definition line is missing.
115 *
116 * Revision 6.16  2001/01/31 18:43:49  dondosha
117 * Test whether subject Bioseq is found before trying to show the hit
118 *
119 * Revision 6.15  2001/01/19 20:02:17  dondosha
120 * Set the query and hit frames depending on strand for blastn output
121 *
122 * Revision 6.14  2000/11/28 20:51:57  shavirin
123 * Adopted for usage with mani-iterational XML definition.
124 *
125 * Revision 6.13  2000/11/22 21:55:49  shavirin
126 * Added function BXMLPrintOutputEx() with new parameter iteration_number
127 * for usage with PSI-Blast.
128 *
129 * Revision 6.12  2000/11/17 21:49:33  shavirin
130 * Changed order from/to for negative strand AFTER retrieval of the sequence.
131 *
132 * Revision 6.11  2000/11/08 21:39:25  shavirin
133 * Changed order from/to depending on strand and frame.
134 *
135 * Revision 6.10  2000/11/08 20:49:05  shavirin
136 * Added paramter "score" as in Traditional Blast Output. Previous score
137 * changed to "bit_score".
138 *
139 * Revision 6.9  2000/11/08 20:09:32  shavirin
140 * Added new parameter align_len analogos to the number reported in
141 * the Traditional Blast Output.
142 *
143 * Revision 6.8  2000/11/07 21:51:22  shavirin
144 * Added check if query sequence is available for retrieval.
145 *
146 * Revision 6.7  2000/10/23 22:12:46  shavirin
147 * Added possibility to create XML with error message in case of failure or
148 * no hits.
149 *
150 * Revision 6.6  2000/10/23 19:56:06  dondosha
151 * AsnIo should be opened and closed outside function BXMLPrintOutput
152 *
153 * Revision 6.5  2000/10/12 21:35:31  shavirin
154 * Added support for OOF alignment.
155 *
156 * Revision 6.4  2000/08/22 14:53:00  shavirin
157 * Changed variable from enthropy to entropy.
158 *
159 * Revision 6.3  2000/08/11 16:54:17  kans
160 * return FALSE instead of NULL for Mac compiler
161 *
162 * Revision 6.2  2000/08/10 14:42:33  shavirin
163 * Added missing comment.
164 *
165 * Revision 6.1  2000/08/09 20:40:04  shavirin
166 * Initial revision.
167 * *
168 *
169 */
170 
171 #include <xmlblast.h>
172 #include <blfmtutl.h>
173 
174 static Int4Ptr PNTR glb_matrix;
175 
BXMLGetSeqLineForDenseDiag(DenseDiagPtr ddp,HspPtr hsp,Int4 length,Boolean is_aa,Int4Ptr PNTR matrix)176 Boolean BXMLGetSeqLineForDenseDiag(DenseDiagPtr ddp, HspPtr hsp, Int4 length,
177                                    Boolean is_aa, Int4Ptr PNTR matrix)
178 {
179     SeqIdPtr m_id, t_id;
180     SeqInt si;
181     SeqLoc sl;
182     Int4 i;
183     Uint1 m_res, t_res;
184     SeqPortPtr m_spp, t_spp;
185 
186     if(ddp == NULL || hsp == NULL)
187         return FALSE;
188 
189     hsp->qseq = MemNew(length+1);
190     hsp->hseq = MemNew(length+1);
191     hsp->midline = MemNew(length+1);
192 
193     sl.choice = SEQLOC_INT;
194     sl.data.ptrvalue = &si;
195 
196     /* !! Here we assume, that this is 2-dimensional DenseDiag where
197        first element is query and second element is database sequence
198        This is the case for ALL non-gapped blastp and blastn SeqAligns */
199 
200     /* SeqLoc for query sequence */
201 
202     m_id = SeqIdDup(ddp->id);
203     si.id = m_id;
204     si.from = hsp->query_from;
205     si.to = hsp->query_to;
206     si.strand = (ddp->strands == NULL) ? 0 : ddp->strands[0];
207 
208     m_spp = SeqPortNewByLoc(&sl, is_aa ? Seq_code_ncbieaa : Seq_code_iupacna);
209 
210     /* SeqLoc for the subject */
211 
212     t_id = SeqIdDup(ddp->id->next);
213     si.id = t_id;
214     si.from = hsp->hit_from;
215     si.to = hsp->hit_to;
216     si.strand = (ddp->strands == NULL) ? 0 : ddp->strands[1];
217 
218     t_spp = SeqPortNewByLoc(&sl, is_aa ? Seq_code_ncbieaa : Seq_code_iupacna);
219 
220     if(m_spp == NULL || t_spp == NULL) {
221         if(m_spp == NULL)
222             SeqPortFree(m_spp);
223         if(t_spp != NULL)
224             SeqPortFree(t_spp);
225         return FALSE;
226     }
227 
228     for(i = 0; i < length; ++i) {
229         m_res = SeqPortGetResidue(m_spp);
230         t_res = SeqPortGetResidue(t_spp);
231 
232         hsp->qseq[i] = m_res;
233         hsp->hseq[i] = t_res;
234 
235         if(m_res == t_res) {
236             hsp->midline[i] = is_aa ? m_res : '|';
237         } else if(matrix != NULL && is_aa) {
238             if (IS_residue(m_res) && IS_residue(t_res) &&
239                 matrix[m_res][t_res] >0) {
240                 hsp->midline[i] = '+';
241             } else {
242                 hsp->midline[i] = ' ';
243             }
244         } else {
245             hsp->midline[i] = ' ';
246         }
247     }
248 
249     SeqIdFree(t_id);
250     SeqIdFree(m_id);
251     SeqPortFree(m_spp);
252     SeqPortFree(t_spp);
253 
254     return TRUE;
255 }
BXMLGetSeqLineForDenseSeg(DenseSegPtr dsp,HspPtr hsp,Int4 length,Boolean is_aa,Int4Ptr PNTR matrix)256 Boolean BXMLGetSeqLineForDenseSeg(DenseSegPtr dsp, HspPtr hsp, Int4 length,
257                                   Boolean is_aa, Int4Ptr PNTR matrix)
258 {
259 
260     SeqInt msi, tsi;
261     SeqLoc sl;
262     Int2 i;
263     Int2 dim;
264     Int2 m_order, t_order;	/*order of the master and the target sequence*/
265     Int4 index, abs_index;
266     Int4 j, val, t_val;
267     Uint1 m_res, t_res;
268     SeqPortPtr m_spp, t_spp;
269 
270     if(dsp == NULL || hsp == NULL)
271         return FALSE;
272 
273     hsp->qseq = MemNew(length+1);
274     hsp->hseq = MemNew(length+1);
275     hsp->midline = MemNew(length+1);
276 
277     m_order = 0;
278     t_order = 1;
279     dim = 2;
280 
281     msi.id = SeqIdDup(dsp->ids);
282     msi.from = hsp->query_from;
283     msi.to = hsp->query_to;
284     msi.strand = (dsp->strands == NULL) ? 0 : dsp->strands[m_order];
285 
286     tsi.id = dsp->ids->next;
287     tsi.from = hsp->hit_from;
288     tsi.to = hsp->hit_to;
289     tsi.strand = (dsp->strands == NULL) ? 0 : dsp->strands[t_order];
290 
291     sl.choice = SEQLOC_INT;
292     sl.data.ptrvalue = &msi;
293     m_spp = SeqPortNewByLoc(&sl,
294                             (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
295 
296     sl.choice = SEQLOC_INT;
297     sl.data.ptrvalue = &tsi;
298     t_spp = SeqPortNewByLoc(&sl,
299                             (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
300 
301     abs_index = 0;
302     for(i = 0; i<dsp->numseg; ++i) {
303         val = dsp->starts[i*dim + m_order];
304         t_val = dsp->starts[i*dim + t_order];
305         if(val == -1 || t_val == -1) {
306 
307             if(val == -1 && t_val == -1) /* never should happend */
308                 continue;
309 
310             if(val != -1) {
311                 index = dsp->lens[i];
312                 while (index > 0) {
313                     index--;
314                     m_res = SeqPortGetResidue(m_spp);
315                     hsp->qseq[abs_index] = m_res;
316                     hsp->hseq[abs_index] = '-';
317                     hsp->midline[abs_index] = ' ';
318                     abs_index++;
319                 }
320             }
321             if(t_val != -1) {
322                 index = dsp->lens[i];
323                 while (index > 0) {
324                     index--;
325                     t_res = SeqPortGetResidue(t_spp);
326                     hsp->qseq[abs_index] = '-';
327                     hsp->hseq[abs_index] = t_res;
328                     hsp->midline[abs_index] = ' ';
329                     abs_index++;
330                 }
331             }
332         } else {
333             for(j = 0; j<dsp->lens[i]; ++j) {
334                 m_res = SeqPortGetResidue(m_spp);
335                 t_res = SeqPortGetResidue(t_spp);
336 
337                 hsp->qseq[abs_index] = m_res;
338                 hsp->hseq[abs_index] = t_res;
339 
340                 if(m_res == t_res) {
341                     if(is_aa)
342                         hsp->midline[abs_index] = m_res;
343                     else
344                         hsp->midline[abs_index] = '|';
345 
346                 } else if(matrix != NULL && is_aa &&
347                           IS_residue(m_res) && IS_residue(t_res)) {
348 
349                     if(matrix[m_res][t_res] >0)
350                         hsp->midline[abs_index] = '+';
351                     else
352                         hsp->midline[abs_index] = ' ';
353 
354                 } else {
355                     hsp->midline[abs_index] = ' ';
356                 }
357 
358                 abs_index++;
359             }
360         }
361     }
362 
363     SeqIdFree(msi.id);
364     SeqPortFree(m_spp);
365     SeqPortFree(t_spp);
366     return TRUE;
367 }
BXMLGetSeqLineForStdSeg(StdSegPtr ssp,HspPtr hsp,Int4 length,Int4Ptr PNTR matrix)368 Boolean BXMLGetSeqLineForStdSeg(StdSegPtr ssp, HspPtr hsp, Int4 length,
369                                 Int4Ptr PNTR matrix)
370 {
371     Boolean master_is_translated=FALSE, both_translated=FALSE;
372     Boolean target_is_translated = FALSE;
373     CharPtr genetic_code1, genetic_code2;
374     SeqPortPtr spp1, spp2;
375     Uint1 codon[4], residue1, residue2;
376     Int4 abs_index;
377     Boolean  ungapped_align = FALSE;
378 
379     if(ssp == NULL || hsp == NULL)
380         return FALSE;
381 
382     hsp->qseq = MemNew(length+1);
383     hsp->hseq = MemNew(length+1);
384     hsp->midline = MemNew(length+1);
385 
386     /* Check for valid sequence. */
387     if (SeqLocLen(ssp->loc) == 3*SeqLocLen(ssp->loc->next))
388         master_is_translated = TRUE;
389     else if (3*SeqLocLen(ssp->loc) == SeqLocLen(ssp->loc->next))
390         target_is_translated = TRUE;
391     else if (SeqLocLen(ssp->loc) == SeqLocLen(ssp->loc->next))
392         both_translated = TRUE;
393     else
394         return FALSE;
395 
396     if (master_is_translated) {
397         genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids);
398     } else if (both_translated) {
399         genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids);
400         genetic_code2 = GetGeneticCodeFromSeqId(ssp->ids->next);
401     } else {
402         genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids->next);
403     }
404 
405 
406     for(abs_index = 0; ssp != 0 && abs_index < length; ssp = ssp->next) {
407 
408         if (ssp->loc->choice != SEQLOC_EMPTY || ssp->loc->next->choice != SEQLOC_EMPTY) {
409             /* Non - gap element */
410             if (ssp->loc->choice != SEQLOC_EMPTY && ssp->loc->next->choice != SEQLOC_EMPTY) {
411                 if (both_translated) { /* TBLASTX */
412                     spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
413                     spp2 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
414                     while ((codon[0]=SeqPortGetResidue(spp2)) != SEQPORT_EOF) {
415                         codon[1] = SeqPortGetResidue(spp2);
416                         codon[2] = SeqPortGetResidue(spp2);
417                         residue1 = AAForCodon(codon, genetic_code1);
418                         codon[0] = SeqPortGetResidue(spp1);
419                         codon[1] = SeqPortGetResidue(spp1);
420                         codon[2] = SeqPortGetResidue(spp1);
421                         residue2 = AAForCodon(codon, genetic_code2);
422 
423                         hsp->qseq[abs_index] = residue2;
424                         hsp->hseq[abs_index] = residue1;
425 
426                         if (residue1 == residue2)
427                             hsp->midline[abs_index] = residue1;
428                         else if (matrix != NULL && matrix[residue1][residue2] >0)
429                             hsp->midline[abs_index] = '+';
430                         else
431                             hsp->midline[abs_index] = ' ';
432 
433                         abs_index++;
434 
435                     }
436                 } else {     /* TBLASTN or BLASTX */
437                     if (master_is_translated) { /* BLASTX */
438                         spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
439                         spp2 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbieaa);
440                     } else {                    /* TBLASTN */
441                         spp2 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbieaa);
442                         spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
443                     }
444 
445                     while ((residue1=SeqPortGetResidue(spp2)) != SEQPORT_EOF) {
446                         codon[0] = SeqPortGetResidue(spp1);
447                         codon[1] = SeqPortGetResidue(spp1);
448                         codon[2] = SeqPortGetResidue(spp1);
449                         residue2 = AAForCodon(codon, genetic_code1);
450 
451                         if (master_is_translated) { /* BLASTX */
452                             hsp->qseq[abs_index] = residue2;
453                             hsp->hseq[abs_index] = residue1;
454                         } else {                    /* TBLASTN */
455                             hsp->qseq[abs_index] = residue1;
456                             hsp->hseq[abs_index] = residue2;
457                         }
458 
459                         if (residue1 == residue2)
460                             hsp->midline[abs_index] = residue1;
461                         else if (matrix != NULL && matrix[residue1][residue2] >0)
462                             hsp->midline[abs_index] = '+';
463                         else
464                             hsp->midline[abs_index] = ' ';
465 
466                         abs_index++;
467                     }
468                 }
469                 SeqPortFree(spp1);
470                 SeqPortFree(spp2);
471                 /* Check if this is an ungapped alignment;
472                    in this case do not go to next link */
473                 if (ssp->next &&
474                     ssp->next->loc->choice != SEQLOC_EMPTY &&
475                     ssp->next->loc->next->choice != SEQLOC_EMPTY)
476                     ungapped_align = TRUE;
477 
478             } else if (ssp->loc->choice == SEQLOC_EMPTY) { /* gap in query */
479 
480                 if (target_is_translated) { /* TBLASTN */
481                     spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
482 
483                     while(TRUE) {
484                         if((codon[0] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
485                             break;
486                         if((codon[1] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
487                             break;
488                         if((codon[2] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
489                             break;
490 
491                         residue1 = AAForCodon(codon, master_is_translated ? genetic_code2 : genetic_code1);
492 
493                         hsp->qseq[abs_index]  = '-'; /* gap character */
494                         hsp->hseq[abs_index] = residue1;
495                         hsp->midline[abs_index] = ' ';
496                         abs_index++;
497                     }
498                     SeqPortFree(spp1);
499                 } else {        /* BLASTX */
500                     spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbieaa);
501 
502                     while ((residue1=SeqPortGetResidue(spp1)) != SEQPORT_EOF) {
503                         hsp->qseq[abs_index] = '-'; /* gap character */
504                         hsp->hseq[abs_index] = residue1;
505                         hsp->midline[abs_index] = ' ';
506                         abs_index++;
507                     }
508 
509                     SeqPortFree(spp1);
510                 }
511             } else if (ssp->loc->next->choice == SEQLOC_EMPTY) {
512                 if (master_is_translated) { /* BLASTX */
513                     spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
514 
515                     while(TRUE) {
516                         if((codon[0] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
517                             break;
518                         if((codon[1] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
519                             break;
520                         if((codon[2] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
521                             break;
522 
523                         residue1 = AAForCodon(codon, genetic_code1);
524 
525                         hsp->qseq[abs_index] = residue1;
526                         hsp->hseq[abs_index] = '-'; /* gap character */
527                         hsp->midline[abs_index] = ' ';
528                         abs_index++;
529                     }
530                     SeqPortFree(spp1);
531                 } else {                    /* TBLASTN */
532 
533                     spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbieaa);
534 
535                     while ((residue1=SeqPortGetResidue(spp1)) != SEQPORT_EOF) {
536                         hsp->qseq[abs_index] = residue1;
537                         hsp->hseq[abs_index] = '-'; /* gap character */
538                         hsp->midline[abs_index] = ' ';
539                         abs_index++;
540                     }
541 
542                     SeqPortFree(spp1);
543                 }
544             }
545 
546             if (both_translated || ungapped_align)
547                 /* for tblastx perform only one StdSegPtr. so far...*/
548                 break;
549         } else {
550             continue;    /* Both EMPTY - never should happened */
551         }
552     }
553 
554     return TRUE;
555 }
556 
BXMLGetSeqLines(SeqAlignPtr align,HspPtr hsp,Int4 length,Boolean is_aa,Int4 chain,Int4Ptr PNTR matrix)557 Boolean BXMLGetSeqLines(SeqAlignPtr align, HspPtr hsp, Int4 length,
558                         Boolean is_aa, Int4 chain, Int4Ptr PNTR matrix)
559 {
560     DenseDiagPtr ddp;
561     DenseSegPtr dsp;
562     StdSegPtr ssp;
563     Uint2 order = 0;
564     SeqAlignPtr sap;
565 
566     if(align == NULL)
567         return FALSE;
568 
569     switch (align->segtype) {
570     case 1: /*Dense-diag*/
571         ddp = (DenseDiagPtr) align->segs;
572 
573         while(ddp) {
574             ++order;
575             if(order == chain) {
576                 BXMLGetSeqLineForDenseDiag(ddp, hsp, length, is_aa, matrix);
577                 break;
578             }
579             ddp = ddp->next;
580         }
581 
582         break;
583     case 2:
584         dsp = (DenseSegPtr) align->segs;
585         BXMLGetSeqLineForDenseSeg(dsp, hsp, length, is_aa, matrix);
586         break;
587     case 3:
588         ssp = (StdSegPtr) align->segs;
589         while(ssp) {
590             ++order;
591             if(order == chain) {
592                 BXMLGetSeqLineForStdSeg(ssp, hsp, length, matrix);
593                 break;
594             }
595             ssp = ssp->next;
596         }
597         break;
598     case 5: /* Discontinuous alignment */
599         sap = (SeqAlignPtr) align->segs;
600         return BXMLGetSeqLines(sap, hsp, length, is_aa, chain, matrix);
601     default:
602         break;
603     }
604 
605     return TRUE;
606 }
607 
608 /** Frees the part of the byte store list that has not been used for replacement
609  * of Bioseq data.
610  */
611 static ValNodePtr
ByteStoreListFree(ValNodePtr bs_list)612 ByteStoreListFree(ValNodePtr bs_list)
613 {
614    ByteStorePtr bsp;
615    ValNodePtr vnp;
616 
617    for(vnp = bs_list; vnp; vnp = vnp->next) {
618       bsp = (ByteStorePtr) vnp->data.ptrvalue;
619       if(bsp != NULL)
620           BSFree(bsp);
621    }
622    return ValNodeFree(bs_list);
623 }
624 
BXMLGetHspFromSeqAlign(SeqAlignPtr sap,Boolean is_aa,Int4 chain,Boolean is_ooframe,ValNodePtr mask_loc)625 HspPtr BXMLGetHspFromSeqAlign(SeqAlignPtr sap, Boolean is_aa, Int4 chain,
626                               Boolean is_ooframe, ValNodePtr mask_loc)
627 {
628     HspPtr hsp;
629     AlignSum as;
630     ScorePtr score, sp;
631     Boolean matrix_allocated = FALSE;
632     BioseqPtr bsp = NULL;
633     Char tmp[256];
634     ByteStorePtr seq_data=NULL;
635     Boolean seq_data_replaced = FALSE;
636     ValNodePtr bs_list = NULL;
637     Uint1 code=0;
638     Uint1 repr=0;
639 
640     if((hsp = HspNew()) == NULL)
641         return NULL;
642 
643     MemSet(&as, NULLB, sizeof(AlignSum));
644 
645     as.master_sip = TxGetQueryIdFromSeqAlign(sap);
646     as.target_sip = TxGetSubjectIdFromSeqAlign(sap);
647 
648     /* If there is a mask, retrieve the query Bioseq and replace the sequence
649        data with a masked one. */
650     if (mask_loc) {
651         Int4 frame;
652         if ((bsp = BioseqLockById(as.master_sip)) == NULL) {
653             SeqIdWrite(as.master_sip, tmp, PRINTID_FASTA_LONG, sizeof(tmp));
654             ErrPostEx(SEV_ERROR, 0, __LINE__, "Query sequence '%s' is not "
655                       "available in Bioseq index.", tmp);
656             return NULL;
657         }
658 
659         bs_list = CreateMaskByteStore (mask_loc);
660         if (ISA_na(bsp->mol) && sap->segtype == SAS_STD) {
661             StdSegPtr ssp = (StdSegPtr) sap->segs;
662             int ssp_index = 1;
663             while (ssp_index < chain)
664             {
665                 ssp = ssp->next;
666                 ssp_index++;
667             }
668             if (ssp == NULL)
669             {   /* No more to process. */
670                 BioseqUnlock(bsp);
671                 return NULL;
672             }
673             frame = SeqLocStart(ssp->loc);
674             if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
675                 frame += SeqLocLen(ssp->loc);
676                 frame = 4 + (bsp->length - frame)%3;
677             } else {
678                 frame = (1 + frame%3);
679             }
680         } else
681             frame = 0;
682         repr = bsp->repr;
683         seq_data = (ByteStorePtr) bsp->seq_data;
684         code = bsp->seq_data_type;
685         seq_data_replaced =
686                 replace_bytestore_data(bsp, bs_list, (Uint1)frame);
687         if (!seq_data_replaced) {
688             bsp->repr = repr;
689             bsp->seq_data = (SeqDataPtr) seq_data;
690             bsp->seq_data_type = code;
691         }
692     } else {
693         /* Checkup for query Bioseq to be available in the index */
694         if ((bsp = BioseqLockById(as.master_sip)) == NULL) {
695             SeqIdWrite(as.master_sip, tmp, PRINTID_FASTA_LONG, sizeof(tmp));
696             ErrPostEx(SEV_ERROR, 0, __LINE__, "Query sequence '%s' is not "
697                       "available in Bioseq index.", tmp);
698             return NULL;
699         }
700     }
701 
702     if(glb_matrix != NULL)
703         as.matrix = glb_matrix;
704     else {
705         as.matrix = load_default_matrix ();
706         matrix_allocated = TRUE;
707     }
708     as.is_aa = is_aa;
709     as.ooframe = is_ooframe;
710 
711     if(chain == 0)
712         chain = 1;
713 
714     if((score = find_score_in_align(sap, chain, &as)) == NULL) {
715         if(matrix_allocated)
716             free_default_matrix(as.matrix);
717         HspFree(hsp);
718         BioseqUnlock(bsp);
719         return NULL;
720     }
721 
722     for(sp = score; sp != NULL; sp = sp->next) {
723         if(!(StringICmp(sp->id->str, "e_value")) ||
724            !(StringICmp(sp->id->str, "sum_e")))
725             hsp->evalue = sp->value.realvalue;
726         if(!StringICmp(sp->id->str, "bit_score")) {
727             hsp->bit_score = sp->value.realvalue;
728         }
729         if(!StringICmp(sp->id->str, "score")) {
730             hsp->score = sp->value.intvalue;
731         }
732     }
733 
734     if (as.m_frame_set || as.t_frame_set) {
735        hsp->query_frame = as.m_frame;
736        hsp->hit_frame = as.t_frame;
737     } else { /* blastn program! */
738        hsp->query_frame = 1;
739        if (as.m_strand == as.t_strand)
740           hsp->hit_frame = 1;
741        else
742           hsp->hit_frame = -1;
743     }
744     hsp->identity = as.identical;
745     hsp->positive = as.positive + as.identical;
746     hsp->gaps = as.gaps;
747     hsp->align_len = as.totlen;
748 
749     hsp->density = 0;             /* ???? */
750 
751 
752     /* For sequence retrieval from should always be less than to */
753     hsp->query_from = as.master_from;
754     hsp->query_to = as.master_to;
755     hsp->hit_from = as.target_from;
756     hsp->hit_to = as.target_to;
757 
758     BXMLGetSeqLines(sap, hsp, as.totlen, is_aa, chain, as.matrix);
759 
760     /* Restore the original query sequence data, if it was replaced. */
761     if (seq_data_replaced) {
762         bsp->seq_data = (SeqDataPtr) seq_data;
763         bsp->repr = repr;
764         bsp->seq_data_type = code;
765     }
766     bs_list = ByteStoreListFree(bs_list);
767     BioseqUnlock(bsp);
768 
769 
770     /* For display it depends on strand */
771 
772     if(as.m_strand == Seq_strand_minus || as.m_frame < 0) {
773         hsp->query_from = as.master_to;
774         hsp->query_to = as.master_from;
775     }
776 
777     if(as.t_strand == Seq_strand_minus || as.t_frame < 0) {
778         hsp->hit_from = as.target_to;
779         hsp->hit_to = as.target_from;
780     }
781 
782     /* ... and 1 position larger, that array number (started from 0) */
783 
784     hsp->query_from++;
785     hsp->query_to++;
786     hsp->hit_from++;
787     hsp->hit_to++;
788 
789     if(matrix_allocated)
790         free_default_matrix(as.matrix);
791 
792     return hsp;
793 }
794 
BXMLSeqAlignToHits(SeqAlignPtr seqalign,Boolean ungapped,Boolean is_ooframe,ValNodePtr mask_loc)795 HitPtr BXMLSeqAlignToHits(SeqAlignPtr seqalign, Boolean ungapped,
796                           Boolean is_ooframe, ValNodePtr mask_loc)
797 {
798     HitPtr hitp, hitp_head;
799     HspPtr hspp;
800     SeqAlignPtr sap, sap2;
801     SeqIdPtr subject_sip, new_sip;
802     BioseqPtr bsp;
803     Char buffer[526];
804     Int4 hit_count, hsp_count, chain;
805     Boolean is_aa;
806 
807     /* We assume, that  seqalignes in this set ordered by
808        database sequences and by e-values for a given database sequence */
809 
810     /* For optimization BLOSUM62 may be loaded ones */
811     if(glb_matrix == NULL)
812         glb_matrix = load_default_matrix ();
813 
814     hitp_head = NULL;
815     hit_count = 1;              /* Hits starting with 1 */
816 
817     for(sap = seqalign; sap != NULL;) {
818         subject_sip = TxGetSubjectIdFromSeqAlign(sap);
819         bsp = BioseqLockById(subject_sip);
820         if (!bsp) { /* Apparently a recently deleted sequence */
821            sap = sap->next;
822            continue;
823         }
824 
825         if(hitp_head == NULL) { /* first element */
826             hitp_head = hitp = HitNew();
827         } else {
828             hitp->next = HitNew();
829             hitp = hitp->next;
830         }
831         hitp->num = hit_count;
832         hit_count++;
833 
834         is_aa = (bsp->mol == Seq_mol_aa);
835 
836         if (BioseqGetTitle(bsp))
837         	hitp->def = StringSave(BioseqGetTitle(bsp));
838         else
839         	hitp->def = StringSave("No definition line found");
840 
841         SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
842         hitp->id = StringSave(buffer);
843 
844         if(bsp->id->choice == SEQID_GI && bsp->id->next != NULL)
845            SeqIdWrite(bsp->id->next, buffer, PRINTID_TEXTID_ACCESSION, sizeof(buffer));
846         else
847            SeqIdWrite(bsp->id, buffer, PRINTID_TEXTID_ACCESSION, sizeof(buffer));
848 
849         hitp->accession = StringSave(buffer);
850 
851         hitp->len = bsp->length;
852         hsp_count = 1;          /* Hsps starting with 1 */
853         chain = 1;
854 
855         BioseqUnlock(bsp);
856 
857         for(sap2 = sap; sap2 != NULL;) {
858 
859             /* Filling info about specific alignments */
860 
861             while(TRUE) {
862                 if(hitp->hsps == NULL) {
863                     hspp = hitp->hsps =
864                        BXMLGetHspFromSeqAlign(sap2, is_aa, chain, is_ooframe,
865                                               mask_loc);
866                 } else {
867                     if((hspp->next = BXMLGetHspFromSeqAlign(sap2, is_aa, chain,
868                                         is_ooframe, mask_loc)) == NULL)
869                         break;
870                     else
871                         hspp = hspp->next;
872                 }
873 
874                 if(hspp == NULL)
875                     break;
876 
877                 hspp->num = hsp_count;
878                 hsp_count++;
879                 if(!ungapped) {  /* Only one chain for gapped */
880                     break;
881                 }
882                 chain++;
883             }
884 
885             sap2 = sap2->next;
886             new_sip = TxGetSubjectIdFromSeqAlign(sap2);
887 
888             if(SeqIdMatch(subject_sip, new_sip)) {
889                 continue;
890             } else {
891                 sap = sap2;
892                 break;
893             }
894         }
895     }
896 
897     return hitp_head;
898 }
899 
BXMLBuildOneIteration(SeqAlignPtr seqalign,ValNodePtr other_returns,Boolean is_ooframe,Boolean ungapped,Int4 iter_num,CharPtr message,ValNodePtr mask_loc)900 IterationPtr BXMLBuildOneIteration(SeqAlignPtr seqalign,
901                                    ValNodePtr other_returns,
902                                    Boolean is_ooframe, Boolean ungapped,
903                                    Int4 iter_num, CharPtr message,
904                                    ValNodePtr mask_loc)
905 {
906    return BXMLBuildOneQueryIteration(seqalign, other_returns, is_ooframe,
907                                      ungapped, iter_num, message, NULL,
908                                      mask_loc);
909 }
910 
BXMLBuildOneQueryIteration(SeqAlignPtr seqalign,ValNodePtr other_returns,Boolean is_ooframe,Boolean ungapped,Int4 iter_num,CharPtr message,BioseqPtr query,ValNodePtr mask_loc)911 IterationPtr BXMLBuildOneQueryIteration(SeqAlignPtr seqalign,
912                                    ValNodePtr other_returns,
913                                    Boolean is_ooframe, Boolean ungapped,
914                                    Int4 iter_num, CharPtr message,
915                                    BioseqPtr query, ValNodePtr mask_loc)
916 {
917     IterationPtr iterp;
918     Char buffer[1024];
919 
920     iterp = IterationNew();
921     iterp->iter_num = iter_num;
922 
923     if (query) {
924        SeqIdWrite(query->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
925        iterp->query_ID = StringSave(buffer);
926 
927        if((iterp->query_def = StringSave(BioseqGetTitle(query))) == NULL) {
928           iterp->query_def = StringSave("No definition line found");
929        }
930 
931        iterp->query_len = query->length;
932 
933     }
934 
935     if(seqalign != NULL) {
936        iterp->hits =
937           BXMLSeqAlignToHits(seqalign, ungapped, is_ooframe, mask_loc);
938     }
939 
940     iterp->stat = BXMLBuildStatistics(other_returns, ungapped);
941 
942     iterp->message = StringSave(message);
943 
944     return iterp;
945 }
946 
947 StatisticsPtr
BXMLBuildStatistics(ValNodePtr other_returns,Boolean ungapped)948 BXMLBuildStatistics(ValNodePtr other_returns, Boolean ungapped)
949 {
950    TxDfDbInfoPtr dbinfo=NULL;
951    BLAST_KarlinBlkPtr ka_params_gap=NULL;
952    BLAST_KarlinBlkPtr ka_params_ungap=NULL;
953    ValNodePtr vnp;
954    StatisticsPtr stat;
955 
956    if (!other_returns)
957       return NULL;
958 
959    stat = StatisticsNew();
960 
961    for (dbinfo = NULL, vnp=other_returns; vnp; vnp = vnp->next) {
962       switch (vnp->choice) {
963       case TXDBINFO:
964          dbinfo = vnp->data.ptrvalue;
965          break;
966       case TXKABLK_GAP:
967          ka_params_gap = vnp->data.ptrvalue;
968          break;
969       case TXKABLK_NOGAP:
970          ka_params_ungap = vnp->data.ptrvalue;
971          break;
972       case EFF_SEARCH_SPACE:
973          stat->eff_space = vnp->data.realvalue;
974          break;
975       case EFF_HSP_LENGTH:
976          stat->hsp_len = vnp->data.intvalue;
977          break;
978       default:
979          break;
980       }
981    }
982 
983    if(dbinfo != NULL) {
984       stat->db_num= dbinfo->number_seqs;
985       stat->db_len = dbinfo->total_length;
986    }
987 
988    if(ungapped) {
989       if(ka_params_ungap != NULL) {
990          stat->lambda = ka_params_ungap->Lambda;
991          stat->kappa = ka_params_ungap->K;
992          stat->entropy = ka_params_ungap->H;
993       }
994    } else {
995       if(ka_params_gap != NULL) {
996          stat->lambda = ka_params_gap->Lambda;
997          stat->kappa = ka_params_gap->K;
998          stat->entropy = ka_params_gap->H;
999       }
1000    }
1001 
1002    return stat;
1003 }
1004 
BXMLCreateBlastOutputHead(CharPtr program,CharPtr database,BLAST_OptionsBlkPtr options,BioseqPtr query,Int4 flags)1005 BlastOutputPtr BXMLCreateBlastOutputHead(CharPtr program, CharPtr database,
1006                                          BLAST_OptionsBlkPtr options,
1007                                          BioseqPtr query, Int4 flags)
1008 {
1009     BlastOutputPtr boutp;
1010     Char buffer[1024];
1011     SeqPortPtr spp;
1012     Boolean is_aa = FALSE;
1013     Int4 i;
1014 
1015     if((boutp = BlastOutputNew()) == NULL)
1016         return FALSE;
1017 
1018     /* For optimization BLOSUM62 may be loaded ones */
1019     if(glb_matrix == NULL)
1020         glb_matrix = load_default_matrix ();
1021 
1022     if (query) {
1023        SeqIdWrite(query->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
1024        boutp->query_ID = StringSave(buffer);
1025 
1026        if((boutp->query_def = StringSave(BioseqGetTitle(query))) == NULL) {
1027           boutp->query_def = StringSave("No definition line found");
1028        }
1029 
1030        boutp->query_len = query->length;
1031 
1032        if(flags & BXML_INCLUDE_QUERY) {
1033           boutp->query_seq = MemNew(query->length+1);
1034           is_aa = (query->mol == Seq_mol_aa);
1035           spp = SeqPortNew(query, 0, -1, Seq_strand_plus,
1036                            (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
1037 
1038           for (i = 0; i < query->length; i++) {
1039              boutp->query_seq[i] = SeqPortGetResidue(spp);
1040           }
1041           spp = SeqPortFree(spp);
1042        } else {
1043           boutp->query_seq = NULL;    /* Do we need sequence here??? */
1044        }
1045     }
1046     /* Program name */
1047     boutp->program = StringSave(program);
1048 
1049     /* Database name */
1050     boutp->db = StringSave(database);
1051 
1052     /* Version text */
1053     sprintf(buffer, "%s %s [%s]", program, BlastGetVersionNumber(),
1054             BlastGetReleaseDate());
1055     boutp->version = StringSave(buffer);
1056 
1057     /* Reference */
1058     boutp->reference = BlastGetReference(FALSE);
1059 
1060     /* Filling parameters */
1061 
1062     boutp->param = ParametersNew();
1063     boutp->param->expect = options->expect_value;
1064     boutp->param->matrix = StringSave(options->matrix);
1065     boutp->param->sc_match = options->reward;
1066     boutp->param->sc_mismatch = options->penalty;
1067     boutp->param->gap_open = options->gap_open;
1068     boutp->param->gap_extend = options->gap_extend;
1069     boutp->param->include = options->ethresh;
1070 
1071     if(options->filter_string != NULL)
1072         boutp->param->filter = StringSave(options->filter_string);
1073 
1074     return boutp;
1075 }
1076 
1077 /*
1078    This function will create and print out simplified ASN.1/XML of
1079    one-iterational Blast output - like regular Blast etc. This function
1080    should not be used for multi-iterational PSI-Blast.
1081 */
BXMLPrintOutput(AsnIoPtr aip,SeqAlignPtr seqalign,BLAST_OptionsBlkPtr options,CharPtr program,CharPtr database,BioseqPtr query,ValNodePtr other_returns,Int4 flags,CharPtr message,ValNodePtr mask_loc)1082 Boolean BXMLPrintOutput(AsnIoPtr aip, SeqAlignPtr seqalign,
1083                         BLAST_OptionsBlkPtr options,CharPtr program,
1084                         CharPtr database, BioseqPtr query,
1085                         ValNodePtr other_returns, Int4 flags,
1086                         CharPtr message, ValNodePtr mask_loc)
1087 {
1088     BlastOutputPtr boutp;
1089     Boolean ungapped = FALSE;
1090 
1091     if((boutp = BXMLCreateBlastOutputHead(program, database, options, query,
1092                                           flags)) == NULL)
1093         return FALSE;
1094 
1095     if(options->gapped_calculation == FALSE || !StringICmp(program, "tblastx"))
1096         ungapped = TRUE;
1097 
1098     /* Here is one-iterational Blast output */
1099     boutp->iterations = BXMLBuildOneIteration(seqalign, other_returns,
1100                                               options->is_ooframe, ungapped,
1101                                               1, message, mask_loc);
1102 
1103     if (aip != NULL)
1104         BlastOutputAsnWrite(boutp, aip, NULL);
1105 
1106     if (glb_matrix)
1107     	free_default_matrix(glb_matrix);
1108     glb_matrix = NULL;
1109 
1110     BlastOutputFree(boutp);
1111 
1112     return TRUE;
1113 }
1114 
BXMLPrintMultiQueryOutput(AsnIoPtr aip,SeqAlignPtr seqalign,BLAST_OptionsBlkPtr options,CharPtr program,CharPtr database,BioseqSetPtr query_set,ValNodePtr other_returns,Int4 flags,CharPtr message,ValNodePtr mask_loc)1115 Boolean BXMLPrintMultiQueryOutput(AsnIoPtr aip, SeqAlignPtr seqalign,
1116            BLAST_OptionsBlkPtr options, CharPtr program, CharPtr database,
1117            BioseqSetPtr query_set, ValNodePtr other_returns, Int4 flags,
1118            CharPtr message, ValNodePtr mask_loc)
1119 {
1120     Boolean ungapped = FALSE;
1121     BioseqPtr query;
1122     SeqEntryPtr sep;
1123     Boolean q_is_na, d_is_na;
1124     MBXmlPtr mbxp = NULL;
1125     IterationPtr iterp;
1126     SeqEntryFunc seqentry_callback;
1127     Int4 index;
1128     Boolean query_found;
1129     SeqIdPtr seqid = NULL;
1130     SeqAlignPtr sap = NULL, next_seqalign = NULL;
1131     ValNodePtr next_mask_loc = NULL, current_mask_loc = NULL;
1132 
1133     sep = (SeqEntryPtr) ((BioseqSetPtr)query_set)->seq_set;
1134     BlastGetTypes(program, &q_is_na, &d_is_na);
1135     if (q_is_na)
1136        seqentry_callback = FindNuc;
1137     else
1138        seqentry_callback = FindProt;
1139 
1140     /* If no queries, there is nothing to report */
1141     if (!sep)
1142        return FALSE;
1143 
1144     /* If only one query, call a one-query output function */
1145     if (!sep->next) {
1146        Boolean return_value;
1147        SeqEntryExplore(sep, &query, seqentry_callback);
1148        return_value = BXMLPrintOutput(aip, seqalign, options, program,
1149                          database, query, other_returns, flags,
1150                          message, mask_loc);
1151        /* This function is presumed to close the AsnIoPtr inside */
1152        AsnIoClose(aip);
1153        return return_value;
1154     }
1155 
1156     next_mask_loc = mask_loc;
1157 
1158     index = 0;
1159     while (seqalign) {
1160        /* Find the corresponding query */
1161        query_found = FALSE;
1162        for ( ; sep; ++index, sep = sep->next) {
1163           SeqEntryExplore(sep, &query, seqentry_callback);
1164           seqid = TxGetQueryIdFromSeqAlign(seqalign);
1165           if (SeqIdComp(query->id, seqid) == SIC_YES) {
1166              query_found = TRUE;
1167              break;
1168           }
1169        }
1170        if (query_found) {
1171           /* Find where seqaligns for this query end */
1172           for (sap = seqalign; sap && sap->next; sap = sap->next) {
1173              if (SeqIdComp(seqid, TxGetQueryIdFromSeqAlign(sap->next))
1174                  != SIC_YES) {
1175                 break;
1176              }
1177           }
1178           next_seqalign = sap->next;
1179           /* Unlink this query seqaligns from the rest */
1180           sap->next = NULL;
1181        } else {
1182           break;
1183        }
1184 
1185        /* Find the masking locations for this query */
1186        if (next_mask_loc &&
1187            SeqIdComp(SeqLocId((SeqLocPtr)next_mask_loc->data.ptrvalue),
1188                      seqid) == SIC_YES) {
1189           current_mask_loc = (SeqLocPtr)
1190              MemDup(next_mask_loc, sizeof(SeqLoc));
1191           next_mask_loc = next_mask_loc->next;
1192           current_mask_loc->next = NULL;
1193        } else {
1194           current_mask_loc = NULL;
1195        }
1196 
1197        if (!mbxp) {
1198           if (options->is_megablast_search) {
1199              mbxp = PSIXmlInit(aip, "megablast", database, options, query, 0);
1200           } else {
1201              mbxp = PSIXmlInit(aip, program, database, options, query, 0);
1202           }
1203        }
1204        if(options->gapped_calculation == FALSE ||
1205           !StringICmp(program, "tblastx"))
1206           ungapped = TRUE;
1207 
1208        /* Here is one-iterational Blast output */
1209        iterp = BXMLBuildOneQueryIteration(seqalign, NULL, options->is_ooframe,
1210                                           ungapped, index, NULL, query,
1211                                           current_mask_loc);
1212        current_mask_loc = (ValNodePtr) MemFree(current_mask_loc);
1213 
1214        IterationAsnWrite(iterp, mbxp->aip, mbxp->atp);
1215        AsnIoFlush(mbxp->aip);
1216        IterationFree(iterp);
1217        /* Reconnect the SeqAlign chain */
1218        if (sap)
1219           sap->next = next_seqalign;
1220        seqalign = next_seqalign;
1221     }
1222 
1223     free_default_matrix(glb_matrix);
1224     glb_matrix = NULL;
1225 
1226     MBXmlClose(mbxp, other_returns, !options->gapped_calculation);
1227 
1228     return TRUE;
1229 }
1230 
1231 
PSIXmlInit(AsnIoPtr aip,CharPtr program,CharPtr database,BLAST_OptionsBlkPtr options,BioseqPtr query,Int4 flags)1232 PSIXmlPtr PSIXmlInit(AsnIoPtr aip, CharPtr program, CharPtr database,
1233                      BLAST_OptionsBlkPtr options, BioseqPtr query, Int4 flags)
1234 {
1235     PSIXmlPtr psixp;
1236     AsnModulePtr amp;
1237     DataVal av;
1238     AsnTypePtr atp;
1239     Boolean retval = FALSE;
1240 
1241     AsnTypePtr       BLASTOUTPUT;
1242     AsnTypePtr       BLASTOUTPUT_program;
1243     AsnTypePtr       BLASTOUTPUT_version;
1244     AsnTypePtr       BLASTOUTPUT_reference;
1245     AsnTypePtr       BLASTOUTPUT_db;
1246     AsnTypePtr       BLASTOUTPUT_query_ID;
1247     AsnTypePtr       BLASTOUTPUT_query_def;
1248     AsnTypePtr       BLASTOUTPUT_query_len;
1249     AsnTypePtr       BLASTOUTPUT_query_seq;
1250     AsnTypePtr       BLASTOUTPUT_param;
1251     AsnTypePtr       BLASTOUTPUT_iterations;
1252     AsnTypePtr       BLASTOUTPUT_iterations_E;
1253     AsnTypePtr       BLASTOUTPUT_mbstat;
1254 
1255     psixp = (PSIXmlPtr) MemNew(sizeof(PSIXml));
1256 
1257     psixp->aip = aip;
1258 
1259     if (! bxmlobjAsnLoad()) {
1260         return NULL;
1261     }
1262 
1263     AsnSetXMLmodulePrefix("http://www.ncbi.nlm.nih.gov/dtd/");
1264 
1265     amp = AsnAllModPtr();
1266 
1267     MACRO_atp_find(BLASTOUTPUT,BlastOutput);
1268     MACRO_atp_find(BLASTOUTPUT_program,BlastOutput.program);
1269     MACRO_atp_find(BLASTOUTPUT_version,BlastOutput.version);
1270     MACRO_atp_find(BLASTOUTPUT_reference,BlastOutput.reference);
1271     MACRO_atp_find(BLASTOUTPUT_db,BlastOutput.db);
1272     MACRO_atp_find(BLASTOUTPUT_query_ID,BlastOutput.query-ID);
1273     MACRO_atp_find(BLASTOUTPUT_query_def,BlastOutput.query-def);
1274     MACRO_atp_find(BLASTOUTPUT_query_len,BlastOutput.query-len);
1275     MACRO_atp_find(BLASTOUTPUT_query_seq,BlastOutput.query-seq);
1276     MACRO_atp_find(BLASTOUTPUT_param,BlastOutput.param);
1277     MACRO_atp_find(BLASTOUTPUT_iterations,BlastOutput.iterations);
1278     MACRO_atp_find(BLASTOUTPUT_iterations_E,BlastOutput.iterations.E);
1279     MACRO_atp_find(BLASTOUTPUT_mbstat,BlastOutput.mbstat);
1280 
1281     /* Start of iterations structure */
1282     psixp->atp = BLASTOUTPUT_iterations_E;
1283 
1284     /* Head of all BlastOutput structure */
1285     psixp->BlastOutput = BLASTOUTPUT;
1286 
1287     /* Head of iterations strucure */
1288     psixp->BlastOutput_iterations = BLASTOUTPUT_iterations;
1289 
1290     /* Head of the final statistics for Mega BLAST */
1291     psixp->BlastOutput_mbstat = BLASTOUTPUT_mbstat;
1292 
1293     psixp->boutp = BXMLCreateBlastOutputHead(program, database, options,
1294                                              query, flags);
1295 
1296     atp = AsnLinkType(NULL, BLASTOUTPUT);   /* link local tree */
1297 
1298     if (atp == NULL) {
1299         return NULL;
1300     }
1301 
1302     if (! AsnOpenStruct(psixp->aip, atp, (Pointer) psixp->boutp)) {
1303         return NULL;
1304     }
1305 
1306     if (psixp->boutp->program != NULL) {
1307         av.ptrvalue = psixp->boutp -> program;
1308         retval = AsnWrite(psixp->aip, BLASTOUTPUT_program,  &av);
1309     }
1310 
1311     if (psixp->boutp->version != NULL) {
1312         av.ptrvalue = psixp->boutp->version;
1313         retval = AsnWrite(psixp->aip, BLASTOUTPUT_version,  &av);
1314     }
1315 
1316     if (psixp->boutp->reference != NULL) {
1317         av.ptrvalue = psixp->boutp->reference;
1318         retval = AsnWrite(psixp->aip, BLASTOUTPUT_reference,  &av);
1319     }
1320 
1321     if (psixp->boutp -> db != NULL) {
1322         av.ptrvalue = psixp->boutp->db;
1323         retval = AsnWrite(psixp->aip, BLASTOUTPUT_db,  &av);
1324     }
1325 
1326     if (psixp->boutp -> query_ID != NULL) {
1327         av.ptrvalue = psixp->boutp->query_ID;
1328         retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_ID,  &av);
1329     }
1330 
1331     if (psixp->boutp->query_def != NULL) {
1332         av.ptrvalue = psixp->boutp->query_def;
1333         retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_def,  &av);
1334     }
1335 
1336     av.intvalue = psixp->boutp->query_len;
1337     retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_len,  &av);
1338 
1339     if (psixp->boutp->query_seq != NULL) {
1340         av.ptrvalue = psixp->boutp->query_seq;
1341         retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_seq,  &av);
1342     }
1343 
1344     if (psixp->boutp->param != NULL) {
1345         if (!ParametersAsnWrite(psixp->boutp->param,
1346                                 psixp->aip, BLASTOUTPUT_param)) {
1347             return NULL;
1348         }
1349     }
1350 
1351     if(!AsnOpenStruct(psixp->aip, BLASTOUTPUT_iterations, NULL))
1352         return NULL;
1353 
1354     AsnIoFlush(psixp->aip);
1355 
1356     return psixp;
1357 }
1358 
PSIXmlReset(PSIXmlPtr psixp)1359 void PSIXmlReset(PSIXmlPtr psixp)
1360 {
1361     AsnCloseStruct(psixp->aip, psixp->BlastOutput_iterations, NULL);
1362     AsnCloseStruct(psixp->aip, psixp->BlastOutput, NULL);
1363     psixp->BlastOutput_iterations = NULL;
1364     psixp->BlastOutput = NULL;
1365     AsnIoReset(psixp->aip);
1366 
1367     psixp->boutp = BlastOutputFree(psixp->boutp);
1368 }
1369 
PSIXmlClose(PSIXmlPtr psixp)1370 void PSIXmlClose(PSIXmlPtr psixp)
1371 {
1372    if (psixp->BlastOutput_iterations)
1373       AsnCloseStruct(psixp->aip, psixp->BlastOutput_iterations, NULL);
1374    if (psixp->BlastOutput)
1375       AsnCloseStruct(psixp->aip, psixp->BlastOutput, NULL);
1376 
1377     AsnIoClose(psixp->aip);
1378 
1379     BlastOutputFree(psixp->boutp);
1380     MemFree(psixp);
1381 
1382     return;
1383 }
1384 
MBXmlClose(PSIXmlPtr mbxp,ValNodePtr other_returns,Boolean ungapped)1385 void MBXmlClose(PSIXmlPtr mbxp, ValNodePtr other_returns, Boolean ungapped)
1386 {
1387    StatisticsPtr stat;
1388    AsnTypePtr atp;
1389 
1390     AsnCloseStruct(mbxp->aip, mbxp->BlastOutput_iterations, NULL);
1391 
1392     if (other_returns) {
1393        atp = AsnLinkType(NULL, mbxp->BlastOutput_mbstat);
1394 
1395        /*AsnOpenStruct(mbxp->aip, mbxp->BlastOutput_mbstat, NULL);*/
1396        stat = BXMLBuildStatistics(other_returns, ungapped);
1397 
1398        StatisticsAsnWrite(stat, mbxp->aip, atp);
1399        AsnIoFlush(mbxp->aip);
1400        StatisticsFree(stat);
1401        /*AsnCloseStruct(mbxp->aip, mbxp->BlastOutput_mbstat, NULL);*/
1402     }
1403 
1404     AsnCloseStruct(mbxp->aip, mbxp->BlastOutput, NULL);
1405 
1406     AsnIoClose(mbxp->aip);
1407 
1408     BlastOutputFree(mbxp->boutp);
1409     MemFree(mbxp);
1410 
1411     return;
1412 }
1413 
FillHspScoreInfo(HspPtr hsp,ScorePtr score)1414 static void FillHspScoreInfo(HspPtr hsp, ScorePtr score)
1415 {
1416    ScorePtr sp;
1417    for(sp = score; sp != NULL; sp = sp->next) {
1418       if(!(StringICmp(sp->id->str, "e_value")) ||
1419          !(StringICmp(sp->id->str, "sum_e")))
1420          hsp->evalue = sp->value.realvalue;
1421       if(!StringICmp(sp->id->str, "bit_score")) {
1422          hsp->bit_score = sp->value.realvalue;
1423       }
1424       if(!StringICmp(sp->id->str, "score")) {
1425          hsp->score = sp->value.intvalue;
1426       }
1427       if(!StringICmp(sp->id->str, "num_ident")) {
1428          hsp->identity = sp->value.intvalue;
1429       }
1430       if (!StringICmp(sp->id->str, "sum_n")) {
1431          hsp->num = sp->value.intvalue;
1432       }
1433    }
1434 }
1435 
GetHspFromSeqAlign(SeqAlignPtr align,Boolean ungapped,Int4Ptr hspcnt_ptr)1436 static HspPtr GetHspFromSeqAlign(SeqAlignPtr align, Boolean ungapped,
1437                           Int4Ptr hspcnt_ptr)
1438 {
1439     HspPtr hsp, head_hsp = NULL, last_hsp = NULL;
1440     ScorePtr score;
1441     DenseDiagPtr ddp;
1442     DenseSegPtr dsp;
1443     StdSegPtr ssp;
1444     Int4 hspcnt = 0, last_seg, i;
1445 
1446     *hspcnt_ptr = 0;
1447 
1448     switch (align->segtype) {
1449     case 1: /*Dense-diag; blastn, blastp ungapped */
1450        ddp = (DenseDiagPtr) align->segs;
1451        while(ddp) {
1452           if((hsp = HspNew()) == NULL)
1453              return head_hsp;
1454           if (!head_hsp)
1455              head_hsp = last_hsp = hsp;
1456           else {
1457              last_hsp->next = hsp;
1458              last_hsp = last_hsp->next;
1459           }
1460 
1461           ++hspcnt;
1462           hsp->align_len = ddp->len;
1463           if (ddp->strands[0] == Seq_strand_minus) {
1464              hsp->query_from = ddp->starts[0] + ddp->len;
1465              hsp->query_to = ddp->starts[0] + 1;
1466           } else {
1467              hsp->query_from = ddp->starts[0] + 1;
1468              hsp->query_to = ddp->starts[0] + ddp->len;
1469           }
1470           if (ddp->strands[1] == Seq_strand_minus) {
1471              hsp->hit_from = ddp->starts[1] + ddp->len;
1472              hsp->hit_to = ddp->starts[1] + 1;
1473           } else {
1474              hsp->hit_from = ddp->starts[1];
1475              hsp->hit_to = ddp->starts[1] + ddp->len - 1;
1476           }
1477           FillHspScoreInfo(hsp, ddp->scores);
1478           ddp = ddp->next;
1479        }
1480        break;
1481     case 2: /* Dense-seg; blastn, blastp gapped */
1482        dsp = (DenseSegPtr) align->segs;
1483        if((head_hsp = hsp = HspNew()) == NULL)
1484           return head_hsp;
1485        if (dsp->scores)
1486           score = dsp->scores;
1487        else
1488           score = align->score;
1489        hspcnt = 1;
1490        last_seg = dsp->numseg - 1;
1491 
1492        if (dsp->strands[0] == Seq_strand_minus) {
1493           hsp->query_from = dsp->starts[0] + dsp->lens[0];
1494           hsp->query_to = dsp->starts[2*last_seg] + 1;
1495        } else {
1496           hsp->query_from = dsp->starts[0] + 1;
1497           hsp->query_to = dsp->starts[2*last_seg] + dsp->lens[last_seg];
1498        }
1499        if (dsp->strands[1] == Seq_strand_minus) {
1500           hsp->hit_from = dsp->starts[1] + dsp->lens[1];
1501           hsp->hit_to = dsp->starts[2*last_seg+1] + 1;
1502        } else {
1503           hsp->hit_from = dsp->starts[1] + 1;
1504           hsp->hit_to = dsp->starts[2*last_seg+1] + dsp->lens[last_seg];
1505        }
1506 
1507        for (i=0; i<dsp->numseg; i++) {
1508           hsp->align_len += dsp->lens[i];
1509           if (dsp->starts[2*i] == -1 || dsp->starts[2*i+1] == -1) {
1510              hsp->gaps++;
1511           }
1512        }
1513        FillHspScoreInfo(hsp, score);
1514        break;
1515     case 3: /* Std-seg; translated gapped or ungapped */
1516        ssp = (StdSegPtr) align->segs;
1517 
1518        if (ungapped) {
1519           while(ssp) {
1520              if((hsp = HspNew()) == NULL)
1521                 return head_hsp;
1522              if (!head_hsp)
1523                 head_hsp = last_hsp = hsp;
1524              else {
1525                 last_hsp->next = hsp;
1526                 last_hsp = last_hsp->next;
1527              }
1528 
1529              ++hspcnt;
1530              if (ssp->scores)
1531                 score = ssp->scores;
1532              else
1533                 score = align->score;
1534 
1535              FillHspScoreInfo(hsp, score);
1536 
1537              if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
1538                 hsp->query_to = SeqLocStart(ssp->loc) + 1;
1539                 hsp->query_from = SeqLocStop(ssp->loc);
1540              } else {
1541                 hsp->query_from = SeqLocStart(ssp->loc) + 1;
1542                 hsp->query_to = SeqLocStop(ssp->loc);
1543              }
1544              if (SeqLocStrand(ssp->loc->next) == Seq_strand_minus) {
1545                 hsp->hit_to = SeqLocStart(ssp->loc->next) + 1;
1546                 hsp->hit_from = SeqLocStop(ssp->loc->next);
1547              } else {
1548                 hsp->hit_from = SeqLocStart(ssp->loc->next) + 1;
1549                 hsp->hit_to = SeqLocStop(ssp->loc->next);
1550              }
1551              if (SeqLocStrand(ssp->loc) == Seq_strand_unknown) {
1552                 /* Protein location */
1553                 hsp->align_len = SeqLocLen(ssp->loc);
1554              } else { /* Nucleotide location; need to divide length by 3
1555                         to get protein length */
1556                 hsp->align_len = SeqLocLen(ssp->loc) / 3;
1557              }
1558              ssp = ssp->next;
1559           }
1560        } else {
1561           SeqLocPtr slp;
1562           StdSegPtr last_ssp;
1563 
1564           if((head_hsp = hsp = HspNew()) == NULL)
1565              return head_hsp;
1566 
1567           hspcnt = 1;
1568           FillHspScoreInfo(hsp, align->score);
1569           /* Get relevant endpoints from the first StdSeg in the list */
1570           if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
1571              hsp->query_from = SeqLocStop(ssp->loc);
1572           } else {
1573              hsp->query_from = SeqLocStart(ssp->loc) + 1;
1574           }
1575           if (SeqLocStrand(ssp->loc->next) == Seq_strand_minus) {
1576              hsp->hit_from = SeqLocStop(ssp->loc->next);
1577           } else {
1578              hsp->hit_from = SeqLocStart(ssp->loc->next) + 1;
1579           }
1580           /* Advance StdSeg to the end of the list, and calculate the
1581              alignment length in the process */
1582           for ( ; ssp; ssp = ssp->next) {
1583              slp = ((ssp->loc->choice == SEQLOC_EMPTY) ? ssp->loc->next :
1584                     ssp->loc);
1585              if (SeqLocStrand(slp) == Seq_strand_unknown) {
1586                 hsp->align_len += SeqLocLen(slp);
1587              } else {
1588                 hsp->align_len += (SeqLocLen(slp) / 3);
1589              }
1590              last_ssp = ssp;
1591           }
1592           /* Get relevant endpoints from the last StdSeg in the list */
1593           if (SeqLocStrand(last_ssp->loc) == Seq_strand_minus) {
1594              hsp->query_to = SeqLocStart(last_ssp->loc) + 1;
1595           } else {
1596              hsp->query_to = SeqLocStop(last_ssp->loc);
1597           }
1598           if (SeqLocStrand(last_ssp->loc->next) == Seq_strand_minus) {
1599              hsp->hit_to = SeqLocStart(last_ssp->loc->next) + 1;
1600           } else {
1601              hsp->hit_to = SeqLocStop(last_ssp->loc->next);
1602           }
1603        }
1604        break;
1605     default: break;
1606     }
1607 
1608     *hspcnt_ptr = hspcnt;
1609     return head_hsp;
1610 }
1611 
SeqAlignToHits(SeqAlignPtr seqalign,Boolean ungapped)1612 HitPtr SeqAlignToHits(SeqAlignPtr seqalign, Boolean ungapped)
1613 {
1614    HitPtr hitp, hitp_head;
1615    SeqAlignPtr sap, sap2;
1616    SeqIdPtr subject_id, sip;
1617    Char buffer[526];
1618    HspPtr hspp;
1619    Int4 hsp_count;
1620 
1621    hitp_head = NULL;
1622 
1623    for(sap = seqalign; sap != NULL;) {
1624       subject_id = TxGetSubjectIdFromSeqAlign(sap);
1625 
1626       if(hitp_head == NULL) { /* first element */
1627          hitp_head = hitp = HitNew();
1628       } else {
1629          hitp->next = HitNew();
1630          hitp = hitp->next;
1631       }
1632 
1633       SeqIdWrite(subject_id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
1634       hitp->id = StringSave(buffer);
1635       SeqIdWrite(SeqIdFindBestAccession(subject_id), buffer,
1636                  PRINTID_TEXTID_ACCESSION, sizeof(buffer));
1637 
1638       hitp->accession = StringSave(buffer);
1639 
1640       for(sap2 = sap; sap2 != NULL; ) {
1641          /* Filling info about specific alignments */
1642          if (!hitp->hsps) {
1643             hspp = hitp->hsps =
1644                GetHspFromSeqAlign(sap2, ungapped, &hsp_count);
1645          } else {
1646             hspp->next = GetHspFromSeqAlign(sap2, ungapped, &hsp_count);
1647             hspp = hspp->next;
1648          }
1649 
1650          hitp->num += hsp_count;
1651 
1652          sap2 = sap2->next;
1653          sip = TxGetSubjectIdFromSeqAlign(sap2);
1654 
1655          if(SeqIdMatch(subject_id, sip)) {
1656             continue;
1657          } else {
1658             sap = sap2;
1659             break;
1660          }
1661       }
1662    }
1663 
1664    return hitp_head;
1665 }
1666