1 static char const rcsid[] = "$Id: xmlblast.c,v 6.41 2007/05/07 13:30:54 kans Exp $";
2
3 /* $Id: xmlblast.c,v 6.41 2007/05/07 13:30:54 kans Exp $ */
4 /**************************************************************************
5 * *
6 * COPYRIGHT NOTICE *
7 * *
8 * This software/database is categorized as "United States Government *
9 * Work" under the terms of the United States Copyright Act. It was *
10 * produced as part of the author's official duties as a Government *
11 * employee and thus can not be copyrighted. This software/database is *
12 * freely available to the public for use without a copyright notice. *
13 * Restrictions can not be placed on its present or future use. *
14 * *
15 * Although all reasonable efforts have been taken to ensure the accuracy *
16 * and reliability of the software and data, the National Library of *
17 * Medicine (NLM) and the U.S. Government do not and can not warrant the *
18 * performance or results that may be obtained by using this software, *
19 * data, or derivative works thereof. The NLM and the U.S. Government *
20 * disclaim any and all warranties, expressed or implied, as to the *
21 * performance, merchantability or fitness for any particular purpose or *
22 * use. *
23 * *
24 * In any work or product derived from this material, proper attribution *
25 * of the author(s) as the source of the software or data would be *
26 * appreciated. *
27 * *
28 **************************************************************************
29 * File Name: xmlblast.c
30 *
31 * Author: Sergei B. Shavirin
32 *
33 * Version Creation Date: 05/17/2000
34 *
35 * $Revision: 6.41 $
36 *
37 * File Description: Functions to print simplified BLAST output (XML)
38 *
39 *
40 * $Log: xmlblast.c,v $
41 * Revision 6.41 2007/05/07 13:30:54 kans
42 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
43 *
44 * Revision 6.40 2007/03/23 14:38:31 madden
45 * Fix RT 15264581 (ungapped use of std-seg) and add needed BioseqUnlock
46 *
47 * Revision 6.39 2005/07/28 14:57:10 coulouri
48 * remove dead code
49 *
50 * Revision 6.38 2005/07/05 14:55:07 madden
51 * Add call to AsnSetXMLmodulePrefix
52 *
53 * Revision 6.37 2005/05/16 18:16:40 dondosha
54 * Removed calls to ObjMgrFreeCache - it should be called on a higher level
55 *
56 * Revision 6.36 2004/10/20 19:58:58 dondosha
57 * Replace sequence data in Bioseq, as done in txalign.c, to assure proper masking and identities/positives calculation for all programs
58 *
59 * Revision 6.35 2004/06/30 12:32:20 madden
60 * Added include for blfmtutl.h, removed unused variable
61 *
62 * Revision 6.34 2004/06/23 21:08:58 dondosha
63 * Fixed masking of filtered locations for translated queries
64 *
65 * Revision 6.33 2004/04/29 19:55:35 dondosha
66 * Mask filtered locations in query sequence lines
67 *
68 * Revision 6.32 2004/03/31 17:58:23 dondosha
69 * Added PSIXmlReset function to allow keeping the AsnIoPtr between outputs for multiple queries in blastpgp
70 *
71 * Revision 6.31 2003/08/04 16:19:16 dondosha
72 * Added effective HSP length (length adjustment) to other returns, so it can be reported in XML output
73 *
74 * Revision 6.30 2003/05/30 17:25:38 coulouri
75 * add rcsid
76 *
77 * Revision 6.29 2003/03/21 21:01:16 camacho
78 * Fixed inversion of subject and query sequences for tblastx
79 *
80 * Revision 6.28 2003/02/19 15:42:32 madden
81 * Check glb_matrix before freeing
82 *
83 * Revision 6.27 2003/01/28 16:57:11 dondosha
84 * For single query, call the old BXMLPrintOutput function from BXMLPrintMultiQueryOutput
85 *
86 * Revision 6.26 2003/01/23 20:02:53 dondosha
87 * Distinguish between blastn and megablast programs in multi-query XML output
88 *
89 * Revision 6.25 2003/01/23 19:55:20 dondosha
90 * Added the closing part for multi-query XML output.
91 *
92 * Revision 6.24 2003/01/06 23:01:40 dondosha
93 * Added function to create a multi-query XML output for web megablast
94 *
95 * Revision 6.23 2002/11/14 15:37:18 dondosha
96 * Added functions to extract all hit information from seqalign that can be extracted without loading sequences
97 *
98 * Revision 6.22 2002/07/17 22:28:13 dondosha
99 * Added support for megablast XML output
100 *
101 * Revision 6.21 2002/04/23 20:48:24 madden
102 * Fix hsp_count for ungapped case
103 *
104 * Revision 6.20 2002/03/08 21:34:57 madden
105 * If no title for Bioseq use "No definition line found"
106 *
107 * Revision 6.19 2002/01/15 21:56:38 madden
108 * Fixes from Jed Wing (Turbogenomics) for ungapped runs
109 *
110 * Revision 6.18 2001/05/01 20:54:32 madden
111 * Set glb_matrix in BXMLSeqAlignToHits if not already set
112 *
113 * Revision 6.17 2001/02/28 21:37:25 shavirin
114 * Fixed XML printing in case when definition line is missing.
115 *
116 * Revision 6.16 2001/01/31 18:43:49 dondosha
117 * Test whether subject Bioseq is found before trying to show the hit
118 *
119 * Revision 6.15 2001/01/19 20:02:17 dondosha
120 * Set the query and hit frames depending on strand for blastn output
121 *
122 * Revision 6.14 2000/11/28 20:51:57 shavirin
123 * Adopted for usage with mani-iterational XML definition.
124 *
125 * Revision 6.13 2000/11/22 21:55:49 shavirin
126 * Added function BXMLPrintOutputEx() with new parameter iteration_number
127 * for usage with PSI-Blast.
128 *
129 * Revision 6.12 2000/11/17 21:49:33 shavirin
130 * Changed order from/to for negative strand AFTER retrieval of the sequence.
131 *
132 * Revision 6.11 2000/11/08 21:39:25 shavirin
133 * Changed order from/to depending on strand and frame.
134 *
135 * Revision 6.10 2000/11/08 20:49:05 shavirin
136 * Added paramter "score" as in Traditional Blast Output. Previous score
137 * changed to "bit_score".
138 *
139 * Revision 6.9 2000/11/08 20:09:32 shavirin
140 * Added new parameter align_len analogos to the number reported in
141 * the Traditional Blast Output.
142 *
143 * Revision 6.8 2000/11/07 21:51:22 shavirin
144 * Added check if query sequence is available for retrieval.
145 *
146 * Revision 6.7 2000/10/23 22:12:46 shavirin
147 * Added possibility to create XML with error message in case of failure or
148 * no hits.
149 *
150 * Revision 6.6 2000/10/23 19:56:06 dondosha
151 * AsnIo should be opened and closed outside function BXMLPrintOutput
152 *
153 * Revision 6.5 2000/10/12 21:35:31 shavirin
154 * Added support for OOF alignment.
155 *
156 * Revision 6.4 2000/08/22 14:53:00 shavirin
157 * Changed variable from enthropy to entropy.
158 *
159 * Revision 6.3 2000/08/11 16:54:17 kans
160 * return FALSE instead of NULL for Mac compiler
161 *
162 * Revision 6.2 2000/08/10 14:42:33 shavirin
163 * Added missing comment.
164 *
165 * Revision 6.1 2000/08/09 20:40:04 shavirin
166 * Initial revision.
167 * *
168 *
169 */
170
171 #include <xmlblast.h>
172 #include <blfmtutl.h>
173
174 static Int4Ptr PNTR glb_matrix;
175
BXMLGetSeqLineForDenseDiag(DenseDiagPtr ddp,HspPtr hsp,Int4 length,Boolean is_aa,Int4Ptr PNTR matrix)176 Boolean BXMLGetSeqLineForDenseDiag(DenseDiagPtr ddp, HspPtr hsp, Int4 length,
177 Boolean is_aa, Int4Ptr PNTR matrix)
178 {
179 SeqIdPtr m_id, t_id;
180 SeqInt si;
181 SeqLoc sl;
182 Int4 i;
183 Uint1 m_res, t_res;
184 SeqPortPtr m_spp, t_spp;
185
186 if(ddp == NULL || hsp == NULL)
187 return FALSE;
188
189 hsp->qseq = MemNew(length+1);
190 hsp->hseq = MemNew(length+1);
191 hsp->midline = MemNew(length+1);
192
193 sl.choice = SEQLOC_INT;
194 sl.data.ptrvalue = &si;
195
196 /* !! Here we assume, that this is 2-dimensional DenseDiag where
197 first element is query and second element is database sequence
198 This is the case for ALL non-gapped blastp and blastn SeqAligns */
199
200 /* SeqLoc for query sequence */
201
202 m_id = SeqIdDup(ddp->id);
203 si.id = m_id;
204 si.from = hsp->query_from;
205 si.to = hsp->query_to;
206 si.strand = (ddp->strands == NULL) ? 0 : ddp->strands[0];
207
208 m_spp = SeqPortNewByLoc(&sl, is_aa ? Seq_code_ncbieaa : Seq_code_iupacna);
209
210 /* SeqLoc for the subject */
211
212 t_id = SeqIdDup(ddp->id->next);
213 si.id = t_id;
214 si.from = hsp->hit_from;
215 si.to = hsp->hit_to;
216 si.strand = (ddp->strands == NULL) ? 0 : ddp->strands[1];
217
218 t_spp = SeqPortNewByLoc(&sl, is_aa ? Seq_code_ncbieaa : Seq_code_iupacna);
219
220 if(m_spp == NULL || t_spp == NULL) {
221 if(m_spp == NULL)
222 SeqPortFree(m_spp);
223 if(t_spp != NULL)
224 SeqPortFree(t_spp);
225 return FALSE;
226 }
227
228 for(i = 0; i < length; ++i) {
229 m_res = SeqPortGetResidue(m_spp);
230 t_res = SeqPortGetResidue(t_spp);
231
232 hsp->qseq[i] = m_res;
233 hsp->hseq[i] = t_res;
234
235 if(m_res == t_res) {
236 hsp->midline[i] = is_aa ? m_res : '|';
237 } else if(matrix != NULL && is_aa) {
238 if (IS_residue(m_res) && IS_residue(t_res) &&
239 matrix[m_res][t_res] >0) {
240 hsp->midline[i] = '+';
241 } else {
242 hsp->midline[i] = ' ';
243 }
244 } else {
245 hsp->midline[i] = ' ';
246 }
247 }
248
249 SeqIdFree(t_id);
250 SeqIdFree(m_id);
251 SeqPortFree(m_spp);
252 SeqPortFree(t_spp);
253
254 return TRUE;
255 }
BXMLGetSeqLineForDenseSeg(DenseSegPtr dsp,HspPtr hsp,Int4 length,Boolean is_aa,Int4Ptr PNTR matrix)256 Boolean BXMLGetSeqLineForDenseSeg(DenseSegPtr dsp, HspPtr hsp, Int4 length,
257 Boolean is_aa, Int4Ptr PNTR matrix)
258 {
259
260 SeqInt msi, tsi;
261 SeqLoc sl;
262 Int2 i;
263 Int2 dim;
264 Int2 m_order, t_order; /*order of the master and the target sequence*/
265 Int4 index, abs_index;
266 Int4 j, val, t_val;
267 Uint1 m_res, t_res;
268 SeqPortPtr m_spp, t_spp;
269
270 if(dsp == NULL || hsp == NULL)
271 return FALSE;
272
273 hsp->qseq = MemNew(length+1);
274 hsp->hseq = MemNew(length+1);
275 hsp->midline = MemNew(length+1);
276
277 m_order = 0;
278 t_order = 1;
279 dim = 2;
280
281 msi.id = SeqIdDup(dsp->ids);
282 msi.from = hsp->query_from;
283 msi.to = hsp->query_to;
284 msi.strand = (dsp->strands == NULL) ? 0 : dsp->strands[m_order];
285
286 tsi.id = dsp->ids->next;
287 tsi.from = hsp->hit_from;
288 tsi.to = hsp->hit_to;
289 tsi.strand = (dsp->strands == NULL) ? 0 : dsp->strands[t_order];
290
291 sl.choice = SEQLOC_INT;
292 sl.data.ptrvalue = &msi;
293 m_spp = SeqPortNewByLoc(&sl,
294 (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
295
296 sl.choice = SEQLOC_INT;
297 sl.data.ptrvalue = &tsi;
298 t_spp = SeqPortNewByLoc(&sl,
299 (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
300
301 abs_index = 0;
302 for(i = 0; i<dsp->numseg; ++i) {
303 val = dsp->starts[i*dim + m_order];
304 t_val = dsp->starts[i*dim + t_order];
305 if(val == -1 || t_val == -1) {
306
307 if(val == -1 && t_val == -1) /* never should happend */
308 continue;
309
310 if(val != -1) {
311 index = dsp->lens[i];
312 while (index > 0) {
313 index--;
314 m_res = SeqPortGetResidue(m_spp);
315 hsp->qseq[abs_index] = m_res;
316 hsp->hseq[abs_index] = '-';
317 hsp->midline[abs_index] = ' ';
318 abs_index++;
319 }
320 }
321 if(t_val != -1) {
322 index = dsp->lens[i];
323 while (index > 0) {
324 index--;
325 t_res = SeqPortGetResidue(t_spp);
326 hsp->qseq[abs_index] = '-';
327 hsp->hseq[abs_index] = t_res;
328 hsp->midline[abs_index] = ' ';
329 abs_index++;
330 }
331 }
332 } else {
333 for(j = 0; j<dsp->lens[i]; ++j) {
334 m_res = SeqPortGetResidue(m_spp);
335 t_res = SeqPortGetResidue(t_spp);
336
337 hsp->qseq[abs_index] = m_res;
338 hsp->hseq[abs_index] = t_res;
339
340 if(m_res == t_res) {
341 if(is_aa)
342 hsp->midline[abs_index] = m_res;
343 else
344 hsp->midline[abs_index] = '|';
345
346 } else if(matrix != NULL && is_aa &&
347 IS_residue(m_res) && IS_residue(t_res)) {
348
349 if(matrix[m_res][t_res] >0)
350 hsp->midline[abs_index] = '+';
351 else
352 hsp->midline[abs_index] = ' ';
353
354 } else {
355 hsp->midline[abs_index] = ' ';
356 }
357
358 abs_index++;
359 }
360 }
361 }
362
363 SeqIdFree(msi.id);
364 SeqPortFree(m_spp);
365 SeqPortFree(t_spp);
366 return TRUE;
367 }
BXMLGetSeqLineForStdSeg(StdSegPtr ssp,HspPtr hsp,Int4 length,Int4Ptr PNTR matrix)368 Boolean BXMLGetSeqLineForStdSeg(StdSegPtr ssp, HspPtr hsp, Int4 length,
369 Int4Ptr PNTR matrix)
370 {
371 Boolean master_is_translated=FALSE, both_translated=FALSE;
372 Boolean target_is_translated = FALSE;
373 CharPtr genetic_code1, genetic_code2;
374 SeqPortPtr spp1, spp2;
375 Uint1 codon[4], residue1, residue2;
376 Int4 abs_index;
377 Boolean ungapped_align = FALSE;
378
379 if(ssp == NULL || hsp == NULL)
380 return FALSE;
381
382 hsp->qseq = MemNew(length+1);
383 hsp->hseq = MemNew(length+1);
384 hsp->midline = MemNew(length+1);
385
386 /* Check for valid sequence. */
387 if (SeqLocLen(ssp->loc) == 3*SeqLocLen(ssp->loc->next))
388 master_is_translated = TRUE;
389 else if (3*SeqLocLen(ssp->loc) == SeqLocLen(ssp->loc->next))
390 target_is_translated = TRUE;
391 else if (SeqLocLen(ssp->loc) == SeqLocLen(ssp->loc->next))
392 both_translated = TRUE;
393 else
394 return FALSE;
395
396 if (master_is_translated) {
397 genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids);
398 } else if (both_translated) {
399 genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids);
400 genetic_code2 = GetGeneticCodeFromSeqId(ssp->ids->next);
401 } else {
402 genetic_code1 = GetGeneticCodeFromSeqId(ssp->ids->next);
403 }
404
405
406 for(abs_index = 0; ssp != 0 && abs_index < length; ssp = ssp->next) {
407
408 if (ssp->loc->choice != SEQLOC_EMPTY || ssp->loc->next->choice != SEQLOC_EMPTY) {
409 /* Non - gap element */
410 if (ssp->loc->choice != SEQLOC_EMPTY && ssp->loc->next->choice != SEQLOC_EMPTY) {
411 if (both_translated) { /* TBLASTX */
412 spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
413 spp2 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
414 while ((codon[0]=SeqPortGetResidue(spp2)) != SEQPORT_EOF) {
415 codon[1] = SeqPortGetResidue(spp2);
416 codon[2] = SeqPortGetResidue(spp2);
417 residue1 = AAForCodon(codon, genetic_code1);
418 codon[0] = SeqPortGetResidue(spp1);
419 codon[1] = SeqPortGetResidue(spp1);
420 codon[2] = SeqPortGetResidue(spp1);
421 residue2 = AAForCodon(codon, genetic_code2);
422
423 hsp->qseq[abs_index] = residue2;
424 hsp->hseq[abs_index] = residue1;
425
426 if (residue1 == residue2)
427 hsp->midline[abs_index] = residue1;
428 else if (matrix != NULL && matrix[residue1][residue2] >0)
429 hsp->midline[abs_index] = '+';
430 else
431 hsp->midline[abs_index] = ' ';
432
433 abs_index++;
434
435 }
436 } else { /* TBLASTN or BLASTX */
437 if (master_is_translated) { /* BLASTX */
438 spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
439 spp2 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbieaa);
440 } else { /* TBLASTN */
441 spp2 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbieaa);
442 spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
443 }
444
445 while ((residue1=SeqPortGetResidue(spp2)) != SEQPORT_EOF) {
446 codon[0] = SeqPortGetResidue(spp1);
447 codon[1] = SeqPortGetResidue(spp1);
448 codon[2] = SeqPortGetResidue(spp1);
449 residue2 = AAForCodon(codon, genetic_code1);
450
451 if (master_is_translated) { /* BLASTX */
452 hsp->qseq[abs_index] = residue2;
453 hsp->hseq[abs_index] = residue1;
454 } else { /* TBLASTN */
455 hsp->qseq[abs_index] = residue1;
456 hsp->hseq[abs_index] = residue2;
457 }
458
459 if (residue1 == residue2)
460 hsp->midline[abs_index] = residue1;
461 else if (matrix != NULL && matrix[residue1][residue2] >0)
462 hsp->midline[abs_index] = '+';
463 else
464 hsp->midline[abs_index] = ' ';
465
466 abs_index++;
467 }
468 }
469 SeqPortFree(spp1);
470 SeqPortFree(spp2);
471 /* Check if this is an ungapped alignment;
472 in this case do not go to next link */
473 if (ssp->next &&
474 ssp->next->loc->choice != SEQLOC_EMPTY &&
475 ssp->next->loc->next->choice != SEQLOC_EMPTY)
476 ungapped_align = TRUE;
477
478 } else if (ssp->loc->choice == SEQLOC_EMPTY) { /* gap in query */
479
480 if (target_is_translated) { /* TBLASTN */
481 spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbi4na);
482
483 while(TRUE) {
484 if((codon[0] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
485 break;
486 if((codon[1] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
487 break;
488 if((codon[2] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
489 break;
490
491 residue1 = AAForCodon(codon, master_is_translated ? genetic_code2 : genetic_code1);
492
493 hsp->qseq[abs_index] = '-'; /* gap character */
494 hsp->hseq[abs_index] = residue1;
495 hsp->midline[abs_index] = ' ';
496 abs_index++;
497 }
498 SeqPortFree(spp1);
499 } else { /* BLASTX */
500 spp1 = SeqPortNewByLoc(ssp->loc->next, Seq_code_ncbieaa);
501
502 while ((residue1=SeqPortGetResidue(spp1)) != SEQPORT_EOF) {
503 hsp->qseq[abs_index] = '-'; /* gap character */
504 hsp->hseq[abs_index] = residue1;
505 hsp->midline[abs_index] = ' ';
506 abs_index++;
507 }
508
509 SeqPortFree(spp1);
510 }
511 } else if (ssp->loc->next->choice == SEQLOC_EMPTY) {
512 if (master_is_translated) { /* BLASTX */
513 spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbi4na);
514
515 while(TRUE) {
516 if((codon[0] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
517 break;
518 if((codon[1] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
519 break;
520 if((codon[2] = SeqPortGetResidue(spp1)) == SEQPORT_EOF)
521 break;
522
523 residue1 = AAForCodon(codon, genetic_code1);
524
525 hsp->qseq[abs_index] = residue1;
526 hsp->hseq[abs_index] = '-'; /* gap character */
527 hsp->midline[abs_index] = ' ';
528 abs_index++;
529 }
530 SeqPortFree(spp1);
531 } else { /* TBLASTN */
532
533 spp1 = SeqPortNewByLoc(ssp->loc, Seq_code_ncbieaa);
534
535 while ((residue1=SeqPortGetResidue(spp1)) != SEQPORT_EOF) {
536 hsp->qseq[abs_index] = residue1;
537 hsp->hseq[abs_index] = '-'; /* gap character */
538 hsp->midline[abs_index] = ' ';
539 abs_index++;
540 }
541
542 SeqPortFree(spp1);
543 }
544 }
545
546 if (both_translated || ungapped_align)
547 /* for tblastx perform only one StdSegPtr. so far...*/
548 break;
549 } else {
550 continue; /* Both EMPTY - never should happened */
551 }
552 }
553
554 return TRUE;
555 }
556
BXMLGetSeqLines(SeqAlignPtr align,HspPtr hsp,Int4 length,Boolean is_aa,Int4 chain,Int4Ptr PNTR matrix)557 Boolean BXMLGetSeqLines(SeqAlignPtr align, HspPtr hsp, Int4 length,
558 Boolean is_aa, Int4 chain, Int4Ptr PNTR matrix)
559 {
560 DenseDiagPtr ddp;
561 DenseSegPtr dsp;
562 StdSegPtr ssp;
563 Uint2 order = 0;
564 SeqAlignPtr sap;
565
566 if(align == NULL)
567 return FALSE;
568
569 switch (align->segtype) {
570 case 1: /*Dense-diag*/
571 ddp = (DenseDiagPtr) align->segs;
572
573 while(ddp) {
574 ++order;
575 if(order == chain) {
576 BXMLGetSeqLineForDenseDiag(ddp, hsp, length, is_aa, matrix);
577 break;
578 }
579 ddp = ddp->next;
580 }
581
582 break;
583 case 2:
584 dsp = (DenseSegPtr) align->segs;
585 BXMLGetSeqLineForDenseSeg(dsp, hsp, length, is_aa, matrix);
586 break;
587 case 3:
588 ssp = (StdSegPtr) align->segs;
589 while(ssp) {
590 ++order;
591 if(order == chain) {
592 BXMLGetSeqLineForStdSeg(ssp, hsp, length, matrix);
593 break;
594 }
595 ssp = ssp->next;
596 }
597 break;
598 case 5: /* Discontinuous alignment */
599 sap = (SeqAlignPtr) align->segs;
600 return BXMLGetSeqLines(sap, hsp, length, is_aa, chain, matrix);
601 default:
602 break;
603 }
604
605 return TRUE;
606 }
607
608 /** Frees the part of the byte store list that has not been used for replacement
609 * of Bioseq data.
610 */
611 static ValNodePtr
ByteStoreListFree(ValNodePtr bs_list)612 ByteStoreListFree(ValNodePtr bs_list)
613 {
614 ByteStorePtr bsp;
615 ValNodePtr vnp;
616
617 for(vnp = bs_list; vnp; vnp = vnp->next) {
618 bsp = (ByteStorePtr) vnp->data.ptrvalue;
619 if(bsp != NULL)
620 BSFree(bsp);
621 }
622 return ValNodeFree(bs_list);
623 }
624
BXMLGetHspFromSeqAlign(SeqAlignPtr sap,Boolean is_aa,Int4 chain,Boolean is_ooframe,ValNodePtr mask_loc)625 HspPtr BXMLGetHspFromSeqAlign(SeqAlignPtr sap, Boolean is_aa, Int4 chain,
626 Boolean is_ooframe, ValNodePtr mask_loc)
627 {
628 HspPtr hsp;
629 AlignSum as;
630 ScorePtr score, sp;
631 Boolean matrix_allocated = FALSE;
632 BioseqPtr bsp = NULL;
633 Char tmp[256];
634 ByteStorePtr seq_data=NULL;
635 Boolean seq_data_replaced = FALSE;
636 ValNodePtr bs_list = NULL;
637 Uint1 code=0;
638 Uint1 repr=0;
639
640 if((hsp = HspNew()) == NULL)
641 return NULL;
642
643 MemSet(&as, NULLB, sizeof(AlignSum));
644
645 as.master_sip = TxGetQueryIdFromSeqAlign(sap);
646 as.target_sip = TxGetSubjectIdFromSeqAlign(sap);
647
648 /* If there is a mask, retrieve the query Bioseq and replace the sequence
649 data with a masked one. */
650 if (mask_loc) {
651 Int4 frame;
652 if ((bsp = BioseqLockById(as.master_sip)) == NULL) {
653 SeqIdWrite(as.master_sip, tmp, PRINTID_FASTA_LONG, sizeof(tmp));
654 ErrPostEx(SEV_ERROR, 0, __LINE__, "Query sequence '%s' is not "
655 "available in Bioseq index.", tmp);
656 return NULL;
657 }
658
659 bs_list = CreateMaskByteStore (mask_loc);
660 if (ISA_na(bsp->mol) && sap->segtype == SAS_STD) {
661 StdSegPtr ssp = (StdSegPtr) sap->segs;
662 int ssp_index = 1;
663 while (ssp_index < chain)
664 {
665 ssp = ssp->next;
666 ssp_index++;
667 }
668 if (ssp == NULL)
669 { /* No more to process. */
670 BioseqUnlock(bsp);
671 return NULL;
672 }
673 frame = SeqLocStart(ssp->loc);
674 if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
675 frame += SeqLocLen(ssp->loc);
676 frame = 4 + (bsp->length - frame)%3;
677 } else {
678 frame = (1 + frame%3);
679 }
680 } else
681 frame = 0;
682 repr = bsp->repr;
683 seq_data = (ByteStorePtr) bsp->seq_data;
684 code = bsp->seq_data_type;
685 seq_data_replaced =
686 replace_bytestore_data(bsp, bs_list, (Uint1)frame);
687 if (!seq_data_replaced) {
688 bsp->repr = repr;
689 bsp->seq_data = (SeqDataPtr) seq_data;
690 bsp->seq_data_type = code;
691 }
692 } else {
693 /* Checkup for query Bioseq to be available in the index */
694 if ((bsp = BioseqLockById(as.master_sip)) == NULL) {
695 SeqIdWrite(as.master_sip, tmp, PRINTID_FASTA_LONG, sizeof(tmp));
696 ErrPostEx(SEV_ERROR, 0, __LINE__, "Query sequence '%s' is not "
697 "available in Bioseq index.", tmp);
698 return NULL;
699 }
700 }
701
702 if(glb_matrix != NULL)
703 as.matrix = glb_matrix;
704 else {
705 as.matrix = load_default_matrix ();
706 matrix_allocated = TRUE;
707 }
708 as.is_aa = is_aa;
709 as.ooframe = is_ooframe;
710
711 if(chain == 0)
712 chain = 1;
713
714 if((score = find_score_in_align(sap, chain, &as)) == NULL) {
715 if(matrix_allocated)
716 free_default_matrix(as.matrix);
717 HspFree(hsp);
718 BioseqUnlock(bsp);
719 return NULL;
720 }
721
722 for(sp = score; sp != NULL; sp = sp->next) {
723 if(!(StringICmp(sp->id->str, "e_value")) ||
724 !(StringICmp(sp->id->str, "sum_e")))
725 hsp->evalue = sp->value.realvalue;
726 if(!StringICmp(sp->id->str, "bit_score")) {
727 hsp->bit_score = sp->value.realvalue;
728 }
729 if(!StringICmp(sp->id->str, "score")) {
730 hsp->score = sp->value.intvalue;
731 }
732 }
733
734 if (as.m_frame_set || as.t_frame_set) {
735 hsp->query_frame = as.m_frame;
736 hsp->hit_frame = as.t_frame;
737 } else { /* blastn program! */
738 hsp->query_frame = 1;
739 if (as.m_strand == as.t_strand)
740 hsp->hit_frame = 1;
741 else
742 hsp->hit_frame = -1;
743 }
744 hsp->identity = as.identical;
745 hsp->positive = as.positive + as.identical;
746 hsp->gaps = as.gaps;
747 hsp->align_len = as.totlen;
748
749 hsp->density = 0; /* ???? */
750
751
752 /* For sequence retrieval from should always be less than to */
753 hsp->query_from = as.master_from;
754 hsp->query_to = as.master_to;
755 hsp->hit_from = as.target_from;
756 hsp->hit_to = as.target_to;
757
758 BXMLGetSeqLines(sap, hsp, as.totlen, is_aa, chain, as.matrix);
759
760 /* Restore the original query sequence data, if it was replaced. */
761 if (seq_data_replaced) {
762 bsp->seq_data = (SeqDataPtr) seq_data;
763 bsp->repr = repr;
764 bsp->seq_data_type = code;
765 }
766 bs_list = ByteStoreListFree(bs_list);
767 BioseqUnlock(bsp);
768
769
770 /* For display it depends on strand */
771
772 if(as.m_strand == Seq_strand_minus || as.m_frame < 0) {
773 hsp->query_from = as.master_to;
774 hsp->query_to = as.master_from;
775 }
776
777 if(as.t_strand == Seq_strand_minus || as.t_frame < 0) {
778 hsp->hit_from = as.target_to;
779 hsp->hit_to = as.target_from;
780 }
781
782 /* ... and 1 position larger, that array number (started from 0) */
783
784 hsp->query_from++;
785 hsp->query_to++;
786 hsp->hit_from++;
787 hsp->hit_to++;
788
789 if(matrix_allocated)
790 free_default_matrix(as.matrix);
791
792 return hsp;
793 }
794
BXMLSeqAlignToHits(SeqAlignPtr seqalign,Boolean ungapped,Boolean is_ooframe,ValNodePtr mask_loc)795 HitPtr BXMLSeqAlignToHits(SeqAlignPtr seqalign, Boolean ungapped,
796 Boolean is_ooframe, ValNodePtr mask_loc)
797 {
798 HitPtr hitp, hitp_head;
799 HspPtr hspp;
800 SeqAlignPtr sap, sap2;
801 SeqIdPtr subject_sip, new_sip;
802 BioseqPtr bsp;
803 Char buffer[526];
804 Int4 hit_count, hsp_count, chain;
805 Boolean is_aa;
806
807 /* We assume, that seqalignes in this set ordered by
808 database sequences and by e-values for a given database sequence */
809
810 /* For optimization BLOSUM62 may be loaded ones */
811 if(glb_matrix == NULL)
812 glb_matrix = load_default_matrix ();
813
814 hitp_head = NULL;
815 hit_count = 1; /* Hits starting with 1 */
816
817 for(sap = seqalign; sap != NULL;) {
818 subject_sip = TxGetSubjectIdFromSeqAlign(sap);
819 bsp = BioseqLockById(subject_sip);
820 if (!bsp) { /* Apparently a recently deleted sequence */
821 sap = sap->next;
822 continue;
823 }
824
825 if(hitp_head == NULL) { /* first element */
826 hitp_head = hitp = HitNew();
827 } else {
828 hitp->next = HitNew();
829 hitp = hitp->next;
830 }
831 hitp->num = hit_count;
832 hit_count++;
833
834 is_aa = (bsp->mol == Seq_mol_aa);
835
836 if (BioseqGetTitle(bsp))
837 hitp->def = StringSave(BioseqGetTitle(bsp));
838 else
839 hitp->def = StringSave("No definition line found");
840
841 SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
842 hitp->id = StringSave(buffer);
843
844 if(bsp->id->choice == SEQID_GI && bsp->id->next != NULL)
845 SeqIdWrite(bsp->id->next, buffer, PRINTID_TEXTID_ACCESSION, sizeof(buffer));
846 else
847 SeqIdWrite(bsp->id, buffer, PRINTID_TEXTID_ACCESSION, sizeof(buffer));
848
849 hitp->accession = StringSave(buffer);
850
851 hitp->len = bsp->length;
852 hsp_count = 1; /* Hsps starting with 1 */
853 chain = 1;
854
855 BioseqUnlock(bsp);
856
857 for(sap2 = sap; sap2 != NULL;) {
858
859 /* Filling info about specific alignments */
860
861 while(TRUE) {
862 if(hitp->hsps == NULL) {
863 hspp = hitp->hsps =
864 BXMLGetHspFromSeqAlign(sap2, is_aa, chain, is_ooframe,
865 mask_loc);
866 } else {
867 if((hspp->next = BXMLGetHspFromSeqAlign(sap2, is_aa, chain,
868 is_ooframe, mask_loc)) == NULL)
869 break;
870 else
871 hspp = hspp->next;
872 }
873
874 if(hspp == NULL)
875 break;
876
877 hspp->num = hsp_count;
878 hsp_count++;
879 if(!ungapped) { /* Only one chain for gapped */
880 break;
881 }
882 chain++;
883 }
884
885 sap2 = sap2->next;
886 new_sip = TxGetSubjectIdFromSeqAlign(sap2);
887
888 if(SeqIdMatch(subject_sip, new_sip)) {
889 continue;
890 } else {
891 sap = sap2;
892 break;
893 }
894 }
895 }
896
897 return hitp_head;
898 }
899
BXMLBuildOneIteration(SeqAlignPtr seqalign,ValNodePtr other_returns,Boolean is_ooframe,Boolean ungapped,Int4 iter_num,CharPtr message,ValNodePtr mask_loc)900 IterationPtr BXMLBuildOneIteration(SeqAlignPtr seqalign,
901 ValNodePtr other_returns,
902 Boolean is_ooframe, Boolean ungapped,
903 Int4 iter_num, CharPtr message,
904 ValNodePtr mask_loc)
905 {
906 return BXMLBuildOneQueryIteration(seqalign, other_returns, is_ooframe,
907 ungapped, iter_num, message, NULL,
908 mask_loc);
909 }
910
BXMLBuildOneQueryIteration(SeqAlignPtr seqalign,ValNodePtr other_returns,Boolean is_ooframe,Boolean ungapped,Int4 iter_num,CharPtr message,BioseqPtr query,ValNodePtr mask_loc)911 IterationPtr BXMLBuildOneQueryIteration(SeqAlignPtr seqalign,
912 ValNodePtr other_returns,
913 Boolean is_ooframe, Boolean ungapped,
914 Int4 iter_num, CharPtr message,
915 BioseqPtr query, ValNodePtr mask_loc)
916 {
917 IterationPtr iterp;
918 Char buffer[1024];
919
920 iterp = IterationNew();
921 iterp->iter_num = iter_num;
922
923 if (query) {
924 SeqIdWrite(query->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
925 iterp->query_ID = StringSave(buffer);
926
927 if((iterp->query_def = StringSave(BioseqGetTitle(query))) == NULL) {
928 iterp->query_def = StringSave("No definition line found");
929 }
930
931 iterp->query_len = query->length;
932
933 }
934
935 if(seqalign != NULL) {
936 iterp->hits =
937 BXMLSeqAlignToHits(seqalign, ungapped, is_ooframe, mask_loc);
938 }
939
940 iterp->stat = BXMLBuildStatistics(other_returns, ungapped);
941
942 iterp->message = StringSave(message);
943
944 return iterp;
945 }
946
947 StatisticsPtr
BXMLBuildStatistics(ValNodePtr other_returns,Boolean ungapped)948 BXMLBuildStatistics(ValNodePtr other_returns, Boolean ungapped)
949 {
950 TxDfDbInfoPtr dbinfo=NULL;
951 BLAST_KarlinBlkPtr ka_params_gap=NULL;
952 BLAST_KarlinBlkPtr ka_params_ungap=NULL;
953 ValNodePtr vnp;
954 StatisticsPtr stat;
955
956 if (!other_returns)
957 return NULL;
958
959 stat = StatisticsNew();
960
961 for (dbinfo = NULL, vnp=other_returns; vnp; vnp = vnp->next) {
962 switch (vnp->choice) {
963 case TXDBINFO:
964 dbinfo = vnp->data.ptrvalue;
965 break;
966 case TXKABLK_GAP:
967 ka_params_gap = vnp->data.ptrvalue;
968 break;
969 case TXKABLK_NOGAP:
970 ka_params_ungap = vnp->data.ptrvalue;
971 break;
972 case EFF_SEARCH_SPACE:
973 stat->eff_space = vnp->data.realvalue;
974 break;
975 case EFF_HSP_LENGTH:
976 stat->hsp_len = vnp->data.intvalue;
977 break;
978 default:
979 break;
980 }
981 }
982
983 if(dbinfo != NULL) {
984 stat->db_num= dbinfo->number_seqs;
985 stat->db_len = dbinfo->total_length;
986 }
987
988 if(ungapped) {
989 if(ka_params_ungap != NULL) {
990 stat->lambda = ka_params_ungap->Lambda;
991 stat->kappa = ka_params_ungap->K;
992 stat->entropy = ka_params_ungap->H;
993 }
994 } else {
995 if(ka_params_gap != NULL) {
996 stat->lambda = ka_params_gap->Lambda;
997 stat->kappa = ka_params_gap->K;
998 stat->entropy = ka_params_gap->H;
999 }
1000 }
1001
1002 return stat;
1003 }
1004
BXMLCreateBlastOutputHead(CharPtr program,CharPtr database,BLAST_OptionsBlkPtr options,BioseqPtr query,Int4 flags)1005 BlastOutputPtr BXMLCreateBlastOutputHead(CharPtr program, CharPtr database,
1006 BLAST_OptionsBlkPtr options,
1007 BioseqPtr query, Int4 flags)
1008 {
1009 BlastOutputPtr boutp;
1010 Char buffer[1024];
1011 SeqPortPtr spp;
1012 Boolean is_aa = FALSE;
1013 Int4 i;
1014
1015 if((boutp = BlastOutputNew()) == NULL)
1016 return FALSE;
1017
1018 /* For optimization BLOSUM62 may be loaded ones */
1019 if(glb_matrix == NULL)
1020 glb_matrix = load_default_matrix ();
1021
1022 if (query) {
1023 SeqIdWrite(query->id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
1024 boutp->query_ID = StringSave(buffer);
1025
1026 if((boutp->query_def = StringSave(BioseqGetTitle(query))) == NULL) {
1027 boutp->query_def = StringSave("No definition line found");
1028 }
1029
1030 boutp->query_len = query->length;
1031
1032 if(flags & BXML_INCLUDE_QUERY) {
1033 boutp->query_seq = MemNew(query->length+1);
1034 is_aa = (query->mol == Seq_mol_aa);
1035 spp = SeqPortNew(query, 0, -1, Seq_strand_plus,
1036 (is_aa) ? Seq_code_ncbieaa : Seq_code_iupacna);
1037
1038 for (i = 0; i < query->length; i++) {
1039 boutp->query_seq[i] = SeqPortGetResidue(spp);
1040 }
1041 spp = SeqPortFree(spp);
1042 } else {
1043 boutp->query_seq = NULL; /* Do we need sequence here??? */
1044 }
1045 }
1046 /* Program name */
1047 boutp->program = StringSave(program);
1048
1049 /* Database name */
1050 boutp->db = StringSave(database);
1051
1052 /* Version text */
1053 sprintf(buffer, "%s %s [%s]", program, BlastGetVersionNumber(),
1054 BlastGetReleaseDate());
1055 boutp->version = StringSave(buffer);
1056
1057 /* Reference */
1058 boutp->reference = BlastGetReference(FALSE);
1059
1060 /* Filling parameters */
1061
1062 boutp->param = ParametersNew();
1063 boutp->param->expect = options->expect_value;
1064 boutp->param->matrix = StringSave(options->matrix);
1065 boutp->param->sc_match = options->reward;
1066 boutp->param->sc_mismatch = options->penalty;
1067 boutp->param->gap_open = options->gap_open;
1068 boutp->param->gap_extend = options->gap_extend;
1069 boutp->param->include = options->ethresh;
1070
1071 if(options->filter_string != NULL)
1072 boutp->param->filter = StringSave(options->filter_string);
1073
1074 return boutp;
1075 }
1076
1077 /*
1078 This function will create and print out simplified ASN.1/XML of
1079 one-iterational Blast output - like regular Blast etc. This function
1080 should not be used for multi-iterational PSI-Blast.
1081 */
BXMLPrintOutput(AsnIoPtr aip,SeqAlignPtr seqalign,BLAST_OptionsBlkPtr options,CharPtr program,CharPtr database,BioseqPtr query,ValNodePtr other_returns,Int4 flags,CharPtr message,ValNodePtr mask_loc)1082 Boolean BXMLPrintOutput(AsnIoPtr aip, SeqAlignPtr seqalign,
1083 BLAST_OptionsBlkPtr options,CharPtr program,
1084 CharPtr database, BioseqPtr query,
1085 ValNodePtr other_returns, Int4 flags,
1086 CharPtr message, ValNodePtr mask_loc)
1087 {
1088 BlastOutputPtr boutp;
1089 Boolean ungapped = FALSE;
1090
1091 if((boutp = BXMLCreateBlastOutputHead(program, database, options, query,
1092 flags)) == NULL)
1093 return FALSE;
1094
1095 if(options->gapped_calculation == FALSE || !StringICmp(program, "tblastx"))
1096 ungapped = TRUE;
1097
1098 /* Here is one-iterational Blast output */
1099 boutp->iterations = BXMLBuildOneIteration(seqalign, other_returns,
1100 options->is_ooframe, ungapped,
1101 1, message, mask_loc);
1102
1103 if (aip != NULL)
1104 BlastOutputAsnWrite(boutp, aip, NULL);
1105
1106 if (glb_matrix)
1107 free_default_matrix(glb_matrix);
1108 glb_matrix = NULL;
1109
1110 BlastOutputFree(boutp);
1111
1112 return TRUE;
1113 }
1114
BXMLPrintMultiQueryOutput(AsnIoPtr aip,SeqAlignPtr seqalign,BLAST_OptionsBlkPtr options,CharPtr program,CharPtr database,BioseqSetPtr query_set,ValNodePtr other_returns,Int4 flags,CharPtr message,ValNodePtr mask_loc)1115 Boolean BXMLPrintMultiQueryOutput(AsnIoPtr aip, SeqAlignPtr seqalign,
1116 BLAST_OptionsBlkPtr options, CharPtr program, CharPtr database,
1117 BioseqSetPtr query_set, ValNodePtr other_returns, Int4 flags,
1118 CharPtr message, ValNodePtr mask_loc)
1119 {
1120 Boolean ungapped = FALSE;
1121 BioseqPtr query;
1122 SeqEntryPtr sep;
1123 Boolean q_is_na, d_is_na;
1124 MBXmlPtr mbxp = NULL;
1125 IterationPtr iterp;
1126 SeqEntryFunc seqentry_callback;
1127 Int4 index;
1128 Boolean query_found;
1129 SeqIdPtr seqid = NULL;
1130 SeqAlignPtr sap = NULL, next_seqalign = NULL;
1131 ValNodePtr next_mask_loc = NULL, current_mask_loc = NULL;
1132
1133 sep = (SeqEntryPtr) ((BioseqSetPtr)query_set)->seq_set;
1134 BlastGetTypes(program, &q_is_na, &d_is_na);
1135 if (q_is_na)
1136 seqentry_callback = FindNuc;
1137 else
1138 seqentry_callback = FindProt;
1139
1140 /* If no queries, there is nothing to report */
1141 if (!sep)
1142 return FALSE;
1143
1144 /* If only one query, call a one-query output function */
1145 if (!sep->next) {
1146 Boolean return_value;
1147 SeqEntryExplore(sep, &query, seqentry_callback);
1148 return_value = BXMLPrintOutput(aip, seqalign, options, program,
1149 database, query, other_returns, flags,
1150 message, mask_loc);
1151 /* This function is presumed to close the AsnIoPtr inside */
1152 AsnIoClose(aip);
1153 return return_value;
1154 }
1155
1156 next_mask_loc = mask_loc;
1157
1158 index = 0;
1159 while (seqalign) {
1160 /* Find the corresponding query */
1161 query_found = FALSE;
1162 for ( ; sep; ++index, sep = sep->next) {
1163 SeqEntryExplore(sep, &query, seqentry_callback);
1164 seqid = TxGetQueryIdFromSeqAlign(seqalign);
1165 if (SeqIdComp(query->id, seqid) == SIC_YES) {
1166 query_found = TRUE;
1167 break;
1168 }
1169 }
1170 if (query_found) {
1171 /* Find where seqaligns for this query end */
1172 for (sap = seqalign; sap && sap->next; sap = sap->next) {
1173 if (SeqIdComp(seqid, TxGetQueryIdFromSeqAlign(sap->next))
1174 != SIC_YES) {
1175 break;
1176 }
1177 }
1178 next_seqalign = sap->next;
1179 /* Unlink this query seqaligns from the rest */
1180 sap->next = NULL;
1181 } else {
1182 break;
1183 }
1184
1185 /* Find the masking locations for this query */
1186 if (next_mask_loc &&
1187 SeqIdComp(SeqLocId((SeqLocPtr)next_mask_loc->data.ptrvalue),
1188 seqid) == SIC_YES) {
1189 current_mask_loc = (SeqLocPtr)
1190 MemDup(next_mask_loc, sizeof(SeqLoc));
1191 next_mask_loc = next_mask_loc->next;
1192 current_mask_loc->next = NULL;
1193 } else {
1194 current_mask_loc = NULL;
1195 }
1196
1197 if (!mbxp) {
1198 if (options->is_megablast_search) {
1199 mbxp = PSIXmlInit(aip, "megablast", database, options, query, 0);
1200 } else {
1201 mbxp = PSIXmlInit(aip, program, database, options, query, 0);
1202 }
1203 }
1204 if(options->gapped_calculation == FALSE ||
1205 !StringICmp(program, "tblastx"))
1206 ungapped = TRUE;
1207
1208 /* Here is one-iterational Blast output */
1209 iterp = BXMLBuildOneQueryIteration(seqalign, NULL, options->is_ooframe,
1210 ungapped, index, NULL, query,
1211 current_mask_loc);
1212 current_mask_loc = (ValNodePtr) MemFree(current_mask_loc);
1213
1214 IterationAsnWrite(iterp, mbxp->aip, mbxp->atp);
1215 AsnIoFlush(mbxp->aip);
1216 IterationFree(iterp);
1217 /* Reconnect the SeqAlign chain */
1218 if (sap)
1219 sap->next = next_seqalign;
1220 seqalign = next_seqalign;
1221 }
1222
1223 free_default_matrix(glb_matrix);
1224 glb_matrix = NULL;
1225
1226 MBXmlClose(mbxp, other_returns, !options->gapped_calculation);
1227
1228 return TRUE;
1229 }
1230
1231
PSIXmlInit(AsnIoPtr aip,CharPtr program,CharPtr database,BLAST_OptionsBlkPtr options,BioseqPtr query,Int4 flags)1232 PSIXmlPtr PSIXmlInit(AsnIoPtr aip, CharPtr program, CharPtr database,
1233 BLAST_OptionsBlkPtr options, BioseqPtr query, Int4 flags)
1234 {
1235 PSIXmlPtr psixp;
1236 AsnModulePtr amp;
1237 DataVal av;
1238 AsnTypePtr atp;
1239 Boolean retval = FALSE;
1240
1241 AsnTypePtr BLASTOUTPUT;
1242 AsnTypePtr BLASTOUTPUT_program;
1243 AsnTypePtr BLASTOUTPUT_version;
1244 AsnTypePtr BLASTOUTPUT_reference;
1245 AsnTypePtr BLASTOUTPUT_db;
1246 AsnTypePtr BLASTOUTPUT_query_ID;
1247 AsnTypePtr BLASTOUTPUT_query_def;
1248 AsnTypePtr BLASTOUTPUT_query_len;
1249 AsnTypePtr BLASTOUTPUT_query_seq;
1250 AsnTypePtr BLASTOUTPUT_param;
1251 AsnTypePtr BLASTOUTPUT_iterations;
1252 AsnTypePtr BLASTOUTPUT_iterations_E;
1253 AsnTypePtr BLASTOUTPUT_mbstat;
1254
1255 psixp = (PSIXmlPtr) MemNew(sizeof(PSIXml));
1256
1257 psixp->aip = aip;
1258
1259 if (! bxmlobjAsnLoad()) {
1260 return NULL;
1261 }
1262
1263 AsnSetXMLmodulePrefix("http://www.ncbi.nlm.nih.gov/dtd/");
1264
1265 amp = AsnAllModPtr();
1266
1267 MACRO_atp_find(BLASTOUTPUT,BlastOutput);
1268 MACRO_atp_find(BLASTOUTPUT_program,BlastOutput.program);
1269 MACRO_atp_find(BLASTOUTPUT_version,BlastOutput.version);
1270 MACRO_atp_find(BLASTOUTPUT_reference,BlastOutput.reference);
1271 MACRO_atp_find(BLASTOUTPUT_db,BlastOutput.db);
1272 MACRO_atp_find(BLASTOUTPUT_query_ID,BlastOutput.query-ID);
1273 MACRO_atp_find(BLASTOUTPUT_query_def,BlastOutput.query-def);
1274 MACRO_atp_find(BLASTOUTPUT_query_len,BlastOutput.query-len);
1275 MACRO_atp_find(BLASTOUTPUT_query_seq,BlastOutput.query-seq);
1276 MACRO_atp_find(BLASTOUTPUT_param,BlastOutput.param);
1277 MACRO_atp_find(BLASTOUTPUT_iterations,BlastOutput.iterations);
1278 MACRO_atp_find(BLASTOUTPUT_iterations_E,BlastOutput.iterations.E);
1279 MACRO_atp_find(BLASTOUTPUT_mbstat,BlastOutput.mbstat);
1280
1281 /* Start of iterations structure */
1282 psixp->atp = BLASTOUTPUT_iterations_E;
1283
1284 /* Head of all BlastOutput structure */
1285 psixp->BlastOutput = BLASTOUTPUT;
1286
1287 /* Head of iterations strucure */
1288 psixp->BlastOutput_iterations = BLASTOUTPUT_iterations;
1289
1290 /* Head of the final statistics for Mega BLAST */
1291 psixp->BlastOutput_mbstat = BLASTOUTPUT_mbstat;
1292
1293 psixp->boutp = BXMLCreateBlastOutputHead(program, database, options,
1294 query, flags);
1295
1296 atp = AsnLinkType(NULL, BLASTOUTPUT); /* link local tree */
1297
1298 if (atp == NULL) {
1299 return NULL;
1300 }
1301
1302 if (! AsnOpenStruct(psixp->aip, atp, (Pointer) psixp->boutp)) {
1303 return NULL;
1304 }
1305
1306 if (psixp->boutp->program != NULL) {
1307 av.ptrvalue = psixp->boutp -> program;
1308 retval = AsnWrite(psixp->aip, BLASTOUTPUT_program, &av);
1309 }
1310
1311 if (psixp->boutp->version != NULL) {
1312 av.ptrvalue = psixp->boutp->version;
1313 retval = AsnWrite(psixp->aip, BLASTOUTPUT_version, &av);
1314 }
1315
1316 if (psixp->boutp->reference != NULL) {
1317 av.ptrvalue = psixp->boutp->reference;
1318 retval = AsnWrite(psixp->aip, BLASTOUTPUT_reference, &av);
1319 }
1320
1321 if (psixp->boutp -> db != NULL) {
1322 av.ptrvalue = psixp->boutp->db;
1323 retval = AsnWrite(psixp->aip, BLASTOUTPUT_db, &av);
1324 }
1325
1326 if (psixp->boutp -> query_ID != NULL) {
1327 av.ptrvalue = psixp->boutp->query_ID;
1328 retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_ID, &av);
1329 }
1330
1331 if (psixp->boutp->query_def != NULL) {
1332 av.ptrvalue = psixp->boutp->query_def;
1333 retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_def, &av);
1334 }
1335
1336 av.intvalue = psixp->boutp->query_len;
1337 retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_len, &av);
1338
1339 if (psixp->boutp->query_seq != NULL) {
1340 av.ptrvalue = psixp->boutp->query_seq;
1341 retval = AsnWrite(psixp->aip, BLASTOUTPUT_query_seq, &av);
1342 }
1343
1344 if (psixp->boutp->param != NULL) {
1345 if (!ParametersAsnWrite(psixp->boutp->param,
1346 psixp->aip, BLASTOUTPUT_param)) {
1347 return NULL;
1348 }
1349 }
1350
1351 if(!AsnOpenStruct(psixp->aip, BLASTOUTPUT_iterations, NULL))
1352 return NULL;
1353
1354 AsnIoFlush(psixp->aip);
1355
1356 return psixp;
1357 }
1358
PSIXmlReset(PSIXmlPtr psixp)1359 void PSIXmlReset(PSIXmlPtr psixp)
1360 {
1361 AsnCloseStruct(psixp->aip, psixp->BlastOutput_iterations, NULL);
1362 AsnCloseStruct(psixp->aip, psixp->BlastOutput, NULL);
1363 psixp->BlastOutput_iterations = NULL;
1364 psixp->BlastOutput = NULL;
1365 AsnIoReset(psixp->aip);
1366
1367 psixp->boutp = BlastOutputFree(psixp->boutp);
1368 }
1369
PSIXmlClose(PSIXmlPtr psixp)1370 void PSIXmlClose(PSIXmlPtr psixp)
1371 {
1372 if (psixp->BlastOutput_iterations)
1373 AsnCloseStruct(psixp->aip, psixp->BlastOutput_iterations, NULL);
1374 if (psixp->BlastOutput)
1375 AsnCloseStruct(psixp->aip, psixp->BlastOutput, NULL);
1376
1377 AsnIoClose(psixp->aip);
1378
1379 BlastOutputFree(psixp->boutp);
1380 MemFree(psixp);
1381
1382 return;
1383 }
1384
MBXmlClose(PSIXmlPtr mbxp,ValNodePtr other_returns,Boolean ungapped)1385 void MBXmlClose(PSIXmlPtr mbxp, ValNodePtr other_returns, Boolean ungapped)
1386 {
1387 StatisticsPtr stat;
1388 AsnTypePtr atp;
1389
1390 AsnCloseStruct(mbxp->aip, mbxp->BlastOutput_iterations, NULL);
1391
1392 if (other_returns) {
1393 atp = AsnLinkType(NULL, mbxp->BlastOutput_mbstat);
1394
1395 /*AsnOpenStruct(mbxp->aip, mbxp->BlastOutput_mbstat, NULL);*/
1396 stat = BXMLBuildStatistics(other_returns, ungapped);
1397
1398 StatisticsAsnWrite(stat, mbxp->aip, atp);
1399 AsnIoFlush(mbxp->aip);
1400 StatisticsFree(stat);
1401 /*AsnCloseStruct(mbxp->aip, mbxp->BlastOutput_mbstat, NULL);*/
1402 }
1403
1404 AsnCloseStruct(mbxp->aip, mbxp->BlastOutput, NULL);
1405
1406 AsnIoClose(mbxp->aip);
1407
1408 BlastOutputFree(mbxp->boutp);
1409 MemFree(mbxp);
1410
1411 return;
1412 }
1413
FillHspScoreInfo(HspPtr hsp,ScorePtr score)1414 static void FillHspScoreInfo(HspPtr hsp, ScorePtr score)
1415 {
1416 ScorePtr sp;
1417 for(sp = score; sp != NULL; sp = sp->next) {
1418 if(!(StringICmp(sp->id->str, "e_value")) ||
1419 !(StringICmp(sp->id->str, "sum_e")))
1420 hsp->evalue = sp->value.realvalue;
1421 if(!StringICmp(sp->id->str, "bit_score")) {
1422 hsp->bit_score = sp->value.realvalue;
1423 }
1424 if(!StringICmp(sp->id->str, "score")) {
1425 hsp->score = sp->value.intvalue;
1426 }
1427 if(!StringICmp(sp->id->str, "num_ident")) {
1428 hsp->identity = sp->value.intvalue;
1429 }
1430 if (!StringICmp(sp->id->str, "sum_n")) {
1431 hsp->num = sp->value.intvalue;
1432 }
1433 }
1434 }
1435
GetHspFromSeqAlign(SeqAlignPtr align,Boolean ungapped,Int4Ptr hspcnt_ptr)1436 static HspPtr GetHspFromSeqAlign(SeqAlignPtr align, Boolean ungapped,
1437 Int4Ptr hspcnt_ptr)
1438 {
1439 HspPtr hsp, head_hsp = NULL, last_hsp = NULL;
1440 ScorePtr score;
1441 DenseDiagPtr ddp;
1442 DenseSegPtr dsp;
1443 StdSegPtr ssp;
1444 Int4 hspcnt = 0, last_seg, i;
1445
1446 *hspcnt_ptr = 0;
1447
1448 switch (align->segtype) {
1449 case 1: /*Dense-diag; blastn, blastp ungapped */
1450 ddp = (DenseDiagPtr) align->segs;
1451 while(ddp) {
1452 if((hsp = HspNew()) == NULL)
1453 return head_hsp;
1454 if (!head_hsp)
1455 head_hsp = last_hsp = hsp;
1456 else {
1457 last_hsp->next = hsp;
1458 last_hsp = last_hsp->next;
1459 }
1460
1461 ++hspcnt;
1462 hsp->align_len = ddp->len;
1463 if (ddp->strands[0] == Seq_strand_minus) {
1464 hsp->query_from = ddp->starts[0] + ddp->len;
1465 hsp->query_to = ddp->starts[0] + 1;
1466 } else {
1467 hsp->query_from = ddp->starts[0] + 1;
1468 hsp->query_to = ddp->starts[0] + ddp->len;
1469 }
1470 if (ddp->strands[1] == Seq_strand_minus) {
1471 hsp->hit_from = ddp->starts[1] + ddp->len;
1472 hsp->hit_to = ddp->starts[1] + 1;
1473 } else {
1474 hsp->hit_from = ddp->starts[1];
1475 hsp->hit_to = ddp->starts[1] + ddp->len - 1;
1476 }
1477 FillHspScoreInfo(hsp, ddp->scores);
1478 ddp = ddp->next;
1479 }
1480 break;
1481 case 2: /* Dense-seg; blastn, blastp gapped */
1482 dsp = (DenseSegPtr) align->segs;
1483 if((head_hsp = hsp = HspNew()) == NULL)
1484 return head_hsp;
1485 if (dsp->scores)
1486 score = dsp->scores;
1487 else
1488 score = align->score;
1489 hspcnt = 1;
1490 last_seg = dsp->numseg - 1;
1491
1492 if (dsp->strands[0] == Seq_strand_minus) {
1493 hsp->query_from = dsp->starts[0] + dsp->lens[0];
1494 hsp->query_to = dsp->starts[2*last_seg] + 1;
1495 } else {
1496 hsp->query_from = dsp->starts[0] + 1;
1497 hsp->query_to = dsp->starts[2*last_seg] + dsp->lens[last_seg];
1498 }
1499 if (dsp->strands[1] == Seq_strand_minus) {
1500 hsp->hit_from = dsp->starts[1] + dsp->lens[1];
1501 hsp->hit_to = dsp->starts[2*last_seg+1] + 1;
1502 } else {
1503 hsp->hit_from = dsp->starts[1] + 1;
1504 hsp->hit_to = dsp->starts[2*last_seg+1] + dsp->lens[last_seg];
1505 }
1506
1507 for (i=0; i<dsp->numseg; i++) {
1508 hsp->align_len += dsp->lens[i];
1509 if (dsp->starts[2*i] == -1 || dsp->starts[2*i+1] == -1) {
1510 hsp->gaps++;
1511 }
1512 }
1513 FillHspScoreInfo(hsp, score);
1514 break;
1515 case 3: /* Std-seg; translated gapped or ungapped */
1516 ssp = (StdSegPtr) align->segs;
1517
1518 if (ungapped) {
1519 while(ssp) {
1520 if((hsp = HspNew()) == NULL)
1521 return head_hsp;
1522 if (!head_hsp)
1523 head_hsp = last_hsp = hsp;
1524 else {
1525 last_hsp->next = hsp;
1526 last_hsp = last_hsp->next;
1527 }
1528
1529 ++hspcnt;
1530 if (ssp->scores)
1531 score = ssp->scores;
1532 else
1533 score = align->score;
1534
1535 FillHspScoreInfo(hsp, score);
1536
1537 if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
1538 hsp->query_to = SeqLocStart(ssp->loc) + 1;
1539 hsp->query_from = SeqLocStop(ssp->loc);
1540 } else {
1541 hsp->query_from = SeqLocStart(ssp->loc) + 1;
1542 hsp->query_to = SeqLocStop(ssp->loc);
1543 }
1544 if (SeqLocStrand(ssp->loc->next) == Seq_strand_minus) {
1545 hsp->hit_to = SeqLocStart(ssp->loc->next) + 1;
1546 hsp->hit_from = SeqLocStop(ssp->loc->next);
1547 } else {
1548 hsp->hit_from = SeqLocStart(ssp->loc->next) + 1;
1549 hsp->hit_to = SeqLocStop(ssp->loc->next);
1550 }
1551 if (SeqLocStrand(ssp->loc) == Seq_strand_unknown) {
1552 /* Protein location */
1553 hsp->align_len = SeqLocLen(ssp->loc);
1554 } else { /* Nucleotide location; need to divide length by 3
1555 to get protein length */
1556 hsp->align_len = SeqLocLen(ssp->loc) / 3;
1557 }
1558 ssp = ssp->next;
1559 }
1560 } else {
1561 SeqLocPtr slp;
1562 StdSegPtr last_ssp;
1563
1564 if((head_hsp = hsp = HspNew()) == NULL)
1565 return head_hsp;
1566
1567 hspcnt = 1;
1568 FillHspScoreInfo(hsp, align->score);
1569 /* Get relevant endpoints from the first StdSeg in the list */
1570 if (SeqLocStrand(ssp->loc) == Seq_strand_minus) {
1571 hsp->query_from = SeqLocStop(ssp->loc);
1572 } else {
1573 hsp->query_from = SeqLocStart(ssp->loc) + 1;
1574 }
1575 if (SeqLocStrand(ssp->loc->next) == Seq_strand_minus) {
1576 hsp->hit_from = SeqLocStop(ssp->loc->next);
1577 } else {
1578 hsp->hit_from = SeqLocStart(ssp->loc->next) + 1;
1579 }
1580 /* Advance StdSeg to the end of the list, and calculate the
1581 alignment length in the process */
1582 for ( ; ssp; ssp = ssp->next) {
1583 slp = ((ssp->loc->choice == SEQLOC_EMPTY) ? ssp->loc->next :
1584 ssp->loc);
1585 if (SeqLocStrand(slp) == Seq_strand_unknown) {
1586 hsp->align_len += SeqLocLen(slp);
1587 } else {
1588 hsp->align_len += (SeqLocLen(slp) / 3);
1589 }
1590 last_ssp = ssp;
1591 }
1592 /* Get relevant endpoints from the last StdSeg in the list */
1593 if (SeqLocStrand(last_ssp->loc) == Seq_strand_minus) {
1594 hsp->query_to = SeqLocStart(last_ssp->loc) + 1;
1595 } else {
1596 hsp->query_to = SeqLocStop(last_ssp->loc);
1597 }
1598 if (SeqLocStrand(last_ssp->loc->next) == Seq_strand_minus) {
1599 hsp->hit_to = SeqLocStart(last_ssp->loc->next) + 1;
1600 } else {
1601 hsp->hit_to = SeqLocStop(last_ssp->loc->next);
1602 }
1603 }
1604 break;
1605 default: break;
1606 }
1607
1608 *hspcnt_ptr = hspcnt;
1609 return head_hsp;
1610 }
1611
SeqAlignToHits(SeqAlignPtr seqalign,Boolean ungapped)1612 HitPtr SeqAlignToHits(SeqAlignPtr seqalign, Boolean ungapped)
1613 {
1614 HitPtr hitp, hitp_head;
1615 SeqAlignPtr sap, sap2;
1616 SeqIdPtr subject_id, sip;
1617 Char buffer[526];
1618 HspPtr hspp;
1619 Int4 hsp_count;
1620
1621 hitp_head = NULL;
1622
1623 for(sap = seqalign; sap != NULL;) {
1624 subject_id = TxGetSubjectIdFromSeqAlign(sap);
1625
1626 if(hitp_head == NULL) { /* first element */
1627 hitp_head = hitp = HitNew();
1628 } else {
1629 hitp->next = HitNew();
1630 hitp = hitp->next;
1631 }
1632
1633 SeqIdWrite(subject_id, buffer, PRINTID_FASTA_LONG, sizeof(buffer));
1634 hitp->id = StringSave(buffer);
1635 SeqIdWrite(SeqIdFindBestAccession(subject_id), buffer,
1636 PRINTID_TEXTID_ACCESSION, sizeof(buffer));
1637
1638 hitp->accession = StringSave(buffer);
1639
1640 for(sap2 = sap; sap2 != NULL; ) {
1641 /* Filling info about specific alignments */
1642 if (!hitp->hsps) {
1643 hspp = hitp->hsps =
1644 GetHspFromSeqAlign(sap2, ungapped, &hsp_count);
1645 } else {
1646 hspp->next = GetHspFromSeqAlign(sap2, ungapped, &hsp_count);
1647 hspp = hspp->next;
1648 }
1649
1650 hitp->num += hsp_count;
1651
1652 sap2 = sap2->next;
1653 sip = TxGetSubjectIdFromSeqAlign(sap2);
1654
1655 if(SeqIdMatch(subject_id, sip)) {
1656 continue;
1657 } else {
1658 sap = sap2;
1659 break;
1660 }
1661 }
1662 }
1663
1664 return hitp_head;
1665 }
1666