1 static char const rcsid[] = "$Id: vecscrn.c,v 6.137 2005/12/27 14:46:44 madden Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: vecscrn.c
30
31 Author: Tom Madden
32
33 Contents: functions for Vector screening.
34
35 ******************************************************************************
36 * $Revision: 6.137 $
37 *
38 * $Log: vecscrn.c,v $
39 * Revision 6.137 2005/12/27 14:46:44 madden
40 * Fix buffer overflow in Print62Notation
41 *
42 * Revision 6.136 2004/07/27 14:26:54 madden
43 * New location for tunf.cgi
44 *
45 * Revision 6.135 2003/05/30 17:25:38 coulouri
46 * add rcsid
47 *
48 * Revision 6.134 2002/07/24 21:11:39 kans
49 * reverted ncbi URL
50 *
51 * Revision 6.133 2002/07/23 16:50:04 kans
52 * changed www.ncbi.nlm.nih.gov to www.ncbi.nih.gov
53 *
54 * Revision 6.132 2001/01/09 17:30:03 madden
55 * Fix memory leaks
56 *
57 * Revision 6.131 2000/09/01 18:29:13 dondosha
58 * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
59 *
60 * Revision 6.130 2000/08/07 20:38:11 madden
61 * Proper casting of int to long for printf
62 *
63 * Revision 6.129 2000/05/19 20:40:34 kitts
64 * 1. Fixed bug in VSMakeCombinedSeqLoc preventing "No hits" being
65 * reported when none of the blast hits were significant.
66 * 2. Fixed bug in VSCombineSeqLoc causing overlaps to be missed
67 * when alignment filtering is off.
68 * 3. Fixed bug in CombineSeqLoc to allow all SeqLocs in a linked
69 * list to be processed.
70 * 4. Added checks to several functions to prevent "No hits"
71 * being reported when the search fails.
72 * 5. Other minor changes.
73 *
74 * Revision 6.128 2000/05/09 21:43:48 kitts
75 * Removed unused parameter from VSPrintListFromSeqLocs
76 * Moved "No hits" output from VSPrintListFromSeqLocs to vecscreen.c
77 *
78 * Revision 6.127 2000/05/05 20:11:09 madden
79 * Add VSScreenSequenceByLoc
80 *
81 * Revision 6.126 2000/05/01 16:58:42 kitts
82 *
83 * Added function VSPrintListFromSeqLocs
84 * Added function VSPrintListIdLine
85 *
86 * Revision 6.125 2000/03/30 21:04:16 madden
87 * Added function VSMakeCombinedSeqLoc
88 *
89 * Revision 6.124 2000/01/21 16:47:49 kans
90 * use ValNodeSort instead of original jz SortValNode
91 *
92 * Revision 6.123 2000/01/20 18:32:36 madden
93 * Removed VSMakeDisplaySeqLoc
94 *
95 * Revision 6.122 2000/01/20 14:40:36 madden
96 * Added VSMakeDisplaySeqLoc
97 *
98 * Revision 6.121 2000/01/19 21:53:21 madden
99 * Files for vector screening
100 *
101 ******************************************************************************/
102
103 #include <vecscrn.h>
104 #include <salpacc.h>
105 #include <salpstat.h>
106 #include <urkbias.h>
107
108 /* Used for the '62' notation stuff. */
109 #define DEFINE_62 62
110
111 /***********************************************************
112
113 CoordTo62Notation converts a position into the '62' notation.
114 The '62' notation decomposes a number (X) into two parts:
115
116 1.) the largest multiple of 62 that is still less than X. Let
117 this number be A.
118
119 2.) the difference between the number found in 1.) and X. Let
120 this number be B.
121
122 The original number can be reconstructed from:
123
124 X = 62*A + B
125
126 In CoordTo62Notation the first Int4 is the position that should be decomposed.
127 The two CharPtr's are (respectively) for A and B. These CharPtr's should be
128 allocated with at least two spaces and will be filled in with A and B.
129
130 ****************************************************************/
131 static Boolean
CoordTo62Notation(Int4 position,CharPtr major_part,CharPtr minor_part)132 CoordTo62Notation (Int4 position, CharPtr major_part, CharPtr minor_part)
133
134 {
135 Int4 major_num, minor_num;
136
137 if (major_part == NULL || minor_part == NULL)
138 return FALSE;
139
140 major_num = position/DEFINE_62;
141 minor_num = position%DEFINE_62;
142
143
144 if (major_num < 10)
145 sprintf(major_part, "%ld", (long) major_num);
146 else if (major_num < 36)
147 *major_part = 'A' + major_num - 10;
148 else
149 *major_part = 'a' + major_num - 36;
150
151
152 if (minor_num < 10)
153 sprintf(minor_part, "%ld", (long) minor_num);
154 else if (minor_num < 36)
155 *minor_part = 'A' + minor_num - 10;
156 else
157 *minor_part = 'a' + minor_num - 36;
158
159 return TRUE;
160 }
161
162 #define ADD_TO_BUFFER_FOR_BLAST 100
163
164 /*
165 Given a SeqLocPtr (for a query of length query_length), fill in
166 buffer with starts and stops in '62' notation (see above doc.).
167 */
168
169 static Boolean
Print62Notation(SeqLocPtr seqloc,Int4 query_length,CharPtr * buffer)170 Print62Notation(SeqLocPtr seqloc, Int4 query_length, CharPtr *buffer)
171 {
172 Char a1[2], a2[2], b1[2], b2[2];
173 CharPtr ptr, my_buffer;
174 Int4 buffer_size, position, start, stop;
175 Nlm_FloatHi block_size;
176 const int kScreenWidth = 600;
177
178 if (seqloc == NULL)
179 return FALSE;
180
181 block_size = ((Nlm_FloatHi) query_length)/kScreenWidth;
182
183 if (seqloc->choice == SEQLOC_PACKED_INT)
184 seqloc = (ValNodePtr) seqloc->data.ptrvalue;
185
186 position = 0;
187 buffer_size = ADD_TO_BUFFER_FOR_BLAST;
188 my_buffer = (CharPtr) MemNew(buffer_size*sizeof(Char));
189 ptr = my_buffer;
190 while (seqloc)
191 {
192 /* We check for +5 as we need to allow addition of four characters within
193 this loop as well as setting the final byte to NULLB outside of the loop. */
194 if (position+5 > buffer_size)
195 {
196 CharPtr my_new_buffer = NULL;
197 buffer_size += ADD_TO_BUFFER_FOR_BLAST;
198 my_new_buffer = (CharPtr) MemNew(buffer_size*sizeof(Char));
199 MemCpy(my_new_buffer, my_buffer, position);
200 my_buffer = (CharPtr) MemFree(my_buffer);
201 my_buffer = my_new_buffer;
202 ptr = my_buffer + position;
203 }
204 if (SeqLocStrand(seqloc) == Seq_strand_minus)
205 {
206 start = SeqLocStop(seqloc)/block_size;
207 stop = (SeqLocStart(seqloc)+1)/block_size;
208 }
209 else
210 {
211 start = SeqLocStart(seqloc)/block_size;
212 stop = (SeqLocStop(seqloc)+1)/block_size;
213 }
214 CoordTo62Notation(start, a1, b1);
215 CoordTo62Notation(stop, a2, b2);
216 *ptr = *a1; ptr++;
217 *ptr = *b1; ptr++;
218 *ptr = *a2; ptr++;
219 *ptr = *b2; ptr++;
220 position += 4;
221 seqloc = seqloc->next;
222 }
223 *ptr = NULLB;
224
225 *buffer = my_buffer;
226
227 return TRUE;
228 }
229
230 /*
231 Gets the labels for a given class.
232 The CharPtr's will be allocated by the function.
233 It is up to the caller to deallocate this memory.
234 The 'color' buffer will hold the color in the format of RRGGBB (e.g., "00FF00").
235 */
236
237 Boolean LIBCALL
VSGetLabels(Int2 class,CharPtr PNTR short_desc,CharPtr PNTR long_desc,CharPtr PNTR color)238 VSGetLabels (Int2 class, CharPtr PNTR short_desc, CharPtr PNTR long_desc, CharPtr PNTR color)
239
240 {
241 Boolean retval;
242
243 switch (class) {
244 case 1:
245 if (long_desc)
246 *long_desc = StringSave("Strong match");
247 if (short_desc)
248 *short_desc = StringSave("Strong");
249 if (color)
250 *color = StringSave("FF0000");
251 retval = TRUE;
252 break;
253 case 2:
254 if (long_desc)
255 *long_desc = StringSave("Moderate match");
256 if (short_desc)
257 *short_desc = StringSave("Moderate");
258 if (color)
259 *color = StringSave("FF00FF");
260 retval = TRUE;
261 break;
262 case 3:
263 if (long_desc)
264 *long_desc = StringSave("Weak match");
265 if (short_desc)
266 *short_desc = StringSave("Weak");
267 if (color)
268 *color = StringSave("00FF00");
269 retval = TRUE;
270 break;
271 case 4:
272 if (long_desc)
273 *long_desc = StringSave("Suspect origin");
274 if (short_desc)
275 *short_desc = StringSave("Suspect");
276 if (color)
277 *color = StringSave("FFFF00");
278 retval = TRUE;
279 break;
280 default:
281 retval = FALSE;
282 }
283
284 return retval;
285 }
286
287 /*
288 BlastOptionNew function used for VecScreen screening.
289 These options were specified by Paul Kitts.
290 */
291
292
293 BLAST_OptionsBlkPtr LIBCALL
VSBlastOptionNew(void)294 VSBlastOptionNew(void)
295
296 {
297 BLAST_OptionsBlkPtr options;
298
299
300 options = BLASTOptionNew("blastn", TRUE);
301 if (options == NULL)
302 return NULL;
303
304 options->gap_open = 3;
305 options->gap_extend = 3;
306 options->filter_string = StringSave("m D");
307 options->penalty = -5;
308 options->expect_value = 700;
309 options->searchsp_eff = 1.75e12;
310
311 return options;
312 }
313
314
315 /*
316 Deletes the VSOptions structure.
317 */
318
319 VSOptionsPtr LIBCALL
VSOptionsFree(VSOptionsPtr options)320 VSOptionsFree(VSOptionsPtr options)
321
322 {
323 if (options == NULL)
324 return NULL;
325
326 options->classes = MemFree(options->classes);
327
328
329 MemFree(options);
330
331 return NULL;
332
333 }
334 /*
335 Makes up the structure for VSOptions, with default values
336 filled in.
337 */
338
339 VSOptionsPtr LIBCALL
VSOptionsNew(void)340 VSOptionsNew(void)
341
342 {
343 VSOptionsPtr options;
344
345 options = MemNew(sizeof(VSOptions));
346 if (options == NULL)
347 return NULL;
348
349 options->number_of_classes = 3;
350 options->classes = MemNew(2*(options->number_of_classes)*sizeof(Int4));
351 if (options->classes == NULL)
352 {
353 MemFree(options);
354 return NULL;
355 }
356 options->classes[0] = 24;
357 options->classes[1] = 30;
358 options->classes[2] = 19;
359 options->classes[3] = 25;
360 options->classes[4] = 16;
361 options->classes[5] = 23;
362 options->distance_to_end = 24;
363 options->distance_between_segments = 50;
364 options->filter_by_coverage = TRUE;
365
366 return options;
367 }
368
369 /*
370 Changes the SeqLoc to the PLUS strand. Only works for SeqInt.
371 */
372
373 static void
SeqLocToPlusStrand(SeqLocPtr seqloc)374 SeqLocToPlusStrand(SeqLocPtr seqloc)
375
376 {
377 Int4 start, stop;
378 SeqIntPtr seq_int;
379
380 if (seqloc == NULL || seqloc->choice != SEQLOC_INT)
381 return;
382
383 while (seqloc)
384 {
385 if (SeqLocStrand(seqloc) == Seq_strand_minus)
386 {
387 seq_int = seqloc->data.ptrvalue;
388
389 start = seq_int->from;
390 stop = seq_int->to;
391
392 seq_int->from = MIN(start, stop);
393 seq_int->to = MAX(start, stop);
394 seq_int->strand = Seq_strand_plus;
395 }
396 seqloc = seqloc->next;
397 }
398
399 return;
400 }
401
402 /*
403 Check for overlap between seqloc1 and seqloc2.
404 Keep only seqloc1 if there is an overlap.
405 Return a new SeqLocPtr with these SeqLoc's.
406 */
407 static SeqLocPtr
CheckForOverlap(SeqLocPtr seqloc1,SeqLocPtr seqloc2)408 CheckForOverlap(SeqLocPtr seqloc1, SeqLocPtr seqloc2)
409
410 {
411 SeqLocPtr seqloc, next_seqloc;
412 SeqLocPtr new_seqloc=NULL, retval=NULL, current;
413
414 if (seqloc1 == NULL || seqloc1->choice != SEQLOC_PACKED_INT)
415 return seqloc2;
416
417 if (seqloc2 == NULL || seqloc2->choice != SEQLOC_PACKED_INT)
418 return NULL;
419
420 seqloc = seqloc2->data.ptrvalue;
421
422 while (seqloc)
423 {
424 if (SeqLocAinB(seqloc, seqloc1) >= 0)
425 {
426 seqloc = seqloc->next;
427 }
428 else
429 {
430 if (!new_seqloc)
431 {
432 new_seqloc = seqloc;
433 current = new_seqloc;
434 }
435 else
436 {
437 current->next = seqloc;
438 current = current->next;
439 }
440 next_seqloc = seqloc->next;
441 seqloc->next = NULL;
442 seqloc = next_seqloc;
443 }
444 }
445 if (new_seqloc)
446 ValNodeAddPointer(&retval, SEQLOC_PACKED_INT, new_seqloc);
447
448 return retval;
449 }
450
MySeqLocSortByStartPosition(VoidPtr vp1,VoidPtr vp2)451 static int LIBCALLBACK MySeqLocSortByStartPosition(VoidPtr vp1, VoidPtr vp2)
452
453 {
454 SeqLocPtr slp1, slp2;
455
456 slp1 = *((SeqLocPtr PNTR) vp1);
457 slp2 = *((SeqLocPtr PNTR) vp2);
458
459 if (SeqLocStart(slp1) < SeqLocStart(slp2))
460 return -1;
461 else if (SeqLocStart(slp1) > SeqLocStart(slp2))
462 return 1;
463 else
464 return 0;
465 }
466
467 /*
468 Combine two SeqLoc's that overlap.
469 */
470
471 static SeqLocPtr
CombineSeqLoc(SeqLocPtr seqloc)472 CombineSeqLoc (SeqLocPtr seqloc)
473
474 {
475 Int4 start, stop;
476 SeqLocPtr retval=NULL, seqloc_var, new_seqloc=NULL;
477 SeqLocPtr my_seqloc=NULL, tmp_seqloc;
478 SeqIntPtr seq_int;
479
480 if (seqloc == NULL || seqloc->choice != SEQLOC_PACKED_INT)
481 return NULL;
482
483 while (seqloc)
484 {
485 tmp_seqloc = seqloc->data.ptrvalue;
486 while (tmp_seqloc)
487 {
488 ValNodeLink(&my_seqloc, (SeqLocPtr) AsnIoMemCopy ((Pointer) tmp_seqloc,
489 (AsnReadFunc) SeqLocAsnRead,
490 (AsnWriteFunc) SeqLocAsnWrite));
491 tmp_seqloc = tmp_seqloc->next;
492 }
493 seqloc = seqloc->next;
494 }
495
496 my_seqloc = (SeqLocPtr) ValNodeSort ((ValNodePtr) my_seqloc, MySeqLocSortByStartPosition);
497
498 start = SeqLocStart(my_seqloc);
499 stop = SeqLocStop(my_seqloc);
500 seqloc_var = my_seqloc;
501
502 while (seqloc_var)
503 {
504 if (seqloc_var->next && stop+1 >= SeqLocStart(seqloc_var->next))
505 {
506 stop = MAX(stop, SeqLocStop(seqloc_var->next));
507 }
508 else
509 {
510 seq_int = SeqIntNew();
511 seq_int->from = start;
512 seq_int->to = stop;
513 seq_int->strand = Seq_strand_plus;
514 seq_int->id = SeqIdDup(SeqIdFindBest(SeqLocId(seqloc_var), SEQID_GI));
515 ValNodeAddPointer(&new_seqloc, SEQLOC_INT, seq_int);
516 if (seqloc_var->next)
517 {
518 start = SeqLocStart(seqloc_var->next);
519 stop = SeqLocStop(seqloc_var->next);
520 }
521 }
522 seqloc_var = seqloc_var->next;
523 }
524
525 if (new_seqloc)
526 ValNodeAddPointer(&retval, SEQLOC_PACKED_INT, new_seqloc);
527
528 return retval;
529 }
530
531 /* Get intervening SeqLoc's if the two SeqLoc's are within distance_permitted. */
532
533 static SeqLocPtr
GetInterveningSeqLoc(SeqLocPtr seqloc,Int4 distance_permitted,Int4 query_length)534 GetInterveningSeqLoc (SeqLocPtr seqloc, Int4 distance_permitted, Int4 query_length)
535
536 {
537 SeqLocPtr retval=NULL, seqloc_var, new_seqloc=NULL;
538 SeqLocPtr my_seqloc=NULL, tmp_seqloc;
539 SeqIntPtr seq_int;
540
541 if (seqloc == NULL)
542 return NULL;
543
544 if (seqloc->choice != SEQLOC_PACKED_INT)
545 return NULL;
546
547 while (seqloc)
548 {
549 tmp_seqloc = seqloc->data.ptrvalue;
550 while (tmp_seqloc)
551 {
552 ValNodeLink(&my_seqloc, (SeqLocPtr) AsnIoMemCopy ((Pointer) tmp_seqloc,
553 (AsnReadFunc) SeqLocAsnRead,
554 (AsnWriteFunc) SeqLocAsnWrite));
555 tmp_seqloc = tmp_seqloc->next;
556 }
557 seqloc = seqloc->next;
558 }
559
560 my_seqloc = (SeqLocPtr) ValNodeSort ((ValNodePtr) my_seqloc, MySeqLocSortByStartPosition);
561
562 seqloc_var = my_seqloc;
563
564 if (SeqLocStart(seqloc_var) < distance_permitted &&
565 SeqLocStart(seqloc_var) > 0)
566 {
567 seq_int = SeqIntNew();
568 seq_int->from = 0;
569 seq_int->to = SeqLocStart(seqloc_var) - 1;
570 seq_int->strand = Seq_strand_plus;
571 seq_int->id = SeqIdDup(SeqIdFindBest(SeqLocId(seqloc_var), SEQID_GI));
572 ValNodeAddPointer(&new_seqloc, SEQLOC_INT, seq_int);
573 }
574
575 while (seqloc_var)
576 {
577 if (seqloc_var->next && SeqLocStop(seqloc_var)+1 < SeqLocStart(seqloc_var->next) && SeqLocStop(seqloc_var)+distance_permitted >= SeqLocStart(seqloc_var->next))
578 {
579 seq_int = SeqIntNew();
580 seq_int->from = SeqLocStop(seqloc_var) + 1;
581 seq_int->to = SeqLocStart(seqloc_var->next) - 1;
582 seq_int->strand = Seq_strand_plus;
583 seq_int->id = SeqIdDup(SeqIdFindBest(SeqLocId(seqloc_var), SEQID_GI));
584 ValNodeAddPointer(&new_seqloc, SEQLOC_INT, seq_int);
585 }
586 else if ((seqloc_var->next == NULL || SeqLocStop(seqloc_var->next) < SeqLocStop(seqloc_var)) && query_length - (SeqLocStop(seqloc_var)+1) < distance_permitted && query_length - (SeqLocStop(seqloc_var)+1) > 0) /* Check distance to end. */
587 {
588 seq_int = SeqIntNew();
589 seq_int->from = SeqLocStop(seqloc_var) + 1;
590 seq_int->to = query_length - 1;
591 seq_int->strand = Seq_strand_plus;
592 seq_int->id = SeqIdDup(SeqIdFindBest(SeqLocId(seqloc_var), SEQID_GI));
593 ValNodeAddPointer(&new_seqloc, SEQLOC_INT, seq_int);
594 }
595 seqloc_var = seqloc_var->next;
596 }
597
598 if (new_seqloc)
599 ValNodeAddPointer(&retval, SEQLOC_PACKED_INT, new_seqloc);
600
601 return retval;
602 }
603
604 /*
605 Scans the SeqAlignPtr's, producing SeqLocPtr's as specified.
606 Each ValNodePtr points to a list of SeqLoc's.
607
608 This function:
609
610 - categorizes hits
611 - eliminates redundancy
612 - calculates SeqLocs from the SeqAlign
613
614 if options are NULL, defaults will be used.
615
616 */
617
618 Boolean LIBCALL
VSMakeSeqLoc(SeqAlignPtr PNTR seqalign_ptr,ValNodePtr PNTR vnp,Int4 length,VSOptionsPtr options)619 VSMakeSeqLoc (SeqAlignPtr PNTR seqalign_ptr, ValNodePtr PNTR vnp, Int4 length, VSOptionsPtr options)
620
621 {
622 Boolean own_options=FALSE;
623 Int4 start, stop, score, num, tmp;
624 Nlm_FloatHi bit_score, evalue;
625 SeqLocPtr seqloc1=NULL, seqloc2=NULL, seqloc3=NULL, seqloc, seqloc_tmp;
626 SeqAlignPtr seqalign, head=NULL, next;
627 SeqAlignPtr sap1=NULL, sap2=NULL, sap3=NULL;
628 Uint2Ptr HitsPerBase=NULL;
629 Int4 last_start=INT4_MAX, last_stop=-1;
630
631 if (seqalign_ptr == NULL || vnp == NULL)
632 return FALSE;
633
634 if (options == NULL)
635 {
636 options = VSOptionsNew();
637 if (options == NULL)
638 return FALSE;
639 own_options = TRUE;
640 }
641
642 seqalign = *seqalign_ptr;
643 while (seqalign)
644 {
645 next = seqalign->next;
646 GetScoreAndEvalue(seqalign, &score, &bit_score, &evalue, &num);
647 start = SeqAlignStart(seqalign, 0);
648 stop = SeqAlignStop(seqalign, 0);
649 if (start > stop)
650 { /* reverse for minus strand. */
651 tmp = start;
652 start = stop;
653 stop = tmp;
654 }
655 if (score >= options->classes[1] || (score >= options->classes[0] && (start <= options->distance_to_end || length-(stop+1) <= options->distance_to_end)))
656 {
657 sap1 = SeqAlignLink(sap1, seqalign);
658 seqalign->next = NULL;
659 }
660 else if (score >= options->classes[3] || (score >= options->classes[2] && (start <= options->distance_to_end || length-(stop+1) <= options->distance_to_end)))
661 {
662 sap2 =SeqAlignLink(sap2, seqalign);
663 seqalign->next = NULL;
664 }
665 else if (score >= options->classes[5] || (score >= options->classes[4] && (start <= options->distance_to_end || length-(stop+1) <= options->distance_to_end)))
666 {
667 sap3 = SeqAlignLink(sap3, seqalign);
668 seqalign->next = NULL;
669 }
670 else
671 {
672 SeqAlignFree(seqalign);
673 }
674 seqalign = next;
675 }
676
677 if (sap1)
678 {
679 if (options->filter_by_coverage)
680 sap1 = SeqAlignFilterByCoverage(sap1, NULL, NULL, 1, &HitsPerBase, &last_start, &last_stop);
681 seqalign = sap1;
682 while (seqalign)
683 {
684 seqloc_tmp = SeqLocFromSeqAlign(seqalign, NULL);
685 SeqLocToPlusStrand(seqloc_tmp);
686 ValNodeLink(&seqloc1, seqloc_tmp);
687 seqalign = seqalign->next;
688 }
689 head = SeqAlignLink(head, sap1);
690 }
691 if (sap2)
692 {
693 if (options->filter_by_coverage)
694 sap2 = SeqAlignFilterByCoverage(sap2, NULL, NULL, 1, &HitsPerBase, &last_start, &last_stop);
695 seqalign = sap2;
696 while (seqalign)
697 {
698 seqloc_tmp = SeqLocFromSeqAlign(seqalign, NULL);
699 SeqLocToPlusStrand(seqloc_tmp);
700 ValNodeLink(&seqloc2, seqloc_tmp);
701 seqalign = seqalign->next;
702 }
703 head = SeqAlignLink(head, sap2);
704 }
705 if (sap3)
706 {
707 if (options->filter_by_coverage)
708 sap3 = SeqAlignFilterByCoverage(sap3, NULL, NULL, 1, &HitsPerBase, &last_start, &last_stop);
709 seqalign = sap3;
710 while (seqalign)
711 {
712 seqloc_tmp = SeqLocFromSeqAlign(seqalign, NULL);
713 SeqLocToPlusStrand(seqloc_tmp);
714 ValNodeLink(&seqloc3, seqloc_tmp);
715 seqalign = seqalign->next;
716 }
717 head = SeqAlignLink(head, sap3);
718 }
719
720 /* Set for return. */
721 *seqalign_ptr = head;
722
723 HitsPerBase = MemFree(HitsPerBase);
724
725 *vnp = NULL;
726
727 if (seqloc1)
728 {
729 seqloc = NULL;
730 ValNodeAddPointer(&seqloc, SEQLOC_PACKED_INT, seqloc1);
731 ValNodeAddPointer(vnp, 1, seqloc);
732 }
733
734 if (seqloc2)
735 {
736 seqloc = NULL;
737 ValNodeAddPointer(&seqloc, SEQLOC_PACKED_INT, seqloc2);
738 ValNodeAddPointer(vnp, 2, seqloc);
739 }
740
741 if (seqloc3)
742 {
743 seqloc = NULL;
744 ValNodeAddPointer(&seqloc, SEQLOC_PACKED_INT, seqloc3);
745 ValNodeAddPointer(vnp, 3, seqloc);
746 }
747
748
749 if (own_options)
750 options = VSOptionsFree(options);
751
752 return TRUE;
753 }
754
755 /*
756
757 This function checks SeqLocs for redundancy. It:
758
759 - Combines SeqLoc's in each category
760 - checks for overlap
761 - produces a fourth SeqLoc for intervening sequences.
762 */
763
764 Boolean LIBCALL
VSCombineSeqLoc(ValNodePtr PNTR vnp,Int4 query_length,VSOptionsPtr options)765 VSCombineSeqLoc(ValNodePtr PNTR vnp, Int4 query_length, VSOptionsPtr options)
766 {
767 SeqLocPtr seqloc1=NULL, seqloc2=NULL, seqloc3=NULL, seqloc4=NULL;
768 SeqLocPtr n_seqloc1=NULL, n_seqloc2=NULL, n_seqloc3=NULL;
769 SeqLocPtr tmp_slp=NULL, combo_slp=NULL;
770 ValNodePtr var;
771
772 if (vnp == NULL || options == NULL)
773 return FALSE;
774
775 if (*vnp == NULL)
776 return TRUE;
777
778 var = *vnp;
779 while (var)
780 {
781 if (var->choice == 1)
782 {
783 seqloc1 = var->data.ptrvalue;
784 }
785 else if (var->choice == 2)
786 {
787 seqloc2 = var->data.ptrvalue;
788 }
789 else if (var->choice == 3)
790 {
791 seqloc3 = var->data.ptrvalue;
792 }
793 var = var->next;
794 }
795
796 *vnp = ValNodeFree(*vnp);
797
798 if (seqloc1)
799 {
800 n_seqloc1 = CombineSeqLoc(seqloc1);
801 seqloc1 = SeqLocSetFree(seqloc1);
802 ValNodeAddPointer(vnp, 1, n_seqloc1);
803 }
804
805 if (seqloc2)
806 {
807 seqloc2 = CheckForOverlap(n_seqloc1, seqloc2);
808 n_seqloc2 = CombineSeqLoc(seqloc2);
809 seqloc2 = SeqLocSetFree(seqloc2);
810 ValNodeAddPointer(vnp, 2, n_seqloc2);
811 }
812
813 if (seqloc3)
814 {
815 tmp_slp = SeqLocDupAll(n_seqloc1);
816 tmp_slp = SeqLocLink(&tmp_slp, SeqLocDupAll(n_seqloc2));
817 combo_slp = CombineSeqLoc(tmp_slp);
818 tmp_slp = SeqLocSetFree(tmp_slp);
819 seqloc3 = CheckForOverlap(combo_slp, seqloc3);
820 n_seqloc3 = CombineSeqLoc(seqloc3);
821 seqloc3 = SeqLocSetFree(seqloc3);
822 ValNodeAddPointer(vnp, 3, n_seqloc3);
823 }
824
825 combo_slp = SeqLocSetFree(combo_slp);
826
827 tmp_slp = NULL;
828 tmp_slp = SeqLocDupAll(n_seqloc1);
829 tmp_slp = SeqLocLink(&tmp_slp, SeqLocDupAll(n_seqloc2));
830 tmp_slp = SeqLocLink(&tmp_slp, SeqLocDupAll(n_seqloc3));
831
832 seqloc4 = GetInterveningSeqLoc(tmp_slp, options->distance_between_segments, query_length);
833 ValNodeAddPointer(vnp, 4, seqloc4);
834 /*
835 if (seqloc4)
836 {
837 seqloc = NULL;
838 ValNodeAddPointer(&seqloc, SEQLOC_PACKED_INT, seqloc4);
839 }
840 */
841
842
843 tmp_slp = SeqLocSetFree(tmp_slp);
844
845
846 return TRUE;
847 }
848
849 /*
850 For a given SeqAlignPtr, make and merge SeqLoc's corresponding to these.
851
852 This function calls VSMakeSeqLoc and VSCombineSeqLoc.
853
854 Note: if 'options' is NULL, default values will be used. This is STRONGLY recommended.
855 */
856
857 Int2 LIBCALL
VSMakeCombinedSeqLoc(SeqAlignPtr PNTR seqalign_ptr,ValNodePtr PNTR vnpp,Int4 length,VSOptionsPtr options)858 VSMakeCombinedSeqLoc(SeqAlignPtr PNTR seqalign_ptr, ValNodePtr PNTR vnpp, Int4 length, VSOptionsPtr options)
859 {
860 Boolean delete_options=FALSE;
861 Boolean not_failed;
862 Int2 retval=0;
863 ValNodePtr var;
864
865 if (seqalign_ptr == NULL || vnpp == NULL)
866 return -1;
867
868 *vnpp = NULL;
869
870 if (*seqalign_ptr == NULL)
871 return 0;
872
873 if (options == NULL)
874 {
875 options = VSOptionsNew();
876 if (options == NULL)
877 return -1;
878 delete_options = TRUE;
879 }
880
881 not_failed = VSMakeSeqLoc(seqalign_ptr, vnpp, length, options);
882 if (not_failed)
883 not_failed = VSCombineSeqLoc(vnpp, length, options);
884
885 if (not_failed)
886 {
887 retval = *vnpp ? INT2_MAX : 0;
888 var = *vnpp;
889 while (var)
890 {
891 retval = MIN(retval, (Int2) var->choice);
892 var = var->next;
893 }
894 }
895 else retval = -1;
896
897 if (delete_options)
898 options = VSOptionsFree(options);
899
900 return retval;
901 }
902
903 /*
904 Performs VecScreen for stand-alone application.
905
906 Note: if 'options' is NULL, default values will be used. This is STRONGLY recommended.
907 */
908
909 Int2 LIBCALL
VSScreenSequence(BioseqPtr bsp,VSOptionsPtr options,CharPtr database,SeqAlignPtr PNTR seqalign_ptr,ValNodePtr PNTR vnpp,ValNodePtr * other_returns,ValNodePtr * error_returns)910 VSScreenSequence(BioseqPtr bsp, VSOptionsPtr options, CharPtr database, SeqAlignPtr PNTR seqalign_ptr, ValNodePtr PNTR vnpp, ValNodePtr *other_returns, ValNodePtr *error_returns)
911
912 {
913 BLAST_OptionsBlkPtr blast_options;
914 Boolean delete_options = FALSE;
915 Int2 retval=0;
916 SeqAlignPtr seqalign;
917
918 if (bsp == NULL)
919 return -1;
920
921 if (seqalign_ptr)
922 *seqalign_ptr = NULL;
923
924 blast_options = VSBlastOptionNew();
925 if (blast_options == NULL)
926 return -1;
927
928 if (options == NULL)
929 {
930 options = VSOptionsNew();
931 if (options == NULL)
932 return -1;
933 delete_options = TRUE;
934 }
935
936 if (database == NULL)
937 seqalign = BioseqBlastEngine(bsp, "blastn", "UniVec_Core", blast_options, other_returns, error_returns, NULL);
938 else
939 seqalign = BioseqBlastEngine(bsp, "blastn", database, blast_options, other_returns, error_returns, NULL);
940 if (seqalign)
941 {
942 retval = VSMakeCombinedSeqLoc(&seqalign, vnpp, bsp->length, options);
943
944 if (seqalign_ptr)
945 *seqalign_ptr = seqalign;
946 }
947
948 if (delete_options)
949 options = VSOptionsFree(options);
950
951 blast_options = BLASTOptionDelete(blast_options);
952
953 return retval;
954 }
955
956 /*
957 Performs VecScreen for vector filtering.
958
959 Note: if 'options' is NULL, default values will be used. This is STRONGLY recommended.
960 */
961
962 Int2 LIBCALL
VSScreenSequenceByLoc(SeqLocPtr slp,VSOptionsPtr options,CharPtr database,SeqAlignPtr PNTR seqalign_ptr,ValNodePtr PNTR vnpp,ValNodePtr * other_returns,ValNodePtr * error_returns)963 VSScreenSequenceByLoc(SeqLocPtr slp, VSOptionsPtr options, CharPtr database, SeqAlignPtr PNTR seqalign_ptr, ValNodePtr PNTR vnpp, ValNodePtr *other_returns, ValNodePtr *error_returns)
964
965 {
966 BioseqPtr bsp;
967 Int2 retval=0;
968
969 if (slp == NULL)
970 return -1;
971
972
973 bsp = BioseqLockById(SeqLocId(slp));
974 if (bsp == NULL)
975 return -1;
976
977 retval = VSScreenSequence(bsp, options, database, seqalign_ptr, vnpp, other_returns, error_returns);
978
979 BioseqUnlock(bsp);
980
981 return retval;
982 }
983
984 /*
985 Prints bar overview and list of matching segments by category
986 */
987
988 Boolean LIBCALL
VSPrintOverviewFromSeqLocs(ValNodePtr vnp,Int4 query_length,FILE * outfp)989 VSPrintOverviewFromSeqLocs (ValNodePtr vnp, Int4 query_length, FILE *outfp)
990
991 {
992 CharPtr buffer50=NULL, buffer25=NULL, buffer20=NULL, buffer15=NULL;
993 SeqLocPtr seqloc1=NULL, seqloc2=NULL, seqloc3=NULL, seqloc4=NULL, tmp;
994 ValNodePtr var=NULL;
995
996 if (outfp == NULL || query_length < 1)
997 return FALSE;
998
999 var = vnp;
1000 while (var)
1001 {
1002 if (var->choice == 1)
1003 {
1004 seqloc1 = var->data.ptrvalue;
1005 }
1006 else if (var->choice == 2)
1007 {
1008 seqloc2 = var->data.ptrvalue;
1009 }
1010 else if (var->choice == 3)
1011 {
1012 seqloc3 = var->data.ptrvalue;
1013 }
1014 else if (var->choice == 4)
1015 {
1016 seqloc4 = var->data.ptrvalue;
1017 }
1018 var = var->next;
1019 }
1020
1021 if (!seqloc1 && !seqloc2 && !seqloc3 && !seqloc4)
1022 return TRUE;
1023
1024 if (seqloc1)
1025 {
1026 Print62Notation(seqloc1, query_length, &buffer50);
1027 }
1028
1029 if (seqloc2)
1030 {
1031 Print62Notation(seqloc2, query_length, &buffer25);
1032 }
1033
1034 if (seqloc3)
1035 {
1036 Print62Notation(seqloc3, query_length, &buffer20);
1037 }
1038
1039 if (seqloc4)
1040 {
1041 Print62Notation(seqloc4, query_length, &buffer15);
1042 }
1043
1044
1045 fprintf(outfp, "<B>Distribution of Vector Matches on the Query Sequence</B>\n\n");
1046 fprintf(outfp, "<IMG hspace=1 SRC=http://www.ncbi.nlm.nih.gov/sutils/tunf.cgi?450x12-1(1)150(%ld)300(%ld)450(%ld)600(%ld)\n", (long) query_length/4, (long) query_length/2, (long) 3*query_length/4, (long) query_length);
1047 fprintf(outfp, ">\n");
1048 fprintf(outfp, "<IMG hspace=1 SRC=http://www.ncbi.nlm.nih.gov/gorf/gun2.cgi?0M(009g)");
1049 if (buffer20)
1050 fprintf(outfp, "00FF00(%s)", buffer20);
1051 if (buffer25)
1052 fprintf(outfp, "FF00FF(%s)", buffer25);
1053 if (buffer50)
1054 fprintf(outfp, "FF0000(%s)", buffer50);
1055 if (buffer15)
1056 fprintf(outfp, "FFFF00(%s)", buffer15);
1057
1058 MemFree(buffer20);
1059 MemFree(buffer25);
1060 MemFree(buffer50);
1061 MemFree(buffer15);
1062
1063 fprintf(outfp, ">\n");
1064 fprintf(outfp, " <B>Match to Vector: </B>");
1065 fprintf(outfp, "<IMG SRC=http://www.ncbi.nlm.nih.gov/gorf/gun2.cgi?0M(000M)FF0000(000M)> <B>Strong </B> ");
1066 fprintf(outfp, "<IMG SRC=http://www.ncbi.nlm.nih.gov/gorf/gun2.cgi?0M(000M)FF00FF(000M)> <B>Moderate </B> ");
1067 fprintf(outfp, "<IMG SRC=http://www.ncbi.nlm.nih.gov/gorf/gun2.cgi?0M(000M)00FF00(000M)> <B>Weak </B>\n");
1068 fprintf(outfp, " <B>Segment of suspect origin: </B>");
1069 fprintf(outfp, "<IMG SRC=http://www.ncbi.nlm.nih.gov/gorf/gun2.cgi?0M(000M)FFFF00(000M)>\n\n");
1070
1071
1072 fprintf(outfp, "<B>Segments matching vector: </B>\n");
1073 if (seqloc1)
1074 {
1075 if (seqloc1->choice == SEQLOC_PACKED_INT)
1076 tmp = seqloc1->data.ptrvalue;
1077 fprintf(outfp, "<A HREF=\"http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html#Strong\" TARGET=\"VecScreenInfo\">Strong match</A>: ");
1078 while (tmp)
1079 {
1080 fprintf(outfp, " %ld-%ld", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1081 if (tmp->next)
1082 fprintf(outfp, ",");
1083 else
1084 fprintf(outfp, "\n");
1085 tmp = tmp->next;
1086 }
1087 }
1088
1089 if (seqloc2)
1090 {
1091 if (seqloc2->choice == SEQLOC_PACKED_INT)
1092 tmp = seqloc2->data.ptrvalue;
1093 fprintf(outfp, "<A HREF=\"http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html#Moderate\" TARGET=\"VecScreenInfo\">Moderate match</A>:");
1094 while (tmp)
1095 {
1096 fprintf(outfp, " %ld-%ld", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1097 if (tmp->next)
1098 fprintf(outfp, ",");
1099 else
1100 fprintf(outfp, "\n");
1101 tmp = tmp->next;
1102 }
1103 }
1104
1105 if (seqloc3)
1106 {
1107 if (seqloc3->choice == SEQLOC_PACKED_INT)
1108 tmp = seqloc3->data.ptrvalue;
1109 fprintf(outfp, "<A HREF=\"http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html#Weak\" TARGET=\"VecScreenInfo\">Weak match</A>: ");
1110 while (tmp)
1111 {
1112 fprintf(outfp, " %ld-%ld", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1113 if (tmp->next)
1114 fprintf(outfp, ",");
1115 else
1116 fprintf(outfp, "\n");
1117 tmp = tmp->next;
1118 }
1119 }
1120
1121 if (seqloc4)
1122 {
1123 if (seqloc4->choice == SEQLOC_PACKED_INT)
1124 tmp = seqloc4->data.ptrvalue;
1125 fprintf(outfp, "<A HREF=\"http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html#Suspect\" TARGET=\"VecScreenInfo\">Suspect origin</A>:");
1126 while (tmp)
1127 {
1128 fprintf(outfp, " %ld-%ld", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1129 if (tmp->next)
1130 fprintf(outfp, ",");
1131 else
1132 fprintf(outfp, "\n");
1133 tmp = tmp->next;
1134 }
1135 }
1136
1137 fprintf(outfp, "\n");
1138
1139 return TRUE;
1140 }
1141
1142
1143
1144 /*
1145 Prints list of matching segments
1146 */
1147
1148 Boolean LIBCALL
VSPrintListFromSeqLocs(ValNodePtr vnp,FILE * outfp)1149 VSPrintListFromSeqLocs (ValNodePtr vnp, FILE *outfp)
1150
1151 {
1152 SeqLocPtr seqloc1=NULL, seqloc2=NULL, seqloc3=NULL, seqloc4=NULL, tmp;
1153 ValNodePtr var=NULL;
1154
1155 if (outfp == NULL)
1156 return FALSE;
1157
1158 var = vnp;
1159 while (var)
1160 {
1161 if (var->choice == 1)
1162 {
1163 seqloc1 = var->data.ptrvalue;
1164 }
1165 else if (var->choice == 2)
1166 {
1167 seqloc2 = var->data.ptrvalue;
1168 }
1169 else if (var->choice == 3)
1170 {
1171 seqloc3 = var->data.ptrvalue;
1172 }
1173 else if (var->choice == 4)
1174 {
1175 seqloc4 = var->data.ptrvalue;
1176 }
1177 var = var->next;
1178 }
1179
1180 if (!seqloc1 && !seqloc2 && !seqloc3 && !seqloc4)
1181 return TRUE;
1182
1183 if (seqloc1)
1184 {
1185 if (seqloc1->choice == SEQLOC_PACKED_INT)
1186 tmp = seqloc1->data.ptrvalue;
1187 fprintf(outfp, "Strong match\n");
1188 while (tmp)
1189 {
1190 fprintf(outfp, "%ld\t%ld\n", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1191 tmp = tmp->next;
1192 }
1193 }
1194
1195 if (seqloc2)
1196 {
1197 if (seqloc2->choice == SEQLOC_PACKED_INT)
1198 tmp = seqloc2->data.ptrvalue;
1199 fprintf(outfp, "Moderate match\n");
1200 while (tmp)
1201 {
1202 fprintf(outfp, "%ld\t%ld\n", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1203 tmp = tmp->next;
1204 }
1205 }
1206
1207 if (seqloc3)
1208 {
1209 if (seqloc3->choice == SEQLOC_PACKED_INT)
1210 tmp = seqloc3->data.ptrvalue;
1211 fprintf(outfp, "Weak match\n");
1212 while (tmp)
1213 {
1214 fprintf(outfp, "%ld\t%ld\n", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1215 tmp = tmp->next;
1216 }
1217 }
1218
1219 if (seqloc4)
1220 {
1221 if (seqloc4->choice == SEQLOC_PACKED_INT)
1222 tmp = seqloc4->data.ptrvalue;
1223 fprintf(outfp, "Suspect origin\n");
1224 while (tmp)
1225 {
1226 fprintf(outfp, "%ld\t%ld\n", (long) (SeqLocStart(tmp)+1), (long) (SeqLocStop(tmp)+1));
1227 tmp = tmp->next;
1228 }
1229 }
1230
1231 return TRUE;
1232 }
1233
1234
1235 #define BUFFER_LENGTH 255
1236
1237 /*
1238 Prints Id line for list results format
1239 */
1240
1241 Boolean LIBCALL
VSPrintListIdLine(BioseqPtr query_bsp,CharPtr proginfo,CharPtr database,FILE * outfp)1242 VSPrintListIdLine (BioseqPtr query_bsp, CharPtr proginfo, CharPtr database, FILE *outfp)
1243
1244 {
1245
1246 Char idbuf [BUFFER_LENGTH];
1247 CharPtr dbdef, ptr, chptr;
1248 Int4 length;
1249 ReadDBFILEPtr rdfp, rdfp_var;
1250 Boolean first_title;
1251
1252
1253 if (outfp == NULL)
1254 return FALSE;
1255
1256
1257 /* prepare Query ID string for output */
1258
1259 ptr = NULL; chptr = NULL;
1260
1261 if (query_bsp)
1262 ptr = BioseqGetTitle (query_bsp);
1263
1264 if (ptr) {
1265 while (IS_WHITESP(*ptr))
1266 ptr++;
1267
1268 if (*ptr == '"') {
1269 ptr++;
1270 chptr = StringChr (ptr, '"');
1271 }
1272 else {
1273 chptr = StringPBrk (ptr, " \t");
1274 }
1275 }
1276
1277 if (ptr && *ptr != '\0') {
1278 StringNCpy_0 (idbuf, ptr, (chptr ? MIN((chptr - ptr) + 1, BUFFER_LENGTH) : BUFFER_LENGTH));
1279 }
1280 else {
1281 StringCpy (idbuf, "Id_unknown");
1282 }
1283
1284
1285 /* prepare database description(s) for output */
1286
1287 dbdef = NULL; rdfp = NULL;
1288
1289 if (database)
1290 rdfp = readdb_new_ex(database, READDB_DB_IS_NUC, FALSE);
1291
1292 if (rdfp) {
1293 rdfp_var = rdfp;
1294 length = 0;
1295 while (rdfp_var) {
1296 length += StringLen(readdb_get_title(rdfp_var));
1297 length += 3;
1298 rdfp_var = rdfp_var->next;
1299 }
1300 dbdef = MemNew(length*sizeof(Char));
1301 }
1302
1303 if (dbdef) {
1304 ptr = dbdef;
1305 rdfp_var = rdfp;
1306
1307 first_title = TRUE;
1308 while (rdfp_var) {
1309 chptr = readdb_get_title(rdfp_var);
1310
1311 if(chptr == NULL) {
1312 rdfp_var = rdfp_var->next;
1313 continue;
1314 }
1315
1316 if(!first_title) {
1317 *ptr = ';';
1318 ptr++;
1319 *ptr = ' ';
1320 ptr++;
1321 }
1322 else
1323 first_title = FALSE;
1324
1325 StringCpy(ptr, readdb_get_title(rdfp_var));
1326
1327 length = StringLen(ptr);
1328 ptr += length;
1329
1330 rdfp_var = rdfp_var->next;
1331 }
1332
1333 *ptr = '\0';
1334
1335 rdfp = readdb_destruct(rdfp);
1336
1337 }
1338
1339
1340 fprintf(outfp, ">Vector %s Screen: %s Database: %s\n",
1341 idbuf, proginfo ? proginfo : "unknown", dbdef ? dbdef : "unknown");
1342
1343 dbdef = MemFree(dbdef);
1344
1345 return TRUE;
1346
1347 }
1348
1349
1350