1 static char const rcsid[] = "$Id: pobutil.c,v 6.10 2007/05/07 13:30:54 kans Exp $";
2
3 #include <pobutil.h>
4 #include <gather.h>
5 #include <edutil.h>
6 #include <salign.h>
7 #include <salutil.h>
8 #include <salsap.h>
9
10 /*****************************************************************
11 *
12 * clean_empty_seqalign(): clean up Seq-align that has no
13 * data
14 *
15 *****************************************************************/
clean_empty_seqalign(SeqAlignPtr PNTR head)16 SeqAlignPtr clean_empty_seqalign(SeqAlignPtr PNTR head)
17 {
18 SeqAlignPtr curr, next, prev = NULL;
19
20 curr = *head;
21 while(curr)
22 {
23 next = curr->next;
24 if(curr->segs == NULL)
25 {
26 if(prev == NULL)
27 *head = curr->next;
28 else
29 prev->next = curr->next;
30 curr->next = NULL;
31 SeqAlignFree(curr);
32 }
33 else
34 prev = curr;
35 curr = next;
36 }
37
38 return (*head);
39 }
40
41
make_repeat_lib(CharPtr file_name,ValNodePtr PNTR slp_list,Boolean io_type)42 ValNodePtr make_repeat_lib(CharPtr file_name,
43 ValNodePtr PNTR slp_list,
44 Boolean io_type)
45 {
46 FILE *ifp;
47 SeqEntryPtr sep;
48 BioseqPtr bsp;
49 ValNodePtr sep_list = NULL;
50 SeqLocPtr slp;
51 CharPtr next_char;
52
53 if(file_name == NULL)
54 return NULL;
55 *slp_list = NULL;
56
57 /* FILE IO */
58
59 if(io_type == FILE_IO) {
60 if((ifp = FileOpen(file_name, "r")) == NULL) {
61 Message(MSG_POSTERR, "Fail to open Alu data file %s", file_name);
62 return NULL;
63 }
64 while((sep = FastaToSeqEntry(ifp, TRUE)) != NULL) {
65 if(sep !=NULL) {
66 ValNodeAddPointer(&sep_list, 0, sep);
67 bsp = sep->data.ptrvalue;
68 slp = SeqLocIntNew(0, bsp->length-1, Seq_strand_both, bsp->id);
69 ValNodeAddPointer(slp_list, 0, slp);
70 }
71 }
72 FileClose(ifp);
73 } else {
74 /* STDIN IO - WWW Power Blast case */
75 if(file_name[0] == NULLB)
76 return NULL;
77 next_char = file_name;
78 while((sep = FastaToSeqBuff(next_char,
79 &next_char, TRUE)) != NULL) {
80 if(sep !=NULL) {
81 ValNodeAddPointer(&sep_list, 0, sep);
82 bsp = sep->data.ptrvalue;
83 slp = SeqLocIntNew(0, bsp->length-1, Seq_strand_both, bsp->id);
84 ValNodeAddPointer(slp_list, 0, slp);
85 }
86 }
87 }
88 return sep_list;
89 }
90
91
free_alu_list(ValNodePtr alu_sep_list,ValNodePtr alu_slp_list)92 void free_alu_list(ValNodePtr alu_sep_list, ValNodePtr alu_slp_list)
93 {
94 ValNodePtr curr;
95 SeqEntryPtr sep;
96 SeqLocPtr slp;
97
98 for(curr = alu_sep_list; curr!= NULL; curr= curr->next)
99 {
100 sep = (SeqEntryPtr)(curr->data.ptrvalue);
101 SeqEntryFree(sep);
102 }
103 ValNodeFree(alu_sep_list);
104
105 for(curr = alu_slp_list; curr!= NULL; curr= curr->next)
106 {
107 slp= (SeqLocPtr)(curr->data.ptrvalue);
108 SeqLocFree(slp);
109 }
110 ValNodeFree(alu_slp_list);
111 }
112
113
114 #define MIN_K 1
115 #define OFFSET_K 100
cal_K_num(Int4 query_len,Int4 db_len)116 static Int4 cal_K_num(Int4 query_len, Int4 db_len)
117 {
118 Int4 num;
119
120 num = MAX(MIN_K, (query_len/(db_len+OFFSET_K)+1));
121 return num;
122 }
123
124
125
store_align_ends(ValNodePtr ends_list,SeqIdPtr sip)126 static SeqLocPtr store_align_ends(ValNodePtr ends_list, SeqIdPtr sip)
127 {
128
129 RecordEndsPtr rep;
130 ValNodePtr curr;
131 SeqLocPtr slp_head = NULL, new;
132 SeqLocPtr slp = NULL;
133
134
135 if(ends_list == NULL)
136 return NULL;
137
138 for(curr = ends_list; curr != NULL; curr = curr->next)
139 {
140 rep = (RecordEndsPtr)(curr->data.ptrvalue);
141 new = SeqLocIntNew(rep->start, rep->stop, rep->strand, sip);
142 /*printf("%s: start is %ld, stop is %ld, strand is %d\n", rep->name, rep->start, rep->stop, rep->strand);*/
143 ValNodeLink(&slp_head, new);
144 }
145 if(slp_head !=NULL)
146 {
147 slp = ValNodeNew(NULL);
148 slp->choice = SEQLOC_MIX;
149 slp->data.ptrvalue = slp_head;
150 }
151 return slp;
152 }
153
get_word_size(Int4 length)154 static Int4 get_word_size(Int4 length)
155 {
156 /*#ifdef WIN_MSWIN
157 return 6;
158 #endif*/
159
160 /* #ifdef WIN16
161 return 6;
162 #endif
163 */
164
165 if(length < 500)
166 return 6;
167 if(length <10000)
168 return 7;
169 return 8;
170 }
171
172
173 /*******************************************************************
174 *
175 * SaveRepeats(bsp, ends_list)
176 * converts the alignments stored in ends_list into the Seq-feat
177 * repeat feature in bsp
178 *
179 *******************************************************************/
build_repeat_feat(ValNodePtr ends_list,SeqIdPtr sip)180 static SeqFeatPtr build_repeat_feat(ValNodePtr ends_list, SeqIdPtr sip)
181 {
182 RecordEndsPtr rep;
183 SeqFeatPtr sfp = NULL, new_sfp, curr;
184 ImpFeatPtr ifp;
185 /* Char temp[100]; */
186 GBQualPtr qual;
187 Char c='%';
188
189 while(ends_list)
190 {
191 rep = (RecordEndsPtr)(ends_list->data.ptrvalue);
192 new_sfp = SeqFeatNew();
193 new_sfp->data.choice = 8;
194 ifp = ImpFeatNew();
195 ifp->key = StringSave("repeat_region");
196 new_sfp->data.value.ptrvalue = ifp;
197 /* sprintf(temp, "%d%c similarity", (Int2)(100.0 * rep->zscore), c);
198 new_sfp->comment = StringSave(temp); */
199 new_sfp->location = SeqLocIntNew(rep->start, rep->stop, rep->strand, sip);
200 qual = GBQualNew();
201 qual->qual = StringSave("rpt_family");
202 qual->val = StringSave(rep->name);
203 /* if(StringCmp(rep->name, "L1") ==0)
204 qual->val = StringSave("L1");
205 else
206 {
207 if(StrStr(rep->name, "MER"))
208 qual->val = StringSave(rep->name);
209 else
210 qual->val = StringSave("ALU");
211 } */
212 new_sfp->qual = qual;
213
214 if(sfp == NULL)
215 sfp = new_sfp;
216 else
217 {
218 curr = sfp;
219 while(curr->next != NULL)
220 curr = curr->next;
221 curr->next = new_sfp;
222 }
223
224 ends_list = ends_list->next;
225 }
226
227 return sfp;
228 }
229
230
SaveRepeats(BioseqPtr bsp,ValNodePtr ends_list)231 void SaveRepeats(BioseqPtr bsp, ValNodePtr ends_list)
232 {
233 SeqFeatPtr newsfp;
234 SeqAnnotPtr annot, prev;
235 ValNodePtr desc;
236
237 newsfp = build_repeat_feat(ends_list, bsp->id);
238 if(newsfp == NULL)
239 return;
240 desc = ValNodeNew(NULL);
241 desc->choice = Annot_descr_name;
242 desc->data.ptrvalue = StringSave("powblast");
243
244 annot = SeqAnnotNew();
245 annot->type = 1;
246 annot->data = newsfp;
247 annot->desc = desc;
248
249 if(bsp->annot == NULL)
250 bsp->annot = annot;
251 else
252 {
253 prev = bsp->annot;
254 while(prev->next != NULL)
255 prev = prev->next;
256 prev->next = annot;
257 }
258 return;
259 }
260
filter_align(Int4 length,FloatHi zscore,FloatHi score)261 static Boolean filter_align(Int4 length, FloatHi zscore, FloatHi score)
262 {
263 FloatHi MINSCORE = 0.6;
264
265 if(zscore < MINSCORE)
266 return FALSE;
267 return (score > 30.0);
268
269 }
270
271
filter_repeats(SeqLocPtr slp,ValNodePtr alu_slp_list,ValNodePtr PNTR ends_list)272 SeqLocPtr filter_repeats(SeqLocPtr slp, ValNodePtr alu_slp_list, ValNodePtr PNTR ends_list)
273 {
274 SimPam sp;
275 ValNodePtr curr;
276 SeqLocPtr sloc;
277 Int4 length;
278 Int4 k;
279 ValNodePtr new_list, this_list = NULL;
280 SeqLocPtr aluloc = NULL;
281
282 if(alu_slp_list == NULL || slp == NULL)
283 return NULL;
284
285 DefaultSimPam(&sp);
286 length = SeqLocLen(slp);
287 sp.word = get_word_size(length);
288 sp.cutoff = 30.0;
289 this_list = NULL;
290 for(curr = alu_slp_list; curr != NULL; curr= curr->next)
291 {
292 sloc = (SeqLocPtr)(curr->data.ptrvalue);
293 k = cal_K_num(length, SeqLocLen(sloc));
294
295 new_list = NULL;
296 /* SIM2(slp, sloc, k, &sp, &new_list, OUTPUT_ENDS, NULL); */
297 SIM2(slp, sloc, k, &sp, &new_list, OUTPUT_ENDS, (FilterProc)filter_align);
298 this_list = merge_two_list(this_list, new_list);
299 /* Change_Loc_Strand(sloc, Seq_strand_both); */
300 }
301
302 if(this_list != NULL)
303 {
304 this_list = CleanNewList(this_list);
305 aluloc = store_align_ends(this_list, SeqLocId(slp));
306 if(ends_list != NULL)
307 {
308 if(*ends_list == NULL)
309 *ends_list = this_list;
310 else
311 *ends_list = merge_two_list(*ends_list, this_list);
312 }
313 }
314
315 return aluloc;
316 }
317
318
319 /*****************************************************
320 *
321 * filter_repeats_for_bigloc(slp, max_len, min_overlap)
322 * break the slp into smaller pieces and merge the results
323 *
324 * slp: the big Seq-loc
325 * max_len: the maximum allowed length for search
326 * if -1, set to default
327 * min_overlap: the minimum length for search
328 * if -1, set to the default
329 *
330 *******************************************************/
filter_repeats_for_bigloc(SeqLocPtr slp,Int4 max_len,Int4 min_overlap,ValNodePtr alu_slp_list)331 ValNodePtr filter_repeats_for_bigloc(SeqLocPtr slp, Int4 max_len, Int4 min_overlap, ValNodePtr alu_slp_list)
332 {
333 ValNodePtr ends_list = NULL;
334 SeqLocPtr b_loc, loc;
335 SeqLocPtr aluloc;
336
337 if(slp->choice != SEQLOC_INT && slp->choice != SEQLOC_WHOLE)
338 {
339 ErrPostEx(SEV_ERROR, 0, 0, "Sorry, can not handle SeqLoc for type %d", slp->choice);
340 return NULL;
341 }
342 if(max_len == -1)
343 max_len = REPEAT_BREAK_LEN;
344 if(min_overlap == -1)
345 min_overlap = REPEAT_OVERLAP_LEN;
346
347 b_loc = break_blast_job(slp, SeqLocId(slp), REPEAT_BREAK_LEN, min_overlap);
348 for(loc = b_loc; loc != NULL; loc = loc->next)
349 {
350 aluloc = filter_repeats(loc, alu_slp_list, &ends_list);
351 SeqLocSetFree(aluloc);
352 }
353 SeqLocSetFree(b_loc);
354 return ends_list;
355 }
356
357
358
get_output_name(CharPtr name,CharPtr buf)359 void get_output_name(CharPtr name, CharPtr buf)
360 {
361 Int4 len, i;
362
363
364 i =0;
365 for(; buf !=NULL; ++buf)
366 {
367 if(*buf == '\n' || *buf == '\0' || *buf == '|')
368 break;
369 else
370 name[i++] = *buf;
371 }
372 name[i] = '\0';
373 len = i;
374 for(i = len-1; i>=0; --i)
375 {
376 if(name[i] == '.')
377 {
378 name[i] = '\0';
379 return;
380 }
381 }
382 }
383
write_asn_output(CharPtr name,BioseqPtr bsp,SeqAnnotPtr annot,Int4 option,Boolean io_type)384 void write_asn_output(CharPtr name, BioseqPtr bsp, SeqAnnotPtr annot, Int4 option, Boolean io_type)
385 {
386 SeqEntryPtr sep;
387 Char out_file[100];
388 AsnIoPtr aip;
389 SeqAnnotPtr sap, prev, next;
390 Boolean free_it;
391
392 if(option == 0)
393 {
394 SeqAnnotFree(annot);
395 return;
396 }
397 if(bsp == NULL)
398 {
399 SeqAnnotFree(annot);
400 return;
401 }
402
403 if(bsp->annot == NULL)
404 bsp->annot = annot;
405 else
406 {
407 sap = bsp->annot;
408 prev = NULL;
409
410 while(sap)
411 {
412 free_it = FALSE;
413 next = sap->next;
414 if(sap->type == 2)
415 {
416 if(option == 1) /*overwrite*/
417 {
418 free_it = TRUE;
419 if(prev != NULL)
420 prev->next = sap->next;
421 else
422 bsp->annot = sap->next;
423 sap->next = NULL;
424 SeqAnnotFree(next);
425 }
426 }
427 if(!free_it)
428 prev = sap;
429 sap = next;
430 }
431
432 if(prev != NULL)
433 prev->next = annot;
434 else
435 bsp->annot = annot;
436 }
437
438 sep = SeqEntryFind(bsp->id);
439
440 if(io_type == FILE_IO) {
441 sprintf(out_file, "%s.ent", name);
442 aip = AsnIoOpen(out_file, "w");
443 } else { /* for STD_IO */
444 aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
445 }
446
447 SeqEntryAsnWrite(sep, aip, NULL);
448 AsnIoClose(aip);
449 }
450
451
save_SeqAnnot(SeqAnnotPtr annot,CharPtr name,Boolean io_type)452 Boolean save_SeqAnnot(SeqAnnotPtr annot, CharPtr name, Boolean io_type)
453 {
454 Char sat_name[100];
455 AsnIoPtr aip;
456
457 if(annot->data == NULL)
458 return FALSE;
459
460 if(io_type == FILE_IO) {
461 sprintf(sat_name, "%s.sat", name);
462 aip = AsnIoOpen(sat_name, "w");
463 } else { /* for STD_IO */
464 aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
465 }
466 SeqAnnotAsnWrite(annot, aip, NULL);
467 AsnIoClose(aip);
468 return TRUE;
469 }
470
load_dust_bsp(BioseqPtr dust_bsp,SeqLocPtr dustloc)471 void load_dust_bsp(BioseqPtr dust_bsp, SeqLocPtr dustloc)
472 {
473 ByteStorePtr bsp;
474 SeqLocPtr slp = NULL;
475 Int4 start, stop;
476 Uint1 res = 'N';
477
478
479 if (dust_bsp->seq_data_type == Seq_code_gap) return;
480
481 if(dust_bsp->mol == Seq_mol_aa)
482 res = 'X';
483
484 bsp = (ByteStorePtr) dust_bsp->seq_data;
485 while(dustloc)
486 {
487 slp = NULL;
488 while((slp = SeqLocFindNext(dustloc, slp))!=NULL)
489 {
490 start = SeqLocStart(slp);
491 stop = SeqLocStop(slp);
492 BSSeek(bsp, start, SEEK_SET);
493 for(; start <=stop; ++start)
494 BSPutByte(bsp, (Int2)res);
495 }
496 dustloc = dustloc->next;
497 }
498 }
499
500
501 /*mask the current sequence with the repeat feature*/
502 typedef struct mask_repeat {
503 BioseqPtr bsp;
504 BioseqPtr dust_bsp;
505 }MaskFeat, PNTR MaskFeatPtr;
506
MaskWithRepeatFeature(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)507 static void MaskWithRepeatFeature(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
508 {
509 SeqAnnotPtr annot;
510 SeqFeatPtr sfp;
511 BioseqPtr bsp;
512 BioseqSetPtr bssp;
513 SeqIdPtr sip;
514 MaskFeatPtr mfp;
515 ImpFeatPtr ifp;
516
517 mfp = (MaskFeatPtr)(data);
518 if(sep->choice == 1)
519 {
520 bsp = sep->data.ptrvalue;
521 annot = bsp->annot;
522 }
523 else
524 {
525 bssp = sep->data.ptrvalue;
526 annot = bssp->annot;
527 }
528
529 while(annot)
530 {
531 if(annot->type == 1)
532 {
533 sfp = annot->data;
534 while(sfp)
535 {
536 if(sfp->data.choice == 8)
537 {
538 sip = SeqLocId(sfp->location);
539 if(BioseqMatch(mfp->bsp, sip))
540 {
541 ifp = sfp->data.value.ptrvalue;
542 if(ifp->key != NULL)
543 {
544 if(StringNCmp(ifp->key, "repeat", 6) == 0)
545 load_dust_bsp(mfp->dust_bsp, sfp->location);
546 }
547 }
548 }
549 sfp = sfp->next;
550 }
551 }
552
553 annot = annot->next;
554 }
555 }
556
557 /*******************************************************************
558 *
559 * look for the repeat feature in sep and mask the residue in
560 * dust_bsp
561 *
562 *******************************************************************/
mask_with_repeat_feature(BioseqPtr bsp,SeqEntryPtr sep,BioseqPtr dust_bsp)563 void mask_with_repeat_feature(BioseqPtr bsp, SeqEntryPtr sep, BioseqPtr dust_bsp)
564 {
565 MaskFeat mf;
566
567 mf.bsp = bsp;
568 mf.dust_bsp = dust_bsp;
569
570 SeqEntryExplore(sep, (Pointer)&mf, MaskWithRepeatFeature);
571 }
572
make_dust_bsp(BioseqPtr bsp,Int4 start,Int4 stop,SeqLocPtr dustloc)573 BioseqPtr make_dust_bsp(BioseqPtr bsp, Int4 start, Int4 stop, SeqLocPtr dustloc)
574 {
575 BioseqPtr dust_bsp;
576 SeqPortPtr spp;
577 ByteStorePtr b_store;
578 Uint1 residue;
579 Uint1 code;
580
581 if(bsp->mol == Seq_mol_aa)
582 code = Seq_code_iupacaa;
583 else
584 code = Seq_code_iupacna;
585
586 spp = SeqPortNew(bsp, start, stop, Seq_strand_plus, code);
587 b_store = BSNew(stop - start +2);
588 BSSeek(b_store, 0, SEEK_SET);
589 while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
590 BSPutByte(b_store, (Int2)residue);
591 SeqPortFree(spp);
592
593
594 dust_bsp = BioseqNew();
595 dust_bsp->id = local_id_make("dust_bsp");
596 dust_bsp->seq_data = (SeqDataPtr) b_store;
597 dust_bsp->repr = Seq_repr_raw;
598 dust_bsp->mol = bsp->mol;
599 dust_bsp->length = stop - start +1;
600 dust_bsp->topology = 1;
601 dust_bsp->seq_data_type = code;
602
603 if(dustloc != NULL)
604 load_dust_bsp(dust_bsp, dustloc);
605 return dust_bsp;
606 }
607
prt_FASTA_file(BioseqPtr bsp,SeqLocPtr dustloc,Boolean io_type)608 void prt_FASTA_file(BioseqPtr bsp, SeqLocPtr dustloc, Boolean io_type)
609 {
610 FILE *fp;
611
612 load_dust_bsp(bsp, dustloc);
613 if(io_type == FILE_IO)
614 fp = FileOpen("xxx.out", "w");
615 else
616 fp = stdout;
617 BioseqRawToFasta(bsp, fp, TRUE);
618 if(io_type == FILE_IO)
619 FileClose(fp);
620 }
621
622
restore_blast_id(SeqAnnotPtr sap,SeqIdPtr sip)623 void restore_blast_id(SeqAnnotPtr sap, SeqIdPtr sip)
624 {
625 SeqAlignPtr align;
626 DenseDiagPtr ddp;
627 DenseSegPtr dsp;
628 SeqIdPtr next_id;
629 StdSegPtr ssp;
630 SeqLocPtr slp;
631 SeqIntPtr sint;
632
633 if(sap == NULL || sip == NULL)
634 return;
635 align = sap->data;
636 while(align)
637 {
638 switch(align->segtype)
639 {
640 case 1:
641 ddp = align->segs;
642 while(ddp)
643 {
644 next_id = ddp->id->next;
645 ddp->id->next = NULL;
646 SeqIdFree(ddp->id);
647 ddp->id = SeqIdDup(sip);
648 ddp->id->next = next_id;
649 ddp = ddp->next;
650 }
651 break;
652
653 case 2:
654 dsp = align->segs;
655 if(dsp)
656 {
657 next_id = dsp->ids->next;
658 dsp->ids->next = NULL;
659 SeqIdFree(dsp->ids);
660 dsp->ids = SeqIdDup(sip);
661 dsp->ids->next = next_id;
662 }
663 break;
664
665 case 3:
666 ssp = align->segs;
667 while(ssp)
668 {
669 next_id = ssp->ids->next;
670 ssp->ids->next = NULL;
671 SeqIdFree(ssp->ids);
672 ssp->ids = SeqIdDup(sip);
673 ssp->ids->next = next_id;
674
675 slp = ssp->loc;
676 if(slp->choice == SEQLOC_INT)
677 {
678 sint = slp->data.ptrvalue;
679 SeqIdFree(sint->id);
680 sint->id = SeqIdDup(sip);
681 }
682 ssp = ssp->next;
683 }
684 break;
685
686 default:
687 break;
688 }
689 align = align->next;
690 }
691 }
692
link_ddp(DenseDiagPtr PNTR head,DenseDiagPtr ddp_2)693 static void link_ddp(DenseDiagPtr PNTR head, DenseDiagPtr ddp_2)
694 {
695 DenseDiagPtr ddp_1;
696
697 if(*head == NULL)
698 *head = ddp_2;
699 else
700 {
701 ddp_1 = *head;
702 while(ddp_1->next != NULL)
703 ddp_1 = ddp_1->next;
704 ddp_1->next = ddp_2;
705 }
706 }
707
link_blast_sap(SeqAnnotPtr PNTR blast_sap,SeqAnnotPtr new)708 void link_blast_sap(SeqAnnotPtr PNTR blast_sap, SeqAnnotPtr new)
709 {
710 SeqAlignPtr align, new_align, next_new_align;
711 SeqAnnotPtr head;
712 DenseDiagPtr ddp, new_ddp;
713 SeqIdPtr new_id;
714
715 if(*blast_sap == NULL)
716 *blast_sap = new;
717 else
718 {
719 head = *blast_sap;
720 align = head->data;
721 if(align == NULL)
722 head->data = new->data;
723 else
724 {
725 new_align = new->data;
726 while(new_align)
727 {
728 next_new_align = new_align->next;
729 new_align->next = NULL;
730 new_ddp = new_align->segs;
731 new_id = new_ddp->id->next;
732
733 align = head->data;
734 while(align)
735 {
736 ddp = align->segs;
737 if(SeqIdMatch(new_id, ddp->id->next))
738 {
739 link_ddp(&ddp, new_ddp);
740 new_align->segs = NULL;
741 SeqAlignFree(new_align);
742 break;
743 }
744 align = align->next;
745 }
746 if(align == NULL)
747 head->data = link_align(new_align, (SeqAlignPtr )(head->data));
748 new_align = next_new_align;
749 }
750 }
751 new->data = NULL;
752 SeqAnnotFree(new);
753 }
754 }
755
get_strand_val(Uint1Ptr strands,Int2 i)756 static Uint1 get_strand_val(Uint1Ptr strands, Int2 i)
757 {
758 if(strands == NULL)
759 return Seq_strand_plus;
760 if(strands[i] != Seq_strand_minus)
761 return Seq_strand_plus;
762 else
763 return Seq_strand_minus;
764 }
765
filter_self_diagnol(DenseDiagPtr PNTR head)766 static DenseDiagPtr filter_self_diagnol(DenseDiagPtr PNTR head)
767 {
768 DenseDiagPtr prev = NULL, ddp;
769 DenseDiagPtr next;
770 Uint1 strand_1, strand_2;
771
772 ddp = *head;
773 while(ddp)
774 {
775 next = ddp->next;
776 strand_1 = get_strand_val(ddp->strands, 0);
777 strand_2 = get_strand_val(ddp->strands, 1);
778 if((strand_1 == strand_2) && (ddp->starts[0] == ddp->starts[1]))
779 {
780 if(prev == NULL)
781 *head = next;
782 else
783 prev->next = next;
784 ddp->next = NULL;
785 DenseDiagFree(ddp);
786 }
787 else
788 prev = ddp;
789 ddp = next;
790 }
791
792 return (*head);
793 }
794
795
796
filter_blast_query(SeqAnnotPtr blast_sap,BioseqPtr query)797 void filter_blast_query(SeqAnnotPtr blast_sap, BioseqPtr query)
798 {
799 SeqAlignPtr align, next, prev = NULL;
800 DenseDiagPtr ddp;
801 Boolean free_it;
802
803
804 if(query == NULL || blast_sap == NULL)
805 return;
806
807 align = blast_sap->data;
808 if(align == NULL)
809 return;
810 while(align)
811 {
812 next = align->next;
813 free_it = FALSE;
814 switch(align->segtype) {
815 case 1:
816 ddp = align->segs;
817 if(ddp && ddp->id != NULL)
818 {
819 if(BioseqMatch(query, ddp->id->next))
820 {
821 filter_self_diagnol(&ddp);
822 if(ddp == NULL)
823 {
824 if(prev == NULL)
825 blast_sap->data = next;
826 else
827 prev->next = next;
828 align->next = NULL;
829 align->segs = NULL;
830 SeqAlignFree(align);
831 free_it = TRUE;
832 }
833 else
834 align->segs = ddp;
835
836 }
837 }
838 break;
839 case 2:
840 break;
841 case 3:
842 break;
843 default:
844 break;
845 }
846 if(!free_it)
847 prev = align;
848 align = next;
849 }
850 }
851
852
853
854
855
856
857
858
859 /*********************************************************************
860 *
861 * break_blast_job(slp)
862 * if the Seq-loc is too long for blast search, break it into
863 * overlapping pieces to search more efficiently
864 * slp: the query sequence
865 * return a list of Seq-locs for blast search
866 *
867 *********************************************************************/
break_blast_job(SeqLocPtr slp,SeqIdPtr sip,Int4 max_len,Int4 overlap_len)868 SeqLocPtr break_blast_job(SeqLocPtr slp, SeqIdPtr sip, Int4 max_len, Int4 overlap_len)
869 {
870 Int4 K;
871 Int4 length, len;
872 SeqLocPtr head = NULL, new;
873 Int4 start, stop;
874 Uint1 strand;
875
876 length = SeqLocLen(slp);
877 start = SeqLocStart(slp);
878 stop = SeqLocStop(slp);
879 strand = SeqLocStrand(slp);
880
881 if(length < max_len)
882 return SeqLocIntNew(start, stop, strand, sip);
883 else
884 {
885 K = (length)/max_len;
886 if((length)%max_len!= 0)
887 ++K;
888 len = length/K + overlap_len;
889 while(K >0)
890 {
891 if(K == 1)
892 stop = SeqLocStop(slp);
893 else
894 stop = (start + len -1);
895 new = SeqLocIntNew(start, stop, strand, sip);
896 ValNodeLink(&head, new);
897 start += (len - overlap_len-1);
898 --K;
899 }
900 return head;
901 }
902 }
903
904
905 /*##################################################################
906 #
907 # functions related to merge_blast_result
908 #
909 ###################################################################*/
910
DenseDiagForSameSequence(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)911 static Boolean DenseDiagForSameSequence(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
912 {
913 SeqIdPtr sip_1, sip_2;
914
915 if(ddp_1 == NULL || ddp_2 == NULL)
916 return FALSE;
917
918 if(ddp_1->dim != ddp_2->dim)
919 return FALSE;
920 sip_1 = ddp_1->id;
921 sip_2 = ddp_2->id;
922 while(sip_1 && sip_2)
923 {
924 if(!SeqIdMatch(sip_1, sip_2))
925 return FALSE;
926 sip_1 = sip_1->next;
927 sip_2 = sip_2->next;
928 }
929 return TRUE;
930 }
931
is_same_orientation(Uint1 strand_1,Uint1 strand_2)932 static Boolean is_same_orientation(Uint1 strand_1, Uint1 strand_2)
933 {
934 if(strand_1 == strand_2)
935 return TRUE;
936 if(strand_1 == Seq_strand_minus)
937 return (strand_2 == Seq_strand_minus);
938 else
939 return (strand_2 != Seq_strand_minus);
940 }
941
DenseDiagSameOrient(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)942 static Boolean DenseDiagSameOrient(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
943 {
944 Int2 dim, i;
945 Uint1 strand_1, strand_2;
946
947 if(ddp_1 == NULL || ddp_2 == NULL)
948 return FALSE;
949 if(ddp_1->dim != ddp_2->dim)
950 return FALSE;
951
952 dim = ddp_1->dim;
953 for(i =0; i<dim; ++i)
954 {
955 strand_1 = get_strand_val(ddp_1->strands, i);
956 strand_2 = get_strand_val(ddp_2->strands, i);
957 if(!is_same_orientation(strand_1, strand_2))
958 return FALSE;
959 }
960
961 return TRUE;
962 }
963
964
StdSegForSameSequence(StdSegPtr ssp_1,StdSegPtr ssp_2)965 static Boolean StdSegForSameSequence(StdSegPtr ssp_1, StdSegPtr ssp_2)
966 {
967 SeqIdPtr sip_1, sip_2;
968 SeqLocPtr loc_1, loc_2;
969
970 if(ssp_1 == NULL || ssp_2 == NULL)
971 return FALSE;
972
973 loc_1 = ssp_1->loc;
974 loc_2 = ssp_2->loc;
975 while(loc_1 && loc_2)
976 {
977 sip_1 = SeqLocId(loc_1);
978 sip_2 = SeqLocId(loc_2);
979 if(!SeqIdMatch(sip_1, sip_2))
980 return FALSE;
981 loc_1 = loc_1->next;
982 loc_2 = loc_2->next;
983 }
984 return TRUE;
985 }
986
987
988
StdSegSameOrient(StdSegPtr ssp_1,StdSegPtr ssp_2)989 static Boolean StdSegSameOrient(StdSegPtr ssp_1, StdSegPtr ssp_2)
990 {
991 Uint1 strand_1, strand_2;
992 SeqLocPtr loc_1, loc_2;
993
994 if(ssp_1 == NULL || ssp_2 == NULL)
995 return FALSE;
996 if(ssp_1->dim != ssp_2->dim)
997 return FALSE;
998
999 loc_1 = ssp_1->loc;
1000 loc_2 = ssp_2->loc;
1001 while(loc_1 && loc_2)
1002 {
1003 strand_1 = SeqLocStrand(loc_1);
1004 strand_2 = SeqLocStrand(loc_2);
1005 if(!is_same_orientation(strand_1, strand_2))
1006 return FALSE;
1007 loc_1 = loc_1->next;
1008 loc_2 = loc_2->next;
1009 }
1010 return TRUE;
1011 }
1012
1013
StdSegLink(StdSegPtr PNTR head,StdSegPtr ssp)1014 static void StdSegLink (StdSegPtr PNTR head, StdSegPtr ssp)
1015 {
1016 StdSegPtr curr;
1017
1018 curr = *head;
1019 if(curr == NULL)
1020 *head = ssp;
1021 else
1022 {
1023 while(curr->next != NULL)
1024 curr = curr->next;
1025 curr->next = ssp;
1026 }
1027 }
1028
merge_StdSeg(StdSegPtr ssp_1,StdSegPtr ssp_2)1029 static Boolean merge_StdSeg(StdSegPtr ssp_1, StdSegPtr ssp_2)
1030 {
1031 Int4 overlap = -1;
1032 Uint1 strand_1, strand;
1033 Int4 start_1, stop_1, start_2, stop_2;
1034 Int4 a_start, a_stop; /*start and stop position of an aa sequence*/
1035 SeqIntPtr sint;
1036 SeqLocPtr slp;
1037
1038
1039 if(!StdSegSameOrient(ssp_1, ssp_2))
1040 return FALSE;
1041
1042 strand_1 = SeqLocStrand(ssp_1->loc);
1043 strand = strand_1;
1044 start_1 = SeqLocStart(ssp_1->loc);
1045 stop_1 = SeqLocStop(ssp_1->loc);
1046 start_2 = SeqLocStart(ssp_2->loc);
1047 stop_2 = SeqLocStop(ssp_2->loc);
1048 if(start_2 > stop_1) /*no overlaps*/
1049 return FALSE;
1050 else
1051 {
1052 if((stop_1 - start_2 + 1)%3 != 0) /*check if they are in frame*/
1053 return FALSE;
1054 else
1055 {
1056 if(strand == Seq_strand_minus)
1057 {
1058 a_stop = SeqLocStart(ssp_2->loc->next);
1059 a_start = SeqLocStop(ssp_1->loc->next);
1060 }
1061 else
1062 {
1063 a_stop = SeqLocStop(ssp_1->loc->next);
1064 a_start = SeqLocStart(ssp_2->loc->next);
1065 }
1066 if((a_stop - a_start +1) *3 != (stop_1 - start_2 +1))
1067 return FALSE;
1068 else
1069 {
1070 slp = ssp_1->loc;
1071 if(slp->choice != SEQLOC_INT)
1072 return FALSE;
1073 sint = slp->data.ptrvalue;
1074 sint->to = MAX(sint->to, stop_2);
1075
1076 slp = ssp_1->loc->next;
1077 sint = slp->data.ptrvalue;
1078 if(strand == Seq_strand_minus)
1079 sint->from = MIN(sint->from, SeqLocStart(ssp_2->loc->next));
1080 else
1081 sint->to = MAX(sint->to, SeqLocStop(ssp_2->loc->next));
1082 return TRUE;
1083 }
1084 }
1085 }
1086
1087 }
1088
1089
1090 /*******************************************************************
1091 *
1092 * merge_DenseDiag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1093 * merge the two DenseDiags if they overlap and form the same
1094 * aligned diagnol. It is assumed that ddp_1 is a diag computed
1095 * on a location before ddp_2. Their relative order is taken
1096 * into account
1097 *
1098 ********************************************************************/
merge_DenseDiag(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)1099 static Boolean merge_DenseDiag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1100 {
1101 Int2 i, dim;
1102 Int4 overlap = -1;
1103 Uint1 strand;
1104 Int4 start, stop;
1105
1106 /*it is assumed that the dimension and Seq-id has been checked before-hand*/
1107 if(!DenseDiagSameOrient(ddp_1, ddp_2))
1108 return FALSE;
1109
1110 dim = ddp_1->dim;
1111 for(i =0; i<dim; ++i)
1112 {
1113 strand = get_strand_val(ddp_1->strands, i);
1114 if(strand== Seq_strand_minus)
1115 {
1116 start = ddp_2->starts[i] + ddp_2->len -1;
1117 stop = ddp_1->starts[i];
1118 }
1119 else
1120 {
1121 start = ddp_2->starts[i];
1122 stop = ddp_1->starts[i] + ddp_1->len -1;
1123 }
1124 if(start > stop) /*no overlap*/
1125 return FALSE;
1126 if(overlap == -1)
1127 overlap = stop - start;
1128 else
1129 if(stop - start != overlap)
1130 return FALSE;
1131 }
1132
1133 if(overlap == -1)
1134 return FALSE;
1135 for(i = 0; i<dim; ++i)
1136 {
1137 if(get_strand_val(ddp_1->strands, i) == Seq_strand_minus)
1138 ddp_1->starts[i] -= overlap;
1139 }
1140 ddp_1->len += (ddp_2->len -1 - overlap);
1141 return TRUE;
1142 }
1143
1144
1145
1146
1147 /*******************************************************************
1148 *
1149 * align_for_same_sequences(sap_1, sap_2)
1150 * check if the same sequences are involved in alignments
1151 *
1152 ********************************************************************/
align_for_same_sequences(SeqAlignPtr sap_1,SeqAlignPtr sap_2)1153 static Boolean align_for_same_sequences(SeqAlignPtr sap_1, SeqAlignPtr sap_2)
1154 {
1155 DenseDiagPtr ddp_1, ddp_2;
1156 StdSegPtr ssp_1, ssp_2;
1157
1158 if(sap_1->segtype != sap_2->segtype)
1159 return FALSE;
1160 if(sap_1->segs == NULL || sap_2->segs == NULL)
1161 return FALSE;
1162
1163 switch(sap_1->segtype)
1164 {
1165 case 1: /*it is a Dense-diag*/
1166 ddp_1 = sap_1->segs;
1167 ddp_2 = sap_2->segs;
1168 return DenseDiagForSameSequence(ddp_1, ddp_2);
1169
1170 case 3:
1171 ssp_1 = sap_1->segs;
1172 ssp_2 = sap_2->segs;
1173 return StdSegForSameSequence(ssp_1, ssp_2);
1174 default:
1175 return FALSE;
1176 }
1177 }
1178
find_previous_align_in_list(SeqAlignPtr list,SeqAlignPtr new_sap)1179 static SeqAlignPtr find_previous_align_in_list(SeqAlignPtr list, SeqAlignPtr new_sap)
1180 {
1181 while(list)
1182 {
1183 if(align_for_same_sequences(list, new_sap))
1184 return list;
1185 list = list->next;
1186 }
1187
1188 return NULL;
1189 }
1190
1191
get_score_value(ScorePtr sp)1192 Int4 get_score_value(ScorePtr sp)
1193 {
1194 ObjectIdPtr oip;
1195
1196 while(sp)
1197 {
1198 if(sp->id)
1199 {
1200 oip = sp->id;
1201 if(oip && oip->str && StringCmp(oip->str, "score") == 0)
1202 {
1203 if(sp->choice == 1)
1204 return sp->value.intvalue;
1205 }
1206 }
1207 sp = sp->next;
1208 }
1209
1210 return -1;
1211 }
1212
replace_with_big_score(DenseDiagPtr free_ddp,DenseDiagPtr ddp)1213 static Boolean replace_with_big_score (DenseDiagPtr free_ddp, DenseDiagPtr ddp)
1214 {
1215 Int4 free_score, score;
1216
1217 if(free_ddp == NULL || ddp == NULL)
1218 return FALSE;
1219 if(ddp->scores == NULL)
1220 {
1221 ddp->scores = free_ddp->scores;
1222 free_ddp->scores = NULL;
1223 return TRUE;
1224 }
1225 else if(free_ddp->scores != NULL)
1226 {
1227 free_score = get_score_value(free_ddp->scores);
1228 score = get_score_value(ddp->scores);
1229
1230 if(free_score > score)
1231 {
1232 ScoreSetFree(ddp->scores);
1233 ddp->scores = free_ddp->scores;
1234 free_ddp->scores = NULL;
1235 return TRUE;
1236 }
1237 }
1238
1239 return FALSE;
1240 }
merge_one_align(SeqAlignPtr list,SeqAlignPtr new_sap)1241 static Boolean merge_one_align(SeqAlignPtr list, SeqAlignPtr new_sap)
1242 {
1243 SeqAlignPtr prev_sap;
1244 DenseDiagPtr ddp_1, ddp_2;
1245 DenseDiagPtr newddp;
1246 DenseDiagPtr next;
1247 Boolean merge;
1248 StdSegPtr ssp_1, ssp_2;
1249 StdSegPtr new_ssp;
1250 StdSegPtr next_ssp;
1251
1252
1253 prev_sap = find_previous_align_in_list(list, new_sap);
1254 if(prev_sap == NULL)
1255 return FALSE;
1256
1257 switch(prev_sap->segtype)
1258 {
1259 case 1: /*it is the Dense-diag*/
1260 ddp_2 = (DenseDiagPtr)(new_sap->segs);
1261 newddp = NULL;
1262 while(ddp_2)
1263 {
1264 next = ddp_2->next;
1265 ddp_2->next = NULL;
1266 ddp_1 = (DenseDiagPtr)(prev_sap->segs);
1267 merge = FALSE;
1268 while(ddp_1 )
1269 {
1270 merge = merge_DenseDiag(ddp_1, ddp_2);
1271 if(merge)
1272 break;
1273 else
1274 ddp_1 = ddp_1->next;
1275 }
1276 if(merge)
1277 {
1278 replace_with_big_score(ddp_2, ddp_1);
1279 DenseDiagFree(ddp_2);
1280 }
1281 else
1282 link_ddp(&newddp, ddp_2);
1283
1284 ddp_2 = next;
1285 }
1286 if(newddp)
1287 link_ddp((DenseDiagPtr PNTR)&(prev_sap->segs), newddp);
1288 new_sap->segs = NULL;
1289 return TRUE;
1290
1291 case 3:
1292 ssp_2 = (StdSegPtr)(new_sap->segs);
1293 new_ssp = NULL;
1294 while(ssp_2)
1295 {
1296 next_ssp = ssp_2->next;
1297 ssp_2->next = NULL;
1298 merge = FALSE;
1299 ssp_1 = (StdSegPtr)(prev_sap->segs);
1300 while(ssp_1 && !merge)
1301 {
1302 merge = merge_StdSeg(ssp_1, ssp_2);
1303 ssp_1 = ssp_1->next;
1304 }
1305 if(merge)
1306 StdSegFree(ssp_2);
1307 else
1308 StdSegLink(&new_ssp, ssp_2);
1309 ssp_2 = next_ssp;
1310 }
1311 if(new_ssp)
1312 StdSegLink((StdSegPtr PNTR)&(prev_sap->segs), new_ssp);
1313 new_sap->segs = NULL;
1314 return TRUE;
1315
1316 default:
1317 return FALSE;
1318 }
1319
1320 }
1321
1322 /*******************************************************************
1323 *
1324 * merge_blast_result(head, new)
1325 * when the input query is too big, it is broken into several
1326 * overlapping Seq-loc to be sent to blast. The results can be
1327 * merged if two Dense-diags are actually represent one HSP
1328 *
1329 ********************************************************************/
merge_blast_result(SeqAnnotPtr PNTR head,SeqAnnotPtr new)1330 SeqAnnotPtr merge_blast_result(SeqAnnotPtr PNTR head, SeqAnnotPtr new)
1331 {
1332 SeqAlignPtr list, align, next;
1333 SeqAlignPtr newlist = NULL;
1334
1335 if(*head == NULL)
1336 *head = new;
1337 else
1338 {
1339 list = (*head)->data;
1340 align = new->data;
1341 while(align)
1342 {
1343 next = align->next;
1344 align->next = NULL;
1345 if(merge_one_align(list, align))
1346 SeqAlignFree(align);
1347 else
1348 newlist = link_align(align, newlist);
1349 align = next;
1350 }
1351 if(newlist !=NULL)
1352 {
1353 list = link_align(newlist, list);
1354 (*head)->data = list;
1355 }
1356 new->data = NULL;
1357 SeqAnnotFree(new);
1358 }
1359 return (*head);
1360 }
1361
GetOrg(AsnExpOptStructPtr aeosp)1362 static void LIBCALLBACK GetOrg (AsnExpOptStructPtr aeosp)
1363 {
1364 OrgRefPtr orp;
1365 CharPtr PNTR org_name;
1366
1367 if (aeosp->dvp->intvalue != START_STRUCT){
1368 return;
1369 }
1370
1371 orp = (OrgRefPtr)aeosp->the_struct;
1372 org_name = aeosp->data;
1373 if(*org_name == NULL && orp->taxname)
1374 *org_name = StringSave(orp->taxname);
1375 return;
1376 }
1377
1378 /********************************************************************
1379 *
1380 * get_organism(bsp)
1381 * filter the taxname from the organism field in Bioseq
1382 *
1383 ********************************************************************/
get_organism(BioseqPtr bsp)1384 CharPtr get_organism(BioseqPtr bsp)
1385 {
1386 SeqEntryPtr sep;
1387 AsnIoPtr aip;
1388 CharPtr org_name;
1389 AsnExpOptPtr aeop;
1390
1391 sep = SeqEntryFind(bsp->id);
1392 aip = AsnIoNullOpen();
1393
1394 org_name = NULL;
1395 if ((aeop = AsnExpOptNew(aip, "Org-ref", (Pointer)(&org_name), GetOrg)) ==NULL)
1396 ErrPost(100, 1, "no aeop 1");
1397 SeqEntryAsnWrite(sep, aip, NULL);
1398 AsnExpOptFree(aip, aeop);
1399 AsnIoClose(aip);
1400 return org_name;
1401 }
1402
1403
merge_dense_diag_internal_repeats(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2,BoolPtr pkeep_1)1404 static Boolean merge_dense_diag_internal_repeats(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2, BoolPtr pkeep_1)
1405 {
1406 Int4 a_start_1, a_stop_1;
1407 Int4 a_start_2, a_stop_2;
1408 Int4 b_start_1, b_stop_1;
1409 Int4 b_start_2, b_stop_2;
1410 Boolean keep_1;
1411
1412 if(!DenseDiagSameOrient(ddp_1, ddp_2))
1413 return FALSE;
1414
1415 /*check overlap*/
1416 a_start_1 = ddp_1->starts[0];
1417 a_stop_1 = a_start_1 + ddp_1->len -1;
1418 a_start_2 = ddp_2->starts[0];
1419 a_stop_2 = a_start_2 + ddp_2->len -1;
1420
1421 if(a_start_2 >= a_start_1 && a_stop_2 <= a_stop_1)
1422 keep_1 = TRUE;
1423 else if(a_start_1 >= a_start_2 && a_stop_1 <= a_stop_2)
1424 keep_1 = FALSE;
1425 else
1426 return FALSE;
1427
1428 b_start_1 = ddp_1->starts[1];
1429 b_stop_1 = b_start_1 + ddp_1->len -1;
1430 b_start_2 = ddp_2->starts[1];
1431 b_stop_2 = b_start_2 + ddp_2->len -1;
1432
1433 *pkeep_1 = keep_1;
1434 if(keep_1)
1435 return (b_start_2 >= b_start_1 && b_stop_2 <= b_stop_1);
1436 else
1437 return (b_start_1 >= b_start_2 && b_stop_1 <= b_stop_2);
1438 }
1439
merge_std_seg_internal_repeats(StdSegPtr ssp_1,StdSegPtr ssp_2,BoolPtr pkeep_1)1440 static Boolean merge_std_seg_internal_repeats(StdSegPtr ssp_1, StdSegPtr ssp_2, BoolPtr pkeep_1)
1441 {
1442 Int4 a_start_1, a_stop_1;
1443 Int4 a_start_2, a_stop_2;
1444 Int4 b_start_1, b_stop_1;
1445 Int4 b_start_2, b_stop_2;
1446 Boolean keep_1;
1447 SeqLocPtr loc_1, loc_2;
1448
1449 if(!StdSegSameOrient(ssp_1, ssp_2))
1450 return FALSE;
1451
1452
1453 /*check overlap*/
1454 loc_1 = ssp_1->loc;
1455 loc_2 = ssp_2->loc;
1456
1457 a_start_1 = SeqLocStart(loc_1);
1458 a_stop_1 = SeqLocStop(loc_1);
1459 a_start_2 = SeqLocStart(loc_2);
1460 a_stop_2 = SeqLocStop(loc_2);
1461
1462 if(a_start_2 >= a_start_1 && a_stop_2 <= a_stop_1)
1463 keep_1 = TRUE;
1464 else if(a_start_1 >= a_start_2 && a_stop_1 <= a_stop_2)
1465 keep_1 = FALSE;
1466 else
1467 return FALSE;
1468
1469 loc_1 = ssp_1->loc->next;
1470 loc_2 = ssp_2->loc->next;
1471 b_start_1 = SeqLocStart(loc_1);
1472 b_stop_1 = SeqLocStop(loc_1);
1473 b_start_2 = SeqLocStart(loc_2);
1474 b_stop_2 = SeqLocStop(loc_2);
1475
1476 *pkeep_1 = keep_1;
1477 if(keep_1)
1478 return (b_start_2 >= b_start_1 && b_stop_2 <= b_stop_1);
1479 else
1480 return (b_start_1 >= b_start_2 && b_stop_1 <= b_stop_2);
1481 }
1482
1483
1484 static SeqAlignPtr clean_repeats_in_chain PROTO((SeqAlignPtr head));
1485
1486
1487
1488
clean_dense_diag(DenseDiagPtr curr,DenseDiagPtr PNTR chain)1489 static Boolean clean_dense_diag(DenseDiagPtr curr, DenseDiagPtr PNTR chain)
1490 {
1491
1492 DenseDiagPtr c_next, c_prev, next;
1493 Boolean keep_1;
1494
1495 c_prev = NULL;
1496 c_next = *chain;
1497
1498 while(c_next)
1499 {
1500 keep_1 = FALSE;
1501 next = c_next->next;
1502 if(merge_dense_diag_internal_repeats(curr, c_next, &keep_1))
1503 {
1504 if(keep_1)
1505 {
1506 c_next->next = NULL;
1507 replace_with_big_score(c_next, curr);
1508 DenseDiagFree(c_next);
1509 if(c_prev == NULL)
1510 *chain = next;
1511 else
1512 c_prev->next = next;
1513 }
1514 else
1515 return TRUE;
1516 }
1517 if(!keep_1)
1518 c_prev = c_next;
1519 c_next = next;
1520 }
1521
1522 return FALSE;
1523 }
1524
1525
clean_std_seg(StdSegPtr curr,StdSegPtr PNTR chain)1526 static Boolean clean_std_seg(StdSegPtr curr, StdSegPtr PNTR chain)
1527 {
1528
1529 StdSegPtr c_next, c_prev, next;
1530 Boolean keep_1;
1531
1532 c_prev = NULL;
1533 c_next = *chain;
1534
1535 while(c_next)
1536 {
1537 keep_1 = FALSE;
1538 next = c_next->next;
1539 if(merge_std_seg_internal_repeats(curr, c_next, &keep_1))
1540 {
1541 if(keep_1)
1542 {
1543 c_next->next = NULL;
1544 StdSegFree(c_next);
1545 if(c_prev == NULL)
1546 *chain = next;
1547 else
1548 c_prev->next = next;
1549 }
1550 else
1551 return TRUE;
1552 }
1553 if(!keep_1)
1554 c_prev = c_next;
1555 c_next = next;
1556 }
1557
1558 return FALSE;
1559 }
1560
1561 /*clean all the repeats within the Seq-align*/
clean_all_internal_repeats(SeqAlignPtr PNTR head)1562 void clean_all_internal_repeats(SeqAlignPtr PNTR head)
1563 {
1564 DenseDiagPtr ddp, ddp_next, ddp_prev;
1565 SeqAlignPtr align, ali_next, ali_prev = NULL;
1566 StdSegPtr ssp, ssp_next, ssp_prev;
1567
1568
1569 align = *head;
1570
1571 while(align)
1572 {
1573 ali_next = align->next;
1574 switch(align->segtype)
1575 {
1576 case 1:
1577 ddp = align->segs;
1578 ddp_prev = NULL;
1579 while(ddp !=NULL && ddp->next !=NULL)
1580 {
1581 ddp_next = ddp->next;
1582 if(clean_dense_diag(ddp, &ddp_next))
1583 {
1584 if(ddp_prev == NULL)
1585 align->segs = ddp_next;
1586 else
1587 ddp_prev->next = ddp_next;
1588 ddp->next = NULL;
1589 replace_with_big_score(ddp, ddp_next);
1590 DenseDiagFree(ddp);
1591 }
1592 else
1593 {
1594 ddp->next = ddp_next;
1595 ddp_prev = ddp;
1596 }
1597 ddp = ddp_next;
1598 }
1599 break;
1600
1601 case 3:
1602 ssp = align->segs;
1603 ssp_prev = NULL;
1604 while(ssp !=NULL && ssp->next !=NULL)
1605 {
1606 ssp_next = ssp->next;
1607 if(clean_std_seg(ssp, &ssp_next))
1608 {
1609 if(ssp_prev == NULL)
1610 align->segs = ssp_next;
1611 else
1612 ssp_prev->next = ssp_next;
1613 ssp->next = NULL;
1614 StdSegFree(ssp);
1615 }
1616 else
1617 {
1618 ssp->next = ssp_next;
1619 ssp_prev = ssp;
1620 }
1621 ssp = ssp_next;
1622 }
1623 break;
1624
1625 default:
1626 break;
1627 }
1628 if(align->segs == NULL)
1629 {
1630 if(ali_prev)
1631 ali_prev->next = ali_next;
1632 else
1633 *head = ali_next;
1634 align->next = NULL;
1635 SeqAlignFree(align);
1636 }
1637 else
1638 ali_prev = align;
1639 align = ali_next;
1640 }
1641
1642 *head = clean_repeats_in_chain(*head);
1643 }
1644
1645
clean_repeat_current(SeqAlignPtr align,SeqAlignPtr next)1646 static void clean_repeat_current(SeqAlignPtr align, SeqAlignPtr next)
1647 {
1648 DenseDiagPtr ddp_1, ddp_2, pddp_1, pddp_2, nddp_1, nddp_2;
1649 Boolean keep_1, free_1;
1650 StdSegPtr ssp_1, ssp_2, pssp_1, pssp_2, nssp_1, nssp_2;
1651 Uint1 segtype;
1652
1653 if(align == NULL || next == NULL)
1654 return;
1655 segtype = align->segtype;
1656
1657 while(next)
1658 {
1659 switch(segtype)
1660 {
1661 case 1:/*it is a Dense-diag*/
1662 ddp_1 = align->segs;
1663 pddp_1 = NULL;
1664 while(ddp_1)
1665 {
1666 ddp_2 = next->segs;
1667 pddp_2 = NULL;
1668 nddp_1 = ddp_1->next;
1669 free_1 = FALSE;
1670 while(ddp_2 && !free_1)
1671 {
1672 nddp_2 = ddp_2->next;
1673 if(merge_dense_diag_internal_repeats(ddp_1, ddp_2, &keep_1))
1674 {
1675 if(keep_1)
1676 {
1677 if(pddp_2 == NULL)
1678 next->segs = nddp_2;
1679 else
1680 pddp_2->next = nddp_2;
1681 ddp_2->next = NULL;
1682 replace_with_big_score(ddp_2, ddp_1);
1683 DenseDiagFree(ddp_2);
1684 ddp_2 = nddp_2;
1685 }
1686 else
1687 {
1688 if(pddp_1 == NULL)
1689 align->segs = nddp_1;
1690 else
1691 pddp_1->next = nddp_1;
1692 ddp_1->next = NULL;
1693 replace_with_big_score(ddp_1, ddp_2);
1694 DenseDiagFree(ddp_1);
1695 ddp_1 = nddp_1;
1696 free_1 = TRUE;
1697 break;
1698 }
1699 }
1700 else
1701 {
1702 pddp_2 = ddp_2;
1703 ddp_2 = ddp_2->next;
1704 }
1705 }
1706
1707 if(!free_1)
1708 {
1709 pddp_1 = ddp_1;
1710 ddp_1 = ddp_1->next;
1711 }
1712 }
1713 break;
1714
1715 case 3: /*it is a Std-seg*/
1716 ssp_1 = align->segs;
1717 pssp_1 = NULL;
1718 while(ssp_1)
1719 {
1720 ssp_2 = next->segs;
1721 pssp_2 = NULL;
1722 nssp_1 = ssp_1->next;
1723 free_1 = FALSE;
1724 while(ssp_2 && !free_1)
1725 {
1726 nssp_2 = ssp_2->next;
1727 if(merge_std_seg_internal_repeats(ssp_1, ssp_2, &keep_1))
1728 {
1729 if(keep_1)
1730 {
1731 if(pssp_2 == NULL)
1732 next->segs = nssp_2;
1733 else
1734 pssp_2->next = nssp_2;
1735 ssp_2->next = NULL;
1736 StdSegFree(ssp_2);
1737 ssp_2 = nssp_2;
1738 }
1739 else
1740 {
1741 if(pssp_1 == NULL)
1742 align->segs = nssp_1;
1743 else
1744 pssp_1->next = nssp_1;
1745 ssp_1->next = NULL;
1746 StdSegFree(ssp_1);
1747 ssp_1 = nssp_1;
1748 free_1 = TRUE;
1749 break;
1750 }
1751 }
1752 else
1753 {
1754 pssp_2 = ssp_2;
1755 ssp_2 = ssp_2->next;
1756 }
1757 }
1758
1759 if(!free_1)
1760 {
1761 pssp_1 = ssp_1;
1762 ssp_1 = ssp_1->next;
1763 }
1764 }
1765 break;
1766
1767 default:
1768 break;
1769
1770 }
1771 next = next->next;
1772 }
1773 }
1774
1775
clean_repeats_in_chain(SeqAlignPtr head)1776 static SeqAlignPtr clean_repeats_in_chain(SeqAlignPtr head)
1777 {
1778 SeqAlignPtr align, next, prev;
1779
1780 align = head;
1781 while(align->next !=NULL)
1782 {
1783 next = align->next;
1784 if(align_for_same_sequences(align, next))
1785 clean_repeat_current(align, next);
1786 align = next;
1787 }
1788
1789 /*free all the empty records*/
1790 align = head;
1791 prev = NULL;
1792 while(align)
1793 {
1794 next = align->next;
1795 if(align->segs == NULL)
1796 {
1797 if(prev == NULL)
1798 head->next = next;
1799 else
1800 prev->next = next;
1801 align->next = NULL;
1802 SeqAlignFree(align);
1803 }
1804 else
1805 prev = align;
1806 align = next;
1807 }
1808
1809 return head;
1810 }
1811
1812
1813
switch_Dense_diag(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)1814 static Boolean switch_Dense_diag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1815 {
1816 if(ddp_1->starts[0] > ddp_2->starts[0])
1817 return TRUE;
1818
1819 if(ddp_1->starts[0] < ddp_2->starts[0])
1820 return FALSE;
1821
1822 return (ddp_1->len < ddp_2->len);
1823 }
1824
sort_dense_diag(DenseDiagPtr head)1825 static DenseDiagPtr sort_dense_diag(DenseDiagPtr head)
1826 {
1827 DenseDiagPtr PNTR list;
1828 DenseDiagPtr curr, last = NULL;
1829 Int4 num =0, i, j;
1830 Boolean s_witch = TRUE;
1831
1832 for(curr = head; curr != NULL; curr = curr->next)
1833 ++num;
1834 if(num == 0 || num == 1)
1835 return head;
1836
1837 list = MemNew((size_t)num * sizeof(DenseDiagPtr));
1838 for(i =0, curr = head; i<num; ++i)
1839 {
1840 list[i] = curr;
1841 curr = curr->next;
1842 }
1843 for(i = 0; i<num-1 && s_witch == TRUE; ++i)
1844 {
1845 s_witch = FALSE;
1846 for(j =0; j<num-i-1; ++j)
1847 {
1848 if(switch_Dense_diag(list[j], list[j+1]))
1849 {
1850 s_witch = TRUE;
1851 curr = list[j];
1852 list[j] = list[j+1];
1853 list[j +1] = curr;
1854 }
1855 }
1856 }
1857
1858 for(i =0; i<num -1; ++i)
1859 {
1860 curr = list[i];
1861 curr->next = list[i+1];
1862 last = curr->next;
1863 }
1864 last->next = NULL;
1865 head = list[0];
1866 MemFree(list);
1867 return head;
1868 }
1869
1870
switch_std_seg(StdSegPtr ssp_1,StdSegPtr ssp_2)1871 static Boolean switch_std_seg(StdSegPtr ssp_1, StdSegPtr ssp_2)
1872 {
1873 Int4 start_1, start_2;
1874
1875 start_1 = SeqLocStart(ssp_1->loc);
1876 start_2 = SeqLocStart(ssp_2->loc);
1877
1878 if(start_2 < start_1)
1879 return TRUE;
1880 else if(start_2 == start_1)
1881 {
1882 if(SeqLocStop(ssp_2->loc) < SeqLocStop(ssp_1->loc))
1883 return TRUE;
1884 else
1885 return FALSE;
1886 }
1887 else
1888 return FALSE;
1889 }
1890
1891
sort_std_seg(StdSegPtr ssp)1892 static StdSegPtr sort_std_seg(StdSegPtr ssp)
1893 {
1894 StdSegPtr PNTR list;
1895 StdSegPtr curr, last = NULL;
1896 Int4 num =0, i, j;
1897 Boolean s_witch = TRUE;
1898
1899 for(curr = ssp; curr != NULL; curr = curr->next)
1900 ++num;
1901 if(num == 0 || num == 1)
1902 return ssp;
1903
1904 list = MemNew((size_t)num * sizeof(DenseDiagPtr));
1905 for(i =0, curr = ssp; i<num; ++i)
1906 {
1907 list[i] = curr;
1908 curr = curr->next;
1909 }
1910 for(i = 0; i<num-1 && s_witch == TRUE; ++i)
1911 {
1912 s_witch = FALSE;
1913 for(j =0; j<num-i-1; ++j)
1914 {
1915 if(switch_std_seg(list[j], list[j+1]))
1916 {
1917 s_witch = TRUE;
1918 curr = list[j];
1919 list[j] = list[j+1];
1920 list[j +1] = curr;
1921 }
1922 }
1923 }
1924
1925 for(i =0; i<num -1; ++i)
1926 {
1927 curr = list[i];
1928 curr->next = list[i+1];
1929 last = curr->next;
1930 }
1931 last->next = NULL;
1932 ssp = list[0];
1933 MemFree(list);
1934 return ssp;
1935 }
1936
1937 typedef struct parse_blast{
1938 Int4 K;
1939 SeqLocPtr m_loc, s_loc;
1940 ScorePtr scores; /*the original scores from blast */
1941 }ParseBlast, PNTR ParseBlastPtr;
1942
1943 #define MAX_GAP_LEN 200 /*maximum gap length allowed by sim2*/
1944 #define DNA_EDGE_LEN 1000 /*add the length of the edge factor*/
1945 #define AA_EDGE_LEN 100
1946
check_sim_list(DenseDiagPtr ddp,ValNodePtr PNTR head)1947 static void check_sim_list(DenseDiagPtr ddp, ValNodePtr PNTR head)
1948 {
1949 ValNodePtr sim_list;
1950 ParseBlastPtr pbp;
1951 Int4 m_start, m_stop;
1952 Int4 s_start, s_stop;
1953 Int4 c_start, c_stop;
1954 Boolean increase_K;
1955 Uint1 strand;
1956 SeqIdPtr s_sip;
1957 Boolean found;
1958 Int4 score, blast_score;
1959
1960 m_start = ddp->starts[0];
1961 m_stop = m_start + ddp->len -1;
1962 s_start = ddp->starts[1];
1963 s_stop = s_start + ddp->len -1;
1964 strand = get_strand_val(ddp->strands, 0);
1965
1966 sim_list = *head;
1967 while(sim_list)
1968 {
1969 pbp = sim_list->data.ptrvalue;
1970 s_sip = SeqLocId(pbp->s_loc);
1971 if(SeqIdMatch(s_sip, ddp->id->next) && strand == SeqLocStrand(pbp->s_loc))
1972 {
1973 if(s_sip->choice == SEQID_GI && s_sip->data.intvalue == 1418720)
1974 found = TRUE;
1975 increase_K = FALSE;
1976 found = TRUE;
1977 c_start = SeqLocStart(pbp->m_loc);
1978 c_stop = SeqLocStop(pbp->m_loc);
1979 if(c_start > m_stop)
1980 {
1981 if(c_start - m_stop > MAX_GAP_LEN)
1982 {
1983 if(c_start - m_stop > DNA_EDGE_LEN)
1984 found = FALSE;
1985 else
1986 increase_K = TRUE;
1987 }
1988 }
1989
1990 if(c_stop < m_start)
1991 {
1992 if(m_start - c_stop > MAX_GAP_LEN)
1993 {
1994 if(m_start - c_stop > DNA_EDGE_LEN)
1995 found = FALSE;
1996 else
1997 increase_K = TRUE;
1998 }
1999 }
2000
2001 c_start = SeqLocStart(pbp->s_loc);
2002 c_stop = SeqLocStop(pbp->s_loc);
2003 if(c_start > s_stop)
2004 {
2005 if(c_start - s_stop > MAX_GAP_LEN)
2006 {
2007 if(c_start - s_stop > DNA_EDGE_LEN)
2008 found = FALSE;
2009 else
2010 increase_K = TRUE;
2011 }
2012 }
2013
2014 if(c_stop < s_start)
2015 {
2016 if(s_start - c_stop > MAX_GAP_LEN)
2017 {
2018 if(s_start - c_stop > DNA_EDGE_LEN)
2019 found = FALSE;
2020 else
2021 increase_K = TRUE;
2022 }
2023 }
2024 if(found)
2025 {
2026 c_start = MIN(m_start, SeqLocStart(pbp->m_loc));
2027 c_stop = MAX(m_stop, SeqLocStop(pbp->m_loc));
2028 update_seq_loc(c_start, c_stop, Seq_strand_plus, pbp->m_loc);
2029
2030 c_start = MIN(s_start, SeqLocStart(pbp->s_loc));
2031 c_stop = MAX(s_stop, SeqLocStop(pbp->s_loc));
2032 update_seq_loc(c_start, c_stop, 0, pbp->s_loc);
2033
2034 if(increase_K)
2035 ++pbp->K;
2036 blast_score = get_score_value(ddp->scores);
2037 score = get_score_value(pbp->scores);
2038 if(blast_score > score)
2039 {
2040 ScoreSetFree(pbp->scores);
2041 pbp->scores = ddp->scores;
2042 ddp->scores = NULL;
2043 }
2044 return;
2045 }
2046 }
2047 sim_list = sim_list->next;
2048 }
2049
2050 pbp = MemNew(sizeof(ParseBlast));
2051 pbp->K = 1;
2052 pbp->m_loc = SeqLocIntNew(m_start, m_stop, Seq_strand_plus, ddp->id);
2053 pbp->s_loc = SeqLocIntNew(s_start, s_stop, strand, ddp->id->next);
2054 pbp->scores = ddp->scores;
2055 ddp->scores = NULL;
2056 ValNodeAddPointer(head, 0, (Pointer)pbp);
2057
2058 }
2059
2060 /****************************************************************
2061 *
2062 * SortAlignByLocation()
2063 * Sort the DenseDiag from Blastn and Blastp to ascending order
2064 * of the locations
2065 *
2066 *****************************************************************/
SortAlignByLocation(SeqAnnotPtr blast_sap)2067 void SortAlignByLocation(SeqAnnotPtr blast_sap)
2068 {
2069 SeqAlignPtr align;
2070 DenseDiagPtr ddp;
2071 StdSegPtr ssp;
2072
2073
2074 if(blast_sap == NULL)
2075 return;
2076
2077 align = blast_sap->data;
2078 while(align)
2079 {
2080 switch(align->segtype)
2081 {
2082 case 1:
2083 ddp = (DenseDiagPtr)align->segs;
2084 align->segs = sort_dense_diag(ddp);
2085 break;
2086 case 2:
2087 break;
2088 case 3:
2089 ssp = align->segs;
2090 align->segs = (StdSegPtr)sort_std_seg(ssp);
2091 break;
2092 default:
2093 break;
2094 }
2095 align = align->next;
2096 }
2097
2098 }
2099
2100
filter_redundent_sim_alignment(SeqAlignPtr PNTR align)2101 static SeqAlignPtr filter_redundent_sim_alignment(SeqAlignPtr PNTR align)
2102 {
2103 SeqAlignPtr curr, prev, next;
2104 DenseSegPtr dsp;
2105 SeqLocPtr slp, h_slp, n_slp;
2106 SeqIdPtr sip;
2107 Int4 start, stop;
2108 Uint1 strand;
2109
2110 h_slp = NULL;
2111 for(curr = *align; curr != NULL; curr = curr->next)
2112 {
2113 dsp = curr->segs;
2114 sip = dsp->ids->next;
2115 get_align_ends(curr, sip, &start, &stop, &strand);
2116 slp = SeqLocIntNew(start, stop, strand, sip);
2117 ValNodeLink(&h_slp, slp);
2118 }
2119
2120 for(slp = h_slp; slp != NULL; slp = slp->next)
2121 {
2122 if(slp->choice != 0)
2123 {
2124 for(n_slp = slp->next; n_slp != NULL; n_slp = n_slp->next)
2125 {
2126 if(n_slp->choice != 0)
2127 {
2128 if(SeqLocStrand(slp) == SeqLocStrand(n_slp))
2129 {
2130 switch(SeqLocCompare(slp, n_slp))
2131 {
2132 case SLC_A_EQ_B:
2133 case SLC_B_IN_A:
2134 case SLC_A_OVERLAP_B:
2135 n_slp->choice = 0;
2136 break;
2137 case SLC_A_IN_B:
2138 slp->choice = 0;
2139 break;
2140 default:
2141 break;
2142 }
2143 }
2144 }
2145 }
2146 }
2147 }
2148
2149 curr = *align;
2150 slp = h_slp;
2151 prev = NULL;
2152
2153 while(curr && slp)
2154 {
2155 next = curr->next;
2156 if(slp->choice == 0)
2157 {
2158 slp->choice = SEQLOC_INT;
2159 if(prev == NULL)
2160 *align = next;
2161 else
2162 prev->next = next;
2163 curr->next = NULL;
2164 SeqAlignFree(curr);
2165 }
2166 else
2167 prev = curr;
2168
2169 curr = next;
2170 slp = slp->next;
2171 }
2172
2173 SeqLocSetFree(h_slp);
2174 return (*align);
2175 }
2176
2177
2178
2179 /****************************************************************
2180 *
2181 * sim2_for_blast(blast_sap, query_id)
2182 * run sim2 with the Seq-annot from blast
2183 *
2184 ****************************************************************/
sim_for_blast(SeqAnnotPtr blast_sap,SeqIdPtr query_id,Int4 which_sim)2185 SeqAnnotPtr sim_for_blast(SeqAnnotPtr blast_sap, SeqIdPtr query_id, Int4 which_sim)
2186 {
2187 SeqAlignPtr sim_align, align;
2188 ValNodePtr sim_list = NULL, curr;
2189 ParseBlastPtr pbp;
2190 BioseqPtr mbsp, sbsp;
2191 Int4 start, stop;
2192 SeqIdPtr s_sip;
2193 Uint1 strand;
2194 SeqAnnotPtr annot;
2195 DenseDiagPtr ddp;
2196 SimPam sp; /*parameters for the sim2 alignment*/
2197 PSimPam psp; /*parameters for the SIM alignment*/
2198 Int4 limit = 200; /*limit of sequence discrepancies used in SIM3*/
2199 Int4 edge_len; /*length for adding up to the edge*/
2200 Boolean is_aa;
2201 Char label[100];
2202 Int4 K;
2203 Int4 mbsplength;
2204 DenseSegPtr dsp;
2205 /** BANDALIGN**/
2206 ValNodePtr vnp;
2207 MashPtr msp;
2208
2209 if(blast_sap == NULL || blast_sap->data == NULL)
2210 return NULL;
2211 mbsp = BioseqLockById(query_id);
2212 if(mbsp == NULL)
2213 return blast_sap;
2214 if(mbsp->mol == Seq_mol_aa)
2215 {
2216 edge_len = AA_EDGE_LEN;
2217 DefaultPSimPam(&psp);
2218 is_aa = TRUE;
2219 }
2220 else
2221 {
2222 edge_len = DNA_EDGE_LEN;
2223 DefaultSimPam(&sp);
2224 is_aa = FALSE;
2225 }
2226 mbsplength = mbsp->length;
2227 BioseqUnlock(mbsp);
2228
2229 align = blast_sap->data;
2230
2231 if (which_sim == RUN_BANDALGN)
2232 {
2233 vnp = SeqLocListOfBioseqsFromSeqAlign (align);
2234 msp = MashNew (is_aa);
2235 align = SeqLocListToSeqAlign (vnp, PRGALIGNDEFAULT, msp);
2236 annot = SeqAnnotForSeqAlign (align);
2237 return annot;
2238 }
2239 while(align)
2240 {
2241 switch (align->segtype) {
2242 case 1:
2243 ddp = align->segs;
2244 while(ddp)
2245 {
2246 check_sim_list(ddp, &sim_list);
2247 ddp = ddp->next;
2248 }
2249 break;
2250 case 2:
2251 dsp = align->segs;
2252 if (dsp) {
2253 }
2254 break;
2255 case 3:
2256 break;
2257 default:
2258 break;
2259 }
2260 align = align->next;
2261 }
2262 sim_align = NULL;
2263
2264 for(curr = sim_list; curr !=NULL; curr = curr->next)
2265 {
2266 pbp = curr->data.ptrvalue;
2267 s_sip = SeqLocId(pbp->s_loc);
2268 sbsp = BioseqLockById(s_sip);
2269 if(sbsp == NULL)
2270 {
2271 seqid_name(s_sip, label, FALSE, FALSE);
2272 Message(MSG_POSTERR, "Fail to retrieve sequence %s", label);
2273 }
2274 else
2275 {
2276 strand = SeqLocStrand(pbp->s_loc);
2277
2278 start = SeqLocStart(pbp->s_loc);
2279 stop = SeqLocStop(pbp->s_loc);
2280 start = MAX(0, start - edge_len);
2281 stop = MIN(stop+edge_len, sbsp->length-1);
2282 update_seq_loc(start, stop, strand, pbp->s_loc);
2283
2284 start = SeqLocStart(pbp->m_loc);
2285 stop = SeqLocStop(pbp->m_loc);
2286 start = MAX(0, start - edge_len);
2287 stop = MIN(stop+edge_len, mbsplength-1);
2288 update_seq_loc(start, stop, Seq_strand_plus, pbp->m_loc);
2289
2290 if(pbp->K >1)
2291 {
2292 sp.cutoff = 30.0;
2293 K = MIN((pbp->K + 10), (pbp->K * 2));
2294 }
2295 else
2296 {
2297 sp.cutoff = 0.0;
2298 K = 1;
2299 }
2300
2301 if(is_aa) {
2302 align = CSIM(pbp->m_loc, pbp->s_loc, pbp->K, sp.cutoff, &psp);
2303 }
2304 else
2305 {
2306 /* if(pbp->K >1)
2307 align = SIM2(pbp->m_loc, pbp->s_loc, pbp->K, &sp, NULL, OUTPUT_ALIGN, NULL);
2308 else
2309 { */
2310 switch( which_sim)
2311 {
2312 case RUN_SIM_1:
2313 /* align = CSIM(pbp->m_loc, pbp->s_loc, pbp->K, sp.cutoff, &psp);
2314 break; */
2315
2316 case RUN_SIM_2:
2317 align = SIM2(pbp->m_loc, pbp->s_loc, K, &sp, NULL, OUTPUT_ALIGN, NULL);
2318 if(K > 1 && align != NULL)
2319 filter_redundent_sim_alignment(&align);
2320 break;
2321
2322 case RUN_SIM_3:
2323 align = SIM3ALN(pbp->m_loc, pbp->s_loc, limit);
2324 if(align == NULL)
2325 align = SIM2(pbp->m_loc, pbp->s_loc, pbp->K, &sp, NULL, OUTPUT_ALIGN, NULL);
2326 break;
2327 default:
2328 align = NULL;
2329 }
2330 /* } */
2331 }
2332 if(align != NULL)
2333 {
2334 dsp = align->segs;
2335 if(pbp->scores != NULL && dsp != NULL)
2336 {
2337 if(dsp->scores != NULL)
2338 ScoreSetFree(dsp->scores);
2339 dsp->scores = pbp->scores;
2340 pbp->scores = NULL;
2341 }
2342 sim_align = link_align(align, sim_align);
2343 }
2344
2345 BioseqUnlock(sbsp);
2346 }
2347
2348 pbp->m_loc = SeqLocFree(pbp->m_loc);
2349 pbp->s_loc = SeqLocFree(pbp->s_loc);
2350 if(pbp->scores != NULL)
2351 ScoreSetFree(pbp->scores);
2352
2353 }
2354
2355 ValNodeFreeData(sim_list);
2356
2357 if (sim_align == NULL) return NULL;
2358
2359 annot = SeqAnnotNew();
2360 annot->type = 2;
2361 annot->data = sim_align;
2362 return annot;
2363 }
2364
2365 /***********************************************************
2366 *
2367 * Function for finding human repeat features
2368 *
2369 ************************************************************/
2370
FindHumanRepeats(BioseqPtr bsp,Boolean show_prog_mon)2371 Boolean LIBCALL FindHumanRepeats (BioseqPtr bsp, Boolean show_prog_mon)
2372 {
2373 MonitorPtr mon=NULL;
2374 ValNodePtr alu_slp_list = NULL, alu_sep_list = NULL, ends_list = NULL;
2375 Char buf[80];
2376 Boolean got_some = FALSE;
2377 ValNode tmp;
2378 SeqInt si;
2379
2380 MemSet((Pointer) (&tmp), sizeof(ValNode), 0);
2381 MemSet((Pointer) (&si), sizeof(SeqInt), 0);
2382 tmp.choice = SEQLOC_INT;
2383 tmp.data.ptrvalue = &si;
2384 si.id = bsp->id;
2385 si.from = 0;
2386 si.to = bsp->length - 1;
2387 si.strand = Seq_strand_both;
2388
2389 if (show_prog_mon)
2390 {
2391 mon = MonitorStrNewEx("Alu Features", 40, FALSE);
2392 MonitorStrValue(mon, "Reading Repeats File");
2393 }
2394
2395 if (! FindPath("ncbi", "ncbi", "data", buf, sizeof (buf)))
2396 {
2397 MonitorFree(mon);
2398 ErrPostEx(SEV_WARNING, 0, 0, "FindPath failed in repeat filter");
2399 return got_some;
2400 }
2401
2402 StringCat(buf, "humrep.fsa");
2403
2404 alu_sep_list = make_repeat_lib (buf, &alu_slp_list, FILE_IO);
2405 if (show_prog_mon)
2406 MonitorStrValue(mon, "Finding Repeats");
2407 ends_list = filter_repeats_for_bigloc(&tmp, -1, -1, alu_slp_list);
2408 if(ends_list != NULL)
2409 {
2410 got_some = TRUE;
2411 if (show_prog_mon)
2412 MonitorStrValue(mon, "Building repeat features");
2413 SaveRepeats (bsp, ends_list);
2414 }
2415 free_alu_list (alu_sep_list, alu_slp_list);
2416
2417 MonitorFree(mon);
2418
2419 return got_some;
2420 }
2421 /***********************************************************
2422 *
2423 * Desktop function for doing human repeat features
2424 *
2425 ************************************************************/
2426
AluFeat(Pointer data)2427 Int2 LIBCALLBACK AluFeat (Pointer data)
2428 {
2429 OMProcControlPtr ompcp;
2430 ObjMgrProcPtr ompp;
2431 Pointer my_userdata;
2432 Boolean got_some;
2433 BioseqPtr bsp;
2434
2435 ompcp = (OMProcControlPtr)data;
2436 if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2437 if (ompcp->input_itemtype != OBJ_BIOSEQ) return OM_MSG_RET_ERROR;
2438 ompp = ompcp->proc; /* this is your proceedure */
2439 my_userdata = ompp->procdata; /* this is the user data pointer you passed in, if any */
2440
2441 bsp = (BioseqPtr)(ompcp->input_data);
2442
2443 got_some = FindHumanRepeats (bsp, TRUE);
2444
2445 if (! got_some)
2446 Message(MSG_OK, "No Repeat regions found.");
2447 else
2448 ObjMgrSendMsg(OM_MSG_UPDATE, ompcp->input_entityID, ompcp->input_itemID, ompcp->input_itemtype);
2449
2450 return OM_MSG_RET_OK;
2451 }
2452
2453
2454
2455 /*functions to convert alignment to a feature*/
2456
2457 /**************************************************************
2458 *
2459 * convert the alignment to an interval
2460 *
2461 **************************************************************/
convert_align_to_interval(SeqAlignPtr align,Int4 min_gap_len)2462 static SeqFeatPtr convert_align_to_interval(SeqAlignPtr align, Int4 min_gap_len)
2463 {
2464 DenseSegPtr dsp;
2465 Int2 i;
2466 SeqLocPtr slp, slp_head, slp_next;
2467 Int4 s_start, start;
2468 Uint1 strand;
2469 Boolean load;
2470 SeqFeatPtr new_sfp;
2471 ImpFeatPtr ifp;
2472
2473 if(align->segtype != 2)
2474 return NULL;
2475
2476 dsp = align->segs;
2477 strand = dsp->strands[1];
2478 slp_head = NULL;
2479
2480 for(i = 0; i<dsp->numseg; ++i)
2481 {
2482 start = dsp->starts[2*i];
2483 s_start = dsp->starts[2*i+1];
2484 load = FALSE;
2485 if(start != -1)
2486 {
2487 if(s_start != -1)
2488 load = TRUE;
2489 else if(dsp->lens[i] <= min_gap_len)
2490 load = TRUE;
2491 }
2492 if(load)
2493 {
2494 slp = SeqLocIntNew(start, start+dsp->lens[i]-1, strand, dsp->ids);
2495 if(slp_head == NULL)
2496 slp_head = slp;
2497 else
2498 {
2499 if(strand == Seq_strand_minus)
2500 {
2501 slp_next = slp_head->next;
2502 slp_head->next = NULL;
2503 slp_head = SeqLocAdd(&slp, slp_head, TRUE, FALSE);
2504 slp_head->next = slp_next;
2505 slp_head = slp;
2506 }
2507 else
2508 SeqLocAdd(&slp_head, slp, TRUE, FALSE);
2509 }
2510 }
2511 }
2512
2513 new_sfp = SeqFeatNew();
2514 new_sfp->data.choice = 8;
2515 ifp = ImpFeatNew();
2516 ifp->key = StringSave("hit");
2517 new_sfp->data.value.ptrvalue = ifp;
2518 new_sfp->location = SeqLocPackage(slp_head);
2519 return new_sfp;
2520 }
2521
load_align_to_interval(SeqAnnotPtr align_annot,Int4 min_gap_len,BioseqPtr bsp)2522 Boolean load_align_to_interval(SeqAnnotPtr align_annot, Int4 min_gap_len, BioseqPtr bsp)
2523 {
2524 SeqFeatPtr sfp_head = NULL, prev, sfp;
2525 SeqAlignPtr align;
2526 ValNodePtr desc;
2527 SeqAnnotPtr annot, p_annot;
2528
2529 prev = NULL;
2530 while(align_annot)
2531 {
2532 if(align_annot->type == 2)
2533 {
2534 align = align_annot->data;
2535 while(align)
2536 {
2537 sfp = convert_align_to_interval(align, min_gap_len);
2538 if(sfp != NULL)
2539 {
2540 if(prev == NULL)
2541 sfp_head = sfp;
2542 else
2543 prev->next =sfp;
2544 prev = sfp;
2545 }
2546 align = align->next;
2547 }
2548 }
2549
2550 align_annot = align_annot->next;
2551 }
2552
2553 if(sfp_head != NULL)
2554 {
2555 desc = ValNodeNew(NULL);
2556 desc->choice = Annot_descr_name;
2557 desc->data.ptrvalue = StringSave("HITS");
2558
2559 annot = SeqAnnotNew();
2560 annot->type = 1;
2561 annot->data = sfp_head;
2562 annot->desc = desc;
2563
2564
2565 if(bsp->annot == NULL)
2566 bsp->annot = annot;
2567 else
2568 {
2569 p_annot = bsp->annot;
2570 while(p_annot->next != NULL)
2571 p_annot = p_annot->next;
2572 p_annot->next = annot;
2573 }
2574 return TRUE;
2575 }
2576 else
2577 return FALSE;
2578 }
2579
2580