1 static char const rcsid[] = "$Id: pobutil.c,v 6.10 2007/05/07 13:30:54 kans Exp $";
2 
3 #include <pobutil.h>
4 #include <gather.h>
5 #include <edutil.h>
6 #include <salign.h>
7 #include <salutil.h>
8 #include <salsap.h>
9 
10 /*****************************************************************
11 *
12 *	clean_empty_seqalign(): clean up Seq-align that has no
13 *	data
14 *
15 *****************************************************************/
clean_empty_seqalign(SeqAlignPtr PNTR head)16 SeqAlignPtr clean_empty_seqalign(SeqAlignPtr PNTR head)
17 {
18 	SeqAlignPtr curr, next, prev = NULL;
19 
20 	curr = *head;
21 	while(curr)
22 	{
23 		next = curr->next;
24 		if(curr->segs == NULL)
25 		{
26 			if(prev == NULL)
27 				*head = curr->next;
28 			else
29 				prev->next = curr->next;
30 			curr->next = NULL;
31 			SeqAlignFree(curr);
32 		}
33 		else
34 			prev = curr;
35 		curr = next;
36 	}
37 
38 	return (*head);
39 }
40 
41 
make_repeat_lib(CharPtr file_name,ValNodePtr PNTR slp_list,Boolean io_type)42 ValNodePtr make_repeat_lib(CharPtr file_name,
43                            ValNodePtr PNTR slp_list,
44                            Boolean io_type)
45 {
46   FILE *ifp;
47   SeqEntryPtr sep;
48   BioseqPtr bsp;
49   ValNodePtr sep_list = NULL;
50   SeqLocPtr slp;
51   CharPtr next_char;
52 
53   if(file_name == NULL)
54     return NULL;
55   *slp_list = NULL;
56 
57   /* FILE IO */
58 
59   if(io_type == FILE_IO) {
60     if((ifp = FileOpen(file_name, "r")) == NULL) {
61       Message(MSG_POSTERR, "Fail to open Alu data file %s", file_name);
62       return NULL;
63     }
64     while((sep = FastaToSeqEntry(ifp, TRUE)) != NULL) {
65       if(sep !=NULL) {
66         ValNodeAddPointer(&sep_list, 0, sep);
67         bsp = sep->data.ptrvalue;
68         slp = SeqLocIntNew(0, bsp->length-1, Seq_strand_both, bsp->id);
69         ValNodeAddPointer(slp_list, 0, slp);
70       }
71     }
72     FileClose(ifp);
73   } else {
74     /* STDIN IO - WWW Power Blast case */
75     if(file_name[0] == NULLB)
76       return NULL;
77     next_char = file_name;
78     while((sep = FastaToSeqBuff(next_char,
79                                 &next_char, TRUE)) != NULL) {
80       if(sep !=NULL) {
81         ValNodeAddPointer(&sep_list, 0, sep);
82         bsp = sep->data.ptrvalue;
83         slp = SeqLocIntNew(0, bsp->length-1, Seq_strand_both, bsp->id);
84         ValNodeAddPointer(slp_list, 0, slp);
85       }
86     }
87   }
88   return sep_list;
89 }
90 
91 
free_alu_list(ValNodePtr alu_sep_list,ValNodePtr alu_slp_list)92 void free_alu_list(ValNodePtr alu_sep_list, ValNodePtr alu_slp_list)
93 {
94 	ValNodePtr curr;
95 	SeqEntryPtr sep;
96 	SeqLocPtr slp;
97 
98 	for(curr = alu_sep_list; curr!= NULL; curr= curr->next)
99 	{
100 		sep = (SeqEntryPtr)(curr->data.ptrvalue);
101 		SeqEntryFree(sep);
102 	}
103 	ValNodeFree(alu_sep_list);
104 
105 	for(curr = alu_slp_list; curr!= NULL; curr= curr->next)
106 	{
107 		slp= (SeqLocPtr)(curr->data.ptrvalue);
108 		SeqLocFree(slp);
109 	}
110 	ValNodeFree(alu_slp_list);
111 }
112 
113 
114 #define MIN_K 1
115 #define OFFSET_K 100
cal_K_num(Int4 query_len,Int4 db_len)116 static Int4 cal_K_num(Int4 query_len, Int4 db_len)
117 {
118 	Int4 num;
119 
120 	num = MAX(MIN_K, (query_len/(db_len+OFFSET_K)+1));
121 	return num;
122 }
123 
124 
125 
store_align_ends(ValNodePtr ends_list,SeqIdPtr sip)126 static SeqLocPtr store_align_ends(ValNodePtr ends_list, SeqIdPtr sip)
127 {
128 
129 	RecordEndsPtr rep;
130 	ValNodePtr curr;
131 	SeqLocPtr slp_head = NULL, new;
132 	SeqLocPtr slp = NULL;
133 
134 
135 	if(ends_list == NULL)
136 		return NULL;
137 
138 	for(curr = ends_list; curr != NULL; curr = curr->next)
139 	{
140 		rep = (RecordEndsPtr)(curr->data.ptrvalue);
141 		new = SeqLocIntNew(rep->start, rep->stop, rep->strand, sip);
142 		/*printf("%s: start is %ld, stop is %ld, strand is %d\n", rep->name, rep->start, rep->stop, rep->strand);*/
143 		ValNodeLink(&slp_head, new);
144 	}
145 	if(slp_head !=NULL)
146 	{
147 		slp = ValNodeNew(NULL);
148 		slp->choice = SEQLOC_MIX;
149 		slp->data.ptrvalue = slp_head;
150 	}
151 	return slp;
152 }
153 
get_word_size(Int4 length)154 static Int4 get_word_size(Int4 length)
155 {
156 	/*#ifdef WIN_MSWIN
157 		return 6;
158 	#endif*/
159 
160 	/* #ifdef WIN16
161 		return 6;
162 	#endif
163 	*/
164 
165 	if(length < 500)
166 		return 6;
167 	if(length <10000)
168 		return 7;
169 	return 8;
170 }
171 
172 
173 /*******************************************************************
174 *
175 *	SaveRepeats(bsp, ends_list)
176 *	converts the alignments stored in ends_list into the Seq-feat
177 *	repeat feature in bsp
178 *
179 *******************************************************************/
build_repeat_feat(ValNodePtr ends_list,SeqIdPtr sip)180 static SeqFeatPtr build_repeat_feat(ValNodePtr ends_list, SeqIdPtr sip)
181 {
182 	RecordEndsPtr rep;
183 	SeqFeatPtr sfp = NULL, new_sfp, curr;
184 	ImpFeatPtr ifp;
185 	/* Char temp[100]; */
186 	GBQualPtr qual;
187 	Char c='%';
188 
189 	while(ends_list)
190 	{
191 		rep = (RecordEndsPtr)(ends_list->data.ptrvalue);
192 		new_sfp = SeqFeatNew();
193 		new_sfp->data.choice = 8;
194 		ifp = ImpFeatNew();
195 		ifp->key = StringSave("repeat_region");
196 		new_sfp->data.value.ptrvalue = ifp;
197 		/* sprintf(temp, "%d%c similarity", (Int2)(100.0 * rep->zscore), c);
198 		new_sfp->comment = StringSave(temp); */
199 		new_sfp->location = SeqLocIntNew(rep->start, rep->stop, rep->strand, sip);
200 		qual = GBQualNew();
201 		qual->qual = StringSave("rpt_family");
202 		qual->val = StringSave(rep->name);
203 		/* if(StringCmp(rep->name, "L1") ==0)
204 			qual->val = StringSave("L1");
205 		else
206 		{
207 			if(StrStr(rep->name, "MER"))
208 				qual->val = StringSave(rep->name);
209 			else
210 				qual->val = StringSave("ALU");
211 		} */
212 		new_sfp->qual = qual;
213 
214 		if(sfp == NULL)
215 			sfp = new_sfp;
216 		else
217 		{
218 			curr = sfp;
219 			while(curr->next != NULL)
220 				curr = curr->next;
221 			curr->next = new_sfp;
222 		}
223 
224 		ends_list = ends_list->next;
225 	}
226 
227 	return sfp;
228 }
229 
230 
SaveRepeats(BioseqPtr bsp,ValNodePtr ends_list)231 void SaveRepeats(BioseqPtr bsp, ValNodePtr ends_list)
232 {
233 	SeqFeatPtr newsfp;
234 	SeqAnnotPtr annot, prev;
235 	ValNodePtr desc;
236 
237 	newsfp = build_repeat_feat(ends_list, bsp->id);
238 	if(newsfp == NULL)
239 		return;
240 	desc = ValNodeNew(NULL);
241 	desc->choice = Annot_descr_name;
242 	desc->data.ptrvalue = StringSave("powblast");
243 
244 	annot = SeqAnnotNew();
245 	annot->type = 1;
246 	annot->data = newsfp;
247 	annot->desc = desc;
248 
249 	if(bsp->annot == NULL)
250 		bsp->annot = annot;
251 	else
252 	{
253 		prev = bsp->annot;
254 		while(prev->next != NULL)
255 			prev = prev->next;
256 		prev->next = annot;
257 	}
258 	return;
259 }
260 
filter_align(Int4 length,FloatHi zscore,FloatHi score)261 static Boolean filter_align(Int4 length, FloatHi zscore, FloatHi score)
262 {
263 	FloatHi MINSCORE = 0.6;
264 
265         if(zscore < MINSCORE)
266                 return FALSE;
267         return (score > 30.0);
268 
269 }
270 
271 
filter_repeats(SeqLocPtr slp,ValNodePtr alu_slp_list,ValNodePtr PNTR ends_list)272 SeqLocPtr filter_repeats(SeqLocPtr slp, ValNodePtr alu_slp_list, ValNodePtr PNTR ends_list)
273 {
274 	SimPam sp;
275 	ValNodePtr curr;
276 	SeqLocPtr sloc;
277 	Int4 length;
278 	Int4 k;
279 	ValNodePtr new_list, this_list = NULL;
280 	SeqLocPtr aluloc = NULL;
281 
282 	if(alu_slp_list == NULL || slp == NULL)
283 		return NULL;
284 
285 	DefaultSimPam(&sp);
286 	length = SeqLocLen(slp);
287 	sp.word = get_word_size(length);
288 	sp.cutoff = 30.0;
289 	this_list = NULL;
290 	for(curr = alu_slp_list; curr != NULL; curr= curr->next)
291 	{
292 		sloc = (SeqLocPtr)(curr->data.ptrvalue);
293 		k = cal_K_num(length, SeqLocLen(sloc));
294 
295 		new_list = NULL;
296 		/* SIM2(slp, sloc, k, &sp, &new_list, OUTPUT_ENDS, NULL); */
297 		SIM2(slp, sloc, k, &sp, &new_list, OUTPUT_ENDS, (FilterProc)filter_align);
298 		this_list = merge_two_list(this_list, new_list);
299 		/* Change_Loc_Strand(sloc, Seq_strand_both); */
300 	}
301 
302 	if(this_list != NULL)
303 	{
304 		this_list = CleanNewList(this_list);
305 		aluloc = store_align_ends(this_list, SeqLocId(slp));
306 		if(ends_list != NULL)
307 		{
308 			if(*ends_list == NULL)
309 				*ends_list = this_list;
310 			else
311 				*ends_list = merge_two_list(*ends_list, this_list);
312 		}
313 	}
314 
315 	return aluloc;
316 }
317 
318 
319 /*****************************************************
320 *
321 *	filter_repeats_for_bigloc(slp, max_len, min_overlap)
322 *	break the slp into smaller pieces and merge the results
323 *
324 *	slp: the big Seq-loc
325 *	max_len: the maximum allowed length for search
326 *	if -1, set to default
327 *	min_overlap: the minimum length for search
328 *	if -1, set to the default
329 *
330 *******************************************************/
filter_repeats_for_bigloc(SeqLocPtr slp,Int4 max_len,Int4 min_overlap,ValNodePtr alu_slp_list)331 ValNodePtr filter_repeats_for_bigloc(SeqLocPtr slp, Int4 max_len, Int4 min_overlap, ValNodePtr alu_slp_list)
332 {
333 	ValNodePtr ends_list = NULL;
334 	SeqLocPtr b_loc, loc;
335 	SeqLocPtr aluloc;
336 
337 	if(slp->choice != SEQLOC_INT && slp->choice != SEQLOC_WHOLE)
338 	{
339 		ErrPostEx(SEV_ERROR, 0, 0, "Sorry, can not handle SeqLoc for type %d", slp->choice);
340 		return NULL;
341 	}
342 	if(max_len == -1)
343 		max_len = REPEAT_BREAK_LEN;
344 	if(min_overlap == -1)
345 		min_overlap = REPEAT_OVERLAP_LEN;
346 
347 	b_loc = break_blast_job(slp, SeqLocId(slp), REPEAT_BREAK_LEN, min_overlap);
348 	for(loc = b_loc; loc != NULL; loc = loc->next)
349 	{
350 		aluloc = filter_repeats(loc, alu_slp_list, &ends_list);
351 		SeqLocSetFree(aluloc);
352 	}
353 	SeqLocSetFree(b_loc);
354 	return ends_list;
355 }
356 
357 
358 
get_output_name(CharPtr name,CharPtr buf)359 void get_output_name(CharPtr name, CharPtr buf)
360 {
361 	Int4 len, i;
362 
363 
364 	i =0;
365 	for(; buf !=NULL; ++buf)
366 	{
367 	   if(*buf == '\n' || *buf == '\0' || *buf == '|')
368 		break;
369 	   else
370 	   	name[i++] = *buf;
371 	}
372 	name[i] = '\0';
373 	len = i;
374 	for(i = len-1; i>=0; --i)
375 	{
376 		if(name[i] == '.')
377 		{
378 			name[i] = '\0';
379 			return;
380 		}
381 	}
382 }
383 
write_asn_output(CharPtr name,BioseqPtr bsp,SeqAnnotPtr annot,Int4 option,Boolean io_type)384 void write_asn_output(CharPtr name, BioseqPtr bsp, SeqAnnotPtr annot, Int4 option, Boolean io_type)
385 {
386 	SeqEntryPtr sep;
387 	Char out_file[100];
388 	AsnIoPtr aip;
389 	SeqAnnotPtr sap, prev, next;
390 	Boolean free_it;
391 
392 	if(option == 0)
393 	{
394 		SeqAnnotFree(annot);
395 		return;
396 	}
397 	if(bsp == NULL)
398 	{
399 		SeqAnnotFree(annot);
400 		return;
401 	}
402 
403 	if(bsp->annot == NULL)
404 		bsp->annot = annot;
405 	else
406 	{
407 		sap = bsp->annot;
408 		prev = NULL;
409 
410 		while(sap)
411 		{
412 			free_it = FALSE;
413 			next = sap->next;
414 			if(sap->type == 2)
415 			{
416 				if(option == 1)	/*overwrite*/
417 				{
418 					free_it = TRUE;
419 					if(prev != NULL)
420 						prev->next = sap->next;
421 					else
422 						bsp->annot = sap->next;
423 					sap->next = NULL;
424 					SeqAnnotFree(next);
425 				}
426 			}
427 			if(!free_it)
428 				prev = sap;
429 			sap = next;
430 		}
431 
432 		if(prev != NULL)
433 			prev->next = annot;
434 		else
435 			bsp->annot = annot;
436 	}
437 
438 	sep = SeqEntryFind(bsp->id);
439 
440         if(io_type == FILE_IO) {
441           sprintf(out_file, "%s.ent", name);
442           aip = AsnIoOpen(out_file, "w");
443         } else { /* for STD_IO */
444           aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
445         }
446 
447 	SeqEntryAsnWrite(sep, aip, NULL);
448 	AsnIoClose(aip);
449 }
450 
451 
save_SeqAnnot(SeqAnnotPtr annot,CharPtr name,Boolean io_type)452 Boolean save_SeqAnnot(SeqAnnotPtr annot, CharPtr name, Boolean io_type)
453 {
454         Char sat_name[100];
455 	AsnIoPtr aip;
456 
457 	if(annot->data == NULL)
458 		return FALSE;
459 
460         if(io_type == FILE_IO) {
461           sprintf(sat_name, "%s.sat", name);
462           aip = AsnIoOpen(sat_name, "w");
463         } else { /* for STD_IO */
464           aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
465         }
466 	SeqAnnotAsnWrite(annot, aip, NULL);
467 	AsnIoClose(aip);
468 	return TRUE;
469 }
470 
load_dust_bsp(BioseqPtr dust_bsp,SeqLocPtr dustloc)471 void load_dust_bsp(BioseqPtr dust_bsp, SeqLocPtr dustloc)
472 {
473 	ByteStorePtr bsp;
474 	SeqLocPtr slp = NULL;
475 	Int4 start, stop;
476 	Uint1 res = 'N';
477 
478 
479 	if (dust_bsp->seq_data_type == Seq_code_gap) return;
480 
481 	if(dust_bsp->mol == Seq_mol_aa)
482 		res = 'X';
483 
484 	bsp = (ByteStorePtr) dust_bsp->seq_data;
485 	while(dustloc)
486 	{
487 		slp = NULL;
488  		while((slp = SeqLocFindNext(dustloc, slp))!=NULL)
489 		{
490 			start = SeqLocStart(slp);
491 			stop = SeqLocStop(slp);
492 			BSSeek(bsp, start, SEEK_SET);
493 			for(; start <=stop; ++start)
494                 		BSPutByte(bsp, (Int2)res);
495 		}
496 		dustloc = dustloc->next;
497 	}
498 }
499 
500 
501 /*mask the current sequence with the repeat feature*/
502 typedef struct mask_repeat {
503 	BioseqPtr bsp;
504 	BioseqPtr dust_bsp;
505 }MaskFeat, PNTR MaskFeatPtr;
506 
MaskWithRepeatFeature(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)507 static void MaskWithRepeatFeature(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
508 {
509 	SeqAnnotPtr annot;
510 	SeqFeatPtr sfp;
511 	BioseqPtr bsp;
512 	BioseqSetPtr bssp;
513 	SeqIdPtr sip;
514 	MaskFeatPtr mfp;
515 	ImpFeatPtr ifp;
516 
517 	mfp = (MaskFeatPtr)(data);
518 	if(sep->choice == 1)
519 	{
520 		bsp = sep->data.ptrvalue;
521 		annot = bsp->annot;
522 	}
523 	else
524 	{
525 		bssp = sep->data.ptrvalue;
526 		annot = bssp->annot;
527 	}
528 
529 	while(annot)
530 	{
531 		if(annot->type == 1)
532 		{
533 			sfp = annot->data;
534 			while(sfp)
535 			{
536 				if(sfp->data.choice == 8)
537 				{
538 					sip = SeqLocId(sfp->location);
539 					if(BioseqMatch(mfp->bsp, sip))
540 					{
541 						ifp = sfp->data.value.ptrvalue;
542 						if(ifp->key != NULL)
543 						{
544 							if(StringNCmp(ifp->key, "repeat", 6) == 0)
545 								load_dust_bsp(mfp->dust_bsp, sfp->location);
546 						}
547 					}
548 				}
549 				sfp = sfp->next;
550 			}
551 		}
552 
553 		annot = annot->next;
554 	}
555 }
556 
557 /*******************************************************************
558 *
559 *	look for the repeat feature in sep and mask the residue in
560 *	dust_bsp
561 *
562 *******************************************************************/
mask_with_repeat_feature(BioseqPtr bsp,SeqEntryPtr sep,BioseqPtr dust_bsp)563 void mask_with_repeat_feature(BioseqPtr bsp, SeqEntryPtr sep, BioseqPtr dust_bsp)
564 {
565 	MaskFeat mf;
566 
567 	mf.bsp = bsp;
568 	mf.dust_bsp = dust_bsp;
569 
570 	SeqEntryExplore(sep, (Pointer)&mf, MaskWithRepeatFeature);
571 }
572 
make_dust_bsp(BioseqPtr bsp,Int4 start,Int4 stop,SeqLocPtr dustloc)573 BioseqPtr make_dust_bsp(BioseqPtr bsp, Int4 start, Int4 stop, SeqLocPtr dustloc)
574 {
575 	BioseqPtr dust_bsp;
576 	SeqPortPtr spp;
577 	ByteStorePtr b_store;
578 	Uint1 residue;
579 	Uint1 code;
580 
581 	if(bsp->mol == Seq_mol_aa)
582 		code = Seq_code_iupacaa;
583 	else
584 		code = Seq_code_iupacna;
585 
586 	spp = SeqPortNew(bsp, start, stop, Seq_strand_plus, code);
587 	b_store = BSNew(stop - start +2);
588 	BSSeek(b_store, 0, SEEK_SET);
589 	while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
590                 BSPutByte(b_store, (Int2)residue);
591 	SeqPortFree(spp);
592 
593 
594 	dust_bsp = BioseqNew();
595 	dust_bsp->id = local_id_make("dust_bsp");
596 	dust_bsp->seq_data = (SeqDataPtr) b_store;
597 	dust_bsp->repr = Seq_repr_raw;
598 	dust_bsp->mol = bsp->mol;
599 	dust_bsp->length = stop - start +1;
600 	dust_bsp->topology = 1;
601 	dust_bsp->seq_data_type = code;
602 
603 	if(dustloc != NULL)
604 		load_dust_bsp(dust_bsp, dustloc);
605 	return dust_bsp;
606 }
607 
prt_FASTA_file(BioseqPtr bsp,SeqLocPtr dustloc,Boolean io_type)608 void prt_FASTA_file(BioseqPtr bsp, SeqLocPtr dustloc, Boolean io_type)
609 {
610 	FILE *fp;
611 
612 	load_dust_bsp(bsp, dustloc);
613         if(io_type == FILE_IO)
614           fp = FileOpen("xxx.out", "w");
615         else
616           fp = stdout;
617 	BioseqRawToFasta(bsp, fp, TRUE);
618         if(io_type == FILE_IO)
619           FileClose(fp);
620 }
621 
622 
restore_blast_id(SeqAnnotPtr sap,SeqIdPtr sip)623 void restore_blast_id(SeqAnnotPtr sap, SeqIdPtr sip)
624 {
625 	SeqAlignPtr align;
626 	DenseDiagPtr ddp;
627 	DenseSegPtr dsp;
628 	SeqIdPtr next_id;
629 	StdSegPtr ssp;
630 	SeqLocPtr slp;
631 	SeqIntPtr sint;
632 
633 	if(sap == NULL || sip == NULL)
634 		return;
635 	align = sap->data;
636 	while(align)
637 	{
638 		switch(align->segtype)
639 		{
640 			case 1:
641 				ddp = align->segs;
642 				while(ddp)
643 				{
644 					next_id = ddp->id->next;
645 					ddp->id->next = NULL;
646 					SeqIdFree(ddp->id);
647 					ddp->id = SeqIdDup(sip);
648 					ddp->id->next = next_id;
649 					ddp = ddp->next;
650 				}
651 				break;
652 
653 			case 2:
654 				dsp = align->segs;
655 				if(dsp)
656 				{
657 					next_id = dsp->ids->next;
658 					dsp->ids->next = NULL;
659 					SeqIdFree(dsp->ids);
660 					dsp->ids = SeqIdDup(sip);
661 					dsp->ids->next = next_id;
662 				}
663 				break;
664 
665 			case 3:
666 				ssp = align->segs;
667 				while(ssp)
668 				{
669 					next_id = ssp->ids->next;
670 					ssp->ids->next = NULL;
671 					SeqIdFree(ssp->ids);
672 					ssp->ids = SeqIdDup(sip);
673 					ssp->ids->next = next_id;
674 
675 					slp = ssp->loc;
676 					if(slp->choice == SEQLOC_INT)
677 					{
678 						sint = slp->data.ptrvalue;
679 						SeqIdFree(sint->id);
680 						sint->id = SeqIdDup(sip);
681 					}
682 					ssp = ssp->next;
683 				}
684 				break;
685 
686 			default:
687 				break;
688 		}
689 		align = align->next;
690 	}
691 }
692 
link_ddp(DenseDiagPtr PNTR head,DenseDiagPtr ddp_2)693 static void link_ddp(DenseDiagPtr PNTR head, DenseDiagPtr ddp_2)
694 {
695 	DenseDiagPtr ddp_1;
696 
697 	if(*head == NULL)
698 		*head = ddp_2;
699 	else
700 	{
701 		ddp_1 = *head;
702 		while(ddp_1->next != NULL)
703 			ddp_1 = ddp_1->next;
704 		ddp_1->next = ddp_2;
705 	}
706 }
707 
link_blast_sap(SeqAnnotPtr PNTR blast_sap,SeqAnnotPtr new)708 void link_blast_sap(SeqAnnotPtr PNTR blast_sap, SeqAnnotPtr new)
709 {
710 	SeqAlignPtr align, new_align, next_new_align;
711 	SeqAnnotPtr head;
712 	DenseDiagPtr ddp, new_ddp;
713 	SeqIdPtr new_id;
714 
715 	if(*blast_sap == NULL)
716 		*blast_sap = new;
717 	else
718 	{
719 		head = *blast_sap;
720 		align = head->data;
721 		if(align == NULL)
722 			head->data = new->data;
723 		else
724 		{
725 			new_align = new->data;
726 			while(new_align)
727 			{
728 				next_new_align = new_align->next;
729 				new_align->next = NULL;
730 				new_ddp = new_align->segs;
731 				new_id = new_ddp->id->next;
732 
733 				align = head->data;
734 				while(align)
735 				{
736 					ddp = align->segs;
737 					if(SeqIdMatch(new_id, ddp->id->next))
738 					{
739 						link_ddp(&ddp, new_ddp);
740 						new_align->segs = NULL;
741 						SeqAlignFree(new_align);
742 						break;
743 					}
744 					align = align->next;
745 				}
746 				if(align == NULL)
747 					head->data = link_align(new_align, (SeqAlignPtr )(head->data));
748 				new_align = next_new_align;
749 			}
750 		}
751 		new->data = NULL;
752 		SeqAnnotFree(new);
753 	}
754 }
755 
get_strand_val(Uint1Ptr strands,Int2 i)756 static Uint1 get_strand_val(Uint1Ptr strands, Int2 i)
757 {
758 	if(strands == NULL)
759 		return Seq_strand_plus;
760 	if(strands[i] != Seq_strand_minus)
761 		return Seq_strand_plus;
762 	else
763 		return Seq_strand_minus;
764 }
765 
filter_self_diagnol(DenseDiagPtr PNTR head)766 static DenseDiagPtr filter_self_diagnol(DenseDiagPtr PNTR head)
767 {
768 	DenseDiagPtr prev = NULL, ddp;
769 	DenseDiagPtr next;
770 	Uint1 strand_1, strand_2;
771 
772 	ddp = *head;
773 	while(ddp)
774 	{
775 		next = ddp->next;
776 		strand_1 = get_strand_val(ddp->strands, 0);
777 		strand_2 = get_strand_val(ddp->strands, 1);
778 		if((strand_1  == strand_2) && (ddp->starts[0] == ddp->starts[1]))
779 		{
780 			if(prev == NULL)
781 				*head = next;
782 			else
783 				prev->next = next;
784 			ddp->next = NULL;
785 			DenseDiagFree(ddp);
786 		}
787 		else
788 			prev = ddp;
789 		ddp = next;
790 	}
791 
792 	return (*head);
793 }
794 
795 
796 
filter_blast_query(SeqAnnotPtr blast_sap,BioseqPtr query)797 void filter_blast_query(SeqAnnotPtr blast_sap, BioseqPtr query)
798 {
799 	SeqAlignPtr align, next, prev = NULL;
800 	DenseDiagPtr ddp;
801 	Boolean free_it;
802 
803 
804 	if(query == NULL || blast_sap == NULL)
805 		return;
806 
807 	align = blast_sap->data;
808 	if(align == NULL)
809 		return;
810 	while(align)
811 	{
812 		next = align->next;
813 		free_it = FALSE;
814 		switch(align->segtype) {
815 			case 1:
816 				ddp = align->segs;
817 				if(ddp && ddp->id != NULL)
818 				{
819 					if(BioseqMatch(query, ddp->id->next))
820 					{
821 						filter_self_diagnol(&ddp);
822 						if(ddp == NULL)
823 						{
824 							if(prev == NULL)
825 								blast_sap->data = next;
826 							else
827 								prev->next = next;
828 							align->next = NULL;
829 							align->segs = NULL;
830 							SeqAlignFree(align);
831 							free_it = TRUE;
832 						}
833 						else
834 							align->segs = ddp;
835 
836 					}
837 				}
838 				break;
839 			case 2:
840 				break;
841 			case 3:
842 				break;
843 			default:
844 				break;
845 		}
846 		if(!free_it)
847 			prev = align;
848 		align = next;
849 	}
850 }
851 
852 
853 
854 
855 
856 
857 
858 
859 /*********************************************************************
860 *
861 *	break_blast_job(slp)
862 *	if the Seq-loc is too long for blast search, break it into
863 *	overlapping pieces to search more efficiently
864 *	slp: the query sequence
865 *	return  a list of Seq-locs for blast search
866 *
867 *********************************************************************/
break_blast_job(SeqLocPtr slp,SeqIdPtr sip,Int4 max_len,Int4 overlap_len)868 SeqLocPtr break_blast_job(SeqLocPtr slp, SeqIdPtr sip, Int4 max_len, Int4 overlap_len)
869 {
870 	Int4 K;
871 	Int4 length, len;
872 	SeqLocPtr head = NULL, new;
873 	Int4 start, stop;
874 	Uint1 strand;
875 
876 	length = SeqLocLen(slp);
877 	start = SeqLocStart(slp);
878 	stop = SeqLocStop(slp);
879 	strand = SeqLocStrand(slp);
880 
881 	if(length < max_len)
882 		return SeqLocIntNew(start, stop, strand, sip);
883 	else
884 	{
885 		K = (length)/max_len;
886 		if((length)%max_len!= 0)
887 			++K;
888 		len = length/K + overlap_len;
889 		while(K >0)
890 		{
891 			if(K == 1)
892 				stop = SeqLocStop(slp);
893 			else
894 				stop = (start + len -1);
895 			new = SeqLocIntNew(start, stop, strand, sip);
896 			ValNodeLink(&head, new);
897 			start += (len - overlap_len-1);
898 			--K;
899 		}
900 		return head;
901 	}
902 }
903 
904 
905 /*##################################################################
906 #
907 # 	functions related to merge_blast_result
908 #
909 ###################################################################*/
910 
DenseDiagForSameSequence(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)911 static Boolean DenseDiagForSameSequence(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
912 {
913 	SeqIdPtr sip_1, sip_2;
914 
915 	if(ddp_1 == NULL || ddp_2 == NULL)
916 		return FALSE;
917 
918 	if(ddp_1->dim != ddp_2->dim)
919 		return FALSE;
920 	sip_1 = ddp_1->id;
921 	sip_2 = ddp_2->id;
922 	while(sip_1 && sip_2)
923 	{
924 		if(!SeqIdMatch(sip_1, sip_2))
925 			return FALSE;
926 		sip_1 = sip_1->next;
927 		sip_2 = sip_2->next;
928 	}
929 	return TRUE;
930 }
931 
is_same_orientation(Uint1 strand_1,Uint1 strand_2)932 static Boolean is_same_orientation(Uint1 strand_1, Uint1 strand_2)
933 {
934 	if(strand_1 == strand_2)
935 		return TRUE;
936 	if(strand_1 == Seq_strand_minus)
937 		return (strand_2 == Seq_strand_minus);
938 	else
939 		return (strand_2 != Seq_strand_minus);
940 }
941 
DenseDiagSameOrient(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)942 static Boolean DenseDiagSameOrient(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
943 {
944 	Int2 dim, i;
945 	Uint1 strand_1, strand_2;
946 
947 	if(ddp_1 == NULL || ddp_2 == NULL)
948 		return FALSE;
949 	if(ddp_1->dim != ddp_2->dim)
950 		return FALSE;
951 
952 	dim = ddp_1->dim;
953 	for(i =0; i<dim; ++i)
954 	{
955 		strand_1 = get_strand_val(ddp_1->strands, i);
956 		strand_2 = get_strand_val(ddp_2->strands, i);
957 		if(!is_same_orientation(strand_1, strand_2))
958 			return FALSE;
959 	}
960 
961 	return TRUE;
962 }
963 
964 
StdSegForSameSequence(StdSegPtr ssp_1,StdSegPtr ssp_2)965 static Boolean StdSegForSameSequence(StdSegPtr ssp_1, StdSegPtr ssp_2)
966 {
967 	SeqIdPtr sip_1, sip_2;
968 	SeqLocPtr loc_1, loc_2;
969 
970 	if(ssp_1 == NULL || ssp_2 == NULL)
971 		return FALSE;
972 
973 	loc_1 = ssp_1->loc;
974 	loc_2 = ssp_2->loc;
975 	while(loc_1 && loc_2)
976 	{
977 		sip_1 = SeqLocId(loc_1);
978 		sip_2 = SeqLocId(loc_2);
979 		if(!SeqIdMatch(sip_1, sip_2))
980 			return FALSE;
981 		loc_1 = loc_1->next;
982 		loc_2 = loc_2->next;
983 	}
984 	return TRUE;
985 }
986 
987 
988 
StdSegSameOrient(StdSegPtr ssp_1,StdSegPtr ssp_2)989 static Boolean StdSegSameOrient(StdSegPtr ssp_1, StdSegPtr ssp_2)
990 {
991 	Uint1 strand_1, strand_2;
992 	SeqLocPtr loc_1, loc_2;
993 
994 	if(ssp_1 == NULL || ssp_2 == NULL)
995 		return FALSE;
996 	if(ssp_1->dim != ssp_2->dim)
997 		return FALSE;
998 
999 	loc_1 = ssp_1->loc;
1000 	loc_2 = ssp_2->loc;
1001 	while(loc_1 && loc_2)
1002 	{
1003 		strand_1 = SeqLocStrand(loc_1);
1004 		strand_2 = SeqLocStrand(loc_2);
1005 		if(!is_same_orientation(strand_1, strand_2))
1006 			return FALSE;
1007 		loc_1 = loc_1->next;
1008 		loc_2 = loc_2->next;
1009 	}
1010 	return TRUE;
1011 }
1012 
1013 
StdSegLink(StdSegPtr PNTR head,StdSegPtr ssp)1014 static void StdSegLink (StdSegPtr PNTR head, StdSegPtr ssp)
1015 {
1016 	StdSegPtr curr;
1017 
1018 	curr = *head;
1019 	if(curr == NULL)
1020 		*head = ssp;
1021 	else
1022 	{
1023 		while(curr->next != NULL)
1024 			curr = curr->next;
1025 		curr->next = ssp;
1026 	}
1027 }
1028 
merge_StdSeg(StdSegPtr ssp_1,StdSegPtr ssp_2)1029 static Boolean merge_StdSeg(StdSegPtr ssp_1, StdSegPtr ssp_2)
1030 {
1031 	Int4 overlap = -1;
1032 	Uint1 strand_1, strand;
1033 	Int4 start_1, stop_1, start_2, stop_2;
1034 	Int4 a_start, a_stop;	/*start and stop position of an aa sequence*/
1035 	SeqIntPtr sint;
1036 	SeqLocPtr slp;
1037 
1038 
1039 	if(!StdSegSameOrient(ssp_1, ssp_2))
1040 		return FALSE;
1041 
1042 	strand_1 = SeqLocStrand(ssp_1->loc);
1043 	strand = strand_1;
1044 	start_1 = SeqLocStart(ssp_1->loc);
1045 	stop_1 = SeqLocStop(ssp_1->loc);
1046 	start_2 = SeqLocStart(ssp_2->loc);
1047 	stop_2 = SeqLocStop(ssp_2->loc);
1048 	if(start_2 > stop_1)	/*no overlaps*/
1049 		return FALSE;
1050 	else
1051 	{
1052 		if((stop_1 - start_2 + 1)%3 != 0)	/*check if they are in frame*/
1053 			return FALSE;
1054 		else
1055 		{
1056 			if(strand == Seq_strand_minus)
1057 			{
1058 				a_stop = SeqLocStart(ssp_2->loc->next);
1059 				a_start = SeqLocStop(ssp_1->loc->next);
1060 			}
1061 			else
1062 			{
1063 				a_stop = SeqLocStop(ssp_1->loc->next);
1064 				a_start = SeqLocStart(ssp_2->loc->next);
1065 			}
1066 			if((a_stop - a_start +1) *3 != (stop_1 - start_2 +1))
1067 				return FALSE;
1068 			else
1069 			{
1070 				slp = ssp_1->loc;
1071 				if(slp->choice != SEQLOC_INT)
1072 					return FALSE;
1073 				sint = slp->data.ptrvalue;
1074 				sint->to = MAX(sint->to, stop_2);
1075 
1076 				slp = ssp_1->loc->next;
1077 				sint = slp->data.ptrvalue;
1078 				if(strand == Seq_strand_minus)
1079 					sint->from = MIN(sint->from, SeqLocStart(ssp_2->loc->next));
1080 				else
1081 					sint->to = MAX(sint->to, SeqLocStop(ssp_2->loc->next));
1082 				return TRUE;
1083 			}
1084 		}
1085 	}
1086 
1087 }
1088 
1089 
1090 /*******************************************************************
1091 *
1092 *	merge_DenseDiag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1093 *	merge the two DenseDiags if they overlap and form the same
1094 *	aligned diagnol. It is assumed that ddp_1 is a diag computed
1095 *	on a location before ddp_2. Their relative order is taken
1096 *	into account
1097 *
1098 ********************************************************************/
merge_DenseDiag(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)1099 static Boolean merge_DenseDiag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1100 {
1101 	Int2 i, dim;
1102 	Int4 overlap = -1;
1103 	Uint1 strand;
1104 	Int4 start, stop;
1105 
1106 	/*it is assumed that the dimension and Seq-id has been checked before-hand*/
1107 	if(!DenseDiagSameOrient(ddp_1, ddp_2))
1108 		return FALSE;
1109 
1110 	dim = ddp_1->dim;
1111 	for(i =0; i<dim; ++i)
1112 	{
1113 		strand = get_strand_val(ddp_1->strands, i);
1114 		if(strand== Seq_strand_minus)
1115 		{
1116 			start = ddp_2->starts[i] + ddp_2->len -1;
1117 			stop = ddp_1->starts[i];
1118 		}
1119 		else
1120 		{
1121 			start = ddp_2->starts[i];
1122 			stop = ddp_1->starts[i] + ddp_1->len -1;
1123 		}
1124 		if(start > stop)	/*no overlap*/
1125 			return FALSE;
1126 		if(overlap == -1)
1127 			overlap = stop - start;
1128 		else
1129 			if(stop - start != overlap)
1130 				return FALSE;
1131 	}
1132 
1133 	if(overlap == -1)
1134 		return FALSE;
1135 	for(i = 0; i<dim; ++i)
1136 	{
1137 		if(get_strand_val(ddp_1->strands, i) == Seq_strand_minus)
1138 			ddp_1->starts[i] -= overlap;
1139 	}
1140 	ddp_1->len += (ddp_2->len -1 - overlap);
1141 	return TRUE;
1142 }
1143 
1144 
1145 
1146 
1147 /*******************************************************************
1148 *
1149 *	align_for_same_sequences(sap_1, sap_2)
1150 *	check if the same sequences are involved in alignments
1151 *
1152 ********************************************************************/
align_for_same_sequences(SeqAlignPtr sap_1,SeqAlignPtr sap_2)1153 static Boolean align_for_same_sequences(SeqAlignPtr sap_1, SeqAlignPtr sap_2)
1154 {
1155 	DenseDiagPtr ddp_1, ddp_2;
1156 	StdSegPtr ssp_1, ssp_2;
1157 
1158 	if(sap_1->segtype != sap_2->segtype)
1159 		return FALSE;
1160 	if(sap_1->segs == NULL || sap_2->segs == NULL)
1161 		return FALSE;
1162 
1163 	switch(sap_1->segtype)
1164 	{
1165 		case 1:	/*it is a Dense-diag*/
1166 			ddp_1 = sap_1->segs;
1167 			ddp_2 = sap_2->segs;
1168 			return DenseDiagForSameSequence(ddp_1, ddp_2);
1169 
1170 		case 3:
1171 			ssp_1 = sap_1->segs;
1172 			ssp_2 = sap_2->segs;
1173 			return StdSegForSameSequence(ssp_1, ssp_2);
1174 		default:
1175 			return FALSE;
1176 	}
1177 }
1178 
find_previous_align_in_list(SeqAlignPtr list,SeqAlignPtr new_sap)1179 static SeqAlignPtr find_previous_align_in_list(SeqAlignPtr list, SeqAlignPtr new_sap)
1180 {
1181 	while(list)
1182 	{
1183 		if(align_for_same_sequences(list, new_sap))
1184 			return list;
1185 		list = list->next;
1186 	}
1187 
1188 	return NULL;
1189 }
1190 
1191 
get_score_value(ScorePtr sp)1192 Int4 get_score_value(ScorePtr sp)
1193 {
1194 	ObjectIdPtr oip;
1195 
1196 	while(sp)
1197 	{
1198 		if(sp->id)
1199 		{
1200 			oip = sp->id;
1201 			if(oip && oip->str && StringCmp(oip->str, "score") == 0)
1202 			{
1203 				if(sp->choice == 1)
1204 					return sp->value.intvalue;
1205 			}
1206 		}
1207 		sp = sp->next;
1208 	}
1209 
1210 	return -1;
1211 }
1212 
replace_with_big_score(DenseDiagPtr free_ddp,DenseDiagPtr ddp)1213 static Boolean replace_with_big_score (DenseDiagPtr free_ddp, DenseDiagPtr ddp)
1214 {
1215 	Int4 free_score, score;
1216 
1217 	if(free_ddp == NULL || ddp == NULL)
1218 		return FALSE;
1219 	if(ddp->scores == NULL)
1220 	{
1221 		ddp->scores = free_ddp->scores;
1222 		free_ddp->scores = NULL;
1223 		return TRUE;
1224 	}
1225 	else if(free_ddp->scores != NULL)
1226 	{
1227 		free_score = get_score_value(free_ddp->scores);
1228 		score = get_score_value(ddp->scores);
1229 
1230 		if(free_score > score)
1231 		{
1232 			ScoreSetFree(ddp->scores);
1233 			ddp->scores = free_ddp->scores;
1234 			free_ddp->scores = NULL;
1235 			return TRUE;
1236 		}
1237 	}
1238 
1239 	return FALSE;
1240 }
merge_one_align(SeqAlignPtr list,SeqAlignPtr new_sap)1241 static Boolean merge_one_align(SeqAlignPtr list, SeqAlignPtr new_sap)
1242 {
1243 	SeqAlignPtr prev_sap;
1244 	DenseDiagPtr ddp_1, ddp_2;
1245 	DenseDiagPtr newddp;
1246 	DenseDiagPtr next;
1247 	Boolean merge;
1248 	StdSegPtr ssp_1, ssp_2;
1249 	StdSegPtr new_ssp;
1250 	StdSegPtr next_ssp;
1251 
1252 
1253 	prev_sap  = find_previous_align_in_list(list, new_sap);
1254 	if(prev_sap == NULL)
1255 		return FALSE;
1256 
1257 	switch(prev_sap->segtype)
1258 	{
1259 		case 1:	/*it is the Dense-diag*/
1260 			ddp_2 = (DenseDiagPtr)(new_sap->segs);
1261 			newddp = NULL;
1262 			while(ddp_2)
1263 			{
1264 				next = ddp_2->next;
1265 				ddp_2->next = NULL;
1266 				ddp_1 = (DenseDiagPtr)(prev_sap->segs);
1267 				merge = FALSE;
1268 				while(ddp_1 )
1269 				{
1270 					merge = merge_DenseDiag(ddp_1, ddp_2);
1271 					if(merge)
1272 						break;
1273 					else
1274 						ddp_1 = ddp_1->next;
1275 				}
1276 				if(merge)
1277 				{
1278 					replace_with_big_score(ddp_2, ddp_1);
1279 					DenseDiagFree(ddp_2);
1280 				}
1281 				else
1282 					link_ddp(&newddp, ddp_2);
1283 
1284 				ddp_2 = next;
1285 			}
1286 			if(newddp)
1287 				link_ddp((DenseDiagPtr PNTR)&(prev_sap->segs), newddp);
1288 			new_sap->segs = NULL;
1289 		   return TRUE;
1290 
1291 		case 3:
1292 			ssp_2 = (StdSegPtr)(new_sap->segs);
1293 			new_ssp = NULL;
1294 			while(ssp_2)
1295 			{
1296 				next_ssp = ssp_2->next;
1297 				ssp_2->next = NULL;
1298 				merge = FALSE;
1299 				ssp_1 = (StdSegPtr)(prev_sap->segs);
1300 				while(ssp_1 && !merge)
1301 				{
1302 					merge = merge_StdSeg(ssp_1, ssp_2);
1303 					ssp_1 = ssp_1->next;
1304 				}
1305 				if(merge)
1306 					StdSegFree(ssp_2);
1307 				else
1308 					StdSegLink(&new_ssp, ssp_2);
1309 				ssp_2 = next_ssp;
1310 			}
1311 			if(new_ssp)
1312 				StdSegLink((StdSegPtr PNTR)&(prev_sap->segs), new_ssp);
1313 			new_sap->segs = NULL;
1314 			return TRUE;
1315 
1316 		default:
1317 			return FALSE;
1318 	}
1319 
1320 }
1321 
1322 /*******************************************************************
1323 *
1324 *	merge_blast_result(head, new)
1325 *	when the input query is too big, it is broken into several
1326 *	overlapping Seq-loc to be sent to blast. The results can be
1327 *	merged if two Dense-diags are actually represent one HSP
1328 *
1329 ********************************************************************/
merge_blast_result(SeqAnnotPtr PNTR head,SeqAnnotPtr new)1330 SeqAnnotPtr merge_blast_result(SeqAnnotPtr PNTR head, SeqAnnotPtr new)
1331 {
1332 	SeqAlignPtr list, align, next;
1333 	SeqAlignPtr newlist = NULL;
1334 
1335 	if(*head == NULL)
1336 		*head = new;
1337 	else
1338 	{
1339 		list = (*head)->data;
1340 		align = new->data;
1341 		while(align)
1342 		{
1343 			next = align->next;
1344 			align->next = NULL;
1345 			if(merge_one_align(list, align))
1346 				SeqAlignFree(align);
1347 			else
1348 				newlist = link_align(align, newlist);
1349 			align = next;
1350 		}
1351 		if(newlist !=NULL)
1352 		{
1353 			list = link_align(newlist, list);
1354 			(*head)->data = list;
1355 		}
1356 		new->data = NULL;
1357 		SeqAnnotFree(new);
1358 	}
1359 	return (*head);
1360 }
1361 
GetOrg(AsnExpOptStructPtr aeosp)1362 static void LIBCALLBACK GetOrg (AsnExpOptStructPtr aeosp)
1363 {
1364   OrgRefPtr orp;
1365   CharPtr PNTR org_name;
1366 
1367         if (aeosp->dvp->intvalue != START_STRUCT){
1368 		return;
1369 	}
1370 
1371 	 orp = (OrgRefPtr)aeosp->the_struct;
1372 	 org_name = aeosp->data;
1373 	 if(*org_name == NULL && orp->taxname)
1374 		*org_name = StringSave(orp->taxname);
1375 	return;
1376 }
1377 
1378 /********************************************************************
1379 *
1380 *	get_organism(bsp)
1381 *	filter the taxname from the organism field in Bioseq
1382 *
1383 ********************************************************************/
get_organism(BioseqPtr bsp)1384 CharPtr get_organism(BioseqPtr bsp)
1385 {
1386 	SeqEntryPtr sep;
1387 	AsnIoPtr aip;
1388 	CharPtr org_name;
1389 	AsnExpOptPtr aeop;
1390 
1391 	sep = SeqEntryFind(bsp->id);
1392         aip = AsnIoNullOpen();
1393 
1394 	org_name = NULL;
1395         if ((aeop = AsnExpOptNew(aip, "Org-ref", (Pointer)(&org_name), GetOrg)) ==NULL)
1396                 ErrPost(100, 1, "no aeop 1");
1397 	SeqEntryAsnWrite(sep, aip, NULL);
1398         AsnExpOptFree(aip, aeop);
1399 	AsnIoClose(aip);
1400 	return org_name;
1401 }
1402 
1403 
merge_dense_diag_internal_repeats(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2,BoolPtr pkeep_1)1404 static Boolean merge_dense_diag_internal_repeats(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2, BoolPtr pkeep_1)
1405 {
1406 	Int4 a_start_1, a_stop_1;
1407 	Int4 a_start_2, a_stop_2;
1408 	Int4 b_start_1, b_stop_1;
1409 	Int4 b_start_2, b_stop_2;
1410 	Boolean keep_1;
1411 
1412 	if(!DenseDiagSameOrient(ddp_1, ddp_2))
1413 		return FALSE;
1414 
1415 	/*check overlap*/
1416 	a_start_1 = ddp_1->starts[0];
1417 	a_stop_1 = a_start_1 + ddp_1->len -1;
1418 	a_start_2 = ddp_2->starts[0];
1419 	a_stop_2 = a_start_2 + ddp_2->len -1;
1420 
1421 	if(a_start_2 >= a_start_1 && a_stop_2 <= a_stop_1)
1422 		keep_1 = TRUE;
1423 	else if(a_start_1 >= a_start_2 && a_stop_1 <= a_stop_2)
1424 		keep_1 = FALSE;
1425 	else
1426 		return FALSE;
1427 
1428 	b_start_1 = ddp_1->starts[1];
1429 	b_stop_1 = b_start_1 + ddp_1->len -1;
1430 	b_start_2 = ddp_2->starts[1];
1431 	b_stop_2 = b_start_2 + ddp_2->len -1;
1432 
1433 	*pkeep_1 = keep_1;
1434 	if(keep_1)
1435 		return (b_start_2 >= b_start_1 && b_stop_2 <= b_stop_1);
1436 	else
1437 		return (b_start_1 >= b_start_2 && b_stop_1 <= b_stop_2);
1438 }
1439 
merge_std_seg_internal_repeats(StdSegPtr ssp_1,StdSegPtr ssp_2,BoolPtr pkeep_1)1440 static Boolean merge_std_seg_internal_repeats(StdSegPtr ssp_1, StdSegPtr ssp_2, BoolPtr pkeep_1)
1441 {
1442 	Int4 a_start_1, a_stop_1;
1443 	Int4 a_start_2, a_stop_2;
1444 	Int4 b_start_1, b_stop_1;
1445 	Int4 b_start_2, b_stop_2;
1446 	Boolean keep_1;
1447 	SeqLocPtr loc_1, loc_2;
1448 
1449 	if(!StdSegSameOrient(ssp_1, ssp_2))
1450 		return FALSE;
1451 
1452 
1453 	/*check overlap*/
1454 	loc_1 = ssp_1->loc;
1455 	loc_2 = ssp_2->loc;
1456 
1457 	a_start_1 = SeqLocStart(loc_1);
1458 	a_stop_1 = SeqLocStop(loc_1);
1459 	a_start_2 = SeqLocStart(loc_2);
1460 	a_stop_2 = SeqLocStop(loc_2);
1461 
1462 	if(a_start_2 >= a_start_1 && a_stop_2 <= a_stop_1)
1463 		keep_1 = TRUE;
1464 	else if(a_start_1 >= a_start_2 && a_stop_1 <= a_stop_2)
1465 		keep_1 = FALSE;
1466 	else
1467 		return FALSE;
1468 
1469 	loc_1 = ssp_1->loc->next;
1470 	loc_2 = ssp_2->loc->next;
1471 	b_start_1 = SeqLocStart(loc_1);
1472 	b_stop_1 = SeqLocStop(loc_1);
1473 	b_start_2 = SeqLocStart(loc_2);
1474 	b_stop_2 = SeqLocStop(loc_2);
1475 
1476 	*pkeep_1 = keep_1;
1477 	if(keep_1)
1478 		return (b_start_2 >= b_start_1 && b_stop_2 <= b_stop_1);
1479 	else
1480 		return (b_start_1 >= b_start_2 && b_stop_1 <= b_stop_2);
1481 }
1482 
1483 
1484 static SeqAlignPtr clean_repeats_in_chain PROTO((SeqAlignPtr head));
1485 
1486 
1487 
1488 
clean_dense_diag(DenseDiagPtr curr,DenseDiagPtr PNTR chain)1489 static Boolean clean_dense_diag(DenseDiagPtr curr, DenseDiagPtr PNTR chain)
1490 {
1491 
1492 	DenseDiagPtr c_next, c_prev, next;
1493 	Boolean keep_1;
1494 
1495 	c_prev = NULL;
1496 	c_next = *chain;
1497 
1498 	while(c_next)
1499 	{
1500 		keep_1 = FALSE;
1501 		next = c_next->next;
1502 		if(merge_dense_diag_internal_repeats(curr, c_next, &keep_1))
1503 		{
1504 		   if(keep_1)
1505 		   {
1506 				c_next->next = NULL;
1507 				replace_with_big_score(c_next, curr);
1508 				DenseDiagFree(c_next);
1509 				if(c_prev == NULL)
1510 					*chain = next;
1511 				else
1512 					c_prev->next = next;
1513 			}
1514 		    else
1515 				return TRUE;
1516 		}
1517 		if(!keep_1)
1518 		   c_prev = c_next;
1519 		c_next = next;
1520 	}
1521 
1522 	return FALSE;
1523 }
1524 
1525 
clean_std_seg(StdSegPtr curr,StdSegPtr PNTR chain)1526 static Boolean clean_std_seg(StdSegPtr curr, StdSegPtr PNTR chain)
1527 {
1528 
1529 	StdSegPtr c_next, c_prev, next;
1530 	Boolean keep_1;
1531 
1532 	c_prev = NULL;
1533 	c_next = *chain;
1534 
1535 	while(c_next)
1536 	{
1537 		keep_1 = FALSE;
1538 		next = c_next->next;
1539 		if(merge_std_seg_internal_repeats(curr, c_next, &keep_1))
1540 		{
1541 		   if(keep_1)
1542 		   {
1543 				c_next->next = NULL;
1544 				StdSegFree(c_next);
1545 				if(c_prev == NULL)
1546 					*chain = next;
1547 				else
1548 					c_prev->next = next;
1549 			}
1550 		    else
1551 				return TRUE;
1552 		}
1553 		if(!keep_1)
1554 		   c_prev = c_next;
1555 		c_next = next;
1556 	}
1557 
1558 	return FALSE;
1559 }
1560 
1561 /*clean all the repeats within the Seq-align*/
clean_all_internal_repeats(SeqAlignPtr PNTR head)1562 void clean_all_internal_repeats(SeqAlignPtr PNTR head)
1563 {
1564 	DenseDiagPtr ddp, ddp_next, ddp_prev;
1565 	SeqAlignPtr align, ali_next, ali_prev = NULL;
1566 	StdSegPtr ssp, ssp_next, ssp_prev;
1567 
1568 
1569 	align = *head;
1570 
1571 	while(align)
1572 	{
1573 		ali_next = align->next;
1574 		switch(align->segtype)
1575 		{
1576 		case 1:
1577 			ddp = align->segs;
1578 			ddp_prev = NULL;
1579 			while(ddp !=NULL && ddp->next !=NULL)
1580 			{
1581 				ddp_next = ddp->next;
1582 				if(clean_dense_diag(ddp, &ddp_next))
1583 				{
1584 					if(ddp_prev == NULL)
1585 						align->segs = ddp_next;
1586 					else
1587 						ddp_prev->next = ddp_next;
1588 					ddp->next = NULL;
1589 					replace_with_big_score(ddp, ddp_next);
1590 					DenseDiagFree(ddp);
1591 				}
1592 				else
1593 				{
1594 					ddp->next = ddp_next;
1595 					ddp_prev = ddp;
1596 				}
1597 				ddp = ddp_next;
1598 			}
1599 			break;
1600 
1601 		case 3:
1602 			ssp = align->segs;
1603 			ssp_prev = NULL;
1604 			while(ssp !=NULL && ssp->next !=NULL)
1605 			{
1606 				ssp_next = ssp->next;
1607 				if(clean_std_seg(ssp, &ssp_next))
1608 				{
1609 					if(ssp_prev == NULL)
1610 						align->segs = ssp_next;
1611 					else
1612 						ssp_prev->next = ssp_next;
1613 					ssp->next = NULL;
1614 					StdSegFree(ssp);
1615 				}
1616 				else
1617 				{
1618 					ssp->next = ssp_next;
1619 					ssp_prev = ssp;
1620 				}
1621 				ssp = ssp_next;
1622 			}
1623 			break;
1624 
1625 		default:
1626 			break;
1627 		}
1628 		if(align->segs == NULL)
1629 		{
1630 			if(ali_prev)
1631 				ali_prev->next = ali_next;
1632 			else
1633 				*head = ali_next;
1634 			align->next = NULL;
1635 			SeqAlignFree(align);
1636 		}
1637 		else
1638 			ali_prev = align;
1639 		align = ali_next;
1640 	}
1641 
1642 	*head = clean_repeats_in_chain(*head);
1643 }
1644 
1645 
clean_repeat_current(SeqAlignPtr align,SeqAlignPtr next)1646 static void clean_repeat_current(SeqAlignPtr align, SeqAlignPtr next)
1647 {
1648 	DenseDiagPtr ddp_1, ddp_2, pddp_1, pddp_2, nddp_1, nddp_2;
1649 	Boolean keep_1, free_1;
1650 	StdSegPtr ssp_1, ssp_2, pssp_1, pssp_2, nssp_1, nssp_2;
1651 	Uint1 segtype;
1652 
1653 	if(align == NULL || next == NULL)
1654 		return;
1655 	segtype = align->segtype;
1656 
1657 	while(next)
1658 	{
1659 		switch(segtype)
1660 		{
1661 		case 1:/*it is a Dense-diag*/
1662 			ddp_1 = align->segs;
1663 			pddp_1 = NULL;
1664 			while(ddp_1)
1665 			{
1666 				ddp_2 = next->segs;
1667 				pddp_2 = NULL;
1668 				nddp_1 = ddp_1->next;
1669 				free_1 = FALSE;
1670 				while(ddp_2 && !free_1)
1671 				{
1672 					nddp_2 = ddp_2->next;
1673 					if(merge_dense_diag_internal_repeats(ddp_1, ddp_2, &keep_1))
1674 					{
1675 						if(keep_1)
1676 						{
1677 							if(pddp_2 == NULL)
1678 								next->segs = nddp_2;
1679 							else
1680 								pddp_2->next = nddp_2;
1681 							ddp_2->next = NULL;
1682 							replace_with_big_score(ddp_2, ddp_1);
1683 							DenseDiagFree(ddp_2);
1684 							ddp_2 = nddp_2;
1685 						}
1686 						else
1687 						{
1688 							if(pddp_1 == NULL)
1689 								align->segs = nddp_1;
1690 							else
1691 								pddp_1->next = nddp_1;
1692 							ddp_1->next = NULL;
1693 							replace_with_big_score(ddp_1, ddp_2);
1694 							DenseDiagFree(ddp_1);
1695 							ddp_1 = nddp_1;
1696 							free_1 = TRUE;
1697 							break;
1698 						}
1699 					}
1700 					else
1701 					{
1702 						pddp_2 = ddp_2;
1703 						ddp_2 = ddp_2->next;
1704 					}
1705 				}
1706 
1707 				if(!free_1)
1708 				{
1709 					pddp_1 = ddp_1;
1710 					ddp_1 = ddp_1->next;
1711 				}
1712 			}
1713 			break;
1714 
1715 		case 3:		/*it is a Std-seg*/
1716 			ssp_1 = align->segs;
1717 			pssp_1 = NULL;
1718 			while(ssp_1)
1719 			{
1720 				ssp_2 = next->segs;
1721 				pssp_2 = NULL;
1722 				nssp_1 = ssp_1->next;
1723 				free_1 = FALSE;
1724 				while(ssp_2 && !free_1)
1725 				{
1726 					nssp_2 = ssp_2->next;
1727 					if(merge_std_seg_internal_repeats(ssp_1, ssp_2, &keep_1))
1728 					{
1729 						if(keep_1)
1730 						{
1731 							if(pssp_2 == NULL)
1732 								next->segs = nssp_2;
1733 							else
1734 								pssp_2->next = nssp_2;
1735 							ssp_2->next = NULL;
1736 							StdSegFree(ssp_2);
1737 							ssp_2 = nssp_2;
1738 						}
1739 						else
1740 						{
1741 							if(pssp_1 == NULL)
1742 								align->segs = nssp_1;
1743 							else
1744 								pssp_1->next = nssp_1;
1745 							ssp_1->next = NULL;
1746 							StdSegFree(ssp_1);
1747 							ssp_1 = nssp_1;
1748 							free_1 = TRUE;
1749 							break;
1750 						}
1751 					}
1752 					else
1753 					{
1754 						pssp_2 = ssp_2;
1755 						ssp_2 = ssp_2->next;
1756 					}
1757 				}
1758 
1759 				if(!free_1)
1760 				{
1761 					pssp_1 = ssp_1;
1762 					ssp_1 = ssp_1->next;
1763 				}
1764 			}
1765 			break;
1766 
1767 		default:
1768 			break;
1769 
1770 		}
1771 		next = next->next;
1772 	}
1773 }
1774 
1775 
clean_repeats_in_chain(SeqAlignPtr head)1776 static SeqAlignPtr clean_repeats_in_chain(SeqAlignPtr head)
1777 {
1778 	SeqAlignPtr align, next, prev;
1779 
1780 	align = head;
1781 	while(align->next !=NULL)
1782 	{
1783 		next = align->next;
1784 		if(align_for_same_sequences(align, next))
1785 			clean_repeat_current(align, next);
1786 		align = next;
1787 	}
1788 
1789 	/*free all the empty records*/
1790 	align = head;
1791 	prev = NULL;
1792 	while(align)
1793 	{
1794 		next = align->next;
1795 		if(align->segs == NULL)
1796 		{
1797 			if(prev == NULL)
1798 				head->next = next;
1799 			else
1800 				prev->next = next;
1801 			align->next = NULL;
1802 			SeqAlignFree(align);
1803 		}
1804 		else
1805 			prev = align;
1806 		align = next;
1807 	}
1808 
1809 	return head;
1810 }
1811 
1812 
1813 
switch_Dense_diag(DenseDiagPtr ddp_1,DenseDiagPtr ddp_2)1814 static Boolean switch_Dense_diag(DenseDiagPtr ddp_1, DenseDiagPtr ddp_2)
1815 {
1816 	if(ddp_1->starts[0] > ddp_2->starts[0])
1817 		return TRUE;
1818 
1819 	if(ddp_1->starts[0] < ddp_2->starts[0])
1820 		return FALSE;
1821 
1822 	return (ddp_1->len < ddp_2->len);
1823 }
1824 
sort_dense_diag(DenseDiagPtr head)1825 static DenseDiagPtr sort_dense_diag(DenseDiagPtr head)
1826 {
1827 	DenseDiagPtr PNTR list;
1828 	DenseDiagPtr curr, last = NULL;
1829 	Int4 num =0, i, j;
1830 	Boolean s_witch = TRUE;
1831 
1832 	for(curr = head; curr != NULL; curr = curr->next)
1833 		++num;
1834 	if(num == 0 || num == 1)
1835 		return head;
1836 
1837 	list = MemNew((size_t)num * sizeof(DenseDiagPtr));
1838 	for(i =0, curr = head; i<num; ++i)
1839 	{
1840 		list[i] = curr;
1841 		curr = curr->next;
1842 	}
1843 	for(i = 0; i<num-1 && s_witch == TRUE; ++i)
1844 	{
1845 		s_witch = FALSE;
1846 		for(j =0; j<num-i-1; ++j)
1847 		{
1848 			if(switch_Dense_diag(list[j], list[j+1]))
1849 			{
1850 				s_witch = TRUE;
1851 				curr = list[j];
1852 				list[j] = list[j+1];
1853 				list[j +1] = curr;
1854 			}
1855 		}
1856 	}
1857 
1858 	for(i =0; i<num -1; ++i)
1859 	{
1860 		curr = list[i];
1861 		curr->next = list[i+1];
1862 		last = curr->next;
1863 	}
1864 	last->next = NULL;
1865 	head = list[0];
1866 	MemFree(list);
1867 	return head;
1868 }
1869 
1870 
switch_std_seg(StdSegPtr ssp_1,StdSegPtr ssp_2)1871 static Boolean switch_std_seg(StdSegPtr ssp_1, StdSegPtr ssp_2)
1872 {
1873 	Int4 start_1, start_2;
1874 
1875 	start_1 = SeqLocStart(ssp_1->loc);
1876 	start_2 = SeqLocStart(ssp_2->loc);
1877 
1878 	if(start_2 < start_1)
1879 		return TRUE;
1880 	else if(start_2 == start_1)
1881 	{
1882 		if(SeqLocStop(ssp_2->loc) < SeqLocStop(ssp_1->loc))
1883 			return TRUE;
1884 		else
1885 			return FALSE;
1886 	}
1887 	else
1888 		return FALSE;
1889 }
1890 
1891 
sort_std_seg(StdSegPtr ssp)1892 static StdSegPtr sort_std_seg(StdSegPtr ssp)
1893 {
1894 	StdSegPtr PNTR list;
1895 	StdSegPtr curr, last = NULL;
1896 	Int4 num =0, i, j;
1897 	Boolean s_witch = TRUE;
1898 
1899 	for(curr = ssp; curr != NULL; curr = curr->next)
1900 		++num;
1901 	if(num == 0 || num == 1)
1902 		return ssp;
1903 
1904 	list = MemNew((size_t)num * sizeof(DenseDiagPtr));
1905 	for(i =0, curr = ssp; i<num; ++i)
1906 	{
1907 		list[i] = curr;
1908 		curr = curr->next;
1909 	}
1910 	for(i = 0; i<num-1 && s_witch == TRUE; ++i)
1911 	{
1912 		s_witch = FALSE;
1913 		for(j =0; j<num-i-1; ++j)
1914 		{
1915 			if(switch_std_seg(list[j], list[j+1]))
1916 			{
1917 				s_witch = TRUE;
1918 				curr = list[j];
1919 				list[j] = list[j+1];
1920 				list[j +1] = curr;
1921 			}
1922 		}
1923 	}
1924 
1925 	for(i =0; i<num -1; ++i)
1926 	{
1927 		curr = list[i];
1928 		curr->next = list[i+1];
1929 		last = curr->next;
1930 	}
1931 	last->next = NULL;
1932 	ssp = list[0];
1933 	MemFree(list);
1934 	return ssp;
1935 }
1936 
1937 typedef struct parse_blast{
1938 	Int4 K;
1939 	SeqLocPtr m_loc, s_loc;
1940 	ScorePtr scores;	/*the original scores from blast */
1941 }ParseBlast, PNTR ParseBlastPtr;
1942 
1943 #define MAX_GAP_LEN 200	/*maximum gap length allowed by sim2*/
1944 #define DNA_EDGE_LEN 1000	/*add the length of the edge factor*/
1945 #define AA_EDGE_LEN 100
1946 
check_sim_list(DenseDiagPtr ddp,ValNodePtr PNTR head)1947 static void check_sim_list(DenseDiagPtr ddp, ValNodePtr PNTR head)
1948 {
1949 	ValNodePtr sim_list;
1950 	ParseBlastPtr pbp;
1951 	Int4 m_start, m_stop;
1952 	Int4 s_start, s_stop;
1953 	Int4 c_start, c_stop;
1954 	Boolean increase_K;
1955 	Uint1 strand;
1956 	SeqIdPtr s_sip;
1957 	Boolean found;
1958 	Int4 score, blast_score;
1959 
1960 	m_start = ddp->starts[0];
1961 	m_stop = m_start + ddp->len -1;
1962 	s_start = ddp->starts[1];
1963 	s_stop = s_start + ddp->len -1;
1964 	strand = get_strand_val(ddp->strands, 0);
1965 
1966 	sim_list = *head;
1967 	while(sim_list)
1968 	{
1969 	   pbp = sim_list->data.ptrvalue;
1970 	   s_sip = SeqLocId(pbp->s_loc);
1971 	   if(SeqIdMatch(s_sip, ddp->id->next) && strand == SeqLocStrand(pbp->s_loc))
1972 	   {
1973 		   if(s_sip->choice == SEQID_GI && s_sip->data.intvalue == 1418720)
1974 			   found = TRUE;
1975 		increase_K = FALSE;
1976 		found = TRUE;
1977 		c_start = SeqLocStart(pbp->m_loc);
1978 		c_stop = SeqLocStop(pbp->m_loc);
1979 		if(c_start > m_stop)
1980 		{
1981 			if(c_start - m_stop > MAX_GAP_LEN)
1982 			{
1983 				if(c_start - m_stop > DNA_EDGE_LEN)
1984 					found = FALSE;
1985 				else
1986 					increase_K = TRUE;
1987 			}
1988 		}
1989 
1990 		if(c_stop < m_start)
1991 		{
1992 			if(m_start - c_stop > MAX_GAP_LEN)
1993 			{
1994 				if(m_start - c_stop > DNA_EDGE_LEN)
1995 					found = FALSE;
1996 				else
1997 					increase_K = TRUE;
1998 			}
1999 		}
2000 
2001 		c_start = SeqLocStart(pbp->s_loc);
2002 		c_stop = SeqLocStop(pbp->s_loc);
2003 		if(c_start > s_stop)
2004 		{
2005 			if(c_start - s_stop > MAX_GAP_LEN)
2006 			{
2007 				if(c_start - s_stop > DNA_EDGE_LEN)
2008 					found = FALSE;
2009 				else
2010 					increase_K = TRUE;
2011 			}
2012 		}
2013 
2014 		if(c_stop < s_start)
2015 		{
2016 			if(s_start - c_stop > MAX_GAP_LEN)
2017 			{
2018 				if(s_start - c_stop > DNA_EDGE_LEN)
2019 					found = FALSE;
2020 				else
2021 					increase_K = TRUE;
2022 			}
2023 		}
2024 		if(found)
2025 		{
2026 			c_start = MIN(m_start, SeqLocStart(pbp->m_loc));
2027 			c_stop = MAX(m_stop, SeqLocStop(pbp->m_loc));
2028 			update_seq_loc(c_start, c_stop, Seq_strand_plus, pbp->m_loc);
2029 
2030 			c_start = MIN(s_start, SeqLocStart(pbp->s_loc));
2031 			c_stop = MAX(s_stop, SeqLocStop(pbp->s_loc));
2032 			update_seq_loc(c_start, c_stop, 0, pbp->s_loc);
2033 
2034 			if(increase_K)
2035 				++pbp->K;
2036 			blast_score = get_score_value(ddp->scores);
2037 			score = get_score_value(pbp->scores);
2038 			if(blast_score > score)
2039 			{
2040 				ScoreSetFree(pbp->scores);
2041 				pbp->scores = ddp->scores;
2042 				ddp->scores = NULL;
2043 			}
2044 			return;
2045 		}
2046 	    }
2047 	    sim_list = sim_list->next;
2048 	}
2049 
2050 	pbp = MemNew(sizeof(ParseBlast));
2051 	pbp->K = 1;
2052 	pbp->m_loc = SeqLocIntNew(m_start, m_stop, Seq_strand_plus, ddp->id);
2053 	pbp->s_loc = SeqLocIntNew(s_start, s_stop, strand, ddp->id->next);
2054 	pbp->scores = ddp->scores;
2055 	ddp->scores = NULL;
2056 	ValNodeAddPointer(head, 0, (Pointer)pbp);
2057 
2058 }
2059 
2060 /****************************************************************
2061 *
2062 *	SortAlignByLocation()
2063 *	Sort the DenseDiag from Blastn and Blastp to ascending order
2064 *	of the locations
2065 *
2066 *****************************************************************/
SortAlignByLocation(SeqAnnotPtr blast_sap)2067 void SortAlignByLocation(SeqAnnotPtr blast_sap)
2068 {
2069 	SeqAlignPtr align;
2070 	DenseDiagPtr ddp;
2071 	StdSegPtr ssp;
2072 
2073 
2074 	if(blast_sap == NULL)
2075 		return;
2076 
2077 	align = blast_sap->data;
2078 	while(align)
2079 	{
2080 		switch(align->segtype)
2081 		{
2082 		case 1:
2083 			ddp = (DenseDiagPtr)align->segs;
2084 			align->segs = sort_dense_diag(ddp);
2085 			break;
2086 		case 2:
2087 			break;
2088 		case 3:
2089 			ssp = align->segs;
2090 			align->segs = (StdSegPtr)sort_std_seg(ssp);
2091 			break;
2092 		default:
2093 			break;
2094 		}
2095 		align = align->next;
2096 	}
2097 
2098 }
2099 
2100 
filter_redundent_sim_alignment(SeqAlignPtr PNTR align)2101 static SeqAlignPtr filter_redundent_sim_alignment(SeqAlignPtr PNTR align)
2102 {
2103 	SeqAlignPtr curr, prev, next;
2104 	DenseSegPtr dsp;
2105 	SeqLocPtr slp, h_slp, n_slp;
2106 	SeqIdPtr sip;
2107 	Int4 start, stop;
2108 	Uint1 strand;
2109 
2110 	h_slp = NULL;
2111 	for(curr = *align; curr != NULL; curr = curr->next)
2112 	{
2113 		dsp = curr->segs;
2114 		sip = dsp->ids->next;
2115 		get_align_ends(curr, sip, &start, &stop, &strand);
2116 		slp = SeqLocIntNew(start, stop, strand, sip);
2117 		ValNodeLink(&h_slp, slp);
2118 	}
2119 
2120 	for(slp = h_slp; slp != NULL; slp = slp->next)
2121 	{
2122 		if(slp->choice != 0)
2123 		{
2124 			for(n_slp = slp->next; n_slp != NULL; n_slp = n_slp->next)
2125 			{
2126 				if(n_slp->choice != 0)
2127 				{
2128 					if(SeqLocStrand(slp) == SeqLocStrand(n_slp))
2129 					{
2130 						switch(SeqLocCompare(slp, n_slp))
2131 						{
2132 							case SLC_A_EQ_B:
2133 							case SLC_B_IN_A:
2134 							case SLC_A_OVERLAP_B:
2135 								n_slp->choice = 0;
2136 								break;
2137 							case SLC_A_IN_B:
2138 								slp->choice = 0;
2139 								break;
2140 							default:
2141 								break;
2142 						}
2143 					}
2144 				}
2145 			}
2146 		}
2147 	}
2148 
2149 	curr = *align;
2150 	slp = h_slp;
2151 	prev = NULL;
2152 
2153 	while(curr && slp)
2154 	{
2155 		next = curr->next;
2156 		if(slp->choice == 0)
2157 		{
2158 			slp->choice = SEQLOC_INT;
2159 			if(prev == NULL)
2160 				*align = next;
2161 			else
2162 				prev->next = next;
2163 			curr->next = NULL;
2164 			SeqAlignFree(curr);
2165 		}
2166 		else
2167 			prev = curr;
2168 
2169 		curr = next;
2170 		slp = slp->next;
2171 	}
2172 
2173 	SeqLocSetFree(h_slp);
2174 	return (*align);
2175 }
2176 
2177 
2178 
2179 /****************************************************************
2180 *
2181 *	sim2_for_blast(blast_sap, query_id)
2182 *	run sim2 with the Seq-annot from blast
2183 *
2184 ****************************************************************/
sim_for_blast(SeqAnnotPtr blast_sap,SeqIdPtr query_id,Int4 which_sim)2185 SeqAnnotPtr  sim_for_blast(SeqAnnotPtr blast_sap, SeqIdPtr query_id, Int4 which_sim)
2186 {
2187 	SeqAlignPtr sim_align, align;
2188 	ValNodePtr sim_list = NULL, curr;
2189 	ParseBlastPtr pbp;
2190 	BioseqPtr mbsp, sbsp;
2191 	Int4 start, stop;
2192 	SeqIdPtr s_sip;
2193 	Uint1 strand;
2194 	SeqAnnotPtr annot;
2195 	DenseDiagPtr ddp;
2196 	SimPam sp;		/*parameters for the sim2 alignment*/
2197 	PSimPam psp;	/*parameters for the SIM alignment*/
2198 	Int4 limit = 200;	/*limit of sequence discrepancies used in SIM3*/
2199 	Int4 edge_len;	/*length for adding up to the edge*/
2200 	Boolean is_aa;
2201 	Char label[100];
2202 	Int4 K;
2203 	Int4 mbsplength;
2204 	DenseSegPtr dsp;
2205 /** BANDALIGN**/
2206 	ValNodePtr vnp;
2207 	MashPtr    msp;
2208 
2209 	if(blast_sap == NULL || blast_sap->data == NULL)
2210 		return NULL;
2211 	mbsp = BioseqLockById(query_id);
2212 	if(mbsp == NULL)
2213 		return blast_sap;
2214 	if(mbsp->mol == Seq_mol_aa)
2215 	{
2216 		edge_len = AA_EDGE_LEN;
2217 		DefaultPSimPam(&psp);
2218 		is_aa = TRUE;
2219 	}
2220 	else
2221 	{
2222 		edge_len = DNA_EDGE_LEN;
2223 		DefaultSimPam(&sp);
2224 		is_aa = FALSE;
2225 	}
2226 	mbsplength = mbsp->length;
2227 	BioseqUnlock(mbsp);
2228 
2229 	align = blast_sap->data;
2230 
2231         if (which_sim == RUN_BANDALGN)
2232 	{
2233 	   vnp = SeqLocListOfBioseqsFromSeqAlign (align);
2234 	   msp = MashNew (is_aa);
2235            align = SeqLocListToSeqAlign (vnp, PRGALIGNDEFAULT, msp);
2236 	   annot = SeqAnnotForSeqAlign (align);
2237 	   return annot;
2238 	}
2239 	while(align)
2240    	   {
2241    	   switch (align->segtype) {
2242    	   	case 1:
2243 			ddp = align->segs;
2244 			while(ddp)
2245 			{
2246 				check_sim_list(ddp, &sim_list);
2247 				ddp = ddp->next;
2248 			}
2249    	   		break;
2250    	   	case 2:
2251    	   		dsp = align->segs;
2252    	   		if (dsp) {
2253    	   		}
2254    	   		break;
2255    	   	case 3:
2256    	   		break;
2257    	   	default:
2258    	   		break;
2259    	   }
2260 		align = align->next;
2261 	}
2262 	sim_align = NULL;
2263 
2264 	for(curr = sim_list; curr !=NULL; curr = curr->next)
2265 	{
2266 		pbp = curr->data.ptrvalue;
2267 		s_sip = SeqLocId(pbp->s_loc);
2268 		sbsp = BioseqLockById(s_sip);
2269 		if(sbsp == NULL)
2270 		{
2271 			seqid_name(s_sip, label, FALSE, FALSE);
2272 			Message(MSG_POSTERR, "Fail to retrieve sequence %s", label);
2273 		}
2274 		else
2275 		{
2276 			strand = SeqLocStrand(pbp->s_loc);
2277 
2278 			start = SeqLocStart(pbp->s_loc);
2279 			stop = SeqLocStop(pbp->s_loc);
2280 			start = MAX(0, start - edge_len);
2281 			stop = MIN(stop+edge_len, sbsp->length-1);
2282 			update_seq_loc(start, stop, strand, pbp->s_loc);
2283 
2284 			start = SeqLocStart(pbp->m_loc);
2285 			stop = SeqLocStop(pbp->m_loc);
2286 			start = MAX(0, start - edge_len);
2287 			stop = MIN(stop+edge_len, mbsplength-1);
2288 			update_seq_loc(start, stop, Seq_strand_plus, pbp->m_loc);
2289 
2290 			if(pbp->K >1)
2291 			{
2292 				sp.cutoff = 30.0;
2293 				K = MIN((pbp->K + 10), (pbp->K * 2));
2294 			}
2295 			else
2296 			{
2297 				sp.cutoff = 0.0;
2298 				K = 1;
2299 			}
2300 
2301 			if(is_aa) {
2302 				align = CSIM(pbp->m_loc, pbp->s_loc, pbp->K, sp.cutoff, &psp);
2303 			}
2304 			else
2305 			{
2306 				/* if(pbp->K >1)
2307 					align = SIM2(pbp->m_loc, pbp->s_loc, pbp->K, &sp, NULL, OUTPUT_ALIGN, NULL);
2308 				else
2309 				{ */
2310 					switch( which_sim)
2311 					{
2312 						case RUN_SIM_1:
2313 							/* align = CSIM(pbp->m_loc, pbp->s_loc, pbp->K, sp.cutoff, &psp);
2314 							break; */
2315 
2316 						case RUN_SIM_2:
2317 							align = SIM2(pbp->m_loc, pbp->s_loc, K, &sp, NULL, OUTPUT_ALIGN, NULL);
2318 							if(K > 1 && align != NULL)
2319 								filter_redundent_sim_alignment(&align);
2320 							break;
2321 
2322 						case RUN_SIM_3:
2323 							align = SIM3ALN(pbp->m_loc, pbp->s_loc, limit);
2324 							if(align == NULL)
2325 								align = SIM2(pbp->m_loc, pbp->s_loc, pbp->K, &sp, NULL, OUTPUT_ALIGN, NULL);
2326 							break;
2327 						default:
2328 							align = NULL;
2329 					}
2330 				/* } */
2331 			}
2332 			if(align != NULL)
2333 			{
2334 				dsp = align->segs;
2335 				if(pbp->scores != NULL && dsp != NULL)
2336 				{
2337 					if(dsp->scores != NULL)
2338 						ScoreSetFree(dsp->scores);
2339 					dsp->scores = pbp->scores;
2340 					pbp->scores = NULL;
2341 				}
2342 				sim_align = link_align(align, sim_align);
2343 			}
2344 
2345 			BioseqUnlock(sbsp);
2346 		}
2347 
2348 		pbp->m_loc = SeqLocFree(pbp->m_loc);
2349 		pbp->s_loc = SeqLocFree(pbp->s_loc);
2350 		if(pbp->scores != NULL)
2351 			ScoreSetFree(pbp->scores);
2352 
2353 	}
2354 
2355 	ValNodeFreeData(sim_list);
2356 
2357 	if (sim_align == NULL) return NULL;
2358 
2359 	annot = SeqAnnotNew();
2360 	annot->type = 2;
2361 	annot->data = sim_align;
2362 	return annot;
2363 }
2364 
2365 /***********************************************************
2366 *
2367 *  Function for finding human repeat features
2368 *
2369 ************************************************************/
2370 
FindHumanRepeats(BioseqPtr bsp,Boolean show_prog_mon)2371 Boolean LIBCALL FindHumanRepeats (BioseqPtr bsp, Boolean show_prog_mon)
2372 {
2373  	MonitorPtr mon=NULL;
2374 	ValNodePtr alu_slp_list = NULL, alu_sep_list = NULL, ends_list = NULL;
2375 	Char buf[80];
2376 	Boolean got_some = FALSE;
2377 	ValNode tmp;
2378 	SeqInt si;
2379 
2380 	MemSet((Pointer) (&tmp), sizeof(ValNode), 0);
2381 	MemSet((Pointer) (&si), sizeof(SeqInt), 0);
2382 	tmp.choice = SEQLOC_INT;
2383 	tmp.data.ptrvalue = &si;
2384 	si.id = bsp->id;
2385 	si.from = 0;
2386 	si.to = bsp->length - 1;
2387 	si.strand = Seq_strand_both;
2388 
2389 	if (show_prog_mon)
2390 	{
2391 		mon = MonitorStrNewEx("Alu Features", 40, FALSE);
2392 		MonitorStrValue(mon, "Reading Repeats File");
2393 	}
2394 
2395     if (! FindPath("ncbi", "ncbi", "data", buf, sizeof (buf)))
2396 	{
2397 		MonitorFree(mon);
2398 		ErrPostEx(SEV_WARNING, 0, 0, "FindPath failed in repeat filter");
2399         return got_some;
2400 	}
2401 
2402     StringCat(buf, "humrep.fsa");
2403 
2404 	alu_sep_list = make_repeat_lib (buf, &alu_slp_list, FILE_IO);
2405 	if (show_prog_mon)
2406 		MonitorStrValue(mon, "Finding Repeats");
2407 	ends_list = filter_repeats_for_bigloc(&tmp, -1, -1, alu_slp_list);
2408 	if(ends_list != NULL)
2409 	{
2410 		got_some = TRUE;
2411 		if (show_prog_mon)
2412 			MonitorStrValue(mon, "Building repeat features");
2413 		SaveRepeats (bsp, ends_list);
2414 	}
2415 	free_alu_list (alu_sep_list, alu_slp_list);
2416 
2417 	MonitorFree(mon);
2418 
2419 	return got_some;
2420 }
2421 /***********************************************************
2422 *
2423 *  Desktop function for doing human repeat features
2424 *
2425 ************************************************************/
2426 
AluFeat(Pointer data)2427 Int2 LIBCALLBACK AluFeat (Pointer data)
2428 {
2429 	OMProcControlPtr ompcp;
2430 	ObjMgrProcPtr ompp;
2431 	Pointer my_userdata;
2432 	Boolean got_some;
2433 	BioseqPtr bsp;
2434 
2435 	ompcp = (OMProcControlPtr)data;
2436 	if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2437 	if (ompcp->input_itemtype != OBJ_BIOSEQ) return OM_MSG_RET_ERROR;
2438 	ompp = ompcp->proc;      /* this is your proceedure */
2439 	my_userdata = ompp->procdata;   /* this is the user data pointer you passed in, if any */
2440 
2441 	bsp = (BioseqPtr)(ompcp->input_data);
2442 
2443 	got_some = FindHumanRepeats (bsp, TRUE);
2444 
2445 	if (! got_some)
2446 		Message(MSG_OK, "No Repeat regions found.");
2447 	else
2448 		ObjMgrSendMsg(OM_MSG_UPDATE, ompcp->input_entityID, ompcp->input_itemID, ompcp->input_itemtype);
2449 
2450 	return OM_MSG_RET_OK;
2451 }
2452 
2453 
2454 
2455 /*functions to convert alignment to a feature*/
2456 
2457 /**************************************************************
2458 *
2459 *	convert the alignment to an interval
2460 *
2461 **************************************************************/
convert_align_to_interval(SeqAlignPtr align,Int4 min_gap_len)2462 static SeqFeatPtr convert_align_to_interval(SeqAlignPtr align, Int4 min_gap_len)
2463 {
2464 	DenseSegPtr dsp;
2465 	Int2 i;
2466 	SeqLocPtr slp, slp_head, slp_next;
2467 	Int4 s_start, start;
2468 	Uint1 strand;
2469 	Boolean load;
2470 	SeqFeatPtr new_sfp;
2471 	ImpFeatPtr ifp;
2472 
2473 	if(align->segtype != 2)
2474 		return NULL;
2475 
2476 	dsp = align->segs;
2477 	strand = dsp->strands[1];
2478 	slp_head = NULL;
2479 
2480 	for(i = 0; i<dsp->numseg; ++i)
2481 	{
2482 		start = dsp->starts[2*i];
2483 		s_start = dsp->starts[2*i+1];
2484 		load = FALSE;
2485 		if(start != -1)
2486 		{
2487 			if(s_start != -1)
2488 				load = TRUE;
2489 			else if(dsp->lens[i] <= min_gap_len)
2490 				load = TRUE;
2491 		}
2492 		if(load)
2493 		{
2494 			slp = SeqLocIntNew(start, start+dsp->lens[i]-1, strand, dsp->ids);
2495 			if(slp_head == NULL)
2496 				slp_head = slp;
2497 			else
2498 			{
2499 			if(strand == Seq_strand_minus)
2500 			{
2501 				slp_next = slp_head->next;
2502 				slp_head->next = NULL;
2503 				slp_head = SeqLocAdd(&slp, slp_head, TRUE, FALSE);
2504 				slp_head->next = slp_next;
2505 				slp_head = slp;
2506 			}
2507 			else
2508 				SeqLocAdd(&slp_head, slp, TRUE, FALSE);
2509 			}
2510 		}
2511 	}
2512 
2513 	new_sfp = SeqFeatNew();
2514 	new_sfp->data.choice = 8;
2515 	ifp = ImpFeatNew();
2516 	ifp->key = StringSave("hit");
2517 	new_sfp->data.value.ptrvalue = ifp;
2518 	new_sfp->location = SeqLocPackage(slp_head);
2519 	return new_sfp;
2520 }
2521 
load_align_to_interval(SeqAnnotPtr align_annot,Int4 min_gap_len,BioseqPtr bsp)2522 Boolean load_align_to_interval(SeqAnnotPtr align_annot, Int4 min_gap_len, BioseqPtr bsp)
2523 {
2524 	SeqFeatPtr sfp_head = NULL, prev, sfp;
2525 	SeqAlignPtr align;
2526 	ValNodePtr desc;
2527 	SeqAnnotPtr annot, p_annot;
2528 
2529 	prev = NULL;
2530 	while(align_annot)
2531 	{
2532 		if(align_annot->type == 2)
2533 		{
2534 			align = align_annot->data;
2535 			while(align)
2536 			{
2537 				sfp = convert_align_to_interval(align, min_gap_len);
2538 				if(sfp != NULL)
2539 				{
2540 					if(prev == NULL)
2541 						sfp_head = sfp;
2542 					else
2543 						prev->next =sfp;
2544 					prev = sfp;
2545 				}
2546 				align = align->next;
2547 			}
2548 		}
2549 
2550 		align_annot = align_annot->next;
2551 	}
2552 
2553 	if(sfp_head != NULL)
2554 	{
2555         	desc = ValNodeNew(NULL);
2556         	desc->choice = Annot_descr_name;
2557         	desc->data.ptrvalue = StringSave("HITS");
2558 
2559         	annot = SeqAnnotNew();
2560         	annot->type = 1;
2561         	annot->data = sfp_head;
2562         	annot->desc = desc;
2563 
2564 
2565         	if(bsp->annot == NULL)
2566                 	bsp->annot = annot;
2567         	else
2568         	{
2569                 	p_annot = bsp->annot;
2570                 	while(p_annot->next != NULL)
2571                         	p_annot = p_annot->next;
2572 			p_annot->next = annot;
2573 		}
2574 		return TRUE;
2575 	}
2576 	else
2577 		return FALSE;
2578 }
2579 
2580