1 /* alignval.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: alignval.c
27 *
28 * Author: Jian Ye, Colombe Chappey
29 *
30 * Version Creation Date: 6/3/99
31 *
32 * $Revision: 6.95 $
33 *
34 * File Description: To validate sequence alignment.
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45
46 #include <ncbi.h>
47 #include <seqmgr.h>
48 #include <objmgr.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <satutil.h>
52 #include <salsap.h>
53 #include <txalign.h>
54 #include <salpacc.h>
55 #include <alignval.h>
56 #include <valid.h>
57 #include <alignmgr2.h>
58
59
60 Uint1 jybitnum[8]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
61
62 typedef struct saval {
63 Boolean message;
64 Boolean msg_success;
65 Boolean find_remote_bsp;
66 Boolean find_acc_bsp;
67 Boolean delete_salp;
68 Boolean delete_bsp;
69 Boolean retdel;
70 Boolean do_hist_assembly;
71 ValNodePtr ids;
72 Uint2 entityID;
73 Boolean dirty;
74 } SaVal, PNTR SaValPtr;
75
76 typedef struct JY_error_msg {
77 Uint1 level;/* corresponds to levels of ErrPostEx [none(0), info(1), war
78 n(2), error(3) and fatal(4)] */
79 CharPtr msg;
80 } JYErrorMsg, *JYErrorMsgPtr;
81
82 /******************************************************************
83 ***
84 *** Error Messaging
85 *** copies of the BLASt functions in blastpri.h
86 *** JYConstructErrorMessage = BlastConstructErrorMessage
87 *** JYErrorChainDestroy = BlastErrorChainDestroy
88 ***
89 ******************************************************************/
90
91 static ValNodePtr errorp = NULL;
92 #define BUFFER_LENGTH 512
93
94 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation);
95
96 //LCOV_EXCL_START
JYConstructErrorMessage(CharPtr function,CharPtr message,Uint1 level,ValNodePtr PNTR vnpp)97 static ValNodePtr JYConstructErrorMessage (CharPtr function, CharPtr message, Uint1 level, ValNodePtr PNTR vnpp)
98 {
99 Char buffer[BUFFER_LENGTH];
100 CharPtr ptr;
101 JYErrorMsgPtr error_msg;
102
103 if (vnpp == NULL)
104 return NULL;
105
106 buffer[0] = NULLB;
107 ptr = buffer;
108 if (function != NULL)
109 {
110 sprintf(buffer, "%s: ", function);
111 ptr = buffer + StringLen(buffer);
112 }
113
114 if (message != NULL)
115 {
116 sprintf(ptr, "%s", message);
117 }
118
119 error_msg = (JYErrorMsgPtr) MemNew(sizeof(JYErrorMsg));
120 error_msg->msg = StringSave(buffer);
121 error_msg->level = level;
122
123 ValNodeAddPointer(vnpp, 0, error_msg);
124
125 return *vnpp;
126 }
127
JYErrorChainDestroy(ValNodePtr vnp)128 static ValNodePtr JYErrorChainDestroy (ValNodePtr vnp)
129
130 {
131 ValNodePtr start = vnp;
132 JYErrorMsgPtr error_msg;
133
134 while (vnp)
135 {
136 error_msg = (JYErrorMsgPtr) vnp->data.ptrvalue;
137 if (error_msg != NULL) {
138 MemFree(error_msg->msg);
139 }
140 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
141 vnp = vnp->next;
142 }
143
144 ValNodeFree(start);
145
146 return NULL;
147 }
148 //LCOV_EXCL_STOP
149
150 /******************************************************************
151 Output error message according to code defined in alignval.h.
152 id refers to seqid of the sequence that causes the error
153 and idcontext refers to other sequences in the same segment.
154 Intvalue is used to indicate 1) the segment where the sequence
155 with error is, or 2) the segtype in case of segtype error.
156 Please note that not all errors report all three
157 parameters(id, idcontext, Intvalue)
158 ******************************************************************/
159
160 static Boolean useValErr = FALSE;
161 static Boolean useLockByID = FALSE;
162 static ValidStructPtr useVsp = NULL;
163
AlignValBioseqLockById(SeqIdPtr sid)164 static BioseqPtr AlignValBioseqLockById (SeqIdPtr sid)
165
166 {
167 Int4 old_sev;
168 BioseqPtr bsp = NULL;
169
170 if (useLockByID) {
171 old_sev = ErrSetMessageLevel (SEV_WARNING);
172 bsp = BioseqLockById (sid);
173 ErrSetMessageLevel ((ErrSev) old_sev);
174 } else {
175 bsp = BioseqFindCore (sid);
176 }
177 return bsp;
178 }
179
AlignValBioseqUnlock(BioseqPtr bsp)180 static Boolean AlignValBioseqUnlock (BioseqPtr bsp)
181
182 {
183 if (useLockByID) {
184 return BioseqUnlock (bsp);
185 } else {
186 return TRUE;
187 }
188 }
189
190 NLM_EXTERN void CDECL ValidErr VPROTO((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
191
192 /*****************************************************************
193 * get the approximate sequence coordinate for an alignment segment
194 * sip == NULL -> get alignment coordinate
195 *****************************************************************/
valmsggetseqpos(SeqAlignPtr sap,Int4 segment,SeqIdPtr sip)196 static Int4 valmsggetseqpos(SeqAlignPtr sap, Int4 segment, SeqIdPtr sip)
197 {
198 Int4 c;
199 DenseDiagPtr ddp;
200 DenseSegPtr dsp;
201 Boolean found;
202 Int4 i;
203 Int4 j;
204 Int4 pos;
205 PackSegPtr psp;
206 Uint1Ptr seqpresence;
207 SeqIdPtr sip_tmp;
208 SeqLocPtr slp;
209 StdSegPtr ssp;
210
211 if (sap == NULL || sap->segs == NULL) {
212 return -1;
213 } else if (segment == 0) {
214 return 0;
215 }
216 if (sap->segtype == SAS_DENSEG)
217 {
218 dsp = (DenseSegPtr)sap->segs;
219 if (sip == NULL)
220 {
221 pos = 0;
222 for (c=0; c<segment; c++)
223 {
224 pos += dsp->lens[c];
225 }
226 return pos;
227 }
228 sip_tmp = dsp->ids;
229 i = 0;
230 found = FALSE;
231 while (!found && sip_tmp != NULL)
232 {
233 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
234 found = TRUE;
235 else
236 {
237 sip_tmp = sip_tmp->next;
238 i++;
239 }
240 }
241 if (!found || i>dsp->dim || segment > dsp->numseg)
242 return -1;
243 pos = 0;
244 for (c=0; c<segment; c++)
245 {
246 if ((j = dsp->starts[(dsp->dim*c)+i])>0)
247 pos=j;
248 }
249 return pos;
250 } else if (sap->segtype == SAS_DENDIAG)
251 {
252 ddp = (DenseDiagPtr)sap->segs;
253 pos = 0;
254 for (c=0; c<segment; c++)
255 {
256 pos += ddp->len;
257 ddp = ddp->next;
258 if (ddp == NULL)
259 return -1;
260 }
261 if (sip == NULL)
262 return pos;
263 sip_tmp = ddp->id;
264 i = 0;
265 found = FALSE;
266 while (!found && sip_tmp != NULL)
267 {
268 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
269 found = TRUE;
270 else
271 {
272 sip_tmp = sip_tmp->next;
273 i++;
274 }
275 }
276 if (!found || i>ddp->dim)
277 return -1;
278 return (ddp->starts[i]);
279 } else if (sap->segtype == SAS_STD)
280 {
281 ssp = (StdSegPtr)(sap->segs);
282 pos = 0;
283 for (c=0; c<segment-1; c++)
284 {
285 pos += SeqLocLen(ssp->loc);
286 ssp = ssp->next;
287 if (ssp == NULL)
288 return -1;
289 }
290 if (sip == NULL)
291 return pos;
292 slp = ssp->loc;
293 found = FALSE;
294 while (!found && slp!=NULL)
295 {
296 sip_tmp = SeqLocId(slp);
297 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
298 found = TRUE;
299 else
300 slp = slp->next;
301 }
302 if (!found)
303 return -1;
304 return (SeqLocStart(slp));
305 } else if (sap->segtype == SAS_PACKED)
306 {
307 psp = (PackSegPtr)(sap->segs);
308 if (segment > psp->numseg)
309 return -1;
310 if (sip == NULL)
311 {
312 pos = 0;
313 for (c=0; c<segment; c++)
314 {
315 pos += psp->lens[c];
316 }
317 return pos;
318 }
319 sip_tmp = psp->ids;
320 i = 0;
321 found = FALSE;
322 while (!found && sip_tmp != NULL)
323 {
324 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
325 found = TRUE;
326 else
327 {
328 sip_tmp = sip_tmp->next;
329 i++;
330 }
331 }
332 if (!found || i>psp->dim)
333 return -1;
334 pos = 0;
335 seqpresence = NULL;
336 BSSeek(psp->present, 0, SEEK_SET);
337 seqpresence=MemNew(BSLen(psp->present));
338 if(!seqpresence)
339 return -1;
340 BSRead(psp->present, seqpresence, BSLen(psp->present));
341 for (c=0; c<segment; c++)
342 {
343 if (seqpresence[(c*psp->numseg+i)/8]&jybitnum[(c*psp->numseg+i)%8])
344 pos+=psp->lens[c];
345 }
346 return pos;
347 } else
348 return -1;
349 }
350
351
BioseqForAlignmentWork(SeqAlignPtr salp)352 static BioseqPtr BioseqForAlignmentWork (SeqAlignPtr salp)
353 {
354 Int4 row, num_rows;
355 BioseqPtr bsp = NULL;
356 SeqIdPtr sip;
357 DenseDiagPtr ddp;
358
359 /* NOTE - can't index DenseDiag chain during validation because we're examining the individual DenseDiags,
360 * and indexing converts it to DenseSegs.
361 */
362 if (salp->segtype == SAS_DENDIAG && salp->segs != NULL) {
363 ddp = (DenseDiagPtr) salp->segs;
364 while (bsp == NULL && ddp != NULL) {
365 for (sip = ddp->id; bsp == NULL && sip != NULL; sip = sip->next) {
366 bsp = BioseqFind (sip);
367 sip = sip->next;
368 }
369 ddp = ddp->next;
370 }
371 } else {
372 AlnMgr2IndexSingleChildSeqAlign(salp);
373 num_rows = AlnMgr2GetNumRows(salp);
374 for (row = 1; row <= num_rows && bsp == NULL; row++) {
375 sip = AlnMgr2GetNthSeqIdPtr(salp, row);
376 bsp = BioseqFind(sip);
377 }
378 }
379 return bsp;
380 }
381
BioseqForAlignment(SeqAlignPtr salp)382 static BioseqPtr BioseqForAlignment (SeqAlignPtr salp)
383 {
384 BioseqPtr bsp;
385 SeqEntryPtr oldscope;
386
387 /* first look locally to scope */
388 bsp = BioseqForAlignmentWork (salp);
389 if (bsp != NULL) return bsp;
390
391 /* otherwise temporarily clear scope */
392 oldscope = SeqEntrySetScope (NULL);
393 bsp = BioseqForAlignmentWork (salp);
394 SeqEntrySetScope (oldscope);
395
396 return bsp;
397 }
398
399
ValMessage(SeqAlignPtr salp,Int1 MessageCode,ErrSev errlevel,SeqIdPtr id,SeqIdPtr idcontext,Int4 Intvalue)400 static void ValMessage (SeqAlignPtr salp, Int1 MessageCode, ErrSev errlevel, SeqIdPtr id, SeqIdPtr idcontext , Int4 Intvalue)
401 {
402
403 Char buf[256],
404 buf3[64],
405 string1[64],
406 string2[552];
407 GatherContextPtr gcp;
408 Int4 pos;
409
410 string1[0] = '\0';
411 string2[0] = '\0';
412 SeqIdWrite(id, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
413 switch(MessageCode)
414 {
415 case Err_SeqId:
416 sprintf(string1, "SeqId");
417 sprintf(string2, "The sequence corresponding to SeqId %s could not be found.", buf);
418 break;
419
420 case Err_Strand_Rev:
421 pos = valmsggetseqpos(salp, Intvalue, id);
422 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
423 sprintf(string1, "Strand");
424 sprintf(string2, "The strand labels for SeqId %s are inconsistent across the alignment; the first inconsistent region is the %ld(th) region, near sequence position %ld, context %s", buf, (long) Intvalue, (long) pos, buf3);
425 break;
426
427 case Err_Denseg_Len_Start:
428 pos = valmsggetseqpos(salp, Intvalue, id);
429 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
430 sprintf(string1, "Start/Length");
431 sprintf(string2, "There is a problem with sequence %s, in segment %ld (near sequence position %ld), context %s: the segment is too long or short or the next segment has an incorrect start position", buf, (long) Intvalue, (long) pos, buf3);
432 break;
433
434 case Err_Start_Less_Than_Zero:
435 //LCOV_EXCL_START
436 //unreachable for ASN.1 valid for C++ Toolkit
437 pos = valmsggetseqpos(salp, Intvalue, id);
438 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
439 sprintf(string1, "Start");
440 sprintf(string2, "Start point is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s", (long) Intvalue, (long) pos, buf, buf3);
441 break;
442 //LCOV_EXCL_STOP
443
444 case Err_Start_More_Than_Biolen:
445 pos = valmsggetseqpos(salp, Intvalue, id);
446 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
447 sprintf(string1, "Start");
448 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
449 break;
450
451 case Err_End_Less_Than_Zero:
452 //LCOV_EXCL_START
453 //unreachable with valid ASN.1
454 pos = valmsggetseqpos(salp, Intvalue, id);
455 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
456 sprintf(string1, "Length");
457 sprintf(string2, "End point is less than zero in segment %ld (near position %d) for sequence ID: %s in the context of %s. This could be a formatting error", (long) Intvalue, (int) pos,buf, buf3);
458 break;
459 //LCOV_EXCL_STOP
460
461 case Err_End_More_Than_Biolen:
462 pos = valmsggetseqpos(salp, Intvalue, id);
463 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
464 sprintf(string1, "Length");
465 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
466 break;
467
468 case Err_Len_Less_Than_Zero:
469 //LCOV_EXCL_START
470 //unreachable with valid ASN.1
471 pos = valmsggetseqpos(salp, Intvalue, id);
472 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
473 sprintf(string1, "Length");
474 sprintf(string2, "Segment length is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s. Look for extra characters in this segment or flanking segments", (long) Intvalue, (long) pos, buf, buf3);
475 break;
476 //LCOV_EXCL_STOP
477
478 case Err_Len_More_Than_Biolen:
479 pos = valmsggetseqpos(salp, Intvalue, id);
480 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
481 sprintf(string1, "Length");
482 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
483 break;
484
485 case Err_Sum_Len_Start:
486 pos = valmsggetseqpos(salp, Intvalue, id);
487 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
488 sprintf(string1, "Start");
489 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
490 break;
491
492 case Err_SeqAlign_DimSeqId_Not_Match:
493 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
494 sprintf(string1, "SeqId");
495 sprintf(string2, "The Seqalign has more or fewer ids than the number of rows in the alignment (context %s). Look for possible formatting errors in the ids.", buf3);
496 break;
497
498 case Err_Segs_DimSeqId_Not_Match:
499 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
500 sprintf(string1, "SeqId");
501 sprintf(string2, "In segment %ld, there are more or fewer rows than there are seqids (context %s). Look for possible formatting errors in the ids.", (long) Intvalue, buf3);
502 break;
503
504 case Err_Fastalike:
505 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
506 sprintf(string1, "Fasta");
507 sprintf(string2, "This may be a fasta-like alignment for SeqId: %s in the context of %s", buf, buf3);
508 break;
509
510 case Err_Null_Segs:
511 //LCOV_EXCL_START
512 // unreachable for valid ASN.1
513 sprintf(string1, "Segs");
514 sprintf(string2, "This alignment is missing all segments. This is a non-correctable error -- look for serious formatting problems.");
515 break;
516 //LCOV_EXCL_STOP
517
518 case Err_Segment_Gap:
519 pos = valmsggetseqpos(salp, Intvalue, id);
520 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
521 sprintf(string1, "Segs");
522 sprintf(string2, "Segment %ld (near alignment position %ld) in the context of %s contains only gaps. Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them.", (long) Intvalue + 1, (long) pos, buf3);
523 break;
524
525 case Err_Segs_Dim_One:
526 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
527 sprintf(string1, "Segs");
528 sprintf(string2, "Segment %ld apparently has only one sequence. Each portion of the alignment must have at least two sequences. context %s", (long) Intvalue, buf3);
529 break;
530
531 case Err_SeqAlign_Dim_One:
532 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
533 sprintf(string1, "Dim");
534 sprintf(string2, "This seqalign apparently has only one sequence. Each alignment must have at least two sequences. context %s", buf3);
535 break;
536
537 case Err_Segtype :
538 /* ignore new segtype warnings in genomic gpipe sequence */
539 if (useValErr && useVsp != NULL && useVsp->is_gpipe_in_sep && useVsp->bsp_genomic_in_sep) return;
540 sprintf(string1, "Segs");
541 sprintf(string2, "This alignment has an undefined or unsupported Seqalign segtype %ld", (long) Intvalue);
542 break;
543
544 case Err_Pcnt_ID :
545 sprintf(string1, "PercentIdentity");
546 sprintf(string2, "This alignment has a percent identity of %d%%", Intvalue);
547 break;
548
549 case Err_Short_Aln:
550 //LCOV_EXCL_START
551 //only call to only function to generate this error is commented out
552 sprintf(string1, "ShortAln");
553 sprintf(string2, "This alignment is shorter than at least one non-farpointer sequence.");
554 break;
555 //LCOV_EXCL_STOP
556
557 case Err_Unexpected_Alignment_Type:
558 sprintf(string1, "UnexpectedAlignmentType");
559 sprintf (string2, "This is not a DenseSeg alignment.");
560 break;
561
562 default:
563 break;
564 }
565 if (useValErr) {
566 if (salp != NULL && useVsp != NULL) {
567 gcp = useVsp->gcp;
568 if (gcp != NULL) {
569 gcp->entityID = salp->idx.entityID;
570 gcp->itemID = salp->idx.itemID;
571 gcp->thistype = salp->idx.itemtype;
572
573 useVsp->bsp = BioseqForAlignment(salp);
574 ValidErr (useVsp, errlevel, 6, MessageCode, "%s: %s", string1, string2);
575 }
576 }
577 return;
578 }
579 //LCOV_EXCL_START
580 if (StringLen(string1) > 0)
581 errorp = JYConstructErrorMessage (string1, string2, errlevel, &errorp);
582 //LCOV_EXCL_STOP
583 }
584
585
586 /******************************************************************
587 return the number of seqid
588 ******************************************************************/
CountSeqIdInSip(SeqIdPtr sip)589 static Int2 CountSeqIdInSip (SeqIdPtr sip)
590 {
591 Int2 numids=0;
592
593 while(sip)
594 {
595 numids++;
596 sip=sip->next;
597 }
598 return numids;
599 }
600
601 /*********************************************************/
delete_bioseqs(ValNodePtr ids,Uint2 entityID)602 static void delete_bioseqs (ValNodePtr ids, Uint2 entityID)
603 {
604 SeqEntryPtr sep_top;
605 SeqEntryPtr sep_del;
606 ValNodePtr vnp;
607 SeqIdPtr sip;
608 SeqLocPtr slp;
609 BioseqPtr bsp;
610 ObjMgrDataPtr omdptop;
611 ObjMgrData omdata;
612 Uint2 parenttype;
613 Pointer parentptr;
614
615 if (ids == NULL)
616 return;
617 sep_top = GetTopSeqEntryForEntityID (entityID);
618 SaveSeqEntryObjMgrData (sep_top, &omdptop, &omdata);
619 GetSeqEntryParent (sep_top, &parentptr, &parenttype);
620
621 vnp=ids;
622 while (vnp!=NULL)
623 {
624 sip = (SeqIdPtr) vnp->data.ptrvalue;
625 if (sip!=NULL) {
626 slp = (SeqLocPtr)ValNodeNew (NULL);
627 slp->choice = SEQLOC_WHOLE;
628 slp->data.ptrvalue = sip;
629 bsp = GetBioseqGivenSeqLoc (slp, entityID);
630 if (bsp!=NULL) {
631 sep_del=GetBestTopParentForData (entityID, bsp);
632 RemoveSeqEntryFromSeqEntry (sep_top, sep_del, FALSE);
633 }
634 slp->data.ptrvalue = NULL;
635 SeqLocFree (slp);
636 }
637 vnp=vnp->next;
638 }
639 SeqMgrLinkSeqEntry (sep_top, parenttype, parentptr);
640 RestoreSeqEntryObjMgrData (sep_top, omdptop, &omdata);
641 RenormalizeNucProtSets (sep_top, TRUE);
642
643 for (vnp=ids; vnp!=NULL; vnp=vnp->next) {
644 SeqIdFree ((SeqIdPtr) vnp->data.ptrvalue);
645 vnp->data.ptrvalue = NULL;
646 }
647 ValNodeFree (vnp);
648 return;
649 }
650
651
652 /******************************************************************
653 validate a SeqId
654 ******************************************************************/
ValidateSeqId(SeqIdPtr sip,SeqAlignPtr salp)655 static void ValidateSeqId (SeqIdPtr sip, SeqAlignPtr salp)
656 {
657 SeqIdPtr siptemp=NULL, sipnext;
658 BioseqPtr bsp=NULL;
659
660 for(siptemp=sip; siptemp!=NULL; siptemp=siptemp->next)
661 {
662 /*
663 bsp = AlignValBioseqLockById(siptemp);
664 if(!bsp)
665 ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
666 else
667 AlignValBioseqUnlockById(siptemp);
668 */
669 sipnext = siptemp->next;
670 siptemp->next = NULL;
671 bsp = BioseqFindCore (siptemp);
672 if (bsp == NULL && siptemp->choice == SEQID_LOCAL) {
673 ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
674 }
675 siptemp->next = sipnext;
676 }
677 return;
678 }
679
680 /******************************************************************
681 return seqid for each seg.
682 Note that a newly created seqid chain is returned for stdseg
683 and you need to free the memory after you use it in this case
684 ******************************************************************/
SeqIdInAlignSegs(Pointer segs,Uint1 segtype,SeqAlignPtr salp)685 static SeqIdPtr SeqIdInAlignSegs(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
686 {
687
688 SeqIdPtr sip=NULL;
689 StdSegPtr ssp;
690 DenseDiagPtr ddp;
691 DenseSegPtr dsp;
692 PackSegPtr psp;
693 SeqLocPtr slp=NULL, slptemp;
694
695 if(!segs)
696 {
697 //LCOV_EXCL_START
698 //unreachable for valid ASN.1
699 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
700 return NULL;
701 //LCOV_EXCL_STOP
702 }
703 if(segtype==1)
704 { /* DenseDiag */
705
706 ddp=(DenseDiagPtr)segs;
707 sip=ddp->id;
708 }
709 else if (segtype==2)
710 { /* DenseSeg */
711
712 dsp = (DenseSegPtr) segs;
713 sip=dsp->ids;
714 }
715 else if (segtype==3)
716 { /* StdSeg */
717
718 ssp = (StdSegPtr)segs;
719 slp = ssp->loc;
720 /*make a new linked list of SeqId*/
721 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
722 AddSeqId(&sip, SeqLocId(slptemp));
723
724 }
725 else if(segtype==4)
726 { /* Packed Seg. Optimal for editing alignments */
727
728 psp = (PackSegPtr)segs;
729 if (psp!=NULL)
730 sip = psp->ids;
731 }
732 return sip;
733 }
734
735
736 /******************************************************************
737 validate SeqId in sequence alignment
738 ******************************************************************/
ValidateSeqIdInSeqAlign(SeqAlignPtr salp)739 static void ValidateSeqIdInSeqAlign (SeqAlignPtr salp)
740 {
741 SeqIdPtr sip=NULL;
742 Pointer segptr=NULL;
743 DenseDiagPtr ddp=NULL, ddptemp;
744 StdSegPtr ssp=NULL, ssptemp;
745
746
747 if(salp)
748 {
749 segptr=salp->segs;
750 if (!segptr) {
751 //LCOV_EXCL_START
752 // unreachable for valid ASN.1
753 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
754 //LCOV_EXCL_STOP
755 }
756 else
757 {
758
759 /*densediag */
760 if(salp->segtype==1)
761 {
762 /*cast to appropriate pointer*/
763 ddp=(DenseDiagPtr)segptr;
764 for(ddptemp=ddp; ddptemp!=NULL; ddptemp=ddptemp->next)
765 {
766
767 sip=SeqIdInAlignSegs((Pointer)ddptemp, salp->segtype, salp);
768 ValidateSeqId(sip, salp);
769 }
770 }
771
772 /*Stdseg*/
773 else if(salp->segtype==3)
774 {
775 /*cast to appropriate pointer*/
776 ssp=(StdSegPtr)segptr;
777 for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
778 {
779
780 sip=SeqIdInAlignSegs((Pointer)ssptemp, salp->segtype, salp);
781 ValidateSeqId(sip, salp);
782 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
783 SeqIdSetFree(sip);
784 }
785 }
786
787 /*Denseseg, Packseg*/
788 else if(salp->segtype==2||salp->segtype==4)
789 {
790
791 sip=SeqIdInAlignSegs(segptr, salp->segtype, salp);
792 ValidateSeqId(sip, salp);
793 }
794 }
795 }
796 }
797
798 /******************************************************************
799 return true if two sip are the same, false otherwise.
800 Also return false if there is error in sip
801 ******************************************************************/
SeqIdCmp(SeqIdPtr sip1,SeqIdPtr sip2)802 static Boolean SeqIdCmp (SeqIdPtr sip1, SeqIdPtr sip2)
803 {
804 Char buf1[256], buf2[256];
805
806 if(!sip1||!sip2)
807 return FALSE;
808
809 SeqIdWrite(sip1, buf1, PRINTID_FASTA_LONG, 255);
810 SeqIdWrite(sip2, buf2, PRINTID_FASTA_LONG, 255);
811 return(!StringCmp(buf1, buf2));
812
813 }
814
815
816 /******************************************************************
817 return the strand for a seqloc with seqid=sip in a stdseg.
818 Note, it returns 255 if null sip or ssp
819 ******************************************************************/
SeqLocStrandForSipInStdSeg(SeqIdPtr sip,StdSegPtr ssp,SeqAlignPtr salp)820 static Uint1 SeqLocStrandForSipInStdSeg (SeqIdPtr sip, StdSegPtr ssp, SeqAlignPtr salp)
821 {
822 SeqLocPtr slp, slptemp;
823 Uint1 strand=0;
824
825 if(!sip||!ssp)
826 return (255);
827
828 slp=ssp->loc;
829 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
830 {
831 if(SeqIdCmp(sip, SeqLocId(slptemp)))
832 {
833 strand=SeqLocStrand(slptemp);
834 break;
835 }
836 }
837 return strand;
838 }
839
840
841 //LCOV_EXCL_START
842 //code for this error does not actually work as described in comments
843 /******************************************************************
844 check if the strand is consistent in Stdseg
845 ******************************************************************/
ValidateStrandInStdSeg(StdSegPtr ssp,SeqAlignPtr salp)846 static void ValidateStrandInStdSeg(StdSegPtr ssp, SeqAlignPtr salp)
847 {
848 SeqIdPtr sip=NULL, sip_inseg=NULL;
849 Uint1 strand1=0, strand2=0;
850 StdSegPtr ssptemp, ssptemp2, ssptemp3;
851 SeqLocPtr slp, slptemp;
852 ValNodePtr FinishedSip=NULL, temp;
853 Boolean CheckedStatus;
854 Int4 start_numseg=0, end_numseg=0;
855
856 if (!ssp) {
857 // unreachable for valid ASN.1
858 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
859 }
860 else
861 for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
862 {
863 sip_inseg=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
864 start_numseg++;
865 slp=ssptemp->loc;
866 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
867 {
868
869 CheckedStatus=FALSE;
870 sip=SeqLocId(slptemp);
871 if(sip)
872 {
873 /*if a seqloc represented by a sip has been checked, set the checkedstatus flag to true so it will not be checked again*/
874 for(temp=FinishedSip; temp!=NULL; temp=temp->next)
875 {
876 if(SeqIdCmp(sip, temp->data.ptrvalue))
877 {
878 CheckedStatus=TRUE;
879 break;
880 }
881 }
882 /*seqloc not checked yet*/
883 if(!CheckedStatus)
884 {
885
886 /*keep a record of checked sip*/
887 ValNodeAddPointer(&FinishedSip, 0, sip);
888 end_numseg=start_numseg;
889 /*go through all segs to get at least two strand, if any, for this seqloc*/
890 for(ssptemp2=ssptemp; ssptemp2!=NULL; ssptemp2=ssptemp2->next, end_numseg++)
891 {
892 /*get the first defined strand */
893 strand1=SeqLocStrandForSipInStdSeg(sip, ssptemp2, salp);
894
895 if(strand1!=0&&strand1!=255)
896 {
897 ssptemp2=ssptemp2->next;
898 break;
899 }
900
901 }
902
903 if(strand1!=0&&strand1!=255)
904 /*continue to get next strand */
905 for(ssptemp3=ssptemp2; ssptemp3!=NULL; ssptemp3=ssptemp3->next, end_numseg++)
906 {
907 strand2=SeqLocStrandForSipInStdSeg(sip, ssptemp3, salp);
908 if(strand2==0||strand2==255)
909 continue;
910
911 if(strand2!=0&&strand2!=255)
912 /*strand should be same for a given seq*/
913 if(strand1!=strand2)
914
915 ValMessage (salp, Err_Strand_Rev, SEV_ERROR, sip, sip_inseg, end_numseg+1);
916 }
917 }
918 }
919 }
920 SeqIdSetFree(sip_inseg);
921
922 }
923
924 ValNodeFree(FinishedSip);
925 }
926 //LCOV_EXCL_STOP
927
928
929 /******************************************************************
930 check if the strand is consistent in Denseseg
931 ******************************************************************/
ValidateStrandInDenseSeg(Pointer segs,Uint1 segtype,SeqAlignPtr salp)932 static void ValidateStrandInDenseSeg(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
933 {
934 DenseSegPtr dsp=NULL;
935 Int4 numseg, aligndim, dimnumseg, i, j, m;
936 SeqIdPtr sip=NULL, siptemp;
937 Uint1 strand1=0, strand2=0;
938 Uint1Ptr strandptr=NULL;
939
940 if(!segs)
941 {
942 //LCOV_EXCL_START
943 // unreachable for valid ASN.1
944 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
945 //LCOV_EXCL_STOP
946 }
947 else if(segtype==2)
948 {
949 dsp=(DenseSegPtr)segs;
950 strandptr=dsp->strands;
951 sip=dsp->ids;
952 numseg=dsp->numseg;
953 aligndim=dsp->dim;
954
955 dimnumseg=numseg*aligndim;
956 if(strandptr)
957 {
958 /*go through id for each alignment sequence*/
959 for(j=0; j<aligndim; j++)
960 {
961 /* first strand value for each sequence*/
962 strand1=strandptr[j];
963 /* go through all strand values for each sequence*/
964 for(i=j+aligndim; i<dimnumseg; i=i+aligndim)
965 {
966 strand2=strandptr[i];
967
968 if(strand1==0||strand1==255)
969 {
970 strand1=strand2;
971 continue;
972 }
973
974 /*skip undefined strand*/
975 if(strand2!=0&&strand2!=255)
976 {
977 /*strand should be same for a given seq*/
978 if(strand1!=strand2)
979 {
980 /*find current seqid*/
981
982 siptemp=sip;
983 for(m=0; m<j&&siptemp!=NULL; m++)
984 {
985 siptemp=siptemp->next;
986 }
987 ValMessage (salp, Err_Strand_Rev, SEV_ERROR, siptemp, sip, i/aligndim+1);
988 }
989 }
990 }
991 }
992 }
993 }
994 }
995
996
997
998
999 /******************************************************************
1000 check if the strand is consistent in SeqAlignment of global
1001 or partial type
1002 ******************************************************************/
ValidateStrandinSeqAlign(SeqAlignPtr salp)1003 static void ValidateStrandinSeqAlign(SeqAlignPtr salp)
1004 {
1005 StdSegPtr ssp=NULL ;
1006
1007 if(salp)
1008 {
1009
1010 /*Strands needs to be validated in case of global or partial alignment*/
1011
1012 /*denseseg*/
1013 if(salp->segtype==2)
1014
1015 ValidateStrandInDenseSeg(salp->segs, salp->segtype, salp);
1016
1017 /*stdseg*/
1018 else if(salp->segtype==3)
1019 {
1020 ssp=(StdSegPtr)salp->segs;
1021 ValidateStrandInStdSeg(ssp, salp);
1022 }
1023 }
1024 }
1025
1026
1027
1028 /******************************************************************
1029 Make sure that, in Densediag alignment, segment length and
1030 start point is not less than zero, and segment length is not greater
1031 than Bioseq length
1032 ******************************************************************/
ValidateSeqlengthInDenseDiag(DenseDiagPtr ddp,SeqAlignPtr salp)1033 static void ValidateSeqlengthInDenseDiag (DenseDiagPtr ddp, SeqAlignPtr salp)
1034 {
1035 Int4Ptr stptr=NULL;
1036 DenseDiagPtr ddptemp;
1037 Int2 numseg, i;
1038 SeqIdPtr sip=NULL, siptemp;
1039 Int4 bslen;
1040 BioseqPtr bsp=NULL;
1041
1042
1043 if (!ddp){
1044 //LCOV_EXCL_START
1045 // unreachable for valid ASN.1
1046 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1047 //LCOV_EXCL_STOP
1048 }
1049 else
1050 {
1051 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1052 {
1053 sip=ddp->id;
1054 stptr=ddptemp->starts;
1055
1056 if(stptr)
1057 {
1058 for(i=0, siptemp=sip; i<ddptemp->dim; i++, siptemp=siptemp->next)
1059 {
1060 bsp=AlignValBioseqLockById(siptemp);
1061 if(bsp)
1062 {
1063 bslen=bsp->length;
1064 AlignValBioseqUnlock (bsp);
1065 /*verify start*/
1066 if (stptr[i] < 0) {
1067 //LCOV_EXCL_START
1068 //unreachable with valid ASN.1
1069 ValMessage(salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip, numseg);
1070 //LCOV_EXCL_STOP
1071 }
1072 if(stptr[i]>=bslen)
1073 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg);
1074
1075 /*verify length*/
1076
1077 if (ddptemp->len<0) {
1078 //LCOV_EXCL_START
1079 //unreachable with valid ASN.1
1080 ValMessage(salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip, numseg);
1081 //LCOV_EXCL_STOP
1082 }
1083
1084 if(ddptemp->len+stptr[i]>bslen)
1085 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg);
1086 }
1087 }
1088 }
1089 }
1090 }
1091 }
1092
1093
1094 /******************************************************************
1095 return a new copy of len array in reversed order
1096 ******************************************************************/
GetReverseLength(Int2 numseg,Int4Ptr lenptr)1097 static Int4Ptr GetReverseLength (Int2 numseg, Int4Ptr lenptr)
1098 {
1099 Int4Ptr lenptrtemp=NULL;
1100 Int2 p;
1101
1102 if(!lenptr)
1103 return NULL;
1104
1105 lenptrtemp=(Int4Ptr)MemNew(numseg*sizeof(Int4Ptr));
1106 if(!lenptrtemp)
1107 {
1108 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1109 return NULL;
1110 }
1111 for(p=0; p<numseg; p++)
1112 lenptrtemp[p]=lenptr[numseg-1-p];
1113 return lenptrtemp;
1114
1115 }
1116
1117 /******************************************************************
1118 return a new copy of start array in reversed "numseg" order .
1119 Note that the relative position of starts in each numseg has not changed.
1120 Example: original length={0, 0, 10, -1, 30, 10}, numseg=3,
1121 lens={10, 20, 40}, the reversed length={30, 10, 10, -1, 0, 0}
1122 ******************************************************************/
GetReverseStart(Int2 numseg,Int2 dim,Int4Ptr stptr)1123 static Int4Ptr GetReverseStart(Int2 numseg, Int2 dim, Int4Ptr stptr)
1124 {
1125 Int4Ptr stptrtemp=NULL;
1126 Int2 p, q;
1127
1128 if(!stptr)
1129 return NULL;
1130
1131 stptrtemp=(Int4Ptr)MemNew(numseg*dim*sizeof(Int4Ptr));
1132 if(!stptrtemp)
1133 {
1134 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1135 return NULL;
1136 }
1137 for(p=0; p<numseg; p++)
1138 for(q=0; q<dim; q++)
1139 stptrtemp[q+p*dim]=stptr[q+(numseg-1-p)*dim];
1140
1141 return stptrtemp;
1142 }
1143
1144
1145
1146 /******************************************************************
1147 Make sure that, in Denseseg alignment, segment length and
1148 start point agrees each other and the sum of segment length
1149 is not greater than Bioseq length
1150 ******************************************************************/
ValidateSeqlengthInDenseSeg(DenseSegPtr dsp,SeqAlignPtr salp)1151 static void ValidateSeqlengthInDenseSeg (DenseSegPtr dsp, SeqAlignPtr salp)
1152 {
1153
1154 Int4Ptr lenptr=NULL, stptr=NULL, lenptrtemp=NULL, stptrtemp=NULL, lenptrtemp2=NULL, stptrtemp2=NULL;
1155
1156 Int2 numseg, aligndim, i, j;
1157 SeqIdPtr sip=NULL, siptemp;
1158 Int4 bslen = 0;
1159 BioseqPtr bsp=NULL;
1160
1161 if (!dsp) {
1162 //LCOV_EXCL_START
1163 // unreachable for valid ASN.1
1164 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1165 //LCOV_EXCL_STOP
1166 }
1167 else
1168 {
1169 numseg=dsp->numseg;
1170 aligndim=dsp->dim;
1171
1172 stptr=dsp->starts;
1173 lenptr=dsp->lens;
1174 sip=dsp->ids;
1175
1176 if(stptr==NULL||lenptr==NULL)
1177 return;
1178
1179
1180 /*go through each sequence*/
1181 for(j=0, siptemp=sip; j<aligndim&&siptemp; j++, siptemp=siptemp->next)
1182 {
1183
1184 lenptrtemp=lenptr;
1185 stptrtemp=stptr;
1186 /*if on minus strand, use reversed length and start array*/
1187 if(dsp->strands)
1188 {
1189 if(dsp->strands[j]==Seq_strand_minus)
1190 {
1191 if(!lenptrtemp2&&!stptrtemp2)
1192 {
1193 lenptrtemp2= GetReverseLength (numseg, lenptr);
1194 if (lenptrtemp2==NULL)
1195 return;
1196 stptrtemp2= GetReverseStart (numseg, aligndim, stptr);
1197 if (stptrtemp2==NULL)
1198 return;
1199 }
1200 lenptrtemp=lenptrtemp2;
1201 stptrtemp=stptrtemp2;
1202 }
1203 }
1204
1205 bsp=AlignValBioseqLockById(siptemp);
1206 if(bsp!=NULL)
1207 {
1208 bslen=bsp->length;
1209 AlignValBioseqUnlock (bsp);
1210 }
1211
1212 /*go through each segment for a given sequence*/
1213 for(i=0; i<numseg; i++)
1214 {
1215
1216 /*no need to verify if segment is not present*/
1217 if(stptrtemp[j+i*aligndim]!=-1)
1218 {
1219
1220 /*length plus start should be equal to next start*/
1221 /*check a start if it's not the last one and the next start is not -1*/
1222 if(i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]!=-1)
1223 {
1224
1225 if(stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+(i+1)*aligndim])
1226 {
1227 if (dsp->strands)
1228 {
1229 if(dsp->strands[j]==2)
1230 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1231 else
1232 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1233 }
1234 else
1235 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1236 }
1237 }
1238 /*check a start if it's not the last one and the next start is -1*/
1239 else if (i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]==-1)
1240 {
1241 Int4 k=i+1;
1242 /*find the next start that is not last and not -1*/
1243 while(k<numseg&&stptrtemp[j+k*aligndim]==-1)
1244 k++;
1245
1246 /*length plus start should be equal to the closest next start that is not -1*/
1247
1248 if(k<numseg&&stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+k*aligndim])
1249 {
1250 if (dsp->strands)
1251 {
1252 if(dsp->strands[j]==2)
1253 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1254 else
1255 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1256 }
1257 else
1258 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1259 }
1260 }
1261
1262
1263 /*make sure the start plus segment does not exceed total bioseq length*/
1264 if(bsp!=NULL)
1265 {
1266
1267 if(stptrtemp[j+i*aligndim]+lenptrtemp[i]>bslen)
1268 if (dsp->strands)
1269 {
1270 if(dsp->strands[j]==2)
1271 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg-1);
1272 else
1273 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1274 }
1275 else
1276 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1277 }
1278
1279 }
1280
1281 }
1282 }
1283 }
1284
1285
1286 MemFree(lenptrtemp2);
1287 MemFree(stptrtemp2);
1288
1289
1290 }
1291
1292 /******************************************************************
1293 Make sure that, in Seqloc of a Stdseg alignment,
1294 end point, start point and length are not less than zero,
1295 and are not greater than Bioseq length
1296 ******************************************************************/
ValidateSeqlengthInStdSeg(StdSegPtr ssp,SeqAlignPtr salp)1297 static void ValidateSeqlengthInStdSeg (StdSegPtr ssp, SeqAlignPtr salp)
1298 {
1299 StdSegPtr ssptemp;
1300 Int2 numseg;
1301 SeqIdPtr sip=NULL, siptemp;
1302 Int4 start, end, length, bslen;
1303 BioseqPtr bsp=NULL;
1304 SeqLocPtr slp=NULL, slptemp;
1305
1306 if(!ssp) {
1307 //LCOV_EXCL_START
1308 // unreachable for valid ASN.1
1309 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1310 //LCOV_EXCL_STOP
1311 } else {
1312 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++) {
1313 /*get all seqid in current segment*/
1314 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1315 slp=ssptemp->loc;
1316 if(slp==NULL)
1317 return;
1318 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next) {
1319 siptemp=SeqLocId(slptemp);
1320 start=SeqLocStart(slptemp);
1321 end=SeqLocStop(slptemp);
1322 length=SeqLocLen(slptemp);
1323
1324 bsp=AlignValBioseqLockById(siptemp);
1325 if(bsp) {
1326 bslen=bsp->length;
1327 AlignValBioseqUnlock (bsp);
1328
1329 /*verify start*/
1330 if(start<0) {
1331 //LCOV_EXCL_START
1332 //unreachable with valid ASN.1
1333 ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1334 //LCOV_EXCL_STOP
1335 }
1336
1337 if(start>bslen-1) {
1338 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1339 }
1340
1341 /*verify end*/
1342 if(end<0) {
1343 //LCOV_EXCL_START
1344 //unreachable with valid ASN.1
1345 ValMessage (salp, Err_End_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1346 //LCOV_EXCL_STOP
1347 }
1348 if(end>bslen-1) {
1349 ValMessage (salp, Err_End_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1350 }
1351
1352 /*verify length*/
1353 if(length<0) {
1354 //LCOV_EXCL_START
1355 //unreachable with valid ASN.1
1356 ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1357 //LCOV_EXCL_STOP
1358 }
1359
1360 if(length>bslen) {
1361 ValMessage (salp, Err_Len_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1362 }
1363
1364 }
1365 }
1366 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1367 SeqIdSetFree(sip);
1368 }
1369 }
1370 }
1371
1372 /******************************************************************
1373 validate the start and segment length in packseg
1374 ******************************************************************/
ValidateSeqlengthInPackSeg(PackSegPtr psp,SeqAlignPtr salp)1375 static void ValidateSeqlengthInPackSeg (PackSegPtr psp, SeqAlignPtr salp)
1376 {
1377 Uint1Ptr seqpresence=NULL;
1378 Int2 numseg, aligndim, i, j;
1379 SeqIdPtr sip=NULL, siptemp;
1380 Int4Ptr stptr=NULL, lenptr=NULL;
1381 BioseqPtr bsp=NULL;
1382 Int4 bslen, seg_start;
1383
1384 if (!psp){
1385 //LCOV_EXCL_START
1386 // unreachable for valid ASN.1
1387 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1388 //LCOV_EXCL_STOP
1389 }
1390 else
1391 {
1392 numseg=psp->numseg;
1393 aligndim=psp->dim;
1394 sip=psp->ids;
1395 stptr=psp->starts;
1396 lenptr=psp->lens;
1397
1398 if(stptr&&lenptr)
1399 {
1400 if(psp->present)
1401 {
1402 BSSeek(psp->present, 0, SEEK_SET);
1403 seqpresence=MemNew(BSLen(psp->present));
1404 if(!seqpresence)
1405 {
1406
1407 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1408 return;
1409
1410 }
1411 BSRead(psp->present, seqpresence, BSLen(psp->present));
1412 /*go through each sequence*/
1413 for(j=0, siptemp=sip; j<aligndim && siptemp != NULL; siptemp=siptemp->next, j++)
1414 {
1415 bsp=AlignValBioseqLockById(siptemp);
1416 if(bsp)
1417 {
1418 bslen=bsp->length;
1419 AlignValBioseqUnlock (bsp);
1420 seg_start=stptr[j];
1421 /*check start*/
1422 if (seg_start < 0) {
1423 //LCOV_EXCL_START
1424 ValMessage(salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip, 0);
1425 //LCOV_EXCL_STOP
1426 }
1427 if(seg_start>=bslen)
1428 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , 0);
1429
1430 /*go through each segment*/
1431 for(i=0; i<numseg; i++)
1432 {
1433 /*if this segment is present*/
1434 if(seqpresence[(i*aligndim+j)/8]&jybitnum[(i*aligndim+j)%8])
1435 {
1436 /*check start plus seg length*/
1437 seg_start=seg_start+lenptr[i];
1438 if(seg_start>bslen)
1439 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip, numseg);
1440 }
1441 }
1442 }
1443 }
1444 }
1445 }
1446 }
1447 MemFree(seqpresence);
1448 }
1449
1450 /******************************************************************
1451 check segment length, start and end point in Denseseg, Densediag and Stdseg
1452 ******************************************************************/
ValidateSeqlengthinSeqAlign(SeqAlignPtr salp)1453 static void ValidateSeqlengthinSeqAlign (SeqAlignPtr salp)
1454 {
1455
1456 if (salp)
1457 {
1458 if(salp->segtype==1)
1459 ValidateSeqlengthInDenseDiag ((DenseDiagPtr)salp->segs, salp);
1460 else if(salp->segtype==2)
1461 ValidateSeqlengthInDenseSeg ((DenseSegPtr)salp->segs, salp);
1462 else if(salp->segtype==3)
1463 ValidateSeqlengthInStdSeg ((StdSegPtr)salp->segs, salp);
1464 else if(salp->segtype==4)
1465 ValidateSeqlengthInPackSeg ((PackSegPtr)salp->segs, salp);
1466 }
1467 }
1468
1469 /******************************************************************
1470 check if # of seqid matches the dimensions, and
1471 if there is only one seqeuence in seqalign
1472 ******************************************************************/
ValidateDimSeqIds(SeqAlignPtr salp)1473 static void ValidateDimSeqIds (SeqAlignPtr salp)
1474 {
1475 SeqIdPtr sip=NULL;
1476 DenseDiagPtr ddp=NULL, ddptemp;
1477 StdSegPtr ssp=NULL, ssptemp;
1478 DenseSegPtr dsp=NULL;
1479 Int4 numseg=0;
1480
1481 if(salp)
1482 {
1483 /*densediag */
1484 if(salp->segtype==1)
1485 {
1486
1487 ddp=(DenseDiagPtr)salp->segs;
1488 if (!ddp) {
1489 //LCOV_EXCL_START
1490 // unreachable for valid ASN.1
1491 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1492 //LCOV_EXCL_STOP
1493 }
1494 else
1495 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1496 {
1497 sip=ddptemp->id;
1498 if(ddptemp->dim==1)
1499 ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1500 if(ddptemp->dim!=CountSeqIdInSip(sip))
1501 ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1502
1503 }
1504 }
1505
1506 /*denseseg, packseg */
1507 else if(salp->segtype==2||salp->segtype==4)
1508 {
1509 dsp=(DenseSegPtr) (salp->segs);
1510 if (!dsp) {
1511 //LCOV_EXCL_START
1512 // unreachable for valid ASN.1
1513 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1514 //LCOV_EXCL_STOP
1515 }
1516 else
1517 {
1518 sip=dsp->ids;
1519 if(dsp->dim==1)
1520 ValMessage (salp, Err_SeqAlign_Dim_One, SEV_ERROR, NULL, sip , 0);
1521 if(dsp->dim!=CountSeqIdInSip(sip))
1522 ValMessage (salp, Err_SeqAlign_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , 0);
1523
1524 }
1525 }
1526
1527 /*stdseg */
1528 else if(salp->segtype==3)
1529 {
1530
1531 ssp=(StdSegPtr)salp->segs;
1532 if (!ssp) {
1533 //LCOV_EXCL_START
1534 // unreachable for valid ASN.1
1535 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1536 //LCOV_EXCL_STOP
1537 }
1538 else
1539 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1540 {
1541
1542 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1543 if(ssptemp->dim==1)
1544 ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1545 if(ssptemp->dim!=CountSeqIdInSip( sip))
1546 ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1547 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1548
1549 SeqIdSetFree(sip);
1550 }
1551 }
1552 }
1553 }
1554
1555 /******************************************************************
1556 return true if a sip is contained in a seg, or false if otherwise
1557 Note it returns FASLE for an empty seqloc.
1558 It also returns false if error in sip or ssp
1559 ******************************************************************/
IsSipContainedInStdseg(SeqIdPtr sip,StdSegPtr ssp)1560 static Boolean IsSipContainedInStdseg(SeqIdPtr sip, StdSegPtr ssp)
1561 {
1562 SeqLocPtr slp, slptemp;
1563
1564 if(!sip||!ssp)
1565 return FALSE;
1566
1567 slp=ssp->loc;
1568 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1569 {
1570 if(slptemp->choice!=SEQLOC_EMPTY&&SeqIdCmp(sip, SeqLocId(slptemp)))
1571 return TRUE;
1572 }
1573
1574 return FALSE;
1575 }
1576
PercentStringMatch(CharPtr string1,CharPtr string2)1577 static Int4 PercentStringMatch (CharPtr string1, CharPtr string2)
1578 {
1579 Int4 len1, len2, min_len, k, max_len;
1580 Int4 num_match = 0;
1581
1582 if (StringHasNoText (string1) || StringHasNoText (string2))
1583 {
1584 return 0;
1585 }
1586 len1 = StringLen (string1);
1587 len2 = StringLen (string2);
1588
1589 if (len1 > len2)
1590 {
1591 min_len = len2;
1592 max_len = len1;
1593 }
1594 else
1595 {
1596 min_len = len1;
1597 max_len = len2;
1598 }
1599
1600 for (k = 0; k < min_len; k++)
1601 {
1602 if (string1[k] == string2[k] || string1[k] == 'N' || string2[k] == 'N')
1603 {
1604 num_match++;
1605 }
1606 }
1607 return (100 * num_match) / min_len;
1608 }
1609
CheckForPercentMatch(SeqIdPtr sip_list)1610 static Boolean CheckForPercentMatch (SeqIdPtr sip_list)
1611 {
1612 SeqIdPtr sip_temp, sip_next;
1613 BioseqPtr bsp;
1614 CharPtr master_seq = NULL, this_seq = NULL;
1615
1616 if (sip_list == NULL) return FALSE;
1617 sip_next = sip_list->next;
1618 sip_list->next = NULL;
1619 bsp = BioseqFind (sip_list);
1620 if (bsp != NULL)
1621 {
1622 master_seq = GetSequenceByBsp (bsp);
1623 }
1624 sip_list->next = sip_next;
1625 sip_temp = sip_next;
1626 if (bsp == NULL || master_seq == NULL)
1627 {
1628 return FALSE;
1629 }
1630
1631 for (sip_temp = sip_next; sip_temp != NULL; sip_temp = sip_next)
1632 {
1633 sip_next = sip_temp->next;
1634 sip_temp->next = NULL;
1635
1636 bsp = BioseqFind (sip_temp);
1637 if (bsp != NULL)
1638 {
1639 this_seq = GetSequenceByBsp (bsp);
1640 } else {
1641 this_seq = NULL;
1642 }
1643
1644 sip_temp->next = sip_next;
1645 if (bsp == NULL || StringHasNoText (this_seq) || PercentStringMatch (master_seq, this_seq) < 50)
1646 {
1647 MemFree (this_seq);
1648 return FALSE;
1649 }
1650 MemFree (this_seq);
1651 }
1652 return TRUE;
1653 }
1654
1655
1656 /******************************************************************
1657 check if an alignment is FASTA-like.
1658 If all gaps are at the 3' ends with dimensions>2, it's FASTA-like
1659 ******************************************************************/
Is_Fasta_Seqalign(SeqAlignPtr salp)1660 static Boolean Is_Fasta_Seqalign (SeqAlignPtr salp)
1661 {
1662
1663 SeqIdPtr siptemp=NULL;
1664 DenseSegPtr dsp;
1665 Int4Ptr startp;
1666 Boolean gap;
1667 Int4 k;
1668 Int2 j;
1669 SeqIdPtr bad_sip = NULL;
1670
1671 /*check only global or partial type*/
1672 if(salp->type!=1&&salp->type!=3)
1673 return FALSE;
1674
1675 if (salp->segtype != SAS_DENSEG) {
1676 ValMessage (salp, Err_Unexpected_Alignment_Type, SEV_ERROR, NULL, NULL, 0);
1677 } else {
1678 dsp = (DenseSegPtr) salp->segs;
1679 if(!dsp)
1680 {
1681 //LCOV_EXCL_START
1682 // unreachable for valid ASN.1
1683 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1684 //LCOV_EXCL_STOP
1685 }
1686 else
1687 {
1688 if(dsp->dim<=2)
1689 {
1690 return FALSE;
1691 }
1692 /* if any sequence has gaps at the 5' end or internal gaps, the entire
1693 * alignment is declared to be valid.
1694 * if the sequence contains no gaps at all or only 3' end gaps, check
1695 * sequences for matches against the first sequence - if more than half
1696 * of the nucleotides are matches, then call this not FASTA-like.
1697 */
1698 for (j=0, siptemp=dsp->ids; j<dsp->dim&&siptemp; j++, siptemp=siptemp->next)
1699 {
1700 gap=FALSE;
1701
1702 for (k=0; k<dsp->numseg; k++)
1703 {
1704 startp=dsp->starts;
1705
1706 /*if start value is -1, set gap flag to true*/
1707 if (startp[dsp->dim*k + j] < 0)
1708 {
1709 gap = TRUE;
1710 }
1711 /*if a positive start value is found after the initial -1 start value, then it's not fasta like, no need to check this sequence further */
1712 else if(gap)
1713 {
1714 if (bad_sip != NULL)
1715 {
1716 SeqIdFree (bad_sip);
1717 }
1718 return FALSE;
1719 }
1720 /* if no positive start value is found after the initial -1 start value
1721 * (indicating that gaps exist only at the 5' end) or if no gaps
1722 * were found at all, flag this sequence as bad if it is the first found.
1723 */
1724 if(k==dsp->numseg-1)
1725 {
1726 if (bad_sip == NULL)
1727 {
1728 bad_sip = SeqIdDup (siptemp);
1729 }
1730 }
1731 }
1732 }
1733 if (bad_sip != NULL)
1734 {
1735 if (! CheckForPercentMatch (dsp->ids))
1736 {
1737 ValMessage (salp, Err_Fastalike, SEV_WARNING, bad_sip, dsp->ids, 0);
1738 SeqIdFree (bad_sip);
1739 return TRUE;
1740 }
1741 SeqIdFree (bad_sip);
1742 return FALSE;
1743 }
1744 }
1745 }
1746 /*no fasta like sequence is found*/
1747 return FALSE;
1748
1749 }
1750
1751
1752
1753 /******************************************************************
1754 check if there is a gap for all sequence in a segment
1755 ******************************************************************/
Segment_Gap_In_SeqAlign(SeqAlignPtr salp)1756 static void Segment_Gap_In_SeqAlign(SeqAlignPtr salp)
1757 {
1758 Int4Ptr stptr=NULL;
1759 DenseSegPtr dsp=NULL;
1760 DenseDiagPtr ddp=NULL, ddptemp;
1761 StdSegPtr ssp=NULL, ssptemp;
1762 PackSegPtr psp=NULL;
1763 Uint1Ptr seqpresence=NULL;
1764 Int2 numseg, aligndim, i, j;
1765 SeqIdPtr sip=NULL;
1766 SeqLocPtr slp=NULL, slptemp;
1767
1768
1769 if(salp)
1770 {
1771 /*densediag*/
1772 if(salp->segtype==1)
1773 {
1774 ddp=(DenseDiagPtr)salp->segs;
1775 if (!ddp) {
1776 //LCOV_EXCL_START
1777 // unreachable for valid ASN.1
1778 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1779 //LCOV_EXCL_STOP
1780 }
1781 else
1782 {
1783 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1784 {
1785 sip=ddptemp->id;
1786 /*empty segment*/
1787 if (ddptemp->dim == 0) {
1788 //LCOV_EXCL_START
1789 //ASN.1 is unreadable if dim is 0
1790 ValMessage(salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1791 //LCOV_EXCL_STOP
1792 }
1793 }
1794 }
1795 }
1796
1797
1798 /*denseseg*/
1799 else if(salp->segtype==2)
1800 {
1801 dsp=(DenseSegPtr)salp->segs;
1802 if (!dsp) {
1803 //LCOV_EXCL_START
1804 // unreachable for valid ASN.1
1805 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1806 //LCOV_EXCL_STOP
1807 }
1808 else
1809 {
1810 numseg=dsp->numseg;
1811 aligndim=dsp->dim;
1812 stptr=dsp->starts;
1813 sip=dsp->ids;
1814
1815 if(stptr==NULL)
1816 return;
1817
1818 /*go through each segment*/
1819 for(j=0; j<numseg; j++)
1820 {
1821 /*go through each sequence */
1822 for(i=0; i<aligndim; i++)
1823 {
1824
1825 if(stptr[j*aligndim+i]==-1)
1826 {
1827 /*all starts are -1 in this segment*/
1828 if(i==aligndim-1)
1829 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1830 }
1831 /*at least one start that is not -1*/
1832 else
1833 break;
1834
1835 }
1836 }
1837 }
1838 }
1839
1840 /*stdseg*/
1841 else if(salp->segtype==3)
1842 {
1843 ssp=(StdSegPtr)salp->segs;
1844 if (!ssp) {
1845 //LCOV_EXCL_START
1846 // unreachable for valid ASN.1
1847 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1848 //LCOV_EXCL_STOP
1849 }
1850 else
1851 {
1852 /*go through each segment*/
1853 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1854 {
1855 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1856 slp=ssptemp->loc;
1857 /*go through each sequence*/
1858 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1859 {
1860 if(slptemp->choice==SEQLOC_EMPTY||slptemp->choice==SEQLOC_NULL)
1861 {
1862 if(slptemp->next)
1863 continue;
1864 /*all seqloc are empty*/
1865 else
1866 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1867 }
1868 /*at least one non-empty seqloc*/
1869 else
1870 break;
1871 }
1872 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1873 SeqIdSetFree(sip);
1874
1875 }
1876 }
1877 }
1878 /*packseg*/
1879 else if(salp->segtype==4)
1880 {
1881 psp=(PackSegPtr)salp->segs;
1882 if (!psp) {
1883 //LCOV_EXCL_START
1884 // unreachable for valid ASN.1
1885 ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1886 //LCOV_EXCL_STOP
1887 }
1888 else
1889 {
1890 numseg=psp->numseg;
1891 aligndim=psp->dim;
1892 sip=psp->ids;
1893 if(psp->present)
1894 {
1895 BSSeek(psp->present, 0, SEEK_SET);
1896 seqpresence=MemNew(BSLen(psp->present));
1897 if(!seqpresence)
1898 {
1899 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1900 return;
1901
1902 }
1903 BSRead(psp->present, seqpresence, BSLen(psp->present));
1904
1905 /*go through each segment*/
1906 for(j=0; j<numseg; j++)
1907 {
1908 /*go through each sequence */
1909 for(i=0; i<aligndim; i++)
1910 {
1911 /*check the presence of each sequence by determining the bit value in a byte (0, not present; otherwise present)*/
1912 if(!(seqpresence[(j*aligndim+i)/8]&jybitnum[(j*aligndim+i)%8]))
1913 {
1914 /*more sequence to go*/
1915 if(i<aligndim-1)
1916 continue;
1917 /*no sequence is present in this segment*/
1918 else if(i==aligndim-1)
1919 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1920 }
1921 /*at least one sequence is present*/
1922 else
1923 break;
1924 }
1925 }
1926 MemFree(seqpresence);
1927 }
1928 }
1929
1930 }
1931
1932
1933 }
1934 }
1935
1936
IsAlignmentTPA(SeqAlignPtr salp)1937 static Boolean IsAlignmentTPA (SeqAlignPtr salp)
1938 {
1939 Boolean isTPA = FALSE;
1940 BioseqPtr bsp;
1941 SeqIdPtr sip = NULL, tmp_sip;
1942 SeqEntryPtr oldscope;
1943 DenseDiagPtr ddp;
1944 StdSegPtr ssp;
1945
1946 if (salp == NULL) {
1947 return FALSE;
1948 }
1949
1950 oldscope = SeqEntrySetScope (NULL);
1951
1952 switch (salp->segtype) {
1953 case SAS_DENDIAG:
1954 /*densediag */
1955 for (ddp = (DenseDiagPtr) salp->segs; ddp != NULL && !isTPA; ddp = ddp->next) {
1956 for (sip = SeqIdInAlignSegs((Pointer)ddp, salp->segtype, salp);
1957 sip != NULL && !isTPA;
1958 sip = sip->next) {
1959 bsp = BioseqLockById(sip);
1960 isTPA = HasTpaUserObject(bsp);
1961 BioseqUnlock(bsp);
1962 }
1963 }
1964 break;
1965 case SAS_STD:
1966 /*Stdseg*/
1967 for (ssp = (StdSegPtr) salp->segs; ssp != NULL && !isTPA; ssp = ssp->next) {
1968 sip = SeqIdInAlignSegs((Pointer)ssp, salp->segtype, salp);
1969 for (tmp_sip = sip;
1970 tmp_sip != NULL && !isTPA;
1971 tmp_sip = tmp_sip->next) {
1972 bsp = BioseqLockById(tmp_sip);
1973 isTPA = HasTpaUserObject(bsp);
1974 BioseqUnlock(bsp);
1975 }
1976 }
1977 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1978 SeqIdSetFree(sip);
1979 break;
1980 case SAS_DENSEG:
1981 case SAS_PACKED:
1982 /*Denseseg, Packseg*/
1983 for (sip=SeqIdInAlignSegs(salp->segs, salp->segtype, salp);
1984 sip != NULL && !isTPA;
1985 sip = sip->next) {
1986 /* NOTE - we do not want to fetch Bioseqs if they aren't local.
1987 * we only care about TpaUserObjects on Bioseqs in THIS record.
1988 */
1989 bsp = BioseqFind(sip);
1990 isTPA = HasTpaUserObject(bsp);
1991 }
1992 break;
1993 }
1994
1995 SeqEntrySetScope (oldscope);
1996 return isTPA;
1997 }
1998
1999
2000 //LCOV_EXCL_START
2001 // only call to this function is commented out
CheckAlnSeqLens(SeqAlignPtr salp)2002 static void CheckAlnSeqLens (SeqAlignPtr salp)
2003 {
2004 Int4 aln_len, start, stop;
2005 Int4 num_rows, row;
2006 SeqIdPtr sip;
2007 BioseqPtr bsp;
2008 Boolean is_shorter = FALSE;
2009
2010 if (salp == NULL) return;
2011
2012 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2013 num_rows = AlnMgr2GetNumRows(salp);
2014 if (num_rows < 0) {
2015 return;
2016 }
2017
2018 for (row = 1; row <= num_rows && !is_shorter; row++) {
2019 sip = AlnMgr2GetNthSeqIdPtr(salp, row);
2020 bsp = BioseqFind (sip);
2021 if (bsp != NULL && bsp->idx.entityID == salp->idx.entityID) {
2022 AlnMgr2GetNthSeqRangeInSA(salp, row, &start, &stop);
2023 if ((stop > start && stop < bsp->length - 1) || (start > stop && start > bsp->length - 1)) {
2024 is_shorter = TRUE;
2025 }
2026 }
2027 sip = SeqIdFree (sip);
2028 }
2029 if (is_shorter) {
2030 ValMessage (salp, Err_Short_Aln, SEV_INFO, NULL, NULL, 0);
2031 }
2032 }
2033 //LCOV_EXCL_STOP
2034
2035
AlignmentScorePercentIdOk(SeqAlignPtr salp)2036 static Boolean AlignmentScorePercentIdOk (SeqAlignPtr salp)
2037 {
2038 ScorePtr score;
2039
2040 if (salp == NULL) {
2041 return FALSE;
2042 }
2043 for (score = salp->score; score != NULL; score = score->next) {
2044 if (score->id != NULL
2045 && score->id->str != NULL
2046 && StringICmp (score->id->str, "pct_identity_ungap") == 0) {
2047 if (score->value.realvalue > 50.0) {
2048 return TRUE;
2049 } else {
2050 return FALSE;
2051 }
2052 }
2053 }
2054 return FALSE;
2055 }
2056
2057
2058 /******************************************************************
2059 validate seqid, segment length, strand in Seqalignment for Denseseg,
2060 Densediag and Stdseg. Also check if it's FASTA-like
2061 ******************************************************************/
ValidateSeqAlignFunc(SeqAlignPtr salp,Boolean find_remote_bsp)2062 static Boolean ValidateSeqAlignFunc (SeqAlignPtr salp, Boolean find_remote_bsp)
2063 {
2064 Boolean error=FALSE;
2065 Uint2 pcnt_identity;
2066 SeqAlignPtr salp_test;
2067
2068 if(salp==NULL)
2069 return FALSE;
2070
2071 /*validate if dimesion equals number of seqid*/
2072 ValidateDimSeqIds (salp);
2073
2074 if (find_remote_bsp) {
2075 ValidateSeqIdInSeqAlign (salp);
2076 ValidateSeqlengthinSeqAlign (salp);
2077 }
2078 /*validate strand*/
2079 ValidateStrandinSeqAlign (salp);
2080
2081 /*validate Fasta like*/
2082 if (Is_Fasta_Seqalign (salp))
2083 {
2084 error = TRUE;
2085 }
2086
2087 /*validate segment gap*/
2088 Segment_Gap_In_SeqAlign (salp);
2089
2090 if (!IsAlignmentTPA(salp) && !AlignmentScorePercentIdOk(salp)) {
2091 if (salp->segtype == SAS_DENDIAG) {
2092 /* duplicate alignment, to prevent indexing from changing the original type */
2093 salp_test = SeqAlignDup (salp);
2094 pcnt_identity = AlignmentPercentIdentityEx (salp_test, FALSE, TRUE);
2095 salp_test = SeqAlignFree (salp_test);
2096 } else {
2097 pcnt_identity = AlignmentPercentIdentityEx (salp, FALSE, TRUE);
2098 }
2099
2100 if (pcnt_identity < 50) {
2101 ValMessage (salp, Err_Pcnt_ID, SEV_WARNING, NULL, NULL, pcnt_identity);
2102 }
2103
2104 /* CheckAlnSeqLens (salp); */
2105 }
2106
2107 return error;
2108 }
2109
2110
2111 /******************************************************************
2112 validate each alignment sequentially.
2113 This function will subject the seqalign to all validation functions
2114 ******************************************************************/
ValidateSeqAlign(SeqAlignPtr salp,Uint2 entityID,Boolean message,Boolean msg_success,Boolean find_remote_bsp,Boolean delete_bsp,Boolean delete_salp,BoolPtr dirty)2115 NLM_EXTERN Boolean ValidateSeqAlign (SeqAlignPtr salp, Uint2 entityID, Boolean message,
2116 Boolean msg_success, Boolean find_remote_bsp,
2117 Boolean delete_bsp, Boolean delete_salp, BoolPtr dirty)
2118 {
2119 SeqAlignPtr pre,
2120 salptmp;
2121 SaVal sv;
2122 SaValPtr svp;
2123 ValNodePtr vnp;
2124 JYErrorMsgPtr bemp;
2125 MsgAnswer ans;
2126 Int2 err_count=0,
2127 salp_count=0;
2128 Boolean retdel = FALSE;
2129
2130 if(salp!=NULL)
2131 {
2132 sv.message = message;
2133 sv.msg_success = msg_success;
2134 sv.find_remote_bsp = find_remote_bsp;
2135 sv.delete_salp = delete_salp;
2136 sv.delete_bsp = delete_bsp;
2137 sv.retdel = TRUE;
2138 sv.do_hist_assembly = FALSE;
2139 sv.ids = NULL;
2140 sv.entityID = entityID;
2141 sv.dirty = FALSE;
2142 svp = &sv;
2143 pre=NULL;
2144 salptmp=salp;
2145 while (salptmp)
2146 {
2147 salp_count++;
2148 if (salptmp->segtype == SAS_SPARSE) {
2149 ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2150 } else if (salptmp->segtype == SAS_SPLICED) {
2151 ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2152 }
2153 else if (salptmp->segtype==5)
2154 {
2155 ValidateSeqAlign ((SeqAlignPtr) (salptmp->segs), entityID, message, msg_success, find_remote_bsp, delete_bsp, delete_salp, &svp->dirty);
2156 }
2157 else if (salptmp->segtype<1 || salptmp->segtype>4)
2158 {
2159 ValMessage (salp, Err_Segtype, SEV_ERROR, NULL, NULL, salptmp->segtype);
2160 }
2161 else {
2162 ValidateSeqAlignFunc (salptmp, svp->find_remote_bsp);
2163 }
2164 if (errorp)
2165 {
2166 //LCOV_EXCL_START
2167 //not used
2168 if(svp->message)
2169 {
2170 for (vnp=errorp; vnp!=NULL; vnp=vnp->next)
2171 {
2172 bemp=(JYErrorMsgPtr)vnp->data.ptrvalue;
2173 ErrPostEx ((ErrSev) bemp->level, 0, 0, bemp->msg);
2174 }
2175 }
2176 errorp = JYErrorChainDestroy (errorp);
2177 if (svp->delete_salp)
2178 {
2179 if (pre==NULL) {
2180 salp=salptmp->next;
2181 salptmp->next = NULL;
2182 SeqAlignFree (salptmp);
2183 salptmp = salp;
2184 }
2185 else {
2186 pre->next = salptmp->next;
2187 salptmp->next = NULL;
2188 SeqAlignFree (salptmp);
2189 salptmp = pre->next;
2190 }
2191 }
2192 else {
2193 salptmp = salptmp->next;
2194 }
2195 err_count++;
2196 svp->retdel=FALSE;
2197 //LCOV_EXCL_STOP
2198 }
2199 else {
2200 salptmp = salptmp->next;
2201 }
2202 }
2203 if (err_count==0 && svp->msg_success) {
2204 if (salp_count>1)
2205 ans = Message (MSG_OK, "Validation test of %d alignments succeeded", salp_count);
2206 else
2207 ans = Message (MSG_OK, "Validation test of the alignment succeeded");
2208 }
2209 if (dirty)
2210 *dirty = svp->dirty;
2211 retdel = svp->retdel;
2212 }
2213 return retdel;
2214 }
2215
2216
2217 //LCOV_EXCL_START
2218 /******************************************************************
2219 call back function for REGISTER_ALIGNVALIDATION defined in sequin4.c.
2220 Starting point for seqalign validation if user clicked on
2221 SeqalignValidation under menu Filer/Alignment.
2222 Either individual alignment or alignment block
2223 should be highlighted for this validation to work
2224 ******************************************************************/
2225
ValidateSeqAlignFromData(Pointer data)2226 NLM_EXTERN Int2 LIBCALLBACK ValidateSeqAlignFromData (Pointer data)
2227 {
2228
2229 OMProcControlPtr ompcp;
2230 SeqAlignPtr salp=NULL;
2231 SeqAnnotPtr sap=NULL;
2232 SeqEntryPtr sep=NULL;
2233
2234 ompcp = (OMProcControlPtr) data;
2235 if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2236
2237 if (ompcp->input_data == NULL) return OM_MSG_RET_ERROR;
2238
2239 switch(ompcp->input_itemtype)
2240 {
2241 case OBJ_BIOSEQ :
2242 sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2243 break;
2244 case OBJ_BIOSEQSET :
2245 sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2246 break;
2247 /*if clicked on alignment block*/
2248 case OBJ_SEQANNOT:
2249 sap=(SeqAnnotPtr) (ompcp->input_data);
2250 break;
2251 /*if clicked on individual alignment*/
2252 case OBJ_SEQALIGN:
2253 salp=(SeqAlignPtr) (ompcp->input_data);
2254 break;
2255 case 0 :
2256 return OM_MSG_RET_ERROR;
2257 default :
2258 return OM_MSG_RET_ERROR;
2259 }
2260
2261 ErrSetMessageLevel(SEV_ERROR);
2262 if(sap!=NULL)
2263 {
2264 salp=is_salp_in_sap(sap, 2);
2265 ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2266 }
2267 if (salp!=NULL) {
2268 ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2269 }
2270 if (sep!=NULL) {
2271 ValidateSeqAlignInSeqEntry (sep, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE);
2272 }
2273 return OM_MSG_RET_DONE;
2274 }
2275 //LCOV_EXCL_STOP
2276
ValidateSeqAlignInAnnot(SeqAnnotPtr sap,SaValPtr svp)2277 static void ValidateSeqAlignInAnnot (SeqAnnotPtr sap, SaValPtr svp)
2278
2279 {
2280 SeqAlignPtr salp;
2281
2282 while (sap != NULL) {
2283 if (sap->type == 2) {
2284 salp = (SeqAlignPtr) sap->data;
2285 if (salp != NULL) {
2286 ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2287 }
2288 }
2289 sap = sap->next;
2290 }
2291 }
2292
ValidateSeqAlignInHist(SeqHistPtr hist,SaValPtr svp)2293 static void ValidateSeqAlignInHist (SeqHistPtr hist, SaValPtr svp)
2294
2295 {
2296 SeqAlignPtr salp;
2297
2298 if (hist == NULL) return;
2299 salp = hist->assembly;
2300 /* ValidateSeqAlign will validate the entire chain */
2301 ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2302 }
2303
ValidateSeqAlignCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)2304 static void ValidateSeqAlignCallback (SeqEntryPtr sep, Pointer mydata,
2305 Int4 index, Int2 indent)
2306 {
2307 BioseqPtr bsp;
2308 BioseqSetPtr bssp;
2309 SaValPtr svp;
2310
2311 if (sep != NULL && sep->data.ptrvalue && mydata != NULL) {
2312 svp = (SaValPtr)mydata;
2313 if (IS_Bioseq(sep)) {
2314 bsp = (BioseqPtr) sep->data.ptrvalue;
2315 if (bsp!=NULL) {
2316 ValidateSeqAlignInAnnot (bsp->annot, svp);
2317 if (svp != NULL && svp->do_hist_assembly) {
2318 ValidateSeqAlignInHist (bsp->hist, svp);
2319 }
2320 }
2321 }
2322 else if(IS_Bioseq_set(sep)) {
2323 bssp = (BioseqSetPtr)sep->data.ptrvalue;
2324 if (bssp!=NULL) {
2325 ValidateSeqAlignInAnnot (bssp->annot, svp);
2326 }
2327 }
2328 }
2329 }
2330
2331
2332
ValidateSeqAlignInSeqEntry(SeqEntryPtr sep,Boolean message,Boolean msg_success,Boolean find_remote_bsp,Boolean delete_bsp,Boolean delete_salp,Boolean do_hist_assembly)2333 NLM_EXTERN Boolean ValidateSeqAlignInSeqEntry (SeqEntryPtr sep, Boolean message,
2334 Boolean msg_success, Boolean find_remote_bsp,
2335 Boolean delete_bsp, Boolean delete_salp,
2336 Boolean do_hist_assembly)
2337 {
2338 SeqEntryPtr sep_head;
2339 Uint2 entityID;
2340 SaVal sv;
2341 Boolean success=TRUE;
2342
2343 entityID = ObjMgrGetEntityIDForChoice (sep);
2344 if (entityID > 0) {
2345 sep_head = GetTopSeqEntryForEntityID (entityID);
2346 if (sep_head != NULL) {
2347 sv.message = message;
2348 sv.msg_success = msg_success;
2349 sv.find_remote_bsp = find_remote_bsp;
2350 sv.find_acc_bsp = FALSE;
2351 sv.delete_salp = delete_salp;
2352 sv.delete_bsp = delete_bsp;
2353 sv.retdel = TRUE;
2354 sv.do_hist_assembly = do_hist_assembly;
2355 sv.ids = NULL;
2356 sv.entityID = entityID;
2357 sv.dirty = FALSE;
2358 SeqEntryExplore (sep_head, (Pointer)&sv, ValidateSeqAlignCallback);
2359 if (sv.dirty) {
2360 ObjMgrSetDirtyFlag (entityID, TRUE);
2361 ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
2362 }
2363 success = sv.retdel;
2364 }
2365 }
2366 return success;
2367 }
2368
2369
2370 /* alignment validator private for regular validator */
2371
2372 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly);
2373
ValidateSeqAlignWithinValidator(ValidStructPtr vsp,SeqEntryPtr sep,Boolean find_remote_bsp,Boolean do_hist_assembly)2374 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly)
2375
2376 {
2377 GatherContext gc;
2378 Boolean rsult;
2379
2380 if (vsp == NULL || sep == NULL) return FALSE;
2381 useLockByID = vsp->farIDsInAlignments;
2382 useValErr = TRUE;
2383 useVsp = vsp;
2384 vsp->gcp = &gc;
2385 vsp->bssp = NULL;
2386 vsp->bsp = NULL;
2387 vsp->sfp = NULL;
2388 vsp->descr = NULL;
2389 MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
2390 rsult = ValidateSeqAlignInSeqEntry (sep, FALSE, FALSE, find_remote_bsp, FALSE, FALSE, do_hist_assembly);
2391 useLockByID = TRUE;
2392 useValErr = FALSE;
2393 useVsp = NULL;
2394 return rsult;
2395 }
2396
2397
2398 /* PopulateSample and ReadFromAlignmentSample are utility functions for AlignmentPercentIdentity */
PopulateSample(Uint1Ptr seqbuf_list,Int4Ptr start_list,Int4 sample_len,BioseqPtr PNTR bsp_list,Int4 row)2399 static void PopulateSample (Uint1Ptr seqbuf_list, Int4Ptr start_list,
2400 Int4 sample_len, BioseqPtr PNTR bsp_list,
2401 Int4 row)
2402 {
2403 Char ch;
2404
2405 if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2406 || bsp_list[row] == NULL || start_list[row] < 0 || start_list[row] >= bsp_list[row]->length) {
2407 return;
2408 }
2409
2410 ch = *(seqbuf_list + (row + 1) * sample_len);
2411
2412 SeqPortStreamInt (bsp_list[row],
2413 start_list[row],
2414 MIN (start_list[row] + sample_len - 1, bsp_list[row]->length - 1),
2415 Seq_strand_plus,
2416 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
2417 seqbuf_list + row * sample_len,
2418 NULL);
2419
2420 /* put back char overwritten by SeqPortStreamInt */
2421 *(seqbuf_list + (row + 1) * sample_len) = ch;
2422
2423 }
2424
2425
ComplementChar(Uint1 ch)2426 static Uint1 ComplementChar (Uint1 ch)
2427 {
2428 if (ch == 'A') {
2429 return 'T';
2430 } else if (ch == 'T') {
2431 return 'A';
2432 } else if (ch == 'G') {
2433 return 'C';
2434 } else if (ch == 'C') {
2435 return 'G';
2436 } else {
2437 return ch;
2438 }
2439 }
2440
ReadFromAlignmentSample(Uint1Ptr seqbuf_list,Int4Ptr start_list,Int4 sample_len,BioseqPtr PNTR bsp_list,Uint1Ptr strand_list,Int4 row,Int4 seq_pos)2441 static Uint1 ReadFromAlignmentSample(Uint1Ptr seqbuf_list, Int4Ptr start_list,
2442 Int4 sample_len, BioseqPtr PNTR bsp_list,
2443 Uint1Ptr strand_list,
2444 Int4 row, Int4 seq_pos)
2445 {
2446 Uint1 ch = 0;
2447
2448 if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2449 || bsp_list[row] == NULL || seq_pos < 0 || seq_pos >= bsp_list[row]->length) {
2450 return 0;
2451 }
2452
2453 if (seq_pos < start_list[row] || seq_pos >= start_list[row] + sample_len) {
2454 start_list[row] = (seq_pos / sample_len) * sample_len;
2455 PopulateSample (seqbuf_list, start_list,
2456 sample_len, bsp_list,
2457 row);
2458 }
2459 ch = seqbuf_list[(row * sample_len) + seq_pos - start_list[row]];
2460 if (strand_list[row] == Seq_strand_minus) {
2461 ch = ComplementChar(ch);
2462 }
2463 return ch;
2464 }
2465
2466 typedef struct ambchar {
2467 Char ambig_char;
2468 CharPtr match_list;
2469 } AmbCharData, PNTR AmbCharPtr;
2470
2471 static const AmbCharData ambiguity_list[] = {
2472 { 'R', "AG" },
2473 { 'Y', "CT" },
2474 { 'M', "AC" },
2475 { 'K', "GT" },
2476 { 'S', "CG" },
2477 { 'W', "AT" },
2478 { 'H', "ACT" },
2479 { 'B', "CGT" },
2480 { 'V', "ACG" },
2481 { 'D', "AGT" }};
2482
2483 static const Int4 num_ambiguities = sizeof (ambiguity_list) / sizeof (AmbCharData);
2484
AmbiguousMatch(Char ch1,Char ch2)2485 static Char AmbiguousMatch (Char ch1, Char ch2)
2486 {
2487 Int4 i;
2488 for (i = 0; i < num_ambiguities; i++) {
2489 if (ch1 == ambiguity_list[i].ambig_char
2490 && StringChr (ambiguity_list[i].match_list, ch2)) {
2491 return ch2;
2492 } else if (ch2 == ambiguity_list[i].ambig_char
2493 && StringChr (ambiguity_list[i].match_list, ch1)) {
2494 return ch1;
2495 }
2496 }
2497 return 0;
2498 }
2499
2500
2501 //LCOV_EXCL_START
2502 extern double *
GetAlignmentColumnPercentIdentities(SeqAlignPtr salp,Int4 start,Int4 stop,Boolean internal_gaps,Boolean internal_validation)2503 GetAlignmentColumnPercentIdentities
2504 (SeqAlignPtr salp,
2505 Int4 start,
2506 Int4 stop,
2507 Boolean internal_gaps,
2508 Boolean internal_validation)
2509 {
2510 Int4 aln_len, num_rows, row, col_count = 0;
2511 Int4 num_match;
2512 Int4 aln_pos, seq_pos, k;
2513 Uint1 row_ch;
2514 SeqEntryPtr oldscope;
2515 SeqIdPtr PNTR sip_list;
2516 BioseqPtr PNTR bsp_list;
2517 Uint1Ptr strand_list;
2518 BoolPtr start_gap, end_gap;
2519 Int4Ptr start_list;
2520 Uint1Ptr seqbuf_list;
2521 Int4 sample_len = 50;
2522 Int4 chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2523 Int4 max_app, total_app, i;
2524 double * pct_ids;
2525
2526 if (salp == NULL || start < 0 || stop < start) return NULL;
2527
2528 AlnMgr2IndexSingleChildSeqAlign(salp);
2529 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2530 num_rows = AlnMgr2GetNumRows(salp);
2531 if (num_rows < 0) {
2532 Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2533 return NULL;
2534 }
2535
2536 pct_ids = (double *) MemNew (sizeof (double) * (stop - start + 1));
2537 MemSet (pct_ids, 0, sizeof (double) * (stop - start + 1));
2538
2539 bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2540 sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2541 strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2542 start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2543 end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2544 for (row = 1; row <= num_rows; row++) {
2545 sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2546 strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2547 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2548 if (bsp_list[row - 1] == NULL) {
2549 oldscope = SeqEntrySetScope (NULL);
2550 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2551 SeqEntrySetScope(oldscope);
2552 if (bsp_list[row - 1] == NULL) {
2553 break;
2554 }
2555 }
2556 start_gap[row - 1] = TRUE;
2557 end_gap[row - 1] = FALSE;
2558 }
2559
2560 if (row <= num_rows) {
2561 Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2562 while (row >= 0) {
2563 sip_list[row] = SeqIdFree(sip_list[row]);
2564 BioseqUnlock(bsp_list[row]);
2565 row--;
2566 }
2567 sip_list = MemFree (sip_list);
2568 bsp_list = MemFree (bsp_list);
2569 start_gap = MemFree (start_gap);
2570 end_gap = MemFree (end_gap);
2571 return 0;
2572 }
2573
2574 start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2575 seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2576 for (row = 0; row < num_rows; row++) {
2577 start_list[row] = 0;
2578 PopulateSample (seqbuf_list, start_list,
2579 sample_len, bsp_list,
2580 row);
2581 }
2582
2583 num_match = 0;
2584 for (aln_pos = start; aln_pos < aln_len && aln_pos <= stop; aln_pos++) {
2585 /* init lists */
2586 MemSet (chars_appearing, 0, sizeof (chars_appearing));
2587 for (row = 1; row <= num_rows; row++) {
2588 if (end_gap[row - 1]) {
2589 continue;
2590 }
2591 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2592 if (seq_pos < 0) {
2593 if (start_gap[row - 1] || end_gap[row - 1]) {
2594 /* beginning/end gap - never counts against percent identity */
2595 } else {
2596 k = aln_pos + 1;
2597 while (k < aln_len && seq_pos < 0) {
2598 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2599 k++;
2600 }
2601 if (seq_pos < 0) {
2602 /* now in end_gap for this sequence */
2603 end_gap[row - 1] = TRUE;
2604 } else {
2605 /* internal gaps count against percent identity when specified */
2606 if (internal_gaps) {
2607 chars_appearing[4] ++;
2608 }
2609 }
2610 }
2611 } else {
2612 start_gap[row - 1] = FALSE;
2613
2614 row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2615 sample_len, bsp_list, strand_list,
2616 row - 1, seq_pos);
2617 switch (row_ch) {
2618 case 'A':
2619 chars_appearing[0]++;
2620 break;
2621 case 'T':
2622 chars_appearing[1]++;
2623 break;
2624 case 'G':
2625 chars_appearing[2]++;
2626 break;
2627 case 'C':
2628 chars_appearing[3]++;
2629 break;
2630 default:
2631 /* we don't count ambiguity characters */
2632 break;
2633 }
2634 }
2635 }
2636 max_app = 0;
2637 total_app = 0;
2638 for (i = 0; i < 4; i++) {
2639 if (chars_appearing[i] > max_app) {
2640 max_app = chars_appearing[i];
2641 }
2642 total_app += chars_appearing[i];
2643 }
2644 /* add in internal gaps */
2645 total_app += chars_appearing[4];
2646 if (total_app > 0) {
2647 pct_ids[aln_pos - start] = (double) max_app / (double) total_app;
2648 }
2649 col_count++;
2650 }
2651
2652 for (row = 0; row < num_rows; row++) {
2653 sip_list[row] = SeqIdFree(sip_list[row]);
2654 BioseqUnlock(bsp_list[row]);
2655 }
2656 sip_list = MemFree (sip_list);
2657 bsp_list = MemFree (bsp_list);
2658 start_gap = MemFree (start_gap);
2659 end_gap = MemFree (end_gap);
2660 start_list = MemFree (start_list);
2661 seqbuf_list = MemFree (seqbuf_list);
2662
2663 return pct_ids;
2664 }
2665 //LCOV_EXCL_STOP
2666
2667
AlignmentPercentIdentityEx(SeqAlignPtr salp,Boolean internal_gaps,Boolean internal_validation)2668 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation)
2669 {
2670 Int4 aln_len, num_rows, row, col_count = 0;
2671 Int4 num_match;
2672 Uint2 pcnt;
2673 Boolean row_match;
2674 Int4 aln_pos, seq_pos, tmp;
2675 Uint1 seq_ch, row_ch, amb_match;
2676 SeqEntryPtr oldscope;
2677 SeqIdPtr PNTR sip_list;
2678 BioseqPtr PNTR bsp_list;
2679 Uint1Ptr strand_list;
2680 Int4Ptr start_list;
2681 Uint1Ptr seqbuf_list;
2682 Int4 sample_len = 50;
2683 Int4Ptr starts, stops;
2684 Int4 match_25 = 0;
2685 ErrSev logsev;
2686 ErrSev msgsev;
2687
2688 if (salp == NULL) return 0;
2689
2690 AlnMgr2IndexSingleChildSeqAlign(salp);
2691 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2692 num_rows = AlnMgr2GetNumRows(salp);
2693 if (num_rows < 0) {
2694 if (! internal_validation) {
2695 Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2696 }
2697 return 0;
2698 }
2699 bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2700 sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2701 strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2702 starts = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2703 stops = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2704 for (row = 1; row <= num_rows; row++) {
2705 sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2706 strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2707 msgsev = ErrSetMessageLevel (SEV_MAX);
2708 logsev = ErrSetLogLevel (SEV_MAX);
2709 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2710 ErrSetLogLevel (logsev);
2711 ErrSetMessageLevel (msgsev);
2712 if (bsp_list[row - 1] == NULL) {
2713 oldscope = SeqEntrySetScope (NULL);
2714 msgsev = ErrSetMessageLevel (SEV_MAX);
2715 logsev = ErrSetLogLevel (SEV_MAX);
2716 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2717 ErrSetLogLevel (logsev);
2718 ErrSetMessageLevel (msgsev);
2719 SeqEntrySetScope(oldscope);
2720 if (bsp_list[row - 1] == NULL) {
2721 break;
2722 }
2723 }
2724 /* get endpoints for each row */
2725 AlnMgr2GetNthSeqRangeInSA(salp, row, starts + row - 1, stops + row - 1);
2726 starts[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, starts[row - 1], row);
2727 stops[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, stops[row - 1], row);
2728 if (starts[row - 1] > stops[row - 1]) {
2729 tmp = starts[row - 1];
2730 starts[row - 1] = stops[row - 1];
2731 stops[row - 1] = tmp;
2732 }
2733
2734 }
2735
2736 if (row <= num_rows) {
2737 if (! internal_validation) {
2738 Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2739 }
2740 while (row > 0) {
2741 sip_list[row - 1] = SeqIdFree(sip_list[row - 1]);
2742 BioseqUnlock(bsp_list[row - 1]);
2743 row--;
2744 }
2745 sip_list = MemFree (sip_list);
2746 bsp_list = MemFree (bsp_list);
2747 starts = MemFree (starts);
2748 stops = MemFree (stops);
2749 return 0;
2750 }
2751
2752 start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2753 seqbuf_list = (Uint1Ptr) MemNew ((num_rows * sample_len + 1) * sizeof(Uint1));
2754 for (row = 0; row < num_rows; row++) {
2755 start_list[row] = 0;
2756 PopulateSample (seqbuf_list, start_list,
2757 sample_len, bsp_list,
2758 row);
2759 }
2760
2761 num_match = 0;
2762 for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2763 row_match = TRUE;
2764 seq_ch = 0;
2765 for (row = 1; row <= num_rows; row++) {
2766 if (aln_pos < starts[row - 1] || aln_pos > stops[row - 1]) {
2767 continue;
2768 }
2769 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2770 if (seq_pos < 0) {
2771 if (internal_gaps) {
2772 row_match = FALSE;
2773 }
2774 } else {
2775 row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2776 sample_len, bsp_list, strand_list,
2777 row - 1, seq_pos);
2778 if (row_ch == 'N') {
2779 /* do nothing - Ns do not count against percent identity */
2780 } else if (seq_ch == 0) {
2781 seq_ch = row_ch;
2782 } else if (seq_ch != row_ch) {
2783 amb_match = AmbiguousMatch (seq_ch, row_ch);
2784 if (amb_match == 0) {
2785 row_match = FALSE;
2786 } else {
2787 seq_ch = amb_match;
2788 }
2789 }
2790 }
2791 if (!row_match) {
2792 break;
2793 }
2794 }
2795 if (row_match) {
2796 num_match++;
2797 match_25++;
2798 }
2799 col_count++;
2800 if (col_count % 25 == 0) {
2801 match_25 = 0;
2802 }
2803 }
2804
2805 for (row = 0; row < num_rows; row++) {
2806 sip_list[row] = SeqIdFree(sip_list[row]);
2807 BioseqUnlock(bsp_list[row]);
2808 }
2809 sip_list = MemFree (sip_list);
2810 bsp_list = MemFree (bsp_list);
2811 starts = MemFree (starts);
2812 stops = MemFree (stops);
2813 start_list = MemFree (start_list);
2814 seqbuf_list = MemFree (seqbuf_list);
2815
2816 if (col_count == 0) {
2817 pcnt = 0;
2818 } else {
2819 pcnt = (100 * num_match) / col_count;
2820 }
2821 return pcnt;
2822 }
2823
2824 //LCOV_EXCL_START
AlignmentPercentIdentity(SeqAlignPtr salp,Boolean internal_gaps)2825 extern Uint2 AlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2826 {
2827 return AlignmentPercentIdentityEx (salp, internal_gaps, FALSE);
2828 }
2829 //LCOV_EXCL_STOP
2830