1 /*  alignval.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  alignval.c
27 *
28 * Author:  Jian Ye, Colombe Chappey
29 *
30 * Version Creation Date:   6/3/99
31 *
32 * $Revision: 6.95 $
33 *
34 * File Description:  To validate sequence alignment.
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 
46 #include <ncbi.h>
47 #include <seqmgr.h>
48 #include <objmgr.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <satutil.h>
52 #include <salsap.h>
53 #include <txalign.h>
54 #include <salpacc.h>
55 #include <alignval.h>
56 #include <valid.h>
57 #include <alignmgr2.h>
58 
59 
60 Uint1  jybitnum[8]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
61 
62 typedef struct saval {
63   Boolean     message;
64   Boolean     msg_success;
65   Boolean     find_remote_bsp;
66   Boolean     find_acc_bsp;
67   Boolean     delete_salp;
68   Boolean     delete_bsp;
69   Boolean     retdel;
70   Boolean     do_hist_assembly;
71   ValNodePtr  ids;
72   Uint2       entityID;
73   Boolean     dirty;
74 } SaVal, PNTR SaValPtr;
75 
76 typedef struct JY_error_msg {
77         Uint1 level;/* corresponds to levels of ErrPostEx [none(0), info(1), war
78 n(2), error(3) and fatal(4)] */
79         CharPtr msg;
80 } JYErrorMsg, *JYErrorMsgPtr;
81 
82 /******************************************************************
83 ***
84 *** Error Messaging
85 ***    copies of the BLASt functions in blastpri.h
86 ***    JYConstructErrorMessage = BlastConstructErrorMessage
87 ***    JYErrorChainDestroy = BlastErrorChainDestroy
88 ***
89 ******************************************************************/
90 
91 static ValNodePtr errorp = NULL;
92 #define BUFFER_LENGTH 512
93 
94 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation);
95 
96 //LCOV_EXCL_START
JYConstructErrorMessage(CharPtr function,CharPtr message,Uint1 level,ValNodePtr PNTR vnpp)97 static ValNodePtr JYConstructErrorMessage (CharPtr function, CharPtr message, Uint1 level, ValNodePtr PNTR vnpp)
98 {
99         Char buffer[BUFFER_LENGTH];
100         CharPtr ptr;
101         JYErrorMsgPtr error_msg;
102 
103         if (vnpp == NULL)
104                 return NULL;
105 
106         buffer[0] = NULLB;
107         ptr = buffer;
108         if (function != NULL)
109         {
110                 sprintf(buffer, "%s: ", function);
111                 ptr = buffer + StringLen(buffer);
112         }
113 
114         if (message != NULL)
115         {
116                 sprintf(ptr, "%s", message);
117         }
118 
119         error_msg = (JYErrorMsgPtr) MemNew(sizeof(JYErrorMsg));
120         error_msg->msg = StringSave(buffer);
121         error_msg->level = level;
122 
123         ValNodeAddPointer(vnpp, 0, error_msg);
124 
125         return *vnpp;
126 }
127 
JYErrorChainDestroy(ValNodePtr vnp)128 static ValNodePtr JYErrorChainDestroy (ValNodePtr vnp)
129 
130 {
131         ValNodePtr start = vnp;
132         JYErrorMsgPtr error_msg;
133 
134         while (vnp)
135         {
136            error_msg = (JYErrorMsgPtr) vnp->data.ptrvalue;
137            if (error_msg != NULL) {
138               MemFree(error_msg->msg);
139            }
140            vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
141            vnp = vnp->next;
142         }
143 
144         ValNodeFree(start);
145 
146         return NULL;
147 }
148 //LCOV_EXCL_STOP
149 
150 /******************************************************************
151 Output error message according to code defined in alignval.h.
152 id refers to seqid of the sequence that causes the error
153 and idcontext refers to other sequences in the same segment.
154 Intvalue is used to indicate 1) the segment where the sequence
155 with error is, or 2) the segtype in case of segtype error.
156 Please note that not all errors report all three
157 parameters(id, idcontext, Intvalue)
158 ******************************************************************/
159 
160 static Boolean useValErr = FALSE;
161 static Boolean useLockByID = FALSE;
162 static ValidStructPtr useVsp = NULL;
163 
AlignValBioseqLockById(SeqIdPtr sid)164 static BioseqPtr AlignValBioseqLockById (SeqIdPtr sid)
165 
166 {
167   Int4 old_sev;
168   BioseqPtr bsp = NULL;
169 
170   if (useLockByID) {
171     old_sev = ErrSetMessageLevel (SEV_WARNING);
172     bsp = BioseqLockById (sid);
173     ErrSetMessageLevel ((ErrSev) old_sev);
174   } else {
175     bsp = BioseqFindCore (sid);
176   }
177   return bsp;
178 }
179 
AlignValBioseqUnlock(BioseqPtr bsp)180 static Boolean AlignValBioseqUnlock (BioseqPtr bsp)
181 
182 {
183   if (useLockByID) {
184     return BioseqUnlock (bsp);
185   } else {
186     return TRUE;
187   }
188 }
189 
190 NLM_EXTERN void CDECL  ValidErr VPROTO((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
191 
192 /*****************************************************************
193 *  get the approximate sequence coordinate for an alignment segment
194 *  sip == NULL -> get alignment coordinate
195 *****************************************************************/
valmsggetseqpos(SeqAlignPtr sap,Int4 segment,SeqIdPtr sip)196 static Int4 valmsggetseqpos(SeqAlignPtr sap, Int4 segment, SeqIdPtr sip)
197 {
198    Int4          c;
199    DenseDiagPtr  ddp;
200    DenseSegPtr   dsp;
201    Boolean       found;
202    Int4          i;
203    Int4          j;
204    Int4          pos;
205    PackSegPtr    psp;
206    Uint1Ptr      seqpresence;
207    SeqIdPtr      sip_tmp;
208    SeqLocPtr     slp;
209    StdSegPtr     ssp;
210 
211    if (sap == NULL || sap->segs == NULL) {
212       return -1;
213    } else if (segment == 0) {
214       return 0;
215    }
216    if (sap->segtype == SAS_DENSEG)
217    {
218       dsp = (DenseSegPtr)sap->segs;
219       if (sip == NULL)
220       {
221          pos = 0;
222          for (c=0; c<segment; c++)
223          {
224             pos += dsp->lens[c];
225          }
226          return pos;
227       }
228       sip_tmp = dsp->ids;
229       i = 0;
230       found = FALSE;
231       while (!found && sip_tmp != NULL)
232       {
233          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
234             found = TRUE;
235          else
236          {
237             sip_tmp = sip_tmp->next;
238             i++;
239          }
240       }
241       if (!found || i>dsp->dim || segment > dsp->numseg)
242          return -1;
243       pos = 0;
244       for (c=0; c<segment; c++)
245       {
246          if ((j = dsp->starts[(dsp->dim*c)+i])>0)
247             pos=j;
248       }
249       return pos;
250    } else if (sap->segtype == SAS_DENDIAG)
251    {
252       ddp = (DenseDiagPtr)sap->segs;
253       pos = 0;
254       for (c=0; c<segment; c++)
255       {
256          pos += ddp->len;
257          ddp = ddp->next;
258          if (ddp == NULL)
259             return -1;
260       }
261       if (sip == NULL)
262          return pos;
263       sip_tmp = ddp->id;
264       i = 0;
265       found = FALSE;
266       while (!found && sip_tmp != NULL)
267       {
268          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
269             found = TRUE;
270          else
271          {
272             sip_tmp = sip_tmp->next;
273             i++;
274          }
275       }
276       if (!found || i>ddp->dim)
277          return -1;
278       return (ddp->starts[i]);
279    } else if (sap->segtype == SAS_STD)
280    {
281       ssp = (StdSegPtr)(sap->segs);
282       pos = 0;
283       for (c=0; c<segment-1; c++)
284       {
285          pos += SeqLocLen(ssp->loc);
286          ssp = ssp->next;
287          if (ssp == NULL)
288             return -1;
289       }
290       if (sip == NULL)
291          return pos;
292       slp = ssp->loc;
293       found = FALSE;
294       while (!found && slp!=NULL)
295       {
296          sip_tmp = SeqLocId(slp);
297          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
298             found = TRUE;
299          else
300             slp = slp->next;
301       }
302       if (!found)
303          return -1;
304       return (SeqLocStart(slp));
305    } else if (sap->segtype == SAS_PACKED)
306    {
307       psp = (PackSegPtr)(sap->segs);
308       if (segment > psp->numseg)
309          return -1;
310       if (sip == NULL)
311       {
312          pos = 0;
313          for (c=0; c<segment; c++)
314          {
315             pos += psp->lens[c];
316          }
317          return pos;
318       }
319       sip_tmp = psp->ids;
320       i = 0;
321       found = FALSE;
322       while (!found && sip_tmp != NULL)
323       {
324          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
325             found = TRUE;
326          else
327          {
328             sip_tmp = sip_tmp->next;
329             i++;
330          }
331       }
332       if (!found || i>psp->dim)
333          return -1;
334       pos = 0;
335       seqpresence = NULL;
336       BSSeek(psp->present, 0, SEEK_SET);
337       seqpresence=MemNew(BSLen(psp->present));
338       if(!seqpresence)
339          return -1;
340       BSRead(psp->present, seqpresence, BSLen(psp->present));
341       for (c=0; c<segment; c++)
342       {
343          if (seqpresence[(c*psp->numseg+i)/8]&jybitnum[(c*psp->numseg+i)%8])
344             pos+=psp->lens[c];
345       }
346       return pos;
347    } else
348       return -1;
349 }
350 
351 
BioseqForAlignmentWork(SeqAlignPtr salp)352 static BioseqPtr BioseqForAlignmentWork (SeqAlignPtr salp)
353 {
354   Int4 row, num_rows;
355   BioseqPtr bsp = NULL;
356   SeqIdPtr  sip;
357   DenseDiagPtr ddp;
358 
359   /* NOTE - can't index DenseDiag chain during validation because we're examining the individual DenseDiags,
360    * and indexing converts it to DenseSegs.
361    */
362   if (salp->segtype == SAS_DENDIAG && salp->segs != NULL) {
363     ddp = (DenseDiagPtr) salp->segs;
364     while (bsp == NULL && ddp != NULL) {
365       for (sip = ddp->id; bsp == NULL && sip != NULL; sip = sip->next) {
366         bsp = BioseqFind (sip);
367         sip = sip->next;
368       }
369       ddp = ddp->next;
370     }
371   } else {
372     AlnMgr2IndexSingleChildSeqAlign(salp);
373     num_rows = AlnMgr2GetNumRows(salp);
374     for (row = 1; row <= num_rows && bsp == NULL; row++) {
375       sip = AlnMgr2GetNthSeqIdPtr(salp, row);
376       bsp = BioseqFind(sip);
377     }
378   }
379   return bsp;
380 }
381 
BioseqForAlignment(SeqAlignPtr salp)382 static BioseqPtr BioseqForAlignment (SeqAlignPtr salp)
383 {
384   BioseqPtr bsp;
385   SeqEntryPtr oldscope;
386 
387   /* first look locally to scope */
388   bsp = BioseqForAlignmentWork (salp);
389   if (bsp != NULL) return bsp;
390 
391   /* otherwise temporarily clear scope */
392   oldscope = SeqEntrySetScope (NULL);
393   bsp = BioseqForAlignmentWork (salp);
394   SeqEntrySetScope (oldscope);
395 
396   return bsp;
397 }
398 
399 
ValMessage(SeqAlignPtr salp,Int1 MessageCode,ErrSev errlevel,SeqIdPtr id,SeqIdPtr idcontext,Int4 Intvalue)400 static void ValMessage (SeqAlignPtr salp, Int1 MessageCode, ErrSev errlevel, SeqIdPtr id, SeqIdPtr idcontext , Int4 Intvalue)
401 {
402 
403   Char     buf[256],
404            buf3[64],
405            string1[64],
406            string2[552];
407   GatherContextPtr gcp;
408   Int4     pos;
409 
410   string1[0] = '\0';
411   string2[0] = '\0';
412   SeqIdWrite(id, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
413   switch(MessageCode)
414   {
415     case Err_SeqId:
416       sprintf(string1, "SeqId");
417       sprintf(string2, "The sequence corresponding to SeqId %s could not be found.", buf);
418       break;
419 
420     case Err_Strand_Rev:
421       pos = valmsggetseqpos(salp, Intvalue, id);
422       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
423       sprintf(string1, "Strand");
424       sprintf(string2, "The strand labels for SeqId %s are inconsistent across the alignment; the first inconsistent region is the %ld(th) region, near sequence position %ld, context %s", buf, (long) Intvalue, (long) pos, buf3);
425       break;
426 
427     case Err_Denseg_Len_Start:
428       pos = valmsggetseqpos(salp, Intvalue, id);
429       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
430       sprintf(string1, "Start/Length");
431       sprintf(string2, "There is a problem with sequence %s, in segment %ld (near sequence position %ld), context %s: the segment is too long or short or the next segment has an incorrect start position", buf, (long) Intvalue, (long) pos, buf3);
432       break;
433 
434     case  Err_Start_Less_Than_Zero:
435         //LCOV_EXCL_START
436         //unreachable for ASN.1 valid for C++ Toolkit
437       pos = valmsggetseqpos(salp, Intvalue, id);
438       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
439       sprintf(string1, "Start");
440       sprintf(string2, "Start point is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s", (long) Intvalue, (long) pos, buf, buf3);
441       break;
442       //LCOV_EXCL_STOP
443 
444     case Err_Start_More_Than_Biolen:
445       pos = valmsggetseqpos(salp, Intvalue, id);
446       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
447       sprintf(string1, "Start");
448       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
449       break;
450 
451     case Err_End_Less_Than_Zero:
452         //LCOV_EXCL_START
453         //unreachable with valid ASN.1
454       pos = valmsggetseqpos(salp, Intvalue, id);
455       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
456       sprintf(string1, "Length");
457       sprintf(string2, "End point is less than zero in segment %ld (near position %d) for sequence ID: %s in the context of %s.  This could be a formatting error", (long) Intvalue, (int) pos,buf, buf3);
458       break;
459       //LCOV_EXCL_STOP
460 
461     case Err_End_More_Than_Biolen:
462       pos = valmsggetseqpos(salp, Intvalue, id);
463       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
464       sprintf(string1, "Length");
465       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
466       break;
467 
468     case Err_Len_Less_Than_Zero:
469         //LCOV_EXCL_START
470         //unreachable with valid ASN.1
471       pos = valmsggetseqpos(salp, Intvalue, id);
472       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
473       sprintf(string1, "Length");
474       sprintf(string2, "Segment length is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s.  Look for extra characters in this segment or flanking segments", (long) Intvalue, (long) pos, buf, buf3);
475       break;
476       //LCOV_EXCL_STOP
477 
478     case Err_Len_More_Than_Biolen:
479       pos = valmsggetseqpos(salp, Intvalue, id);
480       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
481       sprintf(string1, "Length");
482       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
483       break;
484 
485     case Err_Sum_Len_Start:
486       pos = valmsggetseqpos(salp, Intvalue, id);
487       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
488       sprintf(string1, "Start");
489       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
490       break;
491 
492     case Err_SeqAlign_DimSeqId_Not_Match:
493       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
494       sprintf(string1, "SeqId");
495       sprintf(string2, "The Seqalign has more or fewer ids than the number of rows in the alignment (context %s).  Look for possible formatting errors in the ids.", buf3);
496       break;
497 
498     case Err_Segs_DimSeqId_Not_Match:
499       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
500       sprintf(string1, "SeqId");
501       sprintf(string2, "In segment %ld, there are more or fewer rows than there are seqids (context %s).  Look for possible formatting errors in the ids.", (long) Intvalue, buf3);
502       break;
503 
504     case Err_Fastalike:
505       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
506       sprintf(string1, "Fasta");
507       sprintf(string2, "This may be a fasta-like alignment for SeqId: %s in the context of %s", buf, buf3);
508       break;
509 
510     case Err_Null_Segs:
511         //LCOV_EXCL_START
512         // unreachable for valid ASN.1
513       sprintf(string1, "Segs");
514       sprintf(string2, "This alignment is missing all segments.  This is a non-correctable error -- look for serious formatting problems.");
515       break;
516       //LCOV_EXCL_STOP
517 
518     case Err_Segment_Gap:
519       pos = valmsggetseqpos(salp, Intvalue, id);
520       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
521       sprintf(string1, "Segs");
522       sprintf(string2, "Segment %ld (near alignment position %ld) in the context of %s contains only gaps.  Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them.", (long) Intvalue + 1, (long) pos, buf3);
523       break;
524 
525     case Err_Segs_Dim_One:
526       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
527       sprintf(string1, "Segs");
528       sprintf(string2, "Segment %ld apparently has only one sequence.  Each portion of the alignment must have at least two sequences.  context %s", (long) Intvalue, buf3);
529       break;
530 
531     case Err_SeqAlign_Dim_One:
532       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
533       sprintf(string1, "Dim");
534       sprintf(string2, "This seqalign apparently has only one sequence.  Each alignment must have at least two sequences.  context %s", buf3);
535       break;
536 
537     case Err_Segtype :
538       /* ignore new segtype warnings in genomic gpipe sequence */
539       if (useValErr && useVsp != NULL && useVsp->is_gpipe_in_sep && useVsp->bsp_genomic_in_sep) return;
540       sprintf(string1, "Segs");
541       sprintf(string2, "This alignment has an undefined or unsupported Seqalign segtype %ld", (long) Intvalue);
542       break;
543 
544     case Err_Pcnt_ID :
545       sprintf(string1, "PercentIdentity");
546       sprintf(string2, "This alignment has a percent identity of %d%%", Intvalue);
547       break;
548 
549     case Err_Short_Aln:
550         //LCOV_EXCL_START
551         //only call to only function to generate this error is commented out
552       sprintf(string1, "ShortAln");
553       sprintf(string2, "This alignment is shorter than at least one non-farpointer sequence.");
554       break;
555       //LCOV_EXCL_STOP
556 
557     case Err_Unexpected_Alignment_Type:
558       sprintf(string1, "UnexpectedAlignmentType");
559       sprintf (string2, "This is not a DenseSeg alignment.");
560       break;
561 
562     default:
563       break;
564   }
565   if (useValErr) {
566     if (salp != NULL && useVsp != NULL) {
567       gcp = useVsp->gcp;
568       if (gcp != NULL) {
569           gcp->entityID = salp->idx.entityID;
570           gcp->itemID = salp->idx.itemID;
571           gcp->thistype = salp->idx.itemtype;
572 
573         useVsp->bsp = BioseqForAlignment(salp);
574         ValidErr (useVsp, errlevel, 6, MessageCode, "%s: %s", string1, string2);
575       }
576     }
577     return;
578   }
579 //LCOV_EXCL_START
580   if (StringLen(string1) > 0)
581      errorp = JYConstructErrorMessage (string1, string2, errlevel, &errorp);
582 //LCOV_EXCL_STOP
583 }
584 
585 
586 /******************************************************************
587 return the number of seqid
588 ******************************************************************/
CountSeqIdInSip(SeqIdPtr sip)589 static Int2 CountSeqIdInSip (SeqIdPtr sip)
590 {
591     Int2 numids=0;
592 
593      while(sip)
594        {
595      numids++;
596      sip=sip->next;
597        }
598      return numids;
599 }
600 
601 /*********************************************************/
delete_bioseqs(ValNodePtr ids,Uint2 entityID)602 static void delete_bioseqs (ValNodePtr ids, Uint2 entityID)
603 {
604   SeqEntryPtr  sep_top;
605   SeqEntryPtr  sep_del;
606   ValNodePtr   vnp;
607   SeqIdPtr     sip;
608   SeqLocPtr    slp;
609   BioseqPtr    bsp;
610   ObjMgrDataPtr  omdptop;
611   ObjMgrData     omdata;
612   Uint2          parenttype;
613   Pointer        parentptr;
614 
615   if (ids == NULL)
616      return;
617   sep_top = GetTopSeqEntryForEntityID (entityID);
618   SaveSeqEntryObjMgrData (sep_top, &omdptop, &omdata);
619   GetSeqEntryParent (sep_top, &parentptr, &parenttype);
620 
621   vnp=ids;
622   while (vnp!=NULL)
623   {
624      sip = (SeqIdPtr) vnp->data.ptrvalue;
625      if (sip!=NULL) {
626         slp = (SeqLocPtr)ValNodeNew (NULL);
627         slp->choice = SEQLOC_WHOLE;
628         slp->data.ptrvalue = sip;
629         bsp = GetBioseqGivenSeqLoc (slp, entityID);
630         if (bsp!=NULL) {
631            sep_del=GetBestTopParentForData (entityID, bsp);
632            RemoveSeqEntryFromSeqEntry (sep_top, sep_del, FALSE);
633         }
634         slp->data.ptrvalue = NULL;
635         SeqLocFree (slp);
636      }
637      vnp=vnp->next;
638   }
639   SeqMgrLinkSeqEntry (sep_top, parenttype, parentptr);
640   RestoreSeqEntryObjMgrData (sep_top, omdptop, &omdata);
641   RenormalizeNucProtSets (sep_top, TRUE);
642 
643   for (vnp=ids; vnp!=NULL; vnp=vnp->next) {
644      SeqIdFree ((SeqIdPtr) vnp->data.ptrvalue);
645      vnp->data.ptrvalue = NULL;
646   }
647   ValNodeFree (vnp);
648   return;
649 }
650 
651 
652 /******************************************************************
653 validate a SeqId
654 ******************************************************************/
ValidateSeqId(SeqIdPtr sip,SeqAlignPtr salp)655 static void ValidateSeqId (SeqIdPtr sip, SeqAlignPtr salp)
656 {
657   SeqIdPtr  siptemp=NULL, sipnext;
658   BioseqPtr bsp=NULL;
659 
660   for(siptemp=sip; siptemp!=NULL; siptemp=siptemp->next)
661   {
662     /*
663     bsp = AlignValBioseqLockById(siptemp);
664     if(!bsp)
665         ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
666     else
667         AlignValBioseqUnlockById(siptemp);
668     */
669     sipnext = siptemp->next;
670     siptemp->next = NULL;
671     bsp = BioseqFindCore (siptemp);
672     if (bsp == NULL && siptemp->choice == SEQID_LOCAL) {
673         ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
674     }
675     siptemp->next = sipnext;
676   }
677   return;
678 }
679 
680 /******************************************************************
681 return seqid for each seg.
682 Note that a newly created seqid chain is returned for stdseg
683 and you need to free the memory after you use it in this case
684 ******************************************************************/
SeqIdInAlignSegs(Pointer segs,Uint1 segtype,SeqAlignPtr salp)685 static SeqIdPtr SeqIdInAlignSegs(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
686 {
687 
688   SeqIdPtr sip=NULL;
689   StdSegPtr ssp;
690   DenseDiagPtr ddp;
691   DenseSegPtr dsp;
692   PackSegPtr psp;
693   SeqLocPtr slp=NULL, slptemp;
694 
695   if(!segs)
696   {
697       //LCOV_EXCL_START
698       //unreachable for valid ASN.1
699       ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
700       return NULL;
701       //LCOV_EXCL_STOP
702   }
703   if(segtype==1)
704   { /* DenseDiag */
705 
706       ddp=(DenseDiagPtr)segs;
707       sip=ddp->id;
708   }
709   else if (segtype==2)
710   { /* DenseSeg */
711 
712       dsp = (DenseSegPtr) segs;
713       sip=dsp->ids;
714   }
715   else if (segtype==3)
716   { /* StdSeg */
717 
718       ssp = (StdSegPtr)segs;
719       slp = ssp->loc;
720       /*make a new linked list of SeqId*/
721       for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
722         AddSeqId(&sip, SeqLocId(slptemp));
723 
724   }
725   else if(segtype==4)
726   { /* Packed Seg. Optimal for editing alignments */
727 
728       psp = (PackSegPtr)segs;
729       if (psp!=NULL)
730         sip = psp->ids;
731   }
732   return sip;
733 }
734 
735 
736 /******************************************************************
737 validate SeqId in sequence alignment
738 ******************************************************************/
ValidateSeqIdInSeqAlign(SeqAlignPtr salp)739 static void  ValidateSeqIdInSeqAlign (SeqAlignPtr salp)
740 {
741   SeqIdPtr sip=NULL;
742   Pointer segptr=NULL;
743   DenseDiagPtr ddp=NULL, ddptemp;
744   StdSegPtr    ssp=NULL, ssptemp;
745 
746 
747   if(salp)
748     {
749       segptr=salp->segs;
750       if (!segptr) {
751           //LCOV_EXCL_START
752           // unreachable for valid ASN.1
753           ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
754           //LCOV_EXCL_STOP
755       }
756       else
757     {
758 
759       /*densediag */
760       if(salp->segtype==1)
761         {
762           /*cast to appropriate pointer*/
763           ddp=(DenseDiagPtr)segptr;
764           for(ddptemp=ddp; ddptemp!=NULL; ddptemp=ddptemp->next)
765         {
766 
767           sip=SeqIdInAlignSegs((Pointer)ddptemp, salp->segtype, salp);
768           ValidateSeqId(sip, salp);
769         }
770         }
771 
772       /*Stdseg*/
773       else if(salp->segtype==3)
774         {
775           /*cast to appropriate pointer*/
776           ssp=(StdSegPtr)segptr;
777           for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
778         {
779 
780           sip=SeqIdInAlignSegs((Pointer)ssptemp, salp->segtype, salp);
781           ValidateSeqId(sip, salp);
782           /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
783           SeqIdSetFree(sip);
784         }
785         }
786 
787       /*Denseseg, Packseg*/
788       else if(salp->segtype==2||salp->segtype==4)
789         {
790 
791           sip=SeqIdInAlignSegs(segptr, salp->segtype, salp);
792           ValidateSeqId(sip, salp);
793         }
794     }
795     }
796 }
797 
798 /******************************************************************
799 return true if  two sip are the same, false otherwise.
800 Also return false if there is error in sip
801 ******************************************************************/
SeqIdCmp(SeqIdPtr sip1,SeqIdPtr sip2)802 static Boolean SeqIdCmp (SeqIdPtr sip1, SeqIdPtr sip2)
803 {
804   Char buf1[256], buf2[256];
805 
806   if(!sip1||!sip2)
807     return FALSE;
808 
809   SeqIdWrite(sip1, buf1, PRINTID_FASTA_LONG, 255);
810   SeqIdWrite(sip2, buf2, PRINTID_FASTA_LONG, 255);
811   return(!StringCmp(buf1, buf2));
812 
813 }
814 
815 
816 /******************************************************************
817 return the strand for a seqloc with seqid=sip in a stdseg.
818 Note, it returns 255 if null sip or ssp
819 ******************************************************************/
SeqLocStrandForSipInStdSeg(SeqIdPtr sip,StdSegPtr ssp,SeqAlignPtr salp)820 static Uint1 SeqLocStrandForSipInStdSeg (SeqIdPtr sip, StdSegPtr ssp, SeqAlignPtr salp)
821 {
822   SeqLocPtr slp, slptemp;
823   Uint1     strand=0;
824 
825   if(!sip||!ssp)
826     return (255);
827 
828   slp=ssp->loc;
829   for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
830   {
831       if(SeqIdCmp(sip, SeqLocId(slptemp)))
832     {
833       strand=SeqLocStrand(slptemp);
834       break;
835     }
836   }
837   return strand;
838 }
839 
840 
841 //LCOV_EXCL_START
842 //code for this error does not actually work as described in comments
843 /******************************************************************
844 check if the  strand is consistent in Stdseg
845 ******************************************************************/
ValidateStrandInStdSeg(StdSegPtr ssp,SeqAlignPtr salp)846 static void ValidateStrandInStdSeg(StdSegPtr ssp, SeqAlignPtr salp)
847 {
848   SeqIdPtr     sip=NULL,  sip_inseg=NULL;
849   Uint1           strand1=0, strand2=0;
850   StdSegPtr    ssptemp, ssptemp2, ssptemp3;
851   SeqLocPtr    slp, slptemp;
852   ValNodePtr   FinishedSip=NULL, temp;
853   Boolean      CheckedStatus;
854   Int4         start_numseg=0, end_numseg=0;
855 
856   if (!ssp) {
857       // unreachable for valid ASN.1
858       ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
859   }
860   else
861     for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
862       {
863     sip_inseg=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
864     start_numseg++;
865     slp=ssptemp->loc;
866     for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
867       {
868 
869         CheckedStatus=FALSE;
870         sip=SeqLocId(slptemp);
871         if(sip)
872           {
873         /*if a seqloc represented by a sip has been checked, set the checkedstatus flag to true so it will not be checked again*/
874         for(temp=FinishedSip; temp!=NULL; temp=temp->next)
875           {
876             if(SeqIdCmp(sip, temp->data.ptrvalue))
877               {
878             CheckedStatus=TRUE;
879             break;
880               }
881           }
882         /*seqloc not checked yet*/
883         if(!CheckedStatus)
884           {
885 
886             /*keep a record of  checked sip*/
887             ValNodeAddPointer(&FinishedSip, 0, sip);
888             end_numseg=start_numseg;
889             /*go through all segs to get at least two strand, if any, for this seqloc*/
890             for(ssptemp2=ssptemp; ssptemp2!=NULL; ssptemp2=ssptemp2->next, end_numseg++)
891               {
892             /*get the first defined strand */
893             strand1=SeqLocStrandForSipInStdSeg(sip, ssptemp2, salp);
894 
895             if(strand1!=0&&strand1!=255)
896               {
897                 ssptemp2=ssptemp2->next;
898                 break;
899               }
900 
901               }
902 
903             if(strand1!=0&&strand1!=255)
904               /*continue to get next strand */
905               for(ssptemp3=ssptemp2; ssptemp3!=NULL; ssptemp3=ssptemp3->next, end_numseg++)
906             {
907               strand2=SeqLocStrandForSipInStdSeg(sip, ssptemp3, salp);
908               if(strand2==0||strand2==255)
909                 continue;
910 
911               if(strand2!=0&&strand2!=255)
912                 /*strand should be same for a given seq*/
913                 if(strand1!=strand2)
914 
915                   ValMessage (salp, Err_Strand_Rev, SEV_ERROR, sip, sip_inseg, end_numseg+1);
916             }
917             }
918           }
919       }
920     SeqIdSetFree(sip_inseg);
921 
922       }
923 
924   ValNodeFree(FinishedSip);
925 }
926 //LCOV_EXCL_STOP
927 
928 
929 /******************************************************************
930 check if the  strand is consistent in Denseseg
931 ******************************************************************/
ValidateStrandInDenseSeg(Pointer segs,Uint1 segtype,SeqAlignPtr salp)932 static void ValidateStrandInDenseSeg(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
933 {
934   DenseSegPtr dsp=NULL;
935   Int4         numseg, aligndim, dimnumseg, i, j, m;
936   SeqIdPtr     sip=NULL, siptemp;
937   Uint1           strand1=0, strand2=0;
938   Uint1Ptr strandptr=NULL;
939 
940   if(!segs)
941   {
942       //LCOV_EXCL_START
943       // unreachable for valid ASN.1
944     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
945     //LCOV_EXCL_STOP
946   }
947   else if(segtype==2)
948   {
949     dsp=(DenseSegPtr)segs;
950     strandptr=dsp->strands;
951     sip=dsp->ids;
952     numseg=dsp->numseg;
953     aligndim=dsp->dim;
954 
955     dimnumseg=numseg*aligndim;
956     if(strandptr)
957     {
958       /*go through id for each alignment sequence*/
959       for(j=0; j<aligndim; j++)
960       {
961         /* first  strand value for each sequence*/
962         strand1=strandptr[j];
963         /* go through all strand values for each sequence*/
964         for(i=j+aligndim; i<dimnumseg; i=i+aligndim)
965         {
966           strand2=strandptr[i];
967 
968           if(strand1==0||strand1==255)
969           {
970             strand1=strand2;
971             continue;
972           }
973 
974           /*skip undefined strand*/
975           if(strand2!=0&&strand2!=255)
976           {
977             /*strand should be same for a given seq*/
978             if(strand1!=strand2)
979             {
980               /*find current seqid*/
981 
982               siptemp=sip;
983               for(m=0; m<j&&siptemp!=NULL; m++)
984               {
985                 siptemp=siptemp->next;
986               }
987               ValMessage (salp, Err_Strand_Rev, SEV_ERROR, siptemp, sip, i/aligndim+1);
988             }
989           }
990         }
991       }
992     }
993   }
994 }
995 
996 
997 
998 
999 /******************************************************************
1000 check if the  strand is consistent in SeqAlignment of global
1001 or partial type
1002 ******************************************************************/
ValidateStrandinSeqAlign(SeqAlignPtr salp)1003 static void ValidateStrandinSeqAlign(SeqAlignPtr salp)
1004 {
1005   StdSegPtr ssp=NULL ;
1006 
1007   if(salp)
1008     {
1009 
1010       /*Strands needs to be validated  in case of global or partial alignment*/
1011 
1012       /*denseseg*/
1013       if(salp->segtype==2)
1014 
1015     ValidateStrandInDenseSeg(salp->segs, salp->segtype, salp);
1016 
1017       /*stdseg*/
1018       else if(salp->segtype==3)
1019     {
1020       ssp=(StdSegPtr)salp->segs;
1021       ValidateStrandInStdSeg(ssp, salp);
1022     }
1023    }
1024 }
1025 
1026 
1027 
1028 /******************************************************************
1029 Make sure that, in Densediag alignment, segment length and
1030 start point is not less than zero, and  segment length is not greater
1031 than Bioseq length
1032 ******************************************************************/
ValidateSeqlengthInDenseDiag(DenseDiagPtr ddp,SeqAlignPtr salp)1033 static void ValidateSeqlengthInDenseDiag (DenseDiagPtr ddp, SeqAlignPtr salp)
1034 {
1035   Int4Ptr      stptr=NULL;
1036   DenseDiagPtr ddptemp;
1037   Int2         numseg, i;
1038   SeqIdPtr     sip=NULL, siptemp;
1039   Int4         bslen;
1040   BioseqPtr    bsp=NULL;
1041 
1042 
1043   if (!ddp){
1044       //LCOV_EXCL_START
1045       // unreachable for valid ASN.1
1046       ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1047       //LCOV_EXCL_STOP
1048   }
1049   else
1050     {
1051       for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1052     {
1053       sip=ddp->id;
1054       stptr=ddptemp->starts;
1055 
1056       if(stptr)
1057         {
1058           for(i=0, siptemp=sip; i<ddptemp->dim; i++, siptemp=siptemp->next)
1059         {
1060           bsp=AlignValBioseqLockById(siptemp);
1061           if(bsp)
1062             {
1063               bslen=bsp->length;
1064               AlignValBioseqUnlock (bsp);
1065               /*verify start*/
1066               if (stptr[i] < 0) {
1067                   //LCOV_EXCL_START
1068                   //unreachable with valid ASN.1
1069                   ValMessage(salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip, numseg);
1070                   //LCOV_EXCL_STOP
1071               }
1072               if(stptr[i]>=bslen)
1073             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg);
1074 
1075               /*verify length*/
1076 
1077               if (ddptemp->len<0) {
1078                   //LCOV_EXCL_START
1079                   //unreachable with valid ASN.1
1080                   ValMessage(salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip, numseg);
1081                   //LCOV_EXCL_STOP
1082               }
1083 
1084               if(ddptemp->len+stptr[i]>bslen)
1085             ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg);
1086             }
1087         }
1088         }
1089     }
1090     }
1091 }
1092 
1093 
1094 /******************************************************************
1095 return a new copy of len array in reversed order
1096 ******************************************************************/
GetReverseLength(Int2 numseg,Int4Ptr lenptr)1097 static Int4Ptr GetReverseLength (Int2 numseg, Int4Ptr lenptr)
1098 {
1099   Int4Ptr lenptrtemp=NULL;
1100   Int2 p;
1101 
1102   if(!lenptr)
1103     return NULL;
1104 
1105   lenptrtemp=(Int4Ptr)MemNew(numseg*sizeof(Int4Ptr));
1106   if(!lenptrtemp)
1107   {
1108       ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1109       return NULL;
1110   }
1111   for(p=0; p<numseg; p++)
1112     lenptrtemp[p]=lenptr[numseg-1-p];
1113   return lenptrtemp;
1114 
1115 }
1116 
1117 /******************************************************************
1118 return a new copy of start array in reversed "numseg" order .
1119 Note that the relative position of starts in each numseg has not changed.
1120 Example:  original length={0, 0, 10, -1, 30, 10}, numseg=3,
1121 lens={10, 20, 40}, the reversed length={30, 10, 10, -1, 0, 0}
1122 ******************************************************************/
GetReverseStart(Int2 numseg,Int2 dim,Int4Ptr stptr)1123 static Int4Ptr GetReverseStart(Int2 numseg, Int2 dim, Int4Ptr stptr)
1124 {
1125   Int4Ptr stptrtemp=NULL;
1126   Int2 p, q;
1127 
1128   if(!stptr)
1129     return NULL;
1130 
1131   stptrtemp=(Int4Ptr)MemNew(numseg*dim*sizeof(Int4Ptr));
1132   if(!stptrtemp)
1133   {
1134       ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1135       return NULL;
1136   }
1137   for(p=0; p<numseg; p++)
1138     for(q=0; q<dim; q++)
1139       stptrtemp[q+p*dim]=stptr[q+(numseg-1-p)*dim];
1140 
1141   return stptrtemp;
1142 }
1143 
1144 
1145 
1146 /******************************************************************
1147 Make sure that, in Denseseg alignment, segment length and
1148 start point agrees each other and the sum of segment length
1149 is not greater than Bioseq length
1150 ******************************************************************/
ValidateSeqlengthInDenseSeg(DenseSegPtr dsp,SeqAlignPtr salp)1151 static void ValidateSeqlengthInDenseSeg (DenseSegPtr dsp, SeqAlignPtr salp)
1152 {
1153 
1154   Int4Ptr      lenptr=NULL, stptr=NULL, lenptrtemp=NULL, stptrtemp=NULL, lenptrtemp2=NULL, stptrtemp2=NULL;
1155 
1156   Int2         numseg, aligndim, i, j;
1157   SeqIdPtr     sip=NULL, siptemp;
1158   Int4         bslen = 0;
1159   BioseqPtr    bsp=NULL;
1160 
1161   if (!dsp) {
1162       //LCOV_EXCL_START
1163       // unreachable for valid ASN.1
1164       ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1165       //LCOV_EXCL_STOP
1166   }
1167  else
1168     {
1169       numseg=dsp->numseg;
1170       aligndim=dsp->dim;
1171 
1172       stptr=dsp->starts;
1173       lenptr=dsp->lens;
1174       sip=dsp->ids;
1175 
1176       if(stptr==NULL||lenptr==NULL)
1177     return;
1178 
1179 
1180       /*go through each sequence*/
1181       for(j=0, siptemp=sip; j<aligndim&&siptemp; j++, siptemp=siptemp->next)
1182     {
1183 
1184       lenptrtemp=lenptr;
1185       stptrtemp=stptr;
1186       /*if on minus strand, use reversed length and start array*/
1187       if(dsp->strands)
1188         {
1189           if(dsp->strands[j]==Seq_strand_minus)
1190         {
1191           if(!lenptrtemp2&&!stptrtemp2)
1192             {
1193               lenptrtemp2= GetReverseLength (numseg, lenptr);
1194               if (lenptrtemp2==NULL)
1195                  return;
1196               stptrtemp2= GetReverseStart (numseg, aligndim, stptr);
1197               if (stptrtemp2==NULL)
1198                  return;
1199             }
1200           lenptrtemp=lenptrtemp2;
1201           stptrtemp=stptrtemp2;
1202         }
1203         }
1204 
1205       bsp=AlignValBioseqLockById(siptemp);
1206       if(bsp!=NULL)
1207         {
1208           bslen=bsp->length;
1209           AlignValBioseqUnlock (bsp);
1210         }
1211 
1212       /*go through each segment for a given sequence*/
1213       for(i=0; i<numseg; i++)
1214         {
1215 
1216           /*no need to verify if segment is not present*/
1217           if(stptrtemp[j+i*aligndim]!=-1)
1218         {
1219 
1220           /*length plus start should be equal to next start*/
1221           /*check a start if it's not the last one and the next start is not -1*/
1222           if(i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]!=-1)
1223             {
1224 
1225               if(stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+(i+1)*aligndim])
1226             {
1227               if (dsp->strands)
1228                 {
1229                   if(dsp->strands[j]==2)
1230                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1231                   else
1232                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1233                 }
1234                           else
1235                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1236             }
1237             }
1238           /*check a start if it's not the last one and the next start is -1*/
1239           else if (i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]==-1)
1240             {
1241               Int4 k=i+1;
1242               /*find the next start that is not last and not -1*/
1243               while(k<numseg&&stptrtemp[j+k*aligndim]==-1)
1244             k++;
1245 
1246               /*length plus start should be equal to the closest next start that is not -1*/
1247 
1248               if(k<numseg&&stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+k*aligndim])
1249             {
1250               if (dsp->strands)
1251                 {
1252                   if(dsp->strands[j]==2)
1253                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1254                   else
1255                  ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1256                 }
1257                           else
1258                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1259             }
1260             }
1261 
1262 
1263          /*make sure the start plus segment does not exceed total bioseq length*/
1264           if(bsp!=NULL)
1265             {
1266 
1267               if(stptrtemp[j+i*aligndim]+lenptrtemp[i]>bslen)
1268             if (dsp->strands)
1269               {
1270                 if(dsp->strands[j]==2)
1271                   ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg-1);
1272                 else
1273                   ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1274               }
1275             else
1276               ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1277             }
1278 
1279         }
1280 
1281         }
1282     }
1283     }
1284 
1285 
1286  MemFree(lenptrtemp2);
1287  MemFree(stptrtemp2);
1288 
1289 
1290 }
1291 
1292 /******************************************************************
1293 Make sure that, in Seqloc of a Stdseg alignment,
1294 end point, start point and length are not less than zero,
1295 and are not greater than Bioseq length
1296 ******************************************************************/
ValidateSeqlengthInStdSeg(StdSegPtr ssp,SeqAlignPtr salp)1297 static void ValidateSeqlengthInStdSeg (StdSegPtr ssp, SeqAlignPtr salp)
1298 {
1299   StdSegPtr    ssptemp;
1300   Int2         numseg;
1301   SeqIdPtr     sip=NULL, siptemp;
1302   Int4         start, end, length, bslen;
1303   BioseqPtr    bsp=NULL;
1304   SeqLocPtr    slp=NULL, slptemp;
1305 
1306   if(!ssp) {
1307       //LCOV_EXCL_START
1308       // unreachable for valid ASN.1
1309     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1310     //LCOV_EXCL_STOP
1311   } else {
1312     for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++) {
1313       /*get all seqid in current segment*/
1314       sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1315       slp=ssptemp->loc;
1316       if(slp==NULL)
1317         return;
1318       for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next) {
1319         siptemp=SeqLocId(slptemp);
1320         start=SeqLocStart(slptemp);
1321         end=SeqLocStop(slptemp);
1322         length=SeqLocLen(slptemp);
1323 
1324         bsp=AlignValBioseqLockById(siptemp);
1325         if(bsp) {
1326           bslen=bsp->length;
1327           AlignValBioseqUnlock (bsp);
1328 
1329           /*verify start*/
1330           if(start<0) {
1331               //LCOV_EXCL_START
1332               //unreachable with valid ASN.1
1333             ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1334             //LCOV_EXCL_STOP
1335           }
1336 
1337           if(start>bslen-1) {
1338             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1339           }
1340 
1341             /*verify end*/
1342           if(end<0) {
1343               //LCOV_EXCL_START
1344               //unreachable with valid ASN.1
1345             ValMessage (salp, Err_End_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1346             //LCOV_EXCL_STOP
1347           }
1348           if(end>bslen-1) {
1349             ValMessage (salp, Err_End_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1350           }
1351 
1352           /*verify length*/
1353           if(length<0) {
1354               //LCOV_EXCL_START
1355               //unreachable with valid ASN.1
1356             ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1357             //LCOV_EXCL_STOP
1358           }
1359 
1360           if(length>bslen) {
1361             ValMessage (salp, Err_Len_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1362           }
1363 
1364         }
1365       }
1366       /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1367       SeqIdSetFree(sip);
1368     }
1369   }
1370 }
1371 
1372 /******************************************************************
1373 validate the start and segment length in packseg
1374 ******************************************************************/
ValidateSeqlengthInPackSeg(PackSegPtr psp,SeqAlignPtr salp)1375 static void ValidateSeqlengthInPackSeg (PackSegPtr psp, SeqAlignPtr salp)
1376 {
1377   Uint1Ptr     seqpresence=NULL;
1378   Int2         numseg, aligndim, i, j;
1379   SeqIdPtr     sip=NULL, siptemp;
1380   Int4Ptr      stptr=NULL, lenptr=NULL;
1381   BioseqPtr    bsp=NULL;
1382   Int4         bslen, seg_start;
1383 
1384   if (!psp){
1385       //LCOV_EXCL_START
1386       // unreachable for valid ASN.1
1387       ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1388       //LCOV_EXCL_STOP
1389   }
1390   else
1391     {
1392       numseg=psp->numseg;
1393       aligndim=psp->dim;
1394       sip=psp->ids;
1395       stptr=psp->starts;
1396       lenptr=psp->lens;
1397 
1398       if(stptr&&lenptr)
1399     {
1400       if(psp->present)
1401         {
1402           BSSeek(psp->present, 0, SEEK_SET);
1403           seqpresence=MemNew(BSLen(psp->present));
1404           if(!seqpresence)
1405         {
1406 
1407           ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1408           return;
1409 
1410         }
1411           BSRead(psp->present, seqpresence, BSLen(psp->present));
1412           /*go through each sequence*/
1413           for(j=0, siptemp=sip; j<aligndim && siptemp != NULL; siptemp=siptemp->next, j++)
1414         {
1415           bsp=AlignValBioseqLockById(siptemp);
1416           if(bsp)
1417             {
1418               bslen=bsp->length;
1419               AlignValBioseqUnlock (bsp);
1420               seg_start=stptr[j];
1421               /*check start*/
1422               if (seg_start < 0) {
1423                   //LCOV_EXCL_START
1424                   ValMessage(salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip, 0);
1425                   //LCOV_EXCL_STOP
1426               }
1427               if(seg_start>=bslen)
1428             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , 0);
1429 
1430               /*go through each segment*/
1431               for(i=0; i<numseg; i++)
1432             {
1433               /*if this segment is present*/
1434               if(seqpresence[(i*aligndim+j)/8]&jybitnum[(i*aligndim+j)%8])
1435                 {
1436                   /*check start plus seg length*/
1437                   seg_start=seg_start+lenptr[i];
1438                   if(seg_start>bslen)
1439                  ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip, numseg);
1440                 }
1441             }
1442             }
1443         }
1444         }
1445     }
1446     }
1447   MemFree(seqpresence);
1448 }
1449 
1450 /******************************************************************
1451 check segment length, start and end point in Denseseg, Densediag and Stdseg
1452 ******************************************************************/
ValidateSeqlengthinSeqAlign(SeqAlignPtr salp)1453 static void  ValidateSeqlengthinSeqAlign (SeqAlignPtr salp)
1454 {
1455 
1456   if (salp)
1457   {
1458       if(salp->segtype==1)
1459     ValidateSeqlengthInDenseDiag ((DenseDiagPtr)salp->segs, salp);
1460       else if(salp->segtype==2)
1461     ValidateSeqlengthInDenseSeg ((DenseSegPtr)salp->segs, salp);
1462       else if(salp->segtype==3)
1463     ValidateSeqlengthInStdSeg ((StdSegPtr)salp->segs, salp);
1464       else if(salp->segtype==4)
1465     ValidateSeqlengthInPackSeg ((PackSegPtr)salp->segs, salp);
1466   }
1467 }
1468 
1469 /******************************************************************
1470 check if # of seqid matches the dimensions, and
1471 if there is only one seqeuence in seqalign
1472 ******************************************************************/
ValidateDimSeqIds(SeqAlignPtr salp)1473 static void ValidateDimSeqIds (SeqAlignPtr salp)
1474 {
1475   SeqIdPtr sip=NULL;
1476   DenseDiagPtr ddp=NULL, ddptemp;
1477   StdSegPtr ssp=NULL, ssptemp;
1478   DenseSegPtr dsp=NULL;
1479   Int4 numseg=0;
1480 
1481  if(salp)
1482    {
1483      /*densediag */
1484      if(salp->segtype==1)
1485        {
1486 
1487      ddp=(DenseDiagPtr)salp->segs;
1488      if (!ddp) {
1489          //LCOV_EXCL_START
1490          // unreachable for valid ASN.1
1491          ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1492          //LCOV_EXCL_STOP
1493      }
1494      else
1495        for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1496          {
1497            sip=ddptemp->id;
1498            if(ddptemp->dim==1)
1499          ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1500            if(ddptemp->dim!=CountSeqIdInSip(sip))
1501          ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1502 
1503          }
1504        }
1505 
1506      /*denseseg, packseg */
1507      else if(salp->segtype==2||salp->segtype==4)
1508        {
1509      dsp=(DenseSegPtr) (salp->segs);
1510      if (!dsp) {
1511          //LCOV_EXCL_START
1512          // unreachable for valid ASN.1
1513          ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1514          //LCOV_EXCL_STOP
1515      }
1516      else
1517        {
1518          sip=dsp->ids;
1519          if(dsp->dim==1)
1520            ValMessage (salp, Err_SeqAlign_Dim_One, SEV_ERROR, NULL, sip , 0);
1521          if(dsp->dim!=CountSeqIdInSip(sip))
1522             ValMessage (salp, Err_SeqAlign_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , 0);
1523 
1524        }
1525        }
1526 
1527      /*stdseg */
1528      else if(salp->segtype==3)
1529        {
1530 
1531      ssp=(StdSegPtr)salp->segs;
1532      if (!ssp) {
1533          //LCOV_EXCL_START
1534          // unreachable for valid ASN.1
1535          ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1536          //LCOV_EXCL_STOP
1537      }
1538      else
1539        for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1540          {
1541 
1542            sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1543            if(ssptemp->dim==1)
1544          ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1545            if(ssptemp->dim!=CountSeqIdInSip( sip))
1546          ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1547            /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1548 
1549            SeqIdSetFree(sip);
1550          }
1551        }
1552    }
1553 }
1554 
1555 /******************************************************************
1556 return true if a sip is contained in a seg, or false if otherwise
1557 Note it returns FASLE for an empty seqloc.
1558 It also returns false if error in sip or ssp
1559 ******************************************************************/
IsSipContainedInStdseg(SeqIdPtr sip,StdSegPtr ssp)1560 static Boolean IsSipContainedInStdseg(SeqIdPtr sip, StdSegPtr ssp)
1561 {
1562   SeqLocPtr slp, slptemp;
1563 
1564   if(!sip||!ssp)
1565     return FALSE;
1566 
1567   slp=ssp->loc;
1568   for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1569     {
1570       if(slptemp->choice!=SEQLOC_EMPTY&&SeqIdCmp(sip, SeqLocId(slptemp)))
1571     return TRUE;
1572     }
1573 
1574   return FALSE;
1575 }
1576 
PercentStringMatch(CharPtr string1,CharPtr string2)1577 static Int4 PercentStringMatch (CharPtr string1, CharPtr string2)
1578 {
1579   Int4 len1, len2, min_len, k, max_len;
1580   Int4 num_match = 0;
1581 
1582   if (StringHasNoText (string1) || StringHasNoText (string2))
1583   {
1584       return 0;
1585   }
1586   len1 = StringLen (string1);
1587   len2 = StringLen (string2);
1588 
1589   if (len1 > len2)
1590   {
1591       min_len = len2;
1592       max_len = len1;
1593   }
1594   else
1595   {
1596       min_len = len1;
1597       max_len = len2;
1598   }
1599 
1600   for (k = 0; k < min_len; k++)
1601   {
1602       if (string1[k] == string2[k] || string1[k] == 'N' || string2[k] == 'N')
1603       {
1604         num_match++;
1605       }
1606   }
1607   return (100 * num_match) / min_len;
1608 }
1609 
CheckForPercentMatch(SeqIdPtr sip_list)1610 static Boolean CheckForPercentMatch (SeqIdPtr sip_list)
1611 {
1612   SeqIdPtr  sip_temp, sip_next;
1613   BioseqPtr bsp;
1614   CharPtr   master_seq = NULL, this_seq = NULL;
1615 
1616   if (sip_list == NULL) return FALSE;
1617   sip_next = sip_list->next;
1618   sip_list->next = NULL;
1619   bsp = BioseqFind (sip_list);
1620   if (bsp != NULL)
1621   {
1622     master_seq = GetSequenceByBsp (bsp);
1623   }
1624   sip_list->next = sip_next;
1625   sip_temp = sip_next;
1626   if (bsp == NULL || master_seq == NULL)
1627   {
1628       return FALSE;
1629   }
1630 
1631   for (sip_temp = sip_next; sip_temp != NULL; sip_temp = sip_next)
1632   {
1633       sip_next = sip_temp->next;
1634       sip_temp->next = NULL;
1635 
1636       bsp = BioseqFind (sip_temp);
1637       if (bsp != NULL)
1638       {
1639         this_seq = GetSequenceByBsp (bsp);
1640       } else {
1641         this_seq = NULL;
1642       }
1643 
1644       sip_temp->next = sip_next;
1645       if (bsp == NULL || StringHasNoText (this_seq) || PercentStringMatch (master_seq, this_seq) < 50)
1646       {
1647         MemFree (this_seq);
1648         return FALSE;
1649       }
1650       MemFree (this_seq);
1651   }
1652   return TRUE;
1653 }
1654 
1655 
1656 /******************************************************************
1657 check if an alignment is FASTA-like.
1658 If all gaps are at the 3' ends with dimensions>2, it's FASTA-like
1659 ******************************************************************/
Is_Fasta_Seqalign(SeqAlignPtr salp)1660 static Boolean Is_Fasta_Seqalign (SeqAlignPtr salp)
1661 {
1662 
1663   SeqIdPtr    siptemp=NULL;
1664   DenseSegPtr dsp;
1665   Int4Ptr     startp;
1666   Boolean     gap;
1667   Int4        k;
1668   Int2        j;
1669   SeqIdPtr    bad_sip = NULL;
1670 
1671   /*check only global or partial type*/
1672   if(salp->type!=1&&salp->type!=3)
1673     return FALSE;
1674 
1675   if (salp->segtype != SAS_DENSEG) {
1676     ValMessage (salp, Err_Unexpected_Alignment_Type, SEV_ERROR, NULL, NULL, 0);
1677   } else {
1678     dsp = (DenseSegPtr) salp->segs;
1679     if(!dsp)
1680     {
1681         //LCOV_EXCL_START
1682         // unreachable for valid ASN.1
1683       ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1684       //LCOV_EXCL_STOP
1685     }
1686     else
1687     {
1688       if(dsp->dim<=2)
1689       {
1690         return FALSE;
1691       }
1692       /* if any sequence has gaps at the 5' end or internal gaps, the entire
1693        * alignment is declared to be valid.
1694        * if the sequence contains no gaps at all or only 3' end gaps, check
1695        * sequences for matches against the first sequence - if more than half
1696        * of the nucleotides are matches, then call this not FASTA-like.
1697        */
1698       for (j=0, siptemp=dsp->ids; j<dsp->dim&&siptemp; j++, siptemp=siptemp->next)
1699       {
1700         gap=FALSE;
1701 
1702         for (k=0; k<dsp->numseg; k++)
1703         {
1704           startp=dsp->starts;
1705 
1706           /*if start value is -1, set gap flag to true*/
1707           if (startp[dsp->dim*k + j] < 0)
1708           {
1709             gap = TRUE;
1710           }
1711           /*if a positive start value is found after the initial -1 start value, then it's not  fasta like, no need to check this sequence further */
1712           else if(gap)
1713           {
1714             if (bad_sip != NULL)
1715             {
1716               SeqIdFree (bad_sip);
1717             }
1718             return FALSE;
1719           }
1720           /* if no positive start value is found after the initial -1 start value
1721            * (indicating that gaps exist only at the 5' end) or if no gaps
1722            * were found at all, flag this sequence as bad if it is the first found.
1723            */
1724           if(k==dsp->numseg-1)
1725           {
1726             if (bad_sip == NULL)
1727             {
1728               bad_sip = SeqIdDup (siptemp);
1729             }
1730           }
1731         }
1732       }
1733       if (bad_sip != NULL)
1734       {
1735         if (! CheckForPercentMatch (dsp->ids))
1736         {
1737           ValMessage (salp, Err_Fastalike, SEV_WARNING, bad_sip, dsp->ids, 0);
1738           SeqIdFree (bad_sip);
1739           return TRUE;
1740         }
1741         SeqIdFree (bad_sip);
1742         return FALSE;
1743       }
1744     }
1745   }
1746   /*no fasta like sequence is found*/
1747   return FALSE;
1748 
1749 }
1750 
1751 
1752 
1753 /******************************************************************
1754 check if there is a gap for all sequence in a segment
1755 ******************************************************************/
Segment_Gap_In_SeqAlign(SeqAlignPtr salp)1756 static void Segment_Gap_In_SeqAlign(SeqAlignPtr salp)
1757 {
1758   Int4Ptr      stptr=NULL;
1759   DenseSegPtr  dsp=NULL;
1760   DenseDiagPtr ddp=NULL, ddptemp;
1761   StdSegPtr    ssp=NULL, ssptemp;
1762   PackSegPtr   psp=NULL;
1763   Uint1Ptr     seqpresence=NULL;
1764   Int2         numseg, aligndim, i, j;
1765   SeqIdPtr     sip=NULL;
1766   SeqLocPtr    slp=NULL, slptemp;
1767 
1768 
1769   if(salp)
1770     {
1771       /*densediag*/
1772       if(salp->segtype==1)
1773     {
1774       ddp=(DenseDiagPtr)salp->segs;
1775       if (!ddp) {
1776           //LCOV_EXCL_START
1777           // unreachable for valid ASN.1
1778           ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1779           //LCOV_EXCL_STOP
1780       }
1781       else
1782         {
1783           for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1784         {
1785           sip=ddptemp->id;
1786           /*empty segment*/
1787           if (ddptemp->dim == 0)  {
1788               //LCOV_EXCL_START
1789               //ASN.1 is unreadable if dim is 0
1790               ValMessage(salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1791               //LCOV_EXCL_STOP
1792           }
1793         }
1794         }
1795     }
1796 
1797 
1798       /*denseseg*/
1799      else if(salp->segtype==2)
1800     {
1801       dsp=(DenseSegPtr)salp->segs;
1802       if (!dsp) {
1803           //LCOV_EXCL_START
1804           // unreachable for valid ASN.1
1805           ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1806           //LCOV_EXCL_STOP
1807       }
1808       else
1809         {
1810           numseg=dsp->numseg;
1811           aligndim=dsp->dim;
1812           stptr=dsp->starts;
1813           sip=dsp->ids;
1814 
1815           if(stptr==NULL)
1816         return;
1817 
1818           /*go through each segment*/
1819           for(j=0; j<numseg; j++)
1820         {
1821           /*go through each sequence */
1822           for(i=0; i<aligndim; i++)
1823             {
1824 
1825               if(stptr[j*aligndim+i]==-1)
1826             {
1827               /*all starts are -1 in this segment*/
1828               if(i==aligndim-1)
1829                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1830             }
1831               /*at least one start that is not -1*/
1832               else
1833             break;
1834 
1835             }
1836         }
1837         }
1838     }
1839 
1840         /*stdseg*/
1841      else if(salp->segtype==3)
1842     {
1843       ssp=(StdSegPtr)salp->segs;
1844       if (!ssp) {
1845           //LCOV_EXCL_START
1846           // unreachable for valid ASN.1
1847           ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1848           //LCOV_EXCL_STOP
1849       }
1850       else
1851         {
1852           /*go through each segment*/
1853           for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1854         {
1855           sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1856           slp=ssptemp->loc;
1857           /*go through each sequence*/
1858           for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1859             {
1860               if(slptemp->choice==SEQLOC_EMPTY||slptemp->choice==SEQLOC_NULL)
1861             {
1862               if(slptemp->next)
1863                 continue;
1864               /*all seqloc are empty*/
1865               else
1866                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1867             }
1868               /*at least one non-empty seqloc*/
1869               else
1870             break;
1871             }
1872           /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1873           SeqIdSetFree(sip);
1874 
1875         }
1876         }
1877     }
1878       /*packseg*/
1879       else if(salp->segtype==4)
1880     {
1881       psp=(PackSegPtr)salp->segs;
1882       if (!psp) {
1883           //LCOV_EXCL_START
1884           // unreachable for valid ASN.1
1885           ValMessage(salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1886           //LCOV_EXCL_STOP
1887       }
1888       else
1889         {
1890           numseg=psp->numseg;
1891           aligndim=psp->dim;
1892           sip=psp->ids;
1893           if(psp->present)
1894         {
1895           BSSeek(psp->present, 0, SEEK_SET);
1896           seqpresence=MemNew(BSLen(psp->present));
1897           if(!seqpresence)
1898             {
1899               ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1900               return;
1901 
1902             }
1903           BSRead(psp->present, seqpresence, BSLen(psp->present));
1904 
1905           /*go through each segment*/
1906           for(j=0; j<numseg; j++)
1907             {
1908               /*go through each sequence */
1909               for(i=0; i<aligndim; i++)
1910             {
1911               /*check the presence of each sequence by determining the bit value in a byte (0, not present; otherwise present)*/
1912               if(!(seqpresence[(j*aligndim+i)/8]&jybitnum[(j*aligndim+i)%8]))
1913                 {
1914                   /*more sequence to go*/
1915                   if(i<aligndim-1)
1916                 continue;
1917                   /*no sequence is present in this segment*/
1918                   else if(i==aligndim-1)
1919                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1920                 }
1921               /*at least one sequence is present*/
1922               else
1923                 break;
1924             }
1925             }
1926         MemFree(seqpresence);
1927         }
1928         }
1929 
1930     }
1931 
1932 
1933     }
1934 }
1935 
1936 
IsAlignmentTPA(SeqAlignPtr salp)1937 static Boolean IsAlignmentTPA (SeqAlignPtr salp)
1938 {
1939   Boolean isTPA = FALSE;
1940   BioseqPtr bsp;
1941   SeqIdPtr  sip = NULL, tmp_sip;
1942   SeqEntryPtr oldscope;
1943   DenseDiagPtr ddp;
1944   StdSegPtr    ssp;
1945 
1946   if (salp == NULL) {
1947     return FALSE;
1948   }
1949 
1950   oldscope = SeqEntrySetScope (NULL);
1951 
1952   switch (salp->segtype) {
1953     case SAS_DENDIAG:
1954       /*densediag */
1955       for (ddp = (DenseDiagPtr) salp->segs; ddp != NULL && !isTPA; ddp = ddp->next) {
1956         for (sip = SeqIdInAlignSegs((Pointer)ddp, salp->segtype, salp);
1957              sip != NULL && !isTPA;
1958              sip = sip->next) {
1959           bsp = BioseqLockById(sip);
1960           isTPA = HasTpaUserObject(bsp);
1961           BioseqUnlock(bsp);
1962         }
1963       }
1964       break;
1965     case SAS_STD:
1966       /*Stdseg*/
1967       for (ssp = (StdSegPtr) salp->segs; ssp != NULL && !isTPA; ssp = ssp->next) {
1968         sip = SeqIdInAlignSegs((Pointer)ssp, salp->segtype, salp);
1969         for (tmp_sip = sip;
1970              tmp_sip != NULL && !isTPA;
1971              tmp_sip = tmp_sip->next) {
1972           bsp = BioseqLockById(tmp_sip);
1973           isTPA = HasTpaUserObject(bsp);
1974           BioseqUnlock(bsp);
1975         }
1976       }
1977       /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1978       SeqIdSetFree(sip);
1979       break;
1980     case SAS_DENSEG:
1981     case SAS_PACKED:
1982       /*Denseseg, Packseg*/
1983       for (sip=SeqIdInAlignSegs(salp->segs, salp->segtype, salp);
1984            sip != NULL && !isTPA;
1985            sip = sip->next) {
1986         /* NOTE - we do not want to fetch Bioseqs if they aren't local.
1987          * we only care about TpaUserObjects on Bioseqs in THIS record.
1988          */
1989         bsp = BioseqFind(sip);
1990         isTPA = HasTpaUserObject(bsp);
1991       }
1992       break;
1993   }
1994 
1995   SeqEntrySetScope (oldscope);
1996   return isTPA;
1997 }
1998 
1999 
2000 //LCOV_EXCL_START
2001 // only call to this function is commented out
CheckAlnSeqLens(SeqAlignPtr salp)2002 static void CheckAlnSeqLens (SeqAlignPtr salp)
2003 {
2004   Int4     aln_len, start, stop;
2005   Int4     num_rows, row;
2006   SeqIdPtr sip;
2007   BioseqPtr bsp;
2008   Boolean   is_shorter = FALSE;
2009 
2010   if (salp == NULL) return;
2011 
2012   aln_len =  AlnMgr2GetAlnLength(salp, FALSE);
2013   num_rows = AlnMgr2GetNumRows(salp);
2014   if (num_rows < 0) {
2015     return;
2016   }
2017 
2018   for (row = 1; row <= num_rows && !is_shorter; row++) {
2019     sip = AlnMgr2GetNthSeqIdPtr(salp, row);
2020     bsp = BioseqFind (sip);
2021     if (bsp != NULL && bsp->idx.entityID == salp->idx.entityID) {
2022       AlnMgr2GetNthSeqRangeInSA(salp, row, &start, &stop);
2023       if ((stop > start && stop < bsp->length - 1) || (start > stop && start > bsp->length - 1)) {
2024         is_shorter = TRUE;
2025       }
2026     }
2027     sip = SeqIdFree (sip);
2028   }
2029   if (is_shorter) {
2030     ValMessage (salp, Err_Short_Aln, SEV_INFO, NULL, NULL, 0);
2031   }
2032 }
2033 //LCOV_EXCL_STOP
2034 
2035 
AlignmentScorePercentIdOk(SeqAlignPtr salp)2036 static Boolean AlignmentScorePercentIdOk (SeqAlignPtr salp)
2037 {
2038   ScorePtr   score;
2039 
2040   if (salp == NULL) {
2041     return FALSE;
2042   }
2043   for (score = salp->score; score != NULL; score = score->next) {
2044     if (score->id != NULL
2045         && score->id->str != NULL
2046         && StringICmp (score->id->str, "pct_identity_ungap") == 0) {
2047       if (score->value.realvalue > 50.0) {
2048         return TRUE;
2049       } else {
2050         return FALSE;
2051       }
2052     }
2053   }
2054   return FALSE;
2055 }
2056 
2057 
2058 /******************************************************************
2059 validate seqid, segment length, strand in Seqalignment for Denseseg,
2060 Densediag and Stdseg.  Also check if it's FASTA-like
2061 ******************************************************************/
ValidateSeqAlignFunc(SeqAlignPtr salp,Boolean find_remote_bsp)2062 static Boolean ValidateSeqAlignFunc (SeqAlignPtr salp, Boolean find_remote_bsp)
2063 {
2064   Boolean   error=FALSE;
2065   Uint2     pcnt_identity;
2066   SeqAlignPtr salp_test;
2067 
2068   if(salp==NULL)
2069     return FALSE;
2070 
2071   /*validate if dimesion equals number of seqid*/
2072   ValidateDimSeqIds (salp);
2073 
2074   if (find_remote_bsp) {
2075     ValidateSeqIdInSeqAlign (salp);
2076     ValidateSeqlengthinSeqAlign (salp);
2077   }
2078   /*validate strand*/
2079   ValidateStrandinSeqAlign (salp);
2080 
2081   /*validate Fasta like*/
2082   if (Is_Fasta_Seqalign (salp))
2083   {
2084       error = TRUE;
2085   }
2086 
2087   /*validate segment gap*/
2088   Segment_Gap_In_SeqAlign (salp);
2089 
2090   if (!IsAlignmentTPA(salp) && !AlignmentScorePercentIdOk(salp)) {
2091     if (salp->segtype == SAS_DENDIAG) {
2092       /* duplicate alignment, to prevent indexing from changing the original type */
2093       salp_test = SeqAlignDup (salp);
2094       pcnt_identity = AlignmentPercentIdentityEx (salp_test, FALSE, TRUE);
2095       salp_test = SeqAlignFree (salp_test);
2096     } else {
2097       pcnt_identity = AlignmentPercentIdentityEx (salp, FALSE, TRUE);
2098     }
2099 
2100     if (pcnt_identity < 50) {
2101       ValMessage (salp, Err_Pcnt_ID, SEV_WARNING, NULL, NULL, pcnt_identity);
2102     }
2103 
2104 /*    CheckAlnSeqLens (salp); */
2105   }
2106 
2107   return error;
2108 }
2109 
2110 
2111 /******************************************************************
2112 validate each alignment sequentially.
2113 This function will subject the seqalign to all validation functions
2114 ******************************************************************/
ValidateSeqAlign(SeqAlignPtr salp,Uint2 entityID,Boolean message,Boolean msg_success,Boolean find_remote_bsp,Boolean delete_bsp,Boolean delete_salp,BoolPtr dirty)2115 NLM_EXTERN Boolean ValidateSeqAlign (SeqAlignPtr salp, Uint2 entityID, Boolean message,
2116                          Boolean msg_success, Boolean find_remote_bsp,
2117                          Boolean delete_bsp, Boolean delete_salp, BoolPtr dirty)
2118 {
2119   SeqAlignPtr  pre,
2120                salptmp;
2121   SaVal        sv;
2122   SaValPtr     svp;
2123   ValNodePtr   vnp;
2124   JYErrorMsgPtr bemp;
2125   MsgAnswer    ans;
2126   Int2         err_count=0,
2127                salp_count=0;
2128   Boolean      retdel = FALSE;
2129 
2130   if(salp!=NULL)
2131   {
2132      sv.message = message;
2133      sv.msg_success = msg_success;
2134      sv.find_remote_bsp = find_remote_bsp;
2135      sv.delete_salp = delete_salp;
2136      sv.delete_bsp = delete_bsp;
2137      sv.retdel = TRUE;
2138      sv.do_hist_assembly = FALSE;
2139      sv.ids = NULL;
2140      sv.entityID = entityID;
2141      sv.dirty = FALSE;
2142      svp = &sv;
2143      pre=NULL;
2144      salptmp=salp;
2145      while (salptmp)
2146      {
2147         salp_count++;
2148         if (salptmp->segtype == SAS_SPARSE) {
2149            ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2150         } else if (salptmp->segtype == SAS_SPLICED) {
2151            ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2152         }
2153         else if (salptmp->segtype==5)
2154         {
2155            ValidateSeqAlign ((SeqAlignPtr) (salptmp->segs), entityID, message, msg_success, find_remote_bsp, delete_bsp, delete_salp, &svp->dirty);
2156         }
2157         else if (salptmp->segtype<1 || salptmp->segtype>4)
2158         {
2159            ValMessage (salp, Err_Segtype, SEV_ERROR, NULL, NULL, salptmp->segtype);
2160         }
2161         else {
2162               ValidateSeqAlignFunc (salptmp, svp->find_remote_bsp);
2163         }
2164            if (errorp)
2165            {
2166                //LCOV_EXCL_START
2167                //not used
2168               if(svp->message)
2169               {
2170                  for (vnp=errorp; vnp!=NULL; vnp=vnp->next)
2171                  {
2172                     bemp=(JYErrorMsgPtr)vnp->data.ptrvalue;
2173                     ErrPostEx ((ErrSev) bemp->level, 0, 0, bemp->msg);
2174                  }
2175               }
2176               errorp = JYErrorChainDestroy (errorp);
2177               if (svp->delete_salp)
2178               {
2179             if (pre==NULL) {
2180               salp=salptmp->next;
2181               salptmp->next = NULL;
2182               SeqAlignFree (salptmp);
2183               salptmp = salp;
2184             }
2185             else {
2186               pre->next = salptmp->next;
2187               salptmp->next = NULL;
2188               SeqAlignFree (salptmp);
2189               salptmp = pre->next;
2190             }
2191            }
2192               else {
2193                  salptmp = salptmp->next;
2194               }
2195               err_count++;
2196            svp->retdel=FALSE;
2197            //LCOV_EXCL_STOP
2198         }
2199            else {
2200               salptmp = salptmp->next;
2201            }
2202      }
2203      if (err_count==0 && svp->msg_success) {
2204         if (salp_count>1)
2205            ans = Message (MSG_OK, "Validation test of %d alignments succeeded", salp_count);
2206         else
2207            ans = Message (MSG_OK, "Validation test of the alignment succeeded");
2208      }
2209      if (dirty)
2210         *dirty = svp->dirty;
2211      retdel = svp->retdel;
2212   }
2213   return retdel;
2214 }
2215 
2216 
2217 //LCOV_EXCL_START
2218 /******************************************************************
2219 call back function for REGISTER_ALIGNVALIDATION defined in sequin4.c.
2220 Starting point for seqalign validation if user clicked on
2221 SeqalignValidation under menu Filer/Alignment.
2222 Either individual alignment or alignment block
2223 should be highlighted for this validation to work
2224 ******************************************************************/
2225 
ValidateSeqAlignFromData(Pointer data)2226 NLM_EXTERN Int2 LIBCALLBACK ValidateSeqAlignFromData (Pointer data)
2227 {
2228 
2229   OMProcControlPtr  ompcp;
2230   SeqAlignPtr       salp=NULL;
2231   SeqAnnotPtr       sap=NULL;
2232   SeqEntryPtr       sep=NULL;
2233 
2234   ompcp = (OMProcControlPtr) data;
2235   if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2236 
2237   if (ompcp->input_data == NULL) return OM_MSG_RET_ERROR;
2238 
2239   switch(ompcp->input_itemtype)
2240     {
2241     case OBJ_BIOSEQ :
2242       sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2243       break;
2244     case OBJ_BIOSEQSET :
2245       sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2246       break;
2247       /*if clicked on alignment block*/
2248     case OBJ_SEQANNOT:
2249       sap=(SeqAnnotPtr) (ompcp->input_data);
2250       break;
2251       /*if clicked on individual alignment*/
2252     case OBJ_SEQALIGN:
2253       salp=(SeqAlignPtr) (ompcp->input_data);
2254       break;
2255     case 0 :
2256       return OM_MSG_RET_ERROR;
2257     default :
2258       return OM_MSG_RET_ERROR;
2259   }
2260 
2261   ErrSetMessageLevel(SEV_ERROR);
2262   if(sap!=NULL)
2263   {
2264      salp=is_salp_in_sap(sap, 2);
2265      ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2266   }
2267   if (salp!=NULL) {
2268      ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2269   }
2270   if (sep!=NULL) {
2271      ValidateSeqAlignInSeqEntry (sep, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE);
2272   }
2273   return OM_MSG_RET_DONE;
2274 }
2275 //LCOV_EXCL_STOP
2276 
ValidateSeqAlignInAnnot(SeqAnnotPtr sap,SaValPtr svp)2277 static void ValidateSeqAlignInAnnot (SeqAnnotPtr sap, SaValPtr svp)
2278 
2279 {
2280   SeqAlignPtr  salp;
2281 
2282   while (sap != NULL) {
2283     if (sap->type == 2) {
2284       salp = (SeqAlignPtr) sap->data;
2285       if (salp != NULL) {
2286         ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2287       }
2288     }
2289     sap = sap->next;
2290   }
2291 }
2292 
ValidateSeqAlignInHist(SeqHistPtr hist,SaValPtr svp)2293 static void ValidateSeqAlignInHist (SeqHistPtr hist, SaValPtr svp)
2294 
2295 {
2296   SeqAlignPtr  salp;
2297 
2298   if (hist == NULL) return;
2299   salp = hist->assembly;
2300   /* ValidateSeqAlign will validate the entire chain */
2301   ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2302 }
2303 
ValidateSeqAlignCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)2304 static void ValidateSeqAlignCallback (SeqEntryPtr sep, Pointer mydata,
2305                                           Int4 index, Int2 indent)
2306 {
2307   BioseqPtr          bsp;
2308   BioseqSetPtr       bssp;
2309   SaValPtr           svp;
2310 
2311   if (sep != NULL && sep->data.ptrvalue && mydata != NULL) {
2312      svp = (SaValPtr)mydata;
2313      if (IS_Bioseq(sep)) {
2314         bsp = (BioseqPtr) sep->data.ptrvalue;
2315         if (bsp!=NULL) {
2316            ValidateSeqAlignInAnnot (bsp->annot, svp);
2317            if (svp != NULL && svp->do_hist_assembly) {
2318               ValidateSeqAlignInHist (bsp->hist, svp);
2319            }
2320         }
2321      }
2322      else if(IS_Bioseq_set(sep)) {
2323         bssp = (BioseqSetPtr)sep->data.ptrvalue;
2324         if (bssp!=NULL) {
2325            ValidateSeqAlignInAnnot (bssp->annot, svp);
2326         }
2327      }
2328   }
2329 }
2330 
2331 
2332 
ValidateSeqAlignInSeqEntry(SeqEntryPtr sep,Boolean message,Boolean msg_success,Boolean find_remote_bsp,Boolean delete_bsp,Boolean delete_salp,Boolean do_hist_assembly)2333 NLM_EXTERN Boolean ValidateSeqAlignInSeqEntry (SeqEntryPtr sep, Boolean message,
2334                                  Boolean msg_success, Boolean find_remote_bsp,
2335                                  Boolean delete_bsp, Boolean delete_salp,
2336                                  Boolean do_hist_assembly)
2337 {
2338   SeqEntryPtr      sep_head;
2339   Uint2            entityID;
2340   SaVal            sv;
2341   Boolean          success=TRUE;
2342 
2343   entityID = ObjMgrGetEntityIDForChoice (sep);
2344   if (entityID > 0) {
2345      sep_head = GetTopSeqEntryForEntityID (entityID);
2346      if (sep_head != NULL) {
2347         sv.message = message;
2348         sv.msg_success = msg_success;
2349         sv.find_remote_bsp = find_remote_bsp;
2350         sv.find_acc_bsp = FALSE;
2351         sv.delete_salp = delete_salp;
2352         sv.delete_bsp = delete_bsp;
2353         sv.retdel = TRUE;
2354         sv.do_hist_assembly = do_hist_assembly;
2355         sv.ids = NULL;
2356         sv.entityID = entityID;
2357         sv.dirty = FALSE;
2358         SeqEntryExplore (sep_head, (Pointer)&sv, ValidateSeqAlignCallback);
2359         if (sv.dirty) {
2360            ObjMgrSetDirtyFlag (entityID, TRUE);
2361            ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
2362         }
2363         success = sv.retdel;
2364      }
2365   }
2366   return success;
2367 }
2368 
2369 
2370 /* alignment validator private for regular validator */
2371 
2372 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly);
2373 
ValidateSeqAlignWithinValidator(ValidStructPtr vsp,SeqEntryPtr sep,Boolean find_remote_bsp,Boolean do_hist_assembly)2374 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly)
2375 
2376 {
2377   GatherContext  gc;
2378   Boolean        rsult;
2379 
2380   if (vsp == NULL || sep == NULL) return FALSE;
2381   useLockByID = vsp->farIDsInAlignments;
2382   useValErr = TRUE;
2383   useVsp = vsp;
2384   vsp->gcp = &gc;
2385   vsp->bssp = NULL;
2386   vsp->bsp = NULL;
2387   vsp->sfp = NULL;
2388   vsp->descr = NULL;
2389   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
2390   rsult = ValidateSeqAlignInSeqEntry (sep, FALSE, FALSE, find_remote_bsp, FALSE, FALSE, do_hist_assembly);
2391   useLockByID = TRUE;
2392   useValErr = FALSE;
2393   useVsp = NULL;
2394   return rsult;
2395 }
2396 
2397 
2398 /* PopulateSample and ReadFromAlignmentSample are utility functions for AlignmentPercentIdentity */
PopulateSample(Uint1Ptr seqbuf_list,Int4Ptr start_list,Int4 sample_len,BioseqPtr PNTR bsp_list,Int4 row)2399 static void PopulateSample (Uint1Ptr seqbuf_list, Int4Ptr start_list,
2400                             Int4 sample_len, BioseqPtr PNTR bsp_list,
2401                             Int4 row)
2402 {
2403   Char ch;
2404 
2405   if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2406       || bsp_list[row] == NULL || start_list[row] < 0 || start_list[row] >= bsp_list[row]->length) {
2407       return;
2408   }
2409 
2410   ch = *(seqbuf_list + (row + 1) * sample_len);
2411 
2412   SeqPortStreamInt (bsp_list[row],
2413                     start_list[row],
2414                     MIN (start_list[row] + sample_len - 1, bsp_list[row]->length - 1),
2415                     Seq_strand_plus,
2416                     STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
2417                     seqbuf_list + row * sample_len,
2418                     NULL);
2419 
2420   /* put back char overwritten by SeqPortStreamInt */
2421   *(seqbuf_list + (row + 1) * sample_len) = ch;
2422 
2423 }
2424 
2425 
ComplementChar(Uint1 ch)2426 static Uint1 ComplementChar (Uint1 ch)
2427 {
2428   if (ch == 'A') {
2429     return 'T';
2430   } else if (ch == 'T') {
2431     return 'A';
2432   } else if (ch == 'G') {
2433     return 'C';
2434   } else if (ch == 'C') {
2435     return 'G';
2436   } else {
2437     return ch;
2438   }
2439 }
2440 
ReadFromAlignmentSample(Uint1Ptr seqbuf_list,Int4Ptr start_list,Int4 sample_len,BioseqPtr PNTR bsp_list,Uint1Ptr strand_list,Int4 row,Int4 seq_pos)2441 static Uint1 ReadFromAlignmentSample(Uint1Ptr seqbuf_list, Int4Ptr start_list,
2442                                      Int4 sample_len, BioseqPtr PNTR bsp_list,
2443                                      Uint1Ptr strand_list,
2444                                      Int4 row, Int4 seq_pos)
2445 {
2446   Uint1 ch = 0;
2447 
2448   if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2449       || bsp_list[row] == NULL || seq_pos < 0 || seq_pos >= bsp_list[row]->length) {
2450       return 0;
2451   }
2452 
2453   if (seq_pos < start_list[row] || seq_pos >= start_list[row] + sample_len) {
2454     start_list[row] = (seq_pos / sample_len) * sample_len;
2455     PopulateSample (seqbuf_list, start_list,
2456                     sample_len, bsp_list,
2457                     row);
2458   }
2459   ch = seqbuf_list[(row * sample_len) + seq_pos - start_list[row]];
2460   if (strand_list[row] == Seq_strand_minus) {
2461     ch = ComplementChar(ch);
2462   }
2463   return ch;
2464 }
2465 
2466 typedef struct ambchar {
2467   Char ambig_char;
2468   CharPtr match_list;
2469 } AmbCharData, PNTR AmbCharPtr;
2470 
2471 static const AmbCharData ambiguity_list[] = {
2472  { 'R', "AG" },
2473  { 'Y', "CT" },
2474  { 'M', "AC" },
2475  { 'K', "GT" },
2476  { 'S', "CG" },
2477  { 'W', "AT" },
2478  { 'H', "ACT" },
2479  { 'B', "CGT" },
2480  { 'V', "ACG" },
2481  { 'D', "AGT" }};
2482 
2483 static const Int4 num_ambiguities = sizeof (ambiguity_list) / sizeof (AmbCharData);
2484 
AmbiguousMatch(Char ch1,Char ch2)2485 static Char AmbiguousMatch (Char ch1, Char ch2)
2486 {
2487   Int4 i;
2488   for (i = 0; i < num_ambiguities; i++) {
2489     if (ch1 == ambiguity_list[i].ambig_char
2490         && StringChr (ambiguity_list[i].match_list, ch2)) {
2491       return ch2;
2492     } else if (ch2 == ambiguity_list[i].ambig_char
2493         && StringChr (ambiguity_list[i].match_list, ch1)) {
2494       return ch1;
2495     }
2496   }
2497   return 0;
2498 }
2499 
2500 
2501 //LCOV_EXCL_START
2502 extern double *
GetAlignmentColumnPercentIdentities(SeqAlignPtr salp,Int4 start,Int4 stop,Boolean internal_gaps,Boolean internal_validation)2503 GetAlignmentColumnPercentIdentities
2504 (SeqAlignPtr salp,
2505  Int4    start,
2506  Int4    stop,
2507  Boolean internal_gaps,
2508  Boolean internal_validation)
2509 {
2510   Int4       aln_len, num_rows, row, col_count = 0;
2511   Int4       num_match;
2512   Int4       aln_pos, seq_pos, k;
2513   Uint1          row_ch;
2514   SeqEntryPtr    oldscope;
2515   SeqIdPtr PNTR  sip_list;
2516   BioseqPtr PNTR bsp_list;
2517   Uint1Ptr       strand_list;
2518   BoolPtr        start_gap, end_gap;
2519   Int4Ptr        start_list;
2520   Uint1Ptr       seqbuf_list;
2521   Int4           sample_len = 50;
2522   Int4           chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2523   Int4           max_app, total_app, i;
2524   double *       pct_ids;
2525 
2526   if (salp == NULL || start < 0 || stop < start) return NULL;
2527 
2528   AlnMgr2IndexSingleChildSeqAlign(salp);
2529   aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2530   num_rows = AlnMgr2GetNumRows(salp);
2531   if (num_rows < 0) {
2532     Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2533     return NULL;
2534   }
2535 
2536   pct_ids = (double *) MemNew (sizeof (double) * (stop - start + 1));
2537   MemSet (pct_ids, 0, sizeof (double) * (stop - start + 1));
2538 
2539   bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2540   sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2541   strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2542   start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2543   end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2544   for (row = 1; row <= num_rows; row++) {
2545     sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2546     strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2547     bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2548     if (bsp_list[row - 1] == NULL) {
2549       oldscope = SeqEntrySetScope (NULL);
2550       bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2551       SeqEntrySetScope(oldscope);
2552       if (bsp_list[row - 1] == NULL) {
2553         break;
2554       }
2555     }
2556     start_gap[row - 1] = TRUE;
2557     end_gap[row - 1] = FALSE;
2558   }
2559 
2560   if (row <= num_rows) {
2561     Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2562     while (row >= 0) {
2563       sip_list[row] = SeqIdFree(sip_list[row]);
2564       BioseqUnlock(bsp_list[row]);
2565       row--;
2566     }
2567     sip_list = MemFree (sip_list);
2568     bsp_list = MemFree (bsp_list);
2569     start_gap = MemFree (start_gap);
2570     end_gap = MemFree (end_gap);
2571     return 0;
2572   }
2573 
2574   start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2575   seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2576   for (row = 0; row < num_rows; row++) {
2577     start_list[row] = 0;
2578     PopulateSample (seqbuf_list, start_list,
2579                     sample_len, bsp_list,
2580                     row);
2581   }
2582 
2583   num_match = 0;
2584   for (aln_pos = start; aln_pos < aln_len && aln_pos <= stop; aln_pos++) {
2585     /* init lists */
2586     MemSet (chars_appearing, 0, sizeof (chars_appearing));
2587     for (row = 1; row <= num_rows; row++) {
2588       if (end_gap[row - 1]) {
2589         continue;
2590       }
2591       seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2592       if (seq_pos < 0) {
2593         if (start_gap[row - 1] || end_gap[row - 1]) {
2594           /* beginning/end gap - never counts against percent identity */
2595         } else {
2596           k = aln_pos + 1;
2597           while (k < aln_len && seq_pos < 0) {
2598             seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2599             k++;
2600           }
2601           if (seq_pos < 0) {
2602             /* now in end_gap for this sequence */
2603             end_gap[row - 1] = TRUE;
2604           } else {
2605             /* internal gaps count against percent identity when specified */
2606             if (internal_gaps) {
2607               chars_appearing[4] ++;
2608             }
2609           }
2610         }
2611       } else {
2612         start_gap[row - 1] = FALSE;
2613 
2614         row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2615                                          sample_len, bsp_list, strand_list,
2616                                          row - 1, seq_pos);
2617         switch (row_ch) {
2618           case 'A':
2619             chars_appearing[0]++;
2620             break;
2621           case 'T':
2622             chars_appearing[1]++;
2623             break;
2624           case 'G':
2625             chars_appearing[2]++;
2626             break;
2627           case 'C':
2628             chars_appearing[3]++;
2629             break;
2630           default:
2631             /* we don't count ambiguity characters */
2632             break;
2633        }
2634       }
2635     }
2636     max_app = 0;
2637     total_app = 0;
2638     for (i = 0; i < 4; i++) {
2639       if (chars_appearing[i] > max_app) {
2640         max_app = chars_appearing[i];
2641       }
2642       total_app += chars_appearing[i];
2643     }
2644     /* add in internal gaps */
2645     total_app += chars_appearing[4];
2646     if (total_app > 0) {
2647       pct_ids[aln_pos - start] = (double) max_app / (double) total_app;
2648     }
2649     col_count++;
2650   }
2651 
2652   for (row = 0; row < num_rows; row++) {
2653     sip_list[row] = SeqIdFree(sip_list[row]);
2654     BioseqUnlock(bsp_list[row]);
2655   }
2656   sip_list = MemFree (sip_list);
2657   bsp_list = MemFree (bsp_list);
2658   start_gap = MemFree (start_gap);
2659   end_gap = MemFree (end_gap);
2660   start_list = MemFree (start_list);
2661   seqbuf_list = MemFree (seqbuf_list);
2662 
2663   return pct_ids;
2664 }
2665 //LCOV_EXCL_STOP
2666 
2667 
AlignmentPercentIdentityEx(SeqAlignPtr salp,Boolean internal_gaps,Boolean internal_validation)2668 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation)
2669 {
2670   Int4       aln_len, num_rows, row, col_count = 0;
2671   Int4       num_match;
2672   Uint2      pcnt;
2673   Boolean    row_match;
2674   Int4       aln_pos, seq_pos, tmp;
2675   Uint1          seq_ch, row_ch, amb_match;
2676   SeqEntryPtr    oldscope;
2677   SeqIdPtr PNTR  sip_list;
2678   BioseqPtr PNTR bsp_list;
2679   Uint1Ptr       strand_list;
2680   Int4Ptr        start_list;
2681   Uint1Ptr       seqbuf_list;
2682   Int4           sample_len = 50;
2683   Int4Ptr        starts, stops;
2684   Int4           match_25 = 0;
2685   ErrSev         logsev;
2686   ErrSev         msgsev;
2687 
2688   if (salp == NULL) return 0;
2689 
2690   AlnMgr2IndexSingleChildSeqAlign(salp);
2691   aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2692   num_rows = AlnMgr2GetNumRows(salp);
2693   if (num_rows < 0) {
2694     if (! internal_validation) {
2695       Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2696     }
2697     return 0;
2698   }
2699   bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2700   sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2701   strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2702   starts = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2703   stops = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2704   for (row = 1; row <= num_rows; row++) {
2705     sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2706     strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2707     msgsev = ErrSetMessageLevel (SEV_MAX);
2708     logsev = ErrSetLogLevel (SEV_MAX);
2709     bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2710     ErrSetLogLevel (logsev);
2711     ErrSetMessageLevel (msgsev);
2712     if (bsp_list[row - 1] == NULL) {
2713       oldscope = SeqEntrySetScope (NULL);
2714       msgsev = ErrSetMessageLevel (SEV_MAX);
2715       logsev = ErrSetLogLevel (SEV_MAX);
2716       bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2717       ErrSetLogLevel (logsev);
2718       ErrSetMessageLevel (msgsev);
2719       SeqEntrySetScope(oldscope);
2720       if (bsp_list[row - 1] == NULL) {
2721         break;
2722       }
2723     }
2724     /* get endpoints for each row */
2725     AlnMgr2GetNthSeqRangeInSA(salp, row, starts + row - 1, stops + row - 1);
2726     starts[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, starts[row - 1], row);
2727     stops[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, stops[row - 1], row);
2728     if (starts[row - 1] > stops[row - 1]) {
2729       tmp = starts[row - 1];
2730       starts[row - 1] = stops[row - 1];
2731       stops[row - 1] = tmp;
2732     }
2733 
2734   }
2735 
2736   if (row <= num_rows) {
2737     if (! internal_validation) {
2738       Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2739     }
2740     while (row > 0) {
2741       sip_list[row - 1] = SeqIdFree(sip_list[row - 1]);
2742       BioseqUnlock(bsp_list[row - 1]);
2743       row--;
2744     }
2745     sip_list = MemFree (sip_list);
2746     bsp_list = MemFree (bsp_list);
2747     starts = MemFree (starts);
2748     stops = MemFree (stops);
2749     return 0;
2750   }
2751 
2752   start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2753   seqbuf_list = (Uint1Ptr) MemNew ((num_rows * sample_len + 1) * sizeof(Uint1));
2754   for (row = 0; row < num_rows; row++) {
2755     start_list[row] = 0;
2756     PopulateSample (seqbuf_list, start_list,
2757                     sample_len, bsp_list,
2758                     row);
2759   }
2760 
2761   num_match = 0;
2762   for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2763     row_match = TRUE;
2764     seq_ch = 0;
2765     for (row = 1; row <= num_rows; row++) {
2766       if (aln_pos < starts[row - 1] || aln_pos > stops[row - 1]) {
2767         continue;
2768       }
2769       seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2770       if (seq_pos < 0) {
2771         if (internal_gaps) {
2772           row_match = FALSE;
2773         }
2774       } else {
2775         row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2776                                          sample_len, bsp_list, strand_list,
2777                                          row - 1, seq_pos);
2778         if (row_ch == 'N') {
2779           /* do nothing - Ns do not count against percent identity */
2780         } else if (seq_ch == 0) {
2781           seq_ch = row_ch;
2782         } else if (seq_ch != row_ch) {
2783           amb_match = AmbiguousMatch (seq_ch, row_ch);
2784           if (amb_match == 0) {
2785             row_match = FALSE;
2786           } else {
2787             seq_ch = amb_match;
2788           }
2789         }
2790       }
2791       if (!row_match) {
2792         break;
2793       }
2794     }
2795     if (row_match) {
2796       num_match++;
2797       match_25++;
2798     }
2799     col_count++;
2800     if (col_count % 25 == 0) {
2801       match_25 = 0;
2802     }
2803   }
2804 
2805   for (row = 0; row < num_rows; row++) {
2806     sip_list[row] = SeqIdFree(sip_list[row]);
2807     BioseqUnlock(bsp_list[row]);
2808   }
2809   sip_list = MemFree (sip_list);
2810   bsp_list = MemFree (bsp_list);
2811   starts = MemFree (starts);
2812   stops = MemFree (stops);
2813   start_list = MemFree (start_list);
2814   seqbuf_list = MemFree (seqbuf_list);
2815 
2816   if (col_count == 0) {
2817       pcnt = 0;
2818   } else {
2819       pcnt = (100 * num_match) / col_count;
2820   }
2821   return pcnt;
2822 }
2823 
2824 //LCOV_EXCL_START
AlignmentPercentIdentity(SeqAlignPtr salp,Boolean internal_gaps)2825 extern Uint2 AlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2826 {
2827   return AlignmentPercentIdentityEx (salp, internal_gaps, FALSE);
2828 }
2829 //LCOV_EXCL_STOP
2830