1 /*  $Id: utilities.cpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Mati Shomrat
27  *
28  * File Description:
29  *      Implementation of utility classes and functions.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <corelib/ncbistr.hpp>
35 
36 #include <serial/enumvalues.hpp>
37 #include <serial/serialimpl.hpp>
38 
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqfeat/SeqFeatData.hpp>
41 #include <objects/seqfeat/Gb_qual.hpp>
42 #include <objects/seqset/Seq_entry.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 #include <objects/seq/Bioseq.hpp>
45 #include <objects/misc/sequence_macros.hpp>
46 #include <objects/taxon3/T3Data.hpp>
47 #include <objects/taxon3/Taxon3_reply.hpp>
48 #include <objmgr/bioseq_handle.hpp>
49 #include <objmgr/scope.hpp>
50 #include <objmgr/seq_vector.hpp>
51 #include <objmgr/util/sequence.hpp>
52 #include <objmgr/util/seq_loc_util.hpp>
53 #include <objmgr/bioseq_ci.hpp>
54 #include <objmgr/seqdesc_ci.hpp>
55 #include <objmgr/align_ci.hpp>
56 #include <objmgr/object_manager.hpp>
57 #include <objects/taxon3/taxon3.hpp>
58 #include <objects/taxon1/taxon1.hpp>
59 #include <objtools/validator/utilities.hpp>
60 #include <objtools/validator/splice_problems.hpp>
61 #include <objtools/validator/translation_problems.hpp>
62 #include <objtools/validator/tax_validation_and_cleanup.hpp>
63 
64 #include <vector>
65 #include <algorithm>
66 #include <list>
67 
68 
69 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)70 BEGIN_SCOPE(objects)
71 BEGIN_SCOPE(validator)
72 
73 
74 // =============================================================================
75 //                                    Functions
76 // =============================================================================
77 
78 
79 bool IsClassInEntry(const CSeq_entry& se, CBioseq_set::EClass clss)
80 {
81     for ( CTypeConstIterator <CBioseq_set> si(se); si; ++si ) {
82         if ( si->GetClass() == clss ) {
83             return true;
84         }
85     }
86     return false;
87 }
88 
89 
IsDeltaOrFarSeg(const CSeq_loc & loc,CScope * scope)90 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope)
91 {
92     CBioseq_Handle bsh = BioseqHandleFromLocation(scope, loc);
93     const CSeq_entry& se = *bsh.GetTopLevelEntry().GetCompleteSeq_entry();
94 
95     if ( bsh.IsSetInst_Repr() ) {
96         CBioseq_Handle::TInst::TRepr repr = bsh.GetInst_Repr();
97         if ( repr == CSeq_inst::eRepr_delta ) {
98             if ( !IsClassInEntry(se, CBioseq_set::eClass_nuc_prot) ) {
99                 return true;
100             }
101         }
102         if ( repr == CSeq_inst::eRepr_seg ) {
103             if ( !IsClassInEntry(se, CBioseq_set::eClass_parts) ) {
104                 return true;
105             }
106         }
107     }
108 
109     return false;
110 }
111 
112 
113 // Check if string is either empty or contains just white spaces
IsBlankStringList(const list<string> & str_list)114 bool IsBlankStringList(const list< string >& str_list)
115 {
116     ITERATE( list< string >, str, str_list ) {
117         if ( !NStr::IsBlank(*str) ) {
118             return false;
119         }
120     }
121     return true;
122 }
123 
124 
GetGIForSeqId(const CSeq_id & id)125 TGi GetGIForSeqId(const CSeq_id& id)
126 {
127     TGi gi = ZERO_GI;
128     CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
129     scope->AddDefaults();
130 
131     try {
132         CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);
133         gi = scope->GetGi (idh);
134     } catch (CException &) {
135     } catch (std::exception &) {
136     }
137     return gi;
138 }
139 
140 
141 
GetSeqIdsForGI(TGi gi)142 CScope::TIds GetSeqIdsForGI(TGi gi)
143 {
144     CScope::TIds id_list;
145     CSeq_id tmp_id;
146     tmp_id.SetGi(gi);
147     CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
148     scope->AddDefaults();
149 
150     try {
151         id_list = scope->GetIds(tmp_id);
152 
153     } catch (CException &) {
154     } catch (std::exception &) {
155     }
156     return id_list;
157 }
158 
IsFarLocation(const CSeq_loc & loc,const CSeq_entry_Handle & seh)159 bool IsFarLocation(const CSeq_loc& loc, const CSeq_entry_Handle& seh)
160 {
161     CScope& scope = seh.GetScope();
162     for ( CSeq_loc_CI citer(loc); citer; ++citer ) {
163         CConstRef<CSeq_id> id(&citer.GetSeq_id());
164         if ( id ) {
165             CBioseq_Handle near_seq = scope.GetBioseqHandleFromTSE(*id, seh);
166             if ( !near_seq ) {
167                 return true;
168             }
169         }
170     }
171 
172     return false;
173 }
174 
GetSequenceStringFromLoc(const CSeq_loc & loc,CScope & scope)175 string GetSequenceStringFromLoc
176 (const CSeq_loc& loc,
177  CScope& scope)
178 {
179     CNcbiOstrstream oss;
180     CFastaOstream fasta_ostr(oss);
181     fasta_ostr.SetFlag(CFastaOstream::fAssembleParts);
182     fasta_ostr.SetFlag(CFastaOstream::fInstantiateGaps);
183     string s;
184 
185     try {
186         for (CSeq_loc_CI citer (loc); citer; ++citer) {
187             const CSeq_loc& part = citer.GetEmbeddingSeq_loc();
188             CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, part);
189             if (bsh) {
190                 fasta_ostr.WriteSequence (bsh, &part);
191             }
192         }
193         s = CNcbiOstrstreamToString(oss);
194         NStr::ReplaceInPlace(s, "\n", "");
195     } catch (CException&) {
196         s = kEmptyStr;
197     }
198 
199     return s;
200 }
201 
202 
GetSequenceFromLoc(const CSeq_loc & loc,CScope & scope,CBioseq_Handle::EVectorCoding coding)203 CSeqVector GetSequenceFromLoc
204 (const CSeq_loc& loc,
205  CScope& scope,
206  CBioseq_Handle::EVectorCoding coding)
207 {
208     CConstRef<CSeqMap> map =
209         CSeqMap::CreateSeqMapForSeq_loc(loc, &scope);
210     return CSeqVector(*map, scope, coding, eNa_strand_plus);
211 }
212 
213 
GetSequenceFromFeature(const CSeq_feat & feat,CScope & scope,CBioseq_Handle::EVectorCoding coding,bool product)214 CSeqVector GetSequenceFromFeature
215 (const CSeq_feat& feat,
216  CScope& scope,
217  CBioseq_Handle::EVectorCoding coding,
218  bool product)
219 {
220 
221     if ( (product   &&  !feat.CanGetProduct())  ||
222          (!product  &&  !feat.CanGetLocation()) ) {
223         return CSeqVector();
224     }
225 
226     const CSeq_loc* loc = product ? &feat.GetProduct() : &feat.GetLocation();
227     return GetSequenceFromLoc(*loc, scope, coding);
228 }
229 
230 
231 /***** Calculate Accession for a given object *****/
232 
233 
s_GetBioseqAcc(const CSeq_id & id,int * version)234 static string s_GetBioseqAcc(const CSeq_id& id, int* version)
235 {
236     try {
237         string label;
238         id.GetLabel(&label, version, CSeq_id::eFasta);
239         return label;
240     } catch (CException&) {
241         return kEmptyStr;
242     }
243 }
244 
245 
s_GetBioseqAcc(const CBioseq_Handle & handle,int * version)246 static string s_GetBioseqAcc(const CBioseq_Handle& handle, int* version)
247 {
248     if (handle) {
249         CConstRef<CSeq_id> seqid = sequence::GetId(handle, sequence::eGetId_Best).GetSeqId();
250         if (seqid) {
251             return s_GetBioseqAcc(*seqid, version);
252         }
253     }
254     return kEmptyStr;
255 }
256 
257 
s_GetSeq_featAcc(const CSeq_feat & feat,CScope & scope,int * version)258 static string s_GetSeq_featAcc(const CSeq_feat& feat, CScope& scope, int* version)
259 {
260     CBioseq_Handle seq = BioseqHandleFromLocation (&scope, feat.GetLocation());
261     if (seq) {
262         CBioseq_set_Handle parent = seq.GetParentBioseq_set();
263         if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_parts) {
264             parent = parent.GetParentBioseq_set();
265             if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_segset) {
266                 CBioseq_CI m(parent);
267                 if (m) {
268                     return s_GetBioseqAcc(*m, version);
269                 }
270             }
271         }
272     }
273 
274     return s_GetBioseqAcc(seq, version);
275 }
276 
277 
s_GetBioseqAcc(const CBioseq & seq,CScope & scope,int * version)278 static string s_GetBioseqAcc(const CBioseq& seq, CScope& scope, int* version)
279 {
280     CBioseq_Handle handle = scope.GetBioseqHandle(seq);
281     return s_GetBioseqAcc(handle, version);
282 }
283 
s_GetSeqFromSet(const CBioseq_set & bsst,CScope & scope)284 static const CBioseq* s_GetSeqFromSet(const CBioseq_set& bsst, CScope& scope)
285 {
286     const CBioseq* retval = NULL;
287 
288     switch (bsst.GetClass()) {
289         case CBioseq_set::eClass_gen_prod_set:
290             // find the genomic bioseq
291             FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
292                 if ((*it)->IsSeq()) {
293                     const CSeq_inst& inst = (*it)->GetSeq().GetInst();
294                     if (inst.IsSetMol()  &&  inst.GetMol() == CSeq_inst::eMol_dna) {
295                         retval = &(*it)->GetSeq();
296                         break;
297                     }
298                 }
299             }
300             break;
301         case CBioseq_set::eClass_nuc_prot:
302             // find the nucleotide bioseq
303             FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
304                 if ((*it)->IsSeq()  &&  (*it)->GetSeq().IsNa()) {
305                     retval = &(*it)->GetSeq();
306                     break;
307                 } else if ((*it)->IsSet()  &&
308                            (*it)->GetSet().IsSetClass() &&
309                            (*it)->GetSet().GetClass() == CBioseq_set::eClass_segset) {
310                     retval = s_GetSeqFromSet((*it)->GetSet(), scope);
311                     break;
312                 }
313             }
314             if (!retval) {
315                 FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
316                     if ((*it)->IsSeq()) {
317                         retval = &(*it)->GetSeq();
318                         break;
319                     }
320                 }
321             }
322             break;
323         case CBioseq_set::eClass_segset:
324             // find the master bioseq
325             FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
326                 if ((*it)->IsSeq()) {
327                     retval = &(*it)->GetSeq();
328                     break;
329                 }
330             }
331             break;
332 
333         default:
334             // find the first bioseq
335             CTypeConstIterator<CBioseq> seqit(ConstBegin(bsst));
336             if (seqit) {
337                 retval = &(*seqit);
338             }
339             break;
340     }
341 
342     return retval;
343 }
344 
345 
s_IsDescOnSeqEntry(const CSeq_entry & entry,const CSeqdesc & desc)346 static bool s_IsDescOnSeqEntry (const CSeq_entry& entry, const CSeqdesc& desc)
347 {
348     if (entry.IsSetDescr()) {
349         const auto& descs = entry.GetDescr();
350         for (auto& it : descs.Get()) {
351             if (it->Equals(desc)) {
352                 return true;
353             }
354         }
355     }
356     return false;
357 }
358 
359 
360 
s_GetAccessionForSeqdesc(const CSeq_entry_Handle & seh,const CSeqdesc & desc,CScope & scope,int * version)361 static string s_GetAccessionForSeqdesc (const CSeq_entry_Handle& seh, const CSeqdesc& desc, CScope& scope, int* version)
362 {
363     if (!seh) {
364         return kEmptyStr;\
365     } else if (seh.IsSeq()) {
366         return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), scope, version);
367     } else if (s_IsDescOnSeqEntry (*(seh.GetCompleteSeq_entry()), desc)) {
368         if (seh.IsSeq()) {
369             return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), scope, version);
370         } else if (seh.IsSet()) {
371             const CBioseq* seq = s_GetSeqFromSet(*(seh.GetSet().GetCompleteBioseq_set()), scope);
372             if (seq != NULL) {
373                 return s_GetBioseqAcc(*seq, scope, version);
374             }
375         }
376     } else {
377         CSeq_entry_Handle parent = seh.GetParentEntry();
378         if (parent) {
379             return s_GetAccessionForSeqdesc(parent, desc, scope, version);
380         }
381     }
382     return kEmptyStr;
383 }
384 
385 
IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle & bsh,const CSeq_align & align,CScope & scope)386 bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle& bsh, const CSeq_align& align, CScope& scope)
387 {
388     CSeq_entry_Handle seh = bsh.GetTopLevelEntry();
389     for (CAlign_CI align_it(seh); align_it; ++align_it) {
390         if (&(*align_it) == &align) {
391             return true;
392         }
393     }
394     return false;
395 }
396 
397 
GetReportableSeqIdForAlignment(const CSeq_align & align,CScope & scope)398 CConstRef<CSeq_id> GetReportableSeqIdForAlignment(const CSeq_align& align, CScope& scope)
399 {
400     // temporary - to match C Toolkit
401     if (align.IsSetSegs() && align.GetSegs().IsStd()) {
402         return CConstRef<CSeq_id>(NULL);
403     }
404     try {
405         if (align.IsSetDim()) {
406             for (int i = 0; i < align.GetDim(); ++i) {
407                 const CSeq_id& id = align.GetSeq_id(i);
408                 CBioseq_Handle bsh = scope.GetBioseqHandle(id);
409                 if (bsh && IsBioseqInSameSeqEntryAsAlign(bsh, align, scope)) {
410                     return CConstRef<CSeq_id>(&id);
411                 }
412             }
413         } else if (align.IsSetSegs() && align.GetSegs().IsDendiag()) {
414             const CSeq_id& id = *(align.GetSegs().GetDendiag().front()->GetIds()[0]);
415             return CConstRef<CSeq_id>(&id);
416         }
417         // failed to find resolvable ID, use bare ID
418         const CSeq_id& id = align.GetSeq_id(0);
419         return CConstRef<CSeq_id>(&id);
420     } catch (CException& ) {
421     }
422     return CConstRef<CSeq_id>(NULL);
423 }
424 
425 
GetAccessionFromObjects(const CSerialObject * obj,const CSeq_entry * ctx,CScope & scope,int * version)426 string GetAccessionFromObjects(const CSerialObject* obj, const CSeq_entry* ctx, CScope& scope, int* version)
427 {
428     string empty_acc;
429 
430     if (obj && obj->GetThisTypeInfo() == CSeqdesc::GetTypeInfo() && ctx) {
431         CSeq_entry_Handle seh = scope.GetSeq_entryHandle(*ctx);
432         const CSeqdesc& desc = dynamic_cast<const CSeqdesc&>(*obj);
433         string acc = s_GetAccessionForSeqdesc(seh, desc, scope, version);
434         if (!NStr::IsBlank(acc)) {
435             return acc;
436         }
437     }
438 
439     if (ctx) {
440         if (ctx->IsSeq()) {
441             return s_GetBioseqAcc(ctx->GetSeq(), scope, version);
442         } else if (ctx->IsSet()) {
443             const CBioseq* seq = s_GetSeqFromSet(ctx->GetSet(), scope);
444             if (seq != NULL) {
445                 return s_GetBioseqAcc(*seq, scope, version);
446             }
447         }
448     } else if (obj) {
449         if (obj->GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
450             const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*obj);
451             return s_GetSeq_featAcc(feat, scope, version);
452         } else if (obj->GetThisTypeInfo() == CBioseq::GetTypeInfo()) {
453             const CBioseq& seq = dynamic_cast<const CBioseq&>(*obj);
454             return s_GetBioseqAcc(seq, scope, version);
455         } else if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
456             const CBioseq_set& bsst = dynamic_cast<const CBioseq_set&>(*obj);
457             const CBioseq* seq = s_GetSeqFromSet(bsst, scope);
458             if (seq != NULL) {
459                 return s_GetBioseqAcc(*seq, scope, version);
460             }
461         } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
462             const CSeq_entry& entry = dynamic_cast<const CSeq_entry&>(*obj);
463             if (entry.IsSeq()) {
464                 return s_GetBioseqAcc(entry.GetSeq(), scope, version);
465             } else if (entry.IsSet()) {
466                 const CBioseq* seq = s_GetSeqFromSet(entry.GetSet(), scope);
467                 if (seq != NULL) {
468                     return s_GetBioseqAcc(*seq, scope, version);
469                 }
470             }
471         } else if (obj->GetThisTypeInfo() == CSeq_annot::GetTypeInfo()) {
472             CSeq_annot_Handle ah = scope.GetSeq_annotHandle (dynamic_cast<const CSeq_annot&>(*obj));
473             if (ah) {
474                 CSeq_entry_Handle seh = ah.GetParentEntry();
475                 if (seh) {
476                     if (seh.IsSeq()) {
477                         return s_GetBioseqAcc(seh.GetSeq(), version);
478                     } else if (seh.IsSet()) {
479                         CBioseq_set_Handle bsh = seh.GetSet();
480                         const CBioseq_set& bsst = *(bsh.GetCompleteBioseq_set());
481                         const CBioseq* seq = s_GetSeqFromSet(bsst, scope);
482                         if (seq != NULL) {
483                             return s_GetBioseqAcc(*seq, scope, version);
484                         }
485                     }
486                 }
487             }
488         } else if (obj->GetThisTypeInfo() == CSeq_align::GetTypeInfo()) {
489             const CSeq_align& align = dynamic_cast<const CSeq_align&>(*obj);
490             CConstRef<CSeq_id> id = GetReportableSeqIdForAlignment(align, scope);
491             if (id) {
492                 CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
493                 if (bsh) {
494                     return s_GetBioseqAcc(bsh, version);
495                 } else {
496                     return s_GetBioseqAcc(*id, version);
497                 }
498             }
499         } else if (obj->GetThisTypeInfo() == CSeq_graph::GetTypeInfo()) {
500             const CSeq_graph& graph = dynamic_cast<const CSeq_graph&>(*obj);
501             try {
502                 const CSeq_loc& loc = graph.GetLoc();
503                 const CSeq_id *id = loc.GetId();
504                 if (id) {
505                     return s_GetBioseqAcc (*id, version);
506                 }
507             } catch (CException& ) {
508             }
509         }
510     }
511     return empty_acc;
512 }
513 
514 
GetSetParent(const CBioseq_set_Handle & set,CBioseq_set::TClass set_class)515 CBioseq_set_Handle GetSetParent (const CBioseq_set_Handle& set, CBioseq_set::TClass set_class)
516 {
517     CBioseq_set_Handle gps;
518 
519     CSeq_entry_Handle parent = set.GetParentEntry();
520     if (!parent) {
521         return gps;
522     } else if (!(parent = parent.GetParentEntry())) {
523         return gps;
524     } else if (!parent.IsSet()) {
525         return gps;
526     } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
527         return parent.GetSet();
528     } else {
529         return GetSetParent (parent.GetSet(), set_class);
530     }
531 }
532 
533 
GetSetParent(const CBioseq_Handle & bioseq,CBioseq_set::TClass set_class)534 CBioseq_set_Handle GetSetParent (const CBioseq_Handle& bioseq, CBioseq_set::TClass set_class)
535 {
536     CBioseq_set_Handle set;
537 
538     CSeq_entry_Handle parent = bioseq.GetParentEntry();
539     if (!parent) {
540         return set;
541     } else if (!(parent = parent.GetParentEntry())) {
542         return set;
543     } else if (!parent.IsSet()) {
544         return set;
545     } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
546         return parent.GetSet();
547     } else {
548         return GetSetParent (parent.GetSet(), set_class);
549     }
550 }
551 
552 
GetGenProdSetParent(const CBioseq_set_Handle & set)553 CBioseq_set_Handle GetGenProdSetParent (const CBioseq_set_Handle& set)
554 {
555     return GetSetParent (set, CBioseq_set::eClass_gen_prod_set);
556 }
557 
GetGenProdSetParent(const CBioseq_Handle & bioseq)558 CBioseq_set_Handle GetGenProdSetParent (const CBioseq_Handle& bioseq)
559 {
560     return GetSetParent(bioseq, CBioseq_set::eClass_gen_prod_set);
561 }
562 
563 
GetNucProtSetParent(const CBioseq_Handle & bioseq)564 CBioseq_set_Handle GetNucProtSetParent (const CBioseq_Handle& bioseq)
565 {
566     return GetSetParent(bioseq, CBioseq_set::eClass_nuc_prot);
567 }
568 
569 
GetNucBioseq(const CBioseq_set_Handle & bioseq_set)570 CBioseq_Handle GetNucBioseq (const CBioseq_set_Handle& bioseq_set)
571 {
572     CBioseq_Handle nuc;
573 
574     if (!bioseq_set) {
575         return nuc;
576     }
577     CBioseq_CI bit(bioseq_set, CSeq_inst::eMol_na);
578     if (bit) {
579         nuc = *bit;
580     } else {
581         CSeq_entry_Handle parent = bioseq_set.GetParentEntry();
582         if (parent && (parent = parent.GetParentEntry())
583             && parent.IsSet()) {
584             nuc = GetNucBioseq (parent.GetSet());
585         }
586     }
587     return nuc;
588 }
589 
590 
GetNucBioseq(const CBioseq_Handle & bioseq)591 CBioseq_Handle GetNucBioseq (const CBioseq_Handle& bioseq)
592 {
593     CBioseq_Handle nuc;
594 
595     if (bioseq.IsNucleotide()) {
596         return bioseq;
597     }
598     CSeq_entry_Handle parent = bioseq.GetParentEntry();
599     if (parent && (parent = parent.GetParentEntry())
600         && parent.IsSet()) {
601         nuc = GetNucBioseq (parent.GetSet());
602     }
603     return nuc;
604 }
605 
606 
ValidateAccessionString(const string & accession,bool require_version)607 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version)
608 {
609     if (NStr::IsBlank (accession)) {
610         return eAccessionFormat_null;
611     } else if (accession.length() >= 16) {
612         return eAccessionFormat_too_long;
613     } else if (accession.length() < 3
614                || ! isalpha (accession.c_str()[0])
615                || ! isupper (accession.c_str()[0])) {
616         return eAccessionFormat_no_start_letters;
617     }
618 
619     string str = accession;
620     if (NStr::StartsWith (str, "NZ_")) {
621         str = str.substr(3);
622     }
623 
624     const char *cp = str.c_str();
625     int numAlpha = 0;
626 
627     while (isalpha (*cp)) {
628         numAlpha++;
629         cp++;
630     }
631 
632     int numUndersc = 0;
633 
634     while (*cp == '_') {
635         numUndersc++;
636         cp++;
637     }
638 
639     int numDigits = 0;
640     while (isdigit (*cp)) {
641         numDigits++;
642         cp++;
643     }
644 
645     if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
646         return eAccessionFormat_wrong_number_of_digits;
647     }
648 
649     if (require_version) {
650         if (*cp != '.') {
651             return eAccessionFormat_missing_version;
652         }
653         cp++;
654         int numVersion = 0;
655         while (isdigit (*cp)) {
656             numVersion++;
657             cp++;
658         }
659         if (numVersion < 1) {
660             return eAccessionFormat_missing_version;
661         } else if (*cp != '\0' && *cp != ' ') {
662             return eAccessionFormat_bad_version;
663         }
664     }
665 
666 
667     if (numUndersc == 0) {
668         if ((numAlpha == 1 && numDigits == 5)
669             || (numAlpha == 2 && numDigits == 6)
670             || (numAlpha == 3 && numDigits == 5)
671             || (numAlpha == 4 && numDigits == 8)
672             || (numAlpha == 5 && numDigits == 7)) {
673             return eAccessionFormat_valid;
674         }
675     } else {
676         if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
677             return eAccessionFormat_wrong_number_of_digits;
678         }
679         char first_letter = accession.c_str()[0];
680         char second_letter = accession.c_str()[1];
681         if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') {
682             if (second_letter == 'M' || second_letter == 'C'
683                 || second_letter == 'T' || second_letter == 'P'
684                 || second_letter == 'G' || second_letter == 'R'
685                 || second_letter == 'S' || second_letter == 'W'
686                 || second_letter == 'Z') {
687                 return eAccessionFormat_valid;
688             }
689         }
690         if ((first_letter == 'A' || first_letter == 'Y')
691             && second_letter == 'P') {
692             return eAccessionFormat_valid;
693         }
694     }
695 
696     return eAccessionFormat_wrong_number_of_digits;
697 }
698 
699 
s_FeatureIdsMatch(const CFeat_id & f1,const CFeat_id & f2)700 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2)
701 {
702     if (!f1.IsLocal() || !f2.IsLocal()) {
703         return false;
704     }
705 
706     return 0 == f1.GetLocal().Compare(f2.GetLocal());
707 }
708 
709 
s_StringHasPMID(const string & str)710 bool s_StringHasPMID (const string& str)
711 {
712     if (NStr::IsBlank (str)) {
713         return false;
714     }
715 
716     size_t pos = NStr::Find (str, "(PMID ");
717     if (pos == string::npos) {
718         return false;
719     }
720 
721     const char *ptr = str.c_str() + pos + 6;
722     unsigned int numdigits = 0;
723     while (*ptr != 0 && *ptr != ')') {
724         if (isdigit (*ptr)) {
725             numdigits++;
726         }
727         ptr++;
728     }
729 
730     if (*ptr == ')' && numdigits > 0) {
731         return true;
732     } else {
733         return false;
734     }
735 }
736 
737 
HasBadCharacter(const string & str)738 bool HasBadCharacter (const string& str)
739 {
740     if (NStr::Find (str, "?") != string::npos
741         || NStr::Find (str, "!") != string::npos
742         || NStr::Find (str, "~") != string::npos
743         || NStr::Find(str, "|") != string::npos) {
744         return true;
745     } else {
746         return false;
747     }
748 }
749 
750 
EndsWithBadCharacter(const string & str)751 bool EndsWithBadCharacter (const string& str)
752 {
753     if (NStr::EndsWith (str, "_") || NStr::EndsWith (str, ".")
754         || NStr::EndsWith (str, ",") || NStr::EndsWith (str, ":")
755         || NStr::EndsWith (str, ";")) {
756         return true;
757     } else {
758         return false;
759     }
760 }
761 
762 
CheckDate(const CDate & date,bool require_full_date)763 int CheckDate (const CDate& date, bool require_full_date)
764 {
765     int rval = eDateValid_valid;
766 
767     if (date.IsStr()) {
768         if (NStr::IsBlank (date.GetStr()) || NStr::Equal (date.GetStr(), "?")) {
769             rval |= eDateValid_bad_str;
770         }
771     } else if (date.IsStd()) {
772         const auto& sdate = date.GetStd();
773         if (!sdate.IsSetYear() || sdate.GetYear() == 0) {
774             rval |= eDateValid_bad_year;
775         }
776         if (sdate.IsSetMonth() && sdate.GetMonth() > 12) {
777             rval |= eDateValid_bad_month;
778         }
779         if (sdate.IsSetDay() && sdate.GetDay() > 31) {
780             rval |= eDateValid_bad_day;
781         }
782         if (require_full_date) {
783             if (!sdate.IsSetMonth() || sdate.GetMonth() == 0) {
784                 rval |= eDateValid_bad_month;
785             }
786             if (!sdate.IsSetDay() || sdate.GetDay() == 0) {
787                 rval |= eDateValid_bad_day;
788             }
789         }
790         if (sdate.IsSetSeason() && !NStr::IsBlank (sdate.GetSeason())) {
791             const char * cp = sdate.GetSeason().c_str();
792             while (*cp != 0) {
793                 if (isalpha (*cp) || *cp == '-') {
794                     // these are the only acceptable characters
795                 } else {
796                     rval |= eDateValid_bad_season;
797                     break;
798                 }
799                 ++cp;
800             }
801         }
802     } else {
803         rval |= eDateValid_bad_other;
804     }
805     return rval;
806 }
807 
808 
IsDateInPast(const CDate & date)809 bool IsDateInPast(const CDate& date)
810 {
811     time_t t;
812     time(&t);
813     struct tm *tm;
814     tm = localtime(&t);
815 
816     bool in_past = false;
817     if (!date.IsStd()) {
818         return false;
819     }
820     const auto & sdate = date.GetStd();
821     if (sdate.GetYear() < tm->tm_year + 1900) {
822         in_past = true;
823     } else if (sdate.GetYear() == tm->tm_year + 1900
824         && sdate.IsSetMonth()) {
825         if (sdate.GetMonth() < tm->tm_mon + 1) {
826             in_past = true;
827         } else if (sdate.GetMonth() == tm->tm_mon + 1
828             && sdate.IsSetDay()) {
829             if (sdate.GetDay() < tm->tm_mday) {
830                 in_past = true;
831             }
832         }
833     }
834     return in_past;
835 }
836 
837 
GetDateErrorDescription(int flags)838 string GetDateErrorDescription (int flags)
839 {
840     string reasons;
841 
842     if (flags & eDateValid_empty_date) {
843         reasons += "EMPTY_DATE ";
844     }
845     if (flags & eDateValid_bad_str) {
846         reasons += "BAD_STR ";
847     }
848     if (flags & eDateValid_bad_year) {
849         reasons += "BAD_YEAR ";
850     }
851     if (flags & eDateValid_bad_month) {
852         reasons += "BAD_MONTH ";
853     }
854     if (flags & eDateValid_bad_day) {
855         reasons += "BAD_DAY ";
856     }
857     if (flags & eDateValid_bad_season) {
858         reasons += "BAD_SEASON ";
859     }
860     if (flags & eDateValid_bad_other) {
861         reasons += "BAD_OTHER ";
862     }
863     return reasons;
864 }
865 
866 
IsBioseqTSA(const CBioseq & seq,CScope * scope)867 bool IsBioseqTSA (const CBioseq& seq, CScope* scope)
868 {
869     if (!scope) {
870         return false;
871     }
872     bool is_tsa = false;
873     CBioseq_Handle bsh = scope->GetBioseqHandle(seq);
874     if (bsh) {
875         CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_Molinfo);
876         while (desc_ci && !is_tsa) {
877             if (desc_ci->GetMolinfo().IsSetTech() && desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
878                 is_tsa = true;
879             }
880             ++desc_ci;
881         }
882     }
883     return is_tsa;
884 }
885 
886 
887 #if 0
888 // disabled for now
889 bool IsNCBIFILESeqId (const CSeq_id& id)
890 {
891     if (!id.IsGeneral() || !id.GetGeneral().IsSetDb()
892         || !NStr::Equal(id.GetGeneral().GetDb(), "NCBIFILE")) {
893         return false;
894     } else {
895         return true;
896     }
897 }
898 #endif
899 
900 
IsAccession(const CSeq_id & id)901 bool IsAccession(const CSeq_id& id)
902 {
903     if (id.GetTextseq_Id() != NULL) {
904         return true;
905     } else {
906         return false;
907     }
908 }
909 
910 
UpdateToBestId(CSeq_loc & loc,CScope & scope)911 static void UpdateToBestId(CSeq_loc& loc, CScope& scope)
912 {
913     bool any_change = false;
914     CSeq_loc_I it(loc);
915     for (; it; ++it) {
916         const CSeq_id& id = it.GetSeq_id();
917         if (!IsAccession(id)) {
918             CConstRef<CSeq_id> best_id(NULL);
919             CBioseq_Handle bsh = scope.GetBioseqHandle(id);
920             if (bsh) {
921                 const auto& ids = bsh.GetCompleteBioseq()->GetId();
922                 for (auto& id_it : ids) {
923                     if (IsAccession(*id_it)) {
924                         best_id = id_it;
925                         break;
926                     }
927                 }
928             }
929             if (best_id) {
930                 it.SetSeq_id(*best_id);
931                 any_change = true;
932             }
933         }
934     }
935     if (any_change) {
936         loc.Assign(*it.MakeSeq_loc());
937     }
938 }
939 
940 
GetValidatorLocationLabel(const CSeq_loc & loc,CScope & scope)941 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope)
942 {
943     string loc_label;
944     if (loc.IsWhole()) {
945         CBioseq_Handle bsh = scope.GetBioseqHandle(loc.GetWhole());
946         if (bsh) {
947             loc_label = GetBioseqIdLabel(*(bsh.GetCompleteBioseq()));
948             NStr::ReplaceInPlace(loc_label, "[", "");
949             NStr::ReplaceInPlace(loc_label, "]", "");
950         }
951     }
952     if (NStr::IsBlank(loc_label)) {
953         CSeq_loc tweaked_loc;
954         tweaked_loc.Assign(loc);
955         UpdateToBestId(tweaked_loc, scope);
956         tweaked_loc.GetLabel(&loc_label);
957         NStr::ReplaceInPlace(loc_label, "[", "(");
958         NStr::ReplaceInPlace(loc_label, "]", ")");
959     }
960     return loc_label;
961 }
962 
963 
964 
GetBioseqIdLabel(const CBioseq & sq,bool limited)965 string GetBioseqIdLabel(const CBioseq& sq, bool limited)
966 {
967     string content;
968     int num_ids_found = 0;
969     bool id_found = false;
970 
971     const auto& id_list = sq.GetId();
972 
973     /* find first gi */
974     for (auto id_it : id_list) {
975         if (id_it->IsGi()) {
976             CNcbiOstrstream os;
977             id_it->WriteAsFasta(os);
978             string s = CNcbiOstrstreamToString(os);
979             content += s;
980             num_ids_found ++;
981             break;
982         }
983     }
984     /* find first accession */
985     for (auto id_it : id_list) {
986         if (id_it->IsGenbank()
987             || id_it->IsDdbj()
988             || id_it->IsEmbl()
989             || id_it->IsSwissprot()
990             || id_it->IsOther()
991             || id_it->IsTpd()
992             || id_it->IsTpe()
993             || id_it->IsTpg()) {
994             if (num_ids_found > 0) {
995                 content += "|";
996             }
997             CNcbiOstrstream os;
998             id_it->WriteAsFasta(os);
999             string s = CNcbiOstrstreamToString(os);
1000             content += s;
1001             num_ids_found++;
1002             break;
1003         }
1004     }
1005 
1006     if (num_ids_found == 0) {
1007         /* find first general */
1008         for (auto id_it : id_list) {
1009             if (id_it->IsGeneral()) {
1010                 if (num_ids_found > 0) {
1011                     content += "|";
1012                 }
1013                 CNcbiOstrstream os;
1014                 id_it->WriteAsFasta(os);
1015                 string s = CNcbiOstrstreamToString(os);
1016                 content += s;
1017                 num_ids_found++;
1018                 break;
1019             }
1020         }
1021     }
1022     // didn't find any?  print them all, but only the first local
1023     if (num_ids_found == 0) {
1024         bool found_local = false;
1025         for (auto id_it : id_list) {
1026             if (id_it->IsLocal()) {
1027                 if (found_local) {
1028                     continue;
1029                 } else {
1030                     found_local = true;
1031                 }
1032             }
1033             if (id_found) {
1034                 content += "|";
1035             }
1036             CNcbiOstrstream os;
1037             id_it->WriteAsFasta(os);
1038             string s = CNcbiOstrstreamToString(os);
1039             content += s;
1040             id_found = true;
1041         }
1042     }
1043 
1044     return content;
1045 }
1046 
1047 
AppendBioseqLabel(string & str,const CBioseq & sq,bool supress_context)1048 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context)
1049 {
1050     str += "BIOSEQ: ";
1051 
1052     string content = GetBioseqIdLabel (sq);
1053 
1054     if (!supress_context) {
1055         if (!content.empty()) {
1056             content += ": ";
1057         }
1058 
1059         const CEnumeratedTypeValues* tv;
1060         tv = CSeq_inst::GetTypeInfo_enum_ERepr();
1061         const CSeq_inst& inst = sq.GetInst();
1062         content += tv->FindName(inst.GetRepr(), true) + ", ";
1063         tv = CSeq_inst::GetTypeInfo_enum_EMol();
1064         content += tv->FindName(inst.GetMol(), true);
1065         if (inst.IsSetLength()) {
1066             content += string(" len= ") + NStr::IntToString(inst.GetLength());
1067         }
1068     }
1069     str += content;
1070 }
1071 
HasECnumberPattern(const string & str)1072 bool HasECnumberPattern (const string& str)
1073 {
1074     bool rval = false;
1075     if (NStr::IsBlank(str)) {
1076         return false;
1077     }
1078 
1079     bool is_ambig = false;
1080     int  numdashes = 0;
1081     int  numdigits = 0;
1082     int  numperiods = 0;
1083 
1084     string::const_iterator sit = str.begin();
1085     while (sit != str.end() && !rval) {
1086         if (isdigit (*sit)) {
1087             numdigits++;
1088             if (is_ambig) {
1089                 is_ambig = false;
1090                 numperiods = 0;
1091                 numdashes = 0;
1092             }
1093         } else if (*sit == '-') {
1094             numdashes++;
1095             is_ambig = true;
1096         } else if (*sit == 'n') {
1097             numdashes++;
1098             is_ambig = true;
1099         } else if (*sit == '.') {
1100             numperiods++;
1101             if (numdigits > 0 && numdashes > 0) {
1102                 is_ambig = false;
1103                 numperiods = 0;
1104             } else if (numdigits == 0 && numdashes == 0) {
1105                 is_ambig = false;
1106                 numperiods = 0;
1107             } else if (numdashes > 1) {
1108                 is_ambig = false;
1109                 numperiods = 0;
1110             }
1111             numdigits = 0;
1112             numdashes = 0;
1113         } else {
1114             if (numperiods == 3) {
1115                 if (numdigits > 0 && numdashes > 0) {
1116                     is_ambig = false;
1117                 } else if (numdigits > 0 || numdashes == 1) {
1118                     rval = true;
1119                 }
1120             }
1121             is_ambig = false;
1122             numperiods = 0;
1123             numdigits = 0;
1124             numdashes = 0;
1125         }
1126         ++sit;
1127     }
1128     if (numperiods == 3) {
1129         if (numdigits > 0 && numdashes > 0) {
1130             rval = false;
1131         } else if (numdigits > 0 || numdashes == 1) {
1132             rval = true;
1133         }
1134     }
1135     return rval;
1136 }
1137 
1138 
SeqIsPatent(const CBioseq & seq)1139 bool SeqIsPatent (const CBioseq& seq)
1140 {
1141     bool is_patent = false;
1142 
1143     // some tests are suppressed if a patent ID is present
1144     FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
1145         if ((*id_it)->IsPatent()) {
1146             is_patent = true;
1147             break;
1148         }
1149     }
1150     return is_patent;
1151 }
1152 
1153 
SeqIsPatent(const CBioseq_Handle & seq)1154 bool SeqIsPatent (const CBioseq_Handle& seq)
1155 {
1156     return SeqIsPatent (*(seq.GetCompleteBioseq()));
1157 }
1158 
1159 
s_PartialAtGapOrNs(CScope * scope,const CSeq_loc & loc,unsigned int tag,bool only_gap)1160 bool s_PartialAtGapOrNs (
1161     CScope* scope,
1162     const CSeq_loc& loc,
1163     unsigned int tag,
1164     bool only_gap
1165 )
1166 
1167 {
1168     if ( tag != sequence::eSeqlocPartial_Nostart && tag != sequence::eSeqlocPartial_Nostop ) {
1169         return false;
1170     }
1171 
1172     CSeq_loc_CI first, last;
1173     for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) { // EQUIV_IS_ONE not supported
1174         if ( !first ) {
1175             first = sl_iter;
1176         }
1177         last = sl_iter;
1178     }
1179 
1180     if ( first.GetStrand() != last.GetStrand() ) {
1181         return false;
1182     }
1183     CSeq_loc_CI temp = (tag == sequence::eSeqlocPartial_Nostart) ? first : last;
1184 
1185     if (!scope) {
1186         return false;
1187     }
1188 
1189     CConstRef<CSeq_loc> slp = temp.GetRangeAsSeq_loc();
1190     if (!slp) {
1191         return false;
1192     }
1193     const CSeq_id* id = slp->GetId();
1194     if (id == NULL) return false;
1195     CBioseq_Handle bsh = scope->GetBioseqHandle(*id);
1196     if (!bsh) {
1197         return false;
1198     }
1199 
1200     TSeqPos acceptor = temp.GetRange().GetFrom();
1201     TSeqPos donor = temp.GetRange().GetTo();
1202     TSeqPos start = acceptor;
1203     TSeqPos stop = donor;
1204 
1205     CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
1206         temp.GetStrand());
1207     TSeqPos len = vec.size();
1208 
1209     if ( temp.GetStrand() == eNa_strand_minus ) {
1210         swap(acceptor, donor);
1211         stop = len - donor - 1;
1212         start = len - acceptor - 1;
1213     }
1214 
1215     bool result = false;
1216 
1217     try {
1218         if (tag == sequence::eSeqlocPartial_Nostop && stop < len - 1 && vec.IsInGap(stop + 1)) {
1219             return true;
1220         } else if (tag == sequence::eSeqlocPartial_Nostart && start > 0 && start < len && vec.IsInGap(start - 1)) {
1221             return true;
1222         }
1223     } catch ( exception& ) {
1224         return false;
1225     }
1226     if (only_gap) {
1227         return false;
1228     }
1229 
1230     if ( (tag == sequence::eSeqlocPartial_Nostop)  &&  (stop < len - 2) ) {
1231         try {
1232             CSeqVector::TResidue res = vec[stop + 1];
1233 
1234             if ( IsResidue(res)  &&  isalpha (res)) {
1235                 if ( res == 'N' ) {
1236                     result = true;
1237                 }
1238             }
1239         } catch ( exception& ) {
1240             return false;
1241         }
1242     } else if ( (tag == sequence::eSeqlocPartial_Nostart)  &&  (start > 1) ) {
1243         try {
1244             CSeqVector::TResidue res = vec[start - 1];
1245             if ( IsResidue(res)  &&  isalpha (res)) {
1246                 if ( res == 'N' ) {
1247                     result = true;
1248                 }
1249             }
1250         } catch ( exception& ) {
1251             return false;
1252         }
1253     }
1254 
1255     return result;
1256 }
1257 
1258 
BioseqHandleFromLocation(CScope * m_Scope,const CSeq_loc & loc)1259 CBioseq_Handle BioseqHandleFromLocation (CScope* m_Scope, const CSeq_loc& loc)
1260 
1261 {
1262     CBioseq_Handle bsh;
1263     for ( CSeq_loc_CI citer (loc); citer; ++citer) {
1264         const CSeq_id& id = citer.GetSeq_id();
1265         CSeq_id_Handle sih = CSeq_id_Handle::GetHandle(id);
1266         bsh = m_Scope->GetBioseqHandle (sih, CScope::eGetBioseq_All);
1267         if (bsh) {
1268             return bsh;
1269         }
1270     }
1271     return bsh;
1272 }
1273 
1274 
s_PosIsNNotGap(const CSeqVector & vec,unsigned int pos)1275 bool s_PosIsNNotGap(const CSeqVector& vec, unsigned int pos)
1276 {
1277     if (pos >= vec.size()) {
1278         return false;
1279     } else if (vec[pos] != 'N' && vec[pos] != 'n') {
1280         return false;
1281     } else if (vec.IsInGap(pos)) {
1282         return false;
1283     } else {
1284         return true;
1285     }
1286 }
1287 
1288 
ShouldCheckForNsAndGap(const CBioseq_Handle & bsh)1289 bool ShouldCheckForNsAndGap(const CBioseq_Handle& bsh)
1290 {
1291     if (!bsh || bsh.GetInst_Length() < 10 || (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular)) {
1292         return false;
1293     } else {
1294         return true;
1295     }
1296 }
1297 
1298 
CheckBioseqEndsForNAndGap(const CSeqVector & vec,EBioseqEndIsType & begin_n,EBioseqEndIsType & begin_gap,EBioseqEndIsType & end_n,EBioseqEndIsType & end_gap,bool & begin_ambig,bool & end_ambig)1299 void CheckBioseqEndsForNAndGap
1300 (const CSeqVector& vec,
1301 EBioseqEndIsType& begin_n,
1302 EBioseqEndIsType& begin_gap,
1303 EBioseqEndIsType& end_n,
1304 EBioseqEndIsType& end_gap,
1305 bool& begin_ambig,
1306 bool& end_ambig)
1307 {
1308     begin_n = eBioseqEndIsType_None;
1309     begin_gap = eBioseqEndIsType_None;
1310     end_n = eBioseqEndIsType_None;
1311     end_gap = eBioseqEndIsType_None;
1312     begin_ambig = false;
1313     end_ambig = false;
1314 
1315     if (vec.size() < 10) {
1316         return;
1317     }
1318 
1319     try {
1320 
1321         // check for gap at begining of sequence
1322         if (vec.IsInGap(0) /* || vec.IsInGap(1) */) {
1323             begin_gap = eBioseqEndIsType_All;
1324             for (int i = 0; i < 10; i++) {
1325                 if (!vec.IsInGap(i)) {
1326                     begin_gap = eBioseqEndIsType_Last;
1327                     break;
1328                 }
1329             }
1330         }
1331 
1332         // check for gap at end of sequence
1333         if ( /* vec.IsInGap (vec.size() - 2) || */ vec.IsInGap(vec.size() - 1)) {
1334             end_gap = eBioseqEndIsType_All;
1335             for (unsigned int i = vec.size() - 11; i < vec.size(); i++) {
1336                 if (!vec.IsInGap(i)) {
1337                     end_gap = eBioseqEndIsType_Last;
1338                     break;
1339                 }
1340             }
1341         }
1342 
1343         if (vec.IsNucleotide()) {
1344             // check for N bases at beginning of sequence
1345             if (s_PosIsNNotGap(vec, 0) /* || s_PosIsNNotGap(vec, 1) */) {
1346                 begin_n = eBioseqEndIsType_All;
1347                 for (unsigned int i = 0; i < 10; i++) {
1348                     if (!s_PosIsNNotGap(vec, i)) {
1349                         begin_n = eBioseqEndIsType_Last;
1350                         break;
1351                     }
1352                 }
1353             }
1354 
1355             // check for N bases at end of sequence
1356             if ( /* s_PosIsNNotGap(vec, vec.size() - 2) || */ s_PosIsNNotGap(vec, vec.size() - 1)) {
1357                 end_n = eBioseqEndIsType_All;
1358                 for (unsigned int i = vec.size() - 10; i < vec.size(); i++) {
1359                     if (!s_PosIsNNotGap(vec, i)) {
1360                         end_n = eBioseqEndIsType_Last;
1361                         break;
1362                     }
1363                 }
1364             }
1365 
1366             // check for ambiguous concentration
1367             size_t check_len = 50;
1368             if (vec.size() < 50) {
1369                 check_len = vec.size();
1370             }
1371             size_t num_ns = 0;
1372             for (size_t i = 0; i < check_len; i++) {
1373                 if (vec[i] == 'N') {
1374                     num_ns++;
1375                     if (num_ns >= 5 && i < 10) {
1376                         begin_ambig = true;
1377                         break;
1378                     } else if (num_ns >= 15) {
1379                         begin_ambig = true;
1380                         break;
1381                     }
1382                 }
1383             }
1384             num_ns = 0;
1385             for (int i = 0; i < check_len; i++) {
1386                 if (vec[vec.size() - i - 1] == 'N') {
1387                     num_ns++;
1388                     if (num_ns >= 5 && i < 10) {
1389                         end_ambig = true;
1390                         break;
1391                     } else if (num_ns >= 15) {
1392                         end_ambig = true;
1393                         break;
1394                     }
1395                 }
1396             }
1397         }
1398     } catch (exception&) {
1399         // if there are exceptions, cannot perform this calculation
1400     }
1401 }
1402 
1403 
CheckBioseqEndsForNAndGap(const CBioseq_Handle & bsh,EBioseqEndIsType & begin_n,EBioseqEndIsType & begin_gap,EBioseqEndIsType & end_n,EBioseqEndIsType & end_gap,bool & begin_ambig,bool & end_ambig)1404 void CheckBioseqEndsForNAndGap
1405 (const CBioseq_Handle& bsh,
1406  EBioseqEndIsType& begin_n,
1407  EBioseqEndIsType& begin_gap,
1408  EBioseqEndIsType& end_n,
1409  EBioseqEndIsType& end_gap,
1410  bool& begin_ambig,
1411  bool& end_ambig)
1412 {
1413     begin_n = eBioseqEndIsType_None;
1414     begin_gap = eBioseqEndIsType_None;
1415     end_n = eBioseqEndIsType_None;
1416     end_gap = eBioseqEndIsType_None;
1417     begin_ambig = false;
1418     end_ambig = false;
1419     if (!ShouldCheckForNsAndGap(bsh)) {
1420         return;
1421     }
1422 
1423     try {
1424         // check for gap at begining of sequence
1425         CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
1426         CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
1427     } catch ( exception& ) {
1428         // if there are exceptions, cannot perform this calculation
1429     }
1430 }
1431 
1432 
IsLocFullLength(const CSeq_loc & loc,const CBioseq_Handle & bsh)1433 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh)
1434 {
1435     if (loc.IsInt()
1436         && loc.GetInt().GetFrom() == 0
1437         && loc.GetInt().GetTo() == bsh.GetInst_Length() - 1) {
1438         return true;
1439     } else {
1440         return false;
1441     }
1442 }
1443 
1444 
PartialsSame(const CSeq_loc & loc1,const CSeq_loc & loc2)1445 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2)
1446 {
1447     bool loc1_partial_start =
1448         loc1.IsPartialStart(eExtreme_Biological);
1449     bool loc1_partial_stop =
1450         loc1.IsPartialStop(eExtreme_Biological);
1451     bool loc2_partial_start =
1452         loc2.IsPartialStart(eExtreme_Biological);
1453     bool loc2_partial_stop =
1454         loc2.IsPartialStop(eExtreme_Biological);
1455     if (loc1_partial_start == loc2_partial_start  &&
1456         loc1_partial_stop == loc2_partial_stop) {
1457         return true;
1458     } else {
1459         return false;
1460     }
1461 }
1462 
1463 
1464 
1465 
1466 // Code for finding duplicate features
s_IsSameStrand(const CSeq_loc & l1,const CSeq_loc & l2,CScope & scope)1467 bool s_IsSameStrand(const CSeq_loc& l1, const CSeq_loc& l2, CScope& scope)
1468 {
1469     ENa_strand s1 = sequence::GetStrand(l1, &scope);
1470     ENa_strand s2 = sequence::GetStrand(l2, &scope);
1471     if ((s1 == eNa_strand_minus && s2 == eNa_strand_minus)
1472         || (s1 != eNa_strand_minus && s2 != eNa_strand_minus)) {
1473         return true;
1474     } else {
1475         return false;
1476     }
1477 }
1478 
1479 
1480 inline
s_IsSameSeqAnnot(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,bool & diff_descriptions)1481 bool s_IsSameSeqAnnot(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, bool& diff_descriptions)
1482 {
1483     const auto& annot1 = f1.GetAnnot();
1484     const auto& annot2 = f2.GetAnnot();
1485     bool rval = annot1 == annot2;
1486     diff_descriptions = false;
1487     if (!rval) {
1488         if ((!annot1.Seq_annot_IsSetDesc() || annot1.Seq_annot_GetDesc().Get().empty()) &&
1489             (!annot2.Seq_annot_IsSetDesc() || annot2.Seq_annot_GetDesc().Get().empty())) {
1490             // neither is set
1491             diff_descriptions = false;
1492         } else if (annot1.Seq_annot_IsSetDesc() && annot2.Seq_annot_IsSetDesc()) {
1493             // both are set - are they different?
1494             const auto d1 = annot1.Seq_annot_GetDesc().Get().front();
1495             const auto d2 = annot2.Seq_annot_GetDesc().Get().front();
1496             if (d1->Which() != d2->Which()) {
1497                 diff_descriptions = true;
1498             } else {
1499                 if (d1->IsName()
1500                     && NStr::EqualNocase(d1->GetName(), d2->GetName())) {
1501                     diff_descriptions = false;
1502                 } else if (d1->IsTitle()
1503                     && NStr::EqualNocase(d1->GetTitle(), d2->GetTitle())) {
1504                     diff_descriptions = false;
1505                 } else {
1506                     diff_descriptions = true;
1507                 }
1508             }
1509         } else {
1510             diff_descriptions = true;
1511         }
1512     }
1513     return rval;
1514 }
1515 
1516 
s_AreGBQualsIdentical(const CSeq_feat_Handle & feat1,const CSeq_feat_Handle & feat2,bool case_sensitive)1517 bool s_AreGBQualsIdentical(const CSeq_feat_Handle& feat1, const CSeq_feat_Handle& feat2, bool case_sensitive)
1518 {
1519     if (!feat1.IsSetQual() || !feat2.IsSetQual()) {
1520         return true;
1521     }
1522 
1523     bool rval = true;
1524 
1525     CSeq_feat::TQual::const_iterator gb1 = feat1.GetQual().begin();
1526     CSeq_feat::TQual::const_iterator gb1_end = feat1.GetQual().end();
1527     CSeq_feat::TQual::const_iterator gb2 = feat2.GetQual().begin();
1528     CSeq_feat::TQual::const_iterator gb2_end = feat2.GetQual().end();
1529 
1530     while ((gb1 != gb1_end) && (gb2 != gb2_end) && rval) {
1531         if (!(*gb1)->IsSetQual()) {
1532             if ((*gb2)->IsSetQual()) {
1533                 rval = false;
1534             }
1535         } else if (!(*gb2)->IsSetQual()) {
1536             rval = false;
1537         } else if (!NStr::Equal ((*gb1)->GetQual(), (*gb2)->GetQual())) {
1538             rval = false;
1539         }
1540         if (rval) {
1541             string v1 = (*gb1)->IsSetVal() ? (*gb1)->GetVal() : "";
1542             string v2 = (*gb2)->IsSetVal() ? (*gb2)->GetVal() : "";
1543             NStr::TruncateSpacesInPlace(v1);
1544             NStr::TruncateSpacesInPlace(v2);
1545             rval = NStr::Equal(v1, v2, case_sensitive ? NStr::eCase : NStr::eNocase);
1546         }
1547         ++gb1;
1548         ++gb2;
1549     }
1550     if (gb1 != gb1_end || gb2 != gb2_end) {
1551         rval = false;
1552     }
1553 
1554     return rval;
1555 }
1556 
1557 
s_AreFeatureLabelsSame(const CSeq_feat_Handle & feat,const CSeq_feat_Handle & prev,bool case_sensitive)1558 bool s_AreFeatureLabelsSame(const CSeq_feat_Handle& feat, const CSeq_feat_Handle& prev, bool case_sensitive)
1559 {
1560     if (!feat.GetData().Equals(prev.GetData())) {
1561         return false;
1562     }
1563 
1564     // compare labels and comments
1565     bool same_label = true;
1566     const string& curr_comment =
1567         feat.IsSetComment() ? feat.GetComment() : kEmptyStr;
1568     const string& prev_comment =
1569         prev.IsSetComment() ? prev.GetComment() : kEmptyStr;
1570     string curr_label;
1571     string prev_label;
1572 
1573     feature::GetLabel(*(feat.GetSeq_feat()),
1574         &curr_label, feature::fFGL_Content, &(feat.GetScope()));
1575     feature::GetLabel(*(prev.GetSeq_feat()),
1576         &prev_label, feature::fFGL_Content, &(prev.GetScope()));
1577 
1578     bool comments_same = NStr::Equal(curr_comment, prev_comment, case_sensitive ? NStr::eCase : NStr::eNocase);
1579     bool labels_same = NStr::Equal(curr_label, prev_label, case_sensitive ? NStr::eCase : NStr::eNocase);
1580 
1581     if (!comments_same || !labels_same) {
1582         same_label = false;
1583     } else if (!s_AreGBQualsIdentical(feat, prev, case_sensitive)) {
1584         same_label = false;
1585     }
1586     return same_label;
1587 }
1588 
1589 
s_IsDifferentDbxrefs(const TDbtags & list1,const TDbtags & list2)1590 bool s_IsDifferentDbxrefs(const TDbtags& list1, const TDbtags& list2)
1591 {
1592     if (list1.empty()  ||  list2.empty()) {
1593         return false;
1594     } else if (list1.size() != list2.size()) {
1595         return true;
1596     }
1597 
1598     TDbtags::const_iterator it1 = list1.begin();
1599     TDbtags::const_iterator it2 = list2.begin();
1600     for (; it1 != list1.end(); ++it1, ++it2) {
1601         if (!NStr::EqualNocase((*it1)->GetDb(), (*it2)->GetDb())) {
1602             return true;
1603         }
1604         string str1 =
1605             (*it1)->GetTag().IsStr() ? (*it1)->GetTag().GetStr() : "";
1606         string str2 =
1607             (*it2)->GetTag().IsStr() ? (*it2)->GetTag().GetStr() : "";
1608         if ( str1.empty()  &&  str2.empty() ) {
1609             if (!(*it1)->GetTag().IsId()  &&  !(*it2)->GetTag().IsId()) {
1610                 continue;
1611             } else if ((*it1)->GetTag().IsId()  &&  (*it2)->GetTag().IsId()) {
1612                 if ((*it1)->GetTag().GetId() != (*it2)->GetTag().GetId()) {
1613                     return true;
1614                 }
1615             } else {
1616                 return true;
1617             }
1618         } else if (!str1.empty() && !str2.empty() && !NStr::EqualNocase(str1, str2)) {
1619             return true;
1620         }
1621     }
1622     return false;
1623 }
1624 
1625 
s_AreFullLengthCodingRegionsWithDifferentFrames(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1626 bool s_AreFullLengthCodingRegionsWithDifferentFrames (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1627 {
1628     const auto & f1data = f1.GetData();
1629     const auto & f2data = f2.GetData();
1630     if (!f1data.IsCdregion() || !f2data.IsCdregion()) {
1631         return false;
1632     }
1633     const auto & cd1 = f1data.GetCdregion();
1634     const auto & cd2 = f2data.GetCdregion();
1635 
1636     int frame1 = 1, frame2 = 1;
1637     if (cd1.IsSetFrame()) {
1638         frame1 = cd1.GetFrame();
1639         if (frame1 == 0) {
1640             frame1 = 1;
1641         }
1642     }
1643     if (cd2.IsSetFrame()) {
1644         frame2 = cd2.GetFrame();
1645         if (frame2 == 0) {
1646             frame2 = 1;
1647         }
1648     }
1649     if (frame1 == frame2) {
1650         return false;
1651     }
1652 
1653     CBioseq_Handle bsh1 = f1.GetScope().GetBioseqHandle(f1.GetLocation());
1654     if (!IsLocFullLength (f1.GetLocation(), bsh1)) {
1655         return false;
1656     }
1657     CBioseq_Handle bsh2 = f2.GetScope().GetBioseqHandle(f2.GetLocation());
1658     if (!IsLocFullLength (f2.GetLocation(), bsh2)) {
1659         return false;
1660     }
1661 
1662     return true;
1663 }
1664 
1665 
1666 //LCOV_EXCL_START
1667 // never used, because different variations generate different labels
s_ReplaceListFromQuals(const CSeq_feat::TQual & quals)1668 string s_ReplaceListFromQuals(const CSeq_feat::TQual& quals)
1669 {
1670     string replace;
1671     ITERATE(CSeq_feat::TQual, q, quals) {
1672         if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "replace") && (*q)->IsSetVal()) {
1673             if (NStr::IsBlank((*q)->GetVal())) {
1674                 replace += " ";
1675             } else {
1676                 replace += (*q)->GetVal();
1677             }
1678             replace += ".";
1679         }
1680     }
1681     return replace;
1682 }
1683 
1684 
s_AreDifferentVariations(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1685 bool s_AreDifferentVariations(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1686 {
1687     if (f1.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation
1688         || f2.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
1689         return false;
1690     }
1691     if (!f1.IsSetQual() || !f2.IsSetQual()) {
1692         return false;
1693     }
1694     string replace1 = s_ReplaceListFromQuals(f1.GetQual());
1695     string replace2 = s_ReplaceListFromQuals(f2.GetQual());
1696 
1697     if (!NStr::Equal(replace1, replace2)) {
1698         return true;
1699     } else {
1700         return false;
1701     }
1702 }
1703 //LCOV_EXCL_STOP
1704 
1705 
1706 typedef vector<CConstRef<CObject_id> > TFeatIdVec;
s_AreLinkedToDifferentFeats(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,CSeqFeatData::ESubtype s1,CSeqFeatData::ESubtype s2)1707 static bool s_AreLinkedToDifferentFeats (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, CSeqFeatData::ESubtype s1, CSeqFeatData::ESubtype s2)
1708 {
1709     bool rval = false;
1710 
1711     if (f1.GetData().GetSubtype() == s1 && f2.GetData().GetSubtype() == s1) {
1712         CScope& scope = f1.GetScope();
1713         const CSeq_loc& loc = f1.GetLocation();
1714         CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, loc);
1715         if (bsh) {
1716             const CTSE_Handle& tse = bsh.GetTSE_Handle();
1717             TFeatIdVec mrna1_id;
1718             TFeatIdVec mrna2_id;
1719             list<CSeq_feat_Handle> mrna1;
1720             list<CSeq_feat_Handle> mrna2;
1721 
1722             FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, *(f1.GetSeq_feat())) {
1723                 if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1724                     const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1725                     vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1726                     ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1727                         if (feat_it->IsSetData()
1728                             && feat_it->GetData().GetSubtype() == s2) {
1729                             mrna1.push_back(*feat_it);
1730                             CConstRef<CObject_id> f(&feat_id);
1731                             mrna1_id.push_back (f);
1732                             break;
1733                         }
1734                     }
1735                 }
1736             }
1737             FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, *(f2.GetSeq_feat())) {
1738                 if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1739                     const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1740                     vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1741                     ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1742                         if (feat_it->IsSetData()
1743                             && feat_it->GetData().GetSubtype() == s2) {
1744                             mrna2.push_back(*feat_it);
1745                             CConstRef<CObject_id> f(&feat_id);
1746                             mrna2_id.push_back (f);
1747                         }
1748                     }
1749                 }
1750             }
1751 
1752             if (mrna1_id.size() > 0 && mrna2_id.size() > 0) {
1753                 rval = true;
1754                 for (auto i1 = mrna1_id.begin(); i1 != mrna1_id.end(); ++i1) {
1755                     for (auto i2 = mrna2_id.begin(); i2 != mrna2_id.end(); ++i2) {
1756                         if ((*i1)->Equals(**i2)) {
1757                             rval = false;
1758                             break;
1759                         }
1760                     }
1761                     if (!rval) {
1762                         break;
1763                     }
1764                 }
1765 
1766                 if (rval) { // Check that locations aren't the same
1767                     const CSeq_feat_Handle fh1 = mrna1.front();
1768                     const CSeq_feat_Handle fh2 = mrna2.front();
1769 
1770 
1771                     if (s_IsSameStrand(fh1.GetLocation(),
1772                                        fh2.GetLocation(),
1773                                        fh1.GetScope())
1774                       && (sequence::Compare(fh1.GetLocation(),
1775                                            fh2.GetLocation(),
1776                                            &(fh1.GetScope()),
1777                                            sequence::fCompareOverlapping) == sequence::eSame)) {
1778                         rval = false;
1779                     }
1780                 }
1781             }
1782         }
1783     }
1784     return rval;
1785 }
1786 
1787 
1788 
1789 
s_AreCodingRegionsLinkedToDifferentmRNAs(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1790 static bool s_AreCodingRegionsLinkedToDifferentmRNAs (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1791 {
1792     return s_AreLinkedToDifferentFeats (f1, f2, CSeqFeatData::eSubtype_cdregion, CSeqFeatData::eSubtype_mRNA);
1793 }
1794 
1795 
s_AremRNAsLinkedToDifferentCodingRegions(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1796 bool s_AremRNAsLinkedToDifferentCodingRegions (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1797 {
1798     return s_AreLinkedToDifferentFeats (f1, f2, CSeqFeatData::eSubtype_mRNA, CSeqFeatData::eSubtype_cdregion);
1799 }
1800 
1801 
IsDicistronicGene(const CSeq_feat_Handle & f)1802 bool IsDicistronicGene(const CSeq_feat_Handle& f)
1803 {
1804     if ( f.GetData().GetSubtype() != CSeqFeatData::eSubtype_gene ) return false;
1805     return IsDicistronic(f);
1806 }
1807 
1808 
IsDicistronic(const CSeq_feat_Handle & f)1809 bool IsDicistronic(const CSeq_feat_Handle& f)
1810 {
1811     if (!f.IsSetExcept()) return false;
1812     if (!f.IsSetExcept_text()) return false;
1813 
1814     const string& except_text = f.GetExcept_text();
1815     if (NStr::FindNoCase(except_text, "dicistronic gene") == NPOS) return false;
1816 
1817     return true;
1818 }
1819 
1820 
1821 EDuplicateFeatureType
IsDuplicate(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,bool check_partials,bool case_sensitive)1822 IsDuplicate
1823 (const CSeq_feat_Handle& f1,
1824  const CSeq_feat_Handle& f2,
1825  bool check_partials,
1826  bool case_sensitive)
1827 {
1828 
1829     EDuplicateFeatureType dup_type = eDuplicate_Not;
1830 
1831     // subtypes
1832     CSeqFeatData::ESubtype feat1_subtype = f1.GetData().GetSubtype();
1833     CSeqFeatData::ESubtype feat2_subtype = f2.GetData().GetSubtype();
1834 
1835     // not duplicates if not the same subtype
1836     if (feat1_subtype != feat2_subtype) {
1837         return eDuplicate_Not;
1838     }
1839 
1840     // locations
1841     const CSeq_loc& feat1_loc = f1.GetLocation();
1842     const CSeq_loc& feat2_loc = f2.GetLocation();
1843 
1844     // not duplicates if not the same location and strand
1845     if (!s_IsSameStrand(feat1_loc, feat2_loc, f1.GetScope())  ||
1846         sequence::Compare(feat1_loc, feat2_loc, &(f1.GetScope()),
1847                             sequence::fCompareOverlapping) != sequence::eSame) {
1848         return eDuplicate_Not;
1849     }
1850 
1851     // same annot?
1852     bool diff_annot_desc = false;
1853     bool same_annot = s_IsSameSeqAnnot(f1, f2, diff_annot_desc);
1854 
1855     if (diff_annot_desc) {
1856         // don't report if features on different annots with different titles or names
1857         return eDuplicate_Not;
1858     }
1859 
1860     // compare labels and comments
1861     bool same_label = s_AreFeatureLabelsSame (f1, f2, case_sensitive);
1862 
1863     // compare dbxrefs
1864     bool different_dbxrefs = (f1.IsSetDbxref() && f2.IsSetDbxref() &&
1865                         s_IsDifferentDbxrefs(f1.GetDbxref(), f2.GetDbxref()));
1866 
1867     if ( feat1_subtype == CSeqFeatData::eSubtype_region && different_dbxrefs) {
1868         return eDuplicate_Not;
1869     }
1870 
1871     // check for frame difference
1872     bool full_length_coding_regions_with_different_frames =
1873                       s_AreFullLengthCodingRegionsWithDifferentFrames(f1, f2);
1874     if (!same_label && full_length_coding_regions_with_different_frames) {
1875         // do not report if both coding regions are full length, have different products,
1876         // and have different frames
1877         return eDuplicate_Not;
1878     }
1879 
1880     if ((feat1_subtype == CSeqFeatData::eSubtype_variation && !same_label) || s_AreDifferentVariations(f1, f2)) {
1881         // don't report variations if replace quals are different or labels are different
1882         return eDuplicate_Not;
1883     }
1884 
1885 
1886     if (s_AreCodingRegionsLinkedToDifferentmRNAs(f1, f2)) {
1887         // do not report if features are coding regions linked to different mRNAs
1888         return eDuplicate_Not;
1889     }
1890 
1891 
1892     if (s_AremRNAsLinkedToDifferentCodingRegions(f1, f2)) {
1893         // do not report if features are mRNAs linked to different coding regions
1894         return eDuplicate_Not;
1895     }
1896 
1897 
1898     // only report pubs if they have the same label
1899     if (feat1_subtype == CSeqFeatData::eSubtype_pub && !same_label) {
1900         return eDuplicate_Not;
1901     }
1902 
1903     bool partials_ok = (!check_partials || PartialsSame(feat1_loc, feat2_loc));
1904 
1905     if (!partials_ok) {
1906         return eDuplicate_Not;
1907     }
1908 
1909     if ( same_annot ) {
1910         if (same_label) {
1911             dup_type = eDuplicate_Duplicate;
1912         } else {
1913             dup_type = eDuplicate_SameIntervalDifferentLabel;
1914         }
1915     } else {
1916         if (same_label) {
1917             dup_type = eDuplicate_DuplicateDifferentTable;
1918         } else if ( feat2_subtype != CSeqFeatData::eSubtype_pub ) {
1919             dup_type = eDuplicate_SameIntervalDifferentLabelDifferentTable;
1920         }
1921     }
1922 
1923     return dup_type;
1924 }
1925 
1926 // specific-host functions
1927 
IsCommonName(const CT3Data & data)1928 bool IsCommonName (const CT3Data& data)
1929 {
1930     bool is_common = false;
1931 
1932     if (data.IsSetStatus()) {
1933         ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1934             if ((*status_it)->IsSetProperty()
1935                 && NStr::Equal((*status_it)->GetProperty(), "old_name_class", NStr::eNocase)) {
1936                 if ((*status_it)->IsSetValue() && (*status_it)->GetValue().IsStr()) {
1937                     string value_str = (*status_it)->GetValue().GetStr();
1938                     if (NStr::Equal(value_str, "common name", NStr::eCase)
1939                         || NStr::Equal(value_str, "genbank common name", NStr::eCase)) {
1940                         is_common = true;
1941                         break;
1942                     }
1943                 }
1944             }
1945         }
1946     }
1947     return is_common;
1948 }
1949 
HasMisSpellFlag(const CT3Data & data)1950 bool HasMisSpellFlag (const CT3Data& data)
1951 {
1952     bool has_misspell_flag = false;
1953 
1954     if (data.IsSetStatus()) {
1955         ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1956             if ((*status_it)->IsSetProperty()) {
1957                 string prop = (*status_it)->GetProperty();
1958                 if (NStr::EqualNocase(prop, "misspelled_name")) {
1959                     has_misspell_flag = true;
1960                     break;
1961                 }
1962             }
1963         }
1964     }
1965     return has_misspell_flag;
1966 }
1967 
1968 
FindMatchInOrgRef(const string & str,const COrg_ref & org)1969 bool FindMatchInOrgRef (const string& str, const COrg_ref& org)
1970 {
1971     string match;
1972 
1973     if (NStr::IsBlank(str)) {
1974         // do nothing;
1975     } else if (org.IsSetTaxname() && NStr::EqualNocase(str, org.GetTaxname())) {
1976         match = org.GetTaxname();
1977     } else if (org.IsSetCommon() && NStr::EqualNocase(str, org.GetCommon())) {
1978         match = org.GetCommon();
1979     } else {
1980         FOR_EACH_SYN_ON_ORGREF (syn_it, org) {
1981             if (NStr::EqualNocase(str, *syn_it)) {
1982                 match = *syn_it;
1983                 break;
1984             }
1985         }
1986         if (NStr::IsBlank(match) && org.IsSetOrgname()) {
1987             const COrgName& orgname = org.GetOrgname();
1988             if (orgname.IsSetMod()) {
1989                 for (const auto& mod_it : orgname.GetMod()) {
1990                     if (mod_it->IsSetSubtype()
1991                         && (mod_it->GetSubtype() == COrgMod::eSubtype_gb_synonym
1992                             || mod_it->GetSubtype() == COrgMod::eSubtype_old_name)
1993                         && mod_it->IsSetSubname()
1994                         && NStr::EqualNocase(str, mod_it->GetSubname())) {
1995                         match = mod_it->GetSubname();
1996                         break;
1997                     }
1998                 }
1999             }
2000         }
2001     }
2002     return NStr::EqualCase(str, match);
2003 }
2004 
2005 
2006 static const string sIgnoreHostWordList[] = {
2007   " cf.",
2008   " cf ",
2009   " aff ",
2010   " aff.",
2011   " near",
2012   " nr.",
2013   " nr "
2014 };
2015 
2016 
2017 static const int kNumIgnoreHostWordList = sizeof (sIgnoreHostWordList) / sizeof (string);
2018 
AdjustSpecificHostForTaxServer(string & spec_host)2019 void AdjustSpecificHostForTaxServer (string& spec_host)
2020 {
2021     for (int i = 0; i < kNumIgnoreHostWordList; i++) {
2022         NStr::ReplaceInPlace(spec_host, sIgnoreHostWordList[i], " ");
2023     }
2024     NStr::ReplaceInPlace(spec_host, "  ", " ");
2025     NStr::TruncateSpacesInPlace(spec_host);
2026 }
2027 
2028 
SpecificHostValueToCheck(const string & val)2029 string SpecificHostValueToCheck(const string& val)
2030 {
2031     if (NStr::IsBlank(val)) {
2032         return val;
2033 #if 0
2034     } else if (! isupper (val.c_str()[0])) {
2035         return kEmptyStr;
2036 #endif
2037     }
2038 
2039     string host = val;
2040     // ignore portion after semicolon
2041     size_t pos = NStr::Find(host, ";");
2042     if (pos != string::npos) {
2043         host = host.substr(0, pos);
2044     }
2045     NStr::TruncateSpacesInPlace(host);
2046     // must have at least two words to check
2047     pos = NStr::Find(host, " "); // combine with next line
2048     if (pos == string::npos) {
2049         return kEmptyStr;
2050     }
2051 
2052     AdjustSpecificHostForTaxServer(host);
2053     pos = NStr::Find(host, " ");
2054     if (NStr::StartsWith(host.substr(pos + 1), "hybrid ")) {
2055         pos += 7;
2056     } else if (NStr::StartsWith(host.substr(pos + 1), "x ")) {
2057         pos += 2;
2058     }
2059     if (! NStr::StartsWith(host.substr(pos + 1), "sp.")
2060         && ! NStr::StartsWith(host.substr(pos + 1), "(")) {
2061         pos = NStr::Find(host, " ", pos + 1);
2062         if (pos != string::npos) {
2063             host = host.substr(0, pos);
2064         }
2065     } else {
2066         host = host.substr(0, pos);
2067     }
2068     return host;
2069 }
2070 
2071 
InterpretSpecificHostResult(const string & host,const CT3Reply & reply,const string & orig_host)2072 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host)
2073 {
2074     string err_str;
2075     if (reply.IsError()) {
2076         err_str = "?";
2077         if (reply.GetError().IsSetMessage()) {
2078             err_str = reply.GetError().GetMessage();
2079         }
2080         if(NStr::FindNoCase(err_str, "ambiguous") != string::npos) {
2081             err_str = "Specific host value is ambiguous: " +
2082                 (NStr::IsBlank(orig_host) ? host : orig_host);
2083         } else {
2084             err_str = "Invalid value for specific host: " +
2085                 (NStr::IsBlank(orig_host) ? host : orig_host);
2086         }
2087     } else if (reply.IsData()) {
2088         const auto& rdata = reply.GetData();
2089         if (HasMisSpellFlag(rdata)) {
2090             err_str = "Specific host value is misspelled: " +
2091                 (NStr::IsBlank(orig_host) ? host : orig_host);
2092         } else if (rdata.IsSetOrg()) {
2093             const auto& org = rdata.GetOrg();
2094             if (NStr::StartsWith(org.GetTaxname(), host)) {
2095                 // do nothing, all good
2096             } else if (IsCommonName(rdata)) {
2097                 // not actionable
2098             } else if (FindMatchInOrgRef(host, org)) {
2099                 // replace with synonym
2100                 err_str = "Specific host value is alternate name: " +
2101                     orig_host + " should be " +
2102                     org.GetTaxname();
2103             } else {
2104                 err_str = "Specific host value is incorrectly capitalized: " +
2105                     (NStr::IsBlank(orig_host) ? host : orig_host);
2106             }
2107         } else {
2108             err_str = "Invalid value for specific host: " +
2109                 (NStr::IsBlank(orig_host) ? host : orig_host);
2110         }
2111     }
2112     return err_str;
2113 }
2114 
2115 
IsCommon(const COrg_ref & org,const string & val)2116 bool IsCommon(const COrg_ref& org, const string& val)
2117 {
2118     bool is_common = false;
2119     if (org.IsSetCommon() && NStr::EqualNocase(val, org.GetCommon())) {
2120         // common name, not genus
2121         is_common = true;
2122     } else if (org.IsSetOrgMod()) {
2123         for (auto& it : org.GetOrgname().GetMod()) {
2124             if (it->IsSetSubtype() &&
2125                 it->GetSubtype() == COrgMod::eSubtype_common &&
2126                 it->IsSetSubname() &&
2127                 NStr::EqualNocase(it->GetSubname(), val)) {
2128                 is_common = true;
2129                 break;
2130             }
2131         }
2132     }
2133     return is_common;
2134 }
2135 
2136 
IsLikelyTaxname(const string & val)2137 bool IsLikelyTaxname(const string& val)
2138 {
2139     if (!isalpha(val[0])) {
2140         return false;
2141     }
2142     size_t pos = NStr::Find(val, " ");
2143     if (pos == NPOS) {
2144         return false;
2145     }
2146 
2147     CTaxon1 taxon1;
2148     taxon1.Init();
2149     TTaxId taxid = taxon1.GetTaxIdByName(val.substr(0, pos));
2150     if (taxid == ZERO_TAX_ID || taxid == INVALID_TAX_ID) {
2151         return false;
2152     }
2153 
2154     bool is_species = false;
2155     bool is_uncultured = false;
2156     string blast_name;
2157     CConstRef<COrg_ref> org = taxon1.GetOrgRef(taxid, is_species, is_uncultured, blast_name);
2158     if (org && IsCommon(*org, val.substr(0, pos))) {
2159         return false;
2160     } else {
2161         return true;
2162     }
2163 }
2164 
2165 
2166 //LCOV_EXCL_START
2167 //not used by asnvalidate but used by other applications
IsSpecificHostValid(const string & val,string & error_msg)2168 bool IsSpecificHostValid(const string& val, string& error_msg)
2169 {
2170     CTaxValidationAndCleanup tval;
2171     return tval.IsOneSpecificHostValid(val, error_msg);
2172 }
2173 
2174 
FixSpecificHost(const string & val)2175 string FixSpecificHost(const string& val)
2176 {
2177     string hostfix = val;
2178     validator::CTaxValidationAndCleanup tval;
2179     tval.FixOneSpecificHost(hostfix);
2180 
2181     return hostfix;
2182 }
2183 
2184 
s_ConvertChar(char ch)2185 static char s_ConvertChar(char ch)
2186 {
2187     if (ch < 0x02 || ch > 0x7F) {
2188         // no change
2189     }
2190     else if (isalpha(ch)) {
2191         ch = tolower(ch);
2192     }
2193     else if (isdigit(ch)) {
2194         // no change
2195     }
2196     else if (ch == '\'' || ch == '/' || ch == '@' || ch == '`' || ch == ',') {
2197         // no change
2198     }
2199     else {
2200         ch = 0x20;
2201     }
2202     return ch;
2203 }
2204 
2205 
ConvertToEntrezTerm(string & title)2206 void ConvertToEntrezTerm(string& title)
2207 {
2208     string::iterator s = title.begin();
2209     char p = ' ';
2210     while (s != title.end()) {
2211         *s = s_ConvertChar(*s);
2212         if (isspace(*s) && isspace(p)) {
2213             s = title.erase(s);
2214         }
2215         else {
2216             p = *s;
2217             ++s;
2218         }
2219     }
2220     NStr::TruncateSpacesInPlace(title);
2221 }
2222 //LCOV_EXCL_STOP
2223 
2224 
FixGeneticCode(CCdregion & cdr)2225 void FixGeneticCode(CCdregion& cdr)
2226 {
2227     if (!cdr.IsSetCode()) {
2228         return;
2229     }
2230     const auto& gcode = cdr.GetCode();
2231     CGenetic_code::C_E::TId genCode = 0;
2232     for (auto& it : gcode.Get()) {
2233         if (it->IsId()) {
2234             genCode = it->GetId();
2235         }
2236     }
2237 
2238     if (genCode == 7) {
2239         genCode = 4;
2240     } else if (genCode == 8) {
2241         genCode = 1;
2242     } else if (genCode == 0) {
2243         genCode = 1;
2244     }
2245     cdr.ResetCode();
2246     CRef<CGenetic_code::C_E> new_code(new CGenetic_code::C_E());
2247     new_code->SetId(genCode);
2248     cdr.SetCode().Set().push_back(new_code);
2249 }
2250 
2251 
TranslateCodingRegionForValidation(const CSeq_feat & feat,CScope & scope,bool & alt_start)2252 string TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start)
2253 {
2254     string transl_prot;
2255     CRef<CSeq_feat> tmp_cds(new CSeq_feat());
2256     tmp_cds->Assign(feat);
2257     FixGeneticCode(tmp_cds->SetData().SetCdregion());
2258     const CCdregion& cdregion = tmp_cds->GetData().GetCdregion();
2259     const CSeq_loc& cds_loc = tmp_cds->GetLocation();
2260     if (cds_loc.IsWhole()) {
2261         CBioseq_Handle bsh = scope.GetBioseqHandle(cds_loc.GetWhole());
2262         if (!bsh) {
2263             return kEmptyStr;
2264         }
2265         size_t start = 0;
2266         if (cdregion.IsSetFrame()) {
2267             if (cdregion.GetFrame() == 2) {
2268                 start = 1;
2269             } else if (cdregion.GetFrame() == 3) {
2270                 start = 2;
2271             }
2272         }
2273         const CGenetic_code* genetic_code = NULL;
2274         if (cdregion.IsSetCode()) {
2275             genetic_code = &(cdregion.GetCode());
2276         }
2277         CRef<CSeq_id> id(new CSeq_id());
2278         id->Assign(cds_loc.GetWhole());
2279         CRef<CSeq_loc> tmp(new CSeq_loc(*id, start, bsh.GetInst_Length() - 1));
2280         CSeqTranslator::Translate(*tmp, scope, transl_prot, genetic_code, true, false, &alt_start);
2281     } else {
2282         CSeqTranslator::Translate(*tmp_cds, scope, transl_prot,
2283             true,   // include stop codons
2284             false,  // do not remove trailing X/B/Z
2285             &alt_start);
2286     }
2287 
2288     return transl_prot;
2289 }
2290 
2291 
HasBadStartCodon(const CSeq_loc & loc,const string & transl_prot)2292 bool HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot)
2293 {
2294     bool got_dash = (transl_prot[0] == '-');
2295     bool got_x = (transl_prot[0] == 'X'
2296         && !loc.IsPartialStart(eExtreme_Biological));
2297 
2298     if (!got_dash && !got_x) {
2299         return false;
2300     }
2301     return true;
2302 }
2303 
2304 
2305 static const char * kUnclassifiedTranslationDiscrepancy = "unclassified translation discrepancy";
2306 
2307 static const char* const sc_BypassCdsTransCheckText[] = {
2308     "RNA editing",
2309     "adjusted for low-quality genome",
2310     "annotated by transcript or proteomic data",
2311     "rearrangement required for product",
2312     "reasons given in citation",
2313     "translated product replaced",
2314     kUnclassifiedTranslationDiscrepancy
2315 };
2316 typedef CStaticArraySet<const char*, PCase_CStr> TBypassCdsTransCheckSet;
2317 DEFINE_STATIC_ARRAY_MAP(TBypassCdsTransCheckSet, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText);
2318 
2319 static const char* const sc_ForceCdsTransCheckText[] = {
2320     "artificial frameshift",
2321     "mismatches in translation"
2322 };
2323 typedef CStaticArraySet<const char*, PCase_CStr> TForceCdsTransCheckSet;
2324 DEFINE_STATIC_ARRAY_MAP(TForceCdsTransCheckSet, sc_ForceCdsTransCheck, sc_ForceCdsTransCheckText);
2325 
ReportTranslationErrors(const string & except_text)2326 bool ReportTranslationErrors(const string& except_text)
2327 {
2328     bool report = true;
2329     ITERATE(TBypassCdsTransCheckSet, it, sc_BypassCdsTransCheck) {
2330         if (NStr::FindNoCase(except_text, *it) != NPOS) {
2331             report = false;
2332         }
2333     }
2334     if (!report) {
2335         ITERATE(TForceCdsTransCheckSet, it, sc_ForceCdsTransCheck) {
2336             if (NStr::FindNoCase(except_text, *it) != NPOS) {
2337                 report = true;
2338             }
2339         }
2340     }
2341     return report;
2342 }
2343 
2344 
2345 //LCOV_EXCL_START
2346 //not used by asnvalidate but used by other applications
HasBadStartCodon(const CSeq_feat & feat,CScope & scope,bool ignore_exceptions)2347 bool HasBadStartCodon(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2348 {
2349     if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2350         return false;
2351     }
2352     // do not validate for pseudo gene
2353     FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2354         if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2355             return false;
2356         }
2357     }
2358 
2359     if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2360         feat.CanGetExcept_text()) {
2361         if (!ReportTranslationErrors(feat.GetExcept_text())) {
2362             return false;
2363         }
2364     }
2365 
2366     bool alt_start = false;
2367     string transl_prot;
2368     try {
2369         transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2370     } catch (CException& ) {
2371         return false;
2372     }
2373     return HasBadStartCodon(feat.GetLocation(), transl_prot);
2374 }
2375 //LCOV_EXCL_STOP
2376 
2377 
CountInternalStopCodons(const string & transl_prot)2378 size_t CountInternalStopCodons(const string& transl_prot)
2379 {
2380     if (NStr::IsBlank(transl_prot)) {
2381         return 0;
2382     }
2383     // count internal stops and Xs
2384     size_t internal_stop_count = 0;
2385 
2386     ITERATE(string, it, transl_prot) {
2387         if (*it == '*') {
2388             ++internal_stop_count;
2389         }
2390     }
2391     // if stop at end, reduce count by one (since one of the stops counted isn't internal)
2392     if (transl_prot[transl_prot.length() - 1] == '*') {
2393         --internal_stop_count;
2394     }
2395     return internal_stop_count;
2396 }
2397 
2398 
2399 //LCOV_EXCL_START
2400 //not used by asnvalidate but used by other applications
HasInternalStop(const CSeq_feat & feat,CScope & scope,bool ignore_exceptions)2401 bool HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2402 {
2403     if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2404         return false;
2405     }
2406     // do not validate for pseudo gene
2407     FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2408         if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2409             return false;
2410         }
2411     }
2412 
2413     if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2414         feat.CanGetExcept_text()) {
2415         const string& except_text = feat.GetExcept_text();
2416         if (NStr::Find(except_text, kUnclassifiedTranslationDiscrepancy) == string::npos
2417             && !ReportTranslationErrors(feat.GetExcept_text())) {
2418             return false;
2419         }
2420     }
2421 
2422     bool alt_start = false;
2423     string transl_prot;
2424     try {
2425         transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2426     } catch (CException& ) {
2427         return false;
2428     }
2429 
2430     size_t internal_stop_codons = CountInternalStopCodons(transl_prot);
2431     if (internal_stop_codons > 0) {
2432         return true;
2433     } else {
2434         return false;
2435     }
2436 }
2437 //LCOV_EXCL_STOP
2438 
2439 
MakeSeqVectorForResidueCounting(const CBioseq_Handle & bsh)2440 CRef<CSeqVector> MakeSeqVectorForResidueCounting(const CBioseq_Handle& bsh)
2441 {
2442     CRef<CSeqVector> sv(new CSeqVector(bsh, CBioseq_Handle::eCoding_Iupac));
2443     CSeq_data::E_Choice seqtyp = bsh.GetInst().IsSetSeq_data() ?
2444         bsh.GetInst().GetSeq_data().Which() : CSeq_data::e_not_set;
2445     if (seqtyp == CSeq_data::e_Ncbieaa || seqtyp == CSeq_data::e_Ncbistdaa) {
2446         sv->SetCoding(CSeq_data::e_Ncbieaa);
2447     }
2448     return sv;
2449 }
2450 
2451 
HasBadProteinStart(const CSeqVector & sv)2452 bool HasBadProteinStart(const CSeqVector& sv)
2453 {
2454     if (sv.size() < 1) {
2455         return false;
2456     } else if (sv.IsInGap(0) || sv[0] == '-') {
2457         return true;
2458     } else {
2459         return false;
2460     }
2461 }
2462 
2463 
2464 //LCOV_EXCL_START
2465 //not used by asnvalidate but used by other applications
HasBadProteinStart(const CSeq_feat & feat,CScope & scope)2466 bool HasBadProteinStart(const CSeq_feat& feat, CScope& scope)
2467 {
2468     if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2469         !feat.IsSetProduct()) {
2470         return false;
2471     }
2472     // use try catch for those weird situations where the product is
2473     // not specified as a single product sequence (in which case we
2474     // should just skip this test)
2475     try {
2476         CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2477         if (!bsh.IsAa()) {
2478             return false;
2479         }
2480         CConstRef<CSeqVector> sv = MakeSeqVectorForResidueCounting(bsh);
2481         return HasBadProteinStart(*sv);
2482     } catch (CException& ) {
2483         return false;
2484     }
2485 }
2486 //LCOV_STOP
2487 
2488 
CountProteinStops(const CSeqVector & sv)2489 size_t CountProteinStops(const CSeqVector& sv)
2490 {
2491     size_t terminations = 0;
2492 
2493     for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter) {
2494         if (*sv_iter == '*') {
2495             terminations++;
2496         }
2497     }
2498     return terminations;
2499 }
2500 
2501 
2502 //LCOV_EXCL_START
2503 //not used by asnvalidate but used by other applications
HasStopInProtein(const CSeq_feat & feat,CScope & scope)2504 bool HasStopInProtein(const CSeq_feat& feat, CScope& scope)
2505 {
2506     if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2507         !feat.IsSetProduct()) {
2508         return false;
2509     }
2510     // use try catch for those weird situations where the product is
2511     // not specified as a single product sequence (in which case we
2512     // should just skip this test)
2513     try {
2514         CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2515         if (!bsh.IsAa()) {
2516             return false;
2517         }
2518         CConstRef<CSeqVector> sv = MakeSeqVectorForResidueCounting(bsh);
2519         if (CountProteinStops(*sv) > 0) {
2520             return true;
2521         } else {
2522             return false;
2523         }
2524     } catch (CException& ) {
2525         return false;
2526     }
2527 }
2528 //LCOV_EXCL_STOP
2529 
2530 
FeatureHasEnds(const CSeq_feat & feat,CScope * scope,bool & no_beg,bool & no_end)2531 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end)
2532 {
2533     unsigned int part_loc = sequence::SeqLocPartialCheck(feat.GetLocation(), scope);
2534     no_beg = false;
2535     no_end = false;
2536 
2537     if (part_loc & sequence::eSeqlocPartial_Start) {
2538         no_beg = true;
2539     }
2540     if (part_loc & sequence::eSeqlocPartial_Stop) {
2541         no_end = true;
2542     }
2543 
2544 
2545     if ((!no_beg || !no_end) && feat.IsSetProduct()) {
2546         unsigned int part_prod = sequence::SeqLocPartialCheck(feat.GetProduct(), scope);
2547         if (part_prod & sequence::eSeqlocPartial_Start) {
2548             no_beg = true;
2549         }
2550         if (part_prod & sequence::eSeqlocPartial_Stop) {
2551             no_end = true;
2552         }
2553     }
2554 }
2555 
2556 
2557 //LCOV_EXCL_START
2558 // not used by asnvalidate but needed for other applications
GetCDSProductSequence(const CSeq_feat & feat,CScope * scope,const CTSE_Handle & tse,bool far_fetch,bool & is_far)2559 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far)
2560 {
2561     CBioseq_Handle prot_handle;
2562     is_far = false;
2563     if (!feat.IsSetProduct()) {
2564         return prot_handle;
2565     }
2566     const CSeq_id* protid = NULL;
2567     try {
2568         protid = &sequence::GetId(feat.GetProduct(), scope);
2569     } catch (CException&) {}
2570     if (protid != NULL) {
2571         prot_handle = scope->GetBioseqHandleFromTSE(*protid, tse);
2572         if (!prot_handle  &&  far_fetch) {
2573             prot_handle = scope->GetBioseqHandle(*protid);
2574             is_far = true;
2575         }
2576     }
2577     return prot_handle;
2578 }
2579 //LCOV_EXCL_STOP
2580 
2581 
CalculateEffectiveTranslationLengths(const string & transl_prot,const CSeqVector & prot_vec,size_t & len,size_t & prot_len)2582 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len)
2583 {
2584     len = transl_prot.length();
2585     prot_len = prot_vec.size();
2586 
2587     if (NStr::EndsWith(transl_prot, "*") && (len == prot_len + 1)) { // ok, got stop
2588         --len;
2589     }
2590     while (len > 0) {
2591         if (transl_prot[len - 1] == 'X') {  //remove terminal X
2592             --len;
2593         } else {
2594             break;
2595         }
2596     }
2597 
2598     // ignore terminal 'X' from partial last codon if present
2599     while (prot_len > 0) {
2600         if (prot_vec[prot_len - 1] == 'X') {  //remove terminal X
2601             --prot_len;
2602         } else {
2603             break;
2604         }
2605     }
2606 }
2607 
2608 
2609 //LCOV_EXCL_START
2610 // not used by asnvalidate but needed for other applications
GetMismatches(const CSeq_feat & feat,const CSeqVector & prot_vec,const string & transl_prot)2611 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot)
2612 {
2613     vector<TSeqPos> mismatches;
2614     size_t prot_len;
2615     size_t len;
2616 
2617     CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
2618 
2619     if (len == prot_len)  {                // could be identical
2620         for (TSeqPos i = 0; i < len; ++i) {
2621             CSeqVectorTypes::TResidue p_res = prot_vec[i];
2622             CSeqVectorTypes::TResidue t_res = transl_prot[i];
2623 
2624             if (t_res != p_res) {
2625                 if (i == 0) {
2626                     bool no_beg, no_end;
2627                     FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
2628                     if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
2629                     } else if (t_res == '-') {
2630                     } else {
2631                         mismatches.push_back(i);
2632                     }
2633                 } else {
2634                     mismatches.push_back(i);
2635                 }
2636             }
2637         }
2638     }
2639     return mismatches;
2640 }
2641 
2642 
GetMismatches(const CSeq_feat & feat,const CBioseq_Handle & prot_handle,const string & transl_prot)2643 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot)
2644 {
2645     vector<TSeqPos> mismatches;
2646     // can't check for mismatches unless there is a product
2647     if (!prot_handle || !prot_handle.IsAa()) {
2648         return mismatches;
2649     }
2650 
2651     CSeqVector prot_vec = prot_handle.GetSeqVector();
2652     prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
2653 
2654     return GetMismatches(feat, prot_vec, transl_prot);
2655 }
2656 
2657 
HasNoStop(const CSeq_feat & feat,CScope * scope)2658 bool HasNoStop(const CSeq_feat& feat, CScope* scope)
2659 {
2660     bool no_beg, no_end;
2661     FeatureHasEnds(feat, scope, no_beg, no_end);
2662     if (no_end) {
2663         return false;
2664     }
2665 
2666     string transl_prot;
2667     bool alt_start;
2668     try {
2669         transl_prot = TranslateCodingRegionForValidation(feat, *scope, alt_start);
2670     } catch (CException& ) {
2671     }
2672     if (NStr::EndsWith(transl_prot, "*")) {
2673         return false;
2674     }
2675 
2676     bool show_stop = true;
2677     if (!no_beg && feat.IsSetPartial() && feat.GetPartial()) {
2678         CBioseq_Handle prot_handle;
2679         try {
2680             CBioseq_Handle bsh = scope->GetBioseqHandle(feat.GetLocation());
2681             const CTSE_Handle tse = bsh.GetTSE_Handle();
2682             bool is_far = false;
2683             prot_handle = GetCDSProductSequence(feat, scope, tse, true, is_far);
2684             if (prot_handle) {
2685                 vector<TSeqPos> mismatches = GetMismatches(feat, prot_handle, transl_prot);
2686                 if (mismatches.size() == 0) {
2687                     show_stop = false;
2688                 }
2689             }
2690         } catch (CException& ) {
2691         }
2692     }
2693 
2694     return show_stop;
2695 }
2696 //LCOV_EXCL_STOP
2697 
2698 
IsSequenceFetchable(const CSeq_id & id)2699 bool IsSequenceFetchable(const CSeq_id& id)
2700 {
2701     bool fetchable = false;
2702     try {
2703         CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);
2704         CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
2705         scope->AddDefaults();
2706         CBioseq_Handle bsh = scope->GetBioseqHandle(idh);
2707         if (bsh) {
2708             fetchable = true;
2709         }
2710     } catch (CException& ) {
2711     } catch (std::exception &) {
2712     }
2713     return fetchable;
2714 }
2715 
2716 
IsSequenceFetchable(const string & seq_id)2717 bool IsSequenceFetchable(const string& seq_id)
2718 {
2719     bool fetchable = false;
2720     try {
2721         CRef<CSeq_id> id(new CSeq_id(seq_id));
2722         if (id) {
2723             fetchable = IsSequenceFetchable(*id);
2724         }
2725     } catch (CException& ) {
2726     } catch (std::exception &) {
2727     }
2728     return fetchable;
2729 }
2730 
2731 
IsNTNCNWACAccession(const string & acc)2732 bool IsNTNCNWACAccession(const string& acc)
2733 {
2734     if (NStr::StartsWith(acc, "NT_") || NStr::StartsWith(acc, "NC_") ||
2735         NStr::StartsWith(acc, "AC_") || NStr::StartsWith(acc, "NW_")) {
2736         return true;
2737     } else {
2738         return false;
2739     }
2740 }
2741 
2742 
IsNTNCNWACAccession(const CSeq_id & id)2743 bool IsNTNCNWACAccession(const CSeq_id& id)
2744 {
2745     if (id.IsOther() && id.GetOther().IsSetAccession() &&
2746         IsNTNCNWACAccession(id.GetOther().GetAccession())) {
2747         return true;
2748     } else {
2749         return false;
2750     }
2751 }
2752 
2753 
IsNTNCNWACAccession(const CBioseq & seq)2754 bool IsNTNCNWACAccession(const CBioseq& seq)
2755 {
2756     bool is_it = false;
2757     FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2758         if (IsNTNCNWACAccession(**id_it)) {
2759             is_it = true;
2760             break;
2761         }
2762     }
2763     return is_it;
2764 }
2765 
2766 
IsNG(const CSeq_id & id)2767 bool IsNG(const CSeq_id& id)
2768 {
2769     if (id.IsOther() && id.GetOther().IsSetAccession() &&
2770         NStr::StartsWith(id.GetOther().GetAccession(), "NG_")) {
2771         return true;
2772     } else {
2773         return false;
2774     }
2775 }
2776 
2777 
IsNG(const CBioseq & seq)2778 bool IsNG(const CBioseq& seq)
2779 {
2780     bool is_it = false;
2781     FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2782         if (IsNG(**id_it)) {
2783             is_it = true;
2784             break;
2785         }
2786     }
2787     return is_it;
2788 }
2789 
2790 
2791 // See VR-728. These Seq-ids are temporary and will be stripped
2792 // by the ID Load process, so they should not be the only Seq-id
2793 // on a Bioseq, and feature locations should not use these.
IsTemporary(const CSeq_id & id)2794 bool IsTemporary(const CSeq_id& id)
2795 {
2796     if (id.IsGeneral() && id.GetGeneral().IsSetDb()) {
2797         const string& db = id.GetGeneral().GetDb();
2798         if (NStr::EqualNocase(db, "TMSMART") ||
2799             NStr::EqualNocase(db, "NCBIFILE") ||
2800             NStr::EqualNocase(db, "BankIt")) {
2801             return true;
2802         }
2803     }
2804     return false;
2805 }
2806 
2807 
IsOrganelle(int genome)2808 bool IsOrganelle(int genome)
2809 {
2810     bool rval = false;
2811     switch (genome) {
2812     case CBioSource::eGenome_chloroplast:
2813     case CBioSource::eGenome_chromoplast:
2814     case CBioSource::eGenome_kinetoplast:
2815     case CBioSource::eGenome_mitochondrion:
2816     case CBioSource::eGenome_cyanelle:
2817     case CBioSource::eGenome_nucleomorph:
2818     case CBioSource::eGenome_apicoplast:
2819     case CBioSource::eGenome_leucoplast:
2820     case CBioSource::eGenome_proplastid:
2821     case CBioSource::eGenome_hydrogenosome:
2822     case CBioSource::eGenome_chromatophore:
2823     case CBioSource::eGenome_plastid:
2824         rval = true;
2825         break;
2826     default:
2827         rval = false;
2828         break;
2829     }
2830     return rval;
2831 }
2832 
2833 
IsOrganelle(const CBioseq_Handle & seq)2834 bool IsOrganelle(const CBioseq_Handle& seq)
2835 {
2836     if (!seq) {
2837         return false;
2838     }
2839     bool rval = false;
2840     CSeqdesc_CI sd(seq, CSeqdesc::e_Source);
2841     if (sd && sd->GetSource().IsSetGenome() && IsOrganelle(sd->GetSource().GetGenome())) {
2842         rval = true;
2843     }
2844     return rval;
2845 }
2846 
2847 
ConsistentWithA(Char ch)2848 bool ConsistentWithA(Char ch)
2849 
2850 {
2851     return (bool)(strchr("ANRMWHVD", ch) != NULL);
2852 }
2853 
ConsistentWithC(Char ch)2854 bool ConsistentWithC(Char ch)
2855 
2856 {
2857     return (bool)(strchr("CNYMSHBV", ch) != NULL);
2858 }
2859 
ConsistentWithG(Char ch)2860 bool ConsistentWithG(Char ch)
2861 
2862 {
2863     return (bool)(strchr("GNRKSBVD", ch) != NULL);
2864 }
2865 
ConsistentWithT(Char ch)2866 bool ConsistentWithT(Char ch)
2867 
2868 {
2869     return (bool)(strchr("TNYKWHBD", ch) != NULL);
2870 }
2871 
2872 
2873 //LCOV_EXCL_START
2874 //not used by validator, but used by Genome Workbench menu item for
2875 //removing unneccessary exceptions
DoesCodingRegionHaveUnnecessaryException(const CSeq_feat & feat,const CBioseq_Handle & loc_handle,CScope & scope)2876 bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& loc_handle, CScope& scope)
2877 {
2878     CCDSTranslationProblems problems;
2879     CBioseq_Handle prot_handle;
2880     if (feat.IsSetProduct()) {
2881         prot_handle = scope.GetBioseqHandle(feat.GetProduct());
2882     }
2883 
2884     problems.CalculateTranslationProblems(feat,
2885         loc_handle,
2886         prot_handle,
2887         false,
2888         false,
2889         false,
2890         false,
2891         false,
2892         false,
2893         false,
2894         false,
2895         false,
2896         false,
2897         &scope);
2898 
2899     return (problems.GetTranslationProblemFlags() & CCDSTranslationProblems::eCDSTranslationProblem_UnnecessaryException);
2900 }
2901 
2902 
DoesmRNAHaveUnnecessaryException(const CSeq_feat & feat,const CBioseq_Handle & nuc,CScope & scope)2903 bool DoesmRNAHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& nuc, CScope& scope)
2904 {
2905     size_t mismatches = 0;
2906     CBioseq_Handle rna;
2907     if (feat.IsSetProduct()) {
2908         rna = scope.GetBioseqHandle(feat.GetProduct());
2909     }
2910 
2911     size_t problems = GetMRNATranslationProblems
2912         (feat, mismatches, false,
2913         nuc, rna, false, false, false, &scope);
2914 
2915     return (problems & eMRNAProblem_UnnecessaryException);
2916 }
2917 
2918 
DoesFeatureHaveUnnecessaryException(const CSeq_feat & feat,CScope & scope)2919 bool DoesFeatureHaveUnnecessaryException(const CSeq_feat& feat, CScope& scope)
2920 {
2921     if (!feat.IsSetExcept_text()) {
2922         return false;
2923     }
2924     if (!feat.IsSetData()) {
2925         return false;
2926     }
2927     if (!feat.IsSetLocation()) {
2928         return false;
2929     }
2930     try {
2931         CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetLocation());
2932         if (!bsh) {
2933             return false;
2934         }
2935         CSpliceProblems splice_problems;
2936         splice_problems.CalculateSpliceProblems(feat, true, sequence::IsPseudo(feat, scope), bsh);
2937         if (splice_problems.IsExceptionUnnecessary()) {
2938             return true;
2939         }
2940         if (feat.GetData().IsCdregion()) {
2941             return DoesCodingRegionHaveUnnecessaryException(feat, bsh, scope);
2942         } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
2943             return DoesmRNAHaveUnnecessaryException(feat, bsh, scope);
2944         } else {
2945             return false;
2946         }
2947     } catch (CException&) {
2948     }
2949     return false;
2950 }
2951 //LCOV_EXCL_STOP
2952 
2953 
2954 END_SCOPE(validator)
2955 END_SCOPE(objects)
2956 END_NCBI_SCOPE
2957