1 /* $Id: utilities.cpp 632625 2021-06-03 17:38:33Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat
27 *
28 * File Description:
29 * Implementation of utility classes and functions.
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <corelib/ncbistr.hpp>
35
36 #include <serial/enumvalues.hpp>
37 #include <serial/serialimpl.hpp>
38
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqfeat/SeqFeatData.hpp>
41 #include <objects/seqfeat/Gb_qual.hpp>
42 #include <objects/seqset/Seq_entry.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 #include <objects/seq/Bioseq.hpp>
45 #include <objects/misc/sequence_macros.hpp>
46 #include <objects/taxon3/T3Data.hpp>
47 #include <objects/taxon3/Taxon3_reply.hpp>
48 #include <objmgr/bioseq_handle.hpp>
49 #include <objmgr/scope.hpp>
50 #include <objmgr/seq_vector.hpp>
51 #include <objmgr/util/sequence.hpp>
52 #include <objmgr/util/seq_loc_util.hpp>
53 #include <objmgr/bioseq_ci.hpp>
54 #include <objmgr/seqdesc_ci.hpp>
55 #include <objmgr/align_ci.hpp>
56 #include <objmgr/object_manager.hpp>
57 #include <objects/taxon3/taxon3.hpp>
58 #include <objects/taxon1/taxon1.hpp>
59 #include <objtools/validator/utilities.hpp>
60 #include <objtools/validator/splice_problems.hpp>
61 #include <objtools/validator/translation_problems.hpp>
62 #include <objtools/validator/tax_validation_and_cleanup.hpp>
63
64 #include <vector>
65 #include <algorithm>
66 #include <list>
67
68
69 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)70 BEGIN_SCOPE(objects)
71 BEGIN_SCOPE(validator)
72
73
74 // =============================================================================
75 // Functions
76 // =============================================================================
77
78
79 bool IsClassInEntry(const CSeq_entry& se, CBioseq_set::EClass clss)
80 {
81 for ( CTypeConstIterator <CBioseq_set> si(se); si; ++si ) {
82 if ( si->GetClass() == clss ) {
83 return true;
84 }
85 }
86 return false;
87 }
88
89
IsDeltaOrFarSeg(const CSeq_loc & loc,CScope * scope)90 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope)
91 {
92 CBioseq_Handle bsh = BioseqHandleFromLocation(scope, loc);
93 const CSeq_entry& se = *bsh.GetTopLevelEntry().GetCompleteSeq_entry();
94
95 if ( bsh.IsSetInst_Repr() ) {
96 CBioseq_Handle::TInst::TRepr repr = bsh.GetInst_Repr();
97 if ( repr == CSeq_inst::eRepr_delta ) {
98 if ( !IsClassInEntry(se, CBioseq_set::eClass_nuc_prot) ) {
99 return true;
100 }
101 }
102 if ( repr == CSeq_inst::eRepr_seg ) {
103 if ( !IsClassInEntry(se, CBioseq_set::eClass_parts) ) {
104 return true;
105 }
106 }
107 }
108
109 return false;
110 }
111
112
113 // Check if string is either empty or contains just white spaces
IsBlankStringList(const list<string> & str_list)114 bool IsBlankStringList(const list< string >& str_list)
115 {
116 ITERATE( list< string >, str, str_list ) {
117 if ( !NStr::IsBlank(*str) ) {
118 return false;
119 }
120 }
121 return true;
122 }
123
124
GetGIForSeqId(const CSeq_id & id)125 TGi GetGIForSeqId(const CSeq_id& id)
126 {
127 TGi gi = ZERO_GI;
128 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
129 scope->AddDefaults();
130
131 try {
132 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);
133 gi = scope->GetGi (idh);
134 } catch (CException &) {
135 } catch (std::exception &) {
136 }
137 return gi;
138 }
139
140
141
GetSeqIdsForGI(TGi gi)142 CScope::TIds GetSeqIdsForGI(TGi gi)
143 {
144 CScope::TIds id_list;
145 CSeq_id tmp_id;
146 tmp_id.SetGi(gi);
147 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
148 scope->AddDefaults();
149
150 try {
151 id_list = scope->GetIds(tmp_id);
152
153 } catch (CException &) {
154 } catch (std::exception &) {
155 }
156 return id_list;
157 }
158
IsFarLocation(const CSeq_loc & loc,const CSeq_entry_Handle & seh)159 bool IsFarLocation(const CSeq_loc& loc, const CSeq_entry_Handle& seh)
160 {
161 CScope& scope = seh.GetScope();
162 for ( CSeq_loc_CI citer(loc); citer; ++citer ) {
163 CConstRef<CSeq_id> id(&citer.GetSeq_id());
164 if ( id ) {
165 CBioseq_Handle near_seq = scope.GetBioseqHandleFromTSE(*id, seh);
166 if ( !near_seq ) {
167 return true;
168 }
169 }
170 }
171
172 return false;
173 }
174
GetSequenceStringFromLoc(const CSeq_loc & loc,CScope & scope)175 string GetSequenceStringFromLoc
176 (const CSeq_loc& loc,
177 CScope& scope)
178 {
179 CNcbiOstrstream oss;
180 CFastaOstream fasta_ostr(oss);
181 fasta_ostr.SetFlag(CFastaOstream::fAssembleParts);
182 fasta_ostr.SetFlag(CFastaOstream::fInstantiateGaps);
183 string s;
184
185 try {
186 for (CSeq_loc_CI citer (loc); citer; ++citer) {
187 const CSeq_loc& part = citer.GetEmbeddingSeq_loc();
188 CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, part);
189 if (bsh) {
190 fasta_ostr.WriteSequence (bsh, &part);
191 }
192 }
193 s = CNcbiOstrstreamToString(oss);
194 NStr::ReplaceInPlace(s, "\n", "");
195 } catch (CException&) {
196 s = kEmptyStr;
197 }
198
199 return s;
200 }
201
202
GetSequenceFromLoc(const CSeq_loc & loc,CScope & scope,CBioseq_Handle::EVectorCoding coding)203 CSeqVector GetSequenceFromLoc
204 (const CSeq_loc& loc,
205 CScope& scope,
206 CBioseq_Handle::EVectorCoding coding)
207 {
208 CConstRef<CSeqMap> map =
209 CSeqMap::CreateSeqMapForSeq_loc(loc, &scope);
210 return CSeqVector(*map, scope, coding, eNa_strand_plus);
211 }
212
213
GetSequenceFromFeature(const CSeq_feat & feat,CScope & scope,CBioseq_Handle::EVectorCoding coding,bool product)214 CSeqVector GetSequenceFromFeature
215 (const CSeq_feat& feat,
216 CScope& scope,
217 CBioseq_Handle::EVectorCoding coding,
218 bool product)
219 {
220
221 if ( (product && !feat.CanGetProduct()) ||
222 (!product && !feat.CanGetLocation()) ) {
223 return CSeqVector();
224 }
225
226 const CSeq_loc* loc = product ? &feat.GetProduct() : &feat.GetLocation();
227 return GetSequenceFromLoc(*loc, scope, coding);
228 }
229
230
231 /***** Calculate Accession for a given object *****/
232
233
s_GetBioseqAcc(const CSeq_id & id,int * version)234 static string s_GetBioseqAcc(const CSeq_id& id, int* version)
235 {
236 try {
237 string label;
238 id.GetLabel(&label, version, CSeq_id::eFasta);
239 return label;
240 } catch (CException&) {
241 return kEmptyStr;
242 }
243 }
244
245
s_GetBioseqAcc(const CBioseq_Handle & handle,int * version)246 static string s_GetBioseqAcc(const CBioseq_Handle& handle, int* version)
247 {
248 if (handle) {
249 CConstRef<CSeq_id> seqid = sequence::GetId(handle, sequence::eGetId_Best).GetSeqId();
250 if (seqid) {
251 return s_GetBioseqAcc(*seqid, version);
252 }
253 }
254 return kEmptyStr;
255 }
256
257
s_GetSeq_featAcc(const CSeq_feat & feat,CScope & scope,int * version)258 static string s_GetSeq_featAcc(const CSeq_feat& feat, CScope& scope, int* version)
259 {
260 CBioseq_Handle seq = BioseqHandleFromLocation (&scope, feat.GetLocation());
261 if (seq) {
262 CBioseq_set_Handle parent = seq.GetParentBioseq_set();
263 if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_parts) {
264 parent = parent.GetParentBioseq_set();
265 if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_segset) {
266 CBioseq_CI m(parent);
267 if (m) {
268 return s_GetBioseqAcc(*m, version);
269 }
270 }
271 }
272 }
273
274 return s_GetBioseqAcc(seq, version);
275 }
276
277
s_GetBioseqAcc(const CBioseq & seq,CScope & scope,int * version)278 static string s_GetBioseqAcc(const CBioseq& seq, CScope& scope, int* version)
279 {
280 CBioseq_Handle handle = scope.GetBioseqHandle(seq);
281 return s_GetBioseqAcc(handle, version);
282 }
283
s_GetSeqFromSet(const CBioseq_set & bsst,CScope & scope)284 static const CBioseq* s_GetSeqFromSet(const CBioseq_set& bsst, CScope& scope)
285 {
286 const CBioseq* retval = NULL;
287
288 switch (bsst.GetClass()) {
289 case CBioseq_set::eClass_gen_prod_set:
290 // find the genomic bioseq
291 FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
292 if ((*it)->IsSeq()) {
293 const CSeq_inst& inst = (*it)->GetSeq().GetInst();
294 if (inst.IsSetMol() && inst.GetMol() == CSeq_inst::eMol_dna) {
295 retval = &(*it)->GetSeq();
296 break;
297 }
298 }
299 }
300 break;
301 case CBioseq_set::eClass_nuc_prot:
302 // find the nucleotide bioseq
303 FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
304 if ((*it)->IsSeq() && (*it)->GetSeq().IsNa()) {
305 retval = &(*it)->GetSeq();
306 break;
307 } else if ((*it)->IsSet() &&
308 (*it)->GetSet().IsSetClass() &&
309 (*it)->GetSet().GetClass() == CBioseq_set::eClass_segset) {
310 retval = s_GetSeqFromSet((*it)->GetSet(), scope);
311 break;
312 }
313 }
314 if (!retval) {
315 FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
316 if ((*it)->IsSeq()) {
317 retval = &(*it)->GetSeq();
318 break;
319 }
320 }
321 }
322 break;
323 case CBioseq_set::eClass_segset:
324 // find the master bioseq
325 FOR_EACH_SEQENTRY_ON_SEQSET (it, bsst) {
326 if ((*it)->IsSeq()) {
327 retval = &(*it)->GetSeq();
328 break;
329 }
330 }
331 break;
332
333 default:
334 // find the first bioseq
335 CTypeConstIterator<CBioseq> seqit(ConstBegin(bsst));
336 if (seqit) {
337 retval = &(*seqit);
338 }
339 break;
340 }
341
342 return retval;
343 }
344
345
s_IsDescOnSeqEntry(const CSeq_entry & entry,const CSeqdesc & desc)346 static bool s_IsDescOnSeqEntry (const CSeq_entry& entry, const CSeqdesc& desc)
347 {
348 if (entry.IsSetDescr()) {
349 const auto& descs = entry.GetDescr();
350 for (auto& it : descs.Get()) {
351 if (it->Equals(desc)) {
352 return true;
353 }
354 }
355 }
356 return false;
357 }
358
359
360
s_GetAccessionForSeqdesc(const CSeq_entry_Handle & seh,const CSeqdesc & desc,CScope & scope,int * version)361 static string s_GetAccessionForSeqdesc (const CSeq_entry_Handle& seh, const CSeqdesc& desc, CScope& scope, int* version)
362 {
363 if (!seh) {
364 return kEmptyStr;\
365 } else if (seh.IsSeq()) {
366 return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), scope, version);
367 } else if (s_IsDescOnSeqEntry (*(seh.GetCompleteSeq_entry()), desc)) {
368 if (seh.IsSeq()) {
369 return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), scope, version);
370 } else if (seh.IsSet()) {
371 const CBioseq* seq = s_GetSeqFromSet(*(seh.GetSet().GetCompleteBioseq_set()), scope);
372 if (seq != NULL) {
373 return s_GetBioseqAcc(*seq, scope, version);
374 }
375 }
376 } else {
377 CSeq_entry_Handle parent = seh.GetParentEntry();
378 if (parent) {
379 return s_GetAccessionForSeqdesc(parent, desc, scope, version);
380 }
381 }
382 return kEmptyStr;
383 }
384
385
IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle & bsh,const CSeq_align & align,CScope & scope)386 bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle& bsh, const CSeq_align& align, CScope& scope)
387 {
388 CSeq_entry_Handle seh = bsh.GetTopLevelEntry();
389 for (CAlign_CI align_it(seh); align_it; ++align_it) {
390 if (&(*align_it) == &align) {
391 return true;
392 }
393 }
394 return false;
395 }
396
397
GetReportableSeqIdForAlignment(const CSeq_align & align,CScope & scope)398 CConstRef<CSeq_id> GetReportableSeqIdForAlignment(const CSeq_align& align, CScope& scope)
399 {
400 // temporary - to match C Toolkit
401 if (align.IsSetSegs() && align.GetSegs().IsStd()) {
402 return CConstRef<CSeq_id>(NULL);
403 }
404 try {
405 if (align.IsSetDim()) {
406 for (int i = 0; i < align.GetDim(); ++i) {
407 const CSeq_id& id = align.GetSeq_id(i);
408 CBioseq_Handle bsh = scope.GetBioseqHandle(id);
409 if (bsh && IsBioseqInSameSeqEntryAsAlign(bsh, align, scope)) {
410 return CConstRef<CSeq_id>(&id);
411 }
412 }
413 } else if (align.IsSetSegs() && align.GetSegs().IsDendiag()) {
414 const CSeq_id& id = *(align.GetSegs().GetDendiag().front()->GetIds()[0]);
415 return CConstRef<CSeq_id>(&id);
416 }
417 // failed to find resolvable ID, use bare ID
418 const CSeq_id& id = align.GetSeq_id(0);
419 return CConstRef<CSeq_id>(&id);
420 } catch (CException& ) {
421 }
422 return CConstRef<CSeq_id>(NULL);
423 }
424
425
GetAccessionFromObjects(const CSerialObject * obj,const CSeq_entry * ctx,CScope & scope,int * version)426 string GetAccessionFromObjects(const CSerialObject* obj, const CSeq_entry* ctx, CScope& scope, int* version)
427 {
428 string empty_acc;
429
430 if (obj && obj->GetThisTypeInfo() == CSeqdesc::GetTypeInfo() && ctx) {
431 CSeq_entry_Handle seh = scope.GetSeq_entryHandle(*ctx);
432 const CSeqdesc& desc = dynamic_cast<const CSeqdesc&>(*obj);
433 string acc = s_GetAccessionForSeqdesc(seh, desc, scope, version);
434 if (!NStr::IsBlank(acc)) {
435 return acc;
436 }
437 }
438
439 if (ctx) {
440 if (ctx->IsSeq()) {
441 return s_GetBioseqAcc(ctx->GetSeq(), scope, version);
442 } else if (ctx->IsSet()) {
443 const CBioseq* seq = s_GetSeqFromSet(ctx->GetSet(), scope);
444 if (seq != NULL) {
445 return s_GetBioseqAcc(*seq, scope, version);
446 }
447 }
448 } else if (obj) {
449 if (obj->GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
450 const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*obj);
451 return s_GetSeq_featAcc(feat, scope, version);
452 } else if (obj->GetThisTypeInfo() == CBioseq::GetTypeInfo()) {
453 const CBioseq& seq = dynamic_cast<const CBioseq&>(*obj);
454 return s_GetBioseqAcc(seq, scope, version);
455 } else if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
456 const CBioseq_set& bsst = dynamic_cast<const CBioseq_set&>(*obj);
457 const CBioseq* seq = s_GetSeqFromSet(bsst, scope);
458 if (seq != NULL) {
459 return s_GetBioseqAcc(*seq, scope, version);
460 }
461 } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
462 const CSeq_entry& entry = dynamic_cast<const CSeq_entry&>(*obj);
463 if (entry.IsSeq()) {
464 return s_GetBioseqAcc(entry.GetSeq(), scope, version);
465 } else if (entry.IsSet()) {
466 const CBioseq* seq = s_GetSeqFromSet(entry.GetSet(), scope);
467 if (seq != NULL) {
468 return s_GetBioseqAcc(*seq, scope, version);
469 }
470 }
471 } else if (obj->GetThisTypeInfo() == CSeq_annot::GetTypeInfo()) {
472 CSeq_annot_Handle ah = scope.GetSeq_annotHandle (dynamic_cast<const CSeq_annot&>(*obj));
473 if (ah) {
474 CSeq_entry_Handle seh = ah.GetParentEntry();
475 if (seh) {
476 if (seh.IsSeq()) {
477 return s_GetBioseqAcc(seh.GetSeq(), version);
478 } else if (seh.IsSet()) {
479 CBioseq_set_Handle bsh = seh.GetSet();
480 const CBioseq_set& bsst = *(bsh.GetCompleteBioseq_set());
481 const CBioseq* seq = s_GetSeqFromSet(bsst, scope);
482 if (seq != NULL) {
483 return s_GetBioseqAcc(*seq, scope, version);
484 }
485 }
486 }
487 }
488 } else if (obj->GetThisTypeInfo() == CSeq_align::GetTypeInfo()) {
489 const CSeq_align& align = dynamic_cast<const CSeq_align&>(*obj);
490 CConstRef<CSeq_id> id = GetReportableSeqIdForAlignment(align, scope);
491 if (id) {
492 CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
493 if (bsh) {
494 return s_GetBioseqAcc(bsh, version);
495 } else {
496 return s_GetBioseqAcc(*id, version);
497 }
498 }
499 } else if (obj->GetThisTypeInfo() == CSeq_graph::GetTypeInfo()) {
500 const CSeq_graph& graph = dynamic_cast<const CSeq_graph&>(*obj);
501 try {
502 const CSeq_loc& loc = graph.GetLoc();
503 const CSeq_id *id = loc.GetId();
504 if (id) {
505 return s_GetBioseqAcc (*id, version);
506 }
507 } catch (CException& ) {
508 }
509 }
510 }
511 return empty_acc;
512 }
513
514
GetSetParent(const CBioseq_set_Handle & set,CBioseq_set::TClass set_class)515 CBioseq_set_Handle GetSetParent (const CBioseq_set_Handle& set, CBioseq_set::TClass set_class)
516 {
517 CBioseq_set_Handle gps;
518
519 CSeq_entry_Handle parent = set.GetParentEntry();
520 if (!parent) {
521 return gps;
522 } else if (!(parent = parent.GetParentEntry())) {
523 return gps;
524 } else if (!parent.IsSet()) {
525 return gps;
526 } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
527 return parent.GetSet();
528 } else {
529 return GetSetParent (parent.GetSet(), set_class);
530 }
531 }
532
533
GetSetParent(const CBioseq_Handle & bioseq,CBioseq_set::TClass set_class)534 CBioseq_set_Handle GetSetParent (const CBioseq_Handle& bioseq, CBioseq_set::TClass set_class)
535 {
536 CBioseq_set_Handle set;
537
538 CSeq_entry_Handle parent = bioseq.GetParentEntry();
539 if (!parent) {
540 return set;
541 } else if (!(parent = parent.GetParentEntry())) {
542 return set;
543 } else if (!parent.IsSet()) {
544 return set;
545 } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
546 return parent.GetSet();
547 } else {
548 return GetSetParent (parent.GetSet(), set_class);
549 }
550 }
551
552
GetGenProdSetParent(const CBioseq_set_Handle & set)553 CBioseq_set_Handle GetGenProdSetParent (const CBioseq_set_Handle& set)
554 {
555 return GetSetParent (set, CBioseq_set::eClass_gen_prod_set);
556 }
557
GetGenProdSetParent(const CBioseq_Handle & bioseq)558 CBioseq_set_Handle GetGenProdSetParent (const CBioseq_Handle& bioseq)
559 {
560 return GetSetParent(bioseq, CBioseq_set::eClass_gen_prod_set);
561 }
562
563
GetNucProtSetParent(const CBioseq_Handle & bioseq)564 CBioseq_set_Handle GetNucProtSetParent (const CBioseq_Handle& bioseq)
565 {
566 return GetSetParent(bioseq, CBioseq_set::eClass_nuc_prot);
567 }
568
569
GetNucBioseq(const CBioseq_set_Handle & bioseq_set)570 CBioseq_Handle GetNucBioseq (const CBioseq_set_Handle& bioseq_set)
571 {
572 CBioseq_Handle nuc;
573
574 if (!bioseq_set) {
575 return nuc;
576 }
577 CBioseq_CI bit(bioseq_set, CSeq_inst::eMol_na);
578 if (bit) {
579 nuc = *bit;
580 } else {
581 CSeq_entry_Handle parent = bioseq_set.GetParentEntry();
582 if (parent && (parent = parent.GetParentEntry())
583 && parent.IsSet()) {
584 nuc = GetNucBioseq (parent.GetSet());
585 }
586 }
587 return nuc;
588 }
589
590
GetNucBioseq(const CBioseq_Handle & bioseq)591 CBioseq_Handle GetNucBioseq (const CBioseq_Handle& bioseq)
592 {
593 CBioseq_Handle nuc;
594
595 if (bioseq.IsNucleotide()) {
596 return bioseq;
597 }
598 CSeq_entry_Handle parent = bioseq.GetParentEntry();
599 if (parent && (parent = parent.GetParentEntry())
600 && parent.IsSet()) {
601 nuc = GetNucBioseq (parent.GetSet());
602 }
603 return nuc;
604 }
605
606
ValidateAccessionString(const string & accession,bool require_version)607 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version)
608 {
609 if (NStr::IsBlank (accession)) {
610 return eAccessionFormat_null;
611 } else if (accession.length() >= 16) {
612 return eAccessionFormat_too_long;
613 } else if (accession.length() < 3
614 || ! isalpha (accession.c_str()[0])
615 || ! isupper (accession.c_str()[0])) {
616 return eAccessionFormat_no_start_letters;
617 }
618
619 string str = accession;
620 if (NStr::StartsWith (str, "NZ_")) {
621 str = str.substr(3);
622 }
623
624 const char *cp = str.c_str();
625 int numAlpha = 0;
626
627 while (isalpha (*cp)) {
628 numAlpha++;
629 cp++;
630 }
631
632 int numUndersc = 0;
633
634 while (*cp == '_') {
635 numUndersc++;
636 cp++;
637 }
638
639 int numDigits = 0;
640 while (isdigit (*cp)) {
641 numDigits++;
642 cp++;
643 }
644
645 if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
646 return eAccessionFormat_wrong_number_of_digits;
647 }
648
649 if (require_version) {
650 if (*cp != '.') {
651 return eAccessionFormat_missing_version;
652 }
653 cp++;
654 int numVersion = 0;
655 while (isdigit (*cp)) {
656 numVersion++;
657 cp++;
658 }
659 if (numVersion < 1) {
660 return eAccessionFormat_missing_version;
661 } else if (*cp != '\0' && *cp != ' ') {
662 return eAccessionFormat_bad_version;
663 }
664 }
665
666
667 if (numUndersc == 0) {
668 if ((numAlpha == 1 && numDigits == 5)
669 || (numAlpha == 2 && numDigits == 6)
670 || (numAlpha == 3 && numDigits == 5)
671 || (numAlpha == 4 && numDigits == 8)
672 || (numAlpha == 5 && numDigits == 7)) {
673 return eAccessionFormat_valid;
674 }
675 } else {
676 if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
677 return eAccessionFormat_wrong_number_of_digits;
678 }
679 char first_letter = accession.c_str()[0];
680 char second_letter = accession.c_str()[1];
681 if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') {
682 if (second_letter == 'M' || second_letter == 'C'
683 || second_letter == 'T' || second_letter == 'P'
684 || second_letter == 'G' || second_letter == 'R'
685 || second_letter == 'S' || second_letter == 'W'
686 || second_letter == 'Z') {
687 return eAccessionFormat_valid;
688 }
689 }
690 if ((first_letter == 'A' || first_letter == 'Y')
691 && second_letter == 'P') {
692 return eAccessionFormat_valid;
693 }
694 }
695
696 return eAccessionFormat_wrong_number_of_digits;
697 }
698
699
s_FeatureIdsMatch(const CFeat_id & f1,const CFeat_id & f2)700 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2)
701 {
702 if (!f1.IsLocal() || !f2.IsLocal()) {
703 return false;
704 }
705
706 return 0 == f1.GetLocal().Compare(f2.GetLocal());
707 }
708
709
s_StringHasPMID(const string & str)710 bool s_StringHasPMID (const string& str)
711 {
712 if (NStr::IsBlank (str)) {
713 return false;
714 }
715
716 size_t pos = NStr::Find (str, "(PMID ");
717 if (pos == string::npos) {
718 return false;
719 }
720
721 const char *ptr = str.c_str() + pos + 6;
722 unsigned int numdigits = 0;
723 while (*ptr != 0 && *ptr != ')') {
724 if (isdigit (*ptr)) {
725 numdigits++;
726 }
727 ptr++;
728 }
729
730 if (*ptr == ')' && numdigits > 0) {
731 return true;
732 } else {
733 return false;
734 }
735 }
736
737
HasBadCharacter(const string & str)738 bool HasBadCharacter (const string& str)
739 {
740 if (NStr::Find (str, "?") != string::npos
741 || NStr::Find (str, "!") != string::npos
742 || NStr::Find (str, "~") != string::npos
743 || NStr::Find(str, "|") != string::npos) {
744 return true;
745 } else {
746 return false;
747 }
748 }
749
750
EndsWithBadCharacter(const string & str)751 bool EndsWithBadCharacter (const string& str)
752 {
753 if (NStr::EndsWith (str, "_") || NStr::EndsWith (str, ".")
754 || NStr::EndsWith (str, ",") || NStr::EndsWith (str, ":")
755 || NStr::EndsWith (str, ";")) {
756 return true;
757 } else {
758 return false;
759 }
760 }
761
762
CheckDate(const CDate & date,bool require_full_date)763 int CheckDate (const CDate& date, bool require_full_date)
764 {
765 int rval = eDateValid_valid;
766
767 if (date.IsStr()) {
768 if (NStr::IsBlank (date.GetStr()) || NStr::Equal (date.GetStr(), "?")) {
769 rval |= eDateValid_bad_str;
770 }
771 } else if (date.IsStd()) {
772 const auto& sdate = date.GetStd();
773 if (!sdate.IsSetYear() || sdate.GetYear() == 0) {
774 rval |= eDateValid_bad_year;
775 }
776 if (sdate.IsSetMonth() && sdate.GetMonth() > 12) {
777 rval |= eDateValid_bad_month;
778 }
779 if (sdate.IsSetDay() && sdate.GetDay() > 31) {
780 rval |= eDateValid_bad_day;
781 }
782 if (require_full_date) {
783 if (!sdate.IsSetMonth() || sdate.GetMonth() == 0) {
784 rval |= eDateValid_bad_month;
785 }
786 if (!sdate.IsSetDay() || sdate.GetDay() == 0) {
787 rval |= eDateValid_bad_day;
788 }
789 }
790 if (sdate.IsSetSeason() && !NStr::IsBlank (sdate.GetSeason())) {
791 const char * cp = sdate.GetSeason().c_str();
792 while (*cp != 0) {
793 if (isalpha (*cp) || *cp == '-') {
794 // these are the only acceptable characters
795 } else {
796 rval |= eDateValid_bad_season;
797 break;
798 }
799 ++cp;
800 }
801 }
802 } else {
803 rval |= eDateValid_bad_other;
804 }
805 return rval;
806 }
807
808
IsDateInPast(const CDate & date)809 bool IsDateInPast(const CDate& date)
810 {
811 time_t t;
812 time(&t);
813 struct tm *tm;
814 tm = localtime(&t);
815
816 bool in_past = false;
817 if (!date.IsStd()) {
818 return false;
819 }
820 const auto & sdate = date.GetStd();
821 if (sdate.GetYear() < tm->tm_year + 1900) {
822 in_past = true;
823 } else if (sdate.GetYear() == tm->tm_year + 1900
824 && sdate.IsSetMonth()) {
825 if (sdate.GetMonth() < tm->tm_mon + 1) {
826 in_past = true;
827 } else if (sdate.GetMonth() == tm->tm_mon + 1
828 && sdate.IsSetDay()) {
829 if (sdate.GetDay() < tm->tm_mday) {
830 in_past = true;
831 }
832 }
833 }
834 return in_past;
835 }
836
837
GetDateErrorDescription(int flags)838 string GetDateErrorDescription (int flags)
839 {
840 string reasons;
841
842 if (flags & eDateValid_empty_date) {
843 reasons += "EMPTY_DATE ";
844 }
845 if (flags & eDateValid_bad_str) {
846 reasons += "BAD_STR ";
847 }
848 if (flags & eDateValid_bad_year) {
849 reasons += "BAD_YEAR ";
850 }
851 if (flags & eDateValid_bad_month) {
852 reasons += "BAD_MONTH ";
853 }
854 if (flags & eDateValid_bad_day) {
855 reasons += "BAD_DAY ";
856 }
857 if (flags & eDateValid_bad_season) {
858 reasons += "BAD_SEASON ";
859 }
860 if (flags & eDateValid_bad_other) {
861 reasons += "BAD_OTHER ";
862 }
863 return reasons;
864 }
865
866
IsBioseqTSA(const CBioseq & seq,CScope * scope)867 bool IsBioseqTSA (const CBioseq& seq, CScope* scope)
868 {
869 if (!scope) {
870 return false;
871 }
872 bool is_tsa = false;
873 CBioseq_Handle bsh = scope->GetBioseqHandle(seq);
874 if (bsh) {
875 CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_Molinfo);
876 while (desc_ci && !is_tsa) {
877 if (desc_ci->GetMolinfo().IsSetTech() && desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
878 is_tsa = true;
879 }
880 ++desc_ci;
881 }
882 }
883 return is_tsa;
884 }
885
886
887 #if 0
888 // disabled for now
889 bool IsNCBIFILESeqId (const CSeq_id& id)
890 {
891 if (!id.IsGeneral() || !id.GetGeneral().IsSetDb()
892 || !NStr::Equal(id.GetGeneral().GetDb(), "NCBIFILE")) {
893 return false;
894 } else {
895 return true;
896 }
897 }
898 #endif
899
900
IsAccession(const CSeq_id & id)901 bool IsAccession(const CSeq_id& id)
902 {
903 if (id.GetTextseq_Id() != NULL) {
904 return true;
905 } else {
906 return false;
907 }
908 }
909
910
UpdateToBestId(CSeq_loc & loc,CScope & scope)911 static void UpdateToBestId(CSeq_loc& loc, CScope& scope)
912 {
913 bool any_change = false;
914 CSeq_loc_I it(loc);
915 for (; it; ++it) {
916 const CSeq_id& id = it.GetSeq_id();
917 if (!IsAccession(id)) {
918 CConstRef<CSeq_id> best_id(NULL);
919 CBioseq_Handle bsh = scope.GetBioseqHandle(id);
920 if (bsh) {
921 const auto& ids = bsh.GetCompleteBioseq()->GetId();
922 for (auto& id_it : ids) {
923 if (IsAccession(*id_it)) {
924 best_id = id_it;
925 break;
926 }
927 }
928 }
929 if (best_id) {
930 it.SetSeq_id(*best_id);
931 any_change = true;
932 }
933 }
934 }
935 if (any_change) {
936 loc.Assign(*it.MakeSeq_loc());
937 }
938 }
939
940
GetValidatorLocationLabel(const CSeq_loc & loc,CScope & scope)941 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope)
942 {
943 string loc_label;
944 if (loc.IsWhole()) {
945 CBioseq_Handle bsh = scope.GetBioseqHandle(loc.GetWhole());
946 if (bsh) {
947 loc_label = GetBioseqIdLabel(*(bsh.GetCompleteBioseq()));
948 NStr::ReplaceInPlace(loc_label, "[", "");
949 NStr::ReplaceInPlace(loc_label, "]", "");
950 }
951 }
952 if (NStr::IsBlank(loc_label)) {
953 CSeq_loc tweaked_loc;
954 tweaked_loc.Assign(loc);
955 UpdateToBestId(tweaked_loc, scope);
956 tweaked_loc.GetLabel(&loc_label);
957 NStr::ReplaceInPlace(loc_label, "[", "(");
958 NStr::ReplaceInPlace(loc_label, "]", ")");
959 }
960 return loc_label;
961 }
962
963
964
GetBioseqIdLabel(const CBioseq & sq,bool limited)965 string GetBioseqIdLabel(const CBioseq& sq, bool limited)
966 {
967 string content;
968 int num_ids_found = 0;
969 bool id_found = false;
970
971 const auto& id_list = sq.GetId();
972
973 /* find first gi */
974 for (auto id_it : id_list) {
975 if (id_it->IsGi()) {
976 CNcbiOstrstream os;
977 id_it->WriteAsFasta(os);
978 string s = CNcbiOstrstreamToString(os);
979 content += s;
980 num_ids_found ++;
981 break;
982 }
983 }
984 /* find first accession */
985 for (auto id_it : id_list) {
986 if (id_it->IsGenbank()
987 || id_it->IsDdbj()
988 || id_it->IsEmbl()
989 || id_it->IsSwissprot()
990 || id_it->IsOther()
991 || id_it->IsTpd()
992 || id_it->IsTpe()
993 || id_it->IsTpg()) {
994 if (num_ids_found > 0) {
995 content += "|";
996 }
997 CNcbiOstrstream os;
998 id_it->WriteAsFasta(os);
999 string s = CNcbiOstrstreamToString(os);
1000 content += s;
1001 num_ids_found++;
1002 break;
1003 }
1004 }
1005
1006 if (num_ids_found == 0) {
1007 /* find first general */
1008 for (auto id_it : id_list) {
1009 if (id_it->IsGeneral()) {
1010 if (num_ids_found > 0) {
1011 content += "|";
1012 }
1013 CNcbiOstrstream os;
1014 id_it->WriteAsFasta(os);
1015 string s = CNcbiOstrstreamToString(os);
1016 content += s;
1017 num_ids_found++;
1018 break;
1019 }
1020 }
1021 }
1022 // didn't find any? print them all, but only the first local
1023 if (num_ids_found == 0) {
1024 bool found_local = false;
1025 for (auto id_it : id_list) {
1026 if (id_it->IsLocal()) {
1027 if (found_local) {
1028 continue;
1029 } else {
1030 found_local = true;
1031 }
1032 }
1033 if (id_found) {
1034 content += "|";
1035 }
1036 CNcbiOstrstream os;
1037 id_it->WriteAsFasta(os);
1038 string s = CNcbiOstrstreamToString(os);
1039 content += s;
1040 id_found = true;
1041 }
1042 }
1043
1044 return content;
1045 }
1046
1047
AppendBioseqLabel(string & str,const CBioseq & sq,bool supress_context)1048 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context)
1049 {
1050 str += "BIOSEQ: ";
1051
1052 string content = GetBioseqIdLabel (sq);
1053
1054 if (!supress_context) {
1055 if (!content.empty()) {
1056 content += ": ";
1057 }
1058
1059 const CEnumeratedTypeValues* tv;
1060 tv = CSeq_inst::GetTypeInfo_enum_ERepr();
1061 const CSeq_inst& inst = sq.GetInst();
1062 content += tv->FindName(inst.GetRepr(), true) + ", ";
1063 tv = CSeq_inst::GetTypeInfo_enum_EMol();
1064 content += tv->FindName(inst.GetMol(), true);
1065 if (inst.IsSetLength()) {
1066 content += string(" len= ") + NStr::IntToString(inst.GetLength());
1067 }
1068 }
1069 str += content;
1070 }
1071
HasECnumberPattern(const string & str)1072 bool HasECnumberPattern (const string& str)
1073 {
1074 bool rval = false;
1075 if (NStr::IsBlank(str)) {
1076 return false;
1077 }
1078
1079 bool is_ambig = false;
1080 int numdashes = 0;
1081 int numdigits = 0;
1082 int numperiods = 0;
1083
1084 string::const_iterator sit = str.begin();
1085 while (sit != str.end() && !rval) {
1086 if (isdigit (*sit)) {
1087 numdigits++;
1088 if (is_ambig) {
1089 is_ambig = false;
1090 numperiods = 0;
1091 numdashes = 0;
1092 }
1093 } else if (*sit == '-') {
1094 numdashes++;
1095 is_ambig = true;
1096 } else if (*sit == 'n') {
1097 numdashes++;
1098 is_ambig = true;
1099 } else if (*sit == '.') {
1100 numperiods++;
1101 if (numdigits > 0 && numdashes > 0) {
1102 is_ambig = false;
1103 numperiods = 0;
1104 } else if (numdigits == 0 && numdashes == 0) {
1105 is_ambig = false;
1106 numperiods = 0;
1107 } else if (numdashes > 1) {
1108 is_ambig = false;
1109 numperiods = 0;
1110 }
1111 numdigits = 0;
1112 numdashes = 0;
1113 } else {
1114 if (numperiods == 3) {
1115 if (numdigits > 0 && numdashes > 0) {
1116 is_ambig = false;
1117 } else if (numdigits > 0 || numdashes == 1) {
1118 rval = true;
1119 }
1120 }
1121 is_ambig = false;
1122 numperiods = 0;
1123 numdigits = 0;
1124 numdashes = 0;
1125 }
1126 ++sit;
1127 }
1128 if (numperiods == 3) {
1129 if (numdigits > 0 && numdashes > 0) {
1130 rval = false;
1131 } else if (numdigits > 0 || numdashes == 1) {
1132 rval = true;
1133 }
1134 }
1135 return rval;
1136 }
1137
1138
SeqIsPatent(const CBioseq & seq)1139 bool SeqIsPatent (const CBioseq& seq)
1140 {
1141 bool is_patent = false;
1142
1143 // some tests are suppressed if a patent ID is present
1144 FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
1145 if ((*id_it)->IsPatent()) {
1146 is_patent = true;
1147 break;
1148 }
1149 }
1150 return is_patent;
1151 }
1152
1153
SeqIsPatent(const CBioseq_Handle & seq)1154 bool SeqIsPatent (const CBioseq_Handle& seq)
1155 {
1156 return SeqIsPatent (*(seq.GetCompleteBioseq()));
1157 }
1158
1159
s_PartialAtGapOrNs(CScope * scope,const CSeq_loc & loc,unsigned int tag,bool only_gap)1160 bool s_PartialAtGapOrNs (
1161 CScope* scope,
1162 const CSeq_loc& loc,
1163 unsigned int tag,
1164 bool only_gap
1165 )
1166
1167 {
1168 if ( tag != sequence::eSeqlocPartial_Nostart && tag != sequence::eSeqlocPartial_Nostop ) {
1169 return false;
1170 }
1171
1172 CSeq_loc_CI first, last;
1173 for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) { // EQUIV_IS_ONE not supported
1174 if ( !first ) {
1175 first = sl_iter;
1176 }
1177 last = sl_iter;
1178 }
1179
1180 if ( first.GetStrand() != last.GetStrand() ) {
1181 return false;
1182 }
1183 CSeq_loc_CI temp = (tag == sequence::eSeqlocPartial_Nostart) ? first : last;
1184
1185 if (!scope) {
1186 return false;
1187 }
1188
1189 CConstRef<CSeq_loc> slp = temp.GetRangeAsSeq_loc();
1190 if (!slp) {
1191 return false;
1192 }
1193 const CSeq_id* id = slp->GetId();
1194 if (id == NULL) return false;
1195 CBioseq_Handle bsh = scope->GetBioseqHandle(*id);
1196 if (!bsh) {
1197 return false;
1198 }
1199
1200 TSeqPos acceptor = temp.GetRange().GetFrom();
1201 TSeqPos donor = temp.GetRange().GetTo();
1202 TSeqPos start = acceptor;
1203 TSeqPos stop = donor;
1204
1205 CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
1206 temp.GetStrand());
1207 TSeqPos len = vec.size();
1208
1209 if ( temp.GetStrand() == eNa_strand_minus ) {
1210 swap(acceptor, donor);
1211 stop = len - donor - 1;
1212 start = len - acceptor - 1;
1213 }
1214
1215 bool result = false;
1216
1217 try {
1218 if (tag == sequence::eSeqlocPartial_Nostop && stop < len - 1 && vec.IsInGap(stop + 1)) {
1219 return true;
1220 } else if (tag == sequence::eSeqlocPartial_Nostart && start > 0 && start < len && vec.IsInGap(start - 1)) {
1221 return true;
1222 }
1223 } catch ( exception& ) {
1224 return false;
1225 }
1226 if (only_gap) {
1227 return false;
1228 }
1229
1230 if ( (tag == sequence::eSeqlocPartial_Nostop) && (stop < len - 2) ) {
1231 try {
1232 CSeqVector::TResidue res = vec[stop + 1];
1233
1234 if ( IsResidue(res) && isalpha (res)) {
1235 if ( res == 'N' ) {
1236 result = true;
1237 }
1238 }
1239 } catch ( exception& ) {
1240 return false;
1241 }
1242 } else if ( (tag == sequence::eSeqlocPartial_Nostart) && (start > 1) ) {
1243 try {
1244 CSeqVector::TResidue res = vec[start - 1];
1245 if ( IsResidue(res) && isalpha (res)) {
1246 if ( res == 'N' ) {
1247 result = true;
1248 }
1249 }
1250 } catch ( exception& ) {
1251 return false;
1252 }
1253 }
1254
1255 return result;
1256 }
1257
1258
BioseqHandleFromLocation(CScope * m_Scope,const CSeq_loc & loc)1259 CBioseq_Handle BioseqHandleFromLocation (CScope* m_Scope, const CSeq_loc& loc)
1260
1261 {
1262 CBioseq_Handle bsh;
1263 for ( CSeq_loc_CI citer (loc); citer; ++citer) {
1264 const CSeq_id& id = citer.GetSeq_id();
1265 CSeq_id_Handle sih = CSeq_id_Handle::GetHandle(id);
1266 bsh = m_Scope->GetBioseqHandle (sih, CScope::eGetBioseq_All);
1267 if (bsh) {
1268 return bsh;
1269 }
1270 }
1271 return bsh;
1272 }
1273
1274
s_PosIsNNotGap(const CSeqVector & vec,unsigned int pos)1275 bool s_PosIsNNotGap(const CSeqVector& vec, unsigned int pos)
1276 {
1277 if (pos >= vec.size()) {
1278 return false;
1279 } else if (vec[pos] != 'N' && vec[pos] != 'n') {
1280 return false;
1281 } else if (vec.IsInGap(pos)) {
1282 return false;
1283 } else {
1284 return true;
1285 }
1286 }
1287
1288
ShouldCheckForNsAndGap(const CBioseq_Handle & bsh)1289 bool ShouldCheckForNsAndGap(const CBioseq_Handle& bsh)
1290 {
1291 if (!bsh || bsh.GetInst_Length() < 10 || (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular)) {
1292 return false;
1293 } else {
1294 return true;
1295 }
1296 }
1297
1298
CheckBioseqEndsForNAndGap(const CSeqVector & vec,EBioseqEndIsType & begin_n,EBioseqEndIsType & begin_gap,EBioseqEndIsType & end_n,EBioseqEndIsType & end_gap,bool & begin_ambig,bool & end_ambig)1299 void CheckBioseqEndsForNAndGap
1300 (const CSeqVector& vec,
1301 EBioseqEndIsType& begin_n,
1302 EBioseqEndIsType& begin_gap,
1303 EBioseqEndIsType& end_n,
1304 EBioseqEndIsType& end_gap,
1305 bool& begin_ambig,
1306 bool& end_ambig)
1307 {
1308 begin_n = eBioseqEndIsType_None;
1309 begin_gap = eBioseqEndIsType_None;
1310 end_n = eBioseqEndIsType_None;
1311 end_gap = eBioseqEndIsType_None;
1312 begin_ambig = false;
1313 end_ambig = false;
1314
1315 if (vec.size() < 10) {
1316 return;
1317 }
1318
1319 try {
1320
1321 // check for gap at begining of sequence
1322 if (vec.IsInGap(0) /* || vec.IsInGap(1) */) {
1323 begin_gap = eBioseqEndIsType_All;
1324 for (int i = 0; i < 10; i++) {
1325 if (!vec.IsInGap(i)) {
1326 begin_gap = eBioseqEndIsType_Last;
1327 break;
1328 }
1329 }
1330 }
1331
1332 // check for gap at end of sequence
1333 if ( /* vec.IsInGap (vec.size() - 2) || */ vec.IsInGap(vec.size() - 1)) {
1334 end_gap = eBioseqEndIsType_All;
1335 for (unsigned int i = vec.size() - 11; i < vec.size(); i++) {
1336 if (!vec.IsInGap(i)) {
1337 end_gap = eBioseqEndIsType_Last;
1338 break;
1339 }
1340 }
1341 }
1342
1343 if (vec.IsNucleotide()) {
1344 // check for N bases at beginning of sequence
1345 if (s_PosIsNNotGap(vec, 0) /* || s_PosIsNNotGap(vec, 1) */) {
1346 begin_n = eBioseqEndIsType_All;
1347 for (unsigned int i = 0; i < 10; i++) {
1348 if (!s_PosIsNNotGap(vec, i)) {
1349 begin_n = eBioseqEndIsType_Last;
1350 break;
1351 }
1352 }
1353 }
1354
1355 // check for N bases at end of sequence
1356 if ( /* s_PosIsNNotGap(vec, vec.size() - 2) || */ s_PosIsNNotGap(vec, vec.size() - 1)) {
1357 end_n = eBioseqEndIsType_All;
1358 for (unsigned int i = vec.size() - 10; i < vec.size(); i++) {
1359 if (!s_PosIsNNotGap(vec, i)) {
1360 end_n = eBioseqEndIsType_Last;
1361 break;
1362 }
1363 }
1364 }
1365
1366 // check for ambiguous concentration
1367 size_t check_len = 50;
1368 if (vec.size() < 50) {
1369 check_len = vec.size();
1370 }
1371 size_t num_ns = 0;
1372 for (size_t i = 0; i < check_len; i++) {
1373 if (vec[i] == 'N') {
1374 num_ns++;
1375 if (num_ns >= 5 && i < 10) {
1376 begin_ambig = true;
1377 break;
1378 } else if (num_ns >= 15) {
1379 begin_ambig = true;
1380 break;
1381 }
1382 }
1383 }
1384 num_ns = 0;
1385 for (int i = 0; i < check_len; i++) {
1386 if (vec[vec.size() - i - 1] == 'N') {
1387 num_ns++;
1388 if (num_ns >= 5 && i < 10) {
1389 end_ambig = true;
1390 break;
1391 } else if (num_ns >= 15) {
1392 end_ambig = true;
1393 break;
1394 }
1395 }
1396 }
1397 }
1398 } catch (exception&) {
1399 // if there are exceptions, cannot perform this calculation
1400 }
1401 }
1402
1403
CheckBioseqEndsForNAndGap(const CBioseq_Handle & bsh,EBioseqEndIsType & begin_n,EBioseqEndIsType & begin_gap,EBioseqEndIsType & end_n,EBioseqEndIsType & end_gap,bool & begin_ambig,bool & end_ambig)1404 void CheckBioseqEndsForNAndGap
1405 (const CBioseq_Handle& bsh,
1406 EBioseqEndIsType& begin_n,
1407 EBioseqEndIsType& begin_gap,
1408 EBioseqEndIsType& end_n,
1409 EBioseqEndIsType& end_gap,
1410 bool& begin_ambig,
1411 bool& end_ambig)
1412 {
1413 begin_n = eBioseqEndIsType_None;
1414 begin_gap = eBioseqEndIsType_None;
1415 end_n = eBioseqEndIsType_None;
1416 end_gap = eBioseqEndIsType_None;
1417 begin_ambig = false;
1418 end_ambig = false;
1419 if (!ShouldCheckForNsAndGap(bsh)) {
1420 return;
1421 }
1422
1423 try {
1424 // check for gap at begining of sequence
1425 CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
1426 CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
1427 } catch ( exception& ) {
1428 // if there are exceptions, cannot perform this calculation
1429 }
1430 }
1431
1432
IsLocFullLength(const CSeq_loc & loc,const CBioseq_Handle & bsh)1433 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh)
1434 {
1435 if (loc.IsInt()
1436 && loc.GetInt().GetFrom() == 0
1437 && loc.GetInt().GetTo() == bsh.GetInst_Length() - 1) {
1438 return true;
1439 } else {
1440 return false;
1441 }
1442 }
1443
1444
PartialsSame(const CSeq_loc & loc1,const CSeq_loc & loc2)1445 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2)
1446 {
1447 bool loc1_partial_start =
1448 loc1.IsPartialStart(eExtreme_Biological);
1449 bool loc1_partial_stop =
1450 loc1.IsPartialStop(eExtreme_Biological);
1451 bool loc2_partial_start =
1452 loc2.IsPartialStart(eExtreme_Biological);
1453 bool loc2_partial_stop =
1454 loc2.IsPartialStop(eExtreme_Biological);
1455 if (loc1_partial_start == loc2_partial_start &&
1456 loc1_partial_stop == loc2_partial_stop) {
1457 return true;
1458 } else {
1459 return false;
1460 }
1461 }
1462
1463
1464
1465
1466 // Code for finding duplicate features
s_IsSameStrand(const CSeq_loc & l1,const CSeq_loc & l2,CScope & scope)1467 bool s_IsSameStrand(const CSeq_loc& l1, const CSeq_loc& l2, CScope& scope)
1468 {
1469 ENa_strand s1 = sequence::GetStrand(l1, &scope);
1470 ENa_strand s2 = sequence::GetStrand(l2, &scope);
1471 if ((s1 == eNa_strand_minus && s2 == eNa_strand_minus)
1472 || (s1 != eNa_strand_minus && s2 != eNa_strand_minus)) {
1473 return true;
1474 } else {
1475 return false;
1476 }
1477 }
1478
1479
1480 inline
s_IsSameSeqAnnot(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,bool & diff_descriptions)1481 bool s_IsSameSeqAnnot(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, bool& diff_descriptions)
1482 {
1483 const auto& annot1 = f1.GetAnnot();
1484 const auto& annot2 = f2.GetAnnot();
1485 bool rval = annot1 == annot2;
1486 diff_descriptions = false;
1487 if (!rval) {
1488 if ((!annot1.Seq_annot_IsSetDesc() || annot1.Seq_annot_GetDesc().Get().empty()) &&
1489 (!annot2.Seq_annot_IsSetDesc() || annot2.Seq_annot_GetDesc().Get().empty())) {
1490 // neither is set
1491 diff_descriptions = false;
1492 } else if (annot1.Seq_annot_IsSetDesc() && annot2.Seq_annot_IsSetDesc()) {
1493 // both are set - are they different?
1494 const auto d1 = annot1.Seq_annot_GetDesc().Get().front();
1495 const auto d2 = annot2.Seq_annot_GetDesc().Get().front();
1496 if (d1->Which() != d2->Which()) {
1497 diff_descriptions = true;
1498 } else {
1499 if (d1->IsName()
1500 && NStr::EqualNocase(d1->GetName(), d2->GetName())) {
1501 diff_descriptions = false;
1502 } else if (d1->IsTitle()
1503 && NStr::EqualNocase(d1->GetTitle(), d2->GetTitle())) {
1504 diff_descriptions = false;
1505 } else {
1506 diff_descriptions = true;
1507 }
1508 }
1509 } else {
1510 diff_descriptions = true;
1511 }
1512 }
1513 return rval;
1514 }
1515
1516
s_AreGBQualsIdentical(const CSeq_feat_Handle & feat1,const CSeq_feat_Handle & feat2,bool case_sensitive)1517 bool s_AreGBQualsIdentical(const CSeq_feat_Handle& feat1, const CSeq_feat_Handle& feat2, bool case_sensitive)
1518 {
1519 if (!feat1.IsSetQual() || !feat2.IsSetQual()) {
1520 return true;
1521 }
1522
1523 bool rval = true;
1524
1525 CSeq_feat::TQual::const_iterator gb1 = feat1.GetQual().begin();
1526 CSeq_feat::TQual::const_iterator gb1_end = feat1.GetQual().end();
1527 CSeq_feat::TQual::const_iterator gb2 = feat2.GetQual().begin();
1528 CSeq_feat::TQual::const_iterator gb2_end = feat2.GetQual().end();
1529
1530 while ((gb1 != gb1_end) && (gb2 != gb2_end) && rval) {
1531 if (!(*gb1)->IsSetQual()) {
1532 if ((*gb2)->IsSetQual()) {
1533 rval = false;
1534 }
1535 } else if (!(*gb2)->IsSetQual()) {
1536 rval = false;
1537 } else if (!NStr::Equal ((*gb1)->GetQual(), (*gb2)->GetQual())) {
1538 rval = false;
1539 }
1540 if (rval) {
1541 string v1 = (*gb1)->IsSetVal() ? (*gb1)->GetVal() : "";
1542 string v2 = (*gb2)->IsSetVal() ? (*gb2)->GetVal() : "";
1543 NStr::TruncateSpacesInPlace(v1);
1544 NStr::TruncateSpacesInPlace(v2);
1545 rval = NStr::Equal(v1, v2, case_sensitive ? NStr::eCase : NStr::eNocase);
1546 }
1547 ++gb1;
1548 ++gb2;
1549 }
1550 if (gb1 != gb1_end || gb2 != gb2_end) {
1551 rval = false;
1552 }
1553
1554 return rval;
1555 }
1556
1557
s_AreFeatureLabelsSame(const CSeq_feat_Handle & feat,const CSeq_feat_Handle & prev,bool case_sensitive)1558 bool s_AreFeatureLabelsSame(const CSeq_feat_Handle& feat, const CSeq_feat_Handle& prev, bool case_sensitive)
1559 {
1560 if (!feat.GetData().Equals(prev.GetData())) {
1561 return false;
1562 }
1563
1564 // compare labels and comments
1565 bool same_label = true;
1566 const string& curr_comment =
1567 feat.IsSetComment() ? feat.GetComment() : kEmptyStr;
1568 const string& prev_comment =
1569 prev.IsSetComment() ? prev.GetComment() : kEmptyStr;
1570 string curr_label;
1571 string prev_label;
1572
1573 feature::GetLabel(*(feat.GetSeq_feat()),
1574 &curr_label, feature::fFGL_Content, &(feat.GetScope()));
1575 feature::GetLabel(*(prev.GetSeq_feat()),
1576 &prev_label, feature::fFGL_Content, &(prev.GetScope()));
1577
1578 bool comments_same = NStr::Equal(curr_comment, prev_comment, case_sensitive ? NStr::eCase : NStr::eNocase);
1579 bool labels_same = NStr::Equal(curr_label, prev_label, case_sensitive ? NStr::eCase : NStr::eNocase);
1580
1581 if (!comments_same || !labels_same) {
1582 same_label = false;
1583 } else if (!s_AreGBQualsIdentical(feat, prev, case_sensitive)) {
1584 same_label = false;
1585 }
1586 return same_label;
1587 }
1588
1589
s_IsDifferentDbxrefs(const TDbtags & list1,const TDbtags & list2)1590 bool s_IsDifferentDbxrefs(const TDbtags& list1, const TDbtags& list2)
1591 {
1592 if (list1.empty() || list2.empty()) {
1593 return false;
1594 } else if (list1.size() != list2.size()) {
1595 return true;
1596 }
1597
1598 TDbtags::const_iterator it1 = list1.begin();
1599 TDbtags::const_iterator it2 = list2.begin();
1600 for (; it1 != list1.end(); ++it1, ++it2) {
1601 if (!NStr::EqualNocase((*it1)->GetDb(), (*it2)->GetDb())) {
1602 return true;
1603 }
1604 string str1 =
1605 (*it1)->GetTag().IsStr() ? (*it1)->GetTag().GetStr() : "";
1606 string str2 =
1607 (*it2)->GetTag().IsStr() ? (*it2)->GetTag().GetStr() : "";
1608 if ( str1.empty() && str2.empty() ) {
1609 if (!(*it1)->GetTag().IsId() && !(*it2)->GetTag().IsId()) {
1610 continue;
1611 } else if ((*it1)->GetTag().IsId() && (*it2)->GetTag().IsId()) {
1612 if ((*it1)->GetTag().GetId() != (*it2)->GetTag().GetId()) {
1613 return true;
1614 }
1615 } else {
1616 return true;
1617 }
1618 } else if (!str1.empty() && !str2.empty() && !NStr::EqualNocase(str1, str2)) {
1619 return true;
1620 }
1621 }
1622 return false;
1623 }
1624
1625
s_AreFullLengthCodingRegionsWithDifferentFrames(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1626 bool s_AreFullLengthCodingRegionsWithDifferentFrames (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1627 {
1628 const auto & f1data = f1.GetData();
1629 const auto & f2data = f2.GetData();
1630 if (!f1data.IsCdregion() || !f2data.IsCdregion()) {
1631 return false;
1632 }
1633 const auto & cd1 = f1data.GetCdregion();
1634 const auto & cd2 = f2data.GetCdregion();
1635
1636 int frame1 = 1, frame2 = 1;
1637 if (cd1.IsSetFrame()) {
1638 frame1 = cd1.GetFrame();
1639 if (frame1 == 0) {
1640 frame1 = 1;
1641 }
1642 }
1643 if (cd2.IsSetFrame()) {
1644 frame2 = cd2.GetFrame();
1645 if (frame2 == 0) {
1646 frame2 = 1;
1647 }
1648 }
1649 if (frame1 == frame2) {
1650 return false;
1651 }
1652
1653 CBioseq_Handle bsh1 = f1.GetScope().GetBioseqHandle(f1.GetLocation());
1654 if (!IsLocFullLength (f1.GetLocation(), bsh1)) {
1655 return false;
1656 }
1657 CBioseq_Handle bsh2 = f2.GetScope().GetBioseqHandle(f2.GetLocation());
1658 if (!IsLocFullLength (f2.GetLocation(), bsh2)) {
1659 return false;
1660 }
1661
1662 return true;
1663 }
1664
1665
1666 //LCOV_EXCL_START
1667 // never used, because different variations generate different labels
s_ReplaceListFromQuals(const CSeq_feat::TQual & quals)1668 string s_ReplaceListFromQuals(const CSeq_feat::TQual& quals)
1669 {
1670 string replace;
1671 ITERATE(CSeq_feat::TQual, q, quals) {
1672 if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "replace") && (*q)->IsSetVal()) {
1673 if (NStr::IsBlank((*q)->GetVal())) {
1674 replace += " ";
1675 } else {
1676 replace += (*q)->GetVal();
1677 }
1678 replace += ".";
1679 }
1680 }
1681 return replace;
1682 }
1683
1684
s_AreDifferentVariations(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1685 bool s_AreDifferentVariations(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1686 {
1687 if (f1.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation
1688 || f2.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
1689 return false;
1690 }
1691 if (!f1.IsSetQual() || !f2.IsSetQual()) {
1692 return false;
1693 }
1694 string replace1 = s_ReplaceListFromQuals(f1.GetQual());
1695 string replace2 = s_ReplaceListFromQuals(f2.GetQual());
1696
1697 if (!NStr::Equal(replace1, replace2)) {
1698 return true;
1699 } else {
1700 return false;
1701 }
1702 }
1703 //LCOV_EXCL_STOP
1704
1705
1706 typedef vector<CConstRef<CObject_id> > TFeatIdVec;
s_AreLinkedToDifferentFeats(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,CSeqFeatData::ESubtype s1,CSeqFeatData::ESubtype s2)1707 static bool s_AreLinkedToDifferentFeats (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, CSeqFeatData::ESubtype s1, CSeqFeatData::ESubtype s2)
1708 {
1709 bool rval = false;
1710
1711 if (f1.GetData().GetSubtype() == s1 && f2.GetData().GetSubtype() == s1) {
1712 CScope& scope = f1.GetScope();
1713 const CSeq_loc& loc = f1.GetLocation();
1714 CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, loc);
1715 if (bsh) {
1716 const CTSE_Handle& tse = bsh.GetTSE_Handle();
1717 TFeatIdVec mrna1_id;
1718 TFeatIdVec mrna2_id;
1719 list<CSeq_feat_Handle> mrna1;
1720 list<CSeq_feat_Handle> mrna2;
1721
1722 FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, *(f1.GetSeq_feat())) {
1723 if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1724 const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1725 vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1726 ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1727 if (feat_it->IsSetData()
1728 && feat_it->GetData().GetSubtype() == s2) {
1729 mrna1.push_back(*feat_it);
1730 CConstRef<CObject_id> f(&feat_id);
1731 mrna1_id.push_back (f);
1732 break;
1733 }
1734 }
1735 }
1736 }
1737 FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, *(f2.GetSeq_feat())) {
1738 if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1739 const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1740 vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1741 ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1742 if (feat_it->IsSetData()
1743 && feat_it->GetData().GetSubtype() == s2) {
1744 mrna2.push_back(*feat_it);
1745 CConstRef<CObject_id> f(&feat_id);
1746 mrna2_id.push_back (f);
1747 }
1748 }
1749 }
1750 }
1751
1752 if (mrna1_id.size() > 0 && mrna2_id.size() > 0) {
1753 rval = true;
1754 for (auto i1 = mrna1_id.begin(); i1 != mrna1_id.end(); ++i1) {
1755 for (auto i2 = mrna2_id.begin(); i2 != mrna2_id.end(); ++i2) {
1756 if ((*i1)->Equals(**i2)) {
1757 rval = false;
1758 break;
1759 }
1760 }
1761 if (!rval) {
1762 break;
1763 }
1764 }
1765
1766 if (rval) { // Check that locations aren't the same
1767 const CSeq_feat_Handle fh1 = mrna1.front();
1768 const CSeq_feat_Handle fh2 = mrna2.front();
1769
1770
1771 if (s_IsSameStrand(fh1.GetLocation(),
1772 fh2.GetLocation(),
1773 fh1.GetScope())
1774 && (sequence::Compare(fh1.GetLocation(),
1775 fh2.GetLocation(),
1776 &(fh1.GetScope()),
1777 sequence::fCompareOverlapping) == sequence::eSame)) {
1778 rval = false;
1779 }
1780 }
1781 }
1782 }
1783 }
1784 return rval;
1785 }
1786
1787
1788
1789
s_AreCodingRegionsLinkedToDifferentmRNAs(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1790 static bool s_AreCodingRegionsLinkedToDifferentmRNAs (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1791 {
1792 return s_AreLinkedToDifferentFeats (f1, f2, CSeqFeatData::eSubtype_cdregion, CSeqFeatData::eSubtype_mRNA);
1793 }
1794
1795
s_AremRNAsLinkedToDifferentCodingRegions(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2)1796 bool s_AremRNAsLinkedToDifferentCodingRegions (const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2)
1797 {
1798 return s_AreLinkedToDifferentFeats (f1, f2, CSeqFeatData::eSubtype_mRNA, CSeqFeatData::eSubtype_cdregion);
1799 }
1800
1801
IsDicistronicGene(const CSeq_feat_Handle & f)1802 bool IsDicistronicGene(const CSeq_feat_Handle& f)
1803 {
1804 if ( f.GetData().GetSubtype() != CSeqFeatData::eSubtype_gene ) return false;
1805 return IsDicistronic(f);
1806 }
1807
1808
IsDicistronic(const CSeq_feat_Handle & f)1809 bool IsDicistronic(const CSeq_feat_Handle& f)
1810 {
1811 if (!f.IsSetExcept()) return false;
1812 if (!f.IsSetExcept_text()) return false;
1813
1814 const string& except_text = f.GetExcept_text();
1815 if (NStr::FindNoCase(except_text, "dicistronic gene") == NPOS) return false;
1816
1817 return true;
1818 }
1819
1820
1821 EDuplicateFeatureType
IsDuplicate(const CSeq_feat_Handle & f1,const CSeq_feat_Handle & f2,bool check_partials,bool case_sensitive)1822 IsDuplicate
1823 (const CSeq_feat_Handle& f1,
1824 const CSeq_feat_Handle& f2,
1825 bool check_partials,
1826 bool case_sensitive)
1827 {
1828
1829 EDuplicateFeatureType dup_type = eDuplicate_Not;
1830
1831 // subtypes
1832 CSeqFeatData::ESubtype feat1_subtype = f1.GetData().GetSubtype();
1833 CSeqFeatData::ESubtype feat2_subtype = f2.GetData().GetSubtype();
1834
1835 // not duplicates if not the same subtype
1836 if (feat1_subtype != feat2_subtype) {
1837 return eDuplicate_Not;
1838 }
1839
1840 // locations
1841 const CSeq_loc& feat1_loc = f1.GetLocation();
1842 const CSeq_loc& feat2_loc = f2.GetLocation();
1843
1844 // not duplicates if not the same location and strand
1845 if (!s_IsSameStrand(feat1_loc, feat2_loc, f1.GetScope()) ||
1846 sequence::Compare(feat1_loc, feat2_loc, &(f1.GetScope()),
1847 sequence::fCompareOverlapping) != sequence::eSame) {
1848 return eDuplicate_Not;
1849 }
1850
1851 // same annot?
1852 bool diff_annot_desc = false;
1853 bool same_annot = s_IsSameSeqAnnot(f1, f2, diff_annot_desc);
1854
1855 if (diff_annot_desc) {
1856 // don't report if features on different annots with different titles or names
1857 return eDuplicate_Not;
1858 }
1859
1860 // compare labels and comments
1861 bool same_label = s_AreFeatureLabelsSame (f1, f2, case_sensitive);
1862
1863 // compare dbxrefs
1864 bool different_dbxrefs = (f1.IsSetDbxref() && f2.IsSetDbxref() &&
1865 s_IsDifferentDbxrefs(f1.GetDbxref(), f2.GetDbxref()));
1866
1867 if ( feat1_subtype == CSeqFeatData::eSubtype_region && different_dbxrefs) {
1868 return eDuplicate_Not;
1869 }
1870
1871 // check for frame difference
1872 bool full_length_coding_regions_with_different_frames =
1873 s_AreFullLengthCodingRegionsWithDifferentFrames(f1, f2);
1874 if (!same_label && full_length_coding_regions_with_different_frames) {
1875 // do not report if both coding regions are full length, have different products,
1876 // and have different frames
1877 return eDuplicate_Not;
1878 }
1879
1880 if ((feat1_subtype == CSeqFeatData::eSubtype_variation && !same_label) || s_AreDifferentVariations(f1, f2)) {
1881 // don't report variations if replace quals are different or labels are different
1882 return eDuplicate_Not;
1883 }
1884
1885
1886 if (s_AreCodingRegionsLinkedToDifferentmRNAs(f1, f2)) {
1887 // do not report if features are coding regions linked to different mRNAs
1888 return eDuplicate_Not;
1889 }
1890
1891
1892 if (s_AremRNAsLinkedToDifferentCodingRegions(f1, f2)) {
1893 // do not report if features are mRNAs linked to different coding regions
1894 return eDuplicate_Not;
1895 }
1896
1897
1898 // only report pubs if they have the same label
1899 if (feat1_subtype == CSeqFeatData::eSubtype_pub && !same_label) {
1900 return eDuplicate_Not;
1901 }
1902
1903 bool partials_ok = (!check_partials || PartialsSame(feat1_loc, feat2_loc));
1904
1905 if (!partials_ok) {
1906 return eDuplicate_Not;
1907 }
1908
1909 if ( same_annot ) {
1910 if (same_label) {
1911 dup_type = eDuplicate_Duplicate;
1912 } else {
1913 dup_type = eDuplicate_SameIntervalDifferentLabel;
1914 }
1915 } else {
1916 if (same_label) {
1917 dup_type = eDuplicate_DuplicateDifferentTable;
1918 } else if ( feat2_subtype != CSeqFeatData::eSubtype_pub ) {
1919 dup_type = eDuplicate_SameIntervalDifferentLabelDifferentTable;
1920 }
1921 }
1922
1923 return dup_type;
1924 }
1925
1926 // specific-host functions
1927
IsCommonName(const CT3Data & data)1928 bool IsCommonName (const CT3Data& data)
1929 {
1930 bool is_common = false;
1931
1932 if (data.IsSetStatus()) {
1933 ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1934 if ((*status_it)->IsSetProperty()
1935 && NStr::Equal((*status_it)->GetProperty(), "old_name_class", NStr::eNocase)) {
1936 if ((*status_it)->IsSetValue() && (*status_it)->GetValue().IsStr()) {
1937 string value_str = (*status_it)->GetValue().GetStr();
1938 if (NStr::Equal(value_str, "common name", NStr::eCase)
1939 || NStr::Equal(value_str, "genbank common name", NStr::eCase)) {
1940 is_common = true;
1941 break;
1942 }
1943 }
1944 }
1945 }
1946 }
1947 return is_common;
1948 }
1949
HasMisSpellFlag(const CT3Data & data)1950 bool HasMisSpellFlag (const CT3Data& data)
1951 {
1952 bool has_misspell_flag = false;
1953
1954 if (data.IsSetStatus()) {
1955 ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1956 if ((*status_it)->IsSetProperty()) {
1957 string prop = (*status_it)->GetProperty();
1958 if (NStr::EqualNocase(prop, "misspelled_name")) {
1959 has_misspell_flag = true;
1960 break;
1961 }
1962 }
1963 }
1964 }
1965 return has_misspell_flag;
1966 }
1967
1968
FindMatchInOrgRef(const string & str,const COrg_ref & org)1969 bool FindMatchInOrgRef (const string& str, const COrg_ref& org)
1970 {
1971 string match;
1972
1973 if (NStr::IsBlank(str)) {
1974 // do nothing;
1975 } else if (org.IsSetTaxname() && NStr::EqualNocase(str, org.GetTaxname())) {
1976 match = org.GetTaxname();
1977 } else if (org.IsSetCommon() && NStr::EqualNocase(str, org.GetCommon())) {
1978 match = org.GetCommon();
1979 } else {
1980 FOR_EACH_SYN_ON_ORGREF (syn_it, org) {
1981 if (NStr::EqualNocase(str, *syn_it)) {
1982 match = *syn_it;
1983 break;
1984 }
1985 }
1986 if (NStr::IsBlank(match) && org.IsSetOrgname()) {
1987 const COrgName& orgname = org.GetOrgname();
1988 if (orgname.IsSetMod()) {
1989 for (const auto& mod_it : orgname.GetMod()) {
1990 if (mod_it->IsSetSubtype()
1991 && (mod_it->GetSubtype() == COrgMod::eSubtype_gb_synonym
1992 || mod_it->GetSubtype() == COrgMod::eSubtype_old_name)
1993 && mod_it->IsSetSubname()
1994 && NStr::EqualNocase(str, mod_it->GetSubname())) {
1995 match = mod_it->GetSubname();
1996 break;
1997 }
1998 }
1999 }
2000 }
2001 }
2002 return NStr::EqualCase(str, match);
2003 }
2004
2005
2006 static const string sIgnoreHostWordList[] = {
2007 " cf.",
2008 " cf ",
2009 " aff ",
2010 " aff.",
2011 " near",
2012 " nr.",
2013 " nr "
2014 };
2015
2016
2017 static const int kNumIgnoreHostWordList = sizeof (sIgnoreHostWordList) / sizeof (string);
2018
AdjustSpecificHostForTaxServer(string & spec_host)2019 void AdjustSpecificHostForTaxServer (string& spec_host)
2020 {
2021 for (int i = 0; i < kNumIgnoreHostWordList; i++) {
2022 NStr::ReplaceInPlace(spec_host, sIgnoreHostWordList[i], " ");
2023 }
2024 NStr::ReplaceInPlace(spec_host, " ", " ");
2025 NStr::TruncateSpacesInPlace(spec_host);
2026 }
2027
2028
SpecificHostValueToCheck(const string & val)2029 string SpecificHostValueToCheck(const string& val)
2030 {
2031 if (NStr::IsBlank(val)) {
2032 return val;
2033 #if 0
2034 } else if (! isupper (val.c_str()[0])) {
2035 return kEmptyStr;
2036 #endif
2037 }
2038
2039 string host = val;
2040 // ignore portion after semicolon
2041 size_t pos = NStr::Find(host, ";");
2042 if (pos != string::npos) {
2043 host = host.substr(0, pos);
2044 }
2045 NStr::TruncateSpacesInPlace(host);
2046 // must have at least two words to check
2047 pos = NStr::Find(host, " "); // combine with next line
2048 if (pos == string::npos) {
2049 return kEmptyStr;
2050 }
2051
2052 AdjustSpecificHostForTaxServer(host);
2053 pos = NStr::Find(host, " ");
2054 if (NStr::StartsWith(host.substr(pos + 1), "hybrid ")) {
2055 pos += 7;
2056 } else if (NStr::StartsWith(host.substr(pos + 1), "x ")) {
2057 pos += 2;
2058 }
2059 if (! NStr::StartsWith(host.substr(pos + 1), "sp.")
2060 && ! NStr::StartsWith(host.substr(pos + 1), "(")) {
2061 pos = NStr::Find(host, " ", pos + 1);
2062 if (pos != string::npos) {
2063 host = host.substr(0, pos);
2064 }
2065 } else {
2066 host = host.substr(0, pos);
2067 }
2068 return host;
2069 }
2070
2071
InterpretSpecificHostResult(const string & host,const CT3Reply & reply,const string & orig_host)2072 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host)
2073 {
2074 string err_str;
2075 if (reply.IsError()) {
2076 err_str = "?";
2077 if (reply.GetError().IsSetMessage()) {
2078 err_str = reply.GetError().GetMessage();
2079 }
2080 if(NStr::FindNoCase(err_str, "ambiguous") != string::npos) {
2081 err_str = "Specific host value is ambiguous: " +
2082 (NStr::IsBlank(orig_host) ? host : orig_host);
2083 } else {
2084 err_str = "Invalid value for specific host: " +
2085 (NStr::IsBlank(orig_host) ? host : orig_host);
2086 }
2087 } else if (reply.IsData()) {
2088 const auto& rdata = reply.GetData();
2089 if (HasMisSpellFlag(rdata)) {
2090 err_str = "Specific host value is misspelled: " +
2091 (NStr::IsBlank(orig_host) ? host : orig_host);
2092 } else if (rdata.IsSetOrg()) {
2093 const auto& org = rdata.GetOrg();
2094 if (NStr::StartsWith(org.GetTaxname(), host)) {
2095 // do nothing, all good
2096 } else if (IsCommonName(rdata)) {
2097 // not actionable
2098 } else if (FindMatchInOrgRef(host, org)) {
2099 // replace with synonym
2100 err_str = "Specific host value is alternate name: " +
2101 orig_host + " should be " +
2102 org.GetTaxname();
2103 } else {
2104 err_str = "Specific host value is incorrectly capitalized: " +
2105 (NStr::IsBlank(orig_host) ? host : orig_host);
2106 }
2107 } else {
2108 err_str = "Invalid value for specific host: " +
2109 (NStr::IsBlank(orig_host) ? host : orig_host);
2110 }
2111 }
2112 return err_str;
2113 }
2114
2115
IsCommon(const COrg_ref & org,const string & val)2116 bool IsCommon(const COrg_ref& org, const string& val)
2117 {
2118 bool is_common = false;
2119 if (org.IsSetCommon() && NStr::EqualNocase(val, org.GetCommon())) {
2120 // common name, not genus
2121 is_common = true;
2122 } else if (org.IsSetOrgMod()) {
2123 for (auto& it : org.GetOrgname().GetMod()) {
2124 if (it->IsSetSubtype() &&
2125 it->GetSubtype() == COrgMod::eSubtype_common &&
2126 it->IsSetSubname() &&
2127 NStr::EqualNocase(it->GetSubname(), val)) {
2128 is_common = true;
2129 break;
2130 }
2131 }
2132 }
2133 return is_common;
2134 }
2135
2136
IsLikelyTaxname(const string & val)2137 bool IsLikelyTaxname(const string& val)
2138 {
2139 if (!isalpha(val[0])) {
2140 return false;
2141 }
2142 size_t pos = NStr::Find(val, " ");
2143 if (pos == NPOS) {
2144 return false;
2145 }
2146
2147 CTaxon1 taxon1;
2148 taxon1.Init();
2149 TTaxId taxid = taxon1.GetTaxIdByName(val.substr(0, pos));
2150 if (taxid == ZERO_TAX_ID || taxid == INVALID_TAX_ID) {
2151 return false;
2152 }
2153
2154 bool is_species = false;
2155 bool is_uncultured = false;
2156 string blast_name;
2157 CConstRef<COrg_ref> org = taxon1.GetOrgRef(taxid, is_species, is_uncultured, blast_name);
2158 if (org && IsCommon(*org, val.substr(0, pos))) {
2159 return false;
2160 } else {
2161 return true;
2162 }
2163 }
2164
2165
2166 //LCOV_EXCL_START
2167 //not used by asnvalidate but used by other applications
IsSpecificHostValid(const string & val,string & error_msg)2168 bool IsSpecificHostValid(const string& val, string& error_msg)
2169 {
2170 CTaxValidationAndCleanup tval;
2171 return tval.IsOneSpecificHostValid(val, error_msg);
2172 }
2173
2174
FixSpecificHost(const string & val)2175 string FixSpecificHost(const string& val)
2176 {
2177 string hostfix = val;
2178 validator::CTaxValidationAndCleanup tval;
2179 tval.FixOneSpecificHost(hostfix);
2180
2181 return hostfix;
2182 }
2183
2184
s_ConvertChar(char ch)2185 static char s_ConvertChar(char ch)
2186 {
2187 if (ch < 0x02 || ch > 0x7F) {
2188 // no change
2189 }
2190 else if (isalpha(ch)) {
2191 ch = tolower(ch);
2192 }
2193 else if (isdigit(ch)) {
2194 // no change
2195 }
2196 else if (ch == '\'' || ch == '/' || ch == '@' || ch == '`' || ch == ',') {
2197 // no change
2198 }
2199 else {
2200 ch = 0x20;
2201 }
2202 return ch;
2203 }
2204
2205
ConvertToEntrezTerm(string & title)2206 void ConvertToEntrezTerm(string& title)
2207 {
2208 string::iterator s = title.begin();
2209 char p = ' ';
2210 while (s != title.end()) {
2211 *s = s_ConvertChar(*s);
2212 if (isspace(*s) && isspace(p)) {
2213 s = title.erase(s);
2214 }
2215 else {
2216 p = *s;
2217 ++s;
2218 }
2219 }
2220 NStr::TruncateSpacesInPlace(title);
2221 }
2222 //LCOV_EXCL_STOP
2223
2224
FixGeneticCode(CCdregion & cdr)2225 void FixGeneticCode(CCdregion& cdr)
2226 {
2227 if (!cdr.IsSetCode()) {
2228 return;
2229 }
2230 const auto& gcode = cdr.GetCode();
2231 CGenetic_code::C_E::TId genCode = 0;
2232 for (auto& it : gcode.Get()) {
2233 if (it->IsId()) {
2234 genCode = it->GetId();
2235 }
2236 }
2237
2238 if (genCode == 7) {
2239 genCode = 4;
2240 } else if (genCode == 8) {
2241 genCode = 1;
2242 } else if (genCode == 0) {
2243 genCode = 1;
2244 }
2245 cdr.ResetCode();
2246 CRef<CGenetic_code::C_E> new_code(new CGenetic_code::C_E());
2247 new_code->SetId(genCode);
2248 cdr.SetCode().Set().push_back(new_code);
2249 }
2250
2251
TranslateCodingRegionForValidation(const CSeq_feat & feat,CScope & scope,bool & alt_start)2252 string TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start)
2253 {
2254 string transl_prot;
2255 CRef<CSeq_feat> tmp_cds(new CSeq_feat());
2256 tmp_cds->Assign(feat);
2257 FixGeneticCode(tmp_cds->SetData().SetCdregion());
2258 const CCdregion& cdregion = tmp_cds->GetData().GetCdregion();
2259 const CSeq_loc& cds_loc = tmp_cds->GetLocation();
2260 if (cds_loc.IsWhole()) {
2261 CBioseq_Handle bsh = scope.GetBioseqHandle(cds_loc.GetWhole());
2262 if (!bsh) {
2263 return kEmptyStr;
2264 }
2265 size_t start = 0;
2266 if (cdregion.IsSetFrame()) {
2267 if (cdregion.GetFrame() == 2) {
2268 start = 1;
2269 } else if (cdregion.GetFrame() == 3) {
2270 start = 2;
2271 }
2272 }
2273 const CGenetic_code* genetic_code = NULL;
2274 if (cdregion.IsSetCode()) {
2275 genetic_code = &(cdregion.GetCode());
2276 }
2277 CRef<CSeq_id> id(new CSeq_id());
2278 id->Assign(cds_loc.GetWhole());
2279 CRef<CSeq_loc> tmp(new CSeq_loc(*id, start, bsh.GetInst_Length() - 1));
2280 CSeqTranslator::Translate(*tmp, scope, transl_prot, genetic_code, true, false, &alt_start);
2281 } else {
2282 CSeqTranslator::Translate(*tmp_cds, scope, transl_prot,
2283 true, // include stop codons
2284 false, // do not remove trailing X/B/Z
2285 &alt_start);
2286 }
2287
2288 return transl_prot;
2289 }
2290
2291
HasBadStartCodon(const CSeq_loc & loc,const string & transl_prot)2292 bool HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot)
2293 {
2294 bool got_dash = (transl_prot[0] == '-');
2295 bool got_x = (transl_prot[0] == 'X'
2296 && !loc.IsPartialStart(eExtreme_Biological));
2297
2298 if (!got_dash && !got_x) {
2299 return false;
2300 }
2301 return true;
2302 }
2303
2304
2305 static const char * kUnclassifiedTranslationDiscrepancy = "unclassified translation discrepancy";
2306
2307 static const char* const sc_BypassCdsTransCheckText[] = {
2308 "RNA editing",
2309 "adjusted for low-quality genome",
2310 "annotated by transcript or proteomic data",
2311 "rearrangement required for product",
2312 "reasons given in citation",
2313 "translated product replaced",
2314 kUnclassifiedTranslationDiscrepancy
2315 };
2316 typedef CStaticArraySet<const char*, PCase_CStr> TBypassCdsTransCheckSet;
2317 DEFINE_STATIC_ARRAY_MAP(TBypassCdsTransCheckSet, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText);
2318
2319 static const char* const sc_ForceCdsTransCheckText[] = {
2320 "artificial frameshift",
2321 "mismatches in translation"
2322 };
2323 typedef CStaticArraySet<const char*, PCase_CStr> TForceCdsTransCheckSet;
2324 DEFINE_STATIC_ARRAY_MAP(TForceCdsTransCheckSet, sc_ForceCdsTransCheck, sc_ForceCdsTransCheckText);
2325
ReportTranslationErrors(const string & except_text)2326 bool ReportTranslationErrors(const string& except_text)
2327 {
2328 bool report = true;
2329 ITERATE(TBypassCdsTransCheckSet, it, sc_BypassCdsTransCheck) {
2330 if (NStr::FindNoCase(except_text, *it) != NPOS) {
2331 report = false;
2332 }
2333 }
2334 if (!report) {
2335 ITERATE(TForceCdsTransCheckSet, it, sc_ForceCdsTransCheck) {
2336 if (NStr::FindNoCase(except_text, *it) != NPOS) {
2337 report = true;
2338 }
2339 }
2340 }
2341 return report;
2342 }
2343
2344
2345 //LCOV_EXCL_START
2346 //not used by asnvalidate but used by other applications
HasBadStartCodon(const CSeq_feat & feat,CScope & scope,bool ignore_exceptions)2347 bool HasBadStartCodon(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2348 {
2349 if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2350 return false;
2351 }
2352 // do not validate for pseudo gene
2353 FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2354 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2355 return false;
2356 }
2357 }
2358
2359 if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2360 feat.CanGetExcept_text()) {
2361 if (!ReportTranslationErrors(feat.GetExcept_text())) {
2362 return false;
2363 }
2364 }
2365
2366 bool alt_start = false;
2367 string transl_prot;
2368 try {
2369 transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2370 } catch (CException& ) {
2371 return false;
2372 }
2373 return HasBadStartCodon(feat.GetLocation(), transl_prot);
2374 }
2375 //LCOV_EXCL_STOP
2376
2377
CountInternalStopCodons(const string & transl_prot)2378 size_t CountInternalStopCodons(const string& transl_prot)
2379 {
2380 if (NStr::IsBlank(transl_prot)) {
2381 return 0;
2382 }
2383 // count internal stops and Xs
2384 size_t internal_stop_count = 0;
2385
2386 ITERATE(string, it, transl_prot) {
2387 if (*it == '*') {
2388 ++internal_stop_count;
2389 }
2390 }
2391 // if stop at end, reduce count by one (since one of the stops counted isn't internal)
2392 if (transl_prot[transl_prot.length() - 1] == '*') {
2393 --internal_stop_count;
2394 }
2395 return internal_stop_count;
2396 }
2397
2398
2399 //LCOV_EXCL_START
2400 //not used by asnvalidate but used by other applications
HasInternalStop(const CSeq_feat & feat,CScope & scope,bool ignore_exceptions)2401 bool HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2402 {
2403 if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2404 return false;
2405 }
2406 // do not validate for pseudo gene
2407 FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2408 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2409 return false;
2410 }
2411 }
2412
2413 if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2414 feat.CanGetExcept_text()) {
2415 const string& except_text = feat.GetExcept_text();
2416 if (NStr::Find(except_text, kUnclassifiedTranslationDiscrepancy) == string::npos
2417 && !ReportTranslationErrors(feat.GetExcept_text())) {
2418 return false;
2419 }
2420 }
2421
2422 bool alt_start = false;
2423 string transl_prot;
2424 try {
2425 transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2426 } catch (CException& ) {
2427 return false;
2428 }
2429
2430 size_t internal_stop_codons = CountInternalStopCodons(transl_prot);
2431 if (internal_stop_codons > 0) {
2432 return true;
2433 } else {
2434 return false;
2435 }
2436 }
2437 //LCOV_EXCL_STOP
2438
2439
MakeSeqVectorForResidueCounting(const CBioseq_Handle & bsh)2440 CRef<CSeqVector> MakeSeqVectorForResidueCounting(const CBioseq_Handle& bsh)
2441 {
2442 CRef<CSeqVector> sv(new CSeqVector(bsh, CBioseq_Handle::eCoding_Iupac));
2443 CSeq_data::E_Choice seqtyp = bsh.GetInst().IsSetSeq_data() ?
2444 bsh.GetInst().GetSeq_data().Which() : CSeq_data::e_not_set;
2445 if (seqtyp == CSeq_data::e_Ncbieaa || seqtyp == CSeq_data::e_Ncbistdaa) {
2446 sv->SetCoding(CSeq_data::e_Ncbieaa);
2447 }
2448 return sv;
2449 }
2450
2451
HasBadProteinStart(const CSeqVector & sv)2452 bool HasBadProteinStart(const CSeqVector& sv)
2453 {
2454 if (sv.size() < 1) {
2455 return false;
2456 } else if (sv.IsInGap(0) || sv[0] == '-') {
2457 return true;
2458 } else {
2459 return false;
2460 }
2461 }
2462
2463
2464 //LCOV_EXCL_START
2465 //not used by asnvalidate but used by other applications
HasBadProteinStart(const CSeq_feat & feat,CScope & scope)2466 bool HasBadProteinStart(const CSeq_feat& feat, CScope& scope)
2467 {
2468 if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2469 !feat.IsSetProduct()) {
2470 return false;
2471 }
2472 // use try catch for those weird situations where the product is
2473 // not specified as a single product sequence (in which case we
2474 // should just skip this test)
2475 try {
2476 CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2477 if (!bsh.IsAa()) {
2478 return false;
2479 }
2480 CConstRef<CSeqVector> sv = MakeSeqVectorForResidueCounting(bsh);
2481 return HasBadProteinStart(*sv);
2482 } catch (CException& ) {
2483 return false;
2484 }
2485 }
2486 //LCOV_STOP
2487
2488
CountProteinStops(const CSeqVector & sv)2489 size_t CountProteinStops(const CSeqVector& sv)
2490 {
2491 size_t terminations = 0;
2492
2493 for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter) {
2494 if (*sv_iter == '*') {
2495 terminations++;
2496 }
2497 }
2498 return terminations;
2499 }
2500
2501
2502 //LCOV_EXCL_START
2503 //not used by asnvalidate but used by other applications
HasStopInProtein(const CSeq_feat & feat,CScope & scope)2504 bool HasStopInProtein(const CSeq_feat& feat, CScope& scope)
2505 {
2506 if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2507 !feat.IsSetProduct()) {
2508 return false;
2509 }
2510 // use try catch for those weird situations where the product is
2511 // not specified as a single product sequence (in which case we
2512 // should just skip this test)
2513 try {
2514 CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2515 if (!bsh.IsAa()) {
2516 return false;
2517 }
2518 CConstRef<CSeqVector> sv = MakeSeqVectorForResidueCounting(bsh);
2519 if (CountProteinStops(*sv) > 0) {
2520 return true;
2521 } else {
2522 return false;
2523 }
2524 } catch (CException& ) {
2525 return false;
2526 }
2527 }
2528 //LCOV_EXCL_STOP
2529
2530
FeatureHasEnds(const CSeq_feat & feat,CScope * scope,bool & no_beg,bool & no_end)2531 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end)
2532 {
2533 unsigned int part_loc = sequence::SeqLocPartialCheck(feat.GetLocation(), scope);
2534 no_beg = false;
2535 no_end = false;
2536
2537 if (part_loc & sequence::eSeqlocPartial_Start) {
2538 no_beg = true;
2539 }
2540 if (part_loc & sequence::eSeqlocPartial_Stop) {
2541 no_end = true;
2542 }
2543
2544
2545 if ((!no_beg || !no_end) && feat.IsSetProduct()) {
2546 unsigned int part_prod = sequence::SeqLocPartialCheck(feat.GetProduct(), scope);
2547 if (part_prod & sequence::eSeqlocPartial_Start) {
2548 no_beg = true;
2549 }
2550 if (part_prod & sequence::eSeqlocPartial_Stop) {
2551 no_end = true;
2552 }
2553 }
2554 }
2555
2556
2557 //LCOV_EXCL_START
2558 // not used by asnvalidate but needed for other applications
GetCDSProductSequence(const CSeq_feat & feat,CScope * scope,const CTSE_Handle & tse,bool far_fetch,bool & is_far)2559 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far)
2560 {
2561 CBioseq_Handle prot_handle;
2562 is_far = false;
2563 if (!feat.IsSetProduct()) {
2564 return prot_handle;
2565 }
2566 const CSeq_id* protid = NULL;
2567 try {
2568 protid = &sequence::GetId(feat.GetProduct(), scope);
2569 } catch (CException&) {}
2570 if (protid != NULL) {
2571 prot_handle = scope->GetBioseqHandleFromTSE(*protid, tse);
2572 if (!prot_handle && far_fetch) {
2573 prot_handle = scope->GetBioseqHandle(*protid);
2574 is_far = true;
2575 }
2576 }
2577 return prot_handle;
2578 }
2579 //LCOV_EXCL_STOP
2580
2581
CalculateEffectiveTranslationLengths(const string & transl_prot,const CSeqVector & prot_vec,size_t & len,size_t & prot_len)2582 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len)
2583 {
2584 len = transl_prot.length();
2585 prot_len = prot_vec.size();
2586
2587 if (NStr::EndsWith(transl_prot, "*") && (len == prot_len + 1)) { // ok, got stop
2588 --len;
2589 }
2590 while (len > 0) {
2591 if (transl_prot[len - 1] == 'X') { //remove terminal X
2592 --len;
2593 } else {
2594 break;
2595 }
2596 }
2597
2598 // ignore terminal 'X' from partial last codon if present
2599 while (prot_len > 0) {
2600 if (prot_vec[prot_len - 1] == 'X') { //remove terminal X
2601 --prot_len;
2602 } else {
2603 break;
2604 }
2605 }
2606 }
2607
2608
2609 //LCOV_EXCL_START
2610 // not used by asnvalidate but needed for other applications
GetMismatches(const CSeq_feat & feat,const CSeqVector & prot_vec,const string & transl_prot)2611 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot)
2612 {
2613 vector<TSeqPos> mismatches;
2614 size_t prot_len;
2615 size_t len;
2616
2617 CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
2618
2619 if (len == prot_len) { // could be identical
2620 for (TSeqPos i = 0; i < len; ++i) {
2621 CSeqVectorTypes::TResidue p_res = prot_vec[i];
2622 CSeqVectorTypes::TResidue t_res = transl_prot[i];
2623
2624 if (t_res != p_res) {
2625 if (i == 0) {
2626 bool no_beg, no_end;
2627 FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
2628 if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
2629 } else if (t_res == '-') {
2630 } else {
2631 mismatches.push_back(i);
2632 }
2633 } else {
2634 mismatches.push_back(i);
2635 }
2636 }
2637 }
2638 }
2639 return mismatches;
2640 }
2641
2642
GetMismatches(const CSeq_feat & feat,const CBioseq_Handle & prot_handle,const string & transl_prot)2643 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot)
2644 {
2645 vector<TSeqPos> mismatches;
2646 // can't check for mismatches unless there is a product
2647 if (!prot_handle || !prot_handle.IsAa()) {
2648 return mismatches;
2649 }
2650
2651 CSeqVector prot_vec = prot_handle.GetSeqVector();
2652 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
2653
2654 return GetMismatches(feat, prot_vec, transl_prot);
2655 }
2656
2657
HasNoStop(const CSeq_feat & feat,CScope * scope)2658 bool HasNoStop(const CSeq_feat& feat, CScope* scope)
2659 {
2660 bool no_beg, no_end;
2661 FeatureHasEnds(feat, scope, no_beg, no_end);
2662 if (no_end) {
2663 return false;
2664 }
2665
2666 string transl_prot;
2667 bool alt_start;
2668 try {
2669 transl_prot = TranslateCodingRegionForValidation(feat, *scope, alt_start);
2670 } catch (CException& ) {
2671 }
2672 if (NStr::EndsWith(transl_prot, "*")) {
2673 return false;
2674 }
2675
2676 bool show_stop = true;
2677 if (!no_beg && feat.IsSetPartial() && feat.GetPartial()) {
2678 CBioseq_Handle prot_handle;
2679 try {
2680 CBioseq_Handle bsh = scope->GetBioseqHandle(feat.GetLocation());
2681 const CTSE_Handle tse = bsh.GetTSE_Handle();
2682 bool is_far = false;
2683 prot_handle = GetCDSProductSequence(feat, scope, tse, true, is_far);
2684 if (prot_handle) {
2685 vector<TSeqPos> mismatches = GetMismatches(feat, prot_handle, transl_prot);
2686 if (mismatches.size() == 0) {
2687 show_stop = false;
2688 }
2689 }
2690 } catch (CException& ) {
2691 }
2692 }
2693
2694 return show_stop;
2695 }
2696 //LCOV_EXCL_STOP
2697
2698
IsSequenceFetchable(const CSeq_id & id)2699 bool IsSequenceFetchable(const CSeq_id& id)
2700 {
2701 bool fetchable = false;
2702 try {
2703 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);
2704 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
2705 scope->AddDefaults();
2706 CBioseq_Handle bsh = scope->GetBioseqHandle(idh);
2707 if (bsh) {
2708 fetchable = true;
2709 }
2710 } catch (CException& ) {
2711 } catch (std::exception &) {
2712 }
2713 return fetchable;
2714 }
2715
2716
IsSequenceFetchable(const string & seq_id)2717 bool IsSequenceFetchable(const string& seq_id)
2718 {
2719 bool fetchable = false;
2720 try {
2721 CRef<CSeq_id> id(new CSeq_id(seq_id));
2722 if (id) {
2723 fetchable = IsSequenceFetchable(*id);
2724 }
2725 } catch (CException& ) {
2726 } catch (std::exception &) {
2727 }
2728 return fetchable;
2729 }
2730
2731
IsNTNCNWACAccession(const string & acc)2732 bool IsNTNCNWACAccession(const string& acc)
2733 {
2734 if (NStr::StartsWith(acc, "NT_") || NStr::StartsWith(acc, "NC_") ||
2735 NStr::StartsWith(acc, "AC_") || NStr::StartsWith(acc, "NW_")) {
2736 return true;
2737 } else {
2738 return false;
2739 }
2740 }
2741
2742
IsNTNCNWACAccession(const CSeq_id & id)2743 bool IsNTNCNWACAccession(const CSeq_id& id)
2744 {
2745 if (id.IsOther() && id.GetOther().IsSetAccession() &&
2746 IsNTNCNWACAccession(id.GetOther().GetAccession())) {
2747 return true;
2748 } else {
2749 return false;
2750 }
2751 }
2752
2753
IsNTNCNWACAccession(const CBioseq & seq)2754 bool IsNTNCNWACAccession(const CBioseq& seq)
2755 {
2756 bool is_it = false;
2757 FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2758 if (IsNTNCNWACAccession(**id_it)) {
2759 is_it = true;
2760 break;
2761 }
2762 }
2763 return is_it;
2764 }
2765
2766
IsNG(const CSeq_id & id)2767 bool IsNG(const CSeq_id& id)
2768 {
2769 if (id.IsOther() && id.GetOther().IsSetAccession() &&
2770 NStr::StartsWith(id.GetOther().GetAccession(), "NG_")) {
2771 return true;
2772 } else {
2773 return false;
2774 }
2775 }
2776
2777
IsNG(const CBioseq & seq)2778 bool IsNG(const CBioseq& seq)
2779 {
2780 bool is_it = false;
2781 FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2782 if (IsNG(**id_it)) {
2783 is_it = true;
2784 break;
2785 }
2786 }
2787 return is_it;
2788 }
2789
2790
2791 // See VR-728. These Seq-ids are temporary and will be stripped
2792 // by the ID Load process, so they should not be the only Seq-id
2793 // on a Bioseq, and feature locations should not use these.
IsTemporary(const CSeq_id & id)2794 bool IsTemporary(const CSeq_id& id)
2795 {
2796 if (id.IsGeneral() && id.GetGeneral().IsSetDb()) {
2797 const string& db = id.GetGeneral().GetDb();
2798 if (NStr::EqualNocase(db, "TMSMART") ||
2799 NStr::EqualNocase(db, "NCBIFILE") ||
2800 NStr::EqualNocase(db, "BankIt")) {
2801 return true;
2802 }
2803 }
2804 return false;
2805 }
2806
2807
IsOrganelle(int genome)2808 bool IsOrganelle(int genome)
2809 {
2810 bool rval = false;
2811 switch (genome) {
2812 case CBioSource::eGenome_chloroplast:
2813 case CBioSource::eGenome_chromoplast:
2814 case CBioSource::eGenome_kinetoplast:
2815 case CBioSource::eGenome_mitochondrion:
2816 case CBioSource::eGenome_cyanelle:
2817 case CBioSource::eGenome_nucleomorph:
2818 case CBioSource::eGenome_apicoplast:
2819 case CBioSource::eGenome_leucoplast:
2820 case CBioSource::eGenome_proplastid:
2821 case CBioSource::eGenome_hydrogenosome:
2822 case CBioSource::eGenome_chromatophore:
2823 case CBioSource::eGenome_plastid:
2824 rval = true;
2825 break;
2826 default:
2827 rval = false;
2828 break;
2829 }
2830 return rval;
2831 }
2832
2833
IsOrganelle(const CBioseq_Handle & seq)2834 bool IsOrganelle(const CBioseq_Handle& seq)
2835 {
2836 if (!seq) {
2837 return false;
2838 }
2839 bool rval = false;
2840 CSeqdesc_CI sd(seq, CSeqdesc::e_Source);
2841 if (sd && sd->GetSource().IsSetGenome() && IsOrganelle(sd->GetSource().GetGenome())) {
2842 rval = true;
2843 }
2844 return rval;
2845 }
2846
2847
ConsistentWithA(Char ch)2848 bool ConsistentWithA(Char ch)
2849
2850 {
2851 return (bool)(strchr("ANRMWHVD", ch) != NULL);
2852 }
2853
ConsistentWithC(Char ch)2854 bool ConsistentWithC(Char ch)
2855
2856 {
2857 return (bool)(strchr("CNYMSHBV", ch) != NULL);
2858 }
2859
ConsistentWithG(Char ch)2860 bool ConsistentWithG(Char ch)
2861
2862 {
2863 return (bool)(strchr("GNRKSBVD", ch) != NULL);
2864 }
2865
ConsistentWithT(Char ch)2866 bool ConsistentWithT(Char ch)
2867
2868 {
2869 return (bool)(strchr("TNYKWHBD", ch) != NULL);
2870 }
2871
2872
2873 //LCOV_EXCL_START
2874 //not used by validator, but used by Genome Workbench menu item for
2875 //removing unneccessary exceptions
DoesCodingRegionHaveUnnecessaryException(const CSeq_feat & feat,const CBioseq_Handle & loc_handle,CScope & scope)2876 bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& loc_handle, CScope& scope)
2877 {
2878 CCDSTranslationProblems problems;
2879 CBioseq_Handle prot_handle;
2880 if (feat.IsSetProduct()) {
2881 prot_handle = scope.GetBioseqHandle(feat.GetProduct());
2882 }
2883
2884 problems.CalculateTranslationProblems(feat,
2885 loc_handle,
2886 prot_handle,
2887 false,
2888 false,
2889 false,
2890 false,
2891 false,
2892 false,
2893 false,
2894 false,
2895 false,
2896 false,
2897 &scope);
2898
2899 return (problems.GetTranslationProblemFlags() & CCDSTranslationProblems::eCDSTranslationProblem_UnnecessaryException);
2900 }
2901
2902
DoesmRNAHaveUnnecessaryException(const CSeq_feat & feat,const CBioseq_Handle & nuc,CScope & scope)2903 bool DoesmRNAHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& nuc, CScope& scope)
2904 {
2905 size_t mismatches = 0;
2906 CBioseq_Handle rna;
2907 if (feat.IsSetProduct()) {
2908 rna = scope.GetBioseqHandle(feat.GetProduct());
2909 }
2910
2911 size_t problems = GetMRNATranslationProblems
2912 (feat, mismatches, false,
2913 nuc, rna, false, false, false, &scope);
2914
2915 return (problems & eMRNAProblem_UnnecessaryException);
2916 }
2917
2918
DoesFeatureHaveUnnecessaryException(const CSeq_feat & feat,CScope & scope)2919 bool DoesFeatureHaveUnnecessaryException(const CSeq_feat& feat, CScope& scope)
2920 {
2921 if (!feat.IsSetExcept_text()) {
2922 return false;
2923 }
2924 if (!feat.IsSetData()) {
2925 return false;
2926 }
2927 if (!feat.IsSetLocation()) {
2928 return false;
2929 }
2930 try {
2931 CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetLocation());
2932 if (!bsh) {
2933 return false;
2934 }
2935 CSpliceProblems splice_problems;
2936 splice_problems.CalculateSpliceProblems(feat, true, sequence::IsPseudo(feat, scope), bsh);
2937 if (splice_problems.IsExceptionUnnecessary()) {
2938 return true;
2939 }
2940 if (feat.GetData().IsCdregion()) {
2941 return DoesCodingRegionHaveUnnecessaryException(feat, bsh, scope);
2942 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
2943 return DoesmRNAHaveUnnecessaryException(feat, bsh, scope);
2944 } else {
2945 return false;
2946 }
2947 } catch (CException&) {
2948 }
2949 return false;
2950 }
2951 //LCOV_EXCL_STOP
2952
2953
2954 END_SCOPE(validator)
2955 END_SCOPE(objects)
2956 END_NCBI_SCOPE
2957