1 /* $Id: cleanup.cpp 632626 2021-06-03 17:38:42Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Robert Smith
27 *
28 * File Description:
29 * Basic Cleanup of CSeq_entries.
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35 #include <objects/seq/Bioseq.hpp>
36 #include <objects/seq/Seq_annot.hpp>
37 // included for GetPubdescLabels and GetCitationList
38 #include <objects/pub/Pub.hpp>
39 #include <objects/pub/Pub_equiv.hpp>
40 #include <objects/seq/Pubdesc.hpp>
41 #include <objects/biblio/Author.hpp>
42 #include <objects/biblio/Auth_list.hpp>
43 #include <objects/general/Person_id.hpp>
44 #include <objects/general/Name_std.hpp>
45 #include <objects/misc/sequence_macros.hpp>
46
47 #include <objects/seqset/Seq_entry.hpp>
48 #include <objects/seqset/Bioseq_set.hpp>
49 #include <objects/seqset/seqset_macros.hpp>
50 #include <objects/seqfeat/Org_ref.hpp>
51 #include <objects/seqfeat/Seq_feat.hpp>
52 #include <objects/seqfeat/SeqFeatXref.hpp>
53 #include <objects/general/Object_id.hpp>
54 #include <objects/general/User_object.hpp>
55 #include <objects/submit/Seq_submit.hpp>
56 #include <objects/taxon3/taxon3.hpp>
57
58 #include <objmgr/object_manager.hpp>
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/util/feature.hpp>
61 #include <objmgr/util/autodef.hpp>
62 #include <objmgr/seq_annot_ci.hpp>
63 #include <objmgr/seqdesc_ci.hpp>
64 #include <objmgr/seq_vector.hpp>
65 #include <objmgr/seq_vector_ci.hpp>
66 #include <objtools/edit/cds_fix.hpp>
67 #include <objtools/cleanup/cleanup.hpp>
68 #include "cleanup_utils.hpp"
69 #include <objtools/cleanup/cleanup_message.hpp>
70
71 #include <util/strsearch.hpp>
72
73 #include "newcleanupp.hpp"
74
75 #include <objtools/logging/listener.hpp>
76
77 BEGIN_NCBI_SCOPE
78 BEGIN_SCOPE(objects)
79
80 enum EChangeType {
81 eChange_UNKNOWN
82 };
83
84 // *********************** CCleanup implementation **********************
85
86
CCleanup(CScope * scope,EScopeOptions scope_handling)87 CCleanup::CCleanup(CScope* scope, EScopeOptions scope_handling)
88 {
89 if (scope && scope_handling == eScope_UseInPlace) {
90 m_Scope = scope;
91 }
92 else {
93 m_Scope = new CScope(*(CObjectManager::GetInstance()));
94 if (scope) {
95 m_Scope->AddScope(*scope);
96 }
97 }
98 }
99
100
~CCleanup(void)101 CCleanup::~CCleanup(void)
102 {
103 }
104
105
SetScope(CScope * scope)106 void CCleanup::SetScope(CScope* scope)
107 {
108 m_Scope.Reset(new CScope(*(CObjectManager::GetInstance())));
109 if (scope) {
110 m_Scope->AddScope(*scope);
111 }
112 }
113
114
115 static
makeCleanupChange(Uint4 options)116 CRef<CCleanupChange> makeCleanupChange(Uint4 options)
117 {
118 CRef<CCleanupChange> changes;
119 if (! (options & CCleanup::eClean_NoReporting)) {
120 changes.Reset(new CCleanupChange);
121 }
122 return changes;
123 }
124
125 #define CLEANUP_SETUP \
126 CRef<CCleanupChange> changes(makeCleanupChange(options)); \
127 CNewCleanup_imp clean_i(changes, options); \
128 clean_i.SetScope(*m_Scope);
129
BasicCleanup(CSeq_entry & se,Uint4 options)130 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_entry& se, Uint4 options)
131 {
132 CLEANUP_SETUP
133 clean_i.BasicCleanupSeqEntry(se);
134 return changes;
135 }
136
137
BasicCleanup(CSeq_submit & ss,Uint4 options)138 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_submit& ss, Uint4 options)
139 {
140 CLEANUP_SETUP
141 clean_i.BasicCleanupSeqSubmit(ss);
142 return changes;
143 }
144
145
BasicCleanup(CSubmit_block & block,Uint4 options)146 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSubmit_block& block, Uint4 options)
147 {
148 CLEANUP_SETUP
149 clean_i.BasicCleanupSubmitblock(block);
150 return changes;
151 }
152
153
154 /// Cleanup a Bioseq.
BasicCleanup(CBioseq & bs,Uint4 options)155 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq& bs, Uint4 options)
156 {
157 CLEANUP_SETUP
158 clean_i.BasicCleanupBioseq(bs);
159 return changes;
160 }
161
162
BasicCleanup(CBioseq_set & bss,Uint4 options)163 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_set& bss, Uint4 options)
164 {
165 CLEANUP_SETUP
166 clean_i.BasicCleanupBioseqSet(bss);
167 return changes;
168 }
169
170
BasicCleanup(CSeq_annot & sa,Uint4 options)171 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_annot& sa, Uint4 options)
172 {
173 CLEANUP_SETUP
174 clean_i.BasicCleanupSeqAnnot(sa);
175 return changes;
176 }
177
178
BasicCleanup(CSeq_feat & sf,Uint4 options)179 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_feat& sf, Uint4 options)
180 {
181 CLEANUP_SETUP
182 clean_i.BasicCleanupSeqFeat(sf);
183 return changes;
184 }
185
186
BasicCleanup(CBioSource & src,Uint4 options)187 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioSource& src, Uint4 options)
188 {
189 CLEANUP_SETUP
190 clean_i.BasicCleanupBioSource(src);
191 return changes;
192 }
193
194
BasicCleanup(CSeq_entry_Handle & seh,Uint4 options)195 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_entry_Handle& seh, Uint4 options)
196 {
197 CRef<CCleanupChange> changes(makeCleanupChange(options));
198 CNewCleanup_imp clean_i(changes, options);
199 clean_i.SetScope(seh.GetScope());
200 clean_i.BasicCleanupSeqEntryHandle(seh);
201 return changes;
202 }
203
204
BasicCleanup(CBioseq_Handle & bsh,Uint4 options)205 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_Handle& bsh, Uint4 options)
206 {
207 CRef<CCleanupChange> changes(makeCleanupChange(options));
208 CNewCleanup_imp clean_i(changes, options);
209 clean_i.SetScope(bsh.GetScope());
210 clean_i.BasicCleanupBioseqHandle(bsh);
211 return changes;
212 }
213
214
BasicCleanup(CBioseq_set_Handle & bssh,Uint4 options)215 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CBioseq_set_Handle& bssh, Uint4 options)
216 {
217 CRef<CCleanupChange> changes(makeCleanupChange(options));
218 CNewCleanup_imp clean_i(changes, options);
219 clean_i.SetScope(bssh.GetScope());
220 clean_i.BasicCleanupBioseqSetHandle(bssh);
221 return changes;
222 }
223
224
BasicCleanup(CSeq_annot_Handle & sah,Uint4 options)225 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_annot_Handle& sah, Uint4 options)
226 {
227 CRef<CCleanupChange> changes(makeCleanupChange(options));
228 CNewCleanup_imp clean_i(changes, options);
229 clean_i.SetScope(sah.GetScope());
230 clean_i.BasicCleanupSeqAnnotHandle(sah);
231 return changes;
232 }
233
234
BasicCleanup(CSeq_feat_Handle & sfh,Uint4 options)235 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_feat_Handle& sfh, Uint4 options)
236 {
237 CRef<CCleanupChange> changes(makeCleanupChange(options));
238 CNewCleanup_imp clean_i(changes, options);
239 clean_i.SetScope(sfh.GetScope());
240 clean_i.BasicCleanupSeqFeatHandle(sfh);
241 return changes;
242 }
243
244
BasicCleanup(CSeqdesc & desc,Uint4 options)245 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeqdesc& desc, Uint4 options)
246 {
247 CLEANUP_SETUP
248 clean_i.BasicCleanup(desc);
249 return changes;
250
251 }
252
253
BasicCleanup(CSeq_descr & desc,Uint4 options)254 CConstRef<CCleanupChange> CCleanup::BasicCleanup(CSeq_descr & desc, Uint4 options)
255 {
256 CLEANUP_SETUP
257
258 for (auto& it : desc.Set()) {
259 clean_i.BasicCleanup(*it);
260 }
261 return changes;
262 }
263
264
265 // *********************** Extended Cleanup implementation ********************
ExtendedCleanup(CSeq_entry & se,Uint4 options)266 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_entry& se, Uint4 options)
267 {
268 CLEANUP_SETUP
269 clean_i.ExtendedCleanupSeqEntry(se);
270
271 return changes;
272 }
273
274
ExtendedCleanup(CSeq_submit & ss,Uint4 options)275 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_submit& ss, Uint4 options)
276 {
277 CLEANUP_SETUP
278 clean_i.ExtendedCleanupSeqSubmit(ss);
279 return changes;
280 }
281
282
ExtendedCleanup(CSeq_annot & sa,Uint4 options)283 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_annot& sa, Uint4 options)
284 {
285 CLEANUP_SETUP
286 clean_i.ExtendedCleanupSeqAnnot(sa); // (m_Scope->GetSeq_annotHandle(sa));
287 return changes;
288 }
289
ExtendedCleanup(CSeq_entry_Handle & seh,Uint4 options)290 CConstRef<CCleanupChange> CCleanup::ExtendedCleanup(CSeq_entry_Handle& seh, Uint4 options)
291 {
292 CRef<CCleanupChange> changes(makeCleanupChange(options));
293 CNewCleanup_imp clean_i(changes, options);
294 clean_i.ExtendedCleanupSeqEntryHandle(seh); // (m_Scope->GetSeq_annotHandle(sa));
295 return changes;
296 }
297
298
299 // *********************** CCleanupChange implementation **********************
300
301
CCleanupChange()302 CCleanupChange::CCleanupChange()
303 {
304 }
305
306
ChangeCount() const307 size_t CCleanupChange::ChangeCount() const
308 {
309 return m_Changes.count();
310 }
311
312
IsChanged(CCleanupChange::EChanges e) const313 bool CCleanupChange::IsChanged(CCleanupChange::EChanges e) const
314 {
315 return m_Changes.test(e);
316 }
317
318
SetChanged(CCleanupChange::EChanges e)319 void CCleanupChange::SetChanged(CCleanupChange::EChanges e)
320 {
321 m_Changes.set(e);
322 }
323
324
GetAllChanges() const325 vector<CCleanupChange::EChanges> CCleanupChange::GetAllChanges() const
326 {
327 vector<EChanges> result;
328 for (size_t i = eNoChange + 1; i < m_Changes.size(); ++i) {
329 if (m_Changes.test(i)) {
330 result.push_back( (EChanges) i);
331 }
332 }
333 return result;
334 }
335
336
GetAllDescriptions() const337 vector<string> CCleanupChange::GetAllDescriptions() const
338 {
339 vector<string> result;
340 for (size_t i = eNoChange + 1; i < m_Changes.size(); ++i) {
341 if (m_Changes.test(i)) {
342 result.push_back( GetDescription((EChanges) i) );
343 }
344 }
345 return result;
346 }
347
348
GetDescription(EChanges e)349 string CCleanupChange::GetDescription(EChanges e)
350 {
351 if (e <= eNoChange || e >= eNumberofChangeTypes) {
352 return sm_ChangeDesc[eNoChange];
353 }
354 return sm_ChangeDesc[e];
355 }
356
357 // corresponds to the values in CCleanupChange::EChanges.
358 // They must be edited together.
359 const char* const CCleanupChange::sm_ChangeDesc[eNumberofChangeTypes + 1] = {
360 "Invalid Change Code",
361 // set when strings are changed.
362 "Trim Spaces",
363 "Clean Double Quotes",
364 "Append To String",
365 // set when lists are sorted or uniqued.
366 "Clean Qualifiers List",
367 "Clean Dbxrefs List",
368 "Clean CitonFeat List",
369 "Clean Keywords List",
370 "Clean Subsource List",
371 "Clean Orgmod List",
372 // Set when fields are moved or have content changes
373 "Repair BioseqMol", //10
374 "Change Feature Key",
375 "Normalize Authors",
376 "Change Publication",
377 "Change Qualifiers",
378 "Change Dbxrefs",
379 "Change Keywords",
380 "Change Subsource",
381 "Change Orgmod",
382 "Change Exception",
383 "Change Comment", //20
384 // Set when fields are rescued
385 "Change tRna",
386 "Change rRna",
387 "Change ITS",
388 "Change Anticodon",
389 "Change Code Break",
390 "Change Genetic Code",
391 "Copy GeneXref",
392 "Copy ProtXref",
393 // set when locations are repaired
394 "Change Seqloc",
395 "Change Strand", //30
396 "Change WholeLocation",
397 // set when MolInfo descriptors are affected
398 "Change MolInfo Descriptor",
399 // set when prot-xref is removed
400 "Remove ProtXref",
401 // set when gene-xref is removed
402 "Remove GeneXref",
403 // set when protein feature is added
404 "Add Protein Feature",
405 // set when feature is removed
406 "Remove Feature",
407 // set when feature is moved
408 "Move Feature",
409 // set when qualifier is removed
410 "Remove Qualifier",
411 // set when Gene Xref is created
412 "Add GeneXref",
413 // set when descriptor is removed
414 "Remove Descriptor", //40
415 "Remove Keyword",
416 "Add Descriptor",
417 "Move Descriptor",
418 "Convert Feature to Descriptor",
419 "Collapse Set",
420 "Change Feature Location",
421 "Remove Annotation",
422 "Convert Feature",
423 "Remove Comment",
424 "Add BioSource OrgMod", //50
425 "Add BioSource SubSource",
426 "Change BioSource Genome",
427 "Change BioSource Origin",
428 "Change BioSource Other",
429 "Change SeqId",
430 "Remove Empty Publication",
431 "Add Qualifier",
432 "Cleanup Date",
433 "Change BioseqInst",
434 "Remove SeqID", // 60
435 "Add ProtXref",
436 "Change Partial",
437 "Change Prot Names",
438 "Change Prot Activities",
439 "Change Site",
440 "Change PCR Primers",
441 "Change RNA-ref",
442 "Move To Prot Xref",
443 "Compress Spaces",
444 "Strip serial", // 70
445 "Remove Orgmod",
446 "Remove SubSource",
447 "Create Gene Nomenclature",
448 "Clean Seq-feat xref",
449 "Clean User-Object Or -Field",
450 "Letter Case Change",
451 "Change Bioseq-set Class",
452 "Unique Without Sort",
453 "Add RNA-ref",
454 "Change Gene-ref", // 80
455 "Clean Dbtag",
456 "Change Biomol",
457 "Change Cdregion",
458 "Clean EC Number",
459 "Remove Exception",
460 "Add NcbiCleanupObject",
461 "Clean Delta-ext",
462 "Trim Flanking Quotes",
463 "Clean Bioseq Title",
464 "Decode XML", // 90
465 "Remove Dup BioSource",
466 "Clean Org-ref",
467 "Trim Internal Semicolons",
468 "Add SeqFeatXref",
469 "Convert Unstructured Org-ref Modifier",
470 "Change taxname",
471 "Move GO term to GeneOntology object",
472
473 // set when any other change is made.
474 "Change Other",
475 "Invalid Change Code"
476 };
477
478
s_ProcessedFromKey(const string & key)479 CProt_ref::EProcessed s_ProcessedFromKey(const string& key)
480 {
481 if (NStr::Equal(key, "sig_peptide")) {
482 return CProt_ref::eProcessed_signal_peptide;
483 } else if (NStr::Equal(key, "mat_peptide")) {
484 return CProt_ref::eProcessed_mature;
485 } else if (NStr::Equal(key, "transit_peptide")) {
486 return CProt_ref::eProcessed_transit_peptide;
487 } else if (NStr::Equal(key, "preprotein") || NStr::Equal(key, "proprotein")) {
488 return CProt_ref::eProcessed_preprotein;
489 } else if (NStr::Equal(key, "propeptide")) {
490 return CProt_ref::eProcessed_propeptide;
491 } else {
492 return CProt_ref::eProcessed_not_set;
493 }
494 }
495
s_KeyFromProcessed(CProt_ref::EProcessed processed)496 string s_KeyFromProcessed(CProt_ref::EProcessed processed)
497 {
498 switch (processed) {
499 case CProt_ref::eProcessed_mature:
500 return "mat_peptide";
501 break;
502 case CProt_ref::eProcessed_preprotein:
503 return "preprotein";
504 break;
505 case CProt_ref::eProcessed_signal_peptide:
506 return "sig_peptide";
507 break;
508 case CProt_ref::eProcessed_transit_peptide:
509 return "transit_peptide";
510 break;
511 case CProt_ref::eProcessed_propeptide:
512 return "propeptide";
513 break;
514 case CProt_ref::eProcessed_not_set:
515 return kEmptyStr;
516 break;
517 }
518 return kEmptyStr;
519 }
520
521
ConvertProteinToImp(CSeq_feat_Handle fh)522 bool ConvertProteinToImp(CSeq_feat_Handle fh)
523 {
524 if (fh.GetData().IsProt() && fh.GetData().GetProt().IsSetProcessed()) {
525 string key = s_KeyFromProcessed(fh.GetData().GetProt().GetProcessed());
526 if (!NStr::IsBlank(key)) {
527 CRef<CSeq_feat> new_feat(new CSeq_feat());
528 new_feat->Assign(*(fh.GetSeq_feat()));
529 if (fh.GetData().GetProt().IsSetName() && !fh.GetData().GetProt().GetName().empty()) {
530 CRef<CGb_qual> q(new CGb_qual());
531 q->SetQual("product");
532 q->SetVal(fh.GetData().GetProt().GetName().front());
533 new_feat->SetQual().push_back(q);
534 }
535 new_feat->SetData().SetImp().SetKey(key);
536 CSeq_feat_EditHandle efh(fh);
537 efh.Replace(*new_feat);
538 return true;
539 }
540 }
541 return false;
542 }
543
544
s_IsPreprotein(CSeq_feat_Handle fh)545 bool s_IsPreprotein(CSeq_feat_Handle fh)
546 {
547 if (!fh.IsSetData()) {
548 return false;
549 } else if (fh.GetData().IsProt() &&
550 fh.GetData().GetProt().IsSetProcessed() &&
551 fh.GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_preprotein) {
552 return true;
553 } else if (fh.GetData().IsImp() &&
554 fh.GetData().GetImp().IsSetKey() &&
555 s_ProcessedFromKey(fh.GetData().GetImp().GetKey()) == CProt_ref::eProcessed_preprotein) {
556 return true;
557 } else {
558 return false;
559 }
560 }
561
562
RescueProtProductQual(CSeq_feat & feat)563 void RescueProtProductQual(CSeq_feat& feat)
564 {
565 if (!feat.IsSetQual() ||
566 !feat.IsSetData() ||
567 !feat.GetData().IsProt() ||
568 feat.GetData().GetProt().IsSetName()) {
569 return;
570 }
571 CSeq_feat::TQual::iterator it = feat.SetQual().begin();
572 while (it != feat.SetQual().end()) {
573 if ((*it)->IsSetQual() &&
574 NStr::Equal((*it)->GetQual(), "product")) {
575 if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
576 feat.SetData().SetProt().SetName().push_back((*it)->GetVal());
577 }
578 it = feat.SetQual().erase(it);
579 } else {
580 ++it;
581 }
582 }
583
584 if (feat.SetQual().empty()) {
585 feat.ResetQual();
586 }
587 }
588
589
s_GetCdsByProduct(CScope & scope,const CSeq_loc & product)590 static CConstRef<CSeq_feat> s_GetCdsByProduct(CScope& scope, const CSeq_loc& product)
591 {
592 const bool feat_by_product = true;
593 SAnnotSelector sel(CSeqFeatData::e_Cdregion, feat_by_product);
594 CFeat_CI fi(scope, product, sel);
595 if (fi) {
596 return ConstRef(&(fi->GetOriginalFeature()));
597 }
598 return CConstRef<CSeq_feat>();
599 };
600
s_GetCdsByLocation(CScope & scope,const CSeq_loc & feat_loc)601 static CConstRef<CSeq_feat> s_GetCdsByLocation(CScope& scope, const CSeq_loc& feat_loc)
602 {
603 sequence::TFeatScores cdsScores;
604 sequence::GetOverlappingFeatures(
605 feat_loc,
606 CSeqFeatData::e_Cdregion,
607 CSeqFeatData::eSubtype_cdregion,
608 sequence::eOverlap_Contained,
609 cdsScores,
610 scope);
611
612 if (cdsScores.empty()) {
613 return CConstRef<CSeq_feat>();
614 }
615
616 if (!feat_loc.IsPartialStart(eExtreme_Biological)) {
617 for (auto cdsScore : cdsScores) {
618 if (feature::IsLocationInFrame(scope.GetSeq_featHandle(*cdsScore.second), feat_loc)
619 == feature::eLocationInFrame_InFrame) {
620 return cdsScore.second;
621 }
622 }
623 }
624
625 return cdsScores.front().second;
626 }
627
628
629
MoveFeatToProtein(CSeq_feat_Handle fh)630 bool CCleanup::MoveFeatToProtein(CSeq_feat_Handle fh)
631 {
632 CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
633 if (fh.GetData().IsImp()) {
634 if (!fh.GetData().GetImp().IsSetKey()) {
635 return false;
636 }
637 processed = s_ProcessedFromKey(fh.GetData().GetImp().GetKey());
638 if (processed == CProt_ref::eProcessed_not_set || processed == CProt_ref::eProcessed_preprotein) {
639 return false;
640 }
641 } else if (s_IsPreprotein(fh)) {
642 return ConvertProteinToImp(fh);
643 }
644
645 CBioseq_Handle parent_bsh = fh.GetScope().GetBioseqHandle(fh.GetLocation());
646
647 if (!parent_bsh) {
648 // feature is mispackaged
649 return false;
650 }
651 if (parent_bsh.IsAa()) {
652 // feature is already on protein sequence
653 return false;
654 }
655
656 CConstRef<CSeq_feat> cds;
657 bool matched_by_product = false;
658
659 if (fh.IsSetProduct() &&
660 fh.GetData().IsProt() &&
661 fh.GetData().GetProt().IsSetProcessed() &&
662 fh.GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_mature) {
663 cds = s_GetCdsByProduct(fh.GetScope(), fh.GetProduct());
664 if (cds) {
665 matched_by_product = true;
666 }
667 }
668 if (!matched_by_product) {
669 cds = s_GetCdsByLocation(fh.GetScope(), fh.GetLocation());
670 }
671 if (!cds || !cds->IsSetProduct()) {
672 // there is no overlapping coding region feature, so there is no appropriate
673 // protein sequence to move to
674 return ConvertProteinToImp(fh);
675 }
676
677 bool require_frame = false;
678 if (!require_frame) {
679 ITERATE(CBioseq::TId, id_it, parent_bsh.GetBioseqCore()->GetId()) {
680 if ((*id_it)->IsEmbl() || (*id_it)->IsDdbj()) {
681 require_frame = true;
682 break;
683 }
684 }
685 }
686
687 CRef<CSeq_loc> prot_loc = GetProteinLocationFromNucleotideLocation(fh.GetLocation(), *cds, fh.GetScope(), require_frame);
688
689 if (!prot_loc) {
690 return false;
691 }
692
693 CConstRef<CSeq_feat> orig_feat = fh.GetSeq_feat();
694 CRef<CSeq_feat> new_feat(new CSeq_feat());
695 new_feat->Assign(*orig_feat);
696 if (new_feat->GetData().Which() == CSeqFeatData::e_Imp) {
697 new_feat->SetData().SetProt().SetProcessed(processed);
698 // if possible, rescue product qual
699 RescueProtProductQual(*new_feat);
700 if (processed == CProt_ref::eProcessed_mature &&
701 !new_feat->GetData().GetProt().IsSetName()) {
702 if (orig_feat->IsSetComment() && !NStr::IsBlank(orig_feat->GetComment())) {
703 new_feat->SetData().SetProt().SetName().push_back(orig_feat->GetComment());
704 new_feat->ResetComment();
705 } else {
706 new_feat->SetData().SetProt().SetName().push_back("unnamed");
707 }
708 }
709 }
710
711 // change location to protein
712 new_feat->ResetLocation();
713 new_feat->SetLocation(*prot_loc);
714 SetFeaturePartial(*new_feat);
715 if (matched_by_product) {
716 new_feat->ResetProduct();
717 }
718
719 CSeq_feat_EditHandle edh(fh);
720 edh.Replace(*new_feat);
721 CRef<CCleanupChange> changes(makeCleanupChange(0));
722 CNewCleanup_imp clean_i(changes, 0);
723 clean_i.SetScope(fh.GetScope());
724 clean_i.BasicCleanupSeqFeat(*new_feat);
725
726 CSeq_annot_Handle ah = fh.GetAnnot();
727
728 CBioseq_Handle target_bsh = fh.GetScope().GetBioseqHandle(new_feat->GetLocation());
729 if (!target_bsh) {
730 return false;
731 }
732
733 CBioseq_EditHandle eh = target_bsh.GetEditHandle();
734
735 // Find a feature table on the protein sequence to add the feature to.
736 CSeq_annot_Handle ftable;
737 if (target_bsh.GetCompleteBioseq()->IsSetAnnot()) {
738 ITERATE(CBioseq::TAnnot, annot_it, target_bsh.GetCompleteBioseq()->GetAnnot()) {
739 if ((*annot_it)->IsFtable()) {
740 ftable = fh.GetScope().GetSeq_annotHandle(**annot_it);
741 }
742 }
743 }
744
745 // If there is no feature table present, make one
746 if (!ftable) {
747 CRef<CSeq_annot> new_annot(new CSeq_annot());
748 ftable = eh.AttachAnnot(*new_annot);
749 }
750
751 // add feature to the protein bioseq
752 CSeq_annot_EditHandle aeh(ftable);
753 aeh.TakeFeat(edh);
754
755 // remove old annot if now empty
756 if (CNewCleanup_imp::ShouldRemoveAnnot(*(ah.GetCompleteSeq_annot()))) {
757 CSeq_annot_EditHandle orig(ah);
758 orig.Remove();
759 }
760
761 return true;
762 }
763
764
MoveProteinSpecificFeats(CSeq_entry_Handle seh)765 bool CCleanup::MoveProteinSpecificFeats(CSeq_entry_Handle seh)
766 {
767 bool any_change = false;
768 CBioseq_CI bi(seh, CSeq_inst::eMol_na);
769 while (bi) {
770 SAnnotSelector sel(CSeqFeatData::e_Prot);
771 sel.IncludeFeatType(CSeqFeatData::e_Psec_str);
772 sel.IncludeFeatType(CSeqFeatData::e_Bond);
773 for (CFeat_CI prot_it(*bi, sel); prot_it; ++prot_it) {
774 any_change |= MoveFeatToProtein(*prot_it);
775 }
776 for (CFeat_CI imp_it(*bi, CSeqFeatData::e_Imp); imp_it; ++imp_it) {
777 any_change |= MoveFeatToProtein(*imp_it);
778 }
779 ++bi;
780 }
781 return any_change;
782 }
783
784
IsGeneXrefUnnecessary(const CSeq_feat & sf,CScope & scope,const CGene_ref & gene_xref)785 bool CCleanup::IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref)
786 {
787 if (gene_xref.IsSuppressed()) {
788 return false;
789 }
790
791 CConstRef<CSeq_feat> gene = sequence::GetOverlappingGene(sf.GetLocation(), scope);
792 if (!gene || !gene->IsSetData() || !gene->GetData().IsGene()) {
793 return false;
794 }
795
796 if (!gene->GetData().GetGene().RefersToSameGene(gene_xref)) {
797 return false;
798 }
799
800 // see if other gene might also match
801 sequence::TFeatScores scores;
802 sequence::GetOverlappingFeatures(sf.GetLocation(), CSeqFeatData::e_Gene, CSeqFeatData::eSubtype_gene,
803 sequence::eOverlap_Contained, scores, scope);
804 if (scores.size() == 1) {
805 return true;
806 } else if (scores.size() == 0) {
807 return false;
808 }
809
810 ITERATE(sequence::TFeatScores, g, scores) {
811 if (g->second.GetPointer() != gene.GetPointer() &&
812 sequence::Compare(g->second->GetLocation(), gene->GetLocation(), &scope, sequence::fCompareOverlapping) == sequence::eSame) {
813 return false;
814 }
815 }
816 return true;
817 }
818
819
RemoveUnnecessaryGeneXrefs(CSeq_feat & f,CScope & scope)820 bool CCleanup::RemoveUnnecessaryGeneXrefs(CSeq_feat& f, CScope& scope)
821 {
822 if (!f.IsSetXref()) {
823 return false;
824 }
825 bool any_removed = false;
826 CSeq_feat::TXref::iterator xit = f.SetXref().begin();
827 while (xit != f.SetXref().end()) {
828 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
829 IsGeneXrefUnnecessary(f, scope, (*xit)->GetData().GetGene())) {
830 xit = f.SetXref().erase(xit);
831 any_removed = true;
832 } else {
833 ++xit;
834 }
835 }
836 if (any_removed) {
837 if (f.IsSetXref() && f.GetXref().empty()) {
838 f.ResetXref();
839 }
840 }
841 return any_removed;
842 }
843
844
RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh)845 bool CCleanup::RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh)
846 {
847 bool any_change = false;
848 CScope& scope = seh.GetScope();
849
850 for (CFeat_CI fi(seh); fi; ++fi) {
851 if (fi->IsSetXref()) {
852 CRef<CSeq_feat> new_feat(new CSeq_feat());
853 new_feat->Assign(*(fi->GetOriginalSeq_feat()));
854 bool any_removed = RemoveUnnecessaryGeneXrefs(*new_feat, scope);
855 if (any_removed) {
856 CSeq_feat_EditHandle edh(*fi);
857 edh.Replace(*new_feat);
858 any_change = true;
859 }
860 }
861 }
862
863 return any_change;
864 }
865
866
867 //LCOV_EXCL_START
868 //not used by asn_cleanup but used by other applications
RemoveNonsuppressingGeneXrefs(CSeq_feat & f)869 bool CCleanup::RemoveNonsuppressingGeneXrefs(CSeq_feat& f)
870 {
871 if (!f.IsSetXref()) {
872 return false;
873 }
874 bool any_removed = false;
875 CSeq_feat::TXref::iterator xit = f.SetXref().begin();
876 while (xit != f.SetXref().end()) {
877 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
878 !(*xit)->GetData().GetGene().IsSuppressed()) {
879 xit = f.SetXref().erase(xit);
880 any_removed = true;
881 } else {
882 ++xit;
883 }
884 }
885 if (any_removed) {
886 if (f.IsSetXref() && f.GetXref().empty()) {
887 f.ResetXref();
888 }
889 }
890 return any_removed;
891 }
892 //LCOV_EXCL_STOP
893
894
RepairXrefs(const CSeq_feat & src,CSeq_feat_Handle & dst,const CTSE_Handle & tse)895 bool CCleanup::RepairXrefs(const CSeq_feat& src, CSeq_feat_Handle& dst, const CTSE_Handle& tse)
896 {
897 if (!src.IsSetId() || !src.GetId().IsLocal()) {
898 // can't create xref if no ID
899 return false;
900 }
901 if (!CSeqFeatData::AllowXref(src.GetData().GetSubtype(), dst.GetData().GetSubtype())) {
902 // only create reciprocal xrefs if permitted
903 return false;
904 }
905 // don't create xref if already have xref or if dst not gene and already has
906 // xref to feature of same type as src
907 bool has_xref = false;
908 if (dst.IsSetXref()) {
909 ITERATE(CSeq_feat::TXref, xit, dst.GetXref()) {
910 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
911 if ((*xit)->GetId().Equals(src.GetId())) {
912 // already have xref
913 has_xref = true;
914 break;
915 } else if (!dst.GetData().IsGene()) {
916 const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
917 CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
918 ITERATE(CTSE_Handle::TSeq_feat_Handles, fit, far_feats) {
919 if (fit->GetData().GetSubtype() == src.GetData().GetSubtype()) {
920 has_xref = true;
921 break;
922 }
923 }
924 if (has_xref) {
925 break;
926 }
927 }
928 }
929 }
930 }
931 bool rval = false;
932 if (!has_xref) {
933 // to put into "editing mode"
934 dst.GetAnnot().GetEditHandle();
935 CSeq_feat_EditHandle eh(dst);
936 CRef<CSeq_feat> cpy(new CSeq_feat());
937 cpy->Assign(*(dst.GetSeq_feat()));
938 cpy->AddSeqFeatXref(src.GetId());
939 eh.Replace(*cpy);
940 rval = true;
941 }
942 return rval;
943 }
944
945
RepairXrefs(const CSeq_feat & f,const CTSE_Handle & tse)946 bool CCleanup::RepairXrefs(const CSeq_feat& f, const CTSE_Handle& tse)
947 {
948 bool rval = false;
949
950 if (!f.IsSetId() || !f.IsSetXref()) {
951 return rval;
952 }
953
954 ITERATE(CSeq_feat::TXref, xit, f.GetXref()) {
955 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
956 const CTSE_Handle::TFeatureId& x_id = (*xit)->GetId().GetLocal();
957 CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, x_id);
958 if (far_feats.size() == 1) {
959 rval |= RepairXrefs(f, far_feats[0], tse);
960 }
961 }
962 }
963 return rval;
964 }
965
966
RepairXrefs(CSeq_entry_Handle seh)967 bool CCleanup::RepairXrefs(CSeq_entry_Handle seh)
968 {
969 bool rval = false;
970 const CTSE_Handle& tse = seh.GetTSE_Handle();
971
972 CFeat_CI fi(seh);
973 while (fi) {
974 rval |= RepairXrefs(*(fi->GetSeq_feat()), tse);
975 ++fi;
976 }
977 return rval;
978 }
979
980
981 //LCOV_EXCL_START
982 //not used by asn_cleanup but used by other applications
FindMatchingLocusGene(CSeq_feat & f,const CGene_ref & gene_xref,CBioseq_Handle bsh)983 bool CCleanup::FindMatchingLocusGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh)
984 {
985 bool match = false;
986 string locus1;
987 if (gene_xref.IsSetLocus())
988 locus1 = gene_xref.GetLocus();
989 for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
990 {
991 string locus2;
992 if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
993 && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus())
994 {
995 locus2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus();
996 }
997 if (!locus1.empty() && !locus2.empty() && locus1 == locus2)
998 {
999 match = true;
1000 break;
1001 }
1002 }
1003 return match;
1004 }
1005
RemoveOrphanLocusGeneXrefs(CSeq_feat & f,CBioseq_Handle bsh)1006 bool CCleanup::RemoveOrphanLocusGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh)
1007 {
1008 if (!f.IsSetXref()) {
1009 return false;
1010 }
1011 bool any_removed = false;
1012 CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1013 while (xit != f.SetXref().end()) {
1014 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1015 !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocusGene(f, (*xit)->GetData().GetGene(), bsh)) {
1016 xit = f.SetXref().erase(xit);
1017 any_removed = true;
1018 } else {
1019 ++xit;
1020 }
1021 }
1022 if (any_removed) {
1023 if (f.IsSetXref() && f.GetXref().empty()) {
1024 f.ResetXref();
1025 }
1026 }
1027 return any_removed;
1028 }
1029
1030
FindMatchingLocus_tagGene(CSeq_feat & f,const CGene_ref & gene_xref,CBioseq_Handle bsh)1031 bool CCleanup::FindMatchingLocus_tagGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh)
1032 {
1033 bool match = false;
1034 string locus_tag1;
1035 if (gene_xref.IsSetLocus_tag())
1036 locus_tag1 = gene_xref.GetLocus_tag();
1037 for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
1038 {
1039 string locus_tag2;
1040 if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
1041 && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus_tag())
1042 {
1043 locus_tag2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus_tag();
1044 }
1045 if (!locus_tag1.empty() && !locus_tag2.empty() && locus_tag1 == locus_tag2)
1046 {
1047 match = true;
1048 break;
1049 }
1050 }
1051 return match;
1052 }
1053
RemoveOrphanLocus_tagGeneXrefs(CSeq_feat & f,CBioseq_Handle bsh)1054 bool CCleanup::RemoveOrphanLocus_tagGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh)
1055 {
1056 if (!f.IsSetXref()) {
1057 return false;
1058 }
1059 bool any_removed = false;
1060 CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1061 while (xit != f.SetXref().end()) {
1062 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1063 !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocus_tagGene(f, (*xit)->GetData().GetGene(), bsh)) {
1064 xit = f.SetXref().erase(xit);
1065 any_removed = true;
1066 } else {
1067 ++xit;
1068 }
1069 }
1070 if (any_removed) {
1071 if (f.IsSetXref() && f.GetXref().empty()) {
1072 f.ResetXref();
1073 }
1074 }
1075 return any_removed;
1076 }
1077
1078
SeqLocExtend(CSeq_loc & loc,size_t pos_,CScope & scope)1079 bool CCleanup::SeqLocExtend(CSeq_loc& loc, size_t pos_, CScope& scope)
1080 {
1081 TSeqPos pos = static_cast<TSeqPos>(pos_);
1082 TSeqPos loc_start = loc.GetStart(eExtreme_Positional);
1083 TSeqPos loc_stop = loc.GetStop(eExtreme_Positional);
1084 bool partial_start = loc.IsPartialStart(eExtreme_Positional);
1085 bool partial_stop = loc.IsPartialStop(eExtreme_Positional);
1086 ENa_strand strand = loc.GetStrand();
1087 CRef<CSeq_loc> new_loc(NULL);
1088 bool changed = false;
1089
1090 if (pos < loc_start) {
1091 CRef<CSeq_id> id(new CSeq_id());
1092 id->Assign(*(loc.GetId()));
1093 CRef<CSeq_loc> add(new CSeq_loc(*id, pos, loc_start - 1, strand));
1094 add->SetPartialStart(partial_start, eExtreme_Positional);
1095 new_loc = sequence::Seq_loc_Add(loc, *add, CSeq_loc::fSort | CSeq_loc::fMerge_AbuttingOnly, &scope);
1096 changed = true;
1097 } else if (pos > loc_stop) {
1098 CRef<CSeq_id> id(new CSeq_id());
1099 id->Assign(*(loc.GetId()));
1100 CRef<CSeq_loc> add(new CSeq_loc(*id, loc_stop + 1, pos, strand));
1101 add->SetPartialStop(partial_stop, eExtreme_Positional);
1102 new_loc = sequence::Seq_loc_Add(loc, *add, CSeq_loc::fSort | CSeq_loc::fMerge_AbuttingOnly, &scope);
1103 changed = true;
1104 }
1105 if (changed) {
1106 loc.Assign(*new_loc);
1107 }
1108 return changed;
1109 }
1110 //LCOV_EXCL_STOP
1111
1112
ExtendStopPosition(CSeq_feat & f,const CSeq_feat * cdregion,size_t extension_)1113 bool CCleanup::ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension_)
1114 {
1115 TSeqPos extension = static_cast<TSeqPos>(extension_);
1116 CRef<CSeq_loc> new_loc(&f.SetLocation());
1117
1118 CRef<CSeq_loc> last_interval;
1119 if (new_loc->IsMix()) {
1120 last_interval = new_loc->SetMix().SetLastLoc();
1121 }
1122 else
1123 {
1124 last_interval = new_loc;
1125 }
1126
1127 CConstRef<CSeq_id> id(last_interval->GetId());
1128
1129 TSeqPos new_start;
1130 TSeqPos new_stop;
1131
1132 // the last element of the mix or the single location MUST be converted into interval
1133 // whethe it's whole or point, etc
1134 if (last_interval->IsSetStrand() && last_interval->GetStrand() == eNa_strand_minus) {
1135 new_start = (cdregion ? cdregion->GetLocation().GetStart(eExtreme_Positional) :
1136 last_interval->GetStart(eExtreme_Positional)) - extension;
1137
1138 new_stop = last_interval->GetStop(eExtreme_Positional);
1139 }
1140 else {
1141 new_start = last_interval->GetStart(eExtreme_Positional);
1142 new_stop = (cdregion ? cdregion->GetLocation().GetStop(eExtreme_Positional) :
1143 last_interval->GetStop(eExtreme_Positional)) + extension;
1144 }
1145 last_interval->SetInt().SetFrom(new_start);
1146 last_interval->SetInt().SetTo(new_stop);
1147 last_interval->SetInt().SetId().Assign(*id);
1148
1149 new_loc->SetPartialStop(false, eExtreme_Biological);
1150
1151 return true;
1152 }
1153
ExtendToStopCodon(CSeq_feat & f,CBioseq_Handle bsh,size_t limit)1154 bool CCleanup::ExtendToStopCodon(CSeq_feat& f, CBioseq_Handle bsh, size_t limit)
1155 {
1156 const CSeq_loc& loc = f.GetLocation();
1157
1158 CCdregion::TFrame frame = CCdregion::eFrame_not_set;
1159 const CGenetic_code* code = NULL;
1160 // we need to extract frame and cd_region from linked cd_region
1161 if (f.IsSetData() && f.GetData().IsCdregion())
1162 {
1163 if (f.GetData().GetCdregion().IsSetCode())
1164 code = &(f.GetData().GetCdregion().GetCode());
1165 if (f.GetData().GetCdregion().IsSetFrame())
1166 frame = f.GetData().GetCdregion().GetFrame();
1167 }
1168
1169 TSeqPos stop = loc.GetStop(eExtreme_Biological);
1170 if (stop < 1 || stop > bsh.GetBioseqLength() - 1) {
1171 // no room to extend
1172 return false;
1173 }
1174 // figure out if we have a partial codon at the end
1175 size_t orig_len = sequence::GetLength(loc, &(bsh.GetScope()));
1176 size_t len = orig_len;
1177
1178 if (frame == CCdregion::eFrame_two) {
1179 len -= 1;
1180 } else if (frame == CCdregion::eFrame_three) {
1181 len -= 2;
1182 }
1183
1184 TSeqPos mod = len % 3;
1185 CRef<CSeq_loc> vector_loc(new CSeq_loc());
1186 vector_loc->SetInt().SetId().Assign(*(bsh.GetId().front().GetSeqId()));
1187
1188 if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
1189 vector_loc->SetInt().SetFrom(0);
1190 vector_loc->SetInt().SetTo(stop + mod - 1);
1191 vector_loc->SetStrand(eNa_strand_minus);
1192 } else {
1193 vector_loc->SetInt().SetFrom(stop - mod + 1);
1194 vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1195 }
1196
1197 CSeqVector seq(*vector_loc, bsh.GetScope(), CBioseq_Handle::eCoding_Iupac);
1198 // reserve our space
1199 size_t usable_size = seq.size();
1200
1201 if (limit > 0 && usable_size > limit) {
1202 usable_size = limit;
1203 }
1204
1205 // get appropriate translation table
1206 const CTrans_table & tbl =
1207 (code ? CGen_code_table::GetTransTable(*code) :
1208 CGen_code_table::GetTransTable(1));
1209
1210 // main loop through bases
1211 CSeqVector::const_iterator start = seq.begin();
1212
1213 size_t i;
1214 size_t k;
1215 int state = 0;
1216 size_t length = usable_size / 3;
1217
1218 for (i = 0; i < length; ++i) {
1219 // loop through one codon at a time
1220 for (k = 0; k < 3; ++k, ++start) {
1221 state = tbl.NextCodonState(state, *start);
1222 }
1223
1224 if (tbl.GetCodonResidue(state) == '*') {
1225 TSeqPos extension = static_cast<TSeqPos>(((i + 1) * 3) - mod);
1226 ExtendStopPosition(f, 0, extension);
1227 return true;
1228 }
1229 }
1230
1231 return false;
1232 }
1233
1234
SetBestFrame(CSeq_feat & cds,CScope & scope)1235 bool CCleanup::SetBestFrame(CSeq_feat& cds, CScope& scope)
1236 {
1237 bool changed = false;
1238 CCdregion::TFrame frame = CCdregion::eFrame_not_set;
1239 if (cds.GetData().GetCdregion().IsSetFrame()) {
1240 frame = cds.GetData().GetCdregion().GetFrame();
1241 }
1242
1243 CCdregion::TFrame new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1244 if (frame != new_frame) {
1245 cds.SetData().SetCdregion().SetFrame(new_frame);
1246 changed = true;
1247 }
1248 return changed;
1249 }
1250
1251 // like C's function GetFrameFromLoc, but better
SetFrameFromLoc(CCdregion::EFrame & frame,const CSeq_loc & loc,CScope & scope)1252 bool CCleanup::SetFrameFromLoc(CCdregion::EFrame &frame, const CSeq_loc& loc, CScope& scope)
1253 {
1254 if (!loc.IsPartialStart(eExtreme_Biological)) {
1255 if (frame != CCdregion::eFrame_one) {
1256 frame = CCdregion::eFrame_one;
1257 return true;
1258 }
1259 return false;
1260 }
1261 if (loc.IsPartialStop(eExtreme_Biological)) {
1262 // cannot make a determination if both ends are partial
1263 return false;
1264 }
1265
1266 const TSeqPos seq_len = sequence::GetLength(loc, &scope);
1267
1268 CCdregion::EFrame desired_frame = CCdregion::eFrame_not_set;
1269
1270 // have complete last codon, get frame from length
1271 switch( (seq_len % 3) + 1 ) {
1272 case 1:
1273 desired_frame = CCdregion::eFrame_one;
1274 break;
1275 case 2:
1276 desired_frame = CCdregion::eFrame_two;
1277 break;
1278 case 3:
1279 desired_frame = CCdregion::eFrame_three;
1280 break;
1281 default:
1282 // mathematically impossible
1283 _ASSERT(false);
1284 return false;
1285 }
1286 if (frame != desired_frame) {
1287 frame = desired_frame;
1288 return true;
1289 }
1290 return false;
1291 }
1292
1293
SetFrameFromLoc(CCdregion & cdregion,const CSeq_loc & loc,CScope & scope)1294 bool CCleanup::SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope)
1295 {
1296 CCdregion::EFrame frame = CCdregion::eFrame_not_set;
1297 if (cdregion.IsSetFrame()) {
1298 frame = cdregion.GetFrame();
1299 }
1300 if (SetFrameFromLoc(frame, loc, scope)) {
1301 cdregion.SetFrame(frame);
1302 return true;
1303 } else {
1304 return false;
1305 }
1306 }
1307
1308
s_IsLocationEndAtOtherLocationInternalEndpoint(const CSeq_loc & loc,const CSeq_loc & other_loc)1309 bool s_IsLocationEndAtOtherLocationInternalEndpoint(const CSeq_loc& loc, const CSeq_loc& other_loc)
1310 {
1311 size_t loc_end = loc.GetStop(eExtreme_Biological);
1312 CSeq_loc_CI other_int(other_loc);
1313 while (other_int) {
1314 if (other_int.IsSetStrand() &&
1315 other_int.GetStrand() == eNa_strand_minus) {
1316 if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus &&
1317 loc_end == other_int.GetRange().GetFrom()) {
1318 return true;
1319 }
1320 } else {
1321 if ((!loc.IsSetStrand() || loc.GetStrand() != eNa_strand_minus) &&
1322 loc_end == other_int.GetRange().GetTo()) {
1323 return true;
1324 }
1325 }
1326 ++other_int;
1327 }
1328 return false;
1329 }
1330
1331
ExtendToStopIfShortAndNotPartial(CSeq_feat & f,CBioseq_Handle bsh,bool check_for_stop)1332 bool CCleanup::ExtendToStopIfShortAndNotPartial(CSeq_feat& f, CBioseq_Handle bsh, bool check_for_stop)
1333 {
1334 if (!f.GetData().IsCdregion()) {
1335 // not coding region
1336 return false;
1337 }
1338 if (sequence::IsPseudo(f, bsh.GetScope())) {
1339 return false;
1340 }
1341 if (f.GetLocation().IsPartialStop(eExtreme_Biological)) {
1342 return false;
1343 }
1344 CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(f, bsh.GetScope());
1345 if (mrna) {
1346 if (mrna->GetLocation().GetStop(eExtreme_Biological) == f.GetLocation().GetStop(eExtreme_Biological)) {
1347 //ok
1348 } else if (s_IsLocationEndAtOtherLocationInternalEndpoint(f.GetLocation(), mrna->GetLocation())) {
1349 return false;
1350 }
1351 }
1352
1353 if (check_for_stop) {
1354 string translation;
1355 try {
1356 CSeqTranslator::Translate(f, bsh.GetScope(), translation, true);
1357 } catch (CSeqMapException&) {
1358 //unable to translate
1359 return false;
1360 } catch (CSeqVectorException&) {
1361 //unable to translate
1362 return false;
1363 }
1364 if (NStr::EndsWith(translation, "*")) {
1365 //already has stop codon
1366 return false;
1367 }
1368 }
1369
1370 return ExtendToStopCodon(f, bsh, 3);
1371 }
1372
1373
LocationMayBeExtendedToMatch(const CSeq_loc & orig,const CSeq_loc & improved)1374 bool CCleanup::LocationMayBeExtendedToMatch(const CSeq_loc& orig, const CSeq_loc& improved)
1375 {
1376 if ((orig.GetStrand() == eNa_strand_minus &&
1377 orig.GetStop(eExtreme_Biological) > improved.GetStop(eExtreme_Biological)) ||
1378 (orig.GetStrand() != eNa_strand_minus &&
1379 orig.GetStop(eExtreme_Biological) < improved.GetStop(eExtreme_Biological))) {
1380 return true;
1381 }
1382
1383 return false;
1384 }
1385
SetProteinName(CProt_ref & prot_ref,const string & protein_name,bool append)1386 void CCleanup::SetProteinName(CProt_ref& prot_ref, const string& protein_name, bool append)
1387 {
1388 if (append && prot_ref.IsSetName() && prot_ref.GetName().size() > 0) {
1389 if (!NStr::IsBlank(prot_ref.GetName().front())) {
1390 prot_ref.SetName().front() += "; ";
1391 }
1392 prot_ref.SetName().front() += protein_name;
1393 } else {
1394 prot_ref.SetName().push_back(protein_name);
1395 }
1396 }
1397
1398
SetMrnaName(CSeq_feat & mrna,const string & protein_name)1399 void CCleanup::SetMrnaName(CSeq_feat& mrna, const string& protein_name)
1400 {
1401 bool used_qual = false;
1402 if (mrna.IsSetQual()) {
1403 for (auto it = mrna.SetQual().begin(); it != mrna.SetQual().end(); it++) {
1404 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1405 (*it)->SetVal(protein_name);
1406 used_qual = true;
1407 break;
1408 }
1409 }
1410 }
1411 if (!used_qual || (mrna.IsSetData() && mrna.GetData().IsRna() && mrna.GetData().GetRna().IsSetExt())) {
1412 string remainder;
1413 mrna.SetData().SetRna().SetRnaProductName(protein_name, remainder);
1414 }
1415 }
1416
1417
1418 //LCOV_EXCL_START
1419 //seems to be unused
s_IsProductOnFeat(const CSeq_feat & cds)1420 bool CCleanup::s_IsProductOnFeat(const CSeq_feat& cds)
1421 {
1422 if (cds.IsSetXref()) {
1423 for (auto it = cds.GetXref().begin(); it != cds.GetXref().end(); it++) {
1424 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1425 return true;
1426 }
1427 }
1428 }
1429 if (cds.IsSetQual()) {
1430 for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1431 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1432 return true;
1433 }
1434 }
1435 }
1436 return false;
1437 }
1438 //LCOV_EXCL_STOP
1439
1440
s_SetProductOnFeat(CSeq_feat & feat,const string & protein_name,bool append)1441 void CCleanup::s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append)
1442 {
1443 if (feat.IsSetXref()) {
1444 // see if this seq-feat already has a prot xref
1445 for (auto it = feat.SetXref().begin(); it != feat.SetXref().end(); it++) {
1446 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1447 SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1448 break;
1449 }
1450 }
1451 }
1452 if (feat.IsSetQual()) {
1453 for (auto it = feat.SetQual().begin(); it != feat.SetQual().end(); it++) {
1454 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1455 if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal()) && append) {
1456 (*it)->SetVal((*it)->GetVal() + "; " + protein_name);
1457 } else {
1458 (*it)->SetVal(protein_name);
1459 }
1460 }
1461 }
1462 }
1463 }
1464
1465
SetProteinName(CSeq_feat & cds,const string & protein_name,bool append,CScope & scope)1466 void CCleanup::SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope)
1467 {
1468 s_SetProductOnFeat(cds, protein_name, append);
1469 bool added = false;
1470 if (cds.IsSetProduct()) {
1471 CBioseq_Handle prot = scope.GetBioseqHandle(cds.GetProduct());
1472 if (prot) {
1473 // find main protein feature
1474 CFeat_CI feat_ci(prot, CSeqFeatData::eSubtype_prot);
1475 if (feat_ci) {
1476 CRef<CSeq_feat> new_prot(new CSeq_feat());
1477 new_prot->Assign(feat_ci->GetOriginalFeature());
1478 SetProteinName(new_prot->SetData().SetProt(), protein_name, append);
1479 CSeq_feat_EditHandle feh(feat_ci->GetSeq_feat_Handle());
1480 feh.Replace(*new_prot);
1481 } else {
1482 // make new protein feature
1483 feature::AddProteinFeature(*(prot.GetCompleteBioseq()), protein_name, cds, scope);
1484 }
1485 added = true;
1486 }
1487 }
1488 if (!added) {
1489 if (cds.IsSetXref()) {
1490 // see if this seq-feat already has a prot xref
1491 NON_CONST_ITERATE(CSeq_feat::TXref, it, cds.SetXref()) {
1492 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1493 SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1494 added = true;
1495 break;
1496 }
1497 }
1498 }
1499 if (!added) {
1500 CRef<CSeqFeatXref> xref(new CSeqFeatXref());
1501 xref->SetData().SetProt().SetName().push_back(protein_name);
1502 cds.SetXref().push_back(xref);
1503 }
1504 }
1505 }
1506
1507
GetProteinName(const CProt_ref & prot)1508 const string& CCleanup::GetProteinName(const CProt_ref& prot)
1509 {
1510 if (prot.IsSetName() && !prot.GetName().empty()) {
1511 return prot.GetName().front();
1512 } else {
1513 return kEmptyStr;
1514 }
1515 }
1516
1517
GetProteinName(const CSeq_feat & cds,CScope & scope)1518 const string& CCleanup::GetProteinName(const CSeq_feat& cds, CScope& scope)
1519 {
1520 if (cds.IsSetProduct()) {
1521 CBioseq_Handle prot = scope.GetBioseqHandle(cds.GetProduct());
1522 if (prot) {
1523 CFeat_CI f(prot, CSeqFeatData::eSubtype_prot);
1524 if (f) {
1525 return GetProteinName(f->GetData().GetProt());
1526 }
1527 }
1528 }
1529 if (cds.IsSetXref()) {
1530 ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1531 if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1532 return GetProteinName((*it)->GetData().GetProt());
1533 }
1534 }
1535 }
1536 if (cds.IsSetQual()) {
1537 for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1538 if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1539 return (*it)->GetVal();
1540 }
1541 }
1542 }
1543 return kEmptyStr;
1544 }
1545
1546
SetCDSPartialsByFrameAndTranslation(CSeq_feat & cds,CScope & scope)1547 bool CCleanup::SetCDSPartialsByFrameAndTranslation(CSeq_feat& cds, CScope& scope)
1548 {
1549 bool any_change = false;
1550
1551 if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) &&
1552 cds.GetData().GetCdregion().IsSetFrame() &&
1553 cds.GetData().GetCdregion().GetFrame() != CCdregion::eFrame_not_set &&
1554 cds.GetData().GetCdregion().GetFrame() != CCdregion::eFrame_one) {
1555 cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1556 any_change = true;
1557 }
1558
1559 if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) || !cds.GetLocation().IsPartialStop(eExtreme_Biological)) {
1560 // look for start and stop codon
1561 string transl_prot;
1562 try {
1563 CSeqTranslator::Translate(cds, scope, transl_prot,
1564 true, // include stop codons
1565 false); // do not remove trailing X/B/Z
1566
1567 } catch (const runtime_error&) {
1568 }
1569 if (!NStr::IsBlank(transl_prot)) {
1570 if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) && !NStr::StartsWith(transl_prot, "M")) {
1571 cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1572 any_change = true;
1573 }
1574 if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) && !NStr::EndsWith(transl_prot, "*")) {
1575 cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
1576 any_change = true;
1577 }
1578 }
1579 }
1580
1581 any_change |= feature::AdjustFeaturePartialFlagForLocation(cds);
1582
1583 return any_change;
1584 }
1585
1586
ClearInternalPartials(CSeq_loc & loc,bool is_first,bool is_last)1587 bool CCleanup::ClearInternalPartials(CSeq_loc& loc, bool is_first, bool is_last)
1588 {
1589 bool rval = false;
1590 switch (loc.Which()) {
1591 case CSeq_loc::e_Mix:
1592 rval |= ClearInternalPartials(loc.SetMix(), is_first, is_last);
1593 break;
1594 case CSeq_loc::e_Packed_int:
1595 rval |= ClearInternalPartials(loc.SetPacked_int(), is_first, is_last);
1596 break;
1597 default:
1598 break;
1599 }
1600 return rval;
1601 }
1602
1603
ClearInternalPartials(CSeq_loc_mix & mix,bool is_first,bool is_last)1604 bool CCleanup::ClearInternalPartials(CSeq_loc_mix& mix, bool is_first, bool is_last)
1605 {
1606 bool rval = false;
1607 NON_CONST_ITERATE(CSeq_loc::TMix::Tdata, it, mix.Set()) {
1608 bool this_is_last = is_last && (*it == mix.Set().back());
1609 if ((*it)->IsMix() || (*it)->IsPacked_int()) {
1610 rval |= ClearInternalPartials(**it, is_first, this_is_last);
1611 } else {
1612 if (!is_first &&
1613 (*it)->IsPartialStart(eExtreme_Biological)) {
1614 (*it)->SetPartialStart(false, eExtreme_Biological);
1615 rval = true;
1616 }
1617 if (!this_is_last &&
1618 (*it)->IsPartialStop(eExtreme_Biological)) {
1619 (*it)->SetPartialStop(false, eExtreme_Biological);
1620 rval = true;
1621 }
1622 }
1623 is_first = false;
1624 }
1625 return rval;
1626 }
1627
1628
ClearInternalPartials(CPacked_seqint & pint,bool is_first,bool is_last)1629 bool CCleanup::ClearInternalPartials(CPacked_seqint& pint, bool is_first, bool is_last)
1630 {
1631 bool rval = false;
1632
1633 NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, it, pint.Set()) {
1634 bool this_is_last = is_last && (*it == pint.Set().back());
1635 if (!is_first && (*it)->IsPartialStart(eExtreme_Biological)) {
1636 (*it)->SetPartialStart(false, eExtreme_Biological);
1637 rval = true;
1638 }
1639 if (!this_is_last && (*it)->IsPartialStop(eExtreme_Biological)) {
1640 (*it)->SetPartialStop(false, eExtreme_Biological);
1641 rval = true;
1642 }
1643 is_first = false;
1644 }
1645 return rval;
1646 }
1647
1648
ClearInternalPartials(CSeq_entry_Handle seh)1649 bool CCleanup::ClearInternalPartials(CSeq_entry_Handle seh)
1650 {
1651 bool rval = false;
1652 CFeat_CI f(seh);
1653 while (f) {
1654 CRef<CSeq_feat> new_feat(new CSeq_feat());
1655 new_feat->Assign(*(f->GetSeq_feat()));
1656 if (ClearInternalPartials(new_feat->SetLocation())) {
1657 CSeq_feat_EditHandle eh(f->GetSeq_feat_Handle());
1658 eh.Replace(*new_feat);
1659 }
1660 ++f;
1661 }
1662
1663 return rval;
1664 }
1665
1666
SetFeaturePartial(CSeq_feat & f)1667 bool CCleanup::SetFeaturePartial(CSeq_feat& f)
1668 {
1669 if (!f.IsSetLocation()) {
1670 return false;
1671 }
1672 bool partial = false;
1673 CSeq_loc_CI li(f.GetLocation());
1674 while (li && !partial) {
1675 if (li.GetFuzzFrom() || li.GetFuzzTo()) {
1676 partial = true;
1677 break;
1678 }
1679 ++li;
1680 }
1681 bool changed = false;
1682 if (f.IsSetPartial() && f.GetPartial()) {
1683 if (!partial) {
1684 f.ResetPartial();
1685 changed = true;
1686 }
1687 } else {
1688 if (partial) {
1689 f.SetPartial(true);
1690 changed = true;
1691 }
1692 }
1693 return changed;
1694 }
1695
1696
UpdateECNumbers(CProt_ref::TEc & ec_num_list)1697 bool CCleanup::UpdateECNumbers(CProt_ref::TEc & ec_num_list)
1698 {
1699 bool changed = false;
1700 // CProt_ref::TEc is a list, so the iterator stays valid even if we
1701 // add new entries after the current one
1702 NON_CONST_ITERATE(CProt_ref::TEc, ec_num_iter, ec_num_list) {
1703 string & ec_num = *ec_num_iter;
1704 size_t tlen = ec_num.length();
1705 CleanVisStringJunk(ec_num);
1706 if (tlen != ec_num.length()) {
1707 changed = true;
1708 }
1709 if (CProt_ref::GetECNumberStatus(ec_num) == CProt_ref::eEC_replaced &&
1710 !CProt_ref::IsECNumberSplit(ec_num)) {
1711 string new_val = CProt_ref::GetECNumberReplacement(ec_num);
1712 if (!NStr::IsBlank(new_val)) {
1713 ec_num = new_val;
1714 changed = true;
1715 }
1716 }
1717
1718 }
1719 return changed;
1720 }
1721
1722
RemoveBadECNumbers(CProt_ref::TEc & ec_num_list)1723 bool CCleanup::RemoveBadECNumbers(CProt_ref::TEc & ec_num_list)
1724 {
1725 bool changed = false;
1726 CProt_ref::TEc::iterator ec_num_iter = ec_num_list.begin();
1727 while (ec_num_iter != ec_num_list.end()) {
1728 string & ec_num = *ec_num_iter;
1729 size_t tlen = ec_num.length();
1730 CleanVisStringJunk(ec_num);
1731 if (tlen != ec_num.length()) {
1732 changed = true;
1733 }
1734 CProt_ref::EECNumberStatus ec_status = CProt_ref::GetECNumberStatus(ec_num);
1735 if (ec_status == CProt_ref::eEC_deleted || ec_status == CProt_ref::eEC_unknown || CProt_ref::IsECNumberSplit(ec_num)) {
1736 ec_num_iter = ec_num_list.erase(ec_num_iter);
1737 changed = true;
1738 } else {
1739 ++ec_num_iter;
1740 }
1741
1742 }
1743 return changed;
1744 }
1745
1746
FixECNumbers(CSeq_entry_Handle entry)1747 bool CCleanup::FixECNumbers(CSeq_entry_Handle entry)
1748 {
1749 bool any_change = false;
1750 CFeat_CI f(entry, CSeqFeatData::e_Prot);
1751 while (f) {
1752 if (f->GetData().GetProt().IsSetEc()) {
1753 bool this_change = false;
1754 CRef<CSeq_feat> new_feat(new CSeq_feat());
1755 new_feat->Assign(*(f->GetSeq_feat()));
1756 this_change = UpdateECNumbers(new_feat->SetData().SetProt().SetEc());
1757 this_change |= RemoveBadECNumbers(new_feat->SetData().SetProt().SetEc());
1758 if (new_feat->GetData().GetProt().GetEc().empty()) {
1759 new_feat->SetData().SetProt().ResetEc();
1760 this_change = true;
1761 }
1762 if (this_change) {
1763 CSeq_feat_EditHandle efh(*f);
1764 efh.Replace(*new_feat);
1765 }
1766 }
1767 ++f;
1768 }
1769 return any_change;
1770 }
1771
1772
SetGenePartialByLongestContainedFeature(CSeq_feat & gene,CScope & scope)1773 bool CCleanup::SetGenePartialByLongestContainedFeature(CSeq_feat& gene, CScope& scope)
1774 {
1775 CBioseq_Handle bh = scope.GetBioseqHandle(gene.GetLocation());
1776 if (!bh) {
1777 return false;
1778 }
1779 CFeat_CI under(scope, gene.GetLocation());
1780 size_t longest = 0;
1781 CConstRef<CSeq_feat> longest_feat(NULL);
1782
1783 while (under) {
1784 // ignore genes
1785 if (under->GetData().IsGene()) {
1786
1787 } else {
1788 // must be contained in gene location
1789 sequence::ECompare loc_cmp = sequence::Compare(gene.GetLocation(), under->GetLocation(), &scope, sequence::fCompareOverlapping);
1790
1791 if (loc_cmp == sequence::eSame || loc_cmp == sequence::eContains) {
1792 size_t len = sequence::GetLength(under->GetLocation(), &scope);
1793 // if longer than longest, record new length and feature
1794 if (len > longest) {
1795 longest_feat.Reset(under->GetSeq_feat());
1796 }
1797 }
1798 }
1799
1800 ++under;
1801 }
1802 bool changed = false;
1803 if (longest_feat) {
1804 changed = feature::CopyFeaturePartials(gene, *longest_feat);
1805 }
1806 return changed;
1807 }
1808
1809
SetMolinfoTech(CBioseq_Handle bsh,CMolInfo::ETech tech)1810 bool CCleanup::SetMolinfoTech(CBioseq_Handle bsh, CMolInfo::ETech tech)
1811 {
1812 CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
1813 if (di) {
1814 if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == tech) {
1815 // no change necessary
1816 return false;
1817 } else {
1818 CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1819 d->SetMolinfo().SetTech(tech);
1820 return true;
1821 }
1822 }
1823 CRef<CSeqdesc> m(new CSeqdesc());
1824 m->SetMolinfo().SetTech(tech);
1825 if (bsh.IsSetInst() && bsh.GetInst().IsSetMol() && bsh.IsAa()) {
1826 m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1827 }
1828 CBioseq_EditHandle eh = bsh.GetEditHandle();
1829 eh.AddSeqdesc(*m);
1830 return true;
1831 }
1832
1833
1834 //LCOV_EXCL_START
1835 //does not appear to be used
SetMolinfoBiomol(CBioseq_Handle bsh,CMolInfo::EBiomol biomol)1836 bool CCleanup::SetMolinfoBiomol(CBioseq_Handle bsh, CMolInfo::EBiomol biomol)
1837 {
1838 CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
1839 if (di) {
1840 if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetBiomol() == biomol) {
1841 // no change necessary
1842 return false;
1843 } else {
1844 CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1845 d->SetMolinfo().SetBiomol(biomol);
1846 return true;
1847 }
1848 }
1849 CRef<CSeqdesc> m(new CSeqdesc());
1850 m->SetMolinfo().SetBiomol(biomol);
1851 CBioseq_EditHandle eh = bsh.GetEditHandle();
1852 eh.AddSeqdesc(*m);
1853 return true;
1854 }
1855 //LCOV_EXCL_STOP
1856
1857
AddMissingMolInfo(CBioseq & seq,bool is_product)1858 bool CCleanup::AddMissingMolInfo(CBioseq& seq, bool is_product)
1859 {
1860 if (!seq.IsSetInst() || !seq.GetInst().IsSetMol()) {
1861 return false;
1862 }
1863 bool needs_molinfo = true;
1864
1865 if (seq.IsSetDescr()) {
1866 NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, seq.SetDescr().Set()) {
1867 if ((*it)->IsMolinfo()) {
1868 needs_molinfo = false;
1869 if (seq.IsAa() &&
1870 (!(*it)->GetMolinfo().IsSetBiomol() ||
1871 (*it)->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_unknown)) {
1872 (*it)->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1873 }
1874 }
1875 }
1876 }
1877 if (needs_molinfo) {
1878 if (seq.IsAa()) {
1879 CRef<CSeqdesc> m(new CSeqdesc());
1880 m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1881 if (is_product) {
1882 m->SetMolinfo().SetTech(CMolInfo::eTech_concept_trans);
1883 }
1884 seq.SetDescr().Set().push_back(m);
1885 } else if (seq.GetInst().GetMol() == CSeq_inst::eMol_rna && is_product) {
1886 CRef<CSeqdesc> m(new CSeqdesc());
1887 m->SetMolinfo().SetBiomol(CMolInfo::eBiomol_mRNA);
1888 m->SetMolinfo().SetTech(CMolInfo::eTech_standard);
1889 seq.SetDescr().Set().push_back(m);
1890 } else {
1891 needs_molinfo = false;
1892 }
1893 }
1894
1895 return needs_molinfo;
1896 }
1897
1898
AddProteinTitle(CBioseq_Handle bsh)1899 bool CCleanup::AddProteinTitle(CBioseq_Handle bsh)
1900 {
1901 if (!bsh.IsSetInst() || !bsh.GetInst().IsSetMol() || !bsh.IsAa()) {
1902 return false;
1903 }
1904 if (bsh.IsSetId()) {
1905 ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
1906 // do not add titles for sequences with certain IDs
1907 switch (it->Which()) {
1908 case CSeq_id::e_Pir:
1909 case CSeq_id::e_Swissprot:
1910 case CSeq_id::e_Patent:
1911 case CSeq_id::e_Prf:
1912 case CSeq_id::e_Pdb:
1913 return false;
1914 break;
1915 default:
1916 break;
1917 }
1918 }
1919 }
1920
1921 string new_defline = sequence::CDeflineGenerator().GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
1922
1923 CAutoAddDesc title_desc(bsh.GetEditHandle().SetDescr(), CSeqdesc::e_Title);
1924
1925 bool modified = title_desc.Set().SetTitle() != new_defline; // get or create a title
1926 if (modified)
1927 title_desc.Set().SetTitle().swap(new_defline);
1928 return modified;
1929 }
1930
1931
RemoveNcbiCleanupObject(CSeq_entry & seq_entry)1932 bool CCleanup::RemoveNcbiCleanupObject(CSeq_entry &seq_entry)
1933 {
1934 bool rval = false;
1935 if (seq_entry.IsSetDescr()) {
1936 CBioseq::TDescr::Tdata::iterator it = seq_entry.SetDescr().Set().begin();
1937 while (it != seq_entry.SetDescr().Set().end()) {
1938 if ((*it)->IsUser() && (*it)->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup){
1939 it = seq_entry.SetDescr().Set().erase(it);
1940 rval = true;
1941 }
1942 else {
1943 ++it;
1944 }
1945 }
1946 if (seq_entry.SetDescr().Set().empty()) {
1947 if (seq_entry.IsSeq()) {
1948 seq_entry.SetSeq().ResetDescr();
1949 }
1950 else if (seq_entry.IsSet()) {
1951 seq_entry.SetSet().ResetDescr();
1952 }
1953 }
1954 }
1955 if (seq_entry.IsSet() && seq_entry.GetSet().IsSetSeq_set()) {
1956 NON_CONST_ITERATE(CBioseq_set::TSeq_set, it, seq_entry.SetSet().SetSeq_set()) {
1957 rval |= RemoveNcbiCleanupObject(**it);
1958 }
1959 }
1960 return rval;
1961 }
1962
1963
1964 //LCOV_EXCL_START
1965 //not used by asn_cleanup but used by functions used by other applications
GetSourceDescriptors(const CSeq_entry & se,vector<const CSeqdesc * > & src_descs)1966 void GetSourceDescriptors(const CSeq_entry& se, vector<const CSeqdesc* >& src_descs)
1967 {
1968 if (se.IsSetDescr()) {
1969 ITERATE(CBioseq::TDescr::Tdata, it, se.GetDescr().Get()) {
1970 if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
1971 src_descs.push_back(*it);
1972 }
1973 }
1974 }
1975
1976 if (se.IsSet() && se.GetSet().IsSetSeq_set()) {
1977 ITERATE(CBioseq_set::TSeq_set, it, se.GetSet().GetSeq_set()) {
1978 GetSourceDescriptors(**it, src_descs);
1979 }
1980 }
1981 }
1982 //LCOV_EXCL_STOP
1983
1984
1985 //LCOV_EXCL_START
1986 //not used by asn_cleanup
TaxonomyLookup(CSeq_entry_Handle seh)1987 bool CCleanup::TaxonomyLookup(CSeq_entry_Handle seh)
1988 {
1989 bool any_changes = false;
1990
1991 vector<CRef<COrg_ref> > rq_list;
1992 vector<const CSeqdesc* > src_descs;
1993 vector<CConstRef<CSeq_feat> > src_feats;
1994
1995 GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
1996 vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
1997 while (desc_it != src_descs.end()) {
1998 // add org ref for descriptor to request list
1999 CRef<COrg_ref> org(new COrg_ref());
2000 org->Assign((*desc_it)->GetSource().GetOrg());
2001 rq_list.push_back(org);
2002
2003 ++desc_it;
2004 }
2005
2006 CFeat_CI feat(seh, SAnnotSelector(CSeqFeatData::e_Biosrc));
2007 while (feat) {
2008 if (feat->GetData().GetBiosrc().IsSetOrg()) {
2009 // add org ref for feature to request list
2010 CRef<COrg_ref> org(new COrg_ref());
2011 org->Assign(feat->GetData().GetBiosrc().GetOrg());
2012 rq_list.push_back(org);
2013 // add feature to list
2014 src_feats.push_back(feat->GetOriginalSeq_feat());
2015 }
2016 ++feat;
2017 }
2018
2019 if (rq_list.size() > 0) {
2020 CTaxon3 taxon3;
2021 taxon3.Init();
2022 CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(rq_list);
2023 if (reply) {
2024 CTaxon3_reply::TReply::const_iterator reply_it = reply->GetReply().begin();
2025
2026 // process descriptor responses
2027 desc_it = src_descs.begin();
2028
2029 while (reply_it != reply->GetReply().end()
2030 && desc_it != src_descs.end()) {
2031 if ((*reply_it)->IsData() &&
2032 !(*desc_it)->GetSource().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2033 any_changes = true;
2034 CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
2035 desc->SetSource().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2036 desc->SetSource().SetOrg().CleanForGenBank();
2037 }
2038 ++reply_it;
2039 ++desc_it;
2040 }
2041
2042 // process feature responses
2043 vector<CConstRef<CSeq_feat> >::iterator feat_it = src_feats.begin();
2044 while (reply_it != reply->GetReply().end()
2045 && feat_it != src_feats.end()) {
2046 if ((*reply_it)->IsData() &&
2047 !(*feat_it)->GetData().GetBiosrc().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2048 any_changes = true;
2049 CRef<CSeq_feat> new_feat(new CSeq_feat());
2050 new_feat->Assign(**feat_it);
2051 new_feat->SetData().SetBiosrc().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2052 CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(**feat_it);
2053 CSeq_feat_EditHandle efh(fh);
2054 efh.Replace(*new_feat);
2055 }
2056 ++reply_it;
2057 ++feat_it;
2058 }
2059 }
2060 }
2061
2062 return any_changes;
2063 }
2064 //LCOV_EXCL_STOP
2065
2066
AddProtein(const CSeq_feat & cds,CScope & scope)2067 CRef<CSeq_entry> CCleanup::AddProtein(const CSeq_feat& cds, CScope& scope)
2068 {
2069 CBioseq_Handle cds_bsh = scope.GetBioseqHandle(cds.GetLocation());
2070 if (!cds_bsh) {
2071 return CRef<CSeq_entry>(NULL);
2072 }
2073 CSeq_entry_Handle seh = cds_bsh.GetSeq_entry_Handle();
2074 if (!seh) {
2075 return CRef<CSeq_entry>(NULL);
2076 }
2077
2078 CRef<CBioseq> new_product = CSeqTranslator::TranslateToProtein(cds, scope);
2079 if (new_product.Empty()) {
2080 return CRef<CSeq_entry>(NULL);
2081 }
2082
2083 CRef<CSeqdesc> molinfo(new CSeqdesc());
2084 molinfo->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
2085 molinfo->SetMolinfo().SetTech(CMolInfo::eTech_concept_trans);
2086 new_product->SetDescr().Set().push_back(molinfo);
2087
2088 if (cds.IsSetProduct()) {
2089 CRef<CSeq_id> prot_id(new CSeq_id());
2090 prot_id->Assign(*(cds.GetProduct().GetId()));
2091 new_product->SetId().push_back(prot_id);
2092 }
2093 CRef<CSeq_entry> prot_entry(new CSeq_entry());
2094 prot_entry->SetSeq(*new_product);
2095
2096 CSeq_entry_EditHandle eh = seh.GetEditHandle();
2097 if (!eh.IsSet()) {
2098 CBioseq_set_Handle nuc_parent = eh.GetParentBioseq_set();
2099 if (nuc_parent && nuc_parent.IsSetClass() && nuc_parent.GetClass() == objects::CBioseq_set::eClass_nuc_prot) {
2100 eh = nuc_parent.GetParentEntry().GetEditHandle();
2101 }
2102 }
2103 if (!eh.IsSet()) {
2104 eh.ConvertSeqToSet();
2105 // move all descriptors on nucleotide sequence except molinfo, title, and create-date to set
2106 eh.SetSet().SetClass(CBioseq_set::eClass_nuc_prot);
2107 CConstRef<CBioseq_set> set = eh.GetSet().GetCompleteBioseq_set();
2108 if (set && set->IsSetSeq_set()) {
2109 CConstRef<CSeq_entry> nuc = set->GetSeq_set().front();
2110 CSeq_entry_EditHandle neh = eh.GetScope().GetSeq_entryEditHandle(*nuc);
2111 CBioseq_set::TDescr::Tdata::const_iterator it = nuc->GetDescr().Get().begin();
2112 while (it != nuc->GetDescr().Get().end()) {
2113 if (!(*it)->IsMolinfo() && !(*it)->IsTitle() && !(*it)->IsCreate_date()) {
2114 CRef<CSeqdesc> copy(new CSeqdesc());
2115 copy->Assign(**it);
2116 eh.AddSeqdesc(*copy);
2117 neh.RemoveSeqdesc(**it);
2118 if (nuc->IsSetDescr()) {
2119 it = nuc->GetDescr().Get().begin();
2120 }
2121 else {
2122 break;
2123 }
2124 }
2125 else {
2126 ++it;
2127 }
2128 }
2129 }
2130 }
2131
2132 CSeq_entry_EditHandle added = eh.AttachEntry(*prot_entry);
2133 return prot_entry;
2134 }
2135
SetGeneticCodes(CBioseq_Handle bsh)2136 bool CCleanup::SetGeneticCodes(CBioseq_Handle bsh)
2137 {
2138 if (!bsh) {
2139 return false;
2140 }
2141 if (!bsh.IsNa()) {
2142 return false;
2143 }
2144
2145 CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
2146 if (!src) {
2147 // no source, don't fix
2148 return false;
2149 }
2150 const auto& bsrc = src->GetSource();
2151 if (!bsrc.IsSetOrg() || !bsrc.IsSetOrgname()) {
2152 return false;
2153 }
2154 const auto& orgname = bsrc.GetOrg().GetOrgname();
2155 if (!orgname.IsSetGcode() && !orgname.IsSetMgcode() && !orgname.IsSetPgcode()) {
2156 return false;
2157 }
2158 int bioseqGenCode = src->GetSource().GetGenCode();
2159
2160 bool any_changed = false;
2161 // set Cdregion's gcode from BioSource (unless except-text)
2162 SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2163 CFeat_CI feat_ci(bsh, sel);
2164 for (; feat_ci; ++feat_ci) {
2165 const CSeq_feat& feat = feat_ci->GetOriginalFeature();
2166 const CCdregion& cds = feat.GetData().GetCdregion();
2167 int cdregionGenCode = (cds.IsSetCode() ?
2168 cds.GetCode().GetId() :
2169 0);
2170 if (cdregionGenCode != bioseqGenCode)
2171 {
2172 // make cdregion's gencode match bioseq's gencode,
2173 // if allowed
2174 if (!feat.HasExceptionText("genetic code exception"))
2175 {
2176 CRef<CSeq_feat> new_feat(new CSeq_feat);
2177 new_feat->Assign(feat);
2178 CCdregion& new_cds = new_feat->SetData().SetCdregion();
2179 new_cds.ResetCode();
2180 new_cds.SetCode().SetId(bioseqGenCode);
2181 CSeq_feat_EditHandle edit_handle(*feat_ci);
2182 edit_handle.Replace(*new_feat);
2183 any_changed = true;
2184 }
2185 }
2186 }
2187 return any_changed;
2188 }
2189
2190
2191 // return position of " [" + sOrganism + "]", but only if it's
2192 // at the end and there are characters before it.
2193 // Also, returns the position of the organelle prefix in the title.
s_TitleEndsInOrganism(const string & sTitle,const string & sOrganism,SIZE_TYPE & OrganellePos)2194 static SIZE_TYPE s_TitleEndsInOrganism(
2195 const string & sTitle,
2196 const string & sOrganism,
2197 SIZE_TYPE& OrganellePos)
2198 {
2199 OrganellePos = NPOS;
2200
2201 SIZE_TYPE answer = NPOS;
2202
2203 const string sPattern = " [" + sOrganism + "]";
2204 if (NStr::EndsWith(sTitle, sPattern, NStr::eNocase)) {
2205 answer = sTitle.length() - sPattern.length();
2206 if (answer < 1) {
2207 // title must have something before the pattern
2208 answer = NPOS;
2209 }
2210 } else {
2211 answer = NStr::Find(sTitle, sPattern, NStr::eNocase, NStr::eReverseSearch);
2212 if (answer < 1 || answer == NPOS) {
2213 // pattern not found
2214 answer = NPOS;
2215 }
2216 }
2217
2218 if (answer != NPOS) {
2219 // find organelle prefix
2220 for (unsigned int genome = CBioSource::eGenome_chloroplast;
2221 genome <= CBioSource::eGenome_chromatophore;
2222 genome++) {
2223 if (genome != CBioSource::eGenome_extrachrom &&
2224 genome != CBioSource::eGenome_transposon &&
2225 genome != CBioSource::eGenome_insertion_seq &&
2226 genome != CBioSource::eGenome_proviral &&
2227 genome != CBioSource::eGenome_virion &&
2228 genome != CBioSource::eGenome_chromosome)
2229 {
2230 string organelle = " (" + CBioSource::GetOrganelleByGenome(genome) + ")";
2231 SIZE_TYPE possible_organelle_start_pos = NStr::Find(sTitle, organelle, NStr::eNocase, NStr::eReverseSearch);
2232 if (possible_organelle_start_pos != NPOS &&
2233 NStr::EndsWith(CTempString(sTitle, 0, answer), organelle)) {
2234 OrganellePos = possible_organelle_start_pos;
2235 break;
2236 }
2237
2238 }
2239 }
2240 }
2241 return answer;
2242 }
2243
2244
s_TitleEndsInOrganism(const string & sTitle,const COrgName::TName & orgname,SIZE_TYPE & organelle_pos)2245 static SIZE_TYPE s_TitleEndsInOrganism(
2246 const string & sTitle,
2247 const COrgName::TName& orgname,
2248 SIZE_TYPE &organelle_pos)
2249 {
2250 SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2251 organelle_pos = NPOS;
2252
2253 if (orgname.IsBinomial() &&
2254 orgname.GetBinomial().IsSetGenus() &&
2255 !NStr::IsBlank(orgname.GetBinomial().GetGenus()) &&
2256 orgname.GetBinomial().IsSetSpecies() &&
2257 !NStr::IsBlank(orgname.GetBinomial().GetSpecies())) {
2258 string binomial = orgname.GetBinomial().GetGenus() + " " + orgname.GetBinomial().GetSpecies();
2259 suffixPos = s_TitleEndsInOrganism(sTitle, binomial, organelle_pos);
2260 }
2261 return suffixPos;
2262 }
2263
2264
IsCrossKingdom(const COrg_ref & org,string & first_kingdom,string & second_kingdom)2265 bool IsCrossKingdom(const COrg_ref& org, string& first_kingdom, string& second_kingdom)
2266 {
2267 bool is_cross_kingdom = false;
2268 first_kingdom = kEmptyStr;
2269 second_kingdom = kEmptyStr;
2270 if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2271 org.GetOrgname().GetName().IsPartial() &&
2272 org.GetOrgname().GetName().GetPartial().IsSet()) {
2273 ITERATE(CPartialOrgName::Tdata, it, org.GetOrgname().GetName().GetPartial().Get()) {
2274 const CTaxElement& te = **it;
2275 if (te.IsSetFixed_level() && te.GetFixed_level() == 0 &&
2276 te.IsSetLevel() &&
2277 NStr::EqualNocase(te.GetLevel(), "superkingdom") &&
2278 te.IsSetName() && !NStr::IsBlank(te.GetName())) {
2279 if (first_kingdom.empty()) {
2280 first_kingdom = te.GetName();
2281 } else if (!NStr::EqualNocase(first_kingdom, te.GetName())) {
2282 is_cross_kingdom = true;
2283 second_kingdom = te.GetName();
2284 break;
2285 }
2286 }
2287 }
2288 }
2289 return is_cross_kingdom;
2290 }
2291
2292
IsCrossKingdom(const COrg_ref & org)2293 bool IsCrossKingdom(const COrg_ref& org)
2294 {
2295 string first_kingdom, second_kingdom;
2296 return IsCrossKingdom(org, first_kingdom, second_kingdom);
2297 }
2298
2299
s_TitleEndsInOrganism(const string & sTitle,const COrg_ref & org,SIZE_TYPE & organelle_pos)2300 static SIZE_TYPE s_TitleEndsInOrganism(
2301 const string & sTitle,
2302 const COrg_ref& org,
2303 SIZE_TYPE &organelle_pos)
2304 {
2305 SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2306 organelle_pos = NPOS;
2307
2308 // first, check to see if protein title matches old-name
2309 if (org.IsSetOrgMod()) {
2310 ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
2311 if ((*it)->IsSetSubtype() && (*it)->IsSetSubname() &&
2312 (*it)->GetSubtype() == COrgMod::eSubtype_old_name &&
2313 !NStr::IsBlank((*it)->GetSubname())) {
2314 suffixPos = s_TitleEndsInOrganism(sTitle, (*it)->GetSubname(), organelle_pos);
2315 if (suffixPos != NPOS) {
2316 return suffixPos;
2317 }
2318 }
2319 }
2320 }
2321
2322 // next, check to see if protein title matches taxname
2323 if (org.IsSetTaxname() && !NStr::IsBlank(org.GetTaxname())) {
2324 suffixPos = s_TitleEndsInOrganism(sTitle, org.GetTaxname(), organelle_pos);
2325 if (suffixPos != NPOS) {
2326 return suffixPos;
2327 }
2328 }
2329
2330 // try binomial if preset
2331 if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2332 org.GetOrgname().GetName().IsBinomial()) {
2333 suffixPos = s_TitleEndsInOrganism(sTitle, org.GetOrgname().GetName(), organelle_pos);
2334 if (suffixPos != NPOS) {
2335 return suffixPos;
2336 }
2337 }
2338
2339 // cross-kingdom?
2340 if (IsCrossKingdom(org)) {
2341 SIZE_TYPE sep = NStr::Find(sTitle, "][");
2342 if (sep != string::npos) {
2343 suffixPos = s_TitleEndsInOrganism(sTitle.substr(0, sep + 1), org.GetTaxname(), organelle_pos);
2344 }
2345 }
2346 return suffixPos;
2347 }
2348
2349
s_RemoveOrgFromEndOfProtein(CBioseq & seq,string taxname)2350 static void s_RemoveOrgFromEndOfProtein(CBioseq& seq, string taxname)
2351
2352 {
2353 if (taxname.empty()) return;
2354 SIZE_TYPE taxlen = taxname.length();
2355
2356 EDIT_EACH_SEQANNOT_ON_BIOSEQ(annot_it, seq) {
2357 CSeq_annot& annot = **annot_it;
2358 if (!annot.IsFtable()) continue;
2359 EDIT_EACH_FEATURE_ON_ANNOT(feat_it, annot) {
2360 CSeq_feat& feat = **feat_it;
2361 CSeqFeatData& data = feat.SetData();
2362 if (!data.IsProt()) continue;
2363 CProt_ref& prot_ref = data.SetProt();
2364 EDIT_EACH_NAME_ON_PROTREF(it, prot_ref) {
2365 string str = *it;
2366 if (str.empty()) continue;
2367 auto len = str.length();
2368 if (len < 5) continue;
2369 if (str[len - 1] != ']') continue;
2370 SIZE_TYPE cp = NStr::Find(str, "[", NStr::eCase, NStr::eReverseSearch);
2371 if (cp == NPOS) continue;
2372 string suffix = str.substr(cp + 1);
2373 if (NStr::StartsWith(suffix, "NAD")) continue;
2374 if (suffix.length() != taxlen + 1) continue;
2375 if (NStr::StartsWith(suffix, taxname)) {
2376 str.erase(cp);
2377 Asn2gnbkCompressSpaces(str);
2378 *it = str;
2379 }
2380 }
2381 }
2382 }
2383 }
2384
AddPartialToProteinTitle(CBioseq & bioseq)2385 bool CCleanup::AddPartialToProteinTitle(CBioseq &bioseq)
2386 {
2387 // Bail if not protein
2388 if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || !bioseq.GetInst().IsAa()) {
2389 return false;
2390 }
2391
2392 // Bail if record is swissprot
2393 FOR_EACH_SEQID_ON_BIOSEQ(seqid_itr, bioseq) {
2394 if ((*seqid_itr)->IsSwissprot()) {
2395 return false;
2396 }
2397 }
2398
2399 // gather some info from the Seqdesc's on the bioseq, into
2400 // the following variables
2401 bool bPartial = false;
2402 string organelle;
2403
2404 CConstRef<CSeqdesc> molinfo_desc(NULL);
2405 CConstRef<CSeqdesc> src_desc(NULL);
2406 FOR_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
2407 if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2408 molinfo_desc = *descr_iter;
2409 }
2410 if (!src_desc && (*descr_iter)->IsSource()) {
2411 src_desc = *descr_iter;
2412 }
2413 if (molinfo_desc && src_desc) {
2414 break;
2415 }
2416 }
2417 if (!molinfo_desc || !src_desc) {
2418 // climb up to get parent Seqdescs
2419 CConstRef<CBioseq_set> bioseq_set(bioseq.GetParentSet());
2420 for (; bioseq_set; bioseq_set = bioseq_set->GetParentSet()) {
2421 FOR_EACH_SEQDESC_ON_SEQSET(descr_iter, *bioseq_set) {
2422 if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2423 molinfo_desc = *descr_iter;
2424 }
2425 if (!src_desc && (*descr_iter)->IsSource()) {
2426 src_desc = *descr_iter;
2427 }
2428 if (molinfo_desc && src_desc) {
2429 break;
2430 }
2431 }
2432 if (molinfo_desc && src_desc) {
2433 break;
2434 }
2435 }
2436 }
2437
2438 if (molinfo_desc && molinfo_desc->GetMolinfo().IsSetCompleteness()) {
2439 switch (molinfo_desc->GetMolinfo().GetCompleteness()) {
2440 case NCBI_COMPLETENESS(partial):
2441 case NCBI_COMPLETENESS(no_left):
2442 case NCBI_COMPLETENESS(no_right):
2443 case NCBI_COMPLETENESS(no_ends):
2444 bPartial = true;
2445 break;
2446 default:
2447 break;
2448 }
2449 }
2450
2451 CConstRef<COrg_ref> org(NULL);
2452 if (src_desc) {
2453 const TBIOSOURCE_GENOME genome = (src_desc->GetSource().IsSetGenome() ?
2454 src_desc->GetSource().GetGenome() : CBioSource::eGenome_unknown);
2455 if (genome >= CBioSource::eGenome_chloroplast &&
2456 genome <= CBioSource::eGenome_chromatophore &&
2457 genome != CBioSource::eGenome_extrachrom &&
2458 genome != CBioSource::eGenome_transposon &&
2459 genome != CBioSource::eGenome_insertion_seq &&
2460 genome != CBioSource::eGenome_proviral &&
2461 genome != CBioSource::eGenome_virion &&
2462 genome != CBioSource::eGenome_chromosome)
2463 {
2464 organelle = CBioSource::GetOrganelleByGenome(genome);
2465 }
2466
2467 if (src_desc->GetSource().IsSetOrg()) {
2468 org.Reset(&(src_desc->GetSource().GetOrg()));
2469 }
2470 }
2471
2472 if (!org) {
2473 return false;
2474 }
2475 if (org->IsSetTaxname() && !NStr::IsBlank(org->GetTaxname())) {
2476 s_RemoveOrgFromEndOfProtein(bioseq, org->GetTaxname());
2477 }
2478
2479 // find the title to edit
2480 if (!bioseq.IsSetDescr()) {
2481 return false;
2482 }
2483 CRef<CSeqdesc> title_desc(NULL);
2484 NON_CONST_ITERATE(CBioseq::TDescr::Tdata, d, bioseq.SetDescr().Set()) {
2485 if ((*d)->IsTitle()) {
2486 title_desc = *d;
2487 }
2488 }
2489 if (!title_desc) {
2490 return false;
2491 }
2492 string & sTitle = title_desc->SetTitle();
2493 // remember original so we can see if we changed it
2494 const string sOriginalTitle = sTitle;
2495
2496 // search for partial, must be just before bracketed organism
2497 SIZE_TYPE partialPos = NStr::Find(sTitle, ", partial [");
2498 if (partialPos == NPOS) {
2499 partialPos = NStr::Find(sTitle, ", partial (");
2500 }
2501
2502 // find oldname or taxname in brackets at end of protein title
2503 SIZE_TYPE penult = NPOS;
2504 SIZE_TYPE suffixPos = s_TitleEndsInOrganism(sTitle, *org, penult); // will point to " [${organism name}]" at end
2505 // do not change unless [genus species] was at the end
2506 if (suffixPos == NPOS) {
2507 return false;
2508 }
2509
2510 // truncate bracketed info from end of title, will replace with current taxname
2511 sTitle.resize(suffixPos);
2512 if (penult != NPOS) {
2513 sTitle.resize(penult);
2514 }
2515
2516 // if ", partial [" was indeed just before the [genus species], it will now be ", partial"
2517 // Note: 9 is length of ", partial"
2518 if (!bPartial &&
2519 partialPos != string::npos &&
2520 (partialPos == (sTitle.length() - 9)))
2521 {
2522 sTitle.resize(partialPos);
2523 }
2524 NStr::TruncateSpacesInPlace(sTitle);
2525
2526 //
2527 if (bPartial && partialPos == NPOS) {
2528 sTitle += ", partial";
2529 }
2530 if (!NStr::IsBlank(organelle)) {
2531 sTitle += " (" + string(organelle) + ")";
2532 }
2533 string first_kingdom, second_kingdom;
2534 if (IsCrossKingdom(*org, first_kingdom, second_kingdom)) {
2535 sTitle += " [" + first_kingdom + "][" + second_kingdom + "]";
2536 } else {
2537 sTitle += " [";
2538 if (org->IsSetTaxname()) {
2539 sTitle += org->GetTaxname();
2540 }
2541 sTitle += "]";
2542 }
2543
2544 if (sTitle != sOriginalTitle) {
2545 return true;
2546 } else {
2547 return false;
2548 }
2549 }
2550
RemovePseudoProduct(CSeq_feat & cds,CScope & scope)2551 bool CCleanup::RemovePseudoProduct(CSeq_feat& cds, CScope& scope)
2552 {
2553 if (!sequence::IsPseudo(cds, scope) ||
2554 !cds.IsSetData() || !cds.GetData().IsCdregion() ||
2555 !cds.IsSetProduct()) {
2556 return false;
2557 }
2558 CBioseq_Handle pseq = scope.GetBioseqHandle(cds.GetProduct());
2559 if (pseq) {
2560 CFeat_CI prot(pseq, CSeqFeatData::eSubtype_prot);
2561 if (prot) {
2562 string label;
2563 if (prot->GetData().GetProt().IsSetName() &&
2564 !prot->GetData().GetProt().GetName().empty()) {
2565 label = prot->GetData().GetProt().GetName().front();
2566 } else if (prot->GetData().GetProt().IsSetDesc()) {
2567 label = prot->GetData().GetProt().GetDesc();
2568 }
2569 if (!NStr::IsBlank(label)) {
2570 if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
2571 cds.SetComment(cds.GetComment() + "; " + label);
2572 } else {
2573 cds.SetComment(label);
2574 }
2575 }
2576 }
2577 CBioseq_EditHandle pseq_e(pseq);
2578 pseq_e.Remove();
2579 }
2580 cds.ResetProduct();
2581 return true;
2582 }
2583
2584
ExpandGeneToIncludeChildren(CSeq_feat & gene,CTSE_Handle & tse)2585 bool CCleanup::ExpandGeneToIncludeChildren(CSeq_feat& gene, CTSE_Handle& tse)
2586 {
2587 if (!gene.IsSetXref() || !gene.IsSetLocation() || !gene.GetLocation().IsInt()) {
2588 return false;
2589 }
2590 bool any_change = false;
2591 TSeqPos gene_start = gene.GetLocation().GetStart(eExtreme_Positional);
2592 TSeqPos gene_stop = gene.GetLocation().GetStop(eExtreme_Positional);
2593 ITERATE(CSeq_feat::TXref, xit, gene.GetXref()) {
2594 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
2595 const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
2596 CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::eSubtype_any, feat_id);
2597 ITERATE(CTSE_Handle::TSeq_feat_Handles, f, far_feats) {
2598 TSeqPos f_start = f->GetLocation().GetStart(eExtreme_Positional);
2599 TSeqPos f_stop = f->GetLocation().GetStop(eExtreme_Positional);
2600 if (f_start < gene_start) {
2601 gene.SetLocation().SetInt().SetFrom(f_start);
2602 gene_start = f_start;
2603 any_change = true;
2604 }
2605 if (f_stop > gene_stop) {
2606 gene.SetLocation().SetInt().SetTo(f_stop);
2607 gene_stop = f_stop;
2608 any_change = true;
2609 }
2610 }
2611 }
2612 }
2613 return any_change;
2614 }
2615
2616
2617 typedef pair<size_t, bool> TRNALength;
2618 typedef map<string, TRNALength > TRNALengthMap;
2619
2620 static const TRNALengthMap kTrnaLengthMap{
2621 { "16S", { 1000, false } },
2622 { "18S", { 1000, false } },
2623 { "23S", { 2000, false } },
2624 { "25S", { 1000, false } },
2625 { "26S", { 1000, false } },
2626 { "28S", { 3300, false } },
2627 { "small", { 1000, false } },
2628 { "large", { 1000, false } },
2629 { "5.8S", { 130, true } },
2630 { "5S", { 90, true } }
2631 // possible problem: if it matches /25S/ it would also match /5S/
2632 // luckily, if it fails the /5S/ rule it would fail the /25S/ rule
2633 };
2634
2635
s_CleanupIsShortrRNA(const CSeq_feat & f,CScope * scope)2636 static bool s_CleanupIsShortrRNA(const CSeq_feat& f, CScope* scope) // used in feature_tests.cpp
2637 {
2638 if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_rRNA) {
2639 return false;
2640 }
2641 bool is_bad = false;
2642 size_t len = sequence::GetLength(f.GetLocation(), scope);
2643 const CRNA_ref& rrna = f.GetData().GetRna();
2644 string rrna_name = rrna.GetRnaProductName();
2645 if (rrna_name.empty()) {
2646 // RNA name may still be in product GBQual
2647 if (f.IsSetQual()) {
2648 for (auto qit : f.GetQual()) {
2649 const CGb_qual& gbq = *qit;
2650 if ( gbq.IsSetQual() && gbq.GetQual() == "product" ) {
2651 rrna_name = gbq.GetVal();
2652 break;
2653 }
2654 }
2655 }
2656 }
2657 ITERATE (TRNALengthMap, it, kTrnaLengthMap) {
2658 SIZE_TYPE pos = NStr::FindNoCase(rrna_name, it->first);
2659 if (pos != string::npos && len < it->second.first && !(it->second.second && f.IsSetPartial() && f.GetPartial()) ) {
2660 is_bad = true;
2661 break;
2662 }
2663 }
2664 return is_bad;
2665 }
2666
WGSCleanup(CSeq_entry_Handle entry,bool instantiate_missing_proteins,Uint4 options,bool run_extended_cleanup)2667 bool CCleanup::WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins, Uint4 options, bool run_extended_cleanup)
2668 {
2669 bool any_changes = false;
2670
2671 int protein_id_counter = 1;
2672 bool create_general_only = objects::edit::IsGeneralIdProtPresent(entry.GetTopLevelEntry());
2673 SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2674 for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2675 bool change_this_cds = false;
2676 CRef<CSeq_feat> new_cds(new CSeq_feat());
2677 new_cds->Assign(*(cds_it->GetSeq_feat()));
2678 if (sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope())) {
2679 change_this_cds = RemovePseudoProduct(*new_cds, entry.GetScope());
2680 } else {
2681 string current_name = GetProteinName(*new_cds, entry.GetScope());
2682
2683 change_this_cds |= SetBestFrame(*new_cds, entry.GetScope());
2684
2685 change_this_cds |= SetCDSPartialsByFrameAndTranslation(*new_cds, entry.GetScope());
2686
2687 // retranslate
2688 if (new_cds->IsSetProduct() && entry.GetScope().GetBioseqHandleFromTSE(*(new_cds->GetProduct().GetId()), entry)) {
2689 any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2690 } else {
2691 // need to set product if not set
2692 if (!new_cds->IsSetProduct() && !sequence::IsPseudo(*new_cds, entry.GetScope())) {
2693 string id_label;
2694 CRef<CSeq_id> new_id = objects::edit::GetNewProtId(entry.GetScope().GetBioseqHandle(new_cds->GetLocation()), protein_id_counter, id_label, create_general_only);
2695 if (new_id) {
2696 new_cds->SetProduct().SetWhole().Assign(*new_id);
2697 change_this_cds = true;
2698 }
2699 }
2700 if (new_cds->IsSetProduct() && instantiate_missing_proteins) {
2701 CRef<CSeq_entry> prot = AddProtein(*new_cds, entry.GetScope());
2702 if (prot) {
2703 any_changes = true;
2704 }
2705 }
2706 any_changes |= feature::AdjustForCDSPartials(*new_cds, entry);
2707 }
2708 //prefer ncbieaa
2709 if (new_cds->IsSetProduct()) {
2710 CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2711 if (p && p.IsSetInst() && p.GetInst().IsSetSeq_data() && p.GetInst().GetSeq_data().IsIupacaa()) {
2712 CBioseq_EditHandle peh(p);
2713 string current = p.GetInst().GetSeq_data().GetIupacaa().Get();
2714 CRef<CSeq_inst> new_inst(new CSeq_inst());
2715 new_inst->Assign(p.GetInst());
2716 new_inst->SetSeq_data().SetNcbieaa().Set(current);
2717 peh.SetInst(*new_inst);
2718 any_changes = true;
2719 }
2720 }
2721
2722 if (NStr::IsBlank(current_name)) {
2723 SetProteinName(*new_cds, "hypothetical protein", false, entry.GetScope());
2724 current_name = "hypothetical protein";
2725 change_this_cds = true;
2726 } else if (new_cds->IsSetProduct()) {
2727 CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2728 if (p) {
2729 CFeat_CI feat_ci(p, CSeqFeatData::eSubtype_prot);
2730 if (!feat_ci) {
2731 // make new protein feature
2732 feature::AddProteinFeature(*(p.GetCompleteBioseq()), current_name, *new_cds, entry.GetScope());
2733 }
2734 }
2735 }
2736
2737 CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(*(cds_it->GetSeq_feat()), entry.GetScope());
2738 if (mrna) {
2739 bool change_mrna = false;
2740 CRef<CSeq_feat> new_mrna(new CSeq_feat());
2741 new_mrna->Assign(*mrna);
2742 // Make mRNA name match coding region protein
2743 string mrna_name = new_mrna->GetData().GetRna().GetRnaProductName();
2744 if (NStr::IsBlank(mrna_name) && new_mrna->IsSetQual()) {
2745 for (auto it = new_mrna->GetQual().begin(); it != new_mrna->GetQual().end(); it++) {
2746 if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
2747 mrna_name = (*it)->GetVal();
2748 break;
2749 }
2750 }
2751 }
2752 if (NStr::IsBlank(mrna_name)
2753 || (!NStr::Equal(current_name, "hypothetical protein") &&
2754 !NStr::Equal(current_name, mrna_name))) {
2755 SetMrnaName(*new_mrna, current_name);
2756 change_mrna = true;
2757 }
2758 // Adjust mRNA partials to match coding region
2759 change_mrna |= feature::CopyFeaturePartials(*new_mrna, *new_cds);
2760 if (change_mrna) {
2761 CSeq_feat_Handle fh = entry.GetScope().GetSeq_featHandle(*mrna);
2762 CSeq_feat_EditHandle feh(fh);
2763 feh.Replace(*new_mrna);
2764 any_changes = true;
2765 }
2766 }
2767 }
2768
2769 //any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2770 if (change_this_cds) {
2771 CSeq_feat_EditHandle cds_h(*cds_it);
2772
2773 cds_h.Replace(*new_cds);
2774 any_changes = true;
2775
2776 //also need to redo protein title
2777 }
2778
2779 }
2780
2781 CTSE_Handle tse = entry.GetTSE_Handle();
2782
2783 for (CFeat_CI rna_it(entry, SAnnotSelector(CSeqFeatData::e_Rna)); rna_it; ++rna_it) {
2784
2785 const CSeq_feat& rna_feat = *(rna_it->GetSeq_feat());
2786 if (rna_feat.IsSetData() &&
2787 rna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA &&
2788 s_CleanupIsShortrRNA(rna_feat, &(entry.GetScope()))) {
2789
2790 bool change_this_rrna = false;
2791 CRef<CSeq_feat> new_rrna(new CSeq_feat());
2792 new_rrna->Assign(*(rna_it->GetSeq_feat()));
2793
2794 const CSeq_loc& loc = rna_feat.GetLocation();
2795 if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
2796 if (loc.GetStart(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2797 new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2798 change_this_rrna = true;
2799 }
2800 if (loc.GetStop(eExtreme_Biological) < 1) {
2801 new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2802 change_this_rrna = true;
2803 }
2804 } else {
2805 if (loc.GetStart(eExtreme_Biological) < 1) {
2806 new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2807 change_this_rrna = true;
2808 }
2809 if (loc.GetStop(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2810 new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2811 change_this_rrna = true;
2812 }
2813 }
2814
2815 if (change_this_rrna) {
2816 CSeq_feat_EditHandle rrna_h(*rna_it);
2817 rrna_h.Replace(*new_rrna);
2818 any_changes = true;
2819 }
2820 }
2821 }
2822
2823 for (CFeat_CI gene_it(entry, SAnnotSelector(CSeqFeatData::e_Gene)); gene_it; ++gene_it) {
2824 bool change_this_gene;
2825 CRef<CSeq_feat> new_gene(new CSeq_feat());
2826 new_gene->Assign(*(gene_it->GetSeq_feat()));
2827
2828 change_this_gene = ExpandGeneToIncludeChildren(*new_gene, tse);
2829
2830 change_this_gene |= SetGenePartialByLongestContainedFeature(*new_gene, entry.GetScope());
2831
2832 if (change_this_gene) {
2833 CSeq_feat_EditHandle gene_h(*gene_it);
2834 gene_h.Replace(*new_gene);
2835 any_changes = true;
2836 }
2837 }
2838
2839 NormalizeDescriptorOrder(entry);
2840
2841 for (CBioseq_CI bi(entry, CSeq_inst::eMol_na); bi; ++bi) {
2842 any_changes |= SetGeneticCodes(*bi);
2843 }
2844
2845 if (run_extended_cleanup) {
2846 auto pChanged = CCleanup::ExtendedCleanup(entry, options);
2847 if (pChanged->ChangeCount()>0) {
2848 return true;
2849 }
2850 }
2851 return any_changes;
2852 }
2853
2854
x_HasShortIntron(const CSeq_loc & loc,size_t min_len)2855 bool CCleanup::x_HasShortIntron(const CSeq_loc& loc, size_t min_len)
2856 {
2857 CSeq_loc_CI li(loc);
2858 while (li && li.IsEmpty()) {
2859 ++li;
2860 }
2861 if (!li) {
2862 return false;
2863 }
2864 while (li) {
2865 TSeqPos prev_end;
2866 ENa_strand prev_strand;
2867 if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2868 prev_end = li.GetRange().GetFrom();
2869 prev_strand = eNa_strand_minus;
2870 } else {
2871 prev_end = li.GetRange().GetTo();
2872 prev_strand = eNa_strand_plus;
2873 }
2874 ++li;
2875 while (li && li.IsEmpty()) {
2876 ++li;
2877 }
2878 if (li) {
2879 TSeqPos this_start;
2880 ENa_strand this_strand;
2881 if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2882 this_start = li.GetRange().GetTo();
2883 this_strand = eNa_strand_minus;
2884 } else {
2885 this_start = li.GetRange().GetFrom();
2886 this_strand = eNa_strand_plus;
2887 }
2888 if (this_strand == prev_strand) {
2889 if (abs((long int)this_start - (long int)prev_end) < min_len) {
2890 return true;
2891 }
2892 }
2893 }
2894 }
2895 return false;
2896 }
2897
2898 //LCOV_EXCL_START
2899 //not used by asn_cleanup but used by table2asn
2900 const string kLowQualitySequence = "low-quality sequence region";
2901
x_AddLowQualityException(CSeq_feat & feat)2902 bool CCleanup::x_AddLowQualityException(CSeq_feat& feat)
2903 {
2904 bool any_change = false;
2905 if (!feat.IsSetExcept()) {
2906 any_change = true;
2907 feat.SetExcept(true);
2908 }
2909 if (!feat.IsSetExcept_text() || NStr::IsBlank(feat.GetExcept_text())) {
2910 feat.SetExcept_text(kLowQualitySequence);
2911 any_change = true;
2912 } else if (NStr::Find(feat.GetExcept_text(), kLowQualitySequence) == string::npos) {
2913 feat.SetExcept_text(feat.GetExcept_text() + "; " + kLowQualitySequence);
2914 any_change = true;
2915 }
2916 return any_change;
2917 }
2918
2919
x_AddLowQualityException(CSeq_entry_Handle entry,CSeqFeatData::ESubtype subtype)2920 bool CCleanup::x_AddLowQualityException(CSeq_entry_Handle entry, CSeqFeatData::ESubtype subtype)
2921 {
2922 bool any_changes = false;
2923
2924 SAnnotSelector sel(subtype);
2925 for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2926 bool change_this_cds = false;
2927 CRef<CSeq_feat> new_cds(new CSeq_feat());
2928 new_cds->Assign(*(cds_it->GetSeq_feat()));
2929 if (!sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope()) &&
2930 x_HasShortIntron(cds_it->GetLocation())) {
2931 change_this_cds = x_AddLowQualityException(*new_cds);
2932 }
2933
2934 if (change_this_cds) {
2935 CSeq_feat_EditHandle cds_h(*cds_it);
2936
2937 cds_h.Replace(*new_cds);
2938 any_changes = true;
2939 }
2940 }
2941 return any_changes;
2942 }
2943
2944
AddLowQualityException(CSeq_entry_Handle entry)2945 bool CCleanup::AddLowQualityException(CSeq_entry_Handle entry)
2946 {
2947 bool any_changes = x_AddLowQualityException(entry, CSeqFeatData::eSubtype_cdregion);
2948 any_changes |= x_AddLowQualityException(entry, CSeqFeatData::eSubtype_mRNA);
2949 return any_changes;
2950 }
2951 //LCOV_EXCL_STOP
2952
2953
2954 // maps the type of seqdesc to the order it should be in
2955 // (lowest to highest)
2956 typedef SStaticPair<CSeqdesc::E_Choice, int> TSeqdescOrderElem;
2957 static const TSeqdescOrderElem sc_seqdesc_order_map[] = {
2958 // Note that ordering must match ordering
2959 // in CSeqdesc::E_Choice
2960 { CSeqdesc::e_Mol_type, 13 },
2961 { CSeqdesc::e_Modif, 14 },
2962 { CSeqdesc::e_Method, 15 },
2963 { CSeqdesc::e_Name, 7 },
2964 { CSeqdesc::e_Title, 1 },
2965 { CSeqdesc::e_Org, 16 },
2966 { CSeqdesc::e_Comment, 6 },
2967 { CSeqdesc::e_Num, 11 },
2968 { CSeqdesc::e_Maploc, 9 },
2969 { CSeqdesc::e_Pir, 18 },
2970 { CSeqdesc::e_Genbank, 22 },
2971 { CSeqdesc::e_Pub, 5 },
2972 { CSeqdesc::e_Region, 10 },
2973 { CSeqdesc::e_User, 8 },
2974 { CSeqdesc::e_Sp, 17 },
2975 { CSeqdesc::e_Dbxref, 12 },
2976 { CSeqdesc::e_Embl, 21 },
2977 { CSeqdesc::e_Create_date, 24 },
2978 { CSeqdesc::e_Update_date, 25 },
2979 { CSeqdesc::e_Prf, 19 },
2980 { CSeqdesc::e_Pdb, 20 },
2981 { CSeqdesc::e_Het, 4 },
2982
2983 { CSeqdesc::e_Source, 2 },
2984 { CSeqdesc::e_Molinfo, 3 },
2985 { CSeqdesc::e_Modelev, 23 }
2986 };
2987 typedef CStaticPairArrayMap<CSeqdesc::E_Choice, int> TSeqdescOrderMap;
2988 DEFINE_STATIC_ARRAY_MAP(TSeqdescOrderMap, sc_SeqdescOrderMap, sc_seqdesc_order_map);
2989
2990 static
s_SeqDescToOrdering(CSeqdesc::E_Choice chs)2991 int s_SeqDescToOrdering(CSeqdesc::E_Choice chs) {
2992 // ordering assigned to unknown
2993 const int unknown_seqdesc = static_cast<int>(1 + sc_SeqdescOrderMap.size());
2994
2995 TSeqdescOrderMap::const_iterator find_iter = sc_SeqdescOrderMap.find(chs);
2996 if (find_iter == sc_SeqdescOrderMap.end()) {
2997 return unknown_seqdesc;
2998 }
2999
3000 return find_iter->second;
3001 }
3002
3003 static
s_SeqDescLessThan(const CRef<CSeqdesc> & desc1,const CRef<CSeqdesc> & desc2)3004 bool s_SeqDescLessThan(const CRef<CSeqdesc> &desc1, const CRef<CSeqdesc> &desc2)
3005 {
3006 CSeqdesc::E_Choice chs1, chs2;
3007
3008 chs1 = desc1->Which();
3009 chs2 = desc2->Which();
3010
3011 return (s_SeqDescToOrdering(chs1) < s_SeqDescToOrdering(chs2));
3012 }
3013
NormalizeDescriptorOrder(CSeq_descr & descr)3014 bool CCleanup::NormalizeDescriptorOrder(CSeq_descr& descr)
3015 {
3016 bool rval = false;
3017 if (!seq_mac_is_sorted(descr.Set().begin(), descr.Set().end(), s_SeqDescLessThan)) {
3018 descr.Set().sort(s_SeqDescLessThan);
3019 rval = true;
3020 }
3021 return rval;
3022 }
3023
NormalizeDescriptorOrder(CSeq_entry_Handle seh)3024 bool CCleanup::NormalizeDescriptorOrder(CSeq_entry_Handle seh)
3025 {
3026 bool rval = false;
3027
3028 CSeq_entry_CI ci(seh, CSeq_entry_CI::fRecursive | CSeq_entry_CI::fIncludeGivenEntry);
3029 while (ci) {
3030 CSeq_entry_EditHandle edit(*ci);
3031 if (edit.IsSetDescr()) {
3032 rval |= NormalizeDescriptorOrder(edit.SetDescr());
3033 }
3034 ++ci;
3035 }
3036
3037 return rval;
3038 }
3039
3040
RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)3041 bool CCleanup::RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)
3042 {
3043 bool removed = false;
3044 if (seq.IsSetDescr()) {
3045 CConstRef<CSeqdesc> last_title(NULL);
3046 ITERATE(CBioseq::TDescr::Tdata, d, seq.GetDescr().Get()) {
3047 if ((*d)->IsTitle()) {
3048 if (last_title) {
3049 seq.RemoveSeqdesc(*last_title);
3050 removed = true;
3051 }
3052 last_title.Reset(d->GetPointer());
3053 }
3054 }
3055 }
3056 return removed;
3057 }
3058
3059
RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set)3060 bool CCleanup::RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set)
3061 {
3062 bool removed = false;
3063 if (set.IsSetDescr()) {
3064 CConstRef<CSeqdesc> last_title(NULL);
3065 ITERATE(CBioseq::TDescr::Tdata, d, set.GetDescr().Get()) {
3066 if ((*d)->IsTitle()) {
3067 if (last_title) {
3068 set.RemoveSeqdesc(*last_title);
3069 removed = true;
3070 }
3071 last_title.Reset(d->GetPointer());
3072 }
3073 }
3074 }
3075 return removed;
3076 }
3077
3078
AddGenBankWrapper(CSeq_entry_Handle seh)3079 bool CCleanup::AddGenBankWrapper(CSeq_entry_Handle seh)
3080 {
3081 if (seh.IsSet() && seh.GetSet().IsSetClass() &&
3082 seh.GetSet().GetClass() == CBioseq_set::eClass_genbank) {
3083 return false;
3084 }
3085 CSeq_entry_EditHandle eh(seh);
3086 eh.ConvertSeqToSet(CBioseq_set::eClass_genbank);
3087 return true;
3088 }
3089
3090
s_GetAuthorsString(string * out_authors,const CAuth_list & auth_list)3091 void s_GetAuthorsString(string *out_authors, const CAuth_list& auth_list)
3092 {
3093 string & auth_str = *out_authors;
3094 auth_str.clear();
3095
3096 if (!auth_list.IsSetNames()) {
3097 return;
3098 }
3099
3100 vector<string> name_list;
3101
3102 if (auth_list.GetNames().IsStd()) {
3103 ITERATE(CAuth_list::TNames::TStd, auth_it, auth_list.GetNames().GetStd()) {
3104 if ((*auth_it)->IsSetName()) {
3105 string label;
3106 (*auth_it)->GetName().GetLabel(&label);
3107 name_list.push_back(label);
3108 }
3109 }
3110 } else if (auth_list.GetNames().IsMl()) {
3111 copy(BEGIN_COMMA_END(auth_list.GetNames().GetMl()),
3112 back_inserter(name_list));
3113 } else if (auth_list.GetNames().IsStr()) {
3114 copy(BEGIN_COMMA_END(auth_list.GetNames().GetStr()),
3115 back_inserter(name_list));
3116 }
3117
3118 if (name_list.size() == 0) {
3119 return;
3120 } else if (name_list.size() == 1) {
3121 auth_str = name_list.back();
3122 return;
3123 }
3124
3125 // join most of them by commas, but the last one gets an "and"
3126 string last_author;
3127 last_author.swap(name_list.back());
3128 name_list.pop_back();
3129 // swap is faster than assignment
3130 NStr::Join(name_list, ", ").swap(auth_str);
3131 auth_str += "and ";
3132 auth_str += last_author;
3133
3134 return;
3135 }
3136
3137
s_GetAuthorsString(string * out_authors_string,const CPubdesc & pd)3138 void s_GetAuthorsString(
3139 string *out_authors_string, const CPubdesc& pd)
3140 {
3141 string & authors_string = *out_authors_string;
3142 authors_string.clear();
3143
3144 FOR_EACH_PUB_ON_PUBDESC(pub, pd) {
3145 if ((*pub)->IsSetAuthors()) {
3146 s_GetAuthorsString(&authors_string, (*pub)->GetAuthors());
3147 break;
3148 }
3149 }
3150 }
3151
3152
GetPubdescLabels(const CPubdesc & pd,vector<TEntrezId> & pmids,vector<TEntrezId> & muids,vector<int> & serials,vector<string> & published_labels,vector<string> & unpublished_labels)3153 void CCleanup::GetPubdescLabels
3154 (const CPubdesc& pd,
3155 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
3156 vector<string>& published_labels,
3157 vector<string>& unpublished_labels)
3158 {
3159 string label;
3160 bool is_published = false;
3161 bool need_label = false;
3162
3163 if (!pd.IsSetPub()) {
3164 return;
3165 }
3166 ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3167 if ((*it)->IsPmid()) {
3168 pmids.push_back((*it)->GetPmid());
3169 is_published = true;
3170 } else if ((*it)->IsMuid()) {
3171 muids.push_back((*it)->GetMuid());
3172 is_published = true;
3173 } else if ((*it)->IsGen()) {
3174 if ((*it)->GetGen().IsSetCit()
3175 && NStr::StartsWith((*it)->GetGen().GetCit(), "BackBone id_pub", NStr::eNocase)) {
3176 need_label = true;
3177 }
3178 if ((*it)->GetGen().IsSetSerial_number()) {
3179 serials.push_back((*it)->GetGen().GetSerial_number());
3180 if ((*it)->GetGen().IsSetCit()
3181 || (*it)->GetGen().IsSetJournal()
3182 || (*it)->GetGen().IsSetDate()) {
3183 need_label = true;
3184 }
3185 } else {
3186 need_label = true;
3187 }
3188 } else if ((*it)->IsArticle() && (*it)->GetArticle().IsSetIds()) {
3189 is_published = true;
3190 ITERATE(CArticleIdSet::Tdata, id, (*it)->GetArticle().GetIds().Get()) {
3191 if ((*id)->IsPubmed()) {
3192 pmids.push_back((*id)->GetPubmed());
3193 is_published = true;
3194 } else if ((*id)->IsMedline()) {
3195 muids.push_back((*id)->GetMedline());
3196 }
3197 }
3198 need_label = true;
3199 } else {
3200 need_label = true;
3201 }
3202 if (need_label && NStr::IsBlank(label)) {
3203 // create unique label
3204 (*it)->GetLabel(&label, CPub::eContent, true);
3205 string auth_str;
3206 s_GetAuthorsString(&auth_str, pd);
3207 label += "; ";
3208 label += auth_str;
3209 }
3210 }
3211 if (!NStr::IsBlank(label)) {
3212 if (is_published) {
3213 published_labels.push_back(label);
3214 } else {
3215 unpublished_labels.push_back(label);
3216 }
3217 }
3218 }
3219
3220
GetCitationList(CBioseq_Handle bsh)3221 vector<CConstRef<CPub> > CCleanup::GetCitationList(CBioseq_Handle bsh)
3222 {
3223 vector<CConstRef<CPub> > pub_list;
3224
3225 // first get descriptor pubs
3226 CSeqdesc_CI di(bsh, CSeqdesc::e_Pub);
3227 while (di) {
3228 vector<TEntrezId> pmids;
3229 vector<TEntrezId> muids;
3230 vector<int> serials;
3231 vector<string> published_labels;
3232 vector<string> unpublished_labels;
3233 GetPubdescLabels(di->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3234 if (pmids.size() > 0) {
3235 CRef<CPub> pub(new CPub());
3236 pub->SetPmid().Set(pmids[0]);
3237 pub_list.push_back(pub);
3238 } else if (muids.size() > 0) {
3239 CRef<CPub> pub(new CPub());
3240 pub->SetMuid(muids[0]);
3241 pub_list.push_back(pub);
3242 } else if (serials.size() > 0) {
3243 CRef<CPub> pub(new CPub());
3244 pub->SetGen().SetSerial_number(serials[0]);
3245 pub_list.push_back(pub);
3246 } else if (published_labels.size() > 0) {
3247 CRef<CPub> pub(new CPub());
3248 pub->SetGen().SetCit(published_labels[0]);
3249 pub_list.push_back(pub);
3250 } else if (unpublished_labels.size() > 0) {
3251 CRef<CPub> pub(new CPub());
3252 pub->SetGen().SetCit(unpublished_labels[0]);
3253 pub_list.push_back(pub);
3254 }
3255
3256 ++di;
3257 }
3258 // now get pub features
3259 CFeat_CI fi(bsh, SAnnotSelector(CSeqFeatData::e_Pub));
3260 while (fi) {
3261 vector<TEntrezId> pmids;
3262 vector<TEntrezId> muids;
3263 vector<int> serials;
3264 vector<string> published_labels;
3265 vector<string> unpublished_labels;
3266 GetPubdescLabels(fi->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3267 if (pmids.size() > 0) {
3268 CRef<CPub> pub(new CPub());
3269 pub->SetPmid().Set(pmids[0]);
3270 pub_list.push_back(pub);
3271 } else if (muids.size() > 0) {
3272 CRef<CPub> pub(new CPub());
3273 pub->SetMuid(muids[0]);
3274 pub_list.push_back(pub);
3275 } else if (serials.size() > 0) {
3276 CRef<CPub> pub(new CPub());
3277 pub->SetGen().SetSerial_number(serials[0]);
3278 pub_list.push_back(pub);
3279 } else if (published_labels.size() > 0) {
3280 CRef<CPub> pub(new CPub());
3281 pub->SetGen().SetCit(published_labels[0]);
3282 pub_list.push_back(pub);
3283 } else if (unpublished_labels.size() > 0) {
3284 CRef<CPub> pub(new CPub());
3285 pub->SetGen().SetCit(unpublished_labels[0]);
3286 pub_list.push_back(pub);
3287 }
3288
3289 ++fi;
3290 }
3291 return pub_list;
3292 }
3293
3294
RemoveDuplicatePubs(CSeq_descr & descr)3295 bool CCleanup::RemoveDuplicatePubs(CSeq_descr& descr)
3296 {
3297 bool any_change = false;
3298 CSeq_descr::Tdata::iterator it1 = descr.Set().begin();
3299 while (it1 != descr.Set().end()) {
3300 if ((*it1)->IsPub()) {
3301 CSeq_descr::Tdata::iterator it2 = it1;
3302 ++it2;
3303 while (it2 != descr.Set().end()) {
3304 if ((*it2)->IsPub() && (*it1)->GetPub().Equals((*it2)->GetPub())) {
3305 it2 = descr.Set().erase(it2);
3306 any_change = true;
3307 } else {
3308 ++it2;
3309 }
3310 }
3311 }
3312 ++it1;
3313 }
3314 return any_change;
3315 }
3316
3317
s_FirstPubMatchesSecond(const CPubdesc & pd1,const CPubdesc & pd2)3318 bool s_FirstPubMatchesSecond(const CPubdesc& pd1, const CPubdesc& pd2)
3319 {
3320 if (pd1.Equals(pd2)) {
3321 return true;
3322 } else if (pd1.IsSetPub() && pd2.IsSetPub() && pd1.GetPub().Get().size() == 1) {
3323 ITERATE(CPubdesc::TPub::Tdata, it, pd2.GetPub().Get()) {
3324 if (pd1.GetPub().Get().front()->Equals(**it)) {
3325 return true;
3326 }
3327 }
3328 }
3329 return false;
3330 }
3331
3332
PubAlreadyInSet(const CPubdesc & pd,const CSeq_descr & descr)3333 bool CCleanup::PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr)
3334 {
3335 ITERATE(CSeq_descr::Tdata, d, descr.Get()) {
3336 if ((*d)->IsPub() && s_FirstPubMatchesSecond(pd, (*d)->GetPub())) {
3337 return true;
3338 }
3339 }
3340 return false;
3341 }
3342
3343
OkToPromoteNpPub(const CBioseq & b)3344 bool CCleanup::OkToPromoteNpPub(const CBioseq& b)
3345 {
3346 bool is_embl_or_ddbj = false;
3347 ITERATE(CBioseq::TId, id, b.GetId()) {
3348 if ((*id)->IsEmbl() || (*id)->IsDdbj()) {
3349 is_embl_or_ddbj = true;
3350 break;
3351 }
3352 }
3353 return !is_embl_or_ddbj;
3354 }
3355
3356
OkToPromoteNpPub(const CPubdesc & pd)3357 bool CCleanup::OkToPromoteNpPub(const CPubdesc& pd)
3358 {
3359 if (pd.IsSetNum() || pd.IsSetName() || pd.IsSetFig() || pd.IsSetComment()) {
3360 return false;
3361 } else {
3362 return true;
3363 }
3364 }
3365
3366
MoveOneFeatToPubdesc(CSeq_feat_Handle feat,CRef<CSeqdesc> d,CBioseq_Handle b,bool remove_feat)3367 void CCleanup::MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef<CSeqdesc> d, CBioseq_Handle b, bool remove_feat)
3368 {
3369 // add descriptor to nuc-prot parent or sequence itself
3370 CBioseq_set_Handle parent = b.GetParentBioseq_set();
3371 if (!CCleanup::OkToPromoteNpPub(*(b.GetCompleteBioseq()))) {
3372 // add to sequence
3373 CBioseq_EditHandle eh(b);
3374 eh.AddSeqdesc(*d);
3375 RemoveDuplicatePubs(eh.SetDescr());
3376 NormalizeDescriptorOrder(eh.SetDescr());
3377 } else if (parent && parent.IsSetClass() &&
3378 parent.GetClass() == CBioseq_set::eClass_nuc_prot &&
3379 parent.IsSetDescr() && PubAlreadyInSet(d->GetPub(), parent.GetDescr())) {
3380 // don't add descriptor, just delete feature
3381 } else if (OkToPromoteNpPub((d)->GetPub()) &&
3382 parent && parent.IsSetClass() &&
3383 parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3384 CBioseq_set_EditHandle eh(parent);
3385 eh.AddSeqdesc(*d);
3386 RemoveDuplicatePubs(eh.SetDescr());
3387 NormalizeDescriptorOrder(eh.SetDescr());
3388 } else {
3389 CBioseq_EditHandle eh(b);
3390 eh.AddSeqdesc(*d);
3391 RemoveDuplicatePubs(eh.SetDescr());
3392 NormalizeDescriptorOrder(eh.SetDescr());
3393 }
3394 if (remove_feat) {
3395 // remove feature
3396 CSeq_feat_EditHandle feh(feat);
3397 feh.Remove();
3398 }
3399 }
3400
3401
ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)3402 bool CCleanup::ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)
3403 {
3404 bool any_change = false;
3405 for (CBioseq_CI b(seh); b; ++b) {
3406 for (CFeat_CI p(*b, CSeqFeatData::e_Pub); p; ++p) {
3407 if (p->GetLocation().IsInt() &&
3408 p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3409 p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3410 CRef<CSeqdesc> d(new CSeqdesc());
3411 d->SetPub().Assign(p->GetData().GetPub());
3412 if (p->IsSetComment()) {
3413 if (d->GetPub().IsSetComment() && !NStr::IsBlank(d->GetPub().GetComment())) {
3414 d->SetPub().SetComment(d->GetPub().GetComment() + "; " + p->GetComment());
3415 } else {
3416 d->SetPub().SetComment();
3417 }
3418 }
3419 MoveOneFeatToPubdesc(*p, d, *b);
3420 any_change = true;
3421 }
3422 }
3423 }
3424 return any_change;
3425 }
3426
3427
IsSiteRef(const CSeq_feat & sf)3428 bool IsSiteRef(const CSeq_feat& sf)
3429 {
3430 if (sf.GetData().IsImp() &&
3431 sf.GetData().GetImp().IsSetKey() &&
3432 NStr::Equal(sf.GetData().GetImp().GetKey(), "Site-ref")) {
3433 return true;
3434 } else {
3435 return false;
3436 }
3437 }
3438
3439
IsMinPub(const CPubdesc & pd,bool is_refseq_prot)3440 bool CCleanup::IsMinPub(const CPubdesc& pd, bool is_refseq_prot)
3441 {
3442 if (!pd.IsSetPub()) {
3443 return true;
3444 }
3445 bool found_non_minimal = false;
3446 ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3447 if ((*it)->IsMuid() || (*it)->IsPmid()) {
3448 if (is_refseq_prot) {
3449 found_non_minimal = true;
3450 break;
3451 }
3452 } else if ((*it)->IsGen()) {
3453 const CCit_gen& gen = (*it)->GetGen();
3454 if (gen.IsSetCit() && !gen.IsSetJournal() &&
3455 !gen.IsSetAuthors() && !gen.IsSetVolume() &&
3456 !gen.IsSetPages()) {
3457 //minimalish, keep looking
3458 } else {
3459 found_non_minimal = true;
3460 }
3461 } else {
3462 found_non_minimal = true;
3463 break;
3464 }
3465 }
3466
3467 return !found_non_minimal;
3468 }
3469
3470
RescueSiteRefPubs(CSeq_entry_Handle seh)3471 bool CCleanup::RescueSiteRefPubs(CSeq_entry_Handle seh)
3472 {
3473 bool found_site_ref = false;
3474 CFeat_CI f(seh, CSeqFeatData::e_Imp);
3475 while (f && !found_site_ref) {
3476 if (IsSiteRef(*(f->GetSeq_feat()))) {
3477 found_site_ref = true;
3478 }
3479 ++f;
3480 }
3481 if (!found_site_ref) {
3482 return false;
3483 }
3484
3485 bool any_change = false;
3486 for (CBioseq_CI b(seh); b; ++b) {
3487 bool is_refseq_prot = false;
3488 if (b->IsAa()) {
3489 ITERATE(CBioseq::TId, id_it, b->GetCompleteBioseq()->GetId()) {
3490 if ((*id_it)->IsOther()) {
3491 is_refseq_prot = true;
3492 break;
3493 }
3494 }
3495 }
3496
3497 for (CFeat_CI p(*b); p; ++p) {
3498 if (!p->IsSetCit() || p->GetCit().Which() != CPub_set::e_Pub) {
3499 continue;
3500 }
3501
3502 bool is_site_ref = IsSiteRef(*(p->GetSeq_feat()));
3503 ITERATE(CSeq_feat::TCit::TPub, c, p->GetCit().GetPub()) {
3504 CRef<CSeqdesc> d(new CSeqdesc());
3505 if ((*c)->IsEquiv()) {
3506 ITERATE(CPub_equiv::Tdata, t, (*c)->GetEquiv().Get()) {
3507 CRef<CPub> pub_copy(new CPub());
3508 pub_copy->Assign(**t);
3509 d->SetPub().SetPub().Set().push_back(pub_copy);
3510 }
3511
3512 } else {
3513 CRef<CPub> pub_copy(new CPub());
3514 pub_copy->Assign(**c);
3515 d->SetPub().SetPub().Set().push_back(pub_copy);
3516 }
3517 if (is_site_ref) {
3518 d->SetPub().SetReftype(CPubdesc::eReftype_sites);
3519 } else {
3520 d->SetPub().SetReftype(CPubdesc::eReftype_feats);
3521 }
3522 CRef<CCleanupChange> changes(makeCleanupChange(0));
3523 CNewCleanup_imp pubclean(changes, 0);
3524 pubclean.BasicCleanup(d->SetPub(), ShouldStripPubSerial(*(b->GetCompleteBioseq())));
3525 if (!IsMinPub(d->SetPub(), is_refseq_prot)) {
3526 MoveOneFeatToPubdesc(*p, d, *b, false);
3527 }
3528 }
3529 if (is_site_ref) {
3530
3531 CSeq_feat_EditHandle feh(*p);
3532 CSeq_annot_Handle annot = feh.GetAnnot();
3533
3534 feh.Remove();
3535
3536 // remove old annot if now empty
3537 if (CNewCleanup_imp::ShouldRemoveAnnot(*(annot.GetCompleteSeq_annot()))) {
3538 CSeq_annot_EditHandle annot_edit(annot);
3539 annot_edit.Remove();
3540 }
3541
3542 }
3543 any_change = true;
3544 }
3545 }
3546 return any_change;
3547 }
3548
3549
AreBioSourcesMergeable(const CBioSource & src1,const CBioSource & src2)3550 bool CCleanup::AreBioSourcesMergeable(const CBioSource& src1, const CBioSource& src2)
3551 {
3552 if (src1.IsSetOrg() && src1.GetOrg().IsSetTaxname() &&
3553 src2.IsSetOrg() && src2.GetOrg().IsSetTaxname() &&
3554 NStr::Equal(src1.GetOrg().GetTaxname(), src2.GetOrg().GetTaxname())) {
3555 return true;
3556 } else {
3557 return false;
3558 }
3559 }
3560
3561
s_SubsourceCompareC(const CRef<CSubSource> & st1,const CRef<CSubSource> & st2)3562 static bool s_SubsourceCompareC (
3563 const CRef<CSubSource>& st1,
3564 const CRef<CSubSource>& st2
3565 )
3566
3567 {
3568 const CSubSource& sbs1 = *(st1);
3569 const CSubSource& sbs2 = *(st2);
3570
3571 TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
3572 TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
3573
3574 if (chs1 < chs2) return true;
3575 if (chs1 > chs2) return false;
3576
3577 if (FIELD_IS_SET (sbs2, Name)) {
3578 if (! FIELD_IS_SET (sbs1, Name)) return true;
3579 if (NStr::CompareNocase(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
3580 }
3581
3582 return false;
3583 }
3584
s_SameSubtypeC(const CSubSource & s1,const CSubSource & s2)3585 static bool s_SameSubtypeC(const CSubSource& s1, const CSubSource& s2)
3586 {
3587 if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
3588 return true;
3589 } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
3590 return false;
3591 } else {
3592 return s1.GetSubtype() == s2.GetSubtype();
3593 }
3594 }
3595
3596 // close enough if second name contains the first
s_NameCloseEnoughC(const CSubSource & s1,const CSubSource & s2)3597 static bool s_NameCloseEnoughC(const CSubSource& s1, const CSubSource& s2)
3598 {
3599 if (!s1.IsSetName() && !s2.IsSetName()) {
3600 return true;
3601 } else if (!s1.IsSetName() || !s2.IsSetName()) {
3602 return false;
3603 }
3604 const string& n1 = s1.GetName();
3605 const string& n2 = s2.GetName();
3606
3607 if (NStr::Equal(n1, n2)) {
3608 return true;
3609 } else {
3610 return false;
3611 }
3612 }
3613
3614
s_SubSourceListUniqued(CBioSource & biosrc)3615 bool s_SubSourceListUniqued(CBioSource& biosrc)
3616 {
3617 bool res = false;
3618
3619 // sort and remove duplicates.
3620 if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
3621 if (!SUBSOURCE_ON_BIOSOURCE_IS_SORTED(biosrc, s_SubsourceCompareC)) {
3622 SORT_SUBSOURCE_ON_BIOSOURCE(biosrc, s_SubsourceCompareC);
3623 }
3624
3625 // remove duplicates and subsources that contain previous values
3626 CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
3627 CBioSource::TSubtype::iterator s_next = s;
3628 ++s_next;
3629 while (s_next != biosrc.SetSubtype().end()) {
3630 if (s_SameSubtypeC(**s, **s_next) && s_NameCloseEnoughC(**s, **s_next)) {
3631 s = biosrc.SetSubtype().erase(s);
3632 res = true;
3633 } else {
3634 ++s;
3635 }
3636 ++s_next;
3637 }
3638 }
3639
3640 return res;
3641 }
3642
MergeDupBioSources(CBioSource & src1,const CBioSource & add)3643 bool CCleanup::MergeDupBioSources(CBioSource& src1, const CBioSource& add)
3644 {
3645 bool any_change = false;
3646 // genome
3647 if ((!src1.IsSetGenome() || src1.GetGenome() == CBioSource::eGenome_unknown) &&
3648 add.IsSetGenome() && add.GetGenome() != CBioSource::eGenome_unknown) {
3649 src1.SetGenome(add.GetGenome());
3650 any_change = true;
3651 }
3652 // origin
3653 if ((!src1.IsSetOrigin() || src1.GetOrigin() == CBioSource::eOrigin_unknown) &&
3654 add.IsSetOrigin() && add.GetOrigin() != CBioSource::eOrigin_unknown) {
3655 src1.SetOrigin(add.GetOrigin());
3656 any_change = true;
3657 }
3658 // focus
3659 if (!src1.IsSetIs_focus() && add.IsSetIs_focus()) {
3660 src1.SetIs_focus();
3661 any_change = true;
3662 }
3663
3664 // merge subtypes
3665 if (add.IsSetSubtype()) {
3666 ITERATE(CBioSource::TSubtype, it, add.GetSubtype()) {
3667 CRef<CSubSource> a(new CSubSource());
3668 a->Assign(**it);
3669 src1.SetSubtype().push_back(a);
3670 }
3671 any_change = true;
3672 }
3673
3674 x_MergeDupOrgRefs(src1.SetOrg(), add.GetOrg());
3675
3676 if (s_SubSourceListUniqued(src1)) {
3677 any_change = true;
3678 }
3679
3680 return any_change;
3681 }
3682
3683
x_MergeDupOrgNames(COrgName & on1,const COrgName & add)3684 bool CCleanup::x_MergeDupOrgNames(COrgName& on1, const COrgName& add)
3685 {
3686 bool any_change = false;
3687
3688 // OrgMods
3689 if (add.IsSetMod()) {
3690 ITERATE(COrgName::TMod, it, add.GetMod()) {
3691 CRef<COrgMod> a(new COrgMod());
3692 a->Assign(**it);
3693 on1.SetMod().push_back(a);
3694 }
3695 any_change = true;
3696 }
3697
3698 // gcode
3699 if ((!on1.IsSetGcode() || on1.GetGcode() == 0) && add.IsSetGcode() && add.GetGcode() != 0) {
3700 on1.SetGcode(add.GetGcode());
3701 any_change = true;
3702 }
3703
3704 // mgcode
3705 if ((!on1.IsSetMgcode() || on1.GetMgcode() == 0) && add.IsSetMgcode() && add.GetMgcode() != 0) {
3706 on1.SetMgcode(add.GetMgcode());
3707 any_change = true;
3708 }
3709
3710 // lineage
3711 if (!on1.IsSetLineage() && add.IsSetLineage()) {
3712 on1.SetLineage(add.GetLineage());
3713 any_change = true;
3714 }
3715
3716 // div
3717 if (!on1.IsSetDiv() && add.IsSetDiv()) {
3718 on1.SetDiv(add.GetDiv());
3719 any_change = true;
3720 }
3721
3722 return any_change;
3723 }
3724
3725
HasMod(const COrg_ref & org,const string & mod)3726 bool HasMod(const COrg_ref& org, const string& mod)
3727 {
3728 if (!org.IsSetMod()) {
3729 return false;
3730 }
3731 ITERATE(COrg_ref::TMod, it, org.GetMod()) {
3732 if (NStr::Equal(*it, mod)) {
3733 return true;
3734 }
3735 }
3736 return false;
3737 }
3738
3739
x_MergeDupOrgRefs(COrg_ref & org1,const COrg_ref & add)3740 bool CCleanup::x_MergeDupOrgRefs(COrg_ref& org1, const COrg_ref& add)
3741 {
3742 bool any_change = false;
3743 // mods
3744 if (add.IsSetMod()) {
3745 ITERATE(COrg_ref::TMod, it, add.GetMod()) {
3746 if (!HasMod(org1, *it)) {
3747 org1.SetMod().push_back(*it);
3748 any_change = true;
3749 }
3750 }
3751 }
3752
3753 // dbxrefs
3754 if (add.IsSetDb()) {
3755 ITERATE(COrg_ref::TDb, it, add.GetDb()) {
3756 CRef<CDbtag> a(new CDbtag());
3757 a->Assign(**it);
3758 org1.SetDb().push_back(a);
3759 }
3760 any_change = true;
3761 }
3762
3763 // synonyms
3764 if (add.IsSetSyn()) {
3765 ITERATE(COrg_ref::TSyn, it, add.GetSyn()) {
3766 org1.SetSyn().push_back(*it);
3767 }
3768 any_change = true;
3769 }
3770
3771 if (add.IsSetOrgname()) {
3772 any_change |= x_MergeDupOrgNames(org1.SetOrgname(), add.GetOrgname());
3773 }
3774
3775 return any_change;
3776 }
3777
3778
MergeDupBioSources(CSeq_descr & seq_descr)3779 bool CCleanup::MergeDupBioSources(CSeq_descr & seq_descr)
3780 {
3781 bool any_change = false;
3782 CSeq_descr::Tdata::iterator src1 = seq_descr.Set().begin();
3783 while (src1 != seq_descr.Set().end()) {
3784 if ((*src1)->IsSource() && (*src1)->GetSource().IsSetOrg() && (*src1)->GetSource().GetOrg().IsSetTaxname()) {
3785 CSeq_descr::Tdata::iterator src2 = src1;
3786 ++src2;
3787 while (src2 != seq_descr.Set().end()) {
3788 if ((*src2)->IsSource() &&
3789 AreBioSourcesMergeable((*src1)->GetSource(), (*src2)->GetSource())) {
3790 MergeDupBioSources((*src1)->SetSource(), (*src2)->GetSource());
3791
3792 CRef<CCleanupChange> changes(makeCleanupChange(0));
3793 CNewCleanup_imp srcclean(changes, 0);
3794 srcclean.ExtendedCleanup((*src1)->SetSource());
3795 src2 = seq_descr.Set().erase(src2);
3796 any_change = true;
3797 } else {
3798 ++src2;
3799 }
3800 }
3801 }
3802 ++src1;
3803 }
3804 return any_change;
3805 }
3806
3807 /// Remove duplicate biosource descriptors
RemoveDupBioSource(CSeq_descr & descr)3808 bool CCleanup::RemoveDupBioSource(CSeq_descr& descr)
3809 {
3810 bool any_change = false;
3811 vector<CConstRef<CBioSource> > src_list;
3812 CSeq_descr::Tdata::iterator d = descr.Set().begin();
3813 while (d != descr.Set().end()) {
3814 if ((*d)->IsSource()) {
3815 bool found = false;
3816 ITERATE(vector<CConstRef<CBioSource> >, s, src_list) {
3817 if ((*d)->GetSource().Equals(**s)) {
3818 found = true;
3819 break;
3820 }
3821 }
3822 if (found) {
3823 d = descr.Set().erase(d);
3824 any_change = true;
3825 } else {
3826 CConstRef<CBioSource> src(&((*d)->GetSource()));
3827 src_list.push_back(src);
3828 ++d;
3829 }
3830 } else {
3831 ++d;
3832 }
3833 }
3834 return any_change;
3835 }
3836
3837
BioSrcFromFeat(const CSeq_feat & f)3838 CRef<CBioSource> CCleanup::BioSrcFromFeat(const CSeq_feat& f)
3839 {
3840 if (!f.IsSetData() || !f.GetData().IsBiosrc()) {
3841 return CRef<CBioSource>(NULL);
3842 }
3843 CRef<CBioSource> src(new CBioSource());
3844 src->Assign(f.GetData().GetBiosrc());
3845
3846 // move comment to subsource note
3847 if (f.IsSetComment()) {
3848 CRef<CSubSource> s(new CSubSource());
3849 s->SetSubtype(CSubSource::eSubtype_other);
3850 s->SetName(f.GetComment());
3851 src->SetSubtype().push_back(s);
3852
3853 }
3854
3855 // move dbxrefs on feature to source
3856 if (f.IsSetDbxref()) {
3857 ITERATE(CSeq_feat::TDbxref, it, f.GetDbxref()) {
3858 CRef<CDbtag> a(new CDbtag());
3859 a->Assign(**it);
3860 src->SetOrg().SetDb().push_back(a);
3861 }
3862 }
3863 CRef<CCleanupChange> changes(makeCleanupChange(0));
3864 CNewCleanup_imp srcclean(changes, 0);
3865 srcclean.ExtendedCleanup(*src);
3866
3867 return src;
3868 }
3869
3870
ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)3871 bool CCleanup::ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)
3872 {
3873 bool any_change = false;
3874 for (CBioseq_CI b(seh); b; ++b) {
3875 bool transgenic_or_focus = false;
3876 CSeqdesc_CI existing_src(*b, CSeqdesc::e_Source);
3877 while (existing_src && !transgenic_or_focus) {
3878 if (existing_src->GetSource().IsSetIs_focus() ||
3879 existing_src->GetSource().HasSubtype(CSubSource::eSubtype_transgenic)) {
3880 transgenic_or_focus = true;
3881 }
3882 ++existing_src;
3883 }
3884 if (transgenic_or_focus) {
3885 continue;
3886 }
3887 for (CFeat_CI p(*b, CSeqFeatData::e_Biosrc); p; ++p) {
3888 if (p->GetLocation().IsInt() &&
3889 p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3890 p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3891 CRef<CSeqdesc> d(new CSeqdesc());
3892 d->SetSource().Assign(*(BioSrcFromFeat(*(p->GetSeq_feat()))));
3893
3894 // add descriptor to nuc-prot parent or sequence itself
3895 CBioseq_set_Handle parent = b->GetParentBioseq_set();
3896 if (parent && parent.IsSetClass() &&
3897 parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3898 CBioseq_set_EditHandle eh(parent);
3899 eh.AddSeqdesc(*d);
3900 MergeDupBioSources(eh.SetDescr());
3901 RemoveDupBioSource(eh.SetDescr());
3902 NormalizeDescriptorOrder(eh.SetDescr());
3903 } else {
3904 CBioseq_EditHandle eh(*b);
3905 eh.AddSeqdesc(*d);
3906 MergeDupBioSources(eh.SetDescr());
3907 RemoveDupBioSource(eh.SetDescr());
3908 NormalizeDescriptorOrder(eh.SetDescr());
3909 }
3910
3911 // remove feature
3912 CSeq_feat_EditHandle feh(*p);
3913 feh.Remove();
3914
3915 any_change = true;
3916 }
3917 }
3918 }
3919 return any_change;
3920 }
3921
3922
3923
FixGeneXrefSkew(CSeq_entry_Handle seh)3924 bool CCleanup::FixGeneXrefSkew(CSeq_entry_Handle seh)
3925 {
3926 CFeat_CI fi(seh);
3927 size_t num_gene_locus = 0;
3928 size_t num_gene_locus_tag = 0;
3929 size_t num_gene_xref_locus = 0;
3930 size_t num_gene_xref_locus_tag = 0;
3931
3932 while (fi) {
3933 if (fi->GetData().IsGene()) {
3934 if (fi->GetData().GetGene().IsSetLocus()) {
3935 num_gene_locus++;
3936 }
3937 if (fi->GetData().GetGene().IsSetLocus_tag()) {
3938 num_gene_locus_tag++;
3939 }
3940 } else if (fi->IsSetXref()) {
3941 const CGene_ref* g = fi->GetGeneXref();
3942 if (g) {
3943 if (g->IsSetLocus()) {
3944 num_gene_xref_locus++;
3945 }
3946 if (g->IsSetLocus_tag()) {
3947 num_gene_xref_locus_tag++;
3948 }
3949 }
3950 }
3951 if (num_gene_locus > 0) {
3952 if (num_gene_locus_tag > 0) {
3953 return false;
3954 }
3955 if (num_gene_xref_locus > 0) {
3956 return false;
3957 }
3958 }
3959 if (num_gene_locus_tag > 0) {
3960 if (num_gene_locus > 0) {
3961 return false;
3962 }
3963 if (num_gene_xref_locus_tag > 0) {
3964 return false;
3965 }
3966 }
3967 ++fi;
3968 }
3969
3970 bool any_change = false;
3971 if (num_gene_locus == 0 && num_gene_locus_tag > 0) {
3972 if (num_gene_xref_locus > 0 && num_gene_xref_locus_tag == 0) {
3973 fi.Rewind();
3974 while (fi) {
3975 if (!fi->GetData().IsGene() && fi->GetGeneXref() != NULL) {
3976 bool this_change = false;
3977 CRef<CSeq_feat> new_f(new CSeq_feat());
3978 new_f->Assign(*(fi->GetSeq_feat()));
3979 NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
3980 if ((*it)->IsSetData() && (*it)->GetData().IsGene()
3981 && (*it)->GetData().GetGene().IsSetLocus()) {
3982 (*it)->SetData().SetGene().SetLocus_tag((*it)->GetData().GetGene().GetLocus());
3983 (*it)->SetData().SetGene().ResetLocus();
3984 this_change = true;
3985 }
3986 }
3987 if (this_change) {
3988 CSeq_feat_EditHandle eh(*fi);
3989 eh.Replace(*new_f);
3990 }
3991 }
3992 ++fi;
3993 }
3994 }
3995 } else if (num_gene_locus > 0 && num_gene_locus_tag == 0) {
3996 if (num_gene_xref_locus == 0 && num_gene_xref_locus_tag > 0) {
3997 fi.Rewind();
3998 while (fi) {
3999 if (!fi->GetData().IsGene() && fi->GetGeneXref() != NULL) {
4000 bool this_change = false;
4001 CRef<CSeq_feat> new_f(new CSeq_feat());
4002 new_f->Assign(*(fi->GetSeq_feat()));
4003 NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
4004 if ((*it)->IsSetData() && (*it)->GetData().IsGene()
4005 && (*it)->GetData().GetGene().IsSetLocus_tag()) {
4006 (*it)->SetData().SetGene().SetLocus((*it)->GetData().GetGene().GetLocus_tag());
4007 (*it)->SetData().SetGene().ResetLocus_tag();
4008 this_change = true;
4009 }
4010 }
4011 if (this_change) {
4012 CSeq_feat_EditHandle eh(*fi);
4013 eh.Replace(*new_f);
4014 any_change = true;
4015 }
4016 }
4017 ++fi;
4018 }
4019 }
4020 }
4021 return any_change;
4022 }
4023
4024
ShouldStripPubSerial(const CBioseq & bs)4025 bool CCleanup::ShouldStripPubSerial(const CBioseq& bs)
4026 {
4027 bool strip_serial = true;
4028 ITERATE(CBioseq::TId, id, bs.GetId()) {
4029 const CSeq_id& sid = **id;
4030 switch (sid.Which()) {
4031 case NCBI_SEQID(Genbank):
4032 case NCBI_SEQID(Tpg):
4033 {
4034 const CTextseq_id& tsid = *GET_FIELD(sid, Textseq_Id);
4035 if (FIELD_IS_SET(tsid, Accession)) {
4036 const string& acc = GET_FIELD(tsid, Accession);
4037 if (acc.length() == 6) {
4038 strip_serial = false;
4039 }
4040 }
4041 }
4042 break;
4043 case NCBI_SEQID(Embl):
4044 case NCBI_SEQID(Ddbj):
4045 strip_serial = false;
4046 break;
4047 case NCBI_SEQID(not_set):
4048 case NCBI_SEQID(Local):
4049 case NCBI_SEQID(Other):
4050 case NCBI_SEQID(General):
4051 break;
4052 case NCBI_SEQID(Gibbsq):
4053 case NCBI_SEQID(Gibbmt):
4054 case NCBI_SEQID(Pir):
4055 case NCBI_SEQID(Swissprot):
4056 case NCBI_SEQID(Patent):
4057 case NCBI_SEQID(Prf):
4058 case NCBI_SEQID(Pdb):
4059 case NCBI_SEQID(Gpipe):
4060 case NCBI_SEQID(Tpe):
4061 case NCBI_SEQID(Tpd):
4062 strip_serial = false;
4063 break;
4064 default:
4065 break;
4066 }
4067 }
4068 return strip_serial;
4069 }
4070
4071
RenormalizeNucProtSets(CSeq_entry_Handle seh)4072 bool CCleanup::RenormalizeNucProtSets(CSeq_entry_Handle seh)
4073 {
4074 bool change_made = false;
4075 CConstRef<CSeq_entry> entry = seh.GetCompleteSeq_entry();
4076 if (seh.IsSet() && seh.GetSet().IsSetClass() &&
4077 entry->GetSet().IsSetSeq_set()) {
4078 CBioseq_set::TClass set_class = seh.GetSet().GetClass();
4079 if (set_class == CBioseq_set::eClass_nuc_prot) {
4080 if (entry->GetSet().GetSeq_set().size() == 1 &&
4081 entry->GetSet().GetSeq_set().front()->IsSeq()) {
4082 CSeq_entry_EditHandle eh = seh.GetEditHandle();
4083 eh.ConvertSetToSeq();
4084 if (eh.GetSeq().IsSetDescr()) {
4085 RemoveUnseenTitles(eh.SetSeq());
4086 NormalizeDescriptorOrder(eh.SetSeq().SetDescr());
4087 }
4088 change_made = true;
4089 }
4090 } else if (set_class == CBioseq_set::eClass_genbank ||
4091 set_class == CBioseq_set::eClass_mut_set ||
4092 set_class == CBioseq_set::eClass_pop_set ||
4093 set_class == CBioseq_set::eClass_phy_set ||
4094 set_class == CBioseq_set::eClass_eco_set ||
4095 set_class == CBioseq_set::eClass_wgs_set ||
4096 set_class == CBioseq_set::eClass_gen_prod_set ||
4097 set_class == CBioseq_set::eClass_small_genome_set) {
4098 ITERATE(CBioseq_set::TSeq_set, s, entry->GetSet().GetSeq_set()) {
4099 CSeq_entry_Handle ch = seh.GetScope().GetSeq_entryHandle(**s);
4100 change_made |= RenormalizeNucProtSets(ch);
4101 }
4102 }
4103 }
4104 return change_made;
4105 }
4106
4107
DecodeXMLMarkChanged(std::string & str)4108 bool CCleanup::DecodeXMLMarkChanged(std::string & str)
4109 {
4110 // return false;
4111 bool change_made = false;
4112
4113 // This is more complex than you might initially think is necessary
4114 // because this needs to be as efficient as possible since it's
4115 // called on every single string in an object.
4116
4117 SIZE_TYPE amp = str.find('&');
4118 if( NPOS == amp ) {
4119 // Check for the common case of no replacements required
4120 return change_made;
4121 }
4122
4123 // transformations done by this function:
4124 const static struct {
4125 string src_word;
4126 string result_word;
4127 } transformations[] = {
4128 // all start with an implicit ampersand
4129 // and end with an implicit semi-colon
4130 { "amp", "&" },
4131 { "apos", "\'" },
4132 { "gt", ">" },
4133 { "lt", "<" },
4134 { "quot", "\"" },
4135 { "#13
", "" },
4136 { "#13;
", "" },
4137 { "#916", "Delta" },
4138 { "#945", "alpha" },
4139 { "#946", "beta" },
4140 { "#947", "gamma" },
4141 { "#952", "theta" },
4142 { "#955", "lambda" },
4143 { "#956", "mu" },
4144 { "#957", "nu" },
4145 { "#8201", "" },
4146 { "#8206", "" },
4147 { "#8242", "'" },
4148 { "#8594", "->" },
4149 { "#8722", "-" },
4150 { "#8710", "delta" },
4151 { "#64257", "fi" },
4152 { "#64258", "fl" },
4153 { "#65292", "," }
4154 };
4155
4156 // Collisions should be rare enough that the CFastMutex is
4157 // faster than recreating the searcher each time this function is called
4158 static CTextFsm<int> searcher;
4159 // set searcher's state, if not already done
4160 {
4161 // just in case of the tiny chance that two threads try to prime
4162 // the searcher at the same time.
4163 static CFastMutex searcher_mtx;
4164 CFastMutexGuard searcher_mtx_guard( searcher_mtx );
4165 if( ! searcher.IsPrimed() ) {
4166 for( int idx = 0;
4167 idx < sizeof(transformations)/sizeof(transformations[0]);
4168 ++idx )
4169 {
4170 // match type is index into transformations array
4171 searcher.AddWord( transformations[idx].src_word, idx );
4172 }
4173 searcher.Prime();
4174 }
4175 }
4176
4177 // a smart compiler probably won't need this manual optimization,
4178 // but just in case.
4179 const SIZE_TYPE str_len = str.length();
4180
4181 // fill result up to the first '&'
4182 string result;
4183 result.reserve( str_len );
4184 copy( str.begin(), str.begin() + amp,
4185 back_inserter(result) );
4186
4187 // at the start of each loop, the result is filled in
4188 // up to the ampersand (amp)
4189 while( amp != NPOS && amp < str_len ) {
4190
4191 // find out what the ampersand code represents
4192 // (if it represents anything)
4193 int state = searcher.GetInitialState();
4194 SIZE_TYPE search_pos = (amp + 1);
4195 if (str[search_pos] == ' ') {
4196 break;
4197 }
4198 for( ; search_pos < str_len ; ++search_pos ) {
4199 const char ch = str[search_pos];
4200 if( ch == ';' ) {
4201 break;
4202 }
4203 if( ch == '&' && state == 0 ) {
4204 --search_pos; // so we don't skip over the '&'
4205 state = searcher.GetInitialState(); // force "no-match"
4206 break;
4207 }
4208 state = searcher.GetNextState(state, ch);
4209 }
4210
4211 if( search_pos == str_len && searcher.IsMatchFound(state) ) {
4212 // copy the translation of the XML code:
4213 _ASSERT( searcher.GetMatches(state).size() == 1 );
4214 const int match_idx = searcher.GetMatches(state)[0];
4215 const string & result_word = transformations[match_idx].result_word;
4216 copy( result_word.begin(), result_word.end(),
4217 back_inserter(result) );
4218 change_made = true;
4219 break;
4220 }
4221
4222 if( search_pos >= str_len ) {
4223 // we reached the end without finding anything, so
4224 // copy the rest and break
4225 copy( str.begin() + amp, str.end(),
4226 back_inserter(result) );
4227 break;
4228 }
4229
4230 if( searcher.IsMatchFound(state) ) {
4231 // copy the translation of the XML code:
4232 _ASSERT( searcher.GetMatches(state).size() == 1 );
4233 const int match_idx = searcher.GetMatches(state)[0];
4234 const string & result_word = transformations[match_idx].result_word;
4235 copy( result_word.begin(), result_word.end(),
4236 back_inserter(result) );
4237 change_made = true;
4238 } else {
4239 // no match found, so copy the text we looked at
4240 // as-is
4241 copy( str.begin() + amp, str.begin() + search_pos + 1,
4242 back_inserter(result) );
4243 }
4244
4245 // find next_amp
4246 if( str[search_pos] == '&' ) {
4247 // special case that occurs when there are multiple '&' together
4248 ++search_pos;
4249 result += '&';
4250 }
4251 SIZE_TYPE next_amp = str.find('&', search_pos );
4252 if( NPOS == next_amp ) {
4253 // no more amps; copy the rest and break
4254 copy( str.begin() + search_pos + 1, str.end(),
4255 back_inserter(result) );
4256 break;
4257 }
4258
4259 // copy up to the next amp
4260 if( (search_pos + 1) < next_amp ) {
4261 copy( str.begin() + search_pos + 1, str.begin() + next_amp,
4262 back_inserter(result) );
4263 }
4264 amp = next_amp;
4265 }
4266
4267 if (change_made) {
4268 str = result;
4269 }
4270
4271 return change_made;
4272 }
4273
4274
GetProteinLocationFromNucleotideLocation(const CSeq_loc & nuc_loc,const CSeq_feat & cds,CScope & scope,bool require_inframe)4275 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe)
4276 {
4277 if (require_inframe) {
4278 feature::ELocationInFrame is_in_frame = feature::IsLocationInFrame(scope.GetSeq_featHandle(cds), nuc_loc);
4279 bool is_ok = false;
4280 switch (is_in_frame) {
4281 case feature::eLocationInFrame_InFrame:
4282 is_ok = true;
4283 break;
4284 case feature::eLocationInFrame_BadStart:
4285 if (cds.GetLocation().GetStart(eExtreme_Biological) == nuc_loc.GetStart(eExtreme_Biological)) {
4286 is_ok = true;
4287 }
4288 break;
4289 case feature::eLocationInFrame_BadStop:
4290 if (cds.GetLocation().GetStop(eExtreme_Biological) == nuc_loc.GetStop(eExtreme_Biological)) {
4291 is_ok = true;
4292 }
4293 break;
4294 case feature::eLocationInFrame_BadStartAndStop:
4295 if (cds.GetLocation().GetStart(eExtreme_Biological) == nuc_loc.GetStart(eExtreme_Biological) &&
4296 cds.GetLocation().GetStop(eExtreme_Biological) == nuc_loc.GetStop(eExtreme_Biological)) {
4297 is_ok = true;
4298 }
4299 break;
4300 case feature::eLocationInFrame_NotIn:
4301 break;
4302 }
4303 if (!is_ok) {
4304 return CRef<CSeq_loc>(NULL);
4305 }
4306 }
4307 CRef<CSeq_loc> new_loc;
4308 CRef<CSeq_loc_Mapper> nuc2prot_mapper(
4309 new CSeq_loc_Mapper(cds, CSeq_loc_Mapper::eLocationToProduct, &scope));
4310 new_loc = nuc2prot_mapper->Map(nuc_loc);
4311 if (!new_loc) {
4312 return CRef<CSeq_loc>(NULL);
4313 }
4314
4315 const CSeq_id* sid = new_loc->GetId();
4316 const CSeq_id* orig_id = nuc_loc.GetId();
4317 if (!sid || (orig_id && sid->Equals(*orig_id))) {
4318 // unable to map to protein location
4319 return CRef<CSeq_loc>(NULL);
4320 }
4321
4322 new_loc->ResetStrand();
4323
4324 // if location includes stop codon, remove it
4325 CBioseq_Handle prot = scope.GetBioseqHandle(*sid);
4326 if (prot && new_loc->GetStop(objects::eExtreme_Positional) >= prot.GetBioseqLength())
4327 {
4328 CRef<CSeq_id> sub_id(new CSeq_id());
4329 sub_id->Assign(*sid);
4330 CSeq_loc sub(*sub_id, prot.GetBioseqLength(), new_loc->GetStop(objects::eExtreme_Positional), new_loc->GetStrand());
4331 new_loc = sequence::Seq_loc_Subtract(*new_loc, sub, CSeq_loc::fMerge_All | CSeq_loc::fSort, &scope);
4332 if (nuc_loc.IsPartialStop(eExtreme_Biological)) {
4333 new_loc->SetPartialStop(true, eExtreme_Biological);
4334 }
4335 }
4336
4337 if (!new_loc->IsInt() && !new_loc->IsPnt()) {
4338 CRef<CSeq_loc> tmp = sequence::Seq_loc_Merge(*new_loc, CSeq_loc::fMerge_All, &scope);
4339 new_loc = tmp;
4340 }
4341
4342 // fix partials if protein feature starts or ends at beginning or end of protein sequence
4343 if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) &&
4344 new_loc->GetStart(eExtreme_Biological) == 0) {
4345 if (new_loc->IsPartialStart(eExtreme_Biological)) {
4346 new_loc->SetPartialStart(false, eExtreme_Biological);
4347 }
4348 }
4349 if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) &&
4350 new_loc->GetStop(eExtreme_Biological) == prot.GetBioseqLength() - 1) {
4351 if (new_loc->IsPartialStop(eExtreme_Biological)) {
4352 new_loc->SetPartialStop(false, eExtreme_Biological);
4353 }
4354 }
4355
4356 return new_loc;
4357 }
4358
4359
GetProteinLocationFromNucleotideLocation(const CSeq_loc & nuc_loc,CScope & scope)4360 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, CScope& scope)
4361 {
4362 CConstRef<CSeq_feat> cds = sequence::GetOverlappingCDS(nuc_loc, scope);
4363 if (!cds || !cds->IsSetProduct()) {
4364 // there is no overlapping coding region feature, so there is no appropriate
4365 // protein sequence to move to
4366 return CRef<CSeq_loc>(NULL);
4367 }
4368
4369 return GetProteinLocationFromNucleotideLocation(nuc_loc, *cds, scope);
4370 }
4371
4372
4373
RepackageProteins(const CSeq_feat & cds,CBioseq_set_Handle np)4374 bool CCleanup::RepackageProteins(const CSeq_feat& cds, CBioseq_set_Handle np)
4375 {
4376 if (!cds.IsSetProduct() || !cds.GetProduct().IsWhole()) {
4377 // no product, or product is specified weirdly
4378 return false;
4379 }
4380 CBioseq_Handle protein = np.GetTSE_Handle().GetBioseqHandle(cds.GetProduct().GetWhole());
4381 if (!protein) {
4382 // protein is not in the same TSE
4383 return false;
4384 }
4385 if (protein.GetParentBioseq_set() == np) {
4386 // already in the right set
4387 return false;
4388 }
4389 CBioseq_set_EditHandle eh(np);
4390 CSeq_entry_Handle ph = protein.GetSeq_entry_Handle();
4391 CSeq_entry_EditHandle peh(ph);
4392 eh.TakeEntry(peh);
4393 return true;
4394 }
4395
4396
RepackageProteins(CSeq_entry_Handle seh)4397 bool CCleanup::RepackageProteins(CSeq_entry_Handle seh)
4398 {
4399 bool changed = false;
4400 CSeq_entry_CI si(seh, CSeq_entry_CI::fRecursive | CSeq_entry_CI::fIncludeGivenEntry, CSeq_entry::e_Set);
4401 while (si) {
4402 CBioseq_set_Handle set = si->GetSet();
4403 if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_nuc_prot && set.HasAnnots()) {
4404 ITERATE(CBioseq_set::TAnnot, annot_it, set.GetCompleteBioseq_set()->GetAnnot()) {
4405 if ((*annot_it)->IsSetData() && (*annot_it)->IsFtable()) {
4406 ITERATE(CSeq_annot::TData::TFtable, feat_it, (*annot_it)->GetData().GetFtable()) {
4407 if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsCdregion()) {
4408 changed |= RepackageProteins(**feat_it, set);
4409 }
4410 }
4411 }
4412 }
4413 }
4414 ++si;
4415 }
4416 return changed;
4417 }
4418
4419
ConvertDeltaSeqToRaw(CSeq_entry_Handle seh,CSeq_inst::EMol filter)4420 bool CCleanup::ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter)
4421 {
4422 bool any_change = false;
4423 for (CBioseq_CI bi(seh, filter); bi; ++bi) {
4424 CBioseq_Handle bsh = *bi;
4425 CRef<CSeq_inst> inst(new CSeq_inst());
4426 inst->Assign(bsh.GetInst());
4427 if (inst->ConvertDeltaToRaw()) {
4428 CBioseq_EditHandle beh(bsh);
4429 beh.SetInst(*inst);
4430 any_change = true;
4431 }
4432 }
4433 return any_change;
4434 }
4435
4436
ParseCodeBreak(const CSeq_feat & feat,CCdregion & cds,const CTempString & str,CScope & scope,IObjtoolsListener * pMessageListener)4437 bool CCleanup::ParseCodeBreak(const CSeq_feat& feat,
4438 CCdregion& cds,
4439 const CTempString& str,
4440 CScope& scope,
4441 IObjtoolsListener* pMessageListener)
4442 {
4443 if (str.empty() || !feat.IsSetLocation()) {
4444 return false;
4445 }
4446
4447 const CSeq_id* feat_loc_seq_id = feat.GetLocation().GetId();
4448 if (!feat_loc_seq_id) {
4449 return false;
4450 }
4451
4452 string::size_type aa_pos = NStr::Find(str, "aa:");
4453 string::size_type len = 0;
4454 string::size_type loc_pos, end_pos;
4455 char protein_letter = 'X';
4456 CRef<CSeq_loc> break_loc;
4457
4458 if (aa_pos == string::npos) {
4459 aa_pos = NStr::Find(str, ",");
4460 if (aa_pos != string::npos) {
4461 aa_pos = NStr::Find(str, ":", aa_pos);
4462 }
4463 if (aa_pos != string::npos) {
4464 aa_pos++;
4465 }
4466 } else {
4467 aa_pos += 3;
4468 }
4469
4470 if (aa_pos != string::npos) {
4471 while (aa_pos < str.length() && isspace(str[aa_pos])) {
4472 aa_pos++;
4473 }
4474 while (aa_pos + len < str.length() && isalpha(str[aa_pos + len])) {
4475 len++;
4476 }
4477 if (len != 0) {
4478 protein_letter = ValidAminoAcid(str.substr(aa_pos, len));
4479 }
4480 }
4481
4482 loc_pos = NStr::Find(str, "(pos:");
4483
4484 using TSubcode = CCleanupMessage::ESubcode;
4485 auto postMessage =
4486 [pMessageListener](string msg, TSubcode subcode) {
4487 pMessageListener->PutMessage(
4488 CCleanupMessage(msg, eDiag_Error, CCleanupMessage::ECode::eCodeBreak, subcode));
4489 };
4490
4491 if (loc_pos == string::npos) {
4492 if (pMessageListener) {
4493 string msg = "Unable to identify code-break location in '" + str + "'";
4494 postMessage(msg, TSubcode::eParseError);
4495 }
4496 return false;
4497 }
4498 loc_pos += 5;
4499 while (loc_pos < str.length() && isspace(str[loc_pos])) {
4500 loc_pos++;
4501 }
4502
4503 end_pos = NStr::Find(str, ",aa:", loc_pos);
4504 if (end_pos == NPOS) {
4505 end_pos = NStr::Find(str, ",", loc_pos);
4506 if (end_pos == NPOS) {
4507 end_pos = str.length();
4508 }
4509 }
4510
4511 string pos = NStr::TruncateSpaces_Unsafe(str.substr(loc_pos, end_pos - loc_pos));
4512
4513 // handle multi-interval positions by adding a join() around them
4514 if (pos.find_first_of(",") != string::npos) {
4515 pos = "join(" + pos + ")";
4516 }
4517
4518 break_loc = ReadLocFromText(pos, feat_loc_seq_id, &scope);
4519
4520 if (break_loc == NULL) {
4521 if (pMessageListener) {
4522 string msg = "Unable to extract code-break location from '" + str + "'";
4523 postMessage(msg, TSubcode::eParseError);
4524 }
4525 return false;
4526 }
4527
4528 if (break_loc->IsInt() && sequence::GetLength(*break_loc, &scope) > 3) {
4529 if (pMessageListener) {
4530 string msg = "code-break location exceeds 3 bases";
4531 postMessage(msg, TSubcode::eBadLocation);
4532 }
4533 return false;
4534 }
4535 if ((break_loc->IsInt() || break_loc->IsPnt()) &&
4536 sequence::Compare(*break_loc, feat.GetLocation(), &scope, sequence::fCompareOverlapping) != sequence::eContained) {
4537 if (pMessageListener) {
4538 string msg = "code-break location lies outside of coding region";
4539 postMessage(msg, TSubcode::eBadLocation);
4540 }
4541 return false;
4542 }
4543
4544 if (FIELD_IS_SET(feat.GetLocation(), Strand) && GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_minus) {
4545 break_loc->SetStrand(GET_FIELD(feat.GetLocation(), Strand));
4546 } else {
4547 RESET_FIELD(*break_loc, Strand);
4548 }
4549
4550 // need to build code break object and add it to coding region
4551 CRef<CCode_break> newCodeBreak(new CCode_break());
4552 CCode_break::TAa& aa = newCodeBreak->SetAa();
4553 aa.SetNcbieaa(protein_letter);
4554 newCodeBreak->SetLoc(*break_loc);
4555
4556 CCdregion::TCode_break& orig_list = cds.SetCode_break();
4557 orig_list.push_back(newCodeBreak);
4558
4559 return true;
4560 }
4561
4562
ParseCodeBreaks(CSeq_feat & feat,CScope & scope)4563 bool CCleanup::ParseCodeBreaks(CSeq_feat& feat, CScope& scope)
4564 {
4565 if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
4566 !feat.IsSetQual() || !feat.IsSetLocation()) {
4567 return false;
4568 }
4569
4570 bool any_removed = false;
4571 CSeq_feat::TQual::iterator it = feat.SetQual().begin();
4572 while (it != feat.SetQual().end()) {
4573 if ((*it)->IsSetQual() &&
4574 NStr::EqualNocase((*it)->GetQual(), "transl_except") &&
4575 (*it)->IsSetVal() &&
4576 ParseCodeBreak(feat, feat.SetData().SetCdregion(), (*it)->GetVal(), scope)) {
4577 it = feat.SetQual().erase(it);
4578 any_removed = true;
4579 } else {
4580 ++it;
4581 }
4582 }
4583 if (feat.GetQual().size() == 0) {
4584 feat.ResetQual();
4585 }
4586 return any_removed;
4587 }
4588
4589
4590 // From SQD-4297
4591 // Influenza is a multi-segmented virus. We would like to create
4592 // small-genome sets when all segments of a particular viral strain
4593 // are submitted together. This is made more difficult due to fact
4594 // that submitters often have large submissions with multiple strains
4595 // at one time.
4596 // This function will segregate sequences with the same taxname
4597 // plus additional qualifiers into small-genome sets, if there are enough
4598 // sequences for that type of Influenza *AND* all CDS and gene features
4599 // on the sequences are complete.
4600 // * Influenza A virus: 8 or more nucleotide sequences with same strain and serotype
4601 // * Influenza B virus: 8 or more nucleotide sequences with same strain
4602 // * Influenza C virus: 7 or more nucleotide sequences with same strain
4603 // * Influenza D virus: 7 or more records with same strain
4604 // Note that as long as we are making strain-specific organism names,
4605 // the taxname must only start with the Influenza designation, not match it.
4606 // Can only make a set if at least one instance of each segment value is represented.
4607 class CInfluenzaSet : public CObject {
4608 public:
4609 CInfluenzaSet(const string& key);
~CInfluenzaSet()4610 ~CInfluenzaSet() {}
4611
4612 static string GetKey(const COrg_ref& org);
4613 bool OkToMakeSet() const;
4614 void MakeSet();
4615
4616 typedef enum {
4617 eNotInfluenza = 0,
4618 eInfluenzaA,
4619 eInfluenzaB,
4620 eInfluenzaC,
4621 eInfluenzaD
4622 } EInfluenzaType;
4623
4624 static EInfluenzaType GetInfluenzaType(const string& taxname);
4625
4626 void AddBioseq(CBioseq_Handle bsh);
4627
4628 protected:
4629 typedef vector<CBioseq_Handle> TMembers;
4630 TMembers m_Members;
4631 const string m_Key;
4632 EInfluenzaType m_FluType;
4633 size_t m_Required;
4634 };
4635
4636
CInfluenzaSet(const string & key)4637 CInfluenzaSet::CInfluenzaSet(const string& key) : m_Key(key)
4638 {
4639 m_FluType = GetInfluenzaType(key);
4640 m_Required = 7;
4641 if (m_FluType == eInfluenzaA || m_FluType == eInfluenzaB) {
4642 m_Required = 8;
4643 }
4644 }
4645
4646
GetInfluenzaType(const string & taxname)4647 CInfluenzaSet::EInfluenzaType CInfluenzaSet::GetInfluenzaType(const string& taxname)
4648 {
4649 if (NStr::StartsWith(taxname, "Influenza A virus", NStr::eNocase)) {
4650 return eInfluenzaA;
4651 } else if (NStr::StartsWith(taxname, "Influenza B virus", NStr::eNocase)) {
4652 return eInfluenzaB;
4653 } else if (NStr::StartsWith(taxname, "Influenza C virus", NStr::eNocase)) {
4654 return eInfluenzaC;
4655 } else if (NStr::StartsWith(taxname, "Influenza D virus", NStr::eNocase)) {
4656 return eInfluenzaD;
4657 } else {
4658 return eNotInfluenza;
4659 }
4660 }
4661
4662
GetKey(const COrg_ref & org)4663 string CInfluenzaSet::GetKey(const COrg_ref& org)
4664 {
4665 if (!org.IsSetTaxname() || !org.IsSetOrgname() || !org.GetOrgname().IsSetMod()) {
4666 return kEmptyStr;
4667 }
4668 EInfluenzaType flu_type = GetInfluenzaType(org.GetTaxname());
4669 if (flu_type == eNotInfluenza) {
4670 return kEmptyStr;
4671 }
4672
4673 CTempString strain = kEmptyStr;
4674 CTempString serotype = kEmptyStr;
4675
4676 ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
4677 if ((*it)->IsSetSubtype() && (*it)->IsSetSubname()) {
4678 if ((*it)->GetSubtype() == COrgMod::eSubtype_strain) {
4679 strain = (*it)->GetSubname();
4680 } else if ((*it)->GetSubtype() == COrgMod::eSubtype_serotype &&
4681 flu_type == eInfluenzaA) {
4682 serotype = (*it)->GetSubname();
4683 }
4684 }
4685 }
4686 if(NStr::IsBlank(strain)) {
4687 return kEmptyStr;
4688 }
4689 if (flu_type == eInfluenzaA) {
4690 if (NStr::IsBlank(serotype)) {
4691 return kEmptyStr;
4692 } else {
4693 return org.GetTaxname() + ":" + strain + ":" + serotype;
4694 }
4695 } else {
4696 return org.GetTaxname() + ":" + strain;
4697 }
4698 }
4699
4700
AddBioseq(CBioseq_Handle bsh)4701 void CInfluenzaSet::AddBioseq(CBioseq_Handle bsh)
4702 {
4703 m_Members.push_back(bsh);
4704 }
4705
4706
OkToMakeSet() const4707 bool CInfluenzaSet::OkToMakeSet() const
4708 {
4709 if (m_Members.size() < m_Required) {
4710 return false;
4711 }
4712
4713 bool ok = true;
4714 bool* seg_found = new bool[m_Required];
4715 for (size_t i = 0; i < m_Required; i++) {
4716 seg_found[i] = false;
4717 }
4718
4719 ITERATE(TMembers, it, m_Members) {
4720 // check to make sure one of each segment is represented
4721 CSeqdesc_CI src(*it, CSeqdesc::e_Source);
4722 if (src->GetSource().IsSetSubtype()) {
4723 bool found_seg = false;
4724 ITERATE(CBioSource::TSubtype, s, src->GetSource().GetSubtype()) {
4725 if ((*s)->IsSetSubtype() && (*s)->IsSetName() &&
4726 (*s)->GetSubtype() == CSubSource::eSubtype_segment) {
4727 try {
4728 size_t seg = NStr::StringToSizet((*s)->GetName());
4729 if (seg < 1 || seg > m_Required) {
4730 ok = false;
4731 break;
4732 }
4733 seg_found[seg - 1] = true;
4734 found_seg = true;
4735 } catch (CException&) {
4736 ok = false;
4737 break;
4738 }
4739 }
4740 }
4741 if (!found_seg) {
4742 ok = false;
4743 }
4744 } else {
4745 ok = false;
4746 }
4747 if (!ok) {
4748 break;
4749 }
4750
4751 // make sure all coding regions and genes are complete
4752 SAnnotSelector sel;
4753 sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
4754 sel.IncludeFeatType(CSeqFeatData::e_Gene);
4755 CFeat_CI f(*it, sel);
4756 while (f) {
4757 if (f->GetLocation().IsPartialStart(eExtreme_Biological) ||
4758 f->GetLocation().IsPartialStop(eExtreme_Biological)) {
4759 ok = false;
4760 break;
4761 }
4762 ++f;
4763 }
4764 if (!ok) break;
4765 }
4766
4767 if (ok) {
4768 for (size_t i = 0; i < m_Required; i++) {
4769 if (!seg_found[i]) {
4770 ok = false;
4771 break;
4772 }
4773 }
4774 }
4775 delete[] seg_found;
4776
4777 return ok;
4778 }
4779
4780
MakeSet()4781 void CInfluenzaSet::MakeSet()
4782 {
4783 if (m_Members.size() == 0) {
4784 return;
4785 }
4786 CBioseq_set_Handle parent = m_Members[0].GetParentBioseq_set();
4787 if (!parent) {
4788 return;
4789 }
4790 if (parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
4791 parent = parent.GetParentBioseq_set();
4792 }
4793 if (!parent) {
4794 return;
4795 }
4796 CSeq_entry_Handle peh = parent.GetParentEntry();
4797 CSeq_entry_EditHandle peeh(peh);
4798 CBioseq_set_EditHandle parent_edit(parent);
4799 CRef<CSeq_entry> ns(new CSeq_entry());
4800 ns->SetSet().SetClass(CBioseq_set::eClass_small_genome_set);
4801 CSeq_entry_EditHandle new_set = parent_edit.AttachEntry(*ns, -1);
4802 ITERATE(TMembers, it, m_Members) {
4803 CBioseq_set_Handle np = it->GetParentBioseq_set();
4804 if (np && np.IsSetClass() && np.GetClass() == CBioseq_set::eClass_nuc_prot) {
4805 CSeq_entry_Handle nps = np.GetParentEntry();
4806 CSeq_entry_EditHandle npse(nps);
4807 npse.Remove();
4808 new_set.AttachEntry(npse);
4809 } else {
4810 CSeq_entry_Handle s = it->GetParentEntry();
4811 CSeq_entry_EditHandle se(s);
4812 se.Remove();
4813 new_set.AttachEntry(se);
4814 }
4815 }
4816 }
4817
4818
4819 typedef map<string, CRef<CInfluenzaSet> > TInfluenzaSetMap;
4820
MakeSmallGenomeSet(CSeq_entry_Handle entry)4821 size_t CCleanup::MakeSmallGenomeSet(CSeq_entry_Handle entry)
4822 {
4823 TInfluenzaSetMap flu_map;
4824
4825 CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4826 while (bi) {
4827 CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4828 if (src && src->GetSource().IsSetOrg()) {
4829 string key = CInfluenzaSet::GetKey(src->GetSource().GetOrg());
4830 if (!NStr::IsBlank(key)) {
4831 // add to set
4832 TInfluenzaSetMap::iterator it = flu_map.find(key);
4833 if (it == flu_map.end()) {
4834 CRef<CInfluenzaSet> new_set(new CInfluenzaSet(key));
4835 new_set->AddBioseq(*bi);
4836 flu_map[key] = new_set;
4837 } else {
4838 it->second->AddBioseq(*bi);
4839 }
4840 }
4841 }
4842 ++bi;
4843 }
4844 // now create sets
4845 size_t added = 0;
4846 NON_CONST_ITERATE(TInfluenzaSetMap, it, flu_map) {
4847 if (it->second->OkToMakeSet()) {
4848 it->second->MakeSet();
4849 added++;
4850 }
4851 }
4852
4853 return added;
4854 }
4855
4856
AddIRDMiscFeature(CBioseq_Handle bh,const CDbtag & tag)4857 void AddIRDMiscFeature(CBioseq_Handle bh, const CDbtag& tag)
4858 {
4859 CSeq_annot_Handle ftable;
4860
4861 CSeq_annot_CI annot_ci(bh);
4862 for (; annot_ci; ++annot_ci) {
4863 if ((*annot_ci).IsFtable()) {
4864 ftable = *annot_ci;
4865 break;
4866 }
4867 }
4868
4869 if (!ftable) {
4870 CBioseq_EditHandle beh = bh.GetEditHandle();
4871 CRef<CSeq_annot> new_annot(new CSeq_annot());
4872 ftable = beh.AttachAnnot(*new_annot);
4873 }
4874
4875 CSeq_annot_EditHandle aeh(ftable);
4876
4877 CRef<CSeq_feat> f(new CSeq_feat());
4878 f->SetData().SetImp().SetKey("misc_feature");
4879 f->SetLocation().SetInt().SetFrom(0);
4880 f->SetLocation().SetInt().SetTo(bh.GetBioseqLength() - 1);
4881 f->SetLocation().SetInt().SetId().Assign(*(bh.GetSeqId()));
4882 CRef<CDbtag> xref(new CDbtag());
4883 xref->Assign(tag);
4884 f->SetDbxref().push_back(xref);
4885 CRef<CSeqFeatXref> suppress(new CSeqFeatXref());
4886 suppress->SetData().SetGene();
4887 f->SetXref().push_back(suppress);
4888 aeh.AddFeat(*f);
4889 }
4890
4891
MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry)4892 bool CCleanup::MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry)
4893 {
4894 bool any = false;
4895 CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4896 while (bi) {
4897 CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4898 while (src) {
4899 if (src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDb()) {
4900 CRef<COrg_ref> org(const_cast<COrg_ref *>(&(src->GetSource().GetOrg())));
4901 COrg_ref::TDb::iterator db = org->SetDb().begin();
4902 while (db != org->SetDb().end()) {
4903 if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "IRD")) {
4904 AddIRDMiscFeature(*bi, **db);
4905 db = org->SetDb().erase(db);
4906 any = true;
4907 } else {
4908 ++db;
4909 }
4910 }
4911 if (org->GetDb().size() == 0) {
4912 org->ResetDb();
4913 }
4914 }
4915 ++src;
4916 }
4917 ++bi;
4918 }
4919 return any;
4920 }
4921
4922 //LCOV_EXCL_START
4923 //not used by asn_cleanup but used by other applications
4924 const unsigned int methionine_encoded = 'M' - 'A';
4925
IsMethionine(const CCode_break & cb)4926 bool CCleanup::IsMethionine(const CCode_break& cb)
4927 {
4928 if (!cb.IsSetAa()) {
4929 return false;
4930 }
4931 bool rval = false;
4932 switch (cb.GetAa().Which()) {
4933 case CCode_break::TAa::e_Ncbi8aa:
4934 if (cb.GetAa().GetNcbi8aa() == methionine_encoded) {
4935 rval = true;
4936 }
4937 break;
4938 case CCode_break::TAa::e_Ncbieaa:
4939 if (cb.GetAa().GetNcbieaa() == 'M') {
4940 rval = true;
4941 }
4942 break;
4943 case CCode_break::TAa::e_Ncbistdaa:
4944 if (cb.GetAa().GetNcbistdaa() == methionine_encoded) {
4945 rval = true;
4946 }
4947 break;
4948 default:
4949 break;
4950 }
4951 return rval;
4952 }
4953 //LCOV_EXCL_STOP
4954
4955
4956 //LCOV_EXCL_START
4957 //not used by asn_cleanup but used by other applications
GetCodeBreakForLocation(size_t pos,const CSeq_feat & cds)4958 CConstRef<CCode_break> CCleanup::GetCodeBreakForLocation(size_t pos, const CSeq_feat& cds)
4959 {
4960 if (!cds.IsSetData() || !cds.GetData().IsCdregion() ||
4961 !cds.IsSetLocation() ||
4962 !cds.GetData().GetCdregion().IsSetCode_break()) {
4963 return CConstRef<CCode_break>(NULL);
4964 }
4965
4966 TSeqPos frame = 0;
4967 if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4968 {
4969 switch(cds.GetData().GetCdregion().GetFrame())
4970 {
4971 case CCdregion::eFrame_not_set :
4972 case CCdregion::eFrame_one : frame = 0; break;
4973 case CCdregion::eFrame_two : frame = 1; break;
4974 case CCdregion::eFrame_three : frame = 2; break;
4975 default : frame = 0; break;
4976 }
4977 }
4978
4979 for (auto cb : cds.GetData().GetCdregion().GetCode_break()) {
4980 if (cb->IsSetLoc()) {
4981 TSeqPos offset = sequence::LocationOffset(cds.GetLocation(),
4982 cb->GetLoc());
4983 if (offset >= frame &&
4984 ((offset - frame) / 3 ) + 1 == pos) {
4985 return cb;
4986 }
4987 }
4988 }
4989 return CConstRef<CCode_break>(NULL);
4990 }
4991 //LCOV_EXCL_STOP
4992
4993 //LCOV_EXCL_START
4994 //appears not to be used
SetCodeBreakLocation(CCode_break & cb,size_t pos,const CSeq_feat & cds)4995 void CCleanup::SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds)
4996 {
4997 int start = static_cast<int>((pos-1)*3);
4998 //start -= 1;
4999 //start *= 3;
5000 int frame = 0;
5001 if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
5002 {
5003 switch(cds.GetData().GetCdregion().GetFrame())
5004 {
5005 case CCdregion::eFrame_not_set :
5006 case CCdregion::eFrame_one : frame = 0; break;
5007 case CCdregion::eFrame_two : frame = 1; break;
5008 case CCdregion::eFrame_three : frame = 2; break;
5009 default : frame = 0; break;
5010 }
5011 }
5012 int frame_shift = (start - frame) % 3;
5013 if (frame_shift < 0) {
5014 frame_shift += 3;
5015 }
5016 if (frame_shift == 1)
5017 start += 2;
5018 else if (frame_shift == 2)
5019 start += 1;
5020
5021 int offset = 0;
5022 CRef<CSeq_loc> packed (new CSeq_loc());
5023 for (CSeq_loc_CI loc_iter(cds.GetLocation()); loc_iter; ++loc_iter) {
5024 int len = loc_iter.GetRange().GetLength();
5025 if (offset <= start && offset + len > start) {
5026 CRef<CSeq_interval> tmp(new CSeq_interval());
5027 tmp->SetId().Assign(loc_iter.GetSeq_id());
5028 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5029 tmp->SetStrand(eNa_strand_minus);
5030 tmp->SetTo(loc_iter.GetRange().GetTo() - (start - offset) );
5031 } else {
5032 tmp->SetFrom(loc_iter.GetRange().GetFrom() + start - offset);
5033 }
5034 if (offset <= start + 2 && offset + len > start + 2) {
5035 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5036 tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
5037 } else {
5038 tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
5039 }
5040 } else {
5041 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5042 tmp->SetFrom(loc_iter.GetRange().GetFrom());
5043 } else {
5044 tmp->SetTo(loc_iter.GetRange().GetTo());
5045 }
5046 }
5047 packed->SetPacked_int().Set().push_back(tmp);
5048 } else if (offset > start && offset <= start + 2) {
5049 // add new interval
5050 CRef<CSeq_interval> tmp (new CSeq_interval());
5051 tmp->SetId().Assign(loc_iter.GetSeq_id());
5052 if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
5053 tmp->SetStrand(eNa_strand_minus);
5054 tmp->SetTo(loc_iter.GetRange().GetTo());
5055 if (offset + len >= start + 2) {
5056 tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
5057 } else {
5058 tmp->SetFrom(loc_iter.GetRange().GetFrom());
5059 }
5060 } else {
5061 tmp->SetFrom(loc_iter.GetRange().GetFrom());
5062 if (offset + len >= start + 2) {
5063 tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
5064 } else {
5065 tmp->SetTo(loc_iter.GetRange().GetTo());
5066 }
5067 }
5068
5069 packed->SetPacked_int().Set().push_back(tmp);
5070 }
5071 offset += len;
5072 }
5073 if (packed->Which() != CSeq_loc::e_Packed_int || packed->GetPacked_int().Get().size() == 0) {
5074 cb.ResetLoc();
5075 }
5076 if (packed->GetPacked_int().Get().size() == 1) {
5077 cb.SetLoc().SetInt().Assign(*(packed->GetPacked_int().Get().front()));
5078 } else {
5079 cb.SetLoc(*packed);
5080 }
5081 }
5082 //LCOV_EXCL_STOP
5083
5084
5085 //LCOV_EXCL_START
5086 //not used by asn_cleanup but used by other applications
FixRNAEditingCodingRegion(CSeq_feat & cds)5087 bool CCleanup::FixRNAEditingCodingRegion(CSeq_feat& cds)
5088 {
5089 if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
5090 return false;
5091 }
5092 if (!cds.IsSetLocation() ||
5093 cds.GetLocation().IsPartialStart(eExtreme_Biological)) {
5094 return false;
5095 }
5096 CConstRef<CCode_break> cbstart = GetCodeBreakForLocation(1, cds);
5097 if (cbstart && !CCleanup::IsMethionine(*cbstart)) {
5098 // already have a start translation exception AND it is not methionine
5099 return false;
5100 }
5101
5102 bool any_change = false;
5103 if (!cds.IsSetExcept_text() || NStr::IsBlank(cds.GetExcept_text())) {
5104 cds.SetExcept_text("RNA editing");
5105 any_change = true;
5106 } else if (NStr::Find(cds.GetExcept_text(), "RNA editing") == string::npos) {
5107 cds.SetExcept_text(cds.GetExcept_text() + "; RNA editing");
5108 any_change = true;
5109 }
5110 if (!cds.IsSetExcept() || !cds.GetExcept()) {
5111 cds.SetExcept(true);
5112 any_change = true;
5113 }
5114 return any_change;
5115 }
5116 //LCOV_EXCL_STOP
5117
5118
5119 //LCOV_EXCL_START
5120 //not used by asn_cleanup but used by other applications
CleanupCollectionDates(CSeq_entry_Handle seh,bool month_first)5121 bool CCleanup::CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first)
5122 {
5123 bool any_changes = false;
5124
5125 vector<CRef<COrg_ref> > rq_list;
5126 vector<const CSeqdesc* > src_descs;
5127 vector<CConstRef<CSeq_feat> > src_feats;
5128
5129 GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
5130 vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
5131 while (desc_it != src_descs.end()) {
5132 if ((*desc_it)->GetSource().IsSetSubtype()) {
5133 CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
5134 for (auto s : desc->SetSource().SetSubtype()) {
5135 if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
5136 && s->IsSetName()) {
5137 bool month_ambiguous = false;
5138 string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
5139 if (!NStr::Equal(new_date, s->GetName())) {
5140 s->SetName(new_date);
5141 any_changes = true;
5142 }
5143 }
5144 }
5145 }
5146 ++desc_it;
5147 }
5148
5149 CFeat_CI feat(seh, SAnnotSelector(CSeqFeatData::e_Biosrc));
5150 while (feat) {
5151 if (feat->GetData().GetBiosrc().IsSetSubtype()) {
5152 CRef<CSeq_feat> new_feat(new CSeq_feat());
5153 new_feat->Assign(*(feat->GetOriginalSeq_feat()));
5154 bool local_change = false;
5155 for (auto s : new_feat->SetData().SetBiosrc().SetSubtype()) {
5156 if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
5157 && s->IsSetName()) {
5158 bool month_ambiguous = false;
5159 string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
5160 if (!NStr::Equal(new_date, s->GetName())) {
5161 s->SetName(new_date);
5162 local_change = true;
5163 }
5164 }
5165 }
5166 if (local_change) {
5167 any_changes = true;
5168 CSeq_feat_EditHandle efh(*feat);
5169 efh.Replace(*new_feat);
5170 }
5171 ++feat;
5172 }
5173 }
5174
5175 return any_changes;
5176 }
5177 //LCOV_EXCL_STOP
5178
5179
AutodefId(CSeq_entry_Handle seh)5180 void CCleanup::AutodefId(CSeq_entry_Handle seh)
5181 {
5182 // remove existing options (TODO)
5183 for (CBioseq_CI b(seh); b; ++b) {
5184 bool removed = true;
5185 while (removed) {
5186 removed = false;
5187 CSeqdesc_CI ud(*b, CSeqdesc::e_User);
5188 while (ud) {
5189 if (ud->GetUser().IsAutodefOptions()) {
5190 CSeq_entry_Handle s = ud.GetSeq_entry_Handle();
5191 CSeq_entry_EditHandle se = s.GetEditHandle();
5192 se.RemoveSeqdesc(*ud);
5193 removed = true;
5194 break;
5195 }
5196 ++ud;
5197 }
5198 }
5199 }
5200
5201 // create new options
5202 CRef<CUser_object> auto_user = CAutoDef::CreateIDOptions(seh);
5203 CRef<CSeqdesc> d(new CSeqdesc());
5204 d->SetUser().Assign(*auto_user);
5205 CSeq_entry_EditHandle eh = seh.GetEditHandle();
5206 eh.AddSeqdesc(*d);
5207
5208 CAutoDef::RegenerateSequenceDefLines(seh);
5209 }
5210
5211 END_SCOPE(objects)
5212 END_NCBI_SCOPE
5213