1 /*  $Id: autodef.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Colleen Bollin
27 *
28 * File Description:
29 *   Generate unique definition lines for a set of sequences using organism
30 *   descriptions and feature clauses.
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <objmgr/util/autodef.hpp>
35 #include <corelib/ncbimisc.hpp>
36 #include <objmgr/seqdesc_ci.hpp>
37 #include <objmgr/bioseq_ci.hpp>
38 #include <objmgr/util/feature.hpp>
39 #include <objmgr/util/sequence.hpp>
40 #include <objmgr/util/seq_loc_util.hpp>
41 
42 #include <objects/seq/Seq_descr.hpp>
43 #include <objects/seq/Seqdesc.hpp>
44 #include <objects/seq/Bioseq.hpp>
45 #include <objects/seqblock/GB_block.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48 #include <objects/seqfeat/BioSource.hpp>
49 #include <objects/seqfeat/Org_ref.hpp>
50 #include <objects/general/Dbtag.hpp>
51 
52 #include <serial/iterator.hpp>
53 
54 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)55 BEGIN_SCOPE(objects)
56 
57 
58 CAutoDef::CAutoDef()
59     : m_Cancelled(false)
60 {
61 }
62 
63 
~CAutoDef()64 CAutoDef::~CAutoDef()
65 {
66 }
67 
68 
s_NeedFeatureClause(const CBioseq & b)69 bool s_NeedFeatureClause(const CBioseq& b)
70 {
71     if (!b.IsSetAnnot()) {
72         return true;
73     }
74     size_t num_features = 0;
75 
76     ITERATE(CBioseq::TAnnot, a, b.GetAnnot()) {
77         if ((*a)->IsFtable()) {
78             num_features += (*a)->GetData().GetFtable().size();
79             if (num_features > 100) {
80                 break;
81             }
82         }
83     }
84     if (num_features < 100) {
85         return true;
86     } else {
87         return false;
88     }
89 }
90 
AddSources(CSeq_entry_Handle se)91 void CAutoDef::AddSources (CSeq_entry_Handle se)
92 {
93 
94     // add sources to modifier combination groups
95     CBioseq_CI seq_iter(se, CSeq_inst::eMol_na);
96     for ( ; seq_iter; ++seq_iter ) {
97         CSeqdesc_CI dit((*seq_iter), CSeqdesc::e_Source);
98         if (dit) {
99             string feature_clauses = s_NeedFeatureClause(*(seq_iter->GetCompleteBioseq())) ? x_GetFeatureClauses(*seq_iter) : kEmptyStr;
100             const CBioSource& bsrc = dit->GetSource();
101             m_OrigModCombo.AddSource(bsrc, feature_clauses);
102         }
103     }
104 
105     // set default exclude_sp values
106     m_OrigModCombo.SetExcludeSpOrgs (m_OrigModCombo.GetDefaultExcludeSp());
107 }
108 
109 
x_SortModifierListByRank(TModifierIndexVector & index_list,CAutoDefSourceDescription::TAvailableModifierVector & modifier_list)110 void CAutoDef::x_SortModifierListByRank(TModifierIndexVector &index_list, CAutoDefSourceDescription::TAvailableModifierVector &modifier_list)
111 {
112     unsigned int k, j, tmp;
113     if (index_list.size() < 2) {
114         return;
115     }
116     for (k = 0; k < index_list.size() - 1; k++) {
117         for (j = k + 1; j < index_list.size(); j++) {
118             if (modifier_list[index_list[k]].GetRank() > modifier_list[index_list[j]].GetRank()) {
119                  tmp = index_list[k];
120                  index_list[k] = index_list[j];
121                  index_list[j] = tmp;
122              }
123          }
124      }
125 }
126 
127 
x_GetModifierIndexList(TModifierIndexVector & index_list,CAutoDefSourceDescription::TAvailableModifierVector & modifier_list)128 void CAutoDef::x_GetModifierIndexList(TModifierIndexVector &index_list, CAutoDefSourceDescription::TAvailableModifierVector &modifier_list)
129 {
130     unsigned int k;
131     TModifierIndexVector remaining_list;
132 
133     index_list.clear();
134     remaining_list.clear();
135 
136     // note - required modifiers should be removed from the list
137 
138     // first, look for all_present and all_unique modifiers
139     for (k = 0; k < modifier_list.size(); k++) {
140         if (modifier_list[k].AllPresent() && modifier_list[k].AllUnique()) {
141             index_list.push_back(k);
142         } else if (modifier_list[k].AnyPresent()) {
143             remaining_list.push_back(k);
144         }
145     }
146     x_SortModifierListByRank(index_list, modifier_list);
147     x_SortModifierListByRank(remaining_list, modifier_list);
148 
149     for (k = 0; k < remaining_list.size(); k++) {
150         index_list.push_back(remaining_list[k]);
151     }
152 }
153 
154 
x_IsOrgModRequired(unsigned int mod_type)155 bool CAutoDef::x_IsOrgModRequired(unsigned int mod_type)
156 {
157     return false;
158 }
159 
160 
x_IsSubSrcRequired(unsigned int mod_type)161 bool CAutoDef::x_IsSubSrcRequired(unsigned int mod_type)
162 {
163     if (mod_type == CSubSource::eSubtype_endogenous_virus_name
164         || mod_type == CSubSource::eSubtype_plasmid_name
165         || mod_type == CSubSource::eSubtype_transgenic) {
166         return true;
167     } else {
168         return false;
169     }
170 }
171 
172 
GetNumAvailableModifiers()173 unsigned int CAutoDef::GetNumAvailableModifiers()
174 {
175     CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
176     modifier_list.clear();
177     m_OrigModCombo.GetAvailableModifiers (modifier_list);
178 
179     unsigned int num_present = 0;
180     for (unsigned int k = 0; k < modifier_list.size(); k++) {
181         if (modifier_list[k].AnyPresent()) {
182             num_present++;
183         }
184     }
185     return num_present;
186 }
187 
188 
189 struct SAutoDefModifierComboSort {
operator ()SAutoDefModifierComboSort190     bool operator()(const CRef<CAutoDefModifierCombo>& s1,
191                     const CRef<CAutoDefModifierCombo>& s2) const
192     {
193         return (*s1 < *s2);
194     }
195 };
196 
197 
198 
FindBestModifierCombo()199 CRef<CAutoDefModifierCombo> CAutoDef::FindBestModifierCombo()
200 {
201     TModifierComboVector  combo_list;
202 
203     combo_list.clear();
204     combo_list.emplace_back (new CAutoDefModifierCombo(&m_OrigModCombo));
205 
206 
207     TModifierComboVector tmp, add_list;
208     TModifierComboVector::iterator it;
209     CAutoDefSourceDescription::TModifierVector mod_list;
210     bool stop = false;
211     unsigned int  k;
212 
213     mod_list.clear();
214 
215     if (combo_list[0]->GetMaxInGroup() == 1) {
216         stop = true;
217     }
218 
219     while (!stop) {
220         stop = true;
221         it = combo_list.begin();
222         add_list.clear();
223         while (it != combo_list.end()) {
224             tmp = (*it)->ExpandByAnyPresent ();
225             if (!tmp.empty()) {
226                 stop = false;
227                 for (k = 0; k < tmp.size(); k++) {
228                     add_list.emplace_back (new CAutoDefModifierCombo(tmp[k]));
229                 }
230                 it = combo_list.erase (it);
231             } else {
232                 ++it;
233             }
234             tmp.clear();
235         }
236         for (k = 0; k < add_list.size(); k++) {
237             combo_list.emplace_back (new CAutoDefModifierCombo(add_list[k]));
238         }
239         add_list.clear();
240         std::sort (combo_list.begin(), combo_list.end(), SAutoDefModifierComboSort());
241         if (combo_list[0]->GetMaxInGroup() == 1) {
242             stop = true;
243         }
244     }
245 
246     ITERATE (CAutoDefSourceDescription::TModifierVector, it, combo_list[0]->GetModifiers()) {
247         mod_list.push_back (CAutoDefSourceModifierInfo(*it));
248     }
249 
250     return combo_list[0];
251 }
252 
253 
GetAllModifierCombo()254 CAutoDefModifierCombo* CAutoDef::GetAllModifierCombo()
255 {
256     CAutoDefModifierCombo *newm = new CAutoDefModifierCombo(&m_OrigModCombo);
257 
258     // set all modifiers in combo
259     CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
260 
261     // first, get the list of modifiers that are available
262     modifier_list.clear();
263     newm->GetAvailableModifiers (modifier_list);
264 
265     // add any modifier not already in the combo to the combo
266     for (unsigned int k = 0; k < modifier_list.size(); k++) {
267         if (modifier_list[k].AnyPresent()) {
268             if (modifier_list[k].IsOrgMod()) {
269                 COrgMod::ESubtype subtype = modifier_list[k].GetOrgModType();
270                 if (!newm->HasOrgMod(subtype)) {
271                     newm->AddOrgMod(subtype);
272                 }
273             } else {
274                 CSubSource::ESubtype subtype = modifier_list[k].GetSubSourceType();
275                 if (!newm->HasSubSource(subtype)) {
276                     newm->AddSubsource(subtype);
277                 }
278             }
279         }
280     }
281     return newm;
282 }
283 
284 
GetEmptyCombo()285 CAutoDefModifierCombo* CAutoDef::GetEmptyCombo()
286 {
287     CAutoDefModifierCombo *newm = new CAutoDefModifierCombo(&m_OrigModCombo);
288 
289     return newm;
290 }
291 
292 
GetOneSourceDescription(const CBioseq_Handle & bh)293 string CAutoDef::GetOneSourceDescription(const CBioseq_Handle& bh)
294 {
295     CRef<CAutoDefModifierCombo> best = FindBestModifierCombo();
296     if (best == NULL) {
297         return "";
298     }
299 
300     for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit;  ++dit) {
301         const CBioSource& bsrc = dit->GetSource();
302         return best->GetSourceDescriptionString(bsrc);
303     }
304     return "";
305 }
306 
307 
x_RemoveOptionalFeatures(CAutoDefFeatureClause_Base * main_clause,const CBioseq_Handle & bh)308 void CAutoDef::x_RemoveOptionalFeatures(CAutoDefFeatureClause_Base *main_clause, const CBioseq_Handle& bh)
309 {
310     // remove optional features that have not been requested
311     if (main_clause == NULL) {
312         return;
313     }
314 
315     // keep 5' UTRs only if lonely or requested
316     if (!m_Options.GetKeep5UTRs() && !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_5UTR)) {
317         main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_5UTR);
318     }
319 
320     // keep 3' UTRs only if lonely or requested
321     if (!m_Options.GetKeep3UTRs() && !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_3UTR)) {
322         main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_3UTR);
323     }
324 
325     // keep LTRs only if requested or lonely and not in parent
326     if (!m_Options.GetKeepLTRs() && !m_Options.GetKeepRepeatRegion() &&
327         !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_LTR)) {
328         main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_LTR);
329     }
330 
331     // keep promoters only if requested or lonely and not in mRNA
332     if (!m_Options.GetKeepRegulatoryFeatures()) {
333         if (m_Options.GetUseFakePromoters()) {
334             // promoters are requested, remove all regulatory features except promoters
335             main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, true);
336         } else {
337             bool lonely = main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_regulatory);
338             if (lonely) {
339                 // remove regulatory features, including promoters, only in mRNA sequences
340                 main_clause->RemoveFeaturesInmRNAsByType(CSeqFeatData::eSubtype_regulatory, false);
341                 // remove regulatory features other than promoters everywhere else
342                 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, true);
343             } else {
344                 // remove all regulatory features
345                 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, false);
346             }
347         }
348     }
349 
350     // keep introns only if requested or lonely and not in mRNA
351     if (!m_Options.GetKeepIntrons()) {
352         if (!main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_intron)) {
353             main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_intron);
354         } else {
355             main_clause->RemoveFeaturesInmRNAsByType(CSeqFeatData::eSubtype_intron);
356         }
357     }
358 
359     // keep exons only if requested or lonely or in mRNA or in partial CDS or on segment
360     if (!m_Options.GetKeepExons() && !IsSegment(bh)) {
361         if (main_clause->GetMainFeatureSubtype() != CSeqFeatData::eSubtype_exon) {
362             main_clause->RemoveUnwantedExons();
363         }
364     }
365 
366     // only keep bioseq precursor RNAs if lonely or requested
367     if (!main_clause->IsBioseqPrecursorRNA() && !m_Options.GetKeepPrecursorRNA()) {
368         main_clause->RemoveBioseqPrecursorRNAs();
369     }
370 
371     // keep uORFs if lonely or requested
372     if (!m_Options.GetKeepuORFs() && main_clause->GetNumSubclauses() > 1) {
373         main_clause->RemoveuORFs();
374     }
375 
376     // remove "optional" mobile element features unless lonely or requested
377     if (!m_Options.GetKeepMobileElements() && main_clause->GetNumSubclauses() > 1) {
378         main_clause->RemoveOptionalMobileElements();
379     }
380 
381     // keep misc_recombs only if requested
382     if (!m_Options.GetKeepMiscRecomb()) {
383         main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_misc_recomb);
384     }
385 
386     // delete subclauses at end, so that loneliness calculations will be correct
387     main_clause->RemoveDeletedSubclauses();
388 }
389 
390 
x_IsFeatureSuppressed(CSeqFeatData::ESubtype subtype)391 bool CAutoDef::x_IsFeatureSuppressed(CSeqFeatData::ESubtype subtype)
392 {
393     return m_Options.IsFeatureSuppressed(subtype);
394 }
395 
396 
SuppressFeature(const objects::CFeatListItem & feat)397 void CAutoDef::SuppressFeature(const objects::CFeatListItem& feat)
398 {
399     if (feat.GetType() == CSeqFeatData::e_not_set) {
400         m_Options.SuppressAllFeatures();
401     } else {
402         m_Options.SuppressFeature((CSeqFeatData::ESubtype)(feat.GetSubtype()));
403     }
404 }
405 
406 
SuppressFeature(objects::CSeqFeatData::ESubtype subtype)407 void CAutoDef::SuppressFeature(objects::CSeqFeatData::ESubtype subtype)
408 {
409     m_Options.SuppressFeature(subtype);
410 }
411 
412 
IsSegment(const CBioseq_Handle & bh)413 bool CAutoDef::IsSegment(const CBioseq_Handle& bh)
414 {
415     CSeq_entry_Handle seh = bh.GetParentEntry();
416 
417     seh = seh.GetParentEntry();
418 
419     if (seh && seh.IsSet()) {
420         CBioseq_set_Handle bsh = seh.GetSet();
421         if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_parts) {
422             return true;
423         }
424     }
425     return false;
426 }
427 
428 
GetMasterLocation(CBioseq_Handle & bh,CRange<TSeqPos> & range)429 void CAutoDef::GetMasterLocation(CBioseq_Handle &bh, CRange<TSeqPos>& range)
430 {
431     CSeq_entry_Handle seh = bh.GetParentEntry();
432     CBioseq_Handle    master = bh;
433     unsigned int      start = 0, stop = bh.GetBioseqLength() - 1;
434     unsigned int      offset = 0;
435 
436     seh = seh.GetParentEntry();
437 
438     if (seh && seh.IsSet()) {
439         CBioseq_set_Handle bsh = seh.GetSet();
440         if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_parts) {
441             seh = seh.GetParentEntry();
442             if (seh.IsSet()) {
443                 bsh = seh.GetSet();
444                 if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_segset) {
445                     CBioseq_CI seq_iter(seh);
446                     for ( ; seq_iter; ++seq_iter ) {
447                         if (seq_iter->CanGetInst_Repr()) {
448                             if (seq_iter->GetInst_Repr() == CSeq_inst::eRepr_seg) {
449                                 master = *seq_iter;
450                             } else if (seq_iter->GetInst_Repr() == CSeq_inst::eRepr_raw) {
451                                 if (*seq_iter == bh) {
452                                     start = offset;
453                                     stop = offset + bh.GetBioseqLength() - 1;
454                                 } else {
455                                     offset += seq_iter->GetBioseqLength();
456                                 }
457                             }
458                         }
459                     }
460                 }
461             }
462         }
463     }
464     bh = master;
465     range.SetFrom(start);
466     range.SetTo(stop);
467 }
468 
469 
x_Is5SList(CFeat_CI feat_ci)470 bool CAutoDef::x_Is5SList(CFeat_CI feat_ci)
471 {
472     bool is_list = true;
473     bool is_single = true;
474     bool found_single = false;
475 
476     if (!feat_ci) {
477         return false;
478     }
479     ++feat_ci;
480     if (feat_ci) {
481         is_single = false;
482     }
483     feat_ci.Rewind();
484 
485     while (feat_ci && is_list) {
486         if (feat_ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA) {
487             if (!feat_ci->GetData().GetRna().IsSetExt()
488                 || !feat_ci->GetData().GetRna().GetExt().IsName()
489                 || !NStr::Equal(feat_ci->GetData().GetRna().GetExt().GetName(), "5S ribosomal RNA")) {
490                 is_list = false;
491             }
492         } else if (feat_ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
493             if (!feat_ci->IsSetComment()) {
494                 is_list = false;
495             } else if (NStr::Equal(feat_ci->GetComment(), "contains 5S ribosomal RNA and nontranscribed spacer")) {
496                 found_single = true;
497             } else if (!NStr::Equal(feat_ci->GetComment(), "nontranscribed spacer")) {
498                 is_list = false;
499             }
500         } else {
501             is_list = false;
502         }
503         ++feat_ci;
504     }
505     if (is_single && !found_single) {
506         is_list = false;
507     }
508     feat_ci.Rewind();
509     return is_list;
510 }
511 
512 
x_IsSingleMiscFeat(CFeat_CI feat_ci)513 bool CAutoDef::x_IsSingleMiscFeat(CFeat_CI feat_ci)
514 {
515     if (!feat_ci ||
516         feat_ci->GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature ||
517         !feat_ci->IsSetComment()) {
518         return false;
519     }
520     bool is_single = true;
521     ++feat_ci;
522     if (feat_ci) {
523         is_single = false;
524     }
525     feat_ci.Rewind();
526     return is_single;
527 }
528 
529 
s_HasPromoter(CBioseq_Handle bh)530 bool s_HasPromoter(CBioseq_Handle bh)
531 {
532     bool has_promoter = false;
533     SAnnotSelector sel(CSeqFeatData::eSubtype_regulatory);
534     CFeat_CI f_ci (bh, sel);
535     while (f_ci && !has_promoter) {
536         has_promoter = CAutoDefFeatureClause::IsPromoter(*(f_ci->GetSeq_feat()));
537         ++f_ci;
538     }
539     return has_promoter;
540 }
541 
542 
x_GetFeatureClauses(const CBioseq_Handle & bh)543 string CAutoDef::x_GetFeatureClauses(const CBioseq_Handle& bh)
544 {
545     const string& custom = m_Options.GetCustomFeatureClause();
546     if (!NStr::IsBlank(custom)) {
547         return custom;
548     }
549 
550     CSeqdesc_CI d(bh, CSeqdesc::e_User);
551     while (d) {
552         if (x_IsHumanSTR(d->GetUser())) {
553             return x_GetHumanSTRFeatureClauses(bh, d->GetUser());
554         }
555         ++d;
556     }
557 
558 
559     CAutoDefFeatureClause_Base main_clause(m_Options);
560     CRange<TSeqPos> range;
561     CBioseq_Handle master_bh = bh;
562 
563     GetMasterLocation(master_bh, range);
564 
565     // if no promoter, and fake promoters are requested, create one
566     if (m_Options.GetUseFakePromoters() && !s_HasPromoter(bh)) {
567         CRef<CSeq_feat> fake_promoter(new CSeq_feat());
568         CRef<CSeq_loc> fake_promoter_loc(new CSeq_loc());
569         const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
570         CRef <CSeq_id> new_id(new CSeq_id);
571         new_id->Assign(*id);
572         fake_promoter_loc->SetInt().SetId(*new_id);
573         fake_promoter_loc->SetInt().SetFrom(0);
574         fake_promoter_loc->SetInt().SetTo(bh.GetInst_Length() - 1);
575 
576         fake_promoter->SetLocation(*fake_promoter_loc);
577 
578         main_clause.AddSubclause (CRef<CAutoDefFeatureClause>(new CAutoDefFakePromoterClause (master_bh,
579                                                                     *fake_promoter,
580                                                                     *fake_promoter_loc,
581                                                                     m_Options)));
582     }
583 
584     // now create clauses for real features
585     CFeat_CI feat_ci(master_bh);
586 
587     if (x_Is5SList(feat_ci)) {
588         return "5S ribosomal RNA gene region";
589     }
590 
591     bool is_single_misc_feat = x_IsSingleMiscFeat(feat_ci);
592 
593     while (feat_ci)
594     {
595         vector<CRef<CAutoDefFeatureClause > > fclause = FeatureClauseFactory(bh, feat_ci->GetOriginalFeature(), feat_ci->GetMappedFeature().GetLocation(), m_Options, is_single_misc_feat);
596         for (auto it : fclause) {
597             if (it &&
598                 (it->IsRecognizedFeature() ||
599                 (m_Options.GetKeepRepeatRegion() &&
600                     (it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_repeat_region ||
601                         it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_LTR)))) {
602                 if (it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_exon ||
603                     it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_intron) {
604                     it->Label(m_Options.GetSuppressAlleles());
605                 }
606                 main_clause.AddSubclause(it);
607             }
608         }
609 
610         ++feat_ci;
611     }
612 
613     // optionally remove misc_feature subfeatures
614     if (m_Options.GetSuppressMiscFeatureSubfeatures()) {
615         main_clause.RemoveFeaturesUnderType(CSeqFeatData::eSubtype_misc_feature);
616     }
617 
618     // Group alt-spliced exons first, so that they will be associated with the correct genes and mRNAs
619     main_clause.GroupAltSplicedExons(bh);
620     main_clause.RemoveDeletedSubclauses();
621 
622     // Add mRNAs to other clauses
623     main_clause.GroupmRNAs(m_Options.GetSuppressAlleles());
624     main_clause.RemoveDeletedSubclauses();
625 
626     // Add genes to clauses that need them for descriptions/products
627     main_clause.GroupGenes(m_Options.GetSuppressAlleles());
628 
629     main_clause.GroupSegmentedCDSs(m_Options.GetSuppressAlleles());
630     main_clause.RemoveDeletedSubclauses();
631 
632     // Group all features
633     main_clause.GroupClauses(m_Options.GetGeneClusterOppStrand());
634     main_clause.RemoveDeletedSubclauses();
635 
636     // now that features have been grouped, can expand lists of spliced exons
637     main_clause.ExpandExonLists();
638 
639     // assign product names for features associated with genes that have products
640     main_clause.AssignGeneProductNames(&main_clause, m_Options.GetSuppressAlleles());
641 
642     // reverse the order of clauses for minus-strand CDSfeatures
643     main_clause.ReverseCDSClauseLists();
644 
645     main_clause.Label(m_Options.GetSuppressAlleles());
646     main_clause.CountUnknownGenes();
647     main_clause.RemoveDeletedSubclauses();
648 
649     x_RemoveOptionalFeatures(&main_clause, bh);
650 
651     // if a gene is listed as part of another clause, they do not need
652     // to be listed as there own clause
653     main_clause.RemoveGenesMentionedElsewhere();
654     main_clause.RemoveDeletedSubclauses();
655 
656     if (m_Options.GetSuppressMobileElementSubfeatures()) {
657         main_clause.SuppressMobileElementAndInsertionSequenceSubfeatures();
658     }
659 
660     main_clause.Label(m_Options.GetSuppressAlleles());
661 
662     if (!m_Options.GetSuppressFeatureAltSplice()) {
663         // GB-8927
664         // no alternate splice calculations for viruses
665         bool is_virus = false;
666         CSeqdesc_CI src(bh, CSeqdesc::e_Source);
667         if (src && src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDivision()
668             && NStr::EqualNocase(src->GetSource().GetOrg().GetDivision(), "VRL")) {
669             is_virus = true;
670         }
671 
672         if (!is_virus) {
673             main_clause.FindAltSplices(m_Options.GetSuppressAlleles());
674             main_clause.RemoveDeletedSubclauses();
675         }
676     }
677 
678     main_clause.ConsolidateRepeatedClauses(m_Options.GetSuppressAlleles());
679     main_clause.RemoveDeletedSubclauses();
680 
681     main_clause.GroupConsecutiveExons(bh);
682     main_clause.RemoveDeletedSubclauses();
683 
684     main_clause.Label(m_Options.GetSuppressAlleles());
685 
686     return main_clause.ListClauses(true, false, m_Options.GetSuppressAlleles());
687 }
688 
689 
OrganelleByGenome(unsigned int genome_val)690 string OrganelleByGenome(unsigned int genome_val)
691 {
692     string organelle;
693     switch (genome_val) {
694         case CBioSource::eGenome_macronuclear:
695             organelle = "macronuclear";
696             break;
697         case CBioSource::eGenome_nucleomorph:
698             organelle = "nucleomorph";
699             break;
700         case CBioSource::eGenome_mitochondrion:
701             organelle = "mitochondrion";
702             break;
703         case CBioSource::eGenome_apicoplast:
704             organelle = "apicoplast";
705             break;
706         case CBioSource::eGenome_chloroplast:
707             organelle = "chloroplast";
708             break;
709         case CBioSource::eGenome_chromoplast:
710             organelle = "chromoplast";
711             break;
712         case CBioSource::eGenome_kinetoplast:
713             organelle = "kinetoplast";
714             break;
715         case CBioSource::eGenome_plastid:
716             organelle = "plastid";
717             break;
718         case CBioSource::eGenome_cyanelle:
719             organelle = "cyanelle";
720             break;
721         case CBioSource::eGenome_leucoplast:
722             organelle = "leucoplast";
723             break;
724         case CBioSource::eGenome_proplastid:
725             organelle = "proplastid";
726             break;
727         case CBioSource::eGenome_hydrogenosome:
728             organelle = "hydrogenosome";
729             break;
730     }
731     return organelle;
732 }
733 
734 
s_GetProductFlagFromCDSProductNames(CBioseq_Handle bh)735 static unsigned int s_GetProductFlagFromCDSProductNames (CBioseq_Handle bh)
736 {
737 	unsigned int product_flag = CBioSource::eGenome_unknown;
738 	string::size_type pos;
739 
740 	SAnnotSelector sel(CSeqFeatData::eSubtype_cdregion);
741     CFeat_CI feat_ci(bh, sel);
742 	while (feat_ci && product_flag == CBioSource::eGenome_unknown) {
743         if (feat_ci->IsSetProduct()) {
744             string label;
745             CConstRef<CSeq_feat> prot
746                 = sequence::GetBestOverlappingFeat(feat_ci->GetProduct(),
747                 CSeqFeatData::e_Prot,
748                 sequence::eOverlap_Simple,
749                 bh.GetScope());
750             if (prot) {
751                 feature::GetLabel(*prot, &label, feature::fFGL_Content);
752                 if (NStr::Find(label, "mitochondrion") != NCBI_NS_STD::string::npos
753                     || NStr::Find(label, "mitochondrial") != NCBI_NS_STD::string::npos) {
754                     product_flag = CBioSource::eGenome_mitochondrion;
755                 } else if (NStr::Find(label, "apicoplast") != NCBI_NS_STD::string::npos) {
756                     product_flag = CBioSource::eGenome_apicoplast;
757                 } else if (NStr::Find(label, "chloroplast") != NCBI_NS_STD::string::npos) {
758                     product_flag = CBioSource::eGenome_chloroplast;
759                 } else if (NStr::Find(label, "chromoplast") != NCBI_NS_STD::string::npos) {
760                     product_flag = CBioSource::eGenome_chromoplast;
761                 } else if (NStr::Find(label, "kinetoplast") != NCBI_NS_STD::string::npos) {
762                     product_flag = CBioSource::eGenome_kinetoplast;
763                 } else if (NStr::Find(label, "proplastid") != NCBI_NS_STD::string::npos) {
764                     product_flag = CBioSource::eGenome_proplastid;
765                 } else if ((pos = NStr::Find(label, "plastid")) != NCBI_NS_STD::string::npos
766                     && (pos == 0 || isspace(label.c_str()[pos]))) {
767                     product_flag = CBioSource::eGenome_plastid;
768                 } else if (NStr::Find(label, "cyanelle") != NCBI_NS_STD::string::npos) {
769                     product_flag = CBioSource::eGenome_cyanelle;
770                 } else if (NStr::Find(label, "leucoplast") != NCBI_NS_STD::string::npos) {
771                     product_flag = CBioSource::eGenome_leucoplast;
772                 }
773             }
774         }
775 		++feat_ci;
776 	}
777     return product_flag;
778 }
779 
780 
x_GetFeatureClauseProductEnding(const string & feature_clauses,CBioseq_Handle bh)781 string CAutoDef::x_GetFeatureClauseProductEnding(const string& feature_clauses,
782                                                  CBioseq_Handle bh)
783 {
784     bool pluralize = false;
785 	unsigned int product_flag_to_use;
786     unsigned int nuclear_copy_flag = CBioSource::eGenome_unknown;
787 
788 	if (m_Options.GetSpecifyNuclearProduct()) {
789 	    product_flag_to_use = s_GetProductFlagFromCDSProductNames (bh);
790 	} else {
791 		product_flag_to_use = m_Options.GetProductFlag();
792         nuclear_copy_flag = m_Options.GetNuclearCopyFlag();
793 	}
794     if (NStr::Find(feature_clauses, "genes") != NCBI_NS_STD::string::npos) {
795         pluralize = true;
796     } else {
797         string::size_type pos = NStr::Find(feature_clauses, "gene");
798         if (pos != NCBI_NS_STD::string::npos
799             && NStr::Find (feature_clauses, "gene", pos + 4) != NCBI_NS_STD::string::npos) {
800             pluralize = true;
801         }
802     }
803 
804     unsigned int genome_val = CBioSource::eGenome_unknown;
805     string genome_from_mods;
806 
807     for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit;  ++dit) {
808         const CBioSource& bsrc = dit->GetSource();
809         if (bsrc.CanGetGenome()) {
810             genome_val = bsrc.GetGenome();
811         }
812         if (bsrc.CanGetSubtype()) {
813             ITERATE (CBioSource::TSubtype, subSrcI, bsrc.GetSubtype()) {
814                 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_other) {
815                     string note = (*subSrcI)->GetName();
816                     if (NStr::Equal(note, "macronuclear") || NStr::Equal(note, "micronuclear")) {
817                         genome_from_mods = note;
818                     }
819                 }
820             }
821         }
822         break;
823     }
824 
825     string ending = OrganelleByGenome(genome_val);
826     if (NStr::Equal(ending, "mitochondrion")) {
827         ending = "mitochondrial";
828     }
829     if (!NStr::IsBlank(ending)) {
830         ending = "; " + ending;
831     } else {
832         if (product_flag_to_use != CBioSource::eGenome_unknown) {
833             ending = OrganelleByGenome(product_flag_to_use);
834             if (NStr::IsBlank(ending)) {
835                 if (!NStr::IsBlank(genome_from_mods)) {
836                     ending = "; " + genome_from_mods;
837                 }
838             } else {
839                 if (NStr::Equal(ending, "mitochondrion")) {
840                     ending = "mitochondrial";
841                 }
842                 if (pluralize) {
843                     ending = "; nuclear genes for " + ending + " products";
844                 } else {
845                     ending = "; nuclear gene for " + ending + " product";
846                 }
847             }
848         } else if (nuclear_copy_flag != CBioSource::eGenome_unknown) {
849             ending = OrganelleByGenome(nuclear_copy_flag);
850             if (!NStr::IsBlank(ending)) {
851                 if (NStr::Equal(ending, "mitochondrion")) {
852                     ending = "mitochondrial";
853                 }
854                 ending = "; nuclear copy of " + ending + " gene";
855             }
856         }
857     }
858     return ending;
859 }
860 
861 
x_GetNonFeatureListEnding()862 string CAutoDef::x_GetNonFeatureListEnding()
863 {
864     string end;
865     switch (m_Options.GetFeatureListType())
866     {
867         case CAutoDefOptions::eCompleteSequence:
868             end = ", complete sequence.";
869             break;
870         case CAutoDefOptions::eCompleteGenome:
871             end = ", complete genome.";
872             break;
873         case CAutoDefOptions::ePartialSequence:
874             end = ", partial sequence.";
875             break;
876         case CAutoDefOptions::ePartialGenome:
877             end = ", partial genome.";
878             break;
879         case CAutoDefOptions::eSequence:
880         case CAutoDefOptions::eListAllFeatures:
881             end = " sequence.";
882             break;
883         case CAutoDefOptions::eWholeGenomeShotgunSequence:
884             end = " whole genome shotgun sequence.";
885             break;
886         default:
887             break;
888     }
889     return end;
890 }
891 
892 
IsBioseqmRNA(CBioseq_Handle bsh)893 bool IsBioseqmRNA(CBioseq_Handle bsh)
894 {
895     bool is_mRNA = false;
896     for (CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo); desc && !is_mRNA; ++desc) {
897         if (desc->GetMolinfo().CanGetBiomol()
898             && desc->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
899             is_mRNA = true;
900         }
901     }
902     return is_mRNA;
903 }
904 
905 
IsInGenProdSet(CBioseq_Handle bh)906 bool IsInGenProdSet(CBioseq_Handle bh)
907 {
908     CBioseq_set_Handle parent = bh.GetParentBioseq_set();
909     while (parent) {
910         if (parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_gen_prod_set) {
911             return true;
912         }
913         parent = parent.GetParentBioseq_set();
914     }
915     return false;
916 }
917 
918 
x_GetOneNonFeatureClause(CBioseq_Handle bh,unsigned int genome_val)919 string CAutoDef::x_GetOneNonFeatureClause(CBioseq_Handle bh, unsigned int genome_val)
920 {
921     string feature_clauses;
922     string organelle;
923 
924     if (m_Options.GetFeatureListType() != CAutoDefOptions::eSequence
925         || genome_val == CBioSource::eGenome_apicoplast
926         || genome_val == CBioSource::eGenome_chloroplast
927         || genome_val == CBioSource::eGenome_kinetoplast
928         || genome_val == CBioSource::eGenome_leucoplast
929         || genome_val == CBioSource::eGenome_mitochondrion
930         || genome_val == CBioSource::eGenome_plastid) {
931         organelle = OrganelleByGenome(genome_val);
932     }
933     if (!NStr::IsBlank(organelle)) {
934         feature_clauses = " " + organelle;
935     } else if (m_Options.GetFeatureListType() == CAutoDefOptions::eSequence) {
936         string biomol;
937         CSeqdesc_CI mi(bh, CSeqdesc::e_Molinfo);
938         if (mi && mi->GetMolinfo().IsSetBiomol()) {
939             if (mi->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
940                 biomol = "mRNA";
941             } else {
942                 biomol = CMolInfo::GetBiomolName(mi->GetMolinfo().GetBiomol());
943             }
944         }
945         if (!NStr::IsBlank(biomol)) {
946             feature_clauses = " " + biomol;
947         }
948     }
949 
950     feature_clauses += x_GetNonFeatureListEnding();
951     return feature_clauses;
952 }
953 
954 
GetOneFeatureClauseList(CBioseq_Handle bh,unsigned int genome_val)955 string CAutoDef::GetOneFeatureClauseList(CBioseq_Handle bh, unsigned int genome_val)
956 {
957     string feature_clauses;
958     if (m_Options.GetFeatureListType() == CAutoDefOptions::eListAllFeatures ||
959         (IsBioseqmRNA(bh) && IsInGenProdSet(bh))) {
960         feature_clauses = x_GetFeatureClauses(bh);
961         if (NStr::IsBlank(feature_clauses)) {
962             feature_clauses = x_GetOneNonFeatureClause(bh, genome_val);
963         } else {
964             feature_clauses = " " + feature_clauses;
965             string ending = x_GetFeatureClauseProductEnding(feature_clauses, bh);
966             if (m_Options.GetAltSpliceFlag()) {
967                 if (NStr::IsBlank(ending)) {
968                     ending = "; alternatively spliced";
969                 } else {
970                     ending += ", alternatively spliced";
971                 }
972             }
973             feature_clauses += ending;
974             if (NStr::IsBlank(feature_clauses)) {
975                 feature_clauses = ".";
976             } else {
977                 feature_clauses += ".";
978             }
979         }
980     } else {
981         feature_clauses = x_GetOneNonFeatureClause(bh, genome_val);
982     }
983     return feature_clauses;
984 }
985 
986 
GetKeywordPrefix(CBioseq_Handle bh)987 string CAutoDef::GetKeywordPrefix(CBioseq_Handle bh)
988 {
989     string keyword;
990 
991     CSeqdesc_CI gb(bh, CSeqdesc::e_Genbank);
992     if (gb) {
993         if (gb->GetGenbank().IsSetKeywords()) {
994             ITERATE(CGB_block::TKeywords, it, gb->GetGenbank().GetKeywords()) {
995                 if (NStr::EqualNocase(*it, "TPA:inferential")) {
996                     keyword = "TPA_inf: ";
997                     break;
998                 } else if (NStr::EqualNocase(*it, "TPA:experimental")) {
999                     keyword = "TPA_exp: ";
1000                     break;
1001                 }
1002             }
1003         }
1004     } else {
1005         CSeqdesc_CI mi(bh, CSeqdesc::e_Molinfo);
1006         if (mi && mi->GetMolinfo().IsSetTech() && mi->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
1007             keyword = "TSA: ";
1008         }
1009     }
1010     return keyword;
1011 }
1012 
1013 
GetOneDefLine(CAutoDefModifierCombo * mod_combo,const CBioseq_Handle & bh)1014 string CAutoDef::GetOneDefLine(CAutoDefModifierCombo *mod_combo, const CBioseq_Handle& bh)
1015 {
1016     // for protein sequences, use sequence::GetTitle
1017     if (bh.CanGetInst() && bh.GetInst().CanGetMol() && bh.GetInst().GetMol() == CSeq_inst::eMol_aa) {
1018         return sequence::CDeflineGenerator()
1019             .GenerateDefline(bh,
1020                              sequence::CDeflineGenerator::fIgnoreExisting |
1021                              sequence::CDeflineGenerator::fAllProteinNames);
1022     }
1023     string org_desc = "Unknown organism";
1024     unsigned int genome_val = CBioSource::eGenome_unknown;
1025     mod_combo->InitOptions(m_Options);
1026 
1027     for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit;  ++dit) {
1028         const CBioSource& bsrc = dit->GetSource();
1029         org_desc = mod_combo->GetSourceDescriptionString(bsrc);
1030         if (bsrc.CanGetGenome()) {
1031             genome_val = bsrc.GetGenome();
1032         }
1033         break;
1034     }
1035     string feature_clauses = GetOneFeatureClauseList(bh, genome_val);
1036 
1037     if (org_desc.length() > 0 && isalpha(org_desc.c_str()[0])) {
1038         string first_letter = org_desc.substr(0, 1);
1039         string remainder = org_desc.substr(1);
1040         NStr::ToUpper(first_letter);
1041         org_desc = first_letter + remainder;
1042     }
1043 
1044     string keyword = GetKeywordPrefix(bh);
1045 
1046     return keyword + org_desc + feature_clauses;
1047 }
1048 
1049 
1050 // use internal settings to create mod combo
GetOneDefLine(const CBioseq_Handle & bh)1051 string CAutoDef::GetOneDefLine(const CBioseq_Handle& bh)
1052 {
1053     // for protein sequences, use sequence::GetTitle
1054     if (bh.CanGetInst() && bh.GetInst().CanGetMol() && bh.GetInst().GetMol() == CSeq_inst::eMol_aa) {
1055         return sequence::CDeflineGenerator()
1056             .GenerateDefline(bh,
1057             sequence::CDeflineGenerator::fIgnoreExisting);
1058     }
1059     string org_desc = "Unknown organism";
1060     unsigned int genome_val = CBioSource::eGenome_unknown;
1061 
1062     CRef<CAutoDefModifierCombo> mod_combo(GetEmptyCombo());
1063     mod_combo->InitFromOptions(m_Options);
1064 
1065     for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit; ++dit) {
1066         const CBioSource& bsrc = dit->GetSource();
1067         org_desc = mod_combo->GetSourceDescriptionString(bsrc);
1068         if (bsrc.CanGetGenome()) {
1069             genome_val = bsrc.GetGenome();
1070         }
1071         break;
1072     }
1073     string feature_clauses = GetOneFeatureClauseList(bh, genome_val);
1074 
1075     if (org_desc.length() > 0 && isalpha(org_desc.c_str()[0])) {
1076         string first_letter = org_desc.substr(0, 1);
1077         string remainder = org_desc.substr(1);
1078         NStr::ToUpper(first_letter);
1079         org_desc = first_letter + remainder;
1080     }
1081 
1082     string keyword = GetKeywordPrefix(bh);
1083 
1084     return keyword + org_desc + feature_clauses;
1085 }
1086 
1087 
GetAvailableModifiers(CAutoDef::TAvailableModifierSet & mod_set)1088 void CAutoDef::GetAvailableModifiers(CAutoDef::TAvailableModifierSet &mod_set)
1089 {
1090     mod_set.clear();
1091     CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
1092     modifier_list.clear();
1093     m_OrigModCombo.GetAvailableModifiers (modifier_list);
1094     for (unsigned int k = 0; k < modifier_list.size(); k++) {
1095         mod_set.insert(CAutoDefAvailableModifier(modifier_list[k]));
1096     }
1097 }
1098 
1099 
SetOptionsObject(const CUser_object & user)1100 void CAutoDef::SetOptionsObject(const CUser_object& user)
1101 {
1102     m_Options.InitFromUserObject(user);
1103 }
1104 
1105 
1106 //starting here, remove when separating autodef from taxonomy options
s_GetOptionsForSet(CBioseq_set_Handle set)1107 CConstRef<CUser_object> s_GetOptionsForSet(CBioseq_set_Handle set)
1108 {
1109     CConstRef<CUser_object> options(NULL);
1110     CBioseq_CI b(set, CSeq_inst::eMol_na);
1111     while (b && !options) {
1112         CSeqdesc_CI desc(*b, CSeqdesc::e_User);
1113         while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1114             ++desc;
1115         }
1116         if (desc) {
1117             options.Reset(&(desc->GetUser()));
1118         }
1119     }
1120     return options;
1121 }
1122 
1123 
RegenerateDefLine(CBioseq_Handle bh)1124 string CAutoDef::RegenerateDefLine(CBioseq_Handle bh)
1125 {
1126     string defline;
1127     if (bh.IsAa()) {
1128         return kEmptyStr;
1129     }
1130     CSeqdesc_CI desc(bh, CSeqdesc::e_User);
1131     while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1132         ++desc;
1133     }
1134     if (desc) {
1135         CAutoDef autodef;
1136         autodef.SetOptionsObject(desc->GetUser());
1137         CAutoDefModifierCombo mod_combo;
1138         CAutoDefOptions options;
1139         options.InitFromUserObject(desc->GetUser());
1140         mod_combo.SetOptions(options);
1141         defline = autodef.GetOneDefLine(&mod_combo, bh);
1142     }
1143     return defline;
1144 }
1145 
1146 
RegenerateSequenceDefLines(CSeq_entry_Handle se)1147 bool CAutoDef::RegenerateSequenceDefLines(CSeq_entry_Handle se)
1148 {
1149     bool any = false;
1150     CBioseq_CI b_iter(se);
1151     for (; b_iter; ++b_iter) {
1152         if (b_iter->IsAa()) {
1153             continue;
1154         }
1155         CSeqdesc_CI desc(*b_iter, CSeqdesc::e_User);
1156         while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1157             ++desc;
1158         }
1159         if (desc) {
1160             string defline = RegenerateDefLine(*b_iter);
1161 
1162             bool found_existing = false;
1163             CBioseq_EditHandle beh(*b_iter);
1164             NON_CONST_ITERATE(CBioseq_EditHandle::TDescr::Tdata, it, beh.SetDescr().Set()) {
1165                 if ((*it)->IsTitle()) {
1166                     if (!NStr::Equal((*it)->GetTitle(), defline)) {
1167                         (*it)->SetTitle(defline);
1168                         any = true;
1169                     }
1170                     found_existing = true;
1171                     break;
1172                 }
1173             }
1174             if (!found_existing) {
1175                 CRef<CSeqdesc> new_desc(new CSeqdesc());
1176                 new_desc->SetTitle(defline);
1177                 beh.SetDescr().Set().push_back(new_desc);
1178                 any = true;
1179             }
1180         }
1181     }
1182     return any;
1183 }
1184 
1185 
x_IsHumanSTR(const CUser_object & obj)1186 bool CAutoDef::x_IsHumanSTR(const CUser_object& obj)
1187 {
1188     if (obj.GetObjectType() != CUser_object::eObjectType_StructuredComment) {
1189         return false;
1190     }
1191     if (!obj.IsSetData()) {
1192         return false;
1193     }
1194     ITERATE(CUser_object::TData, f, obj.GetData()) {
1195         if ((*f)->IsSetLabel() && (*f)->GetLabel().IsStr() &&
1196             NStr::EqualNocase((*f)->GetLabel().GetStr(), "StructuredCommentPrefix") &&
1197             (*f)->IsSetData() && (*f)->GetData().IsStr()) {
1198             if (NStr::EqualNocase((*f)->GetData().GetStr(), "##HumanSTR-START##")) {
1199                 return true;
1200             } else {
1201                 return false;
1202             }
1203         }
1204     }
1205     return false;
1206 }
1207 
1208 
x_GetHumanSTRFeatureClauses(CBioseq_Handle bh,const CUser_object & comment)1209 string CAutoDef::x_GetHumanSTRFeatureClauses(CBioseq_Handle bh, const CUser_object& comment)
1210 {
1211     string locus_name;
1212     string allele;
1213     string repeat;
1214     string assay;
1215 
1216     if (comment.IsSetData()) {
1217         ITERATE(CUser_object::TData, it, comment.GetData()) {
1218             if ((*it)->IsSetData() && (*it)->GetData().IsStr() &&
1219                 (*it)->IsSetLabel() && (*it)->GetLabel().IsStr()) {
1220                 const string& label = (*it)->GetLabel().GetStr();
1221                 if (NStr::EqualNocase(label, "STR locus name")) {
1222                     locus_name = (*it)->GetData().GetStr();
1223                 } else if (NStr::EqualNocase(label, "Length-based allele")) {
1224                     allele = (*it)->GetData().GetStr();
1225                 } else if (NStr::EqualNocase(label, "Bracketed repeat")) {
1226                     repeat = (*it)->GetData().GetStr();
1227                 } else if (NStr::EqualNocase(label, "Sequencing assay code")) {
1228                     assay = (*it)->GetData().GetStr();
1229                 }
1230             }
1231         }
1232     }
1233 
1234     string clause = "microsatellite " + locus_name + " " + allele + " " + repeat;
1235     CFeat_CI f(bh, CSeqFeatData::eSubtype_variation);
1236     while (f) {
1237         if (f->IsSetDbxref()) {
1238             ITERATE(CSeq_feat::TDbxref, db, f->GetDbxref()) {
1239                 if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "dbSNP") &&
1240                     (*db)->IsSetTag()) {
1241                     if ((*db)->GetTag().IsStr()) {
1242                         clause += " " + (*db)->GetTag().GetStr();
1243                     } else if ((*db)->GetTag().IsId()) {
1244                         clause += " " + NStr::NumericToString((*db)->GetTag().GetId());
1245                     }
1246                 }
1247             }
1248         }
1249         ++f;
1250     }
1251     if (assay != "") {
1252         clause += " " + assay;
1253     }
1254     clause += " sequence";
1255     return clause;
1256 }
1257 
1258 
s_ChooseModInModList(bool is_org_mod,int subtype,bool require_all,CAutoDefSourceDescription::TAvailableModifierVector & modifiers)1259 bool s_ChooseModInModList(bool is_org_mod, int subtype, bool require_all, CAutoDefSourceDescription::TAvailableModifierVector& modifiers)
1260 {
1261     bool rval = false;
1262     for (auto & modifier : modifiers) {
1263         if (modifier.IsOrgMod() && is_org_mod) {
1264             if (modifier.GetOrgModType() == subtype) {
1265                 if (modifier.AllPresent()) {
1266                     rval = true;
1267                 }
1268                 else if (modifier.AnyPresent() && !require_all) {
1269                     rval = true;
1270                 }
1271                 if (rval) {
1272                     modifier.SetRequested(true);
1273                 }
1274                 break;
1275             }
1276         }
1277         else if (!modifier.IsOrgMod() && !is_org_mod) {
1278             if (modifier.GetSubSourceType() == subtype) {
1279                 if (modifier.AllPresent()) {
1280                     rval = true;
1281                 }
1282                 else if (modifier.AnyPresent() && !require_all) {
1283                     rval = true;
1284                 }
1285                 if (rval) {
1286                     modifier.SetRequested(true);
1287                 }
1288                 break;
1289             }
1290         }
1291     }
1292     return rval;
1293 }
1294 
1295 
CreateIDOptions(CSeq_entry_Handle seh)1296 CRef<CUser_object> CAutoDef::CreateIDOptions(CSeq_entry_Handle seh)
1297 {
1298     CAutoDef ad;
1299     ad.AddSources(seh);
1300 
1301     CRef<CAutoDefModifierCombo> src_combo = ad.FindBestModifierCombo();
1302     CAutoDefSourceDescription::TAvailableModifierVector modifiers;
1303     src_combo->GetAvailableModifiers(modifiers);
1304 
1305     static int subtypes[] = { COrgMod::eSubtype_strain,
1306         CSubSource::eSubtype_clone,
1307         COrgMod::eSubtype_isolate,
1308         CSubSource::eSubtype_haplotype,
1309         COrgMod::eSubtype_cultivar,
1310         COrgMod::eSubtype_ecotype,
1311         COrgMod::eSubtype_breed,
1312         COrgMod::eSubtype_specimen_voucher,
1313         COrgMod::eSubtype_culture_collection,
1314         COrgMod::eSubtype_bio_material };
1315     static bool is_orgmod[] = { true, false, true, false, true, true, true, true, true, true };
1316     static int num_subtypes = sizeof(subtypes) / sizeof(int);
1317 
1318 
1319     bool found = false;
1320     // first look for best identifier found in all
1321     for (int i = 0; i < num_subtypes && !found; i++) {
1322         found = s_ChooseModInModList(is_orgmod[i], subtypes[i], true, modifiers);
1323     }
1324     if (!found) {
1325         // if not found in all, use best identifier found in some
1326         for (int i = 0; i < num_subtypes && !found; i++) {
1327             found = s_ChooseModInModList(is_orgmod[i], subtypes[i], false, modifiers);
1328         }
1329     }
1330     if (!src_combo->AreFeatureClausesUnique()) {
1331         // use best
1332         for (auto &modifier : modifiers) {
1333             if (modifier.AnyPresent()) {
1334                 if (modifier.IsOrgMod()) {
1335                     if (src_combo->HasOrgMod(modifier.GetOrgModType())) {
1336                         modifier.SetRequested(true);
1337                     }
1338                 }
1339                 else if (src_combo->HasSubSource(modifier.GetSubSourceType())) {
1340                     modifier.SetRequested(true);
1341                 }
1342             }
1343         }
1344     }
1345 
1346     CRef<CUser_object> user = ad.GetOptionsObject();
1347     CAutoDefOptions options;
1348     options.InitFromUserObject(*user);
1349     for(const auto &it : modifiers) {
1350         if (it.IsRequested()) {
1351             if (it.IsOrgMod()) {
1352                 options.AddOrgMod(it.GetOrgModType());
1353             } else {
1354                 options.AddSubSource(it.GetSubSourceType());
1355             }
1356         }
1357     }
1358     user = options.MakeUserObject();
1359     return user;
1360 }
1361 
1362 
1363 END_SCOPE(objects)
1364 END_NCBI_SCOPE
1365