1 /* $Id: autodef.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Generate unique definition lines for a set of sequences using organism
30 * descriptions and feature clauses.
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objmgr/util/autodef.hpp>
35 #include <corelib/ncbimisc.hpp>
36 #include <objmgr/seqdesc_ci.hpp>
37 #include <objmgr/bioseq_ci.hpp>
38 #include <objmgr/util/feature.hpp>
39 #include <objmgr/util/sequence.hpp>
40 #include <objmgr/util/seq_loc_util.hpp>
41
42 #include <objects/seq/Seq_descr.hpp>
43 #include <objects/seq/Seqdesc.hpp>
44 #include <objects/seq/Bioseq.hpp>
45 #include <objects/seqblock/GB_block.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48 #include <objects/seqfeat/BioSource.hpp>
49 #include <objects/seqfeat/Org_ref.hpp>
50 #include <objects/general/Dbtag.hpp>
51
52 #include <serial/iterator.hpp>
53
54 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)55 BEGIN_SCOPE(objects)
56
57
58 CAutoDef::CAutoDef()
59 : m_Cancelled(false)
60 {
61 }
62
63
~CAutoDef()64 CAutoDef::~CAutoDef()
65 {
66 }
67
68
s_NeedFeatureClause(const CBioseq & b)69 bool s_NeedFeatureClause(const CBioseq& b)
70 {
71 if (!b.IsSetAnnot()) {
72 return true;
73 }
74 size_t num_features = 0;
75
76 ITERATE(CBioseq::TAnnot, a, b.GetAnnot()) {
77 if ((*a)->IsFtable()) {
78 num_features += (*a)->GetData().GetFtable().size();
79 if (num_features > 100) {
80 break;
81 }
82 }
83 }
84 if (num_features < 100) {
85 return true;
86 } else {
87 return false;
88 }
89 }
90
AddSources(CSeq_entry_Handle se)91 void CAutoDef::AddSources (CSeq_entry_Handle se)
92 {
93
94 // add sources to modifier combination groups
95 CBioseq_CI seq_iter(se, CSeq_inst::eMol_na);
96 for ( ; seq_iter; ++seq_iter ) {
97 CSeqdesc_CI dit((*seq_iter), CSeqdesc::e_Source);
98 if (dit) {
99 string feature_clauses = s_NeedFeatureClause(*(seq_iter->GetCompleteBioseq())) ? x_GetFeatureClauses(*seq_iter) : kEmptyStr;
100 const CBioSource& bsrc = dit->GetSource();
101 m_OrigModCombo.AddSource(bsrc, feature_clauses);
102 }
103 }
104
105 // set default exclude_sp values
106 m_OrigModCombo.SetExcludeSpOrgs (m_OrigModCombo.GetDefaultExcludeSp());
107 }
108
109
x_SortModifierListByRank(TModifierIndexVector & index_list,CAutoDefSourceDescription::TAvailableModifierVector & modifier_list)110 void CAutoDef::x_SortModifierListByRank(TModifierIndexVector &index_list, CAutoDefSourceDescription::TAvailableModifierVector &modifier_list)
111 {
112 unsigned int k, j, tmp;
113 if (index_list.size() < 2) {
114 return;
115 }
116 for (k = 0; k < index_list.size() - 1; k++) {
117 for (j = k + 1; j < index_list.size(); j++) {
118 if (modifier_list[index_list[k]].GetRank() > modifier_list[index_list[j]].GetRank()) {
119 tmp = index_list[k];
120 index_list[k] = index_list[j];
121 index_list[j] = tmp;
122 }
123 }
124 }
125 }
126
127
x_GetModifierIndexList(TModifierIndexVector & index_list,CAutoDefSourceDescription::TAvailableModifierVector & modifier_list)128 void CAutoDef::x_GetModifierIndexList(TModifierIndexVector &index_list, CAutoDefSourceDescription::TAvailableModifierVector &modifier_list)
129 {
130 unsigned int k;
131 TModifierIndexVector remaining_list;
132
133 index_list.clear();
134 remaining_list.clear();
135
136 // note - required modifiers should be removed from the list
137
138 // first, look for all_present and all_unique modifiers
139 for (k = 0; k < modifier_list.size(); k++) {
140 if (modifier_list[k].AllPresent() && modifier_list[k].AllUnique()) {
141 index_list.push_back(k);
142 } else if (modifier_list[k].AnyPresent()) {
143 remaining_list.push_back(k);
144 }
145 }
146 x_SortModifierListByRank(index_list, modifier_list);
147 x_SortModifierListByRank(remaining_list, modifier_list);
148
149 for (k = 0; k < remaining_list.size(); k++) {
150 index_list.push_back(remaining_list[k]);
151 }
152 }
153
154
x_IsOrgModRequired(unsigned int mod_type)155 bool CAutoDef::x_IsOrgModRequired(unsigned int mod_type)
156 {
157 return false;
158 }
159
160
x_IsSubSrcRequired(unsigned int mod_type)161 bool CAutoDef::x_IsSubSrcRequired(unsigned int mod_type)
162 {
163 if (mod_type == CSubSource::eSubtype_endogenous_virus_name
164 || mod_type == CSubSource::eSubtype_plasmid_name
165 || mod_type == CSubSource::eSubtype_transgenic) {
166 return true;
167 } else {
168 return false;
169 }
170 }
171
172
GetNumAvailableModifiers()173 unsigned int CAutoDef::GetNumAvailableModifiers()
174 {
175 CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
176 modifier_list.clear();
177 m_OrigModCombo.GetAvailableModifiers (modifier_list);
178
179 unsigned int num_present = 0;
180 for (unsigned int k = 0; k < modifier_list.size(); k++) {
181 if (modifier_list[k].AnyPresent()) {
182 num_present++;
183 }
184 }
185 return num_present;
186 }
187
188
189 struct SAutoDefModifierComboSort {
operator ()SAutoDefModifierComboSort190 bool operator()(const CRef<CAutoDefModifierCombo>& s1,
191 const CRef<CAutoDefModifierCombo>& s2) const
192 {
193 return (*s1 < *s2);
194 }
195 };
196
197
198
FindBestModifierCombo()199 CRef<CAutoDefModifierCombo> CAutoDef::FindBestModifierCombo()
200 {
201 TModifierComboVector combo_list;
202
203 combo_list.clear();
204 combo_list.emplace_back (new CAutoDefModifierCombo(&m_OrigModCombo));
205
206
207 TModifierComboVector tmp, add_list;
208 TModifierComboVector::iterator it;
209 CAutoDefSourceDescription::TModifierVector mod_list;
210 bool stop = false;
211 unsigned int k;
212
213 mod_list.clear();
214
215 if (combo_list[0]->GetMaxInGroup() == 1) {
216 stop = true;
217 }
218
219 while (!stop) {
220 stop = true;
221 it = combo_list.begin();
222 add_list.clear();
223 while (it != combo_list.end()) {
224 tmp = (*it)->ExpandByAnyPresent ();
225 if (!tmp.empty()) {
226 stop = false;
227 for (k = 0; k < tmp.size(); k++) {
228 add_list.emplace_back (new CAutoDefModifierCombo(tmp[k]));
229 }
230 it = combo_list.erase (it);
231 } else {
232 ++it;
233 }
234 tmp.clear();
235 }
236 for (k = 0; k < add_list.size(); k++) {
237 combo_list.emplace_back (new CAutoDefModifierCombo(add_list[k]));
238 }
239 add_list.clear();
240 std::sort (combo_list.begin(), combo_list.end(), SAutoDefModifierComboSort());
241 if (combo_list[0]->GetMaxInGroup() == 1) {
242 stop = true;
243 }
244 }
245
246 ITERATE (CAutoDefSourceDescription::TModifierVector, it, combo_list[0]->GetModifiers()) {
247 mod_list.push_back (CAutoDefSourceModifierInfo(*it));
248 }
249
250 return combo_list[0];
251 }
252
253
GetAllModifierCombo()254 CAutoDefModifierCombo* CAutoDef::GetAllModifierCombo()
255 {
256 CAutoDefModifierCombo *newm = new CAutoDefModifierCombo(&m_OrigModCombo);
257
258 // set all modifiers in combo
259 CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
260
261 // first, get the list of modifiers that are available
262 modifier_list.clear();
263 newm->GetAvailableModifiers (modifier_list);
264
265 // add any modifier not already in the combo to the combo
266 for (unsigned int k = 0; k < modifier_list.size(); k++) {
267 if (modifier_list[k].AnyPresent()) {
268 if (modifier_list[k].IsOrgMod()) {
269 COrgMod::ESubtype subtype = modifier_list[k].GetOrgModType();
270 if (!newm->HasOrgMod(subtype)) {
271 newm->AddOrgMod(subtype);
272 }
273 } else {
274 CSubSource::ESubtype subtype = modifier_list[k].GetSubSourceType();
275 if (!newm->HasSubSource(subtype)) {
276 newm->AddSubsource(subtype);
277 }
278 }
279 }
280 }
281 return newm;
282 }
283
284
GetEmptyCombo()285 CAutoDefModifierCombo* CAutoDef::GetEmptyCombo()
286 {
287 CAutoDefModifierCombo *newm = new CAutoDefModifierCombo(&m_OrigModCombo);
288
289 return newm;
290 }
291
292
GetOneSourceDescription(const CBioseq_Handle & bh)293 string CAutoDef::GetOneSourceDescription(const CBioseq_Handle& bh)
294 {
295 CRef<CAutoDefModifierCombo> best = FindBestModifierCombo();
296 if (best == NULL) {
297 return "";
298 }
299
300 for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit; ++dit) {
301 const CBioSource& bsrc = dit->GetSource();
302 return best->GetSourceDescriptionString(bsrc);
303 }
304 return "";
305 }
306
307
x_RemoveOptionalFeatures(CAutoDefFeatureClause_Base * main_clause,const CBioseq_Handle & bh)308 void CAutoDef::x_RemoveOptionalFeatures(CAutoDefFeatureClause_Base *main_clause, const CBioseq_Handle& bh)
309 {
310 // remove optional features that have not been requested
311 if (main_clause == NULL) {
312 return;
313 }
314
315 // keep 5' UTRs only if lonely or requested
316 if (!m_Options.GetKeep5UTRs() && !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_5UTR)) {
317 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_5UTR);
318 }
319
320 // keep 3' UTRs only if lonely or requested
321 if (!m_Options.GetKeep3UTRs() && !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_3UTR)) {
322 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_3UTR);
323 }
324
325 // keep LTRs only if requested or lonely and not in parent
326 if (!m_Options.GetKeepLTRs() && !m_Options.GetKeepRepeatRegion() &&
327 !main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_LTR)) {
328 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_LTR);
329 }
330
331 // keep promoters only if requested or lonely and not in mRNA
332 if (!m_Options.GetKeepRegulatoryFeatures()) {
333 if (m_Options.GetUseFakePromoters()) {
334 // promoters are requested, remove all regulatory features except promoters
335 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, true);
336 } else {
337 bool lonely = main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_regulatory);
338 if (lonely) {
339 // remove regulatory features, including promoters, only in mRNA sequences
340 main_clause->RemoveFeaturesInmRNAsByType(CSeqFeatData::eSubtype_regulatory, false);
341 // remove regulatory features other than promoters everywhere else
342 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, true);
343 } else {
344 // remove all regulatory features
345 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_regulatory, false);
346 }
347 }
348 }
349
350 // keep introns only if requested or lonely and not in mRNA
351 if (!m_Options.GetKeepIntrons()) {
352 if (!main_clause->IsFeatureTypeLonely(CSeqFeatData::eSubtype_intron)) {
353 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_intron);
354 } else {
355 main_clause->RemoveFeaturesInmRNAsByType(CSeqFeatData::eSubtype_intron);
356 }
357 }
358
359 // keep exons only if requested or lonely or in mRNA or in partial CDS or on segment
360 if (!m_Options.GetKeepExons() && !IsSegment(bh)) {
361 if (main_clause->GetMainFeatureSubtype() != CSeqFeatData::eSubtype_exon) {
362 main_clause->RemoveUnwantedExons();
363 }
364 }
365
366 // only keep bioseq precursor RNAs if lonely or requested
367 if (!main_clause->IsBioseqPrecursorRNA() && !m_Options.GetKeepPrecursorRNA()) {
368 main_clause->RemoveBioseqPrecursorRNAs();
369 }
370
371 // keep uORFs if lonely or requested
372 if (!m_Options.GetKeepuORFs() && main_clause->GetNumSubclauses() > 1) {
373 main_clause->RemoveuORFs();
374 }
375
376 // remove "optional" mobile element features unless lonely or requested
377 if (!m_Options.GetKeepMobileElements() && main_clause->GetNumSubclauses() > 1) {
378 main_clause->RemoveOptionalMobileElements();
379 }
380
381 // keep misc_recombs only if requested
382 if (!m_Options.GetKeepMiscRecomb()) {
383 main_clause->RemoveFeaturesByType(CSeqFeatData::eSubtype_misc_recomb);
384 }
385
386 // delete subclauses at end, so that loneliness calculations will be correct
387 main_clause->RemoveDeletedSubclauses();
388 }
389
390
x_IsFeatureSuppressed(CSeqFeatData::ESubtype subtype)391 bool CAutoDef::x_IsFeatureSuppressed(CSeqFeatData::ESubtype subtype)
392 {
393 return m_Options.IsFeatureSuppressed(subtype);
394 }
395
396
SuppressFeature(const objects::CFeatListItem & feat)397 void CAutoDef::SuppressFeature(const objects::CFeatListItem& feat)
398 {
399 if (feat.GetType() == CSeqFeatData::e_not_set) {
400 m_Options.SuppressAllFeatures();
401 } else {
402 m_Options.SuppressFeature((CSeqFeatData::ESubtype)(feat.GetSubtype()));
403 }
404 }
405
406
SuppressFeature(objects::CSeqFeatData::ESubtype subtype)407 void CAutoDef::SuppressFeature(objects::CSeqFeatData::ESubtype subtype)
408 {
409 m_Options.SuppressFeature(subtype);
410 }
411
412
IsSegment(const CBioseq_Handle & bh)413 bool CAutoDef::IsSegment(const CBioseq_Handle& bh)
414 {
415 CSeq_entry_Handle seh = bh.GetParentEntry();
416
417 seh = seh.GetParentEntry();
418
419 if (seh && seh.IsSet()) {
420 CBioseq_set_Handle bsh = seh.GetSet();
421 if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_parts) {
422 return true;
423 }
424 }
425 return false;
426 }
427
428
GetMasterLocation(CBioseq_Handle & bh,CRange<TSeqPos> & range)429 void CAutoDef::GetMasterLocation(CBioseq_Handle &bh, CRange<TSeqPos>& range)
430 {
431 CSeq_entry_Handle seh = bh.GetParentEntry();
432 CBioseq_Handle master = bh;
433 unsigned int start = 0, stop = bh.GetBioseqLength() - 1;
434 unsigned int offset = 0;
435
436 seh = seh.GetParentEntry();
437
438 if (seh && seh.IsSet()) {
439 CBioseq_set_Handle bsh = seh.GetSet();
440 if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_parts) {
441 seh = seh.GetParentEntry();
442 if (seh.IsSet()) {
443 bsh = seh.GetSet();
444 if (bsh.CanGetClass() && bsh.GetClass() == CBioseq_set::eClass_segset) {
445 CBioseq_CI seq_iter(seh);
446 for ( ; seq_iter; ++seq_iter ) {
447 if (seq_iter->CanGetInst_Repr()) {
448 if (seq_iter->GetInst_Repr() == CSeq_inst::eRepr_seg) {
449 master = *seq_iter;
450 } else if (seq_iter->GetInst_Repr() == CSeq_inst::eRepr_raw) {
451 if (*seq_iter == bh) {
452 start = offset;
453 stop = offset + bh.GetBioseqLength() - 1;
454 } else {
455 offset += seq_iter->GetBioseqLength();
456 }
457 }
458 }
459 }
460 }
461 }
462 }
463 }
464 bh = master;
465 range.SetFrom(start);
466 range.SetTo(stop);
467 }
468
469
x_Is5SList(CFeat_CI feat_ci)470 bool CAutoDef::x_Is5SList(CFeat_CI feat_ci)
471 {
472 bool is_list = true;
473 bool is_single = true;
474 bool found_single = false;
475
476 if (!feat_ci) {
477 return false;
478 }
479 ++feat_ci;
480 if (feat_ci) {
481 is_single = false;
482 }
483 feat_ci.Rewind();
484
485 while (feat_ci && is_list) {
486 if (feat_ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA) {
487 if (!feat_ci->GetData().GetRna().IsSetExt()
488 || !feat_ci->GetData().GetRna().GetExt().IsName()
489 || !NStr::Equal(feat_ci->GetData().GetRna().GetExt().GetName(), "5S ribosomal RNA")) {
490 is_list = false;
491 }
492 } else if (feat_ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
493 if (!feat_ci->IsSetComment()) {
494 is_list = false;
495 } else if (NStr::Equal(feat_ci->GetComment(), "contains 5S ribosomal RNA and nontranscribed spacer")) {
496 found_single = true;
497 } else if (!NStr::Equal(feat_ci->GetComment(), "nontranscribed spacer")) {
498 is_list = false;
499 }
500 } else {
501 is_list = false;
502 }
503 ++feat_ci;
504 }
505 if (is_single && !found_single) {
506 is_list = false;
507 }
508 feat_ci.Rewind();
509 return is_list;
510 }
511
512
x_IsSingleMiscFeat(CFeat_CI feat_ci)513 bool CAutoDef::x_IsSingleMiscFeat(CFeat_CI feat_ci)
514 {
515 if (!feat_ci ||
516 feat_ci->GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature ||
517 !feat_ci->IsSetComment()) {
518 return false;
519 }
520 bool is_single = true;
521 ++feat_ci;
522 if (feat_ci) {
523 is_single = false;
524 }
525 feat_ci.Rewind();
526 return is_single;
527 }
528
529
s_HasPromoter(CBioseq_Handle bh)530 bool s_HasPromoter(CBioseq_Handle bh)
531 {
532 bool has_promoter = false;
533 SAnnotSelector sel(CSeqFeatData::eSubtype_regulatory);
534 CFeat_CI f_ci (bh, sel);
535 while (f_ci && !has_promoter) {
536 has_promoter = CAutoDefFeatureClause::IsPromoter(*(f_ci->GetSeq_feat()));
537 ++f_ci;
538 }
539 return has_promoter;
540 }
541
542
x_GetFeatureClauses(const CBioseq_Handle & bh)543 string CAutoDef::x_GetFeatureClauses(const CBioseq_Handle& bh)
544 {
545 const string& custom = m_Options.GetCustomFeatureClause();
546 if (!NStr::IsBlank(custom)) {
547 return custom;
548 }
549
550 CSeqdesc_CI d(bh, CSeqdesc::e_User);
551 while (d) {
552 if (x_IsHumanSTR(d->GetUser())) {
553 return x_GetHumanSTRFeatureClauses(bh, d->GetUser());
554 }
555 ++d;
556 }
557
558
559 CAutoDefFeatureClause_Base main_clause(m_Options);
560 CRange<TSeqPos> range;
561 CBioseq_Handle master_bh = bh;
562
563 GetMasterLocation(master_bh, range);
564
565 // if no promoter, and fake promoters are requested, create one
566 if (m_Options.GetUseFakePromoters() && !s_HasPromoter(bh)) {
567 CRef<CSeq_feat> fake_promoter(new CSeq_feat());
568 CRef<CSeq_loc> fake_promoter_loc(new CSeq_loc());
569 const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
570 CRef <CSeq_id> new_id(new CSeq_id);
571 new_id->Assign(*id);
572 fake_promoter_loc->SetInt().SetId(*new_id);
573 fake_promoter_loc->SetInt().SetFrom(0);
574 fake_promoter_loc->SetInt().SetTo(bh.GetInst_Length() - 1);
575
576 fake_promoter->SetLocation(*fake_promoter_loc);
577
578 main_clause.AddSubclause (CRef<CAutoDefFeatureClause>(new CAutoDefFakePromoterClause (master_bh,
579 *fake_promoter,
580 *fake_promoter_loc,
581 m_Options)));
582 }
583
584 // now create clauses for real features
585 CFeat_CI feat_ci(master_bh);
586
587 if (x_Is5SList(feat_ci)) {
588 return "5S ribosomal RNA gene region";
589 }
590
591 bool is_single_misc_feat = x_IsSingleMiscFeat(feat_ci);
592
593 while (feat_ci)
594 {
595 vector<CRef<CAutoDefFeatureClause > > fclause = FeatureClauseFactory(bh, feat_ci->GetOriginalFeature(), feat_ci->GetMappedFeature().GetLocation(), m_Options, is_single_misc_feat);
596 for (auto it : fclause) {
597 if (it &&
598 (it->IsRecognizedFeature() ||
599 (m_Options.GetKeepRepeatRegion() &&
600 (it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_repeat_region ||
601 it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_LTR)))) {
602 if (it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_exon ||
603 it->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_intron) {
604 it->Label(m_Options.GetSuppressAlleles());
605 }
606 main_clause.AddSubclause(it);
607 }
608 }
609
610 ++feat_ci;
611 }
612
613 // optionally remove misc_feature subfeatures
614 if (m_Options.GetSuppressMiscFeatureSubfeatures()) {
615 main_clause.RemoveFeaturesUnderType(CSeqFeatData::eSubtype_misc_feature);
616 }
617
618 // Group alt-spliced exons first, so that they will be associated with the correct genes and mRNAs
619 main_clause.GroupAltSplicedExons(bh);
620 main_clause.RemoveDeletedSubclauses();
621
622 // Add mRNAs to other clauses
623 main_clause.GroupmRNAs(m_Options.GetSuppressAlleles());
624 main_clause.RemoveDeletedSubclauses();
625
626 // Add genes to clauses that need them for descriptions/products
627 main_clause.GroupGenes(m_Options.GetSuppressAlleles());
628
629 main_clause.GroupSegmentedCDSs(m_Options.GetSuppressAlleles());
630 main_clause.RemoveDeletedSubclauses();
631
632 // Group all features
633 main_clause.GroupClauses(m_Options.GetGeneClusterOppStrand());
634 main_clause.RemoveDeletedSubclauses();
635
636 // now that features have been grouped, can expand lists of spliced exons
637 main_clause.ExpandExonLists();
638
639 // assign product names for features associated with genes that have products
640 main_clause.AssignGeneProductNames(&main_clause, m_Options.GetSuppressAlleles());
641
642 // reverse the order of clauses for minus-strand CDSfeatures
643 main_clause.ReverseCDSClauseLists();
644
645 main_clause.Label(m_Options.GetSuppressAlleles());
646 main_clause.CountUnknownGenes();
647 main_clause.RemoveDeletedSubclauses();
648
649 x_RemoveOptionalFeatures(&main_clause, bh);
650
651 // if a gene is listed as part of another clause, they do not need
652 // to be listed as there own clause
653 main_clause.RemoveGenesMentionedElsewhere();
654 main_clause.RemoveDeletedSubclauses();
655
656 if (m_Options.GetSuppressMobileElementSubfeatures()) {
657 main_clause.SuppressMobileElementAndInsertionSequenceSubfeatures();
658 }
659
660 main_clause.Label(m_Options.GetSuppressAlleles());
661
662 if (!m_Options.GetSuppressFeatureAltSplice()) {
663 // GB-8927
664 // no alternate splice calculations for viruses
665 bool is_virus = false;
666 CSeqdesc_CI src(bh, CSeqdesc::e_Source);
667 if (src && src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDivision()
668 && NStr::EqualNocase(src->GetSource().GetOrg().GetDivision(), "VRL")) {
669 is_virus = true;
670 }
671
672 if (!is_virus) {
673 main_clause.FindAltSplices(m_Options.GetSuppressAlleles());
674 main_clause.RemoveDeletedSubclauses();
675 }
676 }
677
678 main_clause.ConsolidateRepeatedClauses(m_Options.GetSuppressAlleles());
679 main_clause.RemoveDeletedSubclauses();
680
681 main_clause.GroupConsecutiveExons(bh);
682 main_clause.RemoveDeletedSubclauses();
683
684 main_clause.Label(m_Options.GetSuppressAlleles());
685
686 return main_clause.ListClauses(true, false, m_Options.GetSuppressAlleles());
687 }
688
689
OrganelleByGenome(unsigned int genome_val)690 string OrganelleByGenome(unsigned int genome_val)
691 {
692 string organelle;
693 switch (genome_val) {
694 case CBioSource::eGenome_macronuclear:
695 organelle = "macronuclear";
696 break;
697 case CBioSource::eGenome_nucleomorph:
698 organelle = "nucleomorph";
699 break;
700 case CBioSource::eGenome_mitochondrion:
701 organelle = "mitochondrion";
702 break;
703 case CBioSource::eGenome_apicoplast:
704 organelle = "apicoplast";
705 break;
706 case CBioSource::eGenome_chloroplast:
707 organelle = "chloroplast";
708 break;
709 case CBioSource::eGenome_chromoplast:
710 organelle = "chromoplast";
711 break;
712 case CBioSource::eGenome_kinetoplast:
713 organelle = "kinetoplast";
714 break;
715 case CBioSource::eGenome_plastid:
716 organelle = "plastid";
717 break;
718 case CBioSource::eGenome_cyanelle:
719 organelle = "cyanelle";
720 break;
721 case CBioSource::eGenome_leucoplast:
722 organelle = "leucoplast";
723 break;
724 case CBioSource::eGenome_proplastid:
725 organelle = "proplastid";
726 break;
727 case CBioSource::eGenome_hydrogenosome:
728 organelle = "hydrogenosome";
729 break;
730 }
731 return organelle;
732 }
733
734
s_GetProductFlagFromCDSProductNames(CBioseq_Handle bh)735 static unsigned int s_GetProductFlagFromCDSProductNames (CBioseq_Handle bh)
736 {
737 unsigned int product_flag = CBioSource::eGenome_unknown;
738 string::size_type pos;
739
740 SAnnotSelector sel(CSeqFeatData::eSubtype_cdregion);
741 CFeat_CI feat_ci(bh, sel);
742 while (feat_ci && product_flag == CBioSource::eGenome_unknown) {
743 if (feat_ci->IsSetProduct()) {
744 string label;
745 CConstRef<CSeq_feat> prot
746 = sequence::GetBestOverlappingFeat(feat_ci->GetProduct(),
747 CSeqFeatData::e_Prot,
748 sequence::eOverlap_Simple,
749 bh.GetScope());
750 if (prot) {
751 feature::GetLabel(*prot, &label, feature::fFGL_Content);
752 if (NStr::Find(label, "mitochondrion") != NCBI_NS_STD::string::npos
753 || NStr::Find(label, "mitochondrial") != NCBI_NS_STD::string::npos) {
754 product_flag = CBioSource::eGenome_mitochondrion;
755 } else if (NStr::Find(label, "apicoplast") != NCBI_NS_STD::string::npos) {
756 product_flag = CBioSource::eGenome_apicoplast;
757 } else if (NStr::Find(label, "chloroplast") != NCBI_NS_STD::string::npos) {
758 product_flag = CBioSource::eGenome_chloroplast;
759 } else if (NStr::Find(label, "chromoplast") != NCBI_NS_STD::string::npos) {
760 product_flag = CBioSource::eGenome_chromoplast;
761 } else if (NStr::Find(label, "kinetoplast") != NCBI_NS_STD::string::npos) {
762 product_flag = CBioSource::eGenome_kinetoplast;
763 } else if (NStr::Find(label, "proplastid") != NCBI_NS_STD::string::npos) {
764 product_flag = CBioSource::eGenome_proplastid;
765 } else if ((pos = NStr::Find(label, "plastid")) != NCBI_NS_STD::string::npos
766 && (pos == 0 || isspace(label.c_str()[pos]))) {
767 product_flag = CBioSource::eGenome_plastid;
768 } else if (NStr::Find(label, "cyanelle") != NCBI_NS_STD::string::npos) {
769 product_flag = CBioSource::eGenome_cyanelle;
770 } else if (NStr::Find(label, "leucoplast") != NCBI_NS_STD::string::npos) {
771 product_flag = CBioSource::eGenome_leucoplast;
772 }
773 }
774 }
775 ++feat_ci;
776 }
777 return product_flag;
778 }
779
780
x_GetFeatureClauseProductEnding(const string & feature_clauses,CBioseq_Handle bh)781 string CAutoDef::x_GetFeatureClauseProductEnding(const string& feature_clauses,
782 CBioseq_Handle bh)
783 {
784 bool pluralize = false;
785 unsigned int product_flag_to_use;
786 unsigned int nuclear_copy_flag = CBioSource::eGenome_unknown;
787
788 if (m_Options.GetSpecifyNuclearProduct()) {
789 product_flag_to_use = s_GetProductFlagFromCDSProductNames (bh);
790 } else {
791 product_flag_to_use = m_Options.GetProductFlag();
792 nuclear_copy_flag = m_Options.GetNuclearCopyFlag();
793 }
794 if (NStr::Find(feature_clauses, "genes") != NCBI_NS_STD::string::npos) {
795 pluralize = true;
796 } else {
797 string::size_type pos = NStr::Find(feature_clauses, "gene");
798 if (pos != NCBI_NS_STD::string::npos
799 && NStr::Find (feature_clauses, "gene", pos + 4) != NCBI_NS_STD::string::npos) {
800 pluralize = true;
801 }
802 }
803
804 unsigned int genome_val = CBioSource::eGenome_unknown;
805 string genome_from_mods;
806
807 for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit; ++dit) {
808 const CBioSource& bsrc = dit->GetSource();
809 if (bsrc.CanGetGenome()) {
810 genome_val = bsrc.GetGenome();
811 }
812 if (bsrc.CanGetSubtype()) {
813 ITERATE (CBioSource::TSubtype, subSrcI, bsrc.GetSubtype()) {
814 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_other) {
815 string note = (*subSrcI)->GetName();
816 if (NStr::Equal(note, "macronuclear") || NStr::Equal(note, "micronuclear")) {
817 genome_from_mods = note;
818 }
819 }
820 }
821 }
822 break;
823 }
824
825 string ending = OrganelleByGenome(genome_val);
826 if (NStr::Equal(ending, "mitochondrion")) {
827 ending = "mitochondrial";
828 }
829 if (!NStr::IsBlank(ending)) {
830 ending = "; " + ending;
831 } else {
832 if (product_flag_to_use != CBioSource::eGenome_unknown) {
833 ending = OrganelleByGenome(product_flag_to_use);
834 if (NStr::IsBlank(ending)) {
835 if (!NStr::IsBlank(genome_from_mods)) {
836 ending = "; " + genome_from_mods;
837 }
838 } else {
839 if (NStr::Equal(ending, "mitochondrion")) {
840 ending = "mitochondrial";
841 }
842 if (pluralize) {
843 ending = "; nuclear genes for " + ending + " products";
844 } else {
845 ending = "; nuclear gene for " + ending + " product";
846 }
847 }
848 } else if (nuclear_copy_flag != CBioSource::eGenome_unknown) {
849 ending = OrganelleByGenome(nuclear_copy_flag);
850 if (!NStr::IsBlank(ending)) {
851 if (NStr::Equal(ending, "mitochondrion")) {
852 ending = "mitochondrial";
853 }
854 ending = "; nuclear copy of " + ending + " gene";
855 }
856 }
857 }
858 return ending;
859 }
860
861
x_GetNonFeatureListEnding()862 string CAutoDef::x_GetNonFeatureListEnding()
863 {
864 string end;
865 switch (m_Options.GetFeatureListType())
866 {
867 case CAutoDefOptions::eCompleteSequence:
868 end = ", complete sequence.";
869 break;
870 case CAutoDefOptions::eCompleteGenome:
871 end = ", complete genome.";
872 break;
873 case CAutoDefOptions::ePartialSequence:
874 end = ", partial sequence.";
875 break;
876 case CAutoDefOptions::ePartialGenome:
877 end = ", partial genome.";
878 break;
879 case CAutoDefOptions::eSequence:
880 case CAutoDefOptions::eListAllFeatures:
881 end = " sequence.";
882 break;
883 case CAutoDefOptions::eWholeGenomeShotgunSequence:
884 end = " whole genome shotgun sequence.";
885 break;
886 default:
887 break;
888 }
889 return end;
890 }
891
892
IsBioseqmRNA(CBioseq_Handle bsh)893 bool IsBioseqmRNA(CBioseq_Handle bsh)
894 {
895 bool is_mRNA = false;
896 for (CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo); desc && !is_mRNA; ++desc) {
897 if (desc->GetMolinfo().CanGetBiomol()
898 && desc->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
899 is_mRNA = true;
900 }
901 }
902 return is_mRNA;
903 }
904
905
IsInGenProdSet(CBioseq_Handle bh)906 bool IsInGenProdSet(CBioseq_Handle bh)
907 {
908 CBioseq_set_Handle parent = bh.GetParentBioseq_set();
909 while (parent) {
910 if (parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_gen_prod_set) {
911 return true;
912 }
913 parent = parent.GetParentBioseq_set();
914 }
915 return false;
916 }
917
918
x_GetOneNonFeatureClause(CBioseq_Handle bh,unsigned int genome_val)919 string CAutoDef::x_GetOneNonFeatureClause(CBioseq_Handle bh, unsigned int genome_val)
920 {
921 string feature_clauses;
922 string organelle;
923
924 if (m_Options.GetFeatureListType() != CAutoDefOptions::eSequence
925 || genome_val == CBioSource::eGenome_apicoplast
926 || genome_val == CBioSource::eGenome_chloroplast
927 || genome_val == CBioSource::eGenome_kinetoplast
928 || genome_val == CBioSource::eGenome_leucoplast
929 || genome_val == CBioSource::eGenome_mitochondrion
930 || genome_val == CBioSource::eGenome_plastid) {
931 organelle = OrganelleByGenome(genome_val);
932 }
933 if (!NStr::IsBlank(organelle)) {
934 feature_clauses = " " + organelle;
935 } else if (m_Options.GetFeatureListType() == CAutoDefOptions::eSequence) {
936 string biomol;
937 CSeqdesc_CI mi(bh, CSeqdesc::e_Molinfo);
938 if (mi && mi->GetMolinfo().IsSetBiomol()) {
939 if (mi->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
940 biomol = "mRNA";
941 } else {
942 biomol = CMolInfo::GetBiomolName(mi->GetMolinfo().GetBiomol());
943 }
944 }
945 if (!NStr::IsBlank(biomol)) {
946 feature_clauses = " " + biomol;
947 }
948 }
949
950 feature_clauses += x_GetNonFeatureListEnding();
951 return feature_clauses;
952 }
953
954
GetOneFeatureClauseList(CBioseq_Handle bh,unsigned int genome_val)955 string CAutoDef::GetOneFeatureClauseList(CBioseq_Handle bh, unsigned int genome_val)
956 {
957 string feature_clauses;
958 if (m_Options.GetFeatureListType() == CAutoDefOptions::eListAllFeatures ||
959 (IsBioseqmRNA(bh) && IsInGenProdSet(bh))) {
960 feature_clauses = x_GetFeatureClauses(bh);
961 if (NStr::IsBlank(feature_clauses)) {
962 feature_clauses = x_GetOneNonFeatureClause(bh, genome_val);
963 } else {
964 feature_clauses = " " + feature_clauses;
965 string ending = x_GetFeatureClauseProductEnding(feature_clauses, bh);
966 if (m_Options.GetAltSpliceFlag()) {
967 if (NStr::IsBlank(ending)) {
968 ending = "; alternatively spliced";
969 } else {
970 ending += ", alternatively spliced";
971 }
972 }
973 feature_clauses += ending;
974 if (NStr::IsBlank(feature_clauses)) {
975 feature_clauses = ".";
976 } else {
977 feature_clauses += ".";
978 }
979 }
980 } else {
981 feature_clauses = x_GetOneNonFeatureClause(bh, genome_val);
982 }
983 return feature_clauses;
984 }
985
986
GetKeywordPrefix(CBioseq_Handle bh)987 string CAutoDef::GetKeywordPrefix(CBioseq_Handle bh)
988 {
989 string keyword;
990
991 CSeqdesc_CI gb(bh, CSeqdesc::e_Genbank);
992 if (gb) {
993 if (gb->GetGenbank().IsSetKeywords()) {
994 ITERATE(CGB_block::TKeywords, it, gb->GetGenbank().GetKeywords()) {
995 if (NStr::EqualNocase(*it, "TPA:inferential")) {
996 keyword = "TPA_inf: ";
997 break;
998 } else if (NStr::EqualNocase(*it, "TPA:experimental")) {
999 keyword = "TPA_exp: ";
1000 break;
1001 }
1002 }
1003 }
1004 } else {
1005 CSeqdesc_CI mi(bh, CSeqdesc::e_Molinfo);
1006 if (mi && mi->GetMolinfo().IsSetTech() && mi->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
1007 keyword = "TSA: ";
1008 }
1009 }
1010 return keyword;
1011 }
1012
1013
GetOneDefLine(CAutoDefModifierCombo * mod_combo,const CBioseq_Handle & bh)1014 string CAutoDef::GetOneDefLine(CAutoDefModifierCombo *mod_combo, const CBioseq_Handle& bh)
1015 {
1016 // for protein sequences, use sequence::GetTitle
1017 if (bh.CanGetInst() && bh.GetInst().CanGetMol() && bh.GetInst().GetMol() == CSeq_inst::eMol_aa) {
1018 return sequence::CDeflineGenerator()
1019 .GenerateDefline(bh,
1020 sequence::CDeflineGenerator::fIgnoreExisting |
1021 sequence::CDeflineGenerator::fAllProteinNames);
1022 }
1023 string org_desc = "Unknown organism";
1024 unsigned int genome_val = CBioSource::eGenome_unknown;
1025 mod_combo->InitOptions(m_Options);
1026
1027 for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit; ++dit) {
1028 const CBioSource& bsrc = dit->GetSource();
1029 org_desc = mod_combo->GetSourceDescriptionString(bsrc);
1030 if (bsrc.CanGetGenome()) {
1031 genome_val = bsrc.GetGenome();
1032 }
1033 break;
1034 }
1035 string feature_clauses = GetOneFeatureClauseList(bh, genome_val);
1036
1037 if (org_desc.length() > 0 && isalpha(org_desc.c_str()[0])) {
1038 string first_letter = org_desc.substr(0, 1);
1039 string remainder = org_desc.substr(1);
1040 NStr::ToUpper(first_letter);
1041 org_desc = first_letter + remainder;
1042 }
1043
1044 string keyword = GetKeywordPrefix(bh);
1045
1046 return keyword + org_desc + feature_clauses;
1047 }
1048
1049
1050 // use internal settings to create mod combo
GetOneDefLine(const CBioseq_Handle & bh)1051 string CAutoDef::GetOneDefLine(const CBioseq_Handle& bh)
1052 {
1053 // for protein sequences, use sequence::GetTitle
1054 if (bh.CanGetInst() && bh.GetInst().CanGetMol() && bh.GetInst().GetMol() == CSeq_inst::eMol_aa) {
1055 return sequence::CDeflineGenerator()
1056 .GenerateDefline(bh,
1057 sequence::CDeflineGenerator::fIgnoreExisting);
1058 }
1059 string org_desc = "Unknown organism";
1060 unsigned int genome_val = CBioSource::eGenome_unknown;
1061
1062 CRef<CAutoDefModifierCombo> mod_combo(GetEmptyCombo());
1063 mod_combo->InitFromOptions(m_Options);
1064
1065 for (CSeqdesc_CI dit(bh, CSeqdesc::e_Source); dit; ++dit) {
1066 const CBioSource& bsrc = dit->GetSource();
1067 org_desc = mod_combo->GetSourceDescriptionString(bsrc);
1068 if (bsrc.CanGetGenome()) {
1069 genome_val = bsrc.GetGenome();
1070 }
1071 break;
1072 }
1073 string feature_clauses = GetOneFeatureClauseList(bh, genome_val);
1074
1075 if (org_desc.length() > 0 && isalpha(org_desc.c_str()[0])) {
1076 string first_letter = org_desc.substr(0, 1);
1077 string remainder = org_desc.substr(1);
1078 NStr::ToUpper(first_letter);
1079 org_desc = first_letter + remainder;
1080 }
1081
1082 string keyword = GetKeywordPrefix(bh);
1083
1084 return keyword + org_desc + feature_clauses;
1085 }
1086
1087
GetAvailableModifiers(CAutoDef::TAvailableModifierSet & mod_set)1088 void CAutoDef::GetAvailableModifiers(CAutoDef::TAvailableModifierSet &mod_set)
1089 {
1090 mod_set.clear();
1091 CAutoDefSourceDescription::TAvailableModifierVector modifier_list;
1092 modifier_list.clear();
1093 m_OrigModCombo.GetAvailableModifiers (modifier_list);
1094 for (unsigned int k = 0; k < modifier_list.size(); k++) {
1095 mod_set.insert(CAutoDefAvailableModifier(modifier_list[k]));
1096 }
1097 }
1098
1099
SetOptionsObject(const CUser_object & user)1100 void CAutoDef::SetOptionsObject(const CUser_object& user)
1101 {
1102 m_Options.InitFromUserObject(user);
1103 }
1104
1105
1106 //starting here, remove when separating autodef from taxonomy options
s_GetOptionsForSet(CBioseq_set_Handle set)1107 CConstRef<CUser_object> s_GetOptionsForSet(CBioseq_set_Handle set)
1108 {
1109 CConstRef<CUser_object> options(NULL);
1110 CBioseq_CI b(set, CSeq_inst::eMol_na);
1111 while (b && !options) {
1112 CSeqdesc_CI desc(*b, CSeqdesc::e_User);
1113 while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1114 ++desc;
1115 }
1116 if (desc) {
1117 options.Reset(&(desc->GetUser()));
1118 }
1119 }
1120 return options;
1121 }
1122
1123
RegenerateDefLine(CBioseq_Handle bh)1124 string CAutoDef::RegenerateDefLine(CBioseq_Handle bh)
1125 {
1126 string defline;
1127 if (bh.IsAa()) {
1128 return kEmptyStr;
1129 }
1130 CSeqdesc_CI desc(bh, CSeqdesc::e_User);
1131 while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1132 ++desc;
1133 }
1134 if (desc) {
1135 CAutoDef autodef;
1136 autodef.SetOptionsObject(desc->GetUser());
1137 CAutoDefModifierCombo mod_combo;
1138 CAutoDefOptions options;
1139 options.InitFromUserObject(desc->GetUser());
1140 mod_combo.SetOptions(options);
1141 defline = autodef.GetOneDefLine(&mod_combo, bh);
1142 }
1143 return defline;
1144 }
1145
1146
RegenerateSequenceDefLines(CSeq_entry_Handle se)1147 bool CAutoDef::RegenerateSequenceDefLines(CSeq_entry_Handle se)
1148 {
1149 bool any = false;
1150 CBioseq_CI b_iter(se);
1151 for (; b_iter; ++b_iter) {
1152 if (b_iter->IsAa()) {
1153 continue;
1154 }
1155 CSeqdesc_CI desc(*b_iter, CSeqdesc::e_User);
1156 while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
1157 ++desc;
1158 }
1159 if (desc) {
1160 string defline = RegenerateDefLine(*b_iter);
1161
1162 bool found_existing = false;
1163 CBioseq_EditHandle beh(*b_iter);
1164 NON_CONST_ITERATE(CBioseq_EditHandle::TDescr::Tdata, it, beh.SetDescr().Set()) {
1165 if ((*it)->IsTitle()) {
1166 if (!NStr::Equal((*it)->GetTitle(), defline)) {
1167 (*it)->SetTitle(defline);
1168 any = true;
1169 }
1170 found_existing = true;
1171 break;
1172 }
1173 }
1174 if (!found_existing) {
1175 CRef<CSeqdesc> new_desc(new CSeqdesc());
1176 new_desc->SetTitle(defline);
1177 beh.SetDescr().Set().push_back(new_desc);
1178 any = true;
1179 }
1180 }
1181 }
1182 return any;
1183 }
1184
1185
x_IsHumanSTR(const CUser_object & obj)1186 bool CAutoDef::x_IsHumanSTR(const CUser_object& obj)
1187 {
1188 if (obj.GetObjectType() != CUser_object::eObjectType_StructuredComment) {
1189 return false;
1190 }
1191 if (!obj.IsSetData()) {
1192 return false;
1193 }
1194 ITERATE(CUser_object::TData, f, obj.GetData()) {
1195 if ((*f)->IsSetLabel() && (*f)->GetLabel().IsStr() &&
1196 NStr::EqualNocase((*f)->GetLabel().GetStr(), "StructuredCommentPrefix") &&
1197 (*f)->IsSetData() && (*f)->GetData().IsStr()) {
1198 if (NStr::EqualNocase((*f)->GetData().GetStr(), "##HumanSTR-START##")) {
1199 return true;
1200 } else {
1201 return false;
1202 }
1203 }
1204 }
1205 return false;
1206 }
1207
1208
x_GetHumanSTRFeatureClauses(CBioseq_Handle bh,const CUser_object & comment)1209 string CAutoDef::x_GetHumanSTRFeatureClauses(CBioseq_Handle bh, const CUser_object& comment)
1210 {
1211 string locus_name;
1212 string allele;
1213 string repeat;
1214 string assay;
1215
1216 if (comment.IsSetData()) {
1217 ITERATE(CUser_object::TData, it, comment.GetData()) {
1218 if ((*it)->IsSetData() && (*it)->GetData().IsStr() &&
1219 (*it)->IsSetLabel() && (*it)->GetLabel().IsStr()) {
1220 const string& label = (*it)->GetLabel().GetStr();
1221 if (NStr::EqualNocase(label, "STR locus name")) {
1222 locus_name = (*it)->GetData().GetStr();
1223 } else if (NStr::EqualNocase(label, "Length-based allele")) {
1224 allele = (*it)->GetData().GetStr();
1225 } else if (NStr::EqualNocase(label, "Bracketed repeat")) {
1226 repeat = (*it)->GetData().GetStr();
1227 } else if (NStr::EqualNocase(label, "Sequencing assay code")) {
1228 assay = (*it)->GetData().GetStr();
1229 }
1230 }
1231 }
1232 }
1233
1234 string clause = "microsatellite " + locus_name + " " + allele + " " + repeat;
1235 CFeat_CI f(bh, CSeqFeatData::eSubtype_variation);
1236 while (f) {
1237 if (f->IsSetDbxref()) {
1238 ITERATE(CSeq_feat::TDbxref, db, f->GetDbxref()) {
1239 if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "dbSNP") &&
1240 (*db)->IsSetTag()) {
1241 if ((*db)->GetTag().IsStr()) {
1242 clause += " " + (*db)->GetTag().GetStr();
1243 } else if ((*db)->GetTag().IsId()) {
1244 clause += " " + NStr::NumericToString((*db)->GetTag().GetId());
1245 }
1246 }
1247 }
1248 }
1249 ++f;
1250 }
1251 if (assay != "") {
1252 clause += " " + assay;
1253 }
1254 clause += " sequence";
1255 return clause;
1256 }
1257
1258
s_ChooseModInModList(bool is_org_mod,int subtype,bool require_all,CAutoDefSourceDescription::TAvailableModifierVector & modifiers)1259 bool s_ChooseModInModList(bool is_org_mod, int subtype, bool require_all, CAutoDefSourceDescription::TAvailableModifierVector& modifiers)
1260 {
1261 bool rval = false;
1262 for (auto & modifier : modifiers) {
1263 if (modifier.IsOrgMod() && is_org_mod) {
1264 if (modifier.GetOrgModType() == subtype) {
1265 if (modifier.AllPresent()) {
1266 rval = true;
1267 }
1268 else if (modifier.AnyPresent() && !require_all) {
1269 rval = true;
1270 }
1271 if (rval) {
1272 modifier.SetRequested(true);
1273 }
1274 break;
1275 }
1276 }
1277 else if (!modifier.IsOrgMod() && !is_org_mod) {
1278 if (modifier.GetSubSourceType() == subtype) {
1279 if (modifier.AllPresent()) {
1280 rval = true;
1281 }
1282 else if (modifier.AnyPresent() && !require_all) {
1283 rval = true;
1284 }
1285 if (rval) {
1286 modifier.SetRequested(true);
1287 }
1288 break;
1289 }
1290 }
1291 }
1292 return rval;
1293 }
1294
1295
CreateIDOptions(CSeq_entry_Handle seh)1296 CRef<CUser_object> CAutoDef::CreateIDOptions(CSeq_entry_Handle seh)
1297 {
1298 CAutoDef ad;
1299 ad.AddSources(seh);
1300
1301 CRef<CAutoDefModifierCombo> src_combo = ad.FindBestModifierCombo();
1302 CAutoDefSourceDescription::TAvailableModifierVector modifiers;
1303 src_combo->GetAvailableModifiers(modifiers);
1304
1305 static int subtypes[] = { COrgMod::eSubtype_strain,
1306 CSubSource::eSubtype_clone,
1307 COrgMod::eSubtype_isolate,
1308 CSubSource::eSubtype_haplotype,
1309 COrgMod::eSubtype_cultivar,
1310 COrgMod::eSubtype_ecotype,
1311 COrgMod::eSubtype_breed,
1312 COrgMod::eSubtype_specimen_voucher,
1313 COrgMod::eSubtype_culture_collection,
1314 COrgMod::eSubtype_bio_material };
1315 static bool is_orgmod[] = { true, false, true, false, true, true, true, true, true, true };
1316 static int num_subtypes = sizeof(subtypes) / sizeof(int);
1317
1318
1319 bool found = false;
1320 // first look for best identifier found in all
1321 for (int i = 0; i < num_subtypes && !found; i++) {
1322 found = s_ChooseModInModList(is_orgmod[i], subtypes[i], true, modifiers);
1323 }
1324 if (!found) {
1325 // if not found in all, use best identifier found in some
1326 for (int i = 0; i < num_subtypes && !found; i++) {
1327 found = s_ChooseModInModList(is_orgmod[i], subtypes[i], false, modifiers);
1328 }
1329 }
1330 if (!src_combo->AreFeatureClausesUnique()) {
1331 // use best
1332 for (auto &modifier : modifiers) {
1333 if (modifier.AnyPresent()) {
1334 if (modifier.IsOrgMod()) {
1335 if (src_combo->HasOrgMod(modifier.GetOrgModType())) {
1336 modifier.SetRequested(true);
1337 }
1338 }
1339 else if (src_combo->HasSubSource(modifier.GetSubSourceType())) {
1340 modifier.SetRequested(true);
1341 }
1342 }
1343 }
1344 }
1345
1346 CRef<CUser_object> user = ad.GetOptionsObject();
1347 CAutoDefOptions options;
1348 options.InitFromUserObject(*user);
1349 for(const auto &it : modifiers) {
1350 if (it.IsRequested()) {
1351 if (it.IsOrgMod()) {
1352 options.AddOrgMod(it.GetOrgModType());
1353 } else {
1354 options.AddSubSource(it.GetSubSourceType());
1355 }
1356 }
1357 }
1358 user = options.MakeUserObject();
1359 return user;
1360 }
1361
1362
1363 END_SCOPE(objects)
1364 END_NCBI_SCOPE
1365