1 /*  $Id: autodef_source_desc.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Colleen Bollin
27 *
28 * File Description:
29 *   Generate unique definition lines for a set of sequences using organism
30 *   descriptions and feature clauses.
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <objmgr/util/autodef_source_desc.hpp>
35 #include <corelib/ncbimisc.hpp>
36 #include <objmgr/seqdesc_ci.hpp>
37 #include <objmgr/bioseq_ci.hpp>
38 #include <objmgr/feat_ci.hpp>
39 #include <objmgr/util/feature.hpp>
40 
41 #include <objects/seq/Seq_descr.hpp>
42 #include <objects/seq/Seqdesc.hpp>
43 #include <objects/seq/Bioseq.hpp>
44 
45 #include <serial/iterator.hpp>
46 
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)48 BEGIN_SCOPE(objects)
49 
50 CAutoDefSourceDescription::CAutoDefSourceDescription(const CBioSource& bs, string feature_clauses) : m_BS(bs)
51 {
52     // consider feature clauses when looking for uniqueness
53     m_FeatureClauses = feature_clauses;
54 
55     if (bs.CanGetOrg() && bs.GetOrg().CanGetTaxname()) {
56         m_DescStrings.push_back (bs.GetOrg().GetTaxname());
57     }
58     if (bs.CanGetOrg() && bs.GetOrg().CanGetOrgname() && bs.GetOrg().GetOrgname().CanGetMod()) {
59         ITERATE (COrgName::TMod, modI, bs.GetOrg().GetOrgname().GetMod()) {
60             m_Modifiers.push_back (CAutoDefSourceModifierInfo(true, (*modI)->GetSubtype(), (*modI)->GetSubname()));
61         }
62     }
63     ITERATE (CBioSource::TSubtype, subSrcI, bs.GetSubtype()) {
64         m_Modifiers.push_back (CAutoDefSourceModifierInfo(false, (*subSrcI)->GetSubtype(), (*subSrcI)->GetName()));
65     }
66     std::sort (m_Modifiers.begin(), m_Modifiers.end());
67 }
68 
69 
CAutoDefSourceDescription(CAutoDefSourceDescription * other)70 CAutoDefSourceDescription::CAutoDefSourceDescription(CAutoDefSourceDescription *other) : m_BS(other->GetBioSource())
71 {
72     // copy strings
73     ITERATE (TDescString, string_it, other->GetStrings()) {
74         m_DescStrings.push_back (*string_it);
75     }
76     // copy remaining modifier list
77     ITERATE (TModifierVector, it, other->GetModifiers()) {
78         m_Modifiers.push_back (CAutoDefSourceModifierInfo(*it));
79     }
80     // copy feature clauses
81     m_FeatureClauses = other->m_FeatureClauses;
82 }
83 
84 
~CAutoDefSourceDescription()85 CAutoDefSourceDescription::~CAutoDefSourceDescription()
86 {
87 }
88 
GetBioSource() const89 const CBioSource& CAutoDefSourceDescription::GetBioSource() const
90 {
91     return m_BS;
92 }
93 
94 
AddQual(bool isOrgMod,int subtype,bool keepAfterSemicolon)95 bool CAutoDefSourceDescription::AddQual (bool isOrgMod, int subtype, bool keepAfterSemicolon)
96 {
97     bool rval = false;
98     TModifierVector::iterator it;
99 
100     it = m_Modifiers.begin();
101     while (it != m_Modifiers.end()) {
102         if (isOrgMod) {
103             if (it->IsOrgMod() && it->GetSubtype() == subtype) {
104 				string val = it->GetValue();
105 				if (!keepAfterSemicolon) {
106                     string::size_type end = NStr::Find(val, ";");
107                     if (end != NCBI_NS_STD::string::npos) {
108                         val = val.substr(0, end);
109 					}
110 				}
111                 m_DescStrings.push_back (val);
112                 it = m_Modifiers.erase(it);
113                 rval = true;
114             } else {
115                 ++it;
116             }
117         } else {
118             if (!it->IsOrgMod() && it->GetSubtype() == subtype) {
119 				string val = it->GetValue();
120 				if (!keepAfterSemicolon) {
121                     string::size_type end = NStr::Find(val, ";");
122                     if (end != NCBI_NS_STD::string::npos) {
123                         val = val.substr(0, end);
124 					}
125 				}
126                 m_DescStrings.push_back (val);
127                 it = m_Modifiers.erase(it);
128                 rval = true;
129             } else {
130                 ++it;
131             }
132         }
133     }
134     return rval;
135 }
136 
137 
RemoveQual(bool isOrgMod,int subtype)138 bool CAutoDefSourceDescription::RemoveQual (bool isOrgMod, int subtype)
139 {
140     bool rval = false;
141     TModifierVector::iterator it;
142 
143     it = m_Modifiers.begin();
144     while (it != m_Modifiers.end()) {
145         if (isOrgMod) {
146             if (it->IsOrgMod() && it->GetSubtype() == subtype) {
147                 it = m_Modifiers.erase(it);
148                 rval = true;
149             } else {
150                 ++it;
151             }
152         } else {
153             if (!it->IsOrgMod() && it->GetSubtype() == subtype) {
154                 it = m_Modifiers.erase(it);
155                 rval = true;
156             } else {
157                 ++it;
158             }
159         }
160     }
161     return rval;
162 }
163 
164 
Compare(const CAutoDefSourceDescription & s) const165 int CAutoDefSourceDescription::Compare(const CAutoDefSourceDescription& s) const
166 {
167     unsigned int k = 0;
168     int rval = 0;
169     TDescString::const_iterator s_it, this_it;
170 
171     s_it = s.GetStrings().begin();
172     this_it = GetStrings().begin();
173     while (s_it != s.GetStrings().end()
174            && this_it != GetStrings().end()
175            && rval == 0) {
176         rval = NStr::Compare (*this_it, *s_it);
177         k++;
178         ++s_it;
179         ++this_it;
180     }
181     if (rval == 0) {
182         if (k < s.GetStrings().size()) {
183             rval = -1;
184         } else if (k < m_DescStrings.size()) {
185             rval = 1;
186         }
187     }
188     if (rval == 0) {
189         rval = NStr::Compare (GetFeatureClauses(), s.GetFeatureClauses());
190     }
191     return rval;
192 }
193 
194 
GetComboDescription(IAutoDefCombo * mod_combo)195 string CAutoDefSourceDescription::GetComboDescription(IAutoDefCombo *mod_combo)
196 {
197     string desc;
198     if (mod_combo) {
199         return mod_combo->GetSourceDescriptionString(m_BS);
200     } else {
201         return m_BS.GetOrg().GetTaxname();
202     }
203 }
204 
205 
GetAvailableModifiers(TAvailableModifierVector & modifier_list)206 void CAutoDefSourceDescription::GetAvailableModifiers (TAvailableModifierVector &modifier_list)
207 {
208     unsigned int k;
209 
210     for (k = 0; k < modifier_list.size(); k++) {
211         bool found = false;
212         if (modifier_list[k].IsOrgMod()) {
213             if (m_BS.CanGetOrg() && m_BS.GetOrg().CanGetOrgname() && m_BS.GetOrg().GetOrgname().IsSetMod()) {
214                 ITERATE (COrgName::TMod, modI, m_BS.GetOrg().GetOrgname().GetMod()) {
215                     if ((*modI)->GetSubtype() == modifier_list[k].GetOrgModType()) {
216                         found = true;
217                         modifier_list[k].ValueFound((*modI)->GetSubname() );
218                     }
219                 }
220             }
221         } else {
222             // get subsource modifiers
223             if (m_BS.CanGetSubtype()) {
224                 ITERATE (CBioSource::TSubtype, subSrcI, m_BS.GetSubtype()) {
225                     if ((*subSrcI)->GetSubtype() == modifier_list[k].GetSubSourceType()) {
226                         found = true;
227                         modifier_list[k].ValueFound((*subSrcI)->GetName());
228                     }
229                 }
230             }
231         }
232         if (!found) {
233             modifier_list[k].ValueFound("");
234         }
235     }
236 }
237 
238 // tricky HIV records have an isolate and a clone
IsTrickyHIV()239 bool CAutoDefSourceDescription::IsTrickyHIV()
240 {
241     string tax_name = m_BS.GetOrg().GetTaxname();
242     if (!NStr::Equal(tax_name, "HIV-1") && !NStr::Equal(tax_name, "HIV-2")) {
243         return false;
244     }
245 
246     bool found = false;
247 
248     if (m_BS.CanGetSubtype()) {
249         ITERATE (CBioSource::TSubtype, subSrcI, m_BS.GetSubtype()) {
250             if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_clone) {
251                 found = true;
252             }
253         }
254     }
255     if (!found) {
256         return false;
257     }
258 
259     found = false;
260     if (m_BS.CanGetOrg() && m_BS.GetOrg().CanGetOrgname() && m_BS.GetOrg().GetOrgname().IsSetMod()) {
261         ITERATE (COrgName::TMod, modI, m_BS.GetOrg().GetOrgname().GetMod()) {
262             if ((*modI)->GetSubtype() == COrgMod::eSubtype_isolate) {
263                 found = true;
264             }
265         }
266     }
267     return found;
268 }
269 
270 
CAutoDefSourceModifierInfo(bool isOrgMod,int subtype,string value)271 CAutoDefSourceModifierInfo::CAutoDefSourceModifierInfo(bool isOrgMod, int subtype, string value)
272 {
273     m_IsOrgMod = isOrgMod;
274     m_Subtype = subtype;
275     m_Value = value;
276 }
277 
278 
CAutoDefSourceModifierInfo(const CAutoDefSourceModifierInfo & other)279 CAutoDefSourceModifierInfo::CAutoDefSourceModifierInfo(const CAutoDefSourceModifierInfo &other)
280 {
281     m_IsOrgMod = other.IsOrgMod();
282     m_Subtype = other.GetSubtype();
283     m_Value = other.GetValue();
284 }
285 
286 
~CAutoDefSourceModifierInfo()287 CAutoDefSourceModifierInfo::~CAutoDefSourceModifierInfo()
288 {
289 }
290 
291 
GetRank() const292 unsigned int CAutoDefSourceModifierInfo::GetRank() const
293 {
294     if (m_IsOrgMod) {
295         if (m_Subtype == COrgMod::eSubtype_strain) {
296             return 3;
297         } else if (m_Subtype == COrgMod::eSubtype_isolate) {
298             return 5;
299         } else if (m_Subtype == COrgMod::eSubtype_cultivar) {
300             return 7;
301         } else if (m_Subtype == COrgMod::eSubtype_specimen_voucher) {
302             return 8;
303         } else if (m_Subtype == COrgMod::eSubtype_ecotype) {
304             return 9;
305         } else if (m_Subtype == COrgMod::eSubtype_type) {
306             return 10;
307         } else if (m_Subtype == COrgMod::eSubtype_serotype) {
308             return 11;
309         } else if (m_Subtype == COrgMod::eSubtype_authority) {
310             return 12;
311         } else if (m_Subtype == COrgMod::eSubtype_breed) {
312             return 13;
313         }
314     } else {
315         if (m_Subtype == CSubSource::eSubtype_transgenic) {
316             return 0;
317         } else if (m_Subtype == CSubSource::eSubtype_plasmid_name) {
318             return 1;
319          } else if (m_Subtype == CSubSource::eSubtype_endogenous_virus_name)  {
320             return 2;
321         } else if (m_Subtype == CSubSource::eSubtype_clone) {
322             return 4;
323         } else if (m_Subtype == CSubSource::eSubtype_haplotype) {
324             return 6;
325         }
326     }
327     return 50;
328 }
329 
330 
Compare(const CAutoDefSourceModifierInfo & mod) const331 int CAutoDefSourceModifierInfo::Compare(const CAutoDefSourceModifierInfo& mod) const
332 {
333     int rank1, rank2;
334 
335     rank1 = GetRank();
336     rank2 = mod.GetRank();
337 
338     if (rank1 < rank2) {
339         return -1;
340     } else if (rank1 > rank2) {
341         return 1;
342     } else if (IsOrgMod() && !mod.IsOrgMod()) {
343         // prefer subsource to orgmod qualifiers
344         return -1;
345     } else if (!IsOrgMod() && mod.IsOrgMod()) {
346         return 1;
347     } else if (IsOrgMod() && mod.IsOrgMod()) {
348         if (GetSubtype() == mod.GetSubtype()) {
349             return 0;
350         } else {
351             return (GetSubtype() < mod.GetSubtype() ? -1 : 1);
352         }
353     } else {
354         if (GetSubtype() == mod.GetSubtype()) {
355             return 0;
356         } else {
357             return (GetSubtype() < mod.GetSubtype() ? -1 : 1);
358         }
359     }
360 }
361 
362 
363 END_SCOPE(objects)
364 END_NCBI_SCOPE
365