1 /* $Id: autodef_source_desc.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Generate unique definition lines for a set of sequences using organism
30 * descriptions and feature clauses.
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objmgr/util/autodef_source_desc.hpp>
35 #include <corelib/ncbimisc.hpp>
36 #include <objmgr/seqdesc_ci.hpp>
37 #include <objmgr/bioseq_ci.hpp>
38 #include <objmgr/feat_ci.hpp>
39 #include <objmgr/util/feature.hpp>
40
41 #include <objects/seq/Seq_descr.hpp>
42 #include <objects/seq/Seqdesc.hpp>
43 #include <objects/seq/Bioseq.hpp>
44
45 #include <serial/iterator.hpp>
46
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)48 BEGIN_SCOPE(objects)
49
50 CAutoDefSourceDescription::CAutoDefSourceDescription(const CBioSource& bs, string feature_clauses) : m_BS(bs)
51 {
52 // consider feature clauses when looking for uniqueness
53 m_FeatureClauses = feature_clauses;
54
55 if (bs.CanGetOrg() && bs.GetOrg().CanGetTaxname()) {
56 m_DescStrings.push_back (bs.GetOrg().GetTaxname());
57 }
58 if (bs.CanGetOrg() && bs.GetOrg().CanGetOrgname() && bs.GetOrg().GetOrgname().CanGetMod()) {
59 ITERATE (COrgName::TMod, modI, bs.GetOrg().GetOrgname().GetMod()) {
60 m_Modifiers.push_back (CAutoDefSourceModifierInfo(true, (*modI)->GetSubtype(), (*modI)->GetSubname()));
61 }
62 }
63 ITERATE (CBioSource::TSubtype, subSrcI, bs.GetSubtype()) {
64 m_Modifiers.push_back (CAutoDefSourceModifierInfo(false, (*subSrcI)->GetSubtype(), (*subSrcI)->GetName()));
65 }
66 std::sort (m_Modifiers.begin(), m_Modifiers.end());
67 }
68
69
CAutoDefSourceDescription(CAutoDefSourceDescription * other)70 CAutoDefSourceDescription::CAutoDefSourceDescription(CAutoDefSourceDescription *other) : m_BS(other->GetBioSource())
71 {
72 // copy strings
73 ITERATE (TDescString, string_it, other->GetStrings()) {
74 m_DescStrings.push_back (*string_it);
75 }
76 // copy remaining modifier list
77 ITERATE (TModifierVector, it, other->GetModifiers()) {
78 m_Modifiers.push_back (CAutoDefSourceModifierInfo(*it));
79 }
80 // copy feature clauses
81 m_FeatureClauses = other->m_FeatureClauses;
82 }
83
84
~CAutoDefSourceDescription()85 CAutoDefSourceDescription::~CAutoDefSourceDescription()
86 {
87 }
88
GetBioSource() const89 const CBioSource& CAutoDefSourceDescription::GetBioSource() const
90 {
91 return m_BS;
92 }
93
94
AddQual(bool isOrgMod,int subtype,bool keepAfterSemicolon)95 bool CAutoDefSourceDescription::AddQual (bool isOrgMod, int subtype, bool keepAfterSemicolon)
96 {
97 bool rval = false;
98 TModifierVector::iterator it;
99
100 it = m_Modifiers.begin();
101 while (it != m_Modifiers.end()) {
102 if (isOrgMod) {
103 if (it->IsOrgMod() && it->GetSubtype() == subtype) {
104 string val = it->GetValue();
105 if (!keepAfterSemicolon) {
106 string::size_type end = NStr::Find(val, ";");
107 if (end != NCBI_NS_STD::string::npos) {
108 val = val.substr(0, end);
109 }
110 }
111 m_DescStrings.push_back (val);
112 it = m_Modifiers.erase(it);
113 rval = true;
114 } else {
115 ++it;
116 }
117 } else {
118 if (!it->IsOrgMod() && it->GetSubtype() == subtype) {
119 string val = it->GetValue();
120 if (!keepAfterSemicolon) {
121 string::size_type end = NStr::Find(val, ";");
122 if (end != NCBI_NS_STD::string::npos) {
123 val = val.substr(0, end);
124 }
125 }
126 m_DescStrings.push_back (val);
127 it = m_Modifiers.erase(it);
128 rval = true;
129 } else {
130 ++it;
131 }
132 }
133 }
134 return rval;
135 }
136
137
RemoveQual(bool isOrgMod,int subtype)138 bool CAutoDefSourceDescription::RemoveQual (bool isOrgMod, int subtype)
139 {
140 bool rval = false;
141 TModifierVector::iterator it;
142
143 it = m_Modifiers.begin();
144 while (it != m_Modifiers.end()) {
145 if (isOrgMod) {
146 if (it->IsOrgMod() && it->GetSubtype() == subtype) {
147 it = m_Modifiers.erase(it);
148 rval = true;
149 } else {
150 ++it;
151 }
152 } else {
153 if (!it->IsOrgMod() && it->GetSubtype() == subtype) {
154 it = m_Modifiers.erase(it);
155 rval = true;
156 } else {
157 ++it;
158 }
159 }
160 }
161 return rval;
162 }
163
164
Compare(const CAutoDefSourceDescription & s) const165 int CAutoDefSourceDescription::Compare(const CAutoDefSourceDescription& s) const
166 {
167 unsigned int k = 0;
168 int rval = 0;
169 TDescString::const_iterator s_it, this_it;
170
171 s_it = s.GetStrings().begin();
172 this_it = GetStrings().begin();
173 while (s_it != s.GetStrings().end()
174 && this_it != GetStrings().end()
175 && rval == 0) {
176 rval = NStr::Compare (*this_it, *s_it);
177 k++;
178 ++s_it;
179 ++this_it;
180 }
181 if (rval == 0) {
182 if (k < s.GetStrings().size()) {
183 rval = -1;
184 } else if (k < m_DescStrings.size()) {
185 rval = 1;
186 }
187 }
188 if (rval == 0) {
189 rval = NStr::Compare (GetFeatureClauses(), s.GetFeatureClauses());
190 }
191 return rval;
192 }
193
194
GetComboDescription(IAutoDefCombo * mod_combo)195 string CAutoDefSourceDescription::GetComboDescription(IAutoDefCombo *mod_combo)
196 {
197 string desc;
198 if (mod_combo) {
199 return mod_combo->GetSourceDescriptionString(m_BS);
200 } else {
201 return m_BS.GetOrg().GetTaxname();
202 }
203 }
204
205
GetAvailableModifiers(TAvailableModifierVector & modifier_list)206 void CAutoDefSourceDescription::GetAvailableModifiers (TAvailableModifierVector &modifier_list)
207 {
208 unsigned int k;
209
210 for (k = 0; k < modifier_list.size(); k++) {
211 bool found = false;
212 if (modifier_list[k].IsOrgMod()) {
213 if (m_BS.CanGetOrg() && m_BS.GetOrg().CanGetOrgname() && m_BS.GetOrg().GetOrgname().IsSetMod()) {
214 ITERATE (COrgName::TMod, modI, m_BS.GetOrg().GetOrgname().GetMod()) {
215 if ((*modI)->GetSubtype() == modifier_list[k].GetOrgModType()) {
216 found = true;
217 modifier_list[k].ValueFound((*modI)->GetSubname() );
218 }
219 }
220 }
221 } else {
222 // get subsource modifiers
223 if (m_BS.CanGetSubtype()) {
224 ITERATE (CBioSource::TSubtype, subSrcI, m_BS.GetSubtype()) {
225 if ((*subSrcI)->GetSubtype() == modifier_list[k].GetSubSourceType()) {
226 found = true;
227 modifier_list[k].ValueFound((*subSrcI)->GetName());
228 }
229 }
230 }
231 }
232 if (!found) {
233 modifier_list[k].ValueFound("");
234 }
235 }
236 }
237
238 // tricky HIV records have an isolate and a clone
IsTrickyHIV()239 bool CAutoDefSourceDescription::IsTrickyHIV()
240 {
241 string tax_name = m_BS.GetOrg().GetTaxname();
242 if (!NStr::Equal(tax_name, "HIV-1") && !NStr::Equal(tax_name, "HIV-2")) {
243 return false;
244 }
245
246 bool found = false;
247
248 if (m_BS.CanGetSubtype()) {
249 ITERATE (CBioSource::TSubtype, subSrcI, m_BS.GetSubtype()) {
250 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_clone) {
251 found = true;
252 }
253 }
254 }
255 if (!found) {
256 return false;
257 }
258
259 found = false;
260 if (m_BS.CanGetOrg() && m_BS.GetOrg().CanGetOrgname() && m_BS.GetOrg().GetOrgname().IsSetMod()) {
261 ITERATE (COrgName::TMod, modI, m_BS.GetOrg().GetOrgname().GetMod()) {
262 if ((*modI)->GetSubtype() == COrgMod::eSubtype_isolate) {
263 found = true;
264 }
265 }
266 }
267 return found;
268 }
269
270
CAutoDefSourceModifierInfo(bool isOrgMod,int subtype,string value)271 CAutoDefSourceModifierInfo::CAutoDefSourceModifierInfo(bool isOrgMod, int subtype, string value)
272 {
273 m_IsOrgMod = isOrgMod;
274 m_Subtype = subtype;
275 m_Value = value;
276 }
277
278
CAutoDefSourceModifierInfo(const CAutoDefSourceModifierInfo & other)279 CAutoDefSourceModifierInfo::CAutoDefSourceModifierInfo(const CAutoDefSourceModifierInfo &other)
280 {
281 m_IsOrgMod = other.IsOrgMod();
282 m_Subtype = other.GetSubtype();
283 m_Value = other.GetValue();
284 }
285
286
~CAutoDefSourceModifierInfo()287 CAutoDefSourceModifierInfo::~CAutoDefSourceModifierInfo()
288 {
289 }
290
291
GetRank() const292 unsigned int CAutoDefSourceModifierInfo::GetRank() const
293 {
294 if (m_IsOrgMod) {
295 if (m_Subtype == COrgMod::eSubtype_strain) {
296 return 3;
297 } else if (m_Subtype == COrgMod::eSubtype_isolate) {
298 return 5;
299 } else if (m_Subtype == COrgMod::eSubtype_cultivar) {
300 return 7;
301 } else if (m_Subtype == COrgMod::eSubtype_specimen_voucher) {
302 return 8;
303 } else if (m_Subtype == COrgMod::eSubtype_ecotype) {
304 return 9;
305 } else if (m_Subtype == COrgMod::eSubtype_type) {
306 return 10;
307 } else if (m_Subtype == COrgMod::eSubtype_serotype) {
308 return 11;
309 } else if (m_Subtype == COrgMod::eSubtype_authority) {
310 return 12;
311 } else if (m_Subtype == COrgMod::eSubtype_breed) {
312 return 13;
313 }
314 } else {
315 if (m_Subtype == CSubSource::eSubtype_transgenic) {
316 return 0;
317 } else if (m_Subtype == CSubSource::eSubtype_plasmid_name) {
318 return 1;
319 } else if (m_Subtype == CSubSource::eSubtype_endogenous_virus_name) {
320 return 2;
321 } else if (m_Subtype == CSubSource::eSubtype_clone) {
322 return 4;
323 } else if (m_Subtype == CSubSource::eSubtype_haplotype) {
324 return 6;
325 }
326 }
327 return 50;
328 }
329
330
Compare(const CAutoDefSourceModifierInfo & mod) const331 int CAutoDefSourceModifierInfo::Compare(const CAutoDefSourceModifierInfo& mod) const
332 {
333 int rank1, rank2;
334
335 rank1 = GetRank();
336 rank2 = mod.GetRank();
337
338 if (rank1 < rank2) {
339 return -1;
340 } else if (rank1 > rank2) {
341 return 1;
342 } else if (IsOrgMod() && !mod.IsOrgMod()) {
343 // prefer subsource to orgmod qualifiers
344 return -1;
345 } else if (!IsOrgMod() && mod.IsOrgMod()) {
346 return 1;
347 } else if (IsOrgMod() && mod.IsOrgMod()) {
348 if (GetSubtype() == mod.GetSubtype()) {
349 return 0;
350 } else {
351 return (GetSubtype() < mod.GetSubtype() ? -1 : 1);
352 }
353 } else {
354 if (GetSubtype() == mod.GetSubtype()) {
355 return 0;
356 } else {
357 return (GetSubtype() < mod.GetSubtype() ? -1 : 1);
358 }
359 }
360 }
361
362
363 END_SCOPE(objects)
364 END_NCBI_SCOPE
365