1 /* $Id: mod_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Justin Foley
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objects/seqset/Bioseq_set.hpp>
35 #include <objects/seq/Bioseq.hpp>
36 #include <objects/seq/MolInfo.hpp>
37 #include <objects/seq/Seq_inst.hpp>
38 #include <objects/seq/Seq_hist.hpp>
39 #include <objects/seq/Seq_hist_rec.hpp>
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objtools/logging/message.hpp>
42 #include <objtools/logging/listener.hpp>
43 #include <objtools/readers/mod_reader.hpp>
44 #include <objtools/readers/mod_error.hpp>
45 #include <objtools/readers/message_listener.hpp>
46 #include <map>
47 #include <unordered_map>
48 #include <unordered_set>
49 #include <cassert>
50 //#include <util/compile_time.hpp>
51
52 #include "mod_to_enum.hpp"
53 #include "descr_mod_apply.hpp"
54 #include "feature_mod_apply.hpp"
55
56 BEGIN_NCBI_SCOPE
57 BEGIN_SCOPE(objects)
58
59 //MAKE_CONST_MAP(s_ModNameMap, NStr::eCase, const char*, const char*,
60
61 static const unordered_map<string, string> s_ModNameMap =
62 {{"top","topology"},
63 {"mol","molecule"},
64 {"moltype", "mol-type"},
65 {"fwd-pcr-primer-name", "fwd-primer-name"},
66 {"fwd-pcr-primer-names", "fwd-primer-name"},
67 {"fwd-primer-names", "fwd-primer-name"},
68 {"fwd-pcr-primer-seq","fwd-primer-seq"},
69 {"fwd-pcr-primer-seqs","fwd-primer-seq"},
70 {"fwd-primer-seqs","fwd-primer-seq"},
71 {"rev-pcr-primer-name", "rev-primer-name"},
72 {"rev-pcr-primer-names", "rev-primer-name"},
73 {"rev-primer-names", "rev-primer-name"},
74 {"rev-pcr-primer-seq", "rev-primer-seq"},
75 {"rev-pcr-primer-seqs", "rev-primer-seq"},
76 {"rev-primer-seqs", "rev-primer-seq"},
77 {"org", "taxname"},
78 {"organism", "taxname"},
79 {"div", "division"},
80 {"notes", "note"},
81 {"completedness", "completeness"},
82 {"gene-syn", "gene-synonym"},
83 {"genesyn", "gene-synonym"},
84 {"genesynonym", "gene-synonym"},
85 {"prot", "protein"},
86 {"prot-desc", "protein-desc"},
87 {"function", "activity"},
88 {"secondary", "secondary-accession"},
89 {"secondary-accessions", "secondary-accession"},
90 {"keywords", "keyword"},
91 {"primary", "primary-accession"},
92 {"primary-accessions", "primary-accession"},
93 {"projects", "project"},
94 {"db-xref", "dbxref"},
95 {"pubmed", "pmid"},
96 {"ft-url-mod", "ft-mod"},
97 {"ft-url", "ft-map"}
98 };
99 //);
100
101
102 const CModHandler::TNameSet CModHandler::sm_DeprecatedModifiers
103 {
104 "dosage",
105 "transposon-name",
106 "plastid-name",
107 "insertion-seq-name",
108 "old-lineage",
109 "old-name",
110 "gene",
111 "gene-synonym",
112 "allele",
113 "locus-tag"
114 };
115
116
117 const CModHandler::TNameSet CModHandler::sm_MultipleValuesForbidden =
118 {
119 "topology", // Seq-inst
120 "molecule",
121 "strand",
122 "gene", // Gene-ref
123 "allele",
124 "locus-tag",
125 "protein-desc",// Protein-ref
126 "mol-type", // MolInfo descriptor
127 "tech",
128 "completeness",
129 "location", // Biosource descriptor
130 "origin",
131 "focus",
132 "taxname", // Biosource - Org-ref
133 "common",
134 "lineage", // Biosource - Org-ref - OrgName
135 "division",
136 "gcode",
137 "mgcode",
138 "pgcode"
139 };
140
141
142 //MAKE_CONST_MAP(s_StrandStringToEnum, NStr::eCase, const char*, CSeq_inst::EStrand,
143 static const unordered_map<string, CSeq_inst::EStrand> s_StrandStringToEnum =
144 {{"single", CSeq_inst::eStrand_ss},
145 {"double", CSeq_inst::eStrand_ds},
146 {"mixed", CSeq_inst::eStrand_mixed},
147 {"other", CSeq_inst::eStrand_other}
148 };
149 //);
150
151
152 //MAKE_CONST_MAP(s_MolStringToEnum, NStr::eCase, const char*, CSeq_inst::EMol,
153 static const unordered_map<string, CSeq_inst::EMol> s_MolStringToEnum =
154 {{"dna", CSeq_inst::eMol_dna},
155 {"rna", CSeq_inst::eMol_rna},
156 {"aa", CSeq_inst::eMol_aa},
157 {"na", CSeq_inst::eMol_na},
158 {"other", CSeq_inst::eMol_other}
159 };
160 //);
161
162
163 //MAKE_CONST_MAP(s_TopologyStringToEnum, NStr::eCase, const char*, CSeq_inst::ETopology,
164 static const unordered_map<string, CSeq_inst::ETopology> s_TopologyStringToEnum =
165 {{"linear", CSeq_inst::eTopology_linear},
166 {"circular", CSeq_inst::eTopology_circular},
167 {"tandem", CSeq_inst::eTopology_tandem},
168 {"other", CSeq_inst::eTopology_other}
169 };
170 //);
171
172 /*
173 MAKE_CONST_MAP(s_BiomolEnumToMolEnum, NStr::eNocase, CMolInfo::TBiomol, CSeq_inst::EMol,
174 {{ CMolInfo::eBiomol_genomic, CSeq_inst::eMol_dna},
175 { CMolInfo::eBiomol_pre_RNA, CSeq_inst::eMol_rna},
176 { CMolInfo::eBiomol_mRNA, CSeq_inst::eMol_rna },
177 { CMolInfo::eBiomol_rRNA, CSeq_inst::eMol_rna},
178 { CMolInfo::eBiomol_tRNA, CSeq_inst::eMol_rna},
179 { CMolInfo::eBiomol_snRNA, CSeq_inst::eMol_rna},
180 { CMolInfo::eBiomol_scRNA, CSeq_inst::eMol_rna},
181 { CMolInfo::eBiomol_genomic_mRNA, CSeq_inst::eMol_rna },
182 { CMolInfo::eBiomol_cRNA, CSeq_inst::eMol_rna },
183 { CMolInfo::eBiomol_snoRNA, CSeq_inst::eMol_rna},
184 { CMolInfo::eBiomol_transcribed_RNA, CSeq_inst::eMol_rna},
185 { CMolInfo::eBiomol_ncRNA, CSeq_inst::eMol_rna},
186 { CMolInfo::eBiomol_tmRNA, CSeq_inst::eMol_rna},
187 { CMolInfo::eBiomol_peptide, CSeq_inst::eMol_aa},
188 { CMolInfo::eBiomol_other_genetic, CSeq_inst::eMol_other},
189 { CMolInfo::eBiomol_other, CSeq_inst::eMol_other}
190 });
191 */
192
193
CModHandler()194 CModHandler::CModHandler(){}
195
196
SetExcludedMods(const vector<string> & excluded_mods)197 void CModHandler::SetExcludedMods(const vector<string>& excluded_mods)
198 {
199 m_ExcludedModifiers.clear();
200 transform(excluded_mods.begin(), excluded_mods.end(),
201 inserter(m_ExcludedModifiers, m_ExcludedModifiers.end()),
202 [](const string& mod_name) { return GetCanonicalName(mod_name); });
203 }
204
205
SetMods(const TMods & mods)206 void CModHandler::SetMods(const TMods& mods)
207 {
208 m_Mods = mods;
209 }
210
211
AddMods(const TModList & mods,EHandleExisting handle_existing,TModList & rejected_mods,FReportError fPostMessage)212 void CModHandler::AddMods(const TModList& mods,
213 EHandleExisting handle_existing,
214 TModList& rejected_mods,
215 FReportError fPostMessage)
216 {
217 rejected_mods.clear();
218
219 unordered_set<string> current_set;
220 TMods accepted_mods;
221 TMods conflicting_mods;
222
223 for (const auto& mod : mods) {
224 const auto& canonical_name = GetCanonicalName(mod.GetName());
225 const auto allow_multiple_values = x_MultipleValuesAllowed(canonical_name);
226 // Don't want to check for errors if we're not going to keep the modifier
227 if (handle_existing == ePreserve ||
228 (handle_existing == eAppendPreserve &&
229 !allow_multiple_values)) {
230 if (m_Mods.find(canonical_name) != m_Mods.end()) {
231 continue;
232 }
233 }
234
235 if (m_ExcludedModifiers.find(canonical_name) !=
236 m_ExcludedModifiers.end()) {
237 string message = "The following modifier is unsupported in this context and will be ignored: " + mod.GetName() + ".";
238 if (fPostMessage) {
239 fPostMessage(mod, message, eDiag_Warning, eModSubcode_Excluded);
240 }
241 rejected_mods.push_back(mod);
242 continue;
243 }
244
245 if (x_IsDeprecated(canonical_name)) {
246 string message = "Use of the following modifier in a sequence file is discouraged and the information will be ignored: " + mod.GetName() + ".";
247 if (fPostMessage) {
248 fPostMessage(mod, message, eDiag_Warning, eModSubcode_Deprecated);
249 }
250 rejected_mods.push_back(mod);
251 continue;
252 }
253
254 const auto first_occurrence = current_set.insert(canonical_name).second;
255
256 // Put this in its own method
257 if (!first_occurrence) {
258 string msg;
259 EDiagSev sev;
260 EModSubcode subcode;
261
262 auto it = accepted_mods.find(canonical_name);
263 if (it != accepted_mods.end() &&
264 NStr::EqualNocase(it->second.front().GetValue(),
265 mod.GetValue())) {
266 msg = "Duplicated modifier value detected, ignoring duplicate, no action required: "
267 + mod.GetName() + "=" + mod.GetValue() + ".";
268 sev = eDiag_Warning;
269 subcode = eModSubcode_Duplicate;
270 }
271 else
272 if (!allow_multiple_values) {
273 msg = "Conflicting modifiers detected. Provide one modifier with one value for: " + mod.GetName() + ".";
274 sev = eDiag_Error;
275 subcode = eModSubcode_ConflictingValues;
276
277 if (it != accepted_mods.end()) {
278 conflicting_mods[canonical_name] = it->second;
279 accepted_mods.erase(it);
280 }
281 conflicting_mods[canonical_name].push_back(mod);
282 }
283 else
284 {
285 accepted_mods[canonical_name].push_back(mod);
286 continue;
287 }
288
289 CModData reportMod =
290 (subcode == eModSubcode_Duplicate) ?
291 mod :
292 CModData( mod.GetName(), kEmptyStr);
293
294 if (fPostMessage) {
295 fPostMessage(reportMod, msg, sev, subcode);
296 continue;
297 }
298 NCBI_THROW(CModReaderException, eMultipleValuesForbidden, msg);
299 }
300
301 accepted_mods[canonical_name].push_back(mod);
302 }
303
304 for (auto& conflicts : conflicting_mods) {
305 rejected_mods.splice(rejected_mods.end(), conflicts.second);
306 }
307
308 x_SaveMods(move(accepted_mods), handle_existing, m_Mods);
309 }
310
311
x_SaveMods(TMods && mods,EHandleExisting handle_existing,TMods & dest)312 void CModHandler::x_SaveMods(TMods&& mods, EHandleExisting handle_existing, TMods& dest)
313 {
314 if (handle_existing == eReplace) {
315 for (auto& mod_entry : mods) {
316 const auto& canonical_name = mod_entry.first;
317 dest[canonical_name] = mod_entry.second;
318 }
319 }
320 else
321 if (handle_existing == ePreserve) {
322 dest.insert(make_move_iterator(mods.begin()),
323 make_move_iterator(mods.end()));
324 }
325 else
326 if (handle_existing == eAppendReplace) {
327 for (auto& mod_entry : mods) {
328 const auto& canonical_name = mod_entry.first;
329 auto& dest_mod_list = dest[canonical_name];
330 if (x_MultipleValuesAllowed(canonical_name)){
331 dest_mod_list.splice(
332 dest_mod_list.end(),
333 move(mod_entry.second));
334 }
335 else {
336 dest_mod_list = move(mod_entry.second);
337 }
338 }
339 }
340 else
341 if (handle_existing == eAppendPreserve) {
342 for (auto& mod_entry : mods) {
343 const auto& canonical_name = mod_entry.first;
344 auto& dest_mod_list = dest[canonical_name];
345 if (dest_mod_list.empty()) {
346 dest_mod_list = move(mod_entry.second);
347 }
348 else
349 if (x_MultipleValuesAllowed(canonical_name)){
350 dest_mod_list.splice(
351 dest_mod_list.end(),
352 move(mod_entry.second));
353 }
354 }
355 }
356 }
357
358
x_MultipleValuesAllowed(const string & canonical_name)359 bool CModHandler::x_MultipleValuesAllowed(const string& canonical_name)
360 {
361 return (sm_MultipleValuesForbidden.find(canonical_name) ==
362 sm_MultipleValuesForbidden.end());
363 }
364
365
GetMods(void) const366 const CModHandler::TMods& CModHandler::GetMods(void) const
367 {
368 return m_Mods;
369 }
370
371
Clear(void)372 void CModHandler::Clear(void)
373 {
374 m_Mods.clear();
375 }
376
377
GetCanonicalName(const TModEntry & mod_entry)378 const string& CModHandler::GetCanonicalName(const TModEntry& mod_entry)
379 {
380 return mod_entry.first;
381 }
382
383
AssertReturnSingleValue(const TModEntry & mod_entry)384 const string& CModHandler::AssertReturnSingleValue(const TModEntry& mod_entry)
385 {
386 assert(mod_entry.second.size() == 1);
387 return mod_entry.second.front().GetValue();
388 }
389
GetCanonicalName(const string & name)390 string CModHandler::GetCanonicalName(const string& name)
391 {
392 const auto normalized_name = x_GetNormalizedString(name);
393 const auto it = s_ModNameMap.find(normalized_name);
394 if (it != s_ModNameMap.end()) {
395 return it->second;
396 }
397
398 return normalized_name;
399 }
400
401
x_IsDeprecated(const string & canonical_name)402 bool CModHandler::x_IsDeprecated(const string& canonical_name)
403 {
404 return (sm_DeprecatedModifiers.find(canonical_name) !=
405 sm_DeprecatedModifiers.end());
406 }
407
408
s_GetNormalizedString(const string & unnormalized)409 static string s_GetNormalizedString(const string& unnormalized)
410 {
411 string normalized = unnormalized;
412 NStr::ToLower(normalized);
413 NStr::TruncateSpacesInPlace(normalized);
414 auto new_end = unique(normalized.begin(),
415 normalized.end(),
416 [](char a, char b) {
417 return ((a=='-' || a=='_' || a==' ') &&
418 (b=='-' || b=='_' || b==' ')); });
419
420 normalized.erase(new_end, normalized.end());
421 for (char& c : normalized) {
422 if (c == '_' || c == ' ') {
423 c = '-';
424 }
425 }
426 return normalized;
427 }
428
x_GetNormalizedString(const string & name)429 string CModHandler::x_GetNormalizedString(const string& name)
430 {
431 return s_GetNormalizedString(name);
432 }
433
434
Apply(const CModHandler & mod_handler,CBioseq & bioseq,TSkippedMods & skipped_mods,FReportError fPostMessage)435 void CModAdder::Apply(const CModHandler& mod_handler,
436 CBioseq& bioseq,
437 TSkippedMods& skipped_mods,
438 FReportError fPostMessage)
439 {
440 Apply(mod_handler, bioseq, skipped_mods, false, fPostMessage);
441 }
442
443
Apply(const CModHandler & mod_handler,CBioseq & bioseq,TSkippedMods & skipped_mods,bool logInfo,FReportError fPostMessage)444 void CModAdder::Apply(const CModHandler& mod_handler,
445 CBioseq& bioseq,
446 TSkippedMods& skipped_mods,
447 bool logInfo,
448 FReportError fPostMessage)
449 {
450 skipped_mods.clear();
451
452 CDescrModApply descr_mod_apply(bioseq,
453 fPostMessage,
454 skipped_mods);
455
456 CFeatModApply feat_mod_apply(bioseq,
457 fPostMessage,
458 skipped_mods);
459
460 list<string> applied_mods;
461 for (const auto& mod_entry : mod_handler.GetMods()) {
462 try {
463 bool applied = false;
464 if (descr_mod_apply.Apply(mod_entry)) {
465 const string& mod_name = x_GetModName(mod_entry);
466 if (mod_name == "secondary-accession"){
467 x_SetHist(mod_entry, bioseq.SetInst());
468 }
469 else if (mod_name == "mol-type") {
470 // mol-type appears before molecule in the default-ordered
471 // map keys. Therefore, if both mol-type and molecule are
472 // specified, molecule will take precedence over (or, more precisly, overwrite)
473 // the information extracted from mol-type when setting Seq-inst::mol
474 x_SetMoleculeFromMolType(mod_entry, bioseq.SetInst());
475 }
476 applied = true;
477 }
478 else
479 if (x_TrySeqInstMod(mod_entry, bioseq.SetInst(), skipped_mods, fPostMessage) ||
480 feat_mod_apply.Apply(mod_entry)) {
481 applied = true;
482 }
483
484 if (applied) {
485 if (logInfo) {
486 applied_mods.push_back(x_GetModName(mod_entry));
487 }
488 continue;
489 }
490
491 // Report unrecognised modifier
492 if (fPostMessage) {
493 skipped_mods.insert(skipped_mods.end(),
494 mod_entry.second.begin(),
495 mod_entry.second.end());
496
497 for (const auto& modData : mod_entry.second) {
498 string msg = "Unrecognized modifier: " + modData.GetName() + ".";
499 fPostMessage(modData, msg, eDiag_Warning, eModSubcode_Unrecognized);
500 }
501 continue;
502 }
503 string canonicalName = x_GetModName(mod_entry);
504 string msg = "Unrecognized modifier: " + canonicalName + ".";
505 NCBI_THROW(CModReaderException, eUnknownModifier, msg);
506 }
507 catch(const CModReaderException& e) {
508 skipped_mods.insert(skipped_mods.end(),
509 mod_entry.second.begin(),
510 mod_entry.second.end());
511 if (fPostMessage) {
512 string canonicalName = x_GetModName(mod_entry);
513 fPostMessage(CModData( canonicalName, kEmptyStr), e.GetMsg(), eDiag_Error, eModSubcode_Undefined);
514 }
515 else {
516 throw; // rethrow e
517 }
518 }
519 }
520
521 if (!applied_mods.empty()) {
522 string msg = "Applied mods: ";
523 for (const auto& applied_mod : applied_mods) {
524 msg += " " + applied_mod;
525 }
526 fPostMessage(CModData("",""), msg, eDiag_Info, eModSubcode_Applied);
527 }
528 }
529
530
x_ReportInvalidValue(const CModData & mod_data,TSkippedMods & skipped_mods,FReportError fPostMessage)531 void CModAdder::x_ReportInvalidValue(const CModData& mod_data,
532 TSkippedMods& skipped_mods,
533 FReportError fPostMessage)
534 {
535 const auto& mod_name = mod_data.GetName();
536 const auto& mod_value = mod_data.GetValue();
537 string msg = "Invalid value: " + mod_name + "=" + mod_value + ".";
538
539 if (fPostMessage) {
540 fPostMessage(mod_data, msg, eDiag_Error, eModSubcode_InvalidValue);
541 skipped_mods.push_back(mod_data);
542 return;
543 }
544
545 NCBI_THROW(CModReaderException, eInvalidValue, msg);
546 }
547
548
549
x_GetModName(const TModEntry & mod_entry)550 const string& CModAdder::x_GetModName(const TModEntry& mod_entry)
551 {
552 return CModHandler::GetCanonicalName(mod_entry);
553 }
554
555
x_GetModValue(const TModEntry & mod_entry)556 const string& CModAdder::x_GetModValue(const TModEntry& mod_entry)
557 {
558 return CModHandler::AssertReturnSingleValue(mod_entry);
559 }
560
561
x_TrySeqInstMod(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)562 bool CModAdder::x_TrySeqInstMod(
563 const TModEntry& mod_entry,
564 CSeq_inst& seq_inst,
565 TSkippedMods& skipped_mods,
566 FReportError fPostMessage)
567 {
568 const auto& mod_name = x_GetModName(mod_entry);
569
570 if (mod_name == "strand") {
571 x_SetStrand(mod_entry, seq_inst, skipped_mods, fPostMessage);
572 return true;
573 }
574
575 if (mod_name == "molecule") {
576 x_SetMolecule(mod_entry, seq_inst, skipped_mods, fPostMessage);
577 return true;
578 }
579
580 if (mod_name == "topology") {
581 x_SetTopology(mod_entry, seq_inst, skipped_mods, fPostMessage);
582 return true;
583 }
584
585 // Note that we do not check for the 'secondary-accession' modifier here.
586 // secondary-accession also modifies the GB_block descriptor
587 // The check for secondary-accession and any resulting call
588 // to x_SetHist is performed before x_TrySeqInstMod
589 // is invoked.
590
591 return false;
592 }
593
594
595
x_SetStrand(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)596 void CModAdder::x_SetStrand(const TModEntry& mod_entry,
597 CSeq_inst& seq_inst,
598 TSkippedMods& skipped_mods,
599 FReportError fPostMessage)
600 {
601 string value = x_GetModValue(mod_entry);
602 const auto it = s_StrandStringToEnum.find(g_GetNormalizedModVal(value));
603 if (it == s_StrandStringToEnum.end()) {
604 x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
605 return;
606 }
607 seq_inst.SetStrand(it->second);
608 }
609
610
x_SetMolecule(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)611 void CModAdder::x_SetMolecule(const TModEntry& mod_entry,
612 CSeq_inst& seq_inst,
613 TSkippedMods& skipped_mods,
614 FReportError fPostMessage)
615 {
616 string value = x_GetModValue(mod_entry);
617 const auto it = s_MolStringToEnum.find(g_GetNormalizedModVal(value));
618 if (it == s_MolStringToEnum.end()) {
619 x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
620 return;
621 }
622 seq_inst.SetMol(it->second);
623 }
624
625
x_SetMoleculeFromMolType(const TModEntry & mod_entry,CSeq_inst & seq_inst)626 void CModAdder::x_SetMoleculeFromMolType(const TModEntry& mod_entry, CSeq_inst& seq_inst)
627 {
628 string value = x_GetModValue(mod_entry);
629 auto it = g_BiomolStringToEnum.find(g_GetNormalizedModVal(value));
630 if (it == g_BiomolStringToEnum.end()) {
631 // No need to report an error here.
632 // The error is reported in x_SetMolInfoType
633 return;
634 }
635 CSeq_inst::EMol mol = g_BiomolEnumToMolEnum.at(it->second);
636 seq_inst.SetMol(mol);
637 }
638
639
x_SetTopology(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)640 void CModAdder::x_SetTopology(const TModEntry& mod_entry,
641 CSeq_inst& seq_inst,
642 TSkippedMods& skipped_mods,
643 FReportError fPostMessage)
644 {
645 string value = x_GetModValue(mod_entry);
646 const auto it = s_TopologyStringToEnum.find(g_GetNormalizedModVal(value));
647 if (it == s_TopologyStringToEnum.end()) {
648 x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
649 return;
650 }
651 seq_inst.SetTopology(it->second);
652 }
653
654
x_SetHist(const TModEntry & mod_entry,CSeq_inst & seq_inst)655 void CModAdder::x_SetHist(const TModEntry& mod_entry, CSeq_inst& seq_inst)
656 {
657 list<string> id_list;
658 for (const auto& mod : mod_entry.second) {
659 const auto& vals = mod.GetValue();
660 list<CTempString> value_sublist;
661 NStr::Split(vals, ",; \t", value_sublist, NStr::fSplit_Tokenize);
662 for (const auto& val : value_sublist) {
663 string value = NStr::TruncateSpaces_Unsafe(val);
664 try {
665 SSeqIdRange idrange(value);
666 id_list.insert(id_list.end(), idrange.begin(), idrange.end());
667 }
668 catch (...)
669 {
670 id_list.push_back(value);
671 }
672 }
673 }
674
675 if (id_list.empty()) {
676 return;
677 }
678
679 list<CRef<CSeq_id>> secondary_ids;
680 // try catch statement
681 transform(id_list.begin(), id_list.end(), back_inserter(secondary_ids),
682 [](const string& id_string) { return Ref(new CSeq_id(id_string)); });
683
684 seq_inst.SetHist().SetReplaces().SetIds() = move(secondary_ids);
685 }
686
687
CDefaultModErrorReporter(const string & seqId,int lineNum,IObjtoolsListener * pMessageListener)688 CDefaultModErrorReporter::CDefaultModErrorReporter(
689 const string& seqId,
690 int lineNum,
691 IObjtoolsListener* pMessageListener)
692 : m_SeqId(seqId),
693 m_LineNum(lineNum),
694 m_pMessageListener(pMessageListener)
695 {}
696
697
operator ()(const CModData & mod,const string & msg,EDiagSev sev,EModSubcode subcode)698 void CDefaultModErrorReporter::operator()(
699 const CModData& mod,
700 const string& msg,
701 EDiagSev sev,
702 EModSubcode subcode)
703 {
704 if (!m_pMessageListener) {
705 if (sev == eDiag_Info) {
706 return;
707 }
708 if (sev == eDiag_Warning) {
709 ERR_POST(Warning << msg);
710 return;
711 }
712 NCBI_THROW2(CObjReaderParseException, eFormat, msg, 0);
713 }
714
715
716 if (!m_pMessageListener->SevEnabled(sev)) {
717 return;
718 }
719
720 AutoPtr<CLineErrorEx> pErr(
721 CLineErrorEx::Create(
722 ILineError::eProblem_GeneralParsingError,
723 sev,
724 EReaderCode::eReader_Mods,
725 subcode,
726 m_SeqId,
727 m_LineNum,
728 msg,
729 "",
730 mod.GetName(),
731 mod.GetValue()));
732
733 if (!m_pMessageListener->PutMessage(*pErr)) {
734 NCBI_THROW2(CObjReaderParseException, eFormat, msg, 0);
735 }
736 }
737
738
Apply(const CTempString & title,TModList & mods,string & remainder)739 void CTitleParser::Apply(const CTempString& title, TModList& mods, string& remainder)
740 {
741 mods.clear();
742 remainder.clear();
743 size_t start_pos = 0;
744 while(start_pos < title.size()) {
745 size_t lb_pos, end_pos, eq_pos;
746 lb_pos = start_pos;
747 if (x_FindBrackets(title, lb_pos, end_pos, eq_pos)) {
748 if (eq_pos < end_pos) {
749 if ((lb_pos > start_pos) ) {
750 auto left_remainder = NStr::TruncateSpaces_Unsafe(title.substr(start_pos, lb_pos-start_pos));
751 if (!left_remainder.empty()) {
752 if (!remainder.empty()) {
753 remainder.append(" ");
754 }
755 remainder.append(left_remainder);
756 }
757 }
758 auto name = NStr::TruncateSpaces_Unsafe(title.substr(lb_pos+1, eq_pos-(lb_pos+1)));
759 auto value = NStr::TruncateSpaces_Unsafe(title.substr(eq_pos+1, end_pos-(eq_pos+1)));
760 mods.emplace_back(name, value);
761 }
762 start_pos = end_pos+1;
763 }
764 else {
765 auto right_remainder = NStr::TruncateSpaces_Unsafe(title.substr(start_pos));
766 if (!right_remainder.empty()) {
767 if (!remainder.empty()) {
768 remainder.append(" ");
769 }
770 remainder.append(right_remainder);
771 }
772 return;
773 }
774 }
775 }
776
777
HasMods(const CTempString & title)778 bool CTitleParser::HasMods(const CTempString& title)
779 {
780 size_t start_pos = 0;
781 while (start_pos < title.size()) {
782 size_t lb_pos, end_pos, eq_pos;
783 lb_pos = start_pos;
784 if (x_FindBrackets(title, lb_pos, end_pos, eq_pos)) {
785 if (eq_pos < end_pos) {
786 return true;
787 }
788 start_pos = end_pos+1;
789 }
790 else {
791 return false;
792 }
793 }
794 return false;
795 }
796
797
x_FindBrackets(const CTempString & line,size_t & start,size_t & stop,size_t & eq_pos)798 bool CTitleParser::x_FindBrackets(const CTempString& line, size_t& start, size_t& stop, size_t& eq_pos)
799 { // Copied from CSourceModParser
800 size_t i = start;
801
802 eq_pos = CTempString::npos;
803 const char* s = line.data() + start;
804
805 int num_unmatched_left_brackets = 0;
806 while (i < line.size())
807 {
808 switch (*s)
809 {
810 case '[':
811 num_unmatched_left_brackets++;
812 if (num_unmatched_left_brackets == 1)
813 {
814 start = i;
815 }
816 break;
817 case '=':
818 if (num_unmatched_left_brackets > 0 && eq_pos == CTempString::npos) {
819 eq_pos = i;
820 }
821 break;
822 case ']':
823 if (num_unmatched_left_brackets == 1)
824 {
825 stop = i;
826 return (eq_pos<stop);
827 }
828 else
829 if (num_unmatched_left_brackets == 0) {
830 return false;
831 }
832 else
833 {
834 num_unmatched_left_brackets--;
835 }
836 }
837 i++; s++;
838 }
839 return false;
840 };
841
842
843 END_SCOPE(objects)
844 END_NCBI_SCOPE
845
846
847
848