1 /*  $Id: mod_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Justin Foley
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objects/seqset/Bioseq_set.hpp>
35 #include <objects/seq/Bioseq.hpp>
36 #include <objects/seq/MolInfo.hpp>
37 #include <objects/seq/Seq_inst.hpp>
38 #include <objects/seq/Seq_hist.hpp>
39 #include <objects/seq/Seq_hist_rec.hpp>
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objtools/logging/message.hpp>
42 #include <objtools/logging/listener.hpp>
43 #include <objtools/readers/mod_reader.hpp>
44 #include <objtools/readers/mod_error.hpp>
45 #include <objtools/readers/message_listener.hpp>
46 #include <map>
47 #include <unordered_map>
48 #include <unordered_set>
49 #include <cassert>
50 //#include <util/compile_time.hpp>
51 
52 #include "mod_to_enum.hpp"
53 #include "descr_mod_apply.hpp"
54 #include "feature_mod_apply.hpp"
55 
56 BEGIN_NCBI_SCOPE
57 BEGIN_SCOPE(objects)
58 
59 //MAKE_CONST_MAP(s_ModNameMap, NStr::eCase, const char*, const char*,
60 
61 static const unordered_map<string, string> s_ModNameMap =
62 {{"top","topology"},
63  {"mol","molecule"},
64  {"moltype", "mol-type"},
65  {"fwd-pcr-primer-name", "fwd-primer-name"},
66  {"fwd-pcr-primer-names", "fwd-primer-name"},
67  {"fwd-primer-names", "fwd-primer-name"},
68  {"fwd-pcr-primer-seq","fwd-primer-seq"},
69  {"fwd-pcr-primer-seqs","fwd-primer-seq"},
70  {"fwd-primer-seqs","fwd-primer-seq"},
71  {"rev-pcr-primer-name", "rev-primer-name"},
72  {"rev-pcr-primer-names", "rev-primer-name"},
73  {"rev-primer-names", "rev-primer-name"},
74  {"rev-pcr-primer-seq", "rev-primer-seq"},
75  {"rev-pcr-primer-seqs", "rev-primer-seq"},
76  {"rev-primer-seqs", "rev-primer-seq"},
77  {"org", "taxname"},
78  {"organism", "taxname"},
79  {"div", "division"},
80  {"notes", "note"},
81  {"completedness", "completeness"},
82  {"gene-syn", "gene-synonym"},
83  {"genesyn", "gene-synonym"},
84  {"genesynonym", "gene-synonym"},
85  {"prot", "protein"},
86  {"prot-desc", "protein-desc"},
87  {"function", "activity"},
88  {"secondary", "secondary-accession"},
89  {"secondary-accessions", "secondary-accession"},
90  {"keywords", "keyword"},
91  {"primary", "primary-accession"},
92  {"primary-accessions", "primary-accession"},
93  {"projects", "project"},
94  {"db-xref", "dbxref"},
95  {"pubmed", "pmid"},
96  {"ft-url-mod", "ft-mod"},
97  {"ft-url", "ft-map"}
98  };
99 //);
100 
101 
102 const CModHandler::TNameSet CModHandler::sm_DeprecatedModifiers
103 {
104     "dosage",
105     "transposon-name",
106     "plastid-name",
107     "insertion-seq-name",
108     "old-lineage",
109     "old-name",
110     "gene",
111     "gene-synonym",
112     "allele",
113     "locus-tag"
114 };
115 
116 
117 const CModHandler::TNameSet CModHandler::sm_MultipleValuesForbidden =
118 {
119     "topology", // Seq-inst
120     "molecule",
121     "strand",
122     "gene",  // Gene-ref
123     "allele",
124     "locus-tag",
125     "protein-desc",// Protein-ref
126     "mol-type", // MolInfo descriptor
127     "tech",
128     "completeness",
129     "location", // Biosource descriptor
130     "origin",
131     "focus",
132     "taxname", // Biosource - Org-ref
133     "common",
134     "lineage", // Biosource - Org-ref - OrgName
135     "division",
136     "gcode",
137     "mgcode",
138     "pgcode"
139 };
140 
141 
142 //MAKE_CONST_MAP(s_StrandStringToEnum, NStr::eCase, const char*, CSeq_inst::EStrand,
143 static const unordered_map<string, CSeq_inst::EStrand> s_StrandStringToEnum =
144 {{"single", CSeq_inst::eStrand_ss},
145  {"double", CSeq_inst::eStrand_ds},
146  {"mixed", CSeq_inst::eStrand_mixed},
147  {"other", CSeq_inst::eStrand_other}
148  };
149 //);
150 
151 
152 //MAKE_CONST_MAP(s_MolStringToEnum, NStr::eCase, const char*, CSeq_inst::EMol,
153 static const unordered_map<string, CSeq_inst::EMol> s_MolStringToEnum =
154 {{"dna", CSeq_inst::eMol_dna},
155  {"rna", CSeq_inst::eMol_rna},
156  {"aa", CSeq_inst::eMol_aa},
157  {"na", CSeq_inst::eMol_na},
158  {"other", CSeq_inst::eMol_other}
159  };
160  //);
161 
162 
163 //MAKE_CONST_MAP(s_TopologyStringToEnum, NStr::eCase, const char*, CSeq_inst::ETopology,
164 static const unordered_map<string, CSeq_inst::ETopology> s_TopologyStringToEnum =
165 {{"linear", CSeq_inst::eTopology_linear},
166  {"circular", CSeq_inst::eTopology_circular},
167  {"tandem", CSeq_inst::eTopology_tandem},
168  {"other", CSeq_inst::eTopology_other}
169  };
170  //);
171 
172 /*
173 MAKE_CONST_MAP(s_BiomolEnumToMolEnum, NStr::eNocase, CMolInfo::TBiomol, CSeq_inst::EMol,
174 {{ CMolInfo::eBiomol_genomic, CSeq_inst::eMol_dna},
175  { CMolInfo::eBiomol_pre_RNA,  CSeq_inst::eMol_rna},
176  { CMolInfo::eBiomol_mRNA,  CSeq_inst::eMol_rna },
177  { CMolInfo::eBiomol_rRNA, CSeq_inst::eMol_rna},
178  { CMolInfo::eBiomol_tRNA, CSeq_inst::eMol_rna},
179  { CMolInfo::eBiomol_snRNA, CSeq_inst::eMol_rna},
180  { CMolInfo::eBiomol_scRNA, CSeq_inst::eMol_rna},
181  { CMolInfo::eBiomol_genomic_mRNA, CSeq_inst::eMol_rna },
182  { CMolInfo::eBiomol_cRNA, CSeq_inst::eMol_rna },
183  { CMolInfo::eBiomol_snoRNA, CSeq_inst::eMol_rna},
184  { CMolInfo::eBiomol_transcribed_RNA, CSeq_inst::eMol_rna},
185  { CMolInfo::eBiomol_ncRNA, CSeq_inst::eMol_rna},
186  { CMolInfo::eBiomol_tmRNA, CSeq_inst::eMol_rna},
187  { CMolInfo::eBiomol_peptide, CSeq_inst::eMol_aa},
188  { CMolInfo::eBiomol_other_genetic, CSeq_inst::eMol_other},
189  { CMolInfo::eBiomol_other, CSeq_inst::eMol_other}
190 });
191 */
192 
193 
CModHandler()194 CModHandler::CModHandler(){}
195 
196 
SetExcludedMods(const vector<string> & excluded_mods)197 void CModHandler::SetExcludedMods(const vector<string>& excluded_mods)
198 {
199     m_ExcludedModifiers.clear();
200     transform(excluded_mods.begin(), excluded_mods.end(),
201             inserter(m_ExcludedModifiers, m_ExcludedModifiers.end()),
202             [](const string& mod_name) { return GetCanonicalName(mod_name); });
203 }
204 
205 
SetMods(const TMods & mods)206 void CModHandler::SetMods(const TMods& mods)
207 {
208     m_Mods = mods;
209 }
210 
211 
AddMods(const TModList & mods,EHandleExisting handle_existing,TModList & rejected_mods,FReportError fPostMessage)212 void CModHandler::AddMods(const TModList& mods,
213                           EHandleExisting handle_existing,
214                           TModList& rejected_mods,
215                           FReportError fPostMessage)
216 {
217     rejected_mods.clear();
218 
219     unordered_set<string> current_set;
220     TMods accepted_mods;
221     TMods conflicting_mods;
222 
223     for (const auto& mod : mods) {
224         const auto& canonical_name = GetCanonicalName(mod.GetName());
225         const auto allow_multiple_values = x_MultipleValuesAllowed(canonical_name);
226         // Don't want to check for errors if we're not going to keep the modifier
227         if (handle_existing == ePreserve ||
228             (handle_existing == eAppendPreserve &&
229              !allow_multiple_values)) {
230             if (m_Mods.find(canonical_name) != m_Mods.end()) {
231                 continue;
232             }
233         }
234 
235         if (m_ExcludedModifiers.find(canonical_name) !=
236             m_ExcludedModifiers.end()) {
237             string message = "The following modifier is unsupported in this context and will be ignored: " + mod.GetName() + ".";
238             if (fPostMessage) {
239                 fPostMessage(mod, message, eDiag_Warning, eModSubcode_Excluded);
240             }
241             rejected_mods.push_back(mod);
242             continue;
243         }
244 
245         if (x_IsDeprecated(canonical_name)) {
246             string message = "Use of the following modifier in a sequence file is discouraged and the information will be ignored: " + mod.GetName() + ".";
247             if (fPostMessage) {
248                 fPostMessage(mod, message, eDiag_Warning, eModSubcode_Deprecated);
249             }
250             rejected_mods.push_back(mod);
251             continue;
252         }
253 
254         const auto first_occurrence = current_set.insert(canonical_name).second;
255 
256        // Put this in its own method
257        if (!first_occurrence) {
258             string msg;
259             EDiagSev sev;
260             EModSubcode subcode;
261 
262             auto it = accepted_mods.find(canonical_name);
263             if (it != accepted_mods.end() &&
264                 NStr::EqualNocase(it->second.front().GetValue(),
265                        mod.GetValue())) {
266                 msg = "Duplicated modifier value detected, ignoring duplicate, no action required: "
267                     + mod.GetName() + "=" + mod.GetValue() + ".";
268                 sev = eDiag_Warning;
269                 subcode = eModSubcode_Duplicate;
270             }
271             else
272             if (!allow_multiple_values) {
273                 msg = "Conflicting modifiers detected. Provide one modifier with one value for: " + mod.GetName() + ".";
274                 sev = eDiag_Error;
275                 subcode = eModSubcode_ConflictingValues;
276 
277                 if (it != accepted_mods.end()) {
278                     conflicting_mods[canonical_name] = it->second;
279                     accepted_mods.erase(it);
280                 }
281                 conflicting_mods[canonical_name].push_back(mod);
282             }
283             else
284             {
285                 accepted_mods[canonical_name].push_back(mod);
286                 continue;
287             }
288 
289             CModData reportMod =
290                 (subcode == eModSubcode_Duplicate) ?
291                 mod :
292                 CModData( mod.GetName(), kEmptyStr);
293 
294             if (fPostMessage) {
295                 fPostMessage(reportMod, msg, sev, subcode);
296                 continue;
297             }
298             NCBI_THROW(CModReaderException, eMultipleValuesForbidden, msg);
299        }
300 
301         accepted_mods[canonical_name].push_back(mod);
302     }
303 
304     for (auto& conflicts : conflicting_mods) {
305         rejected_mods.splice(rejected_mods.end(), conflicts.second);
306     }
307 
308     x_SaveMods(move(accepted_mods), handle_existing, m_Mods);
309 }
310 
311 
x_SaveMods(TMods && mods,EHandleExisting handle_existing,TMods & dest)312 void CModHandler::x_SaveMods(TMods&& mods, EHandleExisting handle_existing, TMods& dest)
313 {
314     if (handle_existing == eReplace) {
315         for (auto& mod_entry : mods) {
316             const auto& canonical_name = mod_entry.first;
317             dest[canonical_name] = mod_entry.second;
318         }
319     }
320     else
321     if (handle_existing == ePreserve) {
322         dest.insert(make_move_iterator(mods.begin()),
323                     make_move_iterator(mods.end()));
324     }
325     else
326     if (handle_existing == eAppendReplace) {
327         for (auto& mod_entry : mods) {
328             const auto& canonical_name = mod_entry.first;
329             auto& dest_mod_list = dest[canonical_name];
330             if (x_MultipleValuesAllowed(canonical_name)){
331                 dest_mod_list.splice(
332                         dest_mod_list.end(),
333                         move(mod_entry.second));
334             }
335             else {
336                 dest_mod_list = move(mod_entry.second);
337             }
338         }
339     }
340     else
341     if (handle_existing == eAppendPreserve) {
342         for (auto& mod_entry : mods) {
343             const auto& canonical_name = mod_entry.first;
344             auto& dest_mod_list = dest[canonical_name];
345             if (dest_mod_list.empty()) {
346                 dest_mod_list = move(mod_entry.second);
347             }
348             else
349             if (x_MultipleValuesAllowed(canonical_name)){
350                 dest_mod_list.splice(
351                         dest_mod_list.end(),
352                         move(mod_entry.second));
353             }
354         }
355     }
356 }
357 
358 
x_MultipleValuesAllowed(const string & canonical_name)359 bool CModHandler::x_MultipleValuesAllowed(const string& canonical_name)
360 {
361     return (sm_MultipleValuesForbidden.find(canonical_name) ==
362             sm_MultipleValuesForbidden.end());
363 }
364 
365 
GetMods(void) const366 const CModHandler::TMods& CModHandler::GetMods(void) const
367 {
368     return m_Mods;
369 }
370 
371 
Clear(void)372 void CModHandler::Clear(void)
373 {
374     m_Mods.clear();
375 }
376 
377 
GetCanonicalName(const TModEntry & mod_entry)378 const string& CModHandler::GetCanonicalName(const TModEntry& mod_entry)
379 {
380     return mod_entry.first;
381 }
382 
383 
AssertReturnSingleValue(const TModEntry & mod_entry)384 const string& CModHandler::AssertReturnSingleValue(const TModEntry& mod_entry)
385 {
386     assert(mod_entry.second.size() == 1);
387     return mod_entry.second.front().GetValue();
388 }
389 
GetCanonicalName(const string & name)390 string CModHandler::GetCanonicalName(const string& name)
391 {
392     const auto normalized_name = x_GetNormalizedString(name);
393     const auto it = s_ModNameMap.find(normalized_name);
394     if (it != s_ModNameMap.end()) {
395         return it->second;
396     }
397 
398     return normalized_name;
399 }
400 
401 
x_IsDeprecated(const string & canonical_name)402 bool CModHandler::x_IsDeprecated(const string& canonical_name)
403 {
404     return (sm_DeprecatedModifiers.find(canonical_name) !=
405             sm_DeprecatedModifiers.end());
406 }
407 
408 
s_GetNormalizedString(const string & unnormalized)409 static string s_GetNormalizedString(const string& unnormalized)
410 {
411     string normalized = unnormalized;
412     NStr::ToLower(normalized);
413     NStr::TruncateSpacesInPlace(normalized);
414     auto new_end = unique(normalized.begin(),
415                           normalized.end(),
416                           [](char a, char b) {
417                               return ((a=='-' || a=='_' || a==' ') &&
418                                       (b=='-' || b=='_' || b==' ')); });
419 
420     normalized.erase(new_end, normalized.end());
421     for (char& c : normalized) {
422         if (c == '_' || c == ' ') {
423             c = '-';
424         }
425     }
426     return normalized;
427 }
428 
x_GetNormalizedString(const string & name)429 string CModHandler::x_GetNormalizedString(const string& name)
430 {
431     return s_GetNormalizedString(name);
432 }
433 
434 
Apply(const CModHandler & mod_handler,CBioseq & bioseq,TSkippedMods & skipped_mods,FReportError fPostMessage)435 void CModAdder::Apply(const CModHandler& mod_handler,
436                       CBioseq& bioseq,
437                       TSkippedMods& skipped_mods,
438                       FReportError fPostMessage)
439 {
440     Apply(mod_handler, bioseq, skipped_mods, false, fPostMessage);
441 }
442 
443 
Apply(const CModHandler & mod_handler,CBioseq & bioseq,TSkippedMods & skipped_mods,bool logInfo,FReportError fPostMessage)444 void CModAdder::Apply(const CModHandler& mod_handler,
445                       CBioseq& bioseq,
446                       TSkippedMods& skipped_mods,
447                       bool logInfo,
448                       FReportError fPostMessage)
449 {
450     skipped_mods.clear();
451 
452     CDescrModApply descr_mod_apply(bioseq,
453                                    fPostMessage,
454                                    skipped_mods);
455 
456     CFeatModApply feat_mod_apply(bioseq,
457                                  fPostMessage,
458                                  skipped_mods);
459 
460     list<string> applied_mods;
461     for (const auto& mod_entry : mod_handler.GetMods()) {
462         try {
463             bool applied = false;
464             if (descr_mod_apply.Apply(mod_entry)) {
465                 const string& mod_name = x_GetModName(mod_entry);
466                 if (mod_name == "secondary-accession"){
467                     x_SetHist(mod_entry, bioseq.SetInst());
468                 }
469                 else if (mod_name == "mol-type") {
470                     // mol-type appears before molecule in the default-ordered
471                     // map keys. Therefore, if both mol-type and molecule are
472                     // specified, molecule will take precedence over (or, more precisly, overwrite)
473                     // the information extracted from mol-type when setting Seq-inst::mol
474                     x_SetMoleculeFromMolType(mod_entry, bioseq.SetInst());
475                 }
476                 applied = true;
477             }
478             else
479             if (x_TrySeqInstMod(mod_entry, bioseq.SetInst(), skipped_mods, fPostMessage) ||
480                 feat_mod_apply.Apply(mod_entry)) {
481                 applied = true;
482             }
483 
484             if (applied) {
485                 if (logInfo) {
486                     applied_mods.push_back(x_GetModName(mod_entry));
487                 }
488                 continue;
489             }
490 
491             // Report unrecognised modifier
492             if (fPostMessage) {
493                 skipped_mods.insert(skipped_mods.end(),
494                     mod_entry.second.begin(),
495                     mod_entry.second.end());
496 
497                 for (const auto& modData : mod_entry.second) {
498                     string msg = "Unrecognized modifier: " + modData.GetName() + ".";
499                     fPostMessage(modData, msg, eDiag_Warning, eModSubcode_Unrecognized);
500                 }
501                 continue;
502             }
503             string canonicalName = x_GetModName(mod_entry);
504             string msg = "Unrecognized modifier: " + canonicalName + ".";
505             NCBI_THROW(CModReaderException, eUnknownModifier, msg);
506         }
507         catch(const CModReaderException& e) {
508             skipped_mods.insert(skipped_mods.end(),
509                     mod_entry.second.begin(),
510                     mod_entry.second.end());
511             if (fPostMessage) {
512                 string canonicalName = x_GetModName(mod_entry);
513                 fPostMessage(CModData( canonicalName, kEmptyStr), e.GetMsg(), eDiag_Error, eModSubcode_Undefined);
514             }
515             else {
516                 throw; // rethrow e
517             }
518         }
519     }
520 
521     if (!applied_mods.empty()) {
522         string msg = "Applied mods: ";
523         for (const auto& applied_mod : applied_mods) {
524             msg += " " + applied_mod;
525         }
526         fPostMessage(CModData("",""), msg, eDiag_Info, eModSubcode_Applied);
527     }
528 }
529 
530 
x_ReportInvalidValue(const CModData & mod_data,TSkippedMods & skipped_mods,FReportError fPostMessage)531 void CModAdder::x_ReportInvalidValue(const CModData& mod_data,
532                                     TSkippedMods& skipped_mods,
533                                     FReportError fPostMessage)
534 {
535     const auto& mod_name = mod_data.GetName();
536     const auto& mod_value = mod_data.GetValue();
537     string msg = "Invalid value: " + mod_name + "=" + mod_value + ".";
538 
539     if (fPostMessage) {
540         fPostMessage(mod_data, msg, eDiag_Error, eModSubcode_InvalidValue);
541         skipped_mods.push_back(mod_data);
542         return;
543     }
544 
545     NCBI_THROW(CModReaderException, eInvalidValue, msg);
546 }
547 
548 
549 
x_GetModName(const TModEntry & mod_entry)550 const string& CModAdder::x_GetModName(const TModEntry& mod_entry)
551 {
552     return CModHandler::GetCanonicalName(mod_entry);
553 }
554 
555 
x_GetModValue(const TModEntry & mod_entry)556 const string& CModAdder::x_GetModValue(const TModEntry& mod_entry)
557 {
558     return CModHandler::AssertReturnSingleValue(mod_entry);
559 }
560 
561 
x_TrySeqInstMod(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)562 bool CModAdder::x_TrySeqInstMod(
563         const TModEntry& mod_entry,
564         CSeq_inst& seq_inst,
565         TSkippedMods& skipped_mods,
566         FReportError fPostMessage)
567 {
568     const auto& mod_name = x_GetModName(mod_entry);
569 
570     if (mod_name == "strand") {
571         x_SetStrand(mod_entry, seq_inst, skipped_mods, fPostMessage);
572         return true;
573     }
574 
575     if (mod_name == "molecule") {
576         x_SetMolecule(mod_entry, seq_inst, skipped_mods, fPostMessage);
577         return true;
578     }
579 
580     if (mod_name == "topology") {
581         x_SetTopology(mod_entry, seq_inst, skipped_mods, fPostMessage);
582         return true;
583     }
584 
585 //   Note that we do not check for the 'secondary-accession' modifier here.
586 //   secondary-accession also modifies the GB_block descriptor
587 //   The check for secondary-accession and any resulting call
588 //   to x_SetHist is performed before x_TrySeqInstMod
589 //   is invoked.
590 
591     return false;
592 }
593 
594 
595 
x_SetStrand(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)596 void CModAdder::x_SetStrand(const TModEntry& mod_entry,
597                             CSeq_inst& seq_inst,
598                             TSkippedMods& skipped_mods,
599                             FReportError fPostMessage)
600 {
601     string value = x_GetModValue(mod_entry);
602     const auto it = s_StrandStringToEnum.find(g_GetNormalizedModVal(value));
603     if (it == s_StrandStringToEnum.end()) {
604         x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
605         return;
606     }
607     seq_inst.SetStrand(it->second);
608 }
609 
610 
x_SetMolecule(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)611 void CModAdder::x_SetMolecule(const TModEntry& mod_entry,
612                               CSeq_inst& seq_inst,
613                               TSkippedMods& skipped_mods,
614                               FReportError fPostMessage)
615 {
616     string value = x_GetModValue(mod_entry);
617     const auto it = s_MolStringToEnum.find(g_GetNormalizedModVal(value));
618     if (it == s_MolStringToEnum.end()) {
619         x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
620         return;
621     }
622     seq_inst.SetMol(it->second);
623 }
624 
625 
x_SetMoleculeFromMolType(const TModEntry & mod_entry,CSeq_inst & seq_inst)626 void CModAdder::x_SetMoleculeFromMolType(const TModEntry& mod_entry, CSeq_inst& seq_inst)
627 {
628     string value = x_GetModValue(mod_entry);
629     auto it = g_BiomolStringToEnum.find(g_GetNormalizedModVal(value));
630     if (it == g_BiomolStringToEnum.end()) {
631         // No need to report an error here.
632         // The error is reported in x_SetMolInfoType
633         return;
634     }
635     CSeq_inst::EMol mol = g_BiomolEnumToMolEnum.at(it->second);
636     seq_inst.SetMol(mol);
637 }
638 
639 
x_SetTopology(const TModEntry & mod_entry,CSeq_inst & seq_inst,TSkippedMods & skipped_mods,FReportError fPostMessage)640 void CModAdder::x_SetTopology(const TModEntry& mod_entry,
641                               CSeq_inst& seq_inst,
642                               TSkippedMods& skipped_mods,
643                               FReportError fPostMessage)
644 {
645     string value = x_GetModValue(mod_entry);
646     const auto it = s_TopologyStringToEnum.find(g_GetNormalizedModVal(value));
647     if (it == s_TopologyStringToEnum.end()) {
648         x_ReportInvalidValue(mod_entry.second.front(), skipped_mods, fPostMessage);
649         return;
650     }
651     seq_inst.SetTopology(it->second);
652 }
653 
654 
x_SetHist(const TModEntry & mod_entry,CSeq_inst & seq_inst)655 void CModAdder::x_SetHist(const TModEntry& mod_entry, CSeq_inst& seq_inst)
656 {
657     list<string> id_list;
658     for (const auto& mod : mod_entry.second) {
659         const auto& vals = mod.GetValue();
660         list<CTempString> value_sublist;
661         NStr::Split(vals, ",; \t", value_sublist, NStr::fSplit_Tokenize);
662         for (const auto& val : value_sublist) {
663             string value = NStr::TruncateSpaces_Unsafe(val);
664             try {
665                 SSeqIdRange idrange(value);
666                 id_list.insert(id_list.end(), idrange.begin(), idrange.end());
667             }
668             catch (...)
669             {
670                 id_list.push_back(value);
671             }
672         }
673     }
674 
675     if (id_list.empty()) {
676         return;
677     }
678 
679     list<CRef<CSeq_id>> secondary_ids;
680     // try catch statement
681     transform(id_list.begin(), id_list.end(), back_inserter(secondary_ids),
682             [](const string& id_string) { return Ref(new CSeq_id(id_string)); });
683 
684     seq_inst.SetHist().SetReplaces().SetIds() = move(secondary_ids);
685 }
686 
687 
CDefaultModErrorReporter(const string & seqId,int lineNum,IObjtoolsListener * pMessageListener)688 CDefaultModErrorReporter::CDefaultModErrorReporter(
689         const string& seqId,
690         int lineNum,
691         IObjtoolsListener* pMessageListener)
692     : m_SeqId(seqId),
693       m_LineNum(lineNum),
694       m_pMessageListener(pMessageListener)
695     {}
696 
697 
operator ()(const CModData & mod,const string & msg,EDiagSev sev,EModSubcode subcode)698 void CDefaultModErrorReporter::operator()(
699     const CModData& mod,
700     const string& msg,
701     EDiagSev sev,
702     EModSubcode subcode)
703 {
704     if (!m_pMessageListener) {
705         if (sev == eDiag_Info) {
706             return;
707         }
708         if (sev == eDiag_Warning) {
709             ERR_POST(Warning << msg);
710             return;
711         }
712         NCBI_THROW2(CObjReaderParseException, eFormat, msg, 0);
713     }
714 
715 
716     if (!m_pMessageListener->SevEnabled(sev)) {
717         return;
718     }
719 
720     AutoPtr<CLineErrorEx> pErr(
721         CLineErrorEx::Create(
722             ILineError::eProblem_GeneralParsingError,
723             sev,
724             EReaderCode::eReader_Mods,
725             subcode,
726             m_SeqId,
727             m_LineNum,
728             msg,
729             "",
730             mod.GetName(),
731             mod.GetValue()));
732 
733     if (!m_pMessageListener->PutMessage(*pErr)) {
734         NCBI_THROW2(CObjReaderParseException, eFormat, msg, 0);
735     }
736 }
737 
738 
Apply(const CTempString & title,TModList & mods,string & remainder)739 void CTitleParser::Apply(const CTempString& title, TModList& mods, string& remainder)
740 {
741     mods.clear();
742     remainder.clear();
743     size_t start_pos = 0;
744     while(start_pos < title.size()) {
745         size_t lb_pos, end_pos, eq_pos;
746         lb_pos = start_pos;
747         if (x_FindBrackets(title, lb_pos, end_pos, eq_pos)) {
748             if (eq_pos < end_pos) {
749                 if ((lb_pos > start_pos) ) {
750                     auto left_remainder = NStr::TruncateSpaces_Unsafe(title.substr(start_pos, lb_pos-start_pos));
751                     if (!left_remainder.empty()) {
752                         if (!remainder.empty()) {
753                             remainder.append(" ");
754                         }
755                         remainder.append(left_remainder);
756                     }
757                 }
758                 auto name = NStr::TruncateSpaces_Unsafe(title.substr(lb_pos+1, eq_pos-(lb_pos+1)));
759                 auto value = NStr::TruncateSpaces_Unsafe(title.substr(eq_pos+1, end_pos-(eq_pos+1)));
760                 mods.emplace_back(name, value);
761             }
762             start_pos = end_pos+1;
763         }
764         else {
765             auto right_remainder = NStr::TruncateSpaces_Unsafe(title.substr(start_pos));
766             if (!right_remainder.empty()) {
767                 if (!remainder.empty()) {
768                     remainder.append(" ");
769                 }
770                 remainder.append(right_remainder);
771             }
772             return;
773         }
774     }
775 }
776 
777 
HasMods(const CTempString & title)778 bool CTitleParser::HasMods(const CTempString& title)
779 {
780     size_t start_pos = 0;
781     while (start_pos < title.size()) {
782         size_t lb_pos, end_pos, eq_pos;
783         lb_pos = start_pos;
784         if (x_FindBrackets(title, lb_pos, end_pos, eq_pos)) {
785             if (eq_pos < end_pos) {
786                 return true;
787             }
788             start_pos = end_pos+1;
789         }
790         else {
791             return false;
792         }
793     }
794     return false;
795 }
796 
797 
x_FindBrackets(const CTempString & line,size_t & start,size_t & stop,size_t & eq_pos)798 bool CTitleParser::x_FindBrackets(const CTempString& line, size_t& start, size_t& stop, size_t& eq_pos)
799 { // Copied from CSourceModParser
800     size_t i = start;
801 
802     eq_pos = CTempString::npos;
803     const char* s = line.data() + start;
804 
805     int num_unmatched_left_brackets = 0;
806     while (i < line.size())
807     {
808         switch (*s)
809         {
810         case '[':
811             num_unmatched_left_brackets++;
812             if (num_unmatched_left_brackets == 1)
813             {
814                 start = i;
815             }
816             break;
817         case '=':
818             if (num_unmatched_left_brackets > 0 && eq_pos == CTempString::npos) {
819                 eq_pos = i;
820             }
821             break;
822         case ']':
823             if (num_unmatched_left_brackets == 1)
824             {
825                 stop = i;
826                 return (eq_pos<stop);
827             }
828             else
829             if (num_unmatched_left_brackets == 0) {
830                 return false;
831             }
832             else
833             {
834                 num_unmatched_left_brackets--;
835             }
836         }
837         i++; s++;
838     }
839     return false;
840 };
841 
842 
843 END_SCOPE(objects)
844 END_NCBI_SCOPE
845 
846 
847 
848