1 /*  $Id: source_mod_parser.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Aaron Ucko, Jonathan Kans, Vasuki Gobi, Michael Kornbluh
27 *
28 * File Description:
29 *   Parser for source modifiers, as found in (Sequin-targeted) FASTA files.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <sstream>
37 
38 #include <objtools/readers/source_mod_parser.hpp>
39 #include <objtools/readers/message_listener.hpp>
40 
41 #include <corelib/ncbiutil.hpp>
42 #include <util/static_map.hpp>
43 #include <serial/enumvalues.hpp>
44 
45 #include <objects/general/Dbtag.hpp>
46 #include <objects/general/Object_id.hpp>
47 #include <objects/general/User_field.hpp>
48 #include <objects/misc/sequence_macros.hpp>
49 #include <objects/pub/Pub.hpp>
50 #include <objects/pub/Pub_equiv.hpp>
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seq/Pubdesc.hpp>
53 #include <objects/seq/Seq_annot.hpp>
54 #include <objects/seq/Seq_data.hpp>
55 #include <objects/seq/Seq_hist_rec.hpp>
56 #include <objects/seq/Seq_inst.hpp>
57 #include <objects/seq/Seqdesc.hpp>
58 #include <objects/seqfeat/Org_ref.hpp>
59 #include <objects/seqfeat/OrgMod.hpp>
60 #include <objects/seqfeat/OrgName.hpp>
61 #include <objects/seqfeat/PCRReactionSet.hpp>
62 #include <objects/seqfeat/PCRReaction.hpp>
63 #include <objects/seqfeat/PCRPrimer.hpp>
64 #include <objects/seqfeat/PCRPrimerSet.hpp>
65 #include <objects/seqfeat/Seq_feat.hpp>
66 #include <objects/seqfeat/SubSource.hpp>
67 #include <objects/seqloc/Seq_id.hpp>
68 #include <objects/seqloc/Seq_loc.hpp>
69 
70 #include <objects/general/User_object.hpp>
71 
72 BEGIN_NCBI_SCOPE
73 BEGIN_SCOPE(objects)
74 
75 namespace
76 {
77     class equal_subtype
78     {
79     public:
equal_subtype(CSubSource::TSubtype st)80         equal_subtype(CSubSource::TSubtype st) : m_st(st){};
operator ()(const CRef<CSubSource> & st) const81         bool operator()(const CRef<CSubSource>& st) const
82         {
83             return st->IsSetSubtype() && (st->GetSubtype() == m_st);
84         }
85     private:
86         CSubSource::TSubtype m_st;
87     };
88 
89 #ifdef STATIC_SMOD
90 #  error "STATIC_SMOD already defined"
91 #endif
92 
93     // The macro makes sure that the var's name matches its key.
94     // Due to kKeyCanonicalizationTable, it's okay to use '_' for '-'
95     // because it will match both.
96 
97 
98 #define STATIC_SMOD(key_str) \
99     const char   s_Mod_s_##key_str[] = #key_str; \
100     const size_t s_Mod_n_##key_str = sizeof(#key_str)-1; \
101     const CTempString s_Mod_##key_str(s_Mod_s_##key_str, s_Mod_n_##key_str)
102 
103 
104     // For CBioseq
105     STATIC_SMOD(topology);
106     STATIC_SMOD(top);
107     STATIC_SMOD(molecule);
108     STATIC_SMOD(mol);
109     STATIC_SMOD(moltype);
110     STATIC_SMOD(mol_type);
111     STATIC_SMOD(strand);
112     STATIC_SMOD(comment);
113 
114     // For CBioSource
115     STATIC_SMOD(organism);
116     STATIC_SMOD(org);
117     STATIC_SMOD(taxname);
118     STATIC_SMOD(taxid);
119     STATIC_SMOD(location);
120     STATIC_SMOD(origin);
121     STATIC_SMOD(sub_clone);
122     STATIC_SMOD(lat_long);
123     STATIC_SMOD(latitude_longitude);
124     STATIC_SMOD(fwd_primer_seq);
125     STATIC_SMOD(fwd_pcr_primer_seq);
126     STATIC_SMOD(rev_primer_seq);
127     STATIC_SMOD(rev_pcr_primer_seq);
128     STATIC_SMOD(fwd_primer_name);
129     STATIC_SMOD(fwd_pcr_primer_name);
130     STATIC_SMOD(rev_primer_name);
131     STATIC_SMOD(rev_pcr_primer_name);
132     STATIC_SMOD(dbxref);
133     STATIC_SMOD(db_xref);
134     STATIC_SMOD(division);
135     STATIC_SMOD(div);
136     STATIC_SMOD(lineage);
137     STATIC_SMOD(gcode);
138     STATIC_SMOD(mgcode);
139     STATIC_SMOD(pgcode);
140     STATIC_SMOD(note);
141     STATIC_SMOD(notes);
142     STATIC_SMOD(focus);
143 
144     // For CMolInfo
145     STATIC_SMOD(tech);
146     STATIC_SMOD(completeness);
147     STATIC_SMOD(completedness);
148 
149     // For CGene_ref
150     STATIC_SMOD(gene);
151     STATIC_SMOD(allele);
152     STATIC_SMOD(gene_syn);
153     STATIC_SMOD(gene_synonym);
154     STATIC_SMOD(locus_tag);
155 
156     // For CProt_ref
157     STATIC_SMOD(protein);
158     STATIC_SMOD(prot);
159     STATIC_SMOD(prot_desc);
160     STATIC_SMOD(protein_desc);
161     STATIC_SMOD(EC_number);
162     STATIC_SMOD(activity);
163     STATIC_SMOD(function);
164 
165     // For CGB_block
166     STATIC_SMOD(secondary_accession);
167     STATIC_SMOD(secondary_accessions);
168     STATIC_SMOD(keyword);
169     STATIC_SMOD(keywords);
170 
171     STATIC_SMOD(biosample);
172     STATIC_SMOD(bioproject);
173     // For TPA Mods (CUser_object)
174     STATIC_SMOD(primary);
175     STATIC_SMOD(primary_accessions);
176     // For SRA (Sequence Read Archive) CUser_object
177     STATIC_SMOD(SRA);
178 
179     // For Genome Project DB Mods (CUser_object)
180     STATIC_SMOD(project);
181     STATIC_SMOD(projects);
182 
183     // For Pub Mods (CSeq_descr)
184     STATIC_SMOD(PubMed);
185     STATIC_SMOD(PMID);
186 
187 
188 #undef STATIC_SMOD
189 
190     typedef set<const char*, CSourceModParser::PKeyCompare> TSModNameSet;
191 
192     // Loads up a map of SMod to subtype
193     template<typename TEnum,
194              typename TSModEnumMap = map<CSourceModParser::SMod, TEnum>,
195              typename TEnumNameToValMap = map<string, TEnum>>
196     TSModEnumMap * s_InitSmodToEnumMap(
197         const CEnumeratedTypeValues* etv,
198         // names to skip
199         const TSModNameSet & skip_enum_names,
200         // extra values to add that aren't in the enum
201         const TEnumNameToValMap & extra_enum_names_to_vals )
202     {
203         unique_ptr<TSModEnumMap> smod_enum_map(new TSModEnumMap);
204 
205         ITERATE (CEnumeratedTypeValues::TValues, it, etv->GetValues()) {
206             const string & enum_name = it->first;
207             const TEnum enum_val = static_cast<TEnum>(it->second);
208             if( skip_enum_names.find(enum_name.c_str()) !=
209                 skip_enum_names.end() )
210             {
211                 // skip this tag
212                 continue;
213             }
214             auto emplace_result =
215                 smod_enum_map->emplace(
216                     CSourceModParser::SMod(enum_name), enum_val);
217             // emplace must succeed
218             if( ! emplace_result.second) {
219                 NCBI_USER_THROW_FMT(
220                    "s_InitSmodToEnumMap " << enum_name);
221             }
222         }
223 
224         for(auto extra_smod_to_enum : extra_enum_names_to_vals) {
225             auto emplace_result =
226                 smod_enum_map->emplace(
227                     CSourceModParser::SMod(extra_smod_to_enum.first),
228                     extra_smod_to_enum.second);
229             // emplace must succeed
230             if( ! emplace_result.second) {
231                 NCBI_USER_THROW_FMT(
232                    "s_InitSmodToEnumMap " << extra_smod_to_enum.first);
233             }
234         }
235 
236         return smod_enum_map.release();
237     }
238 
239     typedef map<CSourceModParser::SMod, COrgMod::ESubtype> TSModOrgSubtypeMap;
240 
s_InitSModOrgSubtypeMap(void)241     TSModOrgSubtypeMap * s_InitSModOrgSubtypeMap(void)
242     {
243         const TSModNameSet kDeprecatedOrgSubtypes{
244             "dosage", "old-lineage", "old-name"};
245         const map<const char*, COrgMod::ESubtype> extra_smod_to_enum_names {
246             { "subspecies",    COrgMod::eSubtype_sub_species },
247             { "host",          COrgMod::eSubtype_nat_host    },
248             { "specific-host", COrgMod::eSubtype_nat_host    },
249         };
250 
251         return s_InitSmodToEnumMap<COrgMod::ESubtype>(
252             COrgMod::GetTypeInfo_enum_ESubtype(),
253             kDeprecatedOrgSubtypes,
254             extra_smod_to_enum_names
255         );
256     }
257 
258     // The subtype SMods are loaded from the names of the enum
259     // and they map to ESubtype enum values so we can't just use STATIC_SMOD
260     CSafeStatic<TSModOrgSubtypeMap> kSModOrgSubtypeMap(s_InitSModOrgSubtypeMap,
261                                                  nullptr);
262 
263     typedef map<CSourceModParser::SMod,
264                 CSubSource::ESubtype> TSModSubSrcSubtype;
265 
s_InitSModSubSrcSubtypeMap(void)266     TSModSubSrcSubtype * s_InitSModSubSrcSubtypeMap(void)
267     {
268         // some are skipped because they're handled specially and some are
269         // skipped because they're deprecated
270         TSModNameSet skip_enum_names {
271             // skip because handled specially elsewhere
272             "fwd_primer_seq", "rev_primer_seq",
273             "fwd_primer_name", "rev_primer_name",
274             "fwd_PCR_primer_seq", "rev_PCR_primer_seq",
275             "fwd_PCR_primer_name", "rev_PCR_primer_name",
276             // skip because deprecated
277             "transposon_name",
278             "plastid_name",
279             "insertion_seq_name",
280         };
281         const map<string, CSubSource::ESubtype> extra_smod_to_enum_names {
282             { "sub-clone",          CSubSource::eSubtype_subclone },
283             { "lat-long",           CSubSource::eSubtype_lat_lon  },
284             { "latitude-longitude", CSubSource::eSubtype_lat_lon  },
285         };
286         return s_InitSmodToEnumMap<CSubSource::ESubtype>(
287             CSubSource::GetTypeInfo_enum_ESubtype(),
288             skip_enum_names,
289             extra_smod_to_enum_names );
290     }
291 
292     CSafeStatic<TSModSubSrcSubtype> kSModSubSrcSubtypeMap(
293         s_InitSModSubSrcSubtypeMap, nullptr);
294 
x_FindBrackets(const CTempString & str,size_t & start,size_t & stop,size_t & eq_pos)295     bool x_FindBrackets(const CTempString& str, size_t& start, size_t& stop, size_t& eq_pos)
296     {
297         size_t i = start;
298         eq_pos = CTempString::npos;
299 
300         const char* s = str.data() + start;
301 
302         int nested_brackets = -1;
303         while (i < str.size())
304         {
305             switch (*s)
306             {
307             case '[':
308                 nested_brackets++;
309                 if (nested_brackets == 0)
310                 {
311                     start = i;
312                 }
313                 break;
314             case '=':
315                 if (nested_brackets >= 0)
316                 if (eq_pos == CTempString::npos)
317                     eq_pos = i;
318                 break;
319             case ']':
320                 if (nested_brackets == 0)
321                 {
322                     stop = i;
323                     if (eq_pos == CTempString::npos)
324                         eq_pos = i;
325                     return true;
326                 }
327                 else
328                 if (nested_brackets < 0)
329                     return false;
330                 else
331                 {
332                     nested_brackets--;
333                 }
334             }
335             i++; s++;
336         }
337         return false;
338     };
339 
x_AppendIfNonEmpty(string & s,const CTempString & o)340     void x_AppendIfNonEmpty(string& s, const CTempString& o)
341     {
342         if (!o.empty())
343         {
344             if (!s.empty())
345                 s.push_back(' ');
346             s.append(o.data(), o.length());
347         }
348     }
349 
350 };
351 
352 
353 CSafeStatic<CSourceModParser::SMod> CSourceModParser::kEmptyMod;
354 
355 // ASCII letters to lowercase, space and underscore to hyphen.
356 const unsigned char CSourceModParser::kKeyCanonicalizationTable[257] =
357     "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
358     "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
359     "-!\"#$%&'()*+,-./0123456789:;<=>?"
360     "@abcdefghijklmnopqrstuvwxyz[\\]^-"
361     "`abcdefghijklmnopqrstuvwxyz{|}~\x7F"
362     "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
363     "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
364     "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
365     "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
366     "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
367     "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"
368     "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
369     "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
370 
371 
372 template<class _T>
373 class CAutoInitDesc : protected CAutoAddDesc
374 {
375 public:
376     CAutoInitDesc(CSeq_descr& descr, CSeqdesc::E_Choice which);
377     CAutoInitDesc(CBioseq& bioseq, CSeqdesc::E_Choice which);
378     CAutoInitDesc(CBioseq_set& bioset, CSeqdesc::E_Choice which);
379     CAutoInitDesc(_T& obj);
380     _T* operator->();
381     _T& operator*();
382 protected:
383     _T* m_ptr;
384     void _getfromdesc();
385     mutable CRef<CBioseq> m_bioseq;
386     mutable CRef<CBioseq_set> m_bioset;
387 };
388 
389 class CAutoAddDBLink
390 {
391 public:
CAutoAddDBLink(CBioseq & seq,const CTempString & id)392     CAutoAddDBLink(CBioseq& seq, const CTempString& id)
393       :m_bioseq(seq), m_id(id)
394     {
395     }
IsInitialised() const396     bool IsInitialised() const
397     {
398         return !m_dblink.Empty();
399     }
400 
Get()401     CUser_field& Get()
402     {
403         if (m_dblink)
404             return *m_dblink;
405 
406         for (auto& d : m_bioseq.SetDescr().Set())
407         {
408             if (d->IsUser() && d->GetUser().IsDBLink())
409             {
410                 for (auto& u : d->SetUser().SetData())
411                 {
412                     if (u->IsSetLabel() && u->GetLabel().IsStr() &&
413                         NStr::EqualCase(u->GetLabel().GetStr(), m_id))
414                     {
415                         m_dblink = u;
416                         return *m_dblink;
417                     }
418                 }
419             }
420         }
421         if (m_dblink.Empty())
422         {
423             m_user_obj.Reset(new CSeqdesc);
424             m_user_obj->SetUser().SetType().SetStr() = "DBLink";
425             m_dblink.Reset(new CUser_field);
426             m_dblink->SetLabel().SetStr() = m_id;
427             m_user_obj->SetUser().SetData().push_back(m_dblink);
428             m_bioseq.SetDescr().Set().push_back(m_user_obj);
429         }
430 
431         return *m_dblink;
432     }
433 protected:
434     CBioseq& m_bioseq;
435     CTempString m_id;
436     CRef<CUser_field> m_dblink;
437     CRef<CSeqdesc> m_user_obj;
438 };
439 
440 CSafeStaticRef<CSeq_descr> fake_descr;
441 
442 template<class _T>
443 inline
CAutoInitDesc(CSeq_descr & descr,CSeqdesc::E_Choice which)444 CAutoInitDesc<_T>::CAutoInitDesc(CSeq_descr& descr, CSeqdesc::E_Choice which) :
445   CAutoAddDesc(descr, which),
446   m_ptr(0)
447 {
448 }
449 
450 template<class _T>
451 inline
CAutoInitDesc(CBioseq & bioseq,CSeqdesc::E_Choice which)452 CAutoInitDesc<_T>::CAutoInitDesc(CBioseq& bioseq, CSeqdesc::E_Choice which) :
453   CAutoAddDesc(*fake_descr, which),
454    m_ptr(0),
455    m_bioseq(&bioseq)
456 {
457     m_descr.Reset();
458 }
459 
460 template<class _T>
461 inline
CAutoInitDesc(CBioseq_set & bioset,CSeqdesc::E_Choice which)462 CAutoInitDesc<_T>::CAutoInitDesc(CBioseq_set& bioset, CSeqdesc::E_Choice which) :
463   CAutoAddDesc(*fake_descr, which),
464   m_ptr(0),
465   m_bioset(&bioset)
466 
467 {
468     m_descr.Reset();
469 }
470 
471 template<class _T>
472 inline
CAutoInitDesc(_T & obj)473 CAutoInitDesc<_T>::CAutoInitDesc(_T& obj):
474    CAutoAddDesc(*fake_descr, CSeqdesc::e_not_set), m_ptr(&obj)
475 {
476     m_descr.Reset();
477 }
478 
479 
480 template<class _T>
481 inline
operator *()482 _T& CAutoInitDesc<_T>::operator*()
483 {
484     return * operator->();
485 }
486 
487 template<class _T>
488 inline
operator ->()489 _T* CAutoInitDesc<_T>::operator->()
490 {
491     if (m_ptr == 0 &&
492         m_which != CSeqdesc::e_not_set)
493     {
494       if (m_descr.Empty())
495       {
496         if (!m_bioseq.Empty())
497           m_descr = &m_bioseq->SetDescr();
498         else
499         if (!m_bioset.Empty())
500           m_descr = &m_bioset->SetDescr();
501       }
502       _getfromdesc();
503     }
504 
505     return m_ptr;
506 }
507 
508 template<>
_getfromdesc()509 void CAutoInitDesc<CBioSource>::_getfromdesc()
510 {
511     m_ptr = &Set().SetSource();
512 }
513 
514 template<>
_getfromdesc()515 void CAutoInitDesc<CMolInfo>::_getfromdesc()
516 {
517     m_ptr = &Set().SetMolinfo();
518 }
519 
520 template<>
_getfromdesc()521 void CAutoInitDesc<CGB_block>::_getfromdesc()
522 {
523     m_ptr = &Set().SetGenbank();
524 }
525 
526 
ParseTitle(const CTempString & title,CConstRef<CSeq_id> seqid,size_t iMaxModsToParse)527 string CSourceModParser::ParseTitle(const CTempString& title,
528     CConstRef<CSeq_id> seqid,
529     size_t iMaxModsToParse )
530 {
531     SMod   mod;
532     string stripped_title;
533     size_t pos = 0;
534 
535     m_Mods.clear();
536 
537     mod.seqid = seqid;
538 
539     size_t iModsFoundSoFar = 0;
540     for (; (pos < title.size()) && (iModsFoundSoFar < iMaxModsToParse);
541         ++iModsFoundSoFar )
542     {
543         size_t lb_pos, end_pos, eq_pos;
544         lb_pos = pos;
545         if (x_FindBrackets(title, lb_pos, end_pos, eq_pos))
546         {
547             CTempString skipped = NStr::TruncateSpaces_Unsafe(title.substr(pos, lb_pos - pos));
548 
549             if (eq_pos < end_pos) {
550                 CTempString key = NStr::TruncateSpaces_Unsafe(title.substr(lb_pos+1, eq_pos - lb_pos - 1));
551                 CTempString value = NStr::TruncateSpaces_Unsafe(title.substr(eq_pos + 1, end_pos - eq_pos - 1));
552 
553                 mod.key = key;
554                 mod.value = value;
555                 mod.pos = lb_pos;
556                 mod.used = false;
557                 m_Mods.emplace(mod);
558             }
559 
560             x_AppendIfNonEmpty(stripped_title, skipped);
561 
562             pos = end_pos + 1;
563         }
564         else
565         { // rest of the title is unparsed
566             CTempString rest = NStr::TruncateSpaces_Unsafe(title.substr(pos));
567             x_AppendIfNonEmpty(stripped_title, rest);
568             break;
569         }
570     }
571 
572     return stripped_title;
573 }
574 
ApplyAllMods(CBioseq & seq,CTempString organism,CConstRef<CSeq_loc> location)575 void CSourceModParser::ApplyAllMods(CBioseq& seq, CTempString organism, CConstRef<CSeq_loc> location)
576 {
577     ApplyMods(seq);
578     // Although the logic below reuses some existing objects if
579     // present, it always creates new features and descriptors.
580 
581     {{
582         CRef<CSeq_id> best_id = FindBestChoice(seq.GetId(), CSeq_id::BestRank);
583         if (location.Empty() && !best_id.Empty())
584         {
585             CRef<CSeq_loc> loc(new CSeq_loc);
586             loc->SetWhole(*best_id);
587             location = loc;
588         }
589 
590         if (location)
591         {
592             CAutoInitRef<CSeq_annot> ftable;
593             bool                     had_ftable = false;
594 
595             if (seq.IsSetAnnot()) {
596                 NON_CONST_ITERATE (CBioseq::TAnnot, it, seq.SetAnnot()) {
597                     if ((*it)->GetData().IsFtable()) {
598                         ftable.Set(*it);
599                         had_ftable = true;
600                         break;
601                     }
602                 }
603             }
604 
605             // CGene_ref only on nucleotide seqs
606             if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
607                 CAutoInitRef<CGene_ref> gene;
608                 x_ApplyMods(gene);
609                 if (gene.IsInitialized()) {
610                     CRef<CSeq_feat> feat(new CSeq_feat);
611                     feat->SetData().SetGene(*gene);
612                     feat->SetLocation().Assign(*location);
613                     ftable->SetData().SetFtable().push_back(feat);
614                 }
615             }
616 
617             // only add Prot_ref if amino acid (or at least not nucleic acid)
618             // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsAa()
619             // can throw an exception if mol isn't set)
620             if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsAa() ) {
621                 CAutoInitRef<CProt_ref> prot;
622                 x_ApplyMods(prot);
623                 if ( prot.IsInitialized() ) {
624                     CRef<CSeq_feat> feat(new CSeq_feat);
625                     feat->SetData().SetProt(*prot);
626                     feat->SetLocation().Assign(*location);
627                     ftable->SetData().SetFtable().push_back(feat);
628                 }
629             }
630 
631             if ( !had_ftable  &&  ftable.IsInitialized() ) {
632                 seq.SetAnnot().push_back(CRef<CSeq_annot>(&*ftable));
633             }
634         }
635     }}
636 
637     if (seq.GetInst().IsSetHist()) {
638         ApplyMods(seq.SetInst().SetHist());
639     } else {
640         CAutoInitRef<CSeq_hist> hist;
641         x_ApplyMods(hist);
642         if (hist.IsInitialized()) {
643             seq.SetInst().SetHist(*hist);
644         }
645     }
646 
647     {{
648         //CSeq_descr* descr = 0;
649         if (
650           seq.GetParentSet() && seq.GetParentSet()->IsSetClass() &&
651           seq.GetParentSet()->GetClass() == CBioseq_set::eClass_nuc_prot)
652         {
653             CBioseq_set& bioset = *(CBioseq_set*)(seq.GetParentSet().GetPointerOrNull());
654             //descr = &bioset.SetDescr();
655             CAutoInitDesc<CBioSource> bsrc(bioset, CSeqdesc::e_Source);
656             x_ApplyMods(bsrc, organism);
657         }
658         else
659         {
660           //descr = &seq.SetDescr();
661             CAutoInitDesc<CBioSource> bsrc(seq, CSeqdesc::e_Source);
662             x_ApplyMods(bsrc, organism);
663         }
664         //CAutoInitDesc<CBioSource> bsrc(*descr, CSeqdesc::e_Source);
665         //x_ApplyMods(bsrc, organism);
666     }}
667 
668     {{
669         CAutoInitDesc<CMolInfo> mi(seq, CSeqdesc::e_Molinfo);
670         x_ApplyMods(mi);
671     }}
672 
673     {{
674         CAutoInitDesc<CGB_block> gbb(seq, CSeqdesc::e_Genbank);
675         x_ApplyMods(gbb);
676     }}
677 
678     {{
679         CAutoInitRef<CUser_object> tpa;
680         x_ApplyTPAMods(tpa);
681         if (tpa.IsInitialized()) {
682             CRef<CSeqdesc> desc(new CSeqdesc);
683             desc->SetUser(*tpa);
684             seq.SetDescr().Set().push_back(desc);
685         }
686     }}
687 
688     x_ApplyDBLinkMods(seq);
689 
690     {{
691         CAutoInitRef<CUser_object> gpdb;
692         x_ApplyGenomeProjectsDBMods(gpdb);
693         if (gpdb.IsInitialized()) {
694             CRef<CSeqdesc> desc(new CSeqdesc);
695             desc->SetUser(*gpdb);
696             seq.SetDescr().Set().push_back(desc);
697         }
698     }}
699 
700     {{
701         ApplyPubMods(seq);
702     }}
703 
704     TMods unusedMods = GetMods(fUnusedMods);
705     for (TMods::const_iterator unused = unusedMods.begin(); unused != unusedMods.end(); ++unused) {
706         x_HandleUnkModValue(*unused);
707     }
708 };
709 
710 struct SMolTypeInfo {
711 
712     // is it shown to the user as a possibility or just silently accepted?
713     enum EShown {
714         eShown_Yes, // Yes, show to user in error messages, etc.
715         eShown_No   // No, don't show the user, but silently accept it if the user gives it to us
716     };
717 
SMolTypeInfoSMolTypeInfo718     SMolTypeInfo(
719         EShown eShown,
720         CMolInfo::TBiomol eBiomol,
721         CSeq_inst::EMol eMol ) :
722         m_eBiomol(eBiomol), m_eMol(eMol), m_eShown(eShown)
723     { }
724 
725     CMolInfo::TBiomol m_eBiomol;
726     CSeq_inst::EMol   m_eMol;
727     EShown m_eShown;
728 };
729 typedef SStaticPair<const char*, SMolTypeInfo> TBiomolMapEntry;
730 static const TBiomolMapEntry sc_BiomolArray[] = {
731     // careful with the sort: remember that the key is canonicalized first
732     {"cRNA",                  SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_cRNA,            CSeq_inst::eMol_rna) },
733     {"DNA",                   SMolTypeInfo(SMolTypeInfo::eShown_No,  CMolInfo::eBiomol_genomic,         CSeq_inst::eMol_dna) },
734     {"Genomic",               SMolTypeInfo(SMolTypeInfo::eShown_No,  CMolInfo::eBiomol_genomic,         CSeq_inst::eMol_dna) },
735     {"Genomic DNA",           SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_genomic,         CSeq_inst::eMol_dna) },
736     {"Genomic RNA",           SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_genomic,         CSeq_inst::eMol_rna) },
737     {"mRNA",                  SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_mRNA,            CSeq_inst::eMol_rna) },
738     {"ncRNA",                 SMolTypeInfo(SMolTypeInfo::eShown_No,  CMolInfo::eBiomol_ncRNA,           CSeq_inst::eMol_rna) },
739     {"non-coding RNA",        SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_ncRNA,           CSeq_inst::eMol_rna) },
740     {"Other-Genetic",         SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_other_genetic,   CSeq_inst::eMol_other) },
741     {"Precursor RNA",         SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_pre_RNA,         CSeq_inst::eMol_rna) },
742     {"Ribosomal RNA",         SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_rRNA,            CSeq_inst::eMol_rna) },
743     {"rRNA",                  SMolTypeInfo(SMolTypeInfo::eShown_No,  CMolInfo::eBiomol_rRNA,            CSeq_inst::eMol_rna) },
744     {"Transcribed RNA",       SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_transcribed_RNA, CSeq_inst::eMol_rna) },
745     {"Transfer-messenger RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_tmRNA,           CSeq_inst::eMol_rna) },
746     {"Transfer RNA",          SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_tRNA,            CSeq_inst::eMol_rna) },
747     {"tRNA",                  SMolTypeInfo(SMolTypeInfo::eShown_No,  CMolInfo::eBiomol_tRNA,            CSeq_inst::eMol_rna) },
748 };
749 typedef CStaticPairArrayMap<const char*, SMolTypeInfo,
750                         CSourceModParser::PKeyCompare>  TBiomolMap;
751 DEFINE_STATIC_ARRAY_MAP(TBiomolMap, sc_BiomolMap, sc_BiomolArray);
752 
ApplyMods(CBioseq & seq)753 void CSourceModParser::ApplyMods(CBioseq& seq)
754 {
755     const SMod* mod = NULL;
756 
757     // top[ology]
758     if ((mod = FindMod(s_Mod_topology, s_Mod_top)) != NULL) {
759         if (NStr::EqualNocase(mod->value, "linear")) {
760             seq.SetInst().SetTopology(CSeq_inst::eTopology_linear);
761         } else if (NStr::EqualNocase(mod->value, "circular")) {
762             seq.SetInst().SetTopology(CSeq_inst::eTopology_circular);
763         } else {
764             x_HandleBadModValue(*mod);
765         }
766     }
767 
768     // molecule information is not set for proteins at this time
769     // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsNa()
770     // can throw an exception if mol isn't set)
771     if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
772         bool bMolSetViaMolMod = false;
773 
774         // mol[ecule]
775         if ((mod = FindMod(s_Mod_molecule, s_Mod_mol)) != NULL) {
776             if (NStr::EqualNocase(mod->value, "dna")) {
777                 seq.SetInst().SetMol( CSeq_inst::eMol_dna );
778                 bMolSetViaMolMod = true;
779             } else if (NStr::EqualNocase(mod->value, "rna")) {
780                 seq.SetInst().SetMol( CSeq_inst::eMol_rna );
781                 bMolSetViaMolMod = true;
782             } else {
783                 x_HandleBadModValue(*mod);
784             }
785         }
786 
787         // if mol/molecule not set right, we can use moltype instead
788 
789         // mol[-]type
790         if( ! bMolSetViaMolMod ) {
791             if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != NULL) {
792                 TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
793                 if (it == sc_BiomolMap.end()) {
794                     x_HandleBadModValue(*mod);
795                 } else {
796                     // moltype sets biomol and inst.mol
797                     seq.SetInst().SetMol(it->second.m_eMol);
798                 }
799             }
800         }
801     }
802 
803     // strand
804     if ((mod = FindMod(s_Mod_strand)) != NULL) {
805         if (NStr::EqualNocase(mod->value, "single")) {
806             seq.SetInst().SetStrand( CSeq_inst::eStrand_ss );
807         } else if (NStr::EqualNocase(mod->value, "double")) {
808             seq.SetInst().SetStrand( CSeq_inst::eStrand_ds );
809         } else if (NStr::EqualNocase(mod->value, "mixed")) {
810             seq.SetInst().SetStrand( CSeq_inst::eStrand_mixed );
811         } else {
812             x_HandleBadModValue(*mod);
813         }
814     }
815 
816     // comment
817     if ((mod = FindMod(s_Mod_comment)) != NULL) {
818         CRef<CSeqdesc> desc(new CSeqdesc);
819         desc->SetComment( mod->value );
820         seq.SetDescr().Set().push_back(desc);
821     }
822 }
823 
824 
s_AddPrimers(const pair<string,string> & primer_info,CPCRPrimerSet & primer_set)825 static void s_AddPrimers(const pair<string, string>& primer_info, CPCRPrimerSet& primer_set)
826 {
827     vector<string> names;
828     NStr::Split(primer_info.first, ":", names, NStr::fSplit_Tokenize);
829     vector<string> seqs;
830     NStr::Split(primer_info.second, ":", seqs, NStr::fSplit_Tokenize);
831 
832     const auto num_names = names.size();
833     const auto num_seqs = seqs.size();
834     const auto num_primers = max(num_names, num_seqs);
835 
836     for(size_t i=0; i<num_primers; ++i) {
837         auto primer = Ref(new CPCRPrimer());
838 
839         if (i<num_names && !NStr::IsBlank(names[i])) {
840             primer->SetName().Set(names[i]);
841         }
842         if (i<num_seqs && !NStr::IsBlank(seqs[i])) {
843             primer->SetSeq().Set(seqs[i]);
844         }
845         primer_set.Set().push_back(primer);
846     }
847 }
848 
849 
s_GetPrimerInfo(const CSourceModParser::SMod * pNamesMod,const CSourceModParser::SMod * pSeqsMod,vector<pair<string,string>> & reaction_info)850 static void s_GetPrimerInfo(const CSourceModParser::SMod* pNamesMod,
851                             const CSourceModParser::SMod* pSeqsMod,
852                             vector<pair<string, string>>& reaction_info)
853 {
854     reaction_info.clear();
855     vector<string> names;
856     if (pNamesMod) {
857         NStr::Split(pNamesMod->value, ",", names, NStr::fSplit_Tokenize);
858     }
859 
860     vector<string> seqs;
861     if (pSeqsMod) {
862         NStr::Split(pSeqsMod->value, ",", seqs, NStr::fSplit_Tokenize);
863         if (seqs.size()>1) {
864             if (seqs.front().front() == '(') {
865                 seqs.front().erase(0,1);
866             }
867             if (seqs.back().back() == ')') {
868                 seqs.back().erase(seqs.back().size()-1, 1);
869             }
870         }
871     }
872 
873     const auto num_names = names.size();
874     const auto num_seqs = seqs.size();
875     const auto num_reactions = max(num_names, num_seqs);
876 
877     for (int i=0; i<num_reactions; ++i) {
878         const string name = (i<num_names) ? names[i] : "";
879         const string seq  = (i<num_seqs) ? seqs[i] : "";
880         reaction_info.push_back(make_pair(name, seq));
881     }
882 }
883 
884 
x_AddPCRPrimers(CAutoInitRef<CPCRReactionSet> & pcr_reaction_set)885 void CSourceModParser::x_AddPCRPrimers(CAutoInitRef<CPCRReactionSet>& pcr_reaction_set)
886 {
887     using TNameSeqPair = pair<string, string>;
888 
889     const SMod* pNameMod = nullptr;
890     const SMod* pSeqMod = nullptr;
891 
892     pNameMod = FindMod(s_Mod_fwd_primer_name, s_Mod_fwd_pcr_primer_name);
893     pSeqMod = FindMod(s_Mod_fwd_primer_seq, s_Mod_fwd_pcr_primer_seq);
894     vector<TNameSeqPair> fwd_primer_info;
895     s_GetPrimerInfo(pNameMod, pSeqMod, fwd_primer_info);
896 
897 
898     pNameMod = FindMod(s_Mod_rev_primer_name, s_Mod_rev_pcr_primer_name);
899     pSeqMod = FindMod(s_Mod_rev_primer_seq, s_Mod_rev_pcr_primer_seq);
900     vector<TNameSeqPair> rev_primer_info;
901     s_GetPrimerInfo(pNameMod, pSeqMod, rev_primer_info);
902 
903     if (fwd_primer_info.empty() &&
904         rev_primer_info.empty()) {
905         return;
906     }
907 
908     auto num_fwd_primer_info = fwd_primer_info.size();
909     auto num_rev_primer_info = rev_primer_info.size();
910 
911     if (num_fwd_primer_info == num_rev_primer_info) {
912         for (auto i=0; i<num_fwd_primer_info; ++i) {
913             CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
914             s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
915             s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
916             pcr_reaction_set->Set().push_back(pcr_reaction);
917         }
918     }
919     else
920     if (num_fwd_primer_info > num_rev_primer_info) {
921         auto diff = num_fwd_primer_info - num_rev_primer_info;
922         for (int i=0; i<diff; ++i) {
923             CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
924             s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
925             pcr_reaction_set->Set().push_back(pcr_reaction);
926         }
927 
928         for (int i=diff; i<num_fwd_primer_info; ++i) {
929             CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
930             s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
931             s_AddPrimers(rev_primer_info[i-diff], pcr_reaction->SetReverse());
932             pcr_reaction_set->Set().push_back(pcr_reaction);
933         }
934     }
935     else
936     if (num_fwd_primer_info < num_rev_primer_info) {
937         for (int i=0; i<num_fwd_primer_info; ++i) {
938             CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
939             s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
940             s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
941             pcr_reaction_set->Set().push_back(pcr_reaction);
942         }
943 
944         for (int i=num_fwd_primer_info; i<num_rev_primer_info; ++i) {
945             CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
946             s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
947             pcr_reaction_set->Set().push_back(pcr_reaction);
948         }
949     }
950 }
951 
952 
x_ApplyMods(CAutoInitDesc<CBioSource> & bsrc,CTempString organism)953 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CBioSource>& bsrc,
954                                    CTempString organism)
955 {
956     const SMod* mod = NULL;
957     bool reset_taxid = false;
958 
959     // org[anism]
960     if (organism.empty())
961     {
962         if ((mod = FindMod(s_Mod_organism, s_Mod_org)) != NULL) {
963             organism = mod->value;
964         }
965         else
966         if ((mod = FindMod(s_Mod_taxname)) != NULL) {
967             organism = mod->value;
968         }
969     }
970 
971     if ( !organism.empty())
972     {
973         if (!(bsrc->GetOrg().IsSetTaxname() && NStr::EqualNocase(bsrc->GetOrg().GetTaxname(), organism)))
974         {
975             if (bsrc->GetOrg().IsSetTaxname())
976             {
977                 bsrc->ResetOrg();
978 //                bsrc->ResetSubtype();
979             }
980             bsrc->SetOrg().SetTaxname(organism);
981             reset_taxid = true;
982         }
983     }
984 
985     // location
986     if ((mod = FindMod(s_Mod_location)) != NULL) {
987         if (NStr::EqualNocase(mod->value, "mitochondrial")) {
988             bsrc->SetGenome(CBioSource::eGenome_mitochondrion);
989         } else if (NStr::EqualNocase(mod->value, "provirus")) {
990             bsrc->SetGenome(CBioSource::eGenome_proviral);
991         } else if (NStr::EqualNocase(mod->value, "extrachromosomal")) {
992             bsrc->SetGenome(CBioSource::eGenome_extrachrom);
993         } else if (NStr::EqualNocase(mod->value, "insertion sequence")) {
994             bsrc->SetGenome(CBioSource::eGenome_insertion_seq);
995         } else {
996             try {
997                 bsrc->SetGenome(CBioSource::GetTypeInfo_enum_EGenome()
998                                 ->FindValue(mod->value));
999             } catch (CSerialException&) {
1000                 x_HandleBadModValue(*mod);
1001             }
1002         }
1003     }
1004 
1005     // origin
1006     if ((mod = FindMod(s_Mod_origin)) != NULL) {
1007         try {
1008             // also check for special cases that don't match the enum name
1009             if( NStr::EqualNocase(mod->value, "natural mutant") ) {
1010                 bsrc->SetOrigin( CBioSource::eOrigin_natmut );
1011             } else if( NStr::EqualNocase(mod->value, "mutant") ) {
1012                 bsrc->SetOrigin( CBioSource::eOrigin_mut );
1013             } else {
1014                 bsrc->SetOrigin(CBioSource::GetTypeInfo_enum_EOrigin()
1015                             ->FindValue(mod->value));
1016             }
1017         } catch (CSerialException&) {
1018             x_HandleBadModValue(*mod);
1019         }
1020     }
1021 
1022     // handle orgmods
1023     for(const auto & smod_orgsubtype : kSModOrgSubtypeMap.Get()) {
1024         const SMod & smod = smod_orgsubtype.first;
1025         const COrgMod::ESubtype e_subtype = smod_orgsubtype.second;
1026         if ((mod = FindMod(smod.key)) != NULL) {
1027             CRef<COrgMod> org_mod(new COrgMod);
1028             org_mod->SetSubtype(e_subtype);
1029             org_mod->SetSubname(mod->value);
1030             bsrc->SetOrg().SetOrgname().SetMod().push_back(org_mod);
1031             reset_taxid = true;
1032         }
1033     }
1034 
1035     // handle subsources
1036     for( const auto & smod_subsrcsubtype : kSModSubSrcSubtypeMap.Get() ) {
1037         const SMod & smod = smod_subsrcsubtype.first;
1038         const CSubSource::ESubtype e_subtype = smod_subsrcsubtype.second;
1039         if ((mod = FindMod(smod.key)) != NULL) {
1040             auto& subtype = bsrc->SetSubtype();
1041             CRef<CSubSource> subsource(new CSubSource);
1042             subsource->SetSubtype(e_subtype);
1043 
1044             if( CSubSource::NeedsNoText(e_subtype) ) {
1045                 subsource->SetName(kEmptyStr);
1046             } else {
1047                 subsource->SetName(mod->value);
1048             }
1049 
1050             if (!CSubSource::IsMultipleValuesAllowed(e_subtype))
1051             {
1052                 // since only one of this e_subtype is allowed, we erase any
1053                 // that are already in the subtype list.
1054                 // (Unfortunately, we cannot just use bsrc->RemoveSubSource
1055                 // because it will ResetSubtype if subtype ends up empty)
1056                 subtype.erase(
1057                     remove_if(subtype.begin(), subtype.end(),
1058                               equal_subtype(e_subtype)),
1059                     subtype.end());
1060             }
1061 
1062             subtype.push_back(subsource);
1063         }
1064     }
1065 
1066     // handle PCR Primers
1067     {{
1068         CAutoInitRef<CPCRReactionSet> pcr_reaction_set;
1069         x_AddPCRPrimers(pcr_reaction_set);
1070         if (pcr_reaction_set.IsInitialized()) {
1071             if (!bsrc->IsSetPcr_primers()) {
1072                 bsrc->SetPcr_primers(*pcr_reaction_set);
1073             }
1074             else {
1075                 bsrc->SetPcr_primers().Set().splice(
1076                         bsrc->SetPcr_primers().Set().end(),
1077                         pcr_reaction_set->Set());
1078             }
1079         }
1080      }}
1081 
1082 
1083     // db_xref
1084     TModsRange db_xref_mods_range = FindAllMods( s_Mod_db_xref, s_Mod_dbxref );
1085     for( TModsCI db_xref_iter = db_xref_mods_range.first;
1086             db_xref_iter != db_xref_mods_range.second;
1087             ++db_xref_iter ) {
1088         CRef< CDbtag > new_db( new CDbtag );
1089 
1090         const CTempString db_xref_str = db_xref_iter->value;
1091         CRef<CObject_id> object_id(new CObject_id);
1092 
1093         size_t colon_location = db_xref_str.find(":");
1094         if (colon_location == string::npos) {
1095             // no colon: it's just tag, and db is unknown
1096             new_db->SetDb() = "?";
1097             db_xref_str.Copy(object_id->SetStr(), 0, CTempString::npos);
1098         } else {
1099             // there's a colon, so db and tag are both known
1100             db_xref_str.Copy(new_db->SetDb(), 0, colon_location);
1101             db_xref_str.Copy(object_id->SetStr(), colon_location + 1, CTempString::npos);
1102         }
1103 
1104         new_db->SetTag( *object_id );
1105 
1106         bsrc->SetOrg().SetDb().push_back( new_db );
1107     }
1108 
1109     // div[ision]
1110     if ((mod = FindMod(s_Mod_division, s_Mod_div)) != NULL) {
1111         bsrc->SetOrg().SetOrgname().SetDiv( mod->value );
1112     }
1113 
1114     // lineage
1115     if ((mod = FindMod(s_Mod_lineage)) != NULL) {
1116         bsrc->SetOrg().SetOrgname().SetLineage( mod->value );
1117     }
1118 
1119     // gcode
1120     if ((mod = FindMod(s_Mod_gcode)) != NULL) {
1121         bsrc->SetOrg().SetOrgname().SetGcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1122     }
1123 
1124     // mgcode
1125     if ((mod = FindMod(s_Mod_mgcode)) != NULL) {
1126         bsrc->SetOrg().SetOrgname().SetMgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1127     }
1128 
1129     // pgcode
1130     if ((mod = FindMod(s_Mod_pgcode)) != NULL) {
1131         bsrc->SetOrg().SetOrgname().SetPgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1132     }
1133 
1134     // note[s]
1135     TModsRange mods[2];
1136     mods[0] = FindAllMods(s_Mod_note);
1137     mods[1] = FindAllMods(s_Mod_notes);
1138     for (size_t i = 0; i < 2; i++)
1139     {
1140         for (TModsCI it = mods[i].first; it != mods[i].second; it++)
1141         {
1142             CRef< CSubSource > new_subsource(new CSubSource);
1143             new_subsource->SetSubtype(CSubSource::eSubtype_other);
1144             new_subsource->SetName(it->value);
1145             bsrc->SetSubtype().push_back(new_subsource);
1146         }
1147     }
1148 
1149     // focus
1150     if ((mod = FindMod(s_Mod_focus)) != NULL) {
1151         if( NStr::EqualNocase( mod->value, "TRUE" ) ) {
1152             bsrc->SetIs_focus();
1153         }
1154     }
1155 
1156 
1157     if ((mod = FindMod(s_Mod_taxid)) != NULL) {
1158         bsrc->SetOrg().SetTaxId( NStr::StringToNumeric<TTaxId>(mod->value, NStr::fConvErr_NoThrow) );
1159     }
1160     else
1161     if (reset_taxid && bsrc->IsSetOrgname() && bsrc->GetOrg().GetTaxId() != ZERO_TAX_ID) {
1162        bsrc->SetOrg().SetTaxId(ZERO_TAX_ID);
1163     }
1164 }
1165 
1166 typedef SStaticPair<const char*, CMolInfo::TTech> TTechMapEntry;
1167 static const TTechMapEntry sc_TechArray[] = {
1168     { "?",                  CMolInfo::eTech_unknown },
1169     { "barcode",            CMolInfo::eTech_barcode },
1170     { "both",               CMolInfo::eTech_both },
1171     { "composite-wgs-htgs", CMolInfo::eTech_composite_wgs_htgs },
1172     { "concept-trans",      CMolInfo::eTech_concept_trans },
1173     { "concept-trans-a",    CMolInfo::eTech_concept_trans_a },
1174     { "derived",            CMolInfo::eTech_derived },
1175     { "EST",                CMolInfo::eTech_est },
1176     { "fli cDNA",           CMolInfo::eTech_fli_cdna },
1177     { "genetic map",        CMolInfo::eTech_genemap },
1178     { "htc",                CMolInfo::eTech_htc },
1179     { "htgs 0",             CMolInfo::eTech_htgs_0 },
1180     { "htgs 1",             CMolInfo::eTech_htgs_1 },
1181     { "htgs 2",             CMolInfo::eTech_htgs_2 },
1182     { "htgs 3",             CMolInfo::eTech_htgs_3 },
1183     { "physical map",       CMolInfo::eTech_physmap },
1184     { "seq-pept",           CMolInfo::eTech_seq_pept },
1185     { "seq-pept-homol",     CMolInfo::eTech_seq_pept_homol },
1186     { "seq-pept-overlap",   CMolInfo::eTech_seq_pept_overlap },
1187     { "standard",           CMolInfo::eTech_standard },
1188     { "STS",                CMolInfo::eTech_sts },
1189     { "survey",             CMolInfo::eTech_survey },
1190     { "targeted",           CMolInfo::eTech_targeted },
1191     { "tsa",                CMolInfo::eTech_tsa },
1192     { "wgs",                CMolInfo::eTech_wgs }
1193 };
1194 typedef CStaticPairArrayMap<const char*, CMolInfo::TTech,
1195 CSourceModParser::PKeyCompare>  TTechMap;
1196 DEFINE_STATIC_ARRAY_MAP(TTechMap, sc_TechMap, sc_TechArray);
1197 
1198 typedef SStaticPair<const char*, CMolInfo::TCompleteness> TCompletenessMapEntry;
1199 static const TCompletenessMapEntry sc_CompletenessArray[] = {
1200     { "complete",  CMolInfo::eCompleteness_complete  },
1201     { "has-left",  CMolInfo::eCompleteness_has_left  },
1202     { "has-right", CMolInfo::eCompleteness_has_right  },
1203     { "no-ends",   CMolInfo::eCompleteness_no_ends  },
1204     { "no-left",   CMolInfo::eCompleteness_no_left  },
1205     { "no-right",  CMolInfo::eCompleteness_no_right  },
1206     { "partial",   CMolInfo::eCompleteness_partial  }
1207 };
1208 typedef CStaticPairArrayMap<const char*, CMolInfo::TCompleteness,
1209 CSourceModParser::PKeyCompare>  TCompletenessMap;
1210 DEFINE_STATIC_ARRAY_MAP(TCompletenessMap, sc_CompletenessMap, sc_CompletenessArray);
1211 
x_ApplyMods(CAutoInitDesc<CMolInfo> & mi)1212 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CMolInfo>& mi)
1213 {
1214     const SMod* mod = NULL;
1215 
1216     // mol[-]type
1217     if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != NULL) {
1218         TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
1219         if (it == sc_BiomolMap.end()) {
1220             // construct the possible bad values by hand
1221             x_HandleBadModValue(*mod);
1222         } else {
1223             // moltype sets biomol and inst.mol
1224             mi->SetBiomol(it->second.m_eBiomol);
1225         }
1226     }
1227 
1228     // tech
1229     if ((mod = FindMod(s_Mod_tech)) != NULL) {
1230         TTechMap::const_iterator it = sc_TechMap.find(mod->value.c_str());
1231         if (it == sc_TechMap.end()) {
1232             x_HandleBadModValue(*mod);
1233         } else {
1234             mi->SetTech(it->second);
1235         }
1236     }
1237 
1238     // complete[d]ness
1239     if ((mod = FindMod(s_Mod_completeness, s_Mod_completedness)) != NULL) {
1240         TTechMap::const_iterator it = sc_CompletenessMap.find(mod->value.c_str());
1241         if (it == sc_CompletenessMap.end()) {
1242             x_HandleBadModValue(*mod);
1243         } else {
1244             mi->SetCompleteness(it->second);
1245         }
1246     }
1247 }
1248 
x_ApplyMods(CAutoInitRef<CGene_ref> & gene)1249 void CSourceModParser::x_ApplyMods(CAutoInitRef<CGene_ref>& gene)
1250 {
1251     const SMod* mod = NULL;
1252 
1253     // gene
1254     if ((mod = FindMod(s_Mod_gene)) != NULL) {
1255         gene->SetLocus(mod->value);
1256     }
1257 
1258     // allele
1259     if ((mod = FindMod(s_Mod_allele)) != NULL) {
1260         gene->SetAllele( mod->value );
1261     }
1262 
1263     // gene_syn[onym]
1264     if ((mod = FindMod(s_Mod_gene_syn, s_Mod_gene_synonym)) != NULL) {
1265         gene->SetSyn().push_back( mod->value );
1266     }
1267 
1268     // locus_tag
1269     if ((mod = FindMod(s_Mod_locus_tag)) != NULL) {
1270         gene->SetLocus_tag( mod->value );
1271     }
1272 }
1273 
1274 
x_ApplyMods(CAutoInitRef<CProt_ref> & prot)1275 void CSourceModParser::x_ApplyMods(CAutoInitRef<CProt_ref>& prot)
1276 {
1277     const SMod* mod = NULL;
1278 
1279     // prot[ein]
1280     if ((mod = FindMod(s_Mod_protein, s_Mod_prot)) != NULL) {
1281         prot->SetName().push_back(mod->value);
1282     }
1283 
1284     // prot[ein]_desc
1285     if ((mod = FindMod(s_Mod_prot_desc, s_Mod_protein_desc)) != NULL) {
1286         prot->SetDesc( mod->value );
1287     }
1288 
1289     // EC_number
1290     if ((mod = FindMod(s_Mod_EC_number)) != NULL) {
1291         prot->SetEc().push_back( mod->value );
1292     }
1293 
1294     // activity/function
1295     if ((mod = FindMod(s_Mod_activity, s_Mod_function)) != NULL) {
1296         prot->SetActivity().push_back( mod->value );
1297     }
1298 }
1299 
1300 
x_ApplyMods(CAutoInitDesc<CGB_block> & gbb)1301 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CGB_block>& gbb)
1302 {
1303     const SMod* mod = NULL;
1304 
1305     // secondary-accession[s]
1306     if ((mod = FindMod(s_Mod_secondary_accession,
1307                        s_Mod_secondary_accessions)) != NULL)
1308     {
1309         list<CTempString> ranges;
1310         NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1311         ITERATE (list<CTempString>, it, ranges) {
1312             string s = NStr::TruncateSpaces_Unsafe(*it);
1313             try {
1314                 SSeqIdRange range(s);
1315                 ITERATE (SSeqIdRange, it2, range) {
1316                     gbb->SetExtra_accessions().push_back(*it2);
1317                 }
1318             } catch (CSeqIdException&) {
1319                 gbb->SetExtra_accessions().push_back(s);
1320             }
1321         }
1322     }
1323 
1324     // keyword[s]
1325     if ((mod = FindMod(s_Mod_keyword, s_Mod_keywords)) != NULL) {
1326         list<string> keywordList;
1327         NStr::Split(mod->value, ",;", keywordList, NStr::fSplit_MergeDelimiters);
1328         // trim every string and push it into the real keyword list
1329         NON_CONST_ITERATE( list<string>, keyword_iter, keywordList ) {
1330             NStr::TruncateSpacesInPlace( *keyword_iter );
1331             gbb->SetKeywords().push_back( *keyword_iter );
1332         }
1333     }
1334 }
1335 
1336 
x_ApplyMods(CAutoInitRef<CSeq_hist> & hist)1337 void CSourceModParser::x_ApplyMods(CAutoInitRef<CSeq_hist>& hist)
1338 {
1339     const SMod* mod = NULL;
1340 
1341     // secondary-accession[s]
1342     if ((mod = FindMod(s_Mod_secondary_accession,
1343                        s_Mod_secondary_accessions)) != NULL)
1344     {
1345         list<CTempString> ranges;
1346         NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1347         ITERATE (list<CTempString>, it, ranges) {
1348             string s = NStr::TruncateSpaces_Unsafe(*it);
1349             try {
1350                 SSeqIdRange range(s);
1351                 ITERATE (SSeqIdRange, it2, range) {
1352                     hist->SetReplaces().SetIds().push_back(it2.GetID());
1353                 }
1354             } catch (CSeqIdException&) {
1355                 NStr::ReplaceInPlace(s, "ref_seq|", "ref|", 0, 1);
1356                 hist->SetReplaces().SetIds()
1357                     .push_back(CRef<CSeq_id>(new CSeq_id(s)));
1358             }
1359         }
1360     }
1361 }
1362 
1363 // Note: It's untested.
1364 //
1365 // This code is currently unused, but I'm leaving it here in case
1366 // at some point in the future someone decides that we do want it.
1367 //
1368 // We're not using this because it would introduce a whole new
1369 // dependency just for a single keyword.
1370 //
1371 //void CSourceModParser::x_ApplyMods(CAutoInitRef<CSubmit_block>& sb) {
1372 //
1373 //    // hup
1374 //    if ((mod = FindMod("hup")) != NULL) {
1375 //        sb->SetHup( false );
1376 //        sb->ResetReldate();
1377 //        if( ! mod->value.empty() ) {
1378 //            if( NStr::EqualNocase( mod->value, "y" ) ) {
1379 //                sb->SetHup( true );
1380 //                // by default, release in a year
1381 //                CDate releaseDate( CTime(CTime::eCurrent) );
1382 //                _ASSERT(releaseDate.IsStd());
1383 //                releaseDate.GetStd().SetYear( releaseDate.GetStd().GetYear() + 1 );
1384 //                sb->SetReldate( releaseDate );
1385 //            } else {
1386 //                // parse string as "m/d/y" (or with "-" instead of "/" )
1387 //                try {
1388 //                    CTime hupTime( NStr::Replace( mod->value, "-", "/" ), "M/D/Y" );
1389 //                    sb->SetReldate( CDate(hupTime) );
1390 //                    sb->SetHup( true );
1391 //                } catch( const CException & e) {
1392 //                    // couldn't parse date
1393 //                    x_HandleBadModValue(*mod);
1394 //                }
1395 //            }
1396 //        }
1397 //    }
1398 //}
1399 
1400 
1401 static
s_PopulateUserObject(CUser_object & uo,const string & type,CUser_object::TData & data)1402 void s_PopulateUserObject(CUser_object& uo, const string& type,
1403                           CUser_object::TData& data)
1404 {
1405     if (uo.GetType().Which() == CObject_id::e_not_set) {
1406         uo.SetType().SetStr(type);
1407     } else if ( !uo.GetType().IsStr()  ||  uo.GetType().GetStr() != type) {
1408         // warn first?
1409         return;
1410     }
1411 
1412     swap(uo.SetData(), data);
1413 }
1414 
1415 
x_ApplyTPAMods(CAutoInitRef<CUser_object> & tpa)1416 void CSourceModParser::x_ApplyTPAMods(CAutoInitRef<CUser_object>& tpa)
1417 {
1418     const SMod* mod = NULL;
1419 
1420     // primary[-accessions]
1421     if ((mod = FindMod(s_Mod_primary, s_Mod_primary_accessions)) != NULL) {
1422         CUser_object::TData data;
1423         list<CTempString> accns;
1424         NStr::Split(mod->value, ",", accns, NStr::fSplit_MergeDelimiters);
1425         ITERATE (list<CTempString>, it, accns) {
1426             CRef<CUser_field> field(new CUser_field), subfield(new CUser_field);
1427             field->SetLabel().SetId(0);
1428             subfield->SetLabel().SetStr("accession");
1429             subfield->SetData().SetStr(CUtf8::AsUTF8(*it, eEncoding_UTF8));
1430             field->SetData().SetFields().push_back(subfield);
1431             data.push_back(field);
1432         }
1433 
1434         if ( !data.empty() ) {
1435             s_PopulateUserObject(*tpa, "TpaAssembly", data);
1436         }
1437     }
1438 }
1439 
1440 
s_SetDBLinkDesc(CBioseq & bioseq)1441 static CRef<CSeqdesc> s_SetDBLinkDesc(CBioseq& bioseq)
1442 {
1443     CConstRef<CBioseq_set> pParentSet = bioseq.GetParentSet();
1444     CSeq_descr& descriptors = (pParentSet &&
1445                                pParentSet->GetClass() == CBioseq_set::eClass_nuc_prot) ?
1446 
1447         (const_cast<CBioseq_set&>(*pParentSet)).SetDescr() :
1448         bioseq.SetDescr();
1449 
1450 
1451     for (auto pDesc : descriptors.Set()) {
1452         if (pDesc->IsUser() && pDesc->GetUser().IsDBLink()) {
1453             return pDesc;
1454         }
1455     }
1456 
1457     auto pDBLinkDesc = Ref(new CSeqdesc());
1458     pDBLinkDesc->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
1459     descriptors.Set().push_back(pDBLinkDesc);
1460     return pDBLinkDesc;
1461 }
1462 
1463 
s_SetDBLinkFieldVals(const string & label,const list<CTempString> & vals,CSeqdesc & dblink_desc)1464 static void s_SetDBLinkFieldVals(const string& label,
1465                                 const list<CTempString>& vals,
1466                                 CSeqdesc& dblink_desc)
1467 {
1468     if (vals.empty()) {
1469         return;
1470     }
1471 
1472     auto& user_obj = dblink_desc.SetUser();
1473     CRef<CUser_field> pField;
1474     if (user_obj.IsSetData()) {
1475         for (auto pUserField : user_obj.SetData()) {
1476             if (pUserField->IsSetLabel() &&
1477                 pUserField->GetLabel().IsStr() &&
1478                 NStr::EqualNocase(pUserField->GetLabel().GetStr(), label)) {
1479                 pField = pUserField;
1480                 break;
1481             }
1482         }
1483     }
1484 
1485     if (!pField) {
1486         pField = Ref(new CUser_field());
1487         pField->SetLabel().SetStr() = label;
1488         user_obj.SetData().push_back(pField);
1489     }
1490 
1491     pField->SetData().SetStrs().clear(); // RW-518 - clear any preexisting entries
1492     for (const auto& val : vals) {
1493         pField->SetData().SetStrs().push_back(val);
1494     }
1495     pField->SetNum(pField->GetData().GetStrs().size());
1496 }
1497 
1498 
s_SetDBLinkField(const string & label,const string & vals,CRef<CSeqdesc> & pDBLinkDesc,CBioseq & bioseq)1499 static void s_SetDBLinkField(const string& label,
1500                              const string& vals,
1501                              CRef<CSeqdesc>& pDBLinkDesc,
1502                              CBioseq& bioseq)
1503 {
1504     list<CTempString> value_list;
1505     NStr::Split(vals, ",", value_list, NStr::fSplit_MergeDelimiters);
1506     for (auto& val : value_list) {
1507         val = NStr::TruncateSpaces_Unsafe(val);
1508     }
1509     value_list.remove_if([](const CTempString& val){ return val.empty(); });
1510     if (value_list.empty()) { // nothing to do
1511         return;
1512     }
1513 
1514     if (!pDBLinkDesc) {
1515         pDBLinkDesc =  s_SetDBLinkDesc(bioseq);
1516     }
1517 
1518     s_SetDBLinkFieldVals(label,
1519                          value_list,
1520                          *pDBLinkDesc);
1521 }
1522 
1523 
x_ApplyDBLinkMods(CBioseq & bioseq)1524 void CSourceModParser::x_ApplyDBLinkMods(CBioseq& bioseq)
1525 {
1526     CRef<CSeqdesc> pDBLinkDesc;
1527     const SMod* mod = NULL;
1528     if ((mod = FindMod(s_Mod_SRA)) != NULL) {
1529         s_SetDBLinkField("Sequence Read Archive", mod->value, pDBLinkDesc, bioseq);
1530     }
1531 
1532     if ((mod = FindMod(s_Mod_bioproject)) != NULL) {
1533         s_SetDBLinkField("BioProject", mod->value, pDBLinkDesc, bioseq);
1534     }
1535 
1536     if ((mod = FindMod(s_Mod_biosample)) != NULL) {
1537         s_SetDBLinkField("BioSample", mod->value, pDBLinkDesc, bioseq);
1538     }
1539 }
1540 
1541 
1542 
1543 void
x_ApplyGenomeProjectsDBMods(CAutoInitRef<CUser_object> & gpdb)1544 CSourceModParser::x_ApplyGenomeProjectsDBMods(CAutoInitRef<CUser_object>& gpdb)
1545 {
1546     const SMod* mod = NULL;
1547 
1548     // project[s]
1549     if ((mod = FindMod(s_Mod_project, s_Mod_projects)) != NULL) {
1550         CUser_object::TData data;
1551         list<CTempString> ids;
1552         NStr::Split(mod->value, ",;", ids, NStr::fSplit_MergeDelimiters);
1553         ITERATE (list<CTempString>, it, ids) {
1554             unsigned int id = NStr::StringToUInt(*it, NStr::fConvErr_NoThrow);
1555             if (id > 0) {
1556                 CRef<CUser_field> field(new CUser_field),
1557                                subfield(new CUser_field);
1558                 field->SetLabel().SetId(0);
1559                 subfield->SetLabel().SetStr("ProjectID");
1560                 subfield->SetData().SetInt(id);
1561                 field->SetData().SetFields().push_back(subfield);
1562                 subfield.Reset(new CUser_field);
1563                 subfield->SetLabel().SetStr("ParentID");
1564                 subfield->SetData().SetInt(0);
1565                 field->SetData().SetFields().push_back(subfield);
1566                 data.push_back(field);
1567             }
1568         }
1569 
1570         if ( !data.empty() ) {
1571             s_PopulateUserObject(*gpdb, "GenomeProjectsDB", data);
1572         }
1573     }
1574 }
1575 
1576 
1577 static
s_ApplyPubMods(CBioseq & bioseq,const CSourceModParser::TModsRange & range)1578 void s_ApplyPubMods(CBioseq& bioseq, const CSourceModParser::TModsRange& range)
1579 {
1580     for (CSourceModParser::TModsCI it = range.first;
1581          it != range.second;  ++it) {
1582         TEntrezId pmid = NStr::StringToNumeric<TEntrezId>(it->value, NStr::fConvErr_NoThrow);
1583         CRef<CPub> pub(new CPub);
1584         pub->SetPmid().Set(pmid);
1585         CRef<CSeqdesc> pubdesc(new CSeqdesc);
1586         pubdesc->SetPub().SetPub().Set().push_back(pub);
1587         bioseq.SetDescr().Set().push_back(pubdesc);
1588     }
1589 }
1590 
1591 
ApplyPubMods(CBioseq & seq)1592 void CSourceModParser::ApplyPubMods(CBioseq& seq)
1593 {
1594     // find PubMed IDs
1595     s_ApplyPubMods(seq, FindAllMods(s_Mod_PubMed));
1596     s_ApplyPubMods(seq, FindAllMods(s_Mod_PMID));
1597 }
1598 
CBadModError(const SMod & badMod,const string & sAllowedValues)1599 CSourceModParser::CBadModError::CBadModError(
1600     const SMod & badMod,
1601     const string & sAllowedValues )
1602     : runtime_error(x_CalculateErrorString(badMod, sAllowedValues)),
1603             m_BadMod(badMod), m_sAllowedValues(sAllowedValues)
1604 {
1605     // no further work required
1606 }
1607 
x_CalculateErrorString(const SMod & badMod,const string & sAllowedValues)1608 string CSourceModParser::CBadModError::x_CalculateErrorString(
1609             const SMod & badMod,
1610             const string & sAllowedValues )
1611 {
1612     stringstream str_strm;
1613     str_strm << "Bad modifier value at seqid '"
1614         << ( badMod.seqid ? badMod.seqid->AsFastaString() : "UNKNOWN")
1615         << "'. '" << badMod.key << "' cannot have value '" << badMod.value
1616         << "'.  Accepted values are [" << sAllowedValues << "]";
1617     return str_strm.str();
1618 }
1619 
CUnkModError(const SMod & unkMod)1620 CSourceModParser::CUnkModError::CUnkModError(
1621     const SMod& unkMod )
1622     : runtime_error(x_CalculateErrorString(unkMod)), m_UnkMod(unkMod)
1623 {
1624 }
1625 
x_CalculateErrorString(const SMod & unkMod)1626 string CSourceModParser::CUnkModError::x_CalculateErrorString(
1627     const SMod& unkMod)
1628 {
1629     stringstream str_strm;
1630     str_strm << "Bad modifier key at seqid '"
1631         << ( unkMod.seqid ? unkMod.seqid->AsFastaString() : "UNKNOWN")
1632         << "'. '" << unkMod.key << "' is not a recognized modifier key";
1633     return str_strm.str();
1634 }
1635 
1636 
GetMods(TWhichMods which) const1637 CSourceModParser::TMods CSourceModParser::GetMods(TWhichMods which) const
1638 {
1639     if (which == fAllMods) {
1640         // if caller gave this they probably should prefer calling GetAllMods
1641         // to avoid the struct copy.
1642         return m_Mods;
1643     } else {
1644         TMods ret;
1645 
1646         ITERATE (TMods, it, m_Mods) {
1647             if (which == (it->used ? fUsedMods : fUnusedMods)) {
1648                 ret.insert(ret.end(), *it);
1649             }
1650         }
1651 
1652         return ret;
1653     }
1654 }
1655 
FindMod(const CTempString & key,const CTempString & alt_key)1656 const CSourceModParser::SMod* CSourceModParser::FindMod(
1657     //const SMod & smod, const SMod & alt_smod)
1658     const CTempString& key, const CTempString& alt_key)
1659 {
1660     // check against m_pModFilter, if any
1661     if( m_pModFilter ) {
1662         if( ! (*m_pModFilter)(key) || ! (*m_pModFilter)(alt_key) ) {
1663             return NULL;
1664         }
1665     }
1666 
1667     SMod mod;
1668 
1669     for (int tries = 0;  tries < 2;  ++tries) {
1670         const CTempString & modkey = ( tries == 0 ? key : alt_key );
1671         if( modkey.empty() ) {
1672             continue;
1673         }
1674         mod.key = modkey;
1675 
1676         TModsCI it = m_Mods.lower_bound(mod);
1677         if (it != m_Mods.end()  &&  EqualKeys(it->key, modkey)) {
1678             // set iterators are const since changing an object could affect
1679             // its order in the set.  However, in this case we know that
1680             // changing the `used` field won't affect the order so we know
1681             // that a const_cast to change it is safe to do.
1682             const_cast<SMod&>(*it).used = true;
1683             return &*it;
1684         }
1685     }
1686 
1687     return NULL;
1688 }
1689 
1690 
1691 CSourceModParser::TModsRange
FindAllMods(const CTempString & key)1692 CSourceModParser::FindAllMods(const CTempString& key)
1693 {
1694     SMod smod(key);
1695     return FindAllMods(smod);
1696 }
1697 
1698 CSourceModParser::TModsRange
FindAllMods(const CTempString & key,const CTempString & alt_key)1699 CSourceModParser::FindAllMods(const CTempString& key, const CTempString& alt_key)
1700 {
1701     SMod smod(key);
1702     SMod alt_smod(alt_key);
1703     return FindAllMods(smod, alt_smod);
1704 }
1705 
1706 CSourceModParser::TModsRange
FindAllMods(const SMod & smod,const SMod & alt_smod)1707 CSourceModParser::FindAllMods(const SMod & smod, const SMod & alt_smod)
1708 {
1709     TModsRange r;
1710     r.first = m_Mods.lower_bound(smod);
1711     if (r.first == m_Mods.end() || !EqualKeys(r.first->key, smod.key)) {
1712         r.first = m_Mods.lower_bound(alt_smod);
1713     }
1714     for (r.second = r.first;
1715          r.second != m_Mods.end()  &&  (EqualKeys(r.second->key, smod.key) || EqualKeys(r.second->key, alt_smod.key));
1716          ++r.second)
1717     {
1718         // set iterators are const since changing an object could affect
1719         // its order in the set.  However, in this case we know that
1720         // changing the `used` field won't affect the order so we know
1721         // that a const_cast to change it is safe to do.
1722         const_cast<SMod&>(*r.second).used = true;
1723     }
1724     return r;
1725 }
1726 
1727 
GetLabel(string * s,TWhichMods which) const1728 void CSourceModParser::GetLabel(string* s, TWhichMods which) const
1729 {
1730     // Possible (flag-conditional?) behavior changes:
1731     // - leave off spaces between modifiers
1732     // - sort by position rather than key
1733     _ASSERT(s != NULL);
1734 
1735     string delim = s->empty() ? kEmptyStr : " ";
1736 
1737     ITERATE (TMods, it, m_Mods) {
1738         if ((which & (it->used ? fUsedMods : fUnusedMods)) != 0) {
1739             *s += delim + '[' + it->key + '=' + it->value + ']';
1740             delim = " ";
1741         }
1742     }
1743 }
1744 
1745 // static
1746 const set<string> &
GetModAllowedValues(const string & mod)1747 CSourceModParser::GetModAllowedValues(const string &mod)
1748 {
1749     // since this has a lock, do NOT grab any other locks
1750     // inside here.
1751     static CMutex mutex;
1752     CMutexGuard guard(mutex);
1753 
1754     typedef map< string, set<string>, CSourceModParser::PKeyCompare> TMapModToValidValues;
1755     static TMapModToValidValues s_mapModToValidValues;
1756 
1757     // see if value is already calculated to try to save time
1758     TMapModToValidValues::const_iterator find_iter =
1759         s_mapModToValidValues.find(mod);
1760     if( find_iter != s_mapModToValidValues.end() ) {
1761         return find_iter->second;
1762     }
1763 
1764     // does canonical comparison, which goes a little beyond case-insensitivity
1765     PKeyEqual key_equal;
1766 
1767     // not cached, so we need to calculate it ourselves
1768     set<string> & set_valid_values = s_mapModToValidValues[mod];
1769     if( key_equal(mod, "topology") || key_equal(mod, "top") ) {
1770         set_valid_values.insert("linear");
1771         set_valid_values.insert("circular");
1772     } else if( key_equal(mod, "molecule") || key_equal(mod, "mol") ) {
1773         set_valid_values.insert("rna");
1774         set_valid_values.insert("dna");
1775     } else if( key_equal(mod, "moltype") || key_equal(mod, "mol-type") ) {
1776         // construct the possible bad values by hand
1777         ITERATE( TBiomolMap, map_iter, sc_BiomolMap ) {
1778             if( map_iter->second.m_eShown == SMolTypeInfo::eShown_Yes ) {
1779                 set_valid_values.insert(map_iter->first);
1780             }
1781         }
1782     } else if( key_equal(mod, "strand") ) {
1783         set_valid_values.insert("single");
1784         set_valid_values.insert("double");
1785         set_valid_values.insert("mixed");
1786     } else if( key_equal(mod, "location") ) {
1787         set_valid_values.insert("mitochondrial");
1788         set_valid_values.insert("provirus");
1789         set_valid_values.insert("extrachromosomal");
1790         set_valid_values.insert("insertion sequence");
1791     } else if( key_equal(mod, "origin") ) {
1792         set_valid_values.insert("natural mutant");
1793         set_valid_values.insert("mutant");
1794         ITERATE( CEnumeratedTypeValues::TValues, enum_iter, CBioSource::GetTypeInfo_enum_EOrigin()->GetValues() ) {
1795             set_valid_values.insert( enum_iter->first );
1796         }
1797     } else if( key_equal(mod, "tech") ) {
1798         ITERATE(TTechMap, tech_it, sc_TechMap) {
1799             set_valid_values.insert(tech_it->first);
1800         }
1801     } else if( key_equal(mod, "completeness") || key_equal(mod, "completedness") ) {
1802         ITERATE( TCompletenessMap, comp_it, sc_CompletenessMap ) {
1803             set_valid_values.insert(comp_it->first);
1804         }
1805     } else {
1806         set_valid_values.insert("ERROR TRYING TO DETERMINE ALLOWED VALUES");
1807     }
1808 
1809     return set_valid_values;
1810 }
1811 
1812 // static
1813 const string &
GetModAllowedValuesAsOneString(const string & mod)1814 CSourceModParser::GetModAllowedValuesAsOneString(const string &mod)
1815 {
1816     // do not grab any other locks while in here (except the lock in
1817     // GetModAllowedValues)
1818     static CMutex mutex;
1819     CMutexGuard guard(mutex);
1820 
1821     typedef map<string, string> TMapModNameToStringOfAllAllowedValues;
1822     static TMapModNameToStringOfAllAllowedValues mapModNameToStringOfAllAllowedValues;
1823 
1824     // see if we've already cached the value
1825     TMapModNameToStringOfAllAllowedValues::const_iterator find_iter =
1826         mapModNameToStringOfAllAllowedValues.find(mod);
1827     if( find_iter != mapModNameToStringOfAllAllowedValues.end() ) {
1828         return find_iter->second;
1829     }
1830 
1831     // not loaded, so we need to calculate it
1832     string & sAllValuesAsOneString =
1833         mapModNameToStringOfAllAllowedValues[mod];
1834     const set<string> & setAllowedValues = GetModAllowedValues(mod);
1835     ITERATE( set<string>, value_it, setAllowedValues ) {
1836         if( ! sAllValuesAsOneString.empty() ) {
1837             sAllValuesAsOneString += ", ";
1838         }
1839         sAllValuesAsOneString += "'" + *value_it + "'";
1840     }
1841 
1842     return sAllValuesAsOneString;
1843 }
1844 
x_HandleBadModValue(const SMod & mod)1845 void CSourceModParser::x_HandleBadModValue(
1846     const SMod& mod)
1847 {
1848     m_BadMods.insert(mod);
1849 
1850     if( eHandleBadMod_Ignore == m_HandleBadMod ) {
1851         return;
1852     }
1853 
1854     const string & sAllAllowedValues = GetModAllowedValuesAsOneString(mod.key);
1855 
1856     CBadModError badModError(mod, sAllAllowedValues);
1857 
1858     switch( m_HandleBadMod ) {
1859     case eHandleBadMod_Throw:
1860         throw badModError;
1861     case eHandleBadMod_PrintToCerr:
1862         cerr << badModError.what() << endl;
1863         break;
1864     case eHandleBadMod_ErrorListener: {
1865         AutoPtr<CObjReaderLineException> pErr(
1866             CObjReaderLineException::Create(
1867                 eDiag_Warning,
1868                 m_LineNumber,
1869                 badModError.what(),
1870                 ILineError::eProblem_GeneralParsingError) );
1871         x_ProcessError(*pErr);
1872         break;
1873     }
1874     default:
1875         _TROUBLE;
1876     }
1877 }
1878 
x_HandleUnkModValue(const SMod & mod)1879 void CSourceModParser::x_HandleUnkModValue(
1880     const SMod& mod)
1881 {
1882     if (m_HandleBadMod == eHandleBadMod_Ignore) {
1883         return;
1884     }
1885     if (m_pModFilter  &&  !m_pModFilter->operator()(mod.key)) {
1886         return;
1887     }
1888     CUnkModError unkModError(mod);
1889 
1890     switch( m_HandleBadMod ) {
1891     case eHandleBadMod_Throw:
1892         throw unkModError;
1893     case eHandleBadMod_PrintToCerr:
1894         cerr << unkModError.what() << endl;
1895         break;
1896     case eHandleBadMod_ErrorListener: {
1897         AutoPtr<CObjReaderLineException> pErr(
1898             CObjReaderLineException::Create(
1899                 eDiag_Warning,
1900                 m_LineNumber,
1901                 unkModError.what(),
1902                 ILineError::eProblem_GeneralParsingError) );
1903         x_ProcessError(*pErr);
1904         break;
1905     }
1906     default:
1907         _TROUBLE;
1908     }
1909 }
1910 
x_ProcessError(CObjReaderLineException & err)1911 void CSourceModParser::x_ProcessError(
1912     CObjReaderLineException& err)
1913 {
1914     if (!m_pErrorListener) {
1915         err.Throw();
1916     }
1917     if (!m_pErrorListener->PutError(err)) {
1918         AutoPtr<CObjReaderLineException> pErr(
1919             CObjReaderLineException::Create(
1920                 eDiag_Critical,
1921                 0,
1922                 "Error allowance exceeded",
1923                 ILineError::eProblem_GeneralParsingError) );
1924         pErr->Throw();
1925     }
1926 }
1927 
ApplyMods(CBioSource & bsrc,CTempString organism)1928 void CSourceModParser::ApplyMods(CBioSource& bsrc, CTempString organism)
1929 {
1930     CAutoInitDesc<CBioSource> ref(bsrc);
1931     x_ApplyMods(ref, organism);
1932 }
1933 
1934 
ApplyMods(CMolInfo & mi)1935 void CSourceModParser::ApplyMods(CMolInfo& mi)
1936 {
1937     CAutoInitDesc<CMolInfo> ref(mi);
1938     x_ApplyMods(ref);
1939 }
1940 
1941 
ApplyMods(CGB_block & gbb)1942 void CSourceModParser::ApplyMods(CGB_block& gbb)
1943 {
1944     CAutoInitDesc<CGB_block> ref(gbb);
1945     x_ApplyMods(ref);
1946 }
1947 
SetAllUnused()1948 void CSourceModParser::SetAllUnused()
1949 {
1950     NON_CONST_ITERATE(TMods, it, m_Mods)
1951     {
1952         // set iterators are const since changing an object could affect
1953         // its order in the set.  However, in this case we know that
1954         // changing the `used` field won't affect the order so we know
1955         // that a const_cast to change it is safe to do.
1956         const_cast<SMod&>(*it).used = false;
1957     }
1958 }
1959 
AddMods(const CTempString & name,const CTempString & value)1960 void CSourceModParser::AddMods(const CTempString& name, const CTempString& value)
1961 {
1962     SMod newmod(NStr::TruncateSpaces_Unsafe(name));
1963     newmod.value = NStr::TruncateSpaces_Unsafe(value);
1964     newmod.used = false;
1965 
1966     m_Mods.insert(newmod);
1967 }
1968 
1969 END_SCOPE(objects)
1970 END_NCBI_SCOPE
1971