1 /* $Id: source_mod_parser.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, Jonathan Kans, Vasuki Gobi, Michael Kornbluh
27 *
28 * File Description:
29 * Parser for source modifiers, as found in (Sequin-targeted) FASTA files.
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <sstream>
37
38 #include <objtools/readers/source_mod_parser.hpp>
39 #include <objtools/readers/message_listener.hpp>
40
41 #include <corelib/ncbiutil.hpp>
42 #include <util/static_map.hpp>
43 #include <serial/enumvalues.hpp>
44
45 #include <objects/general/Dbtag.hpp>
46 #include <objects/general/Object_id.hpp>
47 #include <objects/general/User_field.hpp>
48 #include <objects/misc/sequence_macros.hpp>
49 #include <objects/pub/Pub.hpp>
50 #include <objects/pub/Pub_equiv.hpp>
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seq/Pubdesc.hpp>
53 #include <objects/seq/Seq_annot.hpp>
54 #include <objects/seq/Seq_data.hpp>
55 #include <objects/seq/Seq_hist_rec.hpp>
56 #include <objects/seq/Seq_inst.hpp>
57 #include <objects/seq/Seqdesc.hpp>
58 #include <objects/seqfeat/Org_ref.hpp>
59 #include <objects/seqfeat/OrgMod.hpp>
60 #include <objects/seqfeat/OrgName.hpp>
61 #include <objects/seqfeat/PCRReactionSet.hpp>
62 #include <objects/seqfeat/PCRReaction.hpp>
63 #include <objects/seqfeat/PCRPrimer.hpp>
64 #include <objects/seqfeat/PCRPrimerSet.hpp>
65 #include <objects/seqfeat/Seq_feat.hpp>
66 #include <objects/seqfeat/SubSource.hpp>
67 #include <objects/seqloc/Seq_id.hpp>
68 #include <objects/seqloc/Seq_loc.hpp>
69
70 #include <objects/general/User_object.hpp>
71
72 BEGIN_NCBI_SCOPE
73 BEGIN_SCOPE(objects)
74
75 namespace
76 {
77 class equal_subtype
78 {
79 public:
equal_subtype(CSubSource::TSubtype st)80 equal_subtype(CSubSource::TSubtype st) : m_st(st){};
operator ()(const CRef<CSubSource> & st) const81 bool operator()(const CRef<CSubSource>& st) const
82 {
83 return st->IsSetSubtype() && (st->GetSubtype() == m_st);
84 }
85 private:
86 CSubSource::TSubtype m_st;
87 };
88
89 #ifdef STATIC_SMOD
90 # error "STATIC_SMOD already defined"
91 #endif
92
93 // The macro makes sure that the var's name matches its key.
94 // Due to kKeyCanonicalizationTable, it's okay to use '_' for '-'
95 // because it will match both.
96
97
98 #define STATIC_SMOD(key_str) \
99 const char s_Mod_s_##key_str[] = #key_str; \
100 const size_t s_Mod_n_##key_str = sizeof(#key_str)-1; \
101 const CTempString s_Mod_##key_str(s_Mod_s_##key_str, s_Mod_n_##key_str)
102
103
104 // For CBioseq
105 STATIC_SMOD(topology);
106 STATIC_SMOD(top);
107 STATIC_SMOD(molecule);
108 STATIC_SMOD(mol);
109 STATIC_SMOD(moltype);
110 STATIC_SMOD(mol_type);
111 STATIC_SMOD(strand);
112 STATIC_SMOD(comment);
113
114 // For CBioSource
115 STATIC_SMOD(organism);
116 STATIC_SMOD(org);
117 STATIC_SMOD(taxname);
118 STATIC_SMOD(taxid);
119 STATIC_SMOD(location);
120 STATIC_SMOD(origin);
121 STATIC_SMOD(sub_clone);
122 STATIC_SMOD(lat_long);
123 STATIC_SMOD(latitude_longitude);
124 STATIC_SMOD(fwd_primer_seq);
125 STATIC_SMOD(fwd_pcr_primer_seq);
126 STATIC_SMOD(rev_primer_seq);
127 STATIC_SMOD(rev_pcr_primer_seq);
128 STATIC_SMOD(fwd_primer_name);
129 STATIC_SMOD(fwd_pcr_primer_name);
130 STATIC_SMOD(rev_primer_name);
131 STATIC_SMOD(rev_pcr_primer_name);
132 STATIC_SMOD(dbxref);
133 STATIC_SMOD(db_xref);
134 STATIC_SMOD(division);
135 STATIC_SMOD(div);
136 STATIC_SMOD(lineage);
137 STATIC_SMOD(gcode);
138 STATIC_SMOD(mgcode);
139 STATIC_SMOD(pgcode);
140 STATIC_SMOD(note);
141 STATIC_SMOD(notes);
142 STATIC_SMOD(focus);
143
144 // For CMolInfo
145 STATIC_SMOD(tech);
146 STATIC_SMOD(completeness);
147 STATIC_SMOD(completedness);
148
149 // For CGene_ref
150 STATIC_SMOD(gene);
151 STATIC_SMOD(allele);
152 STATIC_SMOD(gene_syn);
153 STATIC_SMOD(gene_synonym);
154 STATIC_SMOD(locus_tag);
155
156 // For CProt_ref
157 STATIC_SMOD(protein);
158 STATIC_SMOD(prot);
159 STATIC_SMOD(prot_desc);
160 STATIC_SMOD(protein_desc);
161 STATIC_SMOD(EC_number);
162 STATIC_SMOD(activity);
163 STATIC_SMOD(function);
164
165 // For CGB_block
166 STATIC_SMOD(secondary_accession);
167 STATIC_SMOD(secondary_accessions);
168 STATIC_SMOD(keyword);
169 STATIC_SMOD(keywords);
170
171 STATIC_SMOD(biosample);
172 STATIC_SMOD(bioproject);
173 // For TPA Mods (CUser_object)
174 STATIC_SMOD(primary);
175 STATIC_SMOD(primary_accessions);
176 // For SRA (Sequence Read Archive) CUser_object
177 STATIC_SMOD(SRA);
178
179 // For Genome Project DB Mods (CUser_object)
180 STATIC_SMOD(project);
181 STATIC_SMOD(projects);
182
183 // For Pub Mods (CSeq_descr)
184 STATIC_SMOD(PubMed);
185 STATIC_SMOD(PMID);
186
187
188 #undef STATIC_SMOD
189
190 typedef set<const char*, CSourceModParser::PKeyCompare> TSModNameSet;
191
192 // Loads up a map of SMod to subtype
193 template<typename TEnum,
194 typename TSModEnumMap = map<CSourceModParser::SMod, TEnum>,
195 typename TEnumNameToValMap = map<string, TEnum>>
196 TSModEnumMap * s_InitSmodToEnumMap(
197 const CEnumeratedTypeValues* etv,
198 // names to skip
199 const TSModNameSet & skip_enum_names,
200 // extra values to add that aren't in the enum
201 const TEnumNameToValMap & extra_enum_names_to_vals )
202 {
203 unique_ptr<TSModEnumMap> smod_enum_map(new TSModEnumMap);
204
205 ITERATE (CEnumeratedTypeValues::TValues, it, etv->GetValues()) {
206 const string & enum_name = it->first;
207 const TEnum enum_val = static_cast<TEnum>(it->second);
208 if( skip_enum_names.find(enum_name.c_str()) !=
209 skip_enum_names.end() )
210 {
211 // skip this tag
212 continue;
213 }
214 auto emplace_result =
215 smod_enum_map->emplace(
216 CSourceModParser::SMod(enum_name), enum_val);
217 // emplace must succeed
218 if( ! emplace_result.second) {
219 NCBI_USER_THROW_FMT(
220 "s_InitSmodToEnumMap " << enum_name);
221 }
222 }
223
224 for(auto extra_smod_to_enum : extra_enum_names_to_vals) {
225 auto emplace_result =
226 smod_enum_map->emplace(
227 CSourceModParser::SMod(extra_smod_to_enum.first),
228 extra_smod_to_enum.second);
229 // emplace must succeed
230 if( ! emplace_result.second) {
231 NCBI_USER_THROW_FMT(
232 "s_InitSmodToEnumMap " << extra_smod_to_enum.first);
233 }
234 }
235
236 return smod_enum_map.release();
237 }
238
239 typedef map<CSourceModParser::SMod, COrgMod::ESubtype> TSModOrgSubtypeMap;
240
s_InitSModOrgSubtypeMap(void)241 TSModOrgSubtypeMap * s_InitSModOrgSubtypeMap(void)
242 {
243 const TSModNameSet kDeprecatedOrgSubtypes{
244 "dosage", "old-lineage", "old-name"};
245 const map<const char*, COrgMod::ESubtype> extra_smod_to_enum_names {
246 { "subspecies", COrgMod::eSubtype_sub_species },
247 { "host", COrgMod::eSubtype_nat_host },
248 { "specific-host", COrgMod::eSubtype_nat_host },
249 };
250
251 return s_InitSmodToEnumMap<COrgMod::ESubtype>(
252 COrgMod::GetTypeInfo_enum_ESubtype(),
253 kDeprecatedOrgSubtypes,
254 extra_smod_to_enum_names
255 );
256 }
257
258 // The subtype SMods are loaded from the names of the enum
259 // and they map to ESubtype enum values so we can't just use STATIC_SMOD
260 CSafeStatic<TSModOrgSubtypeMap> kSModOrgSubtypeMap(s_InitSModOrgSubtypeMap,
261 nullptr);
262
263 typedef map<CSourceModParser::SMod,
264 CSubSource::ESubtype> TSModSubSrcSubtype;
265
s_InitSModSubSrcSubtypeMap(void)266 TSModSubSrcSubtype * s_InitSModSubSrcSubtypeMap(void)
267 {
268 // some are skipped because they're handled specially and some are
269 // skipped because they're deprecated
270 TSModNameSet skip_enum_names {
271 // skip because handled specially elsewhere
272 "fwd_primer_seq", "rev_primer_seq",
273 "fwd_primer_name", "rev_primer_name",
274 "fwd_PCR_primer_seq", "rev_PCR_primer_seq",
275 "fwd_PCR_primer_name", "rev_PCR_primer_name",
276 // skip because deprecated
277 "transposon_name",
278 "plastid_name",
279 "insertion_seq_name",
280 };
281 const map<string, CSubSource::ESubtype> extra_smod_to_enum_names {
282 { "sub-clone", CSubSource::eSubtype_subclone },
283 { "lat-long", CSubSource::eSubtype_lat_lon },
284 { "latitude-longitude", CSubSource::eSubtype_lat_lon },
285 };
286 return s_InitSmodToEnumMap<CSubSource::ESubtype>(
287 CSubSource::GetTypeInfo_enum_ESubtype(),
288 skip_enum_names,
289 extra_smod_to_enum_names );
290 }
291
292 CSafeStatic<TSModSubSrcSubtype> kSModSubSrcSubtypeMap(
293 s_InitSModSubSrcSubtypeMap, nullptr);
294
x_FindBrackets(const CTempString & str,size_t & start,size_t & stop,size_t & eq_pos)295 bool x_FindBrackets(const CTempString& str, size_t& start, size_t& stop, size_t& eq_pos)
296 {
297 size_t i = start;
298 eq_pos = CTempString::npos;
299
300 const char* s = str.data() + start;
301
302 int nested_brackets = -1;
303 while (i < str.size())
304 {
305 switch (*s)
306 {
307 case '[':
308 nested_brackets++;
309 if (nested_brackets == 0)
310 {
311 start = i;
312 }
313 break;
314 case '=':
315 if (nested_brackets >= 0)
316 if (eq_pos == CTempString::npos)
317 eq_pos = i;
318 break;
319 case ']':
320 if (nested_brackets == 0)
321 {
322 stop = i;
323 if (eq_pos == CTempString::npos)
324 eq_pos = i;
325 return true;
326 }
327 else
328 if (nested_brackets < 0)
329 return false;
330 else
331 {
332 nested_brackets--;
333 }
334 }
335 i++; s++;
336 }
337 return false;
338 };
339
x_AppendIfNonEmpty(string & s,const CTempString & o)340 void x_AppendIfNonEmpty(string& s, const CTempString& o)
341 {
342 if (!o.empty())
343 {
344 if (!s.empty())
345 s.push_back(' ');
346 s.append(o.data(), o.length());
347 }
348 }
349
350 };
351
352
353 CSafeStatic<CSourceModParser::SMod> CSourceModParser::kEmptyMod;
354
355 // ASCII letters to lowercase, space and underscore to hyphen.
356 const unsigned char CSourceModParser::kKeyCanonicalizationTable[257] =
357 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
358 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
359 "-!\"#$%&'()*+,-./0123456789:;<=>?"
360 "@abcdefghijklmnopqrstuvwxyz[\\]^-"
361 "`abcdefghijklmnopqrstuvwxyz{|}~\x7F"
362 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
363 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
364 "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
365 "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
366 "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
367 "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"
368 "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
369 "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
370
371
372 template<class _T>
373 class CAutoInitDesc : protected CAutoAddDesc
374 {
375 public:
376 CAutoInitDesc(CSeq_descr& descr, CSeqdesc::E_Choice which);
377 CAutoInitDesc(CBioseq& bioseq, CSeqdesc::E_Choice which);
378 CAutoInitDesc(CBioseq_set& bioset, CSeqdesc::E_Choice which);
379 CAutoInitDesc(_T& obj);
380 _T* operator->();
381 _T& operator*();
382 protected:
383 _T* m_ptr;
384 void _getfromdesc();
385 mutable CRef<CBioseq> m_bioseq;
386 mutable CRef<CBioseq_set> m_bioset;
387 };
388
389 class CAutoAddDBLink
390 {
391 public:
CAutoAddDBLink(CBioseq & seq,const CTempString & id)392 CAutoAddDBLink(CBioseq& seq, const CTempString& id)
393 :m_bioseq(seq), m_id(id)
394 {
395 }
IsInitialised() const396 bool IsInitialised() const
397 {
398 return !m_dblink.Empty();
399 }
400
Get()401 CUser_field& Get()
402 {
403 if (m_dblink)
404 return *m_dblink;
405
406 for (auto& d : m_bioseq.SetDescr().Set())
407 {
408 if (d->IsUser() && d->GetUser().IsDBLink())
409 {
410 for (auto& u : d->SetUser().SetData())
411 {
412 if (u->IsSetLabel() && u->GetLabel().IsStr() &&
413 NStr::EqualCase(u->GetLabel().GetStr(), m_id))
414 {
415 m_dblink = u;
416 return *m_dblink;
417 }
418 }
419 }
420 }
421 if (m_dblink.Empty())
422 {
423 m_user_obj.Reset(new CSeqdesc);
424 m_user_obj->SetUser().SetType().SetStr() = "DBLink";
425 m_dblink.Reset(new CUser_field);
426 m_dblink->SetLabel().SetStr() = m_id;
427 m_user_obj->SetUser().SetData().push_back(m_dblink);
428 m_bioseq.SetDescr().Set().push_back(m_user_obj);
429 }
430
431 return *m_dblink;
432 }
433 protected:
434 CBioseq& m_bioseq;
435 CTempString m_id;
436 CRef<CUser_field> m_dblink;
437 CRef<CSeqdesc> m_user_obj;
438 };
439
440 CSafeStaticRef<CSeq_descr> fake_descr;
441
442 template<class _T>
443 inline
CAutoInitDesc(CSeq_descr & descr,CSeqdesc::E_Choice which)444 CAutoInitDesc<_T>::CAutoInitDesc(CSeq_descr& descr, CSeqdesc::E_Choice which) :
445 CAutoAddDesc(descr, which),
446 m_ptr(0)
447 {
448 }
449
450 template<class _T>
451 inline
CAutoInitDesc(CBioseq & bioseq,CSeqdesc::E_Choice which)452 CAutoInitDesc<_T>::CAutoInitDesc(CBioseq& bioseq, CSeqdesc::E_Choice which) :
453 CAutoAddDesc(*fake_descr, which),
454 m_ptr(0),
455 m_bioseq(&bioseq)
456 {
457 m_descr.Reset();
458 }
459
460 template<class _T>
461 inline
CAutoInitDesc(CBioseq_set & bioset,CSeqdesc::E_Choice which)462 CAutoInitDesc<_T>::CAutoInitDesc(CBioseq_set& bioset, CSeqdesc::E_Choice which) :
463 CAutoAddDesc(*fake_descr, which),
464 m_ptr(0),
465 m_bioset(&bioset)
466
467 {
468 m_descr.Reset();
469 }
470
471 template<class _T>
472 inline
CAutoInitDesc(_T & obj)473 CAutoInitDesc<_T>::CAutoInitDesc(_T& obj):
474 CAutoAddDesc(*fake_descr, CSeqdesc::e_not_set), m_ptr(&obj)
475 {
476 m_descr.Reset();
477 }
478
479
480 template<class _T>
481 inline
operator *()482 _T& CAutoInitDesc<_T>::operator*()
483 {
484 return * operator->();
485 }
486
487 template<class _T>
488 inline
operator ->()489 _T* CAutoInitDesc<_T>::operator->()
490 {
491 if (m_ptr == 0 &&
492 m_which != CSeqdesc::e_not_set)
493 {
494 if (m_descr.Empty())
495 {
496 if (!m_bioseq.Empty())
497 m_descr = &m_bioseq->SetDescr();
498 else
499 if (!m_bioset.Empty())
500 m_descr = &m_bioset->SetDescr();
501 }
502 _getfromdesc();
503 }
504
505 return m_ptr;
506 }
507
508 template<>
_getfromdesc()509 void CAutoInitDesc<CBioSource>::_getfromdesc()
510 {
511 m_ptr = &Set().SetSource();
512 }
513
514 template<>
_getfromdesc()515 void CAutoInitDesc<CMolInfo>::_getfromdesc()
516 {
517 m_ptr = &Set().SetMolinfo();
518 }
519
520 template<>
_getfromdesc()521 void CAutoInitDesc<CGB_block>::_getfromdesc()
522 {
523 m_ptr = &Set().SetGenbank();
524 }
525
526
ParseTitle(const CTempString & title,CConstRef<CSeq_id> seqid,size_t iMaxModsToParse)527 string CSourceModParser::ParseTitle(const CTempString& title,
528 CConstRef<CSeq_id> seqid,
529 size_t iMaxModsToParse )
530 {
531 SMod mod;
532 string stripped_title;
533 size_t pos = 0;
534
535 m_Mods.clear();
536
537 mod.seqid = seqid;
538
539 size_t iModsFoundSoFar = 0;
540 for (; (pos < title.size()) && (iModsFoundSoFar < iMaxModsToParse);
541 ++iModsFoundSoFar )
542 {
543 size_t lb_pos, end_pos, eq_pos;
544 lb_pos = pos;
545 if (x_FindBrackets(title, lb_pos, end_pos, eq_pos))
546 {
547 CTempString skipped = NStr::TruncateSpaces_Unsafe(title.substr(pos, lb_pos - pos));
548
549 if (eq_pos < end_pos) {
550 CTempString key = NStr::TruncateSpaces_Unsafe(title.substr(lb_pos+1, eq_pos - lb_pos - 1));
551 CTempString value = NStr::TruncateSpaces_Unsafe(title.substr(eq_pos + 1, end_pos - eq_pos - 1));
552
553 mod.key = key;
554 mod.value = value;
555 mod.pos = lb_pos;
556 mod.used = false;
557 m_Mods.emplace(mod);
558 }
559
560 x_AppendIfNonEmpty(stripped_title, skipped);
561
562 pos = end_pos + 1;
563 }
564 else
565 { // rest of the title is unparsed
566 CTempString rest = NStr::TruncateSpaces_Unsafe(title.substr(pos));
567 x_AppendIfNonEmpty(stripped_title, rest);
568 break;
569 }
570 }
571
572 return stripped_title;
573 }
574
ApplyAllMods(CBioseq & seq,CTempString organism,CConstRef<CSeq_loc> location)575 void CSourceModParser::ApplyAllMods(CBioseq& seq, CTempString organism, CConstRef<CSeq_loc> location)
576 {
577 ApplyMods(seq);
578 // Although the logic below reuses some existing objects if
579 // present, it always creates new features and descriptors.
580
581 {{
582 CRef<CSeq_id> best_id = FindBestChoice(seq.GetId(), CSeq_id::BestRank);
583 if (location.Empty() && !best_id.Empty())
584 {
585 CRef<CSeq_loc> loc(new CSeq_loc);
586 loc->SetWhole(*best_id);
587 location = loc;
588 }
589
590 if (location)
591 {
592 CAutoInitRef<CSeq_annot> ftable;
593 bool had_ftable = false;
594
595 if (seq.IsSetAnnot()) {
596 NON_CONST_ITERATE (CBioseq::TAnnot, it, seq.SetAnnot()) {
597 if ((*it)->GetData().IsFtable()) {
598 ftable.Set(*it);
599 had_ftable = true;
600 break;
601 }
602 }
603 }
604
605 // CGene_ref only on nucleotide seqs
606 if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
607 CAutoInitRef<CGene_ref> gene;
608 x_ApplyMods(gene);
609 if (gene.IsInitialized()) {
610 CRef<CSeq_feat> feat(new CSeq_feat);
611 feat->SetData().SetGene(*gene);
612 feat->SetLocation().Assign(*location);
613 ftable->SetData().SetFtable().push_back(feat);
614 }
615 }
616
617 // only add Prot_ref if amino acid (or at least not nucleic acid)
618 // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsAa()
619 // can throw an exception if mol isn't set)
620 if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsAa() ) {
621 CAutoInitRef<CProt_ref> prot;
622 x_ApplyMods(prot);
623 if ( prot.IsInitialized() ) {
624 CRef<CSeq_feat> feat(new CSeq_feat);
625 feat->SetData().SetProt(*prot);
626 feat->SetLocation().Assign(*location);
627 ftable->SetData().SetFtable().push_back(feat);
628 }
629 }
630
631 if ( !had_ftable && ftable.IsInitialized() ) {
632 seq.SetAnnot().push_back(CRef<CSeq_annot>(&*ftable));
633 }
634 }
635 }}
636
637 if (seq.GetInst().IsSetHist()) {
638 ApplyMods(seq.SetInst().SetHist());
639 } else {
640 CAutoInitRef<CSeq_hist> hist;
641 x_ApplyMods(hist);
642 if (hist.IsInitialized()) {
643 seq.SetInst().SetHist(*hist);
644 }
645 }
646
647 {{
648 //CSeq_descr* descr = 0;
649 if (
650 seq.GetParentSet() && seq.GetParentSet()->IsSetClass() &&
651 seq.GetParentSet()->GetClass() == CBioseq_set::eClass_nuc_prot)
652 {
653 CBioseq_set& bioset = *(CBioseq_set*)(seq.GetParentSet().GetPointerOrNull());
654 //descr = &bioset.SetDescr();
655 CAutoInitDesc<CBioSource> bsrc(bioset, CSeqdesc::e_Source);
656 x_ApplyMods(bsrc, organism);
657 }
658 else
659 {
660 //descr = &seq.SetDescr();
661 CAutoInitDesc<CBioSource> bsrc(seq, CSeqdesc::e_Source);
662 x_ApplyMods(bsrc, organism);
663 }
664 //CAutoInitDesc<CBioSource> bsrc(*descr, CSeqdesc::e_Source);
665 //x_ApplyMods(bsrc, organism);
666 }}
667
668 {{
669 CAutoInitDesc<CMolInfo> mi(seq, CSeqdesc::e_Molinfo);
670 x_ApplyMods(mi);
671 }}
672
673 {{
674 CAutoInitDesc<CGB_block> gbb(seq, CSeqdesc::e_Genbank);
675 x_ApplyMods(gbb);
676 }}
677
678 {{
679 CAutoInitRef<CUser_object> tpa;
680 x_ApplyTPAMods(tpa);
681 if (tpa.IsInitialized()) {
682 CRef<CSeqdesc> desc(new CSeqdesc);
683 desc->SetUser(*tpa);
684 seq.SetDescr().Set().push_back(desc);
685 }
686 }}
687
688 x_ApplyDBLinkMods(seq);
689
690 {{
691 CAutoInitRef<CUser_object> gpdb;
692 x_ApplyGenomeProjectsDBMods(gpdb);
693 if (gpdb.IsInitialized()) {
694 CRef<CSeqdesc> desc(new CSeqdesc);
695 desc->SetUser(*gpdb);
696 seq.SetDescr().Set().push_back(desc);
697 }
698 }}
699
700 {{
701 ApplyPubMods(seq);
702 }}
703
704 TMods unusedMods = GetMods(fUnusedMods);
705 for (TMods::const_iterator unused = unusedMods.begin(); unused != unusedMods.end(); ++unused) {
706 x_HandleUnkModValue(*unused);
707 }
708 };
709
710 struct SMolTypeInfo {
711
712 // is it shown to the user as a possibility or just silently accepted?
713 enum EShown {
714 eShown_Yes, // Yes, show to user in error messages, etc.
715 eShown_No // No, don't show the user, but silently accept it if the user gives it to us
716 };
717
SMolTypeInfoSMolTypeInfo718 SMolTypeInfo(
719 EShown eShown,
720 CMolInfo::TBiomol eBiomol,
721 CSeq_inst::EMol eMol ) :
722 m_eBiomol(eBiomol), m_eMol(eMol), m_eShown(eShown)
723 { }
724
725 CMolInfo::TBiomol m_eBiomol;
726 CSeq_inst::EMol m_eMol;
727 EShown m_eShown;
728 };
729 typedef SStaticPair<const char*, SMolTypeInfo> TBiomolMapEntry;
730 static const TBiomolMapEntry sc_BiomolArray[] = {
731 // careful with the sort: remember that the key is canonicalized first
732 {"cRNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_cRNA, CSeq_inst::eMol_rna) },
733 {"DNA", SMolTypeInfo(SMolTypeInfo::eShown_No, CMolInfo::eBiomol_genomic, CSeq_inst::eMol_dna) },
734 {"Genomic", SMolTypeInfo(SMolTypeInfo::eShown_No, CMolInfo::eBiomol_genomic, CSeq_inst::eMol_dna) },
735 {"Genomic DNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_genomic, CSeq_inst::eMol_dna) },
736 {"Genomic RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_genomic, CSeq_inst::eMol_rna) },
737 {"mRNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_mRNA, CSeq_inst::eMol_rna) },
738 {"ncRNA", SMolTypeInfo(SMolTypeInfo::eShown_No, CMolInfo::eBiomol_ncRNA, CSeq_inst::eMol_rna) },
739 {"non-coding RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_ncRNA, CSeq_inst::eMol_rna) },
740 {"Other-Genetic", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_other_genetic, CSeq_inst::eMol_other) },
741 {"Precursor RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_pre_RNA, CSeq_inst::eMol_rna) },
742 {"Ribosomal RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_rRNA, CSeq_inst::eMol_rna) },
743 {"rRNA", SMolTypeInfo(SMolTypeInfo::eShown_No, CMolInfo::eBiomol_rRNA, CSeq_inst::eMol_rna) },
744 {"Transcribed RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_transcribed_RNA, CSeq_inst::eMol_rna) },
745 {"Transfer-messenger RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_tmRNA, CSeq_inst::eMol_rna) },
746 {"Transfer RNA", SMolTypeInfo(SMolTypeInfo::eShown_Yes, CMolInfo::eBiomol_tRNA, CSeq_inst::eMol_rna) },
747 {"tRNA", SMolTypeInfo(SMolTypeInfo::eShown_No, CMolInfo::eBiomol_tRNA, CSeq_inst::eMol_rna) },
748 };
749 typedef CStaticPairArrayMap<const char*, SMolTypeInfo,
750 CSourceModParser::PKeyCompare> TBiomolMap;
751 DEFINE_STATIC_ARRAY_MAP(TBiomolMap, sc_BiomolMap, sc_BiomolArray);
752
ApplyMods(CBioseq & seq)753 void CSourceModParser::ApplyMods(CBioseq& seq)
754 {
755 const SMod* mod = NULL;
756
757 // top[ology]
758 if ((mod = FindMod(s_Mod_topology, s_Mod_top)) != NULL) {
759 if (NStr::EqualNocase(mod->value, "linear")) {
760 seq.SetInst().SetTopology(CSeq_inst::eTopology_linear);
761 } else if (NStr::EqualNocase(mod->value, "circular")) {
762 seq.SetInst().SetTopology(CSeq_inst::eTopology_circular);
763 } else {
764 x_HandleBadModValue(*mod);
765 }
766 }
767
768 // molecule information is not set for proteins at this time
769 // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsNa()
770 // can throw an exception if mol isn't set)
771 if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
772 bool bMolSetViaMolMod = false;
773
774 // mol[ecule]
775 if ((mod = FindMod(s_Mod_molecule, s_Mod_mol)) != NULL) {
776 if (NStr::EqualNocase(mod->value, "dna")) {
777 seq.SetInst().SetMol( CSeq_inst::eMol_dna );
778 bMolSetViaMolMod = true;
779 } else if (NStr::EqualNocase(mod->value, "rna")) {
780 seq.SetInst().SetMol( CSeq_inst::eMol_rna );
781 bMolSetViaMolMod = true;
782 } else {
783 x_HandleBadModValue(*mod);
784 }
785 }
786
787 // if mol/molecule not set right, we can use moltype instead
788
789 // mol[-]type
790 if( ! bMolSetViaMolMod ) {
791 if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != NULL) {
792 TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
793 if (it == sc_BiomolMap.end()) {
794 x_HandleBadModValue(*mod);
795 } else {
796 // moltype sets biomol and inst.mol
797 seq.SetInst().SetMol(it->second.m_eMol);
798 }
799 }
800 }
801 }
802
803 // strand
804 if ((mod = FindMod(s_Mod_strand)) != NULL) {
805 if (NStr::EqualNocase(mod->value, "single")) {
806 seq.SetInst().SetStrand( CSeq_inst::eStrand_ss );
807 } else if (NStr::EqualNocase(mod->value, "double")) {
808 seq.SetInst().SetStrand( CSeq_inst::eStrand_ds );
809 } else if (NStr::EqualNocase(mod->value, "mixed")) {
810 seq.SetInst().SetStrand( CSeq_inst::eStrand_mixed );
811 } else {
812 x_HandleBadModValue(*mod);
813 }
814 }
815
816 // comment
817 if ((mod = FindMod(s_Mod_comment)) != NULL) {
818 CRef<CSeqdesc> desc(new CSeqdesc);
819 desc->SetComment( mod->value );
820 seq.SetDescr().Set().push_back(desc);
821 }
822 }
823
824
s_AddPrimers(const pair<string,string> & primer_info,CPCRPrimerSet & primer_set)825 static void s_AddPrimers(const pair<string, string>& primer_info, CPCRPrimerSet& primer_set)
826 {
827 vector<string> names;
828 NStr::Split(primer_info.first, ":", names, NStr::fSplit_Tokenize);
829 vector<string> seqs;
830 NStr::Split(primer_info.second, ":", seqs, NStr::fSplit_Tokenize);
831
832 const auto num_names = names.size();
833 const auto num_seqs = seqs.size();
834 const auto num_primers = max(num_names, num_seqs);
835
836 for(size_t i=0; i<num_primers; ++i) {
837 auto primer = Ref(new CPCRPrimer());
838
839 if (i<num_names && !NStr::IsBlank(names[i])) {
840 primer->SetName().Set(names[i]);
841 }
842 if (i<num_seqs && !NStr::IsBlank(seqs[i])) {
843 primer->SetSeq().Set(seqs[i]);
844 }
845 primer_set.Set().push_back(primer);
846 }
847 }
848
849
s_GetPrimerInfo(const CSourceModParser::SMod * pNamesMod,const CSourceModParser::SMod * pSeqsMod,vector<pair<string,string>> & reaction_info)850 static void s_GetPrimerInfo(const CSourceModParser::SMod* pNamesMod,
851 const CSourceModParser::SMod* pSeqsMod,
852 vector<pair<string, string>>& reaction_info)
853 {
854 reaction_info.clear();
855 vector<string> names;
856 if (pNamesMod) {
857 NStr::Split(pNamesMod->value, ",", names, NStr::fSplit_Tokenize);
858 }
859
860 vector<string> seqs;
861 if (pSeqsMod) {
862 NStr::Split(pSeqsMod->value, ",", seqs, NStr::fSplit_Tokenize);
863 if (seqs.size()>1) {
864 if (seqs.front().front() == '(') {
865 seqs.front().erase(0,1);
866 }
867 if (seqs.back().back() == ')') {
868 seqs.back().erase(seqs.back().size()-1, 1);
869 }
870 }
871 }
872
873 const auto num_names = names.size();
874 const auto num_seqs = seqs.size();
875 const auto num_reactions = max(num_names, num_seqs);
876
877 for (int i=0; i<num_reactions; ++i) {
878 const string name = (i<num_names) ? names[i] : "";
879 const string seq = (i<num_seqs) ? seqs[i] : "";
880 reaction_info.push_back(make_pair(name, seq));
881 }
882 }
883
884
x_AddPCRPrimers(CAutoInitRef<CPCRReactionSet> & pcr_reaction_set)885 void CSourceModParser::x_AddPCRPrimers(CAutoInitRef<CPCRReactionSet>& pcr_reaction_set)
886 {
887 using TNameSeqPair = pair<string, string>;
888
889 const SMod* pNameMod = nullptr;
890 const SMod* pSeqMod = nullptr;
891
892 pNameMod = FindMod(s_Mod_fwd_primer_name, s_Mod_fwd_pcr_primer_name);
893 pSeqMod = FindMod(s_Mod_fwd_primer_seq, s_Mod_fwd_pcr_primer_seq);
894 vector<TNameSeqPair> fwd_primer_info;
895 s_GetPrimerInfo(pNameMod, pSeqMod, fwd_primer_info);
896
897
898 pNameMod = FindMod(s_Mod_rev_primer_name, s_Mod_rev_pcr_primer_name);
899 pSeqMod = FindMod(s_Mod_rev_primer_seq, s_Mod_rev_pcr_primer_seq);
900 vector<TNameSeqPair> rev_primer_info;
901 s_GetPrimerInfo(pNameMod, pSeqMod, rev_primer_info);
902
903 if (fwd_primer_info.empty() &&
904 rev_primer_info.empty()) {
905 return;
906 }
907
908 auto num_fwd_primer_info = fwd_primer_info.size();
909 auto num_rev_primer_info = rev_primer_info.size();
910
911 if (num_fwd_primer_info == num_rev_primer_info) {
912 for (auto i=0; i<num_fwd_primer_info; ++i) {
913 CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
914 s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
915 s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
916 pcr_reaction_set->Set().push_back(pcr_reaction);
917 }
918 }
919 else
920 if (num_fwd_primer_info > num_rev_primer_info) {
921 auto diff = num_fwd_primer_info - num_rev_primer_info;
922 for (int i=0; i<diff; ++i) {
923 CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
924 s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
925 pcr_reaction_set->Set().push_back(pcr_reaction);
926 }
927
928 for (int i=diff; i<num_fwd_primer_info; ++i) {
929 CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
930 s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
931 s_AddPrimers(rev_primer_info[i-diff], pcr_reaction->SetReverse());
932 pcr_reaction_set->Set().push_back(pcr_reaction);
933 }
934 }
935 else
936 if (num_fwd_primer_info < num_rev_primer_info) {
937 for (int i=0; i<num_fwd_primer_info; ++i) {
938 CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
939 s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
940 s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
941 pcr_reaction_set->Set().push_back(pcr_reaction);
942 }
943
944 for (int i=num_fwd_primer_info; i<num_rev_primer_info; ++i) {
945 CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
946 s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
947 pcr_reaction_set->Set().push_back(pcr_reaction);
948 }
949 }
950 }
951
952
x_ApplyMods(CAutoInitDesc<CBioSource> & bsrc,CTempString organism)953 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CBioSource>& bsrc,
954 CTempString organism)
955 {
956 const SMod* mod = NULL;
957 bool reset_taxid = false;
958
959 // org[anism]
960 if (organism.empty())
961 {
962 if ((mod = FindMod(s_Mod_organism, s_Mod_org)) != NULL) {
963 organism = mod->value;
964 }
965 else
966 if ((mod = FindMod(s_Mod_taxname)) != NULL) {
967 organism = mod->value;
968 }
969 }
970
971 if ( !organism.empty())
972 {
973 if (!(bsrc->GetOrg().IsSetTaxname() && NStr::EqualNocase(bsrc->GetOrg().GetTaxname(), organism)))
974 {
975 if (bsrc->GetOrg().IsSetTaxname())
976 {
977 bsrc->ResetOrg();
978 // bsrc->ResetSubtype();
979 }
980 bsrc->SetOrg().SetTaxname(organism);
981 reset_taxid = true;
982 }
983 }
984
985 // location
986 if ((mod = FindMod(s_Mod_location)) != NULL) {
987 if (NStr::EqualNocase(mod->value, "mitochondrial")) {
988 bsrc->SetGenome(CBioSource::eGenome_mitochondrion);
989 } else if (NStr::EqualNocase(mod->value, "provirus")) {
990 bsrc->SetGenome(CBioSource::eGenome_proviral);
991 } else if (NStr::EqualNocase(mod->value, "extrachromosomal")) {
992 bsrc->SetGenome(CBioSource::eGenome_extrachrom);
993 } else if (NStr::EqualNocase(mod->value, "insertion sequence")) {
994 bsrc->SetGenome(CBioSource::eGenome_insertion_seq);
995 } else {
996 try {
997 bsrc->SetGenome(CBioSource::GetTypeInfo_enum_EGenome()
998 ->FindValue(mod->value));
999 } catch (CSerialException&) {
1000 x_HandleBadModValue(*mod);
1001 }
1002 }
1003 }
1004
1005 // origin
1006 if ((mod = FindMod(s_Mod_origin)) != NULL) {
1007 try {
1008 // also check for special cases that don't match the enum name
1009 if( NStr::EqualNocase(mod->value, "natural mutant") ) {
1010 bsrc->SetOrigin( CBioSource::eOrigin_natmut );
1011 } else if( NStr::EqualNocase(mod->value, "mutant") ) {
1012 bsrc->SetOrigin( CBioSource::eOrigin_mut );
1013 } else {
1014 bsrc->SetOrigin(CBioSource::GetTypeInfo_enum_EOrigin()
1015 ->FindValue(mod->value));
1016 }
1017 } catch (CSerialException&) {
1018 x_HandleBadModValue(*mod);
1019 }
1020 }
1021
1022 // handle orgmods
1023 for(const auto & smod_orgsubtype : kSModOrgSubtypeMap.Get()) {
1024 const SMod & smod = smod_orgsubtype.first;
1025 const COrgMod::ESubtype e_subtype = smod_orgsubtype.second;
1026 if ((mod = FindMod(smod.key)) != NULL) {
1027 CRef<COrgMod> org_mod(new COrgMod);
1028 org_mod->SetSubtype(e_subtype);
1029 org_mod->SetSubname(mod->value);
1030 bsrc->SetOrg().SetOrgname().SetMod().push_back(org_mod);
1031 reset_taxid = true;
1032 }
1033 }
1034
1035 // handle subsources
1036 for( const auto & smod_subsrcsubtype : kSModSubSrcSubtypeMap.Get() ) {
1037 const SMod & smod = smod_subsrcsubtype.first;
1038 const CSubSource::ESubtype e_subtype = smod_subsrcsubtype.second;
1039 if ((mod = FindMod(smod.key)) != NULL) {
1040 auto& subtype = bsrc->SetSubtype();
1041 CRef<CSubSource> subsource(new CSubSource);
1042 subsource->SetSubtype(e_subtype);
1043
1044 if( CSubSource::NeedsNoText(e_subtype) ) {
1045 subsource->SetName(kEmptyStr);
1046 } else {
1047 subsource->SetName(mod->value);
1048 }
1049
1050 if (!CSubSource::IsMultipleValuesAllowed(e_subtype))
1051 {
1052 // since only one of this e_subtype is allowed, we erase any
1053 // that are already in the subtype list.
1054 // (Unfortunately, we cannot just use bsrc->RemoveSubSource
1055 // because it will ResetSubtype if subtype ends up empty)
1056 subtype.erase(
1057 remove_if(subtype.begin(), subtype.end(),
1058 equal_subtype(e_subtype)),
1059 subtype.end());
1060 }
1061
1062 subtype.push_back(subsource);
1063 }
1064 }
1065
1066 // handle PCR Primers
1067 {{
1068 CAutoInitRef<CPCRReactionSet> pcr_reaction_set;
1069 x_AddPCRPrimers(pcr_reaction_set);
1070 if (pcr_reaction_set.IsInitialized()) {
1071 if (!bsrc->IsSetPcr_primers()) {
1072 bsrc->SetPcr_primers(*pcr_reaction_set);
1073 }
1074 else {
1075 bsrc->SetPcr_primers().Set().splice(
1076 bsrc->SetPcr_primers().Set().end(),
1077 pcr_reaction_set->Set());
1078 }
1079 }
1080 }}
1081
1082
1083 // db_xref
1084 TModsRange db_xref_mods_range = FindAllMods( s_Mod_db_xref, s_Mod_dbxref );
1085 for( TModsCI db_xref_iter = db_xref_mods_range.first;
1086 db_xref_iter != db_xref_mods_range.second;
1087 ++db_xref_iter ) {
1088 CRef< CDbtag > new_db( new CDbtag );
1089
1090 const CTempString db_xref_str = db_xref_iter->value;
1091 CRef<CObject_id> object_id(new CObject_id);
1092
1093 size_t colon_location = db_xref_str.find(":");
1094 if (colon_location == string::npos) {
1095 // no colon: it's just tag, and db is unknown
1096 new_db->SetDb() = "?";
1097 db_xref_str.Copy(object_id->SetStr(), 0, CTempString::npos);
1098 } else {
1099 // there's a colon, so db and tag are both known
1100 db_xref_str.Copy(new_db->SetDb(), 0, colon_location);
1101 db_xref_str.Copy(object_id->SetStr(), colon_location + 1, CTempString::npos);
1102 }
1103
1104 new_db->SetTag( *object_id );
1105
1106 bsrc->SetOrg().SetDb().push_back( new_db );
1107 }
1108
1109 // div[ision]
1110 if ((mod = FindMod(s_Mod_division, s_Mod_div)) != NULL) {
1111 bsrc->SetOrg().SetOrgname().SetDiv( mod->value );
1112 }
1113
1114 // lineage
1115 if ((mod = FindMod(s_Mod_lineage)) != NULL) {
1116 bsrc->SetOrg().SetOrgname().SetLineage( mod->value );
1117 }
1118
1119 // gcode
1120 if ((mod = FindMod(s_Mod_gcode)) != NULL) {
1121 bsrc->SetOrg().SetOrgname().SetGcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1122 }
1123
1124 // mgcode
1125 if ((mod = FindMod(s_Mod_mgcode)) != NULL) {
1126 bsrc->SetOrg().SetOrgname().SetMgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1127 }
1128
1129 // pgcode
1130 if ((mod = FindMod(s_Mod_pgcode)) != NULL) {
1131 bsrc->SetOrg().SetOrgname().SetPgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1132 }
1133
1134 // note[s]
1135 TModsRange mods[2];
1136 mods[0] = FindAllMods(s_Mod_note);
1137 mods[1] = FindAllMods(s_Mod_notes);
1138 for (size_t i = 0; i < 2; i++)
1139 {
1140 for (TModsCI it = mods[i].first; it != mods[i].second; it++)
1141 {
1142 CRef< CSubSource > new_subsource(new CSubSource);
1143 new_subsource->SetSubtype(CSubSource::eSubtype_other);
1144 new_subsource->SetName(it->value);
1145 bsrc->SetSubtype().push_back(new_subsource);
1146 }
1147 }
1148
1149 // focus
1150 if ((mod = FindMod(s_Mod_focus)) != NULL) {
1151 if( NStr::EqualNocase( mod->value, "TRUE" ) ) {
1152 bsrc->SetIs_focus();
1153 }
1154 }
1155
1156
1157 if ((mod = FindMod(s_Mod_taxid)) != NULL) {
1158 bsrc->SetOrg().SetTaxId( NStr::StringToNumeric<TTaxId>(mod->value, NStr::fConvErr_NoThrow) );
1159 }
1160 else
1161 if (reset_taxid && bsrc->IsSetOrgname() && bsrc->GetOrg().GetTaxId() != ZERO_TAX_ID) {
1162 bsrc->SetOrg().SetTaxId(ZERO_TAX_ID);
1163 }
1164 }
1165
1166 typedef SStaticPair<const char*, CMolInfo::TTech> TTechMapEntry;
1167 static const TTechMapEntry sc_TechArray[] = {
1168 { "?", CMolInfo::eTech_unknown },
1169 { "barcode", CMolInfo::eTech_barcode },
1170 { "both", CMolInfo::eTech_both },
1171 { "composite-wgs-htgs", CMolInfo::eTech_composite_wgs_htgs },
1172 { "concept-trans", CMolInfo::eTech_concept_trans },
1173 { "concept-trans-a", CMolInfo::eTech_concept_trans_a },
1174 { "derived", CMolInfo::eTech_derived },
1175 { "EST", CMolInfo::eTech_est },
1176 { "fli cDNA", CMolInfo::eTech_fli_cdna },
1177 { "genetic map", CMolInfo::eTech_genemap },
1178 { "htc", CMolInfo::eTech_htc },
1179 { "htgs 0", CMolInfo::eTech_htgs_0 },
1180 { "htgs 1", CMolInfo::eTech_htgs_1 },
1181 { "htgs 2", CMolInfo::eTech_htgs_2 },
1182 { "htgs 3", CMolInfo::eTech_htgs_3 },
1183 { "physical map", CMolInfo::eTech_physmap },
1184 { "seq-pept", CMolInfo::eTech_seq_pept },
1185 { "seq-pept-homol", CMolInfo::eTech_seq_pept_homol },
1186 { "seq-pept-overlap", CMolInfo::eTech_seq_pept_overlap },
1187 { "standard", CMolInfo::eTech_standard },
1188 { "STS", CMolInfo::eTech_sts },
1189 { "survey", CMolInfo::eTech_survey },
1190 { "targeted", CMolInfo::eTech_targeted },
1191 { "tsa", CMolInfo::eTech_tsa },
1192 { "wgs", CMolInfo::eTech_wgs }
1193 };
1194 typedef CStaticPairArrayMap<const char*, CMolInfo::TTech,
1195 CSourceModParser::PKeyCompare> TTechMap;
1196 DEFINE_STATIC_ARRAY_MAP(TTechMap, sc_TechMap, sc_TechArray);
1197
1198 typedef SStaticPair<const char*, CMolInfo::TCompleteness> TCompletenessMapEntry;
1199 static const TCompletenessMapEntry sc_CompletenessArray[] = {
1200 { "complete", CMolInfo::eCompleteness_complete },
1201 { "has-left", CMolInfo::eCompleteness_has_left },
1202 { "has-right", CMolInfo::eCompleteness_has_right },
1203 { "no-ends", CMolInfo::eCompleteness_no_ends },
1204 { "no-left", CMolInfo::eCompleteness_no_left },
1205 { "no-right", CMolInfo::eCompleteness_no_right },
1206 { "partial", CMolInfo::eCompleteness_partial }
1207 };
1208 typedef CStaticPairArrayMap<const char*, CMolInfo::TCompleteness,
1209 CSourceModParser::PKeyCompare> TCompletenessMap;
1210 DEFINE_STATIC_ARRAY_MAP(TCompletenessMap, sc_CompletenessMap, sc_CompletenessArray);
1211
x_ApplyMods(CAutoInitDesc<CMolInfo> & mi)1212 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CMolInfo>& mi)
1213 {
1214 const SMod* mod = NULL;
1215
1216 // mol[-]type
1217 if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != NULL) {
1218 TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
1219 if (it == sc_BiomolMap.end()) {
1220 // construct the possible bad values by hand
1221 x_HandleBadModValue(*mod);
1222 } else {
1223 // moltype sets biomol and inst.mol
1224 mi->SetBiomol(it->second.m_eBiomol);
1225 }
1226 }
1227
1228 // tech
1229 if ((mod = FindMod(s_Mod_tech)) != NULL) {
1230 TTechMap::const_iterator it = sc_TechMap.find(mod->value.c_str());
1231 if (it == sc_TechMap.end()) {
1232 x_HandleBadModValue(*mod);
1233 } else {
1234 mi->SetTech(it->second);
1235 }
1236 }
1237
1238 // complete[d]ness
1239 if ((mod = FindMod(s_Mod_completeness, s_Mod_completedness)) != NULL) {
1240 TTechMap::const_iterator it = sc_CompletenessMap.find(mod->value.c_str());
1241 if (it == sc_CompletenessMap.end()) {
1242 x_HandleBadModValue(*mod);
1243 } else {
1244 mi->SetCompleteness(it->second);
1245 }
1246 }
1247 }
1248
x_ApplyMods(CAutoInitRef<CGene_ref> & gene)1249 void CSourceModParser::x_ApplyMods(CAutoInitRef<CGene_ref>& gene)
1250 {
1251 const SMod* mod = NULL;
1252
1253 // gene
1254 if ((mod = FindMod(s_Mod_gene)) != NULL) {
1255 gene->SetLocus(mod->value);
1256 }
1257
1258 // allele
1259 if ((mod = FindMod(s_Mod_allele)) != NULL) {
1260 gene->SetAllele( mod->value );
1261 }
1262
1263 // gene_syn[onym]
1264 if ((mod = FindMod(s_Mod_gene_syn, s_Mod_gene_synonym)) != NULL) {
1265 gene->SetSyn().push_back( mod->value );
1266 }
1267
1268 // locus_tag
1269 if ((mod = FindMod(s_Mod_locus_tag)) != NULL) {
1270 gene->SetLocus_tag( mod->value );
1271 }
1272 }
1273
1274
x_ApplyMods(CAutoInitRef<CProt_ref> & prot)1275 void CSourceModParser::x_ApplyMods(CAutoInitRef<CProt_ref>& prot)
1276 {
1277 const SMod* mod = NULL;
1278
1279 // prot[ein]
1280 if ((mod = FindMod(s_Mod_protein, s_Mod_prot)) != NULL) {
1281 prot->SetName().push_back(mod->value);
1282 }
1283
1284 // prot[ein]_desc
1285 if ((mod = FindMod(s_Mod_prot_desc, s_Mod_protein_desc)) != NULL) {
1286 prot->SetDesc( mod->value );
1287 }
1288
1289 // EC_number
1290 if ((mod = FindMod(s_Mod_EC_number)) != NULL) {
1291 prot->SetEc().push_back( mod->value );
1292 }
1293
1294 // activity/function
1295 if ((mod = FindMod(s_Mod_activity, s_Mod_function)) != NULL) {
1296 prot->SetActivity().push_back( mod->value );
1297 }
1298 }
1299
1300
x_ApplyMods(CAutoInitDesc<CGB_block> & gbb)1301 void CSourceModParser::x_ApplyMods(CAutoInitDesc<CGB_block>& gbb)
1302 {
1303 const SMod* mod = NULL;
1304
1305 // secondary-accession[s]
1306 if ((mod = FindMod(s_Mod_secondary_accession,
1307 s_Mod_secondary_accessions)) != NULL)
1308 {
1309 list<CTempString> ranges;
1310 NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1311 ITERATE (list<CTempString>, it, ranges) {
1312 string s = NStr::TruncateSpaces_Unsafe(*it);
1313 try {
1314 SSeqIdRange range(s);
1315 ITERATE (SSeqIdRange, it2, range) {
1316 gbb->SetExtra_accessions().push_back(*it2);
1317 }
1318 } catch (CSeqIdException&) {
1319 gbb->SetExtra_accessions().push_back(s);
1320 }
1321 }
1322 }
1323
1324 // keyword[s]
1325 if ((mod = FindMod(s_Mod_keyword, s_Mod_keywords)) != NULL) {
1326 list<string> keywordList;
1327 NStr::Split(mod->value, ",;", keywordList, NStr::fSplit_MergeDelimiters);
1328 // trim every string and push it into the real keyword list
1329 NON_CONST_ITERATE( list<string>, keyword_iter, keywordList ) {
1330 NStr::TruncateSpacesInPlace( *keyword_iter );
1331 gbb->SetKeywords().push_back( *keyword_iter );
1332 }
1333 }
1334 }
1335
1336
x_ApplyMods(CAutoInitRef<CSeq_hist> & hist)1337 void CSourceModParser::x_ApplyMods(CAutoInitRef<CSeq_hist>& hist)
1338 {
1339 const SMod* mod = NULL;
1340
1341 // secondary-accession[s]
1342 if ((mod = FindMod(s_Mod_secondary_accession,
1343 s_Mod_secondary_accessions)) != NULL)
1344 {
1345 list<CTempString> ranges;
1346 NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1347 ITERATE (list<CTempString>, it, ranges) {
1348 string s = NStr::TruncateSpaces_Unsafe(*it);
1349 try {
1350 SSeqIdRange range(s);
1351 ITERATE (SSeqIdRange, it2, range) {
1352 hist->SetReplaces().SetIds().push_back(it2.GetID());
1353 }
1354 } catch (CSeqIdException&) {
1355 NStr::ReplaceInPlace(s, "ref_seq|", "ref|", 0, 1);
1356 hist->SetReplaces().SetIds()
1357 .push_back(CRef<CSeq_id>(new CSeq_id(s)));
1358 }
1359 }
1360 }
1361 }
1362
1363 // Note: It's untested.
1364 //
1365 // This code is currently unused, but I'm leaving it here in case
1366 // at some point in the future someone decides that we do want it.
1367 //
1368 // We're not using this because it would introduce a whole new
1369 // dependency just for a single keyword.
1370 //
1371 //void CSourceModParser::x_ApplyMods(CAutoInitRef<CSubmit_block>& sb) {
1372 //
1373 // // hup
1374 // if ((mod = FindMod("hup")) != NULL) {
1375 // sb->SetHup( false );
1376 // sb->ResetReldate();
1377 // if( ! mod->value.empty() ) {
1378 // if( NStr::EqualNocase( mod->value, "y" ) ) {
1379 // sb->SetHup( true );
1380 // // by default, release in a year
1381 // CDate releaseDate( CTime(CTime::eCurrent) );
1382 // _ASSERT(releaseDate.IsStd());
1383 // releaseDate.GetStd().SetYear( releaseDate.GetStd().GetYear() + 1 );
1384 // sb->SetReldate( releaseDate );
1385 // } else {
1386 // // parse string as "m/d/y" (or with "-" instead of "/" )
1387 // try {
1388 // CTime hupTime( NStr::Replace( mod->value, "-", "/" ), "M/D/Y" );
1389 // sb->SetReldate( CDate(hupTime) );
1390 // sb->SetHup( true );
1391 // } catch( const CException & e) {
1392 // // couldn't parse date
1393 // x_HandleBadModValue(*mod);
1394 // }
1395 // }
1396 // }
1397 // }
1398 //}
1399
1400
1401 static
s_PopulateUserObject(CUser_object & uo,const string & type,CUser_object::TData & data)1402 void s_PopulateUserObject(CUser_object& uo, const string& type,
1403 CUser_object::TData& data)
1404 {
1405 if (uo.GetType().Which() == CObject_id::e_not_set) {
1406 uo.SetType().SetStr(type);
1407 } else if ( !uo.GetType().IsStr() || uo.GetType().GetStr() != type) {
1408 // warn first?
1409 return;
1410 }
1411
1412 swap(uo.SetData(), data);
1413 }
1414
1415
x_ApplyTPAMods(CAutoInitRef<CUser_object> & tpa)1416 void CSourceModParser::x_ApplyTPAMods(CAutoInitRef<CUser_object>& tpa)
1417 {
1418 const SMod* mod = NULL;
1419
1420 // primary[-accessions]
1421 if ((mod = FindMod(s_Mod_primary, s_Mod_primary_accessions)) != NULL) {
1422 CUser_object::TData data;
1423 list<CTempString> accns;
1424 NStr::Split(mod->value, ",", accns, NStr::fSplit_MergeDelimiters);
1425 ITERATE (list<CTempString>, it, accns) {
1426 CRef<CUser_field> field(new CUser_field), subfield(new CUser_field);
1427 field->SetLabel().SetId(0);
1428 subfield->SetLabel().SetStr("accession");
1429 subfield->SetData().SetStr(CUtf8::AsUTF8(*it, eEncoding_UTF8));
1430 field->SetData().SetFields().push_back(subfield);
1431 data.push_back(field);
1432 }
1433
1434 if ( !data.empty() ) {
1435 s_PopulateUserObject(*tpa, "TpaAssembly", data);
1436 }
1437 }
1438 }
1439
1440
s_SetDBLinkDesc(CBioseq & bioseq)1441 static CRef<CSeqdesc> s_SetDBLinkDesc(CBioseq& bioseq)
1442 {
1443 CConstRef<CBioseq_set> pParentSet = bioseq.GetParentSet();
1444 CSeq_descr& descriptors = (pParentSet &&
1445 pParentSet->GetClass() == CBioseq_set::eClass_nuc_prot) ?
1446
1447 (const_cast<CBioseq_set&>(*pParentSet)).SetDescr() :
1448 bioseq.SetDescr();
1449
1450
1451 for (auto pDesc : descriptors.Set()) {
1452 if (pDesc->IsUser() && pDesc->GetUser().IsDBLink()) {
1453 return pDesc;
1454 }
1455 }
1456
1457 auto pDBLinkDesc = Ref(new CSeqdesc());
1458 pDBLinkDesc->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
1459 descriptors.Set().push_back(pDBLinkDesc);
1460 return pDBLinkDesc;
1461 }
1462
1463
s_SetDBLinkFieldVals(const string & label,const list<CTempString> & vals,CSeqdesc & dblink_desc)1464 static void s_SetDBLinkFieldVals(const string& label,
1465 const list<CTempString>& vals,
1466 CSeqdesc& dblink_desc)
1467 {
1468 if (vals.empty()) {
1469 return;
1470 }
1471
1472 auto& user_obj = dblink_desc.SetUser();
1473 CRef<CUser_field> pField;
1474 if (user_obj.IsSetData()) {
1475 for (auto pUserField : user_obj.SetData()) {
1476 if (pUserField->IsSetLabel() &&
1477 pUserField->GetLabel().IsStr() &&
1478 NStr::EqualNocase(pUserField->GetLabel().GetStr(), label)) {
1479 pField = pUserField;
1480 break;
1481 }
1482 }
1483 }
1484
1485 if (!pField) {
1486 pField = Ref(new CUser_field());
1487 pField->SetLabel().SetStr() = label;
1488 user_obj.SetData().push_back(pField);
1489 }
1490
1491 pField->SetData().SetStrs().clear(); // RW-518 - clear any preexisting entries
1492 for (const auto& val : vals) {
1493 pField->SetData().SetStrs().push_back(val);
1494 }
1495 pField->SetNum(pField->GetData().GetStrs().size());
1496 }
1497
1498
s_SetDBLinkField(const string & label,const string & vals,CRef<CSeqdesc> & pDBLinkDesc,CBioseq & bioseq)1499 static void s_SetDBLinkField(const string& label,
1500 const string& vals,
1501 CRef<CSeqdesc>& pDBLinkDesc,
1502 CBioseq& bioseq)
1503 {
1504 list<CTempString> value_list;
1505 NStr::Split(vals, ",", value_list, NStr::fSplit_MergeDelimiters);
1506 for (auto& val : value_list) {
1507 val = NStr::TruncateSpaces_Unsafe(val);
1508 }
1509 value_list.remove_if([](const CTempString& val){ return val.empty(); });
1510 if (value_list.empty()) { // nothing to do
1511 return;
1512 }
1513
1514 if (!pDBLinkDesc) {
1515 pDBLinkDesc = s_SetDBLinkDesc(bioseq);
1516 }
1517
1518 s_SetDBLinkFieldVals(label,
1519 value_list,
1520 *pDBLinkDesc);
1521 }
1522
1523
x_ApplyDBLinkMods(CBioseq & bioseq)1524 void CSourceModParser::x_ApplyDBLinkMods(CBioseq& bioseq)
1525 {
1526 CRef<CSeqdesc> pDBLinkDesc;
1527 const SMod* mod = NULL;
1528 if ((mod = FindMod(s_Mod_SRA)) != NULL) {
1529 s_SetDBLinkField("Sequence Read Archive", mod->value, pDBLinkDesc, bioseq);
1530 }
1531
1532 if ((mod = FindMod(s_Mod_bioproject)) != NULL) {
1533 s_SetDBLinkField("BioProject", mod->value, pDBLinkDesc, bioseq);
1534 }
1535
1536 if ((mod = FindMod(s_Mod_biosample)) != NULL) {
1537 s_SetDBLinkField("BioSample", mod->value, pDBLinkDesc, bioseq);
1538 }
1539 }
1540
1541
1542
1543 void
x_ApplyGenomeProjectsDBMods(CAutoInitRef<CUser_object> & gpdb)1544 CSourceModParser::x_ApplyGenomeProjectsDBMods(CAutoInitRef<CUser_object>& gpdb)
1545 {
1546 const SMod* mod = NULL;
1547
1548 // project[s]
1549 if ((mod = FindMod(s_Mod_project, s_Mod_projects)) != NULL) {
1550 CUser_object::TData data;
1551 list<CTempString> ids;
1552 NStr::Split(mod->value, ",;", ids, NStr::fSplit_MergeDelimiters);
1553 ITERATE (list<CTempString>, it, ids) {
1554 unsigned int id = NStr::StringToUInt(*it, NStr::fConvErr_NoThrow);
1555 if (id > 0) {
1556 CRef<CUser_field> field(new CUser_field),
1557 subfield(new CUser_field);
1558 field->SetLabel().SetId(0);
1559 subfield->SetLabel().SetStr("ProjectID");
1560 subfield->SetData().SetInt(id);
1561 field->SetData().SetFields().push_back(subfield);
1562 subfield.Reset(new CUser_field);
1563 subfield->SetLabel().SetStr("ParentID");
1564 subfield->SetData().SetInt(0);
1565 field->SetData().SetFields().push_back(subfield);
1566 data.push_back(field);
1567 }
1568 }
1569
1570 if ( !data.empty() ) {
1571 s_PopulateUserObject(*gpdb, "GenomeProjectsDB", data);
1572 }
1573 }
1574 }
1575
1576
1577 static
s_ApplyPubMods(CBioseq & bioseq,const CSourceModParser::TModsRange & range)1578 void s_ApplyPubMods(CBioseq& bioseq, const CSourceModParser::TModsRange& range)
1579 {
1580 for (CSourceModParser::TModsCI it = range.first;
1581 it != range.second; ++it) {
1582 TEntrezId pmid = NStr::StringToNumeric<TEntrezId>(it->value, NStr::fConvErr_NoThrow);
1583 CRef<CPub> pub(new CPub);
1584 pub->SetPmid().Set(pmid);
1585 CRef<CSeqdesc> pubdesc(new CSeqdesc);
1586 pubdesc->SetPub().SetPub().Set().push_back(pub);
1587 bioseq.SetDescr().Set().push_back(pubdesc);
1588 }
1589 }
1590
1591
ApplyPubMods(CBioseq & seq)1592 void CSourceModParser::ApplyPubMods(CBioseq& seq)
1593 {
1594 // find PubMed IDs
1595 s_ApplyPubMods(seq, FindAllMods(s_Mod_PubMed));
1596 s_ApplyPubMods(seq, FindAllMods(s_Mod_PMID));
1597 }
1598
CBadModError(const SMod & badMod,const string & sAllowedValues)1599 CSourceModParser::CBadModError::CBadModError(
1600 const SMod & badMod,
1601 const string & sAllowedValues )
1602 : runtime_error(x_CalculateErrorString(badMod, sAllowedValues)),
1603 m_BadMod(badMod), m_sAllowedValues(sAllowedValues)
1604 {
1605 // no further work required
1606 }
1607
x_CalculateErrorString(const SMod & badMod,const string & sAllowedValues)1608 string CSourceModParser::CBadModError::x_CalculateErrorString(
1609 const SMod & badMod,
1610 const string & sAllowedValues )
1611 {
1612 stringstream str_strm;
1613 str_strm << "Bad modifier value at seqid '"
1614 << ( badMod.seqid ? badMod.seqid->AsFastaString() : "UNKNOWN")
1615 << "'. '" << badMod.key << "' cannot have value '" << badMod.value
1616 << "'. Accepted values are [" << sAllowedValues << "]";
1617 return str_strm.str();
1618 }
1619
CUnkModError(const SMod & unkMod)1620 CSourceModParser::CUnkModError::CUnkModError(
1621 const SMod& unkMod )
1622 : runtime_error(x_CalculateErrorString(unkMod)), m_UnkMod(unkMod)
1623 {
1624 }
1625
x_CalculateErrorString(const SMod & unkMod)1626 string CSourceModParser::CUnkModError::x_CalculateErrorString(
1627 const SMod& unkMod)
1628 {
1629 stringstream str_strm;
1630 str_strm << "Bad modifier key at seqid '"
1631 << ( unkMod.seqid ? unkMod.seqid->AsFastaString() : "UNKNOWN")
1632 << "'. '" << unkMod.key << "' is not a recognized modifier key";
1633 return str_strm.str();
1634 }
1635
1636
GetMods(TWhichMods which) const1637 CSourceModParser::TMods CSourceModParser::GetMods(TWhichMods which) const
1638 {
1639 if (which == fAllMods) {
1640 // if caller gave this they probably should prefer calling GetAllMods
1641 // to avoid the struct copy.
1642 return m_Mods;
1643 } else {
1644 TMods ret;
1645
1646 ITERATE (TMods, it, m_Mods) {
1647 if (which == (it->used ? fUsedMods : fUnusedMods)) {
1648 ret.insert(ret.end(), *it);
1649 }
1650 }
1651
1652 return ret;
1653 }
1654 }
1655
FindMod(const CTempString & key,const CTempString & alt_key)1656 const CSourceModParser::SMod* CSourceModParser::FindMod(
1657 //const SMod & smod, const SMod & alt_smod)
1658 const CTempString& key, const CTempString& alt_key)
1659 {
1660 // check against m_pModFilter, if any
1661 if( m_pModFilter ) {
1662 if( ! (*m_pModFilter)(key) || ! (*m_pModFilter)(alt_key) ) {
1663 return NULL;
1664 }
1665 }
1666
1667 SMod mod;
1668
1669 for (int tries = 0; tries < 2; ++tries) {
1670 const CTempString & modkey = ( tries == 0 ? key : alt_key );
1671 if( modkey.empty() ) {
1672 continue;
1673 }
1674 mod.key = modkey;
1675
1676 TModsCI it = m_Mods.lower_bound(mod);
1677 if (it != m_Mods.end() && EqualKeys(it->key, modkey)) {
1678 // set iterators are const since changing an object could affect
1679 // its order in the set. However, in this case we know that
1680 // changing the `used` field won't affect the order so we know
1681 // that a const_cast to change it is safe to do.
1682 const_cast<SMod&>(*it).used = true;
1683 return &*it;
1684 }
1685 }
1686
1687 return NULL;
1688 }
1689
1690
1691 CSourceModParser::TModsRange
FindAllMods(const CTempString & key)1692 CSourceModParser::FindAllMods(const CTempString& key)
1693 {
1694 SMod smod(key);
1695 return FindAllMods(smod);
1696 }
1697
1698 CSourceModParser::TModsRange
FindAllMods(const CTempString & key,const CTempString & alt_key)1699 CSourceModParser::FindAllMods(const CTempString& key, const CTempString& alt_key)
1700 {
1701 SMod smod(key);
1702 SMod alt_smod(alt_key);
1703 return FindAllMods(smod, alt_smod);
1704 }
1705
1706 CSourceModParser::TModsRange
FindAllMods(const SMod & smod,const SMod & alt_smod)1707 CSourceModParser::FindAllMods(const SMod & smod, const SMod & alt_smod)
1708 {
1709 TModsRange r;
1710 r.first = m_Mods.lower_bound(smod);
1711 if (r.first == m_Mods.end() || !EqualKeys(r.first->key, smod.key)) {
1712 r.first = m_Mods.lower_bound(alt_smod);
1713 }
1714 for (r.second = r.first;
1715 r.second != m_Mods.end() && (EqualKeys(r.second->key, smod.key) || EqualKeys(r.second->key, alt_smod.key));
1716 ++r.second)
1717 {
1718 // set iterators are const since changing an object could affect
1719 // its order in the set. However, in this case we know that
1720 // changing the `used` field won't affect the order so we know
1721 // that a const_cast to change it is safe to do.
1722 const_cast<SMod&>(*r.second).used = true;
1723 }
1724 return r;
1725 }
1726
1727
GetLabel(string * s,TWhichMods which) const1728 void CSourceModParser::GetLabel(string* s, TWhichMods which) const
1729 {
1730 // Possible (flag-conditional?) behavior changes:
1731 // - leave off spaces between modifiers
1732 // - sort by position rather than key
1733 _ASSERT(s != NULL);
1734
1735 string delim = s->empty() ? kEmptyStr : " ";
1736
1737 ITERATE (TMods, it, m_Mods) {
1738 if ((which & (it->used ? fUsedMods : fUnusedMods)) != 0) {
1739 *s += delim + '[' + it->key + '=' + it->value + ']';
1740 delim = " ";
1741 }
1742 }
1743 }
1744
1745 // static
1746 const set<string> &
GetModAllowedValues(const string & mod)1747 CSourceModParser::GetModAllowedValues(const string &mod)
1748 {
1749 // since this has a lock, do NOT grab any other locks
1750 // inside here.
1751 static CMutex mutex;
1752 CMutexGuard guard(mutex);
1753
1754 typedef map< string, set<string>, CSourceModParser::PKeyCompare> TMapModToValidValues;
1755 static TMapModToValidValues s_mapModToValidValues;
1756
1757 // see if value is already calculated to try to save time
1758 TMapModToValidValues::const_iterator find_iter =
1759 s_mapModToValidValues.find(mod);
1760 if( find_iter != s_mapModToValidValues.end() ) {
1761 return find_iter->second;
1762 }
1763
1764 // does canonical comparison, which goes a little beyond case-insensitivity
1765 PKeyEqual key_equal;
1766
1767 // not cached, so we need to calculate it ourselves
1768 set<string> & set_valid_values = s_mapModToValidValues[mod];
1769 if( key_equal(mod, "topology") || key_equal(mod, "top") ) {
1770 set_valid_values.insert("linear");
1771 set_valid_values.insert("circular");
1772 } else if( key_equal(mod, "molecule") || key_equal(mod, "mol") ) {
1773 set_valid_values.insert("rna");
1774 set_valid_values.insert("dna");
1775 } else if( key_equal(mod, "moltype") || key_equal(mod, "mol-type") ) {
1776 // construct the possible bad values by hand
1777 ITERATE( TBiomolMap, map_iter, sc_BiomolMap ) {
1778 if( map_iter->second.m_eShown == SMolTypeInfo::eShown_Yes ) {
1779 set_valid_values.insert(map_iter->first);
1780 }
1781 }
1782 } else if( key_equal(mod, "strand") ) {
1783 set_valid_values.insert("single");
1784 set_valid_values.insert("double");
1785 set_valid_values.insert("mixed");
1786 } else if( key_equal(mod, "location") ) {
1787 set_valid_values.insert("mitochondrial");
1788 set_valid_values.insert("provirus");
1789 set_valid_values.insert("extrachromosomal");
1790 set_valid_values.insert("insertion sequence");
1791 } else if( key_equal(mod, "origin") ) {
1792 set_valid_values.insert("natural mutant");
1793 set_valid_values.insert("mutant");
1794 ITERATE( CEnumeratedTypeValues::TValues, enum_iter, CBioSource::GetTypeInfo_enum_EOrigin()->GetValues() ) {
1795 set_valid_values.insert( enum_iter->first );
1796 }
1797 } else if( key_equal(mod, "tech") ) {
1798 ITERATE(TTechMap, tech_it, sc_TechMap) {
1799 set_valid_values.insert(tech_it->first);
1800 }
1801 } else if( key_equal(mod, "completeness") || key_equal(mod, "completedness") ) {
1802 ITERATE( TCompletenessMap, comp_it, sc_CompletenessMap ) {
1803 set_valid_values.insert(comp_it->first);
1804 }
1805 } else {
1806 set_valid_values.insert("ERROR TRYING TO DETERMINE ALLOWED VALUES");
1807 }
1808
1809 return set_valid_values;
1810 }
1811
1812 // static
1813 const string &
GetModAllowedValuesAsOneString(const string & mod)1814 CSourceModParser::GetModAllowedValuesAsOneString(const string &mod)
1815 {
1816 // do not grab any other locks while in here (except the lock in
1817 // GetModAllowedValues)
1818 static CMutex mutex;
1819 CMutexGuard guard(mutex);
1820
1821 typedef map<string, string> TMapModNameToStringOfAllAllowedValues;
1822 static TMapModNameToStringOfAllAllowedValues mapModNameToStringOfAllAllowedValues;
1823
1824 // see if we've already cached the value
1825 TMapModNameToStringOfAllAllowedValues::const_iterator find_iter =
1826 mapModNameToStringOfAllAllowedValues.find(mod);
1827 if( find_iter != mapModNameToStringOfAllAllowedValues.end() ) {
1828 return find_iter->second;
1829 }
1830
1831 // not loaded, so we need to calculate it
1832 string & sAllValuesAsOneString =
1833 mapModNameToStringOfAllAllowedValues[mod];
1834 const set<string> & setAllowedValues = GetModAllowedValues(mod);
1835 ITERATE( set<string>, value_it, setAllowedValues ) {
1836 if( ! sAllValuesAsOneString.empty() ) {
1837 sAllValuesAsOneString += ", ";
1838 }
1839 sAllValuesAsOneString += "'" + *value_it + "'";
1840 }
1841
1842 return sAllValuesAsOneString;
1843 }
1844
x_HandleBadModValue(const SMod & mod)1845 void CSourceModParser::x_HandleBadModValue(
1846 const SMod& mod)
1847 {
1848 m_BadMods.insert(mod);
1849
1850 if( eHandleBadMod_Ignore == m_HandleBadMod ) {
1851 return;
1852 }
1853
1854 const string & sAllAllowedValues = GetModAllowedValuesAsOneString(mod.key);
1855
1856 CBadModError badModError(mod, sAllAllowedValues);
1857
1858 switch( m_HandleBadMod ) {
1859 case eHandleBadMod_Throw:
1860 throw badModError;
1861 case eHandleBadMod_PrintToCerr:
1862 cerr << badModError.what() << endl;
1863 break;
1864 case eHandleBadMod_ErrorListener: {
1865 AutoPtr<CObjReaderLineException> pErr(
1866 CObjReaderLineException::Create(
1867 eDiag_Warning,
1868 m_LineNumber,
1869 badModError.what(),
1870 ILineError::eProblem_GeneralParsingError) );
1871 x_ProcessError(*pErr);
1872 break;
1873 }
1874 default:
1875 _TROUBLE;
1876 }
1877 }
1878
x_HandleUnkModValue(const SMod & mod)1879 void CSourceModParser::x_HandleUnkModValue(
1880 const SMod& mod)
1881 {
1882 if (m_HandleBadMod == eHandleBadMod_Ignore) {
1883 return;
1884 }
1885 if (m_pModFilter && !m_pModFilter->operator()(mod.key)) {
1886 return;
1887 }
1888 CUnkModError unkModError(mod);
1889
1890 switch( m_HandleBadMod ) {
1891 case eHandleBadMod_Throw:
1892 throw unkModError;
1893 case eHandleBadMod_PrintToCerr:
1894 cerr << unkModError.what() << endl;
1895 break;
1896 case eHandleBadMod_ErrorListener: {
1897 AutoPtr<CObjReaderLineException> pErr(
1898 CObjReaderLineException::Create(
1899 eDiag_Warning,
1900 m_LineNumber,
1901 unkModError.what(),
1902 ILineError::eProblem_GeneralParsingError) );
1903 x_ProcessError(*pErr);
1904 break;
1905 }
1906 default:
1907 _TROUBLE;
1908 }
1909 }
1910
x_ProcessError(CObjReaderLineException & err)1911 void CSourceModParser::x_ProcessError(
1912 CObjReaderLineException& err)
1913 {
1914 if (!m_pErrorListener) {
1915 err.Throw();
1916 }
1917 if (!m_pErrorListener->PutError(err)) {
1918 AutoPtr<CObjReaderLineException> pErr(
1919 CObjReaderLineException::Create(
1920 eDiag_Critical,
1921 0,
1922 "Error allowance exceeded",
1923 ILineError::eProblem_GeneralParsingError) );
1924 pErr->Throw();
1925 }
1926 }
1927
ApplyMods(CBioSource & bsrc,CTempString organism)1928 void CSourceModParser::ApplyMods(CBioSource& bsrc, CTempString organism)
1929 {
1930 CAutoInitDesc<CBioSource> ref(bsrc);
1931 x_ApplyMods(ref, organism);
1932 }
1933
1934
ApplyMods(CMolInfo & mi)1935 void CSourceModParser::ApplyMods(CMolInfo& mi)
1936 {
1937 CAutoInitDesc<CMolInfo> ref(mi);
1938 x_ApplyMods(ref);
1939 }
1940
1941
ApplyMods(CGB_block & gbb)1942 void CSourceModParser::ApplyMods(CGB_block& gbb)
1943 {
1944 CAutoInitDesc<CGB_block> ref(gbb);
1945 x_ApplyMods(ref);
1946 }
1947
SetAllUnused()1948 void CSourceModParser::SetAllUnused()
1949 {
1950 NON_CONST_ITERATE(TMods, it, m_Mods)
1951 {
1952 // set iterators are const since changing an object could affect
1953 // its order in the set. However, in this case we know that
1954 // changing the `used` field won't affect the order so we know
1955 // that a const_cast to change it is safe to do.
1956 const_cast<SMod&>(*it).used = false;
1957 }
1958 }
1959
AddMods(const CTempString & name,const CTempString & value)1960 void CSourceModParser::AddMods(const CTempString& name, const CTempString& value)
1961 {
1962 SMod newmod(NStr::TruncateSpaces_Unsafe(name));
1963 newmod.value = NStr::TruncateSpaces_Unsafe(value);
1964 newmod.used = false;
1965
1966 m_Mods.insert(newmod);
1967 }
1968
1969 END_SCOPE(objects)
1970 END_NCBI_SCOPE
1971