1  /*
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jonathan Kans, Michael Kornbluh
27  *
28  * File Description:
29  *   Feature table reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 
37 #include <util/static_map.hpp>
38 
39 #include <serial/iterator.hpp>
40 #include <serial/objistrasn.hpp>
41 
42 // Objects includes
43 #include <objects/general/Int_fuzz.hpp>
44 #include <objects/general/Object_id.hpp>
45 #include <objects/general/User_object.hpp>
46 #include <objects/general/User_field.hpp>
47 #include <objects/general/Dbtag.hpp>
48 
49 #include <objects/seqloc/Seq_id.hpp>
50 #include <objects/seqloc/Seq_loc.hpp>
51 #include <objects/seqloc/Seq_interval.hpp>
52 #include <objects/seqloc/Seq_point.hpp>
53 
54 #include <objects/seq/Seq_annot.hpp>
55 #include <objects/seq/Annotdesc.hpp>
56 #include <objects/seq/Annot_descr.hpp>
57 #include <objects/pub/Pub.hpp>
58 #include <objects/pub/Pub_equiv.hpp>
59 #include <objects/seq/Pubdesc.hpp>
60 #include <objects/seqfeat/SeqFeatData.hpp>
61 #include <objects/seq/seq_loc_from_string.hpp>
62 
63 #include <objects/seqfeat/Seq_feat.hpp>
64 #include <objects/seqfeat/BioSource.hpp>
65 #include <objects/seqfeat/Org_ref.hpp>
66 #include <objects/seqfeat/OrgName.hpp>
67 #include <objects/seqfeat/SubSource.hpp>
68 #include <objects/seqfeat/OrgMod.hpp>
69 #include <objects/seqfeat/Gene_ref.hpp>
70 #include <objects/seqfeat/Cdregion.hpp>
71 #include <objects/seqfeat/Code_break.hpp>
72 #include <objects/seqfeat/Genetic_code.hpp>
73 #include <objects/seqfeat/Genetic_code_table.hpp>
74 #include <objects/seqfeat/RNA_ref.hpp>
75 #include <objects/seqfeat/Trna_ext.hpp>
76 #include <objects/seqfeat/RNA_gen.hpp>
77 #include <objects/seqfeat/RNA_qual_set.hpp>
78 #include <objects/seqfeat/RNA_qual.hpp>
79 #include <objects/seqfeat/Imp_feat.hpp>
80 #include <objects/seqfeat/Gb_qual.hpp>
81 
82 #include <objects/misc/sequence_macros.hpp>
83 
84 #include <objects/seqset/Bioseq_set.hpp>
85 #include <objects/seqset/Seq_entry.hpp>
86 
87 #include <objtools/readers/readfeat.hpp>
88 #include <objtools/readers/table_filter.hpp>
89 #include <objtools/error_codes.hpp>
90 
91 #include <algorithm>
92 #include <unordered_set>
93 
94 #include <objtools/readers/message_listener.hpp>
95 #include <objtools/readers/read_util.hpp>
96 #include "best_feat_finder.hpp"
97 
98 #define NCBI_USE_ERRCODE_X   Objtools_Rd_Feature
99 
100 
101 BEGIN_NCBI_SCOPE
102 
103 BEGIN_objects_SCOPE // namespace ncbi::objects::
104 
105 
106 
107 namespace {
108     static const char * const kCdsFeatName = "CDS";
109     // priorities, inherited from C toolkit
110     static Uchar std_order[CSeq_id::e_MaxChoice] = {
111         83,  /* 0 = not set */
112         80,  /* 1 = local Object-id */
113         70,  /* 2 = gibbsq */
114         70,  /* 3 = gibbmt */
115         70,  /* 4 = giim Giimport-id */
116         60,  /* 5 = genbank */
117         60,  /* 6 = embl */
118         60,  /* 7 = pir */
119         60,  /* 8 = swissprot */
120         81,  /* 9 = patent */
121         65,  /* 10 = other TextSeqId */
122         80,  /* 11 = general Dbtag */
123         82,  /* 12 = gi */
124         60,  /* 13 = ddbj */
125         60,  /* 14 = prf */
126         60,  /* 15 = pdb */
127         60,  /* 16 = tpg */
128         60,  /* 17 = tpe */
129         60,  /* 18 = tpd */
130         68,  /* 19 = gpp */
131         69   /* 20 = nat */
132     };
133 
GetBestId(const CBioseq::TId & ids)134 CRef<CSeq_id> GetBestId(const CBioseq::TId& ids)
135 {
136     if (ids.size() == 1)
137         return ids.front();
138 
139     CRef<CSeq_id> id;
140     if (!ids.empty())
141     {
142         Uchar best_weight = UCHAR_MAX;
143         ITERATE(CBioseq::TId, it, ids)
144         {
145             Uchar new_weight = std_order[(*it)->Which()];
146             if (new_weight < best_weight)
147             {
148                 id = *it;
149                 best_weight = new_weight;
150             }
151         };
152     }
153 
154     return id;
155 }
156 
157 
158 map<char, list<char>> s_IUPACmap
159 {
160     {'A', list<char>({'A'})},
161     {'G', list<char>({'G'})},
162     {'C', list<char>({'C'})},
163     {'T', list<char>({'T'})},
164     {'U', list<char>({'U'})},
165     {'M', list<char>({'A', 'C'})},
166     {'R', list<char>({'A', 'G'})},
167     {'W', list<char>({'A', 'T'})},
168     {'S', list<char>({'C', 'G'})},
169     {'Y', list<char>({'C', 'T'})},
170     {'K', list<char>({'G', 'T'})},
171     {'V', list<char>({'A', 'C', 'G'})},
172     {'H', list<char>({'A', 'C', 'T'})},
173     {'D', list<char>({'A', 'G', 'T'})},
174     {'B', list<char>({'C', 'G', 'T'})},
175     {'N', list<char>({'A', 'C', 'G', 'T'})}
176 };
177 
178 }
179 
180 
181 class /* NCBI_XOBJREAD_EXPORT */ CFeatureTableReader_Imp
182 {
183 public:
184     enum EQual {
185         eQual_allele,
186         eQual_anticodon,
187         eQual_bac_ends,
188         eQual_bond_type,
189         eQual_bound_moiety,
190         eQual_chrcnt,
191         eQual_citation,
192         eQual_clone,
193         eQual_clone_id,
194         eQual_codon_recognized,
195         eQual_codon_start,
196         eQual_compare,
197         eQual_cons_splice,
198         eQual_ctgcnt,
199         eQual_cyt_map,
200         eQual_db_xref,
201         eQual_direction,
202         eQual_EC_number,
203         eQual_estimated_length,
204         eQual_evidence,
205         eQual_exception,
206         eQual_experiment,
207         eQual_frequency,
208         eQual_function,
209         eQual_gap_type,
210         eQual_gen_map,
211         eQual_gene,
212         eQual_gene_desc,
213         eQual_gene_syn,
214         eQual_go_component,
215         eQual_go_function,
216         eQual_go_process,
217         eQual_heterogen,
218         eQual_inference,
219         eQual_insertion_seq,
220         eQual_label,
221         eQual_linkage_evidence,
222         eQual_loccnt,
223         eQual_locus_tag,
224         eQual_macronuclear,
225         eQual_map,
226         eQual_MEDLINE,
227         eQual_method,
228         eQual_mobile_element_type,
229         eQual_mod_base,
230         eQual_muid,
231         eQual_ncRNA_class,
232         eQual_nomenclature,
233         eQual_note,
234         eQual_number,
235         eQual_old_locus_tag,
236         eQual_operon,
237         eQual_organism,
238         eQual_partial,
239         eQual_PCR_conditions,
240         eQual_phenotype,
241         eQual_pmid,
242         eQual_product,
243         eQual_prot_desc,
244         eQual_prot_note,
245         eQual_protein_id,
246         eQual_pseudo,
247         eQual_pseudogene,
248         eQual_PubMed,
249         eQual_rad_map,
250         eQual_region_name,
251         eQual_regulatory_class,
252         eQual_replace,
253         eQual_ribosomal_slippage,
254         eQual_rpt_family,
255         eQual_rpt_type,
256         eQual_rpt_unit,
257         eQual_rpt_unit_range,
258         eQual_rpt_unit_seq,
259         eQual_satellite,
260         eQual_sec_str_type,
261         eQual_secondary_accession,
262         eQual_sequence,
263         eQual_site_type,
264         eQual_snp_class,
265         eQual_snp_gtype,
266         eQual_snp_het,
267         eQual_snp_het_se,
268         eQual_snp_linkout,
269         eQual_snp_maxrate,
270         eQual_snp_valid,
271         eQual_standard_name,
272         eQual_STS,
273         eQual_sts_aliases,
274         eQual_sts_dsegs,
275         eQual_tag_peptide,
276         eQual_trans_splicing,
277         eQual_transcript_id,
278         eQual_transcription,
279         eQual_transl_except,
280         eQual_transl_table,
281         eQual_translation,
282         eQual_transposon,
283         eQual_usedin,
284         eQual_weight
285     };
286 
287     enum EOrgRef {
288         eOrgRef_organism,
289         eOrgRef_organelle,
290         eOrgRef_div,
291         eOrgRef_lineage,
292         eOrgRef_gcode,
293         eOrgRef_mgcode
294     };
295 
296     using TFlags = CFeature_table_reader::TFlags;
297     using TFtable = CSeq_annot::C_Data::TFtable;
298 
299     // constructor
300     CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener);
301     // destructor
302     ~CFeatureTableReader_Imp(void);
303 
304     // read 5-column feature table and return Seq-annot
305     CRef<CSeq_annot> ReadSequinFeatureTable (const CTempString& seqid,
306                                              const CTempString& annotname,
307                                              const TFlags flags,
308                                              ITableFilter *filter);
309 
310     // create single feature from key
311     CRef<CSeq_feat> CreateSeqFeat (const string& feat,
312                                    CSeq_loc& location,
313                                    const TFlags flags,
314                                    const string &seq_id,
315                                    ITableFilter *filter);
316 
317     // add single qualifier to feature
318     void AddFeatQual (CRef<CSeq_feat> sfp,
319                       const string& feat_name,
320                       const string& qual,
321                       const string& val,
322                       const TFlags flags,
323                       const string &seq_id );
324 
325     static bool ParseInitialFeatureLine (
326         const CTempString& line_arg,
327         CTempStringEx& out_seqid,
328         CTempStringEx& out_annotname );
329 
330     static void PutProgress(const CTempString& seq_id,
331         const unsigned int line_number,
332         ILineErrorListener* pListener);
333 
GetLineReaderPtr(void)334     ILineReader* const GetLineReaderPtr(void)  {
335         return m_reader;
336     }
337 
GetErrorListenerPtr(void)338     ILineErrorListener* const GetErrorListenerPtr(void) {
339         return m_pMessageListener;
340     }
341 
342 private:
343 
344     // Prohibit copy constructor and assignment operator
345     CFeatureTableReader_Imp(const CFeatureTableReader_Imp& value);
346     CFeatureTableReader_Imp& operator=(const CFeatureTableReader_Imp& value);
347 
348     void x_InitId(const CTempString& seq_id, const TFlags flags);
349     // returns true if parsed (otherwise, out_offset is left unchanged)
350     bool x_TryToParseOffset(const CTempString & sLine, Int4 & out_offset );
351 
352 
353     struct SFeatLocInfo {
354         Int4 start_pos;
355         Int4 stop_pos;
356         bool is_5p_partial;
357         bool is_3p_partial;
358         bool is_point;
359         bool is_minus_strand;
360     };
361 
362 
363     bool x_ParseFeatureTableLine(
364             const CTempString& line,
365             SFeatLocInfo& loc_info,
366             string& feat,
367             string& qual,
368             string& val,
369             Int4 offset);
370 
371 
372     bool x_IsWebComment(CTempString line);
373 
374     bool x_AddIntervalToFeature (
375             CTempString strFeatureName,
376             CRef<CSeq_feat>& sfp,
377             const SFeatLocInfo& loc_info);
378 
379     bool x_AddQualifierToFeature (CRef<CSeq_feat> sfp,
380         const string &feat_name,
381         const string& qual, const string& val,
382         const TFlags flags);
383 
384     void x_ProcessQualifier(const string& qual_name,
385                             const string& qual_val,
386                             const string& feat_name,
387                             CRef<CSeq_feat> feat,
388                             TFlags flags);
389 
390     bool x_AddQualifierToGene     (CSeqFeatData& sfdata,
391                                    EQual qtype, const string& val);
392     bool x_AddQualifierToCdregion (CRef<CSeq_feat> sfp, CSeqFeatData& sfdata,
393                                    EQual qtype, const string& val);
394     bool x_AddQualifierToRna      (CRef<CSeq_feat> sfp,
395                                    EQual qtype, const string& val);
396     bool x_AddQualifierToImp      (CRef<CSeq_feat> sfp, CSeqFeatData& sfdata,
397                                    EQual qtype, const string& qual, const string& val);
398     bool x_AddQualifierToBioSrc   (CSeqFeatData& sfdata,
399                                    const string &feat_name,
400                                    EOrgRef rtype, const string& val);
401     bool x_AddQualifierToBioSrc   (CSeqFeatData& sfdata,
402                                    CSubSource::ESubtype stype, const string& val);
403     bool x_AddQualifierToBioSrc   (CSeqFeatData& sfdata,
404                                    COrgMod::ESubtype mtype, const string& val);
405 
406     bool x_AddNoteToFeature(CRef<CSeq_feat> sfp, const string& note);
407 
408     bool x_AddNoteToFeature(CRef<CSeq_feat> sfp,
409             const string& feat_name,
410             const string& qual,
411             const string& val);
412 
413     bool x_AddGBQualToFeature    (CRef<CSeq_feat> sfp,
414                                   const string& qual, const string& val);
415 
416     bool x_AddCodons(const string& val, CTrna_ext& trna_ext) const;
417 
418     typedef CConstRef<CSeq_feat> TFeatConstRef;
419     struct SFeatAndLineNum {
SFeatAndLineNumCFeatureTableReader_Imp::SFeatAndLineNum420         SFeatAndLineNum(
421             TFeatConstRef pFeat,
422             TSeqPos       uLineNum ) :
423         m_pFeat(pFeat), m_uLineNum(uLineNum) {
424             _ASSERT(pFeat);
425         }
426 
operator ==CFeatureTableReader_Imp::SFeatAndLineNum427         bool operator==(const SFeatAndLineNum & rhs) const {
428             return Compare(rhs) == 0; }
operator !=CFeatureTableReader_Imp::SFeatAndLineNum429         bool operator!=(const SFeatAndLineNum & rhs) const {
430             return Compare(rhs) != 0; }
operator <CFeatureTableReader_Imp::SFeatAndLineNum431         bool operator<(const SFeatAndLineNum & rhs) const {
432             return Compare(rhs) < 0; }
433 
CompareCFeatureTableReader_Imp::SFeatAndLineNum434         int Compare(const SFeatAndLineNum & rhs) const {
435             if( m_uLineNum != rhs.m_uLineNum ) {
436                 return ( m_uLineNum < rhs.m_uLineNum ? -1 : 1 );
437             }
438             return (m_pFeat.GetPointerOrNull() < rhs.m_pFeat.GetPointerOrNull() ? -1 : 1 );
439         }
440 
441         TFeatConstRef m_pFeat; // must be non-NULL
442         TSeqPos       m_uLineNum; // the line where this feature was created (or zero if programmatically created)
443     };
444     typedef multimap<CSeqFeatData::E_Choice, SFeatAndLineNum> TChoiceToFeatMap;
445     void x_CreateGenesFromCDSs(
446         CRef<CSeq_annot> sap,
447         TChoiceToFeatMap & choiceToFeatMap, // an input param, but might get more items added
448         const TFlags flags);
449 
450     bool x_StringIsJustQuotes (const string& str);
451 
452     string x_TrnaToAaString(const string& val);
453 
454     bool x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str);
455     SIZE_TYPE x_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos );
456 
457     long x_StringToLongNoThrow (
458         CTempString strToConvert,
459         CTempString strFeatureName,
460         CTempString strQualifierName,
461         // user can override the default problem types that are set on error
462         ILineError::EProblem eProblem = ILineError::eProblem_Unset
463     );
464 
465     bool x_SetupSeqFeat (CRef<CSeq_feat> sfp, const string& feat,
466                          const TFlags flags,
467                          ITableFilter *filter);
468 
469     void  x_ProcessMsg (
470         ILineError::EProblem eProblem,
471         EDiagSev eSeverity,
472         const std::string & strFeatureName = kEmptyStr,
473         const std::string & strQualifierName = kEmptyStr,
474         const std::string & strQualifierValue = kEmptyStr,
475         const std::string & strErrorMessage = kEmptyStr,
476         const ILineError::TVecOfLines & vecOfOtherLines =
477             ILineError::TVecOfLines() );
478 
479     void  x_ProcessMsg(
480         int line_num,
481         ILineError::EProblem eProblem,
482         EDiagSev eSeverity,
483         const std::string & strFeatureName = kEmptyStr,
484         const std::string & strQualifierName = kEmptyStr,
485         const std::string & strQualifierValue = kEmptyStr,
486         const std::string & strErrorMessage = kEmptyStr,
487         const ILineError::TVecOfLines & vecOfOtherLines =
488         ILineError::TVecOfLines());
489 
490     void x_TokenizeStrict( const CTempString &line, vector<string> &out_tokens );
491     void x_TokenizeLenient( const CTempString &line, vector<string> &out_tokens );
492     void x_FinishFeature(CRef<CSeq_feat>& feat, TFtable& ftable);
493     void x_ResetFeat(CRef<CSeq_feat>& feat, bool & curr_feat_intervals_done);
494     void x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const;
495     void x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const;
496 
497     bool m_need_check_strand;
498     string m_real_seqid;
499     CRef<CSeq_id> m_seq_id;
500     ILineReader* m_reader;
501     unsigned int m_LineNumber;
502     ILineErrorListener* m_pMessageListener;
503     unordered_set<string> m_ProcessedTranscriptIds;
504     unordered_set<string> m_ProcessedProteinIds;
505 };
506 
507 
508 typedef SStaticPair<const char *, CFeatureTableReader_Imp::EQual> TQualKey;
509 
510 static const TQualKey qual_key_to_subtype [] = {
511     {  "EC_number",            CFeatureTableReader_Imp::eQual_EC_number             },
512     {  "PCR_conditions",       CFeatureTableReader_Imp::eQual_PCR_conditions        },
513     {  "PubMed",               CFeatureTableReader_Imp::eQual_PubMed                },
514     {  "STS",                  CFeatureTableReader_Imp::eQual_STS                   },
515     {  "allele",               CFeatureTableReader_Imp::eQual_allele                },
516     {  "anticodon",            CFeatureTableReader_Imp::eQual_anticodon             },
517     {  "bac_ends",             CFeatureTableReader_Imp::eQual_bac_ends              },
518     {  "bond_type",            CFeatureTableReader_Imp::eQual_bond_type             },
519     {  "bound_moiety",         CFeatureTableReader_Imp::eQual_bound_moiety          },
520     {  "chrcnt",               CFeatureTableReader_Imp::eQual_chrcnt                },
521     {  "citation",             CFeatureTableReader_Imp::eQual_citation              },
522     {  "clone",                CFeatureTableReader_Imp::eQual_clone                 },
523     {  "clone_id",             CFeatureTableReader_Imp::eQual_clone_id              },
524     {  "codon_recognized",     CFeatureTableReader_Imp::eQual_codon_recognized      },
525     {  "codon_start",          CFeatureTableReader_Imp::eQual_codon_start           },
526     {  "codons_recognized",    CFeatureTableReader_Imp::eQual_codon_recognized      },
527     {  "compare",              CFeatureTableReader_Imp::eQual_compare               },
528     {  "cons_splice",          CFeatureTableReader_Imp::eQual_cons_splice           },
529     {  "ctgcnt",               CFeatureTableReader_Imp::eQual_ctgcnt                },
530     {  "cyt_map",              CFeatureTableReader_Imp::eQual_cyt_map               },
531     {  "db_xref",              CFeatureTableReader_Imp::eQual_db_xref               },
532     {  "direction",            CFeatureTableReader_Imp::eQual_direction             },
533     {  "estimated_length",     CFeatureTableReader_Imp::eQual_estimated_length      },
534     {  "evidence",             CFeatureTableReader_Imp::eQual_evidence              },
535     {  "exception",            CFeatureTableReader_Imp::eQual_exception             },
536     {  "experiment",           CFeatureTableReader_Imp::eQual_experiment            },
537     {  "frequency",            CFeatureTableReader_Imp::eQual_frequency             },
538     {  "function",             CFeatureTableReader_Imp::eQual_function              },
539     {  "gap_type",             CFeatureTableReader_Imp::eQual_gap_type              },
540     {  "gen_map",              CFeatureTableReader_Imp::eQual_gen_map               },
541     {  "gene",                 CFeatureTableReader_Imp::eQual_gene                  },
542     {  "gene_desc",            CFeatureTableReader_Imp::eQual_gene_desc             },
543     {  "gene_syn",             CFeatureTableReader_Imp::eQual_gene_syn              },
544     {  "gene_synonym",         CFeatureTableReader_Imp::eQual_gene_syn              },
545     {  "go_component",         CFeatureTableReader_Imp::eQual_go_component          },
546     {  "go_function",          CFeatureTableReader_Imp::eQual_go_function           },
547     {  "go_process",           CFeatureTableReader_Imp::eQual_go_process            },
548     {  "heterogen",            CFeatureTableReader_Imp::eQual_heterogen             },
549     {  "inference",            CFeatureTableReader_Imp::eQual_inference             },
550     {  "insertion_seq",        CFeatureTableReader_Imp::eQual_insertion_seq         },
551     {  "label",                CFeatureTableReader_Imp::eQual_label                 },
552     {  "linkage_evidence",     CFeatureTableReader_Imp::eQual_linkage_evidence      },
553     {  "loccnt",               CFeatureTableReader_Imp::eQual_loccnt                },
554     {  "locus_tag",            CFeatureTableReader_Imp::eQual_locus_tag             },
555     {  "macronuclear",         CFeatureTableReader_Imp::eQual_macronuclear          },
556     {  "map",                  CFeatureTableReader_Imp::eQual_map                   },
557     {  "method",               CFeatureTableReader_Imp::eQual_method                },
558     {  "mobile_element_type",  CFeatureTableReader_Imp::eQual_mobile_element_type   },
559     {  "mod_base",             CFeatureTableReader_Imp::eQual_mod_base              },
560     {  "ncRNA_class",          CFeatureTableReader_Imp::eQual_ncRNA_class           },
561     {  "nomenclature",         CFeatureTableReader_Imp::eQual_nomenclature          },
562     {  "note",                 CFeatureTableReader_Imp::eQual_note                  },
563     {  "number",               CFeatureTableReader_Imp::eQual_number                },
564     {  "old_locus_tag",        CFeatureTableReader_Imp::eQual_old_locus_tag         },
565     {  "operon",               CFeatureTableReader_Imp::eQual_operon                },
566     {  "organism",             CFeatureTableReader_Imp::eQual_organism              },
567     {  "partial",              CFeatureTableReader_Imp::eQual_partial               },
568     {  "phenotype",            CFeatureTableReader_Imp::eQual_phenotype             },
569     {  "product",              CFeatureTableReader_Imp::eQual_product               },
570     {  "prot_desc",            CFeatureTableReader_Imp::eQual_prot_desc             },
571     {  "prot_note",            CFeatureTableReader_Imp::eQual_prot_note             },
572     {  "protein_id",           CFeatureTableReader_Imp::eQual_protein_id            },
573     {  "pseudo",               CFeatureTableReader_Imp::eQual_pseudo                },
574     {  "pseudogene",           CFeatureTableReader_Imp::eQual_pseudogene            },
575     {  "rad_map",              CFeatureTableReader_Imp::eQual_rad_map               },
576     {  "region_name",          CFeatureTableReader_Imp::eQual_region_name           },
577     {  "regulatory_class",     CFeatureTableReader_Imp::eQual_regulatory_class      },
578     {  "replace",              CFeatureTableReader_Imp::eQual_replace               },
579     {  "ribosomal_slippage",   CFeatureTableReader_Imp::eQual_ribosomal_slippage    },
580     {  "rpt_family",           CFeatureTableReader_Imp::eQual_rpt_family            },
581     {  "rpt_type",             CFeatureTableReader_Imp::eQual_rpt_type              },
582     {  "rpt_unit",             CFeatureTableReader_Imp::eQual_rpt_unit              },
583     {  "rpt_unit_range",       CFeatureTableReader_Imp::eQual_rpt_unit_range        },
584     {  "rpt_unit_seq",         CFeatureTableReader_Imp::eQual_rpt_unit_seq          },
585     {  "satellite",            CFeatureTableReader_Imp::eQual_satellite             },
586     {  "sec_str_type",         CFeatureTableReader_Imp::eQual_sec_str_type          },
587     {  "secondary_accession",  CFeatureTableReader_Imp::eQual_secondary_accession   },
588     {  "secondary_accessions", CFeatureTableReader_Imp::eQual_secondary_accession   },
589     {  "sequence",             CFeatureTableReader_Imp::eQual_sequence              },
590     {  "site_type",            CFeatureTableReader_Imp::eQual_site_type             },
591     {  "snp_class",            CFeatureTableReader_Imp::eQual_snp_class             },
592     {  "snp_gtype",            CFeatureTableReader_Imp::eQual_snp_gtype             },
593     {  "snp_het",              CFeatureTableReader_Imp::eQual_snp_het               },
594     {  "snp_het_se",           CFeatureTableReader_Imp::eQual_snp_het_se            },
595     {  "snp_linkout",          CFeatureTableReader_Imp::eQual_snp_linkout           },
596     {  "snp_maxrate",          CFeatureTableReader_Imp::eQual_snp_maxrate           },
597     {  "snp_valid",            CFeatureTableReader_Imp::eQual_snp_valid             },
598     {  "standard_name",        CFeatureTableReader_Imp::eQual_standard_name         },
599     {  "sts_aliases",          CFeatureTableReader_Imp::eQual_sts_aliases           },
600     {  "sts_dsegs",            CFeatureTableReader_Imp::eQual_sts_dsegs             },
601     {  "tag_peptide",          CFeatureTableReader_Imp::eQual_tag_peptide           },
602     {  "trans_splicing",       CFeatureTableReader_Imp::eQual_trans_splicing        },
603     {  "transcript_id",        CFeatureTableReader_Imp::eQual_transcript_id         },
604     {  "transcription",        CFeatureTableReader_Imp::eQual_transcription         },
605     {  "transl_except",        CFeatureTableReader_Imp::eQual_transl_except         },
606     {  "transl_table",         CFeatureTableReader_Imp::eQual_transl_table          },
607     {  "translation",          CFeatureTableReader_Imp::eQual_translation           },
608     {  "transposon",           CFeatureTableReader_Imp::eQual_transposon            },
609     {  "usedin",               CFeatureTableReader_Imp::eQual_usedin                },
610     {  "weight",               CFeatureTableReader_Imp::eQual_weight                }
611 };
612 
613 typedef CStaticPairArrayMap <const char*, CFeatureTableReader_Imp::EQual, PCase_CStr> TQualMap;
614 DEFINE_STATIC_ARRAY_MAP(TQualMap, sm_QualKeys, qual_key_to_subtype);
615 
616 
617 typedef SStaticPair<const char *, CFeatureTableReader_Imp::EOrgRef> TOrgRefKey;
618 
619 static const TOrgRefKey orgref_key_to_subtype [] = {
620     {  "div",        CFeatureTableReader_Imp::eOrgRef_div        },
621     {  "gcode",      CFeatureTableReader_Imp::eOrgRef_gcode      },
622     {  "lineage",    CFeatureTableReader_Imp::eOrgRef_lineage    },
623     {  "mgcode",     CFeatureTableReader_Imp::eOrgRef_mgcode     },
624     {  "organelle",  CFeatureTableReader_Imp::eOrgRef_organelle  },
625     {  "organism",   CFeatureTableReader_Imp::eOrgRef_organism   }
626 };
627 
628 typedef CStaticPairArrayMap <const char*, CFeatureTableReader_Imp::EOrgRef, PCase_CStr> TOrgRefMap;
629 DEFINE_STATIC_ARRAY_MAP(TOrgRefMap, sm_OrgRefKeys, orgref_key_to_subtype);
630 
631 
632 typedef SStaticPair<const char *, CBioSource::EGenome> TGenomeKey;
633 
634 static const TGenomeKey genome_key_to_subtype [] = {
635     {  "apicoplast",                CBioSource::eGenome_apicoplast        },
636     {  "chloroplast",               CBioSource::eGenome_chloroplast       },
637     {  "chromatophore",             CBioSource::eGenome_chromatophore     },
638     {  "chromoplast",               CBioSource::eGenome_chromoplast       },
639     {  "chromosome",                CBioSource::eGenome_chromosome        },
640     {  "cyanelle",                  CBioSource::eGenome_cyanelle          },
641     {  "endogenous_virus",          CBioSource::eGenome_endogenous_virus  },
642     {  "extrachrom",                CBioSource::eGenome_extrachrom        },
643     {  "genomic",                   CBioSource::eGenome_genomic           },
644     {  "hydrogenosome",             CBioSource::eGenome_hydrogenosome     },
645     {  "insertion_seq",             CBioSource::eGenome_insertion_seq     },
646     {  "kinetoplast",               CBioSource::eGenome_kinetoplast       },
647     {  "leucoplast",                CBioSource::eGenome_leucoplast        },
648     {  "macronuclear",              CBioSource::eGenome_macronuclear      },
649     {  "mitochondrion",             CBioSource::eGenome_mitochondrion     },
650     {  "mitochondrion:kinetoplast", CBioSource::eGenome_kinetoplast       },
651     {  "nucleomorph",               CBioSource::eGenome_nucleomorph       },
652     {  "plasmid",                   CBioSource::eGenome_plasmid           },
653     {  "plastid",                   CBioSource::eGenome_plastid           },
654     {  "plastid:apicoplast",        CBioSource::eGenome_apicoplast        },
655     {  "plastid:chloroplast",       CBioSource::eGenome_chloroplast       },
656     {  "plastid:chromoplast",       CBioSource::eGenome_chromoplast       },
657     {  "plastid:cyanelle",          CBioSource::eGenome_cyanelle          },
658     {  "plastid:leucoplast",        CBioSource::eGenome_leucoplast        },
659     {  "plastid:proplastid",        CBioSource::eGenome_proplastid        },
660     {  "proplastid",                CBioSource::eGenome_proplastid        },
661     {  "proviral",                  CBioSource::eGenome_proviral          },
662     {  "transposon",                CBioSource::eGenome_transposon        },
663     {  "unknown",                   CBioSource::eGenome_unknown           },
664     {  "virion",                    CBioSource::eGenome_virion            }
665 };
666 
667 typedef CStaticPairArrayMap <const char*, CBioSource::EGenome, PCase_CStr> TGenomeMap;
668 DEFINE_STATIC_ARRAY_MAP(TGenomeMap, sm_GenomeKeys, genome_key_to_subtype);
669 
670 
671 typedef SStaticPair<const char *, CSubSource::ESubtype> TSubSrcKey;
672 
673 static const TSubSrcKey subsrc_key_to_subtype [] = {
674     {  "altitude",             CSubSource::eSubtype_altitude               },
675     {  "cell_line",            CSubSource::eSubtype_cell_line              },
676     {  "cell_type",            CSubSource::eSubtype_cell_type              },
677     {  "chromosome",           CSubSource::eSubtype_chromosome             },
678     {  "clone",                CSubSource::eSubtype_clone                  },
679     {  "clone_lib",            CSubSource::eSubtype_clone_lib              },
680     {  "collected_by",         CSubSource::eSubtype_collected_by           },
681     {  "collection_date",      CSubSource::eSubtype_collection_date        },
682     {  "country",              CSubSource::eSubtype_country                },
683     {  "dev_stage",            CSubSource::eSubtype_dev_stage              },
684     {  "endogenous_virus",     CSubSource::eSubtype_endogenous_virus_name  },
685     {  "environmental_sample", CSubSource::eSubtype_environmental_sample   },
686     {  "frequency",            CSubSource::eSubtype_frequency              },
687     {  "fwd_primer_name",      CSubSource::eSubtype_fwd_primer_name        },
688     {  "fwd_primer_seq",       CSubSource::eSubtype_fwd_primer_seq         },
689     {  "genotype",             CSubSource::eSubtype_genotype               },
690     {  "germline",             CSubSource::eSubtype_germline               },
691     {  "haplotype",            CSubSource::eSubtype_haplotype              },
692     {  "identified_by",        CSubSource::eSubtype_identified_by          },
693     {  "insertion_seq",        CSubSource::eSubtype_insertion_seq_name     },
694     {  "isolation_source",     CSubSource::eSubtype_isolation_source       },
695     {  "lab_host",             CSubSource::eSubtype_lab_host               },
696     {  "lat_lon",              CSubSource::eSubtype_lat_lon                },
697     {  "map",                  CSubSource::eSubtype_map                    },
698     {  "metagenomic",          CSubSource::eSubtype_metagenomic            },
699     {  "plasmid",              CSubSource::eSubtype_plasmid_name           },
700     {  "plastid",              CSubSource::eSubtype_plastid_name           },
701     {  "pop_variant",          CSubSource::eSubtype_pop_variant            },
702     {  "rearranged",           CSubSource::eSubtype_rearranged             },
703     {  "rev_primer_name",      CSubSource::eSubtype_rev_primer_name        },
704     {  "rev_primer_seq",       CSubSource::eSubtype_rev_primer_seq         },
705     {  "segment",              CSubSource::eSubtype_segment                },
706     {  "sex",                  CSubSource::eSubtype_sex                    },
707     {  "subclone",             CSubSource::eSubtype_subclone               },
708     {  "tissue_lib ",          CSubSource::eSubtype_tissue_lib             },
709     {  "tissue_type",          CSubSource::eSubtype_tissue_type            },
710     {  "transgenic",           CSubSource::eSubtype_transgenic             },
711     {  "transposon",           CSubSource::eSubtype_transposon_name        }
712 };
713 
714 typedef CStaticPairArrayMap <const char*, CSubSource::ESubtype, PCase_CStr> TSubSrcMap;
715 DEFINE_STATIC_ARRAY_MAP(TSubSrcMap, sm_SubSrcKeys, subsrc_key_to_subtype);
716 
717 // case-insensitive version of sm_SubSrcKeys
718 typedef CStaticPairArrayMap <const char*, CSubSource::ESubtype, PNocase_CStr> TSubSrcNoCaseMap;
719 DEFINE_STATIC_ARRAY_MAP(
720     TSubSrcNoCaseMap, sm_SubSrcNoCaseKeys, subsrc_key_to_subtype);
721 
722 typedef SStaticPair<const char *, COrgMod::ESubtype> TOrgModKey;
723 
724 static const TOrgModKey orgmod_key_to_subtype [] = {
725     {  "acronym",            COrgMod::eSubtype_acronym             },
726     {  "anamorph",           COrgMod::eSubtype_anamorph            },
727     {  "authority",          COrgMod::eSubtype_authority           },
728     {  "bio_material",       COrgMod::eSubtype_bio_material        },
729     {  "biotype",            COrgMod::eSubtype_biotype             },
730     {  "biovar",             COrgMod::eSubtype_biovar              },
731     {  "breed",              COrgMod::eSubtype_breed               },
732     {  "chemovar",           COrgMod::eSubtype_chemovar            },
733     {  "common",             COrgMod::eSubtype_common              },
734     {  "cultivar",           COrgMod::eSubtype_cultivar            },
735     {  "culture_collection", COrgMod::eSubtype_culture_collection  },
736     {  "dosage",             COrgMod::eSubtype_dosage              },
737     {  "ecotype",            COrgMod::eSubtype_ecotype             },
738     {  "forma",              COrgMod::eSubtype_forma               },
739     {  "forma_specialis",    COrgMod::eSubtype_forma_specialis     },
740     {  "gb_acronym",         COrgMod::eSubtype_gb_acronym          },
741     {  "gb_anamorph",        COrgMod::eSubtype_gb_anamorph         },
742     {  "gb_synonym",         COrgMod::eSubtype_gb_synonym          },
743     {  "group",              COrgMod::eSubtype_group               },
744     {  "isolate",            COrgMod::eSubtype_isolate             },
745     {  "metagenome_source",  COrgMod::eSubtype_metagenome_source   },
746     {  "nat_host",           COrgMod::eSubtype_nat_host            },
747     {  "natural_host",       COrgMod::eSubtype_nat_host            },
748     {  "old_lineage",        COrgMod::eSubtype_old_lineage         },
749     {  "old_name",           COrgMod::eSubtype_old_name            },
750     {  "pathovar",           COrgMod::eSubtype_pathovar            },
751     {  "serogroup",          COrgMod::eSubtype_serogroup           },
752     {  "serotype",           COrgMod::eSubtype_serotype            },
753     {  "serovar",            COrgMod::eSubtype_serovar             },
754     {  "spec_host",          COrgMod::eSubtype_nat_host            },
755     {  "specific_host",      COrgMod::eSubtype_nat_host            },
756     {  "specimen_voucher",   COrgMod::eSubtype_specimen_voucher    },
757     {  "strain",             COrgMod::eSubtype_strain              },
758     {  "sub_species",        COrgMod::eSubtype_sub_species         },
759     {  "subgroup",           COrgMod::eSubtype_subgroup            },
760     {  "substrain",          COrgMod::eSubtype_substrain           },
761     {  "subtype",            COrgMod::eSubtype_subtype             },
762     {  "synonym",            COrgMod::eSubtype_synonym             },
763     {  "teleomorph",         COrgMod::eSubtype_teleomorph          },
764     {  "type",               COrgMod::eSubtype_type                },
765     {  "type_material",      COrgMod::eSubtype_type_material       },
766     {  "variety",            COrgMod::eSubtype_variety             }
767 };
768 
769 typedef CStaticPairArrayMap <const char*, COrgMod::ESubtype, PCase_CStr> TOrgModMap;
770 DEFINE_STATIC_ARRAY_MAP(TOrgModMap, sm_OrgModKeys, orgmod_key_to_subtype);
771 
772 static const map<const char*, int, PNocase_CStr> sm_TrnaKeys
773 {
774     {  "Ala",            'A'  },
775     {  "Alanine",        'A'  },
776     {  "Arg",            'R'  },
777     {  "Arginine",       'R'  },
778     {  "Asn",            'N'  },
779     {  "Asp",            'D'  },
780     {  "Asp or Asn",     'B'  },
781     {  "Asparagine",     'N'  },
782     {  "Aspartate",      'D'  },
783     {  "Aspartic Acid",  'D'  },
784     {  "Asx",            'B'  },
785     {  "Cys",            'C'  },
786     {  "Cysteine",       'C'  },
787     {  "Gln",            'Q'  },
788     {  "Glu",            'E'  },
789     {  "Glu or Gln",     'Z'  },
790     {  "Glutamate",      'E'  },
791     {  "Glutamic Acid",  'E'  },
792     {  "Glutamine",      'Q'  },
793     {  "Glx",            'Z'  },
794     {  "Gly",            'G'  },
795     {  "Glycine",        'G'  },
796     {  "His",            'H'  },
797     {  "Histidine",      'H'  },
798     {  "Ile",            'I'  },
799     {  "Ile2",           'I'  },
800     {  "Isoleucine",     'I'  },
801     {  "Leu",            'L'  },
802     {  "Leu or Ile",     'J'  },
803     {  "Leucine",        'L'  },
804     {  "Lys",            'K'  },
805     {  "Lysine",         'K'  },
806     {  "Met",            'M'  },
807     {  "Methionine",     'M'  },
808     {  "OTHER",          'X'  },
809     {  "Phe",            'F'  },
810     {  "Phenylalanine",  'F'  },
811     {  "Pro",            'P'  },
812     {  "Proline",        'P'  },
813     {  "Pyl",            'O'  },
814     {  "Pyrrolysine",    'O'  },
815     {  "Sec",            'U'  },
816     {  "Selenocysteine", 'U'  },
817     {  "Ser",            'S'  },
818     {  "Serine",         'S'  },
819     {  "TERM",           '*'  },
820     {  "Ter",            '*'  },
821     {  "Termination",    '*'  },
822     {  "Thr",            'T'  },
823     {  "Threonine",      'T'  },
824     {  "Trp",            'W'  },
825     {  "Tryptophan",     'W'  },
826     {  "Tyr",            'Y'  },
827     {  "Tyrosine",       'Y'  },
828     {  "Val",            'V'  },
829     {  "Valine",         'V'  },
830     {  "Xle",            'J'  },
831     {  "Xxx",            'X'  },
832     {  "Undet",          'X'  },
833     {  "fMet",           'M'  },
834     {  "iMet",           'M'  }
835 };
836 
837 
838 static
839 set<const char*, PCase_CStr>
840 sc_SingleKeys {
841     "environmental_sample",
842     "germline",
843     "metagenomic",
844     "partial",
845     "pseudo",
846     "rearranged",
847     "ribosomal_slippage",
848     "trans_splicing",
849     "transgenic",
850     "replace" // RW-882
851 };
852 
853 // constructor
CFeatureTableReader_Imp(ILineReader * reader,unsigned int line_num,ILineErrorListener * pMessageListener)854 CFeatureTableReader_Imp::CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener)
855     : m_reader(reader), m_LineNumber(line_num), m_pMessageListener(pMessageListener)
856 {
857 }
858 
859 // destructor
~CFeatureTableReader_Imp(void)860 CFeatureTableReader_Imp::~CFeatureTableReader_Imp(void)
861 {
862 }
863 
x_TryToParseOffset(const CTempString & sLine,Int4 & out_offset)864 bool CFeatureTableReader_Imp::x_TryToParseOffset(
865     const CTempString & sLine, Int4 & out_offset )
866 {
867     // offset strings are of the form [offset=SOME_NUMBER], but here we try
868     // to be as forgiving of whitespace as possible.
869 
870     CTempString sKey;
871     CTempString sValue;
872     if( ! NStr::SplitInTwo(sLine, "=", sKey, sValue) ) {
873         // "=" not found
874         return false;
875     }
876 
877     // check key
878     NStr::TruncateSpacesInPlace(sKey);
879     if( NStr::StartsWith(sKey, "[") ) {
880         sKey = sKey.substr(1); // remove initial "["
881     }
882     NStr::TruncateSpacesInPlace(sKey, NStr::eTrunc_Begin);
883     if( ! NStr::EqualNocase(sKey, "offset") ) {
884         // key is not offset
885         return false;
886     }
887 
888     // check value
889     NStr::TruncateSpacesInPlace(sValue);
890     if( ! NStr::EndsWith(sValue, "]") ) {
891         // no closing bracket
892         return false;
893     }
894     // remove closing bracket
895     sValue = sValue.substr(0, (sValue.length() - 1) );
896     NStr::TruncateSpacesInPlace(sValue, NStr::eTrunc_End);
897     // is it a number?
898     try {
899         Int4 new_offset = NStr::StringToInt(sValue);
900     //    if( new_offset < 0 ) {
901     //        return false;
902     //    }
903         out_offset = new_offset;
904         return true;
905     } catch ( CStringException & ) {
906         return false;
907     }
908 }
909 
x_ParseFeatureTableLine(const CTempString & line,SFeatLocInfo & loc_info,string & featP,string & qualP,string & valP,Int4 offset)910 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
911     const CTempString& line,
912     SFeatLocInfo& loc_info,
913     string& featP,
914     string& qualP,
915     string& valP,
916     Int4 offset
917 )
918 
919 {
920     SIZE_TYPE      numtkns;
921     bool           isminus = false;
922     bool           ispoint = false;
923     size_t         len;
924     bool           partial5 = false;
925     bool           partial3 = false;
926     Int4           startv = -1;
927     Int4           stopv = -1;
928     Int4           swp;
929     string         start, stop, feat, qual, val, stnd;
930     vector<string> tkns;
931 
932 
933     if (line.empty ()) return false;
934 
935     /* offset and other instructions encoded in brackets */
936     if (NStr::StartsWith (line, '[')) return false;
937 
938     tkns.clear ();
939     x_TokenizeLenient(line, tkns);
940     numtkns = tkns.size ();
941 
942     if (numtkns > 0) {
943         start = NStr::TruncateSpaces(tkns[0]);
944     }
945     if (numtkns > 1) {
946         stop = NStr::TruncateSpaces(tkns[1]);
947     }
948     if (numtkns > 2) {
949         feat = NStr::TruncateSpaces(tkns[2]);
950     }
951     if (numtkns > 3) {
952         qual = NStr::TruncateSpaces(tkns[3]);
953     }
954     if (numtkns > 4) {
955         val = NStr::TruncateSpaces(tkns[4]);
956         // trim enclosing double-quotes
957         if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
958             val = val.substr(1, val.length() - 2);
959         }
960     }
961     if (numtkns > 5) {
962         stnd = NStr::TruncateSpaces(tkns[5]);
963     }
964 
965     bool has_start = false;
966     if (! start.empty ()) {
967         if (start [0] == '<') {
968             partial5 = true;
969             start.erase (0, 1);
970         }
971         len = start.length ();
972         if (len > 1 && start [len - 1] == '^') {
973           ispoint = true;
974           start [len - 1] = '\0';
975         }
976         startv = x_StringToLongNoThrow(start, feat, qual,
977             ILineError::eProblem_BadFeatureInterval);
978         has_start = true;
979     }
980 
981     bool has_stop = false;
982     if (! stop.empty ()) {
983         if (stop [0] == '>') {
984             partial3 = true;
985             stop.erase (0, 1);
986         }
987         stopv = x_StringToLongNoThrow (stop, feat, qual,
988             ILineError::eProblem_BadFeatureInterval);
989         has_stop = true;
990     }
991 
992     if ( startv <= 0 || stopv <= 0 ) {
993         startv = -1;
994         stopv = -1;
995     } else {
996         startv--;
997         stopv--;
998         if (! stnd.empty ()) {
999             if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1000                 if (start < stop) {
1001                     swp = startv;
1002                     startv = stopv;
1003                     stopv = swp;
1004                 }
1005                 isminus = true;
1006             }
1007         }
1008     }
1009 
1010     if (startv >= 0) {
1011         startv += offset;
1012     }
1013     if (stopv >= 0) {
1014         stopv += offset;
1015     }
1016 
1017     if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1018         x_ProcessMsg(
1019             ILineError::eProblem_FeatureBadStartAndOrStop,
1020             eDiag_Error,
1021             feat);
1022     }
1023 
1024     loc_info.start_pos = ( startv < 0 ? -1 : startv);
1025     loc_info.stop_pos = ( stopv < 0 ? -1 : stopv);
1026 
1027     loc_info.is_5p_partial = partial5;
1028     loc_info.is_3p_partial = partial3;
1029     loc_info.is_point = ispoint;
1030     loc_info.is_minus_strand = isminus;
1031     featP = feat;
1032     qualP = qual;
1033     valP = val;
1034 
1035     return true;
1036 }
1037 
1038 /*
1039 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
1040     const CTempString& line,
1041     Int4* startP,
1042     Int4* stopP,
1043     bool* partial5P,
1044     bool* partial3P,
1045     bool* ispointP,
1046     bool* isminusP,
1047     string& featP,
1048     string& qualP,
1049     string& valP,
1050     Int4 offset
1051 )
1052 
1053 {
1054     SIZE_TYPE      numtkns;
1055     bool           isminus = false;
1056     bool           ispoint = false;
1057     size_t         len;
1058     bool           partial5 = false;
1059     bool           partial3 = false;
1060     Int4           startv = -1;
1061     Int4           stopv = -1;
1062     Int4           swp;
1063     string         start, stop, feat, qual, val, stnd;
1064     vector<string> tkns;
1065 
1066 
1067     if (line.empty ()) return false;
1068 
1069     if (NStr::StartsWith (line, '[')) return false;
1070 
1071     tkns.clear ();
1072     x_TokenizeLenient(line, tkns);
1073     numtkns = tkns.size ();
1074 
1075     if (numtkns > 0) {
1076         start = NStr::TruncateSpaces(tkns[0]);
1077     }
1078     if (numtkns > 1) {
1079         stop = NStr::TruncateSpaces(tkns[1]);
1080     }
1081     if (numtkns > 2) {
1082         feat = NStr::TruncateSpaces(tkns[2]);
1083     }
1084     if (numtkns > 3) {
1085         qual = NStr::TruncateSpaces(tkns[3]);
1086     }
1087     if (numtkns > 4) {
1088         val = NStr::TruncateSpaces(tkns[4]);
1089         // trim enclosing double-quotes
1090         if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
1091             val = val.substr(1, val.length() - 2);
1092         }
1093     }
1094     if (numtkns > 5) {
1095         stnd = NStr::TruncateSpaces(tkns[5]);
1096     }
1097 
1098     bool has_start = false;
1099     if (! start.empty ()) {
1100         if (start [0] == '<') {
1101             partial5 = true;
1102             start.erase (0, 1);
1103         }
1104         len = start.length ();
1105         if (len > 1 && start [len - 1] == '^') {
1106           ispoint = true;
1107           start [len - 1] = '\0';
1108         }
1109         startv = x_StringToLongNoThrow(start, feat, qual,
1110             ILineError::eProblem_BadFeatureInterval);
1111         has_start = true;
1112     }
1113 
1114     bool has_stop = false;
1115     if (! stop.empty ()) {
1116         if (stop [0] == '>') {
1117             partial3 = true;
1118             stop.erase (0, 1);
1119         }
1120         stopv = x_StringToLongNoThrow (stop, feat, qual,
1121             ILineError::eProblem_BadFeatureInterval);
1122         has_stop = true;
1123     }
1124 
1125     if ( startv <= 0 || stopv <= 0 ) {
1126         startv = -1;
1127         stopv = -1;
1128     } else {
1129         startv--;
1130         stopv--;
1131         if (! stnd.empty ()) {
1132             if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1133                 if (start < stop) {
1134                     swp = startv;
1135                     startv = stopv;
1136                     stopv = swp;
1137                 }
1138                 isminus = true;
1139             }
1140         }
1141     }
1142 
1143     if (startv >= 0) {
1144         startv += offset;
1145     }
1146     if (stopv >= 0) {
1147         stopv += offset;
1148     }
1149 
1150     if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1151         x_ProcessMsg(
1152             ILineError::eProblem_FeatureBadStartAndOrStop,
1153             eDiag_Error,
1154             feat);
1155     }
1156 
1157     *startP = ( startv < 0 ? -1 : startv);
1158     *stopP = ( stopv < 0 ? -1 : stopv);
1159 
1160     *partial5P = partial5;
1161     *partial3P = partial3;
1162     *ispointP = ispoint;
1163     *isminusP = isminus;
1164     featP = feat;
1165     qualP = qual;
1166     valP = val;
1167 
1168     return true;
1169 }
1170 */
1171 
x_TokenizeStrict(const CTempString & line,vector<string> & out_tokens)1172 void CFeatureTableReader_Imp::x_TokenizeStrict(
1173     const CTempString &line,
1174     vector<string> &out_tokens )
1175 {
1176     out_tokens.clear();
1177 
1178     // each token has spaces before it and a tab or end-of-line after it
1179     string::size_type startPosOfNextRoundOfTokenization = 0;
1180     while ( startPosOfNextRoundOfTokenization < line.size() ) {
1181         auto posAfterSpaces = line.find_first_not_of( " ", startPosOfNextRoundOfTokenization );
1182         if( posAfterSpaces == string::npos ) {
1183             return;
1184         }
1185 
1186         string::size_type posOfTab = line.find( '\t', posAfterSpaces );
1187         if( posOfTab == string::npos ) {
1188             posOfTab = line.length();
1189         }
1190 
1191         // The next token is between the spaces and the tab (or end of string)
1192         out_tokens.push_back(kEmptyStr);
1193         string &new_token = out_tokens.back();
1194         copy( line.begin() + posAfterSpaces, line.begin() + posOfTab, back_inserter(new_token) );
1195         NStr::TruncateSpacesInPlace( new_token );
1196 
1197         startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1198     }
1199 }
1200 
1201 // since some compilers won't let me use isspace for find_if
1202 class CIsSpace {
1203 public:
operator ()(char c)1204     bool operator()( char c ) { return isspace(c); }
1205 };
1206 
1207 class CIsNotSpace {
1208 public:
operator ()(char c)1209     bool operator()( char c ) { return ! isspace(c); }
1210 };
1211 
x_TokenizeLenient(const CTempString & line,vector<string> & out_tokens)1212 void CFeatureTableReader_Imp::x_TokenizeLenient(
1213     const CTempString &line,
1214     vector<string> &out_tokens )
1215 {
1216     out_tokens.clear();
1217 
1218     if( line.empty() ) {
1219         return;
1220     }
1221 
1222     // if it starts with whitespace, it must be a qual line, else it's a feature line
1223     if( isspace(line[0]) ) {
1224         // In regex form, we're doing something like this:
1225         // \s+(\S+)(\s+(\S.*))?
1226         // Where the first is the qual, and the rest is the val
1227         auto start_of_qual = find_if( line.begin(), line.end(), CIsNotSpace() );
1228         if( start_of_qual == line.end() ) {
1229             return;
1230         }
1231         auto start_of_whitespace_after_qual = find_if( start_of_qual, line.end(), CIsSpace() );
1232         auto start_of_val = find_if( start_of_whitespace_after_qual, line.end(), CIsNotSpace() );
1233 
1234         // first 3 are empty
1235         out_tokens.push_back(kEmptyStr);
1236         out_tokens.push_back(kEmptyStr);
1237         out_tokens.push_back(kEmptyStr);
1238 
1239         // then qual
1240         out_tokens.push_back(kEmptyStr);
1241         string &qual = out_tokens.back();
1242         copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1243 
1244         // then val
1245         if( start_of_val != line.end() ) {
1246             out_tokens.push_back(kEmptyStr);
1247             string &val = out_tokens.back();
1248             copy( start_of_val, line.end(), back_inserter(val) );
1249             NStr::TruncateSpacesInPlace( val );
1250         }
1251 
1252     } else {
1253         // parse a feature line
1254 
1255         // Since we're being lenient, we consider it to be 3 ( or 6 ) parts separated by whitespace
1256         auto first_column_start = line.begin();
1257         auto first_whitespace = find_if( first_column_start, line.end(), CIsSpace() );
1258         auto second_column_start = find_if( first_whitespace, line.end(), CIsNotSpace() );
1259         auto second_whitespace = find_if( second_column_start, line.end(), CIsSpace() );
1260         auto third_column_start = find_if( second_whitespace, line.end(), CIsNotSpace() );
1261         auto third_whitespace = find_if( third_column_start, line.end(), CIsSpace() );
1262         // columns 4 and 5 are unused on feature lines
1263         auto sixth_column_start = find_if( third_whitespace, line.end(), CIsNotSpace() );
1264         auto sixth_whitespace = find_if( sixth_column_start, line.end(), CIsSpace() );
1265 
1266         out_tokens.push_back(kEmptyStr);
1267         string &first = out_tokens.back();
1268         copy( first_column_start, first_whitespace, back_inserter(first) );
1269 
1270         out_tokens.push_back(kEmptyStr);
1271         string &second = out_tokens.back();
1272         copy( second_column_start, second_whitespace, back_inserter(second) );
1273 
1274         out_tokens.push_back(kEmptyStr);
1275         string &third = out_tokens.back();
1276         copy( third_column_start, third_whitespace, back_inserter(third) );
1277 
1278         if( sixth_column_start != line.end() ) {
1279             // columns 4 and 5 are unused
1280             out_tokens.push_back(kEmptyStr);
1281             out_tokens.push_back(kEmptyStr);
1282 
1283             out_tokens.push_back(kEmptyStr);
1284             string &sixth = out_tokens.back();
1285             copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1286         }
1287     }
1288 }
1289 
1290 
x_AddQualifierToGene(CSeqFeatData & sfdata,EQual qtype,const string & val)1291 bool CFeatureTableReader_Imp::x_AddQualifierToGene (
1292     CSeqFeatData& sfdata,
1293     EQual qtype,
1294     const string& val
1295 )
1296 
1297 {
1298     CGene_ref& grp = sfdata.SetGene ();
1299     switch (qtype) {
1300         case eQual_gene:
1301             grp.SetLocus (val);
1302             return true;
1303         case eQual_allele:
1304             grp.SetAllele (val);
1305             return true;
1306         case eQual_gene_desc:
1307             grp.SetDesc (val);
1308             return true;
1309         case eQual_gene_syn:
1310             {
1311                 CGene_ref::TSyn& syn = grp.SetSyn ();
1312                 syn.push_back (val);
1313                 return true;
1314             }
1315         case eQual_map:
1316             grp.SetMaploc (val);
1317             return true;
1318         case eQual_locus_tag:
1319             grp.SetLocus_tag (val);
1320             return true;
1321         case eQual_nomenclature:
1322             /* !!! need to implement !!! */
1323             return true;
1324         default:
1325             break;
1326     }
1327     return false;
1328 }
1329 
1330 
x_AddQualifierToCdregion(CRef<CSeq_feat> sfp,CSeqFeatData & sfdata,EQual qtype,const string & val)1331 bool CFeatureTableReader_Imp::x_AddQualifierToCdregion (
1332     CRef<CSeq_feat> sfp,
1333     CSeqFeatData& sfdata,
1334     EQual qtype, const string& val
1335 )
1336 
1337 {
1338     CCdregion& crp = sfdata.SetCdregion ();
1339     switch (qtype) {
1340         case eQual_codon_start:
1341             {
1342                 int frame = x_StringToLongNoThrow (val, kCdsFeatName, "codon_start");
1343                 switch (frame) {
1344                     case 0:
1345                         crp.SetFrame (CCdregion::eFrame_not_set);
1346                         break;
1347                     case 1:
1348                         crp.SetFrame (CCdregion::eFrame_one);
1349                         break;
1350                     case 2:
1351                         crp.SetFrame (CCdregion::eFrame_two);
1352                         break;
1353                     case 3:
1354                         crp.SetFrame (CCdregion::eFrame_three);
1355                         break;
1356                     default:
1357                         break;
1358                 }
1359                 return true;
1360             }
1361         case eQual_EC_number:
1362             {
1363                 CProt_ref& prp = sfp->SetProtXref ();
1364                 CProt_ref::TEc& ec = prp.SetEc ();
1365                 ec.push_back (val);
1366                 return true;
1367             }
1368         case eQual_function:
1369             {
1370                 CProt_ref& prp = sfp->SetProtXref ();
1371                 CProt_ref::TActivity& fun = prp.SetActivity ();
1372                 fun.push_back (val);
1373                 return true;
1374             }
1375         case eQual_product:
1376             {
1377                 CProt_ref& prp = sfp->SetProtXref ();
1378                 CProt_ref::TName& prod = prp.SetName ();
1379                 prod.push_back (val);
1380                 return true;
1381             }
1382         case eQual_prot_desc:
1383             {
1384                 CProt_ref& prp = sfp->SetProtXref ();
1385                 prp.SetDesc (val);
1386                 return true;
1387             }
1388         case eQual_prot_note:
1389             return x_AddGBQualToFeature(sfp, "prot_note", val);
1390         case eQual_transl_except:
1391             // add as GBQual, let cleanup convert to code_break
1392             return x_AddGBQualToFeature(sfp, "transl_except", val);
1393         case eQual_translation:
1394             // we should accept, but ignore this qual on CDSs.
1395             // so, do nothing but return success
1396             return true;
1397         case eQual_transl_table:
1398             // set genetic code directly, or add qualifier and let cleanup convert?
1399             try {
1400                 int num = NStr::StringToLong(val);
1401                 CGen_code_table::GetTransTable(num); // throws if bad num
1402                 CRef<CGenetic_code::C_E> code(new CGenetic_code::C_E());
1403                 code->SetId(num);
1404                 crp.SetCode().Set().push_back(code);
1405                 return true;
1406             } catch( CStringException ) {
1407                 // if val is not a number, add qualifier directly and
1408                 // let cleanup convert?
1409                 return x_AddGBQualToFeature(sfp, "transl_table", val);
1410             } catch( ... ) {
1411                 // invalid genome code table so don't even try to make
1412                 // the transl_table qual
1413                 x_ProcessMsg(
1414                     ILineError::eProblem_QualifierBadValue, eDiag_Error,
1415                     kCdsFeatName, "transl_table", val);
1416                 return true;
1417             }
1418             break;
1419 
1420         default:
1421             break;
1422     }
1423     return false;
1424 }
1425 
1426 
x_StringIsJustQuotes(const string & str)1427 bool CFeatureTableReader_Imp::x_StringIsJustQuotes (
1428     const string& str
1429 )
1430 
1431 {
1432     ITERATE (string, it, str) {
1433       char ch = *it;
1434       if (ch > ' ' && ch != '"' && ch != '\'') return false;
1435     }
1436 
1437     return true;
1438 }
1439 
1440 static bool
s_LineIndicatesOrder(const CTempString & line)1441 s_LineIndicatesOrder( const CTempString & line )
1442 {
1443     // basically, this is true if the line starts with "order" (whitespaces disregarded)
1444 
1445     const static char* kOrder = "ORDER";
1446 
1447     // find first non-whitespace character
1448     string::size_type pos = 0;
1449     for( ; pos < line.length() && isspace(line[pos]); ++pos) {
1450         // nothing to do here
1451     }
1452 
1453     // line is all whitespace
1454     if( pos >= line.length() ) {
1455         return false;
1456     }
1457 
1458     // check if starts with "order" after whitespace
1459     return ( 0 == NStr::CompareNocase( line, pos, strlen(kOrder), kOrder ) );
1460 }
1461 
1462 // Turns a "join" location into an "order" by putting nulls between it
1463 // Returns an unset CRef if the loc doesn't need nulls (e.g. if it's just an interval)
1464 static CRef<CSeq_loc>
s_LocationJoinToOrder(const CSeq_loc & loc)1465 s_LocationJoinToOrder( const CSeq_loc & loc )
1466 {
1467     // create result we're returning
1468     CRef<CSeq_loc> result( new CSeq_loc );
1469     CSeq_loc_mix::Tdata & mix_pieces  = result->SetMix().Set();
1470 
1471     // keep this around for whenever we need a "null" piece
1472     CRef<CSeq_loc> loc_piece_null( new CSeq_loc );
1473     loc_piece_null->SetNull();
1474 
1475     // push pieces of source, with NULLs between
1476     CSeq_loc_CI loc_iter( loc );
1477     for( ; loc_iter; ++loc_iter ) {
1478         if( ! mix_pieces.empty() ) {
1479             mix_pieces.push_back( loc_piece_null );
1480         }
1481         CRef<CSeq_loc> new_piece( new CSeq_loc );
1482         new_piece->Assign( loc_iter.GetEmbeddingSeq_loc() );
1483         mix_pieces.push_back( new_piece );
1484     }
1485 
1486     // Only wrap in "mix" if there was more than one piece
1487     if( mix_pieces.size() > 1 ) {
1488         return result;
1489     } else {
1490         return CRef<CSeq_loc>();
1491     }
1492 }
1493 
1494 
x_TrnaToAaString(const string & val)1495 string CFeatureTableReader_Imp::x_TrnaToAaString(
1496     const string& val
1497 )
1498 {
1499     CTempString value(val);
1500 
1501     if (NStr::StartsWith(value, "tRNA-")) {
1502         value.assign(value, strlen("tRNA-"), CTempString::npos);
1503     }
1504 
1505     CTempString::size_type pos = value.find_first_of("-,;:()=\'_~");
1506     if (pos != CTempString::npos) {
1507         value.erase(pos);
1508         NStr::TruncateSpacesInPlace(value);
1509     }
1510 
1511     return string(value);
1512 }
1513 
1514 
1515 bool
x_ParseTrnaExtString(CTrna_ext & ext_trna,const string & str)1516 CFeatureTableReader_Imp::x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str)
1517 {
1518     if (NStr::IsBlank (str)) return false;
1519 
1520     string normalized_string = str;
1521     normalized_string.erase(
1522             remove_if(begin(normalized_string),
1523                       end(normalized_string),
1524                       [](char c) { return isspace(c);}),
1525             end(normalized_string));
1526 
1527     if ( NStr::StartsWith(normalized_string, "(pos:") ) {
1528         // find position of closing paren
1529         string::size_type pos_end = x_MatchingParenPos( normalized_string, 0 );
1530         if (pos_end != string::npos) {
1531             string pos_str = normalized_string.substr (5, pos_end - 5);
1532             string::size_type aa_start = NStr::FindNoCase(pos_str, "aa:");
1533             if (aa_start != string::npos) {
1534                 auto seq_start = NStr::FindNoCase(pos_str, ",seq:");
1535                 if (seq_start != string::npos &&
1536                     seq_start < aa_start+3) {
1537                     return false;
1538                 }
1539 
1540                 size_t aa_length = (seq_start == NPOS) ?
1541                                 pos_str.size() - (aa_start+3) :
1542                                 seq_start - (aa_start+3);
1543 
1544                 string abbrev = pos_str.substr (aa_start + 3, aa_length);
1545                 //TTrnaMap::const_iterator
1546                 auto t_iter = sm_TrnaKeys.find (abbrev.c_str ());
1547                 if (t_iter == sm_TrnaKeys.end ()) {
1548                     // unable to parse
1549                     return false;
1550                 }
1551                 CRef<CTrna_ext::TAa> aa(new CTrna_ext::TAa);
1552                 aa->SetNcbieaa (t_iter->second);
1553                 ext_trna.SetAa(*aa);
1554                 pos_str = pos_str.substr (0, aa_start);
1555                 NStr::TruncateSpacesInPlace (pos_str);
1556                 if (NStr::EndsWith (pos_str, ",")) {
1557                     pos_str = pos_str.substr (0, pos_str.length() - 1);
1558                 }
1559             }
1560             CGetSeqLocFromStringHelper helper;
1561             CRef<CSeq_loc> anticodon = GetSeqLocFromString (pos_str, m_seq_id, & helper);
1562             if (anticodon == NULL) {
1563                 ext_trna.ResetAa();
1564                 return false;
1565             } else {
1566                 switch( anticodon->GetStrand() ) {
1567                 case eNa_strand_unknown:
1568                 case eNa_strand_plus:
1569                 case eNa_strand_minus:
1570                     ext_trna.SetAnticodon(*anticodon);
1571                     return true;
1572                 default:
1573                     ext_trna.ResetAa();
1574                     return false;
1575                 }
1576             }
1577         }
1578     }
1579 
1580     return false;
1581 }
1582 
1583 
x_MatchingParenPos(const string & str,SIZE_TYPE open_paren_pos)1584 SIZE_TYPE CFeatureTableReader_Imp::x_MatchingParenPos(
1585     const string &str, SIZE_TYPE open_paren_pos )
1586 {
1587     _ASSERT( str[open_paren_pos] == '(' );
1588     _ASSERT( open_paren_pos < str.length() );
1589 
1590     // nesting level. start at 1 since we know there's an open paren
1591     int level = 1;
1592 
1593     SIZE_TYPE pos = open_paren_pos + 1;
1594     for( ; pos < str.length(); ++pos ) {
1595         switch( str[pos] ) {
1596             case '(':
1597                 // nesting deeper
1598                 ++level;
1599                 break;
1600             case ')':
1601                 // closed a level of nesting
1602                 --level;
1603                 if( 0 == level ) {
1604                     // reached the top: we're closing the initial paren,
1605                     // so we return our position
1606                     return pos;
1607                 }
1608                 break;
1609             default:
1610                 // ignore other characters.
1611                 // maybe in the future we'll handle ignoring parens in quotes or
1612                 // things like that.
1613                 break;
1614         }
1615     }
1616     return NPOS;
1617 }
1618 
x_StringToLongNoThrow(CTempString strToConvert,CTempString strFeatureName,CTempString strQualifierName,ILineError::EProblem eProblem)1619 long CFeatureTableReader_Imp::x_StringToLongNoThrow (
1620     CTempString strToConvert,
1621     CTempString strFeatureName,
1622     CTempString strQualifierName,
1623     ILineError::EProblem eProblem
1624 )
1625 {
1626     try {
1627         return NStr::StringToLong(strToConvert);
1628     } catch( ... ) {
1629         // See if we start with a number, but there's extra junk after it, try again
1630         if( ! strToConvert.empty() && isdigit(strToConvert[0]) ) {
1631             try {
1632                 long result = NStr::StringToLong(strToConvert, NStr::fAllowTrailingSymbols);
1633 
1634                 ILineError::EProblem problem =
1635                     ILineError::eProblem_NumericQualifierValueHasExtraTrailingCharacters;
1636                 if( eProblem != ILineError::eProblem_Unset ) {
1637                     problem = eProblem;
1638                 }
1639 
1640                 x_ProcessMsg(
1641                     problem,
1642                     eDiag_Warning,
1643                     strFeatureName, strQualifierName, strToConvert );
1644                 return result;
1645             } catch( ... ) { } // fall-thru to usual handling
1646         }
1647 
1648         ILineError::EProblem problem =
1649             ILineError::eProblem_NumericQualifierValueIsNotANumber;
1650         if( eProblem != ILineError::eProblem_Unset ) {
1651             problem = eProblem;
1652         }
1653 
1654         x_ProcessMsg(
1655             problem,
1656             eDiag_Warning,
1657             strFeatureName, strQualifierName, strToConvert );
1658         // we have no idea, so just return zero
1659         return 0;
1660     }
1661 }
1662 
1663 
x_AddQualifierToRna(CRef<CSeq_feat> sfp,EQual qtype,const string & val)1664 bool CFeatureTableReader_Imp::x_AddQualifierToRna (
1665     CRef<CSeq_feat> sfp,
1666     EQual qtype,
1667     const string& val
1668 )
1669 {
1670     CSeqFeatData& sfdata = sfp->SetData();
1671     CRNA_ref& rrp = sfdata.SetRna ();
1672     CRNA_ref::EType rnatyp = rrp.GetType ();
1673     switch (rnatyp) {
1674         case CRNA_ref::eType_premsg:
1675         case CRNA_ref::eType_mRNA:
1676         case CRNA_ref::eType_rRNA:
1677             switch (qtype) {
1678                 case eQual_product:
1679                     {
1680                         CRNA_ref::TExt& tex = rrp.SetExt ();
1681                         CRNA_ref::C_Ext::E_Choice exttype = tex.Which ();
1682                         if (exttype == CRNA_ref::C_Ext::e_TRNA) return false;
1683                         tex.SetName (val);
1684                         return true;
1685                     }
1686                 default:
1687                     break;
1688             }
1689             break;
1690         case CRNA_ref::eType_ncRNA:
1691             switch (qtype) {
1692                 case eQual_product:
1693                     rrp.SetExt().SetGen().SetProduct(val);
1694                     return true;
1695                     break;
1696                 case eQual_ncRNA_class:
1697                     rrp.SetExt().SetGen().SetClass(val);
1698                     return true;
1699                     break;
1700                 default:
1701                     break;
1702             }
1703             break;
1704         case CRNA_ref::eType_tmRNA:
1705             switch (qtype) {
1706                 case eQual_product:
1707                     rrp.SetExt().SetGen().SetProduct(val);
1708                     return true;
1709                 case eQual_tag_peptide:
1710                   {
1711                     CRef<CRNA_qual> q(new CRNA_qual());
1712                     q->SetQual("tag_peptide");
1713                     q->SetVal(val);
1714                     rrp.SetExt().SetGen().SetQuals().Set().push_back(q);
1715                     return true;
1716                   }
1717                   break;
1718                 default:
1719                     break;
1720             }
1721             break;
1722         case CRNA_ref::eType_snRNA:
1723         case CRNA_ref::eType_scRNA:
1724         case CRNA_ref::eType_snoRNA:
1725         case CRNA_ref::eType_other:
1726             return false;
1727         case CRNA_ref::eType_tRNA:
1728             switch (qtype) {
1729                 case eQual_product: {
1730                         if (rrp.IsSetExt() && rrp.GetExt().Which() == CRNA_ref::C_Ext::e_Name)
1731                             return false;
1732 
1733                         const string& aa_string = x_TrnaToAaString(val);
1734                         const auto aaval_it = sm_TrnaKeys.find(aa_string.c_str());
1735 
1736                         if (aaval_it != sm_TrnaKeys.end()) {
1737                             CRNA_ref::TExt& tex = rrp.SetExt ();
1738                             CTrna_ext& trx = tex.SetTRNA();
1739                             CTrna_ext::TAa& taa = trx.SetAa();
1740                             taa.SetNcbieaa(aaval_it->second);
1741                             if (aa_string == "fMet" ||
1742                                 aa_string == "iMet" ||
1743                                 aa_string == "Ile2") {
1744                                x_AddGBQualToFeature(sfp, "product", val);
1745                             }
1746                         }
1747                         else {
1748                             x_ProcessMsg(
1749                                 ILineError::eProblem_QualifierBadValue, eDiag_Warning,
1750                                 "tRNA", "product", val);
1751                         }
1752                         return true;
1753                     }
1754                     break;
1755                 case eQual_anticodon:
1756                     {
1757                         CRNA_ref::TExt& tex = rrp.SetExt ();
1758                         CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1759                         if( ! x_ParseTrnaExtString(ext_trna, val) ) {
1760                             x_ProcessMsg(
1761                                 ILineError::eProblem_QualifierBadValue, eDiag_Error,
1762                                 "tRNA", "anticodon", val );
1763                         }
1764                         return true;
1765                     }
1766                     break;
1767                 case eQual_codon_recognized:
1768                     {
1769                         //const auto codon_index = CGen_code_table::CodonToIndex(val);
1770                         //if (codon_index >= 0) {
1771                             CRNA_ref::TExt& tex = rrp.SetExt ();
1772                             CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1773                             if (!x_AddCodons(val, ext_trna)) {
1774                                 return false;
1775                             }
1776                         //}
1777                         return true;
1778                     }
1779                     break;
1780                 default:
1781                     break;
1782             }
1783             break;
1784         default:
1785             break;
1786     }
1787     return false;
1788 }
1789 
1790 
x_AddCodons(const string & val,CTrna_ext & trna_ext) const1791 bool CFeatureTableReader_Imp::x_AddCodons(
1792         const string& val,
1793         CTrna_ext& trna_ext
1794     ) const
1795 {
1796     if (val.size() != 3) {
1797         return false;
1798     }
1799 
1800     set<int> codons;
1801     try {
1802         for (char char1 : s_IUPACmap.at(val[0])) {
1803             for (char char2 : s_IUPACmap.at(val[1])) {
1804                 for (char char3 : s_IUPACmap.at(val[2])) {
1805                     const auto codon_index = CGen_code_table::CodonToIndex(char1, char2, char3);
1806                     codons.insert(codon_index);
1807                 }
1808             }
1809         }
1810 
1811         if (!codons.empty()) {
1812             trna_ext.SetAa().SetNcbieaa();
1813             for (const auto codon_index : codons) {
1814                 trna_ext.SetCodon().push_back(codon_index);
1815             }
1816         }
1817         return true;
1818     }
1819     catch(...) {}
1820 
1821     return false;
1822 }
1823 
1824 
x_AddQualifierToImp(CRef<CSeq_feat> sfp,CSeqFeatData & sfdata,EQual qtype,const string & qual,const string & val)1825 bool CFeatureTableReader_Imp::x_AddQualifierToImp (
1826     CRef<CSeq_feat> sfp,
1827     CSeqFeatData& sfdata,
1828     EQual qtype,
1829     const string& qual,
1830     const string& val
1831 )
1832 
1833 {
1834     const char *str = NULL;
1835 
1836     CSeqFeatData::ESubtype subtype = sfdata.GetSubtype ();
1837 
1838     // used if-statement because CSeqFeatData::IsRegulatory won't work in a
1839     // switch statement.
1840     if( (subtype == CSeqFeatData::eSubtype_regulatory) ||
1841         CSeqFeatData::IsRegulatory(subtype) )
1842     {
1843         if (qtype == eQual_regulatory_class) {
1844             if (val != "other") { // RW-374 "other" is a special case
1845 
1846                 const vector<string>& allowed_values =
1847                     CSeqFeatData::GetRegulatoryClassList();
1848                 if (find(allowed_values.cbegin(), allowed_values.cend(), val)
1849                     == allowed_values.cend()) {
1850                     return false;
1851                 }
1852 
1853 /*
1854                 const CSeqFeatData::ESubtype regulatory_class_subtype =
1855                     CSeqFeatData::GetRegulatoryClass(val);
1856                 if( regulatory_class_subtype == CSeqFeatData::eSubtype_bad ) {
1857                     // msg will be sent in caller x_AddQualifierToFeature
1858                     return false;
1859                 }
1860                 */
1861             }
1862             // okay
1863             // (Note that at this time we don't validate
1864             // if the regulatory_class actually matches the
1865             // subtype)
1866             x_AddGBQualToFeature(sfp, qual, val);
1867             return true;
1868         }
1869     }
1870 
1871     switch (subtype) {
1872         case CSeqFeatData::eSubtype_variation:
1873             {
1874                 switch (qtype) {
1875                     case eQual_chrcnt:
1876                     case eQual_ctgcnt:
1877                     case eQual_loccnt:
1878                     case eQual_snp_class:
1879                     case eQual_snp_gtype:
1880                     case eQual_snp_het:
1881                     case eQual_snp_het_se:
1882                     case eQual_snp_linkout:
1883                     case eQual_snp_maxrate:
1884                     case eQual_snp_valid:
1885                     case eQual_weight:
1886                         str = "dbSnpSynonymyData";
1887                         break;
1888                     default:
1889                         break;
1890                 }
1891             }
1892             break;
1893         case CSeqFeatData::eSubtype_STS:
1894             {
1895                 switch (qtype) {
1896                     case eQual_sts_aliases:
1897                     case eQual_sts_dsegs:
1898                     case eQual_weight:
1899                         str = "stsUserObject";
1900                         break;
1901                     default:
1902                         break;
1903                 }
1904             }
1905             break;
1906         case CSeqFeatData::eSubtype_misc_feature:
1907             {
1908                 switch (qtype) {
1909                     case eQual_bac_ends:
1910                     case eQual_clone_id:
1911                     case eQual_method:
1912                     case eQual_sequence:
1913                     case eQual_STS:
1914                     case eQual_weight:
1915                         str = "cloneUserObject";
1916                         break;
1917                     default:
1918                         break;
1919                 }
1920             }
1921             break;
1922         default:
1923             break;
1924     }
1925 
1926     if( NULL != str ) {
1927         CSeq_feat::TExt& ext = sfp->SetExt ();
1928         CObject_id& obj = ext.SetType ();
1929         if ((! obj.IsStr ()) || obj.GetStr ().empty ()) {
1930             obj.SetStr ();
1931         }
1932         ext.AddField (qual, val, CUser_object::eParse_Number);
1933         return true;
1934     }
1935 
1936     return false;
1937 }
1938 
1939 
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,const string & feat_name,EOrgRef rtype,const string & val)1940 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
1941     CSeqFeatData& sfdata,
1942     const string &feat_name,
1943     EOrgRef rtype,
1944     const string& val
1945 )
1946 {
1947     CBioSource& bsp = sfdata.SetBiosrc ();
1948 
1949     switch (rtype) {
1950         case eOrgRef_organism:
1951             {
1952                 CBioSource::TOrg& orp = bsp.SetOrg ();
1953                 orp.SetTaxname (val);
1954                 return true;
1955             }
1956         case eOrgRef_organelle:
1957             {
1958                 TGenomeMap::const_iterator g_iter = sm_GenomeKeys.find (val.c_str ());
1959                 if (g_iter != sm_GenomeKeys.end ()) {
1960                     CBioSource::EGenome gtype = g_iter->second;
1961                     bsp.SetGenome (gtype);
1962                 } else {
1963                     x_ProcessMsg(
1964                         ILineError::eProblem_QualifierBadValue, eDiag_Error,
1965                         feat_name, "organelle", val );
1966                 }
1967                 return true;
1968             }
1969         case eOrgRef_div:
1970             {
1971                 CBioSource::TOrg& orp = bsp.SetOrg ();
1972                 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1973                 onp.SetDiv (val);
1974                 return true;
1975             }
1976         case eOrgRef_lineage:
1977             {
1978                 CBioSource::TOrg& orp = bsp.SetOrg ();
1979                 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1980                 onp.SetLineage (val);
1981                 return true;
1982             }
1983         case eOrgRef_gcode:
1984             {
1985                 CBioSource::TOrg& orp = bsp.SetOrg ();
1986                 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1987                 int code = x_StringToLongNoThrow (val, feat_name, "gcode");
1988                 onp.SetGcode (code);
1989                 return true;
1990             }
1991         case eOrgRef_mgcode:
1992             {
1993                 CBioSource::TOrg& orp = bsp.SetOrg ();
1994                 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1995                 int code = x_StringToLongNoThrow (val, feat_name, "mgcode");
1996                 onp.SetMgcode (code);
1997                 return true;
1998             }
1999         default:
2000             break;
2001     }
2002     return false;
2003 }
2004 
2005 
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,CSubSource::ESubtype stype,const string & val)2006 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
2007     CSeqFeatData& sfdata,
2008     CSubSource::ESubtype stype,
2009     const string& val
2010 )
2011 
2012 {
2013     CBioSource& bsp = sfdata.SetBiosrc ();
2014     CBioSource::TSubtype& slist = bsp.SetSubtype ();
2015     CRef<CSubSource> ssp (new CSubSource);
2016     ssp->SetSubtype (stype);
2017     ssp->SetName (val);
2018     slist.push_back (ssp);
2019     return true;
2020 }
2021 
2022 
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,COrgMod::ESubtype mtype,const string & val)2023 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
2024     CSeqFeatData& sfdata,
2025     COrgMod::ESubtype mtype,
2026     const string& val
2027 )
2028 
2029 {
2030     CBioSource& bsp = sfdata.SetBiosrc ();
2031     CBioSource::TOrg& orp = bsp.SetOrg ();
2032     COrg_ref::TOrgname& onp = orp.SetOrgname ();
2033     COrgName::TMod& mlist = onp.SetMod ();
2034     CRef<COrgMod> omp (new COrgMod);
2035     omp->SetSubtype (mtype);
2036     omp->SetSubname (val);
2037     mlist.push_back (omp);
2038     return true;
2039 }
2040 
2041 
x_AddGBQualToFeature(CRef<CSeq_feat> sfp,const string & qual,const string & val)2042 bool CFeatureTableReader_Imp::x_AddGBQualToFeature (
2043     CRef<CSeq_feat> sfp,
2044     const string& qual,
2045     const string& val
2046 )
2047 
2048 {
2049     if (qual.empty ()) return false;
2050 
2051     // need this pointer because references can't be repointed
2052     CTempString normalized_qual = qual;
2053 
2054     // normalize qual if needed, especially regarding case, and
2055     // use as-is if no normalization applies
2056     auto qual_type = CSeqFeatData::GetQualifierType(qual);
2057     if( qual_type != CSeqFeatData::eQual_bad ) {
2058         // swap is constant time
2059         CTempString potential_normalized_qual = CSeqFeatData::GetQualifierAsString(qual_type);
2060         if( ! potential_normalized_qual.empty() ) {
2061             normalized_qual = potential_normalized_qual;
2062         }
2063     }
2064 
2065     auto& qlist = sfp->SetQual ();
2066     CRef<CGb_qual> gbq (new CGb_qual);
2067     gbq->SetQual() = normalized_qual;
2068     if (x_StringIsJustQuotes (val)) {
2069         gbq->SetVal() = kEmptyStr;
2070     } else {
2071         gbq->SetVal() = val;
2072     }
2073     qlist.push_back (gbq);
2074 
2075     return true;
2076 }
2077 
2078 
x_CreateGenesFromCDSs(CRef<CSeq_annot> sap,TChoiceToFeatMap & choiceToFeatMap,const TFlags flags)2079 void CFeatureTableReader_Imp::x_CreateGenesFromCDSs(
2080     CRef<CSeq_annot> sap,
2081     TChoiceToFeatMap & choiceToFeatMap,
2082     const TFlags flags)
2083 {
2084     // load cds_equal_range to hold the CDSs
2085     typedef TChoiceToFeatMap::iterator TChoiceCI;
2086     typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
2087     TChoiceEqualRange cds_equal_range =
2088         choiceToFeatMap.equal_range(CSeqFeatData::e_Cdregion);
2089     if( cds_equal_range.first == cds_equal_range.second )
2090     {
2091         // nothing to do if there are no CDSs
2092         return;
2093     }
2094 
2095     // load mappings from locus or locus-tag to gene
2096     typedef multimap<string, SFeatAndLineNum> TStringToGeneAndLineMap;
2097     TStringToGeneAndLineMap locusToGeneAndLineMap;
2098     TStringToGeneAndLineMap locusTagToGeneAndLineMap;
2099     const TChoiceEqualRange gene_equal_range =
2100         choiceToFeatMap.equal_range(CSeqFeatData::e_Gene);
2101     for( TChoiceCI gene_choice_ci = gene_equal_range.first;
2102         gene_choice_ci != gene_equal_range.second;
2103         ++gene_choice_ci )
2104     {
2105         SFeatAndLineNum gene_feat_ref_and_line = gene_choice_ci->second;
2106         const CGene_ref & gene_ref = gene_feat_ref_and_line.m_pFeat->GetData().GetGene();
2107         if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus) ) {
2108             locusToGeneAndLineMap.insert(
2109                 TStringToGeneAndLineMap::value_type(
2110                     gene_ref.GetLocus(), gene_feat_ref_and_line));
2111         }
2112         if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus_tag) ) {
2113             locusTagToGeneAndLineMap.insert(
2114                 TStringToGeneAndLineMap::value_type(
2115                     gene_ref.GetLocus_tag(), gene_feat_ref_and_line));
2116         }
2117     }
2118 
2119     // for each CDS, check for gene conflicts or create genes,
2120     // depending on various flags
2121     for( TChoiceCI cds_choice_ci = cds_equal_range.first;
2122         cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
2123     {
2124         TFeatConstRef cds_feat_ref = cds_choice_ci->second.m_pFeat;
2125         const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2126 
2127         const CSeq_loc & cds_loc = cds_feat_ref->GetLocation();
2128 
2129         const CGene_ref * pGeneXrefOnCDS = cds_feat_ref->GetGeneXref();
2130         if( ! pGeneXrefOnCDS ) {
2131             // no xref, so can't do anything for this CDS
2132             // (this is NOT an error)
2133             continue;
2134         }
2135 
2136         // get all the already-existing genes that
2137         // this CDS xrefs.  It should be somewhat uncommon for there
2138         // to be more than one matching gene.
2139         set<SFeatAndLineNum> matchingGenes;
2140 
2141         const string locus =
2142             pGeneXrefOnCDS->IsSetLocus() ?
2143             pGeneXrefOnCDS->GetLocus() :
2144             "";
2145 
2146         const string locus_tag =
2147             pGeneXrefOnCDS->IsSetLocus_tag() ?
2148             pGeneXrefOnCDS->GetLocus_tag() :
2149             "";
2150 
2151 
2152         {{
2153             // all the code in this scope is all just for setting up matchingGenes
2154 
2155             typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2156             typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2157             set<SFeatAndLineNum> locusGeneMatches;
2158             // add the locus matches (if any) to genesAlreadyCreated
2159             if( !NStr::IsBlank(locus) ) {
2160                 TStrToGeneEqualRange locus_equal_range =
2161                     locusToGeneAndLineMap.equal_range(locus);
2162                 for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2163                     locus_gene_ci != locus_equal_range.second;
2164                     ++locus_gene_ci  )
2165                 {
2166                     if (!NStr::IsBlank(locus_tag)) {
2167                         auto gene_feat = locus_gene_ci->second.m_pFeat;
2168                         if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2169                             gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2170                             continue;
2171                         }
2172                     }
2173                     locusGeneMatches.insert(locus_gene_ci->second);
2174                 }
2175             }
2176             // remove any that don't also match the locus-tag (if any)
2177             set<SFeatAndLineNum> locusTagGeneMatches;
2178             if( !NStr::IsBlank(locus_tag) ) {
2179                 TStrToGeneEqualRange locus_tag_equal_range =
2180                     locusTagToGeneAndLineMap.equal_range(locus_tag);
2181                 for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2182                      locus_tag_gene_ci != locus_tag_equal_range.second;
2183                      ++locus_tag_gene_ci )
2184                 {
2185                     if (!NStr::IsBlank(locus)) {
2186                         auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2187                         if (gene_feat->GetData().GetGene().IsSetLocus() &&
2188                             gene_feat->GetData().GetGene().GetLocus() != locus) {
2189                             continue;
2190                         }
2191                     }
2192                     locusTagGeneMatches.insert(locus_tag_gene_ci->second);
2193                 }
2194             }
2195             // analyze locusGeneMatches and locusTagGeneMatches to find matchingGenes.
2196             if( locusGeneMatches.empty() ) {
2197                 // swap is faster than assignment
2198                 matchingGenes.swap(locusTagGeneMatches);
2199             } else if( locusTagGeneMatches.empty() ) {
2200                 // swap is faster than assignment
2201                 matchingGenes.swap(locusGeneMatches);
2202             } else {
2203                 // get only the genes that match both (that is, the intersection)
2204                 set_intersection(
2205                     locusGeneMatches.begin(), locusGeneMatches.end(),
2206                     locusTagGeneMatches.begin(), locusTagGeneMatches.end(),
2207                     inserter(matchingGenes, matchingGenes.begin()));
2208             }
2209         }}
2210 
2211         // if requested, check that the genes really do contain the CDS
2212         // (also check if we're trying to create a gene that already exists)
2213 
2214             ITERATE(set<SFeatAndLineNum>, gene_feat_and_line_ci, matchingGenes) {
2215                 const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2216                 const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2217 
2218                 if ((flags & CFeature_table_reader::fCDSsMustBeInTheirGenes) != 0) {
2219 
2220                     // CDS's loc minus gene's loc should be an empty location
2221                     // because the CDS should be entirely on the gene
2222                     CRef<CSeq_loc> pCdsMinusGeneLoc = cds_loc.Subtract(
2223                         gene_loc, CSeq_loc::fSortAndMerge_All, NULL, NULL);
2224                     if( pCdsMinusGeneLoc &&
2225                         ! pCdsMinusGeneLoc->IsNull() &&
2226                         ! pCdsMinusGeneLoc->IsEmpty() )
2227                     {
2228                         ILineError::TVecOfLines gene_lines;
2229                         if( gene_line_num > 0 ) {
2230                             gene_lines.push_back(gene_line_num);
2231                         }
2232                         x_ProcessMsg(
2233                             cds_line_num,
2234                             ILineError::eProblem_FeatMustBeInXrefdGene, eDiag_Error,
2235                             kCdsFeatName,
2236                             kEmptyStr, kEmptyStr, kEmptyStr,
2237                             gene_lines );
2238                     }
2239                 }
2240             }
2241 
2242         // if requested, create genes for the CDS if there isn't already one
2243         // (it is NOT an error if the gene is already created)
2244         if ( (flags & CFeature_table_reader::fCreateGenesFromCDSs) != 0 &&
2245             matchingGenes.empty() )
2246         {
2247             // create the gene
2248             CRef<CSeq_feat> pNewGene( new CSeq_feat );
2249             pNewGene->SetData().SetGene().Assign( *pGeneXrefOnCDS );
2250             if( FIELD_EQUALS(*cds_feat_ref, Partial, true) ) pNewGene->SetPartial(true);
2251             pNewGene->SetLocation().Assign( cds_feat_ref->GetLocation() );
2252 
2253             // add gene the annot
2254             _ASSERT( sap->IsFtable() );
2255             TFtable & the_ftable = sap->SetData().SetFtable();
2256             the_ftable.push_back(pNewGene);
2257 
2258             // add it to our local information for later CDSs
2259             SFeatAndLineNum  gene_feat_and_line(pNewGene, 0);
2260             choiceToFeatMap.insert(
2261                 TChoiceToFeatMap::value_type(
2262                     pNewGene->GetData().Which(), gene_feat_and_line ) );
2263             if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus) ) {
2264                 locusToGeneAndLineMap.insert(
2265                     TStringToGeneAndLineMap::value_type(
2266                         pGeneXrefOnCDS->GetLocus(), gene_feat_and_line));
2267             }
2268             if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus_tag) ) {
2269                 locusTagToGeneAndLineMap.insert(
2270                     TStringToGeneAndLineMap::value_type(
2271                         pGeneXrefOnCDS->GetLocus_tag(), gene_feat_and_line));
2272             }
2273         }
2274     } // end of iteration through the CDS's
2275 }
2276 
2277 static const string s_QualsWithCaps[] = {
2278   "EC_number",
2279   "PCR_conditions",
2280   "PubMed",
2281   "STS",
2282   "ncRNA_class"
2283 };
2284 
2285 static const int s_NumQualsWithCaps = sizeof (s_QualsWithCaps) / sizeof (string);
2286 
s_FixQualCapitalization(const string & qual)2287 static string s_FixQualCapitalization (const string& qual)
2288 {
2289     string lqual = qual;
2290     lqual = NStr::ToLower(lqual);
2291     for (int j = 0; j < s_NumQualsWithCaps; j++) {
2292         if (NStr::EqualNocase(lqual, s_QualsWithCaps[j])) {
2293             lqual = s_QualsWithCaps[j];
2294             break;
2295         }
2296     }
2297     return lqual;
2298 }
2299 
2300 
x_AddNoteToFeature(CRef<CSeq_feat> sfp,const string & note)2301 bool CFeatureTableReader_Imp::x_AddNoteToFeature(
2302         CRef<CSeq_feat> sfp,
2303         const string& note)
2304 {
2305     if (sfp.IsNull()) {
2306         return false;
2307     }
2308 
2309     if (NStr::IsBlank(note)) { // Nothing to do
2310         return true;
2311     }
2312 
2313     string comment = (sfp->CanGetComment()) ?
2314         sfp->GetComment() + "; " + note :
2315         note;
2316         sfp->SetComment(comment);
2317     return true;
2318 }
2319 
2320 
x_AddNoteToFeature(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val)2321 bool CFeatureTableReader_Imp::x_AddNoteToFeature(
2322     CRef<CSeq_feat> sfp,
2323     const string& feat_name,
2324     const string& qual,
2325     const string& val) {
2326 
2327     if (!x_AddNoteToFeature(sfp, val)) {
2328         return false;
2329     }
2330     // Else convert qualifier to note and issue warning
2331     if (qual != "note") {
2332         string error_message =
2333             qual + " is not a valid qualifier for this feature. Converting to note.";
2334         x_ProcessMsg(
2335         ILineError::eProblem_InvalidQualifier, eDiag_Warning,
2336         feat_name, qual, kEmptyStr, error_message);
2337     }
2338     return true;
2339 }
2340 
x_AddQualifierToFeature(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const TFlags flags)2341 bool CFeatureTableReader_Imp::x_AddQualifierToFeature (
2342     CRef<CSeq_feat> sfp,
2343     const string &feat_name,
2344     const string& qual,
2345     const string& val,
2346     const TFlags flags
2347 )
2348 
2349 {
2350     CSeqFeatData&          sfdata = sfp->SetData ();
2351     CSeqFeatData::E_Choice featType = sfdata.Which ();
2352 
2353     const CSeqFeatData::EQualifier qual_type =
2354         CSeqFeatData::GetQualifierType(qual);
2355     if( (flags & CFeature_table_reader::fReportDiscouragedKey) != 0 ) {
2356         if( CSeqFeatData::IsDiscouragedQual(qual_type) ) {
2357             x_ProcessMsg(
2358                 ILineError::eProblem_DiscouragedQualifierName,
2359                 eDiag_Warning, feat_name, qual);
2360         }
2361     }
2362 
2363     if (featType == CSeqFeatData::e_Biosrc) {
2364 
2365         TOrgRefMap::const_iterator o_iter = sm_OrgRefKeys.find (qual.c_str ());
2366         if (o_iter != sm_OrgRefKeys.end ()) {
2367             EOrgRef rtype = o_iter->second;
2368             if (x_AddQualifierToBioSrc (sfdata, feat_name, rtype, val)) return true;
2369         } else {
2370 
2371             TSubSrcMap::const_iterator s_iter = sm_SubSrcKeys.find (qual.c_str ());
2372             if (s_iter != sm_SubSrcKeys.end ()) {
2373 
2374                 CSubSource::ESubtype stype = s_iter->second;
2375                 if (x_AddQualifierToBioSrc (sfdata, stype, val)) return true;
2376 
2377             } else {
2378 
2379                 TOrgModMap::const_iterator m_iter = sm_OrgModKeys.find (qual.c_str ());
2380                 if (m_iter != sm_OrgModKeys.end ()) {
2381 
2382                     COrgMod::ESubtype  mtype = m_iter->second;
2383                     if (x_AddQualifierToBioSrc (sfdata, mtype, val)) return true;
2384                 }
2385             }
2386         }
2387         return false;
2388     }
2389 
2390 
2391     // else type != CSeqFeatData::e_Biosrc
2392     string lqual = s_FixQualCapitalization(qual);
2393     TQualMap::const_iterator q_iter = sm_QualKeys.find (lqual.c_str ());
2394     if (q_iter != sm_QualKeys.end ()) {
2395         EQual qtype = q_iter->second;
2396         switch (featType) {
2397             case CSeqFeatData::e_Gene:
2398                 if (x_AddQualifierToGene (sfdata, qtype, val)) return true;
2399                 break;
2400             case CSeqFeatData::e_Cdregion:
2401                 if (x_AddQualifierToCdregion (sfp, sfdata, qtype, val)) return true;
2402                 break;
2403             case CSeqFeatData::e_Rna:
2404                 if (x_AddQualifierToRna (sfp, qtype, val)) return true;
2405                 break;
2406             case CSeqFeatData::e_Imp:
2407                 if (x_AddQualifierToImp (sfp, sfdata, qtype, qual, val)) return true;
2408                 break;
2409             case CSeqFeatData::e_Region:
2410                 if (qtype == eQual_region_name) {
2411                     sfdata.SetRegion (val);
2412                     return true;
2413                 }
2414                 break;
2415             case CSeqFeatData::e_Bond:
2416                 if (qtype == eQual_bond_type) {
2417                     CSeqFeatData::EBond btyp = CSeqFeatData::eBond_other;
2418                     if (CSeqFeatData::GetBondList()->IsBondName(val.c_str(), btyp)) {
2419                         sfdata.SetBond (btyp);
2420                         return true;
2421                     }
2422                 }
2423                 break;
2424             case CSeqFeatData::e_Site:
2425                 if (qtype == eQual_site_type) {
2426                     CSeqFeatData::ESite styp = CSeqFeatData::eSite_other;
2427                     if (CSeqFeatData::GetSiteList()->IsSiteName( val.c_str(), styp)) {
2428                         sfdata.SetSite (styp);
2429                         return true;
2430                     }
2431                 }
2432                 break;
2433             case CSeqFeatData::e_Pub:
2434                 if( qtype == eQual_PubMed ) {
2435                     CRef<CPub> new_pub( new CPub );
2436                     new_pub->SetPmid( CPubMedId( ENTREZ_ID_FROM(long, x_StringToLongNoThrow(val, feat_name, qual)) ) );
2437                     sfdata.SetPub().SetPub().Set().push_back( new_pub );
2438                     return true;
2439                 }
2440                 break;
2441             case CSeqFeatData::e_Prot:
2442                 switch( qtype ) {
2443                 case eQual_product:
2444                     sfdata.SetProt().SetName().push_back( val );
2445                     return true;
2446                 case eQual_function:
2447                     sfdata.SetProt().SetActivity().push_back( val );
2448                     return true;
2449                 case eQual_EC_number:
2450                     sfdata.SetProt().SetEc().push_back( val );
2451                     return true;
2452                 default:
2453                     break;
2454                 }
2455                 break;
2456             default:
2457                 break;
2458             }
2459 
2460         switch (qtype) {
2461             case eQual_pseudo:
2462                 sfp->SetPseudo (true);
2463                 return true;
2464             case eQual_partial:
2465                 sfp->SetPartial (true);
2466                 return true;
2467             case eQual_exception:
2468                 sfp->SetExcept (true);
2469                 sfp->SetExcept_text (val);
2470                 return true;
2471             case eQual_ribosomal_slippage:
2472                 sfp->SetExcept (true);
2473                 sfp->SetExcept_text (qual);
2474                 return true;
2475             case eQual_trans_splicing:
2476                 sfp->SetExcept (true);
2477                 sfp->SetExcept_text (qual);
2478                 return true;
2479             case eQual_evidence:
2480                 if (val == "experimental") {
2481                     sfp->SetExp_ev (CSeq_feat::eExp_ev_experimental);
2482                 } else if (val == "not_experimental" || val == "non_experimental" ||
2483                            val == "not-experimental" || val == "non-experimental") {
2484                     sfp->SetExp_ev (CSeq_feat::eExp_ev_not_experimental);
2485                 }
2486                 return true;
2487             case eQual_note:
2488                     return x_AddNoteToFeature(sfp, val);
2489             case eQual_inference:
2490                 {
2491                     string prefix, remainder;
2492                     CInferencePrefixList::GetPrefixAndRemainder(val, prefix, remainder);
2493                     if (!NStr::IsBlank(prefix)) {
2494                         x_AddGBQualToFeature(sfp, qual, val);
2495                     }
2496                     else {
2497                         x_ProcessMsg(
2498                             ILineError::eProblem_QualifierBadValue, eDiag_Error,
2499                             feat_name, qual, val);
2500                     }
2501                     return true;
2502                 }
2503             case eQual_replace:
2504                 {
2505                     string val_copy = val;
2506                     NStr::ToLower( val_copy );
2507                     x_AddGBQualToFeature (sfp, qual, val_copy );
2508                     return true;
2509                 }
2510             case eQual_allele:
2511             case eQual_bound_moiety:
2512             case eQual_clone:
2513             case eQual_compare:
2514             case eQual_cons_splice:
2515             case eQual_direction:
2516             case eQual_EC_number:
2517             case eQual_estimated_length:
2518             case eQual_experiment:
2519             case eQual_frequency:
2520             case eQual_function:
2521             case eQual_gap_type:
2522             case eQual_insertion_seq:
2523             case eQual_label:
2524             case eQual_linkage_evidence:
2525             case eQual_map:
2526             case eQual_ncRNA_class:
2527             case eQual_number:
2528             case eQual_old_locus_tag:
2529             case eQual_operon:
2530             case eQual_organism:
2531             case eQual_PCR_conditions:
2532             case eQual_phenotype:
2533             case eQual_product:
2534             case eQual_pseudogene:
2535             case eQual_satellite:
2536             case eQual_rpt_family:
2537             case eQual_rpt_type:
2538             case eQual_rpt_unit:
2539             case eQual_rpt_unit_range:
2540             case eQual_rpt_unit_seq:
2541             case eQual_standard_name:
2542             case eQual_tag_peptide:
2543             case eQual_transposon:
2544             case eQual_usedin:
2545             case eQual_cyt_map:
2546             case eQual_gen_map:
2547             case eQual_rad_map:
2548             case eQual_mobile_element_type:
2549                 {
2550                     x_AddGBQualToFeature (sfp, qual, val);
2551                     return true;
2552                 }
2553             case eQual_gene:
2554                 {
2555                     if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2556                         CGene_ref& grp = sfp->SetGeneXref ();
2557                         if (val != "-") {
2558                             grp.SetLocus (val);
2559                         }
2560                         return true;
2561                     }
2562                     // else:
2563                     return x_AddNoteToFeature(sfp, feat_name, qual, val);
2564                 }
2565             case eQual_gene_desc:
2566                 {
2567                     if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2568                         CGene_ref& grp = sfp->SetGeneXref ();
2569                         grp.SetDesc (val);
2570                         return true;
2571                     }
2572                     // else:
2573                     return x_AddNoteToFeature(sfp, feat_name, qual, val);
2574                 }
2575             case eQual_gene_syn:
2576                 {
2577                     if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2578                         CGene_ref& grp = sfp->SetGeneXref ();
2579                         CGene_ref::TSyn& syn = grp.SetSyn ();
2580                         syn.push_back (val);
2581                         return true;
2582                     }
2583                     // else:
2584                     return x_AddNoteToFeature(sfp, feat_name, qual, val);
2585                 }
2586             case eQual_locus_tag:
2587                 {
2588                     if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2589                         CGene_ref& grp = sfp->SetGeneXref ();
2590                         grp.SetLocus_tag (val);
2591                         return true;
2592                     }
2593                     // else:
2594                     return x_AddNoteToFeature(sfp, feat_name, qual, val);
2595                 }
2596             case eQual_db_xref:
2597                 {
2598                     CTempString db, tag;
2599                     if (NStr::SplitInTwo (val, ":", db, tag)) {
2600                         CSeq_feat::TDbxref& dblist = sfp->SetDbxref ();
2601                         CRef<CDbtag> dbt (new CDbtag);
2602                         dbt->SetDb (db);
2603                         CRef<CObject_id> oid (new CObject_id);
2604                         static const char* digits = "0123456789";
2605                         if (tag.find_first_not_of(digits) == string::npos && !NStr::IsBlank(tag))
2606                             oid->SetId(NStr::StringToLong(tag));
2607                         else
2608                             oid->SetStr(tag);
2609                         dbt->SetTag (*oid);
2610                         dblist.push_back (dbt);
2611                         return true;
2612                     }
2613                     return true;
2614                 }
2615             case eQual_nomenclature:
2616                 {
2617                     /* !!! need to implement !!! */
2618                     return true;
2619                 }
2620             case eQual_go_component:
2621             case eQual_go_function:
2622             case eQual_go_process:
2623                 if (featType == CSeqFeatData::e_Gene ||
2624                     featType == CSeqFeatData::e_Cdregion ||
2625                     featType == CSeqFeatData::e_Rna) {
2626                     try {
2627                         CReadUtil::AddGeneOntologyTerm(*sfp, qual, val);
2628                     }
2629                     catch( ILineError& err) {
2630                         x_ProcessMsg(
2631                             err.Problem(),
2632                             err.Severity(),
2633                             feat_name, qual, val,
2634                             err.ErrorMessage());
2635                     }
2636                     //rw-621: throw out the faulty qualifier but retain the rest of the feature.
2637                     return true;
2638                 }
2639                 return false;
2640             case eQual_transcript_id:
2641                 {
2642                     if (featType == CSeqFeatData::e_Rna &&
2643                         sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) {
2644                         CBioseq::TId ids;
2645                         try {
2646                             CSeq_id::ParseIDs(ids, val,
2647                                 CSeq_id::fParse_ValidLocal
2648                             |   CSeq_id::fParse_PartialOK);
2649                         }
2650                         catch (CSeqIdException&)
2651                         {
2652                             x_ProcessMsg(
2653                                 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2654                                 feat_name, qual, val,
2655                                 "Invalid transcript_id  : " + val);
2656                             return true;
2657                         }
2658 
2659                         for (const auto& id : ids) {
2660                             auto id_string = id->GetSeqIdString(true);
2661                             auto res = m_ProcessedTranscriptIds.insert(id_string);
2662                             if (res.second == false) { // Insertion failed because Seq-id already encountered
2663                                 x_ProcessMsg(
2664                                     ILineError::eProblem_DuplicateIDs, eDiag_Error,
2665                                     feat_name, qual, val,
2666                                     "Transcript ID " + id_string + " appears on multiple mRNA features"
2667                                 );
2668                             }
2669                         }
2670                     }
2671                     x_AddGBQualToFeature(sfp, qual, val);
2672                     return true;
2673                 }
2674             case eQual_protein_id:
2675                 // see SQD-1535 and SQD-3496
2676                 if (featType == CSeqFeatData::e_Cdregion ||
2677                     (featType == CSeqFeatData::e_Rna &&
2678                     sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) ||
2679                     (featType == CSeqFeatData::e_Prot &&
2680                      sfdata.GetProt().IsSetProcessed() &&
2681                      sfdata.GetProt().GetProcessed() == CProt_ref::eProcessed_mature))
2682                 {
2683                     CBioseq::TId ids;
2684                     try {
2685                         CSeq_id::ParseIDs(ids, val,
2686                                 CSeq_id::fParse_ValidLocal |
2687                                 CSeq_id::fParse_PartialOK);
2688                     }
2689                     catch (CSeqIdException&)
2690                     {
2691                         x_ProcessMsg(
2692                                 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2693                                 feat_name, qual, val,
2694                                 "Invalid protein_id  : " + val);
2695                         return true;
2696                     }
2697 
2698                     if (featType == CSeqFeatData::e_Cdregion) {
2699                         for (const auto& id : ids) {
2700                             auto id_string = id->GetSeqIdString(true);
2701                             auto res = m_ProcessedProteinIds.insert(id_string);
2702                             if (res.second == false) { // Insertion failed because Seq-id already encountered
2703                                 x_ProcessMsg(
2704                                     ILineError::eProblem_DuplicateIDs, eDiag_Error,
2705                                     feat_name, qual, val,
2706                                     "Protein ID " + id_string + " appears on multiple CDS features"
2707                                 );
2708                             }
2709                         }
2710                     }
2711 
2712                     if (featType != CSeqFeatData::e_Rna) { // mRNA only has a protein_id qualifier
2713                         auto pBestId = GetBestId(ids);
2714                         if (pBestId) {
2715                             sfp->SetProduct().SetWhole(*pBestId);
2716                         }
2717                     }
2718                 }
2719 
2720                 if (featType != CSeqFeatData::e_Prot) { // Mat-peptide has an instantiated product, but no qualifier
2721                     x_AddGBQualToFeature(sfp, qual, val);
2722                 }
2723                 return true;
2724             case eQual_regulatory_class:
2725                 // This should've been handled up in x_AddQualifierToImp
2726                 // so it's always a bad value to be here
2727                 x_ProcessMsg(
2728                     ILineError::eProblem_QualifierBadValue, eDiag_Error,
2729                     feat_name, qual, val );
2730                 return true;
2731             default:
2732                 break;
2733         }
2734     }
2735     return false;
2736 }
2737 
x_IsWebComment(CTempString line)2738 bool CFeatureTableReader_Imp::x_IsWebComment(CTempString line)
2739 {
2740     // This function is testing for a match against the following regular
2741     // expression, but we avoid actual regexps for max speed:
2742     // "^(===================================================================| INFO:| WARNING:| ERROR:).*"
2743 
2744     // (that magic number is the size of the smallest possible match)
2745     if( line.length() < 6 ) {
2746         return false;
2747     }
2748 
2749     if( line[0] == '=' ) {
2750         static const CTempString kAllEqualsMatch =
2751             "===================================================================";
2752         if( NStr::StartsWith(line, kAllEqualsMatch) ) {
2753             return true;
2754         }
2755     } else if( line[0] == ' ') {
2756         switch(line[1]) {
2757         case 'I':
2758             {
2759                 static const CTempString kInfo = " INFO:";
2760                 if( NStr::StartsWith(line, kInfo) ) {
2761                     return true;
2762                 }
2763             }
2764             break;
2765         case 'W':
2766             {
2767                 static const CTempString kWarning = " WARNING:";
2768                 if( NStr::StartsWith(line, kWarning) ) {
2769                     return true;
2770                 }
2771             }
2772             break;
2773         case 'E':
2774             {
2775                 static const CTempString kError = " ERROR:";
2776                 if( NStr::StartsWith(line, kError) ) {
2777                     return true;
2778                 }
2779             }
2780             break;
2781         default:
2782             // no match
2783             break;
2784         }
2785     }
2786 
2787     // no match
2788     return false;
2789 }
2790 
x_AddIntervalToFeature(CTempString strFeatureName,CRef<CSeq_feat> & sfp,const SFeatLocInfo & loc_info)2791 bool CFeatureTableReader_Imp::x_AddIntervalToFeature(
2792     CTempString strFeatureName,
2793     CRef<CSeq_feat>& sfp,
2794     const SFeatLocInfo& loc_info
2795 )
2796 
2797 {
2798 
2799     auto start = loc_info.start_pos;
2800     auto stop = loc_info.stop_pos;
2801 
2802     const Int4 orig_start = start;
2803     CSeq_interval::TStrand strand = eNa_strand_plus;
2804 
2805     if (start > stop) {
2806         swap(start, stop);
2807         strand = eNa_strand_minus;
2808     }
2809     if (loc_info.is_minus_strand) {
2810         strand = eNa_strand_minus;
2811     }
2812 
2813     // construct loc, which will be added to the mix
2814     CSeq_loc_mix::Tdata & mix_set = sfp->SetLocation().SetMix();
2815     CRef<CSeq_loc> loc(new CSeq_loc);
2816     if (loc_info.is_point || start == stop ) {
2817         // a point of some kind
2818         if (mix_set.empty())
2819            m_need_check_strand = true;
2820         else
2821            x_GetPointStrand(*sfp, strand);
2822 
2823         // note usage of orig_start instead of start
2824         // because we want the first part of the point
2825         // specified in the file, not the smallest because SetRightOf
2826         // works differently for plus vs. minus strand
2827         CRef<CSeq_point> pPoint(
2828             new CSeq_point(*m_seq_id, orig_start, strand) );
2829         if( loc_info.is_point ) {
2830             // between two bases
2831             pPoint->SetRightOf (true);
2832             // warning if stop is not start plus one
2833             if( stop != (start+1) ) {
2834                 x_ProcessMsg(
2835                     ILineError::eProblem_BadFeatureInterval, eDiag_Warning,
2836                     strFeatureName );
2837             }
2838         } else {
2839             // just a point. do nothing
2840         }
2841 
2842         if (loc_info.is_5p_partial) {
2843             pPoint->SetPartialStart (true, eExtreme_Biological);
2844         }
2845         if (loc_info.is_3p_partial) {
2846             pPoint->SetPartialStop (true, eExtreme_Biological);
2847         }
2848 
2849         loc->SetPnt( *pPoint );
2850     } else {
2851         // interval
2852         CRef<CSeq_interval> pIval( new CSeq_interval(*m_seq_id, start, stop, strand) );
2853         if (loc_info.is_5p_partial) {
2854             pIval->SetPartialStart (true, eExtreme_Biological);
2855         }
2856         if (loc_info.is_3p_partial) {
2857             pIval->SetPartialStop (true, eExtreme_Biological);
2858         }
2859         loc->SetInt(*pIval);
2860         if (m_need_check_strand)
2861         {
2862             x_UpdatePointStrand(*sfp, strand);
2863             m_need_check_strand = false;
2864         }
2865     }
2866 
2867     // check for internal partials
2868     if( ! mix_set.empty() ) {
2869         const CSeq_loc & last_loc = *mix_set.back();
2870         if( last_loc.IsPartialStop(eExtreme_Biological) ||
2871             loc->IsPartialStart(eExtreme_Biological) )
2872         {
2873             // internal partials
2874             x_ProcessMsg(ILineError::eProblem_InternalPartialsInFeatLocation,
2875                 eDiag_Warning, strFeatureName );
2876         }
2877     }
2878 
2879     mix_set.push_back(loc);
2880 
2881 
2882     if (loc_info.is_5p_partial || loc_info.is_3p_partial) {
2883         sfp->SetPartial (true);
2884     }
2885 
2886     return true;
2887 }
2888 
2889 
2890 
x_SetupSeqFeat(CRef<CSeq_feat> sfp,const string & feat,const TFlags flags,ITableFilter * filter)2891 bool CFeatureTableReader_Imp::x_SetupSeqFeat (
2892     CRef<CSeq_feat> sfp,
2893     const string& feat,
2894     const TFlags flags,
2895     ITableFilter *filter
2896 )
2897 
2898 {
2899     if (feat.empty ()) return false;
2900 
2901     // check filter, if any
2902     if( NULL != filter ) {
2903         ITableFilter::EAction action = filter->GetFeatAction(feat);
2904         if( action != ITableFilter::eAction_Okay ) {
2905             x_ProcessMsg(
2906                 ILineError::eProblem_FeatureNameNotAllowed,
2907                 eDiag_Warning, feat );
2908             if( action == ITableFilter::eAction_Disallowed ) {
2909                 return false;
2910             }
2911         }
2912     }
2913 
2914     CSeqFeatData::ESubtype sbtyp = CSeqFeatData::SubtypeNameToValue(feat);
2915     if (sbtyp != CSeqFeatData::eSubtype_bad) {
2916 
2917         // populate *sfp here...
2918 
2919         CSeqFeatData::E_Choice typ = CSeqFeatData::GetTypeFromSubtype (sbtyp);
2920         sfp->SetData ().Select (typ);
2921         CSeqFeatData& sfdata = sfp->SetData ();
2922 
2923         if (typ == CSeqFeatData::e_Rna) {
2924             CRNA_ref& rrp = sfdata.SetRna ();
2925             CRNA_ref::EType rnatyp = CRNA_ref::eType_unknown;
2926             switch (sbtyp) {
2927             case CSeqFeatData::eSubtype_preRNA :
2928                 rnatyp = CRNA_ref::eType_premsg;
2929                 break;
2930             case CSeqFeatData::eSubtype_mRNA :
2931                 rnatyp = CRNA_ref::eType_mRNA;
2932                 break;
2933             case CSeqFeatData::eSubtype_tRNA :
2934                 rnatyp = CRNA_ref::eType_tRNA;
2935                 break;
2936             case CSeqFeatData::eSubtype_rRNA :
2937                 rnatyp = CRNA_ref::eType_rRNA;
2938                 break;
2939             case CSeqFeatData::eSubtype_snRNA :
2940                 rnatyp = CRNA_ref::eType_ncRNA;
2941                 rrp.SetExt().SetGen().SetClass("snRNA");
2942                 break;
2943             case CSeqFeatData::eSubtype_scRNA :
2944                 rnatyp = CRNA_ref::eType_ncRNA;
2945                 rrp.SetExt().SetGen().SetClass("scRNA");
2946                 break;
2947             case CSeqFeatData::eSubtype_snoRNA :
2948                 rnatyp = CRNA_ref::eType_ncRNA;
2949                 rrp.SetExt().SetGen().SetClass("snoRNA");
2950                 break;
2951             case CSeqFeatData::eSubtype_ncRNA :
2952                 rnatyp = CRNA_ref::eType_ncRNA;
2953                 rrp.SetExt().SetGen();
2954                 break;
2955             case CSeqFeatData::eSubtype_tmRNA :
2956                 rnatyp = CRNA_ref::eType_tmRNA;
2957                 rrp.SetExt().SetGen();
2958                 break;
2959             case CSeqFeatData::eSubtype_otherRNA :
2960                 rrp.SetExt().SetName("misc_RNA");
2961                 rnatyp = CRNA_ref::eType_other;
2962                 break;
2963             default :
2964                 break;
2965             }
2966             rrp.SetType (rnatyp);
2967 
2968         } else if (typ == CSeqFeatData::e_Imp) {
2969             CImp_feat_Base& imp = sfdata.SetImp ();
2970             imp.SetKey (feat);
2971 
2972         } else if (typ == CSeqFeatData::e_Bond) {
2973             sfdata.SetBond (CSeqFeatData::eBond_other);
2974 
2975         } else if (typ == CSeqFeatData::e_Site) {
2976             sfdata.SetSite (CSeqFeatData::eSite_other);
2977         } else if (typ == CSeqFeatData::e_Prot ) {
2978             CProt_ref &prot_ref = sfdata.SetProt();
2979             switch (sbtyp) {
2980                 default:
2981                     break;
2982                 case CSeqFeatData::eSubtype_mat_peptide_aa:
2983                     prot_ref.SetProcessed(CProt_ref::eProcessed_mature);
2984                     break;
2985                 case CSeqFeatData::eSubtype_sig_peptide_aa:
2986                     prot_ref.SetProcessed(CProt_ref::eProcessed_signal_peptide);
2987                     break;
2988                 case CSeqFeatData::eSubtype_preprotein:
2989                     prot_ref.SetProcessed(CProt_ref::eProcessed_preprotein);
2990                     break;
2991                 case CSeqFeatData::eSubtype_transit_peptide_aa:
2992                     prot_ref.SetProcessed(CProt_ref::eProcessed_transit_peptide);
2993                     break;
2994                 case CSeqFeatData::eSubtype_propeptide_aa:
2995                     prot_ref.SetProcessed(CProt_ref::eProcessed_propeptide);
2996                     break;
2997             }
2998         }
2999 
3000         // check for discouraged feature name
3001         if( (flags & CFeature_table_reader::fReportDiscouragedKey) != 0 ) {
3002             if( CSeqFeatData::IsDiscouragedSubtype(sbtyp) ) {
3003                 x_ProcessMsg(
3004                     ILineError::eProblem_DiscouragedFeatureName,
3005                     eDiag_Warning, feat);
3006             }
3007         }
3008 
3009         return true;
3010     }
3011 
3012     // unrecognized feature key
3013 
3014     if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3015         x_ProcessMsg(ILineError::eProblem_UnrecognizedFeatureName, eDiag_Warning, feat );
3016     }
3017 
3018     if ((flags & CFeature_table_reader::fTranslateBadKey) != 0) {
3019 
3020         sfp->SetData ().Select (CSeqFeatData::e_Imp);
3021         CSeqFeatData& sfdata = sfp->SetData ();
3022         CImp_feat_Base& imp = sfdata.SetImp ();
3023         imp.SetKey ("misc_feature");
3024         x_AddQualifierToFeature (sfp, kEmptyStr, "standard_name", feat, flags);
3025 
3026         return true;
3027 
3028     } else if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3029 
3030         sfp->SetData ().Select (CSeqFeatData::e_Imp);
3031         CSeqFeatData& sfdata = sfp->SetData ();
3032         CImp_feat_Base& imp = sfdata.SetImp ();
3033         imp.SetKey (feat);
3034 
3035         return true;
3036     }
3037 
3038     return false;
3039 }
3040 
x_ProcessMsg(ILineError::EProblem eProblem,EDiagSev eSeverity,const string & strFeatureName,const string & strQualifierName,const string & strQualifierValue,const string & strErrorMessage,const ILineError::TVecOfLines & vecOfOtherLines)3041 void CFeatureTableReader_Imp::x_ProcessMsg(
3042     ILineError::EProblem eProblem,
3043     EDiagSev eSeverity,
3044     const string& strFeatureName,
3045     const string& strQualifierName,
3046     const string& strQualifierValue,
3047     const string& strErrorMessage,
3048     const ILineError::TVecOfLines & vecOfOtherLines)
3049 {
3050     x_ProcessMsg(m_reader ? m_reader->GetLineNumber() : m_LineNumber,
3051         eProblem,
3052         eSeverity,
3053         strFeatureName,
3054         strQualifierName,
3055         strQualifierValue,
3056         strErrorMessage,
3057         vecOfOtherLines);
3058 }
3059 
3060 
x_ProcessMsg(int line_num,ILineError::EProblem eProblem,EDiagSev eSeverity,const string & strFeatureName,const string & strQualifierName,const string & strQualifierValue,const string & strErrorMessage,const ILineError::TVecOfLines & vecOfOtherLines)3061 void CFeatureTableReader_Imp::x_ProcessMsg(
3062     int line_num,
3063     ILineError::EProblem eProblem,
3064     EDiagSev eSeverity,
3065     const string & strFeatureName,
3066     const string & strQualifierName,
3067     const string & strQualifierValue,
3068     const string& strErrorMessage,
3069     const ILineError::TVecOfLines & vecOfOtherLines )
3070 {
3071 
3072     if (!m_pMessageListener) {
3073         return;
3074     }
3075 
3076     AutoPtr<CObjReaderLineException> pErr (
3077         CObjReaderLineException::Create(
3078         eSeverity, line_num, strErrorMessage, eProblem, m_real_seqid, strFeatureName,
3079         strQualifierName, strQualifierValue));
3080     ITERATE( ILineError::TVecOfLines, line_it, vecOfOtherLines ) {
3081         pErr->AddOtherLine(*line_it);
3082     }
3083 
3084     if (!m_pMessageListener->PutError(*pErr)) {
3085         pErr->Throw();
3086     }
3087 }
3088 
3089 
PutProgress(const CTempString & seq_id,const unsigned int line_number,ILineErrorListener * pListener)3090 void CFeatureTableReader_Imp::PutProgress(
3091     const CTempString& seq_id,
3092     const unsigned int line_number,
3093     ILineErrorListener* pListener)
3094 {
3095     if (!pListener) {
3096         return;
3097     }
3098 
3099     string msg = "Seq-id " + seq_id + ", line " + NStr::IntToString(line_number);
3100     pListener->PutProgress(msg);
3101 }
3102 
3103 
3104 // helper for CFeatureTableReader_Imp::ReadSequinFeatureTable,
3105 // just so we don't forget a step when we reset the feature
3106 //
x_ResetFeat(CRef<CSeq_feat> & sfp,bool & curr_feat_intervals_done)3107 void CFeatureTableReader_Imp::x_ResetFeat(CRef<CSeq_feat> & sfp, bool & curr_feat_intervals_done)
3108 {
3109     m_need_check_strand = false;
3110     sfp.Reset(new CSeq_feat);
3111     //sfp->ResetLocation();
3112     curr_feat_intervals_done = false;
3113 }
3114 
x_GetPointStrand(const CSeq_feat & feat,CSeq_interval::TStrand & strand) const3115 void CFeatureTableReader_Imp::x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const
3116 {
3117     if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3118     {
3119         const CSeq_loc& last = *feat.GetLocation().GetMix().Get().back();
3120         if (last.IsInt() && last.GetInt().IsSetStrand())
3121         {
3122             strand = last.GetInt().GetStrand();
3123         }
3124         else
3125         if (last.IsPnt() && last.GetPnt().IsSetStrand())
3126         {
3127             strand = last.GetPnt().GetStrand();
3128         }
3129     }
3130 }
3131 
x_UpdatePointStrand(CSeq_feat & feat,CSeq_interval::TStrand strand) const3132 void CFeatureTableReader_Imp::x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const
3133 {
3134     if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3135     {
3136 
3137         for (auto pSeqLoc : feat.SetLocation().SetMix().Set()) {
3138             if (pSeqLoc->IsPnt()) {
3139                 auto& seq_point = pSeqLoc->SetPnt();
3140                 const auto old_strand =
3141                     seq_point.IsSetStrand() ?
3142                     seq_point.GetStrand() :
3143                     eNa_strand_plus;
3144 
3145                     seq_point.SetStrand(strand);
3146                     if (old_strand != strand) {
3147                         const bool is_5p_partial = seq_point.IsPartialStop(eExtreme_Biological);
3148                         const bool is_3p_partial = seq_point.IsPartialStart(eExtreme_Biological);
3149                         seq_point.SetPartialStart(is_5p_partial, eExtreme_Biological);
3150                         seq_point.SetPartialStop(is_3p_partial, eExtreme_Biological);
3151                     }
3152             }
3153         }
3154     }
3155 }
3156 
3157 
x_FinishFeature(CRef<CSeq_feat> & feat,TFtable & ftable)3158 void CFeatureTableReader_Imp::x_FinishFeature(CRef<CSeq_feat>& feat,
3159                                               TFtable& ftable)
3160 {
3161     if ( !feat ||
3162          feat.Empty() ||
3163          !feat->IsSetData() ||
3164          (feat->GetData().Which() == CSeqFeatData::e_not_set) )
3165     {
3166         return;
3167     }
3168 
3169     // Check for missing publication - RW-626
3170     if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_pub &&
3171         (!feat->SetData().SetPub().IsSetPub() ||
3172           feat->SetData().SetPub().GetPub().Get().empty())) {
3173         const int line_number = m_reader->AtEOF() ?
3174                                 m_reader->GetLineNumber() :
3175                                 m_reader->GetLineNumber()-1;
3176 
3177         string msg = "Reference feature is empty. Skipping feature.";
3178 
3179         x_ProcessMsg(line_number,
3180                      ILineError::eProblem_IncompleteFeature,
3181                      eDiag_Warning,
3182                      "Reference",
3183                      kEmptyStr,
3184                      kEmptyStr,
3185                      msg);
3186             return;
3187     }
3188 
3189     if (feat->IsSetLocation() && feat->GetLocation().IsMix())
3190     {
3191         if (feat->GetLocation().GetMix().Get().empty()) {
3192             // turn empty seqlocmix into a null seq-loc
3193             feat->SetLocation().SetNull();
3194         }
3195         else
3196         if (feat->GetLocation().GetMix().Get().size() == 1) {
3197             // demote 1-part seqlocmixes to seq-loc with just that part
3198             CRef<CSeq_loc> keep_loc = *feat->SetLocation().SetMix().Set().begin();
3199             feat->SetLocation(*keep_loc);
3200         }
3201     }
3202     ftable.push_back(feat);
3203 }
3204 
3205 
3206 
x_ProcessQualifier(const string & qual_name,const string & qual_val,const string & feat_name,CRef<CSeq_feat> feat,TFlags flags)3207 void CFeatureTableReader_Imp::x_ProcessQualifier(const string& qual_name,
3208                                                  const string& qual_val,
3209                                                  const string& feat_name,
3210                                                  CRef<CSeq_feat> feat,
3211                                                  TFlags flags)
3212 {
3213     if (NStr::IsBlank(qual_name)) {
3214         return;
3215     }
3216 
3217     if (!feat) {
3218         if ( flags & CFeature_table_reader::fReportBadKey ) {
3219             x_ProcessMsg(ILineError::eProblem_QualifierWithoutFeature,
3220                         eDiag_Warning, kEmptyStr, qual_name, qual_val);
3221         }
3222         return;
3223     }
3224 
3225     if (NStr::IsBlank(qual_val)) {
3226         if (sc_SingleKeys.find(qual_name.c_str()) != sc_SingleKeys.end()) {
3227             x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags);
3228         }
3229         else {
3230             x_ProcessMsg(ILineError::eProblem_QualifierBadValue,
3231                          eDiag_Warning, feat_name, qual_name);
3232         }
3233         return;
3234     }
3235 
3236     // else qual_name and qual_val are not blank
3237     if (!x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags)) {
3238         if (flags & CFeature_table_reader::fReportBadKey) {
3239             x_ProcessMsg(ILineError::eProblem_UnrecognizedQualifierName,
3240                          eDiag_Warning, feat_name, qual_name, qual_val);
3241         }
3242 
3243         if (flags & CFeature_table_reader::fKeepBadKey) {
3244             x_AddGBQualToFeature(feat, qual_name, qual_val);
3245         }
3246     }
3247 }
3248 
3249 
3250 
ReadSequinFeatureTable(const CTempString & in_seqid,const CTempString & in_annotname,const TFlags flags,ITableFilter * filter)3251 CRef<CSeq_annot> CFeatureTableReader_Imp::ReadSequinFeatureTable (
3252     const CTempString& in_seqid,
3253     const CTempString& in_annotname,
3254     const TFlags flags,
3255     ITableFilter *filter
3256 )
3257 {
3258     string feat, qual, qual_value;
3259     string curr_feat_name;
3260    // Int4 start, stop;
3261     //bool partial5, partial3, ispoint, isminus,
3262 
3263     bool ignore_until_next_feature_key = false;
3264     Int4 offset = 0;
3265     SFeatLocInfo loc_info;
3266 
3267     CRef<CSeq_annot> sap(new CSeq_annot);
3268 
3269     TFtable& ftable = sap->SetData().SetFtable();
3270     const bool bIgnoreWebComments =
3271         ( (flags & CFeature_table_reader::fIgnoreWebComments) != 0 );
3272 
3273     // if sequence ID is a list, use just one sequence ID string
3274     x_InitId(in_seqid, flags);
3275 
3276     // Use this to efficiently find the best CDS for a prot feature
3277     // (only add CDS's for it to work right)
3278     CBestFeatFinder best_CDS_finder;
3279 
3280     // map feature types to features
3281     TChoiceToFeatMap choiceToFeatMap;
3282 
3283     CRef<CSeq_feat> sfp;
3284     // This is true once this feature should not
3285     // have any more intervals.
3286     // This allows us to catch errors like the following:
3287     //
3288     //
3289     //>Feature lcl|Seq1
3290     //1	1008	CDS
3291     //			gene    THE_GENE_NAME
3292     //50	200
3293     //			product THE_GENE_PRODUCT
3294     bool curr_feat_intervals_done = false;
3295 
3296     if (! in_annotname.empty ()) {
3297       CAnnot_descr& descr = sap->SetDesc ();
3298       CRef<CAnnotdesc> annot(new CAnnotdesc);
3299       annot->SetName (in_annotname);
3300       descr.Set().push_back (annot);
3301     }
3302 
3303     while ( !m_reader->AtEOF() ) {
3304 
3305         CTempString line = *++(*m_reader);
3306 
3307         if( m_reader->GetLineNumber() % 10000 == 0 &&
3308             m_reader->GetLineNumber() > 0 )
3309         {
3310             PutProgress(m_real_seqid, m_reader->GetLineNumber(), m_pMessageListener);
3311         }
3312 
3313         // skip empty lines.
3314         // if requested, also skip webcomment lines
3315         if( line.empty () || (bIgnoreWebComments && x_IsWebComment(line) ) ) {
3316             continue;
3317         }
3318 
3319         // if next line is a new feature table, return current sap
3320         CTempStringEx dummy1, dummy2;
3321         if( ParseInitialFeatureLine(line, dummy1, dummy2) ) {
3322             m_reader->UngetLine(); // we'll get this feature line the next time around
3323             break;
3324         }
3325 
3326         if (line [0] == '[') {
3327 
3328             // try to parse it as an offset
3329             if( x_TryToParseOffset(line, offset) ) {
3330                 // okay, known command
3331             } else {
3332                 // warn for unknown square-bracket commands
3333                 x_ProcessMsg(
3334                     ILineError::eProblem_UnrecognizedSquareBracketCommand,
3335                     eDiag_Warning);
3336             }
3337 
3338         } else if ( s_LineIndicatesOrder(line) ) {
3339 
3340             // put nulls between feature intervals
3341             CRef<CSeq_loc> loc_with_nulls = s_LocationJoinToOrder( sfp->GetLocation() );
3342             // loc_with_nulls is unset if no change was needed
3343             if( loc_with_nulls ) {
3344                 sfp->SetLocation( *loc_with_nulls );
3345             }
3346 
3347         } else if (x_ParseFeatureTableLine (line, loc_info, feat, qual, qual_value, offset)) {
3348             // process line in feature table
3349 
3350             replace( qual_value.begin(), qual_value.end(), '\"', '\'' );
3351 
3352             if ((! feat.empty ()) && loc_info.start_pos >= 0 && loc_info.stop_pos >= 0) {
3353 
3354                 // process start - stop - feature line
3355 
3356                 x_FinishFeature(sfp, ftable);
3357                 x_ResetFeat( sfp, curr_feat_intervals_done );
3358 
3359                 if (x_SetupSeqFeat (sfp, feat, flags, filter)) {
3360 
3361                     // figure out type of feat, and store in map for later use
3362                     CSeqFeatData::E_Choice eChoice = CSeqFeatData::e_not_set;
3363                     if( sfp->CanGetData() ) {
3364                         eChoice = sfp->GetData().Which();
3365                     }
3366                     choiceToFeatMap.insert(
3367                         TChoiceToFeatMap::value_type(
3368                         eChoice,
3369                         SFeatAndLineNum(sfp, m_reader->GetLineNumber())));
3370 
3371                     // if new feature is a CDS, remember it for later lookups
3372                     if( eChoice == CSeqFeatData::e_Cdregion ) {
3373                         best_CDS_finder.AddFeat( *sfp );
3374                     }
3375 
3376                     // and add first interval
3377                     x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3378 
3379                     ignore_until_next_feature_key = false;
3380 
3381                     curr_feat_name = feat;
3382 
3383                 } else {
3384 
3385                     // bad feature, set ignore flag
3386 
3387                     ignore_until_next_feature_key = true;
3388                 }
3389 
3390             } else if (ignore_until_next_feature_key) {
3391 
3392                 // bad feature was found before, so ignore
3393                 // qualifiers until next feature key
3394 
3395             }
3396             else
3397             if (loc_info.start_pos >= 0 &&
3398                 loc_info.stop_pos >= 0 &&
3399                 feat.empty () &&
3400                 qual.empty () &&
3401                 qual_value.empty ()) {
3402 
3403                 if( curr_feat_intervals_done ) {
3404                     // the feat intervals were done, so it's an error for there to be more intervals
3405                     x_ProcessMsg(ILineError::eProblem_NoFeatureProvidedOnIntervals, eDiag_Error);
3406                     // this feature is in bad shape, so we ignore the rest of it
3407                     ignore_until_next_feature_key = true;
3408                     x_ResetFeat(sfp, curr_feat_intervals_done);
3409                 } else if (sfp  &&  sfp->IsSetLocation()  &&  sfp->GetLocation().IsMix()) {
3410                     // process start - stop multiple interval line
3411                     x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3412                                            // start, stop, partial5, partial3, ispoint, isminus);
3413                 } else {
3414                     if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3415                         x_ProcessMsg(ILineError::eProblem_NoFeatureProvidedOnIntervals,
3416                             eDiag_Warning);
3417                     }
3418                 }
3419 
3420             } else if (!NStr::IsBlank(qual)) {
3421               curr_feat_intervals_done = true;
3422               x_ProcessQualifier(qual, qual_value, curr_feat_name, sfp, flags);
3423             }
3424             else if (!feat.empty()) {
3425 
3426                 // unrecognized location
3427 
3428                 // there should no more ranges for this feature
3429                 // (although there still can be ranges for quals, of course).
3430                 curr_feat_intervals_done = true;
3431 
3432                 if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3433                     x_ProcessMsg(
3434                         ILineError::eProblem_FeatureBadStartAndOrStop, eDiag_Warning,
3435                         feat );
3436                 }
3437             }
3438         }
3439     }
3440 
3441     // make sure last feature is finished
3442     x_FinishFeature(sfp, ftable);
3443     x_ResetFeat( sfp, curr_feat_intervals_done );
3444 
3445     if ((flags & CFeature_table_reader::fCreateGenesFromCDSs) != 0 ||
3446         (flags & CFeature_table_reader::fCDSsMustBeInTheirGenes) != 0 )
3447     {
3448         x_CreateGenesFromCDSs(sap, choiceToFeatMap, flags);
3449     }
3450     return sap;
3451 }
3452 
3453 
CreateSeqFeat(const string & feat,CSeq_loc & location,const TFlags flags,const string & seq_id,ITableFilter * filter)3454 CRef<CSeq_feat> CFeatureTableReader_Imp::CreateSeqFeat (
3455     const string& feat,
3456     CSeq_loc& location,
3457     const TFlags flags,
3458     const string &seq_id,
3459     ITableFilter *filter
3460 )
3461 
3462 {
3463     CRef<CSeq_feat> sfp (new CSeq_feat);
3464 
3465     sfp->ResetLocation ();
3466 
3467     if ( ! x_SetupSeqFeat (sfp, feat, flags, filter) ) {
3468 
3469         // bad feature, make dummy
3470         sfp->SetData ().Select (CSeqFeatData::e_not_set);
3471     }
3472     sfp->SetLocation (location);
3473 
3474     return sfp;
3475 }
3476 
x_InitId(const CTempString & seq_id,const TFlags flags)3477 void CFeatureTableReader_Imp::x_InitId(const CTempString& seq_id, const TFlags flags)
3478 {
3479     if (!NStr::IsBlank(seq_id)) {
3480         CBioseq::TId ids;
3481         CSeq_id::ParseIDs(ids, seq_id,
3482             (flags & CFeature_table_reader::fAllIdsAsLocal) ? CSeq_id::fParse_AnyLocal : CSeq_id::fParse_Default);
3483 
3484         m_seq_id.Reset();
3485         if (flags & CFeature_table_reader::fPreferGenbankId)
3486         {
3487             for (auto id : ids)
3488             {
3489                 if (id->IsGenbank())
3490                     m_seq_id = id;
3491             }
3492         };
3493 
3494         if (m_seq_id.Empty())
3495             m_seq_id = ids.front();
3496 
3497         m_real_seqid.clear();
3498         m_seq_id->GetLabel(&m_real_seqid, CSeq_id::eFasta);
3499     }
3500 }
3501 
AddFeatQual(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const TFlags flags,const string & seq_id1)3502 void CFeatureTableReader_Imp::AddFeatQual (
3503     CRef<CSeq_feat> sfp,
3504     const string& feat_name,
3505     const string& qual,
3506     const string& val,
3507     const TFlags flags,
3508     const string &seq_id1 )
3509 
3510 {
3511     x_InitId(seq_id1, flags);
3512 
3513     if (NStr::IsBlank(qual)) {
3514         return;
3515     }
3516 
3517     if (!val.empty ()) { // Should probably use NStr::IsBlank()
3518         if (! x_AddQualifierToFeature (sfp, feat_name, qual, val, flags)) {
3519             // unrecognized qualifier key
3520             if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3521                 ERR_POST_X (5, Warning << "Unrecognized qualifier '" << qual << "'");
3522             }
3523             if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3524                 x_AddGBQualToFeature (sfp, qual, val);
3525             }
3526         }
3527     }
3528     else { // empty val
3529         // check for the few qualifiers that do not need a value
3530         auto s_iter = sc_SingleKeys.find (qual.c_str ());
3531         if (s_iter != sc_SingleKeys.end ()) {
3532             x_AddQualifierToFeature (sfp, feat_name, qual, val, flags);
3533         }
3534     }
3535 }
3536 
3537 // static
ParseInitialFeatureLine(const CTempString & line_arg,CTempStringEx & out_seqid,CTempStringEx & out_annotname)3538 bool CFeatureTableReader_Imp::ParseInitialFeatureLine (
3539     const CTempString& line_arg,
3540     CTempStringEx& out_seqid,
3541     CTempStringEx& out_annotname )
3542 {
3543     out_seqid.clear();
3544     out_annotname.clear();
3545 
3546     // copy the line_arg because we can't edit line_arg itself
3547     CTempString line = line_arg;
3548 
3549     // handle ">"
3550     NStr::TruncateSpacesInPlace(line);
3551     if( ! NStr::StartsWith(line, ">") ) {
3552         return false;
3553     }
3554     line = line.substr(1); // remove '>'
3555 
3556     // handle "Feature"
3557     NStr::TruncateSpacesInPlace(line, NStr::eTrunc_Begin);
3558     const CTempString kFeatureStr("Feature");
3559     if( ! NStr::StartsWith(line, kFeatureStr, NStr::eNocase) ) {
3560         return false;
3561     }
3562     line = line.substr( kFeatureStr.length() ); // remove "Feature"
3563 
3564     // throw out any non-space characters at the beginning,
3565     // so we can, for example, handle ">Features" (note the "s")
3566     while( !line.empty() && !isspace(line[0])  ) {
3567         line = line.substr(1);
3568     }
3569 
3570     // extract seqid and annotname
3571     NStr::TruncateSpacesInPlace(line, NStr::eTrunc_Begin);
3572     NStr::SplitInTwo(line, " \t", out_seqid, out_annotname, NStr::fSplit_Tokenize);
3573 
3574     return true;
3575 }
3576 
3577 
3578 // public access functions
3579 
CFeature_table_reader(TReaderFlags fReaderFlags)3580 CFeature_table_reader::CFeature_table_reader(
3581     TReaderFlags fReaderFlags)
3582     : CReaderBase(fReaderFlags)
3583 {
3584 }
3585 
CFeature_table_reader(ILineReader & lr,ILineErrorListener * pErrors)3586 CFeature_table_reader::CFeature_table_reader(
3587     ILineReader& lr,
3588     ILineErrorListener* pErrors) :
3589     CReaderBase(0),
3590     m_pImpl(new CFeatureTableReader_Imp(&lr, 0, pErrors))
3591     {}
3592 
3593 CRef<CSerialObject>
ReadObject(ILineReader & lr,ILineErrorListener * pMessageListener)3594 CFeature_table_reader::ReadObject(
3595     ILineReader &lr, ILineErrorListener *pMessageListener)
3596 {
3597     CRef<CSerialObject> object(
3598         ReadSeqAnnot( lr, pMessageListener ).ReleaseOrNull() );
3599     return object;
3600 }
3601 
3602 
3603 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pMessageListener)3604 CFeature_table_reader::ReadSeqAnnot(
3605     ILineReader &lr, ILineErrorListener *pMessageListener)
3606 {
3607     return ReadSequinFeatureTable(lr, 0, pMessageListener);
3608 }
3609 
3610 
ReadSequinFeatureTable(CNcbiIstream & ifs,const string & seqid,const string & annotname,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3611 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3612     CNcbiIstream& ifs,
3613     const string& seqid,
3614     const string& annotname,
3615     const TFlags flags,
3616     ILineErrorListener* pMessageListener,
3617     ITableFilter *filter
3618 )
3619 {
3620     CStreamLineReader reader(ifs);
3621     return ReadSequinFeatureTable(reader, seqid, annotname, flags, pMessageListener, filter);
3622 }
3623 
ReadSequinFeatureTable(ILineReader & reader,const string & seqid,const string & annotname,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3624 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3625     ILineReader& reader,
3626     const string& seqid,
3627     const string& annotname,
3628     const TFlags flags,
3629     ILineErrorListener* pMessageListener,
3630     ITableFilter *filter
3631 )
3632 {
3633     // just read features from 5-column table
3634     CFeatureTableReader_Imp impl(&reader, 0, pMessageListener);
3635     return impl.ReadSequinFeatureTable(seqid, annotname, flags, filter);
3636 }
3637 
x_ReadFeatureTable(CFeatureTableReader_Imp & reader,const CTempString & seqid,const CTempString & annot_name,TFlags flags,ITableFilter * filter)3638 CRef<CSeq_annot> CFeature_table_reader::x_ReadFeatureTable(
3639     CFeatureTableReader_Imp& reader,
3640     const CTempString& seqid,
3641     const CTempString& annot_name,
3642     TFlags flags,
3643     ITableFilter* filter) {
3644     return reader.ReadSequinFeatureTable(seqid, annot_name, flags, filter);
3645 }
3646 
3647 
ReadSequinFeatureTable(CNcbiIstream & ifs,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3648 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3649     CNcbiIstream& ifs,
3650     const TFlags flags,
3651     ILineErrorListener* pMessageListener,
3652     ITableFilter *filter
3653 )
3654 {
3655     CStreamLineReader reader(ifs);
3656     return ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3657 }
3658 
3659 
x_ReadFeatureTable(CFeatureTableReader_Imp & reader,const TFlags flags,ITableFilter * filter,const string & seqid_prefix)3660 CRef<CSeq_annot> CFeature_table_reader::x_ReadFeatureTable(
3661         CFeatureTableReader_Imp& reader,
3662         const TFlags flags,
3663         ITableFilter* filter,
3664         const string& seqid_prefix)
3665 {
3666     auto pLineReader = reader.GetLineReaderPtr();
3667     if (!pLineReader) {
3668         return CRef<CSeq_annot>();
3669     }
3670 
3671 
3672     CTempStringEx orig_seqid, annotname;
3673     // first look for >Feature line, extract seqid and optional annotname
3674     while (orig_seqid.empty () && !pLineReader->AtEOF() ) {
3675         CTempString line = *++(*pLineReader);
3676         if( ParseInitialFeatureLine(line, orig_seqid, annotname) ) {
3677             CFeatureTableReader_Imp::PutProgress(orig_seqid,
3678                                                  pLineReader->GetLineNumber(),
3679                                                  reader.GetErrorListenerPtr());
3680         }
3681     }
3682 
3683     string temp_seqid;
3684     if (seqid_prefix.empty()) {
3685         //seqid = orig_seqid;
3686     } else {
3687         if (orig_seqid.find('|') == string::npos)
3688             temp_seqid = seqid_prefix + orig_seqid;
3689         else
3690         if (NStr::StartsWith(orig_seqid, "lcl|"))
3691         {
3692             temp_seqid = seqid_prefix + orig_seqid.substr(4);
3693         }
3694         orig_seqid = temp_seqid;
3695     }
3696     return x_ReadFeatureTable(reader, orig_seqid, annotname, flags, filter);
3697 }
3698 
3699 
ReadSequinFeatureTable(ILineReader & reader,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * pFilter,const string & seqid_prefix)3700 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3701     ILineReader& reader,
3702     const TFlags flags,
3703     ILineErrorListener* pMessageListener,
3704     ITableFilter* pFilter,
3705     const string& seqid_prefix
3706 )
3707 {
3708     CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3709     return x_ReadFeatureTable(ftable_reader, flags, pFilter, seqid_prefix);
3710 }
3711 
3712 
ReadSequinFeatureTable(const TFlags flags,ITableFilter * pFilter,const string & seqid_prefix)3713 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable(
3714     const TFlags flags,
3715     ITableFilter* pFilter,
3716     const string& seqid_prefix
3717 )
3718 {
3719     return x_ReadFeatureTable(*m_pImpl, flags, pFilter, seqid_prefix);
3720 }
3721 
3722 
ReadSequinFeatureTables(CNcbiIstream & ifs,CSeq_entry & entry,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3723 void CFeature_table_reader::ReadSequinFeatureTables(
3724     CNcbiIstream& ifs,
3725     CSeq_entry& entry,
3726     const TFlags flags,
3727     ILineErrorListener* pMessageListener,
3728     ITableFilter *filter
3729 )
3730 {
3731     CStreamLineReader reader(ifs);
3732     return ReadSequinFeatureTables(reader, entry, flags, pMessageListener, filter);
3733 }
3734 
3735 struct SCSeqidCompare
3736 {
3737   inline
operator ()SCSeqidCompare3738   bool operator()(const CSeq_id* left, const CSeq_id* right) const
3739   {
3740      return *left < *right;
3741   };
3742 };
3743 
ReadSequinFeatureTables(ILineReader & reader,CSeq_entry & entry,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3744 void CFeature_table_reader::ReadSequinFeatureTables(
3745     ILineReader& reader,
3746     CSeq_entry& entry,
3747     const TFlags flags,
3748     ILineErrorListener* pMessageListener,
3749     ITableFilter *filter
3750 )
3751 {
3752     // let's use map to speedup matching on very large files, see SQD-1847
3753     map<const CSeq_id*, CRef<CBioseq>, SCSeqidCompare> seq_map;
3754 
3755     for (CTypeIterator<CBioseq> seqit(entry);  seqit;  ++seqit) {
3756         ITERATE (CBioseq::TId, seq_id, seqit->GetId()) {
3757             seq_map[seq_id->GetPointer()].Reset(&*seqit);
3758         }
3759     }
3760 
3761     CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3762     while ( !reader.AtEOF() ) {
3763         auto annot =  x_ReadFeatureTable(ftable_reader, flags, filter);
3764         //CRef<CSeq_annot> annot = ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3765         if (entry.IsSeq()) { // only one place to go
3766             entry.SetSeq().SetAnnot().push_back(annot);
3767             continue;
3768         }
3769         _ASSERT(annot->GetData().IsFtable());
3770         if (annot->GetData().GetFtable().empty()) {
3771             continue;
3772         }
3773         // otherwise, take the first feature, which should be representative
3774         const CSeq_feat& feat    = *annot->GetData().GetFtable().front();
3775         const CSeq_id*   feat_id = feat.GetLocation().GetId();
3776         CBioseq*         seq     = NULL;
3777         _ASSERT(feat_id); // we expect a uniform sequence ID
3778         seq = seq_map[feat_id].GetPointer();
3779         if (seq) { // found a match
3780             seq->SetAnnot().push_back(annot);
3781         } else { // just package on the set
3782             ERR_POST_X(6, Warning
3783                        << "ReadSequinFeatureTables: unable to find match for "
3784                        << feat_id->AsFastaString());
3785             entry.SetSet().SetAnnot().push_back(annot);
3786         }
3787     }
3788 }
3789 
3790 
CreateSeqFeat(const string & feat,CSeq_loc & location,const TFlags flags,ILineErrorListener * pMessageListener,unsigned int line_number,string * seq_id,ITableFilter * filter)3791 CRef<CSeq_feat> CFeature_table_reader::CreateSeqFeat (
3792     const string& feat,
3793     CSeq_loc& location,
3794     const TFlags flags,
3795     ILineErrorListener* pMessageListener,
3796     unsigned int line_number,
3797     string *seq_id,
3798     ITableFilter *filter
3799 )
3800 {
3801     CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3802     return impl.CreateSeqFeat (feat, location, flags, (seq_id ? *seq_id : string() ), filter);
3803 }
3804 
3805 
AddFeatQual(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const CFeature_table_reader::TFlags flags,ILineErrorListener * pMessageListener,int line_number,const string & seq_id)3806 void CFeature_table_reader::AddFeatQual (
3807     CRef<CSeq_feat> sfp,
3808     const string& feat_name,
3809     const string& qual,
3810     const string& val,
3811     const CFeature_table_reader::TFlags flags,
3812     ILineErrorListener* pMessageListener,
3813     int line_number,
3814     const string &seq_id
3815 )
3816 
3817 {
3818     CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3819     impl.AddFeatQual (sfp, feat_name, qual, val, flags, seq_id) ;
3820 }
3821 
3822 bool
ParseInitialFeatureLine(const CTempString & line_arg,CTempStringEx & out_seqid,CTempStringEx & out_annotname)3823 CFeature_table_reader::ParseInitialFeatureLine (
3824     const CTempString& line_arg,
3825     CTempStringEx& out_seqid,
3826     CTempStringEx& out_annotname )
3827 {
3828      return CFeatureTableReader_Imp::ParseInitialFeatureLine(line_arg, out_seqid, out_annotname);
3829 }
3830 
3831 
~CFeature_table_reader()3832 CFeature_table_reader::~CFeature_table_reader() {}
3833 
3834 END_objects_SCOPE
3835 END_NCBI_SCOPE
3836