1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Michael Kornbluh
27 *
28 * File Description:
29 * Feature table reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36
37 #include <util/static_map.hpp>
38
39 #include <serial/iterator.hpp>
40 #include <serial/objistrasn.hpp>
41
42 // Objects includes
43 #include <objects/general/Int_fuzz.hpp>
44 #include <objects/general/Object_id.hpp>
45 #include <objects/general/User_object.hpp>
46 #include <objects/general/User_field.hpp>
47 #include <objects/general/Dbtag.hpp>
48
49 #include <objects/seqloc/Seq_id.hpp>
50 #include <objects/seqloc/Seq_loc.hpp>
51 #include <objects/seqloc/Seq_interval.hpp>
52 #include <objects/seqloc/Seq_point.hpp>
53
54 #include <objects/seq/Seq_annot.hpp>
55 #include <objects/seq/Annotdesc.hpp>
56 #include <objects/seq/Annot_descr.hpp>
57 #include <objects/pub/Pub.hpp>
58 #include <objects/pub/Pub_equiv.hpp>
59 #include <objects/seq/Pubdesc.hpp>
60 #include <objects/seqfeat/SeqFeatData.hpp>
61 #include <objects/seq/seq_loc_from_string.hpp>
62
63 #include <objects/seqfeat/Seq_feat.hpp>
64 #include <objects/seqfeat/BioSource.hpp>
65 #include <objects/seqfeat/Org_ref.hpp>
66 #include <objects/seqfeat/OrgName.hpp>
67 #include <objects/seqfeat/SubSource.hpp>
68 #include <objects/seqfeat/OrgMod.hpp>
69 #include <objects/seqfeat/Gene_ref.hpp>
70 #include <objects/seqfeat/Cdregion.hpp>
71 #include <objects/seqfeat/Code_break.hpp>
72 #include <objects/seqfeat/Genetic_code.hpp>
73 #include <objects/seqfeat/Genetic_code_table.hpp>
74 #include <objects/seqfeat/RNA_ref.hpp>
75 #include <objects/seqfeat/Trna_ext.hpp>
76 #include <objects/seqfeat/RNA_gen.hpp>
77 #include <objects/seqfeat/RNA_qual_set.hpp>
78 #include <objects/seqfeat/RNA_qual.hpp>
79 #include <objects/seqfeat/Imp_feat.hpp>
80 #include <objects/seqfeat/Gb_qual.hpp>
81
82 #include <objects/misc/sequence_macros.hpp>
83
84 #include <objects/seqset/Bioseq_set.hpp>
85 #include <objects/seqset/Seq_entry.hpp>
86
87 #include <objtools/readers/readfeat.hpp>
88 #include <objtools/readers/table_filter.hpp>
89 #include <objtools/error_codes.hpp>
90
91 #include <algorithm>
92 #include <unordered_set>
93
94 #include <objtools/readers/message_listener.hpp>
95 #include <objtools/readers/read_util.hpp>
96 #include "best_feat_finder.hpp"
97
98 #define NCBI_USE_ERRCODE_X Objtools_Rd_Feature
99
100
101 BEGIN_NCBI_SCOPE
102
103 BEGIN_objects_SCOPE // namespace ncbi::objects::
104
105
106
107 namespace {
108 static const char * const kCdsFeatName = "CDS";
109 // priorities, inherited from C toolkit
110 static Uchar std_order[CSeq_id::e_MaxChoice] = {
111 83, /* 0 = not set */
112 80, /* 1 = local Object-id */
113 70, /* 2 = gibbsq */
114 70, /* 3 = gibbmt */
115 70, /* 4 = giim Giimport-id */
116 60, /* 5 = genbank */
117 60, /* 6 = embl */
118 60, /* 7 = pir */
119 60, /* 8 = swissprot */
120 81, /* 9 = patent */
121 65, /* 10 = other TextSeqId */
122 80, /* 11 = general Dbtag */
123 82, /* 12 = gi */
124 60, /* 13 = ddbj */
125 60, /* 14 = prf */
126 60, /* 15 = pdb */
127 60, /* 16 = tpg */
128 60, /* 17 = tpe */
129 60, /* 18 = tpd */
130 68, /* 19 = gpp */
131 69 /* 20 = nat */
132 };
133
GetBestId(const CBioseq::TId & ids)134 CRef<CSeq_id> GetBestId(const CBioseq::TId& ids)
135 {
136 if (ids.size() == 1)
137 return ids.front();
138
139 CRef<CSeq_id> id;
140 if (!ids.empty())
141 {
142 Uchar best_weight = UCHAR_MAX;
143 ITERATE(CBioseq::TId, it, ids)
144 {
145 Uchar new_weight = std_order[(*it)->Which()];
146 if (new_weight < best_weight)
147 {
148 id = *it;
149 best_weight = new_weight;
150 }
151 };
152 }
153
154 return id;
155 }
156
157
158 map<char, list<char>> s_IUPACmap
159 {
160 {'A', list<char>({'A'})},
161 {'G', list<char>({'G'})},
162 {'C', list<char>({'C'})},
163 {'T', list<char>({'T'})},
164 {'U', list<char>({'U'})},
165 {'M', list<char>({'A', 'C'})},
166 {'R', list<char>({'A', 'G'})},
167 {'W', list<char>({'A', 'T'})},
168 {'S', list<char>({'C', 'G'})},
169 {'Y', list<char>({'C', 'T'})},
170 {'K', list<char>({'G', 'T'})},
171 {'V', list<char>({'A', 'C', 'G'})},
172 {'H', list<char>({'A', 'C', 'T'})},
173 {'D', list<char>({'A', 'G', 'T'})},
174 {'B', list<char>({'C', 'G', 'T'})},
175 {'N', list<char>({'A', 'C', 'G', 'T'})}
176 };
177
178 }
179
180
181 class /* NCBI_XOBJREAD_EXPORT */ CFeatureTableReader_Imp
182 {
183 public:
184 enum EQual {
185 eQual_allele,
186 eQual_anticodon,
187 eQual_bac_ends,
188 eQual_bond_type,
189 eQual_bound_moiety,
190 eQual_chrcnt,
191 eQual_citation,
192 eQual_clone,
193 eQual_clone_id,
194 eQual_codon_recognized,
195 eQual_codon_start,
196 eQual_compare,
197 eQual_cons_splice,
198 eQual_ctgcnt,
199 eQual_cyt_map,
200 eQual_db_xref,
201 eQual_direction,
202 eQual_EC_number,
203 eQual_estimated_length,
204 eQual_evidence,
205 eQual_exception,
206 eQual_experiment,
207 eQual_frequency,
208 eQual_function,
209 eQual_gap_type,
210 eQual_gen_map,
211 eQual_gene,
212 eQual_gene_desc,
213 eQual_gene_syn,
214 eQual_go_component,
215 eQual_go_function,
216 eQual_go_process,
217 eQual_heterogen,
218 eQual_inference,
219 eQual_insertion_seq,
220 eQual_label,
221 eQual_linkage_evidence,
222 eQual_loccnt,
223 eQual_locus_tag,
224 eQual_macronuclear,
225 eQual_map,
226 eQual_MEDLINE,
227 eQual_method,
228 eQual_mobile_element_type,
229 eQual_mod_base,
230 eQual_muid,
231 eQual_ncRNA_class,
232 eQual_nomenclature,
233 eQual_note,
234 eQual_number,
235 eQual_old_locus_tag,
236 eQual_operon,
237 eQual_organism,
238 eQual_partial,
239 eQual_PCR_conditions,
240 eQual_phenotype,
241 eQual_pmid,
242 eQual_product,
243 eQual_prot_desc,
244 eQual_prot_note,
245 eQual_protein_id,
246 eQual_pseudo,
247 eQual_pseudogene,
248 eQual_PubMed,
249 eQual_rad_map,
250 eQual_region_name,
251 eQual_regulatory_class,
252 eQual_replace,
253 eQual_ribosomal_slippage,
254 eQual_rpt_family,
255 eQual_rpt_type,
256 eQual_rpt_unit,
257 eQual_rpt_unit_range,
258 eQual_rpt_unit_seq,
259 eQual_satellite,
260 eQual_sec_str_type,
261 eQual_secondary_accession,
262 eQual_sequence,
263 eQual_site_type,
264 eQual_snp_class,
265 eQual_snp_gtype,
266 eQual_snp_het,
267 eQual_snp_het_se,
268 eQual_snp_linkout,
269 eQual_snp_maxrate,
270 eQual_snp_valid,
271 eQual_standard_name,
272 eQual_STS,
273 eQual_sts_aliases,
274 eQual_sts_dsegs,
275 eQual_tag_peptide,
276 eQual_trans_splicing,
277 eQual_transcript_id,
278 eQual_transcription,
279 eQual_transl_except,
280 eQual_transl_table,
281 eQual_translation,
282 eQual_transposon,
283 eQual_usedin,
284 eQual_weight
285 };
286
287 enum EOrgRef {
288 eOrgRef_organism,
289 eOrgRef_organelle,
290 eOrgRef_div,
291 eOrgRef_lineage,
292 eOrgRef_gcode,
293 eOrgRef_mgcode
294 };
295
296 using TFlags = CFeature_table_reader::TFlags;
297 using TFtable = CSeq_annot::C_Data::TFtable;
298
299 // constructor
300 CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener);
301 // destructor
302 ~CFeatureTableReader_Imp(void);
303
304 // read 5-column feature table and return Seq-annot
305 CRef<CSeq_annot> ReadSequinFeatureTable (const CTempString& seqid,
306 const CTempString& annotname,
307 const TFlags flags,
308 ITableFilter *filter);
309
310 // create single feature from key
311 CRef<CSeq_feat> CreateSeqFeat (const string& feat,
312 CSeq_loc& location,
313 const TFlags flags,
314 const string &seq_id,
315 ITableFilter *filter);
316
317 // add single qualifier to feature
318 void AddFeatQual (CRef<CSeq_feat> sfp,
319 const string& feat_name,
320 const string& qual,
321 const string& val,
322 const TFlags flags,
323 const string &seq_id );
324
325 static bool ParseInitialFeatureLine (
326 const CTempString& line_arg,
327 CTempStringEx& out_seqid,
328 CTempStringEx& out_annotname );
329
330 static void PutProgress(const CTempString& seq_id,
331 const unsigned int line_number,
332 ILineErrorListener* pListener);
333
GetLineReaderPtr(void)334 ILineReader* const GetLineReaderPtr(void) {
335 return m_reader;
336 }
337
GetErrorListenerPtr(void)338 ILineErrorListener* const GetErrorListenerPtr(void) {
339 return m_pMessageListener;
340 }
341
342 private:
343
344 // Prohibit copy constructor and assignment operator
345 CFeatureTableReader_Imp(const CFeatureTableReader_Imp& value);
346 CFeatureTableReader_Imp& operator=(const CFeatureTableReader_Imp& value);
347
348 void x_InitId(const CTempString& seq_id, const TFlags flags);
349 // returns true if parsed (otherwise, out_offset is left unchanged)
350 bool x_TryToParseOffset(const CTempString & sLine, Int4 & out_offset );
351
352
353 struct SFeatLocInfo {
354 Int4 start_pos;
355 Int4 stop_pos;
356 bool is_5p_partial;
357 bool is_3p_partial;
358 bool is_point;
359 bool is_minus_strand;
360 };
361
362
363 bool x_ParseFeatureTableLine(
364 const CTempString& line,
365 SFeatLocInfo& loc_info,
366 string& feat,
367 string& qual,
368 string& val,
369 Int4 offset);
370
371
372 bool x_IsWebComment(CTempString line);
373
374 bool x_AddIntervalToFeature (
375 CTempString strFeatureName,
376 CRef<CSeq_feat>& sfp,
377 const SFeatLocInfo& loc_info);
378
379 bool x_AddQualifierToFeature (CRef<CSeq_feat> sfp,
380 const string &feat_name,
381 const string& qual, const string& val,
382 const TFlags flags);
383
384 void x_ProcessQualifier(const string& qual_name,
385 const string& qual_val,
386 const string& feat_name,
387 CRef<CSeq_feat> feat,
388 TFlags flags);
389
390 bool x_AddQualifierToGene (CSeqFeatData& sfdata,
391 EQual qtype, const string& val);
392 bool x_AddQualifierToCdregion (CRef<CSeq_feat> sfp, CSeqFeatData& sfdata,
393 EQual qtype, const string& val);
394 bool x_AddQualifierToRna (CRef<CSeq_feat> sfp,
395 EQual qtype, const string& val);
396 bool x_AddQualifierToImp (CRef<CSeq_feat> sfp, CSeqFeatData& sfdata,
397 EQual qtype, const string& qual, const string& val);
398 bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
399 const string &feat_name,
400 EOrgRef rtype, const string& val);
401 bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
402 CSubSource::ESubtype stype, const string& val);
403 bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
404 COrgMod::ESubtype mtype, const string& val);
405
406 bool x_AddNoteToFeature(CRef<CSeq_feat> sfp, const string& note);
407
408 bool x_AddNoteToFeature(CRef<CSeq_feat> sfp,
409 const string& feat_name,
410 const string& qual,
411 const string& val);
412
413 bool x_AddGBQualToFeature (CRef<CSeq_feat> sfp,
414 const string& qual, const string& val);
415
416 bool x_AddCodons(const string& val, CTrna_ext& trna_ext) const;
417
418 typedef CConstRef<CSeq_feat> TFeatConstRef;
419 struct SFeatAndLineNum {
SFeatAndLineNumCFeatureTableReader_Imp::SFeatAndLineNum420 SFeatAndLineNum(
421 TFeatConstRef pFeat,
422 TSeqPos uLineNum ) :
423 m_pFeat(pFeat), m_uLineNum(uLineNum) {
424 _ASSERT(pFeat);
425 }
426
operator ==CFeatureTableReader_Imp::SFeatAndLineNum427 bool operator==(const SFeatAndLineNum & rhs) const {
428 return Compare(rhs) == 0; }
operator !=CFeatureTableReader_Imp::SFeatAndLineNum429 bool operator!=(const SFeatAndLineNum & rhs) const {
430 return Compare(rhs) != 0; }
operator <CFeatureTableReader_Imp::SFeatAndLineNum431 bool operator<(const SFeatAndLineNum & rhs) const {
432 return Compare(rhs) < 0; }
433
CompareCFeatureTableReader_Imp::SFeatAndLineNum434 int Compare(const SFeatAndLineNum & rhs) const {
435 if( m_uLineNum != rhs.m_uLineNum ) {
436 return ( m_uLineNum < rhs.m_uLineNum ? -1 : 1 );
437 }
438 return (m_pFeat.GetPointerOrNull() < rhs.m_pFeat.GetPointerOrNull() ? -1 : 1 );
439 }
440
441 TFeatConstRef m_pFeat; // must be non-NULL
442 TSeqPos m_uLineNum; // the line where this feature was created (or zero if programmatically created)
443 };
444 typedef multimap<CSeqFeatData::E_Choice, SFeatAndLineNum> TChoiceToFeatMap;
445 void x_CreateGenesFromCDSs(
446 CRef<CSeq_annot> sap,
447 TChoiceToFeatMap & choiceToFeatMap, // an input param, but might get more items added
448 const TFlags flags);
449
450 bool x_StringIsJustQuotes (const string& str);
451
452 string x_TrnaToAaString(const string& val);
453
454 bool x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str);
455 SIZE_TYPE x_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos );
456
457 long x_StringToLongNoThrow (
458 CTempString strToConvert,
459 CTempString strFeatureName,
460 CTempString strQualifierName,
461 // user can override the default problem types that are set on error
462 ILineError::EProblem eProblem = ILineError::eProblem_Unset
463 );
464
465 bool x_SetupSeqFeat (CRef<CSeq_feat> sfp, const string& feat,
466 const TFlags flags,
467 ITableFilter *filter);
468
469 void x_ProcessMsg (
470 ILineError::EProblem eProblem,
471 EDiagSev eSeverity,
472 const std::string & strFeatureName = kEmptyStr,
473 const std::string & strQualifierName = kEmptyStr,
474 const std::string & strQualifierValue = kEmptyStr,
475 const std::string & strErrorMessage = kEmptyStr,
476 const ILineError::TVecOfLines & vecOfOtherLines =
477 ILineError::TVecOfLines() );
478
479 void x_ProcessMsg(
480 int line_num,
481 ILineError::EProblem eProblem,
482 EDiagSev eSeverity,
483 const std::string & strFeatureName = kEmptyStr,
484 const std::string & strQualifierName = kEmptyStr,
485 const std::string & strQualifierValue = kEmptyStr,
486 const std::string & strErrorMessage = kEmptyStr,
487 const ILineError::TVecOfLines & vecOfOtherLines =
488 ILineError::TVecOfLines());
489
490 void x_TokenizeStrict( const CTempString &line, vector<string> &out_tokens );
491 void x_TokenizeLenient( const CTempString &line, vector<string> &out_tokens );
492 void x_FinishFeature(CRef<CSeq_feat>& feat, TFtable& ftable);
493 void x_ResetFeat(CRef<CSeq_feat>& feat, bool & curr_feat_intervals_done);
494 void x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const;
495 void x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const;
496
497 bool m_need_check_strand;
498 string m_real_seqid;
499 CRef<CSeq_id> m_seq_id;
500 ILineReader* m_reader;
501 unsigned int m_LineNumber;
502 ILineErrorListener* m_pMessageListener;
503 unordered_set<string> m_ProcessedTranscriptIds;
504 unordered_set<string> m_ProcessedProteinIds;
505 };
506
507
508 typedef SStaticPair<const char *, CFeatureTableReader_Imp::EQual> TQualKey;
509
510 static const TQualKey qual_key_to_subtype [] = {
511 { "EC_number", CFeatureTableReader_Imp::eQual_EC_number },
512 { "PCR_conditions", CFeatureTableReader_Imp::eQual_PCR_conditions },
513 { "PubMed", CFeatureTableReader_Imp::eQual_PubMed },
514 { "STS", CFeatureTableReader_Imp::eQual_STS },
515 { "allele", CFeatureTableReader_Imp::eQual_allele },
516 { "anticodon", CFeatureTableReader_Imp::eQual_anticodon },
517 { "bac_ends", CFeatureTableReader_Imp::eQual_bac_ends },
518 { "bond_type", CFeatureTableReader_Imp::eQual_bond_type },
519 { "bound_moiety", CFeatureTableReader_Imp::eQual_bound_moiety },
520 { "chrcnt", CFeatureTableReader_Imp::eQual_chrcnt },
521 { "citation", CFeatureTableReader_Imp::eQual_citation },
522 { "clone", CFeatureTableReader_Imp::eQual_clone },
523 { "clone_id", CFeatureTableReader_Imp::eQual_clone_id },
524 { "codon_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
525 { "codon_start", CFeatureTableReader_Imp::eQual_codon_start },
526 { "codons_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
527 { "compare", CFeatureTableReader_Imp::eQual_compare },
528 { "cons_splice", CFeatureTableReader_Imp::eQual_cons_splice },
529 { "ctgcnt", CFeatureTableReader_Imp::eQual_ctgcnt },
530 { "cyt_map", CFeatureTableReader_Imp::eQual_cyt_map },
531 { "db_xref", CFeatureTableReader_Imp::eQual_db_xref },
532 { "direction", CFeatureTableReader_Imp::eQual_direction },
533 { "estimated_length", CFeatureTableReader_Imp::eQual_estimated_length },
534 { "evidence", CFeatureTableReader_Imp::eQual_evidence },
535 { "exception", CFeatureTableReader_Imp::eQual_exception },
536 { "experiment", CFeatureTableReader_Imp::eQual_experiment },
537 { "frequency", CFeatureTableReader_Imp::eQual_frequency },
538 { "function", CFeatureTableReader_Imp::eQual_function },
539 { "gap_type", CFeatureTableReader_Imp::eQual_gap_type },
540 { "gen_map", CFeatureTableReader_Imp::eQual_gen_map },
541 { "gene", CFeatureTableReader_Imp::eQual_gene },
542 { "gene_desc", CFeatureTableReader_Imp::eQual_gene_desc },
543 { "gene_syn", CFeatureTableReader_Imp::eQual_gene_syn },
544 { "gene_synonym", CFeatureTableReader_Imp::eQual_gene_syn },
545 { "go_component", CFeatureTableReader_Imp::eQual_go_component },
546 { "go_function", CFeatureTableReader_Imp::eQual_go_function },
547 { "go_process", CFeatureTableReader_Imp::eQual_go_process },
548 { "heterogen", CFeatureTableReader_Imp::eQual_heterogen },
549 { "inference", CFeatureTableReader_Imp::eQual_inference },
550 { "insertion_seq", CFeatureTableReader_Imp::eQual_insertion_seq },
551 { "label", CFeatureTableReader_Imp::eQual_label },
552 { "linkage_evidence", CFeatureTableReader_Imp::eQual_linkage_evidence },
553 { "loccnt", CFeatureTableReader_Imp::eQual_loccnt },
554 { "locus_tag", CFeatureTableReader_Imp::eQual_locus_tag },
555 { "macronuclear", CFeatureTableReader_Imp::eQual_macronuclear },
556 { "map", CFeatureTableReader_Imp::eQual_map },
557 { "method", CFeatureTableReader_Imp::eQual_method },
558 { "mobile_element_type", CFeatureTableReader_Imp::eQual_mobile_element_type },
559 { "mod_base", CFeatureTableReader_Imp::eQual_mod_base },
560 { "ncRNA_class", CFeatureTableReader_Imp::eQual_ncRNA_class },
561 { "nomenclature", CFeatureTableReader_Imp::eQual_nomenclature },
562 { "note", CFeatureTableReader_Imp::eQual_note },
563 { "number", CFeatureTableReader_Imp::eQual_number },
564 { "old_locus_tag", CFeatureTableReader_Imp::eQual_old_locus_tag },
565 { "operon", CFeatureTableReader_Imp::eQual_operon },
566 { "organism", CFeatureTableReader_Imp::eQual_organism },
567 { "partial", CFeatureTableReader_Imp::eQual_partial },
568 { "phenotype", CFeatureTableReader_Imp::eQual_phenotype },
569 { "product", CFeatureTableReader_Imp::eQual_product },
570 { "prot_desc", CFeatureTableReader_Imp::eQual_prot_desc },
571 { "prot_note", CFeatureTableReader_Imp::eQual_prot_note },
572 { "protein_id", CFeatureTableReader_Imp::eQual_protein_id },
573 { "pseudo", CFeatureTableReader_Imp::eQual_pseudo },
574 { "pseudogene", CFeatureTableReader_Imp::eQual_pseudogene },
575 { "rad_map", CFeatureTableReader_Imp::eQual_rad_map },
576 { "region_name", CFeatureTableReader_Imp::eQual_region_name },
577 { "regulatory_class", CFeatureTableReader_Imp::eQual_regulatory_class },
578 { "replace", CFeatureTableReader_Imp::eQual_replace },
579 { "ribosomal_slippage", CFeatureTableReader_Imp::eQual_ribosomal_slippage },
580 { "rpt_family", CFeatureTableReader_Imp::eQual_rpt_family },
581 { "rpt_type", CFeatureTableReader_Imp::eQual_rpt_type },
582 { "rpt_unit", CFeatureTableReader_Imp::eQual_rpt_unit },
583 { "rpt_unit_range", CFeatureTableReader_Imp::eQual_rpt_unit_range },
584 { "rpt_unit_seq", CFeatureTableReader_Imp::eQual_rpt_unit_seq },
585 { "satellite", CFeatureTableReader_Imp::eQual_satellite },
586 { "sec_str_type", CFeatureTableReader_Imp::eQual_sec_str_type },
587 { "secondary_accession", CFeatureTableReader_Imp::eQual_secondary_accession },
588 { "secondary_accessions", CFeatureTableReader_Imp::eQual_secondary_accession },
589 { "sequence", CFeatureTableReader_Imp::eQual_sequence },
590 { "site_type", CFeatureTableReader_Imp::eQual_site_type },
591 { "snp_class", CFeatureTableReader_Imp::eQual_snp_class },
592 { "snp_gtype", CFeatureTableReader_Imp::eQual_snp_gtype },
593 { "snp_het", CFeatureTableReader_Imp::eQual_snp_het },
594 { "snp_het_se", CFeatureTableReader_Imp::eQual_snp_het_se },
595 { "snp_linkout", CFeatureTableReader_Imp::eQual_snp_linkout },
596 { "snp_maxrate", CFeatureTableReader_Imp::eQual_snp_maxrate },
597 { "snp_valid", CFeatureTableReader_Imp::eQual_snp_valid },
598 { "standard_name", CFeatureTableReader_Imp::eQual_standard_name },
599 { "sts_aliases", CFeatureTableReader_Imp::eQual_sts_aliases },
600 { "sts_dsegs", CFeatureTableReader_Imp::eQual_sts_dsegs },
601 { "tag_peptide", CFeatureTableReader_Imp::eQual_tag_peptide },
602 { "trans_splicing", CFeatureTableReader_Imp::eQual_trans_splicing },
603 { "transcript_id", CFeatureTableReader_Imp::eQual_transcript_id },
604 { "transcription", CFeatureTableReader_Imp::eQual_transcription },
605 { "transl_except", CFeatureTableReader_Imp::eQual_transl_except },
606 { "transl_table", CFeatureTableReader_Imp::eQual_transl_table },
607 { "translation", CFeatureTableReader_Imp::eQual_translation },
608 { "transposon", CFeatureTableReader_Imp::eQual_transposon },
609 { "usedin", CFeatureTableReader_Imp::eQual_usedin },
610 { "weight", CFeatureTableReader_Imp::eQual_weight }
611 };
612
613 typedef CStaticPairArrayMap <const char*, CFeatureTableReader_Imp::EQual, PCase_CStr> TQualMap;
614 DEFINE_STATIC_ARRAY_MAP(TQualMap, sm_QualKeys, qual_key_to_subtype);
615
616
617 typedef SStaticPair<const char *, CFeatureTableReader_Imp::EOrgRef> TOrgRefKey;
618
619 static const TOrgRefKey orgref_key_to_subtype [] = {
620 { "div", CFeatureTableReader_Imp::eOrgRef_div },
621 { "gcode", CFeatureTableReader_Imp::eOrgRef_gcode },
622 { "lineage", CFeatureTableReader_Imp::eOrgRef_lineage },
623 { "mgcode", CFeatureTableReader_Imp::eOrgRef_mgcode },
624 { "organelle", CFeatureTableReader_Imp::eOrgRef_organelle },
625 { "organism", CFeatureTableReader_Imp::eOrgRef_organism }
626 };
627
628 typedef CStaticPairArrayMap <const char*, CFeatureTableReader_Imp::EOrgRef, PCase_CStr> TOrgRefMap;
629 DEFINE_STATIC_ARRAY_MAP(TOrgRefMap, sm_OrgRefKeys, orgref_key_to_subtype);
630
631
632 typedef SStaticPair<const char *, CBioSource::EGenome> TGenomeKey;
633
634 static const TGenomeKey genome_key_to_subtype [] = {
635 { "apicoplast", CBioSource::eGenome_apicoplast },
636 { "chloroplast", CBioSource::eGenome_chloroplast },
637 { "chromatophore", CBioSource::eGenome_chromatophore },
638 { "chromoplast", CBioSource::eGenome_chromoplast },
639 { "chromosome", CBioSource::eGenome_chromosome },
640 { "cyanelle", CBioSource::eGenome_cyanelle },
641 { "endogenous_virus", CBioSource::eGenome_endogenous_virus },
642 { "extrachrom", CBioSource::eGenome_extrachrom },
643 { "genomic", CBioSource::eGenome_genomic },
644 { "hydrogenosome", CBioSource::eGenome_hydrogenosome },
645 { "insertion_seq", CBioSource::eGenome_insertion_seq },
646 { "kinetoplast", CBioSource::eGenome_kinetoplast },
647 { "leucoplast", CBioSource::eGenome_leucoplast },
648 { "macronuclear", CBioSource::eGenome_macronuclear },
649 { "mitochondrion", CBioSource::eGenome_mitochondrion },
650 { "mitochondrion:kinetoplast", CBioSource::eGenome_kinetoplast },
651 { "nucleomorph", CBioSource::eGenome_nucleomorph },
652 { "plasmid", CBioSource::eGenome_plasmid },
653 { "plastid", CBioSource::eGenome_plastid },
654 { "plastid:apicoplast", CBioSource::eGenome_apicoplast },
655 { "plastid:chloroplast", CBioSource::eGenome_chloroplast },
656 { "plastid:chromoplast", CBioSource::eGenome_chromoplast },
657 { "plastid:cyanelle", CBioSource::eGenome_cyanelle },
658 { "plastid:leucoplast", CBioSource::eGenome_leucoplast },
659 { "plastid:proplastid", CBioSource::eGenome_proplastid },
660 { "proplastid", CBioSource::eGenome_proplastid },
661 { "proviral", CBioSource::eGenome_proviral },
662 { "transposon", CBioSource::eGenome_transposon },
663 { "unknown", CBioSource::eGenome_unknown },
664 { "virion", CBioSource::eGenome_virion }
665 };
666
667 typedef CStaticPairArrayMap <const char*, CBioSource::EGenome, PCase_CStr> TGenomeMap;
668 DEFINE_STATIC_ARRAY_MAP(TGenomeMap, sm_GenomeKeys, genome_key_to_subtype);
669
670
671 typedef SStaticPair<const char *, CSubSource::ESubtype> TSubSrcKey;
672
673 static const TSubSrcKey subsrc_key_to_subtype [] = {
674 { "altitude", CSubSource::eSubtype_altitude },
675 { "cell_line", CSubSource::eSubtype_cell_line },
676 { "cell_type", CSubSource::eSubtype_cell_type },
677 { "chromosome", CSubSource::eSubtype_chromosome },
678 { "clone", CSubSource::eSubtype_clone },
679 { "clone_lib", CSubSource::eSubtype_clone_lib },
680 { "collected_by", CSubSource::eSubtype_collected_by },
681 { "collection_date", CSubSource::eSubtype_collection_date },
682 { "country", CSubSource::eSubtype_country },
683 { "dev_stage", CSubSource::eSubtype_dev_stage },
684 { "endogenous_virus", CSubSource::eSubtype_endogenous_virus_name },
685 { "environmental_sample", CSubSource::eSubtype_environmental_sample },
686 { "frequency", CSubSource::eSubtype_frequency },
687 { "fwd_primer_name", CSubSource::eSubtype_fwd_primer_name },
688 { "fwd_primer_seq", CSubSource::eSubtype_fwd_primer_seq },
689 { "genotype", CSubSource::eSubtype_genotype },
690 { "germline", CSubSource::eSubtype_germline },
691 { "haplotype", CSubSource::eSubtype_haplotype },
692 { "identified_by", CSubSource::eSubtype_identified_by },
693 { "insertion_seq", CSubSource::eSubtype_insertion_seq_name },
694 { "isolation_source", CSubSource::eSubtype_isolation_source },
695 { "lab_host", CSubSource::eSubtype_lab_host },
696 { "lat_lon", CSubSource::eSubtype_lat_lon },
697 { "map", CSubSource::eSubtype_map },
698 { "metagenomic", CSubSource::eSubtype_metagenomic },
699 { "plasmid", CSubSource::eSubtype_plasmid_name },
700 { "plastid", CSubSource::eSubtype_plastid_name },
701 { "pop_variant", CSubSource::eSubtype_pop_variant },
702 { "rearranged", CSubSource::eSubtype_rearranged },
703 { "rev_primer_name", CSubSource::eSubtype_rev_primer_name },
704 { "rev_primer_seq", CSubSource::eSubtype_rev_primer_seq },
705 { "segment", CSubSource::eSubtype_segment },
706 { "sex", CSubSource::eSubtype_sex },
707 { "subclone", CSubSource::eSubtype_subclone },
708 { "tissue_lib ", CSubSource::eSubtype_tissue_lib },
709 { "tissue_type", CSubSource::eSubtype_tissue_type },
710 { "transgenic", CSubSource::eSubtype_transgenic },
711 { "transposon", CSubSource::eSubtype_transposon_name }
712 };
713
714 typedef CStaticPairArrayMap <const char*, CSubSource::ESubtype, PCase_CStr> TSubSrcMap;
715 DEFINE_STATIC_ARRAY_MAP(TSubSrcMap, sm_SubSrcKeys, subsrc_key_to_subtype);
716
717 // case-insensitive version of sm_SubSrcKeys
718 typedef CStaticPairArrayMap <const char*, CSubSource::ESubtype, PNocase_CStr> TSubSrcNoCaseMap;
719 DEFINE_STATIC_ARRAY_MAP(
720 TSubSrcNoCaseMap, sm_SubSrcNoCaseKeys, subsrc_key_to_subtype);
721
722 typedef SStaticPair<const char *, COrgMod::ESubtype> TOrgModKey;
723
724 static const TOrgModKey orgmod_key_to_subtype [] = {
725 { "acronym", COrgMod::eSubtype_acronym },
726 { "anamorph", COrgMod::eSubtype_anamorph },
727 { "authority", COrgMod::eSubtype_authority },
728 { "bio_material", COrgMod::eSubtype_bio_material },
729 { "biotype", COrgMod::eSubtype_biotype },
730 { "biovar", COrgMod::eSubtype_biovar },
731 { "breed", COrgMod::eSubtype_breed },
732 { "chemovar", COrgMod::eSubtype_chemovar },
733 { "common", COrgMod::eSubtype_common },
734 { "cultivar", COrgMod::eSubtype_cultivar },
735 { "culture_collection", COrgMod::eSubtype_culture_collection },
736 { "dosage", COrgMod::eSubtype_dosage },
737 { "ecotype", COrgMod::eSubtype_ecotype },
738 { "forma", COrgMod::eSubtype_forma },
739 { "forma_specialis", COrgMod::eSubtype_forma_specialis },
740 { "gb_acronym", COrgMod::eSubtype_gb_acronym },
741 { "gb_anamorph", COrgMod::eSubtype_gb_anamorph },
742 { "gb_synonym", COrgMod::eSubtype_gb_synonym },
743 { "group", COrgMod::eSubtype_group },
744 { "isolate", COrgMod::eSubtype_isolate },
745 { "metagenome_source", COrgMod::eSubtype_metagenome_source },
746 { "nat_host", COrgMod::eSubtype_nat_host },
747 { "natural_host", COrgMod::eSubtype_nat_host },
748 { "old_lineage", COrgMod::eSubtype_old_lineage },
749 { "old_name", COrgMod::eSubtype_old_name },
750 { "pathovar", COrgMod::eSubtype_pathovar },
751 { "serogroup", COrgMod::eSubtype_serogroup },
752 { "serotype", COrgMod::eSubtype_serotype },
753 { "serovar", COrgMod::eSubtype_serovar },
754 { "spec_host", COrgMod::eSubtype_nat_host },
755 { "specific_host", COrgMod::eSubtype_nat_host },
756 { "specimen_voucher", COrgMod::eSubtype_specimen_voucher },
757 { "strain", COrgMod::eSubtype_strain },
758 { "sub_species", COrgMod::eSubtype_sub_species },
759 { "subgroup", COrgMod::eSubtype_subgroup },
760 { "substrain", COrgMod::eSubtype_substrain },
761 { "subtype", COrgMod::eSubtype_subtype },
762 { "synonym", COrgMod::eSubtype_synonym },
763 { "teleomorph", COrgMod::eSubtype_teleomorph },
764 { "type", COrgMod::eSubtype_type },
765 { "type_material", COrgMod::eSubtype_type_material },
766 { "variety", COrgMod::eSubtype_variety }
767 };
768
769 typedef CStaticPairArrayMap <const char*, COrgMod::ESubtype, PCase_CStr> TOrgModMap;
770 DEFINE_STATIC_ARRAY_MAP(TOrgModMap, sm_OrgModKeys, orgmod_key_to_subtype);
771
772 static const map<const char*, int, PNocase_CStr> sm_TrnaKeys
773 {
774 { "Ala", 'A' },
775 { "Alanine", 'A' },
776 { "Arg", 'R' },
777 { "Arginine", 'R' },
778 { "Asn", 'N' },
779 { "Asp", 'D' },
780 { "Asp or Asn", 'B' },
781 { "Asparagine", 'N' },
782 { "Aspartate", 'D' },
783 { "Aspartic Acid", 'D' },
784 { "Asx", 'B' },
785 { "Cys", 'C' },
786 { "Cysteine", 'C' },
787 { "Gln", 'Q' },
788 { "Glu", 'E' },
789 { "Glu or Gln", 'Z' },
790 { "Glutamate", 'E' },
791 { "Glutamic Acid", 'E' },
792 { "Glutamine", 'Q' },
793 { "Glx", 'Z' },
794 { "Gly", 'G' },
795 { "Glycine", 'G' },
796 { "His", 'H' },
797 { "Histidine", 'H' },
798 { "Ile", 'I' },
799 { "Ile2", 'I' },
800 { "Isoleucine", 'I' },
801 { "Leu", 'L' },
802 { "Leu or Ile", 'J' },
803 { "Leucine", 'L' },
804 { "Lys", 'K' },
805 { "Lysine", 'K' },
806 { "Met", 'M' },
807 { "Methionine", 'M' },
808 { "OTHER", 'X' },
809 { "Phe", 'F' },
810 { "Phenylalanine", 'F' },
811 { "Pro", 'P' },
812 { "Proline", 'P' },
813 { "Pyl", 'O' },
814 { "Pyrrolysine", 'O' },
815 { "Sec", 'U' },
816 { "Selenocysteine", 'U' },
817 { "Ser", 'S' },
818 { "Serine", 'S' },
819 { "TERM", '*' },
820 { "Ter", '*' },
821 { "Termination", '*' },
822 { "Thr", 'T' },
823 { "Threonine", 'T' },
824 { "Trp", 'W' },
825 { "Tryptophan", 'W' },
826 { "Tyr", 'Y' },
827 { "Tyrosine", 'Y' },
828 { "Val", 'V' },
829 { "Valine", 'V' },
830 { "Xle", 'J' },
831 { "Xxx", 'X' },
832 { "Undet", 'X' },
833 { "fMet", 'M' },
834 { "iMet", 'M' }
835 };
836
837
838 static
839 set<const char*, PCase_CStr>
840 sc_SingleKeys {
841 "environmental_sample",
842 "germline",
843 "metagenomic",
844 "partial",
845 "pseudo",
846 "rearranged",
847 "ribosomal_slippage",
848 "trans_splicing",
849 "transgenic",
850 "replace" // RW-882
851 };
852
853 // constructor
CFeatureTableReader_Imp(ILineReader * reader,unsigned int line_num,ILineErrorListener * pMessageListener)854 CFeatureTableReader_Imp::CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener)
855 : m_reader(reader), m_LineNumber(line_num), m_pMessageListener(pMessageListener)
856 {
857 }
858
859 // destructor
~CFeatureTableReader_Imp(void)860 CFeatureTableReader_Imp::~CFeatureTableReader_Imp(void)
861 {
862 }
863
x_TryToParseOffset(const CTempString & sLine,Int4 & out_offset)864 bool CFeatureTableReader_Imp::x_TryToParseOffset(
865 const CTempString & sLine, Int4 & out_offset )
866 {
867 // offset strings are of the form [offset=SOME_NUMBER], but here we try
868 // to be as forgiving of whitespace as possible.
869
870 CTempString sKey;
871 CTempString sValue;
872 if( ! NStr::SplitInTwo(sLine, "=", sKey, sValue) ) {
873 // "=" not found
874 return false;
875 }
876
877 // check key
878 NStr::TruncateSpacesInPlace(sKey);
879 if( NStr::StartsWith(sKey, "[") ) {
880 sKey = sKey.substr(1); // remove initial "["
881 }
882 NStr::TruncateSpacesInPlace(sKey, NStr::eTrunc_Begin);
883 if( ! NStr::EqualNocase(sKey, "offset") ) {
884 // key is not offset
885 return false;
886 }
887
888 // check value
889 NStr::TruncateSpacesInPlace(sValue);
890 if( ! NStr::EndsWith(sValue, "]") ) {
891 // no closing bracket
892 return false;
893 }
894 // remove closing bracket
895 sValue = sValue.substr(0, (sValue.length() - 1) );
896 NStr::TruncateSpacesInPlace(sValue, NStr::eTrunc_End);
897 // is it a number?
898 try {
899 Int4 new_offset = NStr::StringToInt(sValue);
900 // if( new_offset < 0 ) {
901 // return false;
902 // }
903 out_offset = new_offset;
904 return true;
905 } catch ( CStringException & ) {
906 return false;
907 }
908 }
909
x_ParseFeatureTableLine(const CTempString & line,SFeatLocInfo & loc_info,string & featP,string & qualP,string & valP,Int4 offset)910 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
911 const CTempString& line,
912 SFeatLocInfo& loc_info,
913 string& featP,
914 string& qualP,
915 string& valP,
916 Int4 offset
917 )
918
919 {
920 SIZE_TYPE numtkns;
921 bool isminus = false;
922 bool ispoint = false;
923 size_t len;
924 bool partial5 = false;
925 bool partial3 = false;
926 Int4 startv = -1;
927 Int4 stopv = -1;
928 Int4 swp;
929 string start, stop, feat, qual, val, stnd;
930 vector<string> tkns;
931
932
933 if (line.empty ()) return false;
934
935 /* offset and other instructions encoded in brackets */
936 if (NStr::StartsWith (line, '[')) return false;
937
938 tkns.clear ();
939 x_TokenizeLenient(line, tkns);
940 numtkns = tkns.size ();
941
942 if (numtkns > 0) {
943 start = NStr::TruncateSpaces(tkns[0]);
944 }
945 if (numtkns > 1) {
946 stop = NStr::TruncateSpaces(tkns[1]);
947 }
948 if (numtkns > 2) {
949 feat = NStr::TruncateSpaces(tkns[2]);
950 }
951 if (numtkns > 3) {
952 qual = NStr::TruncateSpaces(tkns[3]);
953 }
954 if (numtkns > 4) {
955 val = NStr::TruncateSpaces(tkns[4]);
956 // trim enclosing double-quotes
957 if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
958 val = val.substr(1, val.length() - 2);
959 }
960 }
961 if (numtkns > 5) {
962 stnd = NStr::TruncateSpaces(tkns[5]);
963 }
964
965 bool has_start = false;
966 if (! start.empty ()) {
967 if (start [0] == '<') {
968 partial5 = true;
969 start.erase (0, 1);
970 }
971 len = start.length ();
972 if (len > 1 && start [len - 1] == '^') {
973 ispoint = true;
974 start [len - 1] = '\0';
975 }
976 startv = x_StringToLongNoThrow(start, feat, qual,
977 ILineError::eProblem_BadFeatureInterval);
978 has_start = true;
979 }
980
981 bool has_stop = false;
982 if (! stop.empty ()) {
983 if (stop [0] == '>') {
984 partial3 = true;
985 stop.erase (0, 1);
986 }
987 stopv = x_StringToLongNoThrow (stop, feat, qual,
988 ILineError::eProblem_BadFeatureInterval);
989 has_stop = true;
990 }
991
992 if ( startv <= 0 || stopv <= 0 ) {
993 startv = -1;
994 stopv = -1;
995 } else {
996 startv--;
997 stopv--;
998 if (! stnd.empty ()) {
999 if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1000 if (start < stop) {
1001 swp = startv;
1002 startv = stopv;
1003 stopv = swp;
1004 }
1005 isminus = true;
1006 }
1007 }
1008 }
1009
1010 if (startv >= 0) {
1011 startv += offset;
1012 }
1013 if (stopv >= 0) {
1014 stopv += offset;
1015 }
1016
1017 if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1018 x_ProcessMsg(
1019 ILineError::eProblem_FeatureBadStartAndOrStop,
1020 eDiag_Error,
1021 feat);
1022 }
1023
1024 loc_info.start_pos = ( startv < 0 ? -1 : startv);
1025 loc_info.stop_pos = ( stopv < 0 ? -1 : stopv);
1026
1027 loc_info.is_5p_partial = partial5;
1028 loc_info.is_3p_partial = partial3;
1029 loc_info.is_point = ispoint;
1030 loc_info.is_minus_strand = isminus;
1031 featP = feat;
1032 qualP = qual;
1033 valP = val;
1034
1035 return true;
1036 }
1037
1038 /*
1039 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
1040 const CTempString& line,
1041 Int4* startP,
1042 Int4* stopP,
1043 bool* partial5P,
1044 bool* partial3P,
1045 bool* ispointP,
1046 bool* isminusP,
1047 string& featP,
1048 string& qualP,
1049 string& valP,
1050 Int4 offset
1051 )
1052
1053 {
1054 SIZE_TYPE numtkns;
1055 bool isminus = false;
1056 bool ispoint = false;
1057 size_t len;
1058 bool partial5 = false;
1059 bool partial3 = false;
1060 Int4 startv = -1;
1061 Int4 stopv = -1;
1062 Int4 swp;
1063 string start, stop, feat, qual, val, stnd;
1064 vector<string> tkns;
1065
1066
1067 if (line.empty ()) return false;
1068
1069 if (NStr::StartsWith (line, '[')) return false;
1070
1071 tkns.clear ();
1072 x_TokenizeLenient(line, tkns);
1073 numtkns = tkns.size ();
1074
1075 if (numtkns > 0) {
1076 start = NStr::TruncateSpaces(tkns[0]);
1077 }
1078 if (numtkns > 1) {
1079 stop = NStr::TruncateSpaces(tkns[1]);
1080 }
1081 if (numtkns > 2) {
1082 feat = NStr::TruncateSpaces(tkns[2]);
1083 }
1084 if (numtkns > 3) {
1085 qual = NStr::TruncateSpaces(tkns[3]);
1086 }
1087 if (numtkns > 4) {
1088 val = NStr::TruncateSpaces(tkns[4]);
1089 // trim enclosing double-quotes
1090 if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
1091 val = val.substr(1, val.length() - 2);
1092 }
1093 }
1094 if (numtkns > 5) {
1095 stnd = NStr::TruncateSpaces(tkns[5]);
1096 }
1097
1098 bool has_start = false;
1099 if (! start.empty ()) {
1100 if (start [0] == '<') {
1101 partial5 = true;
1102 start.erase (0, 1);
1103 }
1104 len = start.length ();
1105 if (len > 1 && start [len - 1] == '^') {
1106 ispoint = true;
1107 start [len - 1] = '\0';
1108 }
1109 startv = x_StringToLongNoThrow(start, feat, qual,
1110 ILineError::eProblem_BadFeatureInterval);
1111 has_start = true;
1112 }
1113
1114 bool has_stop = false;
1115 if (! stop.empty ()) {
1116 if (stop [0] == '>') {
1117 partial3 = true;
1118 stop.erase (0, 1);
1119 }
1120 stopv = x_StringToLongNoThrow (stop, feat, qual,
1121 ILineError::eProblem_BadFeatureInterval);
1122 has_stop = true;
1123 }
1124
1125 if ( startv <= 0 || stopv <= 0 ) {
1126 startv = -1;
1127 stopv = -1;
1128 } else {
1129 startv--;
1130 stopv--;
1131 if (! stnd.empty ()) {
1132 if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1133 if (start < stop) {
1134 swp = startv;
1135 startv = stopv;
1136 stopv = swp;
1137 }
1138 isminus = true;
1139 }
1140 }
1141 }
1142
1143 if (startv >= 0) {
1144 startv += offset;
1145 }
1146 if (stopv >= 0) {
1147 stopv += offset;
1148 }
1149
1150 if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1151 x_ProcessMsg(
1152 ILineError::eProblem_FeatureBadStartAndOrStop,
1153 eDiag_Error,
1154 feat);
1155 }
1156
1157 *startP = ( startv < 0 ? -1 : startv);
1158 *stopP = ( stopv < 0 ? -1 : stopv);
1159
1160 *partial5P = partial5;
1161 *partial3P = partial3;
1162 *ispointP = ispoint;
1163 *isminusP = isminus;
1164 featP = feat;
1165 qualP = qual;
1166 valP = val;
1167
1168 return true;
1169 }
1170 */
1171
x_TokenizeStrict(const CTempString & line,vector<string> & out_tokens)1172 void CFeatureTableReader_Imp::x_TokenizeStrict(
1173 const CTempString &line,
1174 vector<string> &out_tokens )
1175 {
1176 out_tokens.clear();
1177
1178 // each token has spaces before it and a tab or end-of-line after it
1179 string::size_type startPosOfNextRoundOfTokenization = 0;
1180 while ( startPosOfNextRoundOfTokenization < line.size() ) {
1181 auto posAfterSpaces = line.find_first_not_of( " ", startPosOfNextRoundOfTokenization );
1182 if( posAfterSpaces == string::npos ) {
1183 return;
1184 }
1185
1186 string::size_type posOfTab = line.find( '\t', posAfterSpaces );
1187 if( posOfTab == string::npos ) {
1188 posOfTab = line.length();
1189 }
1190
1191 // The next token is between the spaces and the tab (or end of string)
1192 out_tokens.push_back(kEmptyStr);
1193 string &new_token = out_tokens.back();
1194 copy( line.begin() + posAfterSpaces, line.begin() + posOfTab, back_inserter(new_token) );
1195 NStr::TruncateSpacesInPlace( new_token );
1196
1197 startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1198 }
1199 }
1200
1201 // since some compilers won't let me use isspace for find_if
1202 class CIsSpace {
1203 public:
operator ()(char c)1204 bool operator()( char c ) { return isspace(c); }
1205 };
1206
1207 class CIsNotSpace {
1208 public:
operator ()(char c)1209 bool operator()( char c ) { return ! isspace(c); }
1210 };
1211
x_TokenizeLenient(const CTempString & line,vector<string> & out_tokens)1212 void CFeatureTableReader_Imp::x_TokenizeLenient(
1213 const CTempString &line,
1214 vector<string> &out_tokens )
1215 {
1216 out_tokens.clear();
1217
1218 if( line.empty() ) {
1219 return;
1220 }
1221
1222 // if it starts with whitespace, it must be a qual line, else it's a feature line
1223 if( isspace(line[0]) ) {
1224 // In regex form, we're doing something like this:
1225 // \s+(\S+)(\s+(\S.*))?
1226 // Where the first is the qual, and the rest is the val
1227 auto start_of_qual = find_if( line.begin(), line.end(), CIsNotSpace() );
1228 if( start_of_qual == line.end() ) {
1229 return;
1230 }
1231 auto start_of_whitespace_after_qual = find_if( start_of_qual, line.end(), CIsSpace() );
1232 auto start_of_val = find_if( start_of_whitespace_after_qual, line.end(), CIsNotSpace() );
1233
1234 // first 3 are empty
1235 out_tokens.push_back(kEmptyStr);
1236 out_tokens.push_back(kEmptyStr);
1237 out_tokens.push_back(kEmptyStr);
1238
1239 // then qual
1240 out_tokens.push_back(kEmptyStr);
1241 string &qual = out_tokens.back();
1242 copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1243
1244 // then val
1245 if( start_of_val != line.end() ) {
1246 out_tokens.push_back(kEmptyStr);
1247 string &val = out_tokens.back();
1248 copy( start_of_val, line.end(), back_inserter(val) );
1249 NStr::TruncateSpacesInPlace( val );
1250 }
1251
1252 } else {
1253 // parse a feature line
1254
1255 // Since we're being lenient, we consider it to be 3 ( or 6 ) parts separated by whitespace
1256 auto first_column_start = line.begin();
1257 auto first_whitespace = find_if( first_column_start, line.end(), CIsSpace() );
1258 auto second_column_start = find_if( first_whitespace, line.end(), CIsNotSpace() );
1259 auto second_whitespace = find_if( second_column_start, line.end(), CIsSpace() );
1260 auto third_column_start = find_if( second_whitespace, line.end(), CIsNotSpace() );
1261 auto third_whitespace = find_if( third_column_start, line.end(), CIsSpace() );
1262 // columns 4 and 5 are unused on feature lines
1263 auto sixth_column_start = find_if( third_whitespace, line.end(), CIsNotSpace() );
1264 auto sixth_whitespace = find_if( sixth_column_start, line.end(), CIsSpace() );
1265
1266 out_tokens.push_back(kEmptyStr);
1267 string &first = out_tokens.back();
1268 copy( first_column_start, first_whitespace, back_inserter(first) );
1269
1270 out_tokens.push_back(kEmptyStr);
1271 string &second = out_tokens.back();
1272 copy( second_column_start, second_whitespace, back_inserter(second) );
1273
1274 out_tokens.push_back(kEmptyStr);
1275 string &third = out_tokens.back();
1276 copy( third_column_start, third_whitespace, back_inserter(third) );
1277
1278 if( sixth_column_start != line.end() ) {
1279 // columns 4 and 5 are unused
1280 out_tokens.push_back(kEmptyStr);
1281 out_tokens.push_back(kEmptyStr);
1282
1283 out_tokens.push_back(kEmptyStr);
1284 string &sixth = out_tokens.back();
1285 copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1286 }
1287 }
1288 }
1289
1290
x_AddQualifierToGene(CSeqFeatData & sfdata,EQual qtype,const string & val)1291 bool CFeatureTableReader_Imp::x_AddQualifierToGene (
1292 CSeqFeatData& sfdata,
1293 EQual qtype,
1294 const string& val
1295 )
1296
1297 {
1298 CGene_ref& grp = sfdata.SetGene ();
1299 switch (qtype) {
1300 case eQual_gene:
1301 grp.SetLocus (val);
1302 return true;
1303 case eQual_allele:
1304 grp.SetAllele (val);
1305 return true;
1306 case eQual_gene_desc:
1307 grp.SetDesc (val);
1308 return true;
1309 case eQual_gene_syn:
1310 {
1311 CGene_ref::TSyn& syn = grp.SetSyn ();
1312 syn.push_back (val);
1313 return true;
1314 }
1315 case eQual_map:
1316 grp.SetMaploc (val);
1317 return true;
1318 case eQual_locus_tag:
1319 grp.SetLocus_tag (val);
1320 return true;
1321 case eQual_nomenclature:
1322 /* !!! need to implement !!! */
1323 return true;
1324 default:
1325 break;
1326 }
1327 return false;
1328 }
1329
1330
x_AddQualifierToCdregion(CRef<CSeq_feat> sfp,CSeqFeatData & sfdata,EQual qtype,const string & val)1331 bool CFeatureTableReader_Imp::x_AddQualifierToCdregion (
1332 CRef<CSeq_feat> sfp,
1333 CSeqFeatData& sfdata,
1334 EQual qtype, const string& val
1335 )
1336
1337 {
1338 CCdregion& crp = sfdata.SetCdregion ();
1339 switch (qtype) {
1340 case eQual_codon_start:
1341 {
1342 int frame = x_StringToLongNoThrow (val, kCdsFeatName, "codon_start");
1343 switch (frame) {
1344 case 0:
1345 crp.SetFrame (CCdregion::eFrame_not_set);
1346 break;
1347 case 1:
1348 crp.SetFrame (CCdregion::eFrame_one);
1349 break;
1350 case 2:
1351 crp.SetFrame (CCdregion::eFrame_two);
1352 break;
1353 case 3:
1354 crp.SetFrame (CCdregion::eFrame_three);
1355 break;
1356 default:
1357 break;
1358 }
1359 return true;
1360 }
1361 case eQual_EC_number:
1362 {
1363 CProt_ref& prp = sfp->SetProtXref ();
1364 CProt_ref::TEc& ec = prp.SetEc ();
1365 ec.push_back (val);
1366 return true;
1367 }
1368 case eQual_function:
1369 {
1370 CProt_ref& prp = sfp->SetProtXref ();
1371 CProt_ref::TActivity& fun = prp.SetActivity ();
1372 fun.push_back (val);
1373 return true;
1374 }
1375 case eQual_product:
1376 {
1377 CProt_ref& prp = sfp->SetProtXref ();
1378 CProt_ref::TName& prod = prp.SetName ();
1379 prod.push_back (val);
1380 return true;
1381 }
1382 case eQual_prot_desc:
1383 {
1384 CProt_ref& prp = sfp->SetProtXref ();
1385 prp.SetDesc (val);
1386 return true;
1387 }
1388 case eQual_prot_note:
1389 return x_AddGBQualToFeature(sfp, "prot_note", val);
1390 case eQual_transl_except:
1391 // add as GBQual, let cleanup convert to code_break
1392 return x_AddGBQualToFeature(sfp, "transl_except", val);
1393 case eQual_translation:
1394 // we should accept, but ignore this qual on CDSs.
1395 // so, do nothing but return success
1396 return true;
1397 case eQual_transl_table:
1398 // set genetic code directly, or add qualifier and let cleanup convert?
1399 try {
1400 int num = NStr::StringToLong(val);
1401 CGen_code_table::GetTransTable(num); // throws if bad num
1402 CRef<CGenetic_code::C_E> code(new CGenetic_code::C_E());
1403 code->SetId(num);
1404 crp.SetCode().Set().push_back(code);
1405 return true;
1406 } catch( CStringException ) {
1407 // if val is not a number, add qualifier directly and
1408 // let cleanup convert?
1409 return x_AddGBQualToFeature(sfp, "transl_table", val);
1410 } catch( ... ) {
1411 // invalid genome code table so don't even try to make
1412 // the transl_table qual
1413 x_ProcessMsg(
1414 ILineError::eProblem_QualifierBadValue, eDiag_Error,
1415 kCdsFeatName, "transl_table", val);
1416 return true;
1417 }
1418 break;
1419
1420 default:
1421 break;
1422 }
1423 return false;
1424 }
1425
1426
x_StringIsJustQuotes(const string & str)1427 bool CFeatureTableReader_Imp::x_StringIsJustQuotes (
1428 const string& str
1429 )
1430
1431 {
1432 ITERATE (string, it, str) {
1433 char ch = *it;
1434 if (ch > ' ' && ch != '"' && ch != '\'') return false;
1435 }
1436
1437 return true;
1438 }
1439
1440 static bool
s_LineIndicatesOrder(const CTempString & line)1441 s_LineIndicatesOrder( const CTempString & line )
1442 {
1443 // basically, this is true if the line starts with "order" (whitespaces disregarded)
1444
1445 const static char* kOrder = "ORDER";
1446
1447 // find first non-whitespace character
1448 string::size_type pos = 0;
1449 for( ; pos < line.length() && isspace(line[pos]); ++pos) {
1450 // nothing to do here
1451 }
1452
1453 // line is all whitespace
1454 if( pos >= line.length() ) {
1455 return false;
1456 }
1457
1458 // check if starts with "order" after whitespace
1459 return ( 0 == NStr::CompareNocase( line, pos, strlen(kOrder), kOrder ) );
1460 }
1461
1462 // Turns a "join" location into an "order" by putting nulls between it
1463 // Returns an unset CRef if the loc doesn't need nulls (e.g. if it's just an interval)
1464 static CRef<CSeq_loc>
s_LocationJoinToOrder(const CSeq_loc & loc)1465 s_LocationJoinToOrder( const CSeq_loc & loc )
1466 {
1467 // create result we're returning
1468 CRef<CSeq_loc> result( new CSeq_loc );
1469 CSeq_loc_mix::Tdata & mix_pieces = result->SetMix().Set();
1470
1471 // keep this around for whenever we need a "null" piece
1472 CRef<CSeq_loc> loc_piece_null( new CSeq_loc );
1473 loc_piece_null->SetNull();
1474
1475 // push pieces of source, with NULLs between
1476 CSeq_loc_CI loc_iter( loc );
1477 for( ; loc_iter; ++loc_iter ) {
1478 if( ! mix_pieces.empty() ) {
1479 mix_pieces.push_back( loc_piece_null );
1480 }
1481 CRef<CSeq_loc> new_piece( new CSeq_loc );
1482 new_piece->Assign( loc_iter.GetEmbeddingSeq_loc() );
1483 mix_pieces.push_back( new_piece );
1484 }
1485
1486 // Only wrap in "mix" if there was more than one piece
1487 if( mix_pieces.size() > 1 ) {
1488 return result;
1489 } else {
1490 return CRef<CSeq_loc>();
1491 }
1492 }
1493
1494
x_TrnaToAaString(const string & val)1495 string CFeatureTableReader_Imp::x_TrnaToAaString(
1496 const string& val
1497 )
1498 {
1499 CTempString value(val);
1500
1501 if (NStr::StartsWith(value, "tRNA-")) {
1502 value.assign(value, strlen("tRNA-"), CTempString::npos);
1503 }
1504
1505 CTempString::size_type pos = value.find_first_of("-,;:()=\'_~");
1506 if (pos != CTempString::npos) {
1507 value.erase(pos);
1508 NStr::TruncateSpacesInPlace(value);
1509 }
1510
1511 return string(value);
1512 }
1513
1514
1515 bool
x_ParseTrnaExtString(CTrna_ext & ext_trna,const string & str)1516 CFeatureTableReader_Imp::x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str)
1517 {
1518 if (NStr::IsBlank (str)) return false;
1519
1520 string normalized_string = str;
1521 normalized_string.erase(
1522 remove_if(begin(normalized_string),
1523 end(normalized_string),
1524 [](char c) { return isspace(c);}),
1525 end(normalized_string));
1526
1527 if ( NStr::StartsWith(normalized_string, "(pos:") ) {
1528 // find position of closing paren
1529 string::size_type pos_end = x_MatchingParenPos( normalized_string, 0 );
1530 if (pos_end != string::npos) {
1531 string pos_str = normalized_string.substr (5, pos_end - 5);
1532 string::size_type aa_start = NStr::FindNoCase(pos_str, "aa:");
1533 if (aa_start != string::npos) {
1534 auto seq_start = NStr::FindNoCase(pos_str, ",seq:");
1535 if (seq_start != string::npos &&
1536 seq_start < aa_start+3) {
1537 return false;
1538 }
1539
1540 size_t aa_length = (seq_start == NPOS) ?
1541 pos_str.size() - (aa_start+3) :
1542 seq_start - (aa_start+3);
1543
1544 string abbrev = pos_str.substr (aa_start + 3, aa_length);
1545 //TTrnaMap::const_iterator
1546 auto t_iter = sm_TrnaKeys.find (abbrev.c_str ());
1547 if (t_iter == sm_TrnaKeys.end ()) {
1548 // unable to parse
1549 return false;
1550 }
1551 CRef<CTrna_ext::TAa> aa(new CTrna_ext::TAa);
1552 aa->SetNcbieaa (t_iter->second);
1553 ext_trna.SetAa(*aa);
1554 pos_str = pos_str.substr (0, aa_start);
1555 NStr::TruncateSpacesInPlace (pos_str);
1556 if (NStr::EndsWith (pos_str, ",")) {
1557 pos_str = pos_str.substr (0, pos_str.length() - 1);
1558 }
1559 }
1560 CGetSeqLocFromStringHelper helper;
1561 CRef<CSeq_loc> anticodon = GetSeqLocFromString (pos_str, m_seq_id, & helper);
1562 if (anticodon == NULL) {
1563 ext_trna.ResetAa();
1564 return false;
1565 } else {
1566 switch( anticodon->GetStrand() ) {
1567 case eNa_strand_unknown:
1568 case eNa_strand_plus:
1569 case eNa_strand_minus:
1570 ext_trna.SetAnticodon(*anticodon);
1571 return true;
1572 default:
1573 ext_trna.ResetAa();
1574 return false;
1575 }
1576 }
1577 }
1578 }
1579
1580 return false;
1581 }
1582
1583
x_MatchingParenPos(const string & str,SIZE_TYPE open_paren_pos)1584 SIZE_TYPE CFeatureTableReader_Imp::x_MatchingParenPos(
1585 const string &str, SIZE_TYPE open_paren_pos )
1586 {
1587 _ASSERT( str[open_paren_pos] == '(' );
1588 _ASSERT( open_paren_pos < str.length() );
1589
1590 // nesting level. start at 1 since we know there's an open paren
1591 int level = 1;
1592
1593 SIZE_TYPE pos = open_paren_pos + 1;
1594 for( ; pos < str.length(); ++pos ) {
1595 switch( str[pos] ) {
1596 case '(':
1597 // nesting deeper
1598 ++level;
1599 break;
1600 case ')':
1601 // closed a level of nesting
1602 --level;
1603 if( 0 == level ) {
1604 // reached the top: we're closing the initial paren,
1605 // so we return our position
1606 return pos;
1607 }
1608 break;
1609 default:
1610 // ignore other characters.
1611 // maybe in the future we'll handle ignoring parens in quotes or
1612 // things like that.
1613 break;
1614 }
1615 }
1616 return NPOS;
1617 }
1618
x_StringToLongNoThrow(CTempString strToConvert,CTempString strFeatureName,CTempString strQualifierName,ILineError::EProblem eProblem)1619 long CFeatureTableReader_Imp::x_StringToLongNoThrow (
1620 CTempString strToConvert,
1621 CTempString strFeatureName,
1622 CTempString strQualifierName,
1623 ILineError::EProblem eProblem
1624 )
1625 {
1626 try {
1627 return NStr::StringToLong(strToConvert);
1628 } catch( ... ) {
1629 // See if we start with a number, but there's extra junk after it, try again
1630 if( ! strToConvert.empty() && isdigit(strToConvert[0]) ) {
1631 try {
1632 long result = NStr::StringToLong(strToConvert, NStr::fAllowTrailingSymbols);
1633
1634 ILineError::EProblem problem =
1635 ILineError::eProblem_NumericQualifierValueHasExtraTrailingCharacters;
1636 if( eProblem != ILineError::eProblem_Unset ) {
1637 problem = eProblem;
1638 }
1639
1640 x_ProcessMsg(
1641 problem,
1642 eDiag_Warning,
1643 strFeatureName, strQualifierName, strToConvert );
1644 return result;
1645 } catch( ... ) { } // fall-thru to usual handling
1646 }
1647
1648 ILineError::EProblem problem =
1649 ILineError::eProblem_NumericQualifierValueIsNotANumber;
1650 if( eProblem != ILineError::eProblem_Unset ) {
1651 problem = eProblem;
1652 }
1653
1654 x_ProcessMsg(
1655 problem,
1656 eDiag_Warning,
1657 strFeatureName, strQualifierName, strToConvert );
1658 // we have no idea, so just return zero
1659 return 0;
1660 }
1661 }
1662
1663
x_AddQualifierToRna(CRef<CSeq_feat> sfp,EQual qtype,const string & val)1664 bool CFeatureTableReader_Imp::x_AddQualifierToRna (
1665 CRef<CSeq_feat> sfp,
1666 EQual qtype,
1667 const string& val
1668 )
1669 {
1670 CSeqFeatData& sfdata = sfp->SetData();
1671 CRNA_ref& rrp = sfdata.SetRna ();
1672 CRNA_ref::EType rnatyp = rrp.GetType ();
1673 switch (rnatyp) {
1674 case CRNA_ref::eType_premsg:
1675 case CRNA_ref::eType_mRNA:
1676 case CRNA_ref::eType_rRNA:
1677 switch (qtype) {
1678 case eQual_product:
1679 {
1680 CRNA_ref::TExt& tex = rrp.SetExt ();
1681 CRNA_ref::C_Ext::E_Choice exttype = tex.Which ();
1682 if (exttype == CRNA_ref::C_Ext::e_TRNA) return false;
1683 tex.SetName (val);
1684 return true;
1685 }
1686 default:
1687 break;
1688 }
1689 break;
1690 case CRNA_ref::eType_ncRNA:
1691 switch (qtype) {
1692 case eQual_product:
1693 rrp.SetExt().SetGen().SetProduct(val);
1694 return true;
1695 break;
1696 case eQual_ncRNA_class:
1697 rrp.SetExt().SetGen().SetClass(val);
1698 return true;
1699 break;
1700 default:
1701 break;
1702 }
1703 break;
1704 case CRNA_ref::eType_tmRNA:
1705 switch (qtype) {
1706 case eQual_product:
1707 rrp.SetExt().SetGen().SetProduct(val);
1708 return true;
1709 case eQual_tag_peptide:
1710 {
1711 CRef<CRNA_qual> q(new CRNA_qual());
1712 q->SetQual("tag_peptide");
1713 q->SetVal(val);
1714 rrp.SetExt().SetGen().SetQuals().Set().push_back(q);
1715 return true;
1716 }
1717 break;
1718 default:
1719 break;
1720 }
1721 break;
1722 case CRNA_ref::eType_snRNA:
1723 case CRNA_ref::eType_scRNA:
1724 case CRNA_ref::eType_snoRNA:
1725 case CRNA_ref::eType_other:
1726 return false;
1727 case CRNA_ref::eType_tRNA:
1728 switch (qtype) {
1729 case eQual_product: {
1730 if (rrp.IsSetExt() && rrp.GetExt().Which() == CRNA_ref::C_Ext::e_Name)
1731 return false;
1732
1733 const string& aa_string = x_TrnaToAaString(val);
1734 const auto aaval_it = sm_TrnaKeys.find(aa_string.c_str());
1735
1736 if (aaval_it != sm_TrnaKeys.end()) {
1737 CRNA_ref::TExt& tex = rrp.SetExt ();
1738 CTrna_ext& trx = tex.SetTRNA();
1739 CTrna_ext::TAa& taa = trx.SetAa();
1740 taa.SetNcbieaa(aaval_it->second);
1741 if (aa_string == "fMet" ||
1742 aa_string == "iMet" ||
1743 aa_string == "Ile2") {
1744 x_AddGBQualToFeature(sfp, "product", val);
1745 }
1746 }
1747 else {
1748 x_ProcessMsg(
1749 ILineError::eProblem_QualifierBadValue, eDiag_Warning,
1750 "tRNA", "product", val);
1751 }
1752 return true;
1753 }
1754 break;
1755 case eQual_anticodon:
1756 {
1757 CRNA_ref::TExt& tex = rrp.SetExt ();
1758 CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1759 if( ! x_ParseTrnaExtString(ext_trna, val) ) {
1760 x_ProcessMsg(
1761 ILineError::eProblem_QualifierBadValue, eDiag_Error,
1762 "tRNA", "anticodon", val );
1763 }
1764 return true;
1765 }
1766 break;
1767 case eQual_codon_recognized:
1768 {
1769 //const auto codon_index = CGen_code_table::CodonToIndex(val);
1770 //if (codon_index >= 0) {
1771 CRNA_ref::TExt& tex = rrp.SetExt ();
1772 CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1773 if (!x_AddCodons(val, ext_trna)) {
1774 return false;
1775 }
1776 //}
1777 return true;
1778 }
1779 break;
1780 default:
1781 break;
1782 }
1783 break;
1784 default:
1785 break;
1786 }
1787 return false;
1788 }
1789
1790
x_AddCodons(const string & val,CTrna_ext & trna_ext) const1791 bool CFeatureTableReader_Imp::x_AddCodons(
1792 const string& val,
1793 CTrna_ext& trna_ext
1794 ) const
1795 {
1796 if (val.size() != 3) {
1797 return false;
1798 }
1799
1800 set<int> codons;
1801 try {
1802 for (char char1 : s_IUPACmap.at(val[0])) {
1803 for (char char2 : s_IUPACmap.at(val[1])) {
1804 for (char char3 : s_IUPACmap.at(val[2])) {
1805 const auto codon_index = CGen_code_table::CodonToIndex(char1, char2, char3);
1806 codons.insert(codon_index);
1807 }
1808 }
1809 }
1810
1811 if (!codons.empty()) {
1812 trna_ext.SetAa().SetNcbieaa();
1813 for (const auto codon_index : codons) {
1814 trna_ext.SetCodon().push_back(codon_index);
1815 }
1816 }
1817 return true;
1818 }
1819 catch(...) {}
1820
1821 return false;
1822 }
1823
1824
x_AddQualifierToImp(CRef<CSeq_feat> sfp,CSeqFeatData & sfdata,EQual qtype,const string & qual,const string & val)1825 bool CFeatureTableReader_Imp::x_AddQualifierToImp (
1826 CRef<CSeq_feat> sfp,
1827 CSeqFeatData& sfdata,
1828 EQual qtype,
1829 const string& qual,
1830 const string& val
1831 )
1832
1833 {
1834 const char *str = NULL;
1835
1836 CSeqFeatData::ESubtype subtype = sfdata.GetSubtype ();
1837
1838 // used if-statement because CSeqFeatData::IsRegulatory won't work in a
1839 // switch statement.
1840 if( (subtype == CSeqFeatData::eSubtype_regulatory) ||
1841 CSeqFeatData::IsRegulatory(subtype) )
1842 {
1843 if (qtype == eQual_regulatory_class) {
1844 if (val != "other") { // RW-374 "other" is a special case
1845
1846 const vector<string>& allowed_values =
1847 CSeqFeatData::GetRegulatoryClassList();
1848 if (find(allowed_values.cbegin(), allowed_values.cend(), val)
1849 == allowed_values.cend()) {
1850 return false;
1851 }
1852
1853 /*
1854 const CSeqFeatData::ESubtype regulatory_class_subtype =
1855 CSeqFeatData::GetRegulatoryClass(val);
1856 if( regulatory_class_subtype == CSeqFeatData::eSubtype_bad ) {
1857 // msg will be sent in caller x_AddQualifierToFeature
1858 return false;
1859 }
1860 */
1861 }
1862 // okay
1863 // (Note that at this time we don't validate
1864 // if the regulatory_class actually matches the
1865 // subtype)
1866 x_AddGBQualToFeature(sfp, qual, val);
1867 return true;
1868 }
1869 }
1870
1871 switch (subtype) {
1872 case CSeqFeatData::eSubtype_variation:
1873 {
1874 switch (qtype) {
1875 case eQual_chrcnt:
1876 case eQual_ctgcnt:
1877 case eQual_loccnt:
1878 case eQual_snp_class:
1879 case eQual_snp_gtype:
1880 case eQual_snp_het:
1881 case eQual_snp_het_se:
1882 case eQual_snp_linkout:
1883 case eQual_snp_maxrate:
1884 case eQual_snp_valid:
1885 case eQual_weight:
1886 str = "dbSnpSynonymyData";
1887 break;
1888 default:
1889 break;
1890 }
1891 }
1892 break;
1893 case CSeqFeatData::eSubtype_STS:
1894 {
1895 switch (qtype) {
1896 case eQual_sts_aliases:
1897 case eQual_sts_dsegs:
1898 case eQual_weight:
1899 str = "stsUserObject";
1900 break;
1901 default:
1902 break;
1903 }
1904 }
1905 break;
1906 case CSeqFeatData::eSubtype_misc_feature:
1907 {
1908 switch (qtype) {
1909 case eQual_bac_ends:
1910 case eQual_clone_id:
1911 case eQual_method:
1912 case eQual_sequence:
1913 case eQual_STS:
1914 case eQual_weight:
1915 str = "cloneUserObject";
1916 break;
1917 default:
1918 break;
1919 }
1920 }
1921 break;
1922 default:
1923 break;
1924 }
1925
1926 if( NULL != str ) {
1927 CSeq_feat::TExt& ext = sfp->SetExt ();
1928 CObject_id& obj = ext.SetType ();
1929 if ((! obj.IsStr ()) || obj.GetStr ().empty ()) {
1930 obj.SetStr ();
1931 }
1932 ext.AddField (qual, val, CUser_object::eParse_Number);
1933 return true;
1934 }
1935
1936 return false;
1937 }
1938
1939
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,const string & feat_name,EOrgRef rtype,const string & val)1940 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
1941 CSeqFeatData& sfdata,
1942 const string &feat_name,
1943 EOrgRef rtype,
1944 const string& val
1945 )
1946 {
1947 CBioSource& bsp = sfdata.SetBiosrc ();
1948
1949 switch (rtype) {
1950 case eOrgRef_organism:
1951 {
1952 CBioSource::TOrg& orp = bsp.SetOrg ();
1953 orp.SetTaxname (val);
1954 return true;
1955 }
1956 case eOrgRef_organelle:
1957 {
1958 TGenomeMap::const_iterator g_iter = sm_GenomeKeys.find (val.c_str ());
1959 if (g_iter != sm_GenomeKeys.end ()) {
1960 CBioSource::EGenome gtype = g_iter->second;
1961 bsp.SetGenome (gtype);
1962 } else {
1963 x_ProcessMsg(
1964 ILineError::eProblem_QualifierBadValue, eDiag_Error,
1965 feat_name, "organelle", val );
1966 }
1967 return true;
1968 }
1969 case eOrgRef_div:
1970 {
1971 CBioSource::TOrg& orp = bsp.SetOrg ();
1972 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1973 onp.SetDiv (val);
1974 return true;
1975 }
1976 case eOrgRef_lineage:
1977 {
1978 CBioSource::TOrg& orp = bsp.SetOrg ();
1979 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1980 onp.SetLineage (val);
1981 return true;
1982 }
1983 case eOrgRef_gcode:
1984 {
1985 CBioSource::TOrg& orp = bsp.SetOrg ();
1986 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1987 int code = x_StringToLongNoThrow (val, feat_name, "gcode");
1988 onp.SetGcode (code);
1989 return true;
1990 }
1991 case eOrgRef_mgcode:
1992 {
1993 CBioSource::TOrg& orp = bsp.SetOrg ();
1994 COrg_ref::TOrgname& onp = orp.SetOrgname ();
1995 int code = x_StringToLongNoThrow (val, feat_name, "mgcode");
1996 onp.SetMgcode (code);
1997 return true;
1998 }
1999 default:
2000 break;
2001 }
2002 return false;
2003 }
2004
2005
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,CSubSource::ESubtype stype,const string & val)2006 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
2007 CSeqFeatData& sfdata,
2008 CSubSource::ESubtype stype,
2009 const string& val
2010 )
2011
2012 {
2013 CBioSource& bsp = sfdata.SetBiosrc ();
2014 CBioSource::TSubtype& slist = bsp.SetSubtype ();
2015 CRef<CSubSource> ssp (new CSubSource);
2016 ssp->SetSubtype (stype);
2017 ssp->SetName (val);
2018 slist.push_back (ssp);
2019 return true;
2020 }
2021
2022
x_AddQualifierToBioSrc(CSeqFeatData & sfdata,COrgMod::ESubtype mtype,const string & val)2023 bool CFeatureTableReader_Imp::x_AddQualifierToBioSrc (
2024 CSeqFeatData& sfdata,
2025 COrgMod::ESubtype mtype,
2026 const string& val
2027 )
2028
2029 {
2030 CBioSource& bsp = sfdata.SetBiosrc ();
2031 CBioSource::TOrg& orp = bsp.SetOrg ();
2032 COrg_ref::TOrgname& onp = orp.SetOrgname ();
2033 COrgName::TMod& mlist = onp.SetMod ();
2034 CRef<COrgMod> omp (new COrgMod);
2035 omp->SetSubtype (mtype);
2036 omp->SetSubname (val);
2037 mlist.push_back (omp);
2038 return true;
2039 }
2040
2041
x_AddGBQualToFeature(CRef<CSeq_feat> sfp,const string & qual,const string & val)2042 bool CFeatureTableReader_Imp::x_AddGBQualToFeature (
2043 CRef<CSeq_feat> sfp,
2044 const string& qual,
2045 const string& val
2046 )
2047
2048 {
2049 if (qual.empty ()) return false;
2050
2051 // need this pointer because references can't be repointed
2052 CTempString normalized_qual = qual;
2053
2054 // normalize qual if needed, especially regarding case, and
2055 // use as-is if no normalization applies
2056 auto qual_type = CSeqFeatData::GetQualifierType(qual);
2057 if( qual_type != CSeqFeatData::eQual_bad ) {
2058 // swap is constant time
2059 CTempString potential_normalized_qual = CSeqFeatData::GetQualifierAsString(qual_type);
2060 if( ! potential_normalized_qual.empty() ) {
2061 normalized_qual = potential_normalized_qual;
2062 }
2063 }
2064
2065 auto& qlist = sfp->SetQual ();
2066 CRef<CGb_qual> gbq (new CGb_qual);
2067 gbq->SetQual() = normalized_qual;
2068 if (x_StringIsJustQuotes (val)) {
2069 gbq->SetVal() = kEmptyStr;
2070 } else {
2071 gbq->SetVal() = val;
2072 }
2073 qlist.push_back (gbq);
2074
2075 return true;
2076 }
2077
2078
x_CreateGenesFromCDSs(CRef<CSeq_annot> sap,TChoiceToFeatMap & choiceToFeatMap,const TFlags flags)2079 void CFeatureTableReader_Imp::x_CreateGenesFromCDSs(
2080 CRef<CSeq_annot> sap,
2081 TChoiceToFeatMap & choiceToFeatMap,
2082 const TFlags flags)
2083 {
2084 // load cds_equal_range to hold the CDSs
2085 typedef TChoiceToFeatMap::iterator TChoiceCI;
2086 typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
2087 TChoiceEqualRange cds_equal_range =
2088 choiceToFeatMap.equal_range(CSeqFeatData::e_Cdregion);
2089 if( cds_equal_range.first == cds_equal_range.second )
2090 {
2091 // nothing to do if there are no CDSs
2092 return;
2093 }
2094
2095 // load mappings from locus or locus-tag to gene
2096 typedef multimap<string, SFeatAndLineNum> TStringToGeneAndLineMap;
2097 TStringToGeneAndLineMap locusToGeneAndLineMap;
2098 TStringToGeneAndLineMap locusTagToGeneAndLineMap;
2099 const TChoiceEqualRange gene_equal_range =
2100 choiceToFeatMap.equal_range(CSeqFeatData::e_Gene);
2101 for( TChoiceCI gene_choice_ci = gene_equal_range.first;
2102 gene_choice_ci != gene_equal_range.second;
2103 ++gene_choice_ci )
2104 {
2105 SFeatAndLineNum gene_feat_ref_and_line = gene_choice_ci->second;
2106 const CGene_ref & gene_ref = gene_feat_ref_and_line.m_pFeat->GetData().GetGene();
2107 if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus) ) {
2108 locusToGeneAndLineMap.insert(
2109 TStringToGeneAndLineMap::value_type(
2110 gene_ref.GetLocus(), gene_feat_ref_and_line));
2111 }
2112 if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus_tag) ) {
2113 locusTagToGeneAndLineMap.insert(
2114 TStringToGeneAndLineMap::value_type(
2115 gene_ref.GetLocus_tag(), gene_feat_ref_and_line));
2116 }
2117 }
2118
2119 // for each CDS, check for gene conflicts or create genes,
2120 // depending on various flags
2121 for( TChoiceCI cds_choice_ci = cds_equal_range.first;
2122 cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
2123 {
2124 TFeatConstRef cds_feat_ref = cds_choice_ci->second.m_pFeat;
2125 const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2126
2127 const CSeq_loc & cds_loc = cds_feat_ref->GetLocation();
2128
2129 const CGene_ref * pGeneXrefOnCDS = cds_feat_ref->GetGeneXref();
2130 if( ! pGeneXrefOnCDS ) {
2131 // no xref, so can't do anything for this CDS
2132 // (this is NOT an error)
2133 continue;
2134 }
2135
2136 // get all the already-existing genes that
2137 // this CDS xrefs. It should be somewhat uncommon for there
2138 // to be more than one matching gene.
2139 set<SFeatAndLineNum> matchingGenes;
2140
2141 const string locus =
2142 pGeneXrefOnCDS->IsSetLocus() ?
2143 pGeneXrefOnCDS->GetLocus() :
2144 "";
2145
2146 const string locus_tag =
2147 pGeneXrefOnCDS->IsSetLocus_tag() ?
2148 pGeneXrefOnCDS->GetLocus_tag() :
2149 "";
2150
2151
2152 {{
2153 // all the code in this scope is all just for setting up matchingGenes
2154
2155 typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2156 typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2157 set<SFeatAndLineNum> locusGeneMatches;
2158 // add the locus matches (if any) to genesAlreadyCreated
2159 if( !NStr::IsBlank(locus) ) {
2160 TStrToGeneEqualRange locus_equal_range =
2161 locusToGeneAndLineMap.equal_range(locus);
2162 for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2163 locus_gene_ci != locus_equal_range.second;
2164 ++locus_gene_ci )
2165 {
2166 if (!NStr::IsBlank(locus_tag)) {
2167 auto gene_feat = locus_gene_ci->second.m_pFeat;
2168 if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2169 gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2170 continue;
2171 }
2172 }
2173 locusGeneMatches.insert(locus_gene_ci->second);
2174 }
2175 }
2176 // remove any that don't also match the locus-tag (if any)
2177 set<SFeatAndLineNum> locusTagGeneMatches;
2178 if( !NStr::IsBlank(locus_tag) ) {
2179 TStrToGeneEqualRange locus_tag_equal_range =
2180 locusTagToGeneAndLineMap.equal_range(locus_tag);
2181 for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2182 locus_tag_gene_ci != locus_tag_equal_range.second;
2183 ++locus_tag_gene_ci )
2184 {
2185 if (!NStr::IsBlank(locus)) {
2186 auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2187 if (gene_feat->GetData().GetGene().IsSetLocus() &&
2188 gene_feat->GetData().GetGene().GetLocus() != locus) {
2189 continue;
2190 }
2191 }
2192 locusTagGeneMatches.insert(locus_tag_gene_ci->second);
2193 }
2194 }
2195 // analyze locusGeneMatches and locusTagGeneMatches to find matchingGenes.
2196 if( locusGeneMatches.empty() ) {
2197 // swap is faster than assignment
2198 matchingGenes.swap(locusTagGeneMatches);
2199 } else if( locusTagGeneMatches.empty() ) {
2200 // swap is faster than assignment
2201 matchingGenes.swap(locusGeneMatches);
2202 } else {
2203 // get only the genes that match both (that is, the intersection)
2204 set_intersection(
2205 locusGeneMatches.begin(), locusGeneMatches.end(),
2206 locusTagGeneMatches.begin(), locusTagGeneMatches.end(),
2207 inserter(matchingGenes, matchingGenes.begin()));
2208 }
2209 }}
2210
2211 // if requested, check that the genes really do contain the CDS
2212 // (also check if we're trying to create a gene that already exists)
2213
2214 ITERATE(set<SFeatAndLineNum>, gene_feat_and_line_ci, matchingGenes) {
2215 const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2216 const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2217
2218 if ((flags & CFeature_table_reader::fCDSsMustBeInTheirGenes) != 0) {
2219
2220 // CDS's loc minus gene's loc should be an empty location
2221 // because the CDS should be entirely on the gene
2222 CRef<CSeq_loc> pCdsMinusGeneLoc = cds_loc.Subtract(
2223 gene_loc, CSeq_loc::fSortAndMerge_All, NULL, NULL);
2224 if( pCdsMinusGeneLoc &&
2225 ! pCdsMinusGeneLoc->IsNull() &&
2226 ! pCdsMinusGeneLoc->IsEmpty() )
2227 {
2228 ILineError::TVecOfLines gene_lines;
2229 if( gene_line_num > 0 ) {
2230 gene_lines.push_back(gene_line_num);
2231 }
2232 x_ProcessMsg(
2233 cds_line_num,
2234 ILineError::eProblem_FeatMustBeInXrefdGene, eDiag_Error,
2235 kCdsFeatName,
2236 kEmptyStr, kEmptyStr, kEmptyStr,
2237 gene_lines );
2238 }
2239 }
2240 }
2241
2242 // if requested, create genes for the CDS if there isn't already one
2243 // (it is NOT an error if the gene is already created)
2244 if ( (flags & CFeature_table_reader::fCreateGenesFromCDSs) != 0 &&
2245 matchingGenes.empty() )
2246 {
2247 // create the gene
2248 CRef<CSeq_feat> pNewGene( new CSeq_feat );
2249 pNewGene->SetData().SetGene().Assign( *pGeneXrefOnCDS );
2250 if( FIELD_EQUALS(*cds_feat_ref, Partial, true) ) pNewGene->SetPartial(true);
2251 pNewGene->SetLocation().Assign( cds_feat_ref->GetLocation() );
2252
2253 // add gene the annot
2254 _ASSERT( sap->IsFtable() );
2255 TFtable & the_ftable = sap->SetData().SetFtable();
2256 the_ftable.push_back(pNewGene);
2257
2258 // add it to our local information for later CDSs
2259 SFeatAndLineNum gene_feat_and_line(pNewGene, 0);
2260 choiceToFeatMap.insert(
2261 TChoiceToFeatMap::value_type(
2262 pNewGene->GetData().Which(), gene_feat_and_line ) );
2263 if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus) ) {
2264 locusToGeneAndLineMap.insert(
2265 TStringToGeneAndLineMap::value_type(
2266 pGeneXrefOnCDS->GetLocus(), gene_feat_and_line));
2267 }
2268 if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus_tag) ) {
2269 locusTagToGeneAndLineMap.insert(
2270 TStringToGeneAndLineMap::value_type(
2271 pGeneXrefOnCDS->GetLocus_tag(), gene_feat_and_line));
2272 }
2273 }
2274 } // end of iteration through the CDS's
2275 }
2276
2277 static const string s_QualsWithCaps[] = {
2278 "EC_number",
2279 "PCR_conditions",
2280 "PubMed",
2281 "STS",
2282 "ncRNA_class"
2283 };
2284
2285 static const int s_NumQualsWithCaps = sizeof (s_QualsWithCaps) / sizeof (string);
2286
s_FixQualCapitalization(const string & qual)2287 static string s_FixQualCapitalization (const string& qual)
2288 {
2289 string lqual = qual;
2290 lqual = NStr::ToLower(lqual);
2291 for (int j = 0; j < s_NumQualsWithCaps; j++) {
2292 if (NStr::EqualNocase(lqual, s_QualsWithCaps[j])) {
2293 lqual = s_QualsWithCaps[j];
2294 break;
2295 }
2296 }
2297 return lqual;
2298 }
2299
2300
x_AddNoteToFeature(CRef<CSeq_feat> sfp,const string & note)2301 bool CFeatureTableReader_Imp::x_AddNoteToFeature(
2302 CRef<CSeq_feat> sfp,
2303 const string& note)
2304 {
2305 if (sfp.IsNull()) {
2306 return false;
2307 }
2308
2309 if (NStr::IsBlank(note)) { // Nothing to do
2310 return true;
2311 }
2312
2313 string comment = (sfp->CanGetComment()) ?
2314 sfp->GetComment() + "; " + note :
2315 note;
2316 sfp->SetComment(comment);
2317 return true;
2318 }
2319
2320
x_AddNoteToFeature(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val)2321 bool CFeatureTableReader_Imp::x_AddNoteToFeature(
2322 CRef<CSeq_feat> sfp,
2323 const string& feat_name,
2324 const string& qual,
2325 const string& val) {
2326
2327 if (!x_AddNoteToFeature(sfp, val)) {
2328 return false;
2329 }
2330 // Else convert qualifier to note and issue warning
2331 if (qual != "note") {
2332 string error_message =
2333 qual + " is not a valid qualifier for this feature. Converting to note.";
2334 x_ProcessMsg(
2335 ILineError::eProblem_InvalidQualifier, eDiag_Warning,
2336 feat_name, qual, kEmptyStr, error_message);
2337 }
2338 return true;
2339 }
2340
x_AddQualifierToFeature(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const TFlags flags)2341 bool CFeatureTableReader_Imp::x_AddQualifierToFeature (
2342 CRef<CSeq_feat> sfp,
2343 const string &feat_name,
2344 const string& qual,
2345 const string& val,
2346 const TFlags flags
2347 )
2348
2349 {
2350 CSeqFeatData& sfdata = sfp->SetData ();
2351 CSeqFeatData::E_Choice featType = sfdata.Which ();
2352
2353 const CSeqFeatData::EQualifier qual_type =
2354 CSeqFeatData::GetQualifierType(qual);
2355 if( (flags & CFeature_table_reader::fReportDiscouragedKey) != 0 ) {
2356 if( CSeqFeatData::IsDiscouragedQual(qual_type) ) {
2357 x_ProcessMsg(
2358 ILineError::eProblem_DiscouragedQualifierName,
2359 eDiag_Warning, feat_name, qual);
2360 }
2361 }
2362
2363 if (featType == CSeqFeatData::e_Biosrc) {
2364
2365 TOrgRefMap::const_iterator o_iter = sm_OrgRefKeys.find (qual.c_str ());
2366 if (o_iter != sm_OrgRefKeys.end ()) {
2367 EOrgRef rtype = o_iter->second;
2368 if (x_AddQualifierToBioSrc (sfdata, feat_name, rtype, val)) return true;
2369 } else {
2370
2371 TSubSrcMap::const_iterator s_iter = sm_SubSrcKeys.find (qual.c_str ());
2372 if (s_iter != sm_SubSrcKeys.end ()) {
2373
2374 CSubSource::ESubtype stype = s_iter->second;
2375 if (x_AddQualifierToBioSrc (sfdata, stype, val)) return true;
2376
2377 } else {
2378
2379 TOrgModMap::const_iterator m_iter = sm_OrgModKeys.find (qual.c_str ());
2380 if (m_iter != sm_OrgModKeys.end ()) {
2381
2382 COrgMod::ESubtype mtype = m_iter->second;
2383 if (x_AddQualifierToBioSrc (sfdata, mtype, val)) return true;
2384 }
2385 }
2386 }
2387 return false;
2388 }
2389
2390
2391 // else type != CSeqFeatData::e_Biosrc
2392 string lqual = s_FixQualCapitalization(qual);
2393 TQualMap::const_iterator q_iter = sm_QualKeys.find (lqual.c_str ());
2394 if (q_iter != sm_QualKeys.end ()) {
2395 EQual qtype = q_iter->second;
2396 switch (featType) {
2397 case CSeqFeatData::e_Gene:
2398 if (x_AddQualifierToGene (sfdata, qtype, val)) return true;
2399 break;
2400 case CSeqFeatData::e_Cdregion:
2401 if (x_AddQualifierToCdregion (sfp, sfdata, qtype, val)) return true;
2402 break;
2403 case CSeqFeatData::e_Rna:
2404 if (x_AddQualifierToRna (sfp, qtype, val)) return true;
2405 break;
2406 case CSeqFeatData::e_Imp:
2407 if (x_AddQualifierToImp (sfp, sfdata, qtype, qual, val)) return true;
2408 break;
2409 case CSeqFeatData::e_Region:
2410 if (qtype == eQual_region_name) {
2411 sfdata.SetRegion (val);
2412 return true;
2413 }
2414 break;
2415 case CSeqFeatData::e_Bond:
2416 if (qtype == eQual_bond_type) {
2417 CSeqFeatData::EBond btyp = CSeqFeatData::eBond_other;
2418 if (CSeqFeatData::GetBondList()->IsBondName(val.c_str(), btyp)) {
2419 sfdata.SetBond (btyp);
2420 return true;
2421 }
2422 }
2423 break;
2424 case CSeqFeatData::e_Site:
2425 if (qtype == eQual_site_type) {
2426 CSeqFeatData::ESite styp = CSeqFeatData::eSite_other;
2427 if (CSeqFeatData::GetSiteList()->IsSiteName( val.c_str(), styp)) {
2428 sfdata.SetSite (styp);
2429 return true;
2430 }
2431 }
2432 break;
2433 case CSeqFeatData::e_Pub:
2434 if( qtype == eQual_PubMed ) {
2435 CRef<CPub> new_pub( new CPub );
2436 new_pub->SetPmid( CPubMedId( ENTREZ_ID_FROM(long, x_StringToLongNoThrow(val, feat_name, qual)) ) );
2437 sfdata.SetPub().SetPub().Set().push_back( new_pub );
2438 return true;
2439 }
2440 break;
2441 case CSeqFeatData::e_Prot:
2442 switch( qtype ) {
2443 case eQual_product:
2444 sfdata.SetProt().SetName().push_back( val );
2445 return true;
2446 case eQual_function:
2447 sfdata.SetProt().SetActivity().push_back( val );
2448 return true;
2449 case eQual_EC_number:
2450 sfdata.SetProt().SetEc().push_back( val );
2451 return true;
2452 default:
2453 break;
2454 }
2455 break;
2456 default:
2457 break;
2458 }
2459
2460 switch (qtype) {
2461 case eQual_pseudo:
2462 sfp->SetPseudo (true);
2463 return true;
2464 case eQual_partial:
2465 sfp->SetPartial (true);
2466 return true;
2467 case eQual_exception:
2468 sfp->SetExcept (true);
2469 sfp->SetExcept_text (val);
2470 return true;
2471 case eQual_ribosomal_slippage:
2472 sfp->SetExcept (true);
2473 sfp->SetExcept_text (qual);
2474 return true;
2475 case eQual_trans_splicing:
2476 sfp->SetExcept (true);
2477 sfp->SetExcept_text (qual);
2478 return true;
2479 case eQual_evidence:
2480 if (val == "experimental") {
2481 sfp->SetExp_ev (CSeq_feat::eExp_ev_experimental);
2482 } else if (val == "not_experimental" || val == "non_experimental" ||
2483 val == "not-experimental" || val == "non-experimental") {
2484 sfp->SetExp_ev (CSeq_feat::eExp_ev_not_experimental);
2485 }
2486 return true;
2487 case eQual_note:
2488 return x_AddNoteToFeature(sfp, val);
2489 case eQual_inference:
2490 {
2491 string prefix, remainder;
2492 CInferencePrefixList::GetPrefixAndRemainder(val, prefix, remainder);
2493 if (!NStr::IsBlank(prefix)) {
2494 x_AddGBQualToFeature(sfp, qual, val);
2495 }
2496 else {
2497 x_ProcessMsg(
2498 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2499 feat_name, qual, val);
2500 }
2501 return true;
2502 }
2503 case eQual_replace:
2504 {
2505 string val_copy = val;
2506 NStr::ToLower( val_copy );
2507 x_AddGBQualToFeature (sfp, qual, val_copy );
2508 return true;
2509 }
2510 case eQual_allele:
2511 case eQual_bound_moiety:
2512 case eQual_clone:
2513 case eQual_compare:
2514 case eQual_cons_splice:
2515 case eQual_direction:
2516 case eQual_EC_number:
2517 case eQual_estimated_length:
2518 case eQual_experiment:
2519 case eQual_frequency:
2520 case eQual_function:
2521 case eQual_gap_type:
2522 case eQual_insertion_seq:
2523 case eQual_label:
2524 case eQual_linkage_evidence:
2525 case eQual_map:
2526 case eQual_ncRNA_class:
2527 case eQual_number:
2528 case eQual_old_locus_tag:
2529 case eQual_operon:
2530 case eQual_organism:
2531 case eQual_PCR_conditions:
2532 case eQual_phenotype:
2533 case eQual_product:
2534 case eQual_pseudogene:
2535 case eQual_satellite:
2536 case eQual_rpt_family:
2537 case eQual_rpt_type:
2538 case eQual_rpt_unit:
2539 case eQual_rpt_unit_range:
2540 case eQual_rpt_unit_seq:
2541 case eQual_standard_name:
2542 case eQual_tag_peptide:
2543 case eQual_transposon:
2544 case eQual_usedin:
2545 case eQual_cyt_map:
2546 case eQual_gen_map:
2547 case eQual_rad_map:
2548 case eQual_mobile_element_type:
2549 {
2550 x_AddGBQualToFeature (sfp, qual, val);
2551 return true;
2552 }
2553 case eQual_gene:
2554 {
2555 if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2556 CGene_ref& grp = sfp->SetGeneXref ();
2557 if (val != "-") {
2558 grp.SetLocus (val);
2559 }
2560 return true;
2561 }
2562 // else:
2563 return x_AddNoteToFeature(sfp, feat_name, qual, val);
2564 }
2565 case eQual_gene_desc:
2566 {
2567 if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2568 CGene_ref& grp = sfp->SetGeneXref ();
2569 grp.SetDesc (val);
2570 return true;
2571 }
2572 // else:
2573 return x_AddNoteToFeature(sfp, feat_name, qual, val);
2574 }
2575 case eQual_gene_syn:
2576 {
2577 if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2578 CGene_ref& grp = sfp->SetGeneXref ();
2579 CGene_ref::TSyn& syn = grp.SetSyn ();
2580 syn.push_back (val);
2581 return true;
2582 }
2583 // else:
2584 return x_AddNoteToFeature(sfp, feat_name, qual, val);
2585 }
2586 case eQual_locus_tag:
2587 {
2588 if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2589 CGene_ref& grp = sfp->SetGeneXref ();
2590 grp.SetLocus_tag (val);
2591 return true;
2592 }
2593 // else:
2594 return x_AddNoteToFeature(sfp, feat_name, qual, val);
2595 }
2596 case eQual_db_xref:
2597 {
2598 CTempString db, tag;
2599 if (NStr::SplitInTwo (val, ":", db, tag)) {
2600 CSeq_feat::TDbxref& dblist = sfp->SetDbxref ();
2601 CRef<CDbtag> dbt (new CDbtag);
2602 dbt->SetDb (db);
2603 CRef<CObject_id> oid (new CObject_id);
2604 static const char* digits = "0123456789";
2605 if (tag.find_first_not_of(digits) == string::npos && !NStr::IsBlank(tag))
2606 oid->SetId(NStr::StringToLong(tag));
2607 else
2608 oid->SetStr(tag);
2609 dbt->SetTag (*oid);
2610 dblist.push_back (dbt);
2611 return true;
2612 }
2613 return true;
2614 }
2615 case eQual_nomenclature:
2616 {
2617 /* !!! need to implement !!! */
2618 return true;
2619 }
2620 case eQual_go_component:
2621 case eQual_go_function:
2622 case eQual_go_process:
2623 if (featType == CSeqFeatData::e_Gene ||
2624 featType == CSeqFeatData::e_Cdregion ||
2625 featType == CSeqFeatData::e_Rna) {
2626 try {
2627 CReadUtil::AddGeneOntologyTerm(*sfp, qual, val);
2628 }
2629 catch( ILineError& err) {
2630 x_ProcessMsg(
2631 err.Problem(),
2632 err.Severity(),
2633 feat_name, qual, val,
2634 err.ErrorMessage());
2635 }
2636 //rw-621: throw out the faulty qualifier but retain the rest of the feature.
2637 return true;
2638 }
2639 return false;
2640 case eQual_transcript_id:
2641 {
2642 if (featType == CSeqFeatData::e_Rna &&
2643 sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) {
2644 CBioseq::TId ids;
2645 try {
2646 CSeq_id::ParseIDs(ids, val,
2647 CSeq_id::fParse_ValidLocal
2648 | CSeq_id::fParse_PartialOK);
2649 }
2650 catch (CSeqIdException&)
2651 {
2652 x_ProcessMsg(
2653 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2654 feat_name, qual, val,
2655 "Invalid transcript_id : " + val);
2656 return true;
2657 }
2658
2659 for (const auto& id : ids) {
2660 auto id_string = id->GetSeqIdString(true);
2661 auto res = m_ProcessedTranscriptIds.insert(id_string);
2662 if (res.second == false) { // Insertion failed because Seq-id already encountered
2663 x_ProcessMsg(
2664 ILineError::eProblem_DuplicateIDs, eDiag_Error,
2665 feat_name, qual, val,
2666 "Transcript ID " + id_string + " appears on multiple mRNA features"
2667 );
2668 }
2669 }
2670 }
2671 x_AddGBQualToFeature(sfp, qual, val);
2672 return true;
2673 }
2674 case eQual_protein_id:
2675 // see SQD-1535 and SQD-3496
2676 if (featType == CSeqFeatData::e_Cdregion ||
2677 (featType == CSeqFeatData::e_Rna &&
2678 sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) ||
2679 (featType == CSeqFeatData::e_Prot &&
2680 sfdata.GetProt().IsSetProcessed() &&
2681 sfdata.GetProt().GetProcessed() == CProt_ref::eProcessed_mature))
2682 {
2683 CBioseq::TId ids;
2684 try {
2685 CSeq_id::ParseIDs(ids, val,
2686 CSeq_id::fParse_ValidLocal |
2687 CSeq_id::fParse_PartialOK);
2688 }
2689 catch (CSeqIdException&)
2690 {
2691 x_ProcessMsg(
2692 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2693 feat_name, qual, val,
2694 "Invalid protein_id : " + val);
2695 return true;
2696 }
2697
2698 if (featType == CSeqFeatData::e_Cdregion) {
2699 for (const auto& id : ids) {
2700 auto id_string = id->GetSeqIdString(true);
2701 auto res = m_ProcessedProteinIds.insert(id_string);
2702 if (res.second == false) { // Insertion failed because Seq-id already encountered
2703 x_ProcessMsg(
2704 ILineError::eProblem_DuplicateIDs, eDiag_Error,
2705 feat_name, qual, val,
2706 "Protein ID " + id_string + " appears on multiple CDS features"
2707 );
2708 }
2709 }
2710 }
2711
2712 if (featType != CSeqFeatData::e_Rna) { // mRNA only has a protein_id qualifier
2713 auto pBestId = GetBestId(ids);
2714 if (pBestId) {
2715 sfp->SetProduct().SetWhole(*pBestId);
2716 }
2717 }
2718 }
2719
2720 if (featType != CSeqFeatData::e_Prot) { // Mat-peptide has an instantiated product, but no qualifier
2721 x_AddGBQualToFeature(sfp, qual, val);
2722 }
2723 return true;
2724 case eQual_regulatory_class:
2725 // This should've been handled up in x_AddQualifierToImp
2726 // so it's always a bad value to be here
2727 x_ProcessMsg(
2728 ILineError::eProblem_QualifierBadValue, eDiag_Error,
2729 feat_name, qual, val );
2730 return true;
2731 default:
2732 break;
2733 }
2734 }
2735 return false;
2736 }
2737
x_IsWebComment(CTempString line)2738 bool CFeatureTableReader_Imp::x_IsWebComment(CTempString line)
2739 {
2740 // This function is testing for a match against the following regular
2741 // expression, but we avoid actual regexps for max speed:
2742 // "^(===================================================================| INFO:| WARNING:| ERROR:).*"
2743
2744 // (that magic number is the size of the smallest possible match)
2745 if( line.length() < 6 ) {
2746 return false;
2747 }
2748
2749 if( line[0] == '=' ) {
2750 static const CTempString kAllEqualsMatch =
2751 "===================================================================";
2752 if( NStr::StartsWith(line, kAllEqualsMatch) ) {
2753 return true;
2754 }
2755 } else if( line[0] == ' ') {
2756 switch(line[1]) {
2757 case 'I':
2758 {
2759 static const CTempString kInfo = " INFO:";
2760 if( NStr::StartsWith(line, kInfo) ) {
2761 return true;
2762 }
2763 }
2764 break;
2765 case 'W':
2766 {
2767 static const CTempString kWarning = " WARNING:";
2768 if( NStr::StartsWith(line, kWarning) ) {
2769 return true;
2770 }
2771 }
2772 break;
2773 case 'E':
2774 {
2775 static const CTempString kError = " ERROR:";
2776 if( NStr::StartsWith(line, kError) ) {
2777 return true;
2778 }
2779 }
2780 break;
2781 default:
2782 // no match
2783 break;
2784 }
2785 }
2786
2787 // no match
2788 return false;
2789 }
2790
x_AddIntervalToFeature(CTempString strFeatureName,CRef<CSeq_feat> & sfp,const SFeatLocInfo & loc_info)2791 bool CFeatureTableReader_Imp::x_AddIntervalToFeature(
2792 CTempString strFeatureName,
2793 CRef<CSeq_feat>& sfp,
2794 const SFeatLocInfo& loc_info
2795 )
2796
2797 {
2798
2799 auto start = loc_info.start_pos;
2800 auto stop = loc_info.stop_pos;
2801
2802 const Int4 orig_start = start;
2803 CSeq_interval::TStrand strand = eNa_strand_plus;
2804
2805 if (start > stop) {
2806 swap(start, stop);
2807 strand = eNa_strand_minus;
2808 }
2809 if (loc_info.is_minus_strand) {
2810 strand = eNa_strand_minus;
2811 }
2812
2813 // construct loc, which will be added to the mix
2814 CSeq_loc_mix::Tdata & mix_set = sfp->SetLocation().SetMix();
2815 CRef<CSeq_loc> loc(new CSeq_loc);
2816 if (loc_info.is_point || start == stop ) {
2817 // a point of some kind
2818 if (mix_set.empty())
2819 m_need_check_strand = true;
2820 else
2821 x_GetPointStrand(*sfp, strand);
2822
2823 // note usage of orig_start instead of start
2824 // because we want the first part of the point
2825 // specified in the file, not the smallest because SetRightOf
2826 // works differently for plus vs. minus strand
2827 CRef<CSeq_point> pPoint(
2828 new CSeq_point(*m_seq_id, orig_start, strand) );
2829 if( loc_info.is_point ) {
2830 // between two bases
2831 pPoint->SetRightOf (true);
2832 // warning if stop is not start plus one
2833 if( stop != (start+1) ) {
2834 x_ProcessMsg(
2835 ILineError::eProblem_BadFeatureInterval, eDiag_Warning,
2836 strFeatureName );
2837 }
2838 } else {
2839 // just a point. do nothing
2840 }
2841
2842 if (loc_info.is_5p_partial) {
2843 pPoint->SetPartialStart (true, eExtreme_Biological);
2844 }
2845 if (loc_info.is_3p_partial) {
2846 pPoint->SetPartialStop (true, eExtreme_Biological);
2847 }
2848
2849 loc->SetPnt( *pPoint );
2850 } else {
2851 // interval
2852 CRef<CSeq_interval> pIval( new CSeq_interval(*m_seq_id, start, stop, strand) );
2853 if (loc_info.is_5p_partial) {
2854 pIval->SetPartialStart (true, eExtreme_Biological);
2855 }
2856 if (loc_info.is_3p_partial) {
2857 pIval->SetPartialStop (true, eExtreme_Biological);
2858 }
2859 loc->SetInt(*pIval);
2860 if (m_need_check_strand)
2861 {
2862 x_UpdatePointStrand(*sfp, strand);
2863 m_need_check_strand = false;
2864 }
2865 }
2866
2867 // check for internal partials
2868 if( ! mix_set.empty() ) {
2869 const CSeq_loc & last_loc = *mix_set.back();
2870 if( last_loc.IsPartialStop(eExtreme_Biological) ||
2871 loc->IsPartialStart(eExtreme_Biological) )
2872 {
2873 // internal partials
2874 x_ProcessMsg(ILineError::eProblem_InternalPartialsInFeatLocation,
2875 eDiag_Warning, strFeatureName );
2876 }
2877 }
2878
2879 mix_set.push_back(loc);
2880
2881
2882 if (loc_info.is_5p_partial || loc_info.is_3p_partial) {
2883 sfp->SetPartial (true);
2884 }
2885
2886 return true;
2887 }
2888
2889
2890
x_SetupSeqFeat(CRef<CSeq_feat> sfp,const string & feat,const TFlags flags,ITableFilter * filter)2891 bool CFeatureTableReader_Imp::x_SetupSeqFeat (
2892 CRef<CSeq_feat> sfp,
2893 const string& feat,
2894 const TFlags flags,
2895 ITableFilter *filter
2896 )
2897
2898 {
2899 if (feat.empty ()) return false;
2900
2901 // check filter, if any
2902 if( NULL != filter ) {
2903 ITableFilter::EAction action = filter->GetFeatAction(feat);
2904 if( action != ITableFilter::eAction_Okay ) {
2905 x_ProcessMsg(
2906 ILineError::eProblem_FeatureNameNotAllowed,
2907 eDiag_Warning, feat );
2908 if( action == ITableFilter::eAction_Disallowed ) {
2909 return false;
2910 }
2911 }
2912 }
2913
2914 CSeqFeatData::ESubtype sbtyp = CSeqFeatData::SubtypeNameToValue(feat);
2915 if (sbtyp != CSeqFeatData::eSubtype_bad) {
2916
2917 // populate *sfp here...
2918
2919 CSeqFeatData::E_Choice typ = CSeqFeatData::GetTypeFromSubtype (sbtyp);
2920 sfp->SetData ().Select (typ);
2921 CSeqFeatData& sfdata = sfp->SetData ();
2922
2923 if (typ == CSeqFeatData::e_Rna) {
2924 CRNA_ref& rrp = sfdata.SetRna ();
2925 CRNA_ref::EType rnatyp = CRNA_ref::eType_unknown;
2926 switch (sbtyp) {
2927 case CSeqFeatData::eSubtype_preRNA :
2928 rnatyp = CRNA_ref::eType_premsg;
2929 break;
2930 case CSeqFeatData::eSubtype_mRNA :
2931 rnatyp = CRNA_ref::eType_mRNA;
2932 break;
2933 case CSeqFeatData::eSubtype_tRNA :
2934 rnatyp = CRNA_ref::eType_tRNA;
2935 break;
2936 case CSeqFeatData::eSubtype_rRNA :
2937 rnatyp = CRNA_ref::eType_rRNA;
2938 break;
2939 case CSeqFeatData::eSubtype_snRNA :
2940 rnatyp = CRNA_ref::eType_ncRNA;
2941 rrp.SetExt().SetGen().SetClass("snRNA");
2942 break;
2943 case CSeqFeatData::eSubtype_scRNA :
2944 rnatyp = CRNA_ref::eType_ncRNA;
2945 rrp.SetExt().SetGen().SetClass("scRNA");
2946 break;
2947 case CSeqFeatData::eSubtype_snoRNA :
2948 rnatyp = CRNA_ref::eType_ncRNA;
2949 rrp.SetExt().SetGen().SetClass("snoRNA");
2950 break;
2951 case CSeqFeatData::eSubtype_ncRNA :
2952 rnatyp = CRNA_ref::eType_ncRNA;
2953 rrp.SetExt().SetGen();
2954 break;
2955 case CSeqFeatData::eSubtype_tmRNA :
2956 rnatyp = CRNA_ref::eType_tmRNA;
2957 rrp.SetExt().SetGen();
2958 break;
2959 case CSeqFeatData::eSubtype_otherRNA :
2960 rrp.SetExt().SetName("misc_RNA");
2961 rnatyp = CRNA_ref::eType_other;
2962 break;
2963 default :
2964 break;
2965 }
2966 rrp.SetType (rnatyp);
2967
2968 } else if (typ == CSeqFeatData::e_Imp) {
2969 CImp_feat_Base& imp = sfdata.SetImp ();
2970 imp.SetKey (feat);
2971
2972 } else if (typ == CSeqFeatData::e_Bond) {
2973 sfdata.SetBond (CSeqFeatData::eBond_other);
2974
2975 } else if (typ == CSeqFeatData::e_Site) {
2976 sfdata.SetSite (CSeqFeatData::eSite_other);
2977 } else if (typ == CSeqFeatData::e_Prot ) {
2978 CProt_ref &prot_ref = sfdata.SetProt();
2979 switch (sbtyp) {
2980 default:
2981 break;
2982 case CSeqFeatData::eSubtype_mat_peptide_aa:
2983 prot_ref.SetProcessed(CProt_ref::eProcessed_mature);
2984 break;
2985 case CSeqFeatData::eSubtype_sig_peptide_aa:
2986 prot_ref.SetProcessed(CProt_ref::eProcessed_signal_peptide);
2987 break;
2988 case CSeqFeatData::eSubtype_preprotein:
2989 prot_ref.SetProcessed(CProt_ref::eProcessed_preprotein);
2990 break;
2991 case CSeqFeatData::eSubtype_transit_peptide_aa:
2992 prot_ref.SetProcessed(CProt_ref::eProcessed_transit_peptide);
2993 break;
2994 case CSeqFeatData::eSubtype_propeptide_aa:
2995 prot_ref.SetProcessed(CProt_ref::eProcessed_propeptide);
2996 break;
2997 }
2998 }
2999
3000 // check for discouraged feature name
3001 if( (flags & CFeature_table_reader::fReportDiscouragedKey) != 0 ) {
3002 if( CSeqFeatData::IsDiscouragedSubtype(sbtyp) ) {
3003 x_ProcessMsg(
3004 ILineError::eProblem_DiscouragedFeatureName,
3005 eDiag_Warning, feat);
3006 }
3007 }
3008
3009 return true;
3010 }
3011
3012 // unrecognized feature key
3013
3014 if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3015 x_ProcessMsg(ILineError::eProblem_UnrecognizedFeatureName, eDiag_Warning, feat );
3016 }
3017
3018 if ((flags & CFeature_table_reader::fTranslateBadKey) != 0) {
3019
3020 sfp->SetData ().Select (CSeqFeatData::e_Imp);
3021 CSeqFeatData& sfdata = sfp->SetData ();
3022 CImp_feat_Base& imp = sfdata.SetImp ();
3023 imp.SetKey ("misc_feature");
3024 x_AddQualifierToFeature (sfp, kEmptyStr, "standard_name", feat, flags);
3025
3026 return true;
3027
3028 } else if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3029
3030 sfp->SetData ().Select (CSeqFeatData::e_Imp);
3031 CSeqFeatData& sfdata = sfp->SetData ();
3032 CImp_feat_Base& imp = sfdata.SetImp ();
3033 imp.SetKey (feat);
3034
3035 return true;
3036 }
3037
3038 return false;
3039 }
3040
x_ProcessMsg(ILineError::EProblem eProblem,EDiagSev eSeverity,const string & strFeatureName,const string & strQualifierName,const string & strQualifierValue,const string & strErrorMessage,const ILineError::TVecOfLines & vecOfOtherLines)3041 void CFeatureTableReader_Imp::x_ProcessMsg(
3042 ILineError::EProblem eProblem,
3043 EDiagSev eSeverity,
3044 const string& strFeatureName,
3045 const string& strQualifierName,
3046 const string& strQualifierValue,
3047 const string& strErrorMessage,
3048 const ILineError::TVecOfLines & vecOfOtherLines)
3049 {
3050 x_ProcessMsg(m_reader ? m_reader->GetLineNumber() : m_LineNumber,
3051 eProblem,
3052 eSeverity,
3053 strFeatureName,
3054 strQualifierName,
3055 strQualifierValue,
3056 strErrorMessage,
3057 vecOfOtherLines);
3058 }
3059
3060
x_ProcessMsg(int line_num,ILineError::EProblem eProblem,EDiagSev eSeverity,const string & strFeatureName,const string & strQualifierName,const string & strQualifierValue,const string & strErrorMessage,const ILineError::TVecOfLines & vecOfOtherLines)3061 void CFeatureTableReader_Imp::x_ProcessMsg(
3062 int line_num,
3063 ILineError::EProblem eProblem,
3064 EDiagSev eSeverity,
3065 const string & strFeatureName,
3066 const string & strQualifierName,
3067 const string & strQualifierValue,
3068 const string& strErrorMessage,
3069 const ILineError::TVecOfLines & vecOfOtherLines )
3070 {
3071
3072 if (!m_pMessageListener) {
3073 return;
3074 }
3075
3076 AutoPtr<CObjReaderLineException> pErr (
3077 CObjReaderLineException::Create(
3078 eSeverity, line_num, strErrorMessage, eProblem, m_real_seqid, strFeatureName,
3079 strQualifierName, strQualifierValue));
3080 ITERATE( ILineError::TVecOfLines, line_it, vecOfOtherLines ) {
3081 pErr->AddOtherLine(*line_it);
3082 }
3083
3084 if (!m_pMessageListener->PutError(*pErr)) {
3085 pErr->Throw();
3086 }
3087 }
3088
3089
PutProgress(const CTempString & seq_id,const unsigned int line_number,ILineErrorListener * pListener)3090 void CFeatureTableReader_Imp::PutProgress(
3091 const CTempString& seq_id,
3092 const unsigned int line_number,
3093 ILineErrorListener* pListener)
3094 {
3095 if (!pListener) {
3096 return;
3097 }
3098
3099 string msg = "Seq-id " + seq_id + ", line " + NStr::IntToString(line_number);
3100 pListener->PutProgress(msg);
3101 }
3102
3103
3104 // helper for CFeatureTableReader_Imp::ReadSequinFeatureTable,
3105 // just so we don't forget a step when we reset the feature
3106 //
x_ResetFeat(CRef<CSeq_feat> & sfp,bool & curr_feat_intervals_done)3107 void CFeatureTableReader_Imp::x_ResetFeat(CRef<CSeq_feat> & sfp, bool & curr_feat_intervals_done)
3108 {
3109 m_need_check_strand = false;
3110 sfp.Reset(new CSeq_feat);
3111 //sfp->ResetLocation();
3112 curr_feat_intervals_done = false;
3113 }
3114
x_GetPointStrand(const CSeq_feat & feat,CSeq_interval::TStrand & strand) const3115 void CFeatureTableReader_Imp::x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const
3116 {
3117 if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3118 {
3119 const CSeq_loc& last = *feat.GetLocation().GetMix().Get().back();
3120 if (last.IsInt() && last.GetInt().IsSetStrand())
3121 {
3122 strand = last.GetInt().GetStrand();
3123 }
3124 else
3125 if (last.IsPnt() && last.GetPnt().IsSetStrand())
3126 {
3127 strand = last.GetPnt().GetStrand();
3128 }
3129 }
3130 }
3131
x_UpdatePointStrand(CSeq_feat & feat,CSeq_interval::TStrand strand) const3132 void CFeatureTableReader_Imp::x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const
3133 {
3134 if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3135 {
3136
3137 for (auto pSeqLoc : feat.SetLocation().SetMix().Set()) {
3138 if (pSeqLoc->IsPnt()) {
3139 auto& seq_point = pSeqLoc->SetPnt();
3140 const auto old_strand =
3141 seq_point.IsSetStrand() ?
3142 seq_point.GetStrand() :
3143 eNa_strand_plus;
3144
3145 seq_point.SetStrand(strand);
3146 if (old_strand != strand) {
3147 const bool is_5p_partial = seq_point.IsPartialStop(eExtreme_Biological);
3148 const bool is_3p_partial = seq_point.IsPartialStart(eExtreme_Biological);
3149 seq_point.SetPartialStart(is_5p_partial, eExtreme_Biological);
3150 seq_point.SetPartialStop(is_3p_partial, eExtreme_Biological);
3151 }
3152 }
3153 }
3154 }
3155 }
3156
3157
x_FinishFeature(CRef<CSeq_feat> & feat,TFtable & ftable)3158 void CFeatureTableReader_Imp::x_FinishFeature(CRef<CSeq_feat>& feat,
3159 TFtable& ftable)
3160 {
3161 if ( !feat ||
3162 feat.Empty() ||
3163 !feat->IsSetData() ||
3164 (feat->GetData().Which() == CSeqFeatData::e_not_set) )
3165 {
3166 return;
3167 }
3168
3169 // Check for missing publication - RW-626
3170 if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_pub &&
3171 (!feat->SetData().SetPub().IsSetPub() ||
3172 feat->SetData().SetPub().GetPub().Get().empty())) {
3173 const int line_number = m_reader->AtEOF() ?
3174 m_reader->GetLineNumber() :
3175 m_reader->GetLineNumber()-1;
3176
3177 string msg = "Reference feature is empty. Skipping feature.";
3178
3179 x_ProcessMsg(line_number,
3180 ILineError::eProblem_IncompleteFeature,
3181 eDiag_Warning,
3182 "Reference",
3183 kEmptyStr,
3184 kEmptyStr,
3185 msg);
3186 return;
3187 }
3188
3189 if (feat->IsSetLocation() && feat->GetLocation().IsMix())
3190 {
3191 if (feat->GetLocation().GetMix().Get().empty()) {
3192 // turn empty seqlocmix into a null seq-loc
3193 feat->SetLocation().SetNull();
3194 }
3195 else
3196 if (feat->GetLocation().GetMix().Get().size() == 1) {
3197 // demote 1-part seqlocmixes to seq-loc with just that part
3198 CRef<CSeq_loc> keep_loc = *feat->SetLocation().SetMix().Set().begin();
3199 feat->SetLocation(*keep_loc);
3200 }
3201 }
3202 ftable.push_back(feat);
3203 }
3204
3205
3206
x_ProcessQualifier(const string & qual_name,const string & qual_val,const string & feat_name,CRef<CSeq_feat> feat,TFlags flags)3207 void CFeatureTableReader_Imp::x_ProcessQualifier(const string& qual_name,
3208 const string& qual_val,
3209 const string& feat_name,
3210 CRef<CSeq_feat> feat,
3211 TFlags flags)
3212 {
3213 if (NStr::IsBlank(qual_name)) {
3214 return;
3215 }
3216
3217 if (!feat) {
3218 if ( flags & CFeature_table_reader::fReportBadKey ) {
3219 x_ProcessMsg(ILineError::eProblem_QualifierWithoutFeature,
3220 eDiag_Warning, kEmptyStr, qual_name, qual_val);
3221 }
3222 return;
3223 }
3224
3225 if (NStr::IsBlank(qual_val)) {
3226 if (sc_SingleKeys.find(qual_name.c_str()) != sc_SingleKeys.end()) {
3227 x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags);
3228 }
3229 else {
3230 x_ProcessMsg(ILineError::eProblem_QualifierBadValue,
3231 eDiag_Warning, feat_name, qual_name);
3232 }
3233 return;
3234 }
3235
3236 // else qual_name and qual_val are not blank
3237 if (!x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags)) {
3238 if (flags & CFeature_table_reader::fReportBadKey) {
3239 x_ProcessMsg(ILineError::eProblem_UnrecognizedQualifierName,
3240 eDiag_Warning, feat_name, qual_name, qual_val);
3241 }
3242
3243 if (flags & CFeature_table_reader::fKeepBadKey) {
3244 x_AddGBQualToFeature(feat, qual_name, qual_val);
3245 }
3246 }
3247 }
3248
3249
3250
ReadSequinFeatureTable(const CTempString & in_seqid,const CTempString & in_annotname,const TFlags flags,ITableFilter * filter)3251 CRef<CSeq_annot> CFeatureTableReader_Imp::ReadSequinFeatureTable (
3252 const CTempString& in_seqid,
3253 const CTempString& in_annotname,
3254 const TFlags flags,
3255 ITableFilter *filter
3256 )
3257 {
3258 string feat, qual, qual_value;
3259 string curr_feat_name;
3260 // Int4 start, stop;
3261 //bool partial5, partial3, ispoint, isminus,
3262
3263 bool ignore_until_next_feature_key = false;
3264 Int4 offset = 0;
3265 SFeatLocInfo loc_info;
3266
3267 CRef<CSeq_annot> sap(new CSeq_annot);
3268
3269 TFtable& ftable = sap->SetData().SetFtable();
3270 const bool bIgnoreWebComments =
3271 ( (flags & CFeature_table_reader::fIgnoreWebComments) != 0 );
3272
3273 // if sequence ID is a list, use just one sequence ID string
3274 x_InitId(in_seqid, flags);
3275
3276 // Use this to efficiently find the best CDS for a prot feature
3277 // (only add CDS's for it to work right)
3278 CBestFeatFinder best_CDS_finder;
3279
3280 // map feature types to features
3281 TChoiceToFeatMap choiceToFeatMap;
3282
3283 CRef<CSeq_feat> sfp;
3284 // This is true once this feature should not
3285 // have any more intervals.
3286 // This allows us to catch errors like the following:
3287 //
3288 //
3289 //>Feature lcl|Seq1
3290 //1 1008 CDS
3291 // gene THE_GENE_NAME
3292 //50 200
3293 // product THE_GENE_PRODUCT
3294 bool curr_feat_intervals_done = false;
3295
3296 if (! in_annotname.empty ()) {
3297 CAnnot_descr& descr = sap->SetDesc ();
3298 CRef<CAnnotdesc> annot(new CAnnotdesc);
3299 annot->SetName (in_annotname);
3300 descr.Set().push_back (annot);
3301 }
3302
3303 while ( !m_reader->AtEOF() ) {
3304
3305 CTempString line = *++(*m_reader);
3306
3307 if( m_reader->GetLineNumber() % 10000 == 0 &&
3308 m_reader->GetLineNumber() > 0 )
3309 {
3310 PutProgress(m_real_seqid, m_reader->GetLineNumber(), m_pMessageListener);
3311 }
3312
3313 // skip empty lines.
3314 // if requested, also skip webcomment lines
3315 if( line.empty () || (bIgnoreWebComments && x_IsWebComment(line) ) ) {
3316 continue;
3317 }
3318
3319 // if next line is a new feature table, return current sap
3320 CTempStringEx dummy1, dummy2;
3321 if( ParseInitialFeatureLine(line, dummy1, dummy2) ) {
3322 m_reader->UngetLine(); // we'll get this feature line the next time around
3323 break;
3324 }
3325
3326 if (line [0] == '[') {
3327
3328 // try to parse it as an offset
3329 if( x_TryToParseOffset(line, offset) ) {
3330 // okay, known command
3331 } else {
3332 // warn for unknown square-bracket commands
3333 x_ProcessMsg(
3334 ILineError::eProblem_UnrecognizedSquareBracketCommand,
3335 eDiag_Warning);
3336 }
3337
3338 } else if ( s_LineIndicatesOrder(line) ) {
3339
3340 // put nulls between feature intervals
3341 CRef<CSeq_loc> loc_with_nulls = s_LocationJoinToOrder( sfp->GetLocation() );
3342 // loc_with_nulls is unset if no change was needed
3343 if( loc_with_nulls ) {
3344 sfp->SetLocation( *loc_with_nulls );
3345 }
3346
3347 } else if (x_ParseFeatureTableLine (line, loc_info, feat, qual, qual_value, offset)) {
3348 // process line in feature table
3349
3350 replace( qual_value.begin(), qual_value.end(), '\"', '\'' );
3351
3352 if ((! feat.empty ()) && loc_info.start_pos >= 0 && loc_info.stop_pos >= 0) {
3353
3354 // process start - stop - feature line
3355
3356 x_FinishFeature(sfp, ftable);
3357 x_ResetFeat( sfp, curr_feat_intervals_done );
3358
3359 if (x_SetupSeqFeat (sfp, feat, flags, filter)) {
3360
3361 // figure out type of feat, and store in map for later use
3362 CSeqFeatData::E_Choice eChoice = CSeqFeatData::e_not_set;
3363 if( sfp->CanGetData() ) {
3364 eChoice = sfp->GetData().Which();
3365 }
3366 choiceToFeatMap.insert(
3367 TChoiceToFeatMap::value_type(
3368 eChoice,
3369 SFeatAndLineNum(sfp, m_reader->GetLineNumber())));
3370
3371 // if new feature is a CDS, remember it for later lookups
3372 if( eChoice == CSeqFeatData::e_Cdregion ) {
3373 best_CDS_finder.AddFeat( *sfp );
3374 }
3375
3376 // and add first interval
3377 x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3378
3379 ignore_until_next_feature_key = false;
3380
3381 curr_feat_name = feat;
3382
3383 } else {
3384
3385 // bad feature, set ignore flag
3386
3387 ignore_until_next_feature_key = true;
3388 }
3389
3390 } else if (ignore_until_next_feature_key) {
3391
3392 // bad feature was found before, so ignore
3393 // qualifiers until next feature key
3394
3395 }
3396 else
3397 if (loc_info.start_pos >= 0 &&
3398 loc_info.stop_pos >= 0 &&
3399 feat.empty () &&
3400 qual.empty () &&
3401 qual_value.empty ()) {
3402
3403 if( curr_feat_intervals_done ) {
3404 // the feat intervals were done, so it's an error for there to be more intervals
3405 x_ProcessMsg(ILineError::eProblem_NoFeatureProvidedOnIntervals, eDiag_Error);
3406 // this feature is in bad shape, so we ignore the rest of it
3407 ignore_until_next_feature_key = true;
3408 x_ResetFeat(sfp, curr_feat_intervals_done);
3409 } else if (sfp && sfp->IsSetLocation() && sfp->GetLocation().IsMix()) {
3410 // process start - stop multiple interval line
3411 x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3412 // start, stop, partial5, partial3, ispoint, isminus);
3413 } else {
3414 if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3415 x_ProcessMsg(ILineError::eProblem_NoFeatureProvidedOnIntervals,
3416 eDiag_Warning);
3417 }
3418 }
3419
3420 } else if (!NStr::IsBlank(qual)) {
3421 curr_feat_intervals_done = true;
3422 x_ProcessQualifier(qual, qual_value, curr_feat_name, sfp, flags);
3423 }
3424 else if (!feat.empty()) {
3425
3426 // unrecognized location
3427
3428 // there should no more ranges for this feature
3429 // (although there still can be ranges for quals, of course).
3430 curr_feat_intervals_done = true;
3431
3432 if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3433 x_ProcessMsg(
3434 ILineError::eProblem_FeatureBadStartAndOrStop, eDiag_Warning,
3435 feat );
3436 }
3437 }
3438 }
3439 }
3440
3441 // make sure last feature is finished
3442 x_FinishFeature(sfp, ftable);
3443 x_ResetFeat( sfp, curr_feat_intervals_done );
3444
3445 if ((flags & CFeature_table_reader::fCreateGenesFromCDSs) != 0 ||
3446 (flags & CFeature_table_reader::fCDSsMustBeInTheirGenes) != 0 )
3447 {
3448 x_CreateGenesFromCDSs(sap, choiceToFeatMap, flags);
3449 }
3450 return sap;
3451 }
3452
3453
CreateSeqFeat(const string & feat,CSeq_loc & location,const TFlags flags,const string & seq_id,ITableFilter * filter)3454 CRef<CSeq_feat> CFeatureTableReader_Imp::CreateSeqFeat (
3455 const string& feat,
3456 CSeq_loc& location,
3457 const TFlags flags,
3458 const string &seq_id,
3459 ITableFilter *filter
3460 )
3461
3462 {
3463 CRef<CSeq_feat> sfp (new CSeq_feat);
3464
3465 sfp->ResetLocation ();
3466
3467 if ( ! x_SetupSeqFeat (sfp, feat, flags, filter) ) {
3468
3469 // bad feature, make dummy
3470 sfp->SetData ().Select (CSeqFeatData::e_not_set);
3471 }
3472 sfp->SetLocation (location);
3473
3474 return sfp;
3475 }
3476
x_InitId(const CTempString & seq_id,const TFlags flags)3477 void CFeatureTableReader_Imp::x_InitId(const CTempString& seq_id, const TFlags flags)
3478 {
3479 if (!NStr::IsBlank(seq_id)) {
3480 CBioseq::TId ids;
3481 CSeq_id::ParseIDs(ids, seq_id,
3482 (flags & CFeature_table_reader::fAllIdsAsLocal) ? CSeq_id::fParse_AnyLocal : CSeq_id::fParse_Default);
3483
3484 m_seq_id.Reset();
3485 if (flags & CFeature_table_reader::fPreferGenbankId)
3486 {
3487 for (auto id : ids)
3488 {
3489 if (id->IsGenbank())
3490 m_seq_id = id;
3491 }
3492 };
3493
3494 if (m_seq_id.Empty())
3495 m_seq_id = ids.front();
3496
3497 m_real_seqid.clear();
3498 m_seq_id->GetLabel(&m_real_seqid, CSeq_id::eFasta);
3499 }
3500 }
3501
AddFeatQual(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const TFlags flags,const string & seq_id1)3502 void CFeatureTableReader_Imp::AddFeatQual (
3503 CRef<CSeq_feat> sfp,
3504 const string& feat_name,
3505 const string& qual,
3506 const string& val,
3507 const TFlags flags,
3508 const string &seq_id1 )
3509
3510 {
3511 x_InitId(seq_id1, flags);
3512
3513 if (NStr::IsBlank(qual)) {
3514 return;
3515 }
3516
3517 if (!val.empty ()) { // Should probably use NStr::IsBlank()
3518 if (! x_AddQualifierToFeature (sfp, feat_name, qual, val, flags)) {
3519 // unrecognized qualifier key
3520 if ((flags & CFeature_table_reader::fReportBadKey) != 0) {
3521 ERR_POST_X (5, Warning << "Unrecognized qualifier '" << qual << "'");
3522 }
3523 if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3524 x_AddGBQualToFeature (sfp, qual, val);
3525 }
3526 }
3527 }
3528 else { // empty val
3529 // check for the few qualifiers that do not need a value
3530 auto s_iter = sc_SingleKeys.find (qual.c_str ());
3531 if (s_iter != sc_SingleKeys.end ()) {
3532 x_AddQualifierToFeature (sfp, feat_name, qual, val, flags);
3533 }
3534 }
3535 }
3536
3537 // static
ParseInitialFeatureLine(const CTempString & line_arg,CTempStringEx & out_seqid,CTempStringEx & out_annotname)3538 bool CFeatureTableReader_Imp::ParseInitialFeatureLine (
3539 const CTempString& line_arg,
3540 CTempStringEx& out_seqid,
3541 CTempStringEx& out_annotname )
3542 {
3543 out_seqid.clear();
3544 out_annotname.clear();
3545
3546 // copy the line_arg because we can't edit line_arg itself
3547 CTempString line = line_arg;
3548
3549 // handle ">"
3550 NStr::TruncateSpacesInPlace(line);
3551 if( ! NStr::StartsWith(line, ">") ) {
3552 return false;
3553 }
3554 line = line.substr(1); // remove '>'
3555
3556 // handle "Feature"
3557 NStr::TruncateSpacesInPlace(line, NStr::eTrunc_Begin);
3558 const CTempString kFeatureStr("Feature");
3559 if( ! NStr::StartsWith(line, kFeatureStr, NStr::eNocase) ) {
3560 return false;
3561 }
3562 line = line.substr( kFeatureStr.length() ); // remove "Feature"
3563
3564 // throw out any non-space characters at the beginning,
3565 // so we can, for example, handle ">Features" (note the "s")
3566 while( !line.empty() && !isspace(line[0]) ) {
3567 line = line.substr(1);
3568 }
3569
3570 // extract seqid and annotname
3571 NStr::TruncateSpacesInPlace(line, NStr::eTrunc_Begin);
3572 NStr::SplitInTwo(line, " \t", out_seqid, out_annotname, NStr::fSplit_Tokenize);
3573
3574 return true;
3575 }
3576
3577
3578 // public access functions
3579
CFeature_table_reader(TReaderFlags fReaderFlags)3580 CFeature_table_reader::CFeature_table_reader(
3581 TReaderFlags fReaderFlags)
3582 : CReaderBase(fReaderFlags)
3583 {
3584 }
3585
CFeature_table_reader(ILineReader & lr,ILineErrorListener * pErrors)3586 CFeature_table_reader::CFeature_table_reader(
3587 ILineReader& lr,
3588 ILineErrorListener* pErrors) :
3589 CReaderBase(0),
3590 m_pImpl(new CFeatureTableReader_Imp(&lr, 0, pErrors))
3591 {}
3592
3593 CRef<CSerialObject>
ReadObject(ILineReader & lr,ILineErrorListener * pMessageListener)3594 CFeature_table_reader::ReadObject(
3595 ILineReader &lr, ILineErrorListener *pMessageListener)
3596 {
3597 CRef<CSerialObject> object(
3598 ReadSeqAnnot( lr, pMessageListener ).ReleaseOrNull() );
3599 return object;
3600 }
3601
3602
3603 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pMessageListener)3604 CFeature_table_reader::ReadSeqAnnot(
3605 ILineReader &lr, ILineErrorListener *pMessageListener)
3606 {
3607 return ReadSequinFeatureTable(lr, 0, pMessageListener);
3608 }
3609
3610
ReadSequinFeatureTable(CNcbiIstream & ifs,const string & seqid,const string & annotname,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3611 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3612 CNcbiIstream& ifs,
3613 const string& seqid,
3614 const string& annotname,
3615 const TFlags flags,
3616 ILineErrorListener* pMessageListener,
3617 ITableFilter *filter
3618 )
3619 {
3620 CStreamLineReader reader(ifs);
3621 return ReadSequinFeatureTable(reader, seqid, annotname, flags, pMessageListener, filter);
3622 }
3623
ReadSequinFeatureTable(ILineReader & reader,const string & seqid,const string & annotname,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3624 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3625 ILineReader& reader,
3626 const string& seqid,
3627 const string& annotname,
3628 const TFlags flags,
3629 ILineErrorListener* pMessageListener,
3630 ITableFilter *filter
3631 )
3632 {
3633 // just read features from 5-column table
3634 CFeatureTableReader_Imp impl(&reader, 0, pMessageListener);
3635 return impl.ReadSequinFeatureTable(seqid, annotname, flags, filter);
3636 }
3637
x_ReadFeatureTable(CFeatureTableReader_Imp & reader,const CTempString & seqid,const CTempString & annot_name,TFlags flags,ITableFilter * filter)3638 CRef<CSeq_annot> CFeature_table_reader::x_ReadFeatureTable(
3639 CFeatureTableReader_Imp& reader,
3640 const CTempString& seqid,
3641 const CTempString& annot_name,
3642 TFlags flags,
3643 ITableFilter* filter) {
3644 return reader.ReadSequinFeatureTable(seqid, annot_name, flags, filter);
3645 }
3646
3647
ReadSequinFeatureTable(CNcbiIstream & ifs,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3648 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3649 CNcbiIstream& ifs,
3650 const TFlags flags,
3651 ILineErrorListener* pMessageListener,
3652 ITableFilter *filter
3653 )
3654 {
3655 CStreamLineReader reader(ifs);
3656 return ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3657 }
3658
3659
x_ReadFeatureTable(CFeatureTableReader_Imp & reader,const TFlags flags,ITableFilter * filter,const string & seqid_prefix)3660 CRef<CSeq_annot> CFeature_table_reader::x_ReadFeatureTable(
3661 CFeatureTableReader_Imp& reader,
3662 const TFlags flags,
3663 ITableFilter* filter,
3664 const string& seqid_prefix)
3665 {
3666 auto pLineReader = reader.GetLineReaderPtr();
3667 if (!pLineReader) {
3668 return CRef<CSeq_annot>();
3669 }
3670
3671
3672 CTempStringEx orig_seqid, annotname;
3673 // first look for >Feature line, extract seqid and optional annotname
3674 while (orig_seqid.empty () && !pLineReader->AtEOF() ) {
3675 CTempString line = *++(*pLineReader);
3676 if( ParseInitialFeatureLine(line, orig_seqid, annotname) ) {
3677 CFeatureTableReader_Imp::PutProgress(orig_seqid,
3678 pLineReader->GetLineNumber(),
3679 reader.GetErrorListenerPtr());
3680 }
3681 }
3682
3683 string temp_seqid;
3684 if (seqid_prefix.empty()) {
3685 //seqid = orig_seqid;
3686 } else {
3687 if (orig_seqid.find('|') == string::npos)
3688 temp_seqid = seqid_prefix + orig_seqid;
3689 else
3690 if (NStr::StartsWith(orig_seqid, "lcl|"))
3691 {
3692 temp_seqid = seqid_prefix + orig_seqid.substr(4);
3693 }
3694 orig_seqid = temp_seqid;
3695 }
3696 return x_ReadFeatureTable(reader, orig_seqid, annotname, flags, filter);
3697 }
3698
3699
ReadSequinFeatureTable(ILineReader & reader,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * pFilter,const string & seqid_prefix)3700 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable (
3701 ILineReader& reader,
3702 const TFlags flags,
3703 ILineErrorListener* pMessageListener,
3704 ITableFilter* pFilter,
3705 const string& seqid_prefix
3706 )
3707 {
3708 CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3709 return x_ReadFeatureTable(ftable_reader, flags, pFilter, seqid_prefix);
3710 }
3711
3712
ReadSequinFeatureTable(const TFlags flags,ITableFilter * pFilter,const string & seqid_prefix)3713 CRef<CSeq_annot> CFeature_table_reader::ReadSequinFeatureTable(
3714 const TFlags flags,
3715 ITableFilter* pFilter,
3716 const string& seqid_prefix
3717 )
3718 {
3719 return x_ReadFeatureTable(*m_pImpl, flags, pFilter, seqid_prefix);
3720 }
3721
3722
ReadSequinFeatureTables(CNcbiIstream & ifs,CSeq_entry & entry,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3723 void CFeature_table_reader::ReadSequinFeatureTables(
3724 CNcbiIstream& ifs,
3725 CSeq_entry& entry,
3726 const TFlags flags,
3727 ILineErrorListener* pMessageListener,
3728 ITableFilter *filter
3729 )
3730 {
3731 CStreamLineReader reader(ifs);
3732 return ReadSequinFeatureTables(reader, entry, flags, pMessageListener, filter);
3733 }
3734
3735 struct SCSeqidCompare
3736 {
3737 inline
operator ()SCSeqidCompare3738 bool operator()(const CSeq_id* left, const CSeq_id* right) const
3739 {
3740 return *left < *right;
3741 };
3742 };
3743
ReadSequinFeatureTables(ILineReader & reader,CSeq_entry & entry,const TFlags flags,ILineErrorListener * pMessageListener,ITableFilter * filter)3744 void CFeature_table_reader::ReadSequinFeatureTables(
3745 ILineReader& reader,
3746 CSeq_entry& entry,
3747 const TFlags flags,
3748 ILineErrorListener* pMessageListener,
3749 ITableFilter *filter
3750 )
3751 {
3752 // let's use map to speedup matching on very large files, see SQD-1847
3753 map<const CSeq_id*, CRef<CBioseq>, SCSeqidCompare> seq_map;
3754
3755 for (CTypeIterator<CBioseq> seqit(entry); seqit; ++seqit) {
3756 ITERATE (CBioseq::TId, seq_id, seqit->GetId()) {
3757 seq_map[seq_id->GetPointer()].Reset(&*seqit);
3758 }
3759 }
3760
3761 CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3762 while ( !reader.AtEOF() ) {
3763 auto annot = x_ReadFeatureTable(ftable_reader, flags, filter);
3764 //CRef<CSeq_annot> annot = ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3765 if (entry.IsSeq()) { // only one place to go
3766 entry.SetSeq().SetAnnot().push_back(annot);
3767 continue;
3768 }
3769 _ASSERT(annot->GetData().IsFtable());
3770 if (annot->GetData().GetFtable().empty()) {
3771 continue;
3772 }
3773 // otherwise, take the first feature, which should be representative
3774 const CSeq_feat& feat = *annot->GetData().GetFtable().front();
3775 const CSeq_id* feat_id = feat.GetLocation().GetId();
3776 CBioseq* seq = NULL;
3777 _ASSERT(feat_id); // we expect a uniform sequence ID
3778 seq = seq_map[feat_id].GetPointer();
3779 if (seq) { // found a match
3780 seq->SetAnnot().push_back(annot);
3781 } else { // just package on the set
3782 ERR_POST_X(6, Warning
3783 << "ReadSequinFeatureTables: unable to find match for "
3784 << feat_id->AsFastaString());
3785 entry.SetSet().SetAnnot().push_back(annot);
3786 }
3787 }
3788 }
3789
3790
CreateSeqFeat(const string & feat,CSeq_loc & location,const TFlags flags,ILineErrorListener * pMessageListener,unsigned int line_number,string * seq_id,ITableFilter * filter)3791 CRef<CSeq_feat> CFeature_table_reader::CreateSeqFeat (
3792 const string& feat,
3793 CSeq_loc& location,
3794 const TFlags flags,
3795 ILineErrorListener* pMessageListener,
3796 unsigned int line_number,
3797 string *seq_id,
3798 ITableFilter *filter
3799 )
3800 {
3801 CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3802 return impl.CreateSeqFeat (feat, location, flags, (seq_id ? *seq_id : string() ), filter);
3803 }
3804
3805
AddFeatQual(CRef<CSeq_feat> sfp,const string & feat_name,const string & qual,const string & val,const CFeature_table_reader::TFlags flags,ILineErrorListener * pMessageListener,int line_number,const string & seq_id)3806 void CFeature_table_reader::AddFeatQual (
3807 CRef<CSeq_feat> sfp,
3808 const string& feat_name,
3809 const string& qual,
3810 const string& val,
3811 const CFeature_table_reader::TFlags flags,
3812 ILineErrorListener* pMessageListener,
3813 int line_number,
3814 const string &seq_id
3815 )
3816
3817 {
3818 CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3819 impl.AddFeatQual (sfp, feat_name, qual, val, flags, seq_id) ;
3820 }
3821
3822 bool
ParseInitialFeatureLine(const CTempString & line_arg,CTempStringEx & out_seqid,CTempStringEx & out_annotname)3823 CFeature_table_reader::ParseInitialFeatureLine (
3824 const CTempString& line_arg,
3825 CTempStringEx& out_seqid,
3826 CTempStringEx& out_annotname )
3827 {
3828 return CFeatureTableReader_Imp::ParseInitialFeatureLine(line_arg, out_seqid, out_annotname);
3829 }
3830
3831
~CFeature_table_reader()3832 CFeature_table_reader::~CFeature_table_reader() {}
3833
3834 END_objects_SCOPE
3835 END_NCBI_SCOPE
3836