1 /*  $Id: gff_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Aaron Ucko, Wratko Hlavina
27 *
28 * File Description:
29 *   Reader for GFF (including GTF) files.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <objtools/readers/gff_reader.hpp>
36 
37 #include <corelib/ncbitime.hpp>
38 #include <corelib/ncbiutil.hpp>
39 #include <corelib/stream_utils.hpp>
40 #include <serial/iterator.hpp>
41 
42 #include <objects/general/Date.hpp>
43 #include <objects/general/Object_id.hpp>
44 #include <objects/seq/Seq_annot.hpp>
45 #include <objects/seq/Seq_descr.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seqdesc.hpp>
48 #include <objects/seqalign/Dense_seg.hpp>
49 #include <objects/seqalign/Score.hpp>
50 #include <objects/seqalign/Std_seg.hpp>
51 #include <objects/seqfeat/Feat_id.hpp>
52 #include <objects/seqfeat/Cdregion.hpp>
53 #include <objects/seqfeat/SeqFeatXref.hpp>
54 #include <objects/seqfeat/Gb_qual.hpp>
55 #include <objects/seqloc/Seq_interval.hpp>
56 #include <objects/seqloc/Seq_point.hpp>
57 #include <objects/seqset/Bioseq_set.hpp>
58 
59 #include <objtools/readers/cigar.hpp>
60 #include <objtools/readers/fasta.hpp>
61 #include <objtools/readers/readfeat.hpp>
62 #include <objtools/error_codes.hpp>
63 
64 #include <algorithm>
65 #include <ctype.h>
66 
67 
68 #define NCBI_USE_ERRCODE_X   Objtools_Rd_GFF
69 
70 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)71 BEGIN_SCOPE(objects)
72 
73 static CRef<CFeat_id>
74 s_StringToFeatId( const string& str )
75 {
76     CRef<CObject_id> objid( new CObject_id );
77     objid->SetStr( str );
78     CRef<CFeat_id> featid( new CFeat_id );
79     featid->SetLocal( *objid );
80     return featid;
81 }
82 
s_URLDecode(const CTempString & s,string & out)83 static string& s_URLDecode(const CTempString& s, string& out) {
84     SIZE_TYPE pos = 0;
85     out.erase();
86     out.reserve(s.size());
87     while (pos < s.size()) {
88         SIZE_TYPE pos2 = s.find_first_of("%" /* "+" */, pos);
89         out += s.substr(pos, pos2 - pos);
90         if (pos2 == NPOS) {
91             break;
92         } else if (s[pos2] == '+') { // disabled -- often used literally
93             out += ' ';
94             pos = pos2 + 1;
95         } else if (s[pos2] == '%') {
96             try {
97                 out += (char)NStr::StringToInt(s.substr(pos2 + 1, 2), 0, 16);
98                 pos = pos2 + 3;
99             } catch (CStringException&) {
100                 // some sources neglect to encode % (!)
101                 out += '%';
102                 pos = pos2 + 1;
103             }
104         } else {
105             _TROUBLE;
106         }
107     }
108     return out;
109 }
110 
111 
Read(CNcbiIstream & in,TFlags flags)112 CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags)
113 {
114     CStreamLineReader lr(in);
115     return Read(lr, flags);
116 }
117 
Read(ILineReader & in,TFlags flags)118 CRef<CSeq_entry> CGFFReader::Read(ILineReader& in, TFlags flags)
119 {
120     x_Reset();
121     m_Flags  = flags;
122     m_LineReader = &in;
123 
124     if (m_Flags & fSetVersion3) {
125         m_Version = 3;
126     }
127 
128     TStr line;
129     while ( !in.AtEOF() ) {
130         ++m_LineNumber;
131         char c = in.PeekChar();
132         if (c == '#') {
133             line = *++in;
134             if (line.size() > 2  &&  line[1] == '#') {
135                 x_ParseStructuredComment(line);
136                 // ignore regular comments
137             }
138         } else if (c == '>') {
139             // implicit ##FASTA
140             x_ReadFastaSequences(in);
141         } else {
142             line = *++in;
143             if ( x_IsLineUcscMetaInformation(line) ) {
144                 // UCSC browser or track line. For now, we ignore those.
145                 continue;
146             }
147             if ( line.empty() ) {
148                 // too commonly used for file formatting to even warn about
149                 continue;
150             }
151             CRef<SRecord> record = x_ParseFeatureInterval(line);
152             if (record) {
153                 if (record->id.empty()) {
154                     x_ParseAndPlace(*record);
155                 } else {
156                     CRef<SRecord>& match = m_DelayedRecords[ record->id ];
157                     // _TRACE(id << " -> " << match.GetPointer());
158                     if (match) {
159                         x_MergeRecords(*match, *record);
160                     } else {
161                         match.Reset(record);
162                     }
163                 }
164             }
165         }
166     }
167 
168     NON_CONST_ITERATE (TDelayedRecords, it, m_DelayedRecords) {
169         SRecord& rec = *it->second;
170         /// merge mergeable ranges
171         NON_CONST_ITERATE (SRecord::TLoc, loc_iter, rec.loc) {
172             ITERATE (set<TSeqRange>, src_iter, loc_iter->merge_ranges) {
173                 TSeqRange range(*src_iter);
174                 set<TSeqRange>::iterator dst_iter =
175                     loc_iter->ranges.begin();
176                 for ( ;  dst_iter != loc_iter->ranges.end();  ) {
177                     TSeqRange r(range);
178                     r += *dst_iter;
179                     if (r.GetLength() <=
180                         range.GetLength() + dst_iter->GetLength()) {
181                         range += *dst_iter;
182                         _TRACE("merging overlapping ranges: "
183                                << range.GetFrom() << " - "
184                                << range.GetTo() << " <-> "
185                                << dst_iter->GetFrom() << " - "
186                                << dst_iter->GetTo());
187                         loc_iter->ranges.erase(dst_iter++);
188                         break;
189                     } else {
190                         ++dst_iter;
191                     }
192                 }
193                 loc_iter->ranges.insert(range);
194             }
195         }
196 
197         if (rec.key == "exon") {
198             rec.key = "mRNA";
199         }
200         x_ParseAndPlace(rec);
201     }
202 
203     ///
204     /// remap gene refs
205     /// we have built a set of gene-id -> gene-ref pairs
206     ///
207     if (m_TSE  &&  m_GeneRefs.size()) {
208         NON_CONST_ITERATE (TGeneRefs, iter, m_GeneRefs) {
209             if ( !iter->second->IsSetLocus()  &&
210                  !iter->second->IsSetLocus_tag()) {
211                 iter->second->SetLocus(iter->first);
212             } else if ( !iter->second->IsSetLocus()  ||
213                         iter->second->GetLocus() != iter->first) {
214                 iter->second->SetSyn().push_back(iter->first);
215             }
216         }
217 
218         CTypeIterator<CSeq_feat> feat_iter(*m_TSE);
219         for ( ;  feat_iter;  ++feat_iter) {
220             const CGene_ref* ref = NULL;
221             if (feat_iter->GetData().IsGene()) {
222                 ref = &feat_iter->GetData().GetGene();
223             } else {
224                 ref = feat_iter->GetGeneXref();
225             }
226             if (ref  &&  ref->IsSetLocus()) {
227                 TGeneRefs::const_iterator iter =
228                     m_GeneRefs.find(ref->GetLocus());
229                 if (iter != m_GeneRefs.end()) {
230                     const_cast<CGene_ref*>(ref)->Assign(*iter->second);
231                 }
232             }
233         }
234     }
235 
236     CRef<CSeq_entry> tse(m_TSE); // need to save before resetting.
237     x_Reset();
238 
239     // promote transcript_id and protein_id to products
240     if (flags & fSetProducts) {
241         CTypeIterator<CSeq_feat> feat_iter(*tse);
242         for ( ;  feat_iter;  ++feat_iter) {
243             CSeq_feat& feat = *feat_iter;
244 
245             string qual_name;
246             switch (feat.GetData().GetSubtype()) {
247             case CSeqFeatData::eSubtype_cdregion:
248                 qual_name = "protein_id";
249                 break;
250 
251             case CSeqFeatData::eSubtype_mRNA:
252                 qual_name = "transcript_id";
253                 break;
254 
255             default:
256                 continue;
257                 break;
258             }
259 
260             string id_str = feat.GetNamedQual(qual_name);
261             if ( !id_str.empty() ) {
262                 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
263                 feat.SetProduct().SetWhole(*id);
264             }
265         }
266     }
267 
268     if (flags & fCreateGeneFeats) {
269         CTypeIterator<CSeq_annot> annot_iter(*tse);
270         for ( ;  annot_iter;  ++annot_iter) {
271             CSeq_annot& annot = *annot_iter;
272             if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
273                 continue;
274             }
275 
276             // we work within the scope of one annotation
277             CSeq_annot::TData::TFtable::iterator feat_iter =
278                 annot.SetData().SetFtable().begin();
279             CSeq_annot::TData::TFtable::iterator feat_end =
280                 annot.SetData().SetFtable().end();
281 
282             /// we plan to create a series of gene features, one for each gene
283             /// identified above
284             /// genes are identified via a 'gene_id' marker
285             typedef map<string, CRef<CSeq_feat> > TGeneMap;
286             TGeneMap genes;
287             for (bool has_genes = false;
288                  feat_iter != feat_end  &&  !has_genes;  ++feat_iter) {
289                 CSeq_feat& feat = **feat_iter;
290 
291                 switch (feat.GetData().GetSubtype()) {
292                 case CSeqFeatData::eSubtype_gene:
293                     /// we already have genes, so don't add any more
294                     has_genes = true;
295                     genes.clear();
296                     break;
297 
298                 case CSeqFeatData::eSubtype_mRNA:
299                 case CSeqFeatData::eSubtype_cdregion:
300                     /// for mRNA and CDS features, create a gene
301                     /// this is only done if the gene_id parameter was set
302                     /// in parsing, we promote gene_id to a gene xref
303                     if ( !feat.GetGeneXref() ) {
304                         continue;
305                     }
306                     {{
307                         string gene_id;
308                         feat.GetGeneXref()->GetLabel(&gene_id);
309                         _ASSERT( !gene_id.empty() );
310                         TSeqRange range = feat.GetLocation().GetTotalRange();
311 
312                         ENa_strand strand = feat.GetLocation().GetStrand();
313                         const CSeq_id* id = feat.GetLocation().GetId();
314                         if ( !id ) {
315                             x_Error("No consistent ID found; gene feature skipped");
316                             continue;
317                         }
318 
319                         TGeneMap::iterator iter = genes.find(gene_id);
320                         if (iter == genes.end()) {
321                             /// new gene feature
322                             CRef<CSeq_feat> gene(new CSeq_feat());
323                             gene->SetData().SetGene().Assign(*feat.GetGeneXref());
324 
325                             gene->SetLocation().SetInt().SetFrom(range.GetFrom());
326                             gene->SetLocation().SetInt().SetTo  (range.GetTo());
327                             gene->SetLocation().SetId(*id);
328                             gene->SetLocation().SetInt().SetStrand(strand);
329                             genes[gene_id] = gene;
330                         } else {
331                             /// we agglomerate the old location
332                             CRef<CSeq_feat> gene = iter->second;
333 
334                             TSeqRange r2 = gene->GetLocation().GetTotalRange();
335                             range += r2;
336                             gene->SetLocation().SetInt().SetFrom(range.GetFrom());
337                             gene->SetLocation().SetInt().SetTo  (range.GetTo());
338                             gene->SetLocation().InvalidateTotalRangeCache();
339                         }
340                     }}
341                     break;
342 
343                 default:
344                     break;
345                 }
346             }
347 
348             ITERATE (TGeneMap, iter, genes) {
349                 annot.SetData().SetFtable().push_back(iter->second);
350             }
351         }
352     }
353 
354     return tse;
355 }
356 
357 
x_Warn(const string & message,unsigned int line)358 void CGFFReader::x_Warn(const string& message, unsigned int line)
359 {
360     if (line) {
361         ERR_POST_X(2, Warning << message << " [GFF input, line " << line << ']');
362     } else {
363         ERR_POST_X(3, Warning << message << " [GFF input]");
364     }
365 }
366 
367 
x_Error(const string & message,unsigned int line)368 void CGFFReader::x_Error(const string& message, unsigned int line)
369 {
370     if (line) {
371         ERR_POST_X(1, Error << message << " [GFF input, line " << line << ']');
372     } else {
373         ERR_POST_X(1, Error << message << " [GFF input]");
374     }
375 }
376 
377 
x_Info(const string & message,unsigned int line)378 void CGFFReader::x_Info(const string& message, unsigned int line)
379 {
380     if (line) {
381         ERR_POST_X(1, Info << message << " [GFF input, line " << line << ']');
382     } else {
383         ERR_POST_X(1, Info << message << " [GFF input]");
384     }
385 }
386 
387 
x_Reset(void)388 void CGFFReader::x_Reset(void)
389 {
390     m_TSE.Reset(new CSeq_entry);
391     m_SeqNameCache.clear();
392     m_SeqCache.clear();
393     m_DelayedRecords.clear();
394     m_GeneRefs.clear();
395     m_DefMol.erase();
396     m_LineNumber = 0;
397     m_Version = 2;
398 }
399 
400 
x_ParseStructuredComment(const TStr & line)401 bool CGFFReader::x_ParseStructuredComment(const TStr& line)
402 {
403     if ( line.empty() || line[0] != '#' || line[1] != '#' ) {
404         return false;
405     }
406     TStrVec v;
407     NStr::Split(line, "# \t", v, NStr::fSplit_Tokenize);
408     if (v.empty()) {
409         return true;
410     }
411     if (v[0] == "date"  &&  v.size() > 1) {
412         x_ParseDateComment(v[1]);
413     } else if (v[0] == "Type"  &&  v.size() > 1) {
414         x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : TStr());
415     } else if (v[0] == "gff-version"  &&  v.size() > 1) {
416         m_Version = NStr::StringToInt(v[1]);
417     } else if (v[0] == "FASTA") {
418         x_ReadFastaSequences(*m_LineReader);
419     }
420     // etc.
421     return true;
422 }
423 
424 
x_ParseDateComment(const TStr & date)425 void CGFFReader::x_ParseDateComment(const TStr& date)
426 {
427     try {
428         CRef<CSeqdesc> desc(new CSeqdesc);
429         desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),
430                                          CDate::ePrecision_day);
431         m_TSE->SetSet().SetDescr().Set().push_back(desc);
432     } catch (exception& e) {
433         x_Error(string("Bad ISO date: ") + e.what(), x_GetLineNumber());
434     }
435 }
436 
437 
x_ParseTypeComment(const TStr & moltype,const TStr & seqname)438 void CGFFReader::x_ParseTypeComment(const TStr& moltype, const TStr& seqname)
439 {
440     if (seqname.empty()) {
441         m_DefMol = moltype;
442     } else {
443         // automatically adds to m_TSE if new
444         x_ResolveID(*x_ResolveSeqName(seqname), moltype);
445     }
446 }
447 
448 
x_ReadFastaSequences(ILineReader & in)449 void CGFFReader::x_ReadFastaSequences(ILineReader& in)
450 {
451     CFastaReader reader(in, CFastaReader::fAssumeNuc);
452     CRef<CSeq_entry> seqs = reader.ReadSet();
453     for (CTypeIterator<CBioseq> it(*seqs);  it;  ++it) {
454         if (it->GetId().empty()) { // can this happen?
455             CRef<CSeq_entry> parent(new CSeq_entry);
456             parent->SetSeq(*it);
457             m_TSE->SetSet().SetSeq_set().push_back(parent);
458             continue;
459         }
460         CRef<CBioseq> our_bs = x_ResolveID(*it->GetId().front(), kEmptyStr);
461         // keep our annotations, but replace everything else.
462         // (XXX - should also keep mol)
463         our_bs->SetId() = it->GetId();
464         if (it->IsSetDescr()) {
465             our_bs->SetDescr(it->SetDescr());
466         }
467         our_bs->SetInst(it->SetInst());
468     }
469 }
470 
471 
472 CRef<CGFFReader::SRecord>
x_ParseFeatureInterval(const TStr & line)473 CGFFReader::x_ParseFeatureInterval(const TStr& line)
474 {
475     TStrVec v;
476     bool    misdelimited = false;
477 
478     NStr::Split(line, "\t", v);
479     if (v.size() < 8) {
480         v.clear();
481         NStr::Split(line, " \t", v, NStr::fSplit_Tokenize);
482         if (v.size() < 8) {
483             x_Error("Skipping line due to insufficient fields",
484                    x_GetLineNumber());
485             return null;
486         } else if (m_Version < 3) {
487             x_Info("(Recovered) Bad delimiters (should use tabs)", x_GetLineNumber());
488             misdelimited = true;
489         }
490     } else {
491         // XXX - warn about extra fields (if any), but only if they're
492         // not comments
493         // v.resize(9);
494     }
495 
496     CRef<SRecord> record(x_NewRecord());
497     string        accession;
498     TSeqPos       from = 0, to = numeric_limits<TSeqPos>::max();
499     ENa_strand    strand = eNa_strand_unknown;
500     s_URLDecode(v[0], accession);
501     record->source = v[1];
502     record->key = v[2];
503 
504     try {
505         from = NStr::StringToUInt(v[3]) - 1;
506     } catch (std::exception& e) {
507         x_Error(string("Bad FROM position: ") + e.what(), x_GetLineNumber());
508     }
509 
510     try {
511         to = NStr::StringToUInt(v[4]) - 1;
512     } catch (std::exception& e) {
513         x_Error(string("Bad TO position: ") + e.what(), x_GetLineNumber());
514     }
515 
516     record->score = v[5];
517 
518     if (v[6] == "+") {
519         strand = eNa_strand_plus;
520     } else if (v[6] == "-") {
521         strand = eNa_strand_minus;
522     } else if ( !(v[6] == ".") ) {
523         x_Warn("Bad strand " + string(v[6]) + " (should be [+-.])",
524                x_GetLineNumber());
525     }
526 
527     if (v[7] == "0"  ||  v[7] == "1"  ||  v[7] == "2") {
528         record->frame = v[7][0] - '0';
529     } else if (v[7] == ".") {
530         record->frame = -1;
531     } else {
532         x_Warn("Bad frame " + string(v[7]) + " (should be [012.])",
533                x_GetLineNumber());
534         record->frame = -1;
535     }
536 
537     {{
538         SRecord::SSubLoc subloc;
539         subloc.accession = accession;
540         subloc.strand    = strand;
541         subloc.ranges.insert(TSeqRange(from, to));
542 
543         record->loc.push_back(subloc);
544     }}
545 
546     SIZE_TYPE i = 8;
547     if (m_Version >= 3) {
548         x_ParseV3Attributes(*record, v, i);
549     } else {
550         x_ParseV2Attributes(*record, v, i);
551     }
552 
553     if ( !misdelimited  &&  (i > 9  ||  (i == 9  &&  v.size() > 9
554                                          &&  !NStr::StartsWith(v[9], "#") ))) {
555         x_Warn("Extra non-comment fields", x_GetLineNumber());
556     }
557 
558     if (record->FindAttribute("Target") != record->attrs.end()) {
559         record->type = SRecord::eAlign;
560     } else {
561         record->type = SRecord::eFeat;
562     }
563 
564     // extracting additional gff3 attributes
565     if (m_Version == 3) {
566         SRecord::TAttrs::const_iterator id_it = record->FindAttribute("ID");
567         if (id_it != record->attrs.end()) {
568             record->id = (*id_it)[1];
569         }
570 
571         SRecord::TAttrs::const_iterator parent_it = record->FindAttribute("Parent");
572         if (parent_it != record->attrs.end()) {
573             record->parent = (*parent_it)[1];
574         }
575 
576         SRecord::TAttrs::const_iterator name_it = record->FindAttribute("Name");
577         if (name_it != record->attrs.end()) {
578             record->name = (*name_it)[1];
579         }
580     }
581 
582     record->line_no = m_LineNumber;
583     record->id = x_FeatureID(*record);
584     return record;
585 }
586 
587 
x_ParseFeatRecord(const SRecord & record)588 CRef<CSeq_feat> CGFFReader::x_ParseFeatRecord(const SRecord& record)
589 {
590     CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat
591                          (record.key, *x_ResolveLoc(record.loc),
592                           CFeature_table_reader::fTranslateBadKey));
593     if (record.frame >= 0  &&  feat->GetData().IsCdregion()) {
594         feat->SetData().SetCdregion().SetFrame
595             (static_cast<CCdregion::EFrame>(record.frame + 1));
596     }
597     if ( m_Version == 3 ) {
598         ITERATE (SRecord::TAttrs, it, record.attrs) {
599             string tag = it->front();
600             if (tag == "ID") {
601                 feat->SetId( *s_StringToFeatId( (*it)[1] ) );
602             }
603             if (tag == "Parent") {
604                 CRef<CSeqFeatXref> xref( new CSeqFeatXref );
605                 xref->SetId( *s_StringToFeatId( (*it)[1] ) );
606                 feat->SetXref().push_back( xref );
607             }
608         }
609     }
610 
611     if ( record.source != "." ) {
612         CRef<CGb_qual> source( new CGb_qual );
613         source->SetQual( "source" );
614         source->SetVal( record.source );
615         feat->SetQual().push_back( source );
616     }
617 
618     string gene_id;
619     string gene;
620     string locus_tag;
621     ITERATE (SRecord::TAttrs, it, record.attrs) {
622         string tag = it->front();
623         string value;
624         switch (it->size()) {
625         case 1:
626             break;
627         case 2:
628             value = (*it)[1];
629             break;
630         default:
631             x_Warn("Ignoring extra fields in value of " + tag, record.line_no);
632             value = (*it)[1];
633             break;
634         }
635         if (x_GetFlags() & fGBQuals) {
636             if (tag == "transcript_id") {
637                 //continue;
638             } else if (tag == "gene_id") {
639                 gene_id = value;
640                 continue;
641             } else if (tag == "gene") {
642                 gene = value;
643                 continue;
644             } else if (tag == "locus_tag") {
645                 locus_tag = value;
646                 continue;
647             } else if (tag == "exon_number") {
648                 tag = "number";
649             } else if (NStr::StartsWith(tag, "insd_")) {
650                 tag.erase(0, 5);
651             }
652 
653             CFeature_table_reader::AddFeatQual
654                 (feat, kEmptyStr, tag, value, CFeature_table_reader::fKeepBadKey);
655         } else { // don't attempt to parse, just treat as imported
656             CRef<CGb_qual> qual(new CGb_qual);
657             qual->SetQual(tag);
658             qual->SetVal(value);
659             feat->SetQual().push_back(qual);
660         }
661     }
662 
663     if ( !gene_id.empty() ) {
664         SIZE_TYPE colon = gene_id.find(':');
665         if (colon != NPOS) {
666             gene_id.erase(0, colon + 1);
667         }
668 
669         TGeneRefs::value_type val(gene_id, CRef<CGene_ref>());
670         TGeneRefs::iterator iter = m_GeneRefs.insert(val).first;
671         if ( !iter->second ) {
672             iter->second.Reset(new CGene_ref);
673         }
674         if ( !gene.empty() ) {
675             if (iter->second->IsSetLocus()  &&
676                 iter->second->GetLocus() != gene) {
677                 ERR_POST_X(4, Warning << "CGFFReader::x_ParseFeatRecord(): "
678                            << "inconsistent gene name: "
679                            << gene << " != " << iter->second->GetLocus()
680                            << ", ignoring second");
681             } else if ( !iter->second->IsSetLocus() ) {
682                 iter->second->SetLocus(gene);
683             }
684         }
685         if ( !locus_tag.empty() ) {
686             if (iter->second->IsSetLocus_tag()  &&
687                 iter->second->GetLocus_tag() != locus_tag) {
688                 ERR_POST_X(5, Warning << "CGFFReader::x_ParseFeatRecord(): "
689                            << "inconsistent locus tag: "
690                            << locus_tag << " != " << iter->second->GetLocus_tag()
691                            << ", ignoring second");
692             } else if ( !iter->second->IsSetLocus_tag() ) {
693                 iter->second->SetLocus_tag(locus_tag);
694             }
695         }
696 
697         // translate
698         CFeature_table_reader::AddFeatQual
699             (feat, kEmptyStr, "gene_id", gene_id,
700              CFeature_table_reader::fKeepBadKey);
701         if (x_GetFlags() & fGBQuals) {
702             CFeature_table_reader::AddFeatQual
703                 (feat, kEmptyStr, "gene", gene_id,
704                  CFeature_table_reader::fKeepBadKey);
705         }
706     }
707 
708     return feat;
709 }
710 
711 
x_ParseAlignRecord(const SRecord & record)712 CRef<CSeq_align> CGFFReader::x_ParseAlignRecord(const SRecord& record)
713 {
714     CRef<CSeq_align> align(new CSeq_align);
715     align->SetType(CSeq_align::eType_partial);
716     align->SetDim(2);
717     SRecord::TAttrs::const_iterator tgit = record.FindAttribute("Target");
718     vector<string> target;
719     if (tgit != record.attrs.end()) {
720         NStr::Split((*tgit)[1], " +-", target, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
721     }
722     if (target.size() != 3) {
723         x_Warn("Bad Target attribute", record.line_no);
724         return align;
725     }
726     CRef<CSeq_id> tgid    = x_ResolveSeqName(target[0]);
727     TSeqPos       tgstart = NStr::StringToUInt(target[1]) - 1;
728     TSeqPos       tgstop  = NStr::StringToUInt(target[2]) - 1;
729     TSeqPos       tglen   = tgstop - tgstart + 1;
730 
731     CRef<CSeq_loc> refloc = x_ResolveLoc(record.loc);
732     CRef<CSeq_id>  refid(&refloc->SetInt().SetId());
733     TSeqPos        reflen = 0;
734     for (CSeq_loc_CI it(*refloc);  it;  ++it) {
735         reflen += it.GetRange().GetLength();
736     }
737 
738     CRef<CSeq_loc> tgloc(new CSeq_loc);
739     tgloc->SetInt().SetId(*tgid);
740     tgloc->SetInt().SetFrom(tgstart);
741     tgloc->SetInt().SetTo(tgstop);
742 
743     SRecord::TAttrs::const_iterator gap_it = record.FindAttribute("Gap");
744     if (gap_it == record.attrs.end()) {
745         // single ungapped alignment
746         if (reflen == tglen  &&  refloc->IsInt()) {
747             CDense_seg& ds = align->SetSegs().SetDenseg();
748             ds.SetNumseg(1);
749             ds.SetIds().push_back(refid);
750             ds.SetIds().push_back(tgid);
751             ds.SetStarts().push_back(refloc->GetInt().GetFrom());
752             ds.SetStarts().push_back(tgstart);
753             ds.SetLens().push_back(reflen);
754             if (refloc->GetInt().IsSetStrand()) {
755                 ds.SetStrands().push_back(refloc->GetInt().GetStrand());
756                 ds.SetStrands().push_back(eNa_strand_plus);
757             }
758         } else {
759             if (reflen != tglen  &&  reflen != 3 * tglen) {
760                 x_Warn("Reference and target locations have an irregular"
761                        " ratio.", record.line_no);
762             }
763             CRef<CStd_seg> ss(new CStd_seg);
764             ss->SetLoc().push_back(refloc);
765             ss->SetLoc().push_back(tgloc);
766             align->SetSegs().SetStd().push_back(ss);
767         }
768     } else {
769         SCigarAlignment cigar
770             ((*gap_it)[1], SCigarAlignment::eOpFirstIfAmbiguous);
771         align = cigar(refloc->GetInt(), tgloc->GetInt());
772     }
773 
774     try {
775         CRef<CScore> score(new CScore);
776         score->SetValue().SetReal(NStr::StringToDouble(record.score));
777         align->SetScore().push_back(score);
778     } catch (...) {
779     }
780 
781     return align;
782 }
783 
784 
x_ResolveLoc(const SRecord::TLoc & loc)785 CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc)
786 {
787     CRef<CSeq_loc> seqloc(new CSeq_loc);
788     ITERATE (SRecord::TLoc, it, loc) {
789         CRef<CSeq_id> id = x_ResolveSeqName(it->accession);
790         ITERATE (set<TSeqRange>, range, it->ranges) {
791             CRef<CSeq_loc> segment(new CSeq_loc);
792             if (range->GetLength() == 1) {
793                 CSeq_point& pnt = segment->SetPnt();
794                 pnt.SetId   (*id);
795                 pnt.SetPoint(range->GetFrom());
796                 if (it->strand != eNa_strand_unknown) {
797                     pnt.SetStrand(it->strand);
798                 }
799             } else {
800                 CSeq_interval& si = segment->SetInt();
801                 si.SetId  (*id);
802                 si.SetFrom(range->GetFrom());
803                 si.SetTo  (range->GetTo());
804                 if (it->strand != eNa_strand_unknown) {
805                     si.SetStrand(it->strand);
806                 }
807             }
808             if (IsReverse(it->strand)) {
809                 seqloc->SetMix().Set().push_front(segment);
810             } else {
811                 seqloc->SetMix().Set().push_back(segment);
812             }
813         }
814     }
815 
816     if (seqloc->GetMix().Get().size() == 1) {
817         return seqloc->SetMix().Set().front();
818     } else {
819         return seqloc;
820     }
821 }
822 
823 
x_ParseV2Attributes(SRecord & record,const TStrVec & v,SIZE_TYPE & i)824 void CGFFReader::x_ParseV2Attributes(SRecord& record, const TStrVec& v,
825                                      SIZE_TYPE& i)
826 {
827     string         attr_last_value;
828     vector<string> attr_values;
829     char           quote_char = 0;
830 
831     for (;  i < v.size();  ++i) {
832         string s = string(v[i]) + ' ';
833         SIZE_TYPE pos = 0;
834         while (pos < s.size()) {
835             SIZE_TYPE pos2;
836             if (quote_char) { // must be inside a value
837                 pos2 = s.find_first_of(" \'\"\\", pos);
838                 _ASSERT(pos2 != NPOS); // due to trailing space
839                 if (s[pos2] == quote_char) {
840                     if (attr_values.empty()) {
841                         x_Warn("quoted attribute tag " + attr_last_value,
842                                x_GetLineNumber());
843                     }
844                     quote_char = 0;
845                     attr_last_value += s.substr(pos, pos2 - pos);
846                     try {
847                         attr_values.push_back(NStr::ParseEscapes
848                                               (attr_last_value));
849                     } catch (CStringException& e) {
850                         attr_values.push_back(attr_last_value);
851                         x_Warn(e.what() + (" in value of " + attr_values[0]),
852                                x_GetLineNumber());
853                     }
854                     attr_last_value.erase();
855                 } else if (s[pos2] == '\\') {
856                     _VERIFY(++pos2 != s.size());
857                     attr_last_value += s.substr(pos, pos2 + 1 - pos);
858                 } else {
859                     attr_last_value += s.substr(pos, pos2 + 1 - pos);
860                 }
861             } else {
862                 pos2 = s.find_first_of(" #;\"", pos); // also look for \'?
863                 _ASSERT(pos2 != NPOS); // due to trailing space
864                 if (pos != pos2) {
865                     // grab and place the preceding token
866                     attr_last_value += s.substr(pos, pos2 - pos);
867                     attr_values.push_back(attr_last_value);
868                     attr_last_value.erase();
869                 }
870 
871                 switch (s[pos2]) {
872                 case ' ':
873                     if (pos2 == s.size() - 1) {
874                         x_AddAttribute(record, attr_values);
875                         attr_values.clear();
876                     }
877                     break;
878 
879                 case '#':
880                     return;
881 
882                 case ';':
883                     if (attr_values.empty()) {
884                         x_Warn("null attribute", x_GetLineNumber());
885                     } else {
886                         x_AddAttribute(record, attr_values);
887                         attr_values.clear();
888                     }
889                     break;
890 
891                 // NB: we don't currently search for single quotes.
892                 case '\"':
893                 case '\'':
894                     quote_char = s[pos2];
895                     break;
896 
897                 default:
898                     _TROUBLE;
899                 }
900             }
901             pos = pos2 + 1;
902         }
903     }
904 
905     if ( !attr_values.empty() ) {
906         x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());
907         x_AddAttribute(record, attr_values);
908     }
909 }
910 
x_SplitKeyValuePair(const string & pair,string & key,string & value)911 bool CGFFReader::x_SplitKeyValuePair( const string& pair, string& key, string& value )
912 {
913     if ( NStr::SplitInTwo( pair, "=", key, value ) ) {
914         return true;
915     }
916     if ( NStr::SplitInTwo( pair, " ", key, value ) ) {
917         x_Info("(recovered) missdelimited attribute/value pair: " + key, x_GetLineNumber());
918         return true;
919     }
920     x_Warn("attribute without value: " + key, x_GetLineNumber());
921     return false;
922 }
923 
924 
x_ParseV3Attributes(SRecord & record,const TStrVec & v,SIZE_TYPE & i)925 void CGFFReader::x_ParseV3Attributes(SRecord& record, const TStrVec& v,
926                                      SIZE_TYPE& i)
927 {
928     vector<string> v2, attr;
929     NStr::Split(v[i], ";", v2, NStr::fSplit_Tokenize);
930     ITERATE (vector<string>, it, v2) {
931         attr.clear();
932         string key, values;
933         if (x_SplitKeyValuePair( *it, key, values )) {
934             vector<string> vals;
935             attr.resize(2);
936             s_URLDecode(key, attr[0]);
937             NStr::Split(values, ",", vals);
938             ITERATE (vector<string>, it2, vals) {
939                 string value( *it2 );
940                 if ( NStr::MatchesMask(value, "\"*\"") ) {
941                     //
942                     //  Note: The GFF3 spec is ambiguous on whether quoting is
943                     //  required for free text values.
944                     //
945                     value = value.substr(1, value.length()-2);
946                 }
947                 s_URLDecode(value, attr[1]);
948                 x_AddAttribute(record, attr);
949             }
950         } else {
951             x_Warn("attribute without value: " + key, x_GetLineNumber());
952             attr.resize(1);
953             s_URLDecode(*it, attr[0]);
954             x_AddAttribute(record, attr);
955             continue;
956         }
957     }
958 }
959 
960 
x_AddAttribute(SRecord & record,vector<string> & attr)961 void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr)
962 {
963     if (attr.size() == 0) {
964         return;
965     }
966 
967     if (x_GetFlags() & fGBQuals) {
968         if (attr[0] == "gbkey"  &&  attr.size() == 2) {
969             record.key = attr[1];
970             return;
971         }
972     }
973     record.attrs.insert(attr);
974 }
975 
976 
x_FeatureID(const SRecord & record)977 string CGFFReader::x_FeatureID(const SRecord& record)
978 {
979     if (record.type != SRecord::eFeat  ||  x_GetFlags() & fNoGTF) {
980         return kEmptyStr;
981     }
982 
983     // has been retrieved in initial interval parsing
984     if (m_Version == 3) {
985         if (!record.id.empty()) {
986             return  record.id;
987         }
988         else if (!record.parent.empty()) {
989             return record.source + record.key + record.parent;
990         }
991         else {
992             return "";
993         }
994     }
995 
996     SRecord::TAttrs::const_iterator gene_it = record.FindAttribute("gene_id");
997     SRecord::TAttrs::const_iterator transcript_it
998         = record.FindAttribute("transcript_id");
999 
1000     // concatenate our IDs from above, if found
1001     string id;
1002     if (gene_it != record.attrs.end()) {
1003         id += (*gene_it)[1];
1004     }
1005 
1006     if (transcript_it != record.attrs.end()) {
1007         if ( !id.empty() ) {
1008             id += ' ';
1009         }
1010         id += (*transcript_it)[1];
1011     }
1012 
1013     // look for db xrefs
1014     SRecord::TAttrs::const_iterator dbxref_it
1015         = record.FindAttribute("db_xref");
1016     for ( ; dbxref_it != record.attrs.end()  &&
1017             dbxref_it->front() == "db_xref";  ++dbxref_it) {
1018         if ( !id.empty() ) {
1019             id += ' ';
1020         }
1021         id += (*dbxref_it)[1];
1022     }
1023 
1024     if ( id.empty() ) {
1025         return id;
1026     }
1027 
1028     if (record.key == "start_codon" ||  record.key == "stop_codon") {
1029         //id += " " + record.key;
1030         id += "CDS";
1031     } else if (record.key == "CDS"
1032                ||  NStr::FindNoCase(record.key, "rna") != NPOS) {
1033         //id += " " + record.key;
1034         id += record.key;
1035     } else if (record.key == "exon") {
1036         // normally separate intervals, but may want to merge.
1037         if (x_GetFlags() & fMergeExons) {
1038             id += record.key;
1039         } else {
1040             SRecord::TAttrs::const_iterator it
1041                 = record.FindAttribute("exon_number");
1042             if (it == record.attrs.end()) {
1043                 return kEmptyStr;
1044             } else {
1045                 id += record.key + ' ' + (*it)[1];
1046             }
1047         }
1048     } else if (x_GetFlags() & fMergeOnyCdsMrna) {
1049         return kEmptyStr;
1050     }
1051     return id;
1052 }
1053 
1054 
x_MergeRecords(SRecord & dest,const SRecord & src)1055 void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src)
1056 {
1057     // XXX - perform sanity checks and warn on mismatch
1058 
1059     bool merge_overlaps = false;
1060     if (dest.key == "CDS"  &&
1061         (src.key == "start_codon"  ||  src.key == "stop_codon")) {
1062         // start_codon and stop_codon features should be merged into
1063         // existing CDS locations
1064         merge_overlaps = true;
1065     }
1066 
1067     if ((dest.key == "start_codon"  ||  dest.key == "stop_codon") &&
1068         src.key == "CDS") {
1069         // start_codon and stop_codon features should be merged into
1070         // existing CDS locations
1071         merge_overlaps = true;
1072         dest.key = "CDS";
1073     }
1074 
1075     // adjust the frame as needed
1076     int best_frame = dest.frame;
1077 
1078     ITERATE (SRecord::TLoc, slit, src.loc) {
1079         bool merged = false;
1080         NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {
1081             if (slit->accession != dlit->accession) {
1082                 if (dest.loc.size() == 1) {
1083                     x_Warn("Multi-accession feature", src.line_no);
1084                 }
1085                 continue;
1086             } else if (slit->strand != dlit->strand) {
1087                 if (dest.loc.size() == 1) {
1088                     x_Warn("Multi-orientation feature", src.line_no);
1089                 }
1090                 continue;
1091             } else {
1092                 if (slit->strand == eNa_strand_plus) {
1093                     if (slit->ranges.begin()->GetFrom() <
1094                         dlit->ranges.begin()->GetFrom()) {
1095                         best_frame = src.frame;
1096                     }
1097                 } else {
1098                     if (slit->ranges.begin()->GetTo() >
1099                         dlit->ranges.begin()->GetTo()) {
1100                         best_frame = src.frame;
1101                     }
1102                 }
1103                 if (merge_overlaps) {
1104                     ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1105                         dlit->merge_ranges.insert(*set_iter);
1106                     }
1107                 } else {
1108                     ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1109                         dlit->ranges.insert(*set_iter);
1110                     }
1111                 }
1112                 merged = true;
1113                 break;
1114             }
1115         }
1116         if ( !merged ) {
1117             dest.loc.push_back(*slit);
1118         }
1119     }
1120 
1121     dest.frame = best_frame;
1122     if (src.key != dest.key) {
1123         if (dest.key == "CDS"  &&  NStr::EndsWith(src.key, "_codon")
1124             &&  !(x_GetFlags() & fNoGTF) ) {
1125             // ok
1126         } else if (src.key == "CDS" &&  NStr::EndsWith(dest.key, "_codon")
1127             &&  !(x_GetFlags() & fNoGTF) ) {
1128             dest.key = "CDS";
1129         } else {
1130             x_Warn("Merging features with different keys: " + dest.key
1131                    + " != " + src.key, src.line_no);
1132         }
1133     }
1134 
1135     x_MergeAttributes(dest, src);
1136 }
1137 
1138 
x_MergeAttributes(SRecord & dest,const SRecord & src)1139 void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src)
1140 {
1141     SRecord::TAttrs::iterator dait     = dest.attrs.begin();
1142     SRecord::TAttrs::iterator dait_end = dest.attrs.end();
1143     SRecord::TAttrs::iterator dait_tag = dait_end;
1144     ITERATE (SRecord::TAttrs, sait, src.attrs) {
1145         const string& tag = sait->front();
1146         while (dait != dait_end  &&  dait->front() < tag) {
1147             ++dait;
1148         }
1149 
1150         if (dait_tag == dait_end  ||  dait_tag->front() != tag) {
1151             dait_tag = dait;
1152         }
1153         if (dait != dait_end  &&  dait->front() == tag) {
1154             while (dait != dait_end  &&  *dait < *sait) {
1155                 ++dait;
1156             }
1157         }
1158         if (dait != dait_end  &&  *dait == *sait) {
1159             continue; // identical
1160         } else if ( !(x_GetFlags() & fNoGTF)  &&  tag == "exon_number") {
1161             if (dait_tag != dait_end) {
1162                 while (dait != dait_end  &&  dait->front() == tag) {
1163                     ++dait;
1164                 }
1165                 dest.attrs.erase(dait_tag, dait);
1166                 dait_tag = dait_end;
1167             }
1168         } else {
1169             dest.attrs.insert(dait, *sait);
1170         }
1171     }
1172 }
1173 
1174 
x_PlaceFeature(CSeq_feat & feat,const SRecord &)1175 void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&)
1176 {
1177     CRef<CBioseq> seq;
1178     if ( !feat.IsSetProduct() ) {
1179         for (CTypeConstIterator<CSeq_id> it(feat.GetLocation());  it;  ++it) {
1180             CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);
1181             if ( !seq ) {
1182                 seq.Reset(seq2);
1183             } else if ( seq2.NotEmpty()  &&  seq != seq2) {
1184                 seq.Reset();
1185                 BREAK(it);
1186             }
1187         }
1188     }
1189 
1190     CBioseq::TAnnot& annots
1191         = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1192     NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1193         if ((*it)->GetData().IsFtable()) {
1194             (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1195             return;
1196         }
1197     }
1198     CRef<CSeq_annot> annot(new CSeq_annot);
1199     annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1200     annots.push_back(annot);
1201 }
1202 
1203 
x_PlaceAlignment(CSeq_align & align,const SRecord & record)1204 void CGFFReader::x_PlaceAlignment(CSeq_align& align, const SRecord& record)
1205 {
1206     CRef<CBioseq> seq;
1207     try {
1208         seq = x_ResolveID(align.GetSeq_id(0), kEmptyStr);
1209     } catch (...) {
1210     }
1211     CBioseq::TAnnot& annots
1212         = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1213     NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1214         if ((*it)->GetData().IsAlign()) {
1215             (*it)->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1216             return;
1217         }
1218     }
1219     CRef<CSeq_annot> annot(new CSeq_annot);
1220     annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1221     annots.push_back(annot);
1222 }
1223 
1224 
x_ParseAndPlace(const SRecord & record)1225 void CGFFReader::x_ParseAndPlace(const SRecord& record)
1226 {
1227     switch (record.type) {
1228     case SRecord::eFeat:
1229         x_PlaceFeature(*x_ParseFeatRecord(record), record);
1230         break;
1231     case SRecord::eAlign:
1232         x_PlaceAlignment(*x_ParseAlignRecord(record), record);
1233         break;
1234     default:
1235         x_Warn("Unknown record type " + NStr::IntToString(record.type),
1236                record.line_no);
1237     }
1238 }
1239 
1240 
x_ResolveSeqName(const string & name)1241 CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name)
1242 {
1243     CRef<CSeq_id>& id = m_SeqNameCache[name];
1244     if (id.NotEmpty()
1245         &&  (id->Which() == CSeq_id::e_not_set
1246              ||  static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice)) {
1247         x_Warn("x_ResolveSeqName: invalid cache entry for " + name);
1248         id.Reset();
1249     }
1250     if ( !id ) {
1251         id.Reset(x_ResolveNewSeqName(name));
1252     }
1253     if ( !id ||  id->Which() == CSeq_id::e_not_set
1254         ||  static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice) {
1255         x_Warn("x_ResolveNewSeqName returned null or invalid ID for " + name);
1256         id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));
1257     }
1258     return id;
1259 }
1260 
1261 
x_ResolveNewSeqName(const string & name)1262 CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name)
1263 {
1264     if (m_Flags & fAllIdsAsLocal) {
1265         if (NStr::StartsWith(name, "lcl|")) {
1266             return CRef<CSeq_id>(new CSeq_id(name));
1267         } else {
1268             return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1269         }
1270     }
1271 
1272     if (m_Flags & fNumericIdsAsLocal) {
1273         if (name.find_first_not_of("0123456789") == string::npos) {
1274             return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1275         }
1276     }
1277     try {
1278         CRef<CSeq_id> pId(new CSeq_id(name));
1279         if (!pId || (pId->IsGi() && pId->GetGi() < GI_CONST(500)) ) {
1280             pId = new CSeq_id(CSeq_id::e_Local, name);
1281         }
1282         return pId;
1283     }
1284     catch (CSeqIdException&) {
1285         return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1286     }
1287 }
1288 
1289 
x_ResolveID(const CSeq_id & id,const TStr & mol)1290 CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const TStr& mol)
1291 {
1292     CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)];
1293     if ( !seq ) {
1294         seq.Reset(x_ResolveNewID(id, mol));
1295         // Derived versions of x_ResolveNewID may legimately return null
1296         // results....
1297         if (seq) {
1298             x_PlaceSeq(*seq);
1299             ITERATE (CBioseq::TId, it, seq->GetId()) {
1300                 m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));
1301             }
1302         }
1303     }
1304     return seq;
1305 }
1306 
1307 
x_ResolveNewID(const CSeq_id & id,const string & mol0)1308 CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0)
1309 {
1310     CRef<CBioseq> seq(new CBioseq);
1311     CRef<CSeq_id> id_copy(new CSeq_id);
1312 
1313     id_copy->Assign(id);
1314     seq->SetId().push_back(id_copy);
1315     seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
1316 
1317     const string& mol = mol0.empty() ? m_DefMol : mol0;
1318     if (mol.empty()  ||  mol == "dna") {
1319         seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1320     } else if (mol == "rna")  {
1321         seq->SetInst().SetMol(CSeq_inst::eMol_rna);
1322     } else if (mol == "protein")  {
1323         seq->SetInst().SetMol(CSeq_inst::eMol_aa);
1324     } else {
1325         x_Warn("unrecognized sequence type " + mol + "; assuming DNA");
1326         seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1327     }
1328 
1329     return seq;
1330 }
1331 
x_SetProducts(CRef<CSeq_entry> & tse)1332 void CGFFReader::x_SetProducts( CRef<CSeq_entry>& tse )
1333 {
1334     CTypeIterator<CSeq_feat> feat_iter(*tse);
1335     for ( ;  feat_iter;  ++feat_iter) {
1336         CSeq_feat& feat = *feat_iter;
1337 
1338         string qual_name;
1339         switch (feat.GetData().GetSubtype()) {
1340         case CSeqFeatData::eSubtype_cdregion:
1341             qual_name = "protein_id";
1342             break;
1343 
1344         case CSeqFeatData::eSubtype_mRNA:
1345             qual_name = "transcript_id";
1346             break;
1347 
1348         default:
1349             continue;
1350             break;
1351         }
1352 
1353         string id_str = feat.GetNamedQual(qual_name);
1354         if ( !id_str.empty() ) {
1355             CRef<CSeq_id> id = x_ResolveSeqName(id_str);
1356             feat.SetProduct().SetWhole(*id);
1357         }
1358     }
1359 }
1360 
x_CreateGeneFeatures(CRef<CSeq_entry> & tse)1361 void CGFFReader::x_CreateGeneFeatures( CRef<CSeq_entry>& tse )
1362 {
1363     CTypeIterator<CSeq_annot> annot_iter(*tse);
1364     for ( ;  annot_iter;  ++annot_iter) {
1365         CSeq_annot& annot = *annot_iter;
1366         if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
1367             continue;
1368         }
1369 
1370         // we work within the scope of one annotation
1371         CSeq_annot::TData::TFtable::iterator feat_iter =
1372             annot.SetData().SetFtable().begin();
1373         CSeq_annot::TData::TFtable::iterator feat_end =
1374             annot.SetData().SetFtable().end();
1375 
1376         /// we plan to create a series of gene features, one for each gene
1377         /// identified above
1378         /// genes are identified via a 'gene_id' marker
1379         typedef map<string, CRef<CSeq_feat> > TGeneMap;
1380         TGeneMap genes;
1381         for (bool has_genes = false;
1382              feat_iter != feat_end  &&  !has_genes;  ++feat_iter) {
1383             CSeq_feat& feat = **feat_iter;
1384 
1385             switch (feat.GetData().GetSubtype()) {
1386             case CSeqFeatData::eSubtype_gene:
1387                 /// we already have genes, so don't add any more
1388                 has_genes = true;
1389                 genes.clear();
1390                 break;
1391 
1392             case CSeqFeatData::eSubtype_mRNA:
1393             case CSeqFeatData::eSubtype_cdregion:
1394                 /// for mRNA and CDS features, create a gene
1395                 /// this is only done if the gene_id parameter was set
1396                 /// in parsing, we promote gene_id to a gene xref
1397                 if ( !feat.GetGeneXref() ) {
1398                     continue;
1399                 }
1400                 {{
1401                     string gene_id;
1402                     feat.GetGeneXref()->GetLabel(&gene_id);
1403                     _ASSERT( !gene_id.empty() );
1404                     TSeqRange range = feat.GetLocation().GetTotalRange();
1405 
1406                     ENa_strand strand = feat.GetLocation().GetStrand();
1407                     const CSeq_id* id = feat.GetLocation().GetId();
1408                     if ( !id ) {
1409                         x_Error("No consistent ID found; gene feature skipped");
1410                         continue;
1411                     }
1412 
1413                     TGeneMap::iterator iter = genes.find(gene_id);
1414                     if (iter == genes.end()) {
1415                         /// new gene feature
1416                         CRef<CSeq_feat> gene(new CSeq_feat());
1417                         gene->SetData().SetGene().Assign(*feat.GetGeneXref());
1418 
1419                         gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1420                         gene->SetLocation().SetInt().SetTo  (range.GetTo());
1421                         gene->SetLocation().SetId(*id);
1422                         gene->SetLocation().SetInt().SetStrand(strand);
1423                         genes[gene_id] = gene;
1424                     } else {
1425                         /// we agglomerate the old location
1426                         CRef<CSeq_feat> gene = iter->second;
1427 
1428                         TSeqRange r2 = gene->GetLocation().GetTotalRange();
1429                         range += r2;
1430                         gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1431                         gene->SetLocation().SetInt().SetTo  (range.GetTo());
1432                         gene->SetLocation().InvalidateTotalRangeCache();
1433                     }
1434                 }}
1435                 break;
1436 
1437             default:
1438                 break;
1439             }
1440         }
1441 
1442         ITERATE (TGeneMap, iter, genes) {
1443             annot.SetData().SetFtable().push_back(iter->second);
1444         }
1445     }
1446 }
1447 
x_RemapGeneRefs(CRef<CSeq_entry> & tse,TGeneRefs & gene_refs)1448 void CGFFReader::x_RemapGeneRefs( CRef<CSeq_entry>& tse, TGeneRefs& gene_refs )
1449 {
1450     if ( !tse  ||  gene_refs.empty() ) {
1451         return;
1452     }
1453     NON_CONST_ITERATE (TGeneRefs, iter, gene_refs) {
1454         if ( !iter->second->IsSetLocus()  &&
1455              !iter->second->IsSetLocus_tag()) {
1456             iter->second->SetLocus(iter->first);
1457         } else if ( !iter->second->IsSetLocus()  ||
1458                     iter->second->GetLocus() != iter->first) {
1459             iter->second->SetSyn().push_back(iter->first);
1460         }
1461     }
1462 
1463     CTypeIterator<CSeq_feat> feat_iter(*tse);
1464     for ( ;  feat_iter;  ++feat_iter) {
1465         const CGene_ref* ref = NULL;
1466         if (feat_iter->GetData().IsGene()) {
1467             ref = &feat_iter->GetData().GetGene();
1468         } else {
1469             ref = feat_iter->GetGeneXref();
1470         }
1471         if (ref  &&  ref->IsSetLocus()) {
1472             TGeneRefs::const_iterator iter =
1473                 gene_refs.find(ref->GetLocus());
1474             if (iter != gene_refs.end()) {
1475                 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
1476             }
1477         }
1478     }
1479 }
1480 
x_PlaceSeq(CBioseq & seq)1481 void CGFFReader::x_PlaceSeq(CBioseq& seq)
1482 {
1483     bool found = false;
1484     for (CTypeConstIterator<CBioseq> it(*m_TSE);  it;  ++it) {
1485         if (&*it == &seq) {
1486             found = true;
1487             BREAK(it);
1488         }
1489     }
1490     if ( !found ) {
1491         CRef<CSeq_entry> se(new CSeq_entry);
1492         se->SetSeq(seq);
1493         m_TSE->SetSet().SetSeq_set().push_back(se);
1494     }
1495 }
1496 
1497 
1498 CGFFReader::SRecord::TAttrs::const_iterator
FindAttribute(const string & att_name,size_t min_values) const1499 CGFFReader::SRecord::FindAttribute(const string& att_name, size_t min_values)
1500 const
1501 {
1502     SRecord::TAttrs::const_iterator it
1503         = attrs.lower_bound(vector<string>(1, att_name));
1504     while (it != attrs.end()  &&  it->front() == att_name
1505            &&  it->size() <= min_values) {
1506         ++it;
1507     }
1508     return (it == attrs.end() || it->front() == att_name) ? it : attrs.end();
1509 }
1510 
1511 
1512 bool
x_IsLineUcscMetaInformation(const TStr & line)1513 CGFFReader::x_IsLineUcscMetaInformation(const TStr& line)
1514 {
1515     // line starts with keyword "browser" or "track"
1516     return (NStr::StartsWith(line, "browser") || NStr::StartsWith(line, "track") );
1517 }
1518 
1519 
1520 END_SCOPE(objects)
1521 END_NCBI_SCOPE
1522