1 /* $Id: gff_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, Wratko Hlavina
27 *
28 * File Description:
29 * Reader for GFF (including GTF) files.
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <objtools/readers/gff_reader.hpp>
36
37 #include <corelib/ncbitime.hpp>
38 #include <corelib/ncbiutil.hpp>
39 #include <corelib/stream_utils.hpp>
40 #include <serial/iterator.hpp>
41
42 #include <objects/general/Date.hpp>
43 #include <objects/general/Object_id.hpp>
44 #include <objects/seq/Seq_annot.hpp>
45 #include <objects/seq/Seq_descr.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seqdesc.hpp>
48 #include <objects/seqalign/Dense_seg.hpp>
49 #include <objects/seqalign/Score.hpp>
50 #include <objects/seqalign/Std_seg.hpp>
51 #include <objects/seqfeat/Feat_id.hpp>
52 #include <objects/seqfeat/Cdregion.hpp>
53 #include <objects/seqfeat/SeqFeatXref.hpp>
54 #include <objects/seqfeat/Gb_qual.hpp>
55 #include <objects/seqloc/Seq_interval.hpp>
56 #include <objects/seqloc/Seq_point.hpp>
57 #include <objects/seqset/Bioseq_set.hpp>
58
59 #include <objtools/readers/cigar.hpp>
60 #include <objtools/readers/fasta.hpp>
61 #include <objtools/readers/readfeat.hpp>
62 #include <objtools/error_codes.hpp>
63
64 #include <algorithm>
65 #include <ctype.h>
66
67
68 #define NCBI_USE_ERRCODE_X Objtools_Rd_GFF
69
70 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)71 BEGIN_SCOPE(objects)
72
73 static CRef<CFeat_id>
74 s_StringToFeatId( const string& str )
75 {
76 CRef<CObject_id> objid( new CObject_id );
77 objid->SetStr( str );
78 CRef<CFeat_id> featid( new CFeat_id );
79 featid->SetLocal( *objid );
80 return featid;
81 }
82
s_URLDecode(const CTempString & s,string & out)83 static string& s_URLDecode(const CTempString& s, string& out) {
84 SIZE_TYPE pos = 0;
85 out.erase();
86 out.reserve(s.size());
87 while (pos < s.size()) {
88 SIZE_TYPE pos2 = s.find_first_of("%" /* "+" */, pos);
89 out += s.substr(pos, pos2 - pos);
90 if (pos2 == NPOS) {
91 break;
92 } else if (s[pos2] == '+') { // disabled -- often used literally
93 out += ' ';
94 pos = pos2 + 1;
95 } else if (s[pos2] == '%') {
96 try {
97 out += (char)NStr::StringToInt(s.substr(pos2 + 1, 2), 0, 16);
98 pos = pos2 + 3;
99 } catch (CStringException&) {
100 // some sources neglect to encode % (!)
101 out += '%';
102 pos = pos2 + 1;
103 }
104 } else {
105 _TROUBLE;
106 }
107 }
108 return out;
109 }
110
111
Read(CNcbiIstream & in,TFlags flags)112 CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags)
113 {
114 CStreamLineReader lr(in);
115 return Read(lr, flags);
116 }
117
Read(ILineReader & in,TFlags flags)118 CRef<CSeq_entry> CGFFReader::Read(ILineReader& in, TFlags flags)
119 {
120 x_Reset();
121 m_Flags = flags;
122 m_LineReader = ∈
123
124 if (m_Flags & fSetVersion3) {
125 m_Version = 3;
126 }
127
128 TStr line;
129 while ( !in.AtEOF() ) {
130 ++m_LineNumber;
131 char c = in.PeekChar();
132 if (c == '#') {
133 line = *++in;
134 if (line.size() > 2 && line[1] == '#') {
135 x_ParseStructuredComment(line);
136 // ignore regular comments
137 }
138 } else if (c == '>') {
139 // implicit ##FASTA
140 x_ReadFastaSequences(in);
141 } else {
142 line = *++in;
143 if ( x_IsLineUcscMetaInformation(line) ) {
144 // UCSC browser or track line. For now, we ignore those.
145 continue;
146 }
147 if ( line.empty() ) {
148 // too commonly used for file formatting to even warn about
149 continue;
150 }
151 CRef<SRecord> record = x_ParseFeatureInterval(line);
152 if (record) {
153 if (record->id.empty()) {
154 x_ParseAndPlace(*record);
155 } else {
156 CRef<SRecord>& match = m_DelayedRecords[ record->id ];
157 // _TRACE(id << " -> " << match.GetPointer());
158 if (match) {
159 x_MergeRecords(*match, *record);
160 } else {
161 match.Reset(record);
162 }
163 }
164 }
165 }
166 }
167
168 NON_CONST_ITERATE (TDelayedRecords, it, m_DelayedRecords) {
169 SRecord& rec = *it->second;
170 /// merge mergeable ranges
171 NON_CONST_ITERATE (SRecord::TLoc, loc_iter, rec.loc) {
172 ITERATE (set<TSeqRange>, src_iter, loc_iter->merge_ranges) {
173 TSeqRange range(*src_iter);
174 set<TSeqRange>::iterator dst_iter =
175 loc_iter->ranges.begin();
176 for ( ; dst_iter != loc_iter->ranges.end(); ) {
177 TSeqRange r(range);
178 r += *dst_iter;
179 if (r.GetLength() <=
180 range.GetLength() + dst_iter->GetLength()) {
181 range += *dst_iter;
182 _TRACE("merging overlapping ranges: "
183 << range.GetFrom() << " - "
184 << range.GetTo() << " <-> "
185 << dst_iter->GetFrom() << " - "
186 << dst_iter->GetTo());
187 loc_iter->ranges.erase(dst_iter++);
188 break;
189 } else {
190 ++dst_iter;
191 }
192 }
193 loc_iter->ranges.insert(range);
194 }
195 }
196
197 if (rec.key == "exon") {
198 rec.key = "mRNA";
199 }
200 x_ParseAndPlace(rec);
201 }
202
203 ///
204 /// remap gene refs
205 /// we have built a set of gene-id -> gene-ref pairs
206 ///
207 if (m_TSE && m_GeneRefs.size()) {
208 NON_CONST_ITERATE (TGeneRefs, iter, m_GeneRefs) {
209 if ( !iter->second->IsSetLocus() &&
210 !iter->second->IsSetLocus_tag()) {
211 iter->second->SetLocus(iter->first);
212 } else if ( !iter->second->IsSetLocus() ||
213 iter->second->GetLocus() != iter->first) {
214 iter->second->SetSyn().push_back(iter->first);
215 }
216 }
217
218 CTypeIterator<CSeq_feat> feat_iter(*m_TSE);
219 for ( ; feat_iter; ++feat_iter) {
220 const CGene_ref* ref = NULL;
221 if (feat_iter->GetData().IsGene()) {
222 ref = &feat_iter->GetData().GetGene();
223 } else {
224 ref = feat_iter->GetGeneXref();
225 }
226 if (ref && ref->IsSetLocus()) {
227 TGeneRefs::const_iterator iter =
228 m_GeneRefs.find(ref->GetLocus());
229 if (iter != m_GeneRefs.end()) {
230 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
231 }
232 }
233 }
234 }
235
236 CRef<CSeq_entry> tse(m_TSE); // need to save before resetting.
237 x_Reset();
238
239 // promote transcript_id and protein_id to products
240 if (flags & fSetProducts) {
241 CTypeIterator<CSeq_feat> feat_iter(*tse);
242 for ( ; feat_iter; ++feat_iter) {
243 CSeq_feat& feat = *feat_iter;
244
245 string qual_name;
246 switch (feat.GetData().GetSubtype()) {
247 case CSeqFeatData::eSubtype_cdregion:
248 qual_name = "protein_id";
249 break;
250
251 case CSeqFeatData::eSubtype_mRNA:
252 qual_name = "transcript_id";
253 break;
254
255 default:
256 continue;
257 break;
258 }
259
260 string id_str = feat.GetNamedQual(qual_name);
261 if ( !id_str.empty() ) {
262 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
263 feat.SetProduct().SetWhole(*id);
264 }
265 }
266 }
267
268 if (flags & fCreateGeneFeats) {
269 CTypeIterator<CSeq_annot> annot_iter(*tse);
270 for ( ; annot_iter; ++annot_iter) {
271 CSeq_annot& annot = *annot_iter;
272 if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
273 continue;
274 }
275
276 // we work within the scope of one annotation
277 CSeq_annot::TData::TFtable::iterator feat_iter =
278 annot.SetData().SetFtable().begin();
279 CSeq_annot::TData::TFtable::iterator feat_end =
280 annot.SetData().SetFtable().end();
281
282 /// we plan to create a series of gene features, one for each gene
283 /// identified above
284 /// genes are identified via a 'gene_id' marker
285 typedef map<string, CRef<CSeq_feat> > TGeneMap;
286 TGeneMap genes;
287 for (bool has_genes = false;
288 feat_iter != feat_end && !has_genes; ++feat_iter) {
289 CSeq_feat& feat = **feat_iter;
290
291 switch (feat.GetData().GetSubtype()) {
292 case CSeqFeatData::eSubtype_gene:
293 /// we already have genes, so don't add any more
294 has_genes = true;
295 genes.clear();
296 break;
297
298 case CSeqFeatData::eSubtype_mRNA:
299 case CSeqFeatData::eSubtype_cdregion:
300 /// for mRNA and CDS features, create a gene
301 /// this is only done if the gene_id parameter was set
302 /// in parsing, we promote gene_id to a gene xref
303 if ( !feat.GetGeneXref() ) {
304 continue;
305 }
306 {{
307 string gene_id;
308 feat.GetGeneXref()->GetLabel(&gene_id);
309 _ASSERT( !gene_id.empty() );
310 TSeqRange range = feat.GetLocation().GetTotalRange();
311
312 ENa_strand strand = feat.GetLocation().GetStrand();
313 const CSeq_id* id = feat.GetLocation().GetId();
314 if ( !id ) {
315 x_Error("No consistent ID found; gene feature skipped");
316 continue;
317 }
318
319 TGeneMap::iterator iter = genes.find(gene_id);
320 if (iter == genes.end()) {
321 /// new gene feature
322 CRef<CSeq_feat> gene(new CSeq_feat());
323 gene->SetData().SetGene().Assign(*feat.GetGeneXref());
324
325 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
326 gene->SetLocation().SetInt().SetTo (range.GetTo());
327 gene->SetLocation().SetId(*id);
328 gene->SetLocation().SetInt().SetStrand(strand);
329 genes[gene_id] = gene;
330 } else {
331 /// we agglomerate the old location
332 CRef<CSeq_feat> gene = iter->second;
333
334 TSeqRange r2 = gene->GetLocation().GetTotalRange();
335 range += r2;
336 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
337 gene->SetLocation().SetInt().SetTo (range.GetTo());
338 gene->SetLocation().InvalidateTotalRangeCache();
339 }
340 }}
341 break;
342
343 default:
344 break;
345 }
346 }
347
348 ITERATE (TGeneMap, iter, genes) {
349 annot.SetData().SetFtable().push_back(iter->second);
350 }
351 }
352 }
353
354 return tse;
355 }
356
357
x_Warn(const string & message,unsigned int line)358 void CGFFReader::x_Warn(const string& message, unsigned int line)
359 {
360 if (line) {
361 ERR_POST_X(2, Warning << message << " [GFF input, line " << line << ']');
362 } else {
363 ERR_POST_X(3, Warning << message << " [GFF input]");
364 }
365 }
366
367
x_Error(const string & message,unsigned int line)368 void CGFFReader::x_Error(const string& message, unsigned int line)
369 {
370 if (line) {
371 ERR_POST_X(1, Error << message << " [GFF input, line " << line << ']');
372 } else {
373 ERR_POST_X(1, Error << message << " [GFF input]");
374 }
375 }
376
377
x_Info(const string & message,unsigned int line)378 void CGFFReader::x_Info(const string& message, unsigned int line)
379 {
380 if (line) {
381 ERR_POST_X(1, Info << message << " [GFF input, line " << line << ']');
382 } else {
383 ERR_POST_X(1, Info << message << " [GFF input]");
384 }
385 }
386
387
x_Reset(void)388 void CGFFReader::x_Reset(void)
389 {
390 m_TSE.Reset(new CSeq_entry);
391 m_SeqNameCache.clear();
392 m_SeqCache.clear();
393 m_DelayedRecords.clear();
394 m_GeneRefs.clear();
395 m_DefMol.erase();
396 m_LineNumber = 0;
397 m_Version = 2;
398 }
399
400
x_ParseStructuredComment(const TStr & line)401 bool CGFFReader::x_ParseStructuredComment(const TStr& line)
402 {
403 if ( line.empty() || line[0] != '#' || line[1] != '#' ) {
404 return false;
405 }
406 TStrVec v;
407 NStr::Split(line, "# \t", v, NStr::fSplit_Tokenize);
408 if (v.empty()) {
409 return true;
410 }
411 if (v[0] == "date" && v.size() > 1) {
412 x_ParseDateComment(v[1]);
413 } else if (v[0] == "Type" && v.size() > 1) {
414 x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : TStr());
415 } else if (v[0] == "gff-version" && v.size() > 1) {
416 m_Version = NStr::StringToInt(v[1]);
417 } else if (v[0] == "FASTA") {
418 x_ReadFastaSequences(*m_LineReader);
419 }
420 // etc.
421 return true;
422 }
423
424
x_ParseDateComment(const TStr & date)425 void CGFFReader::x_ParseDateComment(const TStr& date)
426 {
427 try {
428 CRef<CSeqdesc> desc(new CSeqdesc);
429 desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),
430 CDate::ePrecision_day);
431 m_TSE->SetSet().SetDescr().Set().push_back(desc);
432 } catch (exception& e) {
433 x_Error(string("Bad ISO date: ") + e.what(), x_GetLineNumber());
434 }
435 }
436
437
x_ParseTypeComment(const TStr & moltype,const TStr & seqname)438 void CGFFReader::x_ParseTypeComment(const TStr& moltype, const TStr& seqname)
439 {
440 if (seqname.empty()) {
441 m_DefMol = moltype;
442 } else {
443 // automatically adds to m_TSE if new
444 x_ResolveID(*x_ResolveSeqName(seqname), moltype);
445 }
446 }
447
448
x_ReadFastaSequences(ILineReader & in)449 void CGFFReader::x_ReadFastaSequences(ILineReader& in)
450 {
451 CFastaReader reader(in, CFastaReader::fAssumeNuc);
452 CRef<CSeq_entry> seqs = reader.ReadSet();
453 for (CTypeIterator<CBioseq> it(*seqs); it; ++it) {
454 if (it->GetId().empty()) { // can this happen?
455 CRef<CSeq_entry> parent(new CSeq_entry);
456 parent->SetSeq(*it);
457 m_TSE->SetSet().SetSeq_set().push_back(parent);
458 continue;
459 }
460 CRef<CBioseq> our_bs = x_ResolveID(*it->GetId().front(), kEmptyStr);
461 // keep our annotations, but replace everything else.
462 // (XXX - should also keep mol)
463 our_bs->SetId() = it->GetId();
464 if (it->IsSetDescr()) {
465 our_bs->SetDescr(it->SetDescr());
466 }
467 our_bs->SetInst(it->SetInst());
468 }
469 }
470
471
472 CRef<CGFFReader::SRecord>
x_ParseFeatureInterval(const TStr & line)473 CGFFReader::x_ParseFeatureInterval(const TStr& line)
474 {
475 TStrVec v;
476 bool misdelimited = false;
477
478 NStr::Split(line, "\t", v);
479 if (v.size() < 8) {
480 v.clear();
481 NStr::Split(line, " \t", v, NStr::fSplit_Tokenize);
482 if (v.size() < 8) {
483 x_Error("Skipping line due to insufficient fields",
484 x_GetLineNumber());
485 return null;
486 } else if (m_Version < 3) {
487 x_Info("(Recovered) Bad delimiters (should use tabs)", x_GetLineNumber());
488 misdelimited = true;
489 }
490 } else {
491 // XXX - warn about extra fields (if any), but only if they're
492 // not comments
493 // v.resize(9);
494 }
495
496 CRef<SRecord> record(x_NewRecord());
497 string accession;
498 TSeqPos from = 0, to = numeric_limits<TSeqPos>::max();
499 ENa_strand strand = eNa_strand_unknown;
500 s_URLDecode(v[0], accession);
501 record->source = v[1];
502 record->key = v[2];
503
504 try {
505 from = NStr::StringToUInt(v[3]) - 1;
506 } catch (std::exception& e) {
507 x_Error(string("Bad FROM position: ") + e.what(), x_GetLineNumber());
508 }
509
510 try {
511 to = NStr::StringToUInt(v[4]) - 1;
512 } catch (std::exception& e) {
513 x_Error(string("Bad TO position: ") + e.what(), x_GetLineNumber());
514 }
515
516 record->score = v[5];
517
518 if (v[6] == "+") {
519 strand = eNa_strand_plus;
520 } else if (v[6] == "-") {
521 strand = eNa_strand_minus;
522 } else if ( !(v[6] == ".") ) {
523 x_Warn("Bad strand " + string(v[6]) + " (should be [+-.])",
524 x_GetLineNumber());
525 }
526
527 if (v[7] == "0" || v[7] == "1" || v[7] == "2") {
528 record->frame = v[7][0] - '0';
529 } else if (v[7] == ".") {
530 record->frame = -1;
531 } else {
532 x_Warn("Bad frame " + string(v[7]) + " (should be [012.])",
533 x_GetLineNumber());
534 record->frame = -1;
535 }
536
537 {{
538 SRecord::SSubLoc subloc;
539 subloc.accession = accession;
540 subloc.strand = strand;
541 subloc.ranges.insert(TSeqRange(from, to));
542
543 record->loc.push_back(subloc);
544 }}
545
546 SIZE_TYPE i = 8;
547 if (m_Version >= 3) {
548 x_ParseV3Attributes(*record, v, i);
549 } else {
550 x_ParseV2Attributes(*record, v, i);
551 }
552
553 if ( !misdelimited && (i > 9 || (i == 9 && v.size() > 9
554 && !NStr::StartsWith(v[9], "#") ))) {
555 x_Warn("Extra non-comment fields", x_GetLineNumber());
556 }
557
558 if (record->FindAttribute("Target") != record->attrs.end()) {
559 record->type = SRecord::eAlign;
560 } else {
561 record->type = SRecord::eFeat;
562 }
563
564 // extracting additional gff3 attributes
565 if (m_Version == 3) {
566 SRecord::TAttrs::const_iterator id_it = record->FindAttribute("ID");
567 if (id_it != record->attrs.end()) {
568 record->id = (*id_it)[1];
569 }
570
571 SRecord::TAttrs::const_iterator parent_it = record->FindAttribute("Parent");
572 if (parent_it != record->attrs.end()) {
573 record->parent = (*parent_it)[1];
574 }
575
576 SRecord::TAttrs::const_iterator name_it = record->FindAttribute("Name");
577 if (name_it != record->attrs.end()) {
578 record->name = (*name_it)[1];
579 }
580 }
581
582 record->line_no = m_LineNumber;
583 record->id = x_FeatureID(*record);
584 return record;
585 }
586
587
x_ParseFeatRecord(const SRecord & record)588 CRef<CSeq_feat> CGFFReader::x_ParseFeatRecord(const SRecord& record)
589 {
590 CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat
591 (record.key, *x_ResolveLoc(record.loc),
592 CFeature_table_reader::fTranslateBadKey));
593 if (record.frame >= 0 && feat->GetData().IsCdregion()) {
594 feat->SetData().SetCdregion().SetFrame
595 (static_cast<CCdregion::EFrame>(record.frame + 1));
596 }
597 if ( m_Version == 3 ) {
598 ITERATE (SRecord::TAttrs, it, record.attrs) {
599 string tag = it->front();
600 if (tag == "ID") {
601 feat->SetId( *s_StringToFeatId( (*it)[1] ) );
602 }
603 if (tag == "Parent") {
604 CRef<CSeqFeatXref> xref( new CSeqFeatXref );
605 xref->SetId( *s_StringToFeatId( (*it)[1] ) );
606 feat->SetXref().push_back( xref );
607 }
608 }
609 }
610
611 if ( record.source != "." ) {
612 CRef<CGb_qual> source( new CGb_qual );
613 source->SetQual( "source" );
614 source->SetVal( record.source );
615 feat->SetQual().push_back( source );
616 }
617
618 string gene_id;
619 string gene;
620 string locus_tag;
621 ITERATE (SRecord::TAttrs, it, record.attrs) {
622 string tag = it->front();
623 string value;
624 switch (it->size()) {
625 case 1:
626 break;
627 case 2:
628 value = (*it)[1];
629 break;
630 default:
631 x_Warn("Ignoring extra fields in value of " + tag, record.line_no);
632 value = (*it)[1];
633 break;
634 }
635 if (x_GetFlags() & fGBQuals) {
636 if (tag == "transcript_id") {
637 //continue;
638 } else if (tag == "gene_id") {
639 gene_id = value;
640 continue;
641 } else if (tag == "gene") {
642 gene = value;
643 continue;
644 } else if (tag == "locus_tag") {
645 locus_tag = value;
646 continue;
647 } else if (tag == "exon_number") {
648 tag = "number";
649 } else if (NStr::StartsWith(tag, "insd_")) {
650 tag.erase(0, 5);
651 }
652
653 CFeature_table_reader::AddFeatQual
654 (feat, kEmptyStr, tag, value, CFeature_table_reader::fKeepBadKey);
655 } else { // don't attempt to parse, just treat as imported
656 CRef<CGb_qual> qual(new CGb_qual);
657 qual->SetQual(tag);
658 qual->SetVal(value);
659 feat->SetQual().push_back(qual);
660 }
661 }
662
663 if ( !gene_id.empty() ) {
664 SIZE_TYPE colon = gene_id.find(':');
665 if (colon != NPOS) {
666 gene_id.erase(0, colon + 1);
667 }
668
669 TGeneRefs::value_type val(gene_id, CRef<CGene_ref>());
670 TGeneRefs::iterator iter = m_GeneRefs.insert(val).first;
671 if ( !iter->second ) {
672 iter->second.Reset(new CGene_ref);
673 }
674 if ( !gene.empty() ) {
675 if (iter->second->IsSetLocus() &&
676 iter->second->GetLocus() != gene) {
677 ERR_POST_X(4, Warning << "CGFFReader::x_ParseFeatRecord(): "
678 << "inconsistent gene name: "
679 << gene << " != " << iter->second->GetLocus()
680 << ", ignoring second");
681 } else if ( !iter->second->IsSetLocus() ) {
682 iter->second->SetLocus(gene);
683 }
684 }
685 if ( !locus_tag.empty() ) {
686 if (iter->second->IsSetLocus_tag() &&
687 iter->second->GetLocus_tag() != locus_tag) {
688 ERR_POST_X(5, Warning << "CGFFReader::x_ParseFeatRecord(): "
689 << "inconsistent locus tag: "
690 << locus_tag << " != " << iter->second->GetLocus_tag()
691 << ", ignoring second");
692 } else if ( !iter->second->IsSetLocus_tag() ) {
693 iter->second->SetLocus_tag(locus_tag);
694 }
695 }
696
697 // translate
698 CFeature_table_reader::AddFeatQual
699 (feat, kEmptyStr, "gene_id", gene_id,
700 CFeature_table_reader::fKeepBadKey);
701 if (x_GetFlags() & fGBQuals) {
702 CFeature_table_reader::AddFeatQual
703 (feat, kEmptyStr, "gene", gene_id,
704 CFeature_table_reader::fKeepBadKey);
705 }
706 }
707
708 return feat;
709 }
710
711
x_ParseAlignRecord(const SRecord & record)712 CRef<CSeq_align> CGFFReader::x_ParseAlignRecord(const SRecord& record)
713 {
714 CRef<CSeq_align> align(new CSeq_align);
715 align->SetType(CSeq_align::eType_partial);
716 align->SetDim(2);
717 SRecord::TAttrs::const_iterator tgit = record.FindAttribute("Target");
718 vector<string> target;
719 if (tgit != record.attrs.end()) {
720 NStr::Split((*tgit)[1], " +-", target, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
721 }
722 if (target.size() != 3) {
723 x_Warn("Bad Target attribute", record.line_no);
724 return align;
725 }
726 CRef<CSeq_id> tgid = x_ResolveSeqName(target[0]);
727 TSeqPos tgstart = NStr::StringToUInt(target[1]) - 1;
728 TSeqPos tgstop = NStr::StringToUInt(target[2]) - 1;
729 TSeqPos tglen = tgstop - tgstart + 1;
730
731 CRef<CSeq_loc> refloc = x_ResolveLoc(record.loc);
732 CRef<CSeq_id> refid(&refloc->SetInt().SetId());
733 TSeqPos reflen = 0;
734 for (CSeq_loc_CI it(*refloc); it; ++it) {
735 reflen += it.GetRange().GetLength();
736 }
737
738 CRef<CSeq_loc> tgloc(new CSeq_loc);
739 tgloc->SetInt().SetId(*tgid);
740 tgloc->SetInt().SetFrom(tgstart);
741 tgloc->SetInt().SetTo(tgstop);
742
743 SRecord::TAttrs::const_iterator gap_it = record.FindAttribute("Gap");
744 if (gap_it == record.attrs.end()) {
745 // single ungapped alignment
746 if (reflen == tglen && refloc->IsInt()) {
747 CDense_seg& ds = align->SetSegs().SetDenseg();
748 ds.SetNumseg(1);
749 ds.SetIds().push_back(refid);
750 ds.SetIds().push_back(tgid);
751 ds.SetStarts().push_back(refloc->GetInt().GetFrom());
752 ds.SetStarts().push_back(tgstart);
753 ds.SetLens().push_back(reflen);
754 if (refloc->GetInt().IsSetStrand()) {
755 ds.SetStrands().push_back(refloc->GetInt().GetStrand());
756 ds.SetStrands().push_back(eNa_strand_plus);
757 }
758 } else {
759 if (reflen != tglen && reflen != 3 * tglen) {
760 x_Warn("Reference and target locations have an irregular"
761 " ratio.", record.line_no);
762 }
763 CRef<CStd_seg> ss(new CStd_seg);
764 ss->SetLoc().push_back(refloc);
765 ss->SetLoc().push_back(tgloc);
766 align->SetSegs().SetStd().push_back(ss);
767 }
768 } else {
769 SCigarAlignment cigar
770 ((*gap_it)[1], SCigarAlignment::eOpFirstIfAmbiguous);
771 align = cigar(refloc->GetInt(), tgloc->GetInt());
772 }
773
774 try {
775 CRef<CScore> score(new CScore);
776 score->SetValue().SetReal(NStr::StringToDouble(record.score));
777 align->SetScore().push_back(score);
778 } catch (...) {
779 }
780
781 return align;
782 }
783
784
x_ResolveLoc(const SRecord::TLoc & loc)785 CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc)
786 {
787 CRef<CSeq_loc> seqloc(new CSeq_loc);
788 ITERATE (SRecord::TLoc, it, loc) {
789 CRef<CSeq_id> id = x_ResolveSeqName(it->accession);
790 ITERATE (set<TSeqRange>, range, it->ranges) {
791 CRef<CSeq_loc> segment(new CSeq_loc);
792 if (range->GetLength() == 1) {
793 CSeq_point& pnt = segment->SetPnt();
794 pnt.SetId (*id);
795 pnt.SetPoint(range->GetFrom());
796 if (it->strand != eNa_strand_unknown) {
797 pnt.SetStrand(it->strand);
798 }
799 } else {
800 CSeq_interval& si = segment->SetInt();
801 si.SetId (*id);
802 si.SetFrom(range->GetFrom());
803 si.SetTo (range->GetTo());
804 if (it->strand != eNa_strand_unknown) {
805 si.SetStrand(it->strand);
806 }
807 }
808 if (IsReverse(it->strand)) {
809 seqloc->SetMix().Set().push_front(segment);
810 } else {
811 seqloc->SetMix().Set().push_back(segment);
812 }
813 }
814 }
815
816 if (seqloc->GetMix().Get().size() == 1) {
817 return seqloc->SetMix().Set().front();
818 } else {
819 return seqloc;
820 }
821 }
822
823
x_ParseV2Attributes(SRecord & record,const TStrVec & v,SIZE_TYPE & i)824 void CGFFReader::x_ParseV2Attributes(SRecord& record, const TStrVec& v,
825 SIZE_TYPE& i)
826 {
827 string attr_last_value;
828 vector<string> attr_values;
829 char quote_char = 0;
830
831 for (; i < v.size(); ++i) {
832 string s = string(v[i]) + ' ';
833 SIZE_TYPE pos = 0;
834 while (pos < s.size()) {
835 SIZE_TYPE pos2;
836 if (quote_char) { // must be inside a value
837 pos2 = s.find_first_of(" \'\"\\", pos);
838 _ASSERT(pos2 != NPOS); // due to trailing space
839 if (s[pos2] == quote_char) {
840 if (attr_values.empty()) {
841 x_Warn("quoted attribute tag " + attr_last_value,
842 x_GetLineNumber());
843 }
844 quote_char = 0;
845 attr_last_value += s.substr(pos, pos2 - pos);
846 try {
847 attr_values.push_back(NStr::ParseEscapes
848 (attr_last_value));
849 } catch (CStringException& e) {
850 attr_values.push_back(attr_last_value);
851 x_Warn(e.what() + (" in value of " + attr_values[0]),
852 x_GetLineNumber());
853 }
854 attr_last_value.erase();
855 } else if (s[pos2] == '\\') {
856 _VERIFY(++pos2 != s.size());
857 attr_last_value += s.substr(pos, pos2 + 1 - pos);
858 } else {
859 attr_last_value += s.substr(pos, pos2 + 1 - pos);
860 }
861 } else {
862 pos2 = s.find_first_of(" #;\"", pos); // also look for \'?
863 _ASSERT(pos2 != NPOS); // due to trailing space
864 if (pos != pos2) {
865 // grab and place the preceding token
866 attr_last_value += s.substr(pos, pos2 - pos);
867 attr_values.push_back(attr_last_value);
868 attr_last_value.erase();
869 }
870
871 switch (s[pos2]) {
872 case ' ':
873 if (pos2 == s.size() - 1) {
874 x_AddAttribute(record, attr_values);
875 attr_values.clear();
876 }
877 break;
878
879 case '#':
880 return;
881
882 case ';':
883 if (attr_values.empty()) {
884 x_Warn("null attribute", x_GetLineNumber());
885 } else {
886 x_AddAttribute(record, attr_values);
887 attr_values.clear();
888 }
889 break;
890
891 // NB: we don't currently search for single quotes.
892 case '\"':
893 case '\'':
894 quote_char = s[pos2];
895 break;
896
897 default:
898 _TROUBLE;
899 }
900 }
901 pos = pos2 + 1;
902 }
903 }
904
905 if ( !attr_values.empty() ) {
906 x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());
907 x_AddAttribute(record, attr_values);
908 }
909 }
910
x_SplitKeyValuePair(const string & pair,string & key,string & value)911 bool CGFFReader::x_SplitKeyValuePair( const string& pair, string& key, string& value )
912 {
913 if ( NStr::SplitInTwo( pair, "=", key, value ) ) {
914 return true;
915 }
916 if ( NStr::SplitInTwo( pair, " ", key, value ) ) {
917 x_Info("(recovered) missdelimited attribute/value pair: " + key, x_GetLineNumber());
918 return true;
919 }
920 x_Warn("attribute without value: " + key, x_GetLineNumber());
921 return false;
922 }
923
924
x_ParseV3Attributes(SRecord & record,const TStrVec & v,SIZE_TYPE & i)925 void CGFFReader::x_ParseV3Attributes(SRecord& record, const TStrVec& v,
926 SIZE_TYPE& i)
927 {
928 vector<string> v2, attr;
929 NStr::Split(v[i], ";", v2, NStr::fSplit_Tokenize);
930 ITERATE (vector<string>, it, v2) {
931 attr.clear();
932 string key, values;
933 if (x_SplitKeyValuePair( *it, key, values )) {
934 vector<string> vals;
935 attr.resize(2);
936 s_URLDecode(key, attr[0]);
937 NStr::Split(values, ",", vals);
938 ITERATE (vector<string>, it2, vals) {
939 string value( *it2 );
940 if ( NStr::MatchesMask(value, "\"*\"") ) {
941 //
942 // Note: The GFF3 spec is ambiguous on whether quoting is
943 // required for free text values.
944 //
945 value = value.substr(1, value.length()-2);
946 }
947 s_URLDecode(value, attr[1]);
948 x_AddAttribute(record, attr);
949 }
950 } else {
951 x_Warn("attribute without value: " + key, x_GetLineNumber());
952 attr.resize(1);
953 s_URLDecode(*it, attr[0]);
954 x_AddAttribute(record, attr);
955 continue;
956 }
957 }
958 }
959
960
x_AddAttribute(SRecord & record,vector<string> & attr)961 void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr)
962 {
963 if (attr.size() == 0) {
964 return;
965 }
966
967 if (x_GetFlags() & fGBQuals) {
968 if (attr[0] == "gbkey" && attr.size() == 2) {
969 record.key = attr[1];
970 return;
971 }
972 }
973 record.attrs.insert(attr);
974 }
975
976
x_FeatureID(const SRecord & record)977 string CGFFReader::x_FeatureID(const SRecord& record)
978 {
979 if (record.type != SRecord::eFeat || x_GetFlags() & fNoGTF) {
980 return kEmptyStr;
981 }
982
983 // has been retrieved in initial interval parsing
984 if (m_Version == 3) {
985 if (!record.id.empty()) {
986 return record.id;
987 }
988 else if (!record.parent.empty()) {
989 return record.source + record.key + record.parent;
990 }
991 else {
992 return "";
993 }
994 }
995
996 SRecord::TAttrs::const_iterator gene_it = record.FindAttribute("gene_id");
997 SRecord::TAttrs::const_iterator transcript_it
998 = record.FindAttribute("transcript_id");
999
1000 // concatenate our IDs from above, if found
1001 string id;
1002 if (gene_it != record.attrs.end()) {
1003 id += (*gene_it)[1];
1004 }
1005
1006 if (transcript_it != record.attrs.end()) {
1007 if ( !id.empty() ) {
1008 id += ' ';
1009 }
1010 id += (*transcript_it)[1];
1011 }
1012
1013 // look for db xrefs
1014 SRecord::TAttrs::const_iterator dbxref_it
1015 = record.FindAttribute("db_xref");
1016 for ( ; dbxref_it != record.attrs.end() &&
1017 dbxref_it->front() == "db_xref"; ++dbxref_it) {
1018 if ( !id.empty() ) {
1019 id += ' ';
1020 }
1021 id += (*dbxref_it)[1];
1022 }
1023
1024 if ( id.empty() ) {
1025 return id;
1026 }
1027
1028 if (record.key == "start_codon" || record.key == "stop_codon") {
1029 //id += " " + record.key;
1030 id += "CDS";
1031 } else if (record.key == "CDS"
1032 || NStr::FindNoCase(record.key, "rna") != NPOS) {
1033 //id += " " + record.key;
1034 id += record.key;
1035 } else if (record.key == "exon") {
1036 // normally separate intervals, but may want to merge.
1037 if (x_GetFlags() & fMergeExons) {
1038 id += record.key;
1039 } else {
1040 SRecord::TAttrs::const_iterator it
1041 = record.FindAttribute("exon_number");
1042 if (it == record.attrs.end()) {
1043 return kEmptyStr;
1044 } else {
1045 id += record.key + ' ' + (*it)[1];
1046 }
1047 }
1048 } else if (x_GetFlags() & fMergeOnyCdsMrna) {
1049 return kEmptyStr;
1050 }
1051 return id;
1052 }
1053
1054
x_MergeRecords(SRecord & dest,const SRecord & src)1055 void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src)
1056 {
1057 // XXX - perform sanity checks and warn on mismatch
1058
1059 bool merge_overlaps = false;
1060 if (dest.key == "CDS" &&
1061 (src.key == "start_codon" || src.key == "stop_codon")) {
1062 // start_codon and stop_codon features should be merged into
1063 // existing CDS locations
1064 merge_overlaps = true;
1065 }
1066
1067 if ((dest.key == "start_codon" || dest.key == "stop_codon") &&
1068 src.key == "CDS") {
1069 // start_codon and stop_codon features should be merged into
1070 // existing CDS locations
1071 merge_overlaps = true;
1072 dest.key = "CDS";
1073 }
1074
1075 // adjust the frame as needed
1076 int best_frame = dest.frame;
1077
1078 ITERATE (SRecord::TLoc, slit, src.loc) {
1079 bool merged = false;
1080 NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {
1081 if (slit->accession != dlit->accession) {
1082 if (dest.loc.size() == 1) {
1083 x_Warn("Multi-accession feature", src.line_no);
1084 }
1085 continue;
1086 } else if (slit->strand != dlit->strand) {
1087 if (dest.loc.size() == 1) {
1088 x_Warn("Multi-orientation feature", src.line_no);
1089 }
1090 continue;
1091 } else {
1092 if (slit->strand == eNa_strand_plus) {
1093 if (slit->ranges.begin()->GetFrom() <
1094 dlit->ranges.begin()->GetFrom()) {
1095 best_frame = src.frame;
1096 }
1097 } else {
1098 if (slit->ranges.begin()->GetTo() >
1099 dlit->ranges.begin()->GetTo()) {
1100 best_frame = src.frame;
1101 }
1102 }
1103 if (merge_overlaps) {
1104 ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1105 dlit->merge_ranges.insert(*set_iter);
1106 }
1107 } else {
1108 ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1109 dlit->ranges.insert(*set_iter);
1110 }
1111 }
1112 merged = true;
1113 break;
1114 }
1115 }
1116 if ( !merged ) {
1117 dest.loc.push_back(*slit);
1118 }
1119 }
1120
1121 dest.frame = best_frame;
1122 if (src.key != dest.key) {
1123 if (dest.key == "CDS" && NStr::EndsWith(src.key, "_codon")
1124 && !(x_GetFlags() & fNoGTF) ) {
1125 // ok
1126 } else if (src.key == "CDS" && NStr::EndsWith(dest.key, "_codon")
1127 && !(x_GetFlags() & fNoGTF) ) {
1128 dest.key = "CDS";
1129 } else {
1130 x_Warn("Merging features with different keys: " + dest.key
1131 + " != " + src.key, src.line_no);
1132 }
1133 }
1134
1135 x_MergeAttributes(dest, src);
1136 }
1137
1138
x_MergeAttributes(SRecord & dest,const SRecord & src)1139 void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src)
1140 {
1141 SRecord::TAttrs::iterator dait = dest.attrs.begin();
1142 SRecord::TAttrs::iterator dait_end = dest.attrs.end();
1143 SRecord::TAttrs::iterator dait_tag = dait_end;
1144 ITERATE (SRecord::TAttrs, sait, src.attrs) {
1145 const string& tag = sait->front();
1146 while (dait != dait_end && dait->front() < tag) {
1147 ++dait;
1148 }
1149
1150 if (dait_tag == dait_end || dait_tag->front() != tag) {
1151 dait_tag = dait;
1152 }
1153 if (dait != dait_end && dait->front() == tag) {
1154 while (dait != dait_end && *dait < *sait) {
1155 ++dait;
1156 }
1157 }
1158 if (dait != dait_end && *dait == *sait) {
1159 continue; // identical
1160 } else if ( !(x_GetFlags() & fNoGTF) && tag == "exon_number") {
1161 if (dait_tag != dait_end) {
1162 while (dait != dait_end && dait->front() == tag) {
1163 ++dait;
1164 }
1165 dest.attrs.erase(dait_tag, dait);
1166 dait_tag = dait_end;
1167 }
1168 } else {
1169 dest.attrs.insert(dait, *sait);
1170 }
1171 }
1172 }
1173
1174
x_PlaceFeature(CSeq_feat & feat,const SRecord &)1175 void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&)
1176 {
1177 CRef<CBioseq> seq;
1178 if ( !feat.IsSetProduct() ) {
1179 for (CTypeConstIterator<CSeq_id> it(feat.GetLocation()); it; ++it) {
1180 CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);
1181 if ( !seq ) {
1182 seq.Reset(seq2);
1183 } else if ( seq2.NotEmpty() && seq != seq2) {
1184 seq.Reset();
1185 BREAK(it);
1186 }
1187 }
1188 }
1189
1190 CBioseq::TAnnot& annots
1191 = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1192 NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1193 if ((*it)->GetData().IsFtable()) {
1194 (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1195 return;
1196 }
1197 }
1198 CRef<CSeq_annot> annot(new CSeq_annot);
1199 annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1200 annots.push_back(annot);
1201 }
1202
1203
x_PlaceAlignment(CSeq_align & align,const SRecord & record)1204 void CGFFReader::x_PlaceAlignment(CSeq_align& align, const SRecord& record)
1205 {
1206 CRef<CBioseq> seq;
1207 try {
1208 seq = x_ResolveID(align.GetSeq_id(0), kEmptyStr);
1209 } catch (...) {
1210 }
1211 CBioseq::TAnnot& annots
1212 = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1213 NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1214 if ((*it)->GetData().IsAlign()) {
1215 (*it)->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1216 return;
1217 }
1218 }
1219 CRef<CSeq_annot> annot(new CSeq_annot);
1220 annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1221 annots.push_back(annot);
1222 }
1223
1224
x_ParseAndPlace(const SRecord & record)1225 void CGFFReader::x_ParseAndPlace(const SRecord& record)
1226 {
1227 switch (record.type) {
1228 case SRecord::eFeat:
1229 x_PlaceFeature(*x_ParseFeatRecord(record), record);
1230 break;
1231 case SRecord::eAlign:
1232 x_PlaceAlignment(*x_ParseAlignRecord(record), record);
1233 break;
1234 default:
1235 x_Warn("Unknown record type " + NStr::IntToString(record.type),
1236 record.line_no);
1237 }
1238 }
1239
1240
x_ResolveSeqName(const string & name)1241 CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name)
1242 {
1243 CRef<CSeq_id>& id = m_SeqNameCache[name];
1244 if (id.NotEmpty()
1245 && (id->Which() == CSeq_id::e_not_set
1246 || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice)) {
1247 x_Warn("x_ResolveSeqName: invalid cache entry for " + name);
1248 id.Reset();
1249 }
1250 if ( !id ) {
1251 id.Reset(x_ResolveNewSeqName(name));
1252 }
1253 if ( !id || id->Which() == CSeq_id::e_not_set
1254 || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice) {
1255 x_Warn("x_ResolveNewSeqName returned null or invalid ID for " + name);
1256 id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));
1257 }
1258 return id;
1259 }
1260
1261
x_ResolveNewSeqName(const string & name)1262 CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name)
1263 {
1264 if (m_Flags & fAllIdsAsLocal) {
1265 if (NStr::StartsWith(name, "lcl|")) {
1266 return CRef<CSeq_id>(new CSeq_id(name));
1267 } else {
1268 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1269 }
1270 }
1271
1272 if (m_Flags & fNumericIdsAsLocal) {
1273 if (name.find_first_not_of("0123456789") == string::npos) {
1274 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1275 }
1276 }
1277 try {
1278 CRef<CSeq_id> pId(new CSeq_id(name));
1279 if (!pId || (pId->IsGi() && pId->GetGi() < GI_CONST(500)) ) {
1280 pId = new CSeq_id(CSeq_id::e_Local, name);
1281 }
1282 return pId;
1283 }
1284 catch (CSeqIdException&) {
1285 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1286 }
1287 }
1288
1289
x_ResolveID(const CSeq_id & id,const TStr & mol)1290 CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const TStr& mol)
1291 {
1292 CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)];
1293 if ( !seq ) {
1294 seq.Reset(x_ResolveNewID(id, mol));
1295 // Derived versions of x_ResolveNewID may legimately return null
1296 // results....
1297 if (seq) {
1298 x_PlaceSeq(*seq);
1299 ITERATE (CBioseq::TId, it, seq->GetId()) {
1300 m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));
1301 }
1302 }
1303 }
1304 return seq;
1305 }
1306
1307
x_ResolveNewID(const CSeq_id & id,const string & mol0)1308 CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0)
1309 {
1310 CRef<CBioseq> seq(new CBioseq);
1311 CRef<CSeq_id> id_copy(new CSeq_id);
1312
1313 id_copy->Assign(id);
1314 seq->SetId().push_back(id_copy);
1315 seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
1316
1317 const string& mol = mol0.empty() ? m_DefMol : mol0;
1318 if (mol.empty() || mol == "dna") {
1319 seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1320 } else if (mol == "rna") {
1321 seq->SetInst().SetMol(CSeq_inst::eMol_rna);
1322 } else if (mol == "protein") {
1323 seq->SetInst().SetMol(CSeq_inst::eMol_aa);
1324 } else {
1325 x_Warn("unrecognized sequence type " + mol + "; assuming DNA");
1326 seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1327 }
1328
1329 return seq;
1330 }
1331
x_SetProducts(CRef<CSeq_entry> & tse)1332 void CGFFReader::x_SetProducts( CRef<CSeq_entry>& tse )
1333 {
1334 CTypeIterator<CSeq_feat> feat_iter(*tse);
1335 for ( ; feat_iter; ++feat_iter) {
1336 CSeq_feat& feat = *feat_iter;
1337
1338 string qual_name;
1339 switch (feat.GetData().GetSubtype()) {
1340 case CSeqFeatData::eSubtype_cdregion:
1341 qual_name = "protein_id";
1342 break;
1343
1344 case CSeqFeatData::eSubtype_mRNA:
1345 qual_name = "transcript_id";
1346 break;
1347
1348 default:
1349 continue;
1350 break;
1351 }
1352
1353 string id_str = feat.GetNamedQual(qual_name);
1354 if ( !id_str.empty() ) {
1355 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
1356 feat.SetProduct().SetWhole(*id);
1357 }
1358 }
1359 }
1360
x_CreateGeneFeatures(CRef<CSeq_entry> & tse)1361 void CGFFReader::x_CreateGeneFeatures( CRef<CSeq_entry>& tse )
1362 {
1363 CTypeIterator<CSeq_annot> annot_iter(*tse);
1364 for ( ; annot_iter; ++annot_iter) {
1365 CSeq_annot& annot = *annot_iter;
1366 if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
1367 continue;
1368 }
1369
1370 // we work within the scope of one annotation
1371 CSeq_annot::TData::TFtable::iterator feat_iter =
1372 annot.SetData().SetFtable().begin();
1373 CSeq_annot::TData::TFtable::iterator feat_end =
1374 annot.SetData().SetFtable().end();
1375
1376 /// we plan to create a series of gene features, one for each gene
1377 /// identified above
1378 /// genes are identified via a 'gene_id' marker
1379 typedef map<string, CRef<CSeq_feat> > TGeneMap;
1380 TGeneMap genes;
1381 for (bool has_genes = false;
1382 feat_iter != feat_end && !has_genes; ++feat_iter) {
1383 CSeq_feat& feat = **feat_iter;
1384
1385 switch (feat.GetData().GetSubtype()) {
1386 case CSeqFeatData::eSubtype_gene:
1387 /// we already have genes, so don't add any more
1388 has_genes = true;
1389 genes.clear();
1390 break;
1391
1392 case CSeqFeatData::eSubtype_mRNA:
1393 case CSeqFeatData::eSubtype_cdregion:
1394 /// for mRNA and CDS features, create a gene
1395 /// this is only done if the gene_id parameter was set
1396 /// in parsing, we promote gene_id to a gene xref
1397 if ( !feat.GetGeneXref() ) {
1398 continue;
1399 }
1400 {{
1401 string gene_id;
1402 feat.GetGeneXref()->GetLabel(&gene_id);
1403 _ASSERT( !gene_id.empty() );
1404 TSeqRange range = feat.GetLocation().GetTotalRange();
1405
1406 ENa_strand strand = feat.GetLocation().GetStrand();
1407 const CSeq_id* id = feat.GetLocation().GetId();
1408 if ( !id ) {
1409 x_Error("No consistent ID found; gene feature skipped");
1410 continue;
1411 }
1412
1413 TGeneMap::iterator iter = genes.find(gene_id);
1414 if (iter == genes.end()) {
1415 /// new gene feature
1416 CRef<CSeq_feat> gene(new CSeq_feat());
1417 gene->SetData().SetGene().Assign(*feat.GetGeneXref());
1418
1419 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1420 gene->SetLocation().SetInt().SetTo (range.GetTo());
1421 gene->SetLocation().SetId(*id);
1422 gene->SetLocation().SetInt().SetStrand(strand);
1423 genes[gene_id] = gene;
1424 } else {
1425 /// we agglomerate the old location
1426 CRef<CSeq_feat> gene = iter->second;
1427
1428 TSeqRange r2 = gene->GetLocation().GetTotalRange();
1429 range += r2;
1430 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1431 gene->SetLocation().SetInt().SetTo (range.GetTo());
1432 gene->SetLocation().InvalidateTotalRangeCache();
1433 }
1434 }}
1435 break;
1436
1437 default:
1438 break;
1439 }
1440 }
1441
1442 ITERATE (TGeneMap, iter, genes) {
1443 annot.SetData().SetFtable().push_back(iter->second);
1444 }
1445 }
1446 }
1447
x_RemapGeneRefs(CRef<CSeq_entry> & tse,TGeneRefs & gene_refs)1448 void CGFFReader::x_RemapGeneRefs( CRef<CSeq_entry>& tse, TGeneRefs& gene_refs )
1449 {
1450 if ( !tse || gene_refs.empty() ) {
1451 return;
1452 }
1453 NON_CONST_ITERATE (TGeneRefs, iter, gene_refs) {
1454 if ( !iter->second->IsSetLocus() &&
1455 !iter->second->IsSetLocus_tag()) {
1456 iter->second->SetLocus(iter->first);
1457 } else if ( !iter->second->IsSetLocus() ||
1458 iter->second->GetLocus() != iter->first) {
1459 iter->second->SetSyn().push_back(iter->first);
1460 }
1461 }
1462
1463 CTypeIterator<CSeq_feat> feat_iter(*tse);
1464 for ( ; feat_iter; ++feat_iter) {
1465 const CGene_ref* ref = NULL;
1466 if (feat_iter->GetData().IsGene()) {
1467 ref = &feat_iter->GetData().GetGene();
1468 } else {
1469 ref = feat_iter->GetGeneXref();
1470 }
1471 if (ref && ref->IsSetLocus()) {
1472 TGeneRefs::const_iterator iter =
1473 gene_refs.find(ref->GetLocus());
1474 if (iter != gene_refs.end()) {
1475 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
1476 }
1477 }
1478 }
1479 }
1480
x_PlaceSeq(CBioseq & seq)1481 void CGFFReader::x_PlaceSeq(CBioseq& seq)
1482 {
1483 bool found = false;
1484 for (CTypeConstIterator<CBioseq> it(*m_TSE); it; ++it) {
1485 if (&*it == &seq) {
1486 found = true;
1487 BREAK(it);
1488 }
1489 }
1490 if ( !found ) {
1491 CRef<CSeq_entry> se(new CSeq_entry);
1492 se->SetSeq(seq);
1493 m_TSE->SetSet().SetSeq_set().push_back(se);
1494 }
1495 }
1496
1497
1498 CGFFReader::SRecord::TAttrs::const_iterator
FindAttribute(const string & att_name,size_t min_values) const1499 CGFFReader::SRecord::FindAttribute(const string& att_name, size_t min_values)
1500 const
1501 {
1502 SRecord::TAttrs::const_iterator it
1503 = attrs.lower_bound(vector<string>(1, att_name));
1504 while (it != attrs.end() && it->front() == att_name
1505 && it->size() <= min_values) {
1506 ++it;
1507 }
1508 return (it == attrs.end() || it->front() == att_name) ? it : attrs.end();
1509 }
1510
1511
1512 bool
x_IsLineUcscMetaInformation(const TStr & line)1513 CGFFReader::x_IsLineUcscMetaInformation(const TStr& line)
1514 {
1515 // line starts with keyword "browser" or "track"
1516 return (NStr::StartsWith(line, "browser") || NStr::StartsWith(line, "track") );
1517 }
1518
1519
1520 END_SCOPE(objects)
1521 END_NCBI_SCOPE
1522