1 /*  $Id: gtf_reader.cpp 632531 2021-06-02 17:25:37Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Frank Ludwig
27  *
28  * File Description:
29  *   GFF file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
38 #include <objects/general/Object_id.hpp>
39 #include <objects/general/User_object.hpp>
40 #include <objects/general/Dbtag.hpp>
41 
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqloc/Seq_point.hpp>
46 
47 #include <objects/seq/Seq_annot.hpp>
48 #include <objects/seq/Annot_id.hpp>
49 #include <objects/seq/Annot_descr.hpp>
50 #include <objects/seqfeat/SeqFeatData.hpp>
51 #include <objects/seqfeat/SeqFeatXref.hpp>
52 
53 #include <objects/seqfeat/Seq_feat.hpp>
54 #include <objects/seqfeat/Gene_ref.hpp>
55 #include <objects/seqfeat/Genetic_code.hpp>
56 #include <objects/seqfeat/RNA_ref.hpp>
57 #include <objects/seqfeat/Gb_qual.hpp>
58 #include <objects/seqfeat/Feat_id.hpp>
59 
60 #include <objtools/readers/gtf_reader.hpp>
61 #include "gtf_location_merger.hpp"
62 #include "reader_message_handler.hpp"
63 
64 #include <algorithm>
65 
66 BEGIN_NCBI_SCOPE
67 BEGIN_objects_SCOPE // namespace ncbi::objects::
68 
69 //  ----------------------------------------------------------------------------
xAssignAttributesFromGff(const string & strGtfType,const string & strRawAttributes)70 bool CGtfReadRecord::xAssignAttributesFromGff(
71     const string& strGtfType,
72     const string& strRawAttributes )
73 //  ----------------------------------------------------------------------------
74 {
75     vector< string > attributes;
76     xSplitGffAttributes(strRawAttributes, attributes);
77 
78     for ( size_t u=0; u < attributes.size(); ++u ) {
79         string key, value;
80         string attribute(attributes[u]);
81         if (!NStr::SplitInTwo(attribute, "=", key, value)) {
82             if (!NStr::SplitInTwo(attribute, " ", key, value)) {
83                 if (strGtfType == "gene") {
84                     mAttributes.AddValue(
85                         "gene_id", xNormalizedAttributeValue(attribute));
86                     continue;
87                 }
88                 if (strGtfType == "transcript") {
89                     string gid, tid;
90                     if (!NStr::SplitInTwo(attribute, ".", gid, tid)) {
91                         return false;
92                     }
93                     mAttributes.AddValue(
94                         "gene_id", xNormalizedAttributeValue(gid));
95                     mAttributes.AddValue(
96                         "transcript_id", xNormalizedAttributeValue(attribute));
97                     continue;
98                 }
99             }
100         }
101         key = xNormalizedAttributeKey(key);
102         value = xNormalizedAttributeValue(value);
103         if ( key.empty()  &&  value.empty() ) {
104             // Probably due to trailing "; ". Sequence Ontology generates such
105             // things.
106             continue;
107         }
108         if (NStr::StartsWith(value, "\"")) {
109             value = value.substr(1, string::npos);
110         }
111         if (NStr::EndsWith(value, "\"")) {
112             value = value.substr(0, value.length() - 1);
113         }
114         mAttributes.AddValue(key, value);
115     }
116     return true;
117 }
118 
119 //  ----------------------------------------------------------------------------
CGtfReader(unsigned int uFlags,const string & strAnnotName,const string & strAnnotTitle,SeqIdResolver resolver,CReaderListener * pRL)120 CGtfReader::CGtfReader(
121     unsigned int uFlags,
122     const string& strAnnotName,
123     const string& strAnnotTitle,
124     SeqIdResolver resolver,
125     CReaderListener* pRL):
126 //  ----------------------------------------------------------------------------
127     CGff2Reader( uFlags, strAnnotName, strAnnotTitle, resolver, pRL)
128 {
129     mpLocations.reset(new CGtfLocationMerger(uFlags, resolver));
130 }
131 
132 //  ----------------------------------------------------------------------------
CGtfReader(unsigned int uFlags,CReaderListener * pRL)133 CGtfReader::CGtfReader(
134     unsigned int uFlags,
135     CReaderListener* pRL):
136 //  ----------------------------------------------------------------------------
137     CGtfReader( uFlags, "", "", CReadUtil::AsSeqId, pRL)
138 {
139 }
140 
141 
142 //  ----------------------------------------------------------------------------
~CGtfReader()143 CGtfReader::~CGtfReader()
144 //  ----------------------------------------------------------------------------
145 {
146 }
147 
148 //  ----------------------------------------------------------------------------
149 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lineReader,ILineErrorListener * pEC)150 CGtfReader::ReadSeqAnnot(
151     ILineReader& lineReader,
152     ILineErrorListener* pEC )
153 //  ----------------------------------------------------------------------------
154 {
155     mCurrentFeatureCount = 0;
156     return CReaderBase::ReadSeqAnnot(lineReader, pEC);
157 }
158 
159 //  ----------------------------------------------------------------------------
160 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)161 CGtfReader::xProcessData(
162     const TReaderData& readerData,
163     CSeq_annot& annot)
164 //  ----------------------------------------------------------------------------
165 {
166     for (const auto& lineData: readerData) {
167         const auto& line = lineData.mData;
168         if (xIsTrackTerminator(line)) {
169             continue;
170         }
171         if (xParseStructuredComment(line)) {
172             continue;
173         }
174         if (xParseBrowserLine(line, annot)) {
175             continue;
176         }
177         if (xParseFeature(line, annot, nullptr)) {
178             continue;
179         }
180     }
181 }
182 
183 //  ----------------------------------------------------------------------------
xUpdateAnnotFeature(const CGff2Record & record,CSeq_annot & annot,ILineErrorListener * pEC)184 bool CGtfReader::xUpdateAnnotFeature(
185     const CGff2Record& record,
186     CSeq_annot& annot,
187     ILineErrorListener* pEC)
188 //  ----------------------------------------------------------------------------
189 {
190     const CGtfReadRecord& gff = dynamic_cast<const CGtfReadRecord&>(record);
191     auto recType = gff.NormalizedType();
192 
193     using TYPEHANDLER = bool (CGtfReader::*)(const CGtfReadRecord&, CSeq_annot&);
194     using HANDLERMAP = map<string, TYPEHANDLER>;
195 
196     HANDLERMAP typeHandlers = {
197         {"cds",         &CGtfReader::xUpdateAnnotCds},
198         {"start_codon", &CGtfReader::xUpdateAnnotCds},
199         {"stop_codon",  &CGtfReader::xUpdateAnnotCds},
200         {"5utr",        &CGtfReader::xUpdateAnnotTranscript},
201         {"3utr",        &CGtfReader::xUpdateAnnotTranscript},
202         {"exon",        &CGtfReader::xUpdateAnnotTranscript},
203         {"initial",     &CGtfReader::xUpdateAnnotTranscript},
204         {"internal",    &CGtfReader::xUpdateAnnotTranscript},
205         {"terminal",    &CGtfReader::xUpdateAnnotTranscript},
206         {"single",      &CGtfReader::xUpdateAnnotTranscript},
207     };
208 
209     //
210     // Handle officially recognized GTF types:
211     //
212     HANDLERMAP::iterator it = typeHandlers.find(recType);
213     if (it != typeHandlers.end()) {
214         TYPEHANDLER handler = it->second;
215         return (this->*handler)(gff, annot);
216     }
217 
218     //
219     //  Every other type is not officially sanctioned GTF, and per spec we are
220     //  supposed to ignore it. In the spirit of being lenient on input we may
221     //  try to salvage some of it anyway.
222     //
223     if (recType == "gene") {
224         return xCreateParentGene(gff, annot);
225     }
226     if (recType == "mrna"  ||  recType == "transcript") {
227         return xCreateParentMrna(gff, annot);
228     }
229     return true;
230 }
231 
232 //  ----------------------------------------------------------------------------
xUpdateAnnotCds(const CGtfReadRecord & gff,CSeq_annot & annot)233 bool CGtfReader::xUpdateAnnotCds(
234     const CGtfReadRecord& gff,
235     CSeq_annot& annot )
236 //  ----------------------------------------------------------------------------
237 {
238     auto featId = mpLocations->GetFeatureIdFor(gff, "cds");
239     mpLocations->AddRecordForId(featId, gff) ;
240     return (xFindFeatById(featId)  ||  xCreateParentCds(gff, annot));
241  }
242 
243 //  ----------------------------------------------------------------------------
xUpdateAnnotTranscript(const CGtfReadRecord & gff,CSeq_annot & annot)244 bool CGtfReader::xUpdateAnnotTranscript(
245     const CGtfReadRecord& gff,
246     CSeq_annot& annot )
247 //  ----------------------------------------------------------------------------
248 {
249     //
250     // If there is no gene feature to go with this CDS then make one. Otherwise,
251     //  make sure the existing gene feature includes the location of the CDS.
252     //
253     auto geneFeatId = mpLocations->GetFeatureIdFor(gff, "gene");
254     CRef< CSeq_feat > pGene = xFindFeatById(geneFeatId);
255     if (!pGene) {
256         if (!xCreateParentGene(gff, annot)) {
257             return false;
258         }
259         mpLocations->AddRecordForId(geneFeatId, gff);
260     }
261     else {
262         mpLocations->AddRecordForId(geneFeatId, gff);
263         if (!xFeatureTrimQualifiers(gff, *pGene)) {
264             return false;
265         }
266     }
267 
268     //
269     // If there is no mRNA feature with this gene_id|transcript_id then make one.
270     //  Otherwise, fix up the location of the existing one.
271     //
272     auto transcriptFeatId = mpLocations->GetFeatureIdFor(gff, "transcript");
273     CRef<CSeq_feat> pMrna = xFindFeatById(transcriptFeatId);
274     if (!pMrna) {
275         //
276         // Create a brand new CDS feature:
277         //
278         if (!xCreateParentMrna(gff, annot)) {
279             return false;
280         }
281         mpLocations->AddRecordForId(transcriptFeatId, gff);
282     }
283     else {
284         //
285         // Update an already existing CDS features:
286         //
287         mpLocations->AddRecordForId(transcriptFeatId, gff);
288         if (!xFeatureTrimQualifiers(gff, *pMrna)) {
289             return false;
290         }
291     }
292     return true;
293 }
294 
295 //  ----------------------------------------------------------------------------
xCreateFeatureId(const CGtfReadRecord & record,const string & prefix,CSeq_feat & feature)296 bool CGtfReader::xCreateFeatureId(
297     const CGtfReadRecord& record,
298     const string& prefix,
299     CSeq_feat& feature )
300 //  ----------------------------------------------------------------------------
301 {
302     static int seqNum(1);
303 
304     string strFeatureId = prefix;
305     if (strFeatureId.empty()) {
306         strFeatureId = "id";
307     }
308     strFeatureId += "_";
309     strFeatureId += NStr::IntToString(seqNum++);
310     feature.SetId().SetLocal().SetStr(strFeatureId);
311     return true;
312 }
313 
314 //  -----------------------------------------------------------------------------
xCreateParentGene(const CGtfReadRecord & gff,CSeq_annot & annot)315 bool CGtfReader::xCreateParentGene(
316     const CGtfReadRecord& gff,
317     CSeq_annot& annot )
318 //  -----------------------------------------------------------------------------
319 {
320     auto featId = mpLocations->GetFeatureIdFor(gff, "gene");
321     if (m_MapIdToFeature.find(featId) != m_MapIdToFeature.end()) {
322         return true;
323     }
324 
325     CRef<CSeq_feat> pFeature( new CSeq_feat );
326 
327     if (!xFeatureSetDataGene(gff, *pFeature)) {
328         return false;
329     }
330     if (!xCreateFeatureId(gff, "gene", *pFeature)) {
331         return false;
332     }
333     if ( !xFeatureSetQualifiersGene(gff, *pFeature)) {
334         return false;
335     }
336 
337     (gff.Type() == "gene") ?
338         mpLocations->AddRecordForId(featId, gff) :
339         mpLocations->AddStubForId(featId);
340     m_MapIdToFeature[featId] = pFeature;
341     xAddFeatureToAnnot(pFeature, annot);
342     return true;
343 }
344 
345 //  ----------------------------------------------------------------------------
xFeatureSetQualifiersGene(const CGtfReadRecord & record,CSeq_feat & feature)346 bool CGtfReader::xFeatureSetQualifiersGene(
347     const CGtfReadRecord& record,
348     CSeq_feat& feature )
349 //  ----------------------------------------------------------------------------
350 {
351     list<string> ignoredAttrs = {
352         "locus_tag", "transcript_id"
353     };
354     //
355     //  Create GB qualifiers for the record attributes:
356     //
357 
358     const auto& attrs = record.GtfAttributes().Get();
359     auto it = attrs.begin();
360     for (/*NOOP*/; it != attrs.end(); ++it) {
361         auto cit = std::find(ignoredAttrs.begin(), ignoredAttrs.end(), it->first);
362         if (cit != ignoredAttrs.end()) {
363             continue;
364         }
365         // special case some well-known attributes
366         if (xProcessQualifierSpecialCase(it->first, it->second, feature)) {
367             continue;
368         }
369 
370         // turn everything else into a qualifier
371         xFeatureAddQualifiers(it->first, it->second, feature);
372     }
373     return true;
374 }
375 
376 //  ----------------------------------------------------------------------------
xFeatureSetQualifiersRna(const CGtfReadRecord & record,CSeq_feat & feature)377 bool CGtfReader::xFeatureSetQualifiersRna(
378     const CGtfReadRecord& record,
379     CSeq_feat& feature )
380 //  ----------------------------------------------------------------------------
381 {
382     list<string> ignoredAttrs = {
383         "locus_tag"
384     };
385 
386     const auto& attrs = record.GtfAttributes().Get();
387     auto it = attrs.begin();
388     for (/*NOOP*/; it != attrs.end(); ++it) {
389         auto cit = std::find(ignoredAttrs.begin(), ignoredAttrs.end(), it->first);
390         if (cit != ignoredAttrs.end()) {
391             continue;
392         }
393         // special case some well-known attributes
394         if (xProcessQualifierSpecialCase(it->first, it->second, feature)) {
395             continue;
396         }
397 
398         // turn everything else into a qualifier
399         xFeatureAddQualifiers(it->first, it->second, feature);
400     }
401     return true;
402 }
403 
404 //  ----------------------------------------------------------------------------
xFeatureSetQualifiersCds(const CGtfReadRecord & record,CSeq_feat & feature)405 bool CGtfReader::xFeatureSetQualifiersCds(
406     const CGtfReadRecord& record,
407     CSeq_feat& feature )
408 //  ----------------------------------------------------------------------------
409 {
410     list<string> ignoredAttrs = {
411         "locus_tag"
412     };
413 
414     const auto& attrs = record.GtfAttributes().Get();
415     auto it = attrs.begin();
416     for (/*NOOP*/; it != attrs.end(); ++it) {
417         auto cit = std::find(ignoredAttrs.begin(), ignoredAttrs.end(), it->first);
418         if (cit != ignoredAttrs.end()) {
419             continue;
420         }
421         // special case some well-known attributes
422         if (xProcessQualifierSpecialCase(it->first, it->second, feature)) {
423             continue;
424         }
425 
426         // turn everything else into a qualifier
427         xFeatureAddQualifiers(it->first, it->second, feature);
428     }
429     return true;
430 }
431 
432 //  -----------------------------------------------------------------------------
xCreateParentCds(const CGtfReadRecord & gff,CSeq_annot & annot)433 bool CGtfReader::xCreateParentCds(
434     const CGtfReadRecord& gff,
435     CSeq_annot& annot )
436 //  -----------------------------------------------------------------------------
437 {
438     auto featId = mpLocations->GetFeatureIdFor(gff, "cds");
439     if (m_MapIdToFeature.find(featId) != m_MapIdToFeature.end()) {
440         return true;
441     }
442 
443     CRef<CSeq_feat> pFeature(new CSeq_feat);
444 
445     if (!xFeatureSetDataCds(gff, *pFeature)) {
446         return false;
447     }
448     if (!xCreateFeatureId(gff, "cds", *pFeature)) {
449         return false;
450     }
451     if (!xFeatureSetQualifiersCds(gff, *pFeature)) {
452         return false;
453     }
454     m_MapIdToFeature[featId] = pFeature;
455     return xAddFeatureToAnnot(pFeature, annot);
456 }
457 
458 //  -----------------------------------------------------------------------------
xCreateParentMrna(const CGtfReadRecord & gff,CSeq_annot & annot)459 bool CGtfReader::xCreateParentMrna(
460     const CGtfReadRecord& gff,
461     CSeq_annot& annot )
462 //  -----------------------------------------------------------------------------
463 {
464     auto featId = mpLocations->GetFeatureIdFor(gff, "transcript");
465     if (m_MapIdToFeature.find(featId) != m_MapIdToFeature.end()) {
466         return true;
467     }
468 
469     CRef< CSeq_feat > pFeature( new CSeq_feat );
470 
471     if (!xFeatureSetDataMrna(gff, *pFeature)) {
472         return false;
473     }
474     if (!xCreateFeatureId(gff, "mrna", *pFeature)) {
475         return false;
476     }
477     if ( ! xFeatureSetQualifiersRna( gff, *pFeature ) ) {
478         return false;
479     }
480 
481     mpLocations->AddStubForId(featId);
482     m_MapIdToFeature[featId] = pFeature;
483 
484     return xAddFeatureToAnnot( pFeature, annot );
485 }
486 
487 //  ----------------------------------------------------------------------------
xFindFeatById(const string & featId)488 CRef<CSeq_feat> CGtfReader::xFindFeatById(
489     const string& featId)
490 //  ----------------------------------------------------------------------------
491 {
492     auto featIt = m_MapIdToFeature.find(featId);
493     if (featIt == m_MapIdToFeature.end()) {
494         return CRef<CSeq_feat>();
495     }
496     return featIt->second;
497 }
498 
499 //  ----------------------------------------------------------------------------
xFeatureSetDataGene(const CGtfReadRecord & record,CSeq_feat & feature)500 bool CGtfReader::xFeatureSetDataGene(
501     const CGtfReadRecord& record,
502     CSeq_feat& feature )
503 //  ----------------------------------------------------------------------------
504 {
505     CGene_ref& gene = feature.SetData().SetGene();
506 
507     const auto& attributes = record.GtfAttributes();
508     string geneSynonym = attributes.ValueOf("gene_synonym");
509     if (!geneSynonym.empty()) {
510         gene.SetSyn().push_back(geneSynonym);
511     }
512     string locusTag = attributes.ValueOf("locus_tag");
513     if (!locusTag.empty()) {
514         gene.SetLocus_tag(locusTag);
515     }
516     return true;
517 }
518 
519 //  ----------------------------------------------------------------------------
xFeatureSetDataMrna(const CGtfReadRecord & record,CSeq_feat & feature)520 bool CGtfReader::xFeatureSetDataMrna(
521     const CGtfReadRecord& record,
522     CSeq_feat& feature)
523 //  ----------------------------------------------------------------------------
524 {
525     if (!xFeatureSetDataRna(record, feature, CSeqFeatData::eSubtype_mRNA)) {
526         return false;
527     }
528     CRNA_ref& rna = feature.SetData().SetRna();
529 
530     string product = record.GtfAttributes().ValueOf("product");
531     if (!product.empty()) {
532         rna.SetExt().SetName(product);
533     }
534     return true;
535 }
536 
537 //  ----------------------------------------------------------------------------
xFeatureSetDataRna(const CGtfReadRecord & record,CSeq_feat & feature,CSeqFeatData::ESubtype subType)538 bool CGtfReader::xFeatureSetDataRna(
539     const CGtfReadRecord& record,
540     CSeq_feat& feature,
541     CSeqFeatData::ESubtype subType)
542 //  ----------------------------------------------------------------------------
543 {
544     CRNA_ref& rnaRef = feature.SetData().SetRna();
545     switch (subType){
546         default:
547             rnaRef.SetType(CRNA_ref::eType_miscRNA);
548             break;
549         case CSeqFeatData::eSubtype_mRNA:
550             rnaRef.SetType(CRNA_ref::eType_mRNA);
551             break;
552         case CSeqFeatData::eSubtype_rRNA:
553             rnaRef.SetType(CRNA_ref::eType_rRNA);
554             break;
555     }
556     return true;
557 }
558 
559 //  ----------------------------------------------------------------------------
xFeatureSetDataCds(const CGtfReadRecord & record,CSeq_feat & feature)560 bool CGtfReader::xFeatureSetDataCds(
561     const CGtfReadRecord& record,
562     CSeq_feat& feature )
563 //  ----------------------------------------------------------------------------
564 {
565     CCdregion& cdr = feature.SetData().SetCdregion();
566     const auto& attributes = record.GtfAttributes();
567 
568     string proteinId = attributes.ValueOf("protein_id");
569     if (!proteinId.empty()) {
570         CRef<CSeq_id> pId = mSeqIdResolve(proteinId, m_iFlags, true);
571         if (pId->IsGenbank()) {
572             feature.SetProduct().SetWhole(*pId);
573         }
574     }
575     string ribosomalSlippage = attributes.ValueOf("ribosomal_slippage");
576     if (!ribosomalSlippage.empty()) {
577         feature.SetExcept( true );
578         feature.SetExcept_text("ribosomal slippage");
579     }
580     string transTable = attributes.ValueOf("transl_table");
581     if (!transTable.empty()) {
582         CRef< CGenetic_code::C_E > pGc( new CGenetic_code::C_E );
583         pGc->SetId(NStr::StringToUInt(transTable));
584         cdr.SetCode().Set().push_back(pGc);
585     }
586     return true;
587 }
588 
589 //  ----------------------------------------------------------------------------
xFeatureTrimQualifiers(const CGtfReadRecord & record,CSeq_feat & feature)590 bool CGtfReader::xFeatureTrimQualifiers(
591     const CGtfReadRecord& record,
592     CSeq_feat& feature )
593     //  ----------------------------------------------------------------------------
594 {
595     typedef CSeq_feat::TQual TQual;
596     //task:
597     // for each attribute of the new piece check if we already got a feature
598     //  qualifier
599     // if so, and with the same value, then the qualifier is allowed to live
600     // otherwise it is subfeature specific and hence removed from the feature
601     TQual& quals = feature.SetQual();
602     for (TQual::iterator it = quals.begin(); it != quals.end(); /**/) {
603         const string& qualKey = (*it)->GetQual();
604         if (NStr::StartsWith(qualKey, "gff_")) {
605             it++;
606             continue;
607         }
608         if (qualKey == "locus_tag") {
609             it++;
610             continue;
611         }
612         if (qualKey == "old_locus_tag") {
613             it++;
614             continue;
615         }
616         if (qualKey == "product") {
617             it++;
618             continue;
619         }
620         if (qualKey == "protein_id") {
621             it++;
622             continue;
623         }
624         const string& qualVal = (*it)->GetVal();
625         if (!record.GtfAttributes().HasValue(qualKey, qualVal)) {
626             //superfluous qualifier- squish
627             it = quals.erase(it);
628             continue;
629         }
630         it++;
631     }
632     return true;
633 }
634 
635 //  ----------------------------------------------------------------------------
xProcessQualifierSpecialCase(const string & key,const CGtfAttributes::MultiValue & values,CSeq_feat & feature)636 bool CGtfReader::xProcessQualifierSpecialCase(
637     const string& key,
638     const CGtfAttributes::MultiValue& values,
639     CSeq_feat& feature )
640 //  ----------------------------------------------------------------------------
641 {
642     CRef<CGb_qual> pQual(0);
643 
644     if (0 == NStr::CompareNocase(key, "exon_id")) {
645         return true;
646     }
647     if (0 == NStr::CompareNocase(key, "exon_number")) {
648         return true;
649     }
650     if ( 0 == NStr::CompareNocase(key, "note") ) {
651         feature.SetComment(NStr::Join(values, ";"));
652         return true;
653     }
654     if ( 0 == NStr::CompareNocase(key, "dbxref") ||
655         0 == NStr::CompareNocase(key, "db_xref"))
656     {
657         for (auto value: values) {
658             vector< string > tags;
659             NStr::Split(value, ";", tags );
660             for (auto it = tags.begin(); it != tags.end(); ++it ) {
661                 feature.SetDbxref().push_back(x_ParseDbtag(*it));
662             }
663         }
664         return true;
665     }
666 
667     if ( 0 == NStr::CompareNocase(key, "pseudo")) {
668         feature.SetPseudo( true );
669         return true;
670     }
671     if ( 0 == NStr::CompareNocase(key, "partial")) {
672         // RW-1108 - ignore partial attribute in Genbank mode
673         if (m_iFlags & CGtfReader::fGenbankMode) {
674             return true;
675         }
676     }
677     return false;
678 }
679 
680 //  ----------------------------------------------------------------------------
xFeatureAddQualifiers(const string & key,const CGtfAttributes::MultiValue & values,CSeq_feat & feature)681 void CGtfReader::xFeatureAddQualifiers(
682     const string& key,
683     const CGtfAttributes::MultiValue& values,
684     CSeq_feat& feature)
685     //  ----------------------------------------------------------------------------
686 {
687     for (auto value: values) {
688         feature.AddQualifier(key, value);
689     }
690 };
691 
692 //  ============================================================================
xSetAncestorXrefs(CSeq_feat & descendent,CSeq_feat & ancestor)693 void CGtfReader::xSetAncestorXrefs(
694     CSeq_feat& descendent,
695     CSeq_feat& ancestor)
696 //  ============================================================================
697 {
698     xSetXrefFromTo(descendent, ancestor);
699     if (m_iFlags & CGtfReader::fGenerateChildXrefs) {
700         xSetXrefFromTo(ancestor, descendent);
701     }
702 }
703 
704 //  ----------------------------------------------------------------------------
xPostProcessAnnot(CSeq_annot & annot)705 void CGtfReader::xPostProcessAnnot(
706     CSeq_annot& annot)
707 //  ----------------------------------------------------------------------------
708 {
709     //location fixup:
710     for (auto itLocation: mpLocations->LocationMap()) {
711         auto id = itLocation.first;
712         auto itFeature = m_MapIdToFeature.find(id);
713         if (itFeature == m_MapIdToFeature.end()) {
714             continue;
715         }
716         CRef<CSeq_feat> pFeature = itFeature->second;
717         auto featSubType = pFeature->GetData().GetSubtype();
718         CRef<CSeq_loc> pNewLoc = mpLocations->MergeLocation(
719             featSubType, itLocation.second);
720         pFeature->SetLocation(*pNewLoc);
721     }
722 
723     //generate xrefs:
724     for (auto itLocation: mpLocations->LocationMap()) {
725         auto id = itLocation.first;
726         auto itFeature = m_MapIdToFeature.find(id);
727         if (itFeature == m_MapIdToFeature.end()) {
728             continue;
729         }
730         CRef<CSeq_feat> pFeature = itFeature->second;
731         auto featSubType = pFeature->GetData().GetSubtype();
732         switch(featSubType) {
733             default: {
734                 break;
735             }
736             case CSeqFeatData::eSubtype_mRNA: {
737                 auto parentGeneFeatId = string("gene:") + pFeature->GetNamedQual("gene_id");
738                 CRef<CSeq_feat> pParentGene;
739                 if (x_GetFeatureById(parentGeneFeatId, pParentGene)) {
740                     xSetAncestorXrefs(*pFeature, *pParentGene);
741                 }
742                 break;
743             }
744             case CSeqFeatData::eSubtype_cdregion: {
745                 auto parentRnaFeatId = string("transcript:") + pFeature->GetNamedQual("gene_id") +
746                     "_" + pFeature->GetNamedQual("transcript_id");
747                 CRef<CSeq_feat> pParentRna;
748                 if (x_GetFeatureById(parentRnaFeatId, pParentRna)) {
749                     xSetAncestorXrefs(*pFeature, *pParentRna);
750                 }
751                 auto parentGeneFeatId = string("gene:") + pFeature->GetNamedQual("gene_id");
752                 CRef<CSeq_feat> pParentGene;
753                 if (x_GetFeatureById(parentGeneFeatId, pParentGene)) {
754                     xSetAncestorXrefs(*pFeature, *pParentGene);
755                 }
756                 break;
757             }
758         }
759     }
760     return CGff2Reader::xPostProcessAnnot(annot);
761 }
762 
763 END_objects_SCOPE
764 END_NCBI_SCOPE
765