1  /*  $Id: gtf_reader.hpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  *   BED file reader
30  *
31  */
32 
33 #ifndef OBJTOOLS_READERS___GTF_READER__HPP
34 #define OBJTOOLS_READERS___GTF_READER__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 #include <objtools/readers/gff2_reader.hpp>
38 
39 BEGIN_NCBI_SCOPE
40 
41 BEGIN_SCOPE(objects) // namespace ncbi::objects::
42 
43 class CGtfLocationMerger;
44 
45 //  ============================================================================
46 class CGtfAttributes
47 //  ============================================================================
48 {
49 public:
50     using MultiValue = vector<string>;
51     using MultiAttributes = map<string, MultiValue>;
52 
53     const MultiAttributes&
Get() const54     Get() const
55     {
56         return mAttributes;
57     };
58 
59     string
ValueOf(const string & key) const60     ValueOf(
61         const string& key) const
62     {
63         MultiValue values;
64         GetValues(key, values);
65         if (values.size() == 1) {
66             return values.front();
67         }
68         return "";
69     }
70 
71     bool
HasValue(const string & key,const string & value="") const72     HasValue(
73         const string& key,
74         const string& value = "") const
75     {
76         auto it = mAttributes.find(key);
77         if (it == mAttributes.end()) {
78             return false;
79         }
80         const auto& values = it->second;
81         if (values.empty()) {
82             return false;
83         }
84         if (value.empty()) {
85             return true;
86         }
87         return (find(values.begin(), values.end(), value) != values.end());
88     };
89 
90     void
GetValues(const string & key,MultiValue & values) const91     GetValues(
92         const string& key,
93         MultiValue& values) const
94     {
95         const MultiValue empty;
96         values = empty;
97         auto it = mAttributes.find(key);
98         if (it != mAttributes.end()) {
99             values = it->second;
100         }
101     };
102 
103     void
AddValue(const string & key,const string & value)104     AddValue(
105         const string& key,
106         const string& value)
107     {
108         auto kit = mAttributes.find(key);
109         if (kit == mAttributes.end()) {
110             kit = mAttributes.insert(make_pair(key, MultiValue())).first;
111         }
112         auto vit = find(kit->second.begin(), kit->second.end(), value);
113         if (vit == kit->second.end()) {
114             kit->second.push_back(value);
115         }
116     };
117 
118 protected:
119     MultiAttributes mAttributes;
120 };
121 
122 //  ============================================================================
123 class CGtfReadRecord
124 //  ============================================================================
125     : public CGff2Record
126 {
127 public:
CGtfReadRecord()128     CGtfReadRecord(): CGff2Record() {
129     };
~CGtfReadRecord()130     ~CGtfReadRecord() {};
131 
132     const CGtfAttributes&
GtfAttributes() const133     GtfAttributes() const
134     {
135         return mAttributes;
136     };
137 
138     string
GeneKey() const139     GeneKey() const
140     {
141         string geneId = mAttributes.ValueOf("gene_id");
142         if (geneId.empty()) {
143             cerr << "Unexpected: GTF feature without a gene_id." << endl;
144         }
145         return geneId;
146     };
147 
148     string
FeatureKey() const149     FeatureKey() const
150     {
151         static unsigned int tidCounter(1);
152         if (Type() == "gene") {
153             return GeneKey();
154         }
155 
156         string transcriptId = mAttributes.ValueOf("transcript_id");
157         if (transcriptId.empty()) {
158             transcriptId = "t" + NStr::IntToString(tidCounter++);
159         }
160         return GeneKey() + "_" + transcriptId;
161     }
162 
163 protected:
164     bool xAssignAttributesFromGff(
165         const string&,
166         const string& );
167 
168     CGtfAttributes mAttributes;
169 };
170 
171 //  ----------------------------------------------------------------------------
172 class NCBI_XOBJREAD_EXPORT CGtfReader
173 //  ----------------------------------------------------------------------------
174     : public CGff2Reader
175 {
176 public:
177     enum EGtfFlags {
178         fGenerateChildXrefs = 1<<8,
179     };
180 
181     CGtfReader(
182         unsigned int =0,
183         const string& = "",
184         const string& = "",
185         SeqIdResolver = CReadUtil::AsSeqId,
186         CReaderListener* = nullptr);
187 
188     CGtfReader(
189         unsigned int,
190         CReaderListener*);
191 
192     virtual ~CGtfReader();
193 
194     CRef< CSeq_annot >
195     ReadSeqAnnot(
196         ILineReader& lr,
197         ILineErrorListener* pErrors=nullptr) override;
198 
199 protected:
200     void xProcessData(
201         const TReaderData&,
202         CSeq_annot&) override;
203 
x_CreateRecord()204     CGff2Record* x_CreateRecord() override { return new CGtfReadRecord(); }
205 
206     bool xUpdateAnnotFeature(
207         const CGff2Record&,
208         CSeq_annot&,
209         ILineErrorListener* =nullptr) override;
210 
211     virtual bool xUpdateAnnotCds(
212         const CGtfReadRecord&,
213         CSeq_annot&);
214 
215     virtual bool xUpdateAnnotTranscript(
216         const CGtfReadRecord&,
217         CSeq_annot&);
218 
219     void xPostProcessAnnot(
220         CSeq_annot&) override;
221 
222     bool xCreateFeatureId(
223         const CGtfReadRecord&,
224         const string&,
225         CSeq_feat&);
226 
227     bool xCreateParentGene(
228         const CGtfReadRecord&,
229         CSeq_annot&);
230 
231     bool xFeatureSetQualifiersGene(
232         const CGtfReadRecord& record,
233         CSeq_feat&);
234 
235     bool xFeatureSetQualifiersRna(
236         const CGtfReadRecord& record,
237         CSeq_feat&);
238 
239     bool xFeatureSetQualifiersCds(
240         const CGtfReadRecord& record,
241         CSeq_feat&);
242 
243     bool xCreateParentCds(
244         const CGtfReadRecord&,
245         CSeq_annot&);
246 
247     bool xCreateParentMrna(
248         const CGtfReadRecord&,
249         CSeq_annot&);
250 
251     bool xFeatureSetDataGene(
252         const CGtfReadRecord&,
253         CSeq_feat&);
254 
255     virtual bool xFeatureSetDataRna(
256         const CGtfReadRecord&,
257         CSeq_feat&,
258         CSeqFeatData::ESubtype );
259 
260     bool xFeatureSetDataMrna(
261         const CGtfReadRecord&,
262         CSeq_feat&);
263 
264     bool xFeatureSetDataCds(
265         const CGtfReadRecord&,
266         CSeq_feat&);
267 
268     bool xFeatureTrimQualifiers(
269         const CGtfReadRecord&,
270         CSeq_feat&);
271 
272     CRef<CSeq_feat> xFindFeatById(
273         const string&);
274 
275     bool xProcessQualifierSpecialCase(
276         const string&,
277         const CGtfAttributes::MultiValue&,
278         CSeq_feat&);
279 
280     void xFeatureAddQualifiers(
281         const string& key,
282         const CGtfAttributes::MultiValue&,
283         CSeq_feat&);
284 
285     void xSetAncestorXrefs(
286         CSeq_feat&,
287         CSeq_feat&) override;
288 
289     unique_ptr<CGtfLocationMerger> mpLocations;
290 };
291 
292 END_SCOPE(objects)
293 END_NCBI_SCOPE
294 
295 #endif // OBJTOOLS_READERS___GTF_READER__HPP
296