1 /* $Id: gtf_reader.hpp 632526 2021-06-02 17:25:01Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Frank Ludwig 27 * 28 * File Description: 29 * BED file reader 30 * 31 */ 32 33 #ifndef OBJTOOLS_READERS___GTF_READER__HPP 34 #define OBJTOOLS_READERS___GTF_READER__HPP 35 36 #include <corelib/ncbistd.hpp> 37 #include <objtools/readers/gff2_reader.hpp> 38 39 BEGIN_NCBI_SCOPE 40 41 BEGIN_SCOPE(objects) // namespace ncbi::objects:: 42 43 class CGtfLocationMerger; 44 45 // ============================================================================ 46 class CGtfAttributes 47 // ============================================================================ 48 { 49 public: 50 using MultiValue = vector<string>; 51 using MultiAttributes = map<string, MultiValue>; 52 53 const MultiAttributes& Get() const54 Get() const 55 { 56 return mAttributes; 57 }; 58 59 string ValueOf(const string & key) const60 ValueOf( 61 const string& key) const 62 { 63 MultiValue values; 64 GetValues(key, values); 65 if (values.size() == 1) { 66 return values.front(); 67 } 68 return ""; 69 } 70 71 bool HasValue(const string & key,const string & value="") const72 HasValue( 73 const string& key, 74 const string& value = "") const 75 { 76 auto it = mAttributes.find(key); 77 if (it == mAttributes.end()) { 78 return false; 79 } 80 const auto& values = it->second; 81 if (values.empty()) { 82 return false; 83 } 84 if (value.empty()) { 85 return true; 86 } 87 return (find(values.begin(), values.end(), value) != values.end()); 88 }; 89 90 void GetValues(const string & key,MultiValue & values) const91 GetValues( 92 const string& key, 93 MultiValue& values) const 94 { 95 const MultiValue empty; 96 values = empty; 97 auto it = mAttributes.find(key); 98 if (it != mAttributes.end()) { 99 values = it->second; 100 } 101 }; 102 103 void AddValue(const string & key,const string & value)104 AddValue( 105 const string& key, 106 const string& value) 107 { 108 auto kit = mAttributes.find(key); 109 if (kit == mAttributes.end()) { 110 kit = mAttributes.insert(make_pair(key, MultiValue())).first; 111 } 112 auto vit = find(kit->second.begin(), kit->second.end(), value); 113 if (vit == kit->second.end()) { 114 kit->second.push_back(value); 115 } 116 }; 117 118 protected: 119 MultiAttributes mAttributes; 120 }; 121 122 // ============================================================================ 123 class CGtfReadRecord 124 // ============================================================================ 125 : public CGff2Record 126 { 127 public: CGtfReadRecord()128 CGtfReadRecord(): CGff2Record() { 129 }; ~CGtfReadRecord()130 ~CGtfReadRecord() {}; 131 132 const CGtfAttributes& GtfAttributes() const133 GtfAttributes() const 134 { 135 return mAttributes; 136 }; 137 138 string GeneKey() const139 GeneKey() const 140 { 141 string geneId = mAttributes.ValueOf("gene_id"); 142 if (geneId.empty()) { 143 cerr << "Unexpected: GTF feature without a gene_id." << endl; 144 } 145 return geneId; 146 }; 147 148 string FeatureKey() const149 FeatureKey() const 150 { 151 static unsigned int tidCounter(1); 152 if (Type() == "gene") { 153 return GeneKey(); 154 } 155 156 string transcriptId = mAttributes.ValueOf("transcript_id"); 157 if (transcriptId.empty()) { 158 transcriptId = "t" + NStr::IntToString(tidCounter++); 159 } 160 return GeneKey() + "_" + transcriptId; 161 } 162 163 protected: 164 bool xAssignAttributesFromGff( 165 const string&, 166 const string& ); 167 168 CGtfAttributes mAttributes; 169 }; 170 171 // ---------------------------------------------------------------------------- 172 class NCBI_XOBJREAD_EXPORT CGtfReader 173 // ---------------------------------------------------------------------------- 174 : public CGff2Reader 175 { 176 public: 177 enum EGtfFlags { 178 fGenerateChildXrefs = 1<<8, 179 }; 180 181 CGtfReader( 182 unsigned int =0, 183 const string& = "", 184 const string& = "", 185 SeqIdResolver = CReadUtil::AsSeqId, 186 CReaderListener* = nullptr); 187 188 CGtfReader( 189 unsigned int, 190 CReaderListener*); 191 192 virtual ~CGtfReader(); 193 194 CRef< CSeq_annot > 195 ReadSeqAnnot( 196 ILineReader& lr, 197 ILineErrorListener* pErrors=nullptr) override; 198 199 protected: 200 void xProcessData( 201 const TReaderData&, 202 CSeq_annot&) override; 203 x_CreateRecord()204 CGff2Record* x_CreateRecord() override { return new CGtfReadRecord(); } 205 206 bool xUpdateAnnotFeature( 207 const CGff2Record&, 208 CSeq_annot&, 209 ILineErrorListener* =nullptr) override; 210 211 virtual bool xUpdateAnnotCds( 212 const CGtfReadRecord&, 213 CSeq_annot&); 214 215 virtual bool xUpdateAnnotTranscript( 216 const CGtfReadRecord&, 217 CSeq_annot&); 218 219 void xPostProcessAnnot( 220 CSeq_annot&) override; 221 222 bool xCreateFeatureId( 223 const CGtfReadRecord&, 224 const string&, 225 CSeq_feat&); 226 227 bool xCreateParentGene( 228 const CGtfReadRecord&, 229 CSeq_annot&); 230 231 bool xFeatureSetQualifiersGene( 232 const CGtfReadRecord& record, 233 CSeq_feat&); 234 235 bool xFeatureSetQualifiersRna( 236 const CGtfReadRecord& record, 237 CSeq_feat&); 238 239 bool xFeatureSetQualifiersCds( 240 const CGtfReadRecord& record, 241 CSeq_feat&); 242 243 bool xCreateParentCds( 244 const CGtfReadRecord&, 245 CSeq_annot&); 246 247 bool xCreateParentMrna( 248 const CGtfReadRecord&, 249 CSeq_annot&); 250 251 bool xFeatureSetDataGene( 252 const CGtfReadRecord&, 253 CSeq_feat&); 254 255 virtual bool xFeatureSetDataRna( 256 const CGtfReadRecord&, 257 CSeq_feat&, 258 CSeqFeatData::ESubtype ); 259 260 bool xFeatureSetDataMrna( 261 const CGtfReadRecord&, 262 CSeq_feat&); 263 264 bool xFeatureSetDataCds( 265 const CGtfReadRecord&, 266 CSeq_feat&); 267 268 bool xFeatureTrimQualifiers( 269 const CGtfReadRecord&, 270 CSeq_feat&); 271 272 CRef<CSeq_feat> xFindFeatById( 273 const string&); 274 275 bool xProcessQualifierSpecialCase( 276 const string&, 277 const CGtfAttributes::MultiValue&, 278 CSeq_feat&); 279 280 void xFeatureAddQualifiers( 281 const string& key, 282 const CGtfAttributes::MultiValue&, 283 CSeq_feat&); 284 285 void xSetAncestorXrefs( 286 CSeq_feat&, 287 CSeq_feat&) override; 288 289 unique_ptr<CGtfLocationMerger> mpLocations; 290 }; 291 292 END_SCOPE(objects) 293 END_NCBI_SCOPE 294 295 #endif // OBJTOOLS_READERS___GTF_READER__HPP 296