1 /* $Id: bed_reader.hpp 632526 2021-06-02 17:25:01Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Frank Ludwig 27 * 28 * File Description: 29 * BED file reader 30 * 31 */ 32 33 #ifndef OBJTOOLS_READERS___BEDREADER__HPP 34 #define OBJTOOLS_READERS___BEDREADER__HPP 35 36 #include <corelib/ncbistd.hpp> 37 #include <objects/seq/Seq_annot.hpp> 38 #include <objects/seqloc/Seq_id.hpp> 39 #include <objects/seqloc/Seq_interval.hpp> 40 #include <objects/seqset/Seq_entry.hpp> 41 #include <objtools/readers/reader_base.hpp> 42 BEGIN_NCBI_SCOPE 43 BEGIN_SCOPE(objects) 44 45 class CLinePreBuffer; 46 class CBedAutoSql; 47 class CBedColumnData; 48 49 // ---------------------------------------------------------------------------- 50 class NCBI_XOBJREAD_EXPORT CRawBedRecord 51 // ---------------------------------------------------------------------------- 52 { 53 public: CRawBedRecord()54 CRawBedRecord(): m_score(-1) {}; 55 ~CRawBedRecord()56 virtual ~CRawBedRecord() {}; 57 58 void SetInterval( 59 CSeq_id& id, 60 unsigned int start, 61 unsigned int stop, 62 ENa_strand strand); 63 64 void SetScore( 65 unsigned int score); 66 67 void Dump( 68 CNcbiOstream& ostr) const; 69 70 public: 71 CRef<CSeq_interval> m_pInterval; 72 int m_score; 73 }; 74 75 76 // ---------------------------------------------------------------------------- 77 class NCBI_XOBJREAD_EXPORT CRawBedTrack 78 // ---------------------------------------------------------------------------- 79 { 80 public: CRawBedTrack()81 CRawBedTrack() {}; ~CRawBedTrack()82 ~CRawBedTrack() {}; 83 84 public: 85 void Dump( 86 CNcbiOstream& ostr) const; 87 Reset()88 void Reset() { m_Records.clear(); }; AddRecord(CRawBedRecord & record)89 void AddRecord( 90 CRawBedRecord& record) { m_Records.push_back(record); }; Records() const91 const vector<CRawBedRecord>& Records() const { return m_Records; }; HasData() const92 bool HasData() const { return (!m_Records.empty()); }; 93 94 public: 95 CRef<CSeq_id> m_pId; 96 vector<CRawBedRecord> m_Records; 97 }; 98 99 100 // ---------------------------------------------------------------------------- 101 /// CReaderBase implementation that reads BED data files, either a single object 102 /// or all objects found. For the purpose of CBedReader, an object consists of 103 /// a run of records all with the same ID (BED comlumn 1), and all contained 104 /// within a single track. 105 /// 106 class NCBI_XOBJREAD_EXPORT CBedReader 107 // ---------------------------------------------------------------------------- 108 : public CReaderBase 109 { 110 // 111 // object management: 112 // 113 public: 114 CBedReader( 115 int = fNormal, 116 const string& = "", 117 const string& = "", 118 CReaderListener* = nullptr); 119 virtual ~CBedReader(); 120 121 // 122 // object interface: 123 // 124 public: 125 enum EBedFlags { 126 fThreeFeatFormat = 1<<8, 127 fDirectedFeatureModel = 1<<9, 128 fAutoSql = 1<<10, 129 fAddDefaultColumns = 1<<11, 130 }; 131 typedef int TFlags; 132 133 /// Read a single object from given line reader containing BED data. The 134 /// resulting Seq-annot will contain a feature table. 135 /// @param lr 136 /// line reader to read from. 137 /// @param pErrors 138 /// pointer to optional error container object. 139 /// 140 virtual CRef< CSeq_annot > 141 ReadSeqAnnot( 142 ILineReader& lr, 143 ILineErrorListener* pErrors=0 ); 144 145 virtual bool 146 ReadTrackData( 147 ILineReader&, 148 CRawBedTrack&, 149 ILineErrorListener* =0 ); 150 151 virtual bool 152 SetAutoSql( 153 const string&); 154 155 virtual bool 156 SetAutoSql( 157 CNcbiIstream&); 158 159 protected: 160 virtual CRef<CSeq_annot> xCreateSeqAnnot(); 161 162 virtual void xGetData( 163 ILineReader&, 164 TReaderData&); 165 166 virtual void xProcessData( 167 const TReaderData&, 168 CSeq_annot&); 169 170 virtual bool xDetermineLikelyColumnCount( 171 CLinePreBuffer&, 172 ILineErrorListener*); 173 174 virtual bool xParseTrackLine( 175 const string&); 176 177 bool xParseFeature( 178 const SReaderLine&, 179 CSeq_annot&, 180 ILineErrorListener*); 181 182 bool xParseFeatureAutoSql( 183 const CBedColumnData&, 184 CSeq_annot&, 185 ILineErrorListener*); 186 187 bool xParseFeatureUserFormat( 188 const CBedColumnData&, 189 CSeq_annot&, 190 ILineErrorListener*); 191 192 bool xParseFeatureThreeFeatFormat( 193 const CBedColumnData&, 194 CSeq_annot&, 195 ILineErrorListener*); 196 197 bool xParseFeatureGeneModelFormat( 198 const CBedColumnData&, 199 CSeq_annot&, 200 ILineErrorListener*); 201 202 bool xAppendFeatureChrom( 203 const CBedColumnData&, 204 CSeq_annot&, 205 unsigned int, 206 ILineErrorListener*); 207 208 bool xAppendFeatureThick( 209 const CBedColumnData&, 210 CSeq_annot&, 211 unsigned int, 212 ILineErrorListener*); 213 214 bool xAppendFeatureBlock( 215 const CBedColumnData&, 216 CSeq_annot&, 217 unsigned int, 218 ILineErrorListener*); 219 220 CRef<CSeq_feat> xAppendFeatureGene( 221 const CBedColumnData&, 222 CSeq_annot&, 223 unsigned int, 224 ILineErrorListener*); 225 226 CRef<CSeq_feat> xAppendFeatureRna( 227 const CBedColumnData&, 228 CSeq_annot&, 229 unsigned int, 230 ILineErrorListener*); 231 232 CRef<CSeq_feat> xAppendFeatureCds( 233 const CBedColumnData&, 234 CSeq_annot&, 235 unsigned int, 236 ILineErrorListener*); 237 238 void xSetFeatureLocation( 239 CRef<CSeq_feat>&, 240 const CBedColumnData&); 241 void xSetFeatureLocationChrom( 242 CRef<CSeq_feat>&, 243 const CBedColumnData&); 244 void xSetFeatureLocationGene( 245 CRef<CSeq_feat>&, 246 const CBedColumnData&); 247 void xSetFeatureLocationThick( 248 CRef<CSeq_feat>&, 249 const CBedColumnData&); 250 void xSetFeatureLocationCds( 251 CRef<CSeq_feat>&, 252 const CBedColumnData&); 253 void xSetFeatureLocationBlock( 254 CRef<CSeq_feat>&, 255 const CBedColumnData&); 256 void xSetFeatureLocationRna( 257 CRef<CSeq_feat>&, 258 const CBedColumnData&); 259 void xSetFeatureIdsChrom( 260 CRef<CSeq_feat>&, 261 const CBedColumnData&, 262 unsigned int); 263 void xSetFeatureIdsGene( 264 CRef<CSeq_feat>&, 265 const CBedColumnData&, 266 unsigned int); 267 void xSetFeatureIdsThick( 268 CRef<CSeq_feat>&, 269 const CBedColumnData&, 270 unsigned int); 271 void xSetFeatureIdsCds( 272 CRef<CSeq_feat>&, 273 const CBedColumnData&, 274 unsigned int); 275 void xSetFeatureIdsBlock( 276 CRef<CSeq_feat>&, 277 const CBedColumnData&, 278 unsigned int); 279 void xSetFeatureIdsRna( 280 CRef<CSeq_feat>&, 281 const CBedColumnData&, 282 unsigned int); 283 void xSetFeatureBedData( 284 CRef<CSeq_feat>&, 285 const CBedColumnData&, 286 ILineErrorListener*); 287 void xSetFeatureTitle( 288 CRef<CSeq_feat>&, 289 const CBedColumnData&); 290 void xSetFeatureScore( 291 CRef<CUser_object>, 292 const CBedColumnData&); 293 void xSetFeatureColor( 294 CRef<CUser_object>, 295 const CBedColumnData&, 296 ILineErrorListener*); 297 298 void xSetFeatureColorFromItemRgb( 299 CRef<CUser_object>, 300 const string&, 301 ILineErrorListener*); 302 void xSetFeatureColorFromScore( 303 CRef<CUser_object>, 304 const string&); 305 void xSetFeatureColorByStrand( 306 CRef<CUser_object>, 307 const string&, 308 ENa_strand, 309 ILineErrorListener*); 310 void xSetFeatureColorDefault( 311 CRef<CUser_object>); 312 313 bool xContainsThickFeature( 314 const CBedColumnData&) const; 315 316 bool xContainsBlockFeature( 317 const CBedColumnData&) const; 318 319 bool xContainsRnaFeature( 320 const CBedColumnData&) const; 321 322 bool xContainsCdsFeature( 323 const CBedColumnData&) const; 324 325 ENa_strand xGetStrand( 326 const CBedColumnData&) const; 327 328 virtual void xAssignBedColumnCount( 329 CSeq_annot&); 330 331 void xSetFeatureDisplayData( 332 CRef<CSeq_feat>&, 333 const CBedColumnData&); 334 335 virtual void xPostProcessAnnot( 336 CSeq_annot&); 337 338 bool 339 xReadBedDataRaw( 340 ILineReader&, 341 CRawBedTrack&, 342 ILineErrorListener*); 343 344 bool 345 xReadBedRecordRaw( 346 const string&, 347 CRawBedRecord&, 348 ILineErrorListener*); 349 350 static void xCleanColumnValues( 351 vector<string>&); 352 353 // 354 // data: 355 // 356 protected: 357 string m_currentId; 358 string mColumnSeparator; 359 NStr::TSplitFlags mColumnSplitFlags; 360 vector<string>::size_type mRealColumnCount; 361 vector<string>::size_type mValidColumnCount; 362 bool mAssumeErrorsAreRecordLevel; 363 unsigned int m_CurrentFeatureCount; 364 bool m_usescore; 365 unsigned int m_CurBatchSize; 366 const unsigned int m_MaxBatchSize; 367 unique_ptr<CLinePreBuffer> mLinePreBuffer; 368 369 unique_ptr<CBedAutoSql> mpAutoSql; 370 }; 371 372 END_SCOPE(objects) 373 END_NCBI_SCOPE 374 375 #endif // OBJTOOLS_READERS___BEDREADER__HPP 376