1 /*  $Id: bed_reader.hpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  *   BED file reader
30  *
31  */
32 
33 #ifndef OBJTOOLS_READERS___BEDREADER__HPP
34 #define OBJTOOLS_READERS___BEDREADER__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 #include <objects/seq/Seq_annot.hpp>
38 #include <objects/seqloc/Seq_id.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seqset/Seq_entry.hpp>
41 #include <objtools/readers/reader_base.hpp>
42 BEGIN_NCBI_SCOPE
43 BEGIN_SCOPE(objects)
44 
45 class CLinePreBuffer;
46 class CBedAutoSql;
47 class CBedColumnData;
48 
49 //  ----------------------------------------------------------------------------
50 class NCBI_XOBJREAD_EXPORT CRawBedRecord
51 //  ----------------------------------------------------------------------------
52 {
53 public:
CRawBedRecord()54     CRawBedRecord(): m_score(-1) {};
55 
~CRawBedRecord()56     virtual ~CRawBedRecord() {};
57 
58     void SetInterval(
59         CSeq_id& id,
60         unsigned int start,
61         unsigned int stop,
62         ENa_strand strand);
63 
64     void SetScore(
65         unsigned int score);
66 
67     void Dump(
68         CNcbiOstream& ostr) const;
69 
70 public:
71     CRef<CSeq_interval> m_pInterval;
72     int m_score;
73 };
74 
75 
76 //  ----------------------------------------------------------------------------
77 class NCBI_XOBJREAD_EXPORT CRawBedTrack
78 //  ----------------------------------------------------------------------------
79 {
80 public:
CRawBedTrack()81     CRawBedTrack() {};
~CRawBedTrack()82     ~CRawBedTrack() {};
83 
84 public:
85     void Dump(
86         CNcbiOstream& ostr) const;
87 
Reset()88     void Reset() { m_Records.clear(); };
AddRecord(CRawBedRecord & record)89     void AddRecord(
90         CRawBedRecord& record) { m_Records.push_back(record); };
Records() const91     const vector<CRawBedRecord>& Records() const { return m_Records; };
HasData() const92     bool HasData() const { return (!m_Records.empty()); };
93 
94 public:
95     CRef<CSeq_id> m_pId;
96     vector<CRawBedRecord> m_Records;
97 };
98 
99 
100 //  ----------------------------------------------------------------------------
101 /// CReaderBase implementation that reads BED data files, either a single object
102 /// or all objects found. For the purpose of CBedReader, an object consists of
103 /// a run of records all with the same ID (BED comlumn 1), and all contained
104 /// within a single track.
105 ///
106 class NCBI_XOBJREAD_EXPORT CBedReader
107 //  ----------------------------------------------------------------------------
108     : public CReaderBase
109 {
110     //
111     //  object management:
112     //
113 public:
114     CBedReader(
115         int = fNormal,
116         const string& = "",
117         const string& = "",
118         CReaderListener* = nullptr);
119     virtual ~CBedReader();
120 
121     //
122     //  object interface:
123     //
124 public:
125     enum EBedFlags {
126         fThreeFeatFormat = 1<<8,
127         fDirectedFeatureModel = 1<<9,
128         fAutoSql = 1<<10,
129         fAddDefaultColumns = 1<<11,
130     };
131     typedef int TFlags;
132 
133     /// Read a single object from given line reader containing BED data. The
134     /// resulting Seq-annot will contain a feature table.
135     /// @param lr
136     ///   line reader to read from.
137     /// @param pErrors
138     ///   pointer to optional error container object.
139     ///
140     virtual CRef< CSeq_annot >
141     ReadSeqAnnot(
142         ILineReader& lr,
143         ILineErrorListener* pErrors=0 );
144 
145     virtual bool
146     ReadTrackData(
147         ILineReader&,
148         CRawBedTrack&,
149         ILineErrorListener* =0 );
150 
151     virtual bool
152     SetAutoSql(
153         const string&);
154 
155     virtual bool
156     SetAutoSql(
157         CNcbiIstream&);
158 
159 protected:
160     virtual CRef<CSeq_annot> xCreateSeqAnnot();
161 
162     virtual void xGetData(
163         ILineReader&,
164         TReaderData&);
165 
166     virtual void xProcessData(
167         const TReaderData&,
168         CSeq_annot&);
169 
170     virtual bool xDetermineLikelyColumnCount(
171         CLinePreBuffer&,
172         ILineErrorListener*);
173 
174     virtual bool xParseTrackLine(
175         const string&);
176 
177     bool xParseFeature(
178         const SReaderLine&,
179         CSeq_annot&,
180         ILineErrorListener*);
181 
182     bool xParseFeatureAutoSql(
183         const CBedColumnData&,
184         CSeq_annot&,
185         ILineErrorListener*);
186 
187     bool xParseFeatureUserFormat(
188         const CBedColumnData&,
189         CSeq_annot&,
190         ILineErrorListener*);
191 
192     bool xParseFeatureThreeFeatFormat(
193         const CBedColumnData&,
194         CSeq_annot&,
195         ILineErrorListener*);
196 
197     bool xParseFeatureGeneModelFormat(
198         const CBedColumnData&,
199         CSeq_annot&,
200         ILineErrorListener*);
201 
202     bool xAppendFeatureChrom(
203         const CBedColumnData&,
204         CSeq_annot&,
205         unsigned int,
206         ILineErrorListener*);
207 
208     bool xAppendFeatureThick(
209         const CBedColumnData&,
210         CSeq_annot&,
211         unsigned int,
212         ILineErrorListener*);
213 
214     bool xAppendFeatureBlock(
215         const CBedColumnData&,
216         CSeq_annot&,
217         unsigned int,
218         ILineErrorListener*);
219 
220     CRef<CSeq_feat> xAppendFeatureGene(
221         const CBedColumnData&,
222         CSeq_annot&,
223         unsigned int,
224         ILineErrorListener*);
225 
226     CRef<CSeq_feat> xAppendFeatureRna(
227         const CBedColumnData&,
228         CSeq_annot&,
229         unsigned int,
230         ILineErrorListener*);
231 
232     CRef<CSeq_feat> xAppendFeatureCds(
233         const CBedColumnData&,
234         CSeq_annot&,
235         unsigned int,
236         ILineErrorListener*);
237 
238     void xSetFeatureLocation(
239         CRef<CSeq_feat>&,
240         const CBedColumnData&);
241     void xSetFeatureLocationChrom(
242         CRef<CSeq_feat>&,
243         const CBedColumnData&);
244     void xSetFeatureLocationGene(
245         CRef<CSeq_feat>&,
246         const CBedColumnData&);
247     void xSetFeatureLocationThick(
248         CRef<CSeq_feat>&,
249         const CBedColumnData&);
250     void xSetFeatureLocationCds(
251         CRef<CSeq_feat>&,
252         const CBedColumnData&);
253     void xSetFeatureLocationBlock(
254         CRef<CSeq_feat>&,
255         const CBedColumnData&);
256     void xSetFeatureLocationRna(
257         CRef<CSeq_feat>&,
258         const CBedColumnData&);
259     void xSetFeatureIdsChrom(
260         CRef<CSeq_feat>&,
261         const CBedColumnData&,
262         unsigned int);
263     void xSetFeatureIdsGene(
264         CRef<CSeq_feat>&,
265         const CBedColumnData&,
266         unsigned int);
267     void xSetFeatureIdsThick(
268         CRef<CSeq_feat>&,
269         const CBedColumnData&,
270         unsigned int);
271     void xSetFeatureIdsCds(
272         CRef<CSeq_feat>&,
273         const CBedColumnData&,
274         unsigned int);
275     void xSetFeatureIdsBlock(
276         CRef<CSeq_feat>&,
277         const CBedColumnData&,
278         unsigned int);
279     void xSetFeatureIdsRna(
280         CRef<CSeq_feat>&,
281         const CBedColumnData&,
282         unsigned int);
283     void xSetFeatureBedData(
284         CRef<CSeq_feat>&,
285         const CBedColumnData&,
286         ILineErrorListener*);
287     void xSetFeatureTitle(
288         CRef<CSeq_feat>&,
289         const CBedColumnData&);
290     void xSetFeatureScore(
291         CRef<CUser_object>,
292         const CBedColumnData&);
293     void xSetFeatureColor(
294         CRef<CUser_object>,
295         const CBedColumnData&,
296         ILineErrorListener*);
297 
298     void xSetFeatureColorFromItemRgb(
299         CRef<CUser_object>,
300         const string&,
301         ILineErrorListener*);
302     void xSetFeatureColorFromScore(
303         CRef<CUser_object>,
304         const string&);
305     void xSetFeatureColorByStrand(
306         CRef<CUser_object>,
307         const string&,
308         ENa_strand,
309         ILineErrorListener*);
310     void xSetFeatureColorDefault(
311         CRef<CUser_object>);
312 
313     bool xContainsThickFeature(
314         const CBedColumnData&) const;
315 
316     bool xContainsBlockFeature(
317         const CBedColumnData&) const;
318 
319     bool xContainsRnaFeature(
320         const CBedColumnData&) const;
321 
322     bool xContainsCdsFeature(
323         const CBedColumnData&) const;
324 
325     ENa_strand xGetStrand(
326         const CBedColumnData&) const;
327 
328     virtual void xAssignBedColumnCount(
329         CSeq_annot&);
330 
331     void xSetFeatureDisplayData(
332         CRef<CSeq_feat>&,
333         const CBedColumnData&);
334 
335     virtual void xPostProcessAnnot(
336         CSeq_annot&);
337 
338     bool
339     xReadBedDataRaw(
340         ILineReader&,
341         CRawBedTrack&,
342         ILineErrorListener*);
343 
344     bool
345     xReadBedRecordRaw(
346         const string&,
347         CRawBedRecord&,
348         ILineErrorListener*);
349 
350     static void xCleanColumnValues(
351         vector<string>&);
352 
353     //
354     //  data:
355     //
356 protected:
357     string m_currentId;
358     string mColumnSeparator;
359     NStr::TSplitFlags mColumnSplitFlags;
360     vector<string>::size_type mRealColumnCount;
361     vector<string>::size_type mValidColumnCount;
362     bool mAssumeErrorsAreRecordLevel;
363     unsigned int m_CurrentFeatureCount;
364     bool m_usescore;
365     unsigned int m_CurBatchSize;
366     const unsigned int m_MaxBatchSize;
367     unique_ptr<CLinePreBuffer> mLinePreBuffer;
368 
369     unique_ptr<CBedAutoSql> mpAutoSql;
370 };
371 
372 END_SCOPE(objects)
373 END_NCBI_SCOPE
374 
375 #endif // OBJTOOLS_READERS___BEDREADER__HPP
376