1 /*  $Id: microarray_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Frank Ludwig
27  *
28  * File Description:
29  *   MicroArray file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <util/line_reader.hpp>
35 
36 #include <objects/general/Object_id.hpp>
37 #include <objects/general/User_object.hpp>
38 #include <objects/seqloc/Seq_id.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seqloc/Seq_point.hpp>
41 #include <objects/seq/Seq_annot.hpp>
42 #include <objects/seq/Annotdesc.hpp>
43 #include <objects/seq/Annot_descr.hpp>
44 #include <objects/seqfeat/Seq_feat.hpp>
45 
46 #include <objtools/readers/microarray_reader.hpp>
47 
48 #include "reader_message_handler.hpp"
49 
50 BEGIN_NCBI_SCOPE
51 BEGIN_objects_SCOPE
52 
53 //  ----------------------------------------------------------------------------
CMicroArrayReader(int flags,CReaderListener * pRL)54 CMicroArrayReader::CMicroArrayReader(
55     int flags,
56     CReaderListener* pRL)
57 //  ----------------------------------------------------------------------------
58     : CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
59       m_currentId(""),
60       m_columncount(15),
61       m_usescore(false)
62 {
63     m_iFlags |= fReadAsBed;
64 }
65 
66 //  ----------------------------------------------------------------------------
~CMicroArrayReader()67 CMicroArrayReader::~CMicroArrayReader()
68 //  ----------------------------------------------------------------------------
69 {
70 }
71 
72 //  ----------------------------------------------------------------------------
73 CRef< CSeq_annot >
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)74 CMicroArrayReader::ReadSeqAnnot(
75     ILineReader& lr,
76     ILineErrorListener* pEC)
77 //  ----------------------------------------------------------------------------
78 {
79     CRef<CSeq_annot> pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
80     if (pAnnot) {
81         xAssignTrackData(*pAnnot);
82 
83         if(m_columncount >= 3) {
84             CRef<CUser_object> columnCountUser( new CUser_object() );
85             columnCountUser->SetType().SetStr( "NCBI_BED_COLUMN_COUNT" );
86             columnCountUser->AddField("NCBI_BED_COLUMN_COUNT", int ( m_columncount ) );
87 
88             CRef<CAnnotdesc> userDesc( new CAnnotdesc() );
89             userDesc->SetUser().Assign( *columnCountUser );
90             pAnnot->SetDesc().Set().push_back( userDesc );
91         }
92     }
93     return pAnnot;
94 }
95 
96 //  ----------------------------------------------------------------------------
97 CRef<CSeq_annot>
xCreateSeqAnnot()98 CMicroArrayReader::xCreateSeqAnnot()
99 //  ----------------------------------------------------------------------------
100 {
101     CRef<CSeq_annot> pAnnot = CReaderBase::xCreateSeqAnnot();
102     CRef<CAnnot_descr> desc(new CAnnot_descr);
103     pAnnot->SetDesc(*desc);
104     pAnnot->SetData().SetFtable();
105     return pAnnot;
106 }
107 
108 
109 //  ----------------------------------------------------------------------------
110 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)111 CMicroArrayReader::xProcessData(
112     const TReaderData& readerData,
113     CSeq_annot& annot)
114 //  ----------------------------------------------------------------------------
115 {
116     for (const auto& lineInfo: readerData) {
117         const auto& line = lineInfo.mData;
118         if (xParseBrowserLine(line, annot)) {
119             return;
120         }
121         if (xProcessTrackLine(line)) {
122             return;
123         }
124         xProcessFeature(line, annot);
125     }
126 }
127 
128 //  ----------------------------------------------------------------------------
129 void
xGetData(ILineReader & lr,TReaderData & readerData)130 CMicroArrayReader::xGetData(
131     ILineReader& lr,
132     TReaderData& readerData)
133 //  ----------------------------------------------------------------------------
134 {
135     const int MAX_RECORDS = 100000;
136 
137     readerData.clear();
138     if (m_uDataCount == MAX_RECORDS) {
139         m_uDataCount = 0;
140         m_currentId.clear();
141         return;
142     }
143 
144     string line, head, tail;
145     if (!xGetLine( lr, line)) {
146         return;
147     }
148     if (xIsTrackLine(line)) {
149         if (!m_currentId.empty()) {
150             xUngetLine(lr);
151             m_uDataCount = 0;
152             m_currentId.clear();
153             return;
154         }
155         else {
156             readerData.push_back(TReaderLine{m_uLineNumber, line});
157             ++m_uDataCount;
158             return;
159         }
160     }
161 
162     NStr::SplitInTwo(line, "\t", head, tail);
163     if (!m_currentId.empty()  &&  head != m_currentId) {
164         xUngetLine(lr);
165         m_uDataCount = 0;
166         m_currentId.clear();
167         return;
168     }
169     readerData.push_back(TReaderLine{m_uLineNumber, line});
170     if (m_currentId.empty()) {
171         m_currentId = head;
172     }
173     ++m_uDataCount;
174 }
175 
176 //  ----------------------------------------------------------------------------
xProcessFeature(const string & line,CSeq_annot & annot)177 bool CMicroArrayReader::xProcessFeature(
178     const string& line,
179     CSeq_annot& annot)
180 //  ----------------------------------------------------------------------------
181 {
182     const size_t COLUMNCOUNT = 15;
183 
184     vector<string> fields;
185     NStr::Split(line, " \t", fields, NStr::fSplit_MergeDelimiters);
186     xCleanColumnValues(fields);
187     if (fields.size() != COLUMNCOUNT) {
188         CReaderMessage error(
189             eDiag_Error,
190             m_uLineNumber,
191             "Feature Processing: Bad column count. Should be 15." );
192         throw(error);
193     }
194 
195     CRef<CSeq_feat> feature;
196     feature.Reset(new CSeq_feat);
197     xSetFeatureLocation(feature, fields);
198     xSetFeatureDisplayData(feature, fields);
199     annot.SetData().SetFtable().push_back(feature);
200     return true;
201 }
202 
203 //  ----------------------------------------------------------------------------
xSetFeatureLocation(CRef<CSeq_feat> & feature,const vector<string> & fields)204 void CMicroArrayReader::xSetFeatureLocation(
205     CRef<CSeq_feat>& feature,
206     const vector<string>& fields )
207 //  ----------------------------------------------------------------------------
208 {
209     feature->ResetLocation();
210 
211     CRef<CSeq_id> id( new CSeq_id() );
212     id->SetLocal().SetStr( fields[0] );
213 
214     CRef<CSeq_loc> location( new CSeq_loc );
215     CSeq_interval& interval = location->SetInt();
216     interval.SetFrom( NStr::StringToInt( fields[1] ) );
217     interval.SetTo( NStr::StringToInt( fields[2] ) - 1 );
218     interval.SetStrand(
219         ( fields[5] == "+" ) ? eNa_strand_plus : eNa_strand_minus );
220     location->SetId( *id );
221 
222     feature->SetLocation( *location );
223 }
224 
225 //  ----------------------------------------------------------------------------
xSetFeatureDisplayData(CRef<CSeq_feat> & feature,const vector<string> & fields)226 void CMicroArrayReader::xSetFeatureDisplayData(
227     CRef<CSeq_feat>& feature,
228     const vector<string>& fields )
229 //  ----------------------------------------------------------------------------
230 {
231     CRef<CUser_object> display_data( new CUser_object );
232     display_data->SetType().SetStr( "Display Data" );
233 
234     display_data->AddField( "name", fields[3] );
235     if ( !m_usescore ) {
236         display_data->AddField( "score", NStr::StringToInt(fields[4]) );
237     }
238     else {
239         display_data->AddField( "greylevel", NStr::StringToInt(fields[4]) );
240     }
241     display_data->AddField( "thickStart", NStr::StringToInt(fields[6]) );
242     display_data->AddField( "thickEnd", NStr::StringToInt(fields[7]) - 1 );
243     display_data->AddField( "itemRGB", NStr::StringToInt(fields[8]) );
244     display_data->AddField( "blockCount", NStr::StringToInt(fields[9]) );
245     display_data->AddField( "blockSizes", fields[10] );
246     display_data->AddField( "blockStarts", fields[11] );
247 
248     if ( !(m_iFlags & fReadAsBed) ) {
249         if ( fields.size() >= 13 ) {
250             display_data->AddField( "expCount", NStr::StringToInt(fields[12]) );
251         }
252         if ( fields.size() >= 14 ) {
253             display_data->AddField( "expIds", fields[13] );
254         }
255         if ( fields.size() >= 15 ) {
256             display_data->AddField( "expStep", NStr::StringToInt(fields[14]) );
257         }
258     }
259 
260     feature->SetData().SetUser( *display_data );
261 }
262 
263 //  ----------------------------------------------------------------------------
xProcessTrackLine(const string & strLine)264 bool CMicroArrayReader::xProcessTrackLine(
265     const string& strLine)
266 //  ----------------------------------------------------------------------------
267 {
268     m_strExpNames = "";
269     m_iExpScale = -1;
270     m_iExpStep = -1;
271 
272     if (!CReaderBase::xParseTrackLine(strLine)) {
273         return false;
274     }
275     if ( m_iFlags & fReadAsBed ) {
276         return true;
277     }
278 
279     if ( m_strExpNames.empty() ) {
280         CReaderMessage error(
281             eDiag_Warning,
282             m_uLineNumber,
283             "Track Line Processing: Missing \"expName\" parameter.");
284         m_pMessageHandler->Report(error);
285     }
286     if ( m_iExpScale == -1 ) {
287          CReaderMessage error(
288             eDiag_Warning,
289             m_uLineNumber,
290             "Track Line Processing: Missing \"expScale\" parameter." );
291         m_pMessageHandler->Report(error);
292     }
293     if ( m_iExpStep == -1 ) {
294          CReaderMessage error(
295             eDiag_Warning,
296             m_uLineNumber,
297             "Track Line Processing: Missing \"expStep\" parameter." );
298         m_pMessageHandler->Report(error);
299     }
300 
301     return true;
302 }
303 
304 //  ----------------------------------------------------------------------------
305 void
xCleanColumnValues(vector<string> & columns)306 CMicroArrayReader::xCleanColumnValues(
307    vector<string>& columns)
308 //  ----------------------------------------------------------------------------
309 {
310     string fixup;
311     auto columnCount = columns.size();
312 
313     if (columnCount <= 1) {
314         return;
315     }
316     if (NStr::EqualNocase(columns[0], "chr")) {
317         columns[1] = columns[0] + columns[1];
318         columns.erase(columns.begin());
319     }
320 
321     if (columnCount <= 2) {
322         return;
323     }
324     try {
325         NStr::Replace(columns[1], ",", "", fixup);
326         columns[1] = fixup;
327     }
328     catch (CException&) {
329         CReaderMessage error(
330             eDiag_Error,
331             0,
332             "Bad data line: Invalid \"SeqStart\" (column 2) value." );
333         throw(error);
334     }
335 
336     if (columnCount <= 3) {
337         return;
338     }
339     try {
340         NStr::Replace(columns[2], ",", "", fixup);
341         columns[2] = fixup;
342     }
343     catch (CException&) {
344         CReaderMessage error(
345             eDiag_Error,
346             0,
347             "Bad data line: Invalid \"SeqStop\" (column 3) value." );
348         throw(error);
349     }
350 }
351 
352 END_objects_SCOPE
353 END_NCBI_SCOPE
354