1 /* $Id: microarray_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description:
29 * MicroArray file reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <util/line_reader.hpp>
35
36 #include <objects/general/Object_id.hpp>
37 #include <objects/general/User_object.hpp>
38 #include <objects/seqloc/Seq_id.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seqloc/Seq_point.hpp>
41 #include <objects/seq/Seq_annot.hpp>
42 #include <objects/seq/Annotdesc.hpp>
43 #include <objects/seq/Annot_descr.hpp>
44 #include <objects/seqfeat/Seq_feat.hpp>
45
46 #include <objtools/readers/microarray_reader.hpp>
47
48 #include "reader_message_handler.hpp"
49
50 BEGIN_NCBI_SCOPE
51 BEGIN_objects_SCOPE
52
53 // ----------------------------------------------------------------------------
CMicroArrayReader(int flags,CReaderListener * pRL)54 CMicroArrayReader::CMicroArrayReader(
55 int flags,
56 CReaderListener* pRL)
57 // ----------------------------------------------------------------------------
58 : CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
59 m_currentId(""),
60 m_columncount(15),
61 m_usescore(false)
62 {
63 m_iFlags |= fReadAsBed;
64 }
65
66 // ----------------------------------------------------------------------------
~CMicroArrayReader()67 CMicroArrayReader::~CMicroArrayReader()
68 // ----------------------------------------------------------------------------
69 {
70 }
71
72 // ----------------------------------------------------------------------------
73 CRef< CSeq_annot >
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)74 CMicroArrayReader::ReadSeqAnnot(
75 ILineReader& lr,
76 ILineErrorListener* pEC)
77 // ----------------------------------------------------------------------------
78 {
79 CRef<CSeq_annot> pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
80 if (pAnnot) {
81 xAssignTrackData(*pAnnot);
82
83 if(m_columncount >= 3) {
84 CRef<CUser_object> columnCountUser( new CUser_object() );
85 columnCountUser->SetType().SetStr( "NCBI_BED_COLUMN_COUNT" );
86 columnCountUser->AddField("NCBI_BED_COLUMN_COUNT", int ( m_columncount ) );
87
88 CRef<CAnnotdesc> userDesc( new CAnnotdesc() );
89 userDesc->SetUser().Assign( *columnCountUser );
90 pAnnot->SetDesc().Set().push_back( userDesc );
91 }
92 }
93 return pAnnot;
94 }
95
96 // ----------------------------------------------------------------------------
97 CRef<CSeq_annot>
xCreateSeqAnnot()98 CMicroArrayReader::xCreateSeqAnnot()
99 // ----------------------------------------------------------------------------
100 {
101 CRef<CSeq_annot> pAnnot = CReaderBase::xCreateSeqAnnot();
102 CRef<CAnnot_descr> desc(new CAnnot_descr);
103 pAnnot->SetDesc(*desc);
104 pAnnot->SetData().SetFtable();
105 return pAnnot;
106 }
107
108
109 // ----------------------------------------------------------------------------
110 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)111 CMicroArrayReader::xProcessData(
112 const TReaderData& readerData,
113 CSeq_annot& annot)
114 // ----------------------------------------------------------------------------
115 {
116 for (const auto& lineInfo: readerData) {
117 const auto& line = lineInfo.mData;
118 if (xParseBrowserLine(line, annot)) {
119 return;
120 }
121 if (xProcessTrackLine(line)) {
122 return;
123 }
124 xProcessFeature(line, annot);
125 }
126 }
127
128 // ----------------------------------------------------------------------------
129 void
xGetData(ILineReader & lr,TReaderData & readerData)130 CMicroArrayReader::xGetData(
131 ILineReader& lr,
132 TReaderData& readerData)
133 // ----------------------------------------------------------------------------
134 {
135 const int MAX_RECORDS = 100000;
136
137 readerData.clear();
138 if (m_uDataCount == MAX_RECORDS) {
139 m_uDataCount = 0;
140 m_currentId.clear();
141 return;
142 }
143
144 string line, head, tail;
145 if (!xGetLine( lr, line)) {
146 return;
147 }
148 if (xIsTrackLine(line)) {
149 if (!m_currentId.empty()) {
150 xUngetLine(lr);
151 m_uDataCount = 0;
152 m_currentId.clear();
153 return;
154 }
155 else {
156 readerData.push_back(TReaderLine{m_uLineNumber, line});
157 ++m_uDataCount;
158 return;
159 }
160 }
161
162 NStr::SplitInTwo(line, "\t", head, tail);
163 if (!m_currentId.empty() && head != m_currentId) {
164 xUngetLine(lr);
165 m_uDataCount = 0;
166 m_currentId.clear();
167 return;
168 }
169 readerData.push_back(TReaderLine{m_uLineNumber, line});
170 if (m_currentId.empty()) {
171 m_currentId = head;
172 }
173 ++m_uDataCount;
174 }
175
176 // ----------------------------------------------------------------------------
xProcessFeature(const string & line,CSeq_annot & annot)177 bool CMicroArrayReader::xProcessFeature(
178 const string& line,
179 CSeq_annot& annot)
180 // ----------------------------------------------------------------------------
181 {
182 const size_t COLUMNCOUNT = 15;
183
184 vector<string> fields;
185 NStr::Split(line, " \t", fields, NStr::fSplit_MergeDelimiters);
186 xCleanColumnValues(fields);
187 if (fields.size() != COLUMNCOUNT) {
188 CReaderMessage error(
189 eDiag_Error,
190 m_uLineNumber,
191 "Feature Processing: Bad column count. Should be 15." );
192 throw(error);
193 }
194
195 CRef<CSeq_feat> feature;
196 feature.Reset(new CSeq_feat);
197 xSetFeatureLocation(feature, fields);
198 xSetFeatureDisplayData(feature, fields);
199 annot.SetData().SetFtable().push_back(feature);
200 return true;
201 }
202
203 // ----------------------------------------------------------------------------
xSetFeatureLocation(CRef<CSeq_feat> & feature,const vector<string> & fields)204 void CMicroArrayReader::xSetFeatureLocation(
205 CRef<CSeq_feat>& feature,
206 const vector<string>& fields )
207 // ----------------------------------------------------------------------------
208 {
209 feature->ResetLocation();
210
211 CRef<CSeq_id> id( new CSeq_id() );
212 id->SetLocal().SetStr( fields[0] );
213
214 CRef<CSeq_loc> location( new CSeq_loc );
215 CSeq_interval& interval = location->SetInt();
216 interval.SetFrom( NStr::StringToInt( fields[1] ) );
217 interval.SetTo( NStr::StringToInt( fields[2] ) - 1 );
218 interval.SetStrand(
219 ( fields[5] == "+" ) ? eNa_strand_plus : eNa_strand_minus );
220 location->SetId( *id );
221
222 feature->SetLocation( *location );
223 }
224
225 // ----------------------------------------------------------------------------
xSetFeatureDisplayData(CRef<CSeq_feat> & feature,const vector<string> & fields)226 void CMicroArrayReader::xSetFeatureDisplayData(
227 CRef<CSeq_feat>& feature,
228 const vector<string>& fields )
229 // ----------------------------------------------------------------------------
230 {
231 CRef<CUser_object> display_data( new CUser_object );
232 display_data->SetType().SetStr( "Display Data" );
233
234 display_data->AddField( "name", fields[3] );
235 if ( !m_usescore ) {
236 display_data->AddField( "score", NStr::StringToInt(fields[4]) );
237 }
238 else {
239 display_data->AddField( "greylevel", NStr::StringToInt(fields[4]) );
240 }
241 display_data->AddField( "thickStart", NStr::StringToInt(fields[6]) );
242 display_data->AddField( "thickEnd", NStr::StringToInt(fields[7]) - 1 );
243 display_data->AddField( "itemRGB", NStr::StringToInt(fields[8]) );
244 display_data->AddField( "blockCount", NStr::StringToInt(fields[9]) );
245 display_data->AddField( "blockSizes", fields[10] );
246 display_data->AddField( "blockStarts", fields[11] );
247
248 if ( !(m_iFlags & fReadAsBed) ) {
249 if ( fields.size() >= 13 ) {
250 display_data->AddField( "expCount", NStr::StringToInt(fields[12]) );
251 }
252 if ( fields.size() >= 14 ) {
253 display_data->AddField( "expIds", fields[13] );
254 }
255 if ( fields.size() >= 15 ) {
256 display_data->AddField( "expStep", NStr::StringToInt(fields[14]) );
257 }
258 }
259
260 feature->SetData().SetUser( *display_data );
261 }
262
263 // ----------------------------------------------------------------------------
xProcessTrackLine(const string & strLine)264 bool CMicroArrayReader::xProcessTrackLine(
265 const string& strLine)
266 // ----------------------------------------------------------------------------
267 {
268 m_strExpNames = "";
269 m_iExpScale = -1;
270 m_iExpStep = -1;
271
272 if (!CReaderBase::xParseTrackLine(strLine)) {
273 return false;
274 }
275 if ( m_iFlags & fReadAsBed ) {
276 return true;
277 }
278
279 if ( m_strExpNames.empty() ) {
280 CReaderMessage error(
281 eDiag_Warning,
282 m_uLineNumber,
283 "Track Line Processing: Missing \"expName\" parameter.");
284 m_pMessageHandler->Report(error);
285 }
286 if ( m_iExpScale == -1 ) {
287 CReaderMessage error(
288 eDiag_Warning,
289 m_uLineNumber,
290 "Track Line Processing: Missing \"expScale\" parameter." );
291 m_pMessageHandler->Report(error);
292 }
293 if ( m_iExpStep == -1 ) {
294 CReaderMessage error(
295 eDiag_Warning,
296 m_uLineNumber,
297 "Track Line Processing: Missing \"expStep\" parameter." );
298 m_pMessageHandler->Report(error);
299 }
300
301 return true;
302 }
303
304 // ----------------------------------------------------------------------------
305 void
xCleanColumnValues(vector<string> & columns)306 CMicroArrayReader::xCleanColumnValues(
307 vector<string>& columns)
308 // ----------------------------------------------------------------------------
309 {
310 string fixup;
311 auto columnCount = columns.size();
312
313 if (columnCount <= 1) {
314 return;
315 }
316 if (NStr::EqualNocase(columns[0], "chr")) {
317 columns[1] = columns[0] + columns[1];
318 columns.erase(columns.begin());
319 }
320
321 if (columnCount <= 2) {
322 return;
323 }
324 try {
325 NStr::Replace(columns[1], ",", "", fixup);
326 columns[1] = fixup;
327 }
328 catch (CException&) {
329 CReaderMessage error(
330 eDiag_Error,
331 0,
332 "Bad data line: Invalid \"SeqStart\" (column 2) value." );
333 throw(error);
334 }
335
336 if (columnCount <= 3) {
337 return;
338 }
339 try {
340 NStr::Replace(columns[2], ",", "", fixup);
341 columns[2] = fixup;
342 }
343 catch (CException&) {
344 CReaderMessage error(
345 eDiag_Error,
346 0,
347 "Bad data line: Invalid \"SeqStop\" (column 3) value." );
348 throw(error);
349 }
350 }
351
352 END_objects_SCOPE
353 END_NCBI_SCOPE
354