1 /* $Id: tab_table_reader.cpp 548810 2017-10-18 13:38:41Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Reader for structured comments for sequences
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <util/line_reader.hpp>
37
38 #include "tab_table_reader.hpp"
39
40 #include <misc/xmlwrapp/xmlwrapp.hpp>
41 #include "col_validator.hpp"
42
43 #include <common/test_assert.h> /* This header must go last */
44
45 BEGIN_NCBI_SCOPE
46 USING_SCOPE(objects);
47
_Validate(int col_number,const CTempString & value)48 bool CTabDelimitedValidator::_Validate(int col_number, const CTempString& value)
49 {
50 const string& datatype = m_col_defs[col_number];
51 if (datatype.empty())
52 return false;
53
54 string error;
55 bool isfatal = CColumnValidatorRegistry::GetInstance().
56 DoValidate(datatype, value, error);
57
58 if (!error.empty())
59 _ReportError(col_number, error, datatype);
60
61 return isfatal;
62 }
63
_ReportWarning(int col_number,const CTempString & warning,const CTempString & colname)64 void CTabDelimitedValidator::_ReportWarning(int col_number, const CTempString& warning, const CTempString& colname)
65 {
66 _ReportError(col_number, warning, colname, true);
67 }
68
_ReportError(int col_number,const CTempString & error,const CTempString & colname,bool warning)69 void CTabDelimitedValidator::_ReportError(int col_number, const CTempString& error, const CTempString& colname, bool warning)
70 {
71 CTabDelimitedValidatorMessage rec;
72 rec.m_col = col_number;
73 rec.m_row = m_current_row_number;
74 rec.m_msg = error;
75 rec.m_warning = warning;
76 rec.m_colname = colname;
77 m_errors.push_back(rec);
78 }
79
_CheckHeader(const string & discouraged,const vector<string> & require_one)80 bool CTabDelimitedValidator::_CheckHeader(const string& discouraged, const vector<string>& require_one)
81 {
82 bool columns_ok = true;
83 vector<string> discouraged_cols;
84 if (!discouraged.empty())
85 {
86 NStr::Split(discouraged, ",", discouraged_cols);
87 }
88
89 ITERATE(std::vector<std::string>, it, m_col_defs)
90 {
91 if (CColumnValidator::IsDiscouraged(*it) ||
92 find(discouraged_cols.begin(), discouraged_cols.end(), *it) != discouraged_cols.end())
93 {
94 int id = it - m_col_defs.begin();
95 _ReportError(id, "Column is discouraged", *it);
96 m_ignored_cols.resize(m_col_defs.size());
97 m_ignored_cols[id] = true;
98 }
99 }
100
101 m_require_one_cols.resize(require_one.size());
102 ITERATE(vector<string>, it, require_one)
103 {
104 set<string>& require_one_cols = m_require_one_cols[it - require_one.begin()];
105 vector<string> cols;
106 NStr::Split(*it, ",", cols);
107 int found(0);
108 ITERATE(vector<string>, it_col, cols)
109 {
110 require_one_cols.insert(*it_col);
111 if (find(m_col_defs.begin(), m_col_defs.end(), *it_col) != m_col_defs.end())
112 {
113 found++;
114 }
115 }
116 if (found == 0)
117 {
118 _ReportError(0, "Not found any of require-one columns", *it);
119 }
120 }
121
122 return columns_ok;
123 }
124
ValidateInput(ILineReader & reader,const string & default_columns,const string & required,const string & ignored,const string & unique,const string & discouraged,const vector<string> & require_one)125 void CTabDelimitedValidator::ValidateInput(ILineReader& reader,
126 const string& default_columns,
127 const string& required, const string& ignored,
128 const string& unique, const string& discouraged,
129 const vector<string>& require_one)
130 {
131 m_current_row_number = 0;
132 m_delim = (m_flags & e_tab_comma_delim) ? "," : "\t";
133
134 if (_ProcessHeader(reader, default_columns) && _CheckHeader(discouraged, require_one))
135 {
136 // preprocess headers & required & ignored
137 if (_MakeColumns("Required", required, m_required_cols) &&
138 _MakeColumns("Ignored", ignored, m_ignored_cols) &&
139 _MakeColumns("Unique", unique, m_unique_cols))
140 {
141 if (!unique.empty())
142 m_unique_values.resize(m_col_defs.size());
143
144 bool ignore_unknown = (m_flags & e_tab_ignore_unknown_types) == e_tab_ignore_unknown_types;
145 map<string, int> types;
146 bool fatal = false;
147 for(size_t i=0; i<m_col_defs.size(); ++i)
148 {
149 if (m_ignored_cols[i])
150 continue;
151
152 if (!CColumnValidatorRegistry::GetInstance().IsSupported(m_col_defs[i]))
153 {
154 _ReportError(i, "Datatype is not supported", m_col_defs[i], ignore_unknown);
155 if (ignore_unknown)
156 {
157 m_ignored_cols[i] = true;
158 continue;
159 }
160 else
161 fatal = true;
162 }
163
164 int count = ++types[m_col_defs[i]];
165 if (count == 2)
166 {
167 // report only first occurance
168 _ReportError(i, "Column is not unique", m_col_defs[i]);
169 fatal = true;
170 }
171
172 }
173 if (!fatal)
174 _OperateRows(reader);
175 }
176 }
177 }
178
_ReportTab(CNcbiOstream * out_stream)179 void CTabDelimitedValidator::_ReportTab(CNcbiOstream* out_stream)
180 {
181 if (!m_errors.empty())
182 *out_stream << "Row\tColumn\tError\tWarning" << endl;
183
184 ITERATE(list<CTabDelimitedValidatorMessage>, it, m_errors)
185 {
186 *out_stream << it->m_row << "\t" << it->m_col + 1 << "\t"
187 << it->m_colname << "\t"
188 << (it->m_warning?"\t":it->m_msg.c_str())
189 << (it->m_warning?it->m_msg.c_str(): "\t")
190 << endl;
191 }
192 }
193
_ReportXML(CNcbiOstream * out_stream,bool no_headers)194 void CTabDelimitedValidator::_ReportXML(CNcbiOstream* out_stream, bool no_headers)
195 {
196 if (m_errors.empty())
197 return;
198
199 xml::document xmldoc("tab_delimited_validator");
200 xml::node& root = xmldoc.get_root_node();
201
202 int i=0;
203 ITERATE(list<CTabDelimitedValidatorMessage>, it, m_errors)
204 {
205 xml::node new_node(it->m_warning?"warning":"error");
206 new_node.get_attributes().insert("row", NStr::IntToString(it->m_row).c_str());
207 new_node.get_attributes().insert("column", NStr::IntToString(it->m_col + 1).c_str());
208 new_node.get_attributes().insert("message", it->m_msg.c_str());
209 if (!it->m_colname.empty())
210 {
211 new_node.get_attributes().insert("colname", it->m_colname.c_str());
212 }
213 // will be handled correctly
214 root.insert(new_node);
215
216 i++;
217 if (i>100)
218 break;
219 }
220
221 xmldoc.set_is_standalone(true);
222 xmldoc.set_encoding("utf-8");
223 *out_stream << xmldoc;
224
225 }
226
_MakeColumns(const string & message,const CTempString & columns,vector<bool> & col_defs)227 bool CTabDelimitedValidator::_MakeColumns(const string& message, const CTempString& columns, vector<bool>& col_defs)
228 {
229 col_defs.resize(m_col_defs.size(), false); // all values are not required
230 vector<CTempStringEx> names;
231 NStr::Split(columns, ",", names);
232 bool can_process = true;
233 for (size_t i=0; i<names.size(); i++)
234 {
235 int index = NStr::StringToInt(names[i], NStr::fConvErr_NoThrow);
236 if (index == 0)
237 {
238 vector<string>::const_iterator col_it = find(m_col_defs.begin(), m_col_defs.end(), names[i]);
239 if (col_it == m_col_defs.end())
240 {
241 _ReportError(i, message + " column does not exist", names[i]);
242 // stop processing
243 can_process = false;
244 }
245 else
246 col_defs[index = col_it - m_col_defs.begin()] = true;
247 }
248 else
249 if (index > (int)m_col_defs.size() || index<1)
250 {
251 _ReportError(i, message + " column does not exist", names[i]);
252 // stop processing
253 can_process = false;
254 }
255 else
256 col_defs[index - 1] = true;
257 }
258 return can_process;
259 }
260
_ProcessHeader(ILineReader & reader,const CTempString & default_columns)261 bool CTabDelimitedValidator::_ProcessHeader(ILineReader& reader, const CTempString& default_columns)
262 {
263 if (!default_columns.empty())
264 {
265 string lower = default_columns;
266 NStr::ToLower(lower);
267 NStr::Split(lower, ",", m_col_defs); //using comma separator always
268
269 return true;
270 }
271 else
272 {
273 while (!reader.AtEOF())
274 {
275 // skip all comment lines
276 reader.ReadLine();
277 // First line is a column definitions
278 m_current_row_number = reader.GetLineNumber();
279 string lower = reader.GetCurrentLine();
280 if (lower[0] == '#') continue;
281 NStr::ToLower(lower);
282 NStr::Split(lower, m_delim, m_col_defs);
283 break;
284 }
285
286 if (m_col_defs.size()<1)
287 {
288 _ReportError(1, "No columns specified", "");
289 return false;
290 }
291 }
292 return true;
293 }
294
_OperateRows(ILineReader & reader)295 void CTabDelimitedValidator::_OperateRows(ILineReader& reader)
296 {
297 while (!reader.AtEOF())
298 {
299 reader.ReadLine();
300 // First line is a column definitions
301 CTempString current = reader.GetCurrentLine();
302 m_current_row_number = reader.GetLineNumber();
303
304 if (current.empty())
305 {
306 if (m_flags & e_tab_ignore_empty_rows)
307 continue;
308 else
309 _ReportError(0, "Empty rows not allowed", "");
310 }
311 else
312 {
313 if (current[0] == '#') continue; // skip all comment lines
314 vector<CTempStringEx> values; values.reserve(m_col_defs.size());
315 NStr::Split(current, m_delim, values);
316
317 if (values.size() > m_col_defs.size())
318 _ReportError(m_col_defs.size(), "To many values", "");
319
320 ITERATE(vector< set<string> >, req_one_it, m_require_one_cols)
321 {
322 int count = 0;
323 for (size_t i=0; i<m_col_defs.size(); i++)
324 {
325 if (i<values.size() && !values[i].empty())
326 {
327 if (req_one_it->find(m_col_defs[i]) != req_one_it->end())
328 {
329 count++;
330 }
331 }
332 }
333 if (count==0)
334 {
335 string colname = NStr::Join(*req_one_it, ",");
336 _ReportError(-1, "None of require-one columns specified", colname);
337 }
338 }
339
340 for (size_t i=0; i<m_col_defs.size(); i++)
341 {
342 if (i>=values.size() || values[i].empty())
343 {
344 if (m_required_cols[i])
345 _ReportError(i, "Missing required value", "");
346 else
347 continue;
348 }
349 else
350 if (m_ignored_cols[i])
351 continue;
352
353 bool isfatal = _Validate(i, values[i]);
354 if (isfatal)
355 {
356 _ReportError(i, "Fatal error occured, stopping", "");
357 return;
358 }
359 if (!values[i].empty() && m_unique_cols[i])
360 {
361 int& count = m_unique_values[i][values[i]];
362 if (count++)
363 {
364 _ReportError(i, "Non unique value", "");
365 }
366
367 }
368 } // iterate over cols
369 }
370 }
371 }
372
GenerateOutput(CNcbiOstream * out_stream,bool no_headers)373 void CTabDelimitedValidator::GenerateOutput(CNcbiOstream* out_stream, bool no_headers)
374 {
375 if ( (m_flags & CTabDelimitedValidator::e_tab_tab_report) == CTabDelimitedValidator::e_tab_tab_report)
376 {
377 _ReportTab(out_stream);
378 }
379 else
380 if ( (m_flags & CTabDelimitedValidator::e_tab_xml_report) == CTabDelimitedValidator::e_tab_xml_report)
381 {
382 _ReportXML(out_stream, no_headers);
383 }
384 }
385
RegisterAliases(CNcbiIstream * in_stream)386 void CTabDelimitedValidator::RegisterAliases(CNcbiIstream* in_stream)
387 {
388 CColumnValidatorRegistry& r = CColumnValidatorRegistry::GetInstance();
389
390 // default aliases
391 r.Register("germline", "boolean");
392 r.Register("metagenomic", "boolean");
393 r.Register("rearranged", "boolean");
394 r.Register("transgenic", "boolean");
395
396 if (in_stream)
397 {
398 CRef<ILineReader> reader(ILineReader::New(*in_stream));
399 while (!reader->AtEOF())
400 {
401 reader->ReadLine();
402 CTempString line = reader->GetCurrentLine();
403 if (line.empty())
404 continue;
405 if (line[0] == '#' || line[0] == ';')
406 continue;
407 CTempString name, alias;
408 NStr::SplitInTwo(line, "\t ", name, alias, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
409 if (name.empty() || alias.empty())
410 continue;
411 r.Register(name, alias);
412 }
413 }
414 }
415
416 END_NCBI_SCOPE
417
418