1 /*  $Id: tab_table_reader.cpp 548810 2017-10-18 13:38:41Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 *   Reader for structured comments for sequences
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <util/line_reader.hpp>
37 
38 #include "tab_table_reader.hpp"
39 
40 #include <misc/xmlwrapp/xmlwrapp.hpp>
41 #include "col_validator.hpp"
42 
43 #include <common/test_assert.h>  /* This header must go last */
44 
45 BEGIN_NCBI_SCOPE
46 USING_SCOPE(objects);
47 
_Validate(int col_number,const CTempString & value)48 bool CTabDelimitedValidator::_Validate(int col_number, const CTempString& value)
49 {
50     const string& datatype = m_col_defs[col_number];
51     if (datatype.empty())
52         return false;
53 
54     string error;
55     bool isfatal = CColumnValidatorRegistry::GetInstance().
56         DoValidate(datatype, value, error);
57 
58     if (!error.empty())
59         _ReportError(col_number, error, datatype);
60 
61     return isfatal;
62 }
63 
_ReportWarning(int col_number,const CTempString & warning,const CTempString & colname)64 void CTabDelimitedValidator::_ReportWarning(int col_number, const CTempString& warning, const CTempString& colname)
65 {
66     _ReportError(col_number, warning, colname, true);
67 }
68 
_ReportError(int col_number,const CTempString & error,const CTempString & colname,bool warning)69 void CTabDelimitedValidator::_ReportError(int col_number, const CTempString& error, const CTempString& colname, bool warning)
70 {
71     CTabDelimitedValidatorMessage rec;
72     rec.m_col = col_number;
73     rec.m_row = m_current_row_number;
74     rec.m_msg = error;
75     rec.m_warning = warning;
76     rec.m_colname = colname;
77     m_errors.push_back(rec);
78 }
79 
_CheckHeader(const string & discouraged,const vector<string> & require_one)80 bool CTabDelimitedValidator::_CheckHeader(const string& discouraged, const vector<string>& require_one)
81 {
82     bool columns_ok = true;
83     vector<string> discouraged_cols;
84     if (!discouraged.empty())
85     {
86         NStr::Split(discouraged, ",", discouraged_cols);
87     }
88 
89     ITERATE(std::vector<std::string>, it, m_col_defs)
90     {
91         if (CColumnValidator::IsDiscouraged(*it) ||
92             find(discouraged_cols.begin(), discouraged_cols.end(), *it) != discouraged_cols.end())
93         {
94             int id = it - m_col_defs.begin();
95             _ReportError(id, "Column is discouraged", *it);
96             m_ignored_cols.resize(m_col_defs.size());
97             m_ignored_cols[id] = true;
98         }
99     }
100 
101     m_require_one_cols.resize(require_one.size());
102     ITERATE(vector<string>, it, require_one)
103     {
104         set<string>& require_one_cols = m_require_one_cols[it - require_one.begin()];
105         vector<string> cols;
106         NStr::Split(*it, ",", cols);
107         int found(0);
108         ITERATE(vector<string>, it_col, cols)
109         {
110             require_one_cols.insert(*it_col);
111             if (find(m_col_defs.begin(), m_col_defs.end(), *it_col) != m_col_defs.end())
112             {
113                 found++;
114             }
115         }
116         if (found == 0)
117         {
118            _ReportError(0, "Not found any of require-one columns", *it);
119         }
120     }
121 
122     return columns_ok;
123 }
124 
ValidateInput(ILineReader & reader,const string & default_columns,const string & required,const string & ignored,const string & unique,const string & discouraged,const vector<string> & require_one)125 void CTabDelimitedValidator::ValidateInput(ILineReader& reader,
126     const string& default_columns,
127     const string& required, const string& ignored,
128     const string& unique, const string& discouraged,
129     const vector<string>& require_one)
130 {
131     m_current_row_number = 0;
132     m_delim = (m_flags & e_tab_comma_delim) ? "," : "\t";
133 
134     if (_ProcessHeader(reader, default_columns) && _CheckHeader(discouraged, require_one))
135     {
136         // preprocess headers & required & ignored
137         if (_MakeColumns("Required", required, m_required_cols) &&
138             _MakeColumns("Ignored", ignored, m_ignored_cols) &&
139             _MakeColumns("Unique", unique, m_unique_cols))
140         {
141             if (!unique.empty())
142                 m_unique_values.resize(m_col_defs.size());
143 
144             bool ignore_unknown = (m_flags & e_tab_ignore_unknown_types) == e_tab_ignore_unknown_types;
145             map<string, int> types;
146             bool fatal = false;
147             for(size_t i=0; i<m_col_defs.size(); ++i)
148             {
149                 if (m_ignored_cols[i])
150                     continue;
151 
152                 if (!CColumnValidatorRegistry::GetInstance().IsSupported(m_col_defs[i]))
153                 {
154                     _ReportError(i, "Datatype is not supported", m_col_defs[i], ignore_unknown);
155                     if (ignore_unknown)
156                     {
157                        m_ignored_cols[i] = true;
158                        continue;
159                     }
160                     else
161                        fatal = true;
162                 }
163 
164                 int count = ++types[m_col_defs[i]];
165                 if (count == 2)
166                 {
167                     // report only first occurance
168                     _ReportError(i, "Column is not unique", m_col_defs[i]);
169                     fatal = true;
170                 }
171 
172             }
173             if (!fatal)
174                _OperateRows(reader);
175         }
176     }
177 }
178 
_ReportTab(CNcbiOstream * out_stream)179 void CTabDelimitedValidator::_ReportTab(CNcbiOstream* out_stream)
180 {
181     if (!m_errors.empty())
182         *out_stream << "Row\tColumn\tError\tWarning" << endl;
183 
184     ITERATE(list<CTabDelimitedValidatorMessage>, it, m_errors)
185     {
186         *out_stream << it->m_row << "\t" << it->m_col + 1 << "\t"
187             << it->m_colname << "\t"
188             << (it->m_warning?"\t":it->m_msg.c_str())
189             << (it->m_warning?it->m_msg.c_str(): "\t")
190             << endl;
191     }
192 }
193 
_ReportXML(CNcbiOstream * out_stream,bool no_headers)194 void CTabDelimitedValidator::_ReportXML(CNcbiOstream* out_stream, bool no_headers)
195 {
196     if (m_errors.empty())
197         return;
198 
199     xml::document xmldoc("tab_delimited_validator");
200     xml::node& root = xmldoc.get_root_node();
201 
202     int i=0;
203     ITERATE(list<CTabDelimitedValidatorMessage>, it, m_errors)
204     {
205         xml::node new_node(it->m_warning?"warning":"error");
206         new_node.get_attributes().insert("row", NStr::IntToString(it->m_row).c_str());
207         new_node.get_attributes().insert("column", NStr::IntToString(it->m_col + 1).c_str());
208         new_node.get_attributes().insert("message", it->m_msg.c_str());
209         if (!it->m_colname.empty())
210         {
211             new_node.get_attributes().insert("colname", it->m_colname.c_str());
212         }
213         // will be handled correctly
214         root.insert(new_node);
215 
216         i++;
217         if (i>100)
218             break;
219     }
220 
221     xmldoc.set_is_standalone(true);
222     xmldoc.set_encoding("utf-8");
223     *out_stream << xmldoc;
224 
225 }
226 
_MakeColumns(const string & message,const CTempString & columns,vector<bool> & col_defs)227 bool CTabDelimitedValidator::_MakeColumns(const string& message, const CTempString& columns, vector<bool>& col_defs)
228 {
229     col_defs.resize(m_col_defs.size(), false); // all values are not required
230     vector<CTempStringEx> names;
231     NStr::Split(columns, ",", names);
232     bool can_process = true;
233     for (size_t i=0; i<names.size(); i++)
234     {
235         int index = NStr::StringToInt(names[i], NStr::fConvErr_NoThrow);
236         if (index == 0)
237         {
238             vector<string>::const_iterator col_it = find(m_col_defs.begin(), m_col_defs.end(), names[i]);
239             if (col_it == m_col_defs.end())
240             {
241                 _ReportError(i, message + " column does not exist", names[i]);
242                 // stop processing
243                 can_process = false;
244             }
245             else
246                 col_defs[index = col_it - m_col_defs.begin()] = true;
247         }
248         else
249         if (index > (int)m_col_defs.size() || index<1)
250         {
251             _ReportError(i, message + " column does not exist", names[i]);
252             // stop processing
253             can_process = false;
254         }
255         else
256             col_defs[index - 1] = true;
257     }
258     return can_process;
259 }
260 
_ProcessHeader(ILineReader & reader,const CTempString & default_columns)261 bool CTabDelimitedValidator::_ProcessHeader(ILineReader& reader, const CTempString& default_columns)
262 {
263     if (!default_columns.empty())
264     {
265         string lower = default_columns;
266         NStr::ToLower(lower);
267         NStr::Split(lower, ",", m_col_defs); //using comma separator always
268 
269         return true;
270     }
271     else
272     {
273         while (!reader.AtEOF())
274         {
275             // skip all comment lines
276             reader.ReadLine();
277             // First line is a column definitions
278             m_current_row_number = reader.GetLineNumber();
279             string lower = reader.GetCurrentLine();
280             if (lower[0] == '#') continue;
281             NStr::ToLower(lower);
282             NStr::Split(lower, m_delim, m_col_defs);
283             break;
284         }
285 
286         if (m_col_defs.size()<1)
287         {
288             _ReportError(1, "No columns specified", "");
289             return false;
290         }
291     }
292     return true;
293 }
294 
_OperateRows(ILineReader & reader)295 void CTabDelimitedValidator::_OperateRows(ILineReader& reader)
296 {
297     while (!reader.AtEOF())
298     {
299         reader.ReadLine();
300         // First line is a column definitions
301         CTempString current = reader.GetCurrentLine();
302         m_current_row_number = reader.GetLineNumber();
303 
304         if (current.empty())
305         {
306             if (m_flags & e_tab_ignore_empty_rows)
307                 continue;
308             else
309                 _ReportError(0, "Empty rows not allowed", "");
310         }
311         else
312         {
313             if (current[0] == '#') continue; // skip all comment lines
314             vector<CTempStringEx> values; values.reserve(m_col_defs.size());
315             NStr::Split(current, m_delim, values);
316 
317             if (values.size() > m_col_defs.size())
318                 _ReportError(m_col_defs.size(), "To many values", "");
319 
320             ITERATE(vector< set<string> >, req_one_it, m_require_one_cols)
321             {
322                 int count = 0;
323                 for (size_t i=0; i<m_col_defs.size(); i++)
324                 {
325                     if (i<values.size() && !values[i].empty())
326                     {
327                         if (req_one_it->find(m_col_defs[i]) != req_one_it->end())
328                         {
329                             count++;
330                         }
331                     }
332                 }
333                 if (count==0)
334                 {
335                     string colname = NStr::Join(*req_one_it, ",");
336                     _ReportError(-1, "None of require-one columns specified", colname);
337                 }
338             }
339 
340             for (size_t i=0; i<m_col_defs.size(); i++)
341             {
342                 if (i>=values.size() || values[i].empty())
343                 {
344                     if (m_required_cols[i])
345                         _ReportError(i, "Missing required value", "");
346                     else
347                         continue;
348                 }
349                 else
350                 if (m_ignored_cols[i])
351                     continue;
352 
353                 bool isfatal = _Validate(i, values[i]);
354                 if (isfatal)
355                 {
356                     _ReportError(i, "Fatal error occured, stopping", "");
357                     return;
358                 }
359                 if (!values[i].empty() && m_unique_cols[i])
360                 {
361                     int& count = m_unique_values[i][values[i]];
362                     if (count++)
363                     {
364                         _ReportError(i, "Non unique value", "");
365                     }
366 
367                 }
368             } // iterate over cols
369         }
370     }
371 }
372 
GenerateOutput(CNcbiOstream * out_stream,bool no_headers)373 void CTabDelimitedValidator::GenerateOutput(CNcbiOstream* out_stream, bool no_headers)
374 {
375     if ( (m_flags & CTabDelimitedValidator::e_tab_tab_report) == CTabDelimitedValidator::e_tab_tab_report)
376     {
377         _ReportTab(out_stream);
378     }
379     else
380     if ( (m_flags & CTabDelimitedValidator::e_tab_xml_report) == CTabDelimitedValidator::e_tab_xml_report)
381     {
382         _ReportXML(out_stream, no_headers);
383     }
384 }
385 
RegisterAliases(CNcbiIstream * in_stream)386 void CTabDelimitedValidator::RegisterAliases(CNcbiIstream* in_stream)
387 {
388     CColumnValidatorRegistry& r = CColumnValidatorRegistry::GetInstance();
389 
390     // default aliases
391     r.Register("germline", "boolean");
392     r.Register("metagenomic", "boolean");
393     r.Register("rearranged", "boolean");
394     r.Register("transgenic", "boolean");
395 
396     if (in_stream)
397     {
398         CRef<ILineReader> reader(ILineReader::New(*in_stream));
399         while (!reader->AtEOF())
400         {
401             reader->ReadLine();
402             CTempString line = reader->GetCurrentLine();
403             if (line.empty())
404                 continue;
405             if (line[0] == '#' || line[0] == ';')
406                 continue;
407             CTempString name, alias;
408             NStr::SplitInTwo(line, "\t ", name, alias, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
409             if (name.empty() || alias.empty())
410                 continue;
411             r.Register(name, alias);
412         }
413     }
414 }
415 
416 END_NCBI_SCOPE
417 
418