1 #ifndef UTIL___ROW_READER_EXCEL_CSV__HPP
2 #define UTIL___ROW_READER_EXCEL_CSV__HPP
3 
4 /*  $Id: row_reader_excel_csv.hpp 564203 2018-05-23 12:13:23Z ivanov $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Denis Vakatov, Sergey Satskiy
30 *
31 * File Description:
32 *   Implementation of the CRowReader<> traits for MS EXCEL CSV
33 *
34 * ===========================================================================
35 */
36 
37 #include <util/row_reader_char_delimited.hpp>
38 
39 
40 BEGIN_NCBI_SCOPE
41 
42 
43 /// Note 1: Empty rows are allowed and treated as 0 fields rows
44 /// Note 2: Both CRLF and LF are allowed
45 /// Note 3: Number of fields is not enforced
46 /// Note 4: There is no formal MS Excel CSV spec. So the implementation is
47 ///         based on experiments made on MS Excel 2013.
48 ///         See the description in JIRA: CXX-9221
49 /// Note 5: Two field cases in a data source
50 ///         - empty, i.e. ,,
51 ///         - "", i.e. ,"",
52 ///         are translated to a Null field
53 /// Note 6: trailing Null fields in a data source are stripped
54 
55 
56 const CTempString   kNullFieldRepresentation = CTempString("\"\"", 2);
57 
58 
59 /// MS Excel CSV traits.
60 class CRowReaderStream_Excel_CSV : public TRowReaderStream_SingleCommaDelimited
61 {
62 public:
CRowReaderStream_Excel_CSV()63     CRowReaderStream_Excel_CSV()
64     {
65         m_LineSeparator.reserve(2);
66         m_PreviousLineSeparator.reserve(2);
67     }
68 
69     // It could be more than one raw line in one row
ReadRowData(CNcbiIstream & is,string * data)70     size_t ReadRowData(CNcbiIstream& is, string* data)
71     {
72         data->clear();
73         m_Tokens.clear();
74 
75         size_t      current_index= 0;
76         size_t      token_begin_index = 0;
77         size_t      lines_read = 0;
78         bool        in_quotes = false;
79         for (;;) {
80             x_ReadOneLine(is, data, lines_read > 0);
81             ++lines_read;
82 
83             while (current_index < data->size()) {
84                 auto    current_char = (*data)[current_index];
85                 if (current_char == ',') {
86                     if (!in_quotes) {
87                         m_Tokens.emplace_back(token_begin_index);
88                         token_begin_index = current_index + 1;
89                     }
90                 } else if (current_char == '"') {
91                     if (token_begin_index == current_index) {
92                         in_quotes = true;
93                     } else {
94                         if (in_quotes) {
95                             if (current_index + 1 < data->size() &&
96                                 (*data)[current_index + 1] == '"') {
97                                 ++current_index;
98                             } else {
99                                 in_quotes = false;
100                             }
101                         }
102                     }
103                 }
104 
105                 ++current_index;
106             }
107 
108             if (!in_quotes)
109                 break;
110 
111             // Here: need to read one more line because of the double quotes.
112             //       So check if we still can read.
113             if (!bool(is))
114                 break;
115         }
116 
117         m_Tokens.push_back(token_begin_index);
118         return lines_read;
119     }
120 
OnNextLine(CTempString)121     ERR_Action OnNextLine(CTempString /* raw_line */)
122     {
123         return eRR_Continue_Data;
124     }
125 
126 
127     // The tokenization is actually done in the ReadRowData() member
Tokenize(const CTempString raw_line,vector<CTempString> & tokens)128     ERR_Action Tokenize(const CTempString  raw_line,
129                         vector<CTempString>& tokens)
130     {
131         // Special case in accordance with CXX-9221: empty line => no fields
132         if (!raw_line.empty()) {
133             size_t      field_size;
134             for (TFieldNo field_no = 0;
135                  field_no < m_Tokens.size(); ++field_no) {
136                 if (field_no + 1 < m_Tokens.size())
137                     field_size = m_Tokens[field_no + 1] - m_Tokens[field_no] - 1;
138                 else
139                     field_size = raw_line.size() - m_Tokens[field_no];
140                 tokens.emplace_back(raw_line.data() + m_Tokens[field_no],
141                                     field_size);
142             }
143 
144             x_StripTrailingNullFields(tokens);
145         }
146         return eRR_Continue_Data;
147     }
148 
Validate(CTempString raw_line,ERR_FieldValidationMode field_validation_mode)149     ERR_Action Validate(CTempString raw_line,
150                         ERR_FieldValidationMode field_validation_mode)
151     {
152         if (field_validation_mode == eRR_NoFieldValidation)
153             return eRR_Skip;
154         if (m_FieldsToValidate.empty())
155             return eRR_Skip;
156 
157         if (raw_line.empty())
158             return eRR_Skip;
159 
160         // Here: the field values need to be validated and there is some type
161         // information
162         m_ValidationTokens.clear();
163         ERR_Action action = this->Tokenize(raw_line, m_ValidationTokens);
164 
165         if (action == eRR_Skip)
166             return eRR_Skip;
167 
168         for (const auto& info : m_FieldsToValidate) {
169             if (info.first < m_Tokens.size()) {
170                 string translated;
171                 ERR_TranslationResult translation_result =
172                     this->Translate((TFieldNo)info.first, m_ValidationTokens[info.first], translated);
173                 if (translation_result == eRR_UseOriginal) {
174                     CRR_Util::ValidateBasicTypeFieldValue(
175                         m_ValidationTokens[info.first],
176                         info.second.first, info.second.second);
177                 } else {
178                     CRR_Util::ValidateBasicTypeFieldValue(
179                         translated, info.second.first, info.second.second);
180                 }
181             }
182         }
183         return eRR_Skip;
184     }
185 
Translate(TFieldNo,const CTempString raw_value,string & translated_value)186     ERR_TranslationResult Translate(TFieldNo          /* field_no */,
187                                     const CTempString raw_value,
188                                     string&           translated_value)
189     {
190         if (x_IsNull(raw_value))
191             return eRR_Null;
192 
193         if (raw_value[0] == '=') {
194             size_t  dbl_quote_cnt = 0;
195             for (size_t index = 0; index < raw_value.size(); ++index)
196                 if (raw_value[index] == '"')
197                     ++dbl_quote_cnt;
198 
199             if (dbl_quote_cnt == 0) {
200                 translated_value = string(raw_value.data() + 1,
201                                           raw_value.size() - 1);
202                 return eRR_Translated;
203             }
204 
205             // Here: there are " in the field. They may need to be stripped
206             // together with = if:
207             // - " follows = immediately
208             // - " is the last character in a field
209             // - there is an even number of "
210             // If so then "" need to be translated into " inside the field
211             // as well
212             if (dbl_quote_cnt % 2 == 0) {
213                 if (raw_value[1] == '"' &&
214                     raw_value[raw_value.size() - 1] == '"') {
215                     // Balanced double quote and poperly surround the field
216                     // value => strip the = and surrounding " plus replace
217                     // "" with "
218                     translated_value = string(raw_value.data() + 2,
219                                               raw_value.size() - 3);
220                     NStr::ReplaceInPlace(translated_value, "\"\"", "\"");
221                     return eRR_Translated;
222                 }
223             }
224 
225             // Non balanced double quotes or they are not surrounding the
226             // value after =
227             // There is no translation for this case
228             return eRR_UseOriginal;
229         }
230 
231         if (raw_value[0] == '"') {
232             size_t      match_index = 1;
233             for (; match_index < raw_value.size(); ++match_index) {
234                 if (raw_value[match_index] == '"') {
235                     if (match_index + 1< raw_value.size() &&
236                         raw_value[match_index + 1] == '"')
237                         ++match_index;
238                     else
239                         break;
240                 }
241             }
242 
243             // Here: match_index points beyond of the field or to a
244             // matching "
245             if (match_index < raw_value.size()) {
246                 // matching " found
247                 translated_value = string(raw_value.data() + 1,
248                                           match_index - 1);
249                 NStr::ReplaceInPlace(translated_value, "\"\"", "\"");
250                 if (match_index < raw_value.size() - 1) {
251                     // tail of the field needs to ba attached as is
252                     translated_value.append(
253                         raw_value.data() + match_index + 1,
254                         raw_value.size() - match_index - 1);
255                 }
256             } else {
257                 // Unbalanced " case
258                 translated_value = string(raw_value.data() + 1,
259                                           raw_value.size() - 1);
260             }
261 
262             // This could be a case with a leading = which may need to be
263             // stripped as well...
264             if (!translated_value.empty()) {
265                 if (translated_value[0] == '=') {
266                     size_t  dbl_quote_cnt = 0;
267                     for (size_t index = 0;
268                          index < translated_value.size(); ++index)
269                         if (translated_value[index] == '"')
270                             ++dbl_quote_cnt;
271 
272                     if (dbl_quote_cnt > 0 && (dbl_quote_cnt % 2 == 0)) {
273                         if (translated_value[1] == '"' &&
274                             translated_value[translated_value.size() - 1] == '"') {
275                             translated_value = translated_value.substr(2, translated_value.size() - 3);
276                         }
277                     }
278                 }
279             }
280 
281             return eRR_Translated;
282         }
283         return eRR_UseOriginal;
284     }
285 
OnEvent(ERR_Event event,ERR_EventMode event_mode)286     ERR_EventAction OnEvent(ERR_Event event,
287                             ERR_EventMode event_mode)
288     {
289         switch (event) {
290             case eRR_Event_SourceBegin:
291                 GetMyStream().x_ClearTraitsProvidedFieldsInfo();
292 
293                 if (event_mode == eRR_EventMode_Validating)
294                     x_GetFieldTypesToValidate();
295 
296                 // fall through
297             case eRR_Event_SourceEnd:
298             case eRR_Event_SourceError:
299             default:
300                 ;
301         }
302         return eRR_EventAction_Default;
303     }
304 
305 private:
x_ReadOneLine(CNcbiIstream & is,string * data,bool joining)306     void x_ReadOneLine(CNcbiIstream& is, string* data, bool joining)
307     {
308         m_RawLine.clear();
309         std::getline(is, m_RawLine);
310         m_LineSeparator = "\n";
311         if(!m_RawLine.empty()  &&  m_RawLine.back() == '\r') {
312             m_RawLine.pop_back();
313             m_LineSeparator = "\r\n";
314         }
315 
316         if (joining)
317             data->append(m_PreviousLineSeparator);
318         data->append(m_RawLine);
319 
320         m_PreviousLineSeparator = m_LineSeparator;
321     }
322 
x_GetFieldTypesToValidate(void)323     void x_GetFieldTypesToValidate(void)
324     {
325         m_FieldsToValidate.clear();
326         for (const auto& info : GetMyStream().GetFieldsMetaInfo()) {
327             if (info.is_type_initialized) {
328                 auto field_type = info.type.GetType();
329                 if (field_type == eRR_Boolean || field_type == eRR_Integer ||
330                     field_type == eRR_Double || field_type == eRR_DateTime)
331                     m_FieldsToValidate[info.field_no] =
332                         make_pair(field_type, info.type.GetProps());
333             }
334         }
335     }
336 
x_IsNull(const CTempString & raw_field_value)337     bool x_IsNull(const CTempString&  raw_field_value)
338     {
339         return raw_field_value.empty() ||
340                (raw_field_value == kNullFieldRepresentation);
341     }
342 
x_StripTrailingNullFields(vector<CTempString> & tokens)343     void x_StripTrailingNullFields(vector<CTempString>& tokens)
344     {
345         while (!tokens.empty()) {
346             if (x_IsNull(tokens.back()))
347                 tokens.pop_back();
348             else
349                 break;
350         }
351     }
352 
353 private:
354     vector<size_t>  m_Tokens;
355     string          m_LineSeparator;
356     string          m_PreviousLineSeparator;
357     string          m_RawLine;
358 
359     map<size_t, pair<ERR_FieldType, string>> m_FieldsToValidate;
360     vector<CTempString>                      m_ValidationTokens;
361 
362     RR_TRAITS_PARENT_STREAM(CRowReaderStream_Excel_CSV);
363 };
364 
365 
366 
367 END_NCBI_SCOPE
368 
369 #endif  /* UTIL___ROW_READER_EXCEL_CSV__HPP */
370